利用爬蟲獲取當前博文數量與字數

SilenceHL發表於2021-06-11

由於個人部落格沒有博文統計的功能,於是自己手寫了一個爬蟲,用於獲取當前博文數量與字數,具體的思路就是先獲取整個文章列表,然後遍歷文章來統計數量與字數

import requests
from lxml import etree
import re
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

def post_statistic():
    start_url = 'https://silencehuliang.github.io/posts/'
    response = requests.get(start_url).content.decode()
    html = etree.HTML(response)
    page = int(html.xpath('//ul [@class="pagination"]/li[5]//a/text()')[0])
    print('當前部落格總頁數為:{}'.format(page))
    archive_count = len(html.xpath('//article [@class="archive-item"]'))
    post_url_list = html.xpath('//a [@class="archive-item-link"]/@href')
    for i in range(1, page + 1):
        print("開始訪問第{}頁".format(i))
        next_url = 'https://silencehuliang.github.io/posts/page/{}/'.format(i)
        response = requests.get(next_url).content.decode()
        html = etree.HTML(response)
        archive_count += len(html.xpath('//article [@class="archive-item"]'))
        post_url_list.extend(html.xpath('//a [@class="archive-item-link"]/@href'))
    num = 0
    for p_url in post_url_list:
        post_url = 'https://silencehuliang.github.io' + p_url
        response = requests.get(post_url).content.decode()
        html = etree.HTML(response)
        num += int(re.findall('約 (\d+) 字', html.xpath('//div [@class="post-meta"]/div[2]/text()[4]')[0])[0])
    print("目前博文數量為:{},總字數為:{}".format(archive_count, num))


if __name__ == '__main__':
    post_statistic()
本作品採用《CC 協議》,轉載必須註明作者和本文連結

相關文章