由於個人部落格沒有博文統計的功能,於是自己手寫了一個爬蟲,用於獲取當前博文數量與字數,具體的思路就是先獲取整個文章列表,然後遍歷文章來統計數量與字數
import requests
from lxml import etree
import re
import random
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
def post_statistic():
start_url = 'https://silencehuliang.github.io/posts/'
response = requests.get(start_url).content.decode()
html = etree.HTML(response)
page = int(html.xpath('//ul [@class="pagination"]/li[5]//a/text()')[0])
print('當前部落格總頁數為:{}'.format(page))
archive_count = len(html.xpath('//article [@class="archive-item"]'))
post_url_list = html.xpath('//a [@class="archive-item-link"]/@href')
for i in range(1, page + 1):
print("開始訪問第{}頁".format(i))
next_url = 'https://silencehuliang.github.io/posts/page/{}/'.format(i)
response = requests.get(next_url).content.decode()
html = etree.HTML(response)
archive_count += len(html.xpath('//article [@class="archive-item"]'))
post_url_list.extend(html.xpath('//a [@class="archive-item-link"]/@href'))
num = 0
for p_url in post_url_list:
post_url = 'https://silencehuliang.github.io' + p_url
response = requests.get(post_url).content.decode()
html = etree.HTML(response)
num += int(re.findall('約 (\d+) 字', html.xpath('//div [@class="post-meta"]/div[2]/text()[4]')[0])[0])
print("目前博文數量為:{},總字數為:{}".format(archive_count, num))
if __name__ == '__main__':
post_statistic()
本作品採用《CC 協議》,轉載必須註明作者和本文連結