python爬蟲,獲取中國工程院院士資訊

Jocks5發表於2021-12-04
import re
import os
import os.path
import time
from urllib.request import urlopen

dstDir = 'YuanShi'
if not os.path.isdir(dstDir):
    os.mkdir(dstDir)

startUrl = r'http://www.cae.cn/cae/html/main/col48/column_48_1.html'
with urlopen(startUrl) as fp:
    content = fp.read().decode()

# 提取並遍歷每位大牛連結
pattern = r'<li class="name_list"><a href="(.+)" target="_blank">(.+)</a></li>'
result = re.findall(pattern, content)
for item in result:
    perUrl, name = item
    # 測試是否獲取資訊
    print(perUrl)
    # 這裡根據初爬結果進行改進
    name = name.replace('<h3>', '').replace('</h3>', '')
    name = os.path.join(dstDir, name)
    perUrl = r'http://www.cae.cn/' + perUrl
    with urlopen(perUrl) as fp:
        content = fp.read().decode()

    # 抓取簡介
    pattern = r'<p>(.+?)</p>'
    result = re.findall(pattern, content)  # 返回string中所有與pattern匹配的全部字串,返回形式為陣列。
    if result:
        intro = re.sub('(<a.+</a>)|(&ensp;)|(&nbsp);','','\n'.join(result))
        with open(name+'.txt', 'w', encoding='utf8') as fp:
            fp.write(intro)

相關文章