爬蟲作業一

宇宙無敵帥寶寶發表於2020-11-27
from bs4 import  BeautifulSoup
import requests
import time
import random
import re
if __name__ =='__main__':
    url = '‘#自己填寫,版權問題不能寫
    A = requests.get(url=url)
    A.encoding='utf-8'
    page_text =A.text
    soup = BeautifulSoup(page_text, 'lxml')
    dd_list = soup.find('div',id='list')
    a_list = dd_list.find_all('a')
    del a_list[0:12]
    print(a_list)
    fp=open('./南明第一狠人.txt', 'w', encoding='utf-8')
    path = r'C:\爬蟲實驗\ '
    for a in a_list:
        time.sleep(4.5)#固定間隔時長,括號內數值可以自定議,下一行同可自定義。不過間隔時長短會導致訪問頻繁被網站 KO
        time.sleep(random.random()*3.24)#隨機間隔時長 程式碼基本完善,不過存在方法不夠完美
        title_1 = a.string
        title = re.sub(u'\\(.*?\\)','',title_1) #去除了作者求月票的行為!!!!但是作者內藏吐槽章節未去除
        print(title)#列印章節的名字
        detail_url='http://ajnnan.com'+a['href']
        print(detail_url)#列印章節的url
        detail = requests.get(url=detail_url,).text #, headers=headers
        detail = detail.encode("ISO-8859-1")
        detail = detail.decode("utf-8")
        detail_s = BeautifulSoup(detail, 'lxml')
        div_t = detail_s.find('div',attrs={'id':'content'})
        c = div_t.text
        fp.write(title + ':' + c + '\n')
        #print(c)
        with open(path + title + '.txt', 'w', encoding='utf-8') as f:
            f.write(title + ':' + c + '\n')
        print('返回'+title,'爬取成功!')

在這裡插入圖片描述
在這裡插入圖片描述
稽核問題,之前的不能檢視

相關文章