爬蟲小專案

PT、小小馬發表於2019-05-10
#對豆瓣讀書中的管理標籤下的內容進行輸出
#使用程式導向的方式進行爬取
import requests
import time
from lxml import html
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'
}
for i in range(0,990,20):
    v = 0
    #在原地址中只有一個%號,由於我們的佔位符中也有%,導致程式以為它是轉移符,所以我們要使用兩個%%解決這個問題
    url = 'https://book.douban.com/tag/%%E7%%AE%%A1%%E7%%90%%86?start=%s&type=T'%i
    # print(url)
    res = requests.get(url=url,headers=headers)
    etree = html.etree
    cont = etree.HTML(res.text)
    s1 = cont.xpath("//div[@class='info']/h2/a/text()")
    s1_con = [i.strip() for i in s1 if i.strip() != '']
    s2 = cont.xpath("//div[@class='info']/div[@class='pub']/text()")
    s2_con = [j.strip() for j in s2 if j.strip() != '']
    s3 = cont.xpath("//div[@class='star clearfix']/span[@class='rating_nums']/text()")
    s4 = cont.xpath("//div[@class='star clearfix']/span[@class='pl']/text()")
    s4_con = [z.strip() for z in s4 if z.strip() != '']
    s5 = cont.xpath("//p/text()")
    del s5[:2]
    del s5[-2:]
    for i1, i2, i3, i4, i5 in zip(s1_con, s2_con, s3, s4_con, s5):
        content = '書名:%s\n作者及出版社:%s\n豆瓣評分:%s\n評價數:%s\n作品簡介:%s\n\n' % (i1, i2, i3, i4, i5)
        #print('書名:%s\n作者及出版社:%s\n豆瓣評分:%s\n評價數:%s\n作品簡介:%s\n\n' % (i1, i2, i3, i4, i5))
        files = open('doubantotal_codes.txt', 'a', encoding='utf8')
        files.write(content)
        files.close()
        # print('列印中')
        v += 1
        print(v)
        time.sleep(0.1)

相關文章