#對豆瓣讀書中的管理標籤下的內容進行輸出
#使用程式導向的方式進行爬取
import requests
import time
from lxml import html
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 6.2) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.12 Safari/535.11'
}
for i in range(0,990,20):
v = 0
#在原地址中只有一個%號,由於我們的佔位符中也有%,導致程式以為它是轉移符,所以我們要使用兩個%%解決這個問題
url = 'https://book.douban.com/tag/%%E7%%AE%%A1%%E7%%90%%86?start=%s&type=T'%i
# print(url)
res = requests.get(url=url,headers=headers)
etree = html.etree
cont = etree.HTML(res.text)
s1 = cont.xpath("//div[@class='info']/h2/a/text()")
s1_con = [i.strip() for i in s1 if i.strip() != '']
s2 = cont.xpath("//div[@class='info']/div[@class='pub']/text()")
s2_con = [j.strip() for j in s2 if j.strip() != '']
s3 = cont.xpath("//div[@class='star clearfix']/span[@class='rating_nums']/text()")
s4 = cont.xpath("//div[@class='star clearfix']/span[@class='pl']/text()")
s4_con = [z.strip() for z in s4 if z.strip() != '']
s5 = cont.xpath("//p/text()")
del s5[:2]
del s5[-2:]
for i1, i2, i3, i4, i5 in zip(s1_con, s2_con, s3, s4_con, s5):
content = '書名:%s\n作者及出版社:%s\n豆瓣評分:%s\n評價數:%s\n作品簡介:%s\n\n' % (i1, i2, i3, i4, i5)
#print('書名:%s\n作者及出版社:%s\n豆瓣評分:%s\n評價數:%s\n作品簡介:%s\n\n' % (i1, i2, i3, i4, i5))
files = open('doubantotal_codes.txt', 'a', encoding='utf8')
files.write(content)
files.close()
# print('列印中')
v += 1
print(v)
time.sleep(0.1)