BS爬取筆趣閣小說資料
# -*- coding: utf-8 -*-
# author : heart
# blog_url : https://www.cnblogs.com/ssrheart/
# time : 2024/3/30
import random
import time
import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import os
headers = {
'User-Agent': UserAgent().random,
}
proxies = {
'http': 'http://221.6.139.190:9002'
}
def spider_title(url):
response = requests.get(url=url, headers=headers, proxies=proxies).text
soup = BeautifulSoup(response, 'lxml')
dd_list = soup.find_all('div', class_='listmain')[0].find_all('dd')
title_list = []
for i in dd_list:
if '<<---展開全部章節--->>' in i.text:
continue
href = i.a.get('href')
href1 = 'https://www.bqgbb.cc' + href
title = i.a.text
title_list.append({
'href': href1,
'title': title
})
return title_list
def spider_content(url):
response = requests.get(url=url, headers=headers, proxies=proxies).text
soup = BeautifulSoup(response, 'lxml')
div_list = soup.find_all('div', class_='Readarea ReadAjax_content')[0].text
content = div_list
return content
def save(title, content):
base_dir = os.path.dirname(__file__)
wenjian = os.path.join(base_dir, 'xiaoshuo')
os.makedirs(wenjian, exist_ok=True)
lujing = os.path.join(wenjian, f'{title}.txt')
with open(lujing, 'w', encoding='utf-8') as f:
f.write(content)
def main():
title = spider_title(url='https://www.bqgbb.cc/book/11174/')
for index, data in enumerate(title, start=1):
title = data['title']
href = data['href']
time.sleep(random.randint(1,3))
content = spider_content(url=href)
save(title, content)
print(f'{title}下載完成')
if __name__ == '__main__':
main()
xpath爬取豆瓣TOP250資料
# -*- coding: utf-8 -*-
# author : heart
# blog_url : https://www.cnblogs.com/ssrheart/
# time : 2024/3/31
import requests
from fake_useragent import UserAgent
from lxml import etree
class SpiderDB():
def __init__(self):
self.headers = {
'User-Agent': UserAgent().random,
}
self.proxies = {
'http': 'http://221.6.139.190:9002'
}
def spider_tag(self):
tagurl_list = []
for i in range(0, int(250 / 25)):
if i == 0:
tag_url = f'https://movie.douban.com/top250'
tagurl_list.append(tag_url)
else:
tag_url = f'https://movie.douban.com/top250?start={i * 25}'
tagurl_list.append(tag_url)
return tagurl_list
def spider_info(self, url):
# print(url) # https://movie.douban.com/top250
response = requests.get(url=url, headers=self.headers, proxies=self.proxies).text
tree = etree.HTML(response)
info = tree.xpath('//li/div[@class="item"]/div[@class="info"]')
data_list = []
for i in info:
try:
title = i.xpath('./div[1]/a/span[1]/text()')[0].strip()
except:
title = ''
try:
title_eng = i.xpath('./div[1]/a/span[2]/text()')[0].replace('\xa0', '').strip()
except:
title_eng = ''
try:
other_title = i.xpath('./div[1]/a/span[3]/text()')[0].replace('\xa0', '').strip()
except:
other_title = ''
actor = i.xpath('./div[2]/p/text()')[0].replace('\xa0', '').strip()
publish_time = i.xpath('./div[2]/p/text()')[1].replace('\xa0', '').strip()
score = i.xpath('./div[2]/div/span[2]/text()')[0]
pingjia_people = i.xpath('./div[2]/div/span[4]/text()')[0][0:-3]
try:
quote = i.xpath('./div[2]/p[@class="quote"]/span/text()')[0]
except:
quote = ''
data_list.append({
'title': title,
'title_eng': title_eng,
'other_title': other_title,
'actor': actor,
'publish_time': publish_time,
'score': score,
'pingjia_people': pingjia_people,
'quote': quote,
})
# print(data_list)
return data_list
def main(self):
tag_url = self.spider_tag()
data_list_all = []
for url in tag_url:
res = self.spider_info(url)
data_list_all.extend(res)
print(len(data_list_all)) # 250
if __name__ == '__main__':
spider = SpiderDB()
spider.main()