爬蟲案例

ssrheart發表於2024-03-31

BS爬取筆趣閣小說資料

# -*- coding: utf-8 -*-
# author : heart
# blog_url : https://www.cnblogs.com/ssrheart/
# time : 2024/3/30
import random
import time

import requests
from fake_useragent import UserAgent
from bs4 import BeautifulSoup
import os

headers = {
    'User-Agent': UserAgent().random,
}
proxies = {
    'http': 'http://221.6.139.190:9002'
}


def spider_title(url):
    response = requests.get(url=url, headers=headers, proxies=proxies).text

    soup = BeautifulSoup(response, 'lxml')

    dd_list = soup.find_all('div', class_='listmain')[0].find_all('dd')
    title_list = []
    for i in dd_list:
        if '<<---展開全部章節--->>' in i.text:
            continue
        href = i.a.get('href')
        href1 = 'https://www.bqgbb.cc' + href
        title = i.a.text
        title_list.append({
            'href': href1,
            'title': title
        })
    return title_list


def spider_content(url):
    response = requests.get(url=url, headers=headers, proxies=proxies).text
    soup = BeautifulSoup(response, 'lxml')
    div_list = soup.find_all('div', class_='Readarea ReadAjax_content')[0].text
    content = div_list
    return content


def save(title, content):
    base_dir = os.path.dirname(__file__)
    wenjian = os.path.join(base_dir, 'xiaoshuo')
    os.makedirs(wenjian, exist_ok=True)
    lujing = os.path.join(wenjian, f'{title}.txt')
    with open(lujing, 'w', encoding='utf-8') as f:
        f.write(content)


def main():
    title = spider_title(url='https://www.bqgbb.cc/book/11174/')
    for index, data in enumerate(title, start=1):
        title = data['title']
        href = data['href']
        time.sleep(random.randint(1,3))
        content = spider_content(url=href)
        save(title, content)
        print(f'{title}下載完成')


if __name__ == '__main__':
    main()

xpath爬取豆瓣TOP250資料

  • 使用xpath
# -*- coding: utf-8 -*-
# author : heart
# blog_url : https://www.cnblogs.com/ssrheart/
# time : 2024/3/31

import requests
from fake_useragent import UserAgent
from lxml import etree


class SpiderDB():
    def __init__(self):
        self.headers = {
            'User-Agent': UserAgent().random,
        }
        self.proxies = {
            'http': 'http://221.6.139.190:9002'
        }

    def spider_tag(self):
        tagurl_list = []
        for i in range(0, int(250 / 25)):
            if i == 0:
                tag_url = f'https://movie.douban.com/top250'
                tagurl_list.append(tag_url)
            else:
                tag_url = f'https://movie.douban.com/top250?start={i * 25}'
                tagurl_list.append(tag_url)
        return tagurl_list

    def spider_info(self, url):
        # print(url) # https://movie.douban.com/top250
        response = requests.get(url=url, headers=self.headers, proxies=self.proxies).text
        tree = etree.HTML(response)

        info = tree.xpath('//li/div[@class="item"]/div[@class="info"]')
        data_list = []
        for i in info:
            try:
                title = i.xpath('./div[1]/a/span[1]/text()')[0].strip()
            except:
                title = ''
            try:
                title_eng = i.xpath('./div[1]/a/span[2]/text()')[0].replace('\xa0', '').strip()
            except:
                title_eng = ''
            try:
                other_title = i.xpath('./div[1]/a/span[3]/text()')[0].replace('\xa0', '').strip()
            except:
                other_title = ''

            actor = i.xpath('./div[2]/p/text()')[0].replace('\xa0', '').strip()
            publish_time = i.xpath('./div[2]/p/text()')[1].replace('\xa0', '').strip()

            score = i.xpath('./div[2]/div/span[2]/text()')[0]
            pingjia_people = i.xpath('./div[2]/div/span[4]/text()')[0][0:-3]
            try:
                quote = i.xpath('./div[2]/p[@class="quote"]/span/text()')[0]
            except:
                quote = ''
            data_list.append({
                'title': title,
                'title_eng': title_eng,
                'other_title': other_title,
                'actor': actor,
                'publish_time': publish_time,
                'score': score,
                'pingjia_people': pingjia_people,
                'quote': quote,
            })
        # print(data_list)
        return data_list

    def main(self):
        tag_url = self.spider_tag()
        data_list_all = []
        for url in tag_url:
            res = self.spider_info(url)
            data_list_all.extend(res)
        print(len(data_list_all))  # 250


if __name__ == '__main__':
    spider = SpiderDB()
    spider.main()

相關文章