python爬去百度美女吧圖片

艾利金德發表於2018-04-01

# coding=utf-8
import requests
from lxml import etree
import os
import re


class TieBa(object):
    """抓取百度貼吧美女圖片"""
    def __init__(self, word):
        self.url = 'https://tieba.baidu.com/f?kw={}'.format(word)
        self.headers = {
            'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0; TUCOWS) '
        }

    def get_data(self, url):
        # 構造請求
        response = requests.get(url, headers=self.headers)
        data = response.content
        # print(data)
        return data

    def parse_page(self, data):
        """解析資料"""
        # 建立xpath物件
        html = etree.HTML(data)
        # 提取當前頁標題,url資料
        node_list = html.xpath('//*[@id="thread_list"]/li/div/div[2]/div[1]/div[1]/a')
        detail_list = []
        for node in node_list:
            temp = dict()
            temp['title'] = node.xpath('./text()')[0]
            temp['url'] = 'https://tieba.baidu.com' + node.xpath('./@href')[0]
            detail_list.append(temp)
            # print(temp)
        # 提取下一頁連線
        next_url = html.xpath('//*[@id="frs_list_pager"]/a[contains(text(), "下一頁")]/@href')[0]
        next_url = 'http:' + next_url if len(next_url) > 0 else None
        # print(next_url)
        return detail_list, next_url

    def parse_detail(self, detail_list):
        """提取詳情頁url"""
        data_url = []
        for detail in detail_list:
            data_url.append(detail['url'])
        return data_url

    def save_data(self, url):
        """儲存資料"""
        # 請求標題連線地址
        data = self.get_data(url)
        # 建立xpath物件
        html = etree.HTML(data)
        # print(html)
        # print(url)
        # 獲取圖片url
        try:
            image_url = html.xpath('//*[contains(@id,"post_content")]/img[1]/@src')[0]
        except Exception as e:
            return
        print(image_url)
        # 判斷圖片地址是否已jpg結尾
        if re.match(r'.*\.jpg$', image_url):
            # 請求圖片地址,獲取圖片
            image_data = self.get_data(image_url)
            filename = 'image/' + image_url.split('/')[-1]
            # print(filename)
            # 儲存圖片
            with open(filename, 'wb') as f:
                f.write(image_data)

    def run(self):
        # 判斷是否有image資料夾
        if not os.path.exists('image'):
            # 建立資料夾
            os.mkdir('image')
        next_url = self.url
        # 請求美女吧首頁
        data = self.get_data(next_url)
        # 儲存首頁檔案,觀察資料,是否有需要的資料
        with open('tieba.json', 'wb') as f:
            f.write(data)
        # 如果有下一頁就執行
        while next_url:
            # 獲取每頁標題和對應的連線地址
            detail_list, next_url = self.parse_page(data)
            # 提取每頁的詳情頁的url
            data_url = self.parse_detail(detail_list)
            # 遍歷每個url
            for url in data_url:
                # 儲存圖片
                self.save_data(url)
            # 構造下一頁請求
            data = self.get_data(next_url)


if __name__ == '__main__':
    tb = TieBa('美女')
    tb.run()複製程式碼


相關文章