python多執行緒爬去糗事百科

艾利金德發表於2018-04-03
# coding=utf-8
import requests
from lxml import etree
import json
import threading
from queue import Queue
import time


class XiuShi(object):
    """抓取糗事百科"""
    def __init__(self):
        self.url = 'https://www.qiushibaike.com/8hr/page/{}'
        self.headers = {
            'User-Agent':'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36'
        }
        self.file = open('qiushi.json', 'w')
        self.url_list_queue = Queue()
        self.get_data_queue = Queue()
        self.parse_page_queue = Queue()

    def url_list(self):
        print('正在生成url列表')
        for i in range(1, 14):
            url = self.url.format(i)
            # 將url新增到佇列
            self.url_list_queue.put(url)

    def get_data(self):
        while True:
            print('正在傳送請求')
            # 從佇列中獲取url
            url = self.url_list_queue.get()
            response = requests.get(url, headers=self.headers)
            # 如果返回的響應狀態碼是503
            if response.status_code == 503:
                self.url_list_queue.put(url)
            else:
                # print(response.content)
                resp_data = response.content
                # 將response新增到佇列
                self.get_data_queue.put(resp_data)
            # 宣告子執行緒結束
            self.url_list_queue.task_done()

    def parse_page(self):
        while True:
            print('正在解析資料')
            # 從佇列中獲取response響應資料
            data = self.get_data_queue.get()
            # 生成xpath物件
            html = etree.HTML(data)
            # 解析所有帖子的結點
            node_list = html.xpath('//*[contains(@id,"qiushi_tag_")]')
            qiushi_list = []
            for node in node_list:
                qiu_dict = dict()
                try:
                    qiu_dict['user'] = node.xpath('./div[1]/a[2]/h2/text()')[0].strip()
                    qiu_dict['age'] = node.xpath('./div[1]/div/text()')[0]
                    qiu_dict['url'] = 'https://www.qiushibaike.com' + node.xpath('./div[1]/a[1]/@href')[0]
                    qiu_dict['gender'] = node.xpath('./div[1]/div/@class')[0].split(' ')[-1]
                except:
                    qiu_dict['user'] = '匿名使用者'
                    qiu_dict['age'] = None
                    qiu_dict['url'] = None
                    qiu_dict['gender'] = None
                qiu_dict['content'] = ''.join(node.xpath('./a/div/span/text()')).strip()
                qiushi_list.append(qiu_dict)
            # 將解析的資料儲存到佇列
            self.parse_page_queue.put(qiushi_list)
            # 宣告子執行緒結束
            self.get_data_queue.task_done()

    def save_data(self):
        while True:
            print('正在儲存資料')
            # 從佇列中獲取要儲存的資料
            qiushi_list = self.parse_page_queue.get()
            for qiushi in qiushi_list:
                # print(qiushi)
                # 轉化為json資料
                json_data = json.dumps(qiushi, ensure_ascii=False) + ',\n'
                print(json_data)
                self.file.write(json_data)
                # time.sleep(3)
            # 宣告子執行緒結束
            self.parse_page_queue.task_done()

    def __del__(self):
        """關閉檔案"""
        self.file.close()

    def run(self):
        # urls = self.url_list()
        # for url in urls:
        #     data = self.get_data(url)
        #     qiushi_list = self.parse_page(data)
        #     self.save_data(qiushi_list)
        threading_list = []
        # 建立生成url的執行緒
        urls = threading.Thread(target=self.url_list)
        threading_list.append(urls)

        # 建立請求的執行緒
        for i in range(1, 4):
            data = threading.Thread(target=self.get_data)
            threading_list.append(data)
        # 建立解析的執行緒
        for i in range(1, 4):
            qiushi_list = threading.Thread(target=self.parse_page)
            threading_list.append(qiushi_list)
        # 建立儲存的執行緒
        save = threading.Thread(target=self.save_data)
        threading_list.append(save)

        for t in threading_list:
            # 將子執行緒設定為守護主執行緒,即主執行緒死亡,子執行緒就死亡
            t.setDaemon(True)
            # 執行執行緒
            t.start()

        # 設定主執行緒等待結束的條件
        for q in (self.url_list_queue, self.get_data_queue, self.parse_page_queue):
            q.join()

if __name__ == '__main__':
    qiu = XiuShi()
    qiu.run()複製程式碼


相關文章