python多執行緒爬蟲與單執行緒爬蟲效率效率對比

專注的阿熊發表於2021-03-19

import requests

from my_test import settings

import sys

import time

import pymysql

import threading

# 繼承父類 threading.Thread

class DownLoadPictures(threading.Thread):

    def __init__(self, name, sn):

        super().__init__()

        self.name = name

        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 '

                                      '(KHTML, like Gecko) Chrome/88.0.4324.150 Safari/537.36',

                        'Referer': 'https://image.so.com/z?ch=beauty'}

        self.url = 'https://image.so.com/zjl?ch=beauty&sn={}'.format(sn)

 

        self.conn = pymysql.Connect(**settings.MYSQL_CONFIG)

        self.cursor = self.conn.cursor()

    def __del__(self):

        self.cursor.close()

        self.conn.close()

    def get_resp_data(self):

        # print(' 當前是連結為 {} 的圖片下載! '.format(self.url))

        print(' 當前是執行緒為 {} 的圖片下載! '.format(self.name))

        # 返回的資料在 json

        resp = requests.get(self.url, headers=self.headers)

        return resp.json()

    def run(self):

        # 重寫 run 函式,執行緒在建立後會直接執行 run 函式

        resp_data = self.get_resp_data()

        # 判斷是否還有圖片

        if resp_data['end'] is False:

            for elem in resp_data['list']:

                downloadurl = elem['qhimg_downurl']

                fromUrl = elem['purl']

                title = elem['title']

                self.download_picture(downloadurl, title, fromUrl)

        else:

            print(' 連結為 {}外匯跟單gendan5.com 已無圖片 '.format(self.url))

    def download_picture(self, downloadurl, title, fromUrl):

        sql = "select * from beautyImages where downloadUrl = '{}' and title='{}'".format(downloadurl, title)

        row_count = self.cursor.execute(sql)

        if not row_count:

            try:

                resp = requests.get(downloadurl, headers=self.headers)

                if resp.status_code == requests.codes.ok:

                    with open(settings.STORE_PATH + '/' + title + '.jpg', 'wb') as f:

                        f.write(resp.content)

                print(' 下載完成 ')

                # 插入資料庫

                insert_sql = "INSERT INTO beautyImages(title, downloadUrl, fromUrl, createTime) values (%s, %s, %s, %s)"

                try:

                    self.cursor.execute(insert_sql, (title, downloadurl, fromUrl, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime())))

                    self.conn.commit()

                    print(' 插入標題為 {}, 連結為 {} 成功 !'.format(title, downloadurl))

                except Exception:

                    print(' 插入標題為 {}, 連結為 {} 失敗 , 失敗原因是 {}'.format(title, downloadurl, sys.exc_info()[1]))

            except Exception:

                print(' 標題為 {} , 連結為 {} 下載失敗 , 失敗原因是 {}'.format(title, downloadurl, sys.exc_info()[1]))

        else:

            print(' 標題為 {} , 連結為 {} 已存在 '.format(title, downloadurl))

if __name__ == '__main__':

    start_time = time.time()

    thread_list = []

    for i in range(0, 301, 30):

        test = DownLoadPictures(name=str(i), sn=i)

        thread_list.append(test)

    for t in thread_list:

        t.start()

    for t in thread_list:

        t.join()

    use_time = time.time() - start_time

    print(' 多執行緒用時: {} '.format(use_time))


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69946337/viewspace-2763870/,如需轉載,請註明出處,否則將追究法律責任。

相關文章