併發爬蟲_使用motor儲存資料

KOJYA發表於2024-10-12
 1 import redis
 2 import chardet
 3 import hashlib
 4 import asyncio
 5 import aiohttp
 6 from lxml import etree
 7 from fake_useragent import UserAgent
 8 from motor.motor_asyncio import AsyncIOMotorClient
 9 
10 
11 class CarSpider:
12     user_agent = UserAgent()
13     redis_client = redis.Redis()
14     mongo_client = AsyncIOMotorClient('localhost', 27017)['py_spider']['car_info']
15 
16     def __init__(self):
17         self.url = 'https://www.che168.com/china/a0_0msdgscncgpi1ltocsp{}exf4x0/?pvareaid=102179#currengpostion'
18         self.api_url = 'https://cacheapigo.che168.com/CarProduct/GetParam.ashx?specid={}'
19 
20     def __del__(self):
21         # 爬蟲完畢時關閉redis服務
22         self.redis_client.close()
23 
24     # 獲取汽車id
25     async def get_car_id(self, page, session):
26         async with session.get(self.url.format(page), headers={'User-Agent': self.user_agent.random}) as response:
27             content = await response.read()
28             encoding = chardet.detect(content)['encoding']
29 
30             if encoding == 'GB2312' or encoding == 'ISO-8859-1':
31                 result = content.decode('gbk')
32                 tree = etree.HTML(result)
33                 id_list = tree.xpath('//ul[@class="viewlist_ul"]/li/@specid')
34                 if id_list:
35                     # 建立獲取汽車詳細資訊的task任務
36                     tasks = [loop.create_task(self.get_car_info(spec_id, session)) for spec_id in id_list]
37                     await asyncio.wait(tasks)
38                 else:
39                     print('id為空...')
40             else:
41                 print('錯誤頁面...')
42 
43     # 獲取汽車詳細資訊
44     async def get_car_info(self, spec_id, session):
45         async with session.get(self.api_url.format(spec_id), headers={'User-Agent': self.user_agent.random}) as response:
46             result = await response.json()
47             if result['result'].get('paramtypeitems'):
48                 item = dict()
49                 item['name'] = result['result']['paramtypeitems'][0]['paramitems'][0]['value']
50                 item['price'] = result['result']['paramtypeitems'][0]['paramitems'][1]['value']
51                 item['brand'] = result['result']['paramtypeitems'][0]['paramitems'][2]['value']
52                 item['altitude'] = result['result']['paramtypeitems'][1]['paramitems'][2]['value']
53                 item['breadth'] = result['result']['paramtypeitems'][1]['paramitems'][1]['value']
54                 item['length'] = result['result']['paramtypeitems'][1]['paramitems'][0]['value']
55                 await self.save_car_info(item)
56             else:
57                 print('資料不存在...')
58 
59     # 資料去重
60     @staticmethod
61     def get_md5(dict_item):
62         md5 = hashlib.md5()
63         md5.update(str(dict_item).encode('utf-8'))
64         return md5.hexdigest()
65 
66     # 資料儲存
67     async def save_car_info(self, item):
68         md5_hash = self.get_md5(item)
69         redis_result = self.redis_client.sadd('car:filter', md5_hash)
70         if redis_result:
71             await self.mongo_client.insert_one(item)
72             print('資料插入成功:', item)
73         else:
74             print('資料重複...')
75 
76     async def main(self):
77         async with aiohttp.ClientSession() as session:
78             tasks = [asyncio.create_task(self.get_car_id(page, session)) for page in range(1, 101)]
79             await asyncio.wait(tasks)
80 
81 
82 if __name__ == '__main__':
83     loop = asyncio.get_event_loop()
84     car_spider = CarSpider()
85     loop.run_until_complete(car_spider.main())

相關文章