今天介紹一下基於asyncio和aiohttp的非同步爬蟲的編寫,解析html用的是xpath。
該爬蟲實現了以下功能:
1.讀取csv檔案中的爬取規則,根據規則爬取資料;程式碼中新增了對3個網站的不同提取規則,如有需要,還可以繼續新增;
2.將爬取到的資料儲存到mysql資料庫中。
通過輸入問題,該爬蟲能爬取關於健康方面的資料。
具體程式碼如下:
# coding:utf-8
"""
async-apiser xpath
"""
from lxml import etree
import csv
import re
import os
import asyncio
import aiohttp
import aiomysql
from datetime import datetime
from config import Config
class HealthSpider(object):
def __init__(self, user_id, keyword, url, hrule, drule, count, trule):
self.user_id = user_id
self.keyword = keyword
self.url = url
self.hrule = hrule
self.drule = drule
self.count = count
self.trule = trule
self.headers = ``
self.urls_done = []
self.urls_will = []
self.spider_data = {}
@staticmethod
def handle_flag(str):
"""
去除字串中的style樣式標籤
:param html:
:return:
"""
pattern = re.compile(r` style=".*?;"`, re.S)
return pattern.sub(``, str)
async def get_html(self, url, session):
"""
根據url,返回html
:param url:
:return:
"""
try:
async with session.get(url, headers=self.headers, timeout=5) as resp:
if resp.status in [200, 201]:
data = await resp.text()
return data
except Exception as e:
raise Exception("資料搜尋錯誤")
def get_url(self, resp):
"""
根據html獲取每條資料的url
:param resp:
:return:
"""
# 儲存爬取的資料
root = etree.HTML(str(resp))
items = root.xpath(self.hrule)
# html結構不同,組織url的方式也不同
if 5 == self.count:
self.urls_will = [`https://dxy.com` + i for i in items[:5]]
elif 3 == self.count:
self.urls_will = [i for i in items[:3]]
elif 2 == self.count:
self.urls_will = [i for i in items[:2]]
async def get_data(self, url, session, pool):
"""
根據url獲取具體資料
:return:
"""
# 根據url解析出htm
html = await self.get_html(url, session)
# 儲存爬取的資料
root = etree.HTML(str(html))
html_data = ``
try:
title = root.xpath(self.trule)
title = ``.join(title)
except Exception as e:
title = ``
try:
data = root.xpath(self.drule)
if data:
# html結構不同,獲取資料的方式也不同
if 3 == self.count:
html_data = ``.join(map(etree.tounicode, data))
# 去除結果中的style標籤
html_data = HealthSpider.handle_flag(html_data)
else:
html_data = etree.tounicode(data[0])
html_data = HealthSpider.handle_flag(html_data)
except Exception as e:
html_data = []
self.urls_done.append(url)
# 資料入庫,儲存:使用者id, 關鍵詞, 日期, 主url, 子url, html資料
if html_data:
self.spider_data["data"].append({"title": title, "html_data": html_data})
spide_date = datetime.now()
data = (self.user_id, self.keyword, spide_date, self.url, url, title, html_data)
stmt = "INSERT INTO spider_data (user_id, keyword, spide_date, main_url, sub_url, title, html_data) "
"VALUES (%s, %s, %s, %s, %s, %s, %s)"
try:
async with pool.acquire() as conn:
async with conn.cursor() as cur:
await cur.execute(stmt, data)
except Exception as e:
pass
async def start_spider(self, pool):
"""
開始爬取資料
:return:
"""
async with aiohttp.ClientSession() as session:
self.spider_data["user_id"] = self.user_id
self.spider_data["keyword"] = self.keyword
self.spider_data["data"] = []
while True:
# 待爬取url佇列為空或者已經爬取3條資料,則停止爬取
if (len(self.urls_will) == 0) or len(self.spider_data["data"]) == self.count:
break
# 獲取待爬url
url = self.urls_will.pop()
# 開始爬取資料
if url not in self.urls_done:
await self.get_data(url, session, pool)
return self.spider_data
async def main(self, loop):
# 請求頭
self.headers = {`Accept`: `text/html, application/xhtml+xml, application/xml;q=0.9,*/*;q=0.8`,
`Accept-Encoding`: `gzip, deflate`,
`Accept-Language`: `zh-Hans-CN, zh-Hans; q=0.5`,
`User-Agent`: `Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 `
`(KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063`
}
# 連線mysql資料庫
pool = await aiomysql.create_pool(host=Config.DB_HOST, port=Config.DB_PORT,
user=Config.DB_USER, password=Config.DB_PASSWORD,
db=Config.DB_NAME, loop=loop, charset="utf8", autocommit=True)
async with aiohttp.ClientSession() as session:
# 首次獲取html
html = await self.get_html(self.url, session)
# 獲取url
self.get_url(html)
data = await self.start_spider(pool)
return data
# asyncio.ensure_future(self.start_spider(pool))
def get_rules(keyword):
"""
獲取csv中的xpath規則
:return:
"""
csv_dict = []
path = os.path.join(os.path.dirname(__file__), `rules.csv`)
with open(path, `rU`) as f:
reader = csv.DictReader(f)
for line in reader:
url = line[`url`].format(keyword)
hrule = line[`hrule`]
drule = line[`drule`]
count = int(line[`count`])
title = line[`trule`]
csv_dict.append({"url": url, "hrule": hrule, "drule": drule, "count": count, "trule": title})
return csv_dict
def start_spider(keyword):
"""
爬取資料
:param user_id:
:param keyword:
:return:
"""
try:
data_list = get_rules(keyword)
except Exception as e:
raise Exception("搜尋規則獲取失敗")
spider_data = []
tasks = []
loop = asyncio.get_event_loop()
for i in data_list:
spider = HealthSpider(1, keyword, i[`url`], i[`hrule`], i[`drule`], i[`count`], i[`trule`])
# 任務列表
tasks.append(asyncio.ensure_future(spider.main(loop)))
# 新增到loop
loop.run_until_complete(asyncio.wait(tasks))
try:
for task in tasks:
for i in range(len(task.result()["data"])):
spider_data.append(task.result()["data"][i])
except Exception as e:
pass
# 延時以等待底層開啟的連線關閉
loop.run_until_complete(asyncio.sleep(0.250))
loop.close()
return spider_data
if __name__ == `__main__`:
# 爬取感冒了怎麼辦相關內容
start_spider("感冒了怎麼辦")
下面講一下程式碼中某些方法的作用:
1.handle_flag()方法用於去掉html字串中的style樣式標籤,保留html中的其他標籤,便於前端的展示;
2.get_data()方法用於爬取具體資料,並使用aiomysql將爬取道德資料儲存到資料庫;
資料庫的配置檔案config.py:
# coding=utf-8
class Config(object):
DB_ENGINE = `mysql`
DB_HOST = `127.0.0.1`
DB_PORT = 3306
DB_USER = `root`
DB_PASSWORD = `wyzane`
DB_NAME = `db_tornado`
DB_OPTIONS = {
`init_command`: "SET sql_mode=`STRICT_TRANS_TABLES`",
`charset`: `utf8mb4`,
}
3.get_rules()方法用於從rules.csv檔案中讀取爬取的規則。因為這裡同時爬取了3個不同的網站,由於每個網站解析html的xpath規則不同,並且每個網站提取的資料條數不同,所以把這些規則寫到了rules.csv檔案(就是一個excel檔案)中。先讀取規則,再爬取資料。
以上就是基於asyncio的非同步爬蟲的程式碼,如有錯誤,歡迎交流指正!