import aiohttp, aiofiles
from aiohttp.client_exceptions import ClientConnectionError
import asyncio
import os
import re
RE_IMG_PAGES = re.compile('''<li><a href=["'](https://www.mzitu.com/\d+)["']''')
RE_LIST_NEXT_PAGE = re.compile('''next page-numbers" href=["'](https://www.mzitu.com/page/\d+/)["']>''')
RE_IMG_INFO = re.compile('''<div class="main-image">.+?<img src=["']([^"']+?)["'] alt=["']([^"']+?)["']''')
RE_IMG_NEXT_PAGE = re.compile('''href=["']([^"']+?/\d+/\d+)["']><span>下一頁''')
RE_SUB_DIRNAME = re.compile(r'[<>/\\|:*?]')
async def download(url, retries=0):
headers = {'User-Agent': 'Mozilla', 'Referer':'https://www.mzitu.com/'}
if retries < 3:
async with aiohttp.request('GET', url, headers=headers, allow_redirects=False, expect100=True) as resp:
if resp.status == 200:
return await resp.read()
else:
await asyncio.sleep(10)
return await download(url, retries+1)
else:
raise ClientConnectionError
async def save_image(img_url, save_dir=''):
img = await download(img_url)
save_dir = RE_SUB_DIRNAME.sub('_', save_dir)
save_path = os.path.join(save_dir, os.path.split(img_url)[-1])
try:
async with aiofiles.open(save_path, mode='wb') as img_fp:
await img_fp.write(img)
except FileNotFoundError:
os.mkdir(save_dir)
async with aiofiles.open(save_path, mode='wb') as img_fp:
await img_fp.write(img)
print(save_path)
async def process_list_page(list_page_url):
list_page = await download(list_page_url)
list_page = list_page.decode('utf-8')
img_page_list = RE_IMG_PAGES.findall(list_page)
for img_page in img_page_list[:1]:
await process_img_page(img_page)
async def process_img_page(img_page_url):
img_page = await download(img_page_url)
img_page = img_page.decode('utf-8')
img_info_list = RE_IMG_INFO.findall(img_page)
for img_url, img_title in img_info_list:
await save_image(img_url, img_title)
img_next_page_list = RE_IMG_NEXT_PAGE.findall(img_page)
for img_next_page in img_next_page_list:
await process_img_page(img_next_page)
base_url = 'https://www.mzitu.com/'
loop = asyncio.get_event_loop()
loop.run_until_complete(process_list_page(base_url))
本作品採用《CC 協議》,轉載必須註明作者和本文連結