使用redis和mongodb下載小說,並用pytest做測試

weixin_34249678發表於2019-01-06

週末為了熟悉mongodb和redis,寫了一個抓取《白夜行》小說的程式,並且用pytest測試框架做單元測試, 使用了執行緒池加快下載速度:

# white_novel.py
""" 使用redis儲存網址,使用mongodb儲存內容"""

import lxml.html  # type: ignore
import requests  # type: ignore
import redis  # type: ignore
from pymongo import MongoClient, database
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
from multiprocessing.dummy import Pool
from functools import partial

class DownloadWhite:
    KEY = 'urls'

    def __init__(self, workers=15, home_url='http://dongyeguiwu.zuopinj.com/5525'):
        self.workers = workers
        self.home_url = home_url
        self.redis_client = redis.StrictRedis(decode_responses=True)
        mongo_client = MongoClient()
        db: database.Database = mongo_client['Chapter6']
        self.collection = db['white_novel']

    def _clear(self):
        self.redis_client.delete(self.KEY)
        self.collection.delete_many({})

    def save_urls(self):
        home_page = requests.get(self.home_url).content.decode()
        selector = lxml.html.fromstring(home_page)
        useful = selector.xpath('//div[@class="book_list"]/ul/li')
        urls = []
        for i, li in enumerate(useful):
            url = li.xpath('a/@href')[0] if li.xpath('a/@href') else None
            urls.append(url)
        self.redis_client.rpush(self.KEY, *urls)

    def download_novel(self):
        client = self.redis_client
        contents = []
        urls = client.lrange(self.KEY, 0, -1)
        if not urls:
            return
        # method1
        # with ThreadPoolExecutor(max_workers=self.workers) as executor:
        #     futures = [executor.submit(self._download_chapter, url, contents) for url in urls]
        # for _ in as_completed(futures):
        #     pass
        # method2
        pool = Pool(self.workers)
        pool.map(partial(self._download_chapter, contents=contents), urls)
        print(f'at last insert {len(contents)} chapters')
        self.collection.insert_many(contents)

    @staticmethod
    def _download_chapter(url, contents: list) -> None:
        page = requests.get(url).content.decode()
        selector = lxml.html.fromstring(page)
        title = selector.xpath('//div[@class="h1title"]/h1/text()')[0]
        content = '\n'.join(selector.xpath('//div[@id="htmlContent"]/p/text()'))
        contents.append({'title': title, 'contnet': content})


if __name__ == '__main__':
    dlw = DownloadWhite()
    dlw._clear()
    dlw.save_urls()
    start = time.perf_counter()
    dlw.download_novel()
    print(f'time elapse {time.perf_counter() - start} seconds')

執行緒池的實現我試了2個方案,一種方案是ThreadPoolExecutor, 另一種方案是multiprocessing.dummy.Pool, 還用了partial這種小技巧.

不過我有個疑惑:多個執行緒往同一個列表contents裡append,這個contents是執行緒安全的嗎?
What kinds of global value mutation are thread-safe?解答了我的疑問,由於GIL的存在,許多java中的非執行緒安全問題在python中不存在了,少數類似L[i] +=4這樣的先讀取再賦值的語句,由於不是原子操作,才可能執行緒不安全。

由於使用了執行緒池(15個執行緒)併發下載章節,因此13章的耗時基本等於1章的耗時

at last insert 13 chapters
time elapse 0.9961462760111317 seconds

單元測試:

# test_white_novel.py
import pytest  # type: ignore
import redis  # type: ignore
from pymongo import MongoClient, collection  # type: ignore

from white_novel import DownloadWhite


@pytest.fixture(scope='function')
def wld_instance():
    print('start')
    dlw = DownloadWhite()
    dlw._clear()
    yield dlw
    dlw._clear()
    print('end')


@pytest.fixture(scope='module')
def redis_client():
    print('init redis')
    return redis.StrictRedis(decode_responses=True)


@pytest.fixture(scope='module')
def white_novel_collection() -> collection.Collection:
    print('init mongo')
    mongo_client = MongoClient()
    database = mongo_client['Chapter6']
    collection = database['white_novel']
    return collection


def test_download(wld_instance, redis_client, white_novel_collection):
    wld_instance.save_urls()
    wld_instance.download_novel()
    assert redis_client.llen(wld_instance.KEY) == 13
    assert white_novel_collection.count_documents(filter={}) == 13


def test_not_save_url_download(wld_instance, redis_client, white_novel_collection):
    wld_instance.download_novel()
    assert redis_client.llen(wld_instance.KEY) == 0
    assert white_novel_collection.count_documents(filter={}) == 0

def test_only_save_url(wld_instance, redis_client, white_novel_collection):
    wld_instance.save_urls()
    assert redis_client.llen(wld_instance.KEY) == 13
    assert white_novel_collection.count_documents(filter={}) == 0

最終抓取的結果如下:


258473-0ded84fa774ab0c9.png
redis 儲存每一章的連結列表
258473-56a485ec97c96855.png
mongodb儲存小說內容

相關文章