python四種方式解析網頁獲取頁面中的連結

還需要學習的萌新發表於2020-12-31
# 連結直譯器
# 使用三種不同直譯器 環境python3.8
#
from html.parser import HTMLParser
from urllib.request import urlopen
from urllib.parse import urljoin
import time
# 非標準庫
from bs4 import BeautifulSoup, SoupStrainer

# 非標準庫
# from html5lib import html5parser, treebuilders,treewalkers

# 要解析的網站
URLS = ('http://python.org',)


# 定義輸出函式
def output(x):
    print('\n'.join(sorted(list(set(x)))))


def simpleBS(url, f):
    """
    使用BeautifulSoup庫來解析所有標籤以獲得錨點
    :param url: url
    :param f: 檔案資料
    :return:
    """
    list_href = [x for x in BeautifulSoup(f).findAll('a')]
    output(urljoin(url, x['href']) for x in list_href)


def fasterBS(url, f):
    """
    使用BeautifulSoup來只解析a標籤
    :param url: url
    :param f: 文加資料
    :return:
    """
    list_href = [x for x in BeautifulSoup(f, parse_only=SoupStrainer('a'))][1:]
    output(urljoin(url, x['href']) for x in list_href)


def htmlparser(url, f):
    """
    自定義HTMLParser 子類來解析錨點
    :param url: url
    :param f: 檔案資料
    :return:
    """

    class AnchorParser(HTMLParser):
        def handle_starttag(self, tag, attrs):
            if tag != 'a':
                return
            # 若沒有data值,增加data屬性
            if not hasattr(self, 'data'):
                self.data = []
            for attr in attrs:
                if attr[0] == 'href':
                    self.data.append(attr[1])

    parser = AnchorParser()
    # 只能feed字串,故將位元組串轉化為字串
    parser.feed(data=f.decode())
    output(urljoin(url, x) for x in parser.data)


def html5libparse(url, f):
    """
    使用html5lib庫作為bs的直譯器來解析錨點
    :param url: url
    :param f: 檔案資料
    :return:
    """
    list_href = [x for x in BeautifulSoup(f.decode(), 'html5lib').findAll('a')]
    output(urljoin(url, x['href']) for x in list_href)


def process(url, data):
    print('\n*** simple BS')
    start = time.time()
    simpleBS(url, data)
    print('simple BS:%.3fs used'% (time.time() - start))
    print('\n*** faster BS')
    start = time.time()
    fasterBS(url, data)
    print('faster BS %.3fs used'% (time.time() - start))
    print('\n*** HTMLParse')
    start = time.time()
    htmlparser(url, data)
    print('HTMLParse %.3fs used' % (time.time() - start))
    print('\n*** HTML5lib')
    start = time.time()
    html5libparse(url, data)
    print('HTML5lib %.3fs used'% (time.time() - start))

    # 主函式


def main():
    for url in URLS:
        f = urlopen(url)
        data = f.read()
        f.close()
        # 處理資料
        process(url, data)


main()

相關文章