from html.parser import HTMLParser
from urllib.request import urlopen
from urllib.parse import urljoin
import time
from bs4 import BeautifulSoup, SoupStrainer
URLS = ('http://python.org',)
def output(x):
print('\n'.join(sorted(list(set(x)))))
def simpleBS(url, f):
"""
使用BeautifulSoup庫來解析所有標籤以獲得錨點
:param url: url
:param f: 檔案資料
:return:
"""
list_href = [x for x in BeautifulSoup(f).findAll('a')]
output(urljoin(url, x['href']) for x in list_href)
def fasterBS(url, f):
"""
使用BeautifulSoup來只解析a標籤
:param url: url
:param f: 文加資料
:return:
"""
list_href = [x for x in BeautifulSoup(f, parse_only=SoupStrainer('a'))][1:]
output(urljoin(url, x['href']) for x in list_href)
def htmlparser(url, f):
"""
自定義HTMLParser 子類來解析錨點
:param url: url
:param f: 檔案資料
:return:
"""
class AnchorParser(HTMLParser):
def handle_starttag(self, tag, attrs):
if tag != 'a':
return
if not hasattr(self, 'data'):
self.data = []
for attr in attrs:
if attr[0] == 'href':
self.data.append(attr[1])
parser = AnchorParser()
parser.feed(data=f.decode())
output(urljoin(url, x) for x in parser.data)
def html5libparse(url, f):
"""
使用html5lib庫作為bs的直譯器來解析錨點
:param url: url
:param f: 檔案資料
:return:
"""
list_href = [x for x in BeautifulSoup(f.decode(), 'html5lib').findAll('a')]
output(urljoin(url, x['href']) for x in list_href)
def process(url, data):
print('\n*** simple BS')
start = time.time()
simpleBS(url, data)
print('simple BS:%.3fs used'% (time.time() - start))
print('\n*** faster BS')
start = time.time()
fasterBS(url, data)
print('faster BS %.3fs used'% (time.time() - start))
print('\n*** HTMLParse')
start = time.time()
htmlparser(url, data)
print('HTMLParse %.3fs used' % (time.time() - start))
print('\n*** HTML5lib')
start = time.time()
html5libparse(url, data)
print('HTML5lib %.3fs used'% (time.time() - start))
def main():
for url in URLS:
f = urlopen(url)
data = f.read()
f.close()
process(url, data)
main()