scraping_編寫第一個網路爬蟲_最終版本
以下是自己學習到的第一個網路爬蟲,是自己寫與例項版本的對比
1.自己學習寫的最終版本
import urllib.request
import urllib.error
import re #正規表示式
import urllib.parse #將url連結從相對路徑(瀏覽器可懂但python不懂)轉為絕對路徑(python也懂了)
import urllib.robotparser #爬取資料前解析網站robots.txt檔案,避免爬取網站所禁止或限制的
import datetime #下載限速功能所需模組
def download(url, user_agent = "brain", proxy = None, num_retries = 2): #下載url網頁,proxy是支援代理功能,初始值為None,想要設定就直接傳引數即可
print("downloading:",url)
header = {"user-agent": user_agent} #設定使用者代理,而不使用python預設的使用者代理Python-urllib/3.6
req = urllib.request.Request(url, headers = header)
opener = urllib.request.build_opener() #為支援代理功能時刻準備著
if proxy: #如果設定了proxy,那麼就進行以下設定以實現支援代理功能
proxy_params = { urllib.parse.urlparse(url).scheme: proxy }
opener.add_handler(urllib.request.ProxyHandler(proxy_params))
response = opener.open(req)
try:
html = urllib.request.urlopen(req).read()
except urllib.error.URLError as e: #下載過程中出現問題
print("download error:",e.reason)
html = None
if num_retries > 0: #錯誤4XX發生在請求存在問題,而5XX錯誤則發生在服務端存在問題,所以在發生5XX錯誤時重試下載
if hasattr(e, "code") and 500<= e.code <600:
return download(url, user_agent, num_retries-1) # recursively retry 5XX HTTP errors
return html
#download("http://example.webscraping.com") #訪問正常
#download("http://httpstat.us/500") #這個網頁測試用,一直是5XXerror
#跟蹤連結的爬蟲
#link_crawler()函式傳入兩個引數:要爬取的網站URL、用於跟蹤連結的正規表示式。
def link_crawler(seed_url, link_regex, max_depth=2):
"""先下載 seed_url 網頁的原始碼,然後提取出裡面所有的連結URL,接著對所有匹配到的連結URL與link_regex 進行匹配,
如果連結URL裡面有link_regex內容,就將這個連結URL放入到佇列中,
下一次 執行 while crawl_queue: 就對這個連結URL 進行同樣的操作。
反反覆覆,直到 crawl_queue 佇列為空,才退出函式。"""
crawl_queue = [seed_url]
max_depth = 2 #為避免爬蟲陷阱,將用於避免重複連結的seen記錄值修改為字典,增加記錄訪問次數;如果想要禁用該功能,只需將max_depth設為一個負數即可,此時當前深度永遠不會與之相等
seen = {seed_url:0} #初始化seed_url訪問深度為0
#seen = set(crawl_queue) #有可能連結中互相重複指向,為避免爬取相同的連結,所以我們需要記錄哪些連結已經被爬取過(放在集合seen中),若已被爬取過,不再爬取
while crawl_queue:
url = crawl_queue.pop()
rp = urllib.robotparser.RobotFileParser() #爬取前解析網站robots.txt,檢查是否可以爬取網站,避免爬取網站禁止或限制的
rp.set_url("http://example.webscraping.com/robots.txt")
rp.read()
user_agent = "brain"
if rp.can_fetch(user_agent, url): #解析後發現如果可以正常爬取網站,則繼續執行
#爬取網站的下載限速功能的類的呼叫,每次在download下載前使用
throttle = Throttle(delay=5) #這裡例項網站robots.txt中的delay值為5
throttle.wait(url)
html = download(url) #html = download(url, hearders, proxy=proxy, num_retries=num_retries)這裡可以傳所需要的引數
html = str(html)
#filter for links matching our regular expression
if html == None:
continue
depth = seen[url] #用於避免爬蟲陷阱的記錄爬取深度的depth
if depth != max_depth:
for link in get_links(html):
if re.match(link_regex, link):
link = urllib.parse.urljoin(seed_url, link) #把提取的相對url路徑link(view/178)轉化成絕對路徑(/view/Poland-178)link
if link not in seen: #判斷是否之前已經爬取
seen[link] = depth + 1 #在之前的爬取深度上加1
crawl_queue.append(link) #之前沒有的話這個連結可用,放在列表中繼續進行爬取
else:
print("Blocked by %s robots,txt" % url)
continue
def get_links(html):
"""用來獲取一個html網頁中所有的連結URL"""
#做了一個匹配模板 webpage_regex,匹配 <a href="xxx"> or <a href='xxx'>這樣的字串,並提取出裡面xxx的URL,請注意這裡的xxxURL很可能是原始碼中相對路徑,eg view/1 正常訪問肯定是打不開的
webpage_regex = re.compile('<a href=["\'](.*?)["\']', re.IGNORECASE)
return re.findall(webpage_regex,html)
#return re.findall('<a[^>]+href=["\'](.*?)["\']', html)也可以這樣實現,但沒有上面的先編譯模板再匹配好
class Throttle: #爬取網站的下載限速功能的類的實現,每次在download下載前使用
"""Add a delay between downloads to the same domain"""
def __init__(self, delay):
self.delay = delay # value of delay between downloads for each domain
self.domains = {} # timestamp of when a domain was last accessed記錄上次訪問的時間,小知識timestamp:時間戳是指格林威治時間1970年01月01日00時00分00秒(北京時間1970年01月01日08時00分00秒)起至現在的總秒數。
def wait(self, url):
domain = urllib.parse.urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay>0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs) #domain has been accessed recently,so need to sleep
self.domains[domain] = datetime.datetime.now()
#只想找http://example.webscraping.com/index... or http://example.webscraping.com/view...
link_crawler("http://example.webscraping.com", "/(index|view)")
2.示例網站提供的最終版本(閱讀就好,示例程式碼是用python2實現的)
import re
import urlparse
import urllib2
import time
from datetime import datetime
import robotparser
import Queue
def link_crawler(seed_url, link_regex=None, delay=5, max_depth=-1, max_urls=-1, headers=None, user_agent='wswp', proxy=None, num_retries=1):
"""Crawl from the given seed URL following links matched by link_regex
"""
# the queue of URL's that still need to be crawled
crawl_queue = Queue.deque([seed_url])
# the URL's that have been seen and at what depth
seen = {seed_url: 0}
# track how many URL's have been downloaded
num_urls = 0
rp = get_robots(seed_url)
throttle = Throttle(delay)
headers = headers or {}
if user_agent:
headers['User-agent'] = user_agent
while crawl_queue:
url = crawl_queue.pop()
# check url passes robots.txt restrictions
if rp.can_fetch(user_agent, url):
throttle.wait(url)
html = download(url, headers, proxy=proxy, num_retries=num_retries)
links = []
depth = seen[url]
if depth != max_depth:
# can still crawl further
if link_regex:
# filter for links matching our regular expression
links.extend(link for link in get_links(html) if re.match(link_regex, link))
for link in links:
link = normalize(seed_url, link)
# check whether already crawled this link
if link not in seen:
seen[link] = depth + 1
# check link is within same domain
if same_domain(seed_url, link):
# success! add this new link to queue
crawl_queue.append(link)
# check whether have reached downloaded maximum
num_urls += 1
if num_urls == max_urls:
break
else:
print 'Blocked by robots.txt:', url
class Throttle:
"""Throttle downloading by sleeping between requests to same domain
"""
def __init__(self, delay):
# amount of delay between downloads for each domain
self.delay = delay
# timestamp of when a domain was last accessed
self.domains = {}
def wait(self, url):
domain = urlparse.urlparse(url).netloc
last_accessed = self.domains.get(domain)
if self.delay > 0 and last_accessed is not None:
sleep_secs = self.delay - (datetime.now() - last_accessed).seconds
if sleep_secs > 0:
time.sleep(sleep_secs)
self.domains[domain] = datetime.now()
def download(url, headers, proxy, num_retries, data=None):
print 'Downloading:', url
request = urllib2.Request(url, data, headers)
opener = urllib2.build_opener()
if proxy:
proxy_params = {urlparse.urlparse(url).scheme: proxy}
opener.add_handler(urllib2.ProxyHandler(proxy_params))
try:
response = opener.open(request)
html = response.read()
code = response.code
except urllib2.URLError as e:
print 'Download error:', e.reason
html = ''
if hasattr(e, 'code'):
code = e.code
if num_retries > 0 and 500 <= code < 600:
# retry 5XX HTTP errors
return download(url, headers, proxy, num_retries-1, data)
else:
code = None
return html
def normalize(seed_url, link):
"""Normalize this URL by removing hash and adding domain
"""
link, _ = urlparse.urldefrag(link) # remove hash to avoid duplicates
return urlparse.urljoin(seed_url, link)
def same_domain(url1, url2):
"""Return True if both URL's belong to same domain
"""
return urlparse.urlparse(url1).netloc == urlparse.urlparse(url2).netloc
def get_robots(url):
"""Initialize robots parser for this domain
"""
rp = robotparser.RobotFileParser()
rp.set_url(urlparse.urljoin(url, '/robots.txt'))
rp.read()
return rp
def get_links(html):
"""Return a list of links from html
"""
# a regular expression to extract all links from the webpage
webpage_regex = re.compile('<a[^>]+href=["\'](.*?)["\']', re.IGNORECASE)
# list of all links from the webpage
return webpage_regex.findall(html)
if __name__ == '__main__':
link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, user_agent='BadCrawler')
link_crawler('http://example.webscraping.com', '/(index|view)', delay=0, num_retries=1, max_depth=1, user_agent='GoodCrawler')
3.測試此爬蟲
我們可以將使用者代理設定為BadCrawler,也就是本章前文所述的被robots.txt 遮蔽了的那個使用者代理。從下面的執行結果中可以看出,爬蟲果然被遮蔽了,程式碼啟動後馬上就會結束:
>>>seed_url = "http://example.webscraping.com/index"
>>>link_regex = "/(index/view)"
>>>link_crawler(seed_url, link_regex, user_agent="BadCrawler")
Blocked by robots.txt : http://example.webscraping.com/
現在,讓我們使用預設的使用者代理,並將最大深度設定為1,這樣只有主頁上的連結才會被下載:downloading: http://example.webscraping.com
downloading: http://example.webscraping.com/index/1
downloading: http://example.webscraping.com/index/2
downloading: http://example.webscraping.com/index/0
downloading: http://example.webscraping.com/view/Barbados-20
downloading: http://example.webscraping.com/view/Bangladesh-19
downloading: http://example.webscraping.com/view/Bahrain-18
downloading: http://example.webscraping.com/view/Bahamas-17
download error: TOO MANY REQUESTS
downloading: http://example.webscraping.com/view/Azerbaijan-16
downloading: http://example.webscraping.com/view/Austria-15
downloading: http://example.webscraping.com/view/Australia-14
downloading: http://example.webscraping.com/view/Aruba-13
downloading: http://example.webscraping.com/view/Armenia-12
downloading: http://example.webscraping.com/view/Argentina-11
downloading: http://example.webscraping.com/view/Antigua-and-Barbuda-10
download error: TOO MANY REQUESTS
downloading: http://example.webscraping.com/view/Antarctica-9
download error: TOO MANY REQUESTS
downloading: http://example.webscraping.com/view/Anguilla-8
download error: TOO MANY REQUESTS
downloading: http://example.webscraping.com/view/Angola-7
download error: TOO MANY REQUESTS
downloading: http://example.webscraping.com/view/Andorra-6
download error: TOO MANY REQUESTS
downloading: http://example.webscraping.com/view/American-Samoa-5
download error: TOO MANY REQUESTS
downloading: http://example.webscraping.com/view/Algeria-4
download error: TOO MANY REQUESTS
downloading: http://example.webscraping.com/view/Albania-3
download error: TOO MANY REQUESTS
downloading: http://example.webscraping.com/view/Aland-Islands-2
download error: TOO MANY REQUESTS
downloading: http://example.webscraping.com/view/Afghanistan-1
download error: TOO MANY REQUESTS
相關文章
- 使用 Kotlin DSL 編寫網路爬蟲Kotlin爬蟲
- 網路爬蟲編寫常見問題爬蟲
- 精通Scrapy網路爬蟲【一】第一個爬蟲專案爬蟲
- 教你如何編寫第一個簡單的爬蟲爬蟲
- 如何自己寫一個網路爬蟲爬蟲
- 如何編寫一個Perl爬蟲程式爬蟲
- python網路爬蟲_Python爬蟲:30個小時搞定Python網路爬蟲視訊教程Python爬蟲
- 寫網路爬蟲的法律邊界爬蟲
- 手把手教你寫網路爬蟲(2):迷你爬蟲架構爬蟲架構
- 什麼是網路爬蟲?為什麼用Python寫爬蟲?爬蟲Python
- 終於有人把網路爬蟲講明白了爬蟲
- 網路爬蟲爬蟲
- 寫個爬蟲唄爬蟲
- 5 個用 Python 編寫 web 爬蟲的方法PythonWeb爬蟲
- 編寫一個使用wreq庫的爬蟲程式爬蟲
- 使用python的scrapy來編寫一個爬蟲Python爬蟲
- 網路爬蟲——爬蟲實戰(一)爬蟲
- 手把手教你寫網路爬蟲(3):開源爬蟲框架對比爬蟲框架
- python爬蟲初探--第一個python爬蟲專案Python爬蟲
- 寫網路爬蟲程式的三種難度爬蟲
- 網路爬蟲示例爬蟲
- 網路爬蟲精要爬蟲
- 基於nodejs編寫小爬蟲NodeJS爬蟲
- 使用JavaScript編寫的爬蟲程式JavaScript爬蟲
- 網路爬蟲——Urllib模組實戰專案(含程式碼)爬取你的第一個網站爬蟲網站
- python網路爬蟲應用_python網路爬蟲應用實戰Python爬蟲
- 最簡單的網路圖片的爬取 --Pyhon網路爬蟲與資訊獲取爬蟲
- 網路爬蟲的原理爬蟲
- python DHT網路爬蟲Python爬蟲
- 網路爬蟲專案爬蟲
- 我的第一個 scrapy 爬蟲爬蟲
- [Python] 網路爬蟲與資訊提取(1) 網路爬蟲之規則Python爬蟲
- 什麼是Python網路爬蟲?常見的網路爬蟲有哪些?Python爬蟲
- python網路爬蟲(14)使用Scrapy搭建爬蟲框架Python爬蟲框架
- Python 第一個爬蟲,爬取 147 小說Python爬蟲
- Python網路爬蟲實戰專案大全 32個Python爬蟲專案demoPython爬蟲
- 為什麼寫網路爬蟲天然就是擇Python而用爬蟲Python
- 手把手教你寫網路爬蟲(4):Scrapy入門爬蟲
- 手把手教你寫網路爬蟲(5):PhantomJS實戰爬蟲JS