貼吧爬取
寫程式碼前,構思需要的功能塊;寫程式碼時,把各個功能模組名提前寫好
初始化
初始化必要引數,完成基礎設定 爬取百度貼吧lol吧:爬取地址中的get引數須傳遞(可以指定不同主題的貼吧和頁碼)
- 主題名
- 初始網址
- 請求頭
生成網址
生成每一頁的路由地址
- 根據列表生成式生成多個頁面的地址
下載
get請求給每一頁的地址,爬取頁面
儲存
儲存爬取結果到檔案中,把每一頁爬取結果寫入到對應名字的檔案中
控制流程
將以上爬取操作封裝到run函式中,方便外部物件呼叫,以後會在此新增多執行緒
- 生成要爬取的每一頁的路由地址
- 通過for迴圈遍歷每一個路由地址
- 對每個路由地址進行爬取和獲取頁碼操作,並進行儲存
原始碼
1 import requests 2 3 class TiebaSpider: 4 def __init__(self, tieba_name_crawl): 5 """ 6 初始化必要引數,完成基礎設定 7 爬取百度貼吧lol吧:爬取地址中的get引數須傳遞(可以指定不同主題的貼吧和頁碼) 8 """ 9 self.tieba_name = tieba_name_crawl 10 self.url_base = `https://tieba.baidu.com/f?kw=` + tieba_name_crawl + `&ie=utf-8&pn={}` 11 self.headers = {`User-Agent`: `Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0`} 12 13 def make_url(self): 14 """ 15 生成每一頁的路由地址 16 :return:(列表生成式) 17 """ 18 return [self.url_base.format(i) for i in range(4)] 19 20 def download_url(self, url_str): 21 """ 22 get請求給每一頁的地址,爬取頁面 23 :param url_str: 每一頁的路由地址 24 :return: 爬取的結果 25 """ 26 result = requests.get(url_str, headers=self.headers) 27 return result.text 28 29 def save_result(self, result, page_num): 30 """ 31 儲存爬取結果到檔案中 32 :param result: 每一頁的爬取結果 33 :param page_num: 頁碼,方便分類儲存 34 :return: 把每一頁爬取結果寫入到對應名字的檔案中 35 """ 36 # with open(`./download/lol` + str(page_num) + `.html`, `ab`) as f: 37 # f.write(result.encode(`utf-8`)) 38 file_path = `./download/{}~第{}頁.html`.format(self.tieba_name,page_num) 39 with open(file_path,`wb`) as f: 40 f.write(result.encode(`utf-8`)) 41 42 def run(self): 43 """ 44 將以上爬取操作封裝到run函式中,方便外部物件呼叫,以後會在此新增多執行緒 45 · 生成要爬取的每一頁的路由地址 46 · 通過for迴圈遍歷每一個路由地址 47 · 對每個路由地址進行爬取和獲取頁碼操作,並進行儲存 48 :return: 49 """ 50 url_lists = self.make_url() 51 for url_str in url_lists: 52 result_str = self.download_url(url_str) 53 p_num = url_lists.index(url_str) + 1 54 self.save_result(result=result_str,page_num=p_num) 55 56 if __name__ == `__main__`: 57 tieba_spider = TiebaSpider(`lol`) 58 tieba_spider.run()
爬取糗事百科
1 import requests 2 from bs4 import BeautifulSoup 3 import lxml.html 4 5 class QiushiSpider: 6 def __init__(self): 7 """ 8 初始化必要引數,完成基礎設定 9 """ 10 # self.tieba_name = qiushi_name_crawl 11 # https: // www.qiushibaike.com / 8 12 # hr / page / 2 / 13 self.url_base = `https://www.qiushibaike.com/8hr/page/{}/` 14 # self.url_base = `https://tieba.baidu.com/f?kw=` + qiushi_name_crawl + `&ie=utf-8&pn={}` 15 self.headers = {`User-Agent`: `Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0`} 16 17 def make_url(self): 18 return [self.url_base.format(i) for i in range(4)] 19 20 def download_url(self, url_str): 21 result = requests.get(url_str, headers=self.headers) 22 #---------- 23 # html = lxml.html.fromstring(result.text) 24 # html_data = html.xpath(`//div[@class="content"]/span[1]/text()`) 25 # data_all = [] 26 # # for h in html_data: 27 # # data_all.append(h) 28 # return html_data 29 #----------- 30 return result.text 31 32 def save_result(self, result, page_num): 33 with open(`./download/qiushi` + str(page_num) + `.html`, `ab`) as f: 34 f.write(result.encode(`utf-8`)) 35 36 37 # qiushi = QiushiSpider() 38 # qiushi_url = qiushi.make_url() 39 # j = 1 40 # for i in qiushi_url: 41 # qiushi_text = qiushi.download_url(url_str=i) 42 # qiushi.save_result(result=qiushi_text, page_num=j) 43 # j += 1
爬取國家資訊
BeautifulSoup方式
1 import requests 2 from bs4 import BeautifulSoup 3 class CountrySoup: 4 def __init__(self,country_name): 5 self.country_name = country_name 6 self.url_base = `http://example.webscraping.com/places/default/view/{}`.format(self.country_name) 7 self.headers = {`User-Agent`: `Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0`,} 8 9 def download_url(self): 10 result = requests.get(self.url_base,headers=self.headers) 11 soup = BeautifulSoup(result.text,`lxml`) 12 tr = soup.find(attrs={`id`:"places_country__row"}) 13 print(tr,type(tr)) 14 td = tr.find(attrs={`class`:"w2p_fw"}) 15 print(td,type(td)) 16 17 print(td.text)
lxml方式
1 class CountrySpider: 2 def __init__(self,country_name): 3 self.country_name = country_name 4 self.url_base = `http://example.webscraping.com/places/default/view/{}`.format(self.country_name) 5 self.headers = {`User-Agent`: `Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:61.0) Gecko/20100101 Firefox/61.0`,} 6 7 def download_url(self,url_str): 8 result = requests.get(url_str,headers=self.headers) 9 html = lxml.html.fromstring(result.text) 10 data_country = html.xpath(`//tr[@id="places_country__row"]/td[@class="w2p_fw"]/text()`) 11 data_capital = html.xpath(`//tr[@id="places_capital__row"]/td[@class="w2p_fw"]/text()`) 12 data_area = html.xpath(`//tr[@id="places_area__row"]/td[@class="w2p_fw"]/text()`) 13 data_all = [`國家:`+data_country[0],`首都:`+data_capital[0],`國土面積:`+data_area[0]] 14 return data_all 15 # print(html_data) 16 17 def save_result(self,result): 18 print(type(result),result) 19 for r in result: 20 r = r + ` ` 21 with open(`./country.txt`,`ab`) as f: 22 f.write(r.encode(`utf-8`)) 23 # with open(`./country.txt`,`ab`) as f: 24 # f.writelines(result) 25 def run(self): 26 result = self.download_url(self.url_base) 27 self.save_result(result) 28 29 30 if __name__ == `__main__`: 31 # c = CountrySpider(`Bolivia-27`) 32 # c.run() 33 s = CountrySoup(`Bolivia-27`) 34 s.download_url()