Python requests爬蟲例項

HuangZhang_123發表於2017-06-21

作業系統：Windows
Python：3.5
歡迎加入學習交流QQ群：657341423

需要用到的庫：
requests
wxPython
docx
win32api需要安裝pywin32

解釋：
requests這個用來做爬蟲，基本上不用多作解釋
wxPython和win32api主要生成軟體的操作介面，給使用者使用。
docx主要將爬取的結果，用word顯示出來。

網頁分析：
這裡以南方日報每天的報紙為例：請點選
這裡寫圖片描述
可以看到，url後面是一個日期和node為尾的，日期就代表當天的報紙，node是報紙的首頁，看到下面紅色標記的是每篇新聞的標題，點開標題看到

點開標題後，看到新的url，日期和一個content為尾的。content是代表每篇新聞的詳細報導。下面紅色標記的是我們需要的正文。
當然，有時候會附帶一些圖片的，而且是會有多張圖片。如：
這裡寫圖片描述

設計思路：
以http://epaper.southcn.com/nfdaily/html/2017-06/21/node_581.htm 開始。
然後獲取這個html裡面的帶有content的url連結。
再通過獲取的url組合成每篇新聞報導的url，然後獲取這些url的正文內容，標題和圖片。

分析node網頁的標題url
這裡寫圖片描述
可以發現，每個標題都是這樣的格式。

再分析content網頁的正文和標題，圖片。
這裡寫圖片描述

總的來說，就是在node這個網頁，獲取每篇新聞詳細資訊的url，而且這些url帶有content的，然後迴圈訪問這些url，獲取每篇新聞的標題，正文和圖片即可實現。

實現程式碼：

#爬蟲庫
from bs4 import BeautifulSoup
import requests
import queue
import re
#讀寫word
from docx import Document
from docx.shared import Inches
from io import StringIO
#介面設計
import os
import wx
import win32api
import win32con
import win32gui

		
def mkdir(path):#判斷是否有資料夾，沒則新建
    path=path.strip()
    path=path.rstrip("\\")
    isExists=os.path.exists(path)
    if not isExists:
        os.makedirs(path)
        return True
    else:
        return False	
PicSavePath=os.getcwd()+"\\news"
mkpath=PicSavePath
mkdir(mkpath)
	
class MyFrame(wx.Frame): 
	def __init__(self, parent, title): 
		super(MyFrame, self).__init__(parent, title = title, size = (500, 350)) 

		self.InitUI()
		self.Centre() 
		self.Show()      

	def InitUI(self): 
		panel = wx.Panel(self)

		hbox = wx.BoxSizer(wx.HORIZONTAL)

		fgs = wx.FlexGridSizer(4, 2, 10,10)

		Sort = wx.StaticText(panel, label = "年   份") 
		sale = wx.StaticText(panel, label = "月   份") 
		keys = wx.StaticText(panel, label = "關鍵字")

		self.tc1 = wx.ComboBox(panel,value="年份",choices=["2016","2017"]) 
		self.tc2 = wx.ComboBox(panel,value="月份",choices=["1","2","3","4","5","6","7","8","9","10","11","12"]) 
		self.tc3 = wx.TextCtrl(panel, style = wx.TE_MULTILINE)

		bt1=wx.Button(panel,label = "查詢")
		bt2=wx.Button(panel,label = "清除")

		fgs.AddMany([(Sort), (self.tc1, 1, wx.EXPAND), (sale),  
		(self.tc2, 1, wx.EXPAND), (keys, 1, wx.EXPAND), (self.tc3, 1, wx.EXPAND),(bt1),(bt2)])  
		fgs.AddGrowableRow(2, 1) 
		fgs.AddGrowableCol(1, 1)  
		hbox.Add(fgs, proportion = 2, flag = wx.ALL|wx.EXPAND, border = 15) 
		panel.SetSizer(hbox) 
		
		bt1.Bind( wx.EVT_BUTTON, self.query )
		bt2.Bind( wx.EVT_BUTTON, self.cleanValue )
		
	def cleanValue(self,event):
		self.tc1.ChangeValue('年份')
		self.tc2.ChangeValue('月份')
		self.tc3.Clear()
	
	def dowloadPic(self,imageUrl,filePath):
		r = requests.get(imageUrl)
		with open(filePath, "wb") as code:
			code.write(r.content)
		

	def query(self,event):
		queryyear=self.tc1.GetValue().strip()
		KeyValue=self.tc3.GetValue().strip()
		
		if KeyValue:
			Keylist=KeyValue.split('\n')
		else:
			Keylist=['南海區專利代理人培訓','專利管理師','南海區專利代理行業自律公約','南海區專利特派員','智慧財產權投融資對接會','科技服務業集聚區',
			'專利質押融資','rttp','南海區榮獲中國專利獎','智慧財產權優勢示範企業','智慧財產權日系列活動','南海基地']
		
		if queryyear.isdigit()==False:
			win32api.MessageBox(0,'請選擇年份', '提示',win32con.MB_ICONASTERISK|win32con.MB_OKCANCEL)
		
		else:
			#獲取全年的新聞
			for i in range(12):
				for k in range(31):
					#設定日期的月日格式
					if len(str(i+1))==1:
						month="0"+str(i+1)
					else:
						month=str(i+1)
						
					if len(str(k+1))==1:
						day="0"+str(k+1)
					else:
						day=str(k+1)	
					#生成日期	
					dateDay=queryyear+"-"+month+'/'+day
					print(dateDay)
					self.queryDay(dateDay,Keylist)
			win32api.MessageBox(0,'資料採集完成', '提示',win32con.MB_ICONASTERISK|win32con.MB_OKCANCEL)
			
	def queryDay(self,dateDay,Keylist):
		try:
			urlList=[]
			PicUrl=[]
			#獲取每天的報紙裡面每篇新聞的url
			papgedate=dateDay
			url='http://epaper.southcn.com/nfdaily/html/%s/node_2.htm' %papgedate
			r=requests.get(url)
			soup=BeautifulSoup(r.content.decode("utf-8"),"html.parser")
			temp=soup.find_all('div',id='btdh')[0]
			herfList=temp.find_all('a',href=re.compile("content"))
			for i in herfList:
				StarIndex=re.search('href=',str(i)).span()[1]
				EndIndex=re.search('target',str(i)).span()[0]
				tempValue=str(i)[StarIndex:EndIndex].replace('"','').strip()
				if tempValue:
					urlList.append(tempValue)
			urlList=sorted(set(urlList))
			
			for i in urlList:
				url='http://epaper.southcn.com/nfdaily/html/%s/%s' %(papgedate,i)
				r=requests.get(url)
				#獲取每篇新聞的標題和正文
				soup=BeautifulSoup(r.content.decode('utf-8'),"html.parser")
				title=soup.find_all('div',id='print_area')[0].find_all('h1')[0].getText()
				ContentText=soup.find_all('founder-content')[0].getText()
				PicList=soup.find_all('div',id='print_area')[0].find_all('img',src=re.compile('/res/'))
				
				#關鍵字篩選
				Mycontent=False
				for kw in Keylist:
					if kw in title:
						Mycontent=True
					elif kw in ContentText:
						Mycontent=True
				if Mycontent:
					for j,k in enumerate(PicList):
						StarIndex=re.search('/res',str(k)).span()[0]
						EndIndex=re.search('"/>',str(k)).span()[0]
						tempValue=str(k)[StarIndex:EndIndex].replace('"','').strip()
						imageUrl='http://epaper.southcn.com/nfdaily'+tempValue
						self.dowloadPic(imageUrl,i.replace('.','')+"_"+str(j)+".jpg")
						PicList[j]=i.replace('.','')+"_"+str(j)+".jpg"
					#寫入word
					document = Document()	
					document.add_heading(title, 0)
					document.add_paragraph(ContentText)
					for p in PicList:
						document.add_picture(p, width=Inches(1.25))
						os.remove(p)
					document.save('news//'+title.strip()+'.docx')
		except Exception as e:
			pass
		
app = wx.App() 
MyFrame(None, title = 'XyJw') 
app.MainLoop()

程式碼設計思路：
1.程式碼執行會生成一個使用者操作介面，使用者可以選擇年份和輸入關鍵字對新聞進行篩選，(這裡的月份還沒做相應的開發。)目前只是做了爬取一年裡面的全部新聞，然後將每篇新聞的標題和正文和關鍵字對比，如果含有關鍵字的，會爬取下來生成word。如果關鍵字不輸入資料，會預設程式裡面的關鍵字進行篩選。
2.在年份設定那裡，我是預設每個月都是31日的，如果當月不存在31日的，在queryDay裡面會跳到Exception不作處理。
這裡寫圖片描述

執行結果：
這裡寫圖片描述

這裡是爬取含義廣東為關鍵字的新聞。

擴充套件和優化

在這裡，我們會發現，網頁的爬取速度很慢。畢竟迴圈的次數比較多。而且很受網速的影響。
對於上述程式碼，可以做優化，最外的迴圈，是每次生成一個日期，然後再爬取，這裡可以用多執行緒方法，一次爬取2天或者3天的資料，如果是一次2天資料，按一個月32天計算，就是迴圈16次即可。
關鍵字優化，如果要更加精準地獲取新聞資訊，這裡可以自然語言處理方法。
這裡介紹一個現有的方法，就是用百度的現有自然語言處理方法：請點選
當然也可以自己編寫，具體自行百度。

python爬蟲requests模組
2019-03-01
Python爬蟲
【Python爬蟲9】Python網路爬蟲例項實戰
2017-02-17
Python爬蟲
python爬蟲例項專案大全
2017-12-06
Python爬蟲
python爬蟲之一：requests庫
2017-06-06
Python爬蟲
爬蟲——Requests模組
2019-01-13
爬蟲
爬蟲-Requests模組
2022-03-03
爬蟲
python例項，python網路爬蟲爬取大學排名!
2018-11-20
Python爬蟲
python爬蟲框架scrapy例項詳解
2013-08-14
Python爬蟲框架
Python爬蟲神器requests庫的使用
2024-11-07
Python爬蟲
爬蟲之requests庫
2022-03-20
爬蟲
Python爬蟲專案100例，附原始碼！100個Python爬蟲練手例項
2021-09-09
Python爬蟲原始碼
網路爬蟲筆記2， requests庫入門-2(程式設計例項)
2018-05-10
爬蟲筆記程式設計
python爬蟲常用庫之requests詳解
2019-03-04
Python爬蟲
Python3爬蟲實戰（requests模組）
2018-01-27
Python爬蟲
Python Beautiful Soup+requests實現爬蟲
2017-02-27
Python爬蟲
爬蟲的例項專案
2019-04-26
爬蟲
python爬蟲例項專案大全-GitHub 上有哪些優秀的 Python 爬蟲專案？
2020-10-30
Python爬蟲Github
Python爬蟲教程-06-爬蟲實現百度翻譯(requests)
2018-09-06
Python爬蟲
Python爬蟲教程-17-ajax爬取例項（豆瓣電影）
2018-09-06
Python爬蟲
Python爬蟲例項：爬取貓眼電影——破解字型反爬
2019-02-26
Python爬蟲
python爬蟲利用requests製作代理池s
2019-12-04
Python爬蟲
Python 爬蟲實戰（二）：使用 requests-html
2018-03-14
Python爬蟲HTML
JB的Python之旅-爬蟲篇--requests&Scrapy
2018-06-08
Python爬蟲
Python 爬蟲實戰（一）：使用 requests 和 BeautifulSoup
2017-12-11
Python爬蟲
Python 爬蟲入門 (二) 使用Requests來爬取圖片
2017-02-24
Python爬蟲
python爬蟲:爬蟲的簡單介紹及requests模組的簡單使用
2022-02-24
Python爬蟲
python 爬蟲實現增量去重和定時爬取例項
2020-03-06
Python爬蟲
python爬蟲之Beautiful Soup基礎知識+例項
2020-08-12
Python爬蟲
python多執行緒非同步爬蟲-Python非同步爬蟲試驗[Celery,gevent,requests]
2020-11-11
Python執行緒非同步爬蟲
Python2爬蟲利器：requests庫的基本用法
2021-09-11
Python爬蟲
使用Python和requests庫的簡單爬蟲程式
2023-11-13
Python爬蟲
python爬蟲專案（新手教程）之知乎（requests方式）
2018-06-13
Python爬蟲
Python爬蟲學習筆記-2.Requests庫
2017-05-20
Python爬蟲筆記
Python3多執行緒爬蟲例項講解
2018-01-03
Python執行緒爬蟲
Requests如何在Python爬蟲中實現get請求？
2021-09-11
Python爬蟲
Python爬蟲十六式 - 第三式：Requests的用法
2019-01-09
Python爬蟲
基於bs4+requests的python爬蟲偽裝
2018-07-20
Python爬蟲
【Python學習】爬蟲爬蟲爬蟲爬蟲~
2018-05-03
Python爬蟲

Python requests爬蟲例項

相關文章