十四個爬蟲專案爬蟲超詳細講解(零基礎入門,老年人都看的懂)
爬蟲專案
專案一#抓電影名字(靜態資料,補充:補充:捕動態載入資料的電影資料)
專案二#抓動態評論(補充:爬取各個頁面的評論,評論為動態資料)
專案三#爬取搜狗頁面原始碼(補充:各個關鍵字搜尋的網頁收集器)
專案四#爬取肯德基餐廳資訊查詢(Ajax請求,且為post請求)
專案五#爬取監管總局的化妝品生產認證的產品資料(Ajax請求)
專案六#爬取單張圖片(圖片返回值為二進位制,補充為urllib請求)
專案七#爬取校花網照片(批量抓取靜態圖片)
專案八#爬取小說三國演義全篇內容(靜態)
專案九#使用xpath爬取多張圖片資料和名稱
專案十#爬取空氣質量歷史資料查詢的城市
專案十一#用cookies抓包雪球網文字資料
專案十二#獲取動態ip代理伺服器字典
專案十三#超級鷹驗證碼識別圖片驗證碼
專案十四#模擬登入古詩文網
專案一#抓豆瓣電影top250名字(靜態)
import requests
def get_movies():
headers={
‘User-agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75’,
‘Host’:‘movie.douban.com’
}
movie_list=[]
for i in range(0,10):
link=‘https://movie.douban.com/top250?start=’+str(i*25)+’&filter=’
r=requests.get(link,headers=headers,timeout=10)
print(str(i+1),“頁面響應碼:”,r.status_code)
soup=BeautifulSoup(r.text,“lxml”)
div_list=soup.find_all(‘div’,class_=‘hd’)
for each in div_list:
#print(str(i+1),“頁面響應碼:”,r.status_code)
movie=each.a.span.text.strip()
movie_list.append(movie)
return movie_list
movies=get_movies()
print(movies)
————補充:捕動態載入資料的電影資料
import requests
import json
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
url=‘https://movie.douban.com/j/search_subjects’
params={
‘type’: ‘tv’,
‘tag’: ‘綜藝’,
‘sort’: ‘recommend’,
‘page_limit’: ‘10’,
‘page_start’: ‘0’,
}
response = requests.get(url=url,params=params,headers=headers)
#.json()將獲取的字串形式的json資料序列化成字典或列表物件
page_text=response.json()
for movie in page_text[“subjects”]:
name = movie[‘title’]
rate = movie[‘rate’]
print(name,rate)
專案二
#抓動態評論
import requests
link=“https://api-zero.livere.com/v1/comments/list?callback=jQuery11240224188209204927_1611757460252&limit=10&repSeq=4272904&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=&=1611757460254"
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75’
}
r=requests.get(link,headers=headers)
import json
json_string=r.text
#僅僅提取字串中符合json格式的部分
json_string=json_string[json_string.find(’{’):-2]
#json.loads 可以把字串格式的響應體資料轉化為 json 資料
json_data=json.loads(json_string)
#利用 json 資料的結構,我們可以提取到評論的列表comment_list
coment_list=json_data[‘results’][‘parents’]
for eachone in coment_list:
message=eachone[‘content’]
print(message)
————————補充專案二
#爬取各個頁面的評論
import requests
import json
def single_page_comment(link):
headers = {‘User-Agent’ : ‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6’}
r = requests.get(link, headers= headers)
# 獲取 json 的 string
json_string = r.text
#僅僅提取字串中符合json格式的部分
json_string = json_string[json_string.find(’{’):-2]
#json.loads 可以把字串格式的響應體資料轉化為 json 資料
json_data = json.loads(json_string)
comment_list = json_data[‘results’][‘parents’]
for eachone in comment_list:
message = eachone[‘content’]
print (message)
for page in range(1,4):
link1 = “https://api-zero.livere.com/v1/comments/list?callback=jQuery112403473268296510956_1531502963311&limit=10&offset=”
link2 = "&repSeq=4272904&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&=1531502963316”
page_str = str(page)
link = link1 + page_str + link2
print (link)
single_page_comment(link)
專案三 爬取搜狗頁面原始碼
url=‘https://www.sogou.com/’
#對網頁發起requests的請求
reponse=requests.get(url=url)
#獲取.text格式的檔案
page_text=response.text
#儲存網頁名為sougou.html格式為utf-8的檔案
with open(’./sougou.html’,‘w’,encoding=‘utf-8’) as fp:
fp.write(page_text)
——————補充:抓取關鍵字搜尋結果頁面
import requests
#設立user-agent反UA檢測
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
keyword = input (‘enter a key word:’)
#實現引數動態化
params={
‘query’:keyword
}
url=‘https://www.sogou.com/web’
#對網頁發起requests的請求
response=requests.get(url=url,params=params,headers=headers)
#獲取.text格式的檔案
page_text=response.text
#encoding返回的是響應資料的原始編碼格式,如果給其賦值則改變響應資料的編碼格式
response.encoding = ‘utf-8’
fileName=keyword+’.html’
#儲存網頁名為sougou.html格式為utf-8的檔案
with open(fileName,‘w’,encoding=‘utf-8’) as fp:
fp.write(page_text)
print(fileName,“爬取完畢!!!”)
專案四 爬取肯德基前八頁地址資訊(ajax請求,且注意為post請求)
import requests
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
url=‘http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword’
for page in range(1,9):
data = {
‘cname’: ‘’,
‘pid’: ‘’,
‘keyword’: ‘北京’,
‘pageIndex’: str(page),
‘pageSize’: ‘10’,
}
response=requests.post(url=url,headers=headers,data=data)
page_text=response.json()
for dic in page_text[“Table1”]:
title=dic[‘storeName’]
addr=dic[‘addressDetail’]
print(title,addr)
專案五#爬取監管總局的化妝品生產認證的產品資料(Ajax請求)
import requests
import json
url=‘http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList’
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
id_list=[]
all_data_list=[]
for page in range(1,6):
data = {
‘on’:‘true’,
‘page’:str(page),
‘pageSize’:‘15’,
‘productName’:’’,
‘conditionType’:‘1’,
‘applyname’:’’,
‘applysn’:’’,
}
response=requests.post(url=url,headers=headers,data=data).json()
for dic in response[‘list’]:
id_list.append(dic[‘ID’])
post_url = ‘http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById’
for id in id_list:
data = {‘id’: id}
detail_json = requests.post(url=post_url, headers=headers, data=data).json()
all_data_list.append(detail_json)
fp = open(‘allData.json’, ‘w’, encoding=‘utf-8’)
json.dump(all_data_list, fp=fp, ensure_ascii=False)
print(‘over!!!’)
————補充1:抓取一個產品資訊(通過ID)
import requests
import json
url=‘http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById’
data={
‘id’: ‘1246978d50094d849fc45defd4d93419’,
}
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
response=requests.post(url=url,headers=headers,data=data)
page_text=response.json()
print(page_text)
————補充2:抓取全部產品名字
import requests
import json
class Cfda:
def init(self):
self.url=‘http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList’
def getCfda(self,datas):
self.html=requests.post(self.url,data=datas)
#法一提取資訊/批量 NO.1
for m in range(15):
self.data=self.html.json()[‘list’][m][‘EPS_NAME’]
print(self.data)
self.data2File(self.data)
#法二函式式
#map(lambda n:self.html.json([‘list’][n][‘ESP_NAME’]))
#定義寫入儲存檔案
def data2File(self,dat):
with open (’.\cfda.txt’,‘a’,encoding=‘utf-8’)as ff:
ff.write(str(dat)+’\n’)
cfda=Cfda()
for n in range(1,5):
data={
‘on’: ‘true’,
‘page’: ‘n’,
‘pageSize’: ‘15’,
‘productName’: ‘’,
‘conditionType’: ‘1’,
‘applyname’: ‘’,
‘applysn’:’’,
}
cfda.getCfda(data)
專案六#爬取圖片(基於requests)
import requests
import urllib
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
img_url=‘https://pics0.baidu.com/feed/738b4710b912c8fc1184111ed2ef6542d48821e6.jpeg?token=16e99d15cb2ef84747689a6dbebbf2ae&s=6843961A10F84C296AC389CC030070BB’
response=requests.get(url=img_url,headers=headers)
img_data=response.content
with open (‘i.jpg’,‘wb’)as fp:
fp.write(img_data)
————補充:基於urllib爬取圖片
import urllib
import requests
img_url=‘https://pics4.baidu.com/feed/e61190ef76c6a7ef335198d46f005756f1de66cb.jpeg?token=d8391cac8b7cb8c79188fd602eb8e5ef&s=22A021A84E1207FD96A154880300E0F2’
urllib.request.urlretrieve(img_url,’./2.jpg’)
專案七#爬取校花網照片(批量抓取靜態圖片)
import requests
import re
import urllib
import os
#建立資料夾dirName
dirName=‘ImgLibs’
#若資料夾ImgLibs不存在,則建立make一個ImgLibs資料夾
if not os.path.exists(dirName):
os.mkdir(dirName)
#1捕捉當前頁面原始碼資料
url=‘http://www.521609.com/tuku/shz/’
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
page_text=requests.get(url=url,headers=headers).text
#2從當前獲取的頁面原始碼資料中解析出圖片地址,(.?)為圖片地址
ex=’
- .? <img src="(.?)" alt=.?
- ’
#re.s處理換行
img_src_list=re.findall(ex,page_text,re.S)
for src in img_src_list:
src = ‘http://www.521609.com’+src
#src.split[’/’]為根據/分組,[-1]表示拿到最後一個結束,
imgPath = dirName+’/’+src.split(’/’)[-1]
urllib.request.urlretrieve(src,imgPath)
print(imgPath,‘下載成功!!!’)
專案八#爬取三國演義小說(靜態)
import requests
fp=open(’./sanguo.txt’,‘w’,encoding=‘utf-8’)
main_url=‘https://www.shicimingju.com/book/sanguoyanyi.html’
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
response=requests.get(url=main_url,headers=headers)
response.encoding=“utf-8”
page_text=response.text
#資料解析:章節標題,詳情頁url,章節內容
soup=BeautifulSoup(page_text,‘lxml’)
#定位到所有符合要求的a標籤
a_list = soup.select(’.book-mulu>ul>li>a’)
for a in a_list:
#獲取標題在li內的字串
title = a.string
#獲取文章的url
detail_url=‘https://www.shicimingju.com’+a[‘href’]
#對詳情頁發起請求解析出章節內容
responses=requests.get(url=detail_url,headers=headers)
responses.encoding=“utf-8”
page_text_detail=responses.text
soup=BeautifulSoup(page_text_detail,‘lxml’)
div_tag=soup.find(‘div’,class_=‘chapter_content’)
#div_tag.encoding=“utf-8”
content=div_tag.text
#寫入資料夾
fp.write(title+’:’+content+’\n’)
print(title,‘儲存成功!!!’)
fp.close()
專案九#使用xpath爬取圖片資料和名稱
#爬取多頁的
#定義一個通用url模板
import requests
dirName=‘GirlsLibs’
if not os.path.exists(dirName):
os.mkdir(dirName)
url=‘http://pic.netbian.com/4kmeinv/index_%d.html’
#for迴圈第一頁到第五頁
for page in range(1,6):
if page ==1:
new_url=‘http://pic.netbian.com/4kmeinv/’
else:
new_url=format(url%page)
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
response=requests.get(url=new_url,headers=headers)
response.encoding=‘gbk’
page_text=response.text
#圖片資料和圖片名稱
tree=etree.HTML(page_text)
li_list=tree.xpath(’//div[@class=“slist”]//li’)
for li in li_list:
#進行區域性資料解析,在區域性資料解析中./表示當前的區域性資料
title=li.xpath(’./a/img/@alt’)[0]+’.jpg’
img_src=‘http://pic.netbian.com’+li.xpath(’./a/img/@src’)[0]
#print(title,img_src)
img_data=requests.get(url=img_src,headers=headers).content
imgPath=dirName+’/’+title
with open (imgPath,‘wb’)as fp:
fp.write(img_data)
print(title,‘儲存成功!!!’)
專案十#爬取空氣質量歷史資料查詢的城市
#爬取空氣質量歷史資料查詢的城市
import requests
url=‘https://www.aqistudy.cn/historydata/’
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
page_text=requests.get(url=url,headers=headers).text
tree=etree.HTML(page_text)
#爬熱門城市
hot_cities=tree.xpath(’//div[@class=“bottom”]/ul/li/a/text()’)
#爬二線城市
all_cities=tree.xpath(’//div[@class=“bottom”]/ul/div[2]/li/a/text()’)
#爬全部城市
tree.xpath(’//div[@class=“bottom”]/ul/li/a/text()|//div[@class=“bottom”]/ul/div[2]/li/a/text()’)
專案十一#用cookies抓包雪球網文字資料
import requests
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
#建立好了session物件
#假設第一訪問主頁就傳送了cookies請求
session=requests.Session()
main_url=‘https://xueqiu.com/’
#捕獲且儲存cookies
session.get(url=main_url,headers=headers)
url=‘https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=164856&size=15’
#攜帶cookies發起的請求
page_text=session.get(url=url,headers=headers).json()
page_text
專案十二#獲取動態ip代理伺服器字典
#獲取ip字典
import requests
from lxml import etree
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
url=‘http://ip.ipjldl.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=1&fa=0&fetch_key=&groupid=0&qty=5&time=1&pro=&city=&port=1&format=html&ss=5&css=&dt=1&specialTxt=3&specialJson=&usertype=2’
page_text=requests.get(url=url,headers=headers).text
tree=etree.HTML(page_text)
proxy_list=tree.xpath(’//body//text()’)
http_proxy=[]
for proxy in proxy_list:
dic={
‘https’:proxy
}
proxy_list.append(dic)
#用的話放在requests.get(url=new_url,headers=headers,proxies=randdom.choice(http_proxy))
專案十三#超級鷹驗證碼識別
#!/usr/bin/env pythoncoding:utf-8
import requests
from hashlib import md5
class Chaojiying_Client(object):
def init(self, username, password, soft_id):
self.username = username
password = password.encode(‘utf8’)
self.password = md5(password).hexdigest()
self.soft_id = soft_id
self.base_params = {
‘user’: self.username,
‘pass2’: self.password,
‘softid’: self.soft_id,
}
self.headers = {
‘Connection’: ‘Keep-Alive’,
‘User-Agent’: ‘Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)’,
}
def PostPic(self, im, codetype):
“”"
im: 圖片位元組
codetype: 題目型別 參考 http://www.chaojiying.com/price.html
“”"
params = {
‘codetype’: codetype,
}
params.update(self.base_params)
files = {‘userfile’: (‘ccc.jpg’, im)}
r = requests.post(‘http://upload.chaojiying.net/Upload/Processing.php’, data=params, files=files, headers=self.headers)
return r.json()def ReportError(self, im_id): """ im_id:報錯題目的圖片ID """ params = { 'id': im_id, } params.update(self.base_params) r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers) return r.json()
def transformImgCode(imgPath,imgType):
chaojiying = Chaojiying_Client(‘xby9527’, ‘klx190036’, ‘912530’)
im = open(imgPath, ‘rb’).read()
return chaojiying.PostPic(im, imgType)[‘pic_str’]
print(transformImgCode(’./a.jpg’,1902))
專案十四#模擬登入古詩文網
#模擬登入
import requests
from lxml import etree
#識別驗證碼
session=requests.Session()
url=‘https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx’
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
page_text=session.get(url=url,headers=headers).text
tree=etree.HTML(page_text)
#解析驗證碼圖片地址
img_src=‘https://so.gushiwen.cn’+tree.xpath(’//*[@id=“imgCode”]/@src’)[0]
#將驗證碼存到本地
img_data=session.get(url=img_src,headers=headers).content
with open(’./code.jpg’,‘wb’) as fp:
fp.write(img_data)
code_text=transformImgCode(’./code.jpg’,1902)
print(code_text)
login_url=‘https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx’
data={
‘__VIEWSTATE’: ‘QGEuht8CM/gFytmYVUZI86PETVGk3UDhl5vSeiKIg/bDaQfIFSCY5Wwelqk+zoykvi7QNfCFM3Jeo/ESWqiXHqn9kc7TiqslGnGMHV6JtjPr04OXSpRLzUJhNZo=’,
‘__VIEWSTATEGENERATOR’: ‘C93BE1AE’,
‘from’: ‘http://so.gushiwen.cn/user/collect.aspx’,
‘email’: ‘youxiang@qq.com’,
‘pwd’: ‘klx190036’,
‘code’: code_text,#動態變化
‘denglu’: ‘登入’,
}
#對登入頁面發起請求
page_text_login=session.post(url=login_url,headers=headers,data=data).text
with open(’./gushiwen.html’,‘w’,encoding=‘utf-8’) as fp:
fp.write(page_text_login)
相關文章
- Python爬蟲超詳細講解(零基礎入門,老年人都看的懂)Python爬蟲
- Python爬蟲入門(2):爬蟲基礎瞭解Python爬蟲
- Scrapy入門-第一個爬蟲專案爬蟲
- Python爬蟲詳解(一看就懂)Python爬蟲
- 一看就明白的爬蟲入門講解:基礎理論篇爬蟲
- Python爬蟲入門專案Python爬蟲
- 爬蟲(1) - 爬蟲基礎入門理論篇爬蟲
- 一看就明白的爬蟲入門講解:基礎理論篇(上篇)爬蟲
- 一看就明白的爬蟲入門講解:基礎理論篇(下篇)爬蟲
- (python)爬蟲----八個專案帶你進入爬蟲的世界Python爬蟲
- scrapy入門教程()部署爬蟲專案爬蟲
- 學渣講爬蟲之Python爬蟲從入門到出門(第三講)爬蟲Python
- 爬蟲入門爬蟲
- Python超簡單超基礎的免費小說爬蟲!爬蟲入門從這開始!Python爬蟲
- 【爬蟲】爬蟲專案推薦 / 思路爬蟲
- scrapy 框架新建一個 爬蟲專案詳細步驟框架爬蟲
- Python爬蟲開發(一):零基礎入門Python爬蟲
- Python 網路爬蟲入門詳解Python爬蟲
- Java爬蟲入門(一)——專案介紹Java爬蟲
- 爬蟲專案爬蟲
- Python爬蟲入門,8個常用爬蟲技巧盤點Python爬蟲
- 爬蟲入門基礎-Python爬蟲Python
- 不踩坑的Python爬蟲:Python爬蟲開發與專案實戰,從爬蟲入門 PythonPython爬蟲
- Node 爬蟲入門爬蟲
- Python爬蟲五大零基礎入門教程Python爬蟲
- 爬蟲專案(一)爬蟲+jsoup輕鬆爬知乎爬蟲JS
- 精通Scrapy網路爬蟲【一】第一個爬蟲專案爬蟲
- python爬蟲初探--第一個python爬蟲專案Python爬蟲
- 專案之爬蟲入門(豆瓣TOP250)爬蟲
- 【爬蟲】python爬蟲從入門到放棄爬蟲Python
- 在scrapy框架下建立爬蟲專案,建立爬蟲檔案,執行爬蟲檔案框架爬蟲
- 爬蟲小專案爬蟲
- 爬蟲專案部署爬蟲
- 建立爬蟲專案爬蟲
- Python爬蟲開發與專案實戰——基礎爬蟲分析Python爬蟲
- 帶你入門Python爬蟲,8個常用爬蟲技巧盤點Python爬蟲
- Python 爬蟲零基礎教程(1):爬單個圖片Python爬蟲
- Java 爬蟲專案實戰之爬蟲簡介Java爬蟲