十四個爬蟲專案爬蟲超詳細講解(零基礎入門,老年人都看的懂)

謝白羽發表於2021-02-04

爬蟲專案
專案一#抓電影名字(靜態資料,補充:補充:捕動態載入資料的電影資料)
專案二#抓動態評論(補充:爬取各個頁面的評論,評論為動態資料)
專案三#爬取搜狗頁面原始碼(補充:各個關鍵字搜尋的網頁收集器)
專案四#爬取肯德基餐廳資訊查詢(Ajax請求,且為post請求)
專案五#爬取監管總局的化妝品生產認證的產品資料(Ajax請求)
專案六#爬取單張圖片(圖片返回值為二進位制,補充為urllib請求)
專案七#爬取校花網照片(批量抓取靜態圖片)
專案八#爬取小說三國演義全篇內容(靜態)
專案九#使用xpath爬取多張圖片資料和名稱
專案十#爬取空氣質量歷史資料查詢的城市
專案十一#用cookies抓包雪球網文字資料
專案十二#獲取動態ip代理伺服器字典
專案十三#超級鷹驗證碼識別圖片驗證碼
專案十四#模擬登入古詩文網

專案一#抓豆瓣電影top250名字(靜態)
import requests
def get_movies():
headers={
‘User-agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75’,
‘Host’:‘movie.douban.com’
}
movie_list=[]
for i in range(0,10):
link=‘https://movie.douban.com/top250?start=’+str(i*25)+’&filter=’
r=requests.get(link,headers=headers,timeout=10)
print(str(i+1),“頁面響應碼:”,r.status_code)
soup=BeautifulSoup(r.text,“lxml”)
div_list=soup.find_all(‘div’,class_=‘hd’)
for each in div_list:
#print(str(i+1),“頁面響應碼:”,r.status_code)
movie=each.a.span.text.strip()
movie_list.append(movie)
return movie_list
movies=get_movies()
print(movies)
————補充:捕動態載入資料的電影資料
import requests
import json
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
url=‘https://movie.douban.com/j/search_subjects’
params={
‘type’: ‘tv’,
‘tag’: ‘綜藝’,
‘sort’: ‘recommend’,
‘page_limit’: ‘10’,
‘page_start’: ‘0’,
}
response = requests.get(url=url,params=params,headers=headers)
#.json()將獲取的字串形式的json資料序列化成字典或列表物件
page_text=response.json()
for movie in page_text[“subjects”]:
name = movie[‘title’]
rate = movie[‘rate’]
print(name,rate)
專案二
#抓動態評論

import requests
link=“https://api-zero.livere.com/v1/comments/list?callback=jQuery11240224188209204927_1611757460252&limit=10&repSeq=4272904&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&code=&=1611757460254"
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.141 Safari/537.36 Edg/87.0.664.75’
}
r=requests.get(link,headers=headers)
import json
json_string=r.text
#僅僅提取字串中符合json格式的部分
json_string=json_string[json_string.find(’{’):-2]
#json.loads 可以把字串格式的響應體資料轉化為 json 資料
json_data=json.loads(json_string)
#利用 json 資料的結構,我們可以提取到評論的列表comment_list
coment_list=json_data[‘results’][‘parents’]
for eachone in coment_list:
message=eachone[‘content’]
print(message)
————————補充專案二
#爬取各個頁面的評論
import requests
import json
def single_page_comment(link):
headers = {‘User-Agent’ : ‘Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6’}
r = requests.get(link, headers= headers)
# 獲取 json 的 string
json_string = r.text
#僅僅提取字串中符合json格式的部分
json_string = json_string[json_string.find(’{’):-2]
#json.loads 可以把字串格式的響應體資料轉化為 json 資料
json_data = json.loads(json_string)
comment_list = json_data[‘results’][‘parents’]
for eachone in comment_list:
message = eachone[‘content’]
print (message)
for page in range(1,4):
link1 = “https://api-zero.livere.com/v1/comments/list?callback=jQuery112403473268296510956_1531502963311&limit=10&offset=”
link2 = "&repSeq=4272904&requestPath=%2Fv1%2Fcomments%2Flist&consumerSeq=1020&livereSeq=28583&smartloginSeq=5154&
=1531502963316”
page_str = str(page)
link = link1 + page_str + link2
print (link)
single_page_comment(link)
專案三 爬取搜狗頁面原始碼
url=‘https://www.sogou.com/’
#對網頁發起requests的請求
reponse=requests.get(url=url)
#獲取.text格式的檔案
page_text=response.text
#儲存網頁名為sougou.html格式為utf-8的檔案
with open(’./sougou.html’,‘w’,encoding=‘utf-8’) as fp:
fp.write(page_text)
——————補充:抓取關鍵字搜尋結果頁面
import requests
#設立user-agent反UA檢測
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
keyword = input (‘enter a key word:’)
#實現引數動態化
params={
‘query’:keyword
}
url=‘https://www.sogou.com/web’
#對網頁發起requests的請求
response=requests.get(url=url,params=params,headers=headers)
#獲取.text格式的檔案
page_text=response.text
#encoding返回的是響應資料的原始編碼格式,如果給其賦值則改變響應資料的編碼格式
response.encoding = ‘utf-8’
fileName=keyword+’.html’
#儲存網頁名為sougou.html格式為utf-8的檔案
with open(fileName,‘w’,encoding=‘utf-8’) as fp:
fp.write(page_text)
print(fileName,“爬取完畢!!!”)
專案四 爬取肯德基前八頁地址資訊(ajax請求,且注意為post請求)
import requests
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
url=‘http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword’
for page in range(1,9):
data = {
‘cname’: ‘’,
‘pid’: ‘’,
‘keyword’: ‘北京’,
‘pageIndex’: str(page),
‘pageSize’: ‘10’,
}
response=requests.post(url=url,headers=headers,data=data)
page_text=response.json()
for dic in page_text[“Table1”]:
title=dic[‘storeName’]
addr=dic[‘addressDetail’]
print(title,addr)
專案五#爬取監管總局的化妝品生產認證的產品資料(Ajax請求)
import requests
import json
url=‘http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList’
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
id_list=[]
all_data_list=[]
for page in range(1,6):
data = {
‘on’:‘true’,
‘page’:str(page),
‘pageSize’:‘15’,
‘productName’:’’,
‘conditionType’:‘1’,
‘applyname’:’’,
‘applysn’:’’,
}
response=requests.post(url=url,headers=headers,data=data).json()
for dic in response[‘list’]:
id_list.append(dic[‘ID’])
post_url = ‘http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById’
for id in id_list:
data = {‘id’: id}
detail_json = requests.post(url=post_url, headers=headers, data=data).json()
all_data_list.append(detail_json)

fp = open(‘allData.json’, ‘w’, encoding=‘utf-8’)
json.dump(all_data_list, fp=fp, ensure_ascii=False)
print(‘over!!!’)
————補充1:抓取一個產品資訊(通過ID)
import requests
import json
url=‘http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById’
data={
‘id’: ‘1246978d50094d849fc45defd4d93419’,
}
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
response=requests.post(url=url,headers=headers,data=data)
page_text=response.json()
print(page_text)
————補充2:抓取全部產品名字
import requests
import json
class Cfda:
def init(self):
self.url=‘http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList’
def getCfda(self,datas):
self.html=requests.post(self.url,data=datas)
#法一提取資訊/批量 NO.1
for m in range(15):
self.data=self.html.json()[‘list’][m][‘EPS_NAME’]
print(self.data)
self.data2File(self.data)
#法二函式式
#map(lambda n:self.html.json([‘list’][n][‘ESP_NAME’]))
#定義寫入儲存檔案
def data2File(self,dat):
with open (’.\cfda.txt’,‘a’,encoding=‘utf-8’)as ff:
ff.write(str(dat)+’\n’)
cfda=Cfda()
for n in range(1,5):
data={
‘on’: ‘true’,
‘page’: ‘n’,
‘pageSize’: ‘15’,
‘productName’: ‘’,
‘conditionType’: ‘1’,
‘applyname’: ‘’,
‘applysn’:’’,
}
cfda.getCfda(data)
專案六#爬取圖片(基於requests)
import requests
import urllib
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
img_url=‘https://pics0.baidu.com/feed/738b4710b912c8fc1184111ed2ef6542d48821e6.jpeg?token=16e99d15cb2ef84747689a6dbebbf2ae&s=6843961A10F84C296AC389CC030070BB’
response=requests.get(url=img_url,headers=headers)
img_data=response.content
with open (‘i.jpg’,‘wb’)as fp:
fp.write(img_data)
————補充:基於urllib爬取圖片
import urllib
import requests
img_url=‘https://pics4.baidu.com/feed/e61190ef76c6a7ef335198d46f005756f1de66cb.jpeg?token=d8391cac8b7cb8c79188fd602eb8e5ef&s=22A021A84E1207FD96A154880300E0F2’
urllib.request.urlretrieve(img_url,’./2.jpg’)
專案七#爬取校花網照片(批量抓取靜態圖片)
import requests
import re
import urllib
import os
#建立資料夾dirName
dirName=‘ImgLibs’
#若資料夾ImgLibs不存在,則建立make一個ImgLibs資料夾
if not os.path.exists(dirName):
os.mkdir(dirName)
#1捕捉當前頁面原始碼資料
url=‘http://www.521609.com/tuku/shz/’
headers={
‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
}
page_text=requests.get(url=url,headers=headers).text
#2從當前獲取的頁面原始碼資料中解析出圖片地址,(.?)為圖片地址
ex=’

  • .? <img src="(.?)" alt=.?

  • #re.s處理換行
    img_src_list=re.findall(ex,page_text,re.S)
    for src in img_src_list:
    src = ‘http://www.521609.com’+src
    #src.split[’/’]為根據/分組,[-1]表示拿到最後一個結束,
    imgPath = dirName+’/’+src.split(’/’)[-1]
    urllib.request.urlretrieve(src,imgPath)
    print(imgPath,‘下載成功!!!’)
    專案八#爬取三國演義小說(靜態)
    import requests
    fp=open(’./sanguo.txt’,‘w’,encoding=‘utf-8’)
    main_url=‘https://www.shicimingju.com/book/sanguoyanyi.html’
    headers={
    ‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
    }
    response=requests.get(url=main_url,headers=headers)
    response.encoding=“utf-8”
    page_text=response.text
    #資料解析:章節標題,詳情頁url,章節內容
    soup=BeautifulSoup(page_text,‘lxml’)
    #定位到所有符合要求的a標籤
    a_list = soup.select(’.book-mulu>ul>li>a’)
    for a in a_list:
    #獲取標題在li內的字串
    title = a.string
    #獲取文章的url
    detail_url=‘https://www.shicimingju.com’+a[‘href’]
    #對詳情頁發起請求解析出章節內容
    responses=requests.get(url=detail_url,headers=headers)
    responses.encoding=“utf-8”
    page_text_detail=responses.text
    soup=BeautifulSoup(page_text_detail,‘lxml’)
    div_tag=soup.find(‘div’,class_=‘chapter_content’)
    #div_tag.encoding=“utf-8”
    content=div_tag.text
    #寫入資料夾
    fp.write(title+’:’+content+’\n’)
    print(title,‘儲存成功!!!’)
    fp.close()
    專案九#使用xpath爬取圖片資料和名稱
    #爬取多頁的
    #定義一個通用url模板
    import requests
    dirName=‘GirlsLibs’
    if not os.path.exists(dirName):
    os.mkdir(dirName)
    url=‘http://pic.netbian.com/4kmeinv/index_%d.html’
    #for迴圈第一頁到第五頁
    for page in range(1,6):
    if page ==1:
    new_url=‘http://pic.netbian.com/4kmeinv/’
    else:
    new_url=format(url%page)
    headers={
    ‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
    }
    response=requests.get(url=new_url,headers=headers)
    response.encoding=‘gbk’
    page_text=response.text
    #圖片資料和圖片名稱
    tree=etree.HTML(page_text)
    li_list=tree.xpath(’//div[@class=“slist”]//li’)
    for li in li_list:
    #進行區域性資料解析,在區域性資料解析中./表示當前的區域性資料
    title=li.xpath(’./a/img/@alt’)[0]+’.jpg’
    img_src=‘http://pic.netbian.com’+li.xpath(’./a/img/@src’)[0]
    #print(title,img_src)
    img_data=requests.get(url=img_src,headers=headers).content
    imgPath=dirName+’/’+title
    with open (imgPath,‘wb’)as fp:
    fp.write(img_data)
    print(title,‘儲存成功!!!’)
    專案十#爬取空氣質量歷史資料查詢的城市
    #爬取空氣質量歷史資料查詢的城市
    import requests
    url=‘https://www.aqistudy.cn/historydata/’
    headers={
    ‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
    }
    page_text=requests.get(url=url,headers=headers).text
    tree=etree.HTML(page_text)
    #爬熱門城市
    hot_cities=tree.xpath(’//div[@class=“bottom”]/ul/li/a/text()’)
    #爬二線城市
    all_cities=tree.xpath(’//div[@class=“bottom”]/ul/div[2]/li/a/text()’)
    #爬全部城市
    tree.xpath(’//div[@class=“bottom”]/ul/li/a/text()|//div[@class=“bottom”]/ul/div[2]/li/a/text()’)
    專案十一#用cookies抓包雪球網文字資料
    import requests
    headers={
    ‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
    }
    #建立好了session物件
    #假設第一訪問主頁就傳送了cookies請求
    session=requests.Session()
    main_url=‘https://xueqiu.com/’
    #捕獲且儲存cookies
    session.get(url=main_url,headers=headers)
    url=‘https://xueqiu.com/statuses/hot/listV2.json?since_id=-1&max_id=164856&size=15’
    #攜帶cookies發起的請求
    page_text=session.get(url=url,headers=headers).json()
    page_text
    專案十二#獲取動態ip代理伺服器字典
    #獲取ip字典
    import requests
    from lxml import etree
    headers={
    ‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
    }
    url=‘http://ip.ipjldl.com/index.php/api/entry?method=proxyServer.generate_api_url&packid=1&fa=0&fetch_key=&groupid=0&qty=5&time=1&pro=&city=&port=1&format=html&ss=5&css=&dt=1&specialTxt=3&specialJson=&usertype=2’
    page_text=requests.get(url=url,headers=headers).text
    tree=etree.HTML(page_text)
    proxy_list=tree.xpath(’//body//text()’)
    http_proxy=[]
    for proxy in proxy_list:
    dic={
    ‘https’:proxy
    }
    proxy_list.append(dic)
    #用的話放在requests.get(url=new_url,headers=headers,proxies=randdom.choice(http_proxy))
    專案十三#超級鷹驗證碼識別
    #!/usr/bin/env python

    coding:utf-8

    import requests
    from hashlib import md5
    class Chaojiying_Client(object):
    def init(self, username, password, soft_id):
    self.username = username
    password = password.encode(‘utf8’)
    self.password = md5(password).hexdigest()
    self.soft_id = soft_id
    self.base_params = {
    ‘user’: self.username,
    ‘pass2’: self.password,
    ‘softid’: self.soft_id,
    }
    self.headers = {
    ‘Connection’: ‘Keep-Alive’,
    ‘User-Agent’: ‘Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 5.1; Trident/4.0)’,
    }
    def PostPic(self, im, codetype):
    “”"
    im: 圖片位元組
    codetype: 題目型別 參考 http://www.chaojiying.com/price.html
    “”"
    params = {
    ‘codetype’: codetype,
    }
    params.update(self.base_params)
    files = {‘userfile’: (‘ccc.jpg’, im)}
    r = requests.post(‘http://upload.chaojiying.net/Upload/Processing.php’, data=params, files=files, headers=self.headers)
    return r.json()

    def ReportError(self, im_id):
        """
        im_id:報錯題目的圖片ID
        """
        params = {
            'id': im_id,
        }
        params.update(self.base_params)
        r = requests.post('http://upload.chaojiying.net/Upload/ReportError.php', data=params, headers=self.headers)
        return r.json()
    

    def transformImgCode(imgPath,imgType):
    chaojiying = Chaojiying_Client(‘xby9527’, ‘klx190036’, ‘912530’)
    im = open(imgPath, ‘rb’).read()
    return chaojiying.PostPic(im, imgType)[‘pic_str’]
    print(transformImgCode(’./a.jpg’,1902))
    專案十四#模擬登入古詩文網
    #模擬登入
    import requests
    from lxml import etree
    #識別驗證碼
    session=requests.Session()
    url=‘https://so.gushiwen.cn/user/login.aspx?from=http://so.gushiwen.cn/user/collect.aspx’
    headers={
    ‘User-Agent’:‘Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4343.0 Safari/537.36’
    }
    page_text=session.get(url=url,headers=headers).text
    tree=etree.HTML(page_text)
    #解析驗證碼圖片地址
    img_src=‘https://so.gushiwen.cn’+tree.xpath(’//*[@id=“imgCode”]/@src’)[0]
    #將驗證碼存到本地
    img_data=session.get(url=img_src,headers=headers).content
    with open(’./code.jpg’,‘wb’) as fp:
    fp.write(img_data)
    code_text=transformImgCode(’./code.jpg’,1902)
    print(code_text)
    login_url=‘https://so.gushiwen.cn/user/login.aspx?from=http%3a%2f%2fso.gushiwen.cn%2fuser%2fcollect.aspx’
    data={
    ‘__VIEWSTATE’: ‘QGEuht8CM/gFytmYVUZI86PETVGk3UDhl5vSeiKIg/bDaQfIFSCY5Wwelqk+zoykvi7QNfCFM3Jeo/ESWqiXHqn9kc7TiqslGnGMHV6JtjPr04OXSpRLzUJhNZo=’,
    ‘__VIEWSTATEGENERATOR’: ‘C93BE1AE’,
    ‘from’: ‘http://so.gushiwen.cn/user/collect.aspx’,
    ‘email’: ‘youxiang@qq.com’,
    ‘pwd’: ‘klx190036’,
    ‘code’: code_text,#動態變化
    ‘denglu’: ‘登入’,
    }
    #對登入頁面發起請求
    page_text_login=session.post(url=login_url,headers=headers,data=data).text
    with open(’./gushiwen.html’,‘w’,encoding=‘utf-8’) as fp:
    fp.write(page_text_login)

相關文章