python爬蟲日記01
import random
from time import sleep
import pymysql
from bs4 import BeautifulSoup
import re # 正規表示式
import urllib.request, urllib.error # 定製 url 獲取網頁資料
def main():
parser_url_save()
def parser_url_save():
print("doing parser url ....")
# 貓眼
# 豆瓣
urls = [']
for url in urls:
if url.find(') !=-1 :
datalist = parser_html(url)
# datalist = getData_maoyan(html)
# print(datalist)
saveData_maoyan(datalist)
else :
# parser_DOUBAN(url)
print("parser douban ...")
# 正則列表
# 貓眼標題
find_maoyan_title = re.compile(r'.*?title="(.*?)"')
# 貓眼連結
find_maoyan_link = re.compile(r'.*?href="(.*?)"')
# 貓眼圖片
find_maoyan_pic = re.compile(r'.*?<img.*?data-src="(.*?)"')
# 貓眼評分
find_maoyan_score1 = re.compile(r'<p><i>(.*?)<')
find_maoyan_score2 = re.compile(r'</i><i>(.*?)<')
# 主演
find_maoyan_star = re.compile(r'.* 主演: (.*)')
# 上映時間
find_maoyan_date = re.compile(r' 上映時間: (.*)<')
def parser_html (url):
cookie = '###'
# agent=random.choice(user_agent)
agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"
data_maoyan_list=[]
for i in range (0,10) :
sleep(3)
url_tmp = url+str(i*10)
headers = {
"User-Agent":agent
,"Cookie":cookie
}
req = urllib.request.Request(url_tmp, headers=headers)
response = urllib.request.urlopen(req)
html = response.read().decode("utf-8")
data_maoyan_list = getData_maoyan(html,data_maoyan_list)
return data_maoyan_list
def getData_maoyan (html,data_maoyan_list):
html_parser = BeautifulSoup(html, "html.parser")
base_url = '
item_list = html_parser.find_all('dd')
for item in item_list:
sleep(1) # 延時訪問
# 單個電影的資料集
data = []
item_a = str(item.a)
# 取標題
title = re.findall(find_maoyan_title, item_a)[0]
# 取連結
curr_url = base_url + str(re.findall(find_maoyan_link, item_a)[0])
# 取圖片連結
pic = re.findall(find_maoyan_pic, item_a)[0]
# 評分
item_p = item.select("p[class='score']")
# if i * 10 == 20:
# print(item_p)
score = "0.0" # 存在沒有評分的重置 0.0
if len(re.findall(find_maoyan_score1, str(item_p))) > 0:
score = float(str(re.findall(find_maoyan_score1, str(item_p))[0]) + str(
re.findall(find_maoyan_score2, str(item_p))[0]))
# 主演
# ’ <p> ‘
item_star = item.select("p[class='star']")
# print(str(item_star))
star = re.findall(find_maoyan_star, str(item_star))[0]
# 上映時間 <p>
item_releasetime = item.select("p[class='releasetime']")
releasetime = re.findall(find_maoyan_date, str(item_releasetime))[0]
# 外匯跟單gendan5.com 新增到資料集中 ,title,curr_url,pic,score,star,releasetime
data.append(title)
data.append(curr_url)
data.append(pic)
data.append(score)
data.append(star)
data.append(releasetime)
data_maoyan_list.append(data)
return data_maoyan_list
def saveData_maoyan(data_list):
conn = pymysql.connect(
host='xxx.xx.xx.xx',# host
port = 80, # 預設埠,根據實際修改
user='root',# 使用者名稱
passwd='123456', # 密碼
db ='luke_db', # DB name
)
cur=conn.cursor()
print(conn)
# 獲取了資料列表
for id in range(0,len(data_list)):
# 取得欄位
ind_id = str(id);
title = '"'+str(data_list[id][0])+'"' # 標題
link = '"'+str(data_list[id][1])+'"' # 連線
pic_link = '"'+str(data_list[id][2])+'"' # 圖片連線
score = str(data_list[id][3]) # 評分
actor = '"'+str(data_list[id][4])+'"' # 主演
pub_date = '"'+str(data_list[id][5])+'"' # 上映時間
arr=[ind_id,title,link,pic_link,score,actor,pub_date]
sql='''
insert into luke_db.t_movie_top100_maoyan (xh,m_title,m_link,m_pic,m_score,m_actor,m_pubdate)
values(%s)'''%",".join(arr)
print(sql)
print(cur.execute(sql))
conn.commit() # 插入資料
cur.close()
conn.close()
if __name__== '__main__':
main()
來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69946337/viewspace-2771798/,如需轉載,請註明出處,否則將追究法律責任。
相關文章
- Python scrapy爬蟲學習筆記01Python爬蟲筆記
- Python爬蟲教程-01-爬蟲介紹Python爬蟲
- 爬蟲學習日記(六)爬蟲
- 爬蟲學習日記(八)爬蟲
- 爬蟲學習日記(七)爬蟲
- 爬蟲學習日記(二)爬蟲
- 爬蟲學習日記(一)爬蟲
- 爬蟲學習日記(五)爬蟲
- 爬蟲學習日記(三)爬蟲
- 【Python學習】爬蟲爬蟲爬蟲爬蟲~Python爬蟲
- 【python爬蟲】python爬蟲demoPython爬蟲
- 爬蟲學習日記(六)完成第一個爬蟲任務爬蟲
- 爬蟲學習日記(十二)解析PDF爬蟲
- python爬蟲---網頁爬蟲,圖片爬蟲,文章爬蟲,Python爬蟲爬取新聞網站新聞Python爬蟲網頁網站
- Python爬蟲 - 記一次字型反爬Python爬蟲
- 爬蟲學習日記(四)分析Freenium爬蟲
- python就是爬蟲嗎-python就是爬蟲嗎Python爬蟲
- Python爬蟲學習筆記(三)Python爬蟲筆記
- python網路爬蟲筆記(一)Python爬蟲筆記
- python爬蟲學習筆記(二)Python爬蟲筆記
- Python爬蟲:Xpath語法筆記Python爬蟲筆記
- python爬蟲—學習筆記-4Python爬蟲筆記
- python爬蟲—學習筆記-2Python爬蟲筆記
- python爬蟲Python爬蟲
- python 爬蟲Python爬蟲
- 一入爬蟲深似海,總結python爬蟲學習筆記!爬蟲Python筆記
- Python爬蟲基礎-01-帶有請求引數的爬蟲Python爬蟲
- python爬蟲學習01--電子書爬取Python爬蟲
- 3.21日 爬蟲小計爬蟲
- Java爬蟲與Python爬蟲的區別?Java爬蟲Python
- python爬蟲初探--第一個python爬蟲專案Python爬蟲
- Python asyncio 爬蟲Python爬蟲
- python爬蟲2Python爬蟲
- Python爬蟲——XPathPython爬蟲
- Python 爬蟲系列Python爬蟲
- Python爬蟲-xpathPython爬蟲
- Python爬蟲--2Python爬蟲
- python網路爬蟲_Python爬蟲:30個小時搞定Python網路爬蟲視訊教程Python爬蟲