python爬蟲日記01

專注的阿熊發表於2021-05-11

import random

from time import sleep

import pymysql

from bs4 import BeautifulSoup

import re  # 正規表示式

import urllib.request, urllib.error  # 定製 url 獲取網頁資料

def main():

     parser_url_save()

def parser_url_save():

     print("doing parser url ....")

     # 貓眼

     # 豆瓣

     urls = [']

     for url in urls:

         if url.find(') !=-1 :

             datalist = parser_html(url)

             # datalist = getData_maoyan(html)

             # print(datalist)

             saveData_maoyan(datalist)

         else :

             # parser_DOUBAN(url)

             print("parser douban ...")

# 正則列表

# 貓眼標題

find_maoyan_title = re.compile(r'.*?title="(.*?)"')

# 貓眼連結

find_maoyan_link = re.compile(r'.*?href="(.*?)"')

# 貓眼圖片

find_maoyan_pic = re.compile(r'.*?<img.*?data-src="(.*?)"')

# 貓眼評分

find_maoyan_score1 = re.compile(r'<p><i>(.*?)<')

find_maoyan_score2 = re.compile(r'</i><i>(.*?)<')

# 主演

find_maoyan_star = re.compile(r'.* 主演: (.*)')

# 上映時間

find_maoyan_date = re.compile(r' 上映時間: (.*)<')

def parser_html (url):

     cookie = '###'

     # agent=random.choice(user_agent)

     agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.93 Safari/537.36"

     data_maoyan_list=[]

     for i in range (0,10) :

         sleep(3)

         url_tmp = url+str(i*10)

         headers = {

             "User-Agent":agent

             ,"Cookie":cookie

         }

         req = urllib.request.Request(url_tmp, headers=headers)

         response = urllib.request.urlopen(req)

         html = response.read().decode("utf-8")

         data_maoyan_list = getData_maoyan(html,data_maoyan_list)

     return data_maoyan_list

def getData_maoyan (html,data_maoyan_list):

     html_parser = BeautifulSoup(html, "html.parser")

     base_url = '

     item_list = html_parser.find_all('dd')

     for item in item_list:

         sleep(1)  # 延時訪問

         # 單個電影的資料集

         data = []

         item_a = str(item.a)

         # 取標題

         title = re.findall(find_maoyan_title, item_a)[0]

         # 取連結

         curr_url = base_url + str(re.findall(find_maoyan_link, item_a)[0])

         # 取圖片連結

         pic = re.findall(find_maoyan_pic, item_a)[0]

         # 評分

         item_p = item.select("p[class='score']")

         # if i * 10 == 20:

         #     print(item_p)

         score = "0.0"  # 存在沒有評分的重置 0.0

         if len(re.findall(find_maoyan_score1, str(item_p))) > 0:

             score = float(str(re.findall(find_maoyan_score1, str(item_p))[0]) + str(

                 re.findall(find_maoyan_score2, str(item_p))[0]))

         # 主演

         # <p>

         item_star = item.select("p[class='star']")

         # print(str(item_star))

         star = re.findall(find_maoyan_star, str(item_star))[0]

         # 上映時間   <p>

         item_releasetime = item.select("p[class='releasetime']")

         releasetime = re.findall(find_maoyan_date, str(item_releasetime))[0]

         # 外匯跟單gendan5.com 新增到資料集中 ,title,curr_url,pic,score,star,releasetime

         data.append(title)

         data.append(curr_url)

         data.append(pic)

         data.append(score)

         data.append(star)

         data.append(releasetime)

         data_maoyan_list.append(data)

     return data_maoyan_list

def saveData_maoyan(data_list):

     conn = pymysql.connect(

             host='xxx.xx.xx.xx',# host

             port = 80, # 預設埠,根據實際修改

             user='root',# 使用者名稱

             passwd='123456', # 密碼

             db ='luke_db', # DB name

             )

     cur=conn.cursor()

     print(conn)

     # 獲取了資料列表

     for id in range(0,len(data_list)):

         #      取得欄位

         ind_id = str(id);

         title = '"'+str(data_list[id][0])+'"'  # 標題

         link = '"'+str(data_list[id][1])+'"'  # 連線

         pic_link = '"'+str(data_list[id][2])+'"'  # 圖片連線

         score = str(data_list[id][3])  # 評分

         actor = '"'+str(data_list[id][4])+'"'  # 主演

         pub_date = '"'+str(data_list[id][5])+'"'  # 上映時間

         arr=[ind_id,title,link,pic_link,score,actor,pub_date]

         sql='''

             insert into luke_db.t_movie_top100_maoyan (xh,m_title,m_link,m_pic,m_score,m_actor,m_pubdate)

              values(%s)'''%",".join(arr)

         print(sql)

         print(cur.execute(sql))

         conn.commit() # 插入資料

     cur.close()

     conn.close()

if __name__== '__main__':

     main()


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69946337/viewspace-2771798/,如需轉載,請註明出處,否則將追究法律責任。

相關文章