豆瓣top250資料爬取

qq_21959759發表於2020-11-09

import requests
import re
import pandas as pd
from pyquery import PyQuery as pq
headers = {
‘User-Agent’: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.129 Safari/537.36’}

urls = []
for tag in range(0, 250, 25):
url = f’https://movie.douban.com/top250?start={tag}’
html = requests.get(url,headers=headers).text
doc = pq(html)
items = doc(’#content > div > div.article > ol > li’).items()
for item in items:
urls.append(item(’.hd > a’).attr(‘href’))

def parase_page(url):
html = requests.get(url,headers=headers).text
doc = pq(html)
res[‘rank’].append(doc(’.top250-no’).text())
res[‘rating’].append(doc(’.rating_num’).text())
res[‘name’].append(doc(’#content > h1 > span[property=“v:itemreviewed”]’).text())
# res.txt[‘type’] = doc(‘span[property=“v:genre”]’).text()
res[‘type’].append(’/’.join(re.findall(r’(.?)’, html)))
country = re.compile('製片國家/地區:(.?)
’, re.S)
res[‘country’].append(’/’.join(re.findall(country, html)))
language = re.compile(‘語言:(.*?)
’, re.S)
res[‘language’].append(’/’.join(re.findall(language, html)))
# res.txt[‘director’] = doc(’#info > span:nth-child(1) > span.attrs > a’).text()
res[‘date’].append(doc(’#info > span[property=“v:initialReleaseDate”]’).text()[:4])
res[‘run_time’].append(doc(‘span[property=“v:runtime”]’).text())
res[‘comments_user’].append(doc(‘span[property=“v:votes”]’).text())
res[‘five_star_ratio’].append(pq(doc(’.rating_per’)[0]).text())
res[‘four_star_ratio’].append(pq(doc(’.rating_per’)[1]).text())
res[‘three_star_ratio’].append(pq(doc(’.rating_per’)[2]).text())
res[‘two_star_ratio’].append(pq(doc(’.rating_per’)[3]).text())
res[‘one_star_ratio’].append(pq(doc(’.rating_per’)[4]).text())
print([i[-1] for i in list(res.values())])

res = {‘rank’: [],
‘name’: [] ,
‘type’: [] ,
‘country’: [] ,
‘language’: [] ,
‘date’: [] ,
‘run_time’: [] ,
‘rating’ : [],
‘comments_user’: [],
‘five_star_ratio’ : [],
‘four_star_ratio’ : [],
‘three_star_ratio’ : [],
‘two_star_ratio’ : [],
‘one_star_ratio’ : []
}
print(list(res.keys()))
for url in urls:
parase_page(url)

df = pd.DataFrame(res)

df.to_excel(‘豆瓣電影Top250.xls’)

豆瓣榜單top250

發給某些人做畢業設計想白嫖.
需要資料的聯絡q:940755193

相關文章