用了selenium、BeautifulSoup
首先還是最基本的初始化程式碼
baseURL = "http://xwxmovie.cn/"
headers = {
'Host': 'xwxmovie.cn',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/64.0.3282.167 Safari/537.36"'
}
def browser_get():
browser = webdriver.Chrome()
browser.get(baseURL)
html_text = browser.page_source
page_count = get_page_count(html_text)
get_page_data(html_text)
複製程式碼
一開始想用BeautifulSoup抓取片段的,猶豫剛學,很多API還不會用,最後用正則先匹配自己想要的區域,然後用BeautifulSoup匹配電影名等資訊;
items = re.findall(re.compile('<div id="post-.*?class="post-.*?style="position:.*?>'
'.*?<div class="pinbin-image">(.*?)</div>'
'.*?<div class="pinbin-category">(.*?)</div>'
'.*?<div class="pinbin-copy">(.*?)</div>'
'.*?</div>', re.S), html)
複製程式碼
這時候我們就要迴圈挨個是找自己想要的了;
for item in items:
if item[0].strip():
soup = BeautifulSoup(item[0].strip(), 'html.parser')
img = soup.find('img', attrs={'class': 'attachment-detail-image wp-post-image'})
# 圖片
print("海報:" + img.get('src'))
if item[1].strip():
soup = BeautifulSoup(item[1].strip(), 'html.parser')
categorys = soup.find_all('a')
for category in categorys:
print(category.get_text())
if item[2].strip():
soup = BeautifulSoup(item[2].strip(), 'html.parser')
title = soup.find('a', attrs={'class': 'front-link'})
print("電影名:" + title.get_text())
print("連結地址:" + title.get('href'))
date = soup.find('p', attrs={'class': 'pinbin-date'})
print("日期:" + date.get_text())
brief = soup.find_all('p')
print("簡介:" + brief[1].string)
複製程式碼
以上就是得到一頁的資料;
如果想得到總得就需要得到總頁面,然後迴圈獲取;# 得到總頁數
def get_page_count(html):
soup = BeautifulSoup(html, 'html.parser')
page_count = soup.find('span', attrs={'class': 'pages'})
return int(page_count.get_text()[-4:-2])
複製程式碼
最終程式碼如下:
# -*- coding: UTF-8 -*-
from selenium import webdriver
from bs4 import BeautifulSoup
import re
baseURL = "http://xwxmovie.cn/"
headers = {
'Host': 'xwxmovie.cn',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_3) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/64.0.3282.167 Safari/537.36"'
}
def browser_get():
browser = webdriver.Chrome()
browser.get(baseURL)
html_text = browser.page_source
page_count = get_page_count(html_text)
get_page_data(html_text)
# 得到總頁數
def get_page_count(html):
soup = BeautifulSoup(html, 'html.parser')
page_count = soup.find('span', attrs={'class': 'pages'})
return int(page_count.get_text()[-4:-2])
def get_page_data(html):
items = re.findall(re.compile('<div id="post-.*?class="post-.*?style="position:.*?>'
'.*?<div class="pinbin-image">(.*?)</div>'
'.*?<div class="pinbin-category">(.*?)</div>'
'.*?<div class="pinbin-copy">(.*?)</div>'
'.*?</div>', re.S), html)
for item in items:
if item[0].strip():
soup = BeautifulSoup(item[0].strip(), 'html.parser')
img = soup.find('img', attrs={'class': 'attachment-detail-image wp-post-image'})
# 圖片
print("海報:" + img.get('src'))
if item[1].strip():
soup = BeautifulSoup(item[1].strip(), 'html.parser')
categorys = soup.find_all('a')
for category in categorys:
print(category.get_text())
if item[2].strip():
soup = BeautifulSoup(item[2].strip(), 'html.parser')
title = soup.find('a', attrs={'class': 'front-link'})
print("電影名:" + title.get_text())
print("連結地址:" + title.get('href'))
date = soup.find('p', attrs={'class': 'pinbin-date'})
print("日期:" + date.get_text())
brief = soup.find_all('p')
print("簡介:" + brief[1].string)
if __name__ == '__main__':
browser_get()
複製程式碼