python初級爬蟲之貓眼電影

LY_0614發表於2019-02-23

**

逐步爬取貓眼電影排行資訊

**

一、抓取一頁,簡單地思路

import requests
import re

headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)\
        Chrome/71.0.3578.98 Safari/537.36'
}
r = requests.get('https://maoyan.com/board/4',headers=headers) 
items = re.findall('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                   + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                   + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',r.text,re.S)
print(items)

先使用requests庫抓取頁面,再使用正規表示式提取頁面相關的資訊,最後輸出結果:
在這裡插入圖片描述

二、整理程式碼,使其更加簡潔和規範,便於分頁抓取。

import requests
import re

def get_page(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)\
        Chrome/71.0.3578.98 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    return response.text

def parse_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                         + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
    items = re.findall(pattern,html)
    return items
   
def main():
    url = 'https://maoyan.com/board/4'
    html = get_page(url)
    results = parse_page(html)
    print(results)
    
main()

上面代自定義函式來規範程式碼,內容基本不變,輸出結果仍為列表型。下面來整理一下輸出的結果:

import requests
import re

def get_page(url):
    headers = {
        'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko)\
        Chrome/71.0.3578.98 Safari/537.36'
    }
    response = requests.get(url,headers=headers)
    return response.text

def parse_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                         + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>',re.S)
    items = re.findall(pattern,html)
    return items
    
def main():
    url = 'https://maoyan.com/board/4'
    html = get_page(url)
    for item in parse_page(html):
        print(item[0],item[1],item[2],item[3].strip()[3:],item[4].strip()[5:],item[5] + item[6])
    
main()

在這裡插入圖片描述
三、終極整理,同時完成分頁抓取,共抓取前100名,完整程式碼如下:

import re
import requests
from requests.exceptions import RequestException

 
def get_one_page(url):
    try:
        headers = {
           'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
        }
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.text
        return None
    except RequestException:
        return None
 
def parse_one_page(html):
    pattern = re.compile('<dd>.*?board-index.*?>(\d+)</i>.*?data-src="(.*?)".*?name"><a'
                         + '.*?>(.*?)</a>.*?star">(.*?)</p>.*?releasetime">(.*?)</p>'
                         + '.*?integer">(.*?)</i>.*?fraction">(.*?)</i>.*?</dd>', re.S)
    items = re.findall(pattern, html)
    for item in items:
        yield {
            'index': item[0],
            'image': item[1],
            'title': item[2],
            'actor': item[3].strip()[3:],
            'time': item[4].strip()[5:],
            'score': item[5] + item[6]
        }
 
def main(offset):
    url = 'https://maoyan.com/board/4?offset='+str(offset) 
    html = get_one_page(url)
    for item in parse_one_page(html):
        print(item)

if __name__ == '__main__':
    for i in range(10):
        main(offset=i*10)

在這裡插入圖片描述
小結:一步一步來做,先理解每一個步驟的原理,懂得相關知識點的使用,才可以學到東西。最後的程式碼中,使用了requests庫來抓取頁面、re庫來完成正規表示式。同時,在輸出結果端使用一個生成器來完成迭代,使得輸出結果為字典型。

相關文章