python爬取貓眼電影top100儲存到CSV

sixkery發表於2018-08-25

程式碼沒含量,希望幫到入門的小白。

import requests
import re,json
from lxml import etree
import csv

class Spider():
    def open_csv(self):
        ```
        在CSV檔案的開頭寫一行標題
        :return:
        ```
        with open(`data.csv`, `a`, newline=``) as f:
            spamwriter = csv.writer(f)
            spamwriter.writerow([`title`, `star`, `date`, `score`])

    def __get_page(self,url,headers):
        ```
        獲取文字內容
        :param url:
        :param headers:
        :return:
        ```
        try:
            response = requests.get(url,headers=headers)
            if response.status_code == 200:
                return response.text
            else:
                return None
        except Exception:
            return None

    def __parse_page(self,html):
        ```
        解析HTML,並得到提取的資料
        :param html:
        :return:
        ```
        data = etree.HTML(html)

        results = data.xpath(`//*[@class="board-wrapper"]/dd/div/div`)
        for result in results:
            # 電影名稱 電影主演 電影上映日期 評分
            ws = [
            result.xpath(`./div[1]/p[1]/a/text()`)[0],
            result.xpath(`./div[1]/p[2]/text()`)[0].strip(),
            result.xpath(`./div[1]/p[3]/text()`)[0],
            result.xpath(`./div[2]/p/i[1]/text()`)[0] + result.xpath(`./div[2]/p/i[2]/text()`)[0],
            ]

            #儲存到CSV
            with open(`data.csv`,`a`,newline=``) as f:
                writer = csv.writer(f)
                writer.writerow(ws)

    def run(self):
        ```
        程式執行入口
        :return:
        ```
        self.open_csv()

        for i in range(11):
            url =`http://maoyan.com/board/4?offset={}`.format(10*i)
            headers = {`User-Agent`: `Mozilla/5.0 (Windows NT 10.0; WOW64)`
                                     ` AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36`
                       }
            html = self.__get_page(url,headers)
            self.__parse_page(html)

#例項化類
spider = Spider()
spider.run()


相關文章