建立爬蟲專案

循序0010發表於2017-10-15
主要四個步驟:
1.執行scrapy startproject project_name 建立專案框架
  執行 scrapy genspider spider_name 'domain.com'建立爬蟲基本格式檔案;
2.編輯items/item.py檔案明確獲取的資料欄位;
3.編寫spiders/目錄下的爬蟲程式;
4.編寫儲存資料的pipelines.py檔案,注開啟setting.py檔案的ITEM_PIPELINES配置;
最後,爬取:
scrapy crawl spider_name
注意:setting.py 檔案設定
ROBOTSTXT_OBEY = False #不遵從robot協議
USER_AGENT='Mozilla/5.0 (iPhone 6s; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.6.0 Mobile/14E304 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1'  #user_agent設定

以下抓取鬥魚api資料為例子:
爬取鬥魚視訊網站資料介面
http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=10

items.py檔案:
# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy

class DouyuItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    nickname = scrapy.Field()
    link = scrapy.Field()


douyu_spider.py檔案:
# -*- coding: utf-8 -*-
import scrapy
import json
from douyu.items import DouyuItem

class DouyuSpiderSpider(scrapy.Spider):

    #定義爬蟲名稱
    name = 'douyu_spider'
    #允許的域名,可省
    allowed_domains = ['douyucdn.cn']
    #組裝url
    base_url = 'http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset='
    offset = 0
    #開始爬取的urls
    start_urls = [base_url+str(offset)]

    #解析函式
    def parse(self, response):
        # response.body 為二進位制編碼,需轉為utf-8
        results = json.loads(response.body.decode('utf-8'))['data']
        if len(results) == 0:
            print('crawing over ! spider stop!')
            exit()

        for li in results:
            item = DouyuItem()
            # print(li['nickname'])
            # print(li['room_src'])
            # print('#'*30)
            item['nickname'] = li['nickname']
            item['link'] = li['room_src']
            #yield 給pipelines
            yield  item

        #跟進 下載 另一頁面
        self.offset += 20
        href = self.base_url + str(self.offset)
        #注意,此處是 關鍵詞 yield 一個scrapy.Request()
        yield  scrapy.Request(href,callback=self.parse)


imagepipelines.py檔案:
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
import os
from scrapy.pipelines.images import ImagesPipeline
#匯入setting.py檔案的常量
from scrapy.utils.project import get_project_settings

class ImagePipeline(ImagesPipeline):

    def get_media_requests(self, item, info):
        #注意此次 yield
       yield  scrapy.Request(item['link'])

    def item_completed(self, results, item, info):
        '''results的資料格式:
        [(True, {'checksum': '1e2cc73f256eff17f2d6a69388421f97', 'path': 'full/d8de0a55eb41d9a4ac4a39a0d1fc008f360d8b98.jpg', 'url': 'h
ttps://rpic.douyucdn.cn/appCovers/2017/09/24/630154_20170924201445_small.jpg'})]
        '''
        #獲取setting.py檔案的所有常量
        settings = get_project_settings()
        print('setting檔案配置:')
        images_folder = settings['IMAGES_STORE']
        data = [x['path'] for ok,x in results if ok]
        #替換 圖片名 ,data 資料格式:
        #['full/b008d9e23bdb013f672ca7527d704e049bf0c87c.jpg']
        image_ext = data[0].split('.')[-1]
        os.rename(images_folder+data[0],images_folder+item['nickname']+'.'+image_ext)
        os.rmdir(images_folder+'full')
        #這裡用return ,否則修改不了圖片名
        return  item
#注意,此檔案繼承scrapy的圖片下載類ImagesPipeline,並重寫了get_media_requests() 和 item_completed()

編寫好了imagepipeline.py 檔案後,需設定setting.py檔案:
#定義下載圖片的目錄
IMAGES_STORE = './images/'
USER_AGENT = 'Mozilla/5.0 (iPhone 6s; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.6.0 Mobile/14E304 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1'

# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
   'douyu.pipelines.DouyuPipeline': 300,
   'douyu.imagepipelines.ImagePipeline': 100,
}

相關文章