主要四個步驟:
1.執行scrapy startproject project_name 建立專案框架
執行 scrapy genspider spider_name 'domain.com'建立爬蟲基本格式檔案;
2.編輯items/item.py檔案明確獲取的資料欄位;
3.編寫spiders/目錄下的爬蟲程式;
4.編寫儲存資料的pipelines.py檔案,注開啟setting.py檔案的ITEM_PIPELINES配置;
最後,爬取:
scrapy crawl spider_name
注意:setting.py 檔案設定
ROBOTSTXT_OBEY = False #不遵從robot協議
USER_AGENT='Mozilla/5.0 (iPhone 6s; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.6.0 Mobile/14E304 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1' #user_agent設定
以下抓取鬥魚api資料為例子:
爬取鬥魚視訊網站資料介面
http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset=10
items.py檔案:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class DouyuItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
nickname = scrapy.Field()
link = scrapy.Field()
douyu_spider.py檔案:
# -*- coding: utf-8 -*-
import scrapy
import json
from douyu.items import DouyuItem
class DouyuSpiderSpider(scrapy.Spider):
#定義爬蟲名稱
name = 'douyu_spider'
#允許的域名,可省
allowed_domains = ['douyucdn.cn']
#組裝url
base_url = 'http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset='
offset = 0
#開始爬取的urls
start_urls = [base_url+str(offset)]
#解析函式
def parse(self, response):
# response.body 為二進位制編碼,需轉為utf-8
results = json.loads(response.body.decode('utf-8'))['data']
if len(results) == 0:
print('crawing over ! spider stop!')
exit()
for li in results:
item = DouyuItem()
# print(li['nickname'])
# print(li['room_src'])
# print('#'*30)
item['nickname'] = li['nickname']
item['link'] = li['room_src']
#yield 給pipelines
yield item
#跟進 下載 另一頁面
self.offset += 20
href = self.base_url + str(self.offset)
#注意,此處是 關鍵詞 yield 一個scrapy.Request()
yield scrapy.Request(href,callback=self.parse)
imagepipelines.py檔案:
# -*- coding: utf-8 -*-
# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import scrapy
import os
from scrapy.pipelines.images import ImagesPipeline
#匯入setting.py檔案的常量
from scrapy.utils.project import get_project_settings
class ImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
#注意此次 yield
yield scrapy.Request(item['link'])
def item_completed(self, results, item, info):
'''results的資料格式:
[(True, {'checksum': '1e2cc73f256eff17f2d6a69388421f97', 'path': 'full/d8de0a55eb41d9a4ac4a39a0d1fc008f360d8b98.jpg', 'url': 'h
ttps://rpic.douyucdn.cn/appCovers/2017/09/24/630154_20170924201445_small.jpg'})]
'''
#獲取setting.py檔案的所有常量
settings = get_project_settings()
print('setting檔案配置:')
images_folder = settings['IMAGES_STORE']
data = [x['path'] for ok,x in results if ok]
#替換 圖片名 ,data 資料格式:
#['full/b008d9e23bdb013f672ca7527d704e049bf0c87c.jpg']
image_ext = data[0].split('.')[-1]
os.rename(images_folder+data[0],images_folder+item['nickname']+'.'+image_ext)
os.rmdir(images_folder+'full')
#這裡用return ,否則修改不了圖片名
return item
#注意,此檔案繼承scrapy的圖片下載類ImagesPipeline,並重寫了get_media_requests() 和 item_completed()
編寫好了imagepipeline.py 檔案後,需設定setting.py檔案:
#定義下載圖片的目錄
IMAGES_STORE = './images/'
USER_AGENT = 'Mozilla/5.0 (iPhone 6s; CPU iPhone OS 10_3_1 like Mac OS X) AppleWebKit/603.1.30 (KHTML, like Gecko) Version/10.0 MQQBrowser/7.6.0 Mobile/14E304 Safari/8536.25 MttCustomUA/2 QBWebViewType/1 WKType/1'
# Obey robots.txt rules
ROBOTSTXT_OBEY = False
ITEM_PIPELINES = {
'douyu.pipelines.DouyuPipeline': 300,
'douyu.imagepipelines.ImagePipeline': 100,
}