1.自定義pipeline
#一:可以寫多個Pipeline類 #1、如果優先順序高的Pipeline的process_item返回一個值或者None,會自動傳給下一個pipline的process_item, #2、如果只想讓第一個Pipeline執行,那得讓第一個pipline的process_item丟擲異常raise DropItem() #3、可以用spider.name == '爬蟲名' 來控制哪些爬蟲用哪些pipeline 二:示範 from scrapy.exceptions import DropItem class CustomPipeline(object): def __init__(self,v): self.value = v @classmethod def from_crawler(cls, crawler): """ Scrapy會先通過getattr判斷我們是否自定義了from_crawler,有則調它來完 成例項化 """ val = crawler.settings.getint('MMMM') return cls(val) def open_spider(self,spider): """ 爬蟲剛啟動時執行一次 """ print('000000') def close_spider(self,spider): """ 爬蟲關閉時執行一次 """ print('111111') def process_item(self, item, spider): # 操作並進行持久化 # return表示會被後續的pipeline繼續處理 return item # 表示將item丟棄,不會被後續pipeline處理 # raise DropItem()
例項
#1、settings.py HOST="127.0.0.1" PORT=27017 USER="root" PWD="123" DB="amazon" TABLE="goods" ITEM_PIPELINES = { 'Amazon.pipelines.CustomPipeline': 200, } #2、pipelines.py class CustomPipeline(object): def __init__(self,host,port,user,pwd,db,table): self.host=host self.port=port self.user=user self.pwd=pwd self.db=db self.table=table @classmethod def from_crawler(cls, crawler): """ Scrapy會先通過getattr判斷我們是否自定義了from_crawler,有則調它來完 成例項化 """ HOST = crawler.settings.get('HOST') PORT = crawler.settings.get('PORT') USER = crawler.settings.get('USER') PWD = crawler.settings.get('PWD') DB = crawler.settings.get('DB') TABLE = crawler.settings.get('TABLE') return cls(HOST,PORT,USER,PWD,DB,TABLE) def open_spider(self,spider): """ 爬蟲剛啟動時執行一次 """ self.client = MongoClient('mongodb://%s:%s@%s:%s' %(self.user,self.pwd,self.host,self.port)) def close_spider(self,spider): """ 爬蟲關閉時執行一次 """ self.client.close() def process_item(self, item, spider): # 操作並進行持久化 self.client[self.db][self.table].save(dict(item))
2. 自定義擴充套件
自定義擴充套件(與django的訊號類似) 1、django的訊號是django是預留的擴充套件,訊號一旦被觸發,相應的功能就會執行 2、scrapy自定義擴充套件的好處是可以在任意我們想要的位置新增功能,而其他元件中提供的功能只能在規定的位置執行
#1、在與settings同級目錄下新建一個檔案,檔名可以為extentions.py,內容如下 from scrapy import signals class MyExtension(object): def __init__(self, value): self.value = value @classmethod def from_crawler(cls, crawler): val = crawler.settings.getint('MMMM') obj = cls(val) crawler.signals.connect(obj.spider_opened, signal=signals.spider_opened) crawler.signals.connect(obj.spider_closed, signal=signals.spider_closed) return obj def spider_opened(self, spider): print('=============>open') def spider_closed(self, spider): print('=============>close') #2、配置生效 EXTENSIONS = { "Amazon.extentions.MyExtension":200 }