1、scrapy 新建專案
scrapy startproject 專案名稱
2、spiders編寫(以爬取163北京新聞為例)
此例中用到了scrapy的Itemloader機制,itemloader中有三個比較重要的方法,有add_xpath(),add_value(),add_css(),這三個方法中,都有兩個引數第一個為item的名,第二個為值或者是提取規則
用法如下例。
1、scrapy 新建專案 scrapy startproject 專案名稱 2、spiders編寫(以爬取163北京新聞為例) import json import time import scrapy from scrapy.http import Request from aticleSpider.items import ArticleItem from aticleSpider.until.common import get_md5 from aticleSpider.items import ArticleItemLoader # from scrapy.loader import ItemLoader class ArticleSpider(scrapy.Spider): #spider名字,scrapy會讀取這個name name = 'bj163news' #可允許爬取的域名 allowed_domains = ['163.com'] #url入口 start_urls = ["http://bendi.news.163.com/beijing/special/04388GGG/bjxinxiliu.js?callback=data_callback&_="] num = 1 times = time.time() add_time = str(times).replace('.','')[0:13] def parse(self, response): """ 1、獲取文章列表中的文章的url並進行具體欄位的解析 2、獲取到下頁的URL進行下載 :param response: :return: """ #解析列表頁的所有文章的url response_str = str(response.body.decode("gbk", "ignore")).replace('data_callback(','').replace(')','') # print(response_str) js = json.loads(response_str) for line in js: keys = [] title = line['title'] commenturl= line['commenturl'] docurl = line['docurl'] newstype = line['newstype'] title_img = line['imgurl'] for keywords in line['keywords']: keys.append(keywords['keyname']) key_words = ','.join(keys) # print(docurl,'docurl') metas = {'title':title,'commenturl':commenturl,'newstype':newstype,'title_img':title_img,'key_words':key_words} yield Request(url=docurl,meta=metas,callback=self.parse_detail) #迴圈獲取所有頁面資料的方法 self.num = self.num +1 if self.num==10: str_num = str(self.num) else : str_num = '0'+str(self.num) next_url = "http://bendi.news.163.com/beijing/special/04388GGG/bjxinxiliu_"+str_num+".js?callback=data_callback&_="+self.add_time # yield Request(url=next_url, callback=self.parse) if url: url_page = url #文章內容頁資料爬取 item loader 模式 def parse_detail(self,response): # item =ArticleItem() main_data = response.meta # docurl = response.url # # project_doc_url = get_md5(docurl) # # try: # news_editor = response.xpath('//span[@class="ep-editor"]/text()').extract_first("").replace('責任編輯:','') # news_source = response.xpath('//div[@class="ep-source cDGray"]/span/text()').extract_first("").replace('本文來源:','') # # except: source_title = response.xpath('//div[@id="endText"]/p[@class="otitle"]/text()').extract_first(" ").replace('(原標題:','').replace(')','').strip() # news_time = response.xpath('//div[@class="post_time_source"]//text()').extract_first("").replace('來源:','').strip() # content_org = response.xpath('//div[@id="endText"]').extract_first("") news_conts =response.xpath('//div[@id="endText"]/p') news_cont =[] for one_p in news_conts: img = one_p.xpath('.//img') # print(img) if img != []: img_url = one_p.xpath('.//@src').extract_first("") news_cont.append({'content':img_url,'type':'pic'}) else: try: text = one_p.xpath('.//text()').extract_first("") if text.find('(原標題:')>0 or text =='': continue news_cont.append({'content':text,'type':'text'}).strip() except: pass #註釋部分為不用itemloader的提取方法 # item['content_org'] ='' # item['project_doc_url'] = project_doc_url # item['title'] = main_data.get('title','') #get方法取值,並傳預設值 # item['source_title'] = source_title # item['commenturl'] = main_data.get('commenturl','') # item['newstype'] = main_data.get('newstype','') # item['docurl'] = docurl # item['title_img'] = [main_data.get('title_img','')] # item['key_words'] = main_data.get('key','') # item['news_editor'] = news_editor # item['news_source'] = news_source # item['news_time'] = news_time # item['news_cont'] = news_cont # yield item item_loader = ArticleItemLoader(item=ArticleItem(),response=response) item_loader.add_xpath('news_editor','//span[@class="ep-editor"]/text()') item_loader.add_value('title',main_data.get('title','')) item_loader.add_value('project_doc_url',get_md5(response.url)) item_loader.add_value('commenturl',main_data.get('commenturl','')) item_loader.add_value('newstype',main_data.get('newstype','')) item_loader.add_value('docurl',response.url) item_loader.add_value('source_title',source_title) item_loader.add_value('title_img',[main_data.get('title_img','')]) item_loader.add_value('key_words',main_data.get('key_words','')) item_loader.add_xpath('news_editor','//span[@class="ep-editor"]/text()') item_loader.add_xpath('news_source','//div[@class="ep-source cDGray"]/span/text()') item_loader.add_xpath('content_org','//div[@id="endText"]') item_loader.add_value('news_cont',news_cont) item_loader.add_xpath('news_time','//div[@class="post_time_source"]//text()') article_item = item_loader.load_item() yield article_item
3、使用item loader的item設計
# -*- coding: utf-8 -*- # Define here the models for your scraped items # # See documentation in: # http://doc.scrapy.org/en/latest/topics/items.html import scrapy from scrapy.loader import ItemLoader from scrapy.loader.processors import TakeFirst, MapCompose class ArticleSpiderItem(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass #自定義article loader 設定預設輸出值為list的第一個 class ArticleItemLoader(ItemLoader): # pass default_output_processor = TakeFirst() #item loader的返回原本值,不做任何改變 def return_value(value): return value #163時間格式化 def return_time(value): return value.replace('來源:','').strip() #163格式化獲取作者 def get_editor(value): # if value != '': return value.replace('責任編輯:','') #163格式化獲取本文來源 def get_source(value): # if value != '': return value.replace('本文來源:','') #163item class ArticleItem(scrapy.Item): """ item= scrapy.Field() Field() 有兩個引數,即輸入/輸出處理器 一個為 input_processor 輸入處理器在資料被接受到時執行 output_processor 輸出處理器在ItemLoader.load_item()時再執行輸出處理器,返回最終結果 輸入處理器在資料被接受到時執行,當資料收集完後呼叫ItemLoader.load_item()時再執行輸出處理器,返回最終結果。 """ #文章標題 source_title = scrapy.Field() title = scrapy.Field() #文章URL docurl = scrapy.Field() #文章md5 URL project_doc_url = scrapy.Field() #大標題圖片 title_img = scrapy.Field( output_processor = MapCompose(return_value) ) #圖片儲存路徑 img_path = scrapy.Field() #關鍵字 key_words = scrapy.Field() #新聞型別 article newstype = scrapy.Field() #評論url commenturl = scrapy.Field() #文章作者 news_editor = scrapy.Field( input_processor = MapCompose(get_editor) ) #分段內容,圖片名或者是內容 news_cont = scrapy.Field( output_processor = MapCompose(return_value) ) #文章內容原始碼 content_org = scrapy.Field() #圖片url # news_pic_url = scrapy.Field() #發表時間 news_time = scrapy.Field( input_processor = MapCompose(return_time) ) #文章來源 news_source = scrapy.Field( input_processor = MapCompose(get_source) ) down_img = scrapy.Field()
在Field定義中宣告輸入/輸出處理器
優先順序:
- 在Item Loader中定義的
field_in
和field_out
- Filed後設資料(
input_processor
和output_processor
關鍵字) - Item Loader中的預設的
Tips:一般來講,將輸入處理器定義在Item Loader的定義中field_in
,然後將輸出處理器定義在Field後設資料中
內建的處理器
Identity
啥也不做TakeFirst
返回第一個非空值,通常用作輸出處理器Join
將結果連起來,預設使用空格’ ‘Compose
將函式連結起來形成管道流,產生最後的輸出MapCompose
跟上面的Compose
類似,區別在於內部結果在函式中的傳遞方式.它的輸入值是可迭代的,首先將第一個函式依次作用於所有值,產生新的可迭代輸入,作為第二個函式的輸入,最後生成的結果連起來返回最終值,一般用在輸入處理器中。SelectJmes
使用json路徑來查詢值並返回結果
4、pipleines設計
pipelines是用來處理item的,此例中用圖片下載、json格式儲存檔案、資料存入資料庫。資料庫具體sql用*代替。
# -*- coding: utf-8 -*- # Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html import codecs # import hashlib import json # import random # from scrapy.utils.python import to_bytes import pymysql # import time from scrapy.pipelines.images import ImagesPipeline from scrapy.exporters import JsonItemExporter from scrapy.http import Request from aticleSpider.until.common import get_md5 #預設方法 class ArticleSpiderPipeline(object): def process_item(self, item, spider): return item #呼叫scrapy提供的json export匯出json檔案 class JsonExporterPipeline(object): #呼叫scrapy提供的json export匯出json檔案 def __init__(self): self.file = open('articleexport.json', 'wb') self.exporter = JsonItemExporter(self.file, encoding="utf-8", ensure_ascii=False) self.exporter.start_exporting() def close_spider(self, spider): self.exporter.finish_exporting() self.file.close() def process_item(self, item, spider): self.exporter.export_item(item) return item #北京163新聞資料入庫操作 class Bj163MySqlPipeline(object): def __init__(self,db_parms): self.conn = pymysql.connect(**db_parms) self.cursor = self.conn.cursor() @classmethod def from_settings(cls,settings): db_parms = dict( host = settings['MYSQL_HOST'], user = settings['MYSQL_USER'], password = settings['MYSQL_PASSWORD'], database = settings['MYSQL_DBNAME'], charset='utf8' ) # conn = pymysql.connect(**db_parms) return cls(db_parms) def process_item(self,item,spider): select_sql_find = """ select id from toon_news_163 WHERE pageurl = %s; """ self.cursor.execute(select_sql_find,(item['docurl'])) odis = self.cursor.fetchall() if odis == (): insert_sql = """ insert into toon_news_163(*) VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s); """ try: source_title = item['source_title'] except: source_title = item['title'] # print(item['news_time'],item['docurl'],item['news_editor'],source_title,item['key_words'],item['news_source'], # item['title'],item['title_img'],item['content_org'],item['img_path'],item['docurl']) self.cursor.execute(insert_sql,('163網易北京',item['news_time'],item['docurl'],item['news_editor'],source_title,item['key_words'],item['news_source'], item['title'],item['title_img'],item['content_org'],item['img_path'],item['docurl'])) self.conn.commit() select_sql = """ select max(id) FROM toon_news_163; """ self.cursor.execute(select_sql) oidss = self.cursor.fetchall() max_id = oidss[0][0] # print(max_id) content = item['news_cont'] for i in range(0,len(content)): if content[i]['type'] == 'pic': pic_url = content[i]['content'] else: pic_url = '' insert_con_sql = """ insert into toon_news_content_163(*) VALUES( %s,%s,%s,%s,%s,%s); """ # print(str(max_id),content[i]['content'],content[i]['type'],str(i),0,pic_url,item['docurl']) self.cursor.execute(insert_con_sql,(str(max_id),content[i]['content'],content[i]['type'],str(i+1),0,item['docurl'])) self.conn.commit() # return item #json格式資料儲存 class JsonWithEncodingPipeline(object): #自定義json檔案的匯出 def __init__(self): self.file = codecs.open('article.json', 'w', encoding="utf-8") def process_item(self, item, spider): lines = json.dumps(dict(item), ensure_ascii=False) + "\n" self.file.write(lines) return item def spider_closed(self, spider): self.file.close() #文章標題頁圖片下載 class ArticleImagePipeline(ImagesPipeline): #圖片下載 def file_path(self, request, response=None, info=None): ## start of deprecation warning block (can be removed in the future) def _warn(): from scrapy.exceptions import ScrapyDeprecationWarning import warnings warnings.warn('ImagesPipeline.image_key(url) and file_key(url) methods are deprecated, ' 'please use file_path(request, response=None, info=None) instead', category=ScrapyDeprecationWarning, stacklevel=1) # check if called from image_key or file_key with url as first argument if not isinstance(request, Request): _warn() url = request else: url = request.url # detect if file_key() or image_key() methods have been overridden if not hasattr(self.file_key, '_base'): _warn() return self.file_key(url) elif not hasattr(self.image_key, '_base'): _warn() return self.image_key(url) ## end of deprecation warning block image_guid = get_md5(url) # change to request.url after deprecation return '%s.jpg' % (image_guid) def item_completed(self, results, item, info): # if "title_img" in item: if "title_img" in item: img_path ='' for ok,value in results: img_path = value['path'] item['img_path'] = img_path # pass return item
5、setting設定
其中重要部分均已做註釋,ITEM_PIPELINES引數為執行優先順序,數值越小越優先
# -*- coding: utf-8 -*- # Scrapy settings for aticleSpider project # # For simplicity, this file contains only settings considered important or # commonly used. You can find more settings consulting the documentation: # # http://doc.scrapy.org/en/latest/topics/settings.html # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html import os BOT_NAME = 'aticleSpider' SPIDER_MODULES = ['aticleSpider.spiders'] NEWSPIDER_MODULE = 'aticleSpider.spiders' # Crawl responsibly by identifying yourself (and your website) on the user-agent #USER_AGENT = 'aticleSpider (+http://www.yourdomain.com)' # Obey robots.txt rules ROBOTSTXT_OBEY = False # Configure maximum concurrent requests performed by Scrapy (default: 16) #CONCURRENT_REQUESTS = 32 # Configure a delay for requests for the same website (default: 0) # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay # See also autothrottle settings and docs #DOWNLOAD_DELAY = 3 # The download delay setting will honor only one of: #CONCURRENT_REQUESTS_PER_DOMAIN = 16 #CONCURRENT_REQUESTS_PER_IP = 16 # Disable cookies (enabled by default) #COOKIES_ENABLED = False # Disable Telnet Console (enabled by default) #TELNETCONSOLE_ENABLED = False # Override the default request headers: #DEFAULT_REQUEST_HEADERS = { # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Language': 'en', #} # Enable or disable spider middlewares # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html #SPIDER_MIDDLEWARES = { # 'aticleSpider.middlewares.AticlespiderSpiderMiddleware': 543, #} # Enable or disable downloader middlewares # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html #DOWNLOADER_MIDDLEWARES = { # 'aticleSpider.middlewares.MyCustomDownloaderMiddleware': 543, #} # Enable or disable extensions # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html #EXTENSIONS = { # 'scrapy.extensions.telnet.TelnetConsole': None, #} # Configure item pipelines # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html ITEM_PIPELINES = { # 'aticleSpider.pipelines.ArticleSpiderPipeline': 100, 'aticleSpider.pipelines.Bj163MySqlPipeline': 3, 'aticleSpider.pipelines.ArticleImagePipeline':1, } IMAGES_URLS_FIELD ='title_img' #圖片URL的item ,其值型別為list project_dir = os.path.abspath(os.path.dirname(__file__)) IMAGES_STORE = os.path.join(project_dir, '163_news') #圖片儲存位置 MYSQL_HOST = '' #資料庫host MYSQL_USER = '' #資料庫user MYSQL_DBNAME = '' #資料庫database MYSQL_PASSWORD = '' #資料庫密碼 # Enable and configure the AutoThrottle extension (disabled by default) # See http://doc.scrapy.org/en/latest/topics/autothrottle.html #AUTOTHROTTLE_ENABLED = True # The initial download delay #AUTOTHROTTLE_START_DELAY = 5 # The maximum download delay to be set in case of high latencies #AUTOTHROTTLE_MAX_DELAY = 60 # The average number of requests Scrapy should be sending in parallel to # each remote server #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Enable showing throttling stats for every response received: #AUTOTHROTTLE_DEBUG = False # Enable and configure HTTP caching (disabled by default) # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings #HTTPCACHE_ENABLED = True #HTTPCACHE_EXPIRATION_SECS = 0 #HTTPCACHE_DIR = 'httpcache' #HTTPCACHE_IGNORE_HTTP_CODES = [] #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
6、執行scrapy專案
scrapy crawl 專案名稱