2017-03-28
入職接到的第一個小任務,scrapy多級頁面爬蟲,從來沒寫過爬蟲,也沒學過scrapy,甚至連xpath都沒用過,最後用了將近一週才搞定。肯定有很多low爆的地方,希望大家可以給我一些建議。
spider檔案:
# -*- coding: utf-8 -*- import scrapy from nosta.items import NostaItem import time import hashlib class NostaSpider(scrapy.Spider): name = "nosta" allowed_domains = ["nosta.gov.cn"] start_urls = [ "http://www.nosta.gov.cn/upload/2017slgb/showProject.html", ] def parse(self, response): for sel1 in response.xpath('//a/@href').extract(): # 儲存連結自身組名group_name group_name = response.xpath('//a[@href="%s"]/text()'%(sel1)).extract()[0] # 儲存連結前的順序號group_number group_number = response.xpath('//a[@href="%s"]/parent::*/preceding-sibling::*/text()'%(sel1)).extract()[0] # 儲存目錄名directory_name directory_name = response.xpath('//a[@href="%s"]/parent::*/parent::*/parent::*/parent::*/preceding-sibling::*/text()'%(sel1)).extract()[0] # 儲存連結本身group_url group_url = response.urljoin(sel1) # url1 = "http://www.nosta.gov.cn/upload/2017slgb/" + sel1 yield scrapy.Request(url = group_url, meta = {"group_name":group_name, "group_number":group_number, "directory_name":directory_name, "group_url":group_url}, callback=self.parse_url, dont_filter=True) def parse_url(self, response): # item = response.meta['item'] group_name = response.meta["group_name"] group_number = response.meta["group_number"] directory_name = response.meta["directory_name"] group_url = response.meta["group_url"] for sel2 in response.xpath('//a/@href').extract(): # 儲存連結前的順序號project_number project_number = response.xpath('//a[@href="%s"]/parent::*/preceding-sibling::*/text()'%(sel2)).extract()[0] # 儲存連結本身project_url project_url = response.urljoin(sel2) # 儲存連結自身工程名project_name project_name = response.xpath('//a[@href="%s"]/text()'%(sel2)).extract()[0] # url2 = response.urljoin(sel2) yield scrapy.Request(url = project_url, meta = {"group_name":group_name, "group_number":group_number, "directory_name":directory_name, "group_url":group_url, "project_number":project_number, "project_url":project_url, "project_name":project_name}, callback=self.parse_item, dont_filter=True) def parse_item(self, response): item = NostaItem() item["time"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()) item["year"] = ["2017"] item["group_name"] = response.meta["group_name"] item["group_number"] = response.meta["group_number"] item["directory_name"] = response.meta["directory_name"] item["group_url"] = response.meta["group_url"] item["project_number"] = response.meta["project_number"] item["project_url"] = response.meta["project_url"] item["project_name"] = response.meta["project_name"] # 儲存詳情頁原始碼project_html item["project_html"] = response.body # 儲存合作人關係連結file_urls s1 = u'完成人合作關係說明:' item["file_urls"] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath("//td[text() = '%s']/following-sibling::*/a/@href"%(s1)).extract()] sha_1 = hashlib.sha1() item["files"] = [] for i in item["file_urls"]: dict1 = {} dict1["url"] = i sha_1.update(i) dict1["path"] = sha_1.hexdigest() + ".pdf" item["files"].append(dict1) # 儲存所有圖片連結image_urls item["image_urls"] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath('//img[@width="840px"]/@src').extract()] # 儲存所有圖片本地地址和圖片名(列表中存存字典)images sha_2 = hashlib.sha1() item["images"] = [] for i in item["image_urls"]: dict2 = {} dict2["url"] = i sha_2.update(i) dict2["path"] = sha_2.hexdigest() + ".jpg" item["images"].append(dict2) # 儲存詳情頁中具體內容project_content dict3 = {} project_detail = response.xpath('//td[@class="label"]/text()').extract() for j in project_detail: dict3[j] = response.xpath("//td[text() = '%s']/following-sibling::*"%(j)).xpath('string(.)').extract()[0] if not dict3[j]: dict3[j] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath("//td[text() = '%s']/following-sibling::*/img/@src"%(j)).extract()] item["project_content"] = dict3 yield item
items檔案:
import scrapy class NostaItem(scrapy.Item): time = scrapy.Field() files = scrapy.Field() # 完成人合作關係 列表中存字典 url:網上鍊接 path本地路徑(第三級) crawl_date = scrapy.Field() # 爬取日期 project_name = scrapy.Field() # 工程名稱(第二、三級) group_url = scrapy.Field() # 所在組的索引頁面連結(第一、二級) project_number = scrapy.Field() # 在組中順序(第二級) project_content = scrapy.Field() # 專案詳情頁中具體內容(第三級) group_number = scrapy.Field() # 組在總頁面中順序(第一級) project_url = scrapy.Field() # 專案連結(第二、三級) group_name = scrapy.Field() # 組名稱(第一、二、三級) image_urls = scrapy.Field() # 列表存圖片連結(第三級) file_urls = scrapy.Field() # 列表中存合作人關係連結(第三級) year = scrapy.Field() # 哪年(2017) images = scrapy.Field() # 列表中存字典 url:網上鍊接 path:本地路徑(第三級) directory_name = scrapy.Field() # 屬於何種目錄名(第一級) project_html = scrapy.Field() # 專案詳情頁html原始碼(第三級) current_count = scrapy.Field()
pipelines檔案
from pymongo import MongoClient from nosta.items import NostaItem class NostaPipeline(object): def __init__(self): self.client = MongoClient('IP', 27017) def process_item(self, item, spider): if isinstance(item, NostaItem): dict1 = {} dict1["time"] = item["time"] dict1["files"] = item["files"] dict1["project_name"] = item["project_name"] dict1["group_url"] = item["group_url"] dict1["project_number"] = item["project_number"] dict1["project_content"] = item["project_content"] dict1["group_number"] = item["group_number"] dict1["project_url"] = item["project_url"] dict1["group_name"] = item["group_name"] dict1["image_urls"] = item["image_urls"] dict1["file_urls"] = item["file_urls"] dict1["year"] = item["year"] dict1["images"] = item["images"] dict1["directory_name"] = item["directory_name"] self.db = self.client.nosta self.db.authenticate('', '') collection = self.db.nosta_2017 collection.insert(dict1) self.db = self.client.platform_info self.db.authenticate('', '') collection = self.db.crawl_info dict2 = {} dict2["current_count"] = item["current_count"] if dict2["current_count"] == 1: dict2["start_time"] = item["time"] collection.update( {'job': '2017年國家科技獎勵'}, {'$set': dict2}) return item
settings檔案(部分修改)
ITEM_PIPELINES = { 'nosta.pipelines.NostaPipeline': 300, 'scrapy.pipelines.images.ImagesPipeline': 1, 'scrapy.pipelines.files.FilesPipeline': 1 } IMAGES_STORE = r'.' FILES_STORE = r'.'