Python 自用程式碼(scrapy多級頁面(三級頁面)爬蟲)

右介發表於2017-05-09

2017-03-28

入職接到的第一個小任務,scrapy多級頁面爬蟲,從來沒寫過爬蟲,也沒學過scrapy,甚至連xpath都沒用過,最後用了將近一週才搞定。肯定有很多low爆的地方,希望大家可以給我一些建議。

spider檔案:

# -*- coding: utf-8 -*-
import scrapy
from nosta.items import NostaItem
import time
import hashlib

class NostaSpider(scrapy.Spider):
    name = "nosta"
    allowed_domains = ["nosta.gov.cn"]
    start_urls = [
        "http://www.nosta.gov.cn/upload/2017slgb/showProject.html",
    ]

    def parse(self, response):  
        for sel1 in response.xpath('//a/@href').extract():
            # 儲存連結自身組名group_name
            group_name = response.xpath('//a[@href="%s"]/text()'%(sel1)).extract()[0]
            # 儲存連結前的順序號group_number
            group_number = response.xpath('//a[@href="%s"]/parent::*/preceding-sibling::*/text()'%(sel1)).extract()[0]
            # 儲存目錄名directory_name
            directory_name = response.xpath('//a[@href="%s"]/parent::*/parent::*/parent::*/parent::*/preceding-sibling::*/text()'%(sel1)).extract()[0]
            # 儲存連結本身group_url
            group_url = response.urljoin(sel1)
            # url1 = "http://www.nosta.gov.cn/upload/2017slgb/" + sel1
            yield scrapy.Request(url = group_url, meta = {"group_name":group_name, "group_number":group_number, "directory_name":directory_name, "group_url":group_url}, callback=self.parse_url, dont_filter=True)

    def parse_url(self, response): 
        # item = response.meta['item']
        group_name = response.meta["group_name"]
        group_number = response.meta["group_number"]
        directory_name = response.meta["directory_name"]
        group_url = response.meta["group_url"]
        for sel2 in response.xpath('//a/@href').extract():
            # 儲存連結前的順序號project_number
            project_number = response.xpath('//a[@href="%s"]/parent::*/preceding-sibling::*/text()'%(sel2)).extract()[0]
            # 儲存連結本身project_url
            project_url = response.urljoin(sel2)
            # 儲存連結自身工程名project_name
            project_name = response.xpath('//a[@href="%s"]/text()'%(sel2)).extract()[0]
            # url2 = response.urljoin(sel2)
            yield scrapy.Request(url = project_url, meta = {"group_name":group_name, "group_number":group_number, "directory_name":directory_name, "group_url":group_url, "project_number":project_number, "project_url":project_url, "project_name":project_name}, callback=self.parse_item, dont_filter=True)

    def parse_item(self, response):
        item = NostaItem()
        item["time"] = time.strftime("%Y-%m-%d %H:%M:%S", time.localtime())
        item["year"] = ["2017"]      
        item["group_name"] = response.meta["group_name"]
        item["group_number"] = response.meta["group_number"]
        item["directory_name"] = response.meta["directory_name"]
        item["group_url"] = response.meta["group_url"]
        item["project_number"] = response.meta["project_number"]
        item["project_url"] = response.meta["project_url"]
        item["project_name"] = response.meta["project_name"]
        # 儲存詳情頁原始碼project_html
        item["project_html"] = response.body
        # 儲存合作人關係連結file_urls
        s1 = u'完成人合作關係說明:'
        item["file_urls"] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath("//td[text() = '%s']/following-sibling::*/a/@href"%(s1)).extract()]
        sha_1 = hashlib.sha1()
        item["files"] = []
        for i in item["file_urls"]:
            dict1 = {}
            dict1["url"] = i
            sha_1.update(i)
            dict1["path"] = sha_1.hexdigest() + ".pdf"
            item["files"].append(dict1)
        # 儲存所有圖片連結image_urls
        item["image_urls"] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath('//img[@width="840px"]/@src').extract()]
        # 儲存所有圖片本地地址和圖片名(列表中存存字典)images
        sha_2 = hashlib.sha1()
        item["images"] = []
        for i in item["image_urls"]:
            dict2 = {}
            dict2["url"] = i
            sha_2.update(i)
            dict2["path"] = sha_2.hexdigest() + ".jpg"
            item["images"].append(dict2)
        # 儲存詳情頁中具體內容project_content
        dict3 = {}
        project_detail = response.xpath('//td[@class="label"]/text()').extract()
        for j in project_detail:
            dict3[j] = response.xpath("//td[text() = '%s']/following-sibling::*"%(j)).xpath('string(.)').extract()[0]
            if not dict3[j]:
                dict3[j] = ['http://www.nosta.gov.cn/upload/2017slgb'+i.replace('..', '') for i in response.xpath("//td[text() = '%s']/following-sibling::*/img/@src"%(j)).extract()]
        item["project_content"] = dict3
        yield item

 items檔案:

import scrapy


class NostaItem(scrapy.Item):
    time = scrapy.Field()
    files = scrapy.Field()           # 完成人合作關係 列表中存字典  url:網上鍊接 path本地路徑(第三級)
    crawl_date = scrapy.Field()      # 爬取日期
    project_name = scrapy.Field()    # 工程名稱(第二、三級)
    group_url = scrapy.Field()       # 所在組的索引頁面連結(第一、二級)
    project_number = scrapy.Field()  # 在組中順序(第二級)
    project_content = scrapy.Field() # 專案詳情頁中具體內容(第三級)
    group_number = scrapy.Field()    # 組在總頁面中順序(第一級)
    project_url = scrapy.Field()     # 專案連結(第二、三級)
    group_name = scrapy.Field()      # 組名稱(第一、二、三級)
    image_urls = scrapy.Field()      # 列表存圖片連結(第三級)
    file_urls = scrapy.Field()       # 列表中存合作人關係連結(第三級)
    year = scrapy.Field()            # 哪年(2017)
    images = scrapy.Field()          # 列表中存字典 url:網上鍊接 path:本地路徑(第三級)
    directory_name = scrapy.Field()  # 屬於何種目錄名(第一級)
    project_html = scrapy.Field()    # 專案詳情頁html原始碼(第三級)
    current_count = scrapy.Field()

pipelines檔案

from pymongo import MongoClient
from nosta.items import NostaItem

class NostaPipeline(object):
    def __init__(self):
        self.client = MongoClient('IP', 27017)
        
    def process_item(self, item, spider):
        if isinstance(item, NostaItem):
            dict1 = {}
            dict1["time"] = item["time"]
            dict1["files"] = item["files"]
            dict1["project_name"] = item["project_name"]
            dict1["group_url"] = item["group_url"]
            dict1["project_number"] = item["project_number"]
            dict1["project_content"] = item["project_content"]
            dict1["group_number"] = item["group_number"]
            dict1["project_url"] = item["project_url"]
            dict1["group_name"] = item["group_name"]
            dict1["image_urls"] = item["image_urls"]
            dict1["file_urls"] = item["file_urls"]
            dict1["year"] = item["year"]
            dict1["images"] = item["images"]
            dict1["directory_name"] = item["directory_name"]
            
            self.db = self.client.nosta        
            self.db.authenticate('', '')
            collection = self.db.nosta_2017
            collection.insert(dict1)

            self.db = self.client.platform_info
            self.db.authenticate('', '')
            collection = self.db.crawl_info
            dict2 = {}
            dict2["current_count"] = item["current_count"]
            if dict2["current_count"] == 1:
                dict2["start_time"] = item["time"]
            collection.update( {'job': '2017年國家科技獎勵'}, {'$set': dict2})

        return item

settings檔案(部分修改)

ITEM_PIPELINES = {
   'nosta.pipelines.NostaPipeline': 300,
   'scrapy.pipelines.images.ImagesPipeline': 1,
   'scrapy.pipelines.files.FilesPipeline': 1
}

IMAGES_STORE = r'.'
FILES_STORE = r'.'

 

相關文章