scrapy爬取鏈家二手房存到mongo資料庫

農村落魄小青年發表於2021-01-03

1.建立專案

scrapy startproject lianjiahouse

2.建立crawl爬蟲模板

scrapy genspider -t crawl house lianjia.com

3.然後開始編寫item設定需要抓取的欄位

class LianjiaItem(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    # 釋出資訊名稱
    house_name = scrapy.Field()
    # 小區名稱
    community_name = scrapy.Field()
    # 所在區域
    location = scrapy.Field()
    # 鏈家編號
    house_record = scrapy.Field()
    # 總售價
    total_amount = scrapy.Field()
    # 單價
    unit_price = scrapy.Field()
    # 房屋基本資訊
    # 建築面積
    area_total = scrapy.Field()
    # 套內面積
    area_use = scrapy.Field()
    # 廳室戶型
    house_type = scrapy.Field()
    # 朝向
    direction = scrapy.Field()
    # 裝修情況
    sub_info = scrapy.Field()
    # 供暖方式
    heating_method = scrapy.Field()
    # 產權
    house_property = scrapy.Field()
    # 樓層
    floor = scrapy.Field()
    # 總層高
    total_floors = scrapy.Field()
    # 電梯
    is_left = scrapy.Field()
    # 戶梯比例
    left_rate = scrapy.Field()
    # 戶型結構
    structure = scrapy.Field()
    # 房屋交易資訊
    # 掛牌時間
    release_date = scrapy.Field()
    # 上次交易時間
    last_trade_time = scrapy.Field()
    # 房屋使用年限
    house_years = scrapy.Field()
    # 房屋抵押資訊
    pawn = scrapy.Field()
    # 交易權屬
    trade_property = scrapy.Field()
    # 房屋用途
    house_usage = scrapy.Field()
    # 產權所有
    property_own = scrapy.Field()
    # 圖片地址
    images_urls = scrapy.Field()
    # 儲存圖片
    images = scrapy.Field()

4.編寫pipelines.py 一個用來儲存資料庫,另一個用來存圖片


class LianjiahousePipeline(object):
    collection_name = 'lianjiahouse'
    def __init__(self,mongo_uri,mongo_db):
        self.mongo_uri = mongo_uri
        self.mongo_db = mongo_db
    @classmethod
    def from_crawler(cls,crawler):
        return cls(
            mongo_uri= crawler.settings.get('MONGO_URI'),
            mongo_db=crawler.settings.get('MONGO_DATABASE','lianjia')
        )
    def open_spider(self,spider):
        self.client = pymongo.MongoClient(self.mongo_uri)
        self.db = self.client[self.mongo_db]
    def close_spider(self,spider):
        self.client.close()
    def process_item(self,item,spider):
        self.db[self.collection_name].insert(dict(item))
        return item
class LianjiaImagePipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        for image_url in item['images_urls']:
            yield Request(image_url,meta={'item':item})
    def file_path(self, request, response=None, info=None, *, item=None):
        item = request.meta['item']
        image_folder = item['house_name']
        image_guild = request.url.split('/')[-1]
        image_sava = u'{0}/{1}'.format(image_folder,image_guild)
        return image_sava

5.在settings.py中啟用pipeline,設定圖片的儲存資訊,mongo資料庫資訊,robots協議改成false

# -*- coding: utf-8 -*-

# Scrapy settings for lianjiahouse project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
#     https://doc.scrapy.org/en/latest/topics/settings.html
#     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
#     https://doc.scrapy.org/en/latest/topics/spider-middleware.html

BOT_NAME = 'lianjiahouse'

SPIDER_MODULES = ['lianjiahouse.spiders']
NEWSPIDER_MODULE = 'lianjiahouse.spiders'


# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'lianjiahouse (+http://www.yourdomain.com)'

# Obey robots.txt rules
ROBOTSTXT_OBEY = True

# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32

# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)
#COOKIES_ENABLED = False

# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False

# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
#   'Accept-Language': 'en',
#}

# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
   'lianjiahouse.middlewares.LianjiahouseSpiderMiddleware': 543,
}

# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
   'lianjiahouse.middlewares.LianjiahouseDownloaderMiddleware': 543,
}

# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
#    'scrapy.extensions.telnet.TelnetConsole': None,
#}

# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
    'lianjiahouse.pipelines.LianjiahousePipeline': 300,
    'lianjiahouse.pipelines.LianjiaImagePipeline':400
}

# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

# 圖片儲存配置
IMAGES_STORE = 'D:\\Scrapy\\chapter5\\5_04\\lianjia\\images'
IMAGES_URLS_FIELD = 'images_urls'
IMAGES_RESULT_FIELD = 'images'

# MongoDB配置資訊
MONGO_URI = 'localhost:27017'
MONGO_DATABASE = 'lianjia'



6.然後開始編寫爬蟲規則

# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from lianjiahouse.items import LianjiahouseItem


class HouseSpider(CrawlSpider):
    name = 'house'
    allowed_domains = ['lianjia.com']
    start_urls = ['https://bj.lianjia.com/ershoufang/']

    rules = (
        Rule(LinkExtractor(allow='/ershoufang/\d{12}.html'), callback='parse_item'),
    )

    def parse_item(self, response):
        i = LianjiahouseItem()
        # 二手房名稱
        i['house_name'] = response.css('title::text').extract_first().replace(' ','')
        # 所在小區
        i['community_name'] = response.css('.communityName a::text').extract_first()
        # i['location'] = response.css()
        # 鏈家編號
        i['house_record'] = response.css('.houseRecord .info::text').extract_first()
        # 總價
        i['total_amount'] = response.css('.overview .total::text').extract_first()
        # 房屋資訊
        # 單價
        i['unit_price'] = response.css('.unitPriceValue::text').extract_first()
        # 建築總面積
        i['area_total'] = response.xpath('//div[@class="base"]//ul/li[3]/text()')\
            .re_first('\d+.\d+')
        # 使用面積
        i['area_use'] = response.xpath('//div[@class="base"]//ul/li[5]/text()')\
            .re_first('\d+.\d+')
        # 房屋型別
        i['house_type'] = response.xpath('//div[@class="base"]//ul/li[1]/text()')\
            .extract_first()
        # 房屋朝向
        i['direction'] = response.xpath('//div[@class="base"]//ul/li[7]/text()')\
            .extract_first()
        # 裝修情況
        i['sub_info'] = response.xpath('//div[@class="base"]//ul/li[9]/text()')\
            .extract_first()
        # 供暖方式
        i['heating_method'] = response.xpath('//div[@class="base"]//ul/li[11]/text()')\
            .extract_first()
        # 產權
        i['house_property'] = response.xpath('//div[@class="base"]//ul/li[13]/text()')\
            .extract_first()
        # 樓層
        i['floor'] = response.xpath('//div[@class="base"]//ul/li[2]/text()')\
            .extract_first()
        # 總樓層
        i['total_floors'] = response.xpath('//div[@class="base"]//ul/li[2]/text()')\
            .re_first(r'\d+')
        # 是否有電梯
        i['is_left'] = response.xpath('//div[@class="base"]//ul/li[12]/text()')\
            .extract_first()
        # 戶梯比例
        i['left_rate'] = response.xpath('//div[@class="base"]//ul/li[10]/text()')\
            .extract_first()
        # 掛牌時間
        i['release_date'] = response.xpath('//div[@class="transaction"]//ul/li[1]'
                                           '/span[2]/text()').extract_first()
        # 最後交易時間
        i['last_trade_time'] = response.xpath('//div[@class="transaction"]//ul/li[3]'
                                              '/span[2]/text()').extract_first()
        # 房屋使用年限
        i['house_years'] = response.xpath('//div[@class="transaction"]//ul/li[5]'
                                          '/span[2]/text()').extract_first()
        # 房屋抵押資訊,抵押資訊中有空格及換行符,先通過replace()將空格去掉,再通過strip()將換行符去掉
        i['pawn'] = response.xpath('//div[@class="transaction"]//ul/li[7]/span[2]'
                                   '/text()').extract_first().replace(' ','').strip()
        # 交易權屬
        i['trade_property'] = response.xpath('//div[@class="transaction"]//ul/li[2]'
                                             '/span[2]/text()').extract_first()
        # 房屋用途
        i['house_usage'] = response.xpath('//div[@class="transaction"]//ul/li[4]'
                                          '/span[2]/text()').extract_first()
        # 產權所有
        i['property_own'] = response.xpath('//div[@class="transaction"]//ul/li[6]'
                                           '/span[2]/text()').extract_first()
        # 圖片url
        i['images_urls'] = response.css('.smallpic > li::attr(data-pic)').extract()
        yield i

7.執行爬蟲 可以看到相關資料 也可以連線mongod
在這裡插入圖片描述
可以看到圖片
在這裡插入圖片描述
鏈家還是比較容易爬取的 完整版程式碼連線如下

相關文章