scrapy爬取鏈家二手房存到mongo資料庫
1.建立專案
scrapy startproject lianjiahouse
2.建立crawl爬蟲模板
scrapy genspider -t crawl house lianjia.com
3.然後開始編寫item設定需要抓取的欄位
class LianjiaItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 釋出資訊名稱
house_name = scrapy.Field()
# 小區名稱
community_name = scrapy.Field()
# 所在區域
location = scrapy.Field()
# 鏈家編號
house_record = scrapy.Field()
# 總售價
total_amount = scrapy.Field()
# 單價
unit_price = scrapy.Field()
# 房屋基本資訊
# 建築面積
area_total = scrapy.Field()
# 套內面積
area_use = scrapy.Field()
# 廳室戶型
house_type = scrapy.Field()
# 朝向
direction = scrapy.Field()
# 裝修情況
sub_info = scrapy.Field()
# 供暖方式
heating_method = scrapy.Field()
# 產權
house_property = scrapy.Field()
# 樓層
floor = scrapy.Field()
# 總層高
total_floors = scrapy.Field()
# 電梯
is_left = scrapy.Field()
# 戶梯比例
left_rate = scrapy.Field()
# 戶型結構
structure = scrapy.Field()
# 房屋交易資訊
# 掛牌時間
release_date = scrapy.Field()
# 上次交易時間
last_trade_time = scrapy.Field()
# 房屋使用年限
house_years = scrapy.Field()
# 房屋抵押資訊
pawn = scrapy.Field()
# 交易權屬
trade_property = scrapy.Field()
# 房屋用途
house_usage = scrapy.Field()
# 產權所有
property_own = scrapy.Field()
# 圖片地址
images_urls = scrapy.Field()
# 儲存圖片
images = scrapy.Field()
4.編寫pipelines.py 一個用來儲存資料庫,另一個用來存圖片
class LianjiahousePipeline(object):
collection_name = 'lianjiahouse'
def __init__(self,mongo_uri,mongo_db):
self.mongo_uri = mongo_uri
self.mongo_db = mongo_db
@classmethod
def from_crawler(cls,crawler):
return cls(
mongo_uri= crawler.settings.get('MONGO_URI'),
mongo_db=crawler.settings.get('MONGO_DATABASE','lianjia')
)
def open_spider(self,spider):
self.client = pymongo.MongoClient(self.mongo_uri)
self.db = self.client[self.mongo_db]
def close_spider(self,spider):
self.client.close()
def process_item(self,item,spider):
self.db[self.collection_name].insert(dict(item))
return item
class LianjiaImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['images_urls']:
yield Request(image_url,meta={'item':item})
def file_path(self, request, response=None, info=None, *, item=None):
item = request.meta['item']
image_folder = item['house_name']
image_guild = request.url.split('/')[-1]
image_sava = u'{0}/{1}'.format(image_folder,image_guild)
return image_sava
5.在settings.py中啟用pipeline,設定圖片的儲存資訊,mongo資料庫資訊,robots協議改成false
# -*- coding: utf-8 -*-
# Scrapy settings for lianjiahouse project
#
# For simplicity, this file contains only settings considered important or
# commonly used. You can find more settings consulting the documentation:
#
# https://doc.scrapy.org/en/latest/topics/settings.html
# https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
# https://doc.scrapy.org/en/latest/topics/spider-middleware.html
BOT_NAME = 'lianjiahouse'
SPIDER_MODULES = ['lianjiahouse.spiders']
NEWSPIDER_MODULE = 'lianjiahouse.spiders'
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = 'lianjiahouse (+http://www.yourdomain.com)'
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
# Configure maximum concurrent requests performed by Scrapy (default: 16)
#CONCURRENT_REQUESTS = 32
# Configure a delay for requests for the same website (default: 0)
# See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
# See also autothrottle settings and docs
#DOWNLOAD_DELAY = 3
# The download delay setting will honor only one of:
#CONCURRENT_REQUESTS_PER_DOMAIN = 16
#CONCURRENT_REQUESTS_PER_IP = 16
# Disable cookies (enabled by default)
#COOKIES_ENABLED = False
# Disable Telnet Console (enabled by default)
#TELNETCONSOLE_ENABLED = False
# Override the default request headers:
#DEFAULT_REQUEST_HEADERS = {
# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
# 'Accept-Language': 'en',
#}
# Enable or disable spider middlewares
# See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
SPIDER_MIDDLEWARES = {
'lianjiahouse.middlewares.LianjiahouseSpiderMiddleware': 543,
}
# Enable or disable downloader middlewares
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
DOWNLOADER_MIDDLEWARES = {
'lianjiahouse.middlewares.LianjiahouseDownloaderMiddleware': 543,
}
# Enable or disable extensions
# See https://doc.scrapy.org/en/latest/topics/extensions.html
#EXTENSIONS = {
# 'scrapy.extensions.telnet.TelnetConsole': None,
#}
# Configure item pipelines
# See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'lianjiahouse.pipelines.LianjiahousePipeline': 300,
'lianjiahouse.pipelines.LianjiaImagePipeline':400
}
# Enable and configure the AutoThrottle extension (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/autothrottle.html
#AUTOTHROTTLE_ENABLED = True
# The initial download delay
#AUTOTHROTTLE_START_DELAY = 5
# The maximum download delay to be set in case of high latencies
#AUTOTHROTTLE_MAX_DELAY = 60
# The average number of requests Scrapy should be sending in parallel to
# each remote server
#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
# Enable showing throttling stats for every response received:
#AUTOTHROTTLE_DEBUG = False
# Enable and configure HTTP caching (disabled by default)
# See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
#HTTPCACHE_ENABLED = True
#HTTPCACHE_EXPIRATION_SECS = 0
#HTTPCACHE_DIR = 'httpcache'
#HTTPCACHE_IGNORE_HTTP_CODES = []
#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
# 圖片儲存配置
IMAGES_STORE = 'D:\\Scrapy\\chapter5\\5_04\\lianjia\\images'
IMAGES_URLS_FIELD = 'images_urls'
IMAGES_RESULT_FIELD = 'images'
# MongoDB配置資訊
MONGO_URI = 'localhost:27017'
MONGO_DATABASE = 'lianjia'
6.然後開始編寫爬蟲規則
# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from lianjiahouse.items import LianjiahouseItem
class HouseSpider(CrawlSpider):
name = 'house'
allowed_domains = ['lianjia.com']
start_urls = ['https://bj.lianjia.com/ershoufang/']
rules = (
Rule(LinkExtractor(allow='/ershoufang/\d{12}.html'), callback='parse_item'),
)
def parse_item(self, response):
i = LianjiahouseItem()
# 二手房名稱
i['house_name'] = response.css('title::text').extract_first().replace(' ','')
# 所在小區
i['community_name'] = response.css('.communityName a::text').extract_first()
# i['location'] = response.css()
# 鏈家編號
i['house_record'] = response.css('.houseRecord .info::text').extract_first()
# 總價
i['total_amount'] = response.css('.overview .total::text').extract_first()
# 房屋資訊
# 單價
i['unit_price'] = response.css('.unitPriceValue::text').extract_first()
# 建築總面積
i['area_total'] = response.xpath('//div[@class="base"]//ul/li[3]/text()')\
.re_first('\d+.\d+')
# 使用面積
i['area_use'] = response.xpath('//div[@class="base"]//ul/li[5]/text()')\
.re_first('\d+.\d+')
# 房屋型別
i['house_type'] = response.xpath('//div[@class="base"]//ul/li[1]/text()')\
.extract_first()
# 房屋朝向
i['direction'] = response.xpath('//div[@class="base"]//ul/li[7]/text()')\
.extract_first()
# 裝修情況
i['sub_info'] = response.xpath('//div[@class="base"]//ul/li[9]/text()')\
.extract_first()
# 供暖方式
i['heating_method'] = response.xpath('//div[@class="base"]//ul/li[11]/text()')\
.extract_first()
# 產權
i['house_property'] = response.xpath('//div[@class="base"]//ul/li[13]/text()')\
.extract_first()
# 樓層
i['floor'] = response.xpath('//div[@class="base"]//ul/li[2]/text()')\
.extract_first()
# 總樓層
i['total_floors'] = response.xpath('//div[@class="base"]//ul/li[2]/text()')\
.re_first(r'\d+')
# 是否有電梯
i['is_left'] = response.xpath('//div[@class="base"]//ul/li[12]/text()')\
.extract_first()
# 戶梯比例
i['left_rate'] = response.xpath('//div[@class="base"]//ul/li[10]/text()')\
.extract_first()
# 掛牌時間
i['release_date'] = response.xpath('//div[@class="transaction"]//ul/li[1]'
'/span[2]/text()').extract_first()
# 最後交易時間
i['last_trade_time'] = response.xpath('//div[@class="transaction"]//ul/li[3]'
'/span[2]/text()').extract_first()
# 房屋使用年限
i['house_years'] = response.xpath('//div[@class="transaction"]//ul/li[5]'
'/span[2]/text()').extract_first()
# 房屋抵押資訊,抵押資訊中有空格及換行符,先通過replace()將空格去掉,再通過strip()將換行符去掉
i['pawn'] = response.xpath('//div[@class="transaction"]//ul/li[7]/span[2]'
'/text()').extract_first().replace(' ','').strip()
# 交易權屬
i['trade_property'] = response.xpath('//div[@class="transaction"]//ul/li[2]'
'/span[2]/text()').extract_first()
# 房屋用途
i['house_usage'] = response.xpath('//div[@class="transaction"]//ul/li[4]'
'/span[2]/text()').extract_first()
# 產權所有
i['property_own'] = response.xpath('//div[@class="transaction"]//ul/li[6]'
'/span[2]/text()').extract_first()
# 圖片url
i['images_urls'] = response.css('.smallpic > li::attr(data-pic)').extract()
yield i
7.執行爬蟲 可以看到相關資料 也可以連線mongod
可以看到圖片
鏈家還是比較容易爬取的 完整版程式碼連線如下
相關文章
- Scrapy爬取二手房資訊+視覺化資料分析視覺化
- python爬取股票資料並存到資料庫Python資料庫
- scrapy爬取豆瓣電影資料
- python爬蟲學習(4)抓取鏈家網二手房資料Python爬蟲
- Python爬取鏈家成都二手房源資訊 asyncio + aiohttp 非同步爬蟲實戰PythonAIHTTP非同步爬蟲
- 如何提升scrapy爬取資料的效率
- 初識Scrapy框架+爬蟲實戰(7)-爬取鏈家網100頁租房資訊框架爬蟲
- Selenium + Scrapy爬取某商標資料
- Python爬蟲框架:scrapy爬取高考派大學資料Python爬蟲框架
- Scrapy爬蟲 - 獲取知乎使用者資料爬蟲
- python爬蟲--爬取鏈家租房資訊Python爬蟲
- 使用python3抓取鏈家二手房資料Python
- 5 分鐘掌握智聯招聘網站爬取並儲存到 MongoDB 資料庫網站MongoDB資料庫
- Python3爬蟲資料入資料庫---把爬取到的資料存到資料庫,帶資料庫去重功能Python爬蟲資料庫
- Kettle 從資料庫讀取資料存到變數中資料庫變數
- python 爬蟲 5i5j房屋資訊 獲取並儲存到資料庫Python爬蟲資料庫
- scrapy 爬電影 抓取資料
- Scrapy框架的使用之Scrapy爬取新浪微博框架
- 用python爬取鏈家的租房資訊Python
- Mongo 資料庫 基本操作Go資料庫
- Mongo資料庫安裝Go資料庫
- 使用Scrapy爬取圖片入庫,並儲存在本地
- 使用 Scrapy 爬取股票程式碼
- 儲存資料到MySql資料庫——我用scrapy寫爬蟲(二)MySql資料庫爬蟲
- scrapy定製爬蟲-爬取javascript內容爬蟲JavaScript
- scrapy爬蟲框架呼叫百度地圖api資料存入資料庫爬蟲框架地圖API資料庫
- 房產資料爬取、智慧財產權資料爬取、企業工商資料爬取、抖音直播間資料python爬蟲爬取Python爬蟲
- 利用Python對鏈家網北京二手房進行簡單資料分析Python
- Scrapy框架爬取海量妹子圖框架
- 爬蟲教程——用Scrapy爬取豆瓣TOP250爬蟲
- 爬取微博圖片資料存到Mysql中遇到的各種坑mysql儲存圖片爬取微博圖片MySql
- mongo資料庫單節點搭建Go資料庫
- java操作 mongo DB 資料庫例子JavaGo資料庫
- Scrapy爬蟲(6)爬取銀行理財產品並存入MongoDB(共12w+資料)爬蟲MongoDB
- Session儲存到指定資料庫中Session資料庫
- 爬取高考資料
- 爬蟲 Scrapy框架 爬取圖蟲圖片並下載爬蟲框架
- mongo資料庫備份與恢復Go資料庫