# Scrapy settings for demo1 project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

BOT_NAME = 'demo1' # 專案名

SPIDER_MODULES = ['demo1.spiders'] #

NEWSPIDER_MODULE = 'demo1.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'demo1 (+)' # 這個可以瀏覽器抓包檢視值比較重要一般都要帶的

# Obey robots.txt rules

ROBOTSTXT_OBEY = False # 機器人規則預設是 true 一般都要修改為 false 否則幾乎爬不了太多東西

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32 # 最大併發數可以開啟的爬蟲執行緒數

# Configure a delay for requests for the same website (default: 0)

# See #download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 1 # 下載延遲時間，單位是秒，預設是 3 秒，即爬一個停 3 秒，設定為 1 秒價效比較高，如果要爬取的檔案較多，寫零點幾秒也行

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False # 是否儲存 COOKIES ，預設關閉，開機可以記錄爬取過程中的 COKIE ，非常好用的一個引數

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

# 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

# 'Accept-Language': 'en',

#} # 預設請求頭，上面寫了一個 USER_AGENT ，其實這個東西就是放在請求頭裡面的，這個東西可以根據你爬取的內容做相應設定。

# Enable or disable spider middlewares

# See

#SPIDER_MIDDLEWARES = {

# 'demo1.middlewares.Demo1SpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See

#DOWNLOADER_MIDDLEWARES = {

# 'demo1.middlewares.Demo1DownloaderMiddleware': 543,

#}

# Enable or disable extensions

# See

#EXTENSIONS = {

# 'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See

#ITEM_PIPELINES = {

#'demo1.pipelines.Demo1Pipeline': 300,

#'demo1.pipelines.Demo1MySqlPipeline' : 200,

#} # 專案管道， 300 為優先順序，外匯跟單gendan5.com越低爬取的優先度越高 pipelines.py 裡面寫了兩個管道，一個爬取網頁的管道，一個存資料庫的管道，我調整了他們的優先順序，如果有爬蟲資料，優先執行存庫操作。

# Enable and configure the AutoThrottle extension (disabled by default)

# See

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See #httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

一文解決scrapy帶案例爬取噹噹圖書

相關文章