一文解決scrapy帶案例爬取噹噹圖書

專注的阿熊發表於2021-06-04

# Scrapy settings for demo1 project

#

# For simplicity, this file contains only settings considered important or

# commonly used. You can find more settings consulting the documentation:

#

#     

#     

#     

BOT_NAME = 'demo1'  # 專案名

SPIDER_MODULES = ['demo1.spiders']    #

NEWSPIDER_MODULE = 'demo1.spiders'

# Crawl responsibly by identifying yourself (and your website) on the user-agent

#USER_AGENT = 'demo1 (+)'    # 這個可以瀏覽器抓包檢視值 比較重要 一般都要帶的

# Obey robots.txt rules

ROBOTSTXT_OBEY = False   # 機器人規則 預設是 true   一般都要修改為 false   否則幾乎爬不了太多東西

# Configure maximum concurrent requests performed by Scrapy (default: 16)

#CONCURRENT_REQUESTS = 32  # 最大併發數 可以開啟的爬蟲執行緒數

# Configure a delay for requests for the same website (default: 0)

# See #download-delay

# See also autothrottle settings and docs

#DOWNLOAD_DELAY = 1   # 下載延遲時間,單位是秒,預設是 3 秒,即爬一個停 3 秒,設定為 1 秒價效比較高,如果要爬取的檔案較多,寫零點幾秒也行

# The download delay setting will honor only one of:

#CONCURRENT_REQUESTS_PER_DOMAIN = 16

#CONCURRENT_REQUESTS_PER_IP = 16

# Disable cookies (enabled by default)

#COOKIES_ENABLED = False   # 是否儲存 COOKIES ,預設關閉,開機可以記錄爬取過程中的 COKIE ,非常好用的一個引數

# Disable Telnet Console (enabled by default)

#TELNETCONSOLE_ENABLED = False

# Override the default request headers:

#DEFAULT_REQUEST_HEADERS = {

#   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',

#   'Accept-Language': 'en',

#}     # 預設請求頭,上面寫了一個 USER_AGENT ,其實這個東西就是放在請求頭裡面的,這個東西可以根據你爬取的內容做相應設定。

# Enable or disable spider middlewares

# See

#SPIDER_MIDDLEWARES = {

#    'demo1.middlewares.Demo1SpiderMiddleware': 543,

#}

# Enable or disable downloader middlewares

# See

#DOWNLOADER_MIDDLEWARES = {

#    'demo1.middlewares.Demo1DownloaderMiddleware': 543,

#}

# Enable or disable extensions

# See

#EXTENSIONS = {

#    'scrapy.extensions.telnet.TelnetConsole': None,

#}

# Configure item pipelines

# See

#ITEM_PIPELINES = {

     #'demo1.pipelines.Demo1Pipeline': 300,

     #'demo1.pipelines.Demo1MySqlPipeline' : 200,

#}  # 專案管道, 300 為優先順序,外匯跟單gendan5.com越低爬取的優先度越高 pipelines.py 裡面寫了兩個管道,一個爬取網頁的管道,一個存資料庫的管道,我調整了他們的優先順序,如果有爬蟲資料,優先執行存庫操作。

# Enable and configure the AutoThrottle extension (disabled by default)

# See

#AUTOTHROTTLE_ENABLED = True

# The initial download delay

#AUTOTHROTTLE_START_DELAY = 5

# The maximum download delay to be set in case of high latencies

#AUTOTHROTTLE_MAX_DELAY = 60

# The average number of requests Scrapy should be sending in parallel to

# each remote server

#AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0

# Enable showing throttling stats for every response received:

#AUTOTHROTTLE_DEBUG = False

# Enable and configure HTTP caching (disabled by default)

# See #httpcache-middleware-settings

#HTTPCACHE_ENABLED = True

#HTTPCACHE_EXPIRATION_SECS = 0

#HTTPCACHE_DIR = 'httpcache'

#HTTPCACHE_IGNORE_HTTP_CODES = []

#HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'

 


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69946337/viewspace-2775531/,如需轉載,請註明出處,否則將追究法律責任。

相關文章