scrapy 一些設定和問題

一個大柚子發表於2019-02-17

scrapy設定ua池

設定後在setting啟用

DOWNLOADER_MIDDLEWARES = {
`laogou.middlewares.LaogouDownloaderMiddleware`: 543,
`laogou.middlewares.randomUserAgentMiddleware`: 400,
`laogou.middlewares.randomProxyMiddleware`: 400,
}
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware

class randomUserAgentMiddleware(UserAgentMiddleware):

    def __init__(self,user_agent=``):
        self.user_agent = user_agent

    def process_request(self, request, spider):
        ua = random.choice(self.user_agent_list)
        if ua:
            request.headers.setdefault(`User-Agent`, ua)
    user_agent_list = [ 
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" 
        "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", 
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", 
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", 
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", 
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", 
        "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", 
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
        "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", 
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", 
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
        "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
        "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", 
        "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", 
        "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", 
        "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
    ]

scrapy設定ip池

from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware

class randomHttpProxyMiddleware(HttpProxyMiddleware):
    def __init__(self,ip = ``):
        self.ip = ip
    def process_request(self, request, spider):
        ip = random.choice(self.ip_list)
        if ip:
            request.meta[`proxy`] = ip
    ip_list = [
        `https://182.122.176.49:9999`,
        `https://125.123.141.20:9999`
    ]

 

scrapy 設定自定義cookie:class LaogouwangSpider(scrapy.Spider):    

   name = `laogouwang`    
   # allowed_domains = [`www.laogou.com`]

   # start_urls = [`http://www.laogou.com/`] def start_requests(self): url = `https://www.lagou.com/` yield scrapy.Request(url=url,callback=self.parse,meta={`cookiejar`:1}) def parse(self, response): print(response.request.headers.getlist(`Cookie`)) print(response.headers.getlist(`Set-Cookie`)) url = `https://www.lagou.com/jobs/list_`+ str(settings.keys) +`?city=`+ str(settings.cidy) +`&cl=false&fromSearch=true&labelWords=&suginput=` print(response.meta[`cookiejar`])
yield scrapy.Request(url=url,callback=self.download,meta={`cookiejar`:response.meta[`cookiejar`],`id`:1},dont_filter=True)
   def download(self, response):
    # print(response.text)
print(response.request.headers.getlist(`Cookie`))
print(response.headers.getlist(`Set-Cookie`))
i = response.meta.get(`id`)
file = `false`
if i == 1:
file = `true`
data = {
"first":file,
"pn":str(i),
"kd":str(settings.keys)
}
headers_post = {
`Accept`: `application/json, text/javascript, */*; q=0.01`,
`Content-Type`: `application/x-www-form-urlencoded; charset=UTF-8`,
`Content-Length`: str(len(urllib.parse.urlencode(data))),
`Connection`: `keep-alive`,
`Referer`:str(response.url),
`User-Agent`: `Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0`,
}
print(headers_post)
print(str(response.url))
print(data)
url = `https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false`

yield scrapy.FormRequest(url=url,formdata=data,headers=headers_post,callback=self.files,dont_filter=True,meta={`cookiejar`:True,`dont_redirect`: True,`handle_httpstatus_list`: [301,302]})
meta={`cookiejar`:1}這個是啟動cookei記錄,在後面的請求中使用`cookiejar`:response.meta[`cookiejar`]可以更新cookie。
注意,需要在setting中設定COOKIES_ENABLED = True

獲取請求cookies是response.request.headers.getlist(`Cookie`),響應cookies是response.headers.getlist(`Set-Cookie`)。
靜止重定向dont_filter=True。
在meta裡使用`dont_redirect`: True,`handle_httpstatus_list`: [301,302]可以在當前scrapy請求裡禁用重定向。

scrapy 使用日誌
import datetime,os
time = datetime.datetime.now().strftime(`%Y_%m_%H_%M_%S`)
LOG_FILE = `logs`+ os.sep +str(time) + `_` + "laogou.log"
LOG_LEVEL = "DEBUG"
LOG_STDOUT = true

scrapy提供五種日誌級別。

1.CRITICAL -- 關鍵錯誤
2.ERROR -- 一般級別的錯誤
3.WARNING -- 警告資訊
4.INFO -- 資訊訊息的日誌(建議生產模式使用)
5.DEBUG -- 除錯訊息的日誌(建議開發模式)
LOG_FILE 用於日誌輸出記錄的檔名 預設None
LOG_LEVEL 要記錄的最低階別 預設DEBUG
LOG_STDOUT 如果為true 則程式的所有標準輸出和錯誤都重定向到日誌,列如print() 預設false

使用檔案啟動spider
#laogoustrart.py

from
laogou.spiders.laogouwang import LaogouwangSpider from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings process = CrawlerProcess(get_project_settings()) process.crawl(LaogouwangSpider) process.start()
 




 

 

相關文章