scrapy設定ua池
設定後在setting啟用
DOWNLOADER_MIDDLEWARES = {
`laogou.middlewares.LaogouDownloaderMiddleware`: 543,
`laogou.middlewares.randomUserAgentMiddleware`: 400,
`laogou.middlewares.randomProxyMiddleware`: 400,
}
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware class randomUserAgentMiddleware(UserAgentMiddleware): def __init__(self,user_agent=``): self.user_agent = user_agent def process_request(self, request, spider): ua = random.choice(self.user_agent_list) if ua: request.headers.setdefault(`User-Agent`, ua) user_agent_list = [ "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24" ]
scrapy設定ip池
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware class randomHttpProxyMiddleware(HttpProxyMiddleware): def __init__(self,ip = ``): self.ip = ip def process_request(self, request, spider): ip = random.choice(self.ip_list) if ip: request.meta[`proxy`] = ip ip_list = [ `https://182.122.176.49:9999`, `https://125.123.141.20:9999` ]
scrapy 設定自定義cookie:class LaogouwangSpider(scrapy.Spider):
name = `laogouwang`
# allowed_domains = [`www.laogou.com`]
# start_urls = [`http://www.laogou.com/`] def start_requests(self): url = `https://www.lagou.com/` yield scrapy.Request(url=url,callback=self.parse,meta={`cookiejar`:1}) def parse(self, response): print(response.request.headers.getlist(`Cookie`)) print(response.headers.getlist(`Set-Cookie`)) url = `https://www.lagou.com/jobs/list_`+ str(settings.keys) +`?city=`+ str(settings.cidy) +`&cl=false&fromSearch=true&labelWords=&suginput=` print(response.meta[`cookiejar`])
yield scrapy.Request(url=url,callback=self.download,meta={`cookiejar`:response.meta[`cookiejar`],`id`:1},dont_filter=True)
def download(self, response):
# print(response.text)
print(response.request.headers.getlist(`Cookie`))
print(response.headers.getlist(`Set-Cookie`))
i = response.meta.get(`id`)
file = `false`
if i == 1:
file = `true`
data = {
"first":file,
"pn":str(i),
"kd":str(settings.keys)
}
headers_post = {
`Accept`: `application/json, text/javascript, */*; q=0.01`,
`Content-Type`: `application/x-www-form-urlencoded; charset=UTF-8`,
`Content-Length`: str(len(urllib.parse.urlencode(data))),
`Connection`: `keep-alive`,
`Referer`:str(response.url),
`User-Agent`: `Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:65.0) Gecko/20100101 Firefox/65.0`,
}
print(headers_post)
print(str(response.url))
print(data)
url = `https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false`
yield scrapy.FormRequest(url=url,formdata=data,headers=headers_post,callback=self.files,dont_filter=True,meta={`cookiejar`:True,`dont_redirect`: True,`handle_httpstatus_list`: [301,302]})
meta={`cookiejar`:1}這個是啟動cookei記錄,在後面的請求中使用`cookiejar`:response.meta[`cookiejar`]可以更新cookie。
注意,需要在setting中設定COOKIES_ENABLED = True
獲取請求cookies是response.request.headers.getlist(`Cookie`),響應cookies是response.headers.getlist(`Set-Cookie`)。
靜止重定向dont_filter=True。
在meta裡使用`dont_redirect`: True,`handle_httpstatus_list`: [301,302]可以在當前scrapy請求裡禁用重定向。
scrapy 使用日誌
import datetime,os time = datetime.datetime.now().strftime(`%Y_%m_%H_%M_%S`) LOG_FILE = `logs`+ os.sep +str(time) + `_` + "laogou.log" LOG_LEVEL = "DEBUG"
LOG_STDOUT = true
scrapy提供五種日誌級別。
1.CRITICAL -- 關鍵錯誤
2.ERROR -- 一般級別的錯誤
3.WARNING -- 警告資訊
4.INFO -- 資訊訊息的日誌(建議生產模式使用)
5.DEBUG -- 除錯訊息的日誌(建議開發模式)
LOG_FILE 用於日誌輸出記錄的檔名 預設None
LOG_LEVEL 要記錄的最低階別 預設DEBUG
LOG_STDOUT 如果為true 則程式的所有標準輸出和錯誤都重定向到日誌,列如print() 預設false
使用檔案啟動spider
#laogoustrart.py
from laogou.spiders.laogouwang import LaogouwangSpider from scrapy.crawler import CrawlerProcess from scrapy.utils.project import get_project_settings process = CrawlerProcess(get_project_settings()) process.crawl(LaogouwangSpider) process.start()