作用:基於全棧資料的爬取。
首先建立專案
-
scrapy startproject choutiPro
-
cd choutiPro
-
scrapy genspider -t crawl chouti www.xxx.com
【需求】:
爬取抽屜網段子類中所有的分頁URL
程式碼部分:
# -*- coding: utf-8 -*- import scrapy from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class ChoutiSpider(CrawlSpider): name = 'chouti' # allowed_domains = ['www.xxx.com'] start_urls = ['https://dig.chouti.com/r/scoff/hot/1'] # 連結提取器,allow表示的就是連結提取器提取連結的正則 link = LinkExtractor(allow=r'/r/scoff/hot/\d+') rules = ( # 規則解析器:將提取到的連結所對應的頁面資料進行制定形式的解析 Rule(link, callback='parse_item', follow=True), #如果follow設定為False的話。就只能拿到當前頁的分頁資料,也就是十條資料 # 讓連結提取器繼續作用到連結提取器提取到的連結所對應的頁面中 ) def parse_item(self, response): item = {} #item['domain_id'] = response.xpath('//input[@id="sid"]/@value').get() #item['name'] = response.xpath('//div[@id="name"]').get() #item['description'] = response.xpath('//div[@id="description"]').get() print(response)
【需求】:
爬取抽屜網段子類中所有的分頁URL
程式碼部分:
from scrapy.linkextractors import LinkExtractor from scrapy.spiders import CrawlSpider, Rule class ChoutiSpider(CrawlSpider): name = 'qiubai' # allowed_domains = ['www.xxx.com'] start_urls = ['https://www.qiushibaike.com/pic/'] # 連線提取器: # allow:表示的就是連結提取器提取連線的規則(正則)/pic/page/3?s=5172496 link = LinkExtractor(allow=r'/pic/page/\d+\?s=\d+') link1 = LinkExtractor(allow=r'/pic/$') # link1 = LinkExtractor(allow=r'') rules = ( # 規則解析器:將連結提取器提取到的連線所對應的頁面資料進行指定形式的解析 Rule(link, callback='parse_item', follow=True), # 讓連線提取器繼續作用到連結提取器提取到的連線所對應的頁面中 Rule(link1, callback='parse_item', follow=True), ) def parse_item(self, response): print(response)