Python學習筆記——爬蟲之Scrapy專案實戰

唯戀殊雨發表於2018-09-03

目錄

手機App抓包爬蟲

陽光熱線問政平臺

(實戰專案三)新浪網分類資訊爬蟲

Cosplay圖片下載爬蟲

用Pymongo儲存資料

三種Scrapy模擬登陸策略


手機App抓包爬蟲

1. items.py

class DouyuspiderItem(scrapy.Item):
    name = scrapy.Field()# 儲存照片的名字
    imagesUrls = scrapy.Field()# 照片的url路徑
    imagesPath = scrapy.Field()# 照片儲存在本地的路徑

2. spiders/douyu.py

import scrapy
import json
from douyuSpider.items import DouyuspiderItem

class DouyuSpider(scrapy.Spider):
    name = "douyu"
    allowd_domains = ["http://capi.douyucdn.cn"]

    offset = 0
    url = "http://capi.douyucdn.cn/api/v1/getVerticalRoom?limit=20&offset="
    start_urls = [url + str(offset)]

  def parse(self, response):
      # 返回從json裡獲取 data段資料集合
      data = json.loads(response.text)["data"]

      for each in data:
          item = DouyuspiderItem()
          item["name"] = each["nickname"]
          item["imagesUrls"] = each["vertical_src"]

          yield item

      self.offset += 20
      yield scrapy.Request(self.url + str(self.offset), callback = self.parse)

3. 設定setting.py


ITEM_PIPELINES = {'douyuSpider.pipelines.ImagesPipeline': 1}

# Images 的存放位置,之後會在pipelines.py裡呼叫
IMAGES_STORE = "/Users/Power/lesson_python/douyuSpider/Images"

# user-agent
USER_AGENT = 'DYZB/2.290 (iPhone; iOS 9.3.4; Scale/2.00)'

4. pipelines.py

import scrapy
import os
from scrapy.pipelines.images import ImagesPipeline
from scrapy.utils.project import get_project_settings

class ImagesPipeline(ImagesPipeline):
    IMAGES_STORE = get_project_settings().get("IMAGES_STORE")

    def get_media_requests(self, item, info):
        image_url = item["imagesUrls"]
        yield scrapy.Request(image_url)

    def item_completed(self, results, item, info):
        # 固定寫法,獲取圖片路徑,同時判斷這個路徑是否正確,如果正確,就放到 image_path裡,ImagesPipeline原始碼剖析可見
        image_path = [x["path"] for ok, x in results if ok]

        os.rename(self.IMAGES_STORE + "/" + image_path[0], self.IMAGES_STORE + "/" + item["name"] + ".jpg")
        item["imagesPath"] = self.IMAGES_STORE + "/" + item["name"]

        return item

#get_media_requests的作用就是為每一個圖片連結生成一個Request物件,這個方法的輸出將作為item_completed的輸入中的results,results是一個元組,每個元組包括(success, imageinfoorfailure)。如果success=true,imageinfoor_failure是一個字典,包括url/path/checksum三個key。

在專案根目錄下新建main.py檔案,用於除錯

from scrapy import cmdline
cmdline.execute('scrapy crawl douyu'.split())

執行程式

py2 main.py

 

陽光熱線問政平臺

http://wz.sun0769.com/index.php/question/questionType?type=4

爬取投訴帖子的編號、帖子的url、帖子的標題,和帖子裡的內容。

items.py

import scrapy

class DongguanItem(scrapy.Item):
    # 每個帖子的標題
    title = scrapy.Field()
    # 每個帖子的編號
    number = scrapy.Field()
    # 每個帖子的文字內容
    content = scrapy.Field()
    # 每個帖子的url
    url = scrapy.Field()

spiders/sunwz.py

Spider 版本

# -*- coding: utf-8 -*-

import scrapy
from dongguan.items import DongguanItem

class SunSpider(CrawlSpider):
    name = 'sun'
    allowed_domains = ['wz.sun0769.com']
    url = 'http://wz.sun0769.com/index.php/question/questionType?type=4&page='
    offset = 0
    start_urls = [url + str(offset)]

    def parse(self, response):
        # 取出每個頁面裡帖子連結列表
        links = response.xpath("//div[@class='greyframe']/table//td/a[@class='news14']/@href").extract()
        # 迭代傳送每個帖子的請求,呼叫parse_item方法處理
        for link in links:
            yield scrapy.Request(link, callback = self.parse_item)
        # 設定頁碼終止條件,並且每次傳送新的頁面請求呼叫parse方法處理
        if self.offset <= 71130:
            self.offset += 30
            yield scrapy.Request(self.url + str(self.offset), callback = self.parse)

    # 處理每個帖子裡
    def parse_item(self, response):
        item = DongguanItem()
        # 標題
        item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]

        # 編號
        item['number'] = item['title'].split(' ')[-1].split(":")[-1]

        # 文字內容,預設先取出有圖片情況下的文字內容列表
        content = response.xpath('//div[@class="contentext"]/text()').extract()
        # 如果沒有內容,則取出沒有圖片情況下的文字內容列表
        if len(content) == 0:
            content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()
            # content為列表,通過join方法拼接為字串,並去除首尾空格
            item['content'] = "".join(content).strip()
        else:
            item['content'] = "".join(content).strip()

        # 連結
        item['url'] = response.url

        yield item

CrawlSpider 版本


# -*- coding: utf-8 -*-
import scrapy
from scrapy.linkextractors import LinkExtractor
from scrapy.spiders import CrawlSpider, Rule
from dongguan.items import DongguanItem
import time


class SunSpider(CrawlSpider):
    name = 'sun'
    allowed_domains = ['wz.sun0769.com']
    start_urls = ['http://wz.sun0769.com/index.php/question/questionType?type=4&page=']

    # 每一頁的匹配規則
    pagelink = LinkExtractor(allow=('type=4'))
    # 每個帖子的匹配規則
    contentlink = LinkExtractor(allow=r'/html/question/\d+/\d+.shtml')

    rules = [
        # 本案例為特殊情況,需要呼叫deal_links方法處理每個頁面裡的連結
        Rule(pagelink, process_links = "deal_links", follow = True),
        Rule(contentlink, callback = 'parse_item')
    ]

    # 需要重新處理每個頁面裡的連結,將連結裡的‘Type&type=4?page=xxx’替換為‘Type?type=4&page=xxx’(或者是Type&page=xxx?type=4’替換為‘Type?page=xxx&type=4’),否則無法傳送這個連結
    def deal_links(self, links):
        for link in links:
            link.url = link.url.replace("?","&").replace("Type&", "Type?")
            print link.url
        return links


    def parse_item(self, response):
        print response.url
        item = DongguanItem()
        # 標題
        item['title'] = response.xpath('//div[contains(@class, "pagecenter p3")]//strong/text()').extract()[0]

        # 編號
        item['number'] = item['title'].split(' ')[-1].split(":")[-1]

        # 文字內容,預設先取出有圖片情況下的文字內容列表
        content = response.xpath('//div[@class="contentext"]/text()').extract()
        # 如果沒有內容,則取出沒有圖片情況下的文字內容列表
        if len(content) == 0:
            content = response.xpath('//div[@class="c1 text14_2"]/text()').extract()
            # content為列表,通過join方法拼接為字串,並去除首尾空格
            item['content'] = "".join(content).strip()
        else:
            item['content'] = "".join(content).strip()

        # 連結
        item['url'] = response.url

        yield item

pipelines.py

# -*- coding: utf-8 -*-

# 檔案處理類庫,可以指定編碼格式
import codecs
import json

class JsonWriterPipeline(object):

    def __init__(self):
        # 建立一個只寫檔案,指定文字編碼格式為utf-8
        self.filename = codecs.open('sunwz.json', 'w', encoding='utf-8')

    def process_item(self, item, spider):
        content = json.dumps(dict(item), ensure_ascii=False) + "\n"
        self.filename.write(content)
        return item

    def spider_closed(self, spider):
        self.file.close()

settings.py

ITEM_PIPELINES = {
    'dongguan.pipelines.DongguanPipeline': 300,
}

# 日誌檔名和處理等級
LOG_FILE = "dg.log"
LOG_LEVEL = "DEBUG"

在專案根目錄下新建main.py檔案,用於除錯

from scrapy import cmdline
cmdline.execute('scrapy crawl sunwz'.split())

執行程式

py2 main.py

 

(實戰專案三)新浪網分類資訊爬蟲

爬取新浪網導航頁所有下所有大類、小類、小類裡的子連結,以及子連結頁面的新聞內容。

效果演示圖:

items.py

import scrapy
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

class SinaItem(scrapy.Item):
    # 大類的標題 和 url
    parentTitle = scrapy.Field()
    parentUrls = scrapy.Field()

    # 小類的標題 和 子url
    subTitle = scrapy.Field()
    subUrls = scrapy.Field()

    # 小類目錄儲存路徑
    subFilename = scrapy.Field()

    # 小類下的子連結
    sonUrls = scrapy.Field()

    # 文章標題和內容
    head = scrapy.Field()
    content = scrapy.Field()

spiders/sina.py

# -*- coding: utf-8 -*-
# -*- coding: utf-8 -*-

from Sina.items import SinaItem
import scrapy
import os

import sys
reload(sys)
sys.setdefaultencoding("utf-8")


class SinaSpider(scrapy.Spider):
    name= "sina"
    allowed_domains= ["sina.com.cn"]
    start_urls= [
       "http://news.sina.com.cn/guide/"
    ]

    def parse(self, response):
        items= []
        # 所有大類的url 和 標題
        parentUrls = response.xpath('//div[@id=\"tab01\"]/div/h3/a/@href').extract()
        parentTitle = response.xpath("//div[@id=\"tab01\"]/div/h3/a/text()").extract()

        # 所有小類的ur 和 標題
        subUrls  = response.xpath('//div[@id=\"tab01\"]/div/ul/li/a/@href').extract()
        subTitle = response.xpath('//div[@id=\"tab01\"]/div/ul/li/a/text()').extract()

        #爬取所有大類
        for i in range(0, len(parentTitle)):
            # 指定大類目錄的路徑和目錄名
            parentFilename = "./Data/" + parentTitle[i]

            #如果目錄不存在,則建立目錄
            if(not os.path.exists(parentFilename)):
                os.makedirs(parentFilename)

            # 爬取所有小類
            for j in range(0, len(subUrls)):
                item = SinaItem()

                # 儲存大類的title和urls
                item['parentTitle'] = parentTitle[i]
                item['parentUrls'] = parentUrls[i]

                # 檢查小類的url是否以同類別大類url開頭,如果是返回True (sports.sina.com.cn 和 sports.sina.com.cn/nba)
                if_belong = subUrls[j].startswith(item['parentUrls'])

                # 如果屬於本大類,將儲存目錄放在本大類目錄下
                if(if_belong):
                    subFilename =parentFilename + '/'+ subTitle[j]
                    # 如果目錄不存在,則建立目錄
                    if(not os.path.exists(subFilename)):
                        os.makedirs(subFilename)

                    # 儲存 小類url、title和filename欄位資料
                    item['subUrls'] = subUrls[j]
                    item['subTitle'] =subTitle[j]
                    item['subFilename'] = subFilename

                    items.append(item)

        #傳送每個小類url的Request請求,得到Response連同包含meta資料 一同交給回撥函式 second_parse 方法處理
        for item in items:
            yield scrapy.Request( url = item['subUrls'], meta={'meta_1': item}, callback=self.second_parse)

    #對於返回的小類的url,再進行遞迴請求
    def second_parse(self, response):
        # 提取每次Response的meta資料
        meta_1= response.meta['meta_1']

        # 取出小類裡所有子連結
        sonUrls = response.xpath('//a/@href').extract()

        items= []
        for i in range(0, len(sonUrls)):
            # 檢查每個連結是否以大類url開頭、以.shtml結尾,如果是返回True
            if_belong = sonUrls[i].endswith('.shtml') and sonUrls[i].startswith(meta_1['parentUrls'])

            # 如果屬於本大類,獲取欄位值放在同一個item下便於傳輸
            if(if_belong):
                item = SinaItem()
                item['parentTitle'] =meta_1['parentTitle']
                item['parentUrls'] =meta_1['parentUrls']
                item['subUrls'] = meta_1['subUrls']
                item['subTitle'] = meta_1['subTitle']
                item['subFilename'] = meta_1['subFilename']
                item['sonUrls'] = sonUrls[i]
                items.append(item)

        #傳送每個小類下子連結url的Request請求,得到Response後連同包含meta資料 一同交給回撥函式 detail_parse 方法處理
        for item in items:
                yield scrapy.Request(url=item['sonUrls'], meta={'meta_2':item}, callback = self.detail_parse)

    # 資料解析方法,獲取文章標題和內容
    def detail_parse(self, response):
        item = response.meta['meta_2']
        content = ""
        head = response.xpath('//h1[@id=\"main_title\"]/text()')
        content_list = response.xpath('//div[@id=\"artibody\"]/p/text()').extract()

        # 將p標籤裡的文字內容合併到一起
        for content_one in content_list:
            content += content_one

        item['head']= head
        item['content']= content

        yield item

pipelines.py

from scrapy import signals
import sys
reload(sys)
sys.setdefaultencoding("utf-8")

class SinaPipeline(object):
    def process_item(self, item, spider):
        sonUrls = item['sonUrls']

        # 檔名為子連結url中間部分,並將 / 替換為 _,儲存為 .txt格式
        filename = sonUrls[7:-6].replace('/','_')
        filename += ".txt"

        fp = open(item['subFilename']+'/'+filename, 'w')
        fp.write(item['content'])
        fp.close()

        return item

settings.py

BOT_NAME = 'Sina'

SPIDER_MODULES = ['Sina.spiders']
NEWSPIDER_MODULE = 'Sina.spiders'

ITEM_PIPELINES = {
    'Sina.pipelines.SinaPipeline': 300,
}

LOG_LEVEL = 'DEBUG'

在專案根目錄下新建main.py檔案,用於除錯

from scrapy import cmdline
cmdline.execute('scrapy crawl sina'.split())

執行程式

py2 main.py

Cosplay圖片下載爬蟲

items.py

class CoserItem(scrapy.Item):
    url = scrapy.Field()
    name = scrapy.Field()
    info = scrapy.Field()
    image_urls = scrapy.Field()
    images = scrapy.Field()

spiders/coser.py

# -*- coding: utf-8 -*-
from scrapy.selector import Selector
import scrapy
from scrapy.contrib.loader import ItemLoader
from Cosplay.items import CoserItem


class CoserSpider(scrapy.Spider):
    name = "coser"
    allowed_domains = ["bcy.net"]
    start_urls = (
        'http://bcy.net/cn125101',
        'http://bcy.net/cn126487',
        'http://bcy.net/cn126173'
    )

    def parse(self, response):
        sel = Selector(response)

        for link in sel.xpath("//ul[@class='js-articles l-works']/li[@class='l-work--big']/article[@class='work work--second-created']/h2[@class='work__title']/a/@href").extract():
            link = 'http://bcy.net%s' % link
            request = scrapy.Request(link, callback=self.parse_item)
            yield request

    def parse_item(self, response):
        item = ItemLoader(item=CoserItem(), response=response)
        item.add_xpath('name', "//h1[@class='js-post-title']/text()")
        item.add_xpath('info', "//div[@class='post__info']/div[@class='post__type post__info-group']/span/text()")
        urls = item.get_xpath('//img[@class="detail_std detail_clickable"]/@src')
        urls = [url.replace('/w650', '') for url in urls]
        item.add_value('image_urls', urls)
        item.add_value('url', response.url)

        return item.load_item()

pipelines.py

import requests
from Cosplay import settings
import os


class ImageDownloadPipeline(object):
    def process_item(self, item, spider):
        if 'image_urls' in item:
            images = []
            dir_path = '%s/%s' % (settings.IMAGES_STORE, spider.name)

            if not os.path.exists(dir_path):
                os.makedirs(dir_path)
            for image_url in item['image_urls']:
                us = image_url.split('/')[3:]
                image_file_name = '_'.join(us)
                file_path = '%s/%s' % (dir_path, image_file_name)
                images.append(file_path)
                if os.path.exists(file_path):
                    continue

                with open(file_path, 'wb') as handle:
                    response = requests.get(image_url, stream=True)
                    for block in response.iter_content(1024):
                        if not block:
                            break

                        handle.write(block)

            item['images'] = images
        return item

settings.py


ITEM_PIPELINES = {'Cosplay.pipelines.ImageDownloadPipeline': 1}

IMAGES_STORE = '../Images'

DOWNLOAD_DELAY = 0.25    # 250 ms of delay

在專案根目錄下新建main.py檔案,用於除錯

from scrapy import cmdline
cmdline.execute('scrapy crawl coser'.split())

執行程式

py2 main.py

用Pymongo儲存資料

爬取豆瓣電影top250movie.douban.com/top250的電影資料,並儲存在MongoDB中。

items.py

class DoubanspiderItem(scrapy.Item):
    # 電影標題
    title = scrapy.Field()
    # 電影評分
    score = scrapy.Field()
    # 電影資訊
    content = scrapy.Field()
    # 簡介
    info = scrapy.Field()

spiders/douban.py

import scrapy
from doubanSpider.items import DoubanspiderItem


class DoubanSpider(scrapy.Spider):
    name = "douban"
    allowed_domains = ["movie.douban.com"]
    start = 0
    url = 'https://movie.douban.com/top250?start='
    end = '&filter='
    start_urls = [url + str(start) + end]

    def parse(self, response):

        item = DoubanspiderItem()

        movies = response.xpath("//div[@class=\'info\']")

        for each in movies:
            title = each.xpath('div[@class="hd"]/a/span[@class="title"]/text()').extract()
            content = each.xpath('div[@class="bd"]/p/text()').extract()
            score = each.xpath('div[@class="bd"]/div[@class="star"]/span[@class="rating_num"]/text()').extract()
            info = each.xpath('div[@class="bd"]/p[@class="quote"]/span/text()').extract()

            item['title'] = title[0]
            # 以;作為分隔,將content列表裡所有元素合併成一個新的字串
            item['content'] = ';'.join(content)
            item['score'] = score[0]
            item['info'] = info[0]
            # 提交item

            yield item

        if self.start <= 225:
            self.start += 25
            yield scrapy.Request(self.url + str(self.start) + self.end, callback=self.parse)

pipelines.py


from scrapy.conf import settings
import pymongo

class DoubanspiderPipeline(object):
    def __init__(self):
        # 獲取setting主機名、埠號和資料庫名
        host = settings['MONGODB_HOST']
        port = settings['MONGODB_PORT']
        dbname = settings['MONGODB_DBNAME']

        # pymongo.MongoClient(host, port) 建立MongoDB連結
        client = pymongo.MongoClient(host=host,port=port)

        # 指向指定的資料庫
        mdb = client[dbname]
        # 獲取資料庫裡存放資料的表名
        self.post = mdb[settings['MONGODB_DOCNAME']]


    def process_item(self, item, spider):
        data = dict(item)
        # 向指定的表裡新增資料
        self.post.insert(data)
        return item

settings.py

BOT_NAME = 'doubanSpider'

SPIDER_MODULES = ['doubanSpider.spiders']
NEWSPIDER_MODULE = 'doubanSpider.spiders'

ITEM_PIPELINES = {
        'doubanSpider.pipelines.DoubanspiderPipeline' : 300
        }

# Crawl responsibly by identifying yourself (and your website) on the user-agent
USER_AGENT = 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/48.0.2564.116 Safari/537.36'

# MONGODB 主機環回地址127.0.0.1
MONGODB_HOST = '127.0.0.1'
# 埠號,預設是27017
MONGODB_PORT = 27017
# 設定資料庫名稱
MONGODB_DBNAME = 'DouBan'
# 存放本次資料的表名稱
MONGODB_DOCNAME = 'DouBanMovies'

執行

啟動MongoDB資料庫需要兩個命令:

mongod:是mongoDB資料庫程式本身
mongo:是命令列shell客戶端


sudo mongod # 首先啟動資料庫服務,再執行Scrapy
sudo mongo # 啟動資料庫shell

在mongo shell下使用命令:

# 檢視當前資料庫
> db

# 列出所有的資料庫
> show dbs

# 連線DouBan資料庫
> use DouBan

# 列出所有表
> show collections

# 檢視錶裡的資料
> db.DouBanMoives.find()

三種Scrapy模擬登陸策略

注意:模擬登陸時,必須保證settings.py裡的 COOKIES_ENABLED(Cookies中介軟體) 處於開啟狀態

COOKIES_ENABLED = True 或 # COOKIES_ENABLED = False

策略一:直接POST資料(比如需要登陸的賬戶資訊)

只要是需要提供post資料的,就可以用這種方法。下面示例裡post的資料是賬戶密碼:

# -*- coding: utf-8 -*-
import scrapy


class Renren1Spider(scrapy.Spider):
    name = "renren1"
    allowed_domains = ["renren.com"]

    def start_requests(self):
        url = 'http://www.renren.com/PLogin.do'
        # FormRequest 是Scrapy傳送POST請求的方法
        yield scrapy.FormRequest(
                url = url,
                formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"},
                callback = self.parse_page)

    def parse_page(self, response):
        with open("mao2.html", "w") as filename:
            filename.write(response.body)

策略二:標準的模擬登陸步驟

正統模擬登入方法:

  1. 首先傳送登入頁面的get請求,獲取到頁面裡的登入必須的引數(比如說zhihu登陸介面的 _xsrf)

  2. 然後和賬戶密碼一起post到伺服器,登入成功

# -*- coding: utf-8 -*-
import scrapy



class Renren2Spider(scrapy.Spider):
    name = "renren2"
    allowed_domains = ["renren.com"]
    start_urls = (
        "http://www.renren.com/PLogin.do",
    )

    # 處理start_urls裡的登入url的響應內容,提取登陸需要的引數(如果需要的話)
    def parse(self, response):
        # 提取登陸需要的引數
        #_xsrf = response.xpath("//_xsrf").extract()[0]

        # 傳送請求引數,並呼叫指定回撥函式處理
        yield scrapy.FormRequest.from_response(
                response,
                formdata = {"email" : "mr_mao_hacker@163.com", "password" : "axxxxxxxe"},#, "_xsrf" = _xsrf},
                callback = self.parse_page
            )

    # 獲取登入成功狀態,訪問需要登入後才能訪問的頁面
    def parse_page(self, response):
        url = "http://www.renren.com/422167102/profile"
        yield scrapy.Request(url, callback = self.parse_newpage)

    # 處理響應內容
    def parse_newpage(self, response):
        with open("xiao.html", "w") as filename:
            filename.write(response.body)

策略三:直接使用儲存登陸狀態的Cookie模擬登陸

如果實在沒辦法了,可以用這種方法模擬登入,雖然麻煩一點,但是成功率100%

# -*- coding: utf-8 -*-
import scrapy

class RenrenSpider(scrapy.Spider):
    name = "renren"
    allowed_domains = ["renren.com"]
    start_urls = (
        'http://www.renren.com/111111',
        'http://www.renren.com/222222',
        'http://www.renren.com/333333',
    )

    cookies = {
    "anonymid" : "ixrna3fysufnwv",
    "_r01_" : "1",
    "ap" : "327550029",
    "JSESSIONID" : "abciwg61A_RvtaRS3GjOv",
    "depovince" : "GW",
    "springskin" : "set",
    "jebe_key" : "f6fb270b-d06d-42e6-8b53-e67c3156aa7e%7Cc13c37f53bca9e1e7132d4b58ce00fa3%7C1484060607478%7C1%7C1486198628950",
    "t" : "691808127750a83d33704a565d8340ae9",
    "societyguester" : "691808127750a83d33704a565d8340ae9",
    "id" : "327550029",
    "xnsid" : "f42b25cf",
    "loginfrom" : "syshome"
    }

    # 可以重寫Spider類的start_requests方法,附帶Cookie值,傳送POST請求
    def start_requests(self):
        for url in self.start_urls:
            yield scrapy.FormRequest(url, cookies = self.cookies, callback = self.parse_page)

    # 處理響應內容
    def parse_page(self, response):
        print "===========" + response.url
        with open("deng.html", "w") as filename:
            filename.write(response.body)

 

相關文章