爬蟲實戰scrapy

艾利金德發表於2018-03-11
# -*- coding: utf-8 -*-
import scrapy
import re


class JobboleSpider(scrapy.Spider):
    name = 'jobbole'
    allowed_domains = ['blog.jobbole.com']
    start_urls = ['http://blog.jobbole.com/all-posts/']

    def parse(self, response):
        
        url_list = response.css('div.post-meta p a.archive-title::attr(href)').extract()
        # url_list = response.xpath("//div[@class='post-meta']/p/a[@class='archive-title']/@href").extract()
        # url_list = response.xpath("//a[@class='archive-title']/@href").extract()
        # xpath
        # 注意在用屬性選擇器的時候儘量選擇class屬性,因為id屬性選擇器可擴充套件性不強
        # 即id選擇器只對當前頁有效,對其他頁面可能就沒有這個id,就可能報錯
        # //*[@id="post-113735"]/div[1]
        # 獲取標題
        title = response.xpath("//div[@class='entry-header']/h1/text()").extract()[0]
        # 獲取日期
        crat_time = response.xpath('//p[@class="entry-meta-hide-on-mobile"]/text()').extract()[0]
        # 刪除空白字元
        crat_time = crat_time.strip()
        # 刪除點
        crat_time = crat_time.strip('·')
        # 刪除空白字元
        crat_time =crat_time.strip()
        # 獲取關鍵字標籤
        biao_qian = response.xpath("//p[@class='entry-meta-hide-on-mobile']/a/text()").extract()
        # biao_qian = response.css("p.entry-meta-hide-on-mobile a::text")
        if biao_qian:
            biao_qian = ','.join(biao_qian)
        else:
            biao_qian = ''

        # 獲取點贊數
        dian_zan = response.xpath('//span[@class=" btn-bluet-bigger href-style vote-post-up   register-user-only "]/h10/text()').extract()[0]
        # 收藏數
        shou_cang = response.xpath('//span[@class=" btn-bluet-bigger href-style bookmark-btn  register-user-only "]/text()').extract()[0]
        ret = re.match(r'.*?(\d+).*?', shou_cang)
        if ret:
            shou_cang = int(ret.group(1))
        else:
            shou_cang = 0

        # 獲取評論
        comment = response.xpath('//span[@class="btn-bluet-bigger href-style hide-on-480"]/text()').extract()[0]
        # re.findall得到的是列表
        comment = re.findall(r'.*(\d).*?', comment)
        if comment:
            comment = int(comment[0])
        else:
            comment = 0

        # 以下是運用css 選擇器提取資料
        # 獲取文章內容
        # bood_commemt = response.css("div.entry").extract()[0]
        # 獲取標題
        # title_css = response.css(".entry-header h1::text").extract()[0]
        # 獲取日期
        # crat_time_css = response.css("p.entry-meta-hide-on-mobile::text").extract()[0].strip().strip('·').strip()

        pass

複製程式碼

相關文章