一個完整的scrapy 專案

18923489164發表於2020-05-02

#注意 管道檔案一定要看settings.py是否開啟不然不會執行

# -*- coding: utf-8 -*-
import scrapy
from bs4 import BeautifulSoup as bs
import re
from douban.items import DoubanItem #這裡是要引入items欄位 

#scrapy crawl dou

class DouSpider(scrapy.Spider):

    name = 'dou' #爬蟲名字
    
    start_urls = ['https://movie.douban.com/subject/30314127/reviews'] #需要爬的連結

    def parse(self, response):
         
        html = response.text

        html = bs(html,'lxml')

        cont = html.findAll('div',class_='main review-item')

        for i in cont:

            item = DoubanItem()#這裡就是用items 欄位來存獲取的東西 相當於一個字典
                
            name = i.header.text
            name = re.sub('\n','',name)
                
            con = i.div.text
            con = re.sub('\n','',con)
            con = re.sub(' ','',con)


            item['name'] = name
            item['con'] = con


            #self.log(name)
            #self.log(con)
            #self.log('\n')
            #self.log('\n')
            
            yield item #存好就可以yield 這個是時時返回並不結束程式







import scrapy   #這個就是定義items 欄位了
class DoubanItem(scrapy.Item):

    name = scrapy.Field()

    con = scrapy.Field()

    
#管道檔案存items 欄位過來的內容

class DoubanPipeline(object):

    def process_item(self, item, spider):
        
        with open("douban.txt", "a",encoding='utf-8')as f:

            f.write(item['name'])
            f.write('\n')
            f.write(item['con'])

            f.write('\n')
            f.write('\n')
            f.write('\n')

        
        return item


相關文章