使用scrapy框架把資料非同步寫入資料庫

yaqinweiliang發表於2018-07-16

在setting中設定MySQL連線資訊

HOST='101.201.70.139'
MYSQL_DBNAME='anttest'
MYSQL_PASSWORD='Myjr678!@#'
MYSQL_USER='root'

2、在pipeline中新增mysql非同步連線類
from twisted.enterprise import adbapi
import MySQLdb
from MySQLdb import cursors

class mysqlTwistedpipline(object):
    def __init__(self,dbpool):
        self.dbpool=dbpool

    @classmethod
    def from_settings(cls,settings):
        dbparms = dict(
            host = settings['HOST'],
            db = settings['MYSQL_DBNAME'],
            user = settings['MYSQL_USER'],
            passwd = settings['MYSQL_PASSWORD'],
            charset='utf8',
            cursorclass=MySQLdb.cursors.DictCursor,
            use_unicode=True

         )
        dbpool = adbapi.ConnectionPool('MySQLdb',**dbparms)
        return cls(dbpool)

3、重寫   process_item方法
def process_item(self, item, spider):
    query = self.dbpool.runInteraction(self.do_insert,item)
    query.addErrback(self.handle_error)
4.定義錯誤處理
def handle_error(self,failure):
    print failure
5、新增向資料庫新增資料的方法
def do_insert(self,cursor,item):
  SQL插入語句
  無需輸入conn.commit
6、在setting.py中新增item——pipeline路徑

#完整檔案
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from twisted.enterprise import adbapi
import MySQLdb
from MySQLdb import cursors
class ZhongguosoudiPipeline(object):
    def process_item(self, item, spider):
        return item


class mysqlTwistedpipline(object):
    def __init__(self,dbpool):
        self.dbpool=dbpool

    @classmethod
    def from_settings(cls,settings):
        dbparms = dict(
            host = settings['HOST'],
            db = settings['MYSQL_DBNAME'],
            user = settings['MYSQL_USER'],
            passwd = settings['MYSQL_PASSWORD'],
            charset='utf8',
            cursorclass=MySQLdb.cursors.DictCursor,
            use_unicode=True

         )
        dbpool = adbapi.ConnectionPool('MySQLdb',**dbparms)
        return cls(dbpool)
    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self.do_insert,item)
        query.addErrback(self.handle_error)

    def handle_error(self,failure):
        print failure

    def do_insert(self,cursor,item):
        sql_in = 'insert into land_deals(title,publish_date,province_id,province_name,articles_link,content,create_time,transact_status)VALUES ("%s","%s","%d","%s","%s","%s",now(),"%d")'%(item['title'],item['publish_time'],0,"",item['url'],item['con'].replace('"','\''),1)
        cursor.execute(sql_in)

相關文章