-
建立
cd xx
scrapy startproject <資料夾名> [dir]
cd 檔案目錄
scrapy genspider <檔名> <域名>
scrapy crawl <檔名> -
spiders
定義的詳細爬取規則 -
items
爬取的資料結構 -
middlewares
中介軟體 -
pipelines
資料管道,負責持久儲存和清洗
: 存取mongod
class MongoDBPipeline(object):
def __init__(self):
# 建立連結
self.client = pymongo.MongoClient(host='localhost', port=27017)
# 進入資料庫
self.db = self.client["test"]
# 進入集合
self.col = self.db["j"]
def process_item(self, item, spider):
# 插入資料
self.col.insert_one(dict(item))
return item
def close_spider(self, spider):
self.client.close()
:儲存mysql
class BookschinaPipeline:
def __init__(self):
self.df = pd.DataFrame(columns=['name', 'price', 'author',
'out_date', 'publisher'])
# database 資料庫名
self.conn = pymysql.Connect(
host='localhost',
port=3306,
user='root',
password='',
database='spiders',
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
self.cursor = self.conn.cursor()
self.count = 1
def process_item(self, item, spider):
sql = """insert into bookschina_goods (name,price,author,out_date,publisher)
values (%s, %s, %s, %s, %s)"""
self.cursor.execute(sql, (
item.get('name', ''),
item.get('price', ''),
item.get('author', ''),
item.get('out_date', ''),
item.get('publisher', '')
))
self.conn.commit()
return item
def close_spider(self, spider):
self.cursor.close()
self.conn.close()
-
settings
配置檔案 -
shell命令
scrapy shell 網址