爬蟲練手:使用scrapy抓取噹噹網程式設計類圖書資訊,並儲存到MySQL

weixin_33860722發表於2016-12-28

爬取目標

噹噹網程式設計類圖書資訊,網址為:
http://category.dangdang.com/cp01.54.06.00.00.00.html

開發環境

python3.5/MySQL 5.6/scrapy 1.3
python 執行在windows上,MySQL執行在centos6.7

原始碼

items.py

# -*- coding: utf-8 -*-

# Define here the models for your scraped items
#
# See documentation in:
# http://doc.scrapy.org/en/latest/topics/items.html

import scrapy


class Dangdang01Item(scrapy.Item):
    # define the fields for your item here like:
    # name = scrapy.Field()
    title = scrapy.Field()  # 標題
    link = scrapy.Field()  # 連結
    comment = scrapy.Field()  # 評論數

dangdang.py (爬蟲程式)

# -*- coding: utf-8 -*-
import scrapy
from dangdang01.items import Dangdang01Item
from scrapy.http import Request

class DangdangSpider(scrapy.Spider):
    name = "dangdang"
    allowed_domains = ["dangdang.com"]
    start_urls = [
        'http://category.dangdang.com/cp01.54.06.00.00.00-srsort_score_desc-f0%7C0%7C0%7C0%7C0%7C1%7C0%7C0%7C0%7C0%7C0%7C0%7C0-shlist.html']

    def parse(self, response):

        title = response.xpath("//p[@class='name']/a/text()").extract()
        link = response.xpath("//a[@class='pic']/@href").extract()
        comment = response.xpath("//a[@name='P_pl']/text()").extract()

        for i in range(0,len(title)):
            dd = Dangdang01Item()
            dd["title"] = title[i]
            dd["link"] = link[i]
            dd["comment"] = comment[i]
            # print(dd["title"])
            # print(dd["link"])
            # print(dd["comment"])
            yield dd
        for i in range(1, 101):
            url = "http://category.dangdang.com/pg" + str(
                i) + "-cp01.54.06.00.00.00-srsort_score_desc-f0%7C0%7C0%7C0%7C0%7C1%7C0%7C0%7C0%7C0%7C0%7C0%7C0-shlist.html"
            yield Request(url, callback=self.parse)

建立資料庫和表

3830372-faee4376183e53e9.png
Paste_Image.png

檢視許可權分配

3830372-4c82629a005918ae.png
Paste_Image.png

pipelines.py

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymysql
import sys

class Dangdang01Pipeline(object):
    def __init__(self):
        # load(sys)
        self.conn = pymysql.connect(host="192.168.1.188",user="root",password="654321",db="dangdang")
        self.conn.set_charset("utf8")

    def process_item(self, item, spider):
        title = item["title"]
        link = item["link"]
        comment = item["comment"]

        print(item["title"])
        print(item["link"])
        print(item["comment"])
        # return item
        sql = "insert into book(title,link,commit) values ('" + title + "','" + link + "','" + comment + "');"
        print(sql)

        cursor = self.conn.cursor()
        try:
            cursor.execute(sql)
            self.conn.commit()
        except Exception as e:
            print(e)
            self.conn.rollback()
        # pass

    def close_spider(self):
        self.conn.close()

相關文章