資料採集與融合技術第三次作業

李迦勒發表於2024-11-12

作業1

倉庫連結:https://gitee.com/jyppx000/crawl_project

作業①

要求:指定一個網站,爬取這個網站中的所有的所有圖片,例如:中國氣象網(http://www.weather.com.cn)。使用scrapy框架分別實現單執行緒和多執行緒的方式爬取。

1.1 程式碼和圖片

我這裡列出核心程式碼

first.py

import scrapy
from bs4 import UnicodeDammit
from firstBlood.items import FirstbloodItem


class FirstSpider(scrapy.Spider):
    name = "first"
    # 控制頁碼
    start_urls = [f"http://www.weather.com.cn/page_{i}" for i in range(1, 57)]

    def parse(self, response, **kwargs):
        try:
            dammit = UnicodeDammit(response.body, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            selector = scrapy.Selector(text=data)
            qixiangs = selector.xpath("//img")
            for qixiang in qixiangs:
                item = FirstbloodItem()
                item["url"] = qixiang.xpath("./@src").extract_first()
                if item["url"]:
                    print(item["url"])
                    yield item
        except Exception as err:
            print(err)

piplines.py

import os
import urllib

class FirstbloodPipeline:
    count = 0
    # 控制最大下載圖片數量
    max_images = 156

    def process_item(self, item, spider):
        if FirstbloodPipeline.count >= FirstbloodPipeline.max_images:
            spider.crawler.engine.close_spider(spider, "達到最大圖片數量,停止爬取")
            return item

        Base_path = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
        image_path = os.path.join(Base_path, "images")
        if not os.path.exists(image_path):  # 確保目錄存在
            os.makedirs(image_path)

        img_url = item['url']
        if img_url:
            FirstbloodPipeline.count += 1
            extension = img_url.split('.')[-1]
            filename = os.path.join(image_path, f"{FirstbloodPipeline.count}.{extension}")
            try:
                urllib.request.urlretrieve(img_url, filename=filename)
                print(f"成功爬取:{filename}")
            except Exception as e:
                print(f"下載失敗: {e}")

        return item

items.py

class FirstbloodItem(scrapy.Item):
    url = scrapy.Field()
    pass

settings.py

ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
CONCURRENT_REQUESTS = 32
CONCURRENT_REQUESTS_PER_DOMAIN = 100
CONCURRENT_REQUESTS_PER_IP = 100

ITEM_PIPELINES = {
    'firstBlood.pipelines.FirstbloodPipeline': 300,  
}

main.py

import os
import sys
from scrapy import cmdline

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
cmdline.execute(['scrapy', "crawl", "first"])

1.2 作業心得

  • 在實現特定功能的同時,我注意到了程式碼結構的重要性。透過將爬蟲邏輯與資料處理邏輯分開,保持了程式碼的清晰與可維護性
  • 在處理大量資料時,如何合理地組織和儲存資料就很重要了。我在實踐中學習到了如何透過Pipeline處理和儲存圖片,並在過程中避免重複下載。這使我對資料儲存和管理有了更深入的理解
  • 透過處理下載過程中可能出現的異常,我的錯誤處理能力得到了提升。讓我在實際操作中更加從容應對各種問題

作業②

要求:熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取股票相關資訊。

2.1 程式碼和圖片

```second.py``

import re
import scrapy
import requests
from secondBlood.items import SecondbloodItem


class SecondSpider(scrapy.Spider):
    name = "second"
    # allowed_domains = ["www.xxx.com"]
    start_url = "http://quote.eastmoney.com/center/gridlist.html#hs_a_board"

    def start_requests(self):
        url = SecondSpider.start_url
        yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response, **kwargs):
        try:
            stocks = []
            url = "https://68.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124009413428787683675_1696660278138&pn=1&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f12,f14,f2,f3,f4,f5,f6,f7,f15,f16,f17,f18&_=1696660278155"
            r = requests.get(url=url)
            data = re.compile("\"f2\":.*").findall(r.text)
            data1 = data[0].split("},{")
            data1[-1] = data1[-1].split("}")[0]
            for i in range(len(data1)):
                stock0 = data1[i].replace('"', "").split(",")
                list = [6, 7, 0, 1, 2, 3, 4, 5, 8, 9, 10, 11]
                stock = []
                for j in list:
                    stock.append(stock0[j].split(":")[1])
                stocks.append(stock)
            print(stocks[0][0])
            for i in range(len(stocks)):
                item = SecondbloodItem()
                item["stockname"] = stocks[i][0]
                item["name"] = stocks[i][1]
                item["newprice"] = stocks[i][2]
                item["zhangdiefu"] = stocks[i][3]
                item["zhangdieer"] = stocks[i][4]
                item["chengjiaoliang"] = stocks[i][5]
                item["chengjiaoer"] = stocks[i][6]
                item["zhenfu"] = stocks[i][7]
                item["zuigao"] = stocks[i][8]
                item["zuidi"] = stocks[i][9]
                item["jinkai"] = stocks[i][10]
                item["zuoshou"] = stocks[i][11]
                print(item)
                yield item
        except Exception as err:
            print(err)	

piplines.py

import pymysql


class SecondbloodPipeline:
    def __init__(self):
        self.mydb = pymysql.connect(
            host="127.0.0.1",
            port=3306,
            user='root',
            password='123456',
            database="stock",
            charset='utf8'
        )
        self.cursor = self.mydb.cursor()
        self.cursor.execute('''CREATE TABLE IF NOT EXISTS stocks(
                                      stockname VARCHAR(256),
                                      name VARCHAR(256),
                                      newprice VARCHAR(256),
                                      zhangdiefu VARCHAR(256),
                                      zhangdieer VARCHAR(256),
                                      chengjiaoliang VARCHAR(256),
                                      chengjiaoer VARCHAR(256),
                                      zhenfu VARCHAR(256),
                                      zuigao VARCHAR(256),
                                      zuidi VARCHAR(256),
                                      jinkai VARCHAR(256),
                                      zuoshou VARCHAR(256)
                                   )''')
        self.mydb.commit()

    def process_item(self, item, spider):
        try:
            sql = "INSERT INTO stocks VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)"
            self.cursor.execute(sql, (
                item.get("stockname"), item.get("name"), item.get("newprice"),
                item.get("zhangdiefu"), item.get("zhangdieer"),
                item.get("chengjiaoliang"), item.get("chengjiaoer"),
                item.get("zhenfu"), item.get("zuigao"),
                item.get("zuidi"), item.get("jinkai"),
                item.get("zuoshou")))
            self.mydb.commit()
            print("Successfully inserted:", item)
        except Exception as e:
            print("Error inserting item:", e)
            self.mydb.rollback()  # Rollback in case of error
        return item

    def close_spider(self, spider):
        self.cursor.close()
        self.mydb.close()

items.py

import scrapy


class SecondbloodItem(scrapy.Item):
    stockname = scrapy.Field()
    name = scrapy.Field()
    newprice = scrapy.Field()
    zhangdiefu = scrapy.Field()
    zhangdieer = scrapy.Field()
    chengjiaoliang = scrapy.Field()
    chengjiaoer = scrapy.Field()
    zhenfu = scrapy.Field()
    zuigao = scrapy.Field()
    zuidi = scrapy.Field()
    jinkai = scrapy.Field()
    zuoshou = scrapy.Field()

setting.py

ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
ITEM_PIPELINES = {
    'secondBlood.pipelines.SecondbloodPipeline': 1,
}

main.py

import os
import sys
from scrapy import cmdline

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
cmdline.execute(['scrapy', "crawl", "second"])

2.2 作業心得

  • 我瞭解瞭如何使用Scrapy框架抓取和解析網頁資料。使用requests庫獲取API資料,結合正規表示式進行資料提取

  • 在將爬取的資料儲存到MySQL資料庫的過程中,我掌握瞭如何透過在Pipeline中處理資料儲存

  • 深入了異常處理機制

作業③:

要求:熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取股票相關資訊。

3.1程式碼和圖片

third.py

import time
import scrapy
from lxml import etree
from selenium import webdriver
from thirdBlood.items import ThirdbloodItem
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options


class ThirdSpider(scrapy.Spider):
    name = "third"

    def __init__(self, *args, **kwargs):
        super(ThirdSpider, self).__init__(*args, **kwargs)
        chrome_options = Options()
        chrome_options.add_argument("--headless")  #
        self.driver = webdriver.Chrome(service=Service(r'D:\tools\package\chromedriver-win64\chromedriver.exe'), options=chrome_options)

    def start_requests(self):
        url = 'https://www.boc.cn/sourcedb/whpj/'
        self.driver.get(url)
        time.sleep(1)
        html = etree.HTML(self.driver.page_source)
        yield scrapy.Request(url, self.parse, meta={'html': html})

    def parse(self, response):
        global item
        html = response.meta['html']
        lis = html.xpath('/html/body/div/div[5]/div[1]/div[2]/table/tbody/tr')
        number = 1
        # 獲取元素
        for link in lis:
            if number != 1:
                texts = link.xpath('./td[1]/text()')
                name = texts[0] if texts else ''
                texts = link.xpath('./td[2]/text()')
                TBP = texts[0] if texts else ''
                texts = link.xpath('./td[3]/text()')
                CBP = texts[0] if texts else ''
                texts = link.xpath('./td[4]/text()')
                TSP = texts[0] if texts else ''
                texts = link.xpath('./td[5]/text()')
                CSP = texts[0] if texts else ''
                texts = link.xpath('./td[8]/text()')
                TIME = texts[0] if texts else ''

                item = ThirdbloodItem()
                item["currency"] = name
                item["TBP"] = TBP
                item["CBP"] = CBP
                item["TSP"] = TSP
                item["CSP"] = CSP
                item["time"] = TIME
                yield item
            if number == 1:
                number += 1

    def closed(self, reason):
        self.driver.quit()  

piplines.py

import pymysql


class ThirdbloodPipeline:
    def process_item(self, item, spider):
        try:
            print(item["currency"])  
            print(item["TSP"])  
            print(item["CSP"])  
            print(item["TBP"])  
            print(item["CBP"]) 
            print(item["time"])  
            print()

            # 將資料插入資料庫的表中
            mydb = pymysql.connect(host="127.0.0.1", port=3306, user="root", passwd="123456", db="china_bank",
                                   charset="utf8")
            mycursor = mydb.cursor()
            sql = "INSERT INTO currency (currency, TSP, CSP, TBP, CBP, time) VALUES (%s, %s, %s, %s, %s, %s)"
            val = (item["currency"], item["TSP"], item["CSP"], item["TBP"], item["CBP"], item["time"])

            # 列印 SQL 語句和要插入的值
            print(f"Inserting into currency: {val}")

            mycursor.execute(sql, val)
            mydb.commit()
        except Exception as err:
            print(f"Error: {err}")  
        finally:
            mycursor.close()  
            mydb.close() 
        return item

items.py

import scrapy


class ThirdbloodItem(scrapy.Item):
    currency = scrapy.Field()
    TSP = scrapy.Field()
    CSP = scrapy.Field()
    TBP = scrapy.Field()
    CBP = scrapy.Field()
    time = scrapy.Field()

settings.py

ROBOTSTXT_OBEY = False
LOG_LEVEL = "ERROR"
ITEM_PIPELINES = {
    'thirdBlood.pipelines.ThirdbloodPipeline': 1,
}

main.py

import os
import sys
from scrapy import cmdline

sys.path.insert(0, os.path.dirname(os.path.abspath(__file__)))
cmdline.execute(['scrapy', "crawl", "third"])

3.2 作業心得

  • 學會了如何將Selenium與Scrapy結合使用,以處理動態網頁的資料抓取。使用Selenium能夠獲取需要的JavaScript渲染後的內容
  • 在解析抓取的HTML內容時,我熟悉了對XPath選擇器的使用。透過XPath可以精準定位到需要的資料,提高了抓取效率和資料提取的準確性

相關文章