資料採集與融合技術-第三次實踐作業

PZn發表於2024-10-30

作業1

要求:要求:指定一個網站,爬取這個網站中的所有的所有圖片,例如:中國氣象網(http://www.weather.com.cn)。使用scrapy框架分別實現單執行緒和多執行緒的方式爬取。

–務必控制總下載的圖片數量(學號尾數後2位)限制爬取的措施。

Gitee連結
https://gitee.com/xiaoaibit/102202131_LX/tree/master/homework3/demo1

單執行緒爬蟲程式碼

import requests
import scrapy
from demo1.items import ImgItem
class FZuSpider(scrapy.Spider):
    name = "FZu"
    allowed_domains = ["news.fzu.edu.cn"]
    start_urls = ["https://news.fzu.edu.cn/fdyw.htm"]
    def parse(self, response):
        imgs= response.xpath("//img/@src").extract()
        for img in imgs:
            item = ImgItem()
            item['img_url'] = response.urljoin(img).split('?')[0]
            item['img'] = requests.get(response.urljoin(img)).content
            yield item
        part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
        next_url = response.urljoin(part_url)
        yield scrapy.Request(url=next_url,callback=self.parse)

多執行緒爬蟲程式碼

import requests
import scrapy

from demo1.items import ImgItem


class FZuSpider1(scrapy.Spider):
    name = "FZu1"
    allowed_domains = ["news.fzu.edu.cn"]
    start_urls = ["https://news.fzu.edu.cn/fdyw.htm"]
    def parse(self, response):
        imgs= response.xpath("//img/@src").extract()
        for img in imgs:
            item = ImgItem()
            item['img_url'] = response.urljoin(img).split('?')[0]
            item['img'] = requests.get(response.urljoin(img)).content
            yield item
        part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
        next_url = response.urljoin(part_url)
        print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())
        if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 10 :
            yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider2(scrapy.Spider):
    name = "FZu2"
    allowed_domains = ["news.fzu.edu.cn"]
    start_urls = ["https://news.fzu.edu.cn/fdyw/711.htm"]

    def parse(self, response):
        imgs= response.xpath("//img/@src").extract()
        for img in imgs:
            item = ImgItem()
            item['img_url'] = response.urljoin(img)
            item['img'] = requests.get(response.urljoin(img)).content
            yield item
        part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
        next_url = response.urljoin(part_url)
        print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())

        if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 20:
            yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider3(scrapy.Spider):
    name = "FZu3"
    allowed_domains = ["news.fzu.edu.cn"]
    start_urls = ["https://news.fzu.edu.cn/fdyw/701.htm"]

    def parse(self, response):
        imgs= response.xpath("//img/@src").extract()
        for img in imgs:
            item = ImgItem()
            item['img_url'] = response.urljoin(img)
            item['img'] = requests.get(response.urljoin(img)).content
            yield item
        part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
        next_url = response.urljoin(part_url)
        print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())

        if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 30:
            yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider4(scrapy.Spider):
    name = "FZu4"
    allowed_domains = ["news.fzu.edu.cn"]
    start_urls = ["https://news.fzu.edu.cn/fdyw/691.htm"]

    def parse(self, response):
        imgs= response.xpath("//img/@src").extract()
        for img in imgs:
            item = ImgItem()
            item['img_url'] = response.urljoin(img)
            item['img'] = requests.get(response.urljoin(img)).content
            yield item
        part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
        next_url = response.urljoin(part_url)
        print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())

        if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 40:
            yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider5(scrapy.Spider):
    name = "FZu5"
    allowed_domains = ["news.fzu.edu.cn"]
    start_urls = ["https://news.fzu.edu.cn/fdyw/681.htm"]

    def parse(self, response):
        imgs= response.xpath("//img/@src").extract()
        for img in imgs:
            item = ImgItem()
            item['img_url'] = response.urljoin(img)
            item['img'] = requests.get(response.urljoin(img)).content
            yield item
        part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
        next_url = response.urljoin(part_url)
        print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())

        if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 50:
            yield scrapy.Request(url=next_url,callback=self.parse)
class FZuSpider6(scrapy.Spider):
    name = "FZu6"
    allowed_domains = ["news.fzu.edu.cn"]
    start_urls = ["https://news.fzu.edu.cn/fdyw/671.htm"]

    def parse(self, response):
        imgs= response.xpath("//img/@src").extract()
        for img in imgs:
            item = ImgItem()
            item['img_url'] = response.urljoin(img)
            item['img'] = requests.get(response.urljoin(img)).content
            yield item
        part_url = response.xpath("//span[@class = 'p_no_d']/following-sibling::*[position() = 1]/a/@href").extract_first()
        next_url = response.urljoin(part_url)
        print(response.xpath("//span[@class = 'p_no_d']/text()").extract_first())

        if int(response.xpath("//span[@class = 'p_no_d']/text()").extract_first()) != 60:
            yield scrapy.Request(url=next_url,callback=self.parse)

items程式碼

import scrapy
class ImgItem(scrapy.Item):
    img_url = scrapy.Field()
    img = scrapy.Field()

pipelines程式碼

import os
from urllib.parse import urlparse, unquote

class Demo1Pipeline:
    def process_item(self, item, spider):
        img_url = item['img_url']
        img = item['img']

        # 解析URL並獲取淨檔名(去除查詢引數)
        parsed_url = urlparse(img_url)
        path = unquote(parsed_url.path)  # 解碼路徑,以防有特殊字元
        filename = os.path.basename(path)  # 獲取檔名

        # 清理檔名,去除非法字元
        clean_filename = os.path.splitext(filename)[0] + os.path.splitext(filename)[1]

        # 確保images目錄存在
        img_dir = 'images'
        if not os.path.exists(img_dir):
            os.makedirs(img_dir, exist_ok=True)

        # 構造完整的檔案路徑
        file_path = os.path.join(img_dir, clean_filename)

        # 嘗試開啟檔案
        try:
            with open(file_path, 'wb') as self.file:
                self.file.write(img)
        except Exception as e:
            print(f"無法建立檔案:{file_path}, 錯誤資訊:{e}")
        return item

結果

心得體會

效率

多執行緒爬取可以顯著提高爬取效率,尤其是在爬取大量圖片時。透過併發請求,可以減少等待時間,加快整體爬取速度。

資源消耗

多執行緒爬取會消耗更多的系統資源,如CPU和記憶體。在資源有限的情況下,需要合理設定併發請求的數量,以避免系統過載。

反爬蟲機制

許多網站都有反爬蟲機制,過多的請求可能會觸發這些機制,導致IP被封禁。因此,在設計爬蟲時,需要考慮到這一點,合理設定請求間隔和併發數量。

錯誤處理

在爬取過程中,可能會遇到各種錯誤,如網路錯誤、404錯誤等。合理地處理這些錯誤,可以提高爬蟲的穩定性和可靠性。

遵守法律法規

在爬取網站資料時,需要遵守相關法律法規,尊重網站的robots.txt檔案,不要對網站造成過大壓力。

作業2

要求: 熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取股票相關資訊。候選網站:東方財富網:https://www.eastmoney.com/輸出資訊:MySQL資料庫儲存和輸出格式如下:表頭英文命名例如:序號id,股票程式碼:bStockNo……,由同學們自行定義設計

Gitee連結
https://gitee.com/xiaoaibit/102202131_LX/tree/master/homework3/demo2

爬蟲程式碼

import scrapy

from demo2.items import StockItem


class StockSpider(scrapy.Spider):
    name = "stock"
    allowed_domains = ["www.eastmoney.com"]
    start_urls = ["https://quote.eastmoney.com/center/gridlist.html#hs_a_board"]

    def parse(self, response):
        stocks = response.xpath("//tbody//tr")
        for stock in stocks:
            item = StockItem()
            item['id'] = stock.xpath('.//td[position() = 1]//text()').extract_first()
            item['code'] = stock.xpath('.//td[position() = 2]//text()').extract_first()
            item['name'] = stock.xpath('.//td[position() = 3]//text()').extract_first()
            item['newPrice'] = stock.xpath('.//td[position() = 5]//text()').extract_first()
            item['price_change_amplitude'] = stock.xpath('.//td[position() = 6]//text()').extract_first()
            item['price_change_Lines'] = stock.xpath('.//td[position() = 7]//text()').extract_first()
            item['volume'] = stock.xpath('.//td[position() = 8]//text()').extract_first()
            item['turnover'] = stock.xpath('.//td[position() = 9]//text()').extract_first()
            item['amplitude'] = stock.xpath('.//td[position() = 10]//text()').extract_first()
            item['highest'] = stock.xpath('.//td[position() = 11]//text()').extract_first()
            item['lowest'] = stock.xpath('.//td[position() = 12]//text()').extract_first()
            item['today'] = stock.xpath('.//td[position() = 13]//text()').extract_first()
            item['yesterday'] = stock.xpath('.//td[position() = 14]//text()').extract_first()
            yield item

items程式碼

import scrapy
class StockItem(scrapy.Item):
    id = scrapy.Field()
    code = scrapy.Field()
    name = scrapy.Field()
    newPrice = scrapy.Field()
    price_change_amplitude = scrapy.Field()
    price_change_Lines = scrapy.Field()
    volume = scrapy.Field()
    turnover = scrapy.Field()
    amplitude = scrapy.Field()
    highest = scrapy.Field()
    lowest = scrapy.Field()
    today = scrapy.Field()
    yesterday = scrapy.Field()

使用selenium作為下載中介軟體

import time

from scrapy.http import HtmlResponse
from selenium import webdriver


class SeleniumMiddleware:
    def process_request(self,request,spider):
        url = request.url
        driver = webdriver.Edge()
        driver.get(url)
        time.sleep(3)
        data =driver.page_source
        driver.close()
        return HtmlResponse(url=url,body=data.encode('utf-8'),encoding='utf-8',request=request)

pipelines程式碼

import pymysql


host = '127.0.0.1'
port = 3306
user = 'root'
password = 'yabdylm'
database = 'pycharm'

class Demo2Pipeline:
    def __init__(self):
        self.con = pymysql.connect(host=host, port=port, user=user, password=password, database=database, charset='utf8mb4')
        self.cursor = self.con.cursor()
        self.cursor.execute(
            "CREATE TABLE IF NOT EXISTS stockData (id Integer,code VARCHAR(255),name VARCHAR(255),newPrice VARCHAR(255),price_change_amplitude VARCHAR(255),price_change_Lines VARCHAR(255), volume VARCHAR(255),turnover VARCHAR(255),amplitude VARCHAR(255),highest VARCHAR(255),lowest VARCHAR(255),today VARCHAR(255),yesterday VARCHAR(255));")

    def process_item(self, item, spider):
        try:
            id = item['id']
            code = item['code']
            name = item['name']
            newPrice = item['newPrice']
            price_change_amplitude = item['price_change_amplitude']
            price_change_Lines = item['price_change_Lines']
            volume = item['volume']
            turnover = item['turnover']
            amplitude = item['amplitude']
            highest = item['highest']
            lowest = item['lowest']
            today = item['today']
            yesterday = item['yesterday']

            # 插入資料
            self.cursor.execute("""
                INSERT INTO stockData VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)
            """, (id, code, name, newPrice, price_change_amplitude, price_change_Lines, volume, turnover, amplitude,
                  highest, lowest, today, yesterday))
            self.con.commit()  # 提交事務
        except Exception as e:
            print(f"An error occurred: {e}")
        return item

    def __del__(self):
        self.con.close()

結果

心得體會

專案結構和程式碼組織:

在開始編寫爬蟲之前,合理規劃專案結構和程式碼組織是非常重要的。這包括定義Item、Pipelines、Spiders等元件,以及如何組織這些元件的程式碼檔案。良好的程式碼組織可以提高程式碼的可讀性和可維護性。

Item定義:

定義Item時,需要根據目標網站的資料結構來設計欄位。這要求對網站的資料結構有清晰的理解,並且能夠預見到未來可能需要的資料欄位。在東方財富網的例子中,可能需要定義如股票程式碼、名稱、價格、成交量等欄位。

作業3

要求:熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取外匯網站資料。候選網站:中國銀行網:https://www.boc.cn/sourcedb/whpj/

Gitee連結
https://gitee.com/xiaoaibit/102202131_LX/tree/master/homework3/demo3

爬蟲程式碼

import scrapy


from demo3.items import BankItem


class BankSpider(scrapy.Spider):
    name = "bank"
    allowed_domains = ["www.boc.cn"]
    start_urls = ["https://www.boc.cn/sourcedb/whpj/"]

    def parse(self, response):
        banks = response.xpath('//tbody[position() = 1]/tr')
        for i in range(2,len(banks) - 2):
            bank = banks[i]
            item = BankItem()
            item['Currency'] = bank.xpath(".//td[position() = 1]//text()").extract_first()
            item['TBP'] = bank.xpath(".//td[position() = 2]//text()").extract_first()
            item['CBP'] = bank.xpath(".//td[position() = 3]//text()").extract_first()
            item['TSP'] = bank.xpath(".//td[position() = 4]//text()").extract_first()
            item['CSP'] = bank.xpath(".//td[position() = 5]//text()").extract_first()
            item['Time'] = bank.xpath(".//td[position() = 8]//text()").extract_first()
            yield item

myDb.closeDB()

items程式碼

import scrapy


class BankItem(scrapy.Item):
    Currency = scrapy.Field()
    TBP = scrapy.Field()
    CBP = scrapy.Field()
    TSP = scrapy.Field()
    CSP = scrapy.Field()
    Time = scrapy.Field()

selenium中介軟體程式碼

import time

from scrapy.http import HtmlResponse
from selenium import webdriver

class SeleniumMiddleware:
    def process_request(self,request,spider):
        url =request.url
        driver = webdriver.Edge()
        driver.get(url)
        time.sleep(1)
        data = driver.page_source
        return HtmlResponse(url=url,body=data.encode('utf-8'),encoding='utf-8',reques

pipelines程式碼

import pymysql
from scrapy.exceptions import DropItem

class BankPipeline:
    def __init__(self):
        # 這裡填寫您的資料庫配置資訊
        self.host = 'localhost'
        self.database = 'pycharm'
        self.user = 'root'
        self.password = 'yabdylm'

        # 建立資料庫連線
        self.con = pymysql.connect(
            host=self.host,
            user=self.user,
            password=self.password,
            database=self.database,
            charset='utf8mb4'  # 使用 utf8mb4 字符集以支援全字符集
        )
        self.cursor = self.con.cursor()


    def process_item(self, item, spider):
        # SQL 插入語句
        insert_sql = """
            INSERT INTO bankData (Currency, TBP, CBP, TSP, CSP, Time)
            VALUES (%s, %s, %s, %s, %s, %s)
        """
        try:
            # 執行 SQL 插入語句
            self.cursor.execute(
                insert_sql,
                (
                    item['Currency'],
                    item['TBP'],
                    item['CBP'],
                    item['TSP'],
                    item['CSP'],
                    item['Time']
                )
            )
            # 提交事務
            self.con.commit()
        except pymysql.Error as e:
            # 如果發生錯誤,回滾事務
            self.con.rollback()
            raise DropItem(f"Error inserting row {item!r} into database: {e}")

        return item

    def close_spider(self, spider):
        # 關閉資料庫連線
        self.cursor.close()
        self.con.close()

結果

心得體會

Xpath選擇器:

使用Xpath選擇器提取資料時,需要對Xpath語法有一定的瞭解。同時,由於網站結構可能會變化,編寫的Xpath選擇器需要具有一定的靈活性和健壯性,以應對可能的結構變化。
資料清洗和處理:

資料庫設計:

設計資料庫表結構時,需要考慮到資料之間的關係和查詢效率。例如,是否需要設定外來鍵、索引等。同時,表頭的英文命名需要符合資料庫命名規範,易於理解且避免關鍵字衝突。

相關文章