資料採集作業3

淋祁發表於2024-11-11

資料採集與融合技術第三次作業
學號姓名 102202128林子豪

作業①:
要求:指定一個網站,爬取這個網站中的所有的所有圖片,例如:中國氣象網(http://www.weather.com.cn)。使用scrapy框架分別實現單執行緒和多執行緒的方式爬取。–務必控制總頁數(學號尾數2位)、總下載的圖片數量(尾數後3位)等限制爬取的措施。輸出資訊: 將下載的Url資訊在控制檯輸出,並將下載的圖片儲存在images子檔案中,並給出截圖。

(1)程式碼如下:

item.py

import scrapy

class WeatherImageItem(scrapy.Item):
image_urls = scrapy.Field() # 儲存圖片URL的欄位
images = scrapy.Field() # 儲存下載後的圖片

spider.py

import scrapy
from weather_image_scraper.items import WeatherImageItem

class WeatherSpiderSpider(scrapy.Spider):
name = 'weather_images'
allowed_domains = ['weather.com.cn']
start_urls = ['http://www.weather.com.cn/']

# 控制總頁數和總下載的圖片數量
total_pages = 3
total_images = 103
images_crawled = 0
current_page = 1

def parse(self, response):
    # 提取當前頁面的圖片連結
    img_urls = response.css('img::attr(src)').getall()
    
    for img_url in img_urls:
        if self.images_crawled < self.total_images:
            yield {
                'image_url': img_url,
            }
            self.images_crawled += 1
        else:
            break  # 達到總圖片數量限制,停止爬取

    # 判斷是否需要繼續爬取下一頁
    if self.current_page < self.total_pages:
        self.current_page += 1
        next_page_url = f'http://www.weather.com.cn/page/{self.current_page}'  # 假設分頁 URL 規則
        yield scrapy.Request(url=next_page_url, callback=self.parse)

pipelines.py

from scrapy.pipelines.images import ImagesPipeline
import scrapy

class WeatherImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)

def item_completed(self, results, item, info):
    image_paths = [x['path'] for ok, x in results if ok]
    item['images'] = image_paths
    return item

settings.py

ITEM_PIPELINES = {
'weather_image_scraper.pipelines.WeatherImagesPipeline': 1,
}

IMAGES_STORE = 'C:/Users/86158/examples' # 設定圖片儲存路徑
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 1 # 單執行緒

CONCURRENT_REQUESTS = 10 # 多執行緒

結果如下:

爬取的圖片:

(圖片有限,僅爬取這麼多)
(2)作業心得:本次實驗透過在實現單執行緒和多執行緒爬取的過程中,我明顯感受到了多執行緒爬取的速度優勢。透過Scrapy的非同步處理能力,多執行緒爬取可以同時發起多個請求,顯著提高了爬取效率。

作業②
要求:熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取股票相關資訊。
候選網站:東方財富網:https://www.eastmoney.com/

輸出資訊:MySQL資料庫儲存和輸出格式如下:
表頭英文命名例如:序號id,股票程式碼:bStockNo……,由同學們自行定義設計

(1)程式碼如下:

item.py

import scrapy

class StockItem(scrapy.Item):
id = scrapy.Field()
code = scrapy.Field()
name = scrapy.Field()
newPrice = scrapy.Field()
price_change_amplitude = scrapy.Field()
price_change_Lines = scrapy.Field()
volume = scrapy.Field()
turnover = scrapy.Field()
amplitude = scrapy.Field()
highest = scrapy.Field()
lowest = scrapy.Field()
today = scrapy.Field()
yesterday = scrapy.Field()
spider.py
import scrapy
from stock_scraper.items import StockItem

class StockSpider(scrapy.Spider):
name = 'stock_spider'
allowed_domains = ['www.eastmoney.com']
start_urls = ['https://quote.eastmoney.com/center/gridlist.html#hs_a_board']

def parse(self, response):
    stocks = response.xpath("//tbody//tr")
    for stock in stocks:
        item = StockItem()
        item['id'] = stock.xpath('.//td[1]//text()').get()
        item['code'] = stock.xpath('.//td[2]//text()').get()
        item['name'] = stock.xpath('.//td[3]//text()').get()
        item['newPrice'] = stock.xpath('.//td[5]//text()').get()
        item['price_change_amplitude'] = stock.xpath('.//td[6]//text()').get()
        item['price_change_Lines'] = stock.xpath('.//td[7]//text()').get()
        item['volume'] = stock.xpath('.//td[8]//text()').get()
        item['turnover'] = stock.xpath('.//td[9]//text()').get()
        item['amplitude'] = stock.xpath('.//td[10]//text()').get()
        item['highest'] = stock.xpath('.//td[11]//text()').get()
        item['lowest'] = stock.xpath('.//td[12]//text()').get()
        item['today'] = stock.xpath('.//td[13]//text()').get()
        item['yesterday'] = stock.xpath('.//td[14]//text()').get()
        yield item

pipelines.py

import mysql.connector
from mysql.connector import Error

class MySQLPipeline:

def open_spider(self, spider):
    try:
        self.connection = mysql.connector.connect(
            host='127.0.0.1',
            database='wwh',  # 使用您的資料庫名稱
            user='root',
            password='123456'  # 使用您的密碼
        )
        self.cursor = self.connection.cursor()
        self.cursor.execute('''
            CREATE TABLE IF NOT EXISTS stockData (
                id INTEGER PRIMARY KEY AUTO_INCREMENT,
                code VARCHAR(255),
                name VARCHAR(255),
                newPrice VARCHAR(255),
                price_change_amplitude VARCHAR(255),
                price_change_Lines VARCHAR(255),
                volume VARCHAR(255),
                turnover VARCHAR(255),
                amplitude VARCHAR(255),
                highest VARCHAR(255),
                lowest VARCHAR(255),
                today VARCHAR(255),
                yesterday VARCHAR(255)
            )
        ''')
    except Error as e:
        spider.logger.error(f"Error connecting to MySQL: {e}")

def close_spider(self, spider):
    try:
        self.connection.commit()
    except Error as e:
        spider.logger.error(f"Error committing to MySQL: {e}")
    finally:
        self.cursor.close()
        self.connection.close()

def process_item(self, item, spider):
    try:
        with self.connection.cursor() as cursor:
            cursor.execute('''
                INSERT INTO stockData (code, name, newPrice, price_change_amplitude, price_change_Lines, volume, turnover, amplitude, highest, lowest, today, yesterday)
                VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
                ON DUPLICATE KEY UPDATE
                newPrice=VALUES(newPrice),
                price_change_amplitude=VALUES(price_change_amplitude),
                price_change_Lines=VALUES(price_change_Lines),
                volume=VALUES(volume),
                turnover=VALUES(turnover),
                amplitude=VALUES(amplitude),
                highest=VALUES(highest),
                lowest=VALUES(lowest),
                today=VALUES(today),
                yesterday=VALUES(yesterday)
            ''', (
                item['code'],
                item['name'],
                item['newPrice'],
                item['price_change_amplitude'],
                item['price_change_Lines'],
                item['volume'],
                item['turnover'],
                item['amplitude'],
                item['highest'],
                item['lowest'],
                item['today'],
                item['yesterday']
            ))
            self.connection.commit()
    except Error as e:
        spider.logger.error(f"Error inserting data into MySQL: {e}")
    return item

middlewares.py

import time
from selenium import webdriver
from scrapy.http import HtmlResponse

class SeleniumMiddleware:
def process_request(self, request, spider):
# 設定Selenium WebDriver
driver = webdriver.Edge()

    try:
        # 訪問URL
        driver.get(request.url)
        # 等待頁面載入
        time.sleep(3)  
        # 獲取頁面原始碼
        data = driver.page_source
    finally:
        # 關閉WebDriver
        driver.quit()

    # 返回構造的HtmlResponse物件
    return HtmlResponse(url=request.url, body=data.encode('utf-8'), encoding='utf-8', request=request)

settings.py

ITEM_PIPELINES = {
'stock_scraper.pipelines.MySQLPipeline': 300,
}
DOWNLOADER_MIDDLEWARES = {
'stock_scraper.middlewares.SeleniumMiddleware': 543,
}

MYSQL_HOST = '127.0.0.1'
MYSQL_DATABASE = 'wwh'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '123456'
ROBOTSTXT_OBEY = False
爬取結果:

(2)作業心得:透過本次實驗,我對於使用XPath解析HTML文件有了進一步的理解,在Scrapy中使用XPath選擇器可以精確地定位和提取網頁中的資料。

作業③:
要求:熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取外匯網站資料。
候選網站:中國銀行網:https://www.boc.cn/sourcedb/whpj/

(1)程式碼如下:

items.py

import scrapy

class ForexItem(scrapy.Item):
currency = scrapy.Field()
tbp = scrapy.Field()
cbp = scrapy.Field()
tsp = scrapy.Field()
csp = scrapy.Field()
time = scrapy.Field()

spider.py

import scrapy
from forex_scraper.items import ForexItem

class BankSpider(scrapy.Spider):
name = "forex_spider"
allowed_domains = ["www.boc.cn"]
start_urls = ["https://www.boc.cn/sourcedb/whpj/"]

def parse(self, response):
    # 選擇第一個 tbody 中的所有行
    rows = response.xpath('//tbody[1]/tr')
    
    # 調整迴圈範圍,以遍歷相關的行
    for row in rows[2:-2]:  # 從索引 2 開始,到倒數第二行結束
        item = ForexItem()
        item['currency'] = row.xpath(".//td[1]//text()").get()  # 使用 .get() 簡化語法
        item['tbp'] = row.xpath(".//td[2]//text()").get()
        item['cbp'] = row.xpath(".//td[3]//text()").get()
        item['tsp'] = row.xpath(".//td[4]//text()").get()
        item['csp'] = row.xpath(".//td[5]//text()").get()
        item['time'] = row.xpath(".//td[8]//text()").get()
        yield item

pipelines.py

import mysql.connector
from mysql.connector import Error

class MySQLPipeline:
def open_spider(self, spider):
try:
self.connection = mysql.connector.connect(
host='127.0.0.1',
user='root', # 替換為你的MySQL使用者名稱
password='123456', # 替換為你的MySQL密碼
database='wwh', # 替換為你的資料庫名
charset='utf8mb4',
use_unicode=True
)
self.cursor = self.connection.cursor()
self.cursor.execute('''
CREATE TABLE IF NOT EXISTS exchange_rates (
id Integer,
currency VARCHAR(255),
tbp VARCHAR(255),
cbp VARCHAR(255),
tsp VARCHAR(255),
csp VARCHAR(255),
time VARCHAR(255)
)
''')
self.connection.commit()
except Error as e:
print(f"Error connecting to MySQL: {e}")

def close_spider(self, spider):
    if self.connection.is_connected():
        self.cursor.close()
        self.connection.close()

def process_item(self, item, spider):
    try:
        self.cursor.execute('''
            INSERT INTO exchange_rates (currency, tbp, cbp, tsp, csp, time) 
            VALUES (%s, %s, %s, %s, %s, %s)
        ''', (item['currency'], item['tbp'], item['cbp'], item['tsp'], item['csp'], item['time']))
        self.connection.commit()
    except Error as e:
        print(f"Error inserting item into MySQL: {e}")
    return item

其它與上一題類似。
爬取結果:

(2)作業心得:跟上一題其實是一樣的體會。

相關文章