資料採集與融合技術第三次作業
學號姓名 102202128林子豪
作業①:
要求:指定一個網站,爬取這個網站中的所有的所有圖片,例如:中國氣象網(http://www.weather.com.cn)。使用scrapy框架分別實現單執行緒和多執行緒的方式爬取。–務必控制總頁數(學號尾數2位)、總下載的圖片數量(尾數後3位)等限制爬取的措施。輸出資訊: 將下載的Url資訊在控制檯輸出,並將下載的圖片儲存在images子檔案中,並給出截圖。
(1)程式碼如下:
item.py
import scrapy
class WeatherImageItem(scrapy.Item):
image_urls = scrapy.Field() # 儲存圖片URL的欄位
images = scrapy.Field() # 儲存下載後的圖片
spider.py
import scrapy
from weather_image_scraper.items import WeatherImageItem
class WeatherSpiderSpider(scrapy.Spider):
name = 'weather_images'
allowed_domains = ['weather.com.cn']
start_urls = ['http://www.weather.com.cn/']
# 控制總頁數和總下載的圖片數量
total_pages = 3
total_images = 103
images_crawled = 0
current_page = 1
def parse(self, response):
# 提取當前頁面的圖片連結
img_urls = response.css('img::attr(src)').getall()
for img_url in img_urls:
if self.images_crawled < self.total_images:
yield {
'image_url': img_url,
}
self.images_crawled += 1
else:
break # 達到總圖片數量限制,停止爬取
# 判斷是否需要繼續爬取下一頁
if self.current_page < self.total_pages:
self.current_page += 1
next_page_url = f'http://www.weather.com.cn/page/{self.current_page}' # 假設分頁 URL 規則
yield scrapy.Request(url=next_page_url, callback=self.parse)
pipelines.py
from scrapy.pipelines.images import ImagesPipeline
import scrapy
class WeatherImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
item['images'] = image_paths
return item
settings.py
ITEM_PIPELINES = {
'weather_image_scraper.pipelines.WeatherImagesPipeline': 1,
}
IMAGES_STORE = 'C:/Users/86158/examples' # 設定圖片儲存路徑
ROBOTSTXT_OBEY = False
CONCURRENT_REQUESTS = 1 # 單執行緒
CONCURRENT_REQUESTS = 10 # 多執行緒
結果如下:
爬取的圖片:
(圖片有限,僅爬取這麼多)
(2)作業心得:本次實驗透過在實現單執行緒和多執行緒爬取的過程中,我明顯感受到了多執行緒爬取的速度優勢。透過Scrapy的非同步處理能力,多執行緒爬取可以同時發起多個請求,顯著提高了爬取效率。
作業②
要求:熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取股票相關資訊。
候選網站:東方財富網:https://www.eastmoney.com/
輸出資訊:MySQL資料庫儲存和輸出格式如下:
表頭英文命名例如:序號id,股票程式碼:bStockNo……,由同學們自行定義設計
(1)程式碼如下:
item.py
import scrapy
class StockItem(scrapy.Item):
id = scrapy.Field()
code = scrapy.Field()
name = scrapy.Field()
newPrice = scrapy.Field()
price_change_amplitude = scrapy.Field()
price_change_Lines = scrapy.Field()
volume = scrapy.Field()
turnover = scrapy.Field()
amplitude = scrapy.Field()
highest = scrapy.Field()
lowest = scrapy.Field()
today = scrapy.Field()
yesterday = scrapy.Field()
spider.py
import scrapy
from stock_scraper.items import StockItem
class StockSpider(scrapy.Spider):
name = 'stock_spider'
allowed_domains = ['www.eastmoney.com']
start_urls = ['https://quote.eastmoney.com/center/gridlist.html#hs_a_board']
def parse(self, response):
stocks = response.xpath("//tbody//tr")
for stock in stocks:
item = StockItem()
item['id'] = stock.xpath('.//td[1]//text()').get()
item['code'] = stock.xpath('.//td[2]//text()').get()
item['name'] = stock.xpath('.//td[3]//text()').get()
item['newPrice'] = stock.xpath('.//td[5]//text()').get()
item['price_change_amplitude'] = stock.xpath('.//td[6]//text()').get()
item['price_change_Lines'] = stock.xpath('.//td[7]//text()').get()
item['volume'] = stock.xpath('.//td[8]//text()').get()
item['turnover'] = stock.xpath('.//td[9]//text()').get()
item['amplitude'] = stock.xpath('.//td[10]//text()').get()
item['highest'] = stock.xpath('.//td[11]//text()').get()
item['lowest'] = stock.xpath('.//td[12]//text()').get()
item['today'] = stock.xpath('.//td[13]//text()').get()
item['yesterday'] = stock.xpath('.//td[14]//text()').get()
yield item
pipelines.py
import mysql.connector
from mysql.connector import Error
class MySQLPipeline:
def open_spider(self, spider):
try:
self.connection = mysql.connector.connect(
host='127.0.0.1',
database='wwh', # 使用您的資料庫名稱
user='root',
password='123456' # 使用您的密碼
)
self.cursor = self.connection.cursor()
self.cursor.execute('''
CREATE TABLE IF NOT EXISTS stockData (
id INTEGER PRIMARY KEY AUTO_INCREMENT,
code VARCHAR(255),
name VARCHAR(255),
newPrice VARCHAR(255),
price_change_amplitude VARCHAR(255),
price_change_Lines VARCHAR(255),
volume VARCHAR(255),
turnover VARCHAR(255),
amplitude VARCHAR(255),
highest VARCHAR(255),
lowest VARCHAR(255),
today VARCHAR(255),
yesterday VARCHAR(255)
)
''')
except Error as e:
spider.logger.error(f"Error connecting to MySQL: {e}")
def close_spider(self, spider):
try:
self.connection.commit()
except Error as e:
spider.logger.error(f"Error committing to MySQL: {e}")
finally:
self.cursor.close()
self.connection.close()
def process_item(self, item, spider):
try:
with self.connection.cursor() as cursor:
cursor.execute('''
INSERT INTO stockData (code, name, newPrice, price_change_amplitude, price_change_Lines, volume, turnover, amplitude, highest, lowest, today, yesterday)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
ON DUPLICATE KEY UPDATE
newPrice=VALUES(newPrice),
price_change_amplitude=VALUES(price_change_amplitude),
price_change_Lines=VALUES(price_change_Lines),
volume=VALUES(volume),
turnover=VALUES(turnover),
amplitude=VALUES(amplitude),
highest=VALUES(highest),
lowest=VALUES(lowest),
today=VALUES(today),
yesterday=VALUES(yesterday)
''', (
item['code'],
item['name'],
item['newPrice'],
item['price_change_amplitude'],
item['price_change_Lines'],
item['volume'],
item['turnover'],
item['amplitude'],
item['highest'],
item['lowest'],
item['today'],
item['yesterday']
))
self.connection.commit()
except Error as e:
spider.logger.error(f"Error inserting data into MySQL: {e}")
return item
middlewares.py
import time
from selenium import webdriver
from scrapy.http import HtmlResponse
class SeleniumMiddleware:
def process_request(self, request, spider):
# 設定Selenium WebDriver
driver = webdriver.Edge()
try:
# 訪問URL
driver.get(request.url)
# 等待頁面載入
time.sleep(3)
# 獲取頁面原始碼
data = driver.page_source
finally:
# 關閉WebDriver
driver.quit()
# 返回構造的HtmlResponse物件
return HtmlResponse(url=request.url, body=data.encode('utf-8'), encoding='utf-8', request=request)
settings.py
ITEM_PIPELINES = {
'stock_scraper.pipelines.MySQLPipeline': 300,
}
DOWNLOADER_MIDDLEWARES = {
'stock_scraper.middlewares.SeleniumMiddleware': 543,
}
MYSQL_HOST = '127.0.0.1'
MYSQL_DATABASE = 'wwh'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '123456'
ROBOTSTXT_OBEY = False
爬取結果:
(2)作業心得:透過本次實驗,我對於使用XPath解析HTML文件有了進一步的理解,在Scrapy中使用XPath選擇器可以精確地定位和提取網頁中的資料。
作業③:
要求:熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取外匯網站資料。
候選網站:中國銀行網:https://www.boc.cn/sourcedb/whpj/
(1)程式碼如下:
items.py
import scrapy
class ForexItem(scrapy.Item):
currency = scrapy.Field()
tbp = scrapy.Field()
cbp = scrapy.Field()
tsp = scrapy.Field()
csp = scrapy.Field()
time = scrapy.Field()
spider.py
import scrapy
from forex_scraper.items import ForexItem
class BankSpider(scrapy.Spider):
name = "forex_spider"
allowed_domains = ["www.boc.cn"]
start_urls = ["https://www.boc.cn/sourcedb/whpj/"]
def parse(self, response):
# 選擇第一個 tbody 中的所有行
rows = response.xpath('//tbody[1]/tr')
# 調整迴圈範圍,以遍歷相關的行
for row in rows[2:-2]: # 從索引 2 開始,到倒數第二行結束
item = ForexItem()
item['currency'] = row.xpath(".//td[1]//text()").get() # 使用 .get() 簡化語法
item['tbp'] = row.xpath(".//td[2]//text()").get()
item['cbp'] = row.xpath(".//td[3]//text()").get()
item['tsp'] = row.xpath(".//td[4]//text()").get()
item['csp'] = row.xpath(".//td[5]//text()").get()
item['time'] = row.xpath(".//td[8]//text()").get()
yield item
pipelines.py
import mysql.connector
from mysql.connector import Error
class MySQLPipeline:
def open_spider(self, spider):
try:
self.connection = mysql.connector.connect(
host='127.0.0.1',
user='root', # 替換為你的MySQL使用者名稱
password='123456', # 替換為你的MySQL密碼
database='wwh', # 替換為你的資料庫名
charset='utf8mb4',
use_unicode=True
)
self.cursor = self.connection.cursor()
self.cursor.execute('''
CREATE TABLE IF NOT EXISTS exchange_rates (
id Integer,
currency VARCHAR(255),
tbp VARCHAR(255),
cbp VARCHAR(255),
tsp VARCHAR(255),
csp VARCHAR(255),
time VARCHAR(255)
)
''')
self.connection.commit()
except Error as e:
print(f"Error connecting to MySQL: {e}")
def close_spider(self, spider):
if self.connection.is_connected():
self.cursor.close()
self.connection.close()
def process_item(self, item, spider):
try:
self.cursor.execute('''
INSERT INTO exchange_rates (currency, tbp, cbp, tsp, csp, time)
VALUES (%s, %s, %s, %s, %s, %s)
''', (item['currency'], item['tbp'], item['cbp'], item['tsp'], item['csp'], item['time']))
self.connection.commit()
except Error as e:
print(f"Error inserting item into MySQL: {e}")
return item
其它與上一題類似。
爬取結果:
(2)作業心得:跟上一題其實是一樣的體會。