Gitee作業連結:https://gitee.com/zheng-qijian33/crawl_project/tree/master/作業3
作業①:
要求:指定一個網站,爬取這個網站中的所有的所有圖片,例如:中國氣象網(http://www.weather.com.cn)。使用scrapy框架分別實現單執行緒和多執行緒的方式爬取。
–務必控制總頁數(學號尾數2位)、總下載的圖片數量(尾數後3位)等限制爬取的措施。
主要程式碼
weather_spider.py:
import scrapy
from scrapy import Request
from scrapy.crawler import CrawlerProcess
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import sys
sys.path.append('D:\\資料採集實踐2\\weather_images')
from weather_images.pipelines import WeatherImagesPipeline
class WeatherSpider(scrapy.Spider):
name = 'weather_spider'
allowed_domains = ['weather.com.cn']
start_urls = ['http://weather.com.cn/']
def parse(self, response):
# 解析頁面並提取圖片URL
# 假設圖片在img標籤中
images = response.css('img::attr(src)').getall()
for image_url in images:
if image_url.startswith('http'):
yield Request(image_url, callback=self.save_image)
def save_image(self, response):
# 儲存圖片到本地
image_guid = response.url.split('/')[-1]
yield {
'image_urls': [response.url],
'image_name': image_guid
}
class WeatherImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
# 配置Scrapy設定
process = CrawlerProcess(settings={
'ITEM_PIPELINES': {'weather_images.pipelines.WeatherImagesPipeline': 1},
'IMAGES_STORE': 'D:\\資料採集實踐2\\images',
'CONCURRENT_REQUESTS': 32, # 控制併發請求數量
'DOWNLOAD_DELAY': 1, # 控制下載延遲
'CLOSESPIDER_ITEMCOUNT': 133, # 控制下載的圖片數量
'CLOSESPIDER_PAGECOUNT': 33, # 控制總頁數
})
# 啟動爬蟲
process.crawl(WeatherSpider)
process.start()
pipelines.py:
import scrapy
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
class WeatherImagesPipeline(ImagesPipeline):
def get_media_requests(self, item, info):
for image_url in item['image_urls']:
yield scrapy.Request(image_url)
def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
item['image_paths'] = image_paths
return item
settings.py:
BOT_NAME = 'weather_images'
SPIDER_MODULES = ['weather_images.spiders']
NEWSPIDER_MODULE = 'weather_images.spiders'
# 管道設定
ITEM_PIPELINES = {
'weather_images.pipelines.WeatherImagesPipeline': 1,
}
# 圖片儲存路徑
IMAGES_STORE = 'D:\\資料採集實踐2\\images'
#多執行緒
CONCURRENT_REQUESTS = 32
#單執行緒
#CONCURRENT_REQUESTS = 1
DOWNLOAD_DELAY = 1
CLOSESPIDER_ITEMCOUNT = 133
CLOSESPIDER_PAGECOUNT = 33
# Crawl responsibly by identifying yourself (and your website) on the user-agent
#USER_AGENT = "weather_images (+http://www.yourdomain.com)"
# Obey robots.txt rules
ROBOTSTXT_OBEY = True
實驗結果
執行結果:
儲存結果:
實驗心得
在編寫這個天氣圖片爬蟲的過程中,我深刻體會到Scrapy框架的強大與靈活性。它不僅簡化了網頁資料抓取和圖片下載的過程,還透過非同步處理機制提高了效率。透過使用Scrapy的ImagesPipeline,我能夠輕鬆地提取圖片URL並下載儲存圖片,同時透過配置併發請求數量和下載延遲,有效控制了爬蟲對目標網站的壓力。此外,Scrapy的中介軟體和管道機制使得資料處理和儲存變得非常靈活,進一步提升了爬蟲的可擴充套件性和可維護性。
作業②
要求:熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取股票相關資訊。
候選網站:東方財富網:https://www.eastmoney.com/
主要程式碼
stocks.py:
import scrapy
import json
from scrapy.exceptions import CloseSpider
from stock_scraper.items import StockScraperItem
class StockSpider(scrapy.Spider):
name = 'stocks'
allowed_domains = ['eastmoney.com']
start_urls = [
'https://push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409840494931556277_1633338445629&pn=1&pz=10&po=1&np=1&fltt=2&invt=2&fid=f3&fs=b:MK0021&fields=f12,f14,f2,f3,f4,f5,f6,f7,f8,f9,f10,f11,f18,f15,f16,f17,f23'
]
def parse(self, response):
try:
# 解析 JSON 資料
json_response = response.json()
data = json_response.get('data', {})
stock_diff = data.get('diff', [])
if not stock_diff:
self.logger.warning("No stock data found in the response.")
return
# 遍歷股票資料
for stock in stock_diff:
item = StockScraperItem()
item['stock_code'] = stock.get('f12', 'N/A')
item['stock_name'] = stock.get('f14', 'N/A')
item['latest_price'] = self._parse_float(stock, 'f2', 0.0)
item['change_percent'] = self._parse_float(stock, 'f3', 0.0)
item['change_amount'] = self._parse_float(stock, 'f4', 0.0)
item['volume'] = stock.get('f5', '0')
item['turnover'] = stock.get('f6', '0')
item['amplitude'] = self._parse_float(stock, 'f7', 0.0)
item['high'] = self._parse_float(stock, 'f15', 0.0)
item['low'] = self._parse_float(stock, 'f16', 0.0)
item['open_price'] = self._parse_float(stock, 'f17', 0.0)
item['yesterday_close'] = self._parse_float(stock, 'f18', 0.0)
yield item
except json.JSONDecodeError as e:
self.logger.error(f"Failed to parse JSON: {e}")
raise CloseSpider("Invalid JSON response")
except Exception as e:
self.logger.error(f"An error occurred: {e}")
raise CloseSpider("An unexpected error occurred")
def _parse_float(self, stock_data, key, default=0.0):
value = stock_data.get(key)
if value is None:
return default
try:
return float(value)
except (ValueError, TypeError):
self.logger.warning(f"Invalid value for key '{key}': {value}")
return default
pipelines:
import mysql.connector
from scrapy.exceptions import DropItem
import pymysql
class MySQLPipeline:
def open_spider(self, spider):
# 讀取資料庫設定
self.con = pymysql.connect(host="localhost", port=3306, user="root", passwd="192837465", db="stocks",
charset="utf8") #############
self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
self.opened = True
self.count = 0
# 建立儲存股票資料的表
self.cursor.execute('''
CREATE TABLE IF NOT EXISTS stock (
stock_code VARCHAR(20),
stock_name VARCHAR(255),
latest_price FLOAT,
change_percent FLOAT,
change_amount FLOAT,
volume VARCHAR(20),
turnover VARCHAR(20),
amplitude FLOAT,
high FLOAT,
low FLOAT,
open_price FLOAT,
yesterday_close FLOAT,
PRIMARY KEY(stock_code)
)
''')
self.con.commit()
def close_spider(self, spider):
# 關閉資料庫連線
self.cursor.close()
self.con.close()
def process_item(self, item, spider):
# 插入資料到表中
try:
self.cursor.execute('''
REPLACE INTO stock (stock_code, stock_name, latest_price, change_percent, change_amount, volume,
turnover, amplitude, high, low, open_price, yesterday_close)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
''', (
item['stock_code'], item['stock_name'], item['latest_price'], item['change_percent'],
item['change_amount'], item['volume'], item['turnover'], item['amplitude'], item['high'],
item['low'], item['open_price'], item['yesterday_close']
))
self.con.commit()
except mysql.connector.Error as e:
spider.logger.error(f"Error saving item to MySQL: {e}")
raise DropItem(f"Error saving item: {e}")
return item
settings:
# Scrapy settings for stock_scraper project
BOT_NAME = 'stock_scraper'
SPIDER_MODULES = ['stock_scraper.spiders']
NEWSPIDER_MODULE = 'stock_scraper.spiders'
# 配置管道
ITEM_PIPELINES = {
'stock_scraper.pipelines.MySQLPipeline': 300,
}
# 資料庫配置
MYSQL_HOST = 'localhost' # 資料庫主機
MYSQL_DATABASE = 'stocks' # 資料庫名
MYSQL_USER = 'root' # 資料庫使用者
MYSQL_PASSWORD = '192837465' # 資料庫密碼
# 使用者代理
USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/130.0.0.0 Safari/537.36 Edg/130.0.0.0'
# 其他設定
REDIRECT_ENABLED = False
LOG_LEVEL = 'DEBUG'
# 配置最大併發請求數
CONCURRENT_REQUESTS = 16
CONCURRENT_REQUESTS_PER_DOMAIN = 8
# 設定下載延遲,避免過於頻繁的請求
DOWNLOAD_DELAY = 1 # 每個請求之間延遲1秒
# 啟用和配置 AutoThrottle 擴充套件
AUTOTHROTTLE_ENABLED = True
AUTOTHROTTLE_START_DELAY = 5 # 初始下載延遲
AUTOTHROTTLE_MAX_DELAY = 60 # 高延遲情況下的最大下載延遲
AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 # Scrapy 每個遠端伺服器並行傳送的平均請求數
items.py:
import scrapy
class StockScraperItem(scrapy.Item):
stock_code = scrapy.Field()
stock_name = scrapy.Field()
latest_price = scrapy.Field()
change_percent = scrapy.Field()
change_amount = scrapy.Field()
volume = scrapy.Field()
turnover = scrapy.Field()
amplitude = scrapy.Field()
high = scrapy.Field()
low = scrapy.Field()
open_price = scrapy.Field()
yesterday_close = scrapy.Field()
實驗結果
實驗心得
在開發這個股票資料爬蟲的過程中,我深入掌握了Scrapy框架的JSON解析、資料提取和錯誤處理機制,並透過自定義管道將資料高效地儲存到MySQL資料庫中。爬蟲透過非同步請求和錯誤處理機制,確保了資料的完整性和爬蟲的穩定性。同時,使用pymysql庫連線資料庫並執行批次插入操作,顯著提升了資料儲存的效率。這次實踐讓我對Scrapy框架的靈活性和資料庫操作有了更深入的理解,併為未來處理更復雜的爬蟲任務打下了堅實的基礎。
作業③:
要求:熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取外匯網站資料。
候選網站:中國銀行網:https://www.boc.cn/sourcedb/whpj/
主要程式碼
boc_spider.py:
import scrapy
from boc_spider.boc_spider.items import BocExchangeRateItem
from bs4 import BeautifulSoup
class ExchangeRateSpider(scrapy.Spider):
name = "boc_spider"
start_urls = ['https://www.boc.cn/sourcedb/whpj/']
def parse(self, response):
try:
soup = BeautifulSoup(response.body, 'lxml')
table = soup.find_all('table')[1]
rows = table.find_all('tr')
rows.pop(0) # 移除表頭
for row in rows:
item = BocExchangeRateItem()
columns = row.find_all('td')
item['currency'] = columns[0].text.strip()
item['cash_buy'] = columns[1].text.strip()
item['cash_sell'] = columns[2].text.strip()
item['spot_buy'] = columns[3].text.strip()
item['spot_sell'] = columns[4].text.strip()
item['exchange_rate'] = columns[5].text.strip()
item['publish_date'] = columns[6].text.strip()
item['publish_time'] = columns[7].text.strip()
yield item
except Exception as err:
self.logger.error(f"An error occurred: {err}")
pipelines:
import pymysql
from scrapy.exceptions import DropItem
class BocExchangeRatePipeline(object):
def __init__(self, mysql_host, mysql_db, mysql_user, mysql_password, mysql_port):
self.mysql_host = mysql_host
self.mysql_db = mysql_db
self.mysql_user = mysql_user
self.mysql_password = mysql_password
self.mysql_port = mysql_port
@classmethod
def from_crawler(cls, crawler):
return cls(
mysql_host=crawler.settings.get('MYSQL_HOST'),
mysql_db=crawler.settings.get('MYSQL_DB'),
mysql_user=crawler.settings.get('MYSQL_USER'),
mysql_password=crawler.settings.get('MYSQL_PASSWORD'),
mysql_port=crawler.settings.get('MYSQL_PORT'),
)
def open_spider(self, spider):
self.connection = pymysql.connect(
host=self.mysql_host,
user=self.mysql_user,
password=self.mysql_password,
db=self.mysql_db,
port=self.mysql_port,
charset='utf8mb4',
cursorclass=pymysql.cursors.DictCursor
)
def close_spider(self, spider):
self.connection.close()
def process_item(self, item, spider):
with self.connection.cursor() as cursor:
sql = """
INSERT INTO exchange_rates (currency, cash_buy, cash_sell, spot_buy, spot_sell, exchange_rate, publish_date, publish_time)
VALUES (%s, %s, %s, %s, %s, %s, %s, %s)
"""
cursor.execute(sql, (
item['currency'],
item['cash_buy'],
item['cash_sell'],
item['spot_buy'],
item['spot_sell'],
item['exchange_rate'],
item['publish_date'],
item['publish_time']
))
self.connection.commit()
return item
settings:
BOT_NAME = "boc_spider"
SPIDER_MODULES = ["boc_spider.spiders"]
NEWSPIDER_MODULE = "boc_spider.spiders"
MYSQL_HOST = 'localhost'
MYSQL_DB = 'boc_db'
MYSQL_USER = 'root'
MYSQL_PASSWORD = '192837465'
MYSQL_PORT = 3306
REQUEST_FINGERPRINTER_IMPLEMENTATION = "2.7"
TWISTED_REACTOR = "twisted.internet.asyncioreactor.AsyncioSelectorReactor"
FEED_EXPORT_ENCODING = "utf-8"
ITEM_PIPELINES = {
'boc_spider.pipelines.BocExchangeRatePipeline': 300,
}
ROBOTSTXT_OBEY = True
items.py:
import scrapy
class BocExchangeRateItem(scrapy.Item):
currency = scrapy.Field()
cash_buy = scrapy.Field()
cash_sell = scrapy.Field()
spot_buy = scrapy.Field()
spot_sell = scrapy.Field()
exchange_rate = scrapy.Field()
publish_date = scrapy.Field()
publish_time = scrapy.Field()
實驗結果
實驗心得
在開發這個中國銀行外匯牌價爬蟲的過程中,我熟練運用了Scrapy框架和BeautifulSoup進行網頁解析和資料提取,並透過自定義管道將抓取到的匯率資料高效儲存到MySQL資料庫中。爬蟲透過解析多層巢狀的HTML表格結構,成功提取了多種貨幣的現匯和現鈔買入賣出價,並記錄了匯率的釋出時間和釋出時間。資料管道部分,我透過pymysql庫實現了與MySQL資料庫的連線和資料插入操作,確保了資料的完整性和一致性。這次實踐讓我對Scrapy框架的靈活性和資料庫操作有了更深入的理解,併為未來處理更復雜的網頁資料抓取任務積累了寶貴經驗。