第三次作業
作業①:
1.要求:
指定一個網站,爬取這個網站中的所有的所有圖片,例如:中國氣象網(
http://www.weather.com.cn
)。使用scrapy框架分別實現單執行緒和多執行緒的方式爬取。
–務必控制總頁數(學號尾數2位)、總下載的圖片數量(尾數後3位)等限制爬取的措施。
- 輸出資訊: 將下載的Url資訊在控制檯輸出,並將下載的圖片儲存在images子檔案中,並給出截圖。
- Gitee****資料夾連結
2.程式碼片段
import scrapy
import os
from urllib.parse import urljoin
from scrapy.exceptions import CloseSpider
class WeatherInfoSpider(scrapy.Spider):
spider_name = 'weather'
allowed_sites = ['weather.com.cn']
initial_urls = ['http://www.weather.com.cn/']
# 設定頁面和圖片下載的限制
page_limit = 17
image_limit = 117
page_counter = 0
image_counter = 0
def start_requests(self):
self.logger.info('Starting requests...')
for url in self.initial_urls:
yield scrapy.Request(url, callback=self.process_page)
def process_page(self, response):
# 檢查是否已達到頁面限制
if self.page_counter >= self.page_limit:
raise CloseSpider('Reached maximum page limit.')
self.page_counter += 1
self.logger.info(f'Visited page {self.page_counter}.')
# 提取圖片連結
images = response.css('img::attr(src)').getall()
for image in images:
full_image_url = urljoin(response.url, image)
self.image_counter += 1
self.logger.info(f'Found image URL: {full_image_url}')
yield {
'image_url': full_image_url
}
# 檢查是否已達到圖片下載限制
if self.image_counter >= self.image_limit:
raise CloseSpider('Reached maximum image download limit.')
# 提取並請求其他頁面連結
links = response.css('a::attr(href)').getall()
for link in links:
if self.page_counter >= self.page_limit:
break
full_link = urljoin(response.url, link)
yield scrapy.Request(full_link, callback=self.process_page)
def close(self, reason):
self.logger.info(f'Spider closed. Reason: {reason}')
3.截圖
心得體會:
單執行緒爬取:適用於需要爬取的頁面較少,且目標網站響應速度較快。
多執行緒爬取:適用於需要抓取大量資料,或者目標網站的響應速度較慢時。
作業②
1.要求
熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取股票相關資訊。
-
候選網站:東方財富網:https://www.eastmoney.com/
-
-
輸出資訊:MySQL資料庫儲存和輸出格式如下:
-
表頭英文命名例如:序號id,股票程式碼:bStockNo……,由同學們自行定義設計
-
序號 股票程式碼 股票名稱 最新報價 漲跌幅 漲跌額 成交量 振幅 最高 最低 今開 昨收 1 688093 N世華 28.47 10.92 26.13萬 7.6億 22.34 32.0 28.08 30.20 17.55 2…… -
Gitee資料夾連結
2.程式碼片段
from typing import Any, Dict
import scrapy
import re
import json
import pymysql
class StockItem(scrapy.Item):
latest_price = scrapy.Field()
change_percentage = scrapy.Field()
change_amount = scrapy.Field()
volume = scrapy.Field()
turnover = scrapy.Field()
amplitude = scrapy.Field()
code = scrapy.Field()
name = scrapy.Field()
high = scrapy.Field()
low = scrapy.Field()
today_open = scrapy.Field()
yesterday_close = scrapy.Field()
class StockSpider(scrapy.Spider):
name = 'stock_spider'
start_urls = [
'http://25.push2.eastmoney.com/api/qt/clist/get?cb=jQuery1124021313927342030325_some_timestamp&pn=1&pz=20&po=1&np=1&ut=some_unique_token&fltt=2&invt=2&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f2,f3,f4,f5,f6,f7,f12,f14,f15,f16,f17,f18&_=some_other_timestamp'
]
def parse(self, response: scrapy.http.Response) -> Any:
body = response.text
diff_pattern = re.compile(r'"diff":\[(.*?)\]', re.DOTALL)
diff_data = diff_pattern.search(body).group(1)
data_pattern = re.compile(r'\{(.*?)\}', re.DOTALL)
stock_records = data_pattern.findall(diff_data)
for record in stock_records:
stock_data = json.loads('{' + record + '}')
item = StockItem()
item['latest_price'] = stock_data.get('f2')
item['change_percentage'] = stock_data.get('f3')
item['change_amount'] = stock_data.get('f4')
item['volume'] = stock_data.get('f5')
item['turnover'] = stock_data.get('f6')
item['amplitude'] = stock_data.get('f7')
item['code'] = stock_data.get('f12')
item['name'] = stock_data.get('f14')
item['high'] = stock_data.get('f15')
item['low'] = stock_data.get('f16')
item['today_open'] = stock_data.get('f17')
item['yesterday_close'] = stock_data.get('f18')
yield item
class StockPipeline:
def open_spider(self, spider: scrapy.Spider):
try:
self.connection = pymysql.connect(
host='127.0.0.1',
user='root',
password='Cjkmysql.',
port=3306,
charset='utf8',
database='chenoojkk'
)
self.cursor = self.connection.cursor()
self.cursor.execute('DROP TABLE IF EXISTS stocks')
create_table_sql = """
CREATE TABLE stocks (
latest_price DOUBLE,
change_percentage DOUBLE,
change_amount DOUBLE,
volume DOUBLE,
turnover DOUBLE,
amplitude DOUBLE,
code VARCHAR(12) PRIMARY KEY,
name VARCHAR(32),
high DOUBLE,
low DOUBLE,
today_open DOUBLE,
yesterday_close DOUBLE
)
"""
self.cursor.execute(create_table_sql)
except Exception as e:
print(f"Error opening spider: {e}")
def process_item(self, item: StockItem, spider: scrapy.Spider) -> StockItem:
try:
insert_sql = """
INSERT INTO stocks (
latest_price, change_percentage, change_amount, volume, turnover, amplitude, code, name, high, low, today_open, yesterday_close
) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
"""
values = (
item['latest_price'], item['change_percentage'], item['change_amount'],
item['volume'], item['turnover'], item['amplitude'], item['code'],
item['name'], item['high'], item['low'], item['today_open'], item['yesterday_close']
)
self.cursor.execute(insert_sql, values)
self.connection.commit()
except Exception as e:
print(f"Error processing item: {e}")
return item
def close_spider(self, spider: scrapy.Spider):
self.cursor.close()
self.connection.close()
3.截圖
心得體會:
學會了如何將資料存入資料庫,由於使用終端較難視覺化,下次將使用navicat
作業③:
1.要求
熟練掌握 scrapy 中 Item、Pipeline 資料的序列化輸出方法;使用scrapy框架+Xpath+MySQL資料庫儲存技術路線爬取外匯網站資料。
-
候選網站:****中國銀行網:https://www.boc.cn/sourcedb/whpj/
-
輸出資訊:
-
Gitee資料夾連結
Currency | TBP | CBP | TSP | CSP | Time |
---|---|---|---|---|---|
阿聯酋迪拉姆 | 198.58 | 192.31 | 199.98 | 206.59 | 11:27:14 |
2.程式碼
import scrapy
from scrapy.loader import ItemLoader
from scrapy.loader.processors import TakeFirst, MapCompose
from w3lib.html import remove_tags
from Practical_work3.items import Work3Item
import pymysql
class Work3Spider(scrapy.Spider):
name = 'work3_revised'
start_requests = [scrapy.http.Request('https://www.boc.cn/sourcedb/whpj/', callback=self.parse)]
def parse(self, response):
for row in response.css('table[align="left"] tr'):
loader = ItemLoader(item=Work3Item(), selector=row, default_output_processor=TakeFirst())
loader.add_css('name', 'td:nth-child(1)::text')
loader.add_css('price1', 'td:nth-child(2)::text')
loader.add_css('price2', 'td:nth-child(3)::text')
loader.add_css('price3', 'td:nth-child(4)::text')
loader.add_css('price4', 'td:nth-child(5)::text')
loader.add_css('price5', 'td:nth-child(6)::text')
loader.add_css('date', 'td:nth-last-child(1)::text')
loader.add_value('name', MapCompose(remove_tags)(loader.get_output_value('name')))
yield loader.load_item()
class Work3Pipeline:
def __init__(self):
self.db = None
self.cursor = None
def open_spider(self, spider):
try:
self.db = pymysql.connect(host='127.0.0.1', user='root', passwd='Cjkmysql.', port=3306, charset='utf8', database='chenoojkk')
self.cursor = self.db.cursor()
self.create_table()
except Exception as e:
print(f"Failed to connect to database: {e}")
def create_table(self):
try:
self.cursor.execute('DROP TABLE IF EXISTS bank')
self.cursor.execute("""
CREATE TABLE bank (
Currency varchar(32),
p1 varchar(17),
p2 varchar(17),
p3 varchar(17),
p4 varchar(17),
p5 varchar(17),
Time varchar(32)
)
""")
except Exception as e:
print(f"Failed to create table: {e}")
def process_item(self, item, spider):
try:
self.cursor.execute("""
INSERT INTO bank (Currency, p1, p2, p3, p4, p5, Time)
VALUES (%s, %s, %s, %s, %s, %s, %s)
""", (
item['name'],
item['price1'],
item['price2'],
item['price3'],
item['price4'],
item['price5'],
item['date']
))
self.db.commit()
except Exception as e:
print(f"Failed to insert item: {e}")
return item
def close_spider(self, spider):
if self.cursor:
self.cursor.close()
if self.db:
self.db.close()
3.截圖
心得體會:
Scrapy 框架 提供了一個完整的爬取和資料儲存的解決方案,能夠處理請求、解析、資料儲存等問題,非常適合構建大規模爬蟲。