作業一
1.實驗內容
要求:在中國氣象網(http://www.weather.com.cn)給定城市集的7日天氣預報,並儲存在資料庫
程式碼如下:
import requests
import sqlite3
from bs4 import BeautifulSoup
# 定義獲取天氣資訊的函式
def get_weather_info(city_id):
url_template = 'http://www.weather.com.cn/weather/{}.shtml'
complete_url = url_template.format(city_id)
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36'
}
response = requests.get(complete_url, headers=headers)
response.encoding = 'utf-8'
soup = BeautifulSoup(response.text, 'html.parser')
forecast_data = []
days = soup.find('ul', class_='t clearfix').find_all('li')
for day in days:
day_date = day.find('h1').get_text(strip=True)
weather_condition = day.find('p', class_='wea').get_text(strip=True)
high_temp = day.find('span').get_text(strip=True) if day.find('span') else ''
low_temp = day.find('i').get_text(strip=True)
temperature = f"{high_temp}/{low_temp}"
forecast_data.append((day_date, weather_condition, temperature))
return forecast_data
# 建立資料庫和表格
def initialize_database():
connection = sqlite3.connect('weather_data.db')
cursor = connection.cursor()
cursor.execute('''
CREATE TABLE IF NOT EXISTS forecast (
record_id INTEGER PRIMARY KEY AUTOINCREMENT,
location TEXT,
forecast_date TEXT,
condition TEXT,
temp_range TEXT
)
''')
connection.commit()
return connection
# 儲存天氣資訊到資料庫
def store_weather_data(location, weather_info, connection):
cursor = connection.cursor()
for day_date, condition, temp in weather_info:
cursor.execute("INSERT INTO forecast (location, forecast_date, condition, temp_range) VALUES (?, ?, ?, ?)",
(location, day_date, condition, temp))
connection.commit()
# 列印資料庫中的天氣資訊
def print_weather_data(connection):
cursor = connection.cursor()
cursor.execute("SELECT * FROM forecast")
records = cursor.fetchall()
print(f"{'ID':<5} {'城市':<10} {'日期':<15} {'天氣狀況':<20} {'溫度範圍':<15}")
for record in records:
print(f"{record[0]:<5} {record[1]:<10} {record[2]:<15} {record[3]:<20} {record[4]:<15}")
def main():
# 城市及其對應的程式碼
cities = {
'北京': '101010100',
'上海': '101020100',
'福州': '101230101',
'天津': '101030100'
}
# 初始化資料庫連線
db_connection = initialize_database()
# 獲取並儲存每個城市的天氣資訊
for city_name, city_id in cities.items():
print(f"獲取城市 {city_name} ({city_id}) 的天氣資訊...")
weather_info = get_weather_info(city_id)
store_weather_data(city_name, weather_info, db_connection)
# 列印資料庫中的天氣資訊
print_weather_data(db_connection)
db_connection.close()
if __name__ == '__main__':
main()
結果如下:
2.心得體會
寫程式碼時遇到bug:TypeError: 'bool' object is not callable
仔細查詢發現是strip=True後面多了一個括號,判斷應該是python版本不同導致的寫法不同
作業二
1.實驗內容
-
要求:用requests和BeautifulSoup庫方法定向爬取股票相關資訊,並儲存在資料庫中。
-
候選網站:東方財富網:https://www.eastmoney.com/
-
新浪股票:http://finance.sina.com.cn/stock/
-
技巧:在谷歌瀏覽器中進入F12除錯模式進行抓包,查詢股票列表載入使用的url,並分析api返回的值,並根據所要求的引數可適當更改api的請求引數。根據URL可觀察請求的引數f1、f2可獲取不同的數值,根據情況可刪減請求的引數。
-
參考連結:https://zhuanlan.zhihu.com/p/50099084
程式碼如下:
import requests import re import sqlite3 def fetch_html(page_num, query_parameters): base_url = ("http://66.push2.eastmoney.com/api/qt/clist/get?" "cb=jQuery112409097606620255823_1696662149317&pn=1&pz=20&po=1&np=" + str(page_num) + "&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&wbp2u=|0|0|0|web&" + query_parameters + "&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1696662149318") response = requests.get(base_url) regex_pattern = "\"diff\":\\[(.*?)\\]" extracted_data = re.findall(regex_pattern, response.text, re.S) return extracted_data def process_stock_data(query_params, page_num): # Retrieve the JSON data from the API json_data = fetch_html(page_num, query_params) stocks_data = json_data[0].split("},") # Create/connect to a SQLite database database_connection = sqlite3.connect('stock_data.db') db_cursor = database_connection.cursor() # Create the stocks table if it doesn't exist db_cursor.execute('''CREATE TABLE IF NOT EXISTS stock_info ( id INTEGER PRIMARY KEY, stock_code TEXT, stock_name TEXT, stock_price REAL, price_change REAL, price_change_percent REAL, volume INTEGER, turnover REAL, amplitude REAL, highest REAL, lowest REAL, open_price REAL, last_close REAL )''') for stock in stocks_data: # Parse stock data stock_info = {} attributes = stock.split(',') for attribute in attributes: key_value = attribute.split(':') key = key_value[0].strip('"') value = key_value[1].strip('"') stock_info[key] = value # Extract relevant stock information stock_code = stock_info.get('f12', 'N/A') stock_name = stock_info.get('f14', 'N/A') stock_price = float(stock_info.get('f2', 0.0)) price_change = float(stock_info.get('f4', 0.0)) price_change_percent = float(stock_info.get('f3', 0.0)) volume = int(stock_info.get('f5', 0)) turnover = float(stock_info.get('f6', 0.0)) amplitude = float(stock_info.get('f7', 0.0)) highest = float(stock_info.get('f15', 0.0)) lowest = float(stock_info.get('f16', 0.0)) open_price = float(stock_info.get('f17', 0.0)) last_close = float(stock_info.get('f18', 0.0)) # Insert stock data into the database db_cursor.execute( "INSERT INTO stock_info (stock_code, stock_name, stock_price, price_change, price_change_percent, volume, turnover, amplitude, highest, lowest, open_price, last_close) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)", (stock_code, stock_name, stock_price, price_change, price_change_percent, volume, turnover, amplitude, highest, lowest, open_price, last_close)) database_connection.commit() # Query all stock data db_cursor.execute("SELECT * FROM stock_info") all_stocks = db_cursor.fetchall() # Get column names for displaying data column_names = [description[0] for description in db_cursor.description] print("\t".join(column_names)) # Display each stock's information for stock in all_stocks: print("\t".join(map(str, stock))) # Close the database connection database_connection.close() # Execute the function with specified parameters page_number = 1 process_stock_data("fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048", page_number)
結果如下:
2.心得體會
目前程式碼是一次性抓取資料,可以考慮新增功能以定期更新資料或只插入新資料,而不是每次都插入,且如果資料量增加,插入資料庫時可以考慮使用批次插入的方式提高效能。
作業三
1.實驗內容
- 要求:爬取中國大學2021主榜(https://www.shanghairanking.cn/rankings/bcur/2021)所有院校資訊,並儲存在資料庫中,同時將瀏覽器F12除錯分析的過程錄製Gif加入至部落格中。
- 技巧:分析該網站的發包情況,分析獲取資料的api
程式碼如下:
import requests
from bs4 import BeautifulSoup
import sqlite3
# 目標網頁URL
url = 'https://www.shanghairanking.cn/rankings/bcur/2021' # 請替換為實際的URL
# 使用 requests 獲取網頁內容
response = requests.get(url)
content = response.content
# 解析網頁內容
soup = BeautifulSoup(content, 'html.parser')
# 找到包含大學排名資訊的表格
ranking_table = soup.find('table', class_='rk-table')
# 建立或連線到SQLite資料庫
database = sqlite3.connect('schools_rank.db')
db_cursor = database.cursor()
# 建立資料表,如果表不存在
db_cursor.execute('''CREATE TABLE IF NOT EXISTS university_ranking
(rank TEXT, school_name TEXT, province_city TEXT, school_type TEXT, total_score TEXT)''')
# 遍歷表格中的每一行,獲取資料
for entry in ranking_table.find_all('tr')[1:]: # 跳過表頭
columns = entry.find_all('td')
rank_value = columns[0].get_text(strip=True)
university_name = columns[1].get_text(strip=True)
location = columns[2].get_text(strip=True)
type_of_school = columns[3].get_text(strip=True)
score = columns[4].get_text(strip=True)
# 插入資料到資料庫中
db_cursor.execute('''INSERT INTO university_ranking (rank, school_name, province_city, school_type, total_score)
VALUES (?, ?, ?, ?, ?)''', (rank_value, university_name, location, type_of_school, score))
# 提交資料到資料庫
database.commit()
# 查詢並列印資料庫中的所有記錄
db_cursor.execute("SELECT * FROM university_ranking")
records = db_cursor.fetchall()
for record in records:
# 清除記錄中的換行符
cleaned_record = [field.replace('\n', '') for field in record]
# 列印處理後的記錄
print(tuple(cleaned_record))
# 關閉資料庫連線
database.close()
print("大學排名資料已成功儲存至資料庫")
結果如下:
F12除錯過程GIF如下:
2.心得體會
當前程式碼沒有包括任何異常處理邏輯,導致我在debug時花費較多時間。例如在資料庫操作或網頁請求時,如果發生錯誤,程式將直接崩潰。下次將在關鍵部分加入 try-except 塊,以便能夠捕獲並處理潛在的異常。