作業1
程式碼及圖片:
點選檢視程式碼
def fetch_data(self, city):
if city not in self.cityCode.keys():
print(f"{city} code cannot be found")
return None
url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
try:
req = urllib.request.Request(url, headers=self.headers)
data = urllib.request.urlopen(req).read()
dammit = UnicodeDammit(data, ["utf-8", "gbk"])
return dammit.unicode_markup
except Exception as err:
print(f"Error fetching data for {city}: {type(err).__name__} - {err}")
return None
def parse_data(self, data, city):
weather_data = []
if data is not None:
soup = BeautifulSoup(data, "lxml")
lis = soup.select("ul[class='t clearfix'] li")
for li in lis:
try:
date = li.select('h1')[0].text
weather = li.select('p[class="wea"]')[0].text
temp = li.select('p[class="tem"] span')[0].text + "/" + li.select('p[class="tem"] i')[0].text
print(city, date, weather, temp)
weather_data.append([city, date, weather, temp])
except Exception as err:
print(f"Error parsing data for {city}: {type(err).__name__} - {err}")
return weather_data
def forecastCity(self, city):
data = self.fetch_data(city)
return self.parse_data(data, city)
def process_and_export(self, cities, filename="weather_data.csv"):
all_weather_data = []
for city in cities:
city_weather_data = self.forecastCity(city)
all_weather_data.extend(city_weather_data)
# Write all data to CSV
with open(filename, 'w', newline='', encoding='utf-8') as csv_file:
writer = csv.writer(csv_file)
writer.writerow(["City", "Date", "Weather", "Temperature"]) # Write header
writer.writerows(all_weather_data) # Write data rows
print(f"Data successfully exported to {filename}")
作業心得:
這次作業,讓我鞏固了網路爬蟲、HTML 解析和資料處理知識。
過程有挑戰,如頁面解析和異常處理,但透過分析頁面結構和完善 try - except 塊解決了。我收穫滿滿,也為後續學習相關知識更添信心。
Gitee資料夾連結:點這裡
作業2
過程:
- 1.找到api:
- 2.找到cookie:
- 3.找到url:
程式碼及圖片:
點選檢視程式碼
# 傳送請求獲取資料的函式
def fetch_data(page_number):
url = f'https://1.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112409952374347191637_1728983306168&pn={page_number}&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&dect=1&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_={1728983306169 + page_number}'
try:
response = requests.get(url, headers=headers, cookies=cookies)
return response
except Exception as err:
print(f"Error fetching data for page {page_number}: {type(err).__name__} - {err}")
return None
# 解析響應資料的函式
def parse_data(response):
if response and response.status_code == 200:
json_data = response.text
# 提取 JSON 格式的有效資料,去掉回撥函式名稱
json_data = json_data[json_data.index('(') + 1: -2] # 去掉回撥函式的包裹
data = json.loads(json_data) # 解析 JSON 資料
return data
else:
print(f'Failed to parse data. Status code: {response.status_code if response else "None"}')
return None
# 翻頁爬取資料的函式
def scrape_data(page_number):
response = fetch_data(page_number)
return parse_data(response)
# 爬取多頁資料並儲存到 DataFrame
def scrape_multiple_pages(max_pages=5):
all_data = [] # 用於儲存所有頁的資料
for page_number in range(1, max_pages + 1): # 從第一頁開始,直到 max_pages
print(f"正在爬取第 {page_number} 頁資料...")
data = scrape_data(page_number)
if data and 'data' in data and 'diff' in data['data']:
for stock in data['data']['diff']:
stock_info = {
'名稱': stock.get('f14'), # 股票名稱
'最新價': stock.get('f2'), # 最新價
'漲跌幅': stock.get('f3'), # 漲跌幅
'成交量(手)': stock.get('f4'), # 成交量(手)
'成交額': stock.get('f5'), # 成交額
'振幅': stock.get('f6'), # 振幅
'最高價': stock.get('f7'), # 最高價
'最低價': stock.get('f8') # 最低價
}
all_data.append(stock_info) # 將每條資料追加到列表中
else:
print(f"第 {page_number} 頁沒有資料或請求失敗")
break # 如果某頁沒有資料,終止迴圈
return all_data
作業心得:
這次用 requests 和 BeautifulSoup 庫爬取股票資訊作業讓我收穫頗豐。在過程中深入實踐了爬蟲技術,提升了資料獲取與分析能力,也體驗了資料庫互動。
雖遇到頁面解析、API 引數理解和資料庫儲存問題,但透過努力都解決了,為後續學習積累了經驗。
Gitee資料夾連結:點這裡
作業3
過程:
-
1.找到api:
-
2.分析api檔案,顯然,一些值被對映成碼:
程式碼及圖片:
點選檢視程式碼
# 對 data2 中的每個元素進行對映
new_data2 = []
for item in data2:
result_dict = item.groupdict()
updated_result_dict = {}
for key, value in result_dict.items():
if key in ["type", "province", "ranking"]:
value = value.strip('"')
if value in dict:
updated_result_dict[key] = dict[value]
else:
updated_result_dict[key] = value
else:
updated_result_dict[key] = value
new_data2.append(updated_result_dict)
# 將 new_data2 賦值給 data2,實現永久改變
data2 = new_data2
# 現在無論在哪裡輸出 data2,都是對映後的結果
for item in data2:
print(item)
# 儲存到檔案裡
with open('output.csv', 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = list(data2[0].keys()) if data2 else []
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for item in data2:
writer.writerow(item)
作業心得:
在完成此次作業的過程中,我收穫頗豐。分析網站的發包情況和獲取資料的 API 是整個任務的關鍵所在。這讓我深刻體會到理解網頁資料獲取機制的重要性,不能僅僅侷限於看到的頁面內容,還要深入挖掘背後的資料互動。
透過研究這些隱藏的資訊,我學會了如何更精準地獲取目標資料,這不僅是技術上的鍛鍊,更提升了我解決問題的思維能力。
同時,將除錯分析過程錄製成 Gif 加入部落格這一要求,也讓我重視起成果展示的完整性和直觀性,更好地和他人分享自己的實踐過程。