資料採集與融合技術作業一
網路爬蟲實踐
作業①:爬取大學排名資訊
程式碼實現
import requests
from bs4 import BeautifulSoup
# 確保URL是正確的
target_url = "http://www.shanghairanking.cn/rankings/bcur/2020"
try:
response = requests.get(target_url)
response.raise_for_status() # 檢查請求是否成功
html_content = response.content
soup = BeautifulSoup(html_content, 'html.parser')
ranking_table = soup.find('table')
# 設定列寬,確保對齊
print(f"{'排名':<5}{'學校名稱':<50}{'省市':<15}{'學校型別':<20}{'總分':<10}")
table_rows = ranking_table.find_all('tr')
for row in table_rows[1:]:
columns = row.find_all('td')
if columns:
ranking = columns[0].get_text(strip=True)
school = columns[1].get_text(strip=True)
location = columns[2].get_text(strip=True)
university_type = columns[3].get_text(strip=True)
score = columns[4].get_text(strip=True)
# 列印資訊,使用格式化字串對齊列
print(f"{ranking:<5}{school:<50}{location:<15}{university_type:<20}{score:<10}")
except requests.RequestException as e:
print(f"請求網頁時遇到問題:{e}")
except Exception as e:
print(f"解析網頁時遇到問題:{e}")
心得體會:
在完成作業①的過程中,我學習瞭如何使用requests庫來傳送HTTP請求,並使用BeautifulSoup來解析HTML文件。這個過程中,我瞭解到了網頁結構的重要性,以及如何根據網頁的結構來提取所需的資料。
作業②:爬取商城商品資訊
程式碼實現
import urllib.request
from bs4 import BeautifulSoup
import chardet
# 我獲取HTML文字內容
def getHTMLText(url):
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
}
try:
req = urllib.request.Request(url, headers=headers)
data = urllib.request.urlopen(req).read() # 傳送請求並讀取響應資料
encoding = chardet.detect(data)['encoding'] # 使用chardet庫檢測網頁編碼
data = data.decode(encoding, 'ignore') # 解碼網頁內容
return data
except Exception as err:
print("Error:", err)
return None
# 解析頁面內容
def parsePage(data):
uinfo = []
soup = BeautifulSoup(data, 'html.parser')
items = soup.find_all('li', class_=lambda x: x and x.startswith('line'))
for i, item in enumerate(items):
price = item.find('p', class_='price').span.get_text(strip=True)
name = item.find('p', class_='name').a.text
uinfo.append([i + 1, price, name])
return uinfo
# 列印商品列表
def printGoodsList(uinfo):
tplt = "{:<10} {:<10} {:<20}"
print(tplt.format("序號", "價格", "商品名稱"))
for i in uinfo:
print(tplt.format(i[0], i[1], i[2]))
def main():
url = 'http://search.dangdang.com/?key=%CA%E9%B0%FC&act=input'
data = getHTMLText(url)
if data:
uinfo = parsePage(data)
printGoodsList(uinfo)
if __name__ == '__main__':
main()
心得體會:
在完成作業②的過程中,我學習瞭如何使用正規表示式來提取商品價格。這個過程中,我瞭解到了正規表示式的強大功能,以及如何在實際的網頁資料中應用它們。
作業③:爬取網頁中的圖片
程式碼實現
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
def fetch_page_content(url):
"""獲取網頁內容"""
try:
response = requests.get(url)
response.raise_for_status() # 確保請求成功
response.encoding = 'utf-8'
return response.text
except requests.RequestException as e:
print(f"請求網頁時遇到問題:{e}")
return None
def parse_images_from_html(html_content, base_url):
"""從HTML內容中解析圖片連結"""
soup = BeautifulSoup(html_content, 'html.parser')
img_tags = soup.find_all('img')
img_urls = [urljoin(base_url, img.get('src')) for img in img_tags if img.get('src')]
return [img_url for img_url in img_urls if img_url.lower().endswith(('.jpg', '.jpeg'))]
def download_images(img_urls, folder_name):
"""下載圖片並儲存到指定資料夾"""
if not os.path.exists(folder_name):
os.makedirs(folder_name)
for img_url in img_urls:
try:
img_response = requests.get(img_url)
img_response.raise_for_status() # 確保請求成功
img_name = os.path.join(folder_name, img_url.split('/')[-1])
with open(img_name, 'wb') as f:
f.write(img_response.content)
print(f'圖片已儲存:{img_name}')
except requests.RequestException as e:
print(f"下載圖片時遇到問題:{e}")
def main():
url = 'https://news.fzu.edu.cn/yxfd.htm'
folder_name = 'images'
html_content = fetch_page_content(url)
if html_content:
img_urls = parse_images_from_html(html_content, url)
download_images(img_urls, folder_name)
print('所有圖片已下載完畢。')
if __name__ == '__main__':
main()
心得體會:
在完成作業③的過程中,我學習瞭如何使用urllib.parse來處理URL,以及如何使用requests庫來下載檔案。這個過程中,我瞭解到了網路請求中的一些細節,比如如何處理URL拼接和檔案儲存。