資料採集與融合技術作業一

102202107發表於2024-10-17

資料採集與融合技術作業一

網路爬蟲實踐

作業①:爬取大學排名資訊

程式碼實現

import requests
from bs4 import BeautifulSoup

# 確保URL是正確的
target_url = "http://www.shanghairanking.cn/rankings/bcur/2020"

try:
    response = requests.get(target_url)
    response.raise_for_status()  # 檢查請求是否成功
    html_content = response.content

    soup = BeautifulSoup(html_content, 'html.parser')
    ranking_table = soup.find('table')

    # 設定列寬,確保對齊
    print(f"{'排名':<5}{'學校名稱':<50}{'省市':<15}{'學校型別':<20}{'總分':<10}")

    table_rows = ranking_table.find_all('tr')
    for row in table_rows[1:]:
        columns = row.find_all('td')
        if columns:
            ranking = columns[0].get_text(strip=True)
            school = columns[1].get_text(strip=True)
            location = columns[2].get_text(strip=True)
            university_type = columns[3].get_text(strip=True)
            score = columns[4].get_text(strip=True)

            # 列印資訊,使用格式化字串對齊列
            print(f"{ranking:<5}{school:<50}{location:<15}{university_type:<20}{score:<10}")
except requests.RequestException as e:
    print(f"請求網頁時遇到問題:{e}")
except Exception as e:
    print(f"解析網頁時遇到問題:{e}")

心得體會:

在完成作業①的過程中,我學習瞭如何使用requests庫來傳送HTTP請求,並使用BeautifulSoup來解析HTML文件。這個過程中,我瞭解到了網頁結構的重要性,以及如何根據網頁的結構來提取所需的資料。

作業②:爬取商城商品資訊

程式碼實現

import urllib.request
from bs4 import BeautifulSoup
import chardet

# 我獲取HTML文字內容
def getHTMLText(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36"
    }
    try:
        req = urllib.request.Request(url, headers=headers)
        data = urllib.request.urlopen(req).read() # 傳送請求並讀取響應資料
        encoding = chardet.detect(data)['encoding'] # 使用chardet庫檢測網頁編碼
        data = data.decode(encoding, 'ignore') # 解碼網頁內容
        return data
    except Exception as err:
        print("Error:", err)
        return None

# 解析頁面內容
def parsePage(data):
    uinfo = []
    soup = BeautifulSoup(data, 'html.parser')

    items = soup.find_all('li', class_=lambda x: x and x.startswith('line'))
    for i, item in enumerate(items):
        price = item.find('p', class_='price').span.get_text(strip=True)
        name = item.find('p', class_='name').a.text
        uinfo.append([i + 1, price, name])
    return uinfo

# 列印商品列表
def printGoodsList(uinfo):
    tplt = "{:<10} {:<10} {:<20}"
    print(tplt.format("序號", "價格", "商品名稱"))
    for i in uinfo:
        print(tplt.format(i[0], i[1], i[2]))

def main():
    url = 'http://search.dangdang.com/?key=%CA%E9%B0%FC&act=input'
    data = getHTMLText(url)
    if data:
        uinfo = parsePage(data)
        printGoodsList(uinfo)

if __name__ == '__main__':
    main()

心得體會:

在完成作業②的過程中,我學習瞭如何使用正規表示式來提取商品價格。這個過程中,我瞭解到了正規表示式的強大功能,以及如何在實際的網頁資料中應用它們。

作業③:爬取網頁中的圖片

程式碼實現

import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

def fetch_page_content(url):
    """獲取網頁內容"""
    try:
        response = requests.get(url)
        response.raise_for_status()  # 確保請求成功
        response.encoding = 'utf-8'
        return response.text
    except requests.RequestException as e:
        print(f"請求網頁時遇到問題:{e}")
        return None

def parse_images_from_html(html_content, base_url):
    """從HTML內容中解析圖片連結"""
    soup = BeautifulSoup(html_content, 'html.parser')
    img_tags = soup.find_all('img')
    img_urls = [urljoin(base_url, img.get('src')) for img in img_tags if img.get('src')]
    return [img_url for img_url in img_urls if img_url.lower().endswith(('.jpg', '.jpeg'))]

def download_images(img_urls, folder_name):
    """下載圖片並儲存到指定資料夾"""
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
    for img_url in img_urls:
        try:
            img_response = requests.get(img_url)
            img_response.raise_for_status()  # 確保請求成功
            img_name = os.path.join(folder_name, img_url.split('/')[-1])
            with open(img_name, 'wb') as f:
                f.write(img_response.content)
                print(f'圖片已儲存:{img_name}')
        except requests.RequestException as e:
            print(f"下載圖片時遇到問題:{e}")

def main():
    url = 'https://news.fzu.edu.cn/yxfd.htm'
    folder_name = 'images'
    
    html_content = fetch_page_content(url)
    if html_content:
        img_urls = parse_images_from_html(html_content, url)
        download_images(img_urls, folder_name)
        print('所有圖片已下載完畢。')

if __name__ == '__main__':
    main()

心得體會:

在完成作業③的過程中,我學習瞭如何使用urllib.parse來處理URL,以及如何使用requests庫來下載檔案。這個過程中,我瞭解到了網路請求中的一些細節,比如如何處理URL拼接和檔案儲存。



相關文章