2024資料採集與融合技術實踐第二次作業

chenoojkk發表於2024-10-16

原文網址 : https://www.cnblogs.com/chenoojkk/p/18469601

這個作業屬於哪個課程	<首頁 - 2024資料採集與融合技術實踐 - 福州大學 - 班級部落格 - 部落格園 (cnblogs.com)>
這個作業要求在哪裡	<作業2 - 作業 - 2024資料採集與融合技術實踐 - 班級部落格 - 部落格園 (cnblogs.com)>
學號	<102202126>

一、作業內容

作業①

要求：在中國氣象網（http://www.weather.com.cn）給定城市集的7日天氣預報，並儲存在資料庫。

程式碼如下

from bs4 import BeautifulSoup
from bs4 import UnicodeDammit
import urllib.request
import sqlite3


class WeatherDB:
    def openDB(self):
        self.con = sqlite3.connect("weathers.db")
        self.cursor = self.con.cursor()
        try:
            self.cursor.execute("CREATE TABLE weathers (wCity VARCHAR(16), wDate VARCHAR(16), wWeather VARCHAR(64), wTemp VARCHAR(32), CONSTRAINT pk_weather PRIMARY KEY (wCity, wDate))")
        except sqlite3.OperationalError:
            # If the table already exists, we ignore the error.
            pass

    def closeDB(self):
        self.con.commit()
        self.con.close()

    def insert(self, city, date, weather, temp):
        try:
            self.cursor.execute("INSERT INTO weathers (wCity, wDate, wWeather, wTemp) VALUES (?, ?, ?, ?)", (city, date, weather, temp))
        except Exception as err:
            print(err)

    def show(self):
        self.cursor.execute("SELECT * FROM weathers")
        rows = self.cursor.fetchall()
        print("%-16s%-16s%-32s%-16s" % ("City", "Date", "Weather", "Temp"))
        for row in rows:
            print("%-16s%-16s%-32s%-16s" % (row[0], row[1], row[2], row[3]))


class WeatherForecast:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 6.0 x64; en-US; rv:1.9pre) Gecko/2008072421 Minefield/3.0.2pre"
        }
        self.cityCode = {"北京": "101010100", "上海": "101020100", "廣州": "101280101", "深圳": "101280601"}
        self.db = WeatherDB()  # Initialize WeatherDB instance

    def forecastCity(self, city):
        if city not in self.cityCode.keys():
            print(city + " code cannot be found")
            return

        url = "http://www.weather.com.cn/weather/" + self.cityCode[city] + ".shtml"
        try:
            req = urllib.request.Request(url, headers=self.headers)
            data = urllib.request.urlopen(req)
            data = data.read()
            dammit = UnicodeDammit(data, ["utf-8", "gbk"])
            data = dammit.unicode_markup
            soup = BeautifulSoup(data, "lxml")
            lis = soup.select("ul[class='t clearfix'] li")
            for li in lis:
                try:
                    date = li.select('h1')[0].text
                    weather = li.select('p[class="wea"]')[0].text
                    temp = li.select("p[class='tem']")[0].text.strip()
                    self.db.insert(city, date, weather, temp)
                except Exception as err:
                    print(f"Error processing weather data: {err}")
        except Exception as err:
            print(f"Error fetching weather data: {err}")

    def process(self, cities):
        self.db.openDB()  # Open database
        for city in cities:
            self.forecastCity(city)  # Scrape and store weather data for each city
        self.db.show()  # Print data from the database
        self.db.closeDB()  # Close database


# Create weather forecast instance and process specified cities
ws = WeatherForecast()
ws.process(["北京", "上海", "廣州", "深圳"])
print("Completed")

輸出資訊：

Gitee資料夾連結：陳家凱第二次實踐作業
心得體會：
- 定義類 WeatherDB 和 WeatherForecast，將資料庫操作和天氣資料處理邏輯封裝在類中。
- 使用類的方法進行資料庫的開啟、關閉、插入和顯示操作。
- 使用類的方法進行天氣資料的抓取和處理
作業②
要求：用requests和BeautifulSoup庫方法定向爬取股票相關資訊，並儲存在資料庫中。

程式碼如下

import requests
import pandas as pd
import sqlite3
import json

# 用get方法訪問伺服器並提取頁面資料
def getHtml(page):
    url = f"https://78.push2.eastmoney.com/api/qt/clist/get?cb=jQuery112408723133727080641_1728978540544&pn={page}&pz=20&po=1&np=1&ut=bd1d9ddb04089700cf9c27f6f7426281&fltt=2&invt=2&dect=1&wbp2u=|0|0|0|web&fid=f3&fs=m:0+t:6,m:0+t:80,m:1+t:2,m:1+t:23,m:0+t:81+s:2048&fields=f1,f2,f3,f4,f5,f6,f7,f8,f9,f10,f12,f13,f14,f15,f16,f17,f18,f20,f21,f23,f24,f25,f22,f11,f62,f128,f136,f115,f152&_=1728978540545"
    try:
        r = requests.get(url)
        r.raise_for_status()
        json_data = r.text[r.text.find("{"):r.text.rfind("}")+1]  # 提取 JSON 資料部分
        data = json.loads(json_data)
        return data
    except requests.RequestException as e:
        print(f"Error fetching data: {e}")
        return None

# 獲取單個頁面股票資料
def getOnePageStock(page):
    data = getHtml(page)
    if not data or 'data' not in data or 'diff' not in data['data']:
        return []
    return data['data']['diff']

# 將股票資訊儲存到SQLite資料庫
def saveToDatabase(stock_list):
    conn = sqlite3.connect('stocks.db')
    c = conn.cursor()
    try:
        c.execute('''CREATE TABLE IF NOT EXISTS stocks
                     (code TEXT PRIMARY KEY, name TEXT, price REAL, change REAL, percent_change REAL, volume INTEGER, amount REAL)''')
        for stock in stock_list:
            c.execute("INSERT OR IGNORE INTO stocks VALUES (?, ?, ?, ?, ?, ?, ?)",
                      (stock.get('f12'), stock.get('f14'), stock.get('f2'), stock.get('f3'), stock.get('f4'), stock.get('f5'), stock.get('f6')))
        conn.commit()
    except sqlite3.Error as e:
        print(f"Database error: {e}")
    finally:
        c.execute("SELECT * FROM stocks")
        rows = c.fetchall()
        df = pd.DataFrame(rows, columns=['Code', 'Name', 'Price', 'Change', 'Percent Change', 'Volume', 'Amount'])
        print(df)
        conn.close()

def main():
    all_stocks = []
    for page in range(1, 6):  # 爬取前5頁資料
        stock_list = getOnePageStock(page)
        if stock_list:
            all_stocks.extend(stock_list)
        else:
            print(f"未能獲取到第{page}頁的股票資料")

    if all_stocks:
        print("爬取到的股票資料：")
        for stock in all_stocks:
            print(stock)
        saveToDatabase(all_stocks)
        print("股票資訊已成功儲存到資料庫。")
    else:
        print("未能獲取到任何股票資料")

if __name__ == "__main__":
    main()

輸出資訊：
Gitee資料夾連結：陳家凱第二次實踐作業
心得體會：
- 分頁處理：透過迴圈處理多頁資料，每次只爬取 20 條記錄，對於 API 限制了請求量的情況是合適的。
- 靈活性：在 main 函式中，可以輕鬆調整爬取的頁數（例如從 range(1, 6) 變更為其他範圍），這使得程式碼具備一定的靈活性。
作業③
要求：爬取中國大學2021主榜（https://www.shanghairanking.cn/rankings/bcur/2021）所有院校資訊，並儲存在資料庫中，同時將瀏覽器F12除錯分析的過程錄製Gif加入至部落格中。

程式碼如下

import requests
import re
import sqlite3

class UniversityDB:
    def openDB(self):
        # 連線到SQLite資料庫
        self.con = sqlite3.connect('universities.db')
        self.cursor = self.con.cursor()
        try:
            # 如果表不存在，則建立universities表
            self.cursor.execute('''CREATE TABLE IF NOT EXISTS universities (
                                    id INTEGER PRIMARY KEY AUTOINCREMENT,
                                    name TEXT,
                                    score REAL
                                )''')
        except sqlite3.Error as e:
            print(f"Error creating table: {e}")

    def closeDB(self):
        # 提交更改並關閉資料庫連線
        self.con.commit()
        self.con.close()

    def insert(self, university_data):
        try:
            # 插入大學資料到表中
            sql = "INSERT INTO universities (name, score) VALUES (?, ?)"
            self.cursor.execute(sql, university_data)
        except sqlite3.Error as e:
            print(f"Error inserting data: {e}")

    def show(self):
        # 獲取並顯示universities表中的所有記錄
        self.cursor.execute("SELECT * FROM universities")
        rows = self.cursor.fetchall()
        print("{:<5}{:<20}{:<5}".format("排名", "學校", "總分"))
        for index, row in enumerate(rows):
            print("{:<5}{:<20}{:<5}".format(index + 1, row[1], row[2]))

class UniversityCrawler:
    def __init__(self):
        # 請求的URL和頭資訊
        self.url = 'https://www.shanghairanking.cn/_nuxt/static/1728872418/rankings/bcur/2020/payload.js'
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3947.100 Safari/537.36'
        }

    def crawl_universities(self):
        try:
            # 傳送GET請求到URL
            response = requests.get(self.url, headers=self.headers)
            response.raise_for_status()
            response_text = response.text
        except requests.RequestException as e:
            print(f"Error fetching data: {e}")
            return []

        # 使用正規表示式提取大學名稱和分數
        name_pattern = r'univNameCn:"(.*?)"'
        score_pattern = r'score:(.*?),'
        names = re.findall(name_pattern, response_text, re.S)
        scores = re.findall(score_pattern, response_text, re.S)

        universities_data = []
        for name, score in zip(names, scores):
            try:
                # 將分數轉換為浮點數並新增到列表中
                score_value = float(score)
                universities_data.append((name, score_value))
            except ValueError:
                print(f"Invalid score value: {score}")
                continue
        return universities_data

    def process(self):
        # 初始化資料庫並插入爬取的資料
        db = UniversityDB()
        db.openDB()
        universities_data = self.crawl_universities()
        for uni_data in universities_data:
            db.insert(uni_data)
        db.show()
        db.closeDB()

if __name__ == "__main__":
    # 建立爬蟲例項並開始處理
    crawler = UniversityCrawler()
    crawler.process()
    print("completed")

輸出資訊：
gif圖：
Gitee資料夾連結:陳家凱第二次實踐作業
心得體會：
- 學會了使用 json 庫解析 JSON 資料。
- 學會了提取 JSON 資料中的特定部分。
- 學會了使用 sqlite3 庫進行 SQLite 資料庫操作。
- 學會了建立資料庫表並定義主鍵約束。
- 學會了插入資料並處理可能的異常。
- 學會了查詢並顯示資料庫中的資料

2024資料採集與融合技術實踐第二次作業

一、作業內容

作業①

輸出資訊：

作業②

作業③

相關文章