資料採集與融合技術作業4

102202133zqj發表於2024-11-16

Gitee作業連結:https://gitee.com/zheng-qijian33/crawl_project/tree/master/作業4

作業①:

要求:
熟練掌握 Selenium 查詢HTML元素、爬取Ajax網頁資料、等待HTML元素等內容。
使用Selenium框架+ MySQL資料庫儲存技術路線爬取“滬深A股”、“上證A股”、“深證A股”3個板塊的股票資料資訊。

實驗程式碼

點選檢視程式碼
import pymysql
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import time
from selenium.webdriver.common.by import By

class MySpider:
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/109.0.0.0 Safari/537.36 "
            "SLBrowser/9.0.3.5211 SLBChan/112"
        )
    }

    def __init__(self, url):
        self.url = url
        self.page = 0  # 爬取頁數
        self.section = ["nav_hs_a_board", "nav_sh_a_board", "nav_sz_a_board"]  # 要點選的板塊的屬性
        self.sectionid = 0  # 當前板塊索引

    def startUp(self):
        chrome_options = Options()
        chrome_options.add_argument('--headless')  # 無頭模式
        chrome_options.add_argument('--disable-gpu')
        chrome_options.add_argument('--no-sandbox')
        self.driver = webdriver.Chrome(options=chrome_options)
        self.driver.get(self.url)

        # 建立與MySQL的連線,並建立三個表來儲存三個板塊的資料
        try:
            print("Connecting to MySQL...")
            self.con = pymysql.connect(
                host="localhost",
                port=3306,
                user="root",
                passwd="密碼",
                db="stocks",
                charset="utf8"
            )
            self.cursor = self.con.cursor(pymysql.cursors.DictCursor)
            for stocks_table in self.section:
                self.cursor.execute(f"DROP TABLE IF EXISTS {stocks_table}")
                self.cursor.execute(
                    f"""
                    CREATE TABLE {stocks_table} (
                        id INT(4) PRIMARY KEY,
                        StockNo VARCHAR(16),
                        StockName VARCHAR(32),
                        StockQuote VARCHAR(32),
                        Changerate VARCHAR(32),
                        Chg VARCHAR(32),
                        Volume VARCHAR(32),
                        Turnover VARCHAR(32),
                        StockAmplitude VARCHAR(32),
                        Highest VARCHAR(32),
                        Lowest VARCHAR(32),
                        Pricetoday VARCHAR(32),
                        PrevClose VARCHAR(32)
                    )
                    """
                )
            print("MySQL tables created successfully.")
        except Exception as err:
            print(f"Error during MySQL setup: {err}")

    def closeUp(self):
        try:
            self.con.commit()
            self.con.close()
            self.driver.quit()
            print("Spider closed successfully.")
        except Exception as err:
            print(f"Error during closing: {err}")

    def insertDB(self, section, id, StockNo, StockName, StockQuote, Changerate, Chg, Volume, Turnover, StockAmplitude, Highest, Lowest, Pricetoday, PrevClose):
        try:
            sql = f"""
            INSERT INTO {section} (
                id, StockNo, StockName, StockQuote, Changerate, Chg, Volume, Turnover,
                StockAmplitude, Highest, Lowest, Pricetoday, PrevClose
            ) VALUES (%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s)
            """
            self.cursor.execute(
                sql,
                (
                    id, StockNo, StockName, StockQuote, Changerate, Chg, Volume,
                    Turnover, StockAmplitude, Highest, Lowest, Pricetoday, PrevClose
                )
            )
        except Exception as err:
            print(f"Error inserting data into MySQL: {err}")

    def processSpider(self):
        time.sleep(2)
        try:
            tr_elements = self.driver.find_elements(By.XPATH, "//table[@id='table_wrapper-table']/tbody/tr")
            for tr in tr_elements:
                id = tr.find_element(By.XPATH, ".//td[1]").text
                StockNo = tr.find_element(By.XPATH, "./td[2]/a").text
                StockName = tr.find_element(By.XPATH, "./td[3]/a").text
                StockQuote = tr.find_element(By.XPATH, "./td[5]/span").text
                Changerate = tr.find_element(By.XPATH, "./td[6]/span").text
                Chg = tr.find_element(By.XPATH, "./td[7]/span").text
                Volume = tr.find_element(By.XPATH, "./td[8]").text
                Turnover = tr.find_element(By.XPATH, "./td[9]").text
                StockAmplitude = tr.find_element(By.XPATH, "./td[10]").text
                highest = tr.find_element(By.XPATH, "./td[11]/span").text
                lowest = tr.find_element(By.XPATH, "./td[12]/span").text
                Pricetoday = tr.find_element(By.XPATH, "./td[13]/span").text
                PrevClose = tr.find_element(By.XPATH, "./td[14]").text
                section = self.section[self.sectionid]
                self.insertDB(
                    section, id, StockNo, StockName, StockQuote, Changerate, Chg,
                    Volume, Turnover, StockAmplitude, highest, lowest, Pricetoday, PrevClose
                )
            # 爬取前2頁
            if self.page < 2:
                self.page += 1
                print(f"第 {self.page} 頁已經爬取完成")
                next_page = self.driver.find_element(By.XPATH, "//div[@class='dataTables_paginate paging_input']/a[2]")
                next_page.click()
                time.sleep(10)
                self.processSpider()
            elif self.sectionid < 3:
                # 爬取下一個板塊
                print(f"{self.section[self.sectionid]} 爬取完成")
                self.sectionid += 1
                self.page = 0
                next_section = self.driver.find_element(By.XPATH, f"//li[@id='{self.section[self.sectionid]}']/a")
                self.driver.execute_script("arguments[0].click();", next_section)
                time.sleep(10)
                self.processSpider()
        except Exception as err:
            print(f"Error during spider processing: {err}")

    def executeSpider(self):
        print("Spider starting...")
        self.startUp()
        print("Spider processing...")
        self.processSpider()
        print("Spider closing...")
        self.closeUp()

if __name__ == "__main__":
    url = "http://quote.eastmoney.com/center/gridlist.html#hs_a_board"
    spider = MySpider(url)
    spider.executeSpider()

Gitee作業連結:https://gitee.com/zheng-qijian33/crawl_project/tree/master/作業4

執行結果

滬深A股:

深證A股:

上證A股:

心得體會

編寫這段爬蟲程式碼讓我深刻體會到,自動化資料抓取需要綜合考慮網頁結構、元素定位、資料庫操作以及異常處理等多個方面。使用 Selenium 的無頭模式進行網頁操作雖然靈活,但需要合理控制頁面載入時間以提高效率。動態切換板塊和翻頁的邏輯需要仔細處理,確保資料完整性和避免重複抓取。同時,資料庫操作的封裝和錯誤處理至關重要,能有效防止資料丟失和程式崩潰。整體而言,程式碼的模組化和清晰的邏輯結構是實現穩定、高效爬蟲的關鍵。

作業②:

要求:
熟練掌握 Selenium 查詢HTML元素、實現使用者模擬登入、爬取Ajax網頁資料、等待HTML元素等內容。
使用Selenium框架+MySQL爬取中國mooc網課程資源資訊(課程號、課程名稱、學校名稱、主講教師、團隊成員、參加人數、課程進度、課程簡介)

實驗程式碼

點選檢視程式碼
from selenium import webdriver
from selenium.webdriver.common.by import By
from lxml import etree
import time
import pymysql

driver = webdriver.Chrome()
driver.get("https://www.icourse163.org/")

driver.maximize_window()
# 找到登入按鈕並點選
button = driver.find_element(By.XPATH,'//div[@class="_1Y4Ni"]/div')
button.click()

# 轉換到iframe
frame = driver.find_element(By.XPATH,
                             '/html/body/div[13]/div[2]/div/div/div/div/div/div[1]/div/div[1]/div[2]/div[2]/div[1]/div/iframe')
driver.switch_to.frame(frame)
# 輸入賬號密碼並點選登入
account = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/form/div/div[2]/div[2]/input').send_keys('手機號')
code = driver.find_element(By.XPATH, '/html/body/div[2]/div[2]/div[2]/form/div/div[4]/div[2]/input[2]').send_keys('密碼')
login_buttom = driver.find_element(By.XPATH,'/html/body/div[2]/div[2]/div[2]/form/div/div[6]/a')
login_buttom.click()
# 切回正常模式並等待頁面載入
driver.switch_to.default_content()
time.sleep(10)
# 導航到搜尋頁面
search_url = "https://www.icourse163.org/search.htm?search=%20#/"
driver.get(search_url)
time.sleep(10)

driver.get('https://www.icourse163.org/search.htm?search=%E8%AE%A1%E7%AE%97%E6%9C%BA#/') 
time.sleep(2)
next_button = driver.find_element(By.XPATH, '/html/body/div[4]/div[2]/div[2]/div[2]/div/div[6]/div[1]/div[1]/label/div')
next_button.click()
time.sleep(2)

js = '''
    timer = setInterval(function(){
       var scrollTop=document.documentElement.scrollTop||document.body.scrollTop;
       var ispeed=Math.floor(document.body.scrollHeight / 100);
       if(scrollTop > document.body.scrollHeight * 90 / 100){
           clearInterval(timer);
       }
       console.log('scrollTop:'+scrollTop)
       console.log('scrollHeight:'+document.body.scrollHeight)
       window.scrollTo(0, scrollTop+ispeed)
    }, 20)
    '''
for i in range(1,5):
    driver.execute_script(js)
    time.sleep(4)
    html = driver.page_source
    bs = etree.HTML(html)
    lis = bs.xpath('/html/body/div[4]/div[2]/div[2]/div[2]/div/div[6]/div[2]/div[1]/div/div/div')
    for link in lis:
        a = link.xpath('./div[2]/div/div/div[1]/a[2]/span/text()')
        if len(a) != 0:
            if a[0] == '國家精品':
                b = link.xpath('./div[2]/div/div/div[1]/a[1]/span/text()')[0]
                c = link.xpath('./div[2]/div/div/div[2]/a[1]/text()')[0]
                d = link.xpath('./div[2]/div/div/div[2]/a[2]/text()')[0]
                e = link.xpath('./div[2]/div/div/div[2]/a[2]/text()')[0]
                f = link.xpath('./div[2]/div/div/div[3]/span[2]/text()')[0]
                try:
                    g = link.xpath('./div[2]/div/div/div[3]/div/span[2]/text()')[0]
                except IndexError:
                    g = '已結束'
                try:
                    h = link.xpath('./div[2]/div/div/a/span/text()')[0]
                except IndexError:
                    h = '空'
                mydb = pymysql.connect(
                    host="localhost",
                    user="root",
                    password="密碼",
                    database="stocks",
                    charset='utf8mb4'
                )
                try:
                    with mydb.cursor() as cursor:
                        sql = "INSERT INTO mooc (cCourse, cCollege, cTeacher, cTeam, cCount, cProcess, cBrief) VALUES (%s, %s, %s, %s, %s, %s, %s)"
                        val = (b, c, d, e, f, g, h)
                        cursor.execute(sql, val)
                    mydb.commit()
                    print(b, c, d, e, f, g, h)
                except Exception as e:
                    print(f"Error: {e}")
                finally:
                    mydb.close()

    next_button = driver.find_element(By.XPATH, '//*[@id="j-courseCardListBox"]/div[2]/ul/li[10]')
    next_button.click()
    time.sleep(2)

Gitee作業連結:https://gitee.com/zheng-qijian33/crawl_project/tree/master/作業4

執行結果


心得體會

編寫這段程式碼讓我深刻體會到,Selenium 在處理複雜網頁互動和資料抓取時非常強大,但也需謹慎使用以避免效率低下。顯式等待(WebDriverWait)比 time.sleep 更可靠,確保元素載入完成後再操作,顯著提升指令碼穩定性。同時,精準的 XPath 或 CSS 選擇器是資料抓取的關鍵,需深入理解網頁結構並選擇穩定路徑。此外,異常處理和日誌記錄至關重要,能有效防止程式崩潰並提供除錯資訊,使指令碼更健壯和易於維護。

作業③:

要求:
掌握大資料相關服務,熟悉Xshell的使用
完成文件 華為雲_大資料實時分析處理實驗手冊-Flume日誌採集實驗(部分)v2.docx 中的任務,即為下面5個任務,具體操作見文件。

環境搭建:

任務一:開通MapReduce服務




實時分析開發實戰:

任務一:Python指令碼生成測試資料

任務二:配置Kafka



任務三: 安裝Flume客戶端


任務四:配置Flume採集資料


心得體會:

透過完成華為雲大資料實時分析處理實驗,我深刻體會到雲平臺在資源排程上的靈活性和高效性,掌握了MapReduce服務開通、Python資料生成、Kafka配置、Flume安裝及資料採集等關鍵技術。這些任務讓我認識到,資料採集和傳輸是實時分析的核心,Kafka和Flume的結合能有效實現高吞吐量、可靠的資料流處理。同時,實驗過程中遇到的配置和網路問題也讓我意識到分散式系統管理的複雜性,為後續實際應用打下了堅實基礎。

相關文章