9.爬蟲案例

WangYao_BigData發表於2024-12-06

爬蟲案例

案例1:中圖網TOP 1000資料爬取

思路:

  • 使用request和lxml獲取網頁的HTML程式碼;
  • 解析HTML程式碼,獲取對應標籤下的文字資訊;
  • 先試著爬取一頁資料,再使用迴圈爬取多頁資料。

單頁資料爬取

import requests
from lxml import etree
import pandas as pd

# 獲取HTML
url = f'https://www.bookschina.com/24hour/1_0_1/'
headers = {
    'cookie': 'adtanchu=1; indexCache=yes; ASP.NET_SessionId=yqg0xeq1lqokp3x0d0ypq2lv; Hm_lvt_6993f0ad5f90f4e1a0e6b2d471ca113a=1733368583; HMACCOUNT=539CFA4C696961D6; user_sign=59baa7d23c894dfe9ddfb64184ca96cc; BookUser=1%7cff09a25d-60b7-46a3-beb0-7ec7de9a5fd8%7c1%7c0%7c638715864084629584%7c20180722%7cfa4041993af56a1a; UserSign=069f073dff21b10b; Hm_lpvt_6993f0ad5f90f4e1a0e6b2d471ca113a=1733371462',
    'host': 'www.bookschina.com',
    'referer': f'https://www.bookschina.com/24hour/1_0_1/',
    'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36'
}
response = requests.get(url=url, headers=headers)
response.encoding = 'gb2312'
html_str = response.text
html = etree.HTML(html_str)

# 建好空列表,方便後面乘放資料
name_list = []
author_list = []
publisher_list = []
comment_list = []
score_list = []
sellPrice_list = []
discount_list = []
priceTit_list = []

# 找到資料在html中的標籤路徑
li_list = html.xpath("//div[@class='bookList']/ul/li")
for li in li_list:
    name = str(li.xpath("./div[@class='infor']/h2/a/text()")[0])
    name_list.append(name)

    author = str(li.xpath("./div[@class='infor']/div[@class='author']/a/text()")[0])
    author_list.append(author)

    publisher = str(li.xpath("./div[@class='infor']/div[@class='publisher']/a/text()")[0])
    publisher_list.append(publisher)

    comment = str(li.xpath("./div[@class='infor']/div[@class='startWrap']/a/text()")[0])
    comment_list.append(comment.replace('條評論', ''))

    startWrap_list = li.xpath("./div[@class='infor']/div[@class='startWrap']/i")
    score = 0
    for i in startWrap_list:
        if i.attrib.get('class') == 'one':
            score = score + 1
        elif i.attrib.get('class') == 'half':
            score = score + 0.5
        else:
            score = 0
    score_list.append(score)

    sellPrice = str(li.xpath("./div[@class='infor']/div[@class='priceWrap']/span[@class='sellPrice']/text()")[0])
    sellPrice_list.append(sellPrice.replace('¥', ''))

    discount = str(li.xpath("./div[@class='infor']/div[@class='priceWrap']/span[@class='discount']/text()")[0])
    discount_list.append(discount.replace('(', '').replace(')', ''))

    priceTit = str(li.xpath("./div[@class='infor']/div[@class='priceWrap']/del/text()")[0])
    priceTit_list.append(priceTit.replace('¥', ''))

dict1 = {
    '書名': name_list,
    '作者': author_list,
    '出版社': publisher_list,
    '評論數': comment_list,
    '評分': score_list,
    '售價': sellPrice_list,
    '折扣': discount_list,
    '原價': priceTit_list
}
df = pd.DataFrame(dict1).to_csv('D:\desk\中圖網page1.csv')

多頁資料爬取

import requests
import pandas as pd
from lxml import etree
from datetime import datetime

# 建好空列表,方便後面乘放資料
name_list = []
author_list = []
publisher_list = []
comment_list = []
score_list = []
sellPrice_list = []
discount_list = []
priceTit_list = []
activeIcon_list = []

if __name__ == '__main__':
    # 獲取爬取的時間,用於最後輸出為csv檔案的命名依據
    time_now = datetime.strftime(datetime.now(), '%Y-%m-%d %H-%M-%S')

    for i in range(1, 35):
        print(f'正在獲取第{i}頁資料'.center(50, '-'))

        # 獲取HTML
        url = f'https://www.bookschina.com/24hour/1_0_{i}/'
        headers = {
            'cookie': 'adtanchu=1; indexCache=yes; ASP.NET_SessionId=yqg0xeq1lqokp3x0d0ypq2lv; Hm_lvt_6993f0ad5f90f4e1a0e6b2d471ca113a=1733368583; HMACCOUNT=539CFA4C696961D6; user_sign=59baa7d23c894dfe9ddfb64184ca96cc; BookUser=1%7cff09a25d-60b7-46a3-beb0-7ec7de9a5fd8%7c1%7c0%7c638715864084629584%7c20180722%7cfa4041993af56a1a; UserSign=069f073dff21b10b; Hm_lpvt_6993f0ad5f90f4e1a0e6b2d471ca113a=1733371462',
            'host': 'www.bookschina.com',
            'referer': f'https://www.bookschina.com/24hour/1_0_{i}/',
            'user-agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Mobile Safari/537.36'
        }
        response = requests.get(url=url, headers=headers)
        response.encoding = 'ansi'
        html_str = response.text
        html = etree.HTML(html_str)

        # 找到資料在html中的標籤路徑並將對應的文字資料提取出來新增到之前建立的空列表中
        li_list = html.xpath("//div[@class='bookList']/ul/li")
        for li in li_list:
            name = str(li.xpath("./div[@class='infor']/h2/a/text()")[0])
            name_list.append(name)

            author = li.xpath("./div[@class='infor']/div[@class='author']/a/text()")
            # 有的書可能沒有作者資訊,透過使用try,except語句實現報錯跳出
            try:
                # 替換掉資料中不需要的部分
                author_list.append(str(author).replace('[', '').replace(']', '').replace('\'', ''))
            except:
                author_list.append('無作者資訊')

            publisher = li.xpath("./div[@class='infor']/div[@class='publisher']/a/text()")
            try:
                publisher_list.append(str(publisher).replace('[', '').replace(']', '').replace('\'', ''))
            except:
                publisher_list.append('無出版資訊')

            comment = str(li.xpath("./div[@class='infor']/div[@class='startWrap']/a/text()")[0])
            comment_list.append(comment.replace('條評論', ''))

            # 特殊情況靈活處理,此處就是透過迴圈條件結構實現對總評分的準確彙總
            startWrap_list = li.xpath("./div[@class='infor']/div[@class='startWrap']/i")
            score = 0
            for i in startWrap_list:
                if i.attrib.get('class') == 'one':
                    score = score + 1
                elif i.attrib.get('class') == 'half':
                    score = score + 0.5
                else:
                    score = 0
            score_list.append(score)

            sellPrice = str(
                li.xpath("./div[@class='infor']/div[@class='priceWrap']/span[@class='sellPrice']/text()")[0])
            sellPrice_list.append(sellPrice.replace('¥', ''))

            discount = str(li.xpath("./div[@class='infor']/div[@class='priceWrap']/span[@class='discount']/text()")[0])
            discount_list.append(discount.replace('(', '').replace(')', '').replace('折', ''))

            priceTit = str(li.xpath("./div[@class='infor']/div[@class='priceWrap']/del/text()")[0])
            priceTit_list.append(priceTit.replace('¥', ''))


            try:
                activeIcon = str(li.xpath("./div[@class='infor']/div[@class='activeIcon']/a/text()")[0])
                activeIcon_list.append(activeIcon)
            except:
                activeIcon_list.append('無折上折')

    dict1 = {
        '書名': name_list,
        '作者': author_list,
        '出版社': publisher_list,
        '評論數': comment_list,
        '評分': score_list,
        '售價': sellPrice_list,
        '折扣': discount_list,
        '原價': priceTit_list,
        '折上折': activeIcon_list
    }
    df = pd.DataFrame(dict1).to_csv(f'D:\desk\中圖網page1-34({time_now}).csv', index=False)

案例2:貓眼電影TOP 100資料爬取

整體思路:

  • 使用selenium中的webdrive開啟瀏覽器並獲取頁面HTML程式碼;
  • 解析HTML程式碼,獲取對應標籤下的文字資訊;
  • 先試著爬取一頁資料,再使用迴圈爬取多頁資料。

單頁資料爬取

import time
import pandas as pd
from selenium import webdriver

# 提前建立所需資訊空列表,方便後續獲取資料後填充
movie_rank_list = []
movie_name_list = []
movie_actor_list = []
movie_time_list = []
movie_score_list = []

# 開啟瀏覽器進入到對應網址
browser = webdriver.Chrome()
browser.get("https://www.maoyan.com/board/4")
time.sleep(5)

# 獲取儲存電影資訊的dd列表
dd_list = browser.find_elements_by_xpath('//dl[@class="board-wrapper"]/dd')

# 遍歷列表得到每個欄位的具體資訊
for i in dd_list:
    movie_rank = i.find_element_by_xpath('./i').text
    movie_rank_list.append(movie_rank)

    movie_name = i.find_element_by_xpath('./div/div/div[1]/p[1]/a').text
    movie_name_list.append(movie_name)

    movie_actor = i.find_element_by_xpath('./div/div/div[1]/p[2]').text
    movie_actor_list.append(movie_actor)

    movie_time = i.find_element_by_xpath('./div/div/div[1]/p[3]').text
    movie_time_list.append(movie_time)

    movie_score1 = str(i.find_element_by_xpath('./div/div/div[2]/p/i[1]').text)
    movie_score2 = str(i.find_element_by_xpath('./div/div/div[2]/p/i[2]').text)
    movie_score = movie_score1 + movie_score2
    movie_score_list.append(movie_score)

# 將獲取到的列表資料放入字典,方便後續轉化為dataframe結構
data_dict = {
    '排名': movie_rank_list,
    '電影名稱': movie_name_list,
    '主演': movie_actor_list,
    '上映時間': movie_time_list,
    '評分': movie_score_list
}

# 將字典轉為為dataframe格式並匯出為csv檔案
pd.DataFrame(data_dict).to_csv('D:\desk\貓眼top10.csv', index=False)

# 關閉瀏覽器
browser.quit()

多頁資料爬取

import time
import pandas as pd
from selenium import webdriver

# 提前建立所需資訊空列表,方便後續獲取資料後填充
movie_rank_list = []
movie_name_list = []
movie_actor_list = []
movie_time_list = []
movie_score_list = []

# 構建獲取一頁資料的函式,後面獲取多頁時迴圈呼叫即可
def get_one_page(b):
    # 獲取儲存電影資訊的dd列表
    dd_list = browser.find_elements_by_xpath('//dl[@class="board-wrapper"]/dd')

    # 遍歷列表得到每個欄位的具體資訊
    for i in dd_list:
        movie_rank = i.find_element_by_xpath('./i').text
        movie_rank_list.append(movie_rank)

        movie_name = i.find_element_by_xpath('./div/div/div[1]/p[1]/a').text
        movie_name_list.append(movie_name)

        movie_actor = i.find_element_by_xpath('./div/div/div[1]/p[2]').text
        movie_actor_list.append(movie_actor)

        movie_time = i.find_element_by_xpath('./div/div/div[1]/p[3]').text
        movie_time_list.append(movie_time)

        movie_score1 = str(i.find_element_by_xpath('./div/div/div[2]/p/i[1]').text)
        movie_score2 = str(i.find_element_by_xpath('./div/div/div[2]/p/i[2]').text)
        movie_score = movie_score1 + movie_score2
        movie_score_list.append(movie_score)


if __name__ == '__main__':
    # 開啟瀏覽器進入到對應網址
    browser = webdriver.Chrome()
    browser.get("https://www.maoyan.com/board/4")
    time.sleep(3)

    # 建立一個迴圈體,透過點選下一頁的方式實現跳轉,直到沒有下一頁按鈕表明最後一頁資料已經爬取完畢
    while True:
        get_one_page(browser)
        next_page = browser.find_element_by_xpath('//*[@id="app"]/div/div/div[2]/ul/li[8]/a')
        if next_page.text != '下一頁':
            print('爬取結束!')
            break
        next_page.click()
        time.sleep(3)

    # 將獲取到的列表資料放入字典,方便後續轉化為dataframe結構
    data_dict = {
        '排名': movie_rank_list,
        '電影名稱': movie_name_list,
        '主演': movie_actor_list,
        '上映時間': movie_time_list,
        '評分': movie_score_list
    }

    # 將字典轉為為dataframe格式並匯出為csv檔案
    pd.DataFrame(data_dict).to_csv('D:\desk\貓眼top100.csv', index=False)

    # 關閉瀏覽器
    browser.quit()

相關文章