爬取東方財富股吧中評論資料

莫格利發表於2020-10-12

東方財富股吧是有反爬取機制的,我們通過減慢爬取速率的方法來避免爬取檢測。這種方法爬取網頁資料的速率很慢,並不會對網站的訪問造成影響,當然我們可以改進方法,使得爬取資料更快。但並不鼓勵大家非法爬取資料,只用作學習交流。
直接上程式碼吧:

import requests
from bs4 import BeautifulSoup
import time
import random
import csv  # 匯入CSV安裝包

#f = open('test.csv', 'w', encoding='utf-8')
f = open('comment.csv', 'w', newline="")   #建立檔案物件
csv_writer = csv.writer(f)  #基於檔案物件構建 csv寫入物件
csv_writer.writerow(["序號", "commentId", "text", "userId", "date", "likeCount", "fans"])
count = 0

def getHtml(url):#下載網頁原始碼
    header = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; LCTE; rv:11.0) like Gecko'}
    try:
        r = requests.get(url,headers=header)
        r.encoding = 'utf-8'
        #print(r.status_code)
        r.raise_for_status()
        return r.text
    except:
        getHtml(url)


# region 隨機延時

# 固定延時x秒
def delay_x_0_s(fixed_delay_num):
    x = float(fixed_delay_num)
    time.sleep(x)


# 隨機延時 0~y 秒
def delay_0_y_s(random_delay_num):
    y = float(random_delay_num)
    time.sleep(random.random() * y)


# 先固定延時x秒,再隨機延時 0~y 秒
# 延時區間,包前不包後
def delay_x_y_s(fixed_delay_num, random_delay_num):
    delay_x_0_s(fixed_delay_num)
    delay_0_y_s(random_delay_num)


# 隨機延時 x~y 秒
# 延時區間,包前不包後
def delay_between_x_y_s(start_delay_num, end_delay_num):
    x = float(start_delay_num)
    y = float(end_delay_num)
    delay_x_0_s(x)
    delay_0_y_s(y - x)

for page in range(100, 1001):
    delay_between_x_y_s(2, 5)
    url = "http://guba.eastmoney.com/list,zssh000001,f_" + str(page) + ".html"
    print(url)
    # url = 'http://guba.eastmoney.com/list,zssh000001,f_1.html'
    html = getHtml(url)
    soup = BeautifulSoup(html, "html.parser")
    #print(soup)
    contain = soup.find_all("div", {"class": "articleh"})   #獲取存有資料的div標籤,存在contain中,因為一個頁面有多條評論,所以contain是一個列表。
    for i in contain[:]:   #遍歷contain
        try:
            delay_between_x_y_s(2, 5)
            content = i.find("span", {"class": "l3 a3"}).find("a")    #獲取一個div便籤中第三個span標籤下的a標籤,其有href和title兩個屬性
            print(content)
            contentUrl = "http://guba.eastmoney.com"+content["href"]  #content["href"]是該評論的詳細介面網址,因為其是相對地址,所以需要在前新增網址的字首,得到完整的介面網址
            print("contentUrl: " + contentUrl + "\n")
            commentId = content["href"][-14:-5]   #我們觀察content["href"]屬性的值,發現其是具有規則的字串,從該字串的倒數第14個位置到倒數第5個位置是該條評論的id
            print("commentId: " + commentId + "\n")
            text = content.attrs["title"] #獲取評論文字(標題)
            print("text: " + text + "\n")
            userUrl = i.find("span", {"class": "l4 a4"}).find("a").attrs["href"]  #用同樣的方法獲取使用者主頁連結
            if userUrl == "/list,cjpl.html":
                continue
            print("userUrl: " + userUrl + "\n")
            userId = userUrl[23:] #獲取使用者ID
            if userId == "":    #跳過討論帖子
                continue
            if userId == "3006113720930996":    #跳過股吧賬號
                continue
            if userId == "/3006113720930996":
                continue
            if userId == "7428111481466798":
                continue
            if userId == "6712111507146464":
                continue
            if userId == "6255325874333310":
                continue
            print("userId: " + userId + "\n")
            delay_between_x_y_s(2, 5)
            commentHtml = getHtml(contentUrl)   # 獲取評論詳細資訊原始碼
            #print("commentHtml: " + commentHtml + "\n")
            soup = BeautifulSoup(commentHtml, "html.parser")
            #print(soup)
            date = soup.find("div", {"class": "zwfbtime"}).text[4:14]   # 獲取評論發表時間
            print("date: " + date + "\n")
            #if date == "2020-07-01":
            #    continue
            likeCount = int(soup.find("div", {"data-like_count": True}).attrs['data-like_count'])  # 獲取評論點贊數,並轉換成整數型別。(因為從html中獲取會認為是字串型別)
            print("likeCount: ", likeCount, "\n")
            #likeCount = int(soup.find("div", {"data-like_count": True}).attrs['data-like_count'])
            #print(likeCount)
            delay_between_x_y_s(2, 5)
            userHtml = getHtml(userUrl) # 獲取使用者主頁原始碼
            soup = BeautifulSoup(userHtml, "html.parser")
            #print(soup)
            fans = int(soup.find("a", {"id": "tafansa"}).find("span").text) # 獲取使用者粉絲數
            print("fans: ", fans, "\n")
            count = count + 1
            csv_writer.writerow([count, commentId, text, userId, date, likeCount, fans])
        except:
            print('出現異常,繼續檢視下一篇文章')
            continue

f.close()

相關文章