微博爬取長津湖博文及評論

專注的阿熊發表於2021-10-08

# Changjin Lake film data from Weibo

# @Time: 20211006

# @Author: heheyang

import requests

import json

import re

import pprint

import pandas as pd

def comments_singlePage_crawl(url,headers,comments_info, id):

     """

     評論單頁爬取

     :param url:

     :param headers:

     :return:

     """

     # 獲取 html

     html = requests.get(url, headers).text

     # json 解析 html

     html_dict = json.loads(html)

     comments_data = html_dict["data"]["data"]

     for comment in comments_data:

         comments_info["id"].append(id)

         comments_info["date"].append(comment["created_at"])

         # 篩選出 text 中的文字資訊

         text = re.sub("<span(.*?)</span>", "", comment["text"])

         text = re.sub("<a(.*?)</a>", "", text)

         comments_info["text"].append(text)

def weibo_bowen_singelPage_crawl(url,headers,mblog_info,comments_info):

     """

     單頁爬取函式

     :param url: 待爬取 url

     :param headers: 請求頭

     :param mblog_info: mblog 資訊儲存字典

     """

     # 獲取 html

     html = requests.get(url,headers).text

     # json 解析 html

     html_dict = json.loads(html)

     users = html_dict["data"]["cards"]

     # 博文儲存

     for user in users:

         mblog = user["mblog"]

         mblog_info["id"].append(mblog["id"])

         mblog_info["date"].append(mblog["created_at"])

         # 篩選出 text 中的文字資訊

         text = re.sub("<span(.*?)</span>","",mblog["text"])

         text = re.sub("<a(.*?)</a>","",text)

         mblog_info["text"].append(text)

         # 構造評論的 url

         comments_url = "%s&mid=%s&max_id_type=" % (mblog["id"], mblog["id"])

         # 儲存評論

         i = 0

         while True:

             try:

                 comments_url_ = comments_url + str(i)

                 comments_singlePage_crawl(comments_url_, headers, comments_info, mblog["id"])

                 i += 1

             except:

                 break

         pprint.pprint(comments_info)

def weibo_bowen_data_crawl(url,headers):

     """

     博文資訊爬取函式

     :param url: 待爬取網站 url

     :param headers: 請求頭

     :return: 博文資訊儲存字典 mblog_info

     """

     # 博文資訊儲存字典

     mblog_info = {

         "id": [],

         "date": [],

         "text": []

     }

     # 評論資訊儲存字典

     comments_info = {

         "id":[],

         "date":[],

         "text":[],

     }

     # 爬取 10 頁博文

     for i in range(1,10):

         url_ = url + str(i)

         # 新增博文資訊

         weibo_bowen_singelPage_crawl(url_, headers, mblog_info,comments_info)

     return mblog_info,comments_info

def bowen_data_store(mblog_info,comments_info):

     """

     資料處理並儲存到 excel

     :param mblog_info: 博文資訊

     :return: 儲存到 excel

     """

     # 儲存表 1

     data = pd.DataFrame(mblog_info)

     data["num"] = data.index + 1

     data["keyword"] = ["Film Changjin Lake"]*len(data["num"])

     df = data.loc[:,["num","keyword","id","date","text"]]

     df.to_excel("bowen_data.xlsx",sheet_name="Sheet1")

     # 儲存表 2

     comments_data = pd.DataFrame(comments_info)

     comments_data["num"] = comments_data.index + 1

     df_c = comments_data.loc[:,["num","id","date","text"]]

     df_c.to_excel("bowen_comments_data.xlsx",sheet_name="Sheet1")

if __name__ == '__main__':

     # 微博 url

     url =外匯跟單gendan5.com  "%3D1%26q%3D%E9%95%BF%E6%B4%A5%E6%B9%96&type=uid&value=7377392724&containerid=1076037377392724&page=" # 長津湖微博

     # 請求頭

     headers = {

         "cookie":" 自行新增 ",

         "user-agent":" 自行新增 "

     }

     mblog_info,comments_info = weibo_bowen_data_crawl(url,headers)

     bowen_data_store(mblog_info,comments_info)


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69946337/viewspace-2794938/,如需轉載,請註明出處,否則將追究法律責任。

相關文章