# Changjin Lake film data from Weibo

# @Time: 20211006

# @Author: heheyang

import requests

import json

import re

import pprint

import pandas as pd

def comments_singlePage_crawl(url,headers,comments_info, id):

"""

評論單頁爬取

:param url:

:param headers:

:return:

"""

# 獲取 html 碼

html = requests.get(url, headers).text

# json 解析 html

html_dict = json.loads(html)

comments_data = html_dict["data"]["data"]

for comment in comments_data:

comments_info["id"].append(id)

comments_info["date"].append(comment["created_at"])

# 篩選出 text 中的文字資訊

text = re.sub("<span(.*?)</span>", "", comment["text"])

text = re.sub("<a(.*?)</a>", "", text)

comments_info["text"].append(text)

def weibo_bowen_singelPage_crawl(url,headers,mblog_info,comments_info):

"""

單頁爬取函式

:param url: 待爬取 url

:param headers: 請求頭

:param mblog_info: mblog 資訊儲存字典

"""

# 獲取 html 碼

html = requests.get(url,headers).text

# json 解析 html

html_dict = json.loads(html)

users = html_dict["data"]["cards"]

# 博文儲存

for user in users:

mblog = user["mblog"]

mblog_info["id"].append(mblog["id"])

mblog_info["date"].append(mblog["created_at"])

# 篩選出 text 中的文字資訊

text = re.sub("<span(.*?)</span>","",mblog["text"])

text = re.sub("<a(.*?)</a>","",text)

mblog_info["text"].append(text)

# 構造評論的 url

comments_url = "%s&mid=%s&max_id_type=" % (mblog["id"], mblog["id"])

# 儲存評論

i = 0

while True:

try:

comments_url_ = comments_url + str(i)

comments_singlePage_crawl(comments_url_, headers, comments_info, mblog["id"])

i += 1

except:

break

pprint.pprint(comments_info)

def weibo_bowen_data_crawl(url,headers):

"""

博文資訊爬取函式

:param url: 待爬取網站 url

:param headers: 請求頭

:return: 博文資訊儲存字典 mblog_info

"""

# 博文資訊儲存字典

mblog_info = {

"id": [],

"date": [],

"text": []

}

# 評論資訊儲存字典

comments_info = {

"id":[],

"date":[],

"text":[],

}

# 爬取 10 頁博文

for i in range(1,10):

url_ = url + str(i)

# 新增博文資訊

weibo_bowen_singelPage_crawl(url_, headers, mblog_info,comments_info)

return mblog_info,comments_info

def bowen_data_store(mblog_info,comments_info):

"""

資料處理並儲存到 excel 中

:param mblog_info: 博文資訊

:return: 儲存到 excel

"""

# 儲存表 1

data = pd.DataFrame(mblog_info)

data["num"] = data.index + 1

data["keyword"] = ["Film Changjin Lake"]*len(data["num"])

df = data.loc[:,["num","keyword","id","date","text"]]

df.to_excel("bowen_data.xlsx",sheet_name="Sheet1")

# 儲存表 2

comments_data = pd.DataFrame(comments_info)

comments_data["num"] = comments_data.index + 1

df_c = comments_data.loc[:,["num","id","date","text"]]

df_c.to_excel("bowen_comments_data.xlsx",sheet_name="Sheet1")

if __name__ == '__main__':

# 微博 url

url =外匯跟單gendan5.com "%3D1%26q%3D%E9%95%BF%E6%B4%A5%E6%B9%96&type=uid&value=7377392724&containerid=1076037377392724&page=" # 長津湖微博

# 請求頭

headers = {

"cookie":" 自行新增 ",

"user-agent":" 自行新增 "

}

mblog_info,comments_info = weibo_bowen_data_crawl(url,headers)

bowen_data_store(mblog_info,comments_info)

微博爬取長津湖博文及評論

相關文章