讀取csv檔案,列印列名稱:
import pandas as pd # data = pd.read_csv("guba_fc_result_20230413.csv") data = pd.read_csv("guba_all_newtext_20230413.csv") data.columns
儲存檔案:
data.to_csv("guba_all_cutwords_20230413.csv",index=False)
統計:
data['ticker_name'].value_counts()
字串長度過濾:
filtered_df = data[data['matches'] != '[]'] long_text = filtered_df[filtered_df['text'].str.len() > 100]
畫字串長度直方圖:
import numpy as np from matplotlib import pyplot as plt len_text = [len(text) for text in filtered_df['text']] #len_text = [len(text) for text in data['content']] #len_text = [len(text) for text in data['rateContent']] plt.figure(figsize=(20,8),dpi=80) plt.hist(len_text,bins=20) plt.show()
按字串名稱過濾:
v_data = data[data['ticker_name'].isin(['邁瑞醫療'])] v_data = v_data[v_data['post_date'].isin(['2023-03-01'])]
去除nan值:
data.dropna(inplace=True)
合併同名稱的資料:
#所有的相同股票的資料合併在一起 # 根據ticker_name列對資料進行分組,並將每個分組的seg資料合併在一起 data = data.groupby('ticker_name')['seg'].apply(lambda x: ' '.join(x)).reset_index() data
按字串長度過濾資料:
# 計算seg列中詞個數 data['word_count'] = data['seg'].str.split().apply(len) # 保留詞個數超過200的行 data = data[data['word_count'] > 200] # 移除word_count列 data = data.drop('word_count', axis=1) data
統計分詞詞數:
word_counts = data.groupby('ticker_name')['seg'].apply(lambda x: sum(len(text.split()) for text in x)).reset_index() # 輸出結果 print(word_counts)
對分詞結果分組,儲存新的行:
import math def split_seg(seg, chunk_size): chunks = [] words = seg.split() num_chunks = math.ceil(len(words) / chunk_size) # print("num_chunks:",num_chunks) for i in range(num_chunks): start = i * chunk_size end = start + chunk_size chunk = ' '.join(words[start:end]) chunks.append(chunk) return chunks # 分割seg列 new_rows = [] for _, row in data.iterrows(): ticker_name = row['ticker_name'] seg = row['seg'] num_words = len(seg.split()) if num_words > 1000: chunked_segs = split_seg(seg, 3000) for i, chunk in enumerate(chunked_segs): new_ticker_name = ticker_name + '_' + str(i) new_rows.append({'ticker_name': new_ticker_name, 'seg': chunk}) else: new_rows.append({'ticker_name': ticker_name, 'seg': seg}) # 建立新的DataFrame new_data = pd.DataFrame(new_rows) new_data
對分組分詞使用tfidf演算法:
import pandas as pd from sklearn.feature_extraction.text import TfidfVectorizer # 定義tokenizer函式 def tokenizer(text): return text.split() # 計算tf-idf值 tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words='english') tfidf_matrix = tfidf.fit_transform(new_data['seg']) # 獲取特徵名列表 feature_names = tfidf.get_feature_names() # 遍歷每篇文章 for _, group in new_data.groupby('ticker_name'): # 獲取tf-idf矩陣 tfidf_scores = tfidf_matrix[group.index, :] # 計算每個詞的tf-idf值 word_scores = list(zip(feature_names, tfidf_scores.sum(axis=0).tolist()[0])) # 按tf-idf值從大到小排序 word_scores = sorted(word_scores, key=lambda x: x[1], reverse=True) # 列印文章中tf-idf值最高的前10個詞 print(group['ticker_name'].iloc[0]) for word, score in word_scores[:10]: print(word, score) print()