pandas 資料處理 一些常用操作

高顏值的殺生丸發表於2023-05-15

 

讀取csv檔案,列印列名稱:

import pandas as pd

# data = pd.read_csv("guba_fc_result_20230413.csv")

data = pd.read_csv("guba_all_newtext_20230413.csv")
data.columns

  

儲存檔案:

data.to_csv("guba_all_cutwords_20230413.csv",index=False)

  

統計:

data['ticker_name'].value_counts()

  

字串長度過濾:

filtered_df = data[data['matches'] != '[]']

long_text = filtered_df[filtered_df['text'].str.len() > 100]

  

畫字串長度直方圖:

import numpy as np
from matplotlib import pyplot as plt

len_text = [len(text) for text in filtered_df['text']]
#len_text = [len(text) for text in data['content']]
#len_text = [len(text) for text in data['rateContent']]

plt.figure(figsize=(20,8),dpi=80)
plt.hist(len_text,bins=20)
plt.show()

  

按字串名稱過濾:

v_data = data[data['ticker_name'].isin(['邁瑞醫療'])]
v_data = v_data[v_data['post_date'].isin(['2023-03-01'])]

  

去除nan值:

data.dropna(inplace=True)

  

合併同名稱的資料:

#所有的相同股票的資料合併在一起

# 根據ticker_name列對資料進行分組,並將每個分組的seg資料合併在一起
data = data.groupby('ticker_name')['seg'].apply(lambda x: ' '.join(x)).reset_index()
data

  

按字串長度過濾資料:

# 計算seg列中詞個數
data['word_count'] = data['seg'].str.split().apply(len)

# 保留詞個數超過200的行
data = data[data['word_count'] > 200]

# 移除word_count列
data = data.drop('word_count', axis=1)
data

  

統計分詞詞數:

word_counts = data.groupby('ticker_name')['seg'].apply(lambda x: sum(len(text.split()) for text in x)).reset_index()

# 輸出結果
print(word_counts)

  

對分詞結果分組,儲存新的行:

import math

def split_seg(seg, chunk_size):
    chunks = []
    words = seg.split()
    num_chunks = math.ceil(len(words) / chunk_size)
#     print("num_chunks:",num_chunks)
    for i in range(num_chunks):
        start = i * chunk_size
        end = start + chunk_size
        chunk = ' '.join(words[start:end])
        chunks.append(chunk)
    return chunks

# 分割seg列
new_rows = []
for _, row in data.iterrows():
    ticker_name = row['ticker_name']
    seg = row['seg']
    num_words = len(seg.split())
    if num_words > 1000:
        chunked_segs = split_seg(seg, 3000)
        for i, chunk in enumerate(chunked_segs):
            new_ticker_name = ticker_name + '_' + str(i)
            new_rows.append({'ticker_name': new_ticker_name, 'seg': chunk})
    else:
        new_rows.append({'ticker_name': ticker_name, 'seg': seg})

# 建立新的DataFrame
new_data = pd.DataFrame(new_rows)
new_data

  

對分組分詞使用tfidf演算法:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer


# 定義tokenizer函式
def tokenizer(text):
    return text.split()

# 計算tf-idf值
tfidf = TfidfVectorizer(tokenizer=tokenizer, stop_words='english')
tfidf_matrix = tfidf.fit_transform(new_data['seg'])

# 獲取特徵名列表
feature_names = tfidf.get_feature_names()

# 遍歷每篇文章
for _, group in new_data.groupby('ticker_name'):
    # 獲取tf-idf矩陣
    tfidf_scores = tfidf_matrix[group.index, :]
    
    # 計算每個詞的tf-idf值
    word_scores = list(zip(feature_names, tfidf_scores.sum(axis=0).tolist()[0]))
    
    # 按tf-idf值從大到小排序
    word_scores = sorted(word_scores, key=lambda x: x[1], reverse=True)
    
    # 列印文章中tf-idf值最高的前10個詞
    print(group['ticker_name'].iloc[0])
    for word, score in word_scores[:10]:
        print(word, score)
    print()

  

 

相關文章