LDA主題模型簡介及Python實現

專注的阿熊發表於2022-10-31

import gensim

from gensim import corpora

import matplotlib.pyplot as plt

import matplotlib

import numpy as np

import warnings

warnings.filterwarnings('ignore')  # To ignore all warnings that arise here to enhance clarity

from gensim.models.coherencemodel import CoherenceModel

from gensim.models.ldamodel import LdaModel

  # 準備資料

PATH = "E:/data/output.csv"

file_object2=open(PATH,encoding = 'utf-8',errors = 'ignore').read().split('\n')  # 一行行的讀取內容

data_set=[] # 建立儲存分詞的列表

for i in range(len(file_object2)):

     result=[]

     seg_list = file_object2[i].split()

     for w in seg_list :# 讀取每一行分詞

         result.append(w)

     data_set.append(result)

print(data_set)

dictionary = corpora.Dictionary(data_set)  # 構建 document-term matrix

corpus = [dictionary.doc2bow(text) for text in data_set]

#Lda = gensim.models.ldamodel.LdaModel  # 建立 LDA 物件

# 計算困惑度

def perplexity(num_topics):

     ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30)

     print(ldamodel.print_topics(num_topics=num_topics, num_words=15))

     print(ldamodel.log_perplexity(corpus))

     return ldamodel.log_perplexity(corpus)

# 計算 coherence

def coherence(num_topics):

     ldamodel = LdaModel(corpus, num_topics=num_topics, id2word = dictionary, passes=30,random_state = 1)

     print(ldamodel.print_topics(num_topics=num_topics, num_words=10))

     ldacm = CoherenceModel(model=ldamodel, texts=data_set, dictionary=dictionary, coherence='c_v')

     print(ldacm.get_coherence())

     return ldacm.get_coherence()

# 繪製困惑度折線圖

x = range(1,15)

# z = [perplexity(i) for i in x]

y = [coherence(i) for i in x]

plt.plot(x, y)

plt.xlabel(' 主題數目 ')

plt.ylabel('coherence 大小 ')

plt.rcParams['font.sans-serif']=['SimHei']

matplotlib.rcParams['axes.unicode_minus']=False

plt.title(' 主題 -coherence 變化情況 ')

plt.show()

from gensim.models import LdaModel

import pandas as pd

from gensim.corpora import Dictionary

from gensim import corpora, models

import csv

# 準備資料

PATH = "E:/data/output1.csv"

file_object2=外匯跟單gendan5.comopen(PATH,encoding = 'utf-8',errors = 'ignore').read().split('\n')  # 一行行的讀取內容

data_set=[] # 建立儲存分詞的列表

for i in range(len(file_object2)):

     result=[]

     seg_list = file_object2[i].split()

     for w in seg_list :# 讀取每一行分詞

         result.append(w)

     data_set.append(result)

dictionary = corpora.Dictionary(data_set)  # 構建 document-term matrix

corpus = [dictionary.doc2bow(text) for text in data_set]

lda = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, passes = 30,random_state=1)

topic_list=lda.print_topics()

print(topic_list)

result_list =[]

for i in lda.get_document_topics(corpus)[:]:

     listj=[]

     for j in i:

         listj.append(j[1])

     bz=listj.index(max(listj))

     result_list.append(i[bz][0])

print(result_list)

import pyLDAvis.gensim

pyLDAvis.enable_notebook()

data = pyLDAvis.gensim.prepare(lda, corpus, dictionary)

pyLDAvis.save_html(data, 'E:/data/topic.html')


來自 “ ITPUB部落格 ” ,連結:http://blog.itpub.net/69946337/viewspace-2921183/,如需轉載,請註明出處,否則將追究法律責任。

相關文章