步驟1:構建語料庫:
-
-
-
-
-
-
sourceDataDir='data'
-
-
-
fileLists = []
-
-
import os
-
from gensim import corpora, models, similarities
-
-
def getSourceFileLists(sourceDataDir):
-
fileLists = []
-
subDirList = os.listdir(sourceDataDir)
-
for subDir in subDirList:
-
subList = os.listdir(sourceDataDir + '/' + subDir)
-
fileList = [ sourceDataDir+'/'+subDir+'/'+ x for x in subList if os.path.isfile(sourceDataDir+'/'+subDir+'/'+x)]
-
fileLists += fileList
-
-
return fileLists
-
-
-
fileLists = getSourceFileLists(sourceDataDir)
-
-
-
if 0 < len(fileLists):
-
import codecs
-
import jieba
-
punctuations = ['','\n','\t',',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
-
-
if not os.path.exists('dict'):
-
os.mkdir("dict")
-
if not os.path.exists('corpus'):
-
os.mkdir("corpus")
-
-
for fileName in fileLists:
-
print fileName
-
-
hFile = None
-
content = None
-
try:
-
hFile = codecs.open(fileName,'r','gb18030')
-
content = hFile.readlines()
-
except Exception,e:
-
print e
-
finally:
-
if hFile:
-
hFile.close()
-
-
if content:
-
fileFenci = [ x for x in jieba.cut(' '.join(content),cut_all=True)]
-
fileFenci2 = [word for word in fileFenci if not word in punctuations]
-
-
texts = [fileFenci2]
-
-
all_tokens = sum(texts, [])
-
tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
-
texts = [[word for word in text if word not in tokens_once] for text in texts]
-
-
sFileDir, sFileName = os.path.split(fileName)
-
dictFileName = 'dict/'+sFileName+'.dict'
-
corpusFileName = 'corpus/'+sFileName+'.mm'
-
-
dictionary = corpora.Dictionary(texts)
-
dictionary.save_as_text(dictFileName)
-
-
corpus = ([dictionary.doc2bow(text) for text in texts])
-
corpora.MmCorpus.serialize(corpusFileName, corpus)
-
-
print 'Build corpus done'
資料來源:
來自 http://d1.txthj.com/newrar/txthj_264.rar 的83篇小說,將其目錄存放在目錄 ./data/下。
載入時作為二層目錄處理
輸出:
./dict 和 ./corpus
在對應目錄下生成 xxx.dict 和 xxx.mm,xxx為原檔案的全稱(不包括路徑,包括字尾)
步驟2:載入語料庫,相似性分析
-
-
-
-
-
import os
-
from gensim import corpora, models, similarities
-
-
def getFileList(dir):
-
return [ dir + x for x in os.listdir(dir)]
-
dictLists = getFileList('./dict/')
-
-
-
class LoadDictionary(object):
-
def __init__(self, dictionary):
-
self.dictionary = dictionary
-
-
def __iter__(self):
-
for dictFile in dictLists:
-
sFileRaw, sFilePostfix = os.path.splitext(dictFile)
-
sFileDir, sFileName = os.path.split(sFileRaw)
-
(dictFile, corpusFile) = ( './dict/' + sFileName + '.dict', './corpus/'+sFileName + '.mm')
-
yield self.dictionary.load_from_text(dictFile)
-
-
class LoadCorpus(object):
-
-
def __iter__(self):
-
for dictFile in dictLists:
-
sFileRaw, sFilePostfix = os.path.splitext(dictFile)
-
sFileDir, sFileName = os.path.split(sFileRaw)
-
(dictFile, corpusFile) = ( './dict/' + sFileName + '.dict', './corpus/'+sFileName + '.mm')
-
yield corpora.MmCorpus(corpusFile)
-
-
-
-
-
-
-
def pre_process_cn(inputs, low_freq_filter = True):
-
-
-
-
-
-
-
-
import nltk
-
import jieba.analyse
-
from nltk.tokenize import word_tokenize
-
-
texts_tokenized = []
-
for document in inputs:
-
texts_tokenized_tmp = []
-
for word in word_tokenize(document):
-
texts_tokenized_tmp += jieba.analyse.extract_tags(word,10)
-
texts_tokenized.append(texts_tokenized_tmp)
-
-
texts_filtered_stopwords = texts_tokenized
-
-
-
english_punctuations = [',', '.', ':', ';', '?', '(', ')', '[', ']', '&', '!', '*', '@', '#', '$', '%']
-
texts_filtered = [[word for word in document if not word in english_punctuations] for document in texts_filtered_stopwords]
-
-
-
from nltk.stem.lancaster import LancasterStemmer
-
st = LancasterStemmer()
-
texts_stemmed = [[st.stem(word) for word in docment] for docment in texts_filtered]
-
-
-
if low_freq_filter:
-
all_stems = sum(texts_stemmed, [])
-
stems_once = set(stem for stem in set(all_stems) if all_stems.count(stem) == 1)
-
texts = [[stem for stem in text if stem not in stems_once] for text in texts_stemmed]
-
else:
-
texts = texts_stemmed
-
return texts
-
-
dictionary = corpora.dictionary.Dictionary()
-
dictionary_memory_friendly = LoadDictionary(dictionary)
-
for vector in dictionary_memory_friendly:
-
dictionary = vector
-
-
corpus = []
-
corpus_memory_friendly = LoadCorpus()
-
for vector in corpus_memory_friendly:
-
corpus.append(vector[0])
-
-
if 0 < len(corpus):
-
tfidf = models.TfidfModel(corpus)
-
corpus_tfidf = tfidf[corpus]
-
-
model = models.LsiModel(corpus_tfidf, id2word=None, num_topics=20, chunksize=2000000)
-
index = similarities.Similarity('./novel_', model[corpus], num_features=len(corpus))
-
-
-
target_courses = ['男人們的臉上沉重而冷凝,蒙著面紗的女人們則是發出斷斷續續的哭泣聲,他們無比專注地看著前方,見證一場生與死的拉鋸戰。']
-
target_text = pre_process_cn(target_courses, low_freq_filter=False)
-
-
-
-
-
-
ml_course = target_text[0]
-
-
ml_bow = dictionary.doc2bow(ml_course)
-
-
-
ml_lsi = model[ml_bow]
-
sims = index[ml_lsi]
-
-
-
sort_sims = sorted(enumerate(sims), key=lambda item: -item[1])
-
-
-
print sort_sims[0:10]
-
print len(dictLists)
-
print dictLists[sort_sims[1][0]]
-
print dictLists[sort_sims[2][0]]
-
print dictLists[sort_sims[3][0]]
說明:
yield的使用是為了更好的記憶體效率。
遺留問題:
步驟2會有提示:
/usr/lib/python2.7/dist-packages/scipy/sparse/compressed.py:122: UserWarning: indices array has non-integer dtype (float64)
不影響處理過程