【python技能】詞頻統計

macb007發表於2017-07-30
import jieba
from astropy.table.np_utils import  join
import os
import sys
import jieba.posseg as pseg


def main():
    #分詞結果儲存列表
    word_list = []
    #詞頻統計詞典{關鍵詞:次數}
    word_dict = {}

    current_dir = os.path.abspath('.')
    #自定義詞典檔案last檔案
    dict_file = os.path.join(current_dir, 'last.txt')
    jieba.load_userdict(dict_file)
    #待分詞的檔案
    file_name = os.path.join(current_dir, 'cutTest.txt')
    f = open(file_name, encoding="UTF8")
    line = f.readline()
    #分詞之後寫入result檔案
    file_name2 = os.path.join(current_dir, 'result.txt')
    f2 = open(file_name2, 'w', encoding='utf8')
    while line:
        seg_list = jieba.cut(line, cut_all=False)
        #jieba.add_word("有價值資訊", freq=None, tag=None)
        seg_list = " ".join(seg_list)
        word_list.extend(seg_list.split(" "))
        seg_list.encode("utf8")
        f2.write(seg_list)
        f2.write("\n")
        line = f.readline()
    f2.close()
    f.close()
    for item in word_list:
        if item not in word_dict:
            word_dict[item] = 1
        else:
            word_dict[item] += 1
    for k, v in word_dict.items():
        print(k, v)

    print("end")


if __name__ == '__main__':
    main()

相關文章