fasttext訓練模型程式碼

永勝永勝發表於2020-12-23
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# author ChenYongSheng
# date 20201222

import pandas as pd
import jieba

'''資料預處理'''
df = pd.read_csv('data/8qi/xx.csv', header=0)
stopwords = [line.strip() for line in open('data/all/stopwords.txt', encoding='utf-8').readlines()]


def remove_stopwords(text_cut, stopwords):
    result = []
    for word in text_cut:
        if word not in stopwords:
            result.append(word)
    return result


lines = []
test_lines = []
for data in df.itertuples():
    # print(data)
    label = '__label__' + str(data.label)
    text = str(data.text)
    text_cut = jieba.lcut(text)
    text_remove_stop = remove_stopwords(text_cut, stopwords)
    words = ''
    for word in text_remove_stop:
        words = word + ' ' + words
    body = label + ' , ' + words.rstrip(' ')
    if data.Index % 10 == 0:
        test_lines.append(body)
    else:
        lines.append(body)

with open('data/8qi/train.txt', 'w', encoding='utf-8') as f:
    for line in lines:
        f.write(line + '\n')
    f.close()

with open('data/8qi/test.txt', 'w', encoding='utf-8') as f:
    for line in test_lines:
        f.write(line + '\n')
    f.close()
#!/usr/bin/env python
# -*- coding: utf-8 -*-

# author ChenYongSheng
# date 20201222

import fasttext

'''模型訓練'''

trainDataFile = 'data/8qi/train.txt'

model = fasttext.train_supervised(trainDataFile, lr=0.1, dim=100, epoch=30, word_ngrams=2, loss='softmax')
model.save_model("model/fasttext_model.bin")


testDataFile = 'data/8qi/test.txt'

model = fasttext.load_model('model/fasttext_model.bin')

result = model.test(testDataFile)
print('測試集上資料量', result[0])
print('測試集上準確率', result[1])
print('測試集上召回率', result[2])

必須是這樣的資料格式:__label__分類名(空格)(逗號)(空格)(切詞)
__label__安靜程度 , 吵不吵 房子 那套 肯德基
__label__安靜程度 , 吵
__label__安靜程度 , 位置 吵 臥室

如果報錯ValueError: data/7期/train.txt cannot be opened for training!
即是資料檔案路徑包含中文名,改成英文或拼音

相關文章