本帖是前一貼的補充:
- 使用大資料,瞭解怎麼處理資料不能一次全部載入到記憶體的情況。如果你記憶體充足,當我沒說
- 訓練好的模型的儲存和使用
- 使用的模型沒變,還是簡單的feedforward神經網路(update:新增CNN模型)
- 如果你要執行本帖程式碼,推薦使用GPU版本或強大的VPS,我使用小筆記本差點等吐血
- 後續有關於中文的練習《TensorFlow練習13: 製作一個簡單的聊天機器人》《TensorFlow練習7: 基於RNN生成古詩詞》《TensorFlow練習18: 根據姓名判斷性別》
在正文開始之前,我畫了一個機器學習模型的基本開發流程圖:
使用的資料集
使用的資料集:http://help.sentiment140.com/for-students/ (情緒分析)
資料集包含1百60萬條推特,包含消極、中性和積極tweet。不知道有沒有現成的微博資料集。
資料格式:移除表情符號的CSV檔案,欄位如下:
- 0 – the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
- 1 – the id of the tweet (2087)
- 2 – the date of the tweet (Sat May 16 23:58:44 UTC 2009)
- 3 – the query (lyx). If there is no query, then this value is NO_QUERY.
- 4 – the user that tweeted (robotickilldozr)
- 5 – the text of the tweet (Lyx is cool)
training.1600000.processed.noemoticon.csv(238M)
testdata.manual.2009.06.14.csv(74K)
資料預處理
上面程式碼把原始資料轉為training.csv、和tesing.csv,裡面只包含label和tweet。lexcion.pickle檔案儲存了詞彙表。
如果資料檔案太大,不能一次載入到記憶體,可以把資料匯入資料庫
Dask可處理大csv檔案
開始漫長的訓練
上面程式佔用記憶體600M,峰值1G。
執行:
訓練模型儲存為model.ckpt。
使用訓練好的模型
上面使用簡單的feedfroward模型,下面使用CNN模型
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
|
#
https://github.com/Lab41/sunny-side-up
import
os
import
random
import
tensorflow
as
tf
import
pickle
import
numpy
as
np
from
nltk.tokenize
import
word_tokenize
from
nltk.stem
import
WordNetLemmatizer
f
=
open('lexcion.pickle',
'rb')
lex
=
pickle.load(f)
f.close()
def
get_random_line(file,
point):
file.seek(point)
file.readline()
return
file.readline()
#
從檔案中隨機選擇n條記錄
def
get_n_random_line(file_name,
n=150):
lines
=
[]
file
=
open(file_name,
encoding='latin-1')
total_bytes
=
os.stat(file_name).st_size
for
i
in
range(n):
random_point
=
random.randint(0,
total_bytes)
lines.append(get_random_line(file,
random_point))
file.close()
return
lines
def
get_test_dataset(test_file):
with
open(test_file,
encoding='latin-1')
as
f:
test_x
=
[]
test_y
=
[]
lemmatizer
=
WordNetLemmatizer()
for
line
in
f:
label
=
line.split(':%:%:%:')[0]
tweet
=
line.split(':%:%:%:')[1]
words
=
word_tokenize(tweet.lower())
words
=
[lemmatizer.lemmatize(word)
for
word
in
words]
features
=
np.zeros(len(lex))
for
word
in
words:
if
word
in
lex:
features[lex.index(word)]
=
1
test_x.append(list(features))
test_y.append(eval(label))
return
test_x,
test_y
test_x,
test_y
=
get_test_dataset('tesing.csv')
##############################################################################
input_size
=
len(lex)
num_classes
=
3
X
=
tf.placeholder(tf.int32,
[None,
input_size])
Y
=
tf.placeholder(tf.float32,
[None,
num_classes])
dropout_keep_prob
=
tf.placeholder(tf.float32)
batch_size
=
90
def
neural_network():
#
embedding layer
with
tf.device('/cpu:0'),
tf.name_scope("embedding"):
embedding_size
=
128
W
=
tf.Variable(tf.random_uniform([input_size,
embedding_size],
-1.0,
1.0))
embedded_chars
=
tf.nn.embedding_lookup(W,
X)
embedded_chars_expanded
=
tf.expand_dims(embedded_chars,
-1)
#
convolution + maxpool layer
num_filters
=
128
filter_sizes
=
[3,4,5]
pooled_outputs
=
[]
for
i,
filter_size
in
enumerate(filter_sizes):
with
tf.name_scope("conv-maxpool-%s"
%
filter_size):
filter_shape
=
[filter_size,
embedding_size,
1,
num_filters]
W
=
tf.Variable(tf.truncated_normal(filter_shape,
stddev=0.1))
b
=
tf.Variable(tf.constant(0.1,
shape=[num_filters]))
conv
=
tf.nn.conv2d(embedded_chars_expanded,
W,
strides=[1,
1,
1,
1],
padding="VALID")
h
=
tf.nn.relu(tf.nn.bias_add(conv,
b))
pooled
=
tf.nn.max_pool(h,
ksize=[1,
input_size
-
filter_size
+
1,
1,
1],
strides=[1,
1,
1,
1],
padding='VALID')
pooled_outputs.append(pooled)
num_filters_total
=
num_filters
*
len(filter_sizes)
h_pool
=
tf.concat(3,
pooled_outputs)
h_pool_flat
=
tf.reshape(h_pool,
[-1,
num_filters_total])
#
dropout
with
tf.name_scope("dropout"):
h_drop
=
tf.nn.dropout(h_pool_flat,
dropout_keep_prob)
#
output
with
tf.name_scope("output"):
W
=
tf.get_variable("W",
shape=[num_filters_total,
num_classes],
initializer=tf.contrib.layers.xavier_initializer())
b
=
tf.Variable(tf.constant(0.1,
shape=[num_classes]))
output
=
tf.nn.xw_plus_b(h_drop,
W,
b)
return
output
def
train_neural_network():
output
=
neural_network()
optimizer
=
tf.train.AdamOptimizer(1e-3)
loss
=
tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output,
Y))
grads_and_vars
=
optimizer.compute_gradients(loss)
train_op
=
optimizer.apply_gradients(grads_and_vars)
saver
=
tf.train.Saver(tf.global_variables())
with
tf.Session()
as
sess:
sess.run(tf.global_variables_initializer())
lemmatizer
=
WordNetLemmatizer()
i
=
0
while
True:
batch_x
=
[]
batch_y
=
[]
#if
model.ckpt檔案已存在:
#
saver.restore(session, 'model.ckpt') 恢復儲存的session
try:
lines
=
get_n_random_line('training.csv',
batch_size)
for
line
in
lines:
label
=
line.split(':%:%:%:')[0]
tweet
=
line.split(':%:%:%:')[1]
words
=
word_tokenize(tweet.lower())
words
=
[lemmatizer.lemmatize(word)
for
word
in
words]
features
=
np.zeros(len(lex))
for
word
in
words:
if
word
in
lex:
features[lex.index(word)]
=
1 #
一個句子中某個詞可能出現兩次,可以用+=1,其實區別不大
batch_x.append(list(features))
batch_y.append(eval(label))
_,
loss_
=
sess.run([train_op,
loss],
feed_dict={X:batch_x,
Y:batch_y,
dropout_keep_prob:0.5})
print(loss_)
except
Exception
as
e:
print(e)
if
i
%
10
==
0:
predictions
=
tf.argmax(output,
1)
correct_predictions
=
tf.equal(predictions,
tf.argmax(Y,
1))
accuracy
=
tf.reduce_mean(tf.cast(correct_predictions,
"float"))
accur
=
sess.run(accuracy,
feed_dict={X:test_x[0:50],
Y:test_y[0:50],
dropout_keep_prob:1.0})
print('準確率:',
accur)
i
+=
1
train_neural_network()
|
使用了CNN模型之後,準確率有了顯著提升。