models生成與載入

bbzz2發表於2017-09-11

TensorFlow練習2: 對評論進行分類

本帖是前一貼的補充：

使用大資料，瞭解怎麼處理資料不能一次全部載入到記憶體的情況。如果你記憶體充足，當我沒說
訓練好的模型的儲存和使用
使用的模型沒變，還是簡單的feedforward神經網路（update：新增CNN模型）
如果你要執行本帖程式碼，推薦使用GPU版本或強大的VPS，我使用小筆記本差點等吐血
後續有關於中文的練習《TensorFlow練習13: 製作一個簡單的聊天機器人》《TensorFlow練習7: 基於RNN生成古詩詞》《TensorFlow練習18: 根據姓名判斷性別》

在正文開始之前，我畫了一個機器學習模型的基本開發流程圖：

TensorFlow練習2: 對評論進行分類

使用的資料集

使用的資料集：http://help.sentiment140.com/for-students/ (情緒分析)

資料集包含1百60萬條推特，包含消極、中性和積極tweet。不知道有沒有現成的微博資料集。

資料格式：移除表情符號的CSV檔案，欄位如下：

0 – the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)
1 – the id of the tweet (2087)
2 – the date of the tweet (Sat May 16 23:58:44 UTC 2009)
3 – the query (lyx). If there is no query, then this value is NO_QUERY.
4 – the user that tweeted (robotickilldozr)
5 – the text of the tweet (Lyx is cool)

training.1600000.processed.noemoticon.csv（238M）
testdata.manual.2009.06.14.csv（74K）

資料預處理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
 
import pickle
import numpy as np
import pandas as pd
from collections import OrderedDict
 
org_train_file = 'training.1600000.processed.noemoticon.csv'
org_test_file = 'testdata.manual.2009.06.14.csv'
 
# 提取檔案中有用的欄位
def usefull_filed(org_file, output_file):
	output = open(output_file, 'w')
	with open(org_file, buffering=10000, encoding='latin-1') as f:
		try:
			for line in f:                # "4","2193601966","Tue Jun 16 08:40:49 PDT 2009","NO_QUERY","AmandaMarie1028","Just woke up. Having no school is the best feeling ever "
				line = line.replace('"', '')
				clf = line.split(',')[0]   # 4
				if clf == '0':
					clf = [0, 0, 1]  # 消極評論
				elif clf == '2':
					clf = [0, 1, 0]  # 中性評論
				elif clf == '4':
					clf = [1, 0, 0]  # 積極評論
 
				tweet = line.split(',')[-1]
				outputline = str(clf) + ':%:%:%:' + tweet
				output.write(outputline)  # [0, 0, 1]:%:%:%: that's a bummer.  You shoulda got David Carr of Third Day to do it. ;D
		except Exception as e:
			print(e)
	output.close()  # 處理完成，處理後檔案大小127.5M
 
usefull_filed(org_train_file, 'training.csv')
usefull_filed(org_test_file, 'tesing.csv')
 
# 建立詞彙表
def create_lexicon(train_file):
	lex = []
	lemmatizer = WordNetLemmatizer()
	with open(train_file, buffering=10000, encoding='latin-1') as f:
		try:
			count_word = {}  # 統計單詞出現次數
			for line in f:
				tweet = line.split(':%:%:%:')[1]
				words = word_tokenize(line.lower())
				for word in words:
					word = lemmatizer.lemmatize(word)
					if word not in count_word:
						count_word[word] = 1
					else:
						count_word[word] += 1
 
			count_word = OrderedDict(sorted(count_word.items(), key=lambda t: t[1]))
			for word in count_word:
				if count_word[word] < 100000 and count_word[word] > 100:  # 過濾掉一些詞
					lex.append(word)
		except Exception as e:
			print(e)
	return lex
 
lex = create_lexicon('training.csv')
 
with open('lexcion.pickle', 'wb') as f:
	pickle.dump(lex, f)
 
 
"""
# 把字串轉為向量
def string_to_vector(input_file, output_file, lex):
	output_f = open(output_file, 'w')
	lemmatizer = WordNetLemmatizer()
	with open(input_file, buffering=10000, encoding='latin-1') as f:
		for line in f:
			label = line.split(':%:%:%:')[0]
			tweet = line.split(':%:%:%:')[1]
			words = word_tokenize(tweet.lower())
			words = [lemmatizer.lemmatize(word) for word in words]
 
			features = np.zeros(len(lex))
			for word in words:
				if word in lex:
					features[lex.index(word)] = 1  # 一個句子中某個詞可能出現兩次,可以用+=1，其實區別不大
			
			features = list(features)
			output_f.write(str(label) + ":" + str(features) + '\n')
	output_f.close()
 
 
f = open('lexcion.pickle', 'rb')
lex = pickle.load(f)
f.close()
 
# lexcion詞彙表大小112k,training.vec大約112k*1600000  170G  太大，只能邊轉邊訓練了
# string_to_vector('training.csv', 'training.vec', lex)
# string_to_vector('tesing.csv', 'tesing.vec', lex)
"""

上面程式碼把原始資料轉為training.csv、和tesing.csv，裡面只包含label和tweet。lexcion.pickle檔案儲存了詞彙表。

如果資料檔案太大，不能一次載入到記憶體，可以把資料匯入資料庫
Dask可處理大csv檔案

開始漫長的訓練

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136


import
os

import
random

import
tensorflow
as
tf

import
pickle

import
numpy
as
np

from
nltk.tokenize
import
word_tokenize

from
nltk.stem
import
WordNetLemmatizer

 

f
=
open('lexcion.pickle',
'rb')

lex
=
pickle.load(f)

f.close()

 

 

def
get_random_line(file,
point):

file.seek(point)

file.readline()

return
file.readline()

#
 從檔案中隨機選擇n條記錄

def
get_n_random_line(file_name,
n=150):

lines
=
[]

file
=
open(file_name,
encoding='latin-1')

total_bytes
=
os.stat(file_name).st_size

for
i
in
range(n):

random_point
=
random.randint(0,
total_bytes)

lines.append(get_random_line(file,
random_point))

file.close()

return
lines

 

 

def
get_test_dataset(test_file):

with
open(test_file,
encoding='latin-1')
as
f:

test_x
=
[]

test_y
=
[]

lemmatizer
=
WordNetLemmatizer()

for
line
in
f:

label
=
line.split(':%:%:%:')[0]

tweet
=
line.split(':%:%:%:')[1]

words
=
word_tokenize(tweet.lower())

words
=
[lemmatizer.lemmatize(word)
for
word
in
words]

features
=
np.zeros(len(lex))

for
word
in
words:

if
word
in
lex:

features[lex.index(word)]
=
1


test_x.append(list(features))

test_y.append(eval(label))

return
test_x,
test_y

 

test_x,
test_y
=
get_test_dataset('tesing.csv')

 

 

#######################################################################

 

n_input_layer
=
len(lex)  #
 輸入層

 

n_layer_1
=
2000    
#
 hide layer

n_layer_2
=
2000    #
 hide layer(隱藏層)聽著很神祕，其實就是除輸入輸出層外的中間層

 

n_output_layer
=
3      
#
 輸出層

 

 

def
neural_network(data):

#
 定義第一層"神經元"的權重和biases

layer_1_w_b
=
{'w_':tf.Variable(tf.random_normal([n_input_layer,
n_layer_1])),
'b_':tf.Variable(tf.random_normal([n_layer_1]))}

#
 定義第二層"神經元"的權重和biases

layer_2_w_b
=
{'w_':tf.Variable(tf.random_normal([n_layer_1,
n_layer_2])),
'b_':tf.Variable(tf.random_normal([n_layer_2]))}

#
 定義輸出層"神經元"的權重和biases

layer_output_w_b
=
{'w_':tf.Variable(tf.random_normal([n_layer_2,
n_output_layer])),
'b_':tf.Variable(tf.random_normal([n_output_layer]))}

 

#
 w·x+b

layer_1
=
tf.add(tf.matmul(data,
layer_1_w_b['w_']),
layer_1_w_b['b_'])

layer_1
=
tf.nn.relu(layer_1)  #
 啟用函式

layer_2
=
tf.add(tf.matmul(layer_1,
layer_2_w_b['w_']),
layer_2_w_b['b_'])

layer_2
=
tf.nn.relu(layer_2
)
#
 啟用函式

layer_output
=
tf.add(tf.matmul(layer_2,
layer_output_w_b['w_']),
layer_output_w_b['b_'])

 

return
layer_output

 

 

X
=
tf.placeholder('float')

Y
=
tf.placeholder('float')

batch_size
=
90

 

def
train_neural_network(X,
Y):

predict
=
neural_network(X)

cost_func
=
tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(predict,
Y))

optimizer
=
tf.train.AdamOptimizer().minimize(cost_func)

 

with
tf.Session()
as
session:

session.run(tf.initialize_all_variables())

 

lemmatizer
=
WordNetLemmatizer()

saver
=
tf.train.Saver()

i
=
0

pre_accuracy
=
0

while
True:  
#
 一直訓練

batch_x
=
[]

batch_y
=
[]

 

#if
 model.ckpt檔案已存在:

#
 saver.restore(session, 'model.ckpt')  恢復儲存的session

 

try:

lines
=
get_n_random_line('training.csv',
batch_size)

for
line
in
lines:

label
=
line.split(':%:%:%:')[0]

tweet
=
line.split(':%:%:%:')[1]

words
=
word_tokenize(tweet.lower())

words
=
[lemmatizer.lemmatize(word)
for
word
in
words]

 

features
=
np.zeros(len(lex))

for
word
in
words:

if
word
in
lex:

features[lex.index(word)]
=
1  #
 一個句子中某個詞可能出現兩次,可以用+=1，其實區別不大


batch_x.append(list(features))

batch_y.append(eval(label))

 

session.run([optimizer,
cost_func],
feed_dict={X:batch_x,Y:batch_y})

except
Exception
as
e:

print(e)

 

#
 準確率

if
i
>
100:

correct
=
tf.equal(tf.argmax(predict,1),
tf.argmax(Y,1))

accuracy
=
tf.reduce_mean(tf.cast(correct,'float'))

accuracy
=
accuracy.eval({X:test_x,
Y:test_y})

if
accuracy
>
pre_accuracy:  #
 儲存準確率最高的訓練模型

print('準確率:
 ',
accuracy)

pre_accuracy
=
accuracy

saver.save(session,
'model.ckpt')  #
 儲存session

i
=
0

i
+=
1

 

 

train_neural_network(X,Y)

上面程式佔用記憶體600M，峰值1G。

執行：

TensorFlow練習2: 對評論進行分類

訓練模型儲存為model.ckpt。

使用訓練好的模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
import tensorflow as tf
import pickle
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
 
f = open('lexcion.pickle', 'rb')
lex = pickle.load(f)
f.close()
 
n_input_layer = len(lex)  # 輸入層
 
n_layer_1 = 2000     # hide layer
n_layer_2 = 2000    # hide layer(隱藏層)聽著很神祕，其實就是除輸入輸出層外的中間層
 
n_output_layer = 3       # 輸出層
def neural_network(data):
	# 定義第一層"神經元"的權重和biases
	layer_1_w_b = {'w_':tf.Variable(tf.random_normal([n_input_layer, n_layer_1])), 'b_':tf.Variable(tf.random_normal([n_layer_1]))}
	# 定義第二層"神經元"的權重和biases
	layer_2_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_1, n_layer_2])), 'b_':tf.Variable(tf.random_normal([n_layer_2]))}
	# 定義輸出層"神經元"的權重和biases
	layer_output_w_b = {'w_':tf.Variable(tf.random_normal([n_layer_2, n_output_layer])), 'b_':tf.Variable(tf.random_normal([n_output_layer]))}
 
	# w·x+b
	layer_1 = tf.add(tf.matmul(data, layer_1_w_b['w_']), layer_1_w_b['b_'])
	layer_1 = tf.nn.relu(layer_1)  # 啟用函式
	layer_2 = tf.add(tf.matmul(layer_1, layer_2_w_b['w_']), layer_2_w_b['b_'])
	layer_2 = tf.nn.relu(layer_2 ) # 啟用函式
	layer_output = tf.add(tf.matmul(layer_2, layer_output_w_b['w_']), layer_output_w_b['b_'])
 
	return layer_output
 
X = tf.placeholder('float')
def prediction(tweet_text):
	predict = neural_network(X)
 
	with tf.Session() as session:
		session.run(tf.initialize_all_variables())
		saver = tf.train.Saver()
		saver.restore(session, 'model.ckpt')
 
		lemmatizer = WordNetLemmatizer()
		words = word_tokenize(tweet_text.lower())
		words = [lemmatizer.lemmatize(word) for word in words]
 
		features = np.zeros(len(lex))
		for word in words:
			if word in lex:
				features[lex.index(word)] = 1
		
		#print(predict.eval(feed_dict={X:[features]})) [[val1,val2,val3]]
		res = session.run(tf.argmax(predict.eval(feed_dict={X:[features]}),1 ))
		return res
 
 
prediction("I am very happe")

上面使用簡單的feedfroward模型，下面使用CNN模型

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

91

92

93

94

95

96

97

98

99

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145


#
 https://github.com/Lab41/sunny-side-up

import
os

import
random

import
tensorflow
as
tf

import
pickle

import
numpy
as
np

from
nltk.tokenize
import
word_tokenize

from
nltk.stem
import
WordNetLemmatizer

 

f
=
open('lexcion.pickle',
'rb')

lex
=
pickle.load(f)

f.close()

 

def
get_random_line(file,
point):

file.seek(point)

file.readline()

return
file.readline()

#
 從檔案中隨機選擇n條記錄

def
get_n_random_line(file_name,
n=150):

lines
=
[]

file
=
open(file_name,
encoding='latin-1')

total_bytes
=
os.stat(file_name).st_size

for
i
in
range(n):

random_point
=
random.randint(0,
total_bytes)

lines.append(get_random_line(file,
random_point))

file.close()

return
lines

 

def
get_test_dataset(test_file):

with
open(test_file,
encoding='latin-1')
as
f:

test_x
=
[]

test_y
=
[]

lemmatizer
=
WordNetLemmatizer()

for
line
in
f:

label
=
line.split(':%:%:%:')[0]

tweet
=
line.split(':%:%:%:')[1]

words
=
word_tokenize(tweet.lower())

words
=
[lemmatizer.lemmatize(word)
for
word
in
words]

features
=
np.zeros(len(lex))

for
word
in
words:

if
word
in
lex:

features[lex.index(word)]
=
1


test_x.append(list(features))

test_y.append(eval(label))

return
test_x,
test_y

 

test_x,
test_y
=
get_test_dataset('tesing.csv')

##############################################################################

input_size
=
len(lex)

num_classes
=
3

 

X
=
tf.placeholder(tf.int32,
[None,
input_size])

Y
=
tf.placeholder(tf.float32,
[None,
num_classes])

 

dropout_keep_prob
=
tf.placeholder(tf.float32)

 

batch_size
=
90

 

def
neural_network():

#
 embedding layer

with
tf.device('/cpu:0'),
tf.name_scope("embedding"):

embedding_size
=
128

W
=
tf.Variable(tf.random_uniform([input_size,
embedding_size],
-1.0,
1.0))

embedded_chars
=
tf.nn.embedding_lookup(W,
X)

embedded_chars_expanded
=
tf.expand_dims(embedded_chars,
-1)

#
 convolution + maxpool layer

num_filters
=
128

filter_sizes
=
[3,4,5]

pooled_outputs
=
[]

for
i,
filter_size
in
enumerate(filter_sizes):

with
tf.name_scope("conv-maxpool-%s"
%
filter_size):

filter_shape
=
[filter_size,
embedding_size,
1,
num_filters]

W
=
tf.Variable(tf.truncated_normal(filter_shape,
stddev=0.1))

b
=
tf.Variable(tf.constant(0.1,
shape=[num_filters]))

conv
=
tf.nn.conv2d(embedded_chars_expanded,
W,
strides=[1,
1,
1,
1],
padding="VALID")

h
=
tf.nn.relu(tf.nn.bias_add(conv,
b))

pooled
=
tf.nn.max_pool(h,
ksize=[1,
input_size
-
filter_size
+
1,
1,
1],
strides=[1,
1,
1,
1],
padding='VALID')

pooled_outputs.append(pooled)

 

num_filters_total
=
num_filters
*
len(filter_sizes)

h_pool
=
tf.concat(3,
pooled_outputs)

h_pool_flat
=
tf.reshape(h_pool,
[-1,
num_filters_total])

#
 dropout

with
tf.name_scope("dropout"):

h_drop
=
tf.nn.dropout(h_pool_flat,
dropout_keep_prob)

#
 output

with
tf.name_scope("output"):

W
=
tf.get_variable("W",
shape=[num_filters_total,
num_classes],
initializer=tf.contrib.layers.xavier_initializer())

b
=
tf.Variable(tf.constant(0.1,
shape=[num_classes]))

output
=
tf.nn.xw_plus_b(h_drop,
W,
b)


return
output

 

def
train_neural_network():

output
=
neural_network()

 

optimizer
=
tf.train.AdamOptimizer(1e-3)

loss
=
tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(output,
Y))

grads_and_vars
=
optimizer.compute_gradients(loss)

train_op
=
optimizer.apply_gradients(grads_and_vars)

 

saver
=
tf.train.Saver(tf.global_variables())

with
tf.Session()
as
sess:

sess.run(tf.global_variables_initializer())

 

lemmatizer
=
WordNetLemmatizer()

i
=
0

while
True:

batch_x
=
[]

batch_y
=
[]

 

#if
 model.ckpt檔案已存在:

#
 saver.restore(session, 'model.ckpt')  恢復儲存的session

try:

lines
=
get_n_random_line('training.csv',
batch_size)

for
line
in
lines:

label
=
line.split(':%:%:%:')[0]

tweet
=
line.split(':%:%:%:')[1]

words
=
word_tokenize(tweet.lower())

words
=
[lemmatizer.lemmatize(word)
for
word
in
words]

 

features
=
np.zeros(len(lex))

for
word
in
words:

if
word
in
lex:

features[lex.index(word)]
=
1  #
 一個句子中某個詞可能出現兩次,可以用+=1，其實區別不大


batch_x.append(list(features))

batch_y.append(eval(label))

 

_,
loss_
=
sess.run([train_op,
loss],
feed_dict={X:batch_x,
Y:batch_y,
dropout_keep_prob:0.5})

print(loss_)

except
Exception
as
e:

print(e)

 

if
i
%
10
==
0:

predictions
=
tf.argmax(output,
1)

correct_predictions
=
tf.equal(predictions,
tf.argmax(Y,
1))

accuracy
=
tf.reduce_mean(tf.cast(correct_predictions,
"float"))

accur
=
sess.run(accuracy,
feed_dict={X:test_x[0:50],
Y:test_y[0:50],
dropout_keep_prob:1.0})

print('準確率:',
accur)

 

i
+=
1

 

train_neural_network()

使用了CNN模型之後，準確率有了顯著提升。

SM2-秘鑰載入、生成與轉換
2024-07-11
canvas入門實戰--邀請卡生成與下載
2017-12-14
Canvas
canvas 入門實戰–邀請卡生成與下載
2018-01-04
Canvas
Python - pydantic 入門介紹與 Models 的簡單使用
2021-08-21
Python
excel表格生成與匯入
2017-11-16
Excel
預載入與智慧預載入（iOS）
2016-11-10
iOS
不在models.py中的models
2018-08-09
頁面圖片預載入與懶載入策略
2019-01-14
[譯] React 16.6 懶載入(與預載入)元件
2019-01-06
React元件
圖片懶載入與下拉載入更新資料
2017-11-21
javascript圖片懶載入與預載入的分析
2014-01-19
JavaScript
類與類的載入
2022-03-02
預載入與快取
2018-06-06
快取
Probabilistic Models
2020-12-17
Android ImageLoader框架之圖片載入與載入策略
2015-04-05
Android框架
類載入機制與反射
2021-09-09
反射
APPEND載入與DELETE操作
2009-04-12
APPdelete
ECMAScript擴充套件 -12 【圖片的預載入與懶載入】
2020-11-20
套件
Java類載入機制與Tomcat類載入器架構
2017-11-23
JavaTomcat架構
fabric-ca載入openssl生成的ecdsa標準證書
2018-09-13
Flutter載入圖片與Glide
2020-07-14
FlutterIDE
Bitmap的載入與快取策略
2017-12-14
快取
Pytorch資料載入與使用
2024-06-17
PyTorch
Beego Models之二
2021-09-09
Go
Grails + EJB Domain Models
2008-01-03
AI
CS 839: FOUNDATION MODELS
2024-10-03
Android入門教程 | Fragment （載入方法與通訊）
2021-10-19
AndroidFragment
使用phpoffice/phpspreadsheet實現載入excel模板生成excel
2023-04-27
PHPExcel
訓練模型的儲存與載入
2019-12-19
模型
spring bean定義與載入方式
2019-03-16
SpringBean
【JVM】類載入器與雙親委派
2020-12-09
JVM
tf.keras: 儲存與載入模型
2020-12-14
Keras模型
2.影像的載入與儲存
2020-11-04
Spark下載與入門（Spark自學二）
2017-10-11
Spark
分散載入與記憶體佈局
2024-05-12
記憶體
【scikit-learn基礎】--『資料載入』之樣本生成器
2023-12-08
Laravel view models [翻譯]
2019-01-20
LaravelView
12、flask-模型-models
2024-07-07
Flask模型

models生成與載入

TensorFlow練習2: 對評論進行分類

使用的資料集

資料預處理

開始漫長的訓練

使用訓練好的模型

上面使用簡單的feedfroward模型，下面使用CNN模型

相關文章