Emotion-Cause Pair Extraction:A New Task to Emotion Analysis in Texts程式碼筆記
一. 載入語料和預訓練詞向量(load_w2v)
Step 1. 載入訓練語料(clause_keywords.csv)並統計emotion和cause欄位覆蓋的中文詞words
words = []
inputFile1 = open(train_file_path, 'r', encoding='utf-8')
for line in inputFile1.readlines():
line = line.strip().split(',')
emotion, clause = line[2], line[-1]
words.extend([emotion] + clause.split())
clause_keywords.csv
Step 2. 中文詞words去重並生成 詞-id與id-詞 索引字典
words = set(words) # 所有不重複詞的集合
word_idx = dict((c, k + 1) for k, c in enumerate(words)) # 每個詞及詞的位置
word_idx_rev = dict((k + 1, c) for k, c in enumerate(words)) # 每個詞及詞的位置
word_idx與word_idx_rev
Step 3. 載入word2vec檔案(w2v_200.txt)並生成詞和對應詞向量的對映字典w2v
w2v = {}
inputFile2 = open(embedding_path, 'r', encoding='utf-8')
inputFile2.readline()
for line in inputFile2.readlines():
line = line.strip().split(' ')
w, ebd = line[0], line[1:]
w2v[w] = ebd
w2v_200.txt
w2v
Step 4. 將訓練語料中去重後的詞轉成詞向量。遍歷words,如果詞在w2v中,就取對應的詞向量,否則從均勻分佈[-0.1,0.1]中隨機取個200維向量
embedding = [list(np.zeros(embedding_dim))]
hit = 0
for item in words:
if item in w2v:
vec = list(map(float, w2v[item]))
hit += 1
else:
vec = list(np.random.rand(embedding_dim) / 5. - 0.1) # 從均勻分佈[-0.1,0.1]中隨機取
embedding.append(vec)
print('w2v_file: {}\nall_words: {} hit_words: {}'.format(embedding_path, len(words), hit))
embedding
Step 5. 初始化位置向量
embedding_pos = [list(np.zeros(embedding_dim_pos))]
embedding_pos.extend([list(np.random.normal(loc=0.0, scale=0.1, size=embedding_dim_pos)) for i in range(200)])
embedding, embedding_pos = np.array(embedding), np.array(embedding_pos)
print("embedding.shape: {} embedding_pos.shape: {}".format(embedding.shape, embedding_pos.shape))
print("load embedding done!\n")
二. 模型構建(build_model)
Step 1. 輸入x轉詞向量,得到inputs
x = tf.nn.embedding_lookup(word_embedding, x) ## (?,75,30,200)
inputs = tf.reshape(x, [-1, FLAGS.max_sen_len, FLAGS.embedding_dim]) ## (?,30,200)
inputs = tf.nn.dropout(inputs, keep_prob=keep_prob1)
Step 2. 對inputs進行BiLSTM編碼以及注意力計算
with tf.name_scope('word_encode'):
inputs = RNN(inputs, sen_len, n_hidden=FLAGS.n_hidden, scope=FLAGS.scope + 'word_layer' + name)
with tf.name_scope('word_attention'):
sh2 = 2 * FLAGS.n_hidden
w1 = get_weight_varible('word_att_w1' + name, [sh2, sh2])
b1 = get_weight_varible('word_att_b1' + name, [sh2])
w2 = get_weight_varible('word_att_w2' + name, [sh2, 1])
s = att_var(inputs, sen_len, w1, b1, w2)
s = tf.reshape(s, [-1, FLAGS.max_doc_len, 2 * FLAGS.n_hidden])
def biLSTM(inputs, length, n_hidden, scope):
'''
input shape:[batch_size, max_len, embedding_dim]
length shape:[batch_size]
return shape:[batch_size, max_len, n_hidden*2]
'''
outputs, state = tf.nn.bidirectional_dynamic_rnn(
cell_fw=tf.contrib.rnn.LSTMCell(n_hidden),
cell_bw=tf.contrib.rnn.LSTMCell(n_hidden),
inputs=inputs,
sequence_length=length,
dtype=tf.float32,
scope=scope
)
return tf.concat(outputs, 2)
def att_var(inputs, length, w1, b1, w2):
'''
input shape:[batch_size, max_len, n_hidden]
length shape:[batch_size]
return shape:[batch_size, n_hidden]
'''
max_len, n_hidden = (tf.shape(inputs)[1], tf.shape(inputs)[2])
tmp = tf.reshape(inputs, [-1, n_hidden])
u = tf.tanh(tf.matmul(tmp, w1) + b1)
alpha = tf.reshape(tf.matmul(u, w2), [-1, 1, max_len])
alpha = softmax_by_length(alpha, length)
return tf.reshape(tf.matmul(alpha, inputs), [-1, n_hidden])
Step 3. 再經過一層BiLSTM層,然後經過softmax層給出預測結果
## cause預測
s = get_s(inputs, name='cause_word_encode')
s = RNN(s, doc_len, n_hidden=FLAGS.n_hidden, scope=FLAGS.scope + 'cause_sentence_layer')
with tf.name_scope('sequence_prediction'):
s1 = tf.reshape(s, [-1, 2 * FLAGS.n_hidden])
s1 = tf.nn.dropout(s1, keep_prob=keep_prob2)
w_cause = get_weight_varible('softmax_w_cause', [2 * FLAGS.n_hidden, FLAGS.n_class])
b_cause = get_weight_varible('softmax_b_cause', [FLAGS.n_class])
pred_cause = tf.nn.softmax(tf.matmul(s1, w_cause) + b_cause)
pred_cause = tf.reshape(pred_cause, [-1, FLAGS.max_doc_len, FLAGS.n_class])
## emotion預測
s = get_s(inputs, name='pos_word_encode')
s = RNN(s, doc_len, n_hidden=FLAGS.n_hidden, scope=FLAGS.scope + 'pos_sentence_layer')
with tf.name_scope('sequence_prediction'):
s1 = tf.reshape(s, [-1, 2 * FLAGS.n_hidden])
s1 = tf.nn.dropout(s1, keep_prob=keep_prob2)
w_pos = get_weight_varible('softmax_w_pos', [2 * FLAGS.n_hidden, FLAGS.n_class])
b_pos = get_weight_varible('softmax_b_pos', [FLAGS.n_class])
pred_pos = tf.nn.softmax(tf.matmul(s1, w_pos) + b_pos)
pred_pos = tf.reshape(pred_pos, [-1, FLAGS.max_doc_len, FLAGS.n_class])
Step 4. 計算emotion、cause權重的l2正則損失和
reg = tf.nn.l2_loss(w_cause) + tf.nn.l2_loss(b_cause)
reg += tf.nn.l2_loss(w_pos) + tf.nn.l2_loss(b_pos)
Step 5. 計算總損失(emotion + cause + l2),並建立優化器Optimizer
pred_pos, pred_cause, reg = build_model(word_embedding, x, sen_len, doc_len, keep_prob1, keep_prob2, y_position, y_cause)
valid_num = tf.cast(tf.reduce_sum(doc_len), dtype=tf.float32)
loss_pos = - tf.reduce_sum(y_position * tf.log(pred_pos)) / valid_num ## emotion loss
loss_cause = - tf.reduce_sum(y_cause * tf.log(pred_cause)) / valid_num ## cause loss
loss_op = loss_cause * FLAGS.cause + loss_pos * FLAGS.pos + reg * FLAGS.l2_reg ## total loss
optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate).minimize(loss_op) ## 優化器
Step 6. 結果解析
true_y_cause_op = tf.argmax(y_cause, 2) ## cause實際值
pred_y_cause_op = tf.argmax(pred_cause, 2) ## cause預測值
true_y_pos_op = tf.argmax(y_position, 2) ## emotion實際值
pred_y_pos_op = tf.argmax(pred_pos, 2) ## emotion預測值
三. 模型訓練、評估(10折,每折迭代15個epoch)
Step 1. 載入訓練集、測試集
train_file_name = 'fold{}_train.txt'.format(fold)
test_file_name = 'fold{}_test.txt'.format(fold)
tr_doc_id, tr_y_position, tr_y_cause, tr_y_pairs, tr_x, tr_sen_len, tr_doc_len = load_data(
'data_combine/' + train_file_name, word_id_mapping, FLAGS.max_doc_len, FLAGS.max_sen_len)
te_doc_id, te_y_position, te_y_cause, te_y_pairs, te_x, te_sen_len, te_doc_len = load_data(
'data_combine/' + test_file_name, word_id_mapping, FLAGS.max_doc_len, FLAGS.max_sen_len)
def load_data(input_file, word_idx, max_doc_len=75, max_sen_len=45):
print('load data_file: {}'.format(input_file))
y_position, y_cause, y_pairs, x, sen_len, doc_len = [], [], [], [], [], []
doc_id = []
n_cut = 0
inputFile = open(input_file, 'r', encoding='utf-8')
while True:
line = inputFile.readline()
if line == '': break
line = line.strip().split()
doc_id.append(line[0])
d_len = int(line[1])
pairs = eval('[' + inputFile.readline().strip() + ']')
doc_len.append(d_len)
y_pairs.append(pairs)
pos, cause = zip(*pairs)
y_po, y_ca, sen_len_tmp, x_tmp = np.zeros((max_doc_len, 2)), np.zeros((max_doc_len, 2)), np.zeros(max_doc_len,
dtype=np.int32), np.zeros(
(max_doc_len, max_sen_len), dtype=np.int32)
for i in range(d_len):
y_po[i][int(i + 1 in pos)] = 1
y_ca[i][int(i + 1 in cause)] = 1
words = inputFile.readline().strip().split(',')[-1]
sen_len_tmp[i] = min(len(words.split()), max_sen_len)
for j, word in enumerate(words.split()):
if j >= max_sen_len:
n_cut += 1
break
x_tmp[i][j] = int(word_idx[word])
y_position.append(y_po)
y_cause.append(y_ca)
x.append(x_tmp)
sen_len.append(sen_len_tmp)
y_position, y_cause, x, sen_len, doc_len = map(np.array, [y_position, y_cause, x, sen_len, doc_len])
for var in ['y_position', 'y_cause', 'x', 'sen_len', 'doc_len']:
print('{}.shape {}'.format(var, eval(var).shape))
print('n_cut {}'.format(n_cut))
print('load data done!\n')
return doc_id, y_position, y_cause, y_pairs, x, sen_len, doc_len
Step 2. 模型訓練
# train
for train, _ in get_batch_data(tr_x, tr_sen_len, tr_doc_len, FLAGS.keep_prob1, FLAGS.keep_prob2,
tr_y_position, tr_y_cause, FLAGS.batch_size):
_, loss, pred_y_cause, true_y_cause, pred_y_pos, true_y_pos, doc_len_batch = sess.run(
[optimizer, loss_op, pred_y_cause_op, true_y_cause_op, pred_y_pos_op, true_y_pos_op, doc_len],
feed_dict=dict(zip(placeholders, train)))
if step % 10 == 0:
print('step {}: train loss {:.4f} '.format(step, loss))
acc, p, r, f1 = acc_prf(pred_y_cause, true_y_cause, doc_len_batch)
print('cause_predict: train acc {:.4f} p {:.4f} r {:.4f} f1 {:.4f}'.format(acc, p, r, f1))
acc, p, r, f1 = acc_prf(pred_y_pos, true_y_pos, doc_len_batch)
print('position_predict: train acc {:.4f} p {:.4f} r {:.4f} f1 {:.4f}'.format(acc, p, r, f1))
step = step + 1
Step 3. 模型評估(測試)
# test
test = [te_x, te_sen_len, te_doc_len, 1., 1., te_y_position, te_y_cause]
loss, pred_y_cause, true_y_cause, pred_y_pos, true_y_pos, doc_len_batch = sess.run(
[loss_op, pred_y_cause_op, true_y_cause_op, pred_y_pos_op, true_y_pos_op, doc_len],
feed_dict=dict(zip(placeholders, test)))
print('\nepoch {}: test loss {:.4f} cost time: {:.1f}s\n'.format(i, loss, time.time() - start_time))
acc, p, r, f1 = acc_prf(pred_y_cause, true_y_cause, doc_len_batch)
result_avg_cause = [acc, p, r, f1]
if f1 > max_f1_cause:
max_acc_cause, max_p_cause, max_r_cause, max_f1_cause = acc, p, r, f1
print('cause_predict: test acc {:.4f} p {:.4f} r {:.4f} f1 {:.4f}'.format(acc, p, r, f1))
print('max_acc {:.4f} max_p {:.4f} max_r {:.4f} max_f1 {:.4f}\n'.format(max_acc_cause, max_p_cause,
max_r_cause, max_f1_cause))
acc, p, r, f1 = acc_prf(pred_y_pos, true_y_pos, doc_len_batch)
result_avg_pos = [acc, p, r, f1]
if f1 > max_f1_pos:
max_acc_pos, max_p_pos, max_r_pos, max_f1_pos = acc, p, r, f1
print('position_predict: test acc {:.4f} p {:.4f} r {:.4f} f1 {:.4f}'.format(acc, p, r, f1))
print(
'max_acc {:.4f} max_p {:.4f} max_r {:.4f} max_f1 {:.4f}\n'.format(max_acc_pos, max_p_pos, max_r_pos,
max_f1_pos))
if (result_avg_cause[-1] + result_avg_pos[-1]) / 2. > max_f1_avg:
max_f1_avg = (result_avg_cause[-1] + result_avg_pos[-1]) / 2.
result_avg_cause_max = result_avg_cause
result_avg_pos_max = result_avg_pos
te_pred_y_cause, te_pred_y_pos = pred_y_cause, pred_y_pos
tr_pred_y_cause, tr_pred_y_pos = [], []
for train, _ in get_batch_data(tr_x, tr_sen_len, tr_doc_len, 1., 1., tr_y_position, tr_y_cause, 200,
test=True):
pred_y_cause, pred_y_pos = sess.run([pred_y_cause_op, pred_y_pos_op],
feed_dict=dict(zip(placeholders, train)))
tr_pred_y_cause.extend(list(pred_y_cause))
tr_pred_y_pos.extend(list(pred_y_pos))
print('Average max cause: max_acc {:.4f} max_p {:.4f} max_r {:.4f} max_f1 {:.4f}'.format(
result_avg_cause_max[0], result_avg_cause_max[1], result_avg_cause_max[2], result_avg_cause_max[3]))
print('Average max pos: max_acc {:.4f} max_p {:.4f} max_r {:.4f} max_f1 {:.4f}\n'.format(
result_avg_pos_max[0], result_avg_pos_max[1], result_avg_pos_max[2], result_avg_pos_max[3]))
Step 4. 結果回寫與列印
def get_pair_data(file_name, doc_id, doc_len, y_pairs, pred_y_cause, pred_y_pos, x, sen_len, word_idx_rev):
g = open(file_name, 'w', encoding='utf-8')
for i in range(len(doc_id)):
g.write(doc_id[i] + ' ' + str(doc_len[i]) + '\n')
g.write(str(y_pairs[i]) + '\n')
for j in range(doc_len[i]):
clause = ''
for k in range(sen_len[i][j]):
clause = clause + word_idx_rev[x[i][j][k]] + ' '
g.write(str(j + 1) + ', ' + str(pred_y_pos[i][j]) + ', ' + str(
pred_y_cause[i][j]) + ', ' + clause + '\n')
print(f"write {file_name} done")
get_pair_data(save_dir + test_file_name, te_doc_id, te_doc_len, te_y_pairs, te_pred_y_cause, te_pred_y_pos,
te_x, te_sen_len, word_idx_rev)
get_pair_data(save_dir + train_file_name, tr_doc_id, tr_doc_len, tr_y_pairs, tr_pred_y_cause, tr_pred_y_pos,
tr_x, tr_sen_len, word_idx_rev)
print('Optimization Finished!\n')
print('############# fold {} end ###############'.format(fold))
# fold += 1
acc_cause_list.append(result_avg_cause_max[0])
p_cause_list.append(result_avg_cause_max[1])
r_cause_list.append(result_avg_cause_max[2])
f1_cause_list.append(result_avg_cause_max[3])
acc_pos_list.append(result_avg_pos_max[0])
p_pos_list.append(result_avg_pos_max[1])
r_pos_list.append(result_avg_pos_max[2])
f1_pos_list.append(result_avg_pos_max[3])
相關文章
- new筆記筆記
- Task01 筆記筆記
- Task01&Task02學習筆記筆記
- Task1&Task2學習筆記筆記
- celery筆記三之task和task的呼叫筆記
- Intent.FLAG_ACTIVITY_NEW_TASKIntent
- 《REBEL Relation Extraction By End-to-end Language generation》閱讀筆記筆記
- 【論文閱讀筆記】An Improved Neural Baseline for Temporal Relation Extraction筆記
- 【筆記】【Android】Activity的Task模式筆記Android模式
- 【筆記】【THM】Malware Analysis(惡意軟體分析)筆記
- Task.Run(), Task.Factory.StartNew() 和 New Task() 的行為不一致分析
- LearnVIORB程式碼框架筆記ORB框架筆記
- IP Adapter程式碼筆記APT筆記
- [SQL] Datawhale 學習筆記 Task04SQL筆記
- 人造情感(emotion)
- 分子AI預測賽Task1筆記AI筆記
- 大模型技術方向Task1筆記大模型筆記
- Python學習筆記—程式碼Python筆記
- pytorch程式碼示例筆記 -- AutogradPyTorch筆記
- pairAI
- 20240505記錄《程式碼隨想錄》筆記筆記
- shell指令碼程式設計筆記指令碼程式設計筆記
- 讀書筆記-乾淨程式碼筆記
- 程式碼大全2閱讀筆記筆記
- [例項分割]Condinst程式碼筆記筆記
- WWDC18 What’s New in LLVM 個人筆記LVM筆記
- 【論文閱讀筆記】Aspect-based sentiment analysis with alternating coattention networks筆記
- 大資料分析筆記 (7) - 時間序列分析(Time Series Analysis)大資料筆記
- celery筆記九之task執行結果檢視筆記
- C++程式碼閱讀筆記(一)筆記
- Laravel 原始碼筆記 應用程式 ApplicationLaravel原始碼筆記APP
- 《程式碼大全》閱讀筆記1(2024.10.4)筆記
- 05夢斷程式碼閱讀筆記筆記
- 03夢斷程式碼閱讀筆記筆記
- 04夢斷程式碼閱讀筆記筆記
- 《夢斷程式碼》讀書筆記(二)筆記
- 夢斷程式碼讀書筆記(一)筆記
- 《程式碼大全2》閱讀筆記01筆記