Emotion-Cause Pair Extraction:A New Task to Emotion Analysis in Texts程式碼筆記

J_Xiong0117發表於2020-12-27

一. 載入語料和預訓練詞向量(load_w2v)

Step 1. 載入訓練語料(clause_keywords.csv)並統計emotion和cause欄位覆蓋的中文詞words

    words = []
    inputFile1 = open(train_file_path, 'r', encoding='utf-8')
    for line in inputFile1.readlines():
        line = line.strip().split(',')
        emotion, clause = line[2], line[-1]
        words.extend([emotion] + clause.split())

clause_keywords.csv
在這裡插入圖片描述

Step 2. 中文詞words去重並生成 詞-id與id-詞 索引字典

    words = set(words)  # 所有不重複詞的集合
    word_idx = dict((c, k + 1) for k, c in enumerate(words))  # 每個詞及詞的位置
    word_idx_rev = dict((k + 1, c) for k, c in enumerate(words))  # 每個詞及詞的位置

word_idx與word_idx_rev
在這裡插入圖片描述
在這裡插入圖片描述

Step 3. 載入word2vec檔案(w2v_200.txt)並生成詞和對應詞向量的對映字典w2v

    w2v = {}
    inputFile2 = open(embedding_path, 'r', encoding='utf-8')
    inputFile2.readline()
    for line in inputFile2.readlines():
        line = line.strip().split(' ')
        w, ebd = line[0], line[1:]
        w2v[w] = ebd

w2v_200.txt
在這裡插入圖片描述

w2v
在這裡插入圖片描述

Step 4. 將訓練語料中去重後的詞轉成詞向量。遍歷words,如果詞在w2v中,就取對應的詞向量,否則從均勻分佈[-0.1,0.1]中隨機取個200維向量

embedding = [list(np.zeros(embedding_dim))]
    hit = 0
    for item in words:
        if item in w2v:
            vec = list(map(float, w2v[item]))
            hit += 1
        else:
            vec = list(np.random.rand(embedding_dim) / 5. - 0.1)  # 從均勻分佈[-0.1,0.1]中隨機取
        embedding.append(vec)
    print('w2v_file: {}\nall_words: {} hit_words: {}'.format(embedding_path, len(words), hit))

embedding
在這裡插入圖片描述

Step 5. 初始化位置向量

    embedding_pos = [list(np.zeros(embedding_dim_pos))]
    embedding_pos.extend([list(np.random.normal(loc=0.0, scale=0.1, size=embedding_dim_pos)) for i in range(200)])

    embedding, embedding_pos = np.array(embedding), np.array(embedding_pos)

    print("embedding.shape: {} embedding_pos.shape: {}".format(embedding.shape, embedding_pos.shape))
    print("load embedding done!\n")

在這裡插入圖片描述

二. 模型構建(build_model)

Step 1. 輸入x轉詞向量,得到inputs

    x = tf.nn.embedding_lookup(word_embedding, x) ## (?,75,30,200)
    inputs = tf.reshape(x, [-1, FLAGS.max_sen_len, FLAGS.embedding_dim])  ## (?,30,200)
    inputs = tf.nn.dropout(inputs, keep_prob=keep_prob1)

Step 2. 對inputs進行BiLSTM編碼以及注意力計算

with tf.name_scope('word_encode'):
    inputs = RNN(inputs, sen_len, n_hidden=FLAGS.n_hidden, scope=FLAGS.scope + 'word_layer' + name)
with tf.name_scope('word_attention'):
    sh2 = 2 * FLAGS.n_hidden
    w1 = get_weight_varible('word_att_w1' + name, [sh2, sh2])
    b1 = get_weight_varible('word_att_b1' + name, [sh2])
    w2 = get_weight_varible('word_att_w2' + name, [sh2, 1])
    s = att_var(inputs, sen_len, w1, b1, w2)
s = tf.reshape(s, [-1, FLAGS.max_doc_len, 2 * FLAGS.n_hidden])
def biLSTM(inputs, length, n_hidden, scope):
    ''' 
    input shape:[batch_size, max_len, embedding_dim]
    length shape:[batch_size]
    return shape:[batch_size, max_len, n_hidden*2]
    '''
    outputs, state = tf.nn.bidirectional_dynamic_rnn(
        cell_fw=tf.contrib.rnn.LSTMCell(n_hidden),
        cell_bw=tf.contrib.rnn.LSTMCell(n_hidden),
        inputs=inputs,
        sequence_length=length,
        dtype=tf.float32,
        scope=scope
    )

    return tf.concat(outputs, 2)
def att_var(inputs, length, w1, b1, w2):
    ''' 
    input shape:[batch_size, max_len, n_hidden]
    length shape:[batch_size]
    return shape:[batch_size, n_hidden]
    '''
    max_len, n_hidden = (tf.shape(inputs)[1], tf.shape(inputs)[2])
    tmp = tf.reshape(inputs, [-1, n_hidden])
    u = tf.tanh(tf.matmul(tmp, w1) + b1)
    alpha = tf.reshape(tf.matmul(u, w2), [-1, 1, max_len])
    alpha = softmax_by_length(alpha, length)
    return tf.reshape(tf.matmul(alpha, inputs), [-1, n_hidden])

Step 3. 再經過一層BiLSTM層,然後經過softmax層給出預測結果


## cause預測
s = get_s(inputs, name='cause_word_encode')
s = RNN(s, doc_len, n_hidden=FLAGS.n_hidden, scope=FLAGS.scope + 'cause_sentence_layer')
with tf.name_scope('sequence_prediction'):
    s1 = tf.reshape(s, [-1, 2 * FLAGS.n_hidden])
    s1 = tf.nn.dropout(s1, keep_prob=keep_prob2)

    w_cause = get_weight_varible('softmax_w_cause', [2 * FLAGS.n_hidden, FLAGS.n_class])
    b_cause = get_weight_varible('softmax_b_cause', [FLAGS.n_class])
    pred_cause = tf.nn.softmax(tf.matmul(s1, w_cause) + b_cause)
    pred_cause = tf.reshape(pred_cause, [-1, FLAGS.max_doc_len, FLAGS.n_class])

## emotion預測
s = get_s(inputs, name='pos_word_encode')
s = RNN(s, doc_len, n_hidden=FLAGS.n_hidden, scope=FLAGS.scope + 'pos_sentence_layer')
with tf.name_scope('sequence_prediction'):
    s1 = tf.reshape(s, [-1, 2 * FLAGS.n_hidden])
    s1 = tf.nn.dropout(s1, keep_prob=keep_prob2)

    w_pos = get_weight_varible('softmax_w_pos', [2 * FLAGS.n_hidden, FLAGS.n_class])
    b_pos = get_weight_varible('softmax_b_pos', [FLAGS.n_class])
    pred_pos = tf.nn.softmax(tf.matmul(s1, w_pos) + b_pos)
    pred_pos = tf.reshape(pred_pos, [-1, FLAGS.max_doc_len, FLAGS.n_class])

Step 4. 計算emotion、cause權重的l2正則損失和

reg = tf.nn.l2_loss(w_cause) + tf.nn.l2_loss(b_cause)
reg += tf.nn.l2_loss(w_pos) + tf.nn.l2_loss(b_pos)

Step 5. 計算總損失(emotion + cause + l2),並建立優化器Optimizer

pred_pos, pred_cause, reg = build_model(word_embedding, x, sen_len, doc_len, keep_prob1, keep_prob2, y_position, y_cause)
valid_num = tf.cast(tf.reduce_sum(doc_len), dtype=tf.float32)
loss_pos = - tf.reduce_sum(y_position * tf.log(pred_pos)) / valid_num  ## emotion loss
loss_cause = - tf.reduce_sum(y_cause * tf.log(pred_cause)) / valid_num  ## cause loss
loss_op = loss_cause * FLAGS.cause + loss_pos * FLAGS.pos + reg * FLAGS.l2_reg  ## total loss
optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate).minimize(loss_op)  ## 優化器

Step 6. 結果解析

true_y_cause_op = tf.argmax(y_cause, 2)  ## cause實際值
pred_y_cause_op = tf.argmax(pred_cause, 2)  ## cause預測值
true_y_pos_op = tf.argmax(y_position, 2)  ## emotion實際值
pred_y_pos_op = tf.argmax(pred_pos, 2)  ## emotion預測值

三. 模型訓練、評估(10折,每折迭代15個epoch)

Step 1. 載入訓練集、測試集

train_file_name = 'fold{}_train.txt'.format(fold)
test_file_name = 'fold{}_test.txt'.format(fold)
tr_doc_id, tr_y_position, tr_y_cause, tr_y_pairs, tr_x, tr_sen_len, tr_doc_len = load_data(
    'data_combine/' + train_file_name, word_id_mapping, FLAGS.max_doc_len, FLAGS.max_sen_len)
te_doc_id, te_y_position, te_y_cause, te_y_pairs, te_x, te_sen_len, te_doc_len = load_data(
    'data_combine/' + test_file_name, word_id_mapping, FLAGS.max_doc_len, FLAGS.max_sen_len)
def load_data(input_file, word_idx, max_doc_len=75, max_sen_len=45):
    print('load data_file: {}'.format(input_file))
    y_position, y_cause, y_pairs, x, sen_len, doc_len = [], [], [], [], [], []
    doc_id = []

    n_cut = 0
    inputFile = open(input_file, 'r', encoding='utf-8')
    while True:
        line = inputFile.readline()
        if line == '': break
        line = line.strip().split()
        doc_id.append(line[0])
        d_len = int(line[1])
        pairs = eval('[' + inputFile.readline().strip() + ']')
        doc_len.append(d_len)
        y_pairs.append(pairs)
        pos, cause = zip(*pairs)
        y_po, y_ca, sen_len_tmp, x_tmp = np.zeros((max_doc_len, 2)), np.zeros((max_doc_len, 2)), np.zeros(max_doc_len,
                                                                                                          dtype=np.int32), np.zeros(
            (max_doc_len, max_sen_len), dtype=np.int32)
        for i in range(d_len):
            y_po[i][int(i + 1 in pos)] = 1
            y_ca[i][int(i + 1 in cause)] = 1
            words = inputFile.readline().strip().split(',')[-1]
            sen_len_tmp[i] = min(len(words.split()), max_sen_len)
            for j, word in enumerate(words.split()):
                if j >= max_sen_len:
                    n_cut += 1
                    break
                x_tmp[i][j] = int(word_idx[word])

        y_position.append(y_po)
        y_cause.append(y_ca)
        x.append(x_tmp)
        sen_len.append(sen_len_tmp)

    y_position, y_cause, x, sen_len, doc_len = map(np.array, [y_position, y_cause, x, sen_len, doc_len])
    for var in ['y_position', 'y_cause', 'x', 'sen_len', 'doc_len']:
        print('{}.shape {}'.format(var, eval(var).shape))
    print('n_cut {}'.format(n_cut))
    print('load data done!\n')
    return doc_id, y_position, y_cause, y_pairs, x, sen_len, doc_len

Step 2. 模型訓練

# train
for train, _ in get_batch_data(tr_x, tr_sen_len, tr_doc_len, FLAGS.keep_prob1, FLAGS.keep_prob2,
                                tr_y_position, tr_y_cause, FLAGS.batch_size):
    _, loss, pred_y_cause, true_y_cause, pred_y_pos, true_y_pos, doc_len_batch = sess.run(
        [optimizer, loss_op, pred_y_cause_op, true_y_cause_op, pred_y_pos_op, true_y_pos_op, doc_len],
        feed_dict=dict(zip(placeholders, train)))
    if step % 10 == 0:
        print('step {}: train loss {:.4f} '.format(step, loss))
        acc, p, r, f1 = acc_prf(pred_y_cause, true_y_cause, doc_len_batch)
        print('cause_predict: train acc {:.4f} p {:.4f} r {:.4f} f1 {:.4f}'.format(acc, p, r, f1))
        acc, p, r, f1 = acc_prf(pred_y_pos, true_y_pos, doc_len_batch)
        print('position_predict: train acc {:.4f} p {:.4f} r {:.4f} f1 {:.4f}'.format(acc, p, r, f1))
    step = step + 1

Step 3. 模型評估(測試)

# test
test = [te_x, te_sen_len, te_doc_len, 1., 1., te_y_position, te_y_cause]
loss, pred_y_cause, true_y_cause, pred_y_pos, true_y_pos, doc_len_batch = sess.run(
    [loss_op, pred_y_cause_op, true_y_cause_op, pred_y_pos_op, true_y_pos_op, doc_len],
    feed_dict=dict(zip(placeholders, test)))
print('\nepoch {}: test loss {:.4f} cost time: {:.1f}s\n'.format(i, loss, time.time() - start_time))

acc, p, r, f1 = acc_prf(pred_y_cause, true_y_cause, doc_len_batch)
result_avg_cause = [acc, p, r, f1]
if f1 > max_f1_cause:
    max_acc_cause, max_p_cause, max_r_cause, max_f1_cause = acc, p, r, f1
print('cause_predict: test acc {:.4f} p {:.4f} r {:.4f} f1 {:.4f}'.format(acc, p, r, f1))
print('max_acc {:.4f} max_p {:.4f} max_r {:.4f} max_f1 {:.4f}\n'.format(max_acc_cause, max_p_cause,
                                                                        max_r_cause, max_f1_cause))

acc, p, r, f1 = acc_prf(pred_y_pos, true_y_pos, doc_len_batch)
result_avg_pos = [acc, p, r, f1]
if f1 > max_f1_pos:
    max_acc_pos, max_p_pos, max_r_pos, max_f1_pos = acc, p, r, f1
print('position_predict: test acc {:.4f} p {:.4f} r {:.4f} f1 {:.4f}'.format(acc, p, r, f1))
print(
    'max_acc {:.4f} max_p {:.4f} max_r {:.4f} max_f1 {:.4f}\n'.format(max_acc_pos, max_p_pos, max_r_pos,
                                                                        max_f1_pos))

if (result_avg_cause[-1] + result_avg_pos[-1]) / 2. > max_f1_avg:
    max_f1_avg = (result_avg_cause[-1] + result_avg_pos[-1]) / 2.
    result_avg_cause_max = result_avg_cause
    result_avg_pos_max = result_avg_pos

    te_pred_y_cause, te_pred_y_pos = pred_y_cause, pred_y_pos
    tr_pred_y_cause, tr_pred_y_pos = [], []
    for train, _ in get_batch_data(tr_x, tr_sen_len, tr_doc_len, 1., 1., tr_y_position, tr_y_cause, 200,
                                    test=True):
        pred_y_cause, pred_y_pos = sess.run([pred_y_cause_op, pred_y_pos_op],
                                            feed_dict=dict(zip(placeholders, train)))
        tr_pred_y_cause.extend(list(pred_y_cause))
        tr_pred_y_pos.extend(list(pred_y_pos))
print('Average max cause: max_acc {:.4f} max_p {:.4f} max_r {:.4f} max_f1 {:.4f}'.format(
    result_avg_cause_max[0], result_avg_cause_max[1], result_avg_cause_max[2], result_avg_cause_max[3]))
print('Average max pos: max_acc {:.4f} max_p {:.4f} max_r {:.4f} max_f1 {:.4f}\n'.format(
    result_avg_pos_max[0], result_avg_pos_max[1], result_avg_pos_max[2], result_avg_pos_max[3]))

Step 4. 結果回寫與列印

def get_pair_data(file_name, doc_id, doc_len, y_pairs, pred_y_cause, pred_y_pos, x, sen_len, word_idx_rev):
    g = open(file_name, 'w', encoding='utf-8')
    for i in range(len(doc_id)):
        g.write(doc_id[i] + ' ' + str(doc_len[i]) + '\n')
        g.write(str(y_pairs[i]) + '\n')
        for j in range(doc_len[i]):
            clause = ''
            for k in range(sen_len[i][j]):
                clause = clause + word_idx_rev[x[i][j][k]] + ' '
            g.write(str(j + 1) + ', ' + str(pred_y_pos[i][j]) + ', ' + str(
                pred_y_cause[i][j]) + ', ' + clause + '\n')
    print(f"write {file_name} done")

get_pair_data(save_dir + test_file_name, te_doc_id, te_doc_len, te_y_pairs, te_pred_y_cause, te_pred_y_pos,
                te_x, te_sen_len, word_idx_rev)
get_pair_data(save_dir + train_file_name, tr_doc_id, tr_doc_len, tr_y_pairs, tr_pred_y_cause, tr_pred_y_pos,
                tr_x, tr_sen_len, word_idx_rev)

print('Optimization Finished!\n')
print('############# fold {} end ###############'.format(fold))
# fold += 1
acc_cause_list.append(result_avg_cause_max[0])
p_cause_list.append(result_avg_cause_max[1])
r_cause_list.append(result_avg_cause_max[2])
f1_cause_list.append(result_avg_cause_max[3])
acc_pos_list.append(result_avg_pos_max[0])
p_pos_list.append(result_avg_pos_max[1])
r_pos_list.append(result_avg_pos_max[2])
f1_pos_list.append(result_avg_pos_max[3])

相關文章