import os os.environ["CUDA_VISIBLE_DEVICES"] = "0" import tensorflow as tf from sklearn.model_selection import train_test_split from transformers import BertTokenizer, TFBertModel from transformers import RobertaTokenizer, TFRobertaModel import pandas as pd from random import shuffle from sklearn.metrics import confusion_matrix, f1_score import numpy as np import random # 設定 Python 的隨機種子 seed_value = 42 np.random.seed(seed_value) random.seed(seed_value) # 設定 TensorFlow 的全域性隨機種子 tf.random.set_seed(seed_value) os.environ['TF_DETERMINISTIC_OPS'] = '1' # 載入預訓練的BERT模型和tokenizer bert_model_name = './bert' tokenizer = BertTokenizer.from_pretrained(bert_model_name) bert_model = TFBertModel.from_pretrained(bert_model_name) # bert_model_name = './robert' # tokenizer = RobertaTokenizer.from_pretrained(bert_model_name) # bert_model = TFRobertaModel.from_pretrained(bert_model_name) # 計算詳細指標 def action_recall_accuracy(y_pred, y_true): cm = confusion_matrix(y_true, y_pred) # 計算每個類別的準確率和召回率 num_classes = cm.shape[0] accuracy = [] recall = [] for i in range(num_classes): # 計算準確率:預測正確的樣本數 / 實際屬於該類別的樣本數 acc = cm[i, i] / sum(cm[i, :]) accuracy.append(acc) # 計算召回率:預測正確的樣本數 / 預測為該類別的樣本數 rec = cm[i, i] / sum(cm[:, i]) recall.append(rec) # 列印結果 for i in range(num_classes): print(f"類別 {i} 的準確率: {accuracy[i]:.3f}") print(f"類別 {i} 的召回率: {recall[i]:.3f}") scores = [] for i in range(num_classes): # 計算F1分數 f1 = f1_score(y_true, y_pred, average=None)[i] scores.append(f1) # 列印F1分數 print(f"類別 {i} 的F1分數: {scores[i]:.3f}") # 列印各類別F1-score的平均值 average_f1 = sum(scores) / len(scores) print(f"各類別F1-score的平均值: {average_f1:.3f}") # 定義輸入處理函式 def encode_texts(query, title, tokenizer, max_length=128): encoded_dict = tokenizer.encode_plus( query, title, add_special_tokens=True, # 新增 [CLS], [SEP] 等標記 max_length=max_length, padding='max_length', truncation=True, return_attention_mask=True, return_tensors='tf' # 返回 TensorFlow 張量 ) return encoded_dict['input_ids'], encoded_dict['attention_mask'] # 構建模型 def build_model(bert_model): input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_ids') attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='attention_mask') bert_output = bert_model(input_ids, attention_mask=attention_mask) cls_output = bert_output.last_hidden_state[:, 0, :] # 取出 [CLS] 向量 dense = tf.keras.layers.Dense(256, activation='relu')(cls_output) dropout = tf.keras.layers.Dropout(0.5)(dense) dense2 = tf.keras.layers.Dense(32, activation='relu')(dropout) output = tf.keras.layers.Dense(1, activation='sigmoid')(dense2) # 二分類問題用 sigmoid 啟用 model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output) model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5), loss='binary_crossentropy', metrics=['accuracy', tf.keras.metrics.AUC(name='auc')]) return model # 讀取資料集 def load_dataset(file_path, tokenizer, max_length=128): queries = [] titles = [] labels = [] data = pd.read_csv(file_path) all_data = [] for query, title, label in zip(data['query'].tolist(), data['title'].tolist(), data["label"].tolist()): all_data.append([query, title, int(label)]) shuffle(all_data) for item in all_data: query, title, label = item queries.append(query) titles.append(title) labels.append(label) input_ids_list = [] attention_mask_list = [] for query, title in zip(queries, titles): input_ids, attention_mask = encode_texts(query, title, tokenizer, max_length) input_ids_list.append(input_ids) attention_mask_list.append(attention_mask) input_ids = tf.concat(input_ids_list, axis=0) attention_masks = tf.concat(attention_mask_list, axis=0) labels = tf.convert_to_tensor(labels) return {'input_ids': input_ids, 'attention_mask': attention_masks}, labels # 載入訓練和測試資料 train_data, train_labels = load_dataset('train.csv', tokenizer) test_data, test_labels = load_dataset('test.csv', tokenizer) # 將TensorFlow張量轉換為numpy陣列 train_input_ids_np = train_data['input_ids'].numpy() train_attention_masks_np = train_data['attention_mask'].numpy() train_labels_np = train_labels.numpy() # 將訓練資料進一步劃分為訓練集和驗證集 train_input_ids, val_input_ids, train_attention_masks, val_attention_masks, train_labels, val_labels = train_test_split( train_input_ids_np, train_attention_masks_np, train_labels_np, test_size=0.05, random_state=42, shuffle=False) # 將numpy陣列轉換回TensorFlow張量 train_inputs = {'input_ids': tf.convert_to_tensor(train_input_ids), 'attention_mask': tf.convert_to_tensor(train_attention_masks)} val_inputs = {'input_ids': tf.convert_to_tensor(val_input_ids), 'attention_mask': tf.convert_to_tensor(val_attention_masks)} train_labels = tf.convert_to_tensor(train_labels) val_labels = tf.convert_to_tensor(val_labels) # 模型例項化 model = build_model(bert_model) model.summary() # 計算類權重以強調準確性 neg_weight = 10.0 pos_weight = 1.0 # 使正類樣本的權重較低,減少召回率 class_weight = {0: neg_weight, 1: pos_weight} # 訓練模型 epochs =3 batch_size = 32 true_labels = pd.read_csv('test.csv')['label'].astype('int32') for epoch in range(epochs): print(f"Epoch {epoch + 1}/{epochs}") history = model.fit( x={'input_ids': train_inputs['input_ids'], 'attention_mask': train_inputs['attention_mask']}, y=train_labels, validation_data=( {'input_ids': val_inputs['input_ids'], 'attention_mask': val_inputs['attention_mask']}, val_labels ), epochs=1, # 每次只訓練一個 epoch batch_size=batch_size, shuffle=True, class_weight=class_weight # 調整類別權重 ) # 基於測試資料集進行評估 loss, accuracy, auc = model.evaluate(test_data, test_labels) print(f"Test loss: {loss}, Test accuracy: {accuracy}, Test AUC: {auc}") # 調整決策閾值 threshold = 0.5 # 調高閾值以減少 False Positives 提升準確度 # 計算精確率和召回率 predictions = model.predict(test_data) pred_labels = [int(i > threshold) for i in predictions[:, 0]] true_labels = list(np.array(true_labels)) action_recall_accuracy(pred_labels, true_labels) if epoch == 5: with open("pred_rs.txt", "w", encoding="utf-8") as out: for label, pred in zip(true_labels, predictions[:, 0]): out.write("{}\t{}\n".format(label, pred))