bert分類的程式碼

15375357604發表於2024-06-21

原文網址 : https://www.cnblogs.com/qiaoqifa/p/18260563

import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import tensorflow as tf
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, TFBertModel
from transformers import RobertaTokenizer, TFRobertaModel
import pandas as pd
from random import shuffle
from sklearn.metrics import confusion_matrix, f1_score
import numpy as np
import random


# 設定 Python 的隨機種子
seed_value = 42
np.random.seed(seed_value)
random.seed(seed_value)
# 設定 TensorFlow 的全域性隨機種子
tf.random.set_seed(seed_value)
os.environ['TF_DETERMINISTIC_OPS'] = '1'

# 載入預訓練的BERT模型和tokenizer
bert_model_name = './bert'
tokenizer = BertTokenizer.from_pretrained(bert_model_name)
bert_model = TFBertModel.from_pretrained(bert_model_name)
# bert_model_name = './robert'
# tokenizer = RobertaTokenizer.from_pretrained(bert_model_name)
# bert_model = TFRobertaModel.from_pretrained(bert_model_name)


# 計算詳細指標
def action_recall_accuracy(y_pred, y_true):
    cm = confusion_matrix(y_true, y_pred)

    # 計算每個類別的準確率和召回率
    num_classes = cm.shape[0]
    accuracy = []
    recall = []

    for i in range(num_classes):
        # 計算準確率：預測正確的樣本數 / 實際屬於該類別的樣本數
        acc = cm[i, i] / sum(cm[i, :])
        accuracy.append(acc)

        # 計算召回率：預測正確的樣本數 / 預測為該類別的樣本數
        rec = cm[i, i] / sum(cm[:, i])
        recall.append(rec)

    # 列印結果
    for i in range(num_classes):
        print(f"類別 {i} 的準確率: {accuracy[i]:.3f}")
        print(f"類別 {i} 的召回率: {recall[i]:.3f}")

    scores = []

    for i in range(num_classes):
        # 計算F1分數
        f1 = f1_score(y_true, y_pred, average=None)[i]
        scores.append(f1)

        # 列印F1分數
        print(f"類別 {i} 的F1分數: {scores[i]:.3f}")

    # 列印各類別F1-score的平均值
    average_f1 = sum(scores) / len(scores)
    print(f"各類別F1-score的平均值: {average_f1:.3f}")


# 定義輸入處理函式
def encode_texts(query, title, tokenizer, max_length=128):
    encoded_dict = tokenizer.encode_plus(
        query,
        title,
        add_special_tokens=True,  # 新增 [CLS], [SEP] 等標記
        max_length=max_length,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='tf'  # 返回 TensorFlow 張量
    )
    return encoded_dict['input_ids'], encoded_dict['attention_mask']


# 構建模型
def build_model(bert_model):
    input_ids = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='input_ids')
    attention_mask = tf.keras.layers.Input(shape=(128,), dtype=tf.int32, name='attention_mask')

    bert_output = bert_model(input_ids, attention_mask=attention_mask)
    cls_output = bert_output.last_hidden_state[:, 0, :]  # 取出 [CLS] 向量

    dense = tf.keras.layers.Dense(256, activation='relu')(cls_output)
    dropout = tf.keras.layers.Dropout(0.5)(dense)
    dense2 = tf.keras.layers.Dense(32, activation='relu')(dropout)
    output = tf.keras.layers.Dense(1, activation='sigmoid')(dense2)  # 二分類問題用 sigmoid 啟用

    model = tf.keras.Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=2e-5),
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
    return model


# 讀取資料集
def load_dataset(file_path, tokenizer, max_length=128):
    queries = []
    titles = []
    labels = []
    data = pd.read_csv(file_path)
    all_data = []
    for query, title, label in zip(data['query'].tolist(), data['title'].tolist(), data["label"].tolist()):
        all_data.append([query, title, int(label)])

    shuffle(all_data)
    for item in all_data:
        query, title, label = item
        queries.append(query)
        titles.append(title)
        labels.append(label)

    input_ids_list = []
    attention_mask_list = []
    for query, title in zip(queries, titles):
        input_ids, attention_mask = encode_texts(query, title, tokenizer, max_length)
        input_ids_list.append(input_ids)
        attention_mask_list.append(attention_mask)

    input_ids = tf.concat(input_ids_list, axis=0)
    attention_masks = tf.concat(attention_mask_list, axis=0)
    labels = tf.convert_to_tensor(labels)

    return {'input_ids': input_ids, 'attention_mask': attention_masks}, labels


# 載入訓練和測試資料
train_data, train_labels = load_dataset('train.csv', tokenizer)
test_data, test_labels = load_dataset('test.csv', tokenizer)

# 將TensorFlow張量轉換為numpy陣列
train_input_ids_np = train_data['input_ids'].numpy()
train_attention_masks_np = train_data['attention_mask'].numpy()
train_labels_np = train_labels.numpy()

# 將訓練資料進一步劃分為訓練集和驗證集
train_input_ids, val_input_ids, train_attention_masks, val_attention_masks, train_labels, val_labels = train_test_split(
    train_input_ids_np, train_attention_masks_np, train_labels_np, test_size=0.05, random_state=42, shuffle=False)

# 將numpy陣列轉換回TensorFlow張量
train_inputs = {'input_ids': tf.convert_to_tensor(train_input_ids), 'attention_mask': tf.convert_to_tensor(train_attention_masks)}
val_inputs = {'input_ids': tf.convert_to_tensor(val_input_ids), 'attention_mask': tf.convert_to_tensor(val_attention_masks)}
train_labels = tf.convert_to_tensor(train_labels)
val_labels = tf.convert_to_tensor(val_labels)

# 模型例項化
model = build_model(bert_model)
model.summary()

# 計算類權重以強調準確性
neg_weight = 10.0
pos_weight = 1.0  # 使正類樣本的權重較低，減少召回率
class_weight = {0: neg_weight, 1: pos_weight}

# 訓練模型
epochs =3
batch_size = 32
true_labels = pd.read_csv('test.csv')['label'].astype('int32')

for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    history = model.fit(
        x={'input_ids': train_inputs['input_ids'], 'attention_mask': train_inputs['attention_mask']},
        y=train_labels,
        validation_data=(
            {'input_ids': val_inputs['input_ids'], 'attention_mask': val_inputs['attention_mask']},
            val_labels
        ),
        epochs=1,  # 每次只訓練一個 epoch
        batch_size=batch_size,
        shuffle=True,
        class_weight=class_weight  # 調整類別權重
    )

    # 基於測試資料集進行評估
    loss, accuracy, auc = model.evaluate(test_data, test_labels)
    print(f"Test loss: {loss}, Test accuracy: {accuracy}, Test AUC: {auc}")

    # 調整決策閾值
    threshold = 0.5  # 調高閾值以減少 False Positives 提升準確度

    # 計算精確率和召回率
    predictions = model.predict(test_data)
    pred_labels = [int(i > threshold) for i in predictions[:, 0]]
    true_labels = list(np.array(true_labels))
    action_recall_accuracy(pred_labels, true_labels)
    if epoch == 5:
        with open("pred_rs.txt", "w", encoding="utf-8") as out:
            for label, pred in zip(true_labels, predictions[:, 0]):
                out.write("{}\t{}\n".format(label, pred))

超詳細的 Bert 文字分類原始碼解讀 | 附原始碼
2021-06-03
文字分類原始碼
Bert文字分類實踐（二）：魔改Bert，融合TextCNN的新思路
2021-10-11
文字分類CNN
bert_dnn的程式碼
2024-06-21
DNN
使用Bert預訓練模型文字分類（內附原始碼）
2019-03-13
模型文字分類原始碼
Bert文字分類實踐（一）：實現一個簡單的分類模型
2021-10-10
文字分類模型
中文新聞情感分類 Bert-Pytorch-transformers
2019-12-24
PyTorchORM
程式設計師垃圾程式碼分類指南
2019-07-25
程式設計師
BERT-Pytorch版本程式碼pipline梳理
2022-01-16
PyTorch
我的BERT！改改字典，讓BERT安全提速不掉分（已開源）
2020-09-25
分類任務中效能度量及程式碼
2021-12-05
BERT預訓練模型的演進過程！(附程式碼)
2019-09-28
模型
pytorch深度學習分類程式碼簡單示例
2024-08-07
PyTorch深度學習
基於深度學習的時間序列分類[含程式碼]
2019-03-12
深度學習
CSP-CCF 202006-1 線性分類器滿分程式碼
2020-11-23
Collections工具類,可以使用collections工具類對程式碼中的list進行分組
2024-03-23
Runtime原始碼 Category(分類)
2018-11-01
原始碼Go
密碼體制分類
2020-07-12
密碼
【機器學習】--xgboost初始之程式碼實現分類
2018-06-18
機器學習
如何用50行程式碼構建情感分類器
2018-06-20
行程
利用機器學習進行惡意程式碼分類
2020-08-19
機器學習
深度學習 | 分類任務中類別不均衡解決策略（附程式碼）
2019-01-14
深度學習
指南：不平衡分類的成本敏感決策樹（附程式碼&連結）
2020-03-04
簡單程式碼:將回歸特徵轉換為分類特徵
2021-09-09
特徵
【BERT】詳解BERT
2024-06-15
Java中，類與類，類中的程式碼執行順序
2018-12-10
Java
Bert文字分類實踐（三）：處理樣本不均衡和提升模型魯棒性trick
2021-10-16
文字分類模型
直播賣貨小程式原始碼中，商品分類頁面是如何實現的
2020-07-22
原始碼
程式碼分層的設計之道
2019-03-04
搞定NLP領域的“變形金剛”！手把手教你用BERT進行多標籤文字分類
2019-02-19
文字分類
noise的分類
2024-06-18
js程式碼與html程式碼分離示例
2018-06-17
JSHTML
顧客類的派生（C#程式碼）
2020-11-26
C#
機器學習(三)：理解邏輯迴歸及二分類、多分類程式碼實踐
2021-02-01
機器學習邏輯迴歸
京東獲得jd商品分類API介面（父分類、根分類、子分類）
2023-04-20
API
程式設計師垃圾分類圖鑑
2019-07-15
程式設計師
什麼是程式?Linux中程式分為哪幾類?
2022-07-13
Linux
程式碼分層設計
2019-03-09
Pytorch實戰-logistic 迴歸二元分類程式碼詳細註釋
2019-12-27
PyTorch

bert分類的程式碼

相關文章