寫在前面
文字分類是nlp中一個非常重要的任務,也是非常適合入坑nlp的第一個完整專案。雖然文字分類看似簡單,但裡面的門道好多好多,作者水平有限,只能將平時用到的方法和trick在此做個記錄和分享,希望大家看過都能有所收穫,享受程式設計的樂趣。
第一部分
模型
Bert模型是Google在2018年10月釋出的語言表示模型,一經問世在NLP領域橫掃了11項任務的最優結果,可謂風頭一時無二。有關於Bert中transformer的模型細節,推薦看這篇。在此不做贅述。
Bert文字分類模型常見做法為將bert最後一層輸出的第一個token位置(CLS位置)當作句子的表示,後接全連線層進行分類。模型很簡單,我們直接看程式碼!
第二部分
pytorch程式碼實現
# -*- coding:utf-8 -*-
# bert文字分類baseline模型
# model: bert
# date: 2021.10.10 10:01
import os
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.utils.data as Data
import torch.optim as optim
import transformers
from transformers import AutoModel, AutoTokenizer
import matplotlib.pyplot as plt
train_curve = []
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# 定義一些引數,模型選擇了最基礎的bert中文模型
batch_size = 2
epoches = 100
model = "bert-base-chinese"
hidden_size = 768
n_class = 2
maxlen = 8
# data,構造一些訓練資料
sentences = ["我喜歡打籃球", "這個相機很好看", "今天玩的特別開心", "我不喜歡你", "太糟糕了", "真是件令人傷心的事情"]
labels = [1, 1, 1, 0, 0, 0] # 1積極, 0消極.
# word_list = ' '.join(sentences).split()
# word_list = list(set(word_list))
# word_dict = {w: i for i, w in enumerate(word_list)}
# num_dict = {i: w for w, i in word_dict.items()}
# vocab_size = len(word_list)
# 將資料構造成bert的輸入格式
# inputs_ids: token的字典編碼
# attention_mask:長度與inputs_ids一致,真實長度的位置填充1,padding位置填充0
# token_type_ids: 第一個句子填充0,第二個句子句子填充1
class MyDataset(Data.Dataset):
def __init__(self, sentences, labels=None, with_labels=True,):
self.tokenizer = AutoTokenizer.from_pretrained(model)
self.with_labels = with_labels
self.sentences = sentences
self.labels = labels
def __len__(self):
return len(sentences)
def __getitem__(self, index):
# Selecting sentence1 and sentence2 at the specified index in the data frame
sent = self.sentences[index]
# Tokenize the pair of sentences to get token ids, attention masks and token type ids
encoded_pair = self.tokenizer(sent,
padding='max_length', # Pad to max_length
truncation=True, # Truncate to max_length
max_length=maxlen,
return_tensors='pt') # Return torch.Tensor objects
token_ids = encoded_pair['input_ids'].squeeze(0) # tensor of token ids
attn_masks = encoded_pair['attention_mask'].squeeze(0) # binary tensor with "0" for padded values and "1" for the other values
token_type_ids = encoded_pair['token_type_ids'].squeeze(0) # binary tensor with "0" for the 1st sentence tokens & "1" for the 2nd sentence tokens
if self.with_labels: # True if the dataset has labels
label = self.labels[index]
return token_ids, attn_masks, token_type_ids, label
else:
return token_ids, attn_masks, token_type_ids
train = Data.DataLoader(dataset=MyDataset(sentences, labels), batch_size=batch_size, shuffle=True, num_workers=1)
# model
class BertClassify(nn.Module):
def __init__(self):
super(BertClassify, self).__init__()
self.bert = AutoModel.from_pretrained(model, output_hidden_states=True, return_dict=True)
self.linear = nn.Linear(hidden_size, n_class) # 直接用cls向量接全連線層分類
self.dropout = nn.Dropout(0.5)
def forward(self, X):
input_ids, attention_mask, token_type_ids = X[0], X[1], X[2]
outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids) # 返回一個output字典
# 用最後一層cls向量做分類
# outputs.pooler_output: [bs, hidden_size]
logits = self.linear(self.dropout(outputs.pooler_output))
return logits
bc = BertClassify().to(device)
optimizer = optim.Adam(bc.parameters(), lr=1e-3, weight_decay=1e-2)
loss_fn = nn.CrossEntropyLoss()
# train
sum_loss = 0
total_step = len(train)
for epoch in range(epoches):
for i, batch in enumerate(train):
optimizer.zero_grad()
batch = tuple(p.to(device) for p in batch)
pred = bc([batch[0], batch[1], batch[2]])
loss = loss_fn(pred, batch[3])
sum_loss += loss.item()
loss.backward()
optimizer.step()
if epoch % 10 == 0:
print('[{}|{}] step:{}/{} loss:{:.4f}'.format(epoch+1, epoches, i+1, total_step, loss.item()))
train_curve.append(sum_loss)
sum_loss = 0
# test
bc.eval()
with torch.no_grad():
test_text = ['我不喜歡打籃球']
test = MyDataset(test_text, labels=None, with_labels=False)
x = test.__getitem__(0)
x = tuple(p.unsqueeze(0).to(device) for p in x)
pred = bc([x[0], x[1], x[2]])
pred = pred.data.max(dim=1, keepdim=True)[1]
if pred[0][0] == 0:
print('消極')
else:
print('積極')
pd.DataFrame(train_curve).plot() # loss曲線
測試單條樣本結果:
程式碼連結:
jupyter版本:https://github.com/PouringRain/blog_code/blob/main/nlp/bert_classify.ipynb
py版本:https://github.com/PouringRain/blog_code/blob/main/nlp/bert_classify.py
喜歡的話,給萌新的github倉庫一顆小星星哦……^ _^