sgns方法加註釋
# Defined in Section 5.2.3.3
#基於負取樣
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN
from utils import load_reuters, save_pretrained, get_loader, init_weights
class SGNSDataset(Dataset):
def __init__(self, corpus, vocab, context_size=2, n_negatives=5, ns_dist=None):
#corpus是一個列表,包含多個句子,每個句子是詞彙索引的列表。
#vocab:詞彙表,一個字典,鍵為詞彙,值為索引。
#context_size=2:上下文視窗的大小,預設為2。
#n_negatives=5:每個正樣本對應的負樣本數量,預設為5。
#ns_dist=None:負取樣分佈,預設為None,表示使用uniform分佈。
self.data = []#儲存資料,每個元素是一個元組,包含一個單詞的索引和上下文單詞的索引列表。
self.bos = vocab[BOS_TOKEN]#詞彙表索引bos
self.eos = vocab[EOS_TOKEN]#詞彙表索引eos
self.pad = vocab[PAD_TOKEN]#詞彙表索引pad
for sentence in tqdm(corpus, desc="Dataset Construction"):#遍歷句子
sentence = [self.bos] + sentence + [self.eos]
for i in range(1, len(sentence)-1):
# 模型輸入:(w, context) ;輸出為0/1,表示context是否為負樣本
w = sentence[i]
left_context_index = max(0, i - context_size)
right_context_index = min(len(sentence), i + context_size)
context = sentence[left_context_index:i] + sentence[i+1:right_context_index+1]
# 對於非2*上下文大小的context用pad補齊
context += [self.pad] * (2 * context_size - len(context))
self.data.append((w, context))
# 負樣本數量
self.n_negatives = n_negatives
# 負取樣分佈:若引數ns_dist為None,則使用uniform分佈
self.ns_dist = ns_dist if ns_dist is not None else torch.ones(len(vocab))
def __len__(self):
return len(self.data)#返回資料集的長度
def __getitem__(self, i):#輸入i,一個整數,表示索引
return self.data[i]#輸出第i個樣本,即(詞索引, 上下文詞索引列表)。
def collate_fn(self, examples):#輸入examples,一個列表,包含了批次中的樣本,每個樣本是(詞索引, 上下文詞索引列表)對。
words = torch.tensor([ex[0] for ex in examples], dtype=torch.long)#樣本的詞索引
contexts = torch.tensor([ex[1] for ex in examples], dtype=torch.long)#樣本的上下文索引
batch_size, context_size = contexts.shape#樣本的數量,上下文視窗大小
# 負樣本數量
neg_contexts = []#負樣本索引
# 對batch內的樣本分別進行負取樣
for i in range(batch_size):
# 保證負樣本不包含當前樣本中的context
# 0,按行填充, contexts[i]:contexts對應的行index, .0:填充值
ns_dist = self.ns_dist.index_fill(0, contexts[i], .0) #真實的上下文詞的權重為0
# ns_dist 取樣權重(全為1,則均勻取樣),self.n_negatives * context_size:取樣大小多少
neg_contexts.append(torch.multinomial(ns_dist, self.n_negatives * context_size, replacement=True)) #按照給定的取樣權重取樣,返回index
neg_contexts = torch.stack(neg_contexts, dim=0)
return words, contexts, neg_contexts#輸出一個元組,包含三個張量:詞索引、上下文索引和負樣本上下文索引
class SGNSModel(nn.Module):
def __init__(self, vocab_size, embedding_dim):#輸入vocab_size:詞彙表大小,embedding_dim:詞嵌入維度。
super(SGNSModel, self).__init__()
# 詞嵌入
self.w_embeddings = nn.Embedding(vocab_size, embedding_dim)
# 上下文嵌入
self.c_embeddings = nn.Embedding(vocab_size, embedding_dim)
def forward_w(self, words):
w_embeds = self.w_embeddings(words)
return w_embeds
def forward_c(self, contexts):
c_embeds = self.c_embeddings(contexts)
return c_embeds
def get_unigram_distribution(corpus, vocab_size):#輸入corpus:文字資料列表,vocab_size:詞彙表大小。
# 從給定語料中統計unigram機率分佈
token_counts = torch.tensor([0] * vocab_size)
total_count = 0
for sentence in corpus:
total_count += len(sentence)
for token in sentence:
token_counts[token] += 1
unigram_dist = torch.div(token_counts.float(), total_count)
return unigram_dist#返回一個張量,包含每個詞彙的unigram機率分佈。
embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1024
num_epoch = 10
n_negatives = 10
# 讀取文字資料
corpus, vocab = load_reuters()
# 計算unigram機率分佈
unigram_dist = get_unigram_distribution(corpus, len(vocab))
# 根據unigram分佈計算負取樣分佈: p(w)**0.75
negative_sampling_dist = unigram_dist ** 0.75
negative_sampling_dist /= negative_sampling_dist.sum() #歸一化
# 構建SGNS訓練資料集
dataset = SGNSDataset(
corpus,
vocab,
context_size=context_size,
n_negatives=n_negatives,
ns_dist=negative_sampling_dist
)
data_loader = get_loader(dataset, batch_size)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SGNSModel(len(vocab), embedding_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.train()
for epoch in range(num_epoch):
total_loss = 0
for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
words, contexts, neg_contexts = [x.to(device) for x in batch]
optimizer.zero_grad()
batch_size = words.shape[0]
# 提取batch內詞、上下文以及負樣本的向量表示
word_embeds = model.forward_w(words).unsqueeze(dim=2)
context_embeds = model.forward_c(contexts)
neg_context_embeds = model.forward_c(neg_contexts)
# 正樣本的分類(對數)似然
# word_embeds.shape()=(batch_size, embed_size, 1)
# context_embeds.shape()=(batch_size, embed_size, context_size)
context_loss = F.logsigmoid(torch.bmm(context_embeds, word_embeds).squeeze(dim=2))
context_loss = context_loss.mean(dim=1)
# 負樣本的分類(對數)似然
neg_context_loss = F.logsigmoid(torch.bmm(neg_context_embeds, word_embeds).squeeze(dim=2).neg())
neg_context_loss = neg_context_loss.view(batch_size, -1, n_negatives).sum(dim=2)
neg_context_loss = neg_context_loss.mean(dim=1)
# 損失:負對數似然
loss = -(context_loss + neg_context_loss).mean()
loss.backward()
optimizer.step()
total_loss += loss.item()
print(f"Loss: {total_loss:.2f}")
# 合併詞嵌入矩陣與上下文嵌入矩陣,作為最終的預訓練詞向量
combined_embeds = model.w_embeddings.weight + model.c_embeddings.weight
save_pretrained(vocab, combined_embeds.data, "sgns.vec")
k-means 聚類並視覺化
# 匯入所需庫
from gensim.models.keyedvectors import KeyedVectors
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np
# 載入詞向量模型
word_vectors = KeyedVectors.load_word2vec_format('sgns.vec', binary=False)
# 設定聚類數量K,這裡以5為例
K = 5
# 準備詞向量資料用於聚類
vectors = word_vectors.vectors
# 應用K-means聚類
kmeans = KMeans(n_clusters=K, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(vectors)
# 獲取每個詞的聚類標籤
labels = kmeans.labels_
# 視覺化聚類結果
# 首先使用PCA降維
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(vectors)
# 繪製聚類結果
plt.figure(figsize=(10, 8))
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
markers = ['o', '^', 's', 'p', '*', 'h', 'x']
for i, color in zip(range(K), colors):
# 選擇屬於當前類別的點
class_member_mask = (labels == i)
xy = reduced_vectors[class_member_mask]
plt.scatter(xy[:, 0], xy[:, 1],
c=color,
marker=markers[i % len(markers)],
alpha=0.5,
label=f'Cluster {i}')
plt.title('K-means Clustering of Word Vectors')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(scatterpoints=1)
plt.grid(True)
plt.show()
print("完成聚類並視覺化展示。")
t-SNE方法降維
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
# 載入word2vec模型(這裡假設vec檔案是Google News Word2Vec格式)
# 注意:路徑替換為你的sgns.vec檔案的實際路徑
model_path = 'sgns.vec'
# 注意:以下程式碼假設vec檔案的第一行是詞的數量和維度,需要跳過
model = KeyedVectors.load_word2vec_format(model_path, binary=False)
# 獲取詞向量
words = model.index_to_key[:1000] # 僅取前1000個詞作為示例,可以根據需要調整
vectors = np.array([model[word] for word in words])
# 使用t-SNE降維,這裡以2D為例
tsne = TSNE(n_components=2, random_state=42)
vectors_2d = tsne.fit_transform(vectors)
# 視覺化降維後的結果
plt.figure(figsize=(10, 8))
for i, word in enumerate(words):
x, y = vectors_2d[i]
plt.scatter(x, y)
plt.annotate(word, xy=(x, y), textcoords='offset points', xytext=(0, 0), ha='right', va='bottom')
plt.title('t-SNE Visualization of Word Embeddings')
plt.show()
選取50個詞進行詞向量視覺化
import numpy as np
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE
# 載入word2vec模型
model_path = 'sgns.vec'
model = KeyedVectors.load_word2vec_format(model_path, binary=False)
plants = ['rose', 'oak', 'maple', 'bamboo', 'orchid', 'cactus', 'palm', 'iris', 'daisy', 'lotus']
titles = ['president', 'doctor', 'engineer', 'artist', 'teacher', 'lawyer', 'architect', 'nurse', 'writer', 'scientist']
honorifics = ['mr', 'mrs', 'ms', 'dr', 'professor', 'sir', 'madam', 'lord', 'lady', 'captain']
countries = ['usa', 'china', 'france', 'brazil', 'japan', 'germany', 'india', 'australia', 'canada', 'russia']
# 合併所有詞彙列表
selected_words = plants + titles + honorifics + countries
# 確保所有詞彙都在模型中
valid_words = [word for word in selected_words if word in model.index_to_key]
# 獲取選定詞彙的詞向量
vectors = np.array([model[word] for word in valid_words])
# 使用t-SNE降維至2D,確保perplexity <= len(valid_words) - 1
tsne = TSNE(n_components=2, perplexity=min(30, len(valid_words) - 1), random_state=42)
vectors_2d = tsne.fit_transform(vectors)
# 視覺化
plt.figure(figsize=(12, 8))
# 根據詞彙類別設定不同的顏色和標記
word_to_color = {word: 'r' if word in plants else 'g' if word in titles else 'b' if word in honorifics else 'y' for word in valid_words}
word_to_marker = {word: 'o' if word in plants else '^' if word in titles else 's' if word in honorifics else '*' for word in valid_words}
for i, (word, color, marker) in enumerate(zip(valid_words, word_to_color.values(), word_to_marker.values())):
x, y = vectors_2d[i]
plt.scatter(x, y, c=color, marker=marker, alpha=0.6)
plt.annotate(word, xy=(x, y), textcoords='offset points', xytext=(0, 0), ha='right', va='bottom')
plt.title('t-SNE Visualization of Selected Word Embeddings')
plt.legend(handles=[
plt.Line2D([0], [0], marker='o', color='w', label='Plants', markerfacecolor='r', markersize=10),
plt.Line2D([0], [0], marker='^', color='w', label='Titles', markerfacecolor='g', markersize=10),
plt.Line2D([0], [0], marker='s', color='w', label='Honorifics', markerfacecolor='b', markersize=10),
plt.Line2D([0], [0], marker='*', color='w', label='Countries', markerfacecolor='y', markersize=10)
])
plt.show()