第二次實驗完整程式碼

漫卷發表於2024-05-27

sgns方法加註釋

# Defined in Section 5.2.3.3
#基於負取樣


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN
from utils import load_reuters, save_pretrained, get_loader, init_weights


class SGNSDataset(Dataset):
    def __init__(self, corpus, vocab, context_size=2, n_negatives=5, ns_dist=None):
        #corpus是一個列表,包含多個句子,每個句子是詞彙索引的列表。
        #vocab:詞彙表,一個字典,鍵為詞彙,值為索引。
        #context_size=2:上下文視窗的大小,預設為2。
        #n_negatives=5:每個正樣本對應的負樣本數量,預設為5。
        #ns_dist=None:負取樣分佈,預設為None,表示使用uniform分佈。
        self.data = []#儲存資料,每個元素是一個元組,包含一個單詞的索引和上下文單詞的索引列表。
        self.bos = vocab[BOS_TOKEN]#詞彙表索引bos
        self.eos = vocab[EOS_TOKEN]#詞彙表索引eos
        self.pad = vocab[PAD_TOKEN]#詞彙表索引pad
        for sentence in tqdm(corpus, desc="Dataset Construction"):#遍歷句子
            sentence = [self.bos] + sentence + [self.eos]
            for i in range(1, len(sentence)-1):
                # 模型輸入:(w, context) ;輸出為0/1,表示context是否為負樣本
                w = sentence[i]
                left_context_index = max(0, i - context_size)
                right_context_index = min(len(sentence), i + context_size)
                context = sentence[left_context_index:i] + sentence[i+1:right_context_index+1]
                # 對於非2*上下文大小的context用pad補齊
                context += [self.pad] * (2 * context_size - len(context))
                self.data.append((w, context))

        # 負樣本數量
        self.n_negatives = n_negatives
        # 負取樣分佈:若引數ns_dist為None,則使用uniform分佈
        self.ns_dist = ns_dist if ns_dist is not None else torch.ones(len(vocab))

    def __len__(self):
        return len(self.data)#返回資料集的長度

    def __getitem__(self, i):#輸入i,一個整數,表示索引
        return self.data[i]#輸出第i個樣本,即(詞索引, 上下文詞索引列表)。

    def collate_fn(self, examples):#輸入examples,一個列表,包含了批次中的樣本,每個樣本是(詞索引, 上下文詞索引列表)對。
        words = torch.tensor([ex[0] for ex in examples], dtype=torch.long)#樣本的詞索引
        contexts = torch.tensor([ex[1] for ex in examples], dtype=torch.long)#樣本的上下文索引
        batch_size, context_size = contexts.shape#樣本的數量,上下文視窗大小
        # 負樣本數量
        neg_contexts = []#負樣本索引
        # 對batch內的樣本分別進行負取樣
        for i in range(batch_size):
            # 保證負樣本不包含當前樣本中的context
            # 0,按行填充, contexts[i]:contexts對應的行index, .0:填充值
            ns_dist = self.ns_dist.index_fill(0, contexts[i], .0) #真實的上下文詞的權重為0
            # ns_dist 取樣權重(全為1,則均勻取樣),self.n_negatives * context_size:取樣大小多少
            neg_contexts.append(torch.multinomial(ns_dist, self.n_negatives * context_size, replacement=True)) #按照給定的取樣權重取樣,返回index
        neg_contexts = torch.stack(neg_contexts, dim=0)
        return words, contexts, neg_contexts#輸出一個元組,包含三個張量:詞索引、上下文索引和負樣本上下文索引


class SGNSModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):#輸入vocab_size:詞彙表大小,embedding_dim:詞嵌入維度。
        super(SGNSModel, self).__init__()
        # 詞嵌入
        self.w_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 上下文嵌入
        self.c_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward_w(self, words):
        w_embeds = self.w_embeddings(words)
        return w_embeds

    def forward_c(self, contexts):
        c_embeds = self.c_embeddings(contexts)
        return c_embeds


def get_unigram_distribution(corpus, vocab_size):#輸入corpus:文字資料列表,vocab_size:詞彙表大小。
    # 從給定語料中統計unigram機率分佈
    token_counts = torch.tensor([0] * vocab_size)
    total_count = 0
    for sentence in corpus:
        total_count += len(sentence)
        for token in sentence:
            token_counts[token] += 1
    unigram_dist = torch.div(token_counts.float(), total_count)
    return unigram_dist#返回一個張量,包含每個詞彙的unigram機率分佈。

embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1024
num_epoch = 10
n_negatives = 10

# 讀取文字資料
corpus, vocab = load_reuters()
# 計算unigram機率分佈
unigram_dist = get_unigram_distribution(corpus, len(vocab))
# 根據unigram分佈計算負取樣分佈: p(w)**0.75
negative_sampling_dist = unigram_dist ** 0.75
negative_sampling_dist /= negative_sampling_dist.sum() #歸一化
# 構建SGNS訓練資料集
dataset = SGNSDataset(
    corpus,
    vocab,
    context_size=context_size,
    n_negatives=n_negatives,
    ns_dist=negative_sampling_dist
)
data_loader = get_loader(dataset, batch_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SGNSModel(len(vocab), embedding_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        words, contexts, neg_contexts = [x.to(device) for x in batch]
        optimizer.zero_grad()
        batch_size = words.shape[0]
        # 提取batch內詞、上下文以及負樣本的向量表示
        word_embeds = model.forward_w(words).unsqueeze(dim=2)
        context_embeds = model.forward_c(contexts)
        neg_context_embeds = model.forward_c(neg_contexts)
        # 正樣本的分類(對數)似然
        # word_embeds.shape()=(batch_size, embed_size, 1)
        # context_embeds.shape()=(batch_size, embed_size, context_size)
        context_loss = F.logsigmoid(torch.bmm(context_embeds, word_embeds).squeeze(dim=2))
        context_loss = context_loss.mean(dim=1)
        # 負樣本的分類(對數)似然
        neg_context_loss = F.logsigmoid(torch.bmm(neg_context_embeds, word_embeds).squeeze(dim=2).neg())
        neg_context_loss = neg_context_loss.view(batch_size, -1, n_negatives).sum(dim=2)
        neg_context_loss = neg_context_loss.mean(dim=1)
        # 損失:負對數似然
        loss = -(context_loss + neg_context_loss).mean()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")

# 合併詞嵌入矩陣與上下文嵌入矩陣,作為最終的預訓練詞向量
combined_embeds = model.w_embeddings.weight + model.c_embeddings.weight
save_pretrained(vocab, combined_embeds.data, "sgns.vec")

k-means 聚類並視覺化

# 匯入所需庫
from gensim.models.keyedvectors import KeyedVectors
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np


# 載入詞向量模型
word_vectors = KeyedVectors.load_word2vec_format('sgns.vec', binary=False)

# 設定聚類數量K,這裡以5為例
K = 5

# 準備詞向量資料用於聚類
vectors = word_vectors.vectors

# 應用K-means聚類
kmeans = KMeans(n_clusters=K, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(vectors)

# 獲取每個詞的聚類標籤
labels = kmeans.labels_

# 視覺化聚類結果
# 首先使用PCA降維
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(vectors)

# 繪製聚類結果
plt.figure(figsize=(10, 8))
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
markers = ['o', '^', 's', 'p', '*', 'h', 'x']

for i, color in zip(range(K), colors):
    # 選擇屬於當前類別的點
    class_member_mask = (labels == i)
    xy = reduced_vectors[class_member_mask]
    plt.scatter(xy[:, 0], xy[:, 1], 
                c=color, 
                marker=markers[i % len(markers)],
                alpha=0.5,
                label=f'Cluster {i}')

plt.title('K-means Clustering of Word Vectors')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(scatterpoints=1)
plt.grid(True)
plt.show()

print("完成聚類並視覺化展示。")

t-SNE方法降維

import numpy as np
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE

# 載入word2vec模型(這裡假設vec檔案是Google News Word2Vec格式)
# 注意:路徑替換為你的sgns.vec檔案的實際路徑
model_path = 'sgns.vec'
# 注意:以下程式碼假設vec檔案的第一行是詞的數量和維度,需要跳過
model = KeyedVectors.load_word2vec_format(model_path, binary=False)

# 獲取詞向量
words = model.index_to_key[:1000]  # 僅取前1000個詞作為示例,可以根據需要調整
vectors = np.array([model[word] for word in words])

# 使用t-SNE降維,這裡以2D為例
tsne = TSNE(n_components=2, random_state=42)
vectors_2d = tsne.fit_transform(vectors)

# 視覺化降維後的結果
plt.figure(figsize=(10, 8))
for i, word in enumerate(words):
    x, y = vectors_2d[i]
    plt.scatter(x, y)
    plt.annotate(word, xy=(x, y), textcoords='offset points', xytext=(0, 0), ha='right', va='bottom')

plt.title('t-SNE Visualization of Word Embeddings')
plt.show()

選取50個詞進行詞向量視覺化

import numpy as np
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE

# 載入word2vec模型
model_path = 'sgns.vec'
model = KeyedVectors.load_word2vec_format(model_path, binary=False)


plants = ['rose', 'oak', 'maple', 'bamboo', 'orchid', 'cactus', 'palm', 'iris', 'daisy', 'lotus']
titles = ['president', 'doctor', 'engineer', 'artist', 'teacher', 'lawyer', 'architect', 'nurse', 'writer', 'scientist']
honorifics = ['mr', 'mrs', 'ms', 'dr', 'professor', 'sir', 'madam', 'lord', 'lady', 'captain']
countries = ['usa', 'china', 'france', 'brazil', 'japan', 'germany', 'india', 'australia', 'canada', 'russia']

# 合併所有詞彙列表
selected_words = plants + titles + honorifics + countries

# 確保所有詞彙都在模型中
valid_words = [word for word in selected_words if word in model.index_to_key]

# 獲取選定詞彙的詞向量
vectors = np.array([model[word] for word in valid_words])

# 使用t-SNE降維至2D,確保perplexity <= len(valid_words) - 1
tsne = TSNE(n_components=2, perplexity=min(30, len(valid_words) - 1), random_state=42)
vectors_2d = tsne.fit_transform(vectors)

# 視覺化
plt.figure(figsize=(12, 8))

# 根據詞彙類別設定不同的顏色和標記
word_to_color = {word: 'r' if word in plants else 'g' if word in titles else 'b' if word in honorifics else 'y' for word in valid_words}
word_to_marker = {word: 'o' if word in plants else '^' if word in titles else 's' if word in honorifics else '*' for word in valid_words}

for i, (word, color, marker) in enumerate(zip(valid_words, word_to_color.values(), word_to_marker.values())):
    x, y = vectors_2d[i]
    plt.scatter(x, y, c=color, marker=marker, alpha=0.6)
    plt.annotate(word, xy=(x, y), textcoords='offset points', xytext=(0, 0), ha='right', va='bottom')

plt.title('t-SNE Visualization of Selected Word Embeddings')
plt.legend(handles=[
    plt.Line2D([0], [0], marker='o', color='w', label='Plants', markerfacecolor='r', markersize=10),
    plt.Line2D([0], [0], marker='^', color='w', label='Titles', markerfacecolor='g', markersize=10),
    plt.Line2D([0], [0], marker='s', color='w', label='Honorifics', markerfacecolor='b', markersize=10),
    plt.Line2D([0], [0], marker='*', color='w', label='Countries', markerfacecolor='y', markersize=10)
])
plt.show()

相關文章