第二次實驗完整程式碼

漫卷發表於2024-05-27

原文網址 : https://www.cnblogs.com/manjuan/p/18214636

sgns方法加註釋

# Defined in Section 5.2.3.3
#基於負取樣


import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torch.nn.utils.rnn import pad_sequence
from tqdm.auto import tqdm
from utils import BOS_TOKEN, EOS_TOKEN, PAD_TOKEN
from utils import load_reuters, save_pretrained, get_loader, init_weights


class SGNSDataset(Dataset):
    def __init__(self, corpus, vocab, context_size=2, n_negatives=5, ns_dist=None):
        #corpus是一個列表，包含多個句子，每個句子是詞彙索引的列表。
        #vocab：詞彙表，一個字典，鍵為詞彙，值為索引。
        #context_size=2：上下文視窗的大小，預設為2。
        #n_negatives=5：每個正樣本對應的負樣本數量，預設為5。
        #ns_dist=None：負取樣分佈，預設為None，表示使用uniform分佈。
        self.data = []#儲存資料，每個元素是一個元組，包含一個單詞的索引和上下文單詞的索引列表。
        self.bos = vocab[BOS_TOKEN]#詞彙表索引bos
        self.eos = vocab[EOS_TOKEN]#詞彙表索引eos
        self.pad = vocab[PAD_TOKEN]#詞彙表索引pad
        for sentence in tqdm(corpus, desc="Dataset Construction"):#遍歷句子
            sentence = [self.bos] + sentence + [self.eos]
            for i in range(1, len(sentence)-1):
                # 模型輸入：(w, context) ；輸出為0/1，表示context是否為負樣本
                w = sentence[i]
                left_context_index = max(0, i - context_size)
                right_context_index = min(len(sentence), i + context_size)
                context = sentence[left_context_index:i] + sentence[i+1:right_context_index+1]
                # 對於非2*上下文大小的context用pad補齊
                context += [self.pad] * (2 * context_size - len(context))
                self.data.append((w, context))

        # 負樣本數量
        self.n_negatives = n_negatives
        # 負取樣分佈：若引數ns_dist為None，則使用uniform分佈
        self.ns_dist = ns_dist if ns_dist is not None else torch.ones(len(vocab))

    def __len__(self):
        return len(self.data)#返回資料集的長度

    def __getitem__(self, i):#輸入i，一個整數，表示索引
        return self.data[i]#輸出第i個樣本，即(詞索引, 上下文詞索引列表)。

    def collate_fn(self, examples):#輸入examples，一個列表，包含了批次中的樣本，每個樣本是(詞索引, 上下文詞索引列表)對。
        words = torch.tensor([ex[0] for ex in examples], dtype=torch.long)#樣本的詞索引
        contexts = torch.tensor([ex[1] for ex in examples], dtype=torch.long)#樣本的上下文索引
        batch_size, context_size = contexts.shape#樣本的數量，上下文視窗大小
        # 負樣本數量
        neg_contexts = []#負樣本索引
        # 對batch內的樣本分別進行負取樣
        for i in range(batch_size):
            # 保證負樣本不包含當前樣本中的context
            # 0,按行填充， contexts[i]：contexts對應的行index, .0：填充值
            ns_dist = self.ns_dist.index_fill(0, contexts[i], .0) #真實的上下文詞的權重為0
            # ns_dist 取樣權重（全為1，則均勻取樣），self.n_negatives * context_size：取樣大小多少
            neg_contexts.append(torch.multinomial(ns_dist, self.n_negatives * context_size, replacement=True)) #按照給定的取樣權重取樣，返回index
        neg_contexts = torch.stack(neg_contexts, dim=0)
        return words, contexts, neg_contexts#輸出一個元組，包含三個張量：詞索引、上下文索引和負樣本上下文索引


class SGNSModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim):#輸入vocab_size：詞彙表大小，embedding_dim：詞嵌入維度。
        super(SGNSModel, self).__init__()
        # 詞嵌入
        self.w_embeddings = nn.Embedding(vocab_size, embedding_dim)
        # 上下文嵌入
        self.c_embeddings = nn.Embedding(vocab_size, embedding_dim)

    def forward_w(self, words):
        w_embeds = self.w_embeddings(words)
        return w_embeds

    def forward_c(self, contexts):
        c_embeds = self.c_embeddings(contexts)
        return c_embeds


def get_unigram_distribution(corpus, vocab_size):#輸入corpus：文字資料列表，vocab_size：詞彙表大小。
    # 從給定語料中統計unigram機率分佈
    token_counts = torch.tensor([0] * vocab_size)
    total_count = 0
    for sentence in corpus:
        total_count += len(sentence)
        for token in sentence:
            token_counts[token] += 1
    unigram_dist = torch.div(token_counts.float(), total_count)
    return unigram_dist#返回一個張量，包含每個詞彙的unigram機率分佈。

embedding_dim = 64
context_size = 2
hidden_dim = 128
batch_size = 1024
num_epoch = 10
n_negatives = 10

# 讀取文字資料
corpus, vocab = load_reuters()
# 計算unigram機率分佈
unigram_dist = get_unigram_distribution(corpus, len(vocab))
# 根據unigram分佈計算負取樣分佈: p(w)**0.75
negative_sampling_dist = unigram_dist ** 0.75
negative_sampling_dist /= negative_sampling_dist.sum() #歸一化
# 構建SGNS訓練資料集
dataset = SGNSDataset(
    corpus,
    vocab,
    context_size=context_size,
    n_negatives=n_negatives,
    ns_dist=negative_sampling_dist
)
data_loader = get_loader(dataset, batch_size)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SGNSModel(len(vocab), embedding_dim)
model.to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)

model.train()
for epoch in range(num_epoch):
    total_loss = 0
    for batch in tqdm(data_loader, desc=f"Training Epoch {epoch}"):
        words, contexts, neg_contexts = [x.to(device) for x in batch]
        optimizer.zero_grad()
        batch_size = words.shape[0]
        # 提取batch內詞、上下文以及負樣本的向量表示
        word_embeds = model.forward_w(words).unsqueeze(dim=2)
        context_embeds = model.forward_c(contexts)
        neg_context_embeds = model.forward_c(neg_contexts)
        # 正樣本的分類（對數）似然
        # word_embeds.shape()=(batch_size, embed_size, 1)
        # context_embeds.shape()=(batch_size, embed_size, context_size)
        context_loss = F.logsigmoid(torch.bmm(context_embeds, word_embeds).squeeze(dim=2))
        context_loss = context_loss.mean(dim=1)
        # 負樣本的分類（對數）似然
        neg_context_loss = F.logsigmoid(torch.bmm(neg_context_embeds, word_embeds).squeeze(dim=2).neg())
        neg_context_loss = neg_context_loss.view(batch_size, -1, n_negatives).sum(dim=2)
        neg_context_loss = neg_context_loss.mean(dim=1)
        # 損失：負對數似然
        loss = -(context_loss + neg_context_loss).mean()
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Loss: {total_loss:.2f}")

# 合併詞嵌入矩陣與上下文嵌入矩陣，作為最終的預訓練詞向量
combined_embeds = model.w_embeddings.weight + model.c_embeddings.weight
save_pretrained(vocab, combined_embeds.data, "sgns.vec")

k-means 聚類並視覺化

# 匯入所需庫
from gensim.models.keyedvectors import KeyedVectors
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import numpy as np


# 載入詞向量模型
word_vectors = KeyedVectors.load_word2vec_format('sgns.vec', binary=False)

# 設定聚類數量K，這裡以5為例
K = 5

# 準備詞向量資料用於聚類
vectors = word_vectors.vectors

# 應用K-means聚類
kmeans = KMeans(n_clusters=K, init='k-means++', max_iter=300, n_init=10, random_state=0)
kmeans.fit(vectors)

# 獲取每個詞的聚類標籤
labels = kmeans.labels_

# 視覺化聚類結果
# 首先使用PCA降維
pca = PCA(n_components=2)
reduced_vectors = pca.fit_transform(vectors)

# 繪製聚類結果
plt.figure(figsize=(10, 8))
colors = ['r', 'g', 'b', 'c', 'm', 'y', 'k']
markers = ['o', '^', 's', 'p', '*', 'h', 'x']

for i, color in zip(range(K), colors):
    # 選擇屬於當前類別的點
    class_member_mask = (labels == i)
    xy = reduced_vectors[class_member_mask]
    plt.scatter(xy[:, 0], xy[:, 1], 
                c=color, 
                marker=markers[i % len(markers)],
                alpha=0.5,
                label=f'Cluster {i}')

plt.title('K-means Clustering of Word Vectors')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.legend(scatterpoints=1)
plt.grid(True)
plt.show()

print("完成聚類並視覺化展示。")

t-SNE方法降維

import numpy as np
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE

# 載入word2vec模型（這裡假設vec檔案是Google News Word2Vec格式）
# 注意：路徑替換為你的sgns.vec檔案的實際路徑
model_path = 'sgns.vec'
# 注意：以下程式碼假設vec檔案的第一行是詞的數量和維度，需要跳過
model = KeyedVectors.load_word2vec_format(model_path, binary=False)

# 獲取詞向量
words = model.index_to_key[:1000]  # 僅取前1000個詞作為示例，可以根據需要調整
vectors = np.array([model[word] for word in words])

# 使用t-SNE降維，這裡以2D為例
tsne = TSNE(n_components=2, random_state=42)
vectors_2d = tsne.fit_transform(vectors)

# 視覺化降維後的結果
plt.figure(figsize=(10, 8))
for i, word in enumerate(words):
    x, y = vectors_2d[i]
    plt.scatter(x, y)
    plt.annotate(word, xy=(x, y), textcoords='offset points', xytext=(0, 0), ha='right', va='bottom')

plt.title('t-SNE Visualization of Word Embeddings')
plt.show()

選取50個詞進行詞向量視覺化

import numpy as np
import matplotlib.pyplot as plt
from gensim.models import KeyedVectors
from sklearn.manifold import TSNE

# 載入word2vec模型
model_path = 'sgns.vec'
model = KeyedVectors.load_word2vec_format(model_path, binary=False)


plants = ['rose', 'oak', 'maple', 'bamboo', 'orchid', 'cactus', 'palm', 'iris', 'daisy', 'lotus']
titles = ['president', 'doctor', 'engineer', 'artist', 'teacher', 'lawyer', 'architect', 'nurse', 'writer', 'scientist']
honorifics = ['mr', 'mrs', 'ms', 'dr', 'professor', 'sir', 'madam', 'lord', 'lady', 'captain']
countries = ['usa', 'china', 'france', 'brazil', 'japan', 'germany', 'india', 'australia', 'canada', 'russia']

# 合併所有詞彙列表
selected_words = plants + titles + honorifics + countries

# 確保所有詞彙都在模型中
valid_words = [word for word in selected_words if word in model.index_to_key]

# 獲取選定詞彙的詞向量
vectors = np.array([model[word] for word in valid_words])

# 使用t-SNE降維至2D，確保perplexity <= len(valid_words) - 1
tsne = TSNE(n_components=2, perplexity=min(30, len(valid_words) - 1), random_state=42)
vectors_2d = tsne.fit_transform(vectors)

# 視覺化
plt.figure(figsize=(12, 8))

# 根據詞彙類別設定不同的顏色和標記
word_to_color = {word: 'r' if word in plants else 'g' if word in titles else 'b' if word in honorifics else 'y' for word in valid_words}
word_to_marker = {word: 'o' if word in plants else '^' if word in titles else 's' if word in honorifics else '*' for word in valid_words}

for i, (word, color, marker) in enumerate(zip(valid_words, word_to_color.values(), word_to_marker.values())):
    x, y = vectors_2d[i]
    plt.scatter(x, y, c=color, marker=marker, alpha=0.6)
    plt.annotate(word, xy=(x, y), textcoords='offset points', xytext=(0, 0), ha='right', va='bottom')

plt.title('t-SNE Visualization of Selected Word Embeddings')
plt.legend(handles=[
    plt.Line2D([0], [0], marker='o', color='w', label='Plants', markerfacecolor='r', markersize=10),
    plt.Line2D([0], [0], marker='^', color='w', label='Titles', markerfacecolor='g', markersize=10),
    plt.Line2D([0], [0], marker='s', color='w', label='Honorifics', markerfacecolor='b', markersize=10),
    plt.Line2D([0], [0], marker='*', color='w', label='Countries', markerfacecolor='y', markersize=10)
])
plt.show()

Java製作驗證碼的完整程式碼
2020-11-04
Java
某滑塊驗證碼識別思路(附完整程式碼)
2024-12-10
作業系統第二次上機實驗-程式通訊
2020-11-05
作業系統
第二次作業預備實驗
2020-09-19
資料庫原理第二次實驗報告
2024-05-23
資料庫
Canvas實現放大鏡效果完整案例分析（附程式碼）
2020-11-21
Canvas
Airbnb JavaScript程式碼規範（完整）
2021-03-09
AIJavaScript
如何從零開始用PyTorch實現Chatbot？（附完整程式碼）
2019-03-01
PyTorch
SSM 實現支付寶支付功能（圖文詳解+完整程式碼）
2019-05-15
SSM
實驗吧 —— web完整滲透測試實驗指導書（圖片版）
2018-09-02
Web
軟體定義網路（SDN）第二次實驗報告
2020-09-15
【程式碼實驗室】.->和.有什麼區別？
2018-12-20
Python實現簡單網頁圖片抓取完整程式碼例項
2020-05-27
Python網頁
萬字長文解密資料異構最佳實踐（含完整程式碼實現）！！
2021-07-04
解密
低程式碼快速搭建完整商品列表頁
2020-12-04
【C++】人工智慧實驗一之猴子摘香蕉/傳教士與野人（含完整程式碼與狀態遷移圖）
2020-12-29
C++人工智慧
Java實驗二：類程式設計實驗
2024-03-22
Java程式設計
一百行js程式碼實現一個校驗工具
2019-04-25
JS
uni-app 實現滑動列表(slider)頁面效果完整程式碼示例
2021-11-04
APPIDE
JPG學習筆記2（附完整程式碼）
2021-02-13
筆記
JPG學習筆記1（附完整程式碼）
2021-02-12
筆記
JPG學習筆記3（附完整程式碼）
2021-02-18
筆記
使用F#破解滑塊驗證碼的完整流程
2024-10-11
c語言程式實驗————實驗報告十
2024-05-26
C語言
c語言程式實驗——實驗報告五
2024-04-18
C語言
c語言程式實驗————實驗報告十二
2024-06-21
C語言
實踐，製作一個高擴充套件、視覺化低程式碼前端，詳實、完整
2023-03-01
套件視覺化前端
120行程式碼實現一個互動完整的拖拽上傳元件
2019-08-29
行程元件
python3+tkinter實現的黑白棋，程式碼完整 100%能執行
2021-03-07
Python
Docker 網路模型之 macvlan 詳解，圖解，實驗完整
2019-05-20
Docker模型Mac圖解
使用Go語言破解滑塊驗證碼的完整流程
2024-10-11
Go
工程數學上機實驗四：共軛梯度法程式設計程式碼
2024-06-08
梯度程式設計
WebRTC 音訊演算法附完整C程式碼
2018-05-01
Web音訊演算法C程式
Python3+pygame實現的俄羅斯方塊程式碼完整有演示效果
2021-03-02
PythonGAM
Django實現驗證碼
2018-06-13
Django
程式設計實驗4
2024-11-22
程式設計
程式設計實驗五
2024-12-03
程式設計
volatile的特性程式碼驗證
2020-08-13

第二次實驗完整程式碼

sgns方法加註釋

k-means 聚類並視覺化

t-SNE方法降維

選取50個詞進行詞向量視覺化

相關文章