6-3使用GPU訓練模型

lotuslaw發表於2024-08-04

深度學習的訓練過程常常非常耗時,一個模型訓練幾個小時是家常便飯,訓練幾天也是常有的事情,有時候甚至要訓練幾十天。

訓練過程的耗時主要來自於兩個部分,一部分來自資料準備,另一部分來自引數迭代。

當資料準備過程還是模型訓練時間的主要瓶頸時,我們可以使用更多程序來準備資料。

當引數迭代過程成為訓練時間的主要瓶頸時,我們通常的方法是應用GPU來進行加速。

import torch 
import torchkeras 
import torchmetrics

print("torch.__version__ = ",torch.__version__)
print("torchkeras.__version__ = ",torchkeras.__version__)
print("torchmetrics.__version__ = ",torchmetrics.__version__)

"""
torch.__version__ =  2.3.1+cu121
torchkeras.__version__ =  3.9.6
torchmetrics.__version__ =  1.4.1
"""

Pytorch中使用GPU加速模型非常簡單,只要將模型和資料移動到GPU上。核心程式碼只有以下幾行。

# 定義模型

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device) # 移動模型到cuda

# 訓練模型

features = features.to(device) # 移動資料到cuda
labels = labels.to(device) # 或者  labels = labels.cuda() if torch.cuda.is_available() else labels

如果要使用多個GPU訓練模型,也非常簡單。只需要在將模型設定為資料並行風格模型。
則模型移動到GPU上之後,會在每一個GPU上複製一個副本,並把資料平分到各個GPU上進行訓練。核心程式碼如下。

# 定義模型

if torch.cuda.device_count() > 1:
    model = nn.DataParallel(model) # 包裝為並行風格模型

# 訓練模型
features = features.to(device) # 移動資料到cuda
labels = labels.to(device) # 或者 labels = labels.cuda() if torch.cuda.is_available() else labels

1.GPU相關操作彙總

import torch
from torch import nn

# 檢視gpu資訊
if_cuda = torch.cuda.is_available()
print("if_cuda=", if_cuda)

gpu_count = torch.cuda.device_count()
print("gpu_count=", gpu_count)

"""
if_cuda= True
gpu_count= 1
"""

# 將張量在gpu和cpu間移動
tensor = torch.rand((100, 100))
tensor_gpu = tensor.to("cuda:0")  # 或者tensor_gpu = tensor.cuda()
print(tensor_gpu.device)
print(tensor_gpu.is_cuda)

tensor_cpu = tensor_gpu.to("cpu")  # 或者tensor_cpu = tensor_gpu.cpu()
print(tensor_cpu.device)

"""
cuda:0
True
cpu
"""

# 將模型中的全部張量移動到gpu上
net = nn.Linear(2, 1)
print(next(net.parameters()).is_cuda)
net.to("cuda:0")  # 將模型中的全部引數張量依次移動到GPU上,注意,無需重新賦值為net = net.to("cuda:0")
print(next(net.parameters()).is_cuda)
print(next(net.parameters()).device)

"""
False
True
cuda:0
"""

# 建立支援多個gpu資料並行的模型
linear = nn.Linear(2, 1)
print(next(linear.parameters()).device)

model = nn.DataParallel(linear)
print(model.device_ids)
print(next(model.module.parameters()).device)

# 注意儲存引數時要制定儲存model.module的引數
torch.save(model.module.state_dict(), "model_parameter.pt")

linear = nn.Linear(2, 1)
linear.load_state_dict(torch.load("model_parameter.pt"))

"""
cpu
[0]
cuda:0
<All keys matched successfully>
"""

2.矩陣乘法範例

下面分別使用CPU和GPU作一個矩陣乘法,並比較其計算效率。

import time
import torch
from torch import nn

# 使用CPU
a = torch.rand((10000, 200))
b = torch.rand((200, 10000))
tic = time.time()
c = torch.matmul(a, b)
toc = time.time()

print(toc - tic)
print(a.device)
print(b.device)

"""
0.26279258728027344
cpu
cpu
"""

# 使用GPU
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
a = torch.rand((10000, 200), device=device)  # 可以指定在GPU上建立張量
b = torch.rand((200, 10000))
b = b.to(device)  # 或者 b = b.cuda() if torch.cuda.is_available() else b
tic = time.time()
c = torch.matmul(a, b)
toc = time.time()

print(toc - tic)
print(a.device)
print(b.device)

"""
0.5037369728088379
cuda:0
cuda:0
"""

3.線性迴歸範例

下面對比使用CPU和GPU訓練一個線性迴歸模型的效率

# 準備資料
n = 10000000  # 樣本數量
X = 10 * torch.rand([n, 2]) - 5.0
w0 = torch.tensor([[2.0, -3.0]])
b0 = torch.tensor([[10.0]])
Y = X@w0.t() + b0 + torch.normal(0.0, 2.0, size=[n, 1])

# 定義模型
class LinearRegression(nn.Module):
    def __init__(self):
        super().__init__()
        self.w = nn.Parameter(torch.randn_like(w0))
        self.b = nn.Parameter(torch.zeros_like(b0))

    def forward(self, x):
        return x@self.w.t() + self.b

linear = LinearRegression()

# 訓練模型
optimizer = torch.optim.Adam(linear.parameters(), lr=0.1)
loss_fn = nn.MSELoss()

def train(epoches):
    tic = time.time()
    for epoch in range(epoches):
        optimizer.zero_grad()
        Y_pred = linear(X)
        loss = loss_fn(Y_pred, Y)
        loss.backward()
        optimizer.step()
        if epoch % 50 == 0:
            print({"epoch": epoch, "loss": loss.item()})
    toc = time.time()
    print("time used:", toc - tic)
train(500)

"""
{'epoch': 0, 'loss': 211.0252227783203}
{'epoch': 50, 'loss': 33.406837463378906}
{'epoch': 100, 'loss': 9.043604850769043}
{'epoch': 150, 'loss': 4.492393970489502}
{'epoch': 200, 'loss': 4.024799346923828}
{'epoch': 250, 'loss': 4.001006603240967}
{'epoch': 300, 'loss': 4.000483512878418}
{'epoch': 350, 'loss': 4.0004801750183105}
{'epoch': 400, 'loss': 4.0004801750183105}
{'epoch': 450, 'loss': 4.0004801750183105}
time used: 48.405426263809204
"""
# 使用GPU
# 準備資料
n = 1000000 #樣本數量

X = 10*torch.rand([n,2])-5.0  #torch.rand是均勻分佈 
w0 = torch.tensor([[2.0,-3.0]])
b0 = torch.tensor([[10.0]])
Y = X@w0.t() + b0 + torch.normal( 0.0,2.0,size = [n,1])  # @表示矩陣乘法,增加正態擾動

# 資料移動到GPU上
print("torch.cuda.is_available() = ",torch.cuda.is_available())
X = X.cuda()
Y = Y.cuda()
print("X.device:",X.device)
print("Y.device:",Y.device)

# 定義模型
class LinearRegression(nn.Module): 
    def __init__(self):
        super().__init__()
        self.w = nn.Parameter(torch.randn_like(w0))
        self.b = nn.Parameter(torch.zeros_like(b0))
    #正向傳播
    def forward(self,x): 
        return x@self.w.t() + self.b
        
linear = LinearRegression() 

# 移動模型到GPU上
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
linear.to(device)

#檢視模型是否已經移動到GPU上
print("if on cuda:",next(linear.parameters()).is_cuda)


# 訓練模型
optimizer = torch.optim.Adam(linear.parameters(),lr = 0.1)
loss_fn = nn.MSELoss()

def train(epoches):
    tic = time.time()
    for epoch in range(epoches):
        optimizer.zero_grad()
        Y_pred = linear(X) 
        loss = loss_fn(Y_pred,Y)
        loss.backward() 
        optimizer.step()
        if epoch%50==0:
            print({"epoch":epoch,"loss":loss.item()})
    toc = time.time()
    print("time used:",toc-tic)
    
train(500)

"""
torch.cuda.is_available() =  True
X.device: cuda:0
Y.device: cuda:0
if on cuda: True
{'epoch': 0, 'loss': 186.33334350585938}
{'epoch': 50, 'loss': 33.154579162597656}
{'epoch': 100, 'loss': 9.04254150390625}
{'epoch': 150, 'loss': 4.492138862609863}
{'epoch': 200, 'loss': 4.024778366088867}
{'epoch': 250, 'loss': 4.00100564956665}
{'epoch': 300, 'loss': 4.00048303604126}
{'epoch': 350, 'loss': 4.0004801750183105}
{'epoch': 400, 'loss': 4.0004801750183105}
{'epoch': 450, 'loss': 4.0004801750183105}
time used: 1.3654239177703857
"""

4.圖片分類範例

import torch 
from torch import nn 

import torchvision 
from torchvision import transforms

transform = transforms.Compose([transforms.ToTensor()])

ds_train = torchvision.datasets.MNIST(root="dataset/mnist/",train=True,download=False,transform=transform)
ds_val = torchvision.datasets.MNIST(root="dataset/mnist/",train=False,download=False,transform=transform)

dl_train =  torch.utils.data.DataLoader(ds_train, batch_size=128, shuffle=True, num_workers=2)
dl_val =  torch.utils.data.DataLoader(ds_val, batch_size=128, shuffle=False, num_workers=2)

print(len(ds_train))
print(len(ds_val))

"""
60000
10000
"""


def create_net():
    net = nn.Sequential()
    net.add_module("conv1",nn.Conv2d(in_channels=1,out_channels=32,kernel_size = 3))
    net.add_module("pool1",nn.MaxPool2d(kernel_size = 2,stride = 2))
    net.add_module("conv2",nn.Conv2d(in_channels=32,out_channels=64,kernel_size = 5))
    net.add_module("pool2",nn.MaxPool2d(kernel_size = 2,stride = 2))
    net.add_module("dropout",nn.Dropout2d(p = 0.1))
    net.add_module("adaptive_pool",nn.AdaptiveMaxPool2d((1,1)))
    net.add_module("flatten",nn.Flatten())
    net.add_module("linear1",nn.Linear(64,32))
    net.add_module("relu",nn.ReLU())
    net.add_module("linear2",nn.Linear(32,10))
    return net

net = create_net()
print(net)

"""
Sequential(
  (conv1): Conv2d(1, 32, kernel_size=(3, 3), stride=(1, 1))
  (pool1): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (conv2): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1))
  (pool2): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (dropout): Dropout2d(p=0.1, inplace=False)
  (adaptive_pool): AdaptiveMaxPool2d(output_size=(1, 1))
  (flatten): Flatten(start_dim=1, end_dim=-1)
  (linear1): Linear(in_features=64, out_features=32, bias=True)
  (relu): ReLU()
  (linear2): Linear(in_features=32, out_features=10, bias=True)
)
"""
# 使用CPU進行訓練
import os,sys,time
import numpy as np
import pandas as pd
import datetime 
from tqdm import tqdm 

import torch
from torch import nn 
from copy import deepcopy
from torchmetrics import Accuracy
#注:多分類使用torchmetrics中的評估指標,二分類使用torchkeras.metrics中的評估指標

def printlog(info):
    nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print("\n"+"=========="*8 + "%s"%nowtime)
    print(str(info)+"\n")
    

net = create_net() 

loss_fn = nn.CrossEntropyLoss()
optimizer= torch.optim.Adam(net.parameters(),lr = 0.01)   
metrics_dict = {"acc":Accuracy(task='multiclass',num_classes=10)}

epochs = 3 
ckpt_path='checkpoint.pt'

#early_stopping相關設定
monitor="val_acc"
patience=1
mode="max"

history = {}

for epoch in range(1, epochs+1):
    printlog("Epoch {0} / {1}".format(epoch, epochs))

    # 1,train -------------------------------------------------  
    net.train()
    
    total_loss,step = 0,0
    
    loop = tqdm(enumerate(dl_train), total =len(dl_train),file=sys.stdout)
    train_metrics_dict = deepcopy(metrics_dict) 
    
    for i, batch in loop: 
        
        features,labels = batch
        #forward
        preds = net(features)
        loss = loss_fn(preds,labels)
        
        #backward
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
            
        #metrics
        step_metrics = {"train_"+name:metric_fn(preds, labels).item() 
                        for name,metric_fn in train_metrics_dict.items()}
        
        step_log = dict({"train_loss":loss.item()},**step_metrics)

        total_loss += loss.item()
        
        step+=1
        if i!=len(dl_train)-1:
            loop.set_postfix(**step_log)
        else:
            epoch_loss = total_loss/step
            epoch_metrics = {"train_"+name:metric_fn.compute().item() 
                             for name,metric_fn in train_metrics_dict.items()}
            epoch_log = dict({"train_loss":epoch_loss},**epoch_metrics)
            loop.set_postfix(**epoch_log)

            for name,metric_fn in train_metrics_dict.items():
                metric_fn.reset()
                
    for name, metric in epoch_log.items():
        history[name] = history.get(name, []) + [metric]
        

    # 2,validate -------------------------------------------------
    net.eval()
    
    total_loss,step = 0,0
    loop = tqdm(enumerate(dl_val), total =len(dl_val),file=sys.stdout)
    
    val_metrics_dict = deepcopy(metrics_dict) 
    
    with torch.no_grad():
        for i, batch in loop: 

            features,labels = batch
            
            #forward
            preds = net(features)
            loss = loss_fn(preds,labels)

            #metrics
            step_metrics = {"val_"+name:metric_fn(preds, labels).item() 
                            for name,metric_fn in val_metrics_dict.items()}

            step_log = dict({"val_loss":loss.item()},**step_metrics)

            total_loss += loss.item()
            step+=1
            if i!=len(dl_val)-1:
                loop.set_postfix(**step_log)
            else:
                epoch_loss = (total_loss/step)
                epoch_metrics = {"val_"+name:metric_fn.compute().item() 
                                 for name,metric_fn in val_metrics_dict.items()}
                epoch_log = dict({"val_loss":epoch_loss},**epoch_metrics)
                loop.set_postfix(**epoch_log)

                for name,metric_fn in val_metrics_dict.items():
                    metric_fn.reset()
                    
    epoch_log["epoch"] = epoch           
    for name, metric in epoch_log.items():
        history[name] = history.get(name, []) + [metric]

    # 3,early-stopping -------------------------------------------------
    arr_scores = history[monitor]
    best_score_idx = np.argmax(arr_scores) if mode=="max" else np.argmin(arr_scores)
    if best_score_idx==len(arr_scores)-1:
        torch.save(net.state_dict(),ckpt_path)
        print("<<<<<< reach best {0} : {1} >>>>>>".format(monitor,
             arr_scores[best_score_idx]))
    if len(arr_scores)-best_score_idx>patience:
        print("<<<<<< {} without improvement in {} epoch, early stopping >>>>>>".format(
            monitor,patience))
        break 
    net.load_state_dict(torch.load(ckpt_path))
    
dfhistory = pd.DataFrame(history)

"""
================================================================================2024-08-04 16:57:44
Epoch 1 / 3

100%|█████████████████████████████████████████████| 469/469 [00:44<00:00, 10.42it/s, train_acc=0.903, train_loss=0.295]
100%|██████████████████████████████████████████████████| 79/79 [00:03<00:00, 24.28it/s, val_acc=0.975, val_loss=0.0781]
<<<<<< reach best val_acc : 0.9751999974250793 >>>>>>

================================================================================2024-08-04 16:58:32
Epoch 2 / 3

100%|█████████████████████████████████████████████| 469/469 [00:44<00:00, 10.54it/s, train_acc=0.967, train_loss=0.108]
100%|██████████████████████████████████████████████████| 79/79 [00:03<00:00, 25.28it/s, val_acc=0.976, val_loss=0.0732]
<<<<<< reach best val_acc : 0.9757999777793884 >>>>>>

================================================================================2024-08-04 16:59:20
Epoch 3 / 3

100%|█████████████████████████████████████████████| 469/469 [00:42<00:00, 11.03it/s, train_acc=0.972, train_loss=0.094]
100%|██████████████████████████████████████████████████| 79/79 [00:03<00:00, 25.24it/s, val_acc=0.981, val_loss=0.0605]
<<<<<< reach best val_acc : 0.9811999797821045 >>>>>>
"""
# 使用GPU進行訓練
import os,sys,time
import numpy as np
import pandas as pd
import datetime 
from tqdm import tqdm 

import torch
from torch import nn 
from copy import deepcopy
from torchmetrics import Accuracy
#注:多分類使用torchmetrics中的評估指標,二分類使用torchkeras.metrics中的評估指標

def printlog(info):
    nowtime = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    print("\n"+"=========="*8 + "%s"%nowtime)
    print(str(info)+"\n")
    
net = create_net() 


loss_fn = nn.CrossEntropyLoss()
optimizer= torch.optim.Adam(net.parameters(),lr = 0.01)   
metrics_dict = {"acc":Accuracy(task='multiclass',num_classes=10)}

# ============================移動模型到GPU上============================
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
loss_fn.to(device)
for name, fn in metrics_dict.items():
    fn.to(device)
# ======================================================================

epochs = 5 
ckpt_path='checkpoint.pt'

#early_stopping相關設定
monitor="val_acc"
patience=1
mode="max"

history = {}

for epoch in range(1, epochs+1):
    printlog("Epoch {0} / {1}".format(epoch, epochs))

    # 1,train -------------------------------------------------  
    net.train()
    
    total_loss,step = 0,0
    
    loop = tqdm(enumerate(dl_train), total =len(dl_train),file=sys.stdout)
    train_metrics_dict = deepcopy(metrics_dict) 
    
    for i, batch in loop: 
        
        features,labels = batch
        # ====================================移動資料到GPU上====================================
        features = features.to(device)
        labels = labels.to(device)
        # ======================================================================================
        #forward
        preds = net(features)
        loss = loss_fn(preds,labels)
        
        #backward
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
            
        #metrics
        step_metrics = {"train_"+name:metric_fn(preds, labels).item() 
                        for name,metric_fn in train_metrics_dict.items()}
        
        step_log = dict({"train_loss":loss.item()},**step_metrics)

        total_loss += loss.item()
        
        step+=1
        if i!=len(dl_train)-1:
            loop.set_postfix(**step_log)
        else:
            epoch_loss = total_loss/step
            epoch_metrics = {"train_"+name:metric_fn.compute().item() 
                             for name,metric_fn in train_metrics_dict.items()}
            epoch_log = dict({"train_loss":epoch_loss},**epoch_metrics)
            loop.set_postfix(**epoch_log)

            for name,metric_fn in train_metrics_dict.items():
                metric_fn.reset()
                
    for name, metric in epoch_log.items():
        history[name] = history.get(name, []) + [metric]
        

    # 2,validate -------------------------------------------------
    net.eval()
    
    total_loss,step = 0,0
    loop = tqdm(enumerate(dl_val), total =len(dl_val),file=sys.stdout)
    
    val_metrics_dict = deepcopy(metrics_dict) 
    
    with torch.no_grad():
        for i, batch in loop: 

            features, labels = batch
            # ====================================移動資料到GPU上====================================
            features = features.to(device)
            labels = labels.to(device)
            # ======================================================================================
            #forward
            preds = net(features)
            loss = loss_fn(preds,labels)

            #metrics
            step_metrics = {"val_"+name:metric_fn(preds, labels).item() 
                            for name,metric_fn in val_metrics_dict.items()}

            step_log = dict({"val_loss":loss.item()},**step_metrics)

            total_loss += loss.item()
            step+=1
            if i!=len(dl_val)-1:
                loop.set_postfix(**step_log)
            else:
                epoch_loss = (total_loss/step)
                epoch_metrics = {"val_"+name:metric_fn.compute().item() 
                                 for name,metric_fn in val_metrics_dict.items()}
                epoch_log = dict({"val_loss":epoch_loss},**epoch_metrics)
                loop.set_postfix(**epoch_log)

                for name,metric_fn in val_metrics_dict.items():
                    metric_fn.reset()
                    
    epoch_log["epoch"] = epoch           
    for name, metric in epoch_log.items():
        history[name] = history.get(name, []) + [metric]

    # 3,early-stopping -------------------------------------------------
    arr_scores = history[monitor]
    best_score_idx = np.argmax(arr_scores) if mode=="max" else np.argmin(arr_scores)
    if best_score_idx==len(arr_scores)-1:
        torch.save(net.state_dict(),ckpt_path)
        print("<<<<<< reach best {0} : {1} >>>>>>".format(monitor,
             arr_scores[best_score_idx]))
    if len(arr_scores)-best_score_idx>patience:
        print("<<<<<< {} without improvement in {} epoch, early stopping >>>>>>".format(
            monitor,patience))
        break 
    net.load_state_dict(torch.load(ckpt_path))
    
dfhistory = pd.DataFrame(history)

"""
================================================================================2024-08-04 17:03:49
Epoch 1 / 5

100%|█████████████████████████████████████████████| 469/469 [00:07<00:00, 63.35it/s, train_acc=0.886, train_loss=0.347]
100%|███████████████████████████████████████████████████| 79/79 [00:02<00:00, 31.36it/s, val_acc=0.952, val_loss=0.153]
<<<<<< reach best val_acc : 0.9517999887466431 >>>>>>

================================================================================2024-08-04 17:03:59
Epoch 2 / 5

100%|█████████████████████████████████████████████| 469/469 [00:05<00:00, 91.89it/s, train_acc=0.965, train_loss=0.116]
100%|██████████████████████████████████████████████████| 79/79 [00:03<00:00, 22.98it/s, val_acc=0.975, val_loss=0.0832]
<<<<<< reach best val_acc : 0.9751999974250793 >>>>>>

================================================================================2024-08-04 17:04:07
Epoch 3 / 5

100%|████████████████████████████████████████████| 469/469 [00:04<00:00, 94.34it/s, train_acc=0.973, train_loss=0.0886]
100%|███████████████████████████████████████████████████| 79/79 [00:02<00:00, 32.07it/s, val_acc=0.98, val_loss=0.0649]
<<<<<< reach best val_acc : 0.9800000190734863 >>>>>>

================================================================================2024-08-04 17:04:15
Epoch 4 / 5

100%|████████████████████████████████████████████| 469/469 [00:05<00:00, 91.59it/s, train_acc=0.975, train_loss=0.0861]
100%|██████████████████████████████████████████████████| 79/79 [00:02<00:00, 30.38it/s, val_acc=0.979, val_loss=0.0748]
<<<<<< val_acc without improvement in 1 epoch, early stopping >>>>>>
"""

5.torchkeras.KerasModel中使用GPU

從上面的例子可以看到,在pytorch中使用GPU並不複雜,但對於經常煉丹的同學來說,模型和資料老是移來移去還是蠻麻煩的。

一不小心就會忘了移動某些資料或者某些module,導致報錯。

torchkeras.KerasModel在設計的適合就考慮到了這一點,如果環境中存在可用的GPU,會自動使用GPU,反之則使用CPU。

透過引入accelerate的一些基礎功能,torchkeras.KerasModel以非常優雅的方式在GPU和CPU之間切換/

詳細實現可以參考torchkeras.KerasModel的原始碼。

import accelerate

accelerator = accelerate.Accelerator()
print(accelerator.device)

"""
cuda
"""
from torchkeras import KerasModel
from torchmetrics import Accuracy

net = create_net()
model = KerasModel(net, loss_fn=nn.CrossEntropyLoss(), metrics_dict={"acc": Accuracy(task="multiclass", num_classes=10)},
                  optimizer=torch.optim.Adam(net.parameters(), lr=0.01))

model.fit(train_data=dl_train, val_data=dl_val, epochs=10, patience=3, monitor='val_acc', mode='max')

相關文章