使用Pytorch和卷積神經網路進行簡單的數字識別(MNIST)
說明:
- 首次發表日期:2024-10-30
- 參考:
- https://github.com/bentrevett/pytorch-image-classification/blob/master/1_mlp.ipynb
- https://www.sciencedirect.com/science/article/pii/S2214579620300502?via%3Dihub#se0070
- https://pytorch.org/tutorials/beginner/basics/buildmodel_tutorial.html
- https://blog.csdn.net/zylooooooooong/article/details/122805833
- https://www.markovml.com/blog/normalization-in-machine-learning
準備CONDA環境
# 在潤建WORKBENCH上使用
conda create -n py310torch python=3.10
conda activate py310torch
conda install pytorch==2.5.0 torchvision==0.20.0 torchaudio==2.5.0 pytorch-cuda=12.1 -c pytorch -c nvidia
pip install matplotlib tqdm ipywidgets
下載MNIST資料集
Kaggle上有MNIST資料集: https://www.kaggle.com/datasets/hojjatk/mnist-dataset
apt-get update
apt-get install curl
mkdir data
cd data
curl -L -o mnist.zip https://www.kaggle.com/api/v1/datasets/download/hojjatk/mnist-dataset
unzip mnist.zip
將MINST資料集儲存為圖片和標註
借鑑 https://www.kaggle.com/code/hojjatk/read-mnist-dataset 中的讀取MNIST資料的程式碼,並新增儲存為圖片和標註(按PaddleOCR格式)
import numpy as np # linear algebra
import struct
from array import array
from os.path import join
import os
from PIL import Image
#
# MNIST Data Loader Class
#
class MnistDataloader(object):
def __init__(self, training_images_filepath,training_labels_filepath,
test_images_filepath, test_labels_filepath):
self.training_images_filepath = training_images_filepath
self.training_labels_filepath = training_labels_filepath
self.test_images_filepath = test_images_filepath
self.test_labels_filepath = test_labels_filepath
def read_images_labels(self, images_filepath, labels_filepath):
labels = []
with open(labels_filepath, 'rb') as file:
magic, size = struct.unpack(">II", file.read(8))
if magic != 2049:
raise ValueError('Magic number mismatch, expected 2049, got {}'.format(magic))
labels = array("B", file.read())
with open(images_filepath, 'rb') as file:
magic, size, rows, cols = struct.unpack(">IIII", file.read(16))
if magic != 2051:
raise ValueError('Magic number mismatch, expected 2051, got {}'.format(magic))
image_data = array("B", file.read())
images = []
for i in range(size):
images.append([0] * rows * cols)
for i in range(size):
img = np.array(image_data[i * rows * cols:(i + 1) * rows * cols])
img = img.reshape(28, 28)
images[i][:] = img
return images, labels
def load_data(self):
x_train, y_train = self.read_images_labels(self.training_images_filepath, self.training_labels_filepath)
x_test, y_test = self.read_images_labels(self.test_images_filepath, self.test_labels_filepath)
return (x_train, y_train),(x_test, y_test)
input_path = './data'
training_images_filepath = join(input_path, 'train-images-idx3-ubyte/train-images-idx3-ubyte')
training_labels_filepath = join(input_path, 'train-labels-idx1-ubyte/train-labels-idx1-ubyte')
test_images_filepath = join(input_path, 't10k-images-idx3-ubyte/t10k-images-idx3-ubyte')
test_labels_filepath = join(input_path, 't10k-labels-idx1-ubyte/t10k-labels-idx1-ubyte')
mnist_dataloader = MnistDataloader(training_images_filepath, training_labels_filepath, test_images_filepath, test_labels_filepath)
(x_train, y_train), (x_test, y_test) = mnist_dataloader.load_data()
dest_dir = "./dataset/mnist_train"
os.makedirs(dest_dir, exist_ok = True)
label_lines = []
for i, (image, label) in enumerate(zip(x_train, y_train)):
fname = f"train_{i}_{label}.png"
Image.fromarray(np.stack(image)).save(os.path.join(dest_dir, fname))
label_lines.append(f"mnist_train/{fname}\t{label}")
with open("./dataset/mnist_train_rec.txt", "w") as f:
f.write("\n".join(label_lines))
dest_dir = "./dataset/mnist_test"
os.makedirs(dest_dir, exist_ok = True)
label_lines = []
for i, (image, label) in enumerate(zip(x_test, y_test)):
fname = f"train_{i}_{label}.png"
Image.fromarray(np.stack(image)).save(os.path.join(dest_dir, fname))
label_lines.append(f"mnist_test/{fname}\t{label}")
with open("./dataset/mnist_test_rec.txt", "w") as f:
f.write("\n".join(label_lines))
注意:需要儲存為PNG格式,JPG格式儲存時會壓縮導致和原np.array不一致
準備資料DataLoader
自定義Dataset子類
按照官方教程 https://pytorch.org/tutorials/beginner/basics/data_tutorial.html#creating-a-custom-dataset-for-your-files , 一個自定義的Dataset
子類需要實現__init__
, __len__
和 __getitem__
這三個方法。
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torch.utils.data as data
from torch.utils.data import Dataset
import torchvision.transforms as transforms
import torchvision.datasets as datasets
from torchvision.io import decode_image
import numpy as np
from PIL import Image
from tqdm.notebook import trange, tqdm
import time
class CustomRecDataset(Dataset):
def __init__(self, data_dir, label_file_or_file_list, transform=None, target_transform=None):
assert os.path.exists(data_dir), f"data_dir {data_dir} does not exists"
self.data_dir = data_dir
self.transform = transform
self.target_transform = target_transform
self.label_list = []
label_file_list = []
if isinstance(label_file_or_file_list, str):
label_file_list.append(label_file_or_file_list)
for label_file in label_file_list:
with open(label_file, "r") as f:
self.label_list.extend(f.readlines())
def __len__(self):
return len(self.label_list)
def __getitem__(self, idx):
img_rel_path, label = self.label_list[idx].split("\t")
label = int(label.replace("\n", ""))
img_path = os.path.join(self.data_dir, img_rel_path)
image = Image.open(img_path).numpy()
if self.transform:
image = self.transform(image)
if self.target_transform:
label = self.target_transform(label)
return image, label
Normalization
通常,我們需要將圖片資料進行Normlization處理。
為何要做Normalization:
- 每個feature具有不同的scale,即有不同的取值範圍(比如房子的價格和麵積),Normalization 可以移除scale的影響,使得不同的feature對模型的貢獻是平等的,防止偏見。
- Normalization可以防止因數值過大造成的梯度爆炸或者梯度消失。
關於Normalization,瞭解更多請參考 https://www.markovml.com/blog/normalization-in-machine-learning
進行Normalization處理需要有MEAN和STD值,CV領域經常使用來自ImageNet的MEAN和STD值。
部落格 https://blog.csdn.net/zylooooooooong/article/details/122805833 指出是否使用ImageNet的均值和標準差取決於你的資料:
- 假設你的資料是“
自然場景
”的普通照片(人,建築,動物,不同的照明/角度/背景等等),並且假設你的資料集和 ImageNet 存在類似的偏差(在類別平衡方面),那麼使用 ImageNet 的場景統計資料進行規範化就可以了。 - 如果照片是“
特殊的
”(顏色過濾,對比度調整,不尋常的光線,等等)或“非自然的主題
”(醫學影像,衛星地圖,手繪等) ,我建議在模型訓練之前正確地規範化你的資料集(計算新的平均值和標準)。
本文不使用來自ImageNet的預訓練權重,所以我們需要計算MEAN和STD值:
ds = CustomRecDataset('dataset', 'dataset/mnist_train_rec.txt')
images = [image for (image, label) in ds]
mean = np.mean(images) / 255.0
std = np.std(images) / 255.0
print(mean, std)
0.1306604762738429 0.30810780385646264
注意:計算MEAN和STD值的時候,只使用訓練集的資料,避免洩露來自測試集的資料。
Transforms
train_transforms = transforms.Compose([
transforms.RandomRotation(5, fill=(0,)),
transforms.RandomCrop(28, padding=2),
transforms.ToTensor(),
transforms.Normalize(mean=[mean], std=[std])
])
test_transforms = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize(mean=[mean], std=[std])
])
其中:
- RandomRotation隨機旋轉圖片
(-x, +x)
度 - RandomCrop應用padding之後,隨機裁剪
- ToTensor轉換為Tensor並scale到
[0.0,1.0]
之間
準備訓練集,驗證集,測試集
train_data = CustomRecDataset('dataset', 'dataset/mnist_train_rec.txt', transform=train_transforms)
test_data = CustomRecDataset('dataset', 'dataset/mnist_test_rec.txt', transform=test_transforms)
MNIST資料集沒有直接提供驗證集,我們從訓練集裡面取出10%來當做驗證集。
注意:我們可以從訓練集取部分資料當驗證集,但是不能從測試集取。
generator1 = torch.Generator().manual_seed(42)
train_data, valid_data = data.random_split(train_data, [0.9, 0.1], generator=generator1)
valid_data.transform=test_transforms
DataLoader
因為隨機梯度下降演算法需要,我們將訓練集的DataLoader的shuffle
為True
。
BATCH_SIZE = 64
train_iterator = data.DataLoader(train_data,
shuffle=True,
batch_size=BATCH_SIZE)
valid_iterator = data.DataLoader(valid_data,
batch_size=BATCH_SIZE)
test_iterator = data.DataLoader(test_data,
batch_size=BATCH_SIZE)
定義模型
定義一個模型,參考自 https://www.sciencedirect.com/science/article/pii/S2214579620300502?via%3Dihub#se0070
class DigitNet(nn.Module):
def __init__(self, output_dim=10):
super().__init__()
self.classifier = nn.Sequential(
nn.Conv2d(1, 16, 3, 1),
nn.MaxPool2d(2, 2),
nn.ReLU(inplace=True),
nn.Conv2d(16, 32, 5, 1),
nn.MaxPool2d(2, 2),
nn.ReLU(inplace=True),
nn.Conv2d(32, 64, 3, 1),
nn.Flatten(),
nn.Linear(256, 128),
nn.Linear(128, output_dim)
)
def forward(self, x):
logits = self.classifier(x)
return logits
計算模型引數量:
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
model = DigitNet(10)
print(f'The model has {count_parameters(model):,} trainable parameters')
0.1306604762738429 0.30810780385646264
The model has 65,674 trainable parameters
訓練
optimizer = optim.Adam(model.parameters())
criterion = nn.CrossEntropyLoss()
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)
criterion = criterion.to(device)
def calculate_accuracy(y_pred, y):
top_pred = y_pred.argmax(1, keepdim=True)
correct = top_pred.eq(y.view_as(top_pred)).sum()
acc = correct.float() / y.shape[0]
return acc
def train(model, iterator, optimizer, criterion, device):
epoch_loss = 0
epoch_acc = 0
model.train()
for (x, y) in tqdm(iterator, desc="Training", leave=False):
x = x.to(device)
y = y.to(device)
optimizer.zero_grad()
y_pred = model(x)
loss = criterion(y_pred, y)
acc = calculate_accuracy(y_pred, y)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def evaluate(model, iterator, criterion, device):
epoch_loss = 0
epoch_acc = 0
model.eval()
with torch.no_grad():
for (x, y) in tqdm(iterator, desc="Evaluating", leave=False):
x = x.to(device)
y = y.to(device)
y_pred = model(x)
loss = criterion(y_pred, y)
acc = calculate_accuracy(y_pred, y)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss / len(iterator), epoch_acc / len(iterator)
def epoch_time(start_time, end_time):
elapsed_time = end_time - start_time
elapsed_mins = int(elapsed_time / 60)
elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
return elapsed_mins, elapsed_secs
EPOCHS = 10
best_valid_loss = float('inf')
for epoch in trange(EPOCHS):
start_time = time.monotonic()
train_loss, train_acc = train(model, train_iterator, optimizer, criterion, device)
valid_loss, valid_acc = evaluate(model, valid_iterator, criterion, device)
if valid_loss < best_valid_loss:
best_valid_loss = valid_loss
torch.save(model.state_dict(), 'tut1-model.pt')
end_time = time.monotonic()
epoch_mins, epoch_secs = epoch_time(start_time, end_time)
print(f'Epoch: {epoch+1:02} | Epoch Time: {epoch_mins}m {epoch_secs}s')
print(f'\tTrain Loss: {train_loss:.3f} | Train Acc: {train_acc*100:.2f}%')
print(f'\t Val. Loss: {valid_loss:.3f} | Val. Acc: {valid_acc*100:.2f}%')
Epoch: 01 | Epoch Time: 0m 22s
Train Loss: 0.285 | Train Acc: 91.15%
Val. Loss: 0.134 | Val. Acc: 96.06%
Epoch: 02 | Epoch Time: 0m 21s
Train Loss: 0.112 | Train Acc: 96.62%
Val. Loss: 0.115 | Val. Acc: 96.63%
Epoch: 03 | Epoch Time: 0m 21s
Train Loss: 0.089 | Train Acc: 97.26%
Val. Loss: 0.085 | Val. Acc: 97.64%
Epoch: 04 | Epoch Time: 0m 21s
Train Loss: 0.074 | Train Acc: 97.71%
Val. Loss: 0.079 | Val. Acc: 97.59%
Epoch: 05 | Epoch Time: 0m 19s
Train Loss: 0.066 | Train Acc: 97.97%
Val. Loss: 0.105 | Val. Acc: 96.66%
Epoch: 06 | Epoch Time: 0m 21s
Train Loss: 0.061 | Train Acc: 98.06%
Val. Loss: 0.073 | Val. Acc: 97.73%
Epoch: 07 | Epoch Time: 0m 21s
Train Loss: 0.056 | Train Acc: 98.29%
Val. Loss: 0.066 | Val. Acc: 97.94%
Epoch: 08 | Epoch Time: 0m 21s
Train Loss: 0.054 | Train Acc: 98.32%
Val. Loss: 0.087 | Val. Acc: 97.23%
Epoch: 09 | Epoch Time: 0m 22s
Train Loss: 0.051 | Train Acc: 98.40%
Val. Loss: 0.065 | Val. Acc: 98.39%
Epoch: 10 | Epoch Time: 0m 22s
Train Loss: 0.049 | Train Acc: 98.51%
Val. Loss: 0.065 | Val. Acc: 98.15%