We study the effect of batch normalization and layer normalization on the performance of a simple MLP on the MNIST dataset.
We provide the code for loading and preprocessing the MNIST dataset, and the training and testing code. Below is the code below loads and preprocesses the MNIST dataset.
import torchvision
import torch
def int2onehot(y):
onehot = torch.zeros(len(y), y.max()+1)
onehot[torch.arange(len(y)), y] = 1
return onehot.long()
mnist_tr = torchvision.datasets.MNIST('~/.torchvision', train=True, download=True)
mnist_ts = torchvision.datasets.MNIST('~/.torchvision', train=False, download=True)
# nomalize input to [0, 1]
x_train, y_train = mnist_tr.data.float()/255, mnist_tr.targets
x_test, y_test = mnist_ts.data.float()/255, mnist_ts.targets
# reshape
x_train = x_train.reshape(len(x_train), -1)
x_test = x_test.reshape(len(x_test), -1)
# use one-hot encoding of labels
y_train = int2onehot(y_train).float()
y_test = int2onehot(y_test).float()
The training code below uses the SGD optimizer to minimize an MSE loss.
import torch.nn as nn
import torch.nn.functional as F
from torch import optim
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from time import time
class DatasetWrapper(Dataset):
def __init__(self, X, y=None):
self.X, self.y = X, y
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
if self.y is None:
return self.X[idx]
else:
return self.X[idx], self.y[idx]
def train(net, x, y, lr=0.01, momentum=0.9, batch_size=600, max_epochs=10):
optimizer = optim.SGD(net.parameters(), lr=lr, momentum=momentum)
acc_tr = [] # training accuracies for each epoch
# training loop
dataloader = DataLoader(DatasetWrapper(x, y), batch_size=batch_size, shuffle=True)
loop = tqdm(range(max_epochs), ncols=110)
for i in loop:
t0 = time()
epoch_loss = 0
n_batches = 0
for (x_batch, y_batch) in dataloader:
optimizer.zero_grad()
loss = torch.sum((net(x_batch) - y_batch)**2) / len(y_batch)
loss.backward()
epoch_loss += loss.data
n_batches += 1
optimizer.step()
# evaluate network performance
acc_tr.append(test(net, x, y, batch_size=batch_size))
# show training progress
loop.set_postfix(loss="%5.5f" % (epoch_loss/n_batches),
train_acc="%.2f%%" % (100*acc_tr[-1]))
The test code below computes a trained MLP's accuracy on a given dataset.
def test(net, x, y, batch_size=600, showerrors=False):
with torch.no_grad():
pred_cls = []
# make predictions on mini-batches
dataloader = DataLoader(DatasetWrapper(x), batch_size=batch_size, shuffle=False)
for x_batch in dataloader:
pred_cls.append(torch.max(net(x_batch), 1)[1])
pred_cls = torch.cat(pred_cls) # concat predictions on the mini-batches
true_cls = torch.max(y, 1)[1].cpu()
acc = (pred_cls == true_cls).sum().float() / len(y)
return acc
The MLP that we consider is a single hidden layer MLP.
net = nn.Sequential(nn.Linear(784, 50), nn.ReLU(), nn.Linear(50, 10))
We can use the given code to train the above model and compute its test set accuracy.
train(net, x_train, y_train)
test(net, x_test, y_test)
(a) Train the given MLP 10 times to estimate a 95\% confidence interval for the trained model's accuracy on the given test set.
Answer.
(b) We consider adding a single batch normalization layer nn.BatchNorm1d
.
There are four different ways to do this as shown below.
Compute 95\% confidence intervals for each model's test set accuracy using the same procedure in (a).
Answer.
(c) We consider adding a single layer normalization layer nn.LayerNorm
.
There are four different ways to do this as shown below.
Compute 95\% confidence intervals for each model's test set accuracy using the same procedure in (a).
Answer.
(d) What can you conclude based on the results in (a)-(c)? What are the limitations in your conclusion?
Answer.