Learning material

LightningModule

Lightning Module{target="_blank"}

Logging

Logging{target="_blank"}

Trainer

Trainer{target="_blank"}

An Example - MLP on MNIST

Implementation with Pytorch

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.utils.data import random_split


class NN(nn.Module):
def __init__(self, input_size, num_classes):
super().__init__()
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)

def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x


# Set device cuda for GPU if it's available otherwise run on the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Hyperparameters
input_size = 784 # 28*28
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 3

# Load Data
entire_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=False
)
train_ds, val_ds = random_split(entire_dataset, [50000, 10000])
test_ds = datasets.MNIST(
root="dataset/", train=False, transform=transforms.ToTensor(), download=False
)
train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, shuffle=True) # len: 782
val_loader = DataLoader(dataset=val_ds, batch_size=batch_size, shuffle=False) # len: 157
test_loader = DataLoader(dataset=test_ds, batch_size=batch_size, shuffle=False) # len: 157

for (images, labels) in train_loader:
print(images.shape, labels.shape)
# torch.Size([64, 1, 28, 28]) torch.Size([64])
break

# Initialize network
model = NN(input_size=input_size, num_classes=num_classes).to(device)

# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train Network
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
# Get data to cuda if possible
data = data.to(device=device)
targets = targets.to(device=device)

# Get to correct shape
data = data.reshape(data.shape[0], -1)

# Forward
scores = model(data)
loss = criterion(scores, targets)

# Backward
optimizer.zero_grad()
loss.backward()

# Gradient descent or adam step
optimizer.step()

# 100%|███████████████████████████████████████████████████████████████████| 782/782 [00:55<00:00, 14.10it/s]
# 100%|███████████████████████████████████████████████████████████████████| 782/782 [00:10<00:00, 75.29it/s]
# 100%|███████████████████████████████████████████████████████████████████| 782/782 [00:10<00:00, 75.45it/s]

# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()

# We don't need to keep track of gradients here so we wrap it in torch.no_grad()
with torch.no_grad():
# Loop through the data
for x, y in loader:

# Move data to device
x = x.to(device=device) # x.shape: [64, 1, 28, 28]
y = y.to(device=device) # y.shape: [64]

# Get to correct shape
x = x.reshape(x.shape[0], -1) # x.shape: [64, 748] <- [64, 1, 28, 28]

# Forward pass
scores = model(x) # socres.shape: [64, 10]
_, predictions = scores.max(1) # predictions.shape: [64]

# Check how many we got correct
num_correct += (predictions == y).sum()

# Keep track of number of samples
num_samples += predictions.size(0)

model.train()
return num_correct / num_samples


# Check accuracy on training & test to see how good our model
model.to(device)

print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
# Accuracy on training set: 95.60
print(f"Accuracy on validation set: {check_accuracy(val_loader, model)*100:.2f}")
# Accuracy on validation set: 95.60
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
# Accuracy on test set: 95.48

Implementation with Pytorch_lightning

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
import torch
import torch.nn.functional as F
import torchvision.datasets as datasets
import torchvision.transforms as transforms
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm
from torch.utils.data import random_split
import pytorch_lightning as pl

class NN(pl.LightningModule):
def __init__(self, input_size, num_classes):
super().__init__()
self.fc1 = nn.Linear(input_size, 50)
self.fc2 = nn.Linear(50, num_classes)
self.loss_fn = nn.CrossEntropyLoss() # <-- criterion/loss

def forward(self, x):
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x

def training_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log("train_loss", loss)
return loss

def validation_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log("val_loss", loss)
return loss

def test_step(self, batch, batch_idx):
loss, scores, y = self._common_step(batch, batch_idx)
self.log("test_loss", loss)
return loss

def _common_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
loss = self.loss_fn(scores, y)
return loss, scores, y

def predict_step(self, batch, batch_idx):
x, y = batch
x = x.reshape(x.size(0), -1)
scores = self.forward(x)
preds = torch.argmax(scores, dim=1)
return preds

def configure_optimizers(self):
return optim.Adam(self.parameters(), lr=0.001)

# Hyperparameters
input_size = 784 # 28*28
num_classes = 10
learning_rate = 0.001
batch_size = 64
num_epochs = 1

# Load Data
entire_dataset = datasets.MNIST(
root="dataset/", train=True, transform=transforms.ToTensor(), download=False
)
train_ds, val_ds = random_split(entire_dataset, [50000, 10000])
test_ds = datasets.MNIST(
root="dataset/", train=False, transform=transforms.ToTensor(), download=False
)
train_loader = DataLoader(dataset=train_ds, batch_size=batch_size, num_workers=0, shuffle=True) # len: 782
val_loader = DataLoader(dataset=val_ds, batch_size=batch_size, num_workers=0, shuffle=False) # len: 157
test_loader = DataLoader(dataset=test_ds, batch_size=batch_size, num_workers=0, shuffle=False) # len: 157

# Have a see at the shape of input images
for (images, labels) in train_loader:
print(images.shape, labels.shape)
# torch.Size([64, 1, 28, 28]) torch.Size([64])
break

# Set device cuda for GPU if it's available otherwise run on the CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Initialize network
model = NN(input_size=input_size, num_classes=num_classes).to(device)

# optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train Network
trainer = pl.Trainer(accelerator="auto", devices="auto", min_epochs=1, max_epochs=3, precision=16)

trainer.fit(model=model,
train_dataloaders=train_loader,
val_dataloaders=val_loader)

trainer.validate(model=model, dataloaders=val_loader)
trainer.test(model=model, dataloaders=test_loader)

# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()

# We don't need to keep track of gradients here so we wrap it in torch.no_grad()
with torch.no_grad():
# Loop through the data
for x, y in loader:

# Move data to device
x = x.to(device=device) # x.shape: [64, 1, 28, 28]
y = y.to(device=device) # y.shape: [64]

# Get to correct shape
x = x.reshape(x.shape[0], -1) # x.shape: [64, 748] <- [64, 1, 28, 28]

# Forward pass
scores = model(x) # socres.shape: [64, 10]
_, predictions = scores.max(1) # predictions.shape: [64]

# Check how many we got correct
num_correct += (predictions == y).sum()

# Keep track of number of samples
num_samples += predictions.size(0)

model.train()
return num_correct / num_samples


# Check accuracy on training & test to see how good our model
model.to(device)

print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}")
# Accuracy on training set: 95.72
print(f"Accuracy on validation set: {check_accuracy(val_loader, model)*100:.2f}")
# Accuracy on validation set: 95.35
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
# Accuracy on test set: 95.39

Checkpoint

Saving and loading checkpoints (basic){target="_blank"}

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import torch
from torch import nn
import torch.nn.functional as F
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader
from torch.utils.data import random_split
import pytorch_lightning as pl

# torch.nn.Module
class Encoder(nn.Module):
def __init__(self):
super().__init__()
self.l1 = nn.Sequential(nn.Linear(28 * 28, 64), nn.ReLU(), nn.Linear(64, 3))

def forward(self, x):
return self.l1(x)

# torch.nn.Module
class Decoder(nn.Module):
def __init__(self):
super().__init__()
self.l1 = nn.Sequential(nn.Linear(3, 64), nn.ReLU(), nn.Linear(64, 28 * 28))

def forward(self, x):
return self.l1(x)

# LightningModule
class LitAutoEncoder(pl.LightningModule):
# def __init__(self, encoder=Encoder(), decoder=Decoder()):
def __init__(self, encoder, decoder):
super().__init__()
self.encoder = encoder # 784 -> 64 -> 3
self.decoder = decoder # 3 -> 64 -> 784
self.save_hyperparameters()

def forward(self, x):
# training_step defines the train loop.
x = x.view(x.size(0), -1)
z = self.encoder(x)
x_hat = self.decoder(z)
return x_hat

def training_step(self, batch, batch_idx):
# training_step defines the train loop.
x, _ = batch
x = x.view(x.size(0), -1)
z = self.encoder(x)
x_hat = self.decoder(z)
loss = F.mse_loss(x_hat, x)
self.log("train_loss", loss)
return loss

def validation_step(self, batch, batch_idx):
# this is the validation loop
x, _ = batch
x = x.view(x.size(0), -1)
z = self.encoder(x)
x_hat = self.decoder(z)
val_loss = F.mse_loss(x_hat, x)
self.log("val_loss", val_loss)

def test_step(self, batch, batch_idx):
# this is the test loop
x, _ = batch
x = x.view(x.size(0), -1)
z = self.encoder(x)
x_hat = self.decoder(z)
test_loss = F.mse_loss(x_hat, x)
self.log("test_loss", test_loss)

def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=1e-3)
return optimizer

# Load MNIST dataset
transform = transforms.ToTensor()
# Load Data: train=True
train_val_ds = MNIST(root="dataset/", train=True, transform=transform, download=False) # 60000
# Load Data: train=False
test_ds = MNIST(root="dataset/", train=False, transform=transform, download=False) # 10000

# Split Data for training and validation
# use 20% of training data for validation and 80% for training
train_ds_size = int(len(train_val_ds) * 0.8) # 60000*0.8 = 48000
val_ds_size = len(train_val_ds) - train_ds_size # 60000-48000 = 12000
seed = torch.Generator().manual_seed(42)
train_ds, val_ds = random_split(train_val_ds, [train_ds_size, val_ds_size], generator=seed)

train_loader = DataLoader(dataset=train_ds, batch_size=64, shuffle=True) # 750: 750*64+00=48000
val_loader = DataLoader(dataset=val_ds, batch_size=64, shuffle=True) # 188: 187*64+32=12000
test_loader = DataLoader(dataset=test_ds, batch_size=64, shuffle=False) # 157: 156*64+16=10000

# Instantiating the model
autoencoder = LitAutoEncoder(Encoder(), Decoder())

# Train model and save checkpoints automatically
trainer = pl.Trainer(accelerator="auto", devices="auto", min_epochs=1, max_epochs=1, precision=16)
trainer.fit(model=autoencoder, train_dataloaders=train_loader, val_dataloaders=val_loader)
trainer.validate(model=autoencoder, dataloaders=val_loader)
trainer.test(model=autoencoder, dataloaders=test_loader)

# Load from the checkpoint
trained_model = LitAutoEncoder.load_from_checkpoint("lightning_logs/version_0/checkpoints/epoch=0-step=750.ckpt")

# Evaluate
trained_model.eval()
for (images, labels) in test_loader:
print(images.shape, labels.shape)
y_hat = trained_model(images)
print(y_hat.shape)
break

# what is in the checkpoint?
checkpoint = torch.load("lightning_logs/version_2/checkpoints/epoch=0-step=750.ckpt")
print(checkpoint.keys())
'''
dict_keys(
[
'epoch',
'global_step',
'pytorch-lightning_version',
'state_dict',
'loops',
'callbacks',
'optimizer_states',
'lr_schedulers',
'hparams_name',
'hyper_parameters'
]
)
'''

configure_optimizers

1