ResNet 笔记

The Paper

Deep Residual Learning for Image Recognition | arxiv

Pytorch ResNet implementation from Scratch | Aladdin Persson | YouTube

Comparison of the time complexity between two blocks

Figure 5 in DRLfIR

2nd block of layer conv2_x in 34-layer (left block)
- 3x3, 64→64, strides=1, padding=1, 56x56→56x56
- 3x3, 64→64, strides=1, padding=1, 56x56→56x56
2nd block of layer conv2_x in 50-layer (right block)
- 1x1, 256→64, strides=1, padding=0, 56x56→56x56
- 3x3, 64→64, strides=1, padding=1, 56x56→56x56
- 1x1, 64→256, strides=1, padding=0, 56x56→56x56

$\begin{matrix} \rm in\_channels & \rm kernel\_size & \rm image\_size & \rm out\_channels\\ 64 & 3\times3 & 56\times56 & 64\\ 64 & 3\times3 & 56\times56 & 64\\ \hline 256 & 1\times1 & 56\times56 & 64\\ 64 & 3\times3 & 56\times56 & 64\\ 64 & 1\times1 & 56\times56 & 256\\ \end{matrix}$

$\begin{matrix} \text{left block} & vs & \text{right block}\\ 2\times64^2\times3^2\times56^2 & & 2\times256\times64\times56^2+64^2\times3^2\times56^2 \end{matrix}$

$\rm left-right = (\underset{576}{64*9}-\underset{512}{2*256})\times64\times56^2$

In fact, the computational complexity of this two blocks is the same order of magnitude, and the complexity of the block of three layers on the right is even slightly lower.

ResNet-18/34

Note: How to calculate the pixel number of an image before and after convolution

$Output\_size = \lfloor(Image\_size - Kernal\_size + 2\times Padding)/Stride\rfloor + 1$

ResNet34 Structure with details

input: channels 3, size 224x224

layer conv1 in 34-layer:

7x7, 3→64, stride=2, padding=3, 224x224→112x112

layer conv2_x in 34-layer:

3x3 maxpool, channels=64, stride=2, padding=1, 112x112→56x56
1st block
- 3x3, 64→64, strides=1, padding=1, 56x56→56x56
- 3x3, 64→64, strides=1, padding=1, 56x56→56x56
2nd block
- 3x3, 64→64, strides=1, padding=1, 56x56→56x56
- 3x3, 64→64, strides=1, padding=1, 56x56→56x56
3rd block
- repeat 2nd block

layer conv3_x in 34-layer:

1st block
- 3x3, 64→128, strides=2, padding=1, 56x56→28x28
- 3x3, 128→128, strides=1, padding=1, 28x28→28x28
2nd block
- 3x3, 128→128, strides=1, padding=1, 28x28→28x28
- 3x3, 128→128, strides=1, padding=1, 28x28→28x28
3rd block
- repeat 2nd block
4th block
- repeat 2nd block

layer conv4_x in 34-layer:

1st block
- 3x3, 128→256, strides=2, padding=1, 28x28→14x14
- 3x3, 256→256, strides=1, padding=1, 14x14→14x14
2nd block
- 3x3, 256→256, strides=1, padding=1, 14x14→14x14
- 3x3, 256→256, strides=1, padding=1, 14x14→14x14
3rd block
- repeat 2nd block
4th block
- repeat 2nd block
5th block
- repeat 2nd block
6th block
- repeat 2nd block

layer conv5_x in 34-layer:

1st block
- 3x3, 256→512, strides=2, padding=1, 14x14→7x7
- 3x3, 512→512, strides=1, padding=1, 7x7→7x7
2nd block
- 3x3, 512→512, strides=1, padding=1, 7x7→7x7
- 3x3, 512→512, strides=1, padding=1, 7x7→7x7
3rd block
- repeat 2nd block

last layer in 34-layer:

average pool, channels 512, 7x7→1x1
fully connected layer, 512→1000
softmax

Code with Pytorch

import torch
import torch.nn as nn

class Block33(nn.Module):
  def __init__(self, in_channels, out_channels, identity_downsample=None, stride=1):
    super(Block33, self).__init__()
    self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
    self.bn1 = nn.BatchNorm2d(out_channels)
    self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1)
    self.bn2 = nn.BatchNorm2d(out_channels)
    self.relu = nn.ReLU()
    self.identity_downsample = identity_downsample

  def forward(self, x):
    identity = x

    x = self.conv1(x) # 3x3
    x = self.bn1(x)
    x = self.relu(x)
    x = self.conv2(x) # 3x3
    x = self.bn2(x)

    if self.identity_downsample is not None:
      identity = self.identity_downsample(identity)

    x += identity
    x = self.relu(x)
    return x

class ResNet(nn.Module):
  def __init__(self, Block, layers, image_channels, num_classes):
    super(ResNet, self).__init__()
    self.in_channels = 64
    # the conv1 layer in Table1 of the paper Deep Residual Learning for Image Recognition
    # 7x7, 64, stride 2, padding 3, 224x224->112x112
    self.conv1 = nn.Conv2d(in_channels=image_channels, out_channels=64, kernel_size=7, stride=2, padding=3)
    self.bn1 = nn.BatchNorm2d(64)
    self.relu = nn.ReLU()
    # 3x3 max pool, stride 2
    self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

    # ResNet layers
    # the layers=[3,4,6,3] in Resnet-34
    self.layer1 = self._make_layer33(Block, layers[0], out_channels=64, stride=1)
    self.layer2 = self._make_layer33(Block, layers[1], out_channels=128, stride=2)
    self.layer3 = self._make_layer33(Block, layers[2], out_channels=256, stride=2)
    self.layer4 = self._make_layer33(Block, layers[3], out_channels=512, stride=2)

    self.avgpool = nn.AdaptiveAvgPool2d((1,1))
    self.fc = nn.Linear(512, num_classes)

  def forward(self, x):
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.maxpool(x)
    features0 = x

    x = self.layer1(x)
    features1 = x
    x = self.layer2(x)
    features2 = x
    x = self.layer3(x)
    features3 = x
    x = self.layer4(x)
    features4 = x

    x = self.avgpool(x)
    x = x.reshape(x.shape[0], -1)
    x = self.fc(x)
    return x, [features0, features1, features2, features3, features4]
  
  def _make_layer33(self, Block, num_residual_Blocks, out_channels, stride):
    identity_downsample = None
    layers = []

    # Match the channels of identity and F(x) when self.inchannels(identity's) don't match with out_channels(F(x)'s)
    # Match the width & height of identity and F(x) when stride != 1 (so the size of image will change)
    if stride != 1 or self.in_channels != out_channels:
      identity_downsample = nn.Sequential(
        nn.Conv2d(self.in_channels, out_channels, kernel_size=1, stride=stride),
        nn.BatchNorm2d(out_channels)
      )
  
    # the 3 Blocks of 1st layer in resnet-34:
    # 64→|64→64|→|64→64|→|64→64|
    # the 4 Blocks of 2nd layer in resnet-34:
    # 64→|128→128|→|128→128|→|128→128|→|128→128|
    # 第一个 Block
    layers.append(Block(self.in_channels, out_channels, identity_downsample, stride))
    # 剩下的 Block
    for i in range(num_residual_Blocks - 1):
      layers.append(Block(out_channels, out_channels))

    # 修改全局的 in_channels 参数，记录当前图片的 channel 数
    self.in_channels = out_channels

    return nn.Sequential(*layers) #*layers表示把这个list的所有元素作为可变参数传进去
    
'''
将 Block 类作为参数, 而不是直接在 ResNet 中创建 Block 的实例, 可以更方便创建不同 Block 的 ResNet
例如 ResNet18/34 和 ResNet50/101 中的 Block 不同
只需要再定义一个 ResNet18/34 的 Block33, 将 Block131 修改为 Block33 即可创建 ResNet18/34
'''
def ResNet18(img_channels=3, num_classes=1000):
  return ResNet(Block=Block33, layers=[2, 2, 2, 2], image_channels=img_channels, num_classes=num_classes)

def ResNet34(img_channels=3, num_classes=1000):
  return ResNet(Block=Block33, layers=[3, 4, 6, 3], image_channels=img_channels, num_classes=num_classes)

if __name__ == "__main__":
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  B, C, H, W = 8, 3, 224, 224
  x = torch.randn(size=[B, C, H, W], device=device)

  # model = ResNet18().to(device)
  model = ResNet34().to(device)

  y, features = model(x)
  print(f"{y.shape = }")
  print(f"{features[0].shape = }\n" # [B,  64, 56, 56]
        f"{features[1].shape = }\n" # [B,  64, 56, 56]
        f"{features[2].shape = }\n" # [B, 128, 28, 28]
        f"{features[3].shape = }\n" # [B, 256, 14, 14]
        f"{features[4].shape = }")  # [B, 512,  7,  7]

ResNet-50/101/152

ResNet50 Structure with details

input: channels 3, size 224x224

layer conv1 in 50-layer:

7x7, 3→64, stride=2, padding=3, 224x224→112x112

layer conv2_x in 50-layer:

3x3 maxpool, stride=2, padding=1, 112x112→56x56

Some implementations set the padding of maxpool to 0, which is also feasible and so the image size will be

$112\xrightarrow[\lfloor(112-3)/2\rfloor + 1 = 55]{\text{maxpool(k3,s2,p0)}}55 \rightarrow...\rightarrow 55\xrightarrow[\lfloor(55-3 + 2*1)/2\rfloor + 1 = 28]{\text{conv(k3,s1,p1)}}28$

Then things will be same.
1st block
- 1x1, 64→64, strides=1, padding=0, 56x56→56x56
- 3x3, 64→64, strides=1, padding=1, 56x56→56x56
- 1x1, 64→256, strides=1, padding=0, 56x56→56x56
2nd block
- 1x1, 256→64, strides=1, padding=0, 56x56→56x56
- 3x3, 64→64, strides=1, padding=1, 56x56→56x56
- 1x1, 64→256, strides=1, padding=0, 56x56→56x56
3rd block
- repeat 2nd block

layer conv3_x in 50-layer:

1st block
- 1x1, 256→128, strides=1, padding=0, 56x56→56x56
- 3x3, 128→128, strides=2, padding=1, 56x56→28x28
- 1x1, 128→512, strides=1, padding=0, 28x28→28x28
2nd block
- 1x1, 512→128, strides=1, padding=0, 28x28→28x28
- 3x3, 128→128, strides=1, padding=1, 28x28→28x28
- 1x1, 128→512, strides=1, padding=0, 28x28→28x28
3rd block
- repeat 2nd block
4th block
- repeat 2nd block

layer conv4_x in 50-layer:

1st block
- 1x1, 512→256, strides=1, padding=0, 28x28→28x28
- 3x3, 256→256, strides=2, padding=1, 28x28→14x14
- 1x1, 256→1024, strides=1, padding=0, 14x14→14x14
2nd block
- 1x1, 1024→256, strides=1, padding=0, 14x14→14x14
- 3x3, 256→256, strides=1, padding=1, 14x14→14x14
- 1x1, 256→1024, strides=1, padding=0, 14x14→14x14
3rd block
- repeat 2nd block
4th block
- repeat 2nd block
5th block
- repeat 2nd block
6th block
- repeat 2nd block

layer conv5_x in 50-layer:

1st block
- 1x1, 1024→512, strides=1, padding=0, 14x14→14x14
- 3x3, 512→512, strides=2, padding=1, 14x14→7x7
- 1x1, 512→2048, strides=1, padding=0, 7x7→7x7
2nd block
- 1x1, 2048→512, strides=1, padding=0, 7x7→7x7
- 3x3, 512→512, strides=1, padding=1, 7x7→7x7
- 1x1, 512→2048, strides=1, padding=0, 7x7→7x7
3rd block
- repeat 2nd block

last layer in 50-layer:

average pool, 2048, 7x7→1x1
fully connected layer, 2048→1000
softmax

image of size 512

input: channels 3, size 512x512

layer conv1 in 50-layer:

7x7, 3→64, stride=2, padding=3, 512x512→256x256

layer conv2_x in 50-layer:

3x3 maxpool, stride=2, padding=1, 256x256→128x128

Some implementations set the padding of maxpool to 0, which is also feasible and so the image size will be

$256\xrightarrow[\lfloor(256-3)/2\rfloor + 1 = 127]{\text{maxpool(k3,s2,p0)}}127 \rightarrow...\rightarrow 127\xrightarrow[\lfloor(127-3 + 2*1)/2\rfloor + 1 = 62]{\text{conv(k3,s1,p1)}}62$

Then things will be same.
1st block
- 1x1, 64→64, strides=1, padding=0, 128x128→128x128
- 3x3, 64→64, strides=1, padding=1, 128x128→128x128
- 1x1, 64→256, strides=1, padding=0, 128x128→128x128
2nd block
- 1x1, 256→64, strides=1, padding=0, 128x128→128x128
- 3x3, 64→64, strides=1, padding=1, 128x128→128x128
- 1x1, 64→256, strides=1, padding=0, 128x128→128x128
3rd block
- repeat 2nd block

layer conv3_x in 50-layer:

1st block
- 1x1, 256→128, strides=1, padding=0, 128x128→128x128
- 3x3, 128→128, strides=2, padding=1, 128x128→64x64
- 1x1, 128→512, strides=1, padding=0, 64x64→64x64
2nd block
- 1x1, 512→128, strides=1, padding=0, 64x64→64x64
- 3x3, 128→128, strides=1, padding=1, 64x64→64x64
- 1x1, 128→512, strides=1, padding=0, 64x64→64x64
3rd block
- repeat 2nd block
4th block
- repeat 2nd block

layer conv4_x in 50-layer:

1st block
- 1x1, 512→256, strides=1, padding=0, 64x64→64x64
- 3x3, 256→256, strides=2, padding=1, 64x64→32x32
- 1x1, 256→1024, strides=1, padding=0, 32x32→32x32
2nd block
- 1x1, 1024→256, strides=1, padding=0, 32x32→32x32
- 3x3, 256→256, strides=1, padding=1, 32x32→32x32
- 1x1, 256→1024, strides=1, padding=0, 32x32→32x32
3rd block
- repeat 2nd block
4th block
- repeat 2nd block
5th block
- repeat 2nd block
6th block
- repeat 2nd block

layer conv5_x in 50-layer:

1st block
- 1x1, 1024→512, strides=1, padding=0, 32x32→32x32
- 3x3, 512→512, strides=2, padding=1, 32x32→16x16
- 1x1, 512→2048, strides=1, padding=0, 16x16→16x16
2nd block
- 1x1, 2048→512, strides=1, padding=0, 16x16→16x16
- 3x3, 512→512, strides=1, padding=1, 16x16→16x16
- 1x1, 512→2048, strides=1, padding=0, 16x16→16x16
3rd block
- repeat 2nd block

last layer in 50-layer:

average pool, 2048, 16x16→1x1
fully connected layer, 2048→1000
softmax

Code with Pytorch

from typing import List, Tuple
from torch import Tensor
import torch
import torch.nn as nn

class Block131(nn.Module):
  def __init__(self, in_channels, out_channels, identity_downsample=None, stride=1):
    super(Block131, self).__init__()
    self.expansion = 4
    self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
    self.bn1 = nn.BatchNorm2d(out_channels)
    self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=stride, padding=1)
    self.bn2 = nn.BatchNorm2d(out_channels)
    self.conv3 = nn.Conv2d(out_channels, out_channels*self.expansion, kernel_size=1, stride=1, padding=0)
    self.bn3 = nn.BatchNorm2d(out_channels*self.expansion)
    self.relu = nn.ReLU()
    self.identity_downsample = identity_downsample

  def forward(self, x):
    identity = x

    x = self.conv1(x) # 1x1
    x = self.bn1(x)
    x = self.relu(x)
    x = self.conv2(x) # 3x3
    x = self.bn2(x)
    x = self.relu(x)
    x = self.conv3(x) # 1x1
    x = self.bn3(x)
    
    if self.identity_downsample is not None:
      identity = self.identity_downsample(identity)

    x += identity
    x = self.relu(x)
    return x
    
class ResNet(nn.Module):
  def __init__(self, Block, layers, image_channels, num_classes):
    super(ResNet, self).__init__()
    self.in_channels = 64
    # the conv1 layer in Table1 of the paper Deep Residual Learning for Image Recognition
    # 7x7, 64, stride 2
    self.conv1 = nn.Conv2d(in_channels=image_channels, out_channels=64, kernel_size=7, stride=2, padding=3)
    self.bn1 = nn.BatchNorm2d(64)
    self.relu = nn.ReLU()
    # 3x3 max pool, stride 2
    self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)

    # ResNet layers
    # the layers=[3,4,6,3] in Resnet-50
    self.layer1 = self._make_layer131(Block, layers[0], out_channels=64, stride=1)
    self.layer2 = self._make_layer131(Block, layers[1], out_channels=128, stride=2)
    self.layer3 = self._make_layer131(Block, layers[2], out_channels=256, stride=2)
    self.layer4 = self._make_layer131(Block, layers[3], out_channels=512, stride=2)

    self.avgpool = nn.AdaptiveAvgPool2d((1,1))
    self.fc = nn.Linear(512*4, num_classes)

  def forward(self, x) -> Tuple[Tensor, List[Tensor]]:
    x = self.conv1(x)
    x = self.bn1(x)
    x = self.relu(x)
    x = self.maxpool(x)
    feature0 = x

    x = self.layer1(x)
    feature1 = x
    x = self.layer2(x)
    feature2 = x
    x = self.layer3(x)
    feature3 = x
    x = self.layer4(x)
    feature4 = x

    x = self.avgpool(x)
    x = x.reshape(x.shape[0], -1)
    x = self.fc(x)
    return x, [feature0, feature1, feature2, feature3, feature4]

  def _make_layer131(self, Block, num_residual_Blocks, out_channels, stride):
    identity_downsample = None
    layers = []

    # Match the channels of identity and F(x) when self.inchannels(identity's) don't match with out_channels(F(x)'s)
    # Match the width & height of identity and F(x) when stride != 1 (so the size of image will change)
    if stride != 1 or self.in_channels != out_channels*4:
      identity_downsample = nn.Sequential(
        nn.Conv2d(self.in_channels, out_channels*4, kernel_size=1, stride=stride),
        nn.BatchNorm2d(out_channels*4)
      )
    
    # self.in_channels = 64, out_channels = 64
    # 64->64, 64->64, 64->64*4(256)
    layers.append(Block(self.in_channels, out_channels, identity_downsample, stride))
    # self.inchannels = 256
    self.in_channels = out_channels*4

    # In the first layer of resnet-50, there are 3 Blocks:
    # the 1st Block is 64->64->64*4(256)
    # the 2nd Block is 256->64->256
    # the 3rd Block is 256->64->256
    # This is why the 1st Block need being handle separately.
    for i in range(num_residual_Blocks - 1):
      # the 1st layer
      # self.in_channels = 256, out_channels = 64
      # 256->64, 64->64, 64->64*4(256)
      layers.append(Block(self.in_channels, out_channels))

    return nn.Sequential(*layers) #*layers表示把这个list的所有元素作为可变参数传进去

'''
将 Block 类作为参数, 而不是直接在 ResNet 中创建 Block 的实例, 可以更方便创建不同 Block 的 ResNet
例如 ResNet18/34 和 ResNet50/101 中的 Block 不同
只需要再定义一个 ResNet18/34 的 Block33, 将 Block131 修改为 Block33 即可创建 ResNet18/34
'''
def ResNet50(img_channels=3, num_classes=1000):
  return ResNet(Block=Block131, layers=[3, 4, 6, 3], image_channels=img_channels, num_classes=num_classes)

def ResNet101(img_channels=3, num_classes=1000):
  return ResNet(Block=Block131, layers=[3, 4, 23, 3], image_channels=img_channels, num_classes=num_classes)

def ResNet152(img_channels=3, num_classes=1000):
  return ResNet(Block=Block131, layers=[3, 8, 36, 3], image_channels=img_channels, num_classes=num_classes)

if __name__ == "__main__":
  device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
  B, C, H, W = 8, 3, 224, 224
  x = torch.randn(size=[B, C, H, W], device=device)

  # model = ResNet50().to(device)
  model = ResNet101().to(device)
  # model = ResNet152().to(device)

  y, features = model(x)
  print(f"{y.shape = }")
  print(f"{features[0].shape = }\n" # [B,   64, 56, 56]
        f"{features[1].shape = }\n" # [B,  256, 56, 56]
        f"{features[2].shape = }\n" # [B,  512, 28, 28]
        f"{features[3].shape = }\n" # [B, 1024, 14, 14]
        f"{features[4].shape = }")  # [B, 2048,  7,  7]