Panoptic Feature Pyramid Networks

Panoptic FPN 整体结构图

参考实现

FPN 部分

这一部分的代码是参考 FPN-Semantic-segmentation | github 这个仓库

'''Panoptic FPN in PyTorch.
See the paper "Panoptic Feature Pyramid Networks" for more details.
'''
from typing import Literal
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.utils.model_zoo as model_zoo
from resnet import resnet50, resnet101, resnet152, RESNET_WEIGHTS_URL

def build_backbone(name: str = Literal["resnet50", "resnet101", "resnet152"], pretrained: bool = True):
    if name == "resnet50":
        backbone = resnet50()
        weights_url =  RESNET_WEIGHTS_URL['IMAGENET1k_V2']['resnet50']
    elif name == "resnet101":
        backbone = resnet101()
        weights_url =  RESNET_WEIGHTS_URL['IMAGENET1k_V2']['resnet101']
    elif name == "resnet152":
        backbone = resnet152()
        weights_url =  RESNET_WEIGHTS_URL['IMAGENET1k_V2']['resnet152']
    else:
        raise Exception("Please choose backbone from resnet50|resnet101|resnet152.")
    
    if pretrained:
        pretrained_state_dict = model_zoo.load_url(url=weights_url)
        backbone.load_state_dict(pretrained_state_dict)
    
    return backbone

class FPN(nn.Module):

    def __init__(self, num_classes, backbone_name='resnet101', pretrained=True):
        super(FPN, self).__init__()
        self.in_planes = 64
        self.num_classes = num_classes
        
        self.backbone = build_backbone(name=backbone_name, pretrained=pretrained)

        # Top layer
        self.toplayer = nn.Conv2d(2048, 256, kernel_size=1, stride=1, padding=0)  # Reduce channels

        # Smooth layers
        self.smooth1 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.smooth2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.smooth3 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)

        # Lateral layers
        self.latlayer1 = nn.Conv2d(1024, 256, kernel_size=1, stride=1, padding=0)
        self.latlayer2 = nn.Conv2d( 512, 256, kernel_size=1, stride=1, padding=0)
        self.latlayer3 = nn.Conv2d( 256, 256, kernel_size=1, stride=1, padding=0)

        # Semantic branch
        self.semantic_branch = nn.Conv2d(256, 128, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1)
        self.conv3 = nn.Conv2d(128, self.num_classes, kernel_size=1, stride=1, padding=0)
        # num_groups, num_channels
        self.gn1 = nn.GroupNorm(128, 128) 
        self.gn2 = nn.GroupNorm(256, 256)


    def _upsample(self, x, h, w):
        return F.interpolate(x, size=(h, w), mode='bilinear', align_corners=True)

    def _make_layer(self, Bottleneck, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(Bottleneck(self.in_planes, planes, stride))
            self.in_planes = planes * Bottleneck.expansion
        return nn.Sequential(*layers)

    def _upsample_add(self, x, y):
        '''Upsample and add two feature maps.
        Args:
          x: (Variable) top feature map to be upsampled.
          y: (Variable) lateral feature map.
        Returns:
          (Variable) added feature map.
        Note in PyTorch, when input size is odd, the upsampled feature map
        with `F.upsample(..., scale_factor=2, mode='nearest')`
        maybe not equal to the lateral feature map size.
        e.g.
        original input size: [N,_,15,15] ->
        conv2d feature map size: [N,_,8,8] ->
        upsampled feature map size: [N,_,16,16]
        So we choose bilinear upsample which supports arbitrary output sizes.
        '''
        _,_,H,W = y.size()
        return F.interpolate(x, size=(H,W), mode='bilinear', align_corners=True) + y


    def forward(self, x):
        # Bottom-up using backbone
        low_level_features = self.backbone(x)
        c1 = low_level_features[0]
        c2 = low_level_features[1]
        c3 = low_level_features[2]
        c4 = low_level_features[3]
        c5 = low_level_features[4]

        # Top-down
        p5 = self.toplayer(c5)                          # [B, 2048, H/32, W/32] -toplayer -> [B, 256, H/32, W/32]
        p4 = self._upsample_add(p5, self.latlayer1(c4)) # [B, 1024, H/16, W/16] -latlayer1-> [B, 256, H/16, W/16]
        p3 = self._upsample_add(p4, self.latlayer2(c3)) # [B,  512,  H/8,  W/8] -latlayer2-> [B, 256,  H/8,  W/8]
        p2 = self._upsample_add(p3, self.latlayer3(c2)) # [B,  256,  H/4,  W/4] -latlayer3-> [B, 256,  H/4,  W/4]

        # Smooth
        p4 = self.smooth1(p4)
        p3 = self.smooth2(p3)
        p2 = self.smooth3(p2)

        # Semantic
        _, _, h, w = p2.size()
        # 256->256
        s5 = self._upsample(F.relu(self.gn2(self.conv2(p5))), h, w)
        # 256->256
        s5 = self._upsample(F.relu(self.gn2(self.conv2(s5))), h, w)
        # 256->128
        s5 = self._upsample(F.relu(self.gn1(self.semantic_branch(s5))), h, w)

        # 256->256
        s4 = self._upsample(F.relu(self.gn2(self.conv2(p4))), h, w)
        # 256->128
        s4 = self._upsample(F.relu(self.gn1(self.semantic_branch(s4))), h, w)

        # 256->128
        s3 = self._upsample(F.relu(self.gn1(self.semantic_branch(p3))), h, w)

        s2 = F.relu(self.gn1(self.semantic_branch(p2)))
        return self._upsample(self.conv3(s2 + s3 + s4 + s5), 4 * h, 4 * w)

if __name__ == "__main__":
    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = torch.device("cpu")

    B, C, H, W = 8, 3, 512, 512
    input = torch.rand(size=[B, C, H, W], device=device)

    model = FPN(num_classes=32, backbone_name="resnet101", pretrained=False).to(device)
    
    output = model(input)
    print(output.shape) # torch.Size([8, 32, 512, 512])

ResNet 部分

其中的 resnet 的实现是参考 torch hub 的官方代码，预训练权重也是由 Pytorch 发布的

'''resnet.py
ref: https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py
'''

from typing import Any, Callable, List, Optional, Tuple, Type, Union

import torch
import torch.nn as nn
from torch import Tensor

RESNET_WEIGHTS_URL = {
    'IMAGENET1k_V1': {
        'resnet18': "https://download.pytorch.org/models/resnet18-f37072fd.pth",
        'resnet34': "https://download.pytorch.org/models/resnet34-b627a593.pth",
        'resnet50': "https://download.pytorch.org/models/resnet50-0676ba61.pth",
        'resnet101': "https://download.pytorch.org/models/resnet101-63fe2227.pth",
        'resnet152': "https://download.pytorch.org/models/resnet152-394f9c45.pth",
    },

    'IMAGENET1k_V2': {
        'resnet50': "https://download.pytorch.org/models/resnet50-11ad3fa6.pth",
        'resnet101': "https://download.pytorch.org/models/resnet101-cd907fc2.pth",
        'resnet152': "https://download.pytorch.org/models/resnet152-f82ba261.pth",
    },
}

def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1) -> nn.Conv2d:
    """3x3 convolution with padding"""
    return nn.Conv2d(
        in_planes,
        out_planes,
        kernel_size=3,
        stride=stride,
        padding=dilation,
        groups=groups,
        bias=False,
        dilation=dilation,
    )


def conv1x1(in_planes: int, out_planes: int, stride: int = 1) -> nn.Conv2d:
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
    expansion: int = 1

    def __init__(
        self,
        inplanes: int,
        planes: int,
        stride: int = 1,
        downsample: Optional[nn.Module] = None,
        groups: int = 1,
        base_width: int = 64,
        dilation: int = 1,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
    ) -> None:
        super().__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError("BasicBlock only supports groups=1 and base_width=64")
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    """
    Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
    while original implementation places the stride at the first 1x1 convolution(self.conv1)
    according to "Deep residual learning for image recognition" https://arxiv.org/abs/1512.03385.
    This variant is also known as ResNet V1.5 and improves accuracy according to
    https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.
    """

    expansion: int = 4

    def __init__(
        self,
        inplanes: int,
        planes: int,
        stride: int = 1,
        downsample: Optional[nn.Module] = None,
        groups: int = 1,
        base_width: int = 64,
        dilation: int = 1,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
    ) -> None:
        super().__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.0)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class ResNet(nn.Module):
    def __init__(
        self,
        block: Type[Union[BasicBlock, Bottleneck]],
        layers: List[int],
        num_classes: int = 1000,
        zero_init_residual: bool = False,
        groups: int = 1,
        width_per_group: int = 64,
        replace_stride_with_dilation: Optional[List[bool]] = None,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
    ) -> None:
        super().__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError(
                "replace_stride_with_dilation should be None "
                f"or a 3-element tuple, got {replace_stride_with_dilation}"
            )
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3, bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2, dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2, dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2, dilate=replace_stride_with_dilation[2])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode="fan_out", nonlinearity="relu")
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck) and m.bn3.weight is not None:
                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]
                elif isinstance(m, BasicBlock) and m.bn2.weight is not None:
                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]

    def _make_layer(
        self,
        block: Type[Union[BasicBlock, Bottleneck]],
        planes: int,
        blocks: int,
        stride: int = 1,
        dilate: bool = False,
    ) -> nn.Sequential:
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(
            block(
                self.inplanes, planes, stride, downsample, self.groups, self.base_width, previous_dilation, norm_layer
            )
        )
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(
                block(
                    self.inplanes,
                    planes,
                    groups=self.groups,
                    base_width=self.base_width,
                    dilation=self.dilation,
                    norm_layer=norm_layer,
                )
            )

        return nn.Sequential(*layers)

    def _forward_impl(self, x: Tensor) -> Tuple[Tensor, List[Tensor]]:
        # See note [TorchScript super()]
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)
        feature0 = x

        x = self.layer1(x)
        feature1 = x
        x = self.layer2(x)
        feature2 = x
        x = self.layer3(x)
        feature3 = x
        x = self.layer4(x)
        feature4 = x

        # x = self.avgpool(x)
        # x = torch.flatten(x, 1)
        # x = self.fc(x)

        return [feature0, feature1, feature2, feature3, feature4]

    def forward(self, x: Tensor) -> Tensor:
        return self._forward_impl(x)


def _resnet(
    block: Type[Union[BasicBlock, Bottleneck]],
    layers: List[int],
    num_classes: int,
    **kwargs: Any,
) -> ResNet:
    model = ResNet(block, layers, num_classes=num_classes, **kwargs)
    return model


def resnet18(*, num_classes: int = 1000, **kwargs: Any) -> ResNet:
    """ResNet-18 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`.

    Args:
        num_classes (int, optional): number of classes. Default is 1000.
        **kwargs: parameters passed to the `ResNet` base class.
    """
    # weights = ResNet18_Weights.verify(weights)

    return _resnet(BasicBlock, [2, 2, 2, 2], num_classes=num_classes, **kwargs)


def resnet34(*, num_classes: int = 1000, **kwargs: Any) -> ResNet:
    """ResNet-34 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`.

    Args:
        num_classes (int, optional): number of classes. Default is 1000.
        **kwargs: parameters passed to the `ResNet` base class.
    """

    return _resnet(BasicBlock, [3, 4, 6, 3], num_classes=num_classes, **kwargs)


def resnet50(*, num_classes: int = 1000, **kwargs: Any) -> ResNet:
    """ResNet-50 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`.

    Note:
       The bottleneck of TorchVision places the stride for downsampling to the second 3x3
       convolution while the original paper places it to the first 1x1 convolution.
       This variant improves the accuracy and is known as `ResNet V1.5
       <https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch>`.

    Args:
        num_classes (int, optional): number of classes. Default is 1000.
        **kwargs: parameters passed to the `ResNet` base class.
    """

    return _resnet(Bottleneck, [3, 4, 6, 3], num_classes=num_classes, **kwargs)


def resnet101(*, num_classes: int = 1000, **kwargs: Any) -> ResNet:
    """ResNet-101 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`.

    Note:
       The bottleneck of TorchVision places the stride for downsampling to the second 3x3
       convolution while the original paper places it to the first 1x1 convolution.
       This variant improves the accuracy and is known as `ResNet V1.5
       <https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch>`.

    Args:
        num_classes (int, optional): number of classes. Default is 1000.
        **kwargs: parameters passed to the `ResNet` base class.
    """

    return _resnet(Bottleneck, [3, 4, 23, 3], num_classes=num_classes, **kwargs)


def resnet152(*, num_classes: int = 1000, **kwargs: Any) -> ResNet:
    """ResNet-152 from `Deep Residual Learning for Image Recognition <https://arxiv.org/abs/1512.03385>`.

    Note:
        The bottleneck of TorchVision places the stride for downsampling to the second 3x3
        convolution while the original paper places it to the first 1x1 convolution.
        This variant improves the accuracy and is known as `ResNet V1.5
        <https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch>`.

    Args:
        num_classes (int, optional): number of classes. Default is 1000.
        **kwargs: parameters passed to the `ResNet` base class.
    """

    return _resnet(Bottleneck, [3, 8, 36, 3], num_classes=num_classes, **kwargs)

if __name__ == "__main__":
    import torch.utils.model_zoo as model_zoo

    # device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    device = torch.device("cpu")
    B, C, H, W = 8, 3, 256, 256
    input = torch.randn(size=[B, C, H, W], device=device)

    # model = resnet18().to(device)
    # model = resnet34().to(device)
    model = resnet50().to(device)
    # model = resnet101().to(device)
    # model = resnet152().to(device)
    # print(model)

    weights_url = RESNET_WEIGHTS_URL["IMAGENET1k_V1"]["resnet18"]
    pretrained_dict = model_zoo.load_url(url=weights_url)
    model.load_state_dict(pretrained_dict)
    
    features = model(input)
    print(f"{features[0].shape = }\n"
          f"{features[1].shape = }\n"
          f"{features[2].shape = }\n"
          f"{features[3].shape = }\n"
          f"{features[4].shape = }")