Pytorch常用函数

链接

项目仓库：pytorch | github

pytorch 源码：pytorch/pytorch | source code | github

torch 源码：pytorch/pytorch/torch source code | github

torchvision源码：pytorch/vision | source code | github

torch.Tensor

ways to make tensor

Uniform distribution on the interval [0,1)
torch.rand
Normal distribution with mean 0 and variance 1 (standard normal distribution)
torch.randn
Random integers generated uniformly between low (inclusive) and high (exclusive)
torch.randint
Returns a tensor filled with the scalar value 1
torch.ones

Demo

import torch
tensor = torch.randint(low=0, high=3, size=(3, 4))
print(tensor)
# tensor([[1, 2, 1, 1],
#         [0, 2, 1, 2],
#         [1, 2, 0, 0]])

torch.set_printoptions

Docs
torch.set_printoptions

Deom

import torch

tensor = torch.tensor([torch.pi, 1/3])
print(tensor)
# tensor([3.1416, 0.3333])

# Limit the number of elements shown: reserve 2 decimal places
torch.set_printoptions(precision=2)
print(tensor)
# tensor([3.14, 0.33])

torch.Tensor.unfold

import torch

x = torch.tensor([[1.,  2.,  3.,  4.],
                  [5.,  6.,  7.,  8.],
                  [9., 10., 11., 12.]])

x_unfold = x.unfold(0, 1, 1)
print(x_unfold)
# tensor([[[ 1.],
#          [ 2.],
#          [ 3.],
#          [ 4.]],
#         [[ 5.],
#          [ 6.],
#          [ 7.],
#          [ 8.]],
#         [[ 9.],
#          [10.],
#          [11.],
#          [12.]]])
x_unfold = x_unfold.unfold(1, 2, 2)
print(x_unfold)
# tensor([[[[ 1.,  2.]],
#          [[ 3.,  4.]]],
#         [[[ 5.,  6.]],
#          [[ 7.,  8.]]],
#         [[[ 9., 10.]],
#          [[11., 12.]]]])
x_unfold = x_unfold.contiguous().view(-1, 1, 2)
print(x_unfold)
# tensor([[[ 1.,  2.]],
#         [[ 3.,  4.]],
#         [[ 5.,  6.]],
#         [[ 7.,  8.]],
#         [[ 9., 10.]],
#         [[11., 12.]]])

Tensor.views

Tensor Views

Tensor.permute

torch.Tensor.permute

torch.unique

import torch

x = torch.randint(0, 5, (3, 4))
print(x)
# tensor([[2, 1, 2, 4],
#         [4, 3, 4, 3],
#         [3, 2, 4, 3]])
print(x.unique(return_counts=True))
# (tensor([1, 2, 3, 4]), tensor([1, 3, 4, 4]))

torch.max

import torch

input = torch.randn(2, 3, 4)
print(input) # shape: torch.Size([2, 3, 4])
# tensor([[[ 0.2506,  0.2790, -0.0562, -0.1434],
#          [ 0.8521, -0.8722,  0.4884,  1.0183],
#          [-0.7518,  0.0761,  0.0518, -0.8154]],

#         [[-3.4666, -0.4228, -0.7586, -1.5254],
#          [ 1.5851,  1.6208,  0.3673, -0.8441],
#          [ 0.7171, -1.0838,  0.9449,  0.3339]]])

output = torch.max(input, dim=0)
print(output) # shape: torch.Size([3, 4])
# torch.return_types.max(
# values=tensor([[ 0.2506,  0.2790, -0.0562, -0.1434],
#                [ 1.5851,  1.6208,  0.4884,  1.0183],
#                [ 0.7171,  0.0761,  0.9449,  0.3339]]),
# indices=tensor([[0, 0, 0, 0],
#                 [1, 1, 0, 0],
#                 [1, 0, 1, 1]])
# )

output = torch.max(input, dim=1)
print(output) # shape: torch.Size([2, 4])
# torch.return_types.max(
# values=tensor([[0.8521, 0.2790, 0.4884, 1.0183],
#                [1.5851, 1.6208, 0.9449, 0.3339]]),
# indices=tensor([[1, 0, 1, 1],
#                 [1, 1, 2, 2]])
# )

output = torch.max(input, dim=2)
print(output) # shape: torch.Size([2, 3])
# torch.return_types.max(
# values=tensor([[ 0.2790,  1.0183,  0.0761],
#                [-0.4228,  1.6208,  0.9449]]),
# indices=tensor([[1, 3, 1],
#                 [1, 1, 2]])
# )

torch.argmax

import torch

input = torch.randn(2, 3, 4)
print(input) # shape: torch.Size([2, 3, 4])
# tensor([[[ 0.3345,  0.1024,  0.5986,  0.9027],
#          [-0.6041,  0.7728, -0.3231,  0.2327],
#          [-0.8263,  0.1184,  1.3489, -1.4658]],

#         [[-0.5787,  0.3613, -1.7988, -0.0108],
#          [-0.3335, -0.4843,  1.0525, -3.6239],
#          [-0.7718, -0.7116, -1.3299, -2.8085]]])

output = torch.argmax(input, dim=0)
print(output) # shape: torch.Size([3, 4])
# tensor([[0, 1, 0, 0],
#         [1, 0, 1, 0],
#         [1, 0, 0, 0]])

output = torch.argmax(input, dim=1)
print(output) # shape: torch.Size([2, 4])
# tensor([[0, 1, 2, 0],
#         [1, 0, 1, 0]])

output = torch.argmax(input, dim=2)
print(output) # shape: torch.Size([2, 3])
# tensor([[3, 1, 2],
#         [1, 2, 1]])

torch.unsqueeze

import torch

x = torch.tensor([0, 1, 2])
print(x, x.shape)
# tensor([0, 1, 2]) torch.Size([3])

x1 = x.unsqueeze(0)
print(x1, x1.shape)
# tensor([[0, 1, 2]]) torch.Size([1, 3])

x2 = x.unsqueeze(1)
print(x2, x2.shape)
# tensor([[0],
#         [1],
#         [2]]) torch.Size([3, 1])

y = x1-x2
print(y, y.shape)
# tensor([[ 0,  1,  2],
#         [-1,  0,  1],
#         [-2, -1,  0]]) torch.Size([3, 3])

torch.squeeze

torch.squeeze{target="_blank"}

x = torch.zeros(2, 1, 2, 1, 2) # shape: [2, 1, 2, 1, 2]
torch.squeeze(x)               # shape: [2, 2, 2]
torch.squeeze(x, 0)            # shape: [2, 1, 2, 1, 2]
torch.squeeze(x, 1)            # shape: [2, 2, 1, 2]
torch.squeeze(x, (1, 2, 3))    # shape: [2, 2, 2]

torch.roll

x = torch.tensor([1, 2, 3, 4, 5, 6, 7, 8]).view(4, 2)
# tensor([[1, 2],
#         [3, 4],
#         [5, 6],
#         [7, 8]])

# Roll the tensor input along the given dimension(s). 
torch.roll(x, 2, 0)
# tensor([[5, 6],
#         [7, 8],
#         [1, 2],
#         [3, 4]])
torch.roll(x, 1, 1)
# tensor([[2, 1],
#         [4, 3],
#         [6, 5],
#         [8, 7]])
torch.roll(x, shifts=(2, 1), dims=(0, 1))
# tensor([[6, 5],
#         [8, 7],
#         [2, 1],
#         [4, 3]])

# If dims is None, the tensor will be flattened before rolling 
# and then restored to the original shape.
torch.roll(x, 1)
# tensor([[8, 1],
#         [2, 3],
#         [4, 5],
#         [6, 7]])

Broadcasting semantics (广播机制)

Broadcasting semantics | pytorch docs

Broadcasting | numpy docs

import torch

x = torch.tensor([[1, 2, 3]]) # shape: torch.Size([1, 3])
y = torch.tensor([[1],        # shape: torch.Size([3, 1])
                  [2],
                  [3]])
print(x-y)
# tensor([[ 0,  1,  2],
#         [-1,  0,  1],
#         [-2, -1,  0]])
# shape: torch.Size([3, 3])
print(y-x)
# tensor([[ 0, -1, -2],
#         [ 1,  0, -1],
#         [ 2,  1,  0]])
# shape: torch.Size([3, 3])

     x      -      y
            👇
[[1, 2, 3],   [[1, 1, 1],
 [1, 2, 3]  -  [2, 2, 2],
 [1, 2, 3]]    [3, 3, 3]]

Pytorch实现线性回归

torch.nn.Linear介绍

torch.nn.Linear(in_features, out_features, bias=True)

Linear是一个类, 它按照下式计算输入数据的线性变换

$y = xA^T+b$

其中 $x$ 是行向量, 当批量处理样本时, $\bold{x}$ 是 $n$ 个行向量构成的矩阵, 每一行都是一个样本.

Linear一共有三个输入参数:

第一个参数为in_features, int型: 输入数据的特征数
第二个参数为out_features, int型: 输出数据的特征数
第三个参数为bias, bool型: 默认为True, 设为False时, 偏置项为0

其中第一第二个参数决定了变换矩阵 $A^T$ 的尺寸, $A^T$ 是一个 $\rm in\_f\times out\_f$ 的矩阵

$A^T,b$ 的初始化是由 $U(-\frac{1}{\sqrt{\rm in\_f}},\frac{1}{\sqrt{\rm in\_f}})$ 的均匀分布随机初始化的

这个分布叫做 $\text{``kaiming uniform''}$ , 是15年2月何愷明 (Kaiming He)在他的论文中提出

Delving Deep into Rectifiers: Surpassing Human-Level Performance on ImageNet Classification

>>> model = nn.Linear(20, 30)
>>> input = torch.randn(128, 20)
>>> output = model(input)
>>> print(output.size())
torch.Size([128, 30])

torch.nn.MSELoss介绍

torch.nn.MSELoss

MSE为均方误差(mean square error), 而MSELoss将返回一个对象用于计算input和target间的MSE.

MSELoss的reduction有三个选项none|mean(default)|sum

1
2
3

#设置输入值和目标值
input = torch.tensor([[1.,1,1],[1,1,1]])
target = torch.tensor([[1.,2,3],[4,5,6]])

#创建loss对象
loss = torch.nn.MSELoss(reduction="none"|"mean"|"sum")
output = loss(input, target)
#reduction="none"时输出output
tensor([[ 0.,  1.,  4.],
        [ 9., 16., 25.]])
#reduction="mean"时输出output
tensor(9.1667)
#reduction="sum"时输出output
tensor(55.)

用Linear实现线性回归

数据

1
2
3

import torch 
x_data = torch.tensor([[1.0],[2.0],[3.0]])
y_data = torch.tensor([[2.0],[4.0],[6.0]])

模型搭建

class LinearModel(torch.nn.Module):
    def __init__(self):
        super(LinearModel, self).__init__()
        self.linear = torch.nn.Linear(1, 1)

    def forward(self, x):
        y_pred = self.linear(x)
        return y_pred

model = LinearModel()

损失函数

1	criterion = torch.nn.MSELoss(reduction="sum")

优化器

1	optimizer = torch.optim.SGD(model.parameters(),lr=0.01)

参数训练

for epoch in range(100):
    y_pred = model(x_data)
    loss = criterion(y_pred, y_data)
    print(epoch, loss)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

打印输出

#输出A^T和bias
print("A^T = ", model.linear.weight.item())
print("b = ", model.linear.bias.item())

#测试模型
x_test = torch.Tensor([4.0])
y_test = model(x_test)
print("y_pred = ", y_test.data)

torch.nn.modules.linear

class Linear(Module):
    __constants__ = ['in_features', 'out_features']
    in_features: int
    out_features: int
    weight: Tensor

    def __init__(self, in_features: int, out_features: int, bias: bool = True,
                 device=None, dtype=None) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
        if bias:
            self.bias = Parameter(torch.empty(out_features, **factory_kwargs))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self) -> None:
        # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
        # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
        # https://github.com/pytorch/pytorch/issues/57109
        init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        if self.bias is not None:
            fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
            bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
            init.uniform_(self.bias, -bound, bound)

    def forward(self, input: Tensor) -> Tensor:
        return F.linear(input, self.weight, self.bias) # <-- Note

Note that modules.Linear use F.linear to calculate forward result:

torch.nn.functional.linear(input, weight, bias=None)

For the incoming input $x$ , weight matrix $A$ and bias $b$ , the F.linear will apply a linear transformation:

$y = xA^T + b$

If the shape of input $x$ is (*, in_features) where * means any number of additional dimensions including none and shape of weight matrix $A$ is (out_features, in_features), the shape of output will be (*, out_features).

Example 1:

$x: \begin{bmatrix} \colorbox{yellow}{1, 2, 3} \\ \colorbox{pink}{4, 5, 6}\end{bmatrix},\quad A: \begin{bmatrix} 1, 0, 0\\0, 1, 0\end{bmatrix},\quad b: None$

$y = xA^T = \begin{bmatrix} \colorbox{yellow}{1, 2, 3} \\ \colorbox{pink}{4, 5, 6}\end{bmatrix} \begin{bmatrix} 1, 0 \\ 0, 1 \\ 0, 0\end{bmatrix} = \begin{bmatrix} \colorbox{yellow}{1, 2} \\ \colorbox{pink}{4, 5}\end{bmatrix}$

import torch
import torch.nn.functional as F

input = torch.tensor([[1, 2, 3],
                      [4, 5, 6]])  # shape: [2, 3]

weight = torch.tensor([[1, 0, 0],
                       [0, 1, 0]]) # shape: [2, 3]

output = F.linear(input, weight)   # shape: [2, 2]

print(output)
# tensor([[1, 2],
#         [4, 5]])

Example 2:

$x: \begin{bmatrix} \begin{bmatrix} \colorbox{yellow}{1, 2, 3} \\ \colorbox{pink}{4, 5, 6}\end{bmatrix}, \begin{bmatrix} \colorbox{yellow}{7, 8, 9} \\ \colorbox{pink}{0, 1, 2}\end{bmatrix} \end{bmatrix},\quad A: \begin{bmatrix} 1, 0, 0\\0, 1, 0\end{bmatrix},\quad b: None$

$y = xA^T = \begin{bmatrix} \begin{bmatrix} \colorbox{yellow}{1, 2, 3} \\ \colorbox{pink}{4, 5, 6}\end{bmatrix}, \begin{bmatrix} \colorbox{yellow}{7, 8, 9} \\ \colorbox{pink}{0, 1, 2}\end{bmatrix} \end{bmatrix} \begin{bmatrix} 1, 0 \\ 0, 1 \\ 0, 0\end{bmatrix} = \begin{bmatrix} \begin{bmatrix}\colorbox{yellow}{1, 2} \\ \colorbox{pink}{4, 5}\end{bmatrix}, \begin{bmatrix}\colorbox{yellow}{7, 8} \\ \colorbox{pink}{0, 1}\end{bmatrix} \end{bmatrix}$

import torch
import torch.nn.functional as F

input = torch.tensor([[[1, 2, 3],
                       [4, 5, 6]],
                      [[6, 7, 8],
                       [9, 0, 1]]]) # shape: [2, 2, 3]

weight = torch.tensor([[1, 0, 0],
                       [0, 1, 0]])  # shape: [2, 3]

output = F.linear(input, weight)    # shape: [2, 2, 2]

print(output)
# tensor([[[1, 2],
#          [4, 5]],

#         [[6, 7],
#          [9, 0]]])

Note: The basic element that Pytorch or Deep learning operate is vectors, more specifically is row vectors. Even though it’s a matrix, it can be thought of as a set of row vectors. And for a high-dimentional matrix, it can also be viewed as a larger set of row vectors. Therefore, when you apply a transformation to a 2 or higher dimentional matrix, what you actually transform is a large set of row vectors. You transform the vectors in the set and keep the shape of the set.

torch.nn

pytorch/torch/nn | source code | github

Softmax

torch.nn.Softmax

1	torch.nn.Softmax(dim=None)

dim (int) – A dimension along which Softmax will be computed (so every slice along dim will sum to 1).

import torch

input = torch.randn(2, 3)
print(input)
# tensor([[-1.3018, -0.9303,  0.2111],
#         [ 0.7206,  0.0933, -0.8475]])
print(input.sum(dim=0)) # tensor([-0.5812, -0.8370, -0.6364])
print(input.sum(dim=1)) # tensor([-2.0210, -0.0337])

output = torch.nn.Softmax(dim=0)(input)
print(output.sum(dim=0)) # tensor([1., 1., 1.])
print(output.sum(dim=1)) # tensor([1.1236, 1.8764])

output = torch.nn.Softmax(dim=1)(input)
print(output.sum(dim=0)) # tensor([0.7170, 0.5139, 0.7691])
print(output.sum(dim=1)) # tensor([1., 1.])

import torch

input = torch.randn(2, 3, 4)
print(input)
# tensor([[[ 1.4035, -1.5199, -0.5251,  0.0535],
#          [ 0.2582,  2.2726, -0.6482, -1.5214],
#          [-0.4722, -0.4668, -0.3910, -0.3729]],

#         [[ 0.3186,  1.2075, -0.4249,  0.3235],
#          [ 0.0873, -1.7824,  0.5938, -2.2797],
#          [-0.9566, -1.0026,  0.0575,  0.3927]]])

output = torch.nn.Softmax(dim=0)(input)
print(output.sum(dim=0)) # shape: torch.Size([3, 4])
# tensor([[1.0000, 1.0000, 1.0000, 1.0000],
#         [1.0000, 1.0000, 1.0000, 1.0000],
#         [1.0000, 1.0000, 1.0000, 1.0000]])

output = torch.nn.Softmax(dim=1)(input)
print(output.sum(dim=1)) # shape: torch.Size([2, 4])
# tensor([[1., 1., 1., 1.],
#         [1., 1., 1., 1.]])

output = torch.nn.Softmax(dim=2)(input)
print(output.sum(dim=2)) # shape: torch.Size([2, 3])
# tensor([[1.0000, 1.0000, 1.0000],
#         [1.0000, 1.0000, 1.0000]])

LogSoftmax

torch.nn.LogSoftmax

torch.nn.LogSoftmax()对输入的tensor进行 $\rm log\circ softmax$ 操作, 例如 $(n+1)\times (c+1)$ 维度的输入tensor(n+1个样本, c+1个类别)

$\begin{split} \rm tensor([&[a_{00},a_{01},\cdots,a_{0c}],\\ &[a_{10},a_{11},\cdots,a_{1c}],\\ &\cdots\\ &[a_{n0},a_{n1},\cdots,a_{nc}] ]) \end{split}$

经过 $\rm torch.nn.LogSoftmax(dim=1)$ 操作后返回

\begin{split}
\rm tensor([&\rm log\circ softmax([a_{00},a_{01},\cdots,a_{0c}]),\\
&\rm log\circ softmax([a_{10},a_{11},\cdots,a_{1c}]),\\
&\vdots\\
&\rm log\circ softmax([a_{n0},a_{n1},\cdots,a_{nc}])
])
\end{split}

其中 $\rm log\circ softmax$ 如下计算

\begin{array}{c}
[[\log(\frac{[e^{a_{00}},e^{a_{01}},\cdots,e^{a_{0c}}]}{\sum_{j=0}^{c}e^{a_{0j}}})],\\
\vdots\\
[\log(\frac{[e^{a_{00}},e^{a_{01}},\cdots,e^{a_{0c}}]}{\sum_{j=0}^{c}e^{a_{2j}}})]]
\end{array}
=
\begin{array}{c}
 [[a_{00},a_{01},\cdots,a_{0c}],\\
 \vdots\\
 [a_{n0},a_{n1},\cdots,a_{nc}]]
\end{array}
-
\begin{array}{c}
[[\log(e^{a_{00}}+\cdots+e^{a_{0c}})],\\
\vdots\\
[\log(e^{a_{n0}}+\cdots+e^{a_{nc}})]]
\end{array}

经过 $\rm torch.nn.LogSoftmax(dim=0)$ 操作将返回

$\begin{split} \rm tensor( \left[\rm l\circ s\begin{pmatrix}a_{00}\\a_{10}\\\vdots\\a_{n0}\end{pmatrix}, l\circ s\begin{pmatrix}a_{01}\\a_{11}\\\vdots\\a_{n1}\end{pmatrix},\cdots, l\circ s\begin{pmatrix}a_{0c}\\a_{1c}\\\vdots\\a_{nc}\end{pmatrix} \right]) \end{split}$

$\rm log \circ softmax(\bold{z})$ 的性质

$\begin{split} \log \circ \text{ softmax}(\bold{z}) &= \log([\frac{e^{z_1}}{\sum e^z}, \cdots, \frac{e^{z_n}}{\sum e^z}])\\ &= [\log(\frac{e^{z_1}}{\sum e^z}), \cdots, \log(\frac{e^{z_n}}{\sum e^z})]\\ &= [z_1-\log(\sum e^z), \cdots, z_n-\log(\sum e^z)]\\ &= \bold{z}-\log(\sum_{i=1}^ne^{z_i}) \end{split}$

Pytorch演示:

1
2
3

#输入向量 torch.ones(2,3)
tensor([[1., 1., 1.],
        [1., 1., 1.]])

# dim=1
m = nn.LogSoftmax(dim=1)
# 输出 m(torch.ones(2,3))
tensor([[-1.0986, -1.0986, -1.0986],
        [-1.0986, -1.0986, -1.0986]])

#dim=0
m = nn.LogSoftmax(dim=0)
# 输出 m(torch.ones(2,3))
tensor([[-0.6931, -0.6931, -0.6931],
        [-0.6931, -0.6931, -0.6931]])

其中 $1-\log(3e) = -\log(3)\approx -1.0986,\qquad 1-\log(2e)=-\log(2)\approx-0.6931$

LogSoftmax 对平移保持不变

$\begin{split} \rm log\circ softmax(\bold{x}+c) &= [x_0+c,\cdots,x_n+c] - \log(\sum \bold{e}^{\bold{x}+c})\\ &=\bold{x}+c-\log(e^c\sum\bold{e^x})\\ &=\bold{x}-\log(\sum\bold{e^x})\\ &=\rm log\circ softmax(\bold{x}) \end{split}$

例如

$\begin{split} &\rm log\circ softmax\left( \begin{bmatrix} 0,1,2,3,4 \end{bmatrix} \right) \\ =&\rm log\circ softmax\left( \begin{bmatrix} 5,6,7,8,9 \end{bmatrix} \right) \\ =&\rm log\circ softmax\left( \begin{bmatrix} 10,11,12,13,14 \end{bmatrix} \right) \\ =&[-4.4519, -3.4519, -2.4519, -1.4519, -0.4519] \end{split}$

NLLLoss

Pytorch Docs
torch.nn.NLLLoss
Descriptions
NLLLoss全称Negative Log Likelihood Loss, 用于计算输入 $x$ 和输出 $y$ 的NLLLoss

对于输入的 $\rm input$

$\begin{split} \rm tensor([&[a_{00},a_{01},\cdots,a_{0c}],\\ &[a_{10},a_{11},\cdots,a_{1c}],\\ &\cdots\\ &[a_{n0},a_{n1},\cdots,a_{nc}] ]) \end{split}$

以及目标 $\rm target = [y_0,y_1,\cdots,y_n],\quad y_{0,\cdots, n}\in\{0,1,\cdots,c\}$

进行 $\rm troch.nn.NLLLoss(reduction=\text{``none|sum|mean''})$ 操作
- reduction="none", 输出
  
  $\rm tensor([\rm -a_{0,y_0}, -a_{1,y_1},\cdots,-a_{n,y_n}])$
  
  其中 $a_{0,y_0}$ 是第0个样本的第 $y_0$ 个值； $a_{1,y_1}$ 是第1个样本的第 $y_1$ 个值…
- reduction="sum", 输出
  
  $\rm sum([\rm -a_{0,y_0}, -a_{1,y_1},\cdots,-a_{n,y_n}])$
- reduction="mean"(default), 输出
  
  $\rm mean([\rm -a_{0,y_0}, -a_{1,y_1},\cdots,-a_{n,y_n}])$

NLLLoss on Segmantation

Descriptions
For the inputs in segmentation, a Prediction with shape (Batch_size, Class_num, Height, Width) and a Mask or Ground Truth with shape (Batch_size, Height, Width), NLLLosss gives the following output with shape (Batch_size, Height, Width):

$\begin{gather*} \rm Output[b,h,w] = - Prediction[b,\underset{\in \{0,\cdots,C-1\}}{\underline{Mask[b,h,w]}},h,w]\\ b\in\{0,\cdots,B-1\},h\in\{0,\cdots,H-1\},w\in\{0,\cdots,W-1\} \end{gather*}$

Demon

import torch
from torch import nn

# Batch_Size: 2, Channels(Classes): 3, Height: 4, Width: 5
B, C, H, W = 2, 3, 4, 5
mask = torch.randint(low=0, high=C, size=(B, H, W)) # ∈ {0, 1, 2}
# mask.shape = torch.Size([2, 4, 5])
prediction = torch.arange(B*C*H*W, dtype=torch.float).reshape(B, C, H, W)
# prediction.shape = torch.Size([2, 3, 4, 5])
nllloss = nn.NLLLoss(reduction='none')
loss = nllloss(prediction, mask)
# loss.shape = torch.Size([2, 4, 5])

for b in range(B):
    for h in range(H):
        for w in range(W):
            assert loss[b,h,w]==-prediction[b,mask[b,h,w],h,w], "Unequal"

NLLLoss + LogSoftmax

通常会首先对 $\rm input$ 取 $\rm torch.nn.LogSoftmax(dim=1)$ , 再进行 $\rm NLLLoss$ , 这样得到的就是交叉熵损失

直接进行 $\rm NLLLoss$ 操作

#设置NLLLoss, reduction="none"直接输出每个样本的结果,不作sum\mean操作
loss = nn.NLLLoss(reduction="none")
#设置input, 3个样本
input = torch.arange(15,dtype=float).reshape(3, 5)
#设置target, 3个标签, 每个标签值介于[0,5)之间
target = torch.tensor([1, 0, 4])
#直接计算input和target的loss
output = loss(input, target)

# 输出 input
tensor([[ 0.,  1.,  2.,  3.,  4.],
        [ 5.,  6.,  7.,  8.,  9.],
        [10., 11., 12., 13., 14.]], dtype=torch.float64)
# 输出 target
tensor([1, 0, 4])
# 输出 output
tensor([ -1.,  -5., -14.], dtype=torch.float64)

先取 $\rm LogSoftmax$ 再进行 $\rm NLLLoss$ 操作

#设置LogSoftmax
m = nn.LogSoftmax(dim=1)
#对input取LogSoftmax操作再与target做Loss
output = loss(m(input), target)

# 输出 m(input)
tensor([[-4.4519, -3.4519, -2.4519, -1.4519, -0.4519],
        [-4.4519, -3.4519, -2.4519, -1.4519, -0.4519],
        [-4.4519, -3.4519, -2.4519, -1.4519, -0.4519]], dtype=torch.float64)
# 输出 target
tensor([1, 0, 4])
# 输出 output
tensor([3.4519, 4.4519, 0.4519], dtype=torch.float64)

CrossEntropyLoss

Pytorch Docs
torch.nn.CrossEntropyLoss
Formulas

$\rm CrossEntropyLoss(\textcolor{red}{input}, target) = NLLLoss(\textcolor{blue}{LogSoftmax(\textcolor{red}{input})},target)$
Description
对于输入的 $\rm input$ ，其有(n+1)个样本，(c+1)个类别

$\begin{split} \rm tensor([&[a_{00},a_{01},\cdots,a_{0c}],\\ &[a_{10},a_{11},\cdots,a_{1c}],\\ &\cdots\\ &[a_{n0},a_{n1},\cdots,a_{nc}] ]) \end{split}$

以及目标 $\rm target = [y_0,y_1,\cdots,y_n],\quad y_{0,\cdots, n}\in\{0,1,\cdots,c\}$

进行 $\rm troch.nn.CrossEntropyLoss(reduction=\text{``none|sum|mean''})$ 操作
- reduction="none", 输出
  
  $\begin{split} &\rm [-log\circ softmax([a_{00},\cdots,a_{0c}])_{\color{red}y_0},\cdots,-log\circ softmax([a_{n0},\cdots,a_{nc}])_{\color{red}y_n}] \\ =&\rm [-\log\frac{\exp(a_{0\color{red}y_0})}{\exp(a_{00})+\cdots+\exp(a_{0c})},\cdots,-\log\frac{\exp(a_{n\color{red}y_n})}{\exp(a_{n0})+\cdots+\exp(a_{nc})}] \\ =& [l_0, \cdots, l_n] \end{split}$
- reduction="sum", 输出
  
  ${\rm sum}([l_0, \cdots, l_n])$
- reduction="mean"(default), 输出
  
  ${\rm mean}([l_0, \cdots, l_n])$

Pytorch演示

# input和target
input = torch.arange(15,dtype=float).reshape(3, 5)
target = torch.tensor([1, 0, 4])

m = nn.LogSoftmax(dim=1)
NLloss = nn.NLLLoss(reduction="none")
CRloss = nn.CrossEntropyLoss(reduction="none")
output_NL = NLloss(m(input), target)
output_CR = CRloss(input, target)

#输出m(input)
tensor([[-4.4519, -3.4519, -2.4519, -1.4519, -0.4519],
        [-4.4519, -3.4519, -2.4519, -1.4519, -0.4519],
        [-4.4519, -3.4519, -2.4519, -1.4519, -0.4519]], dtype=torch.float64)
#输出target
tensor([1, 0, 4])
#输出output_NL
tensor([3.4519, 4.4519, 0.4519], dtype=torch.float64)
#输出output_CR
tensor([3.4519, 4.4519, 0.4519], dtype=torch.float64)

注意上面的两个输出, $\rm NLloss(\textcolor{blue}{m(\textcolor{red}{input})}, target)$ 和 $\rm CRloss(\textcolor{red}{input}, target)$ 的计算结果是一样的.

CrossEntropyLoss on Classification

对于一个分类器如下, 这两种方法使用 loss 是等价的

import torch
from torch import nn

class Classifier(nn.Module):
    def __init__(self, in_features: int, class_num: int) -> None:
        super(Classifier, self).__init__()
        self.mlp = nn.Linear(in_features=in_features, out_features=class_num)
    
    def forward(self, input):
        return self.mlp(input)

Batch, Channel = 8, 512
inputs = torch.rand(Batch, Channel) # shape: torch.Size([8, 512])
labels = torch.randint(low=0, high=10, size=(Batch,)) # shape: torch.Size([8])

classifier = Classifier(in_features=512, class_num=10)
outputs = classifier(inputs) # torch.Size([8, 10])

softmax = nn.Softmax(dim=-1)
NLloss = nn.NLLLoss(reduction="none")
CRloss = nn.CrossEntropyLoss(reduction="none")

loss_nl = NLloss(torch.log(softmax(outputs)), labels)
loss_cr = CRloss(outputs, labels)
print(loss_nl)
print(loss_cr)

CrossEntropyLoss 本身已经封装了一层 -Log(Softmax(x)), 所以如果模型本身已经对输出做了 Log(Softmax(x)) 操作, 可以考虑使用 NLLLoss 作为损失函数. 而对于没有使用 LogSoftmax 正则化的模型输出, 使用 CrossEntropLoss 会更简洁, 但是注意在做 Evaluation 的时候需要额外对 outputs 做一次 Softmax(x) 操作作为概率输出, 再取 Softmax(x) 中最大值的 indice 作为预测的 label.

对于 CrossEntropy 的公式应该如下描述

$CrossEntropy(x, label) = \sum_{i=0}^{B-1} \left( -\log(\underline{\text{sft}(x[i])} [label[i]]) % +\sum_{j\neq label[i]} \log(\underline{\text{sft}(x[i])} [j]) \right)$

其中 x 是一个 Batch_size × Class_num 的 Tensor, 而 label 对应的是一个 int (非 one-hot 型) 的 indices. 对于 x 中的一个样本 x[i] ∈ [1 × C], 对其做 softmax 得到 softmax(x[i]) ∈ [1 × C] 表示每个类别的概率. 取这个样本的真实标签值 label[i] (这里不是 one-hot 编码, 只是 int 值). 使用 softmax(x[i]) 与 label[i] 做交叉熵, 得到的就是这个样本的交叉熵损失函数

$celoss[i] = -\log(\underline{sft(x[i])} [label[i]]) % +\sum_{j\neq label[i]} \log(\underline{sft(x[i])} [j])$

其中第一项是正确样本的预测概率取 -logsoftmax, 而第二项是所有分类错误的预测概率取 logsoftmax. 这里取正负是因为我们使用梯度下降法来降低 loss, 所以我们希望正样本的loss变高, 即取负号后的值变低, 而负样本的loss变低即本身(正号不取负)的值变低.

关于为什么要取 log?

CrossEntropyLoss on Segmentation

Descriptions
For the inputs in segmentation, a Prediction with shape (Batch_size, Class_num, Height, Width) and a Mask or Ground Truth with shape (Batch_size, Height, Width), CrossEntropyLoss will do $\rm -Log\circ Softmax$ on every pixel of every batch of Prediction, which has Class_num entries, and choose the i-th(depending on mask) entry and finally gives the output with shape (Batch_size, Height, Width):

$\begin{gather*} \rm Output[b,h,w] = - \underline{LogSoftmax(dim=1)(Prediction)} \Big[b,\underset{\text{choose i-th class}}{\underline{Mask[b,h,w]}},h,w\Big]\\ b\in\{0,\cdots,B-1\},h\in\{0,\cdots,H-1\},w\in\{0,\cdots,W-1\} \end{gather*}$

The Prediction is actually the output of a model/network.

Demo

import torch
from torch import nn

# Batch_Size: 2, Channels(Classes): 3, Height: 4, Width: 5
B, C, H, W = 2, 3, 4, 5
mask = torch.randint(low=0, high=C, size=(B, H, W)) # ∈ {0, 1, 2}
# mask.shape = torch.Size([2, 4, 5])
prediction = torch.arange(B*C*H*W, dtype=torch.float).reshape(B, C, H, W)
# prediction.shape = torch.Size([2, 3, 4, 5])
crossentropyloss = nn.CrossEntropyLoss(reduction='none')
loss = crossentropyloss(prediction, mask)
# loss.shape = torch.Size([2, 4, 5])

logsofmax_prediction = nn.LogSoftmax(dim=1)(prediction)
for b in range(B):
    for h in range(H):
        for w in range(W):
            assert loss[b,h,w]==-logsofmax_prediction[b,mask[b,h,w],h,w], "Unequal"

RuntimeError: CUDA error: device-side assert triggered

This error happens when your mask(target) values are out of bounds. For example

Recurrence

import torch
from torch import nn

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Batch_Size: 2, Height: 4, Width: 5
mask = torch.randint(low=1, high=4, size=(2, 4, 5)).cuda()
# Batch_Size: 2, Channels(Classes): 3, Height: 4, Width: 5
output = torch.randn(2, 3, 4, 5).cuda()

CE_Loss = nn.CrossEntropyLoss()
loss = CE_Loss(output, mask)
'''RuntimeError: CUDA error: device-side assert triggered'''

The mask contains values 1, 2 and 3, but output has only 3 classes which are “0”, “1” and “2”, so the value 3 is out of bounds.
When you run this code on cpu, you will have this error

Run above code on cpu

...
# loss = CE_Loss(output, mask)
loss = CE_Loss(output.cpu(), mask.cpu())
'''IndexError: Target 3 is out of bounds.'''

Conv2d

torch.nn.Conv2d

使用 Conv2d 方法实现卷积非常方便, 只需要设置 in_channels, out_channels, kernel_size 三个参数就可以使用了, 不需要关注输入输出图片的长和宽.
其它参数包括 stride, padding, bias 也都可以指定, 详细的参数可以查看官方文档.
下面是一个实例

import torch

#随机生成一个矩阵(图片)
input = torch.randn(1,   #batch_size
                    5,   #in_channels
                    100, #width
                    100) #height

#定义一个卷积
#这个卷积由10个5x(3x3)的卷积核构成
conv = torch.nn.Conv2d(in_channels=5,
                      out_channels=10,
                      kernel_size=3)   

#用上述定义的卷积核对 input 进行卷积操作
output = conv(input)

print(input.shape)       #torch.Size([1, 5, 100, 100])
print(conv.weight.shape) #torch.Size([10, 5, 3, 3])
print(output.shape)      #torch.Size([1, 10, 98, 98])

转置卷积

移动窗口的卷积运算可以转换成矩阵乘法, 将输入图片中的像素按从左到右从上到下的顺序排列成一个列向量, 卷积核的每个窗口都可以同样排成一个行向量, 整个卷积核对应着一个矩阵, 其行数是卷积窗口的数量, 列数是输入图片的像素数.
例如, 对于 $4\times4$ 的输入图片和 $3\times3$ 的卷积核, 做 padding=0, strides=1 的卷积运算, 得到一个 $2\times2$ 的输出. 将这个过程用矩阵乘法表示如下:

$\scriptsize{ \begin{bmatrix} y_{00} \\ y_{01} \\ y_{10} \\ y_{11} \end{bmatrix} = \left[\begin{array}{cccc|cccc|cccc|cccc} w_{0,0} & w_{0,1} & w_{0,2} & 0 & w_{1,0} & w_{1,1} & w_{1,2} & 0 & w_{2,0} & w_{2,1} & w_{2,2} & 0 & 0 & 0 & 0 & 0 \\ 0 & w_{0,0} & w_{0,1} & w_{0,2} & 0 & w_{1,0} & w_{1,1} & w_{1,2} & 0 & w_{2,0} & w_{2,1} & w_{2,2} & 0 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 & w_{0,0} & w_{0,1} & w_{0,2} & 0 & w_{1,0} & w_{1,1} & w_{1,2} & 0 & w_{2,0} & w_{2,1} & w_{2,2} & 0 \\ 0 & 0 & 0 & 0 & 0 & w_{0,0} & w_{0,1} & w_{0,2} & 0 & w_{1,0} & w_{1,1} & w_{1,2} & 0 & w_{2,0} & w_{2,1} & w_{2,2} \end{array}\right] \begin{bmatrix} x_{00} \\ x_{01} \\ x_{02} \\ x_{03} \\ \vdots \\ x_{30} \\ x_{31} \\ x_{32} \\ x_{33} \end{bmatrix} }$

上面矩阵中的每一行都对应着一个卷积的窗口, 它们和输入图片做 element-wise 的乘积再求和, 即得到对应窗口位置的卷积输出, 如下示意:

\scriptsize
\begin{bmatrix}
\textcolor{red}{y_{00} }\\ y_{01} \\ y_{10} \\ y_{11}
\end{bmatrix}
=
\begin{matrix}
\textcolor{red}{
\begin{bmatrix}
w_{0,0} & w_{0,1} & w_{0,2} & 0 \\
w_{1,0} & w_{1,1} & w_{1,2} & 0 \\
w_{2,0} & w_{2,1} & w_{2,2} & 0 \\
0 & 0 & 0 & 0
\end{bmatrix}}
\\
\begin{bmatrix}
0 & w_{0,0} & w_{0,1} & w_{0,2} \\
0 & w_{1,0} & w_{1,1} & w_{1,2} \\
0 & w_{2,0} & w_{2,1} & w_{2,2} \\
0 & 0 & 0 & 0
\end{bmatrix}
\\
\begin{bmatrix}
0 & 0 & 0 & 0 \\
w_{0,0} & w_{0,1} & w_{0,2} & 0 \\
w_{1,0} & w_{1,1} & w_{1,2} & 0 \\
w_{2,0} & w_{2,1} & w_{2,2} & 0
\end{bmatrix}
\\
\begin{bmatrix}
0 & 0 & 0 & 0 \\
0 & w_{0,0} & w_{0,1} & w_{0,2} \\
0 & w_{1,0} & w_{1,1} & w_{1,2} \\
0 & w_{2,0} & w_{2,1} & w_{2,2}
\end{bmatrix}
\end{matrix}
*
\textcolor{red}{
\begin{bmatrix}
x_{00} & x_{01} & x_{02} & x_{03} \\
x_{10} & x_{11} & x_{12} & x_{13} \\
x_{20} & x_{21} & x_{22} & x_{23} \\
x_{30} & x_{31} & x_{32} & x_{33}
\end{bmatrix}}

而转置卷积是在上述矩阵乘法形式的卷积的基础上, 将变换矩阵转置. 从而原本 $\overset{4\times4}{16}\rightarrow \overset{2\times2}{4}$ 的线性变换, 变成了 $\overset{2\times2}{4}\rightarrow \overset{4\times4}{16}$ 的线性变换. 如此可以对图片进行放大, 进行上采样.

$\scriptsize{ \begin{bmatrix} \textcolor{red}{y'_{00}} \\ y'_{01} \\ y'_{02} \\ y'_{03} \\ \vdots \\ y'_{30} \\ y'_{31} \\ y'_{32} \\ y'_{33} \end{bmatrix} = \left[\begin{array}{cccc} \textcolor{red}{w_{0,0}} & \textcolor{red}{0} & \textcolor{red}{0} & \textcolor{red}{0} \\ w_{0,1} & w_{0,0} & 0 & 0 \\ w_{0,2} & w_{0,1} & 0 & 0 \\ 0 & w_{0,2} & 0 & 0 \\ \hline w_{1,0} & 0 & w_{0,0} & 0 \\ w_{1,1} & w_{1,0} & w_{0,1} & w_{0,0} \\ w_{1,2} & w_{1,1} & w_{0,2} & w_{0,1} \\ 0 & w_{1,2} & 0 & w_{0,2}\\ \hline w_{2,0} & 0 & w_{1,0} & 0 \\ w_{2,1} & w_{2,0} & w_{1,1} & w_{1,0} \\ w_{2,2} & w_{2,1} & w_{1,2} & w_{1,1} \\ 0 & w_{2,2} & 0 & w_{1,2} \\ \hline 0 & 0 & w_{2,0} & 0 \\ 0 & 0 & w_{2,1} & w_{2,0} \\ 0 & 0 & w_{2,2} & w_{2,1} \\ 0 & 0 & 0 & w_{2,2} \end{array}\right] \textcolor{red}{ \begin{bmatrix} x'_{00} \\ x'_{01} \\ x'_{10} \\ x'_{11} \end{bmatrix}} }$

转置矩阵中的每一列都对应着一个卷积的窗口, 上面的过程对应着, 卷积窗口和输入图片作乘积再求和, 如下示意:

\tiny
\begin{split}
\begin{bmatrix}
y'_{00} & y'_{01} & y'_{02} & y'_{03}\\
y'_{10} & y'_{11} & y'_{12} & y'_{13}\\
y'_{20} & y'_{21} & y'_{22} & y'_{23}\\
y'_{30} & y'_{31} & y'_{32} & y'_{33}
\end{bmatrix}
&=
\left\{
\textcolor{red}{
\begin{bmatrix}
w_{0,0} & w_{0,1} & w_{0,2} & 0 \\
w_{1,0} & w_{1,1} & w_{1,2} & 0 \\
w_{2,0} & w_{2,1} & w_{2,2} & 0 \\
0 & 0 & 0 & 0
\end{bmatrix}},
\begin{bmatrix}
0 & w_{0,0} & w_{0,1} & w_{0,2} \\
0 & w_{1,0} & w_{1,1} & w_{1,2} \\
0 & w_{2,0} & w_{2,1} & w_{2,2} \\
0 & 0 & 0 & 0
\end{bmatrix},
\begin{bmatrix}
0 & 0 & 0 & 0 \\
w_{0,0} & w_{0,1} & w_{0,2} & 0 \\
w_{1,0} & w_{1,1} & w_{1,2} & 0 \\
w_{2,0} & w_{2,1} & w_{2,2} & 0
\end{bmatrix},
\begin{bmatrix}
0 & 0 & 0 & 0 \\
0 & w_{0,0} & w_{0,1} & w_{0,2} \\
0 & w_{1,0} & w_{1,1} & w_{1,2} \\
0 & w_{2,0} & w_{2,1} & w_{2,2}
\end{bmatrix}
\right\}
\begin{bmatrix}
\textcolor{red}{x'_{00}}\\ x'_{01}\\ x'_{10}\\ x'_{11}
\end{bmatrix}\\
&=
\begin{bmatrix}
w_{0,0} & w_{0,1} & w_{0,2} & 0 \\
w_{1,0} & w_{1,1} & w_{1,2} & 0 \\
w_{2,0} & w_{2,1} & w_{2,2} & 0 \\
0 & 0 & 0 & 0
\end{bmatrix}x'_{00}+
\begin{bmatrix}
0 & w_{0,0} & w_{0,1} & w_{0,2} \\
0 & w_{1,0} & w_{1,1} & w_{1,2} \\
0 & w_{2,0} & w_{2,1} & w_{2,2} \\
0 & 0 & 0 & 0
\end{bmatrix}x'_{01}+
\begin{bmatrix}
0 & 0 & 0 & 0 \\
w_{0,0} & w_{0,1} & w_{0,2} & 0 \\
w_{1,0} & w_{1,1} & w_{1,2} & 0 \\
w_{2,0} & w_{2,1} & w_{2,2} & 0
\end{bmatrix}x'_{10}+
\begin{bmatrix}
0 & 0 & 0 & 0 \\
0 & w_{0,0} & w_{0,1} & w_{0,2} \\
0 & w_{1,0} & w_{1,1} & w_{1,2} \\
0 & w_{2,0} & w_{2,1} & w_{2,2}
\end{bmatrix}x'_{11}\\
&=
\left\{
\begin{matrix}
\textcolor{red}{
\begin{bmatrix}
w_{0,0} & w_{0,1} & w_{0,2} & 0 \\
w_{1,0} & w_{1,1} & w_{1,2} & 0 \\
w_{2,0} & w_{2,1} & w_{2,2} & 0 \\
0 & 0 & 0 & 0
\end{bmatrix}}&
\begin{bmatrix}
0 & w_{0,0} & w_{0,1} & w_{0,2} \\
0 & w_{1,0} & w_{1,1} & w_{1,2} \\
0 & w_{2,0} & w_{2,1} & w_{2,2} \\
0 & 0 & 0 & 0
\end{bmatrix}\\
\begin{bmatrix}
0 & 0 & 0 & 0 \\
w_{0,0} & w_{0,1} & w_{0,2} & 0 \\
w_{1,0} & w_{1,1} & w_{1,2} & 0 \\
w_{2,0} & w_{2,1} & w_{2,2} & 0
\end{bmatrix}&
\begin{bmatrix}
0 & 0 & 0 & 0 \\
0 & w_{0,0} & w_{0,1} & w_{0,2} \\
0 & w_{1,0} & w_{1,1} & w_{1,2} \\
0 & w_{2,0} & w_{2,1} & w_{2,2}
\end{bmatrix}
\end{matrix}
\right\}*
\begin{bmatrix}
\textcolor{red}{x'_{00}} & x'_{01} \\ x'_{10} & x'_{11}
\end{bmatrix}
\end{split}

这样相当于 $\left[\begin{smallmatrix}x_{00}&x_{01}\\x_{10}&x_{11}\end{smallmatrix}\right]$ 作为权重作用在 $4\times4$ 的输出上

\scriptsize
\left[
\begin{array}{ccc}
x_{00} & \fbox{$\begin{matrix} x_{00}+x_{01} \end{matrix}$} & x_{01}\\\\
\fbox{$\begin{matrix} x_{00}\\+\\x_{10} \end{matrix}$} &
\fbox{$\begin{matrix} x_{00}+x_{01}\\+\\x_{10}+x_{11} \end{matrix}$} &
\fbox{$\begin{matrix} x_{01}\\+\\x_{11} \end{matrix}$}\\\\
x_{10} & \fbox{$\begin{matrix} x_{10}+x_{11} \end{matrix}$} & x_{11}\\
\end{array}
\right]

这个结果相当于用原卷积核左右镜像+上下镜像后的矩阵作为卷积核, 对两层 padding 的输入图像作卷积

$\fbox{$\begin{matrix} w_{2,2} & w_{2,1} & w_{2,0} \\ w_{1,2} & w_{1,1} & w_{1,0} \\ w_{0,2} & w_{0,1} & w_{0,0} \end{matrix}$} \begin{bmatrix} 0 & 0 & 0 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 & 0 & 0 \\ 0 & 0 & x'_{00} & x'_{01} & 0 & 0 \\ 0 & 0 & x'_{10} & x'_{11} & 0 & 0 \\ 0 & 0 & 0 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 & 0 & 0 \\ \end{bmatrix}$

ConvTranspose2d

torch.nn.ConvTranspose2d

import torch

#随机生成一个矩阵(图片)
input = torch.randn(1,   #batch_size
                    32,  #in_channels
                    28,  #width
                    28)  #height

#定义一个转置卷积
# 4.5 No zero padding, non-unit strides, transposed
# k=2, s=2, p=0 => o'=s(i'-1)+k=2i'
# 输出图片的 size 是输入的 2 倍 28x28→56x56
tranposed_conv = torch.nn.ConvTranspose2d(
    in_channels=32, out_channels=16,
    kernel_size=2, stride=2
    )   

#用上述定义的转置卷积对 input 进行卷积操作
output = tranposed_conv(input)

print(input.shape)       #torch.Size([1, 32, 28, 28])
print(output.shape)      #torch.Size([1, 16, 56, 56])
print(tranposed_conv.weight.shape) #torch.Size([32, 16, 2, 2])

ModuleList

torch.nn.ModuleList

定义一个简单的 ModuleList, 其中包括 4 个 Module

import torch
import torch.nn as nn

modulelist = nn.ModuleList([          
          nn.Conv2d(1,20,5),
          nn.ReLU(),
          nn.Conv2d(20,64,5),
          nn.ReLU()
        ])

# 输出 ModuleList
print(modulelist)
# ModuleList(
#   (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
#   (1): ReLU()
#   (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
#   (3): ReLU()
# )

# 输出 ModuleList 的类型
print(modulelist.type)
# <bound method Module.type of ModuleList(
#   (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
#   (1): ReLU()
#   (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
#   (3): ReLU()
# )>

可以迭代

# 使用 for 循环迭代输出 ModuleList 中的 Module
for i in modulelist:
    print(i)
# Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
# ReLU()
# Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
# ReLU()

# 使用 enumerate 输出 modulelist 中的元素
for i, module in enumerate(modulelist):
    print(i, module)
# 0 Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
# 1 ReLU()
# 2 Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
# 3 ReLU()

可以使用 [] 中括号来访问 ModuleList 中的元素, 下标从 0 开始, 注意不要越界

# 使用 [] 访问 ModuleList 中的元素
for i in range(4):
    print(modulelist[i])
# Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
# ReLU()
# Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
# ReLU()

借助 ModuleList 来定义一个简单的网络模型

class Net(nn.Module):
    def __init__(self) -> None:
        super().__init__()

    def forward(self, x):
        for module in modulelist:
            x = module(x)
        return x

测试这个模型

#输入 batch_size=1, channels=1, width=100, height=100
input = torch.randn(1, 1, 100, 100)

#网络实例
net = Net()

#输出
output = net.forward(input)

print(input.shape)  #torch.Size([1, 1, 100, 100])
print(output.shape) #torch.Size([1, 64, 92, 92])

Sequential

torch.nn.Sequential

Sequential 和 ModuleList 用法非常相似, 不同的地方在于 ModuleList 只是单纯的一个 Module 的 List, 而 Sequential 则将其中的 Module 按顺序串联成一个模型. Pytorhc 中可以直接调用定义好的 Sequential, 而 ModuleList 则不行.

定义一个简单的 Sequential Container

import torch
import torch.nn as nn

sequential = nn.Sequential(
          nn.Conv2d(1,20,5),
          nn.ReLU(),
          nn.Conv2d(20,64,5),
          nn.ReLU()
        )

# 输出 sequential 本身
print(sequential)
# Sequential(
#   (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
#   (1): ReLU()
#   (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
#   (3): ReLU()
# )

# 输出 sequential 的类型
print(sequential.type)
# <bound method Module.type of Sequential(
#   (0): Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
#   (1): ReLU()
#   (2): Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
#   (3): ReLU()
# )>

可以迭代

# 使用 for 循环输出 sequential 中的元素
for i in sequential:
    print(i)
# Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
# ReLU()
# Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
# ReLU()

# 使用 enumerate 输出 sequential 中的元素
for i,j in enumerate(sequential):
    print(i, j)
# 0 Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
# 1 ReLU()
# 2 Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
# 3 ReLU()

可以使用 [] 中括号来访问 Sequential 中的元素, 下标从 0 开始, 注意不要越界

for i in range(4):
    print(sequential[i])
# Conv2d(1, 20, kernel_size=(5, 5), stride=(1, 1))
# ReLU()
# Conv2d(20, 64, kernel_size=(5, 5), stride=(1, 1))
# ReLU()

Sequential 本身已经是一个模型, 可以直接调用

#输入 batch_size=1, channels=1, width=100, height=100
input = torch.randn(1, 1, 100, 100)
output = sequential(input)

print(input.shape)  #torch.Size([1, 1, 100, 100])
print(output.shape) #torch.Size([1, 64, 92, 92])

functional

torch.nn.functional

one_hot

torch.nn.functional.one_hot

import torch
from torch.nn import functional as F

x = torch.arange(0, 5) % 3
# tensor([0, 1, 2, 0, 1])

F.one_hot(x)
# tensor([[1, 0, 0],
#         [0, 1, 0],
#         [0, 0, 1],
#         [1, 0, 0],
#         [0, 1, 0]])

F.one_hot(x, num_classes=5)
# tensor([[1, 0, 0, 0, 0],
#         [0, 1, 0, 0, 0],
#         [0, 0, 1, 0, 0],
#         [1, 0, 0, 0, 0],
#         [0, 1, 0, 0, 0]])

y = torch.arange(0, 6).reshape(3,2) % 4
print(y.shape, '\n', y)
# torch.Size([3, 2]) 
# tensor([[0, 1],
#         [2, 3],
#         [0, 1]])

Y = F.one_hot(y, 4)
print(Y.shape, '\n', Y)
# torch.Size([3, 2, 4])
# tensor([[[1, 0, 0, 0],
#          [0, 1, 0, 0]],

#         [[0, 0, 1, 0],
#          [0, 0, 0, 1]],

#         [[1, 0, 0, 0],
#          [0, 1, 0, 0]]])

nn.Dropout

torch.nn.Dropout

torch.autocast

AUTOMATIC MIXED PRECISION PACKAGE - TORCH.AMP{target="_blank"}

import torch
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from tqdm import tqdm

# loading data
trainset = datasets.MNIST(root="dataset/", train=True, transform=transforms.ToTensor(), download=True)
trainloader = DataLoader(dataset=trainset, batch_size=16)

# linear classifier: 28*28 -> 10
model = torch.nn.Linear(in_features=28*28, out_features=10)

# initializing loss function and optimizer
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# training
for epoch in range(3):
    for images, labels in tqdm(trainloader, desc=f"epoch {epoch}"):
        # images.shape: [16, 1, 28, 28], labels.shape: [16]
        inputs = images.view(images.size(0), -1)
        # inputs.shape: [16, 784]

        optimizer.zero_grad()

        # enables autocasting for the forward pass (model + loss)
        with torch.autocast(device_type="cpu", dtype=torch.bfloat16):
            output = model(inputs)
            loss = criterion(output, labels)
        
        # forward without autocasting
        # output = model(inputs)
        # loss = criterion(output, labels)

        # backward
        loss.backward()
        optimizer.step()

Torchvision

Documents
pytorch.org/vision
Source code in github for torchvision
pytorch/vision | source code | github

torchvision.transforms

Documents
Transforming and augmenting images
Getting started with transforms v2

Demonstraction

import torch
from torchvision.transforms import v2
from PIL import Image
import matplotlib.pyplot as plt

# You can download the astronaut.jpg image from
# https://github.com/pytorch/vision/blob/main/gallery/
# and convert annotation.json file to mask by yourself
path2image = r"./coco/images/astronaut.jpg"
path2mask = r"./mask.png"

TransformsList = [
    v2.RandomResizedCrop(size=(224, 224), antialias=True),
    v2.RandomHorizontalFlip(p=0.5), # posibility = 0.5
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    # 3 means and stds for 3 channels
]

ToFloatImage = v2.Compose([
    v2.ToImage(), # Convert PIL Image to Torchvision Image
    v2.ToDtype(torch.float32, scale=True) # Covert uint8 to float32
])

# read image and mask
image = Image.open(path2image) # PIL Image
mask = Image.open(path2mask) # PIL Image
# convet PIL Image to Torchvison Image
image = ToFloatImage(image) # Float Torchvision Image
mask = ToFloatImage(mask) # Float Torchvision Image

plt.close()
_, axes = plt.subplots(
    nrows=2, ncols=len(TransformsList)+1,
    figsize=(3*(len(TransformsList)+1), 3*2) # (size*cols, size*rows)
)

axes[0][0].imshow(image.permute(1,2,0)) # 3×H×W -permute-> H×W×3
axes[0][0].set_title("Original")
axes[1][0].imshow(mask.permute(1,2,0)) # 3×H×W -permute-> H×W×3

torch.manual_seed(1)
for i, transform in enumerate(TransformsList):
    '''Don't transform image and mask separately 
    when transforms contain ramdom operators'''
    # trans_image = transform(image) # <-- Don't
    # trans_mask = transform(mask)  # <-- Don't
    trans_image, trans_mask = transform(image, mask) # <-- Instead
    print(f"{type(transform).__name__}: {trans_image.shape = }")
    print(f"{type(transform).__name__}: {trans_mask.shape = }")
    # RandomResizedCrop: trans_image.shape = torch.Size([3, 224, 224])
    # RandomResizedCrop: trans_mask.shape = torch.Size([1, 224, 224])
    # RandomHorizontalFlip: trans_image.shape = torch.Size([3, 512, 512])
    # RandomHorizontalFlip: trans_mask.shape = torch.Size([1, 512, 512])
    # Normalize: trans_image.shape = torch.Size([3, 512, 512])
    # Normalize: trans_mask.shape = torch.Size([3, 512, 512]) <-- Note the channels of mask 

    axes[0][i+1].imshow(trans_image.permute(1,2,0)) # 3×H×W -permute-> H×W×3
    axes[0][i+1].set_title(type(transform).__name__)
    axes[1][i+1].imshow(trans_mask.permute(1,2,0)) # 3×H×W -permute-> H×W×3
plt.show()

astronaut-mask-transforms

transforms.v2.Compose

Documents
torchvision.transforms.v2.Compose

Demonstration

from torchvision.transforms import v2
transforms = v2.Compose([
    # Convert to tensor, only needed if you had a PIL image
    v2.ToImage(),
    # optional, most input are already uint8 at this point
    v2.ToDtype(torch.uint8, scale=True),
    v2.RandomResizedCrop(size=(224, 224), antialias=True),
    # Or v2.Resize(antialias=True),
    # Normalize expects float input
    v2.ToDtype(torch.float32, scale=True),
    v2.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

Trick

from torch import nn
from torchvision.transforms import v2
translist = nn.ModuleList([
    ...
])
transform = v2.Compose([transform for transform in translist])
for trans in translist:
    pass

v2-api-reference

Documents
v2-api-reference

transforms.v2.ToTensor

Documents
torchvision.transforms.v2.ToTensor

Use the command below instead

v2.Compose([
    v2.ToImage(), # Convert PIL Image to Torchvision Image
    v2.ToDtype(torch.float32, scale=True) # Covert uint8 to float32
])

transforms.v2.ToImage

Documents
transforms.v2.ToImage

transforms.v2.Normalize

Documents
torchvision.transforms.v2.Normalize
Some understanding
If the input image has n channels and v2.Normalize has n means and stds
1
v2.Normalize(mean=[m1, ..., mn], std=[s1, ..., sn])
The v2.Normalize dose Normalization on every channel of the input image (or array) with the corresponding mean and std

$channel(i) = (channel(i) - mean(i)) / std(i), \quad i = 1, \cdots, n$

If the input image only has 1 channles and v2.Normalize has n (more than 1) means and stds.
v2.Normalize will return n channels

$channel(i) = (channel - mean(i)) / std(i), \quad i = 1, \cdots, n$

Demonstraction

import torch
from torchvision.transforms import v2

channel1 = [[1, 0, 0], [0, 1, 0], [0, 0, 1]]
channel2 = [[0, 0, 1], [0, 1, 0], [1, 0, 0]]
t = torch.tensor([channel1, channel2], dtype=torch.float32)
print(t)
# tensor([[[1., 0., 0.],
#          [0., 1., 0.],
#          [0., 0., 1.]],

#         [[0., 0., 1.],
#          [0., 1., 0.],
#          [1., 0., 0.]]])

'''
channle 1:
1: (1 - mean(0))/std(1) = 1
0: (0 - mean(0))/std(1) = 0
channle 2:
1: (1 - mean(1))/std(1) = 0
0: (0 - mean(1))/std(1) = -1
'''
transform = v2.Normalize(mean=[0, 1], std=[1, 1])
trans_t = transform(t)
print(trans_t)
# tensor([[[ 1.,  0.,  0.],
#          [ 0.,  1.,  0.],
#          [ 0.,  0.,  1.]],

#         [[-1., -1.,  0.],
#          [-1.,  0., -1.],
#          [ 0., -1., -1.]]])

'''
channle 1:
1: (1 - mean(0))/std(0.5) = 2
0: (0 - mean(0))/std(0.5) = 0
channle 2:
1: (1 - mean(0))/std(2) = 0.5
0: (0 - mean(0))/std(2) = 0
'''
transform = v2.Normalize(mean=[0, 0], std=[0.5, 2])
trans_t = transform(t)
# Limit the number of elements shown
torch.set_printoptions(precision=1)
print(trans_t)
# tensor([[[2.0, 0.0, 0.0],
#          [0.0, 2.0, 0.0],
#          [0.0, 0.0, 2.0]],

#         [[0.0, 0.0, 0.5],
#          [0.0, 0.5, 0.0],
#          [0.5, 0.0, 0.0]]])

Paper that proposed Batch Normalization
Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift

torchvision.datasets

Documents
torchvision.datasets

MNIST

Documents
torchvision.datasets.MNIST
Extended reading
MNIST的均值和方差(0.1307,), (0.3081,)是怎么计算出来的？
Note
MNIST所继承的VisionDataset是制作与torchvison相兼容数据的基类, VisionDataset依然是Dataset的子类

Demonstraction

from torchvision import datasets
import random

# train=True 读取训练集
train_dataset = datasets.MNIST(root='../dataset/mnist/',
                              train=True,
                              download=False, #第一次运行设为True下载数据集
                              transform=None)
# train=False 读取测试集
test_dataset = datasets.MNIST(root='..dataset/mnist/',
                              train=False,
                              download=False,  #第一次运行设为True下载数据集
                              transform=None)

print(type(train_dataset)) # <class 'torchvision.datasets.mnist.MNIST'>
print(len(train_dataset))  # 60000
print(len(test_dataset))   # 10000

item = random.choice(train_dataset)
print(type(item))  # <class 'tuple'>
print(item)        # (<PIL.Image.Image image mode=L size=28x28 at 0x1CF74469760>, 5)

Note
train_dataset和test_dataset是两个迭代器, 其中的元素为元组;
train_dataset包含60000个元组; test_dataset包含10000个元组;
每个元组都包含2个元素, 其中第一个元素是图片, 第二个元素是图片的标签.

CIFAR10

该数据集共有 60000 张彩色图像，这些图像是 32*32，分为 10 个类，每类 6000 张图。
这里面有 50000 张用于训练，构成了 5 个训练批，每一批 10000 张图；另外 10000 用于测试，单独构成一批。
测试批的数据里，取自 10 类中的每一类，每一类随机取 1000 张，一共 10000 张，剩下的 50000 张图像就随机排列组成了训练批。
注意一个训练批中的各类图像并不一定数量相同，但总训练集的 50000 张图像中每一类都有 5000 张图。

torch.optim

torch.optim | Pytorch Docs{target="_blank"}

torch.optim 是pytorch的一个工具包, 里面包含了常见的神经网络的优化算法

下面以torch.optim.SGD为例, 介绍如何使用这个工具包

torch.optim.SGD

SGD为随机梯度下降法(stochastic gradient descent)

torch.optim.SGD | Pytorch Docs{target="_blank"}

自定义一个数据集

这个数据集有108个样本, 每个样本都是 $[0,1)$ 上均匀生成的一个64维向量, 每个样本对应一个16维的标签

from torch.utils.data import Dataset
#构建一个自定义数据集的类,继承Dataset类
class CustomDataset(Dataset):
    def __init__(self, input, target):
        self.input, self.target = input, target

    def __getitem__(self, index):
        return self.input[index], self.target[index]

    def __len__(self):
        return len(self.input)

import torch
#构建一个简单的数据
input = torch.rand([108,64])   #108个样本, 每个样本是一个64维的向量
target = torch.rand([108,16])  #108个标签, 每个标签是一个16维的向量

#通过CustomDataset获取经过Dataset封装的可迭代的数据集
dataset = CustomDataset(input, target)

from torch.utils.data import DataLoader
#加载数据集
dataset_loaded = DataLoader(dataset, batch_size=18)

自定义一个模型

import torch.nn as nn
import torch.nn.functional as F
#构建一个简单的两层的MLP
class Model(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(64, 32)
        self.layer2 = nn.Linear(32, 16)

    def forward(self, x):
        x = F.relu(self.layer1(x))
        x = F.relu(self.layer2(x))
        return x

#模型实例
model = Model()
#损失函数
loss_fn = nn.MSELoss(reduction="mean")

使用使用torch.optim.SGD对上面自定义的数据和模型进行训练

import torch.optim as optim
#将模型的参数传入,构建优化器
optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

# 如果需要给不同层的参数设置不同的步长可以如下调用optim.SGD
# optimizer = optim.SGD([
#                 {'params': model.layer1.parameters()},
#                 {'params': model.layer2.parameters(), 'lr': 1e-3}
#             ], lr=1e-2, momentum=0.9)

for input, target in dataset_loaded:
    optimizer.zero_grad()
    output = model(input)
    loss = loss_fn(output, target)
    loss.backward()
    optimizer.step()

zero_grad
torch.optim.Optimizer.zero_grad

step
torch.optim.Optimizer.step

TensorBoard

Docs
How to use TensorBoard with PyTorch
torch.utils.tensorboard
Video tutorial
Pytorch TensorBoard Tutorial

PyTorch Profiler With TensorBoard

Docs
PyTorch Profiler With TensorBoard
torch-tb-profiler