1. 试着增加迭代轮数。对比LeNet的结果有什么不同？为什么？

py 复制代码

import torch
from torch import nn
from d2l import torch as d2l

net = nn.Sequential(
    # 这里使用一个11*11的更大窗口来捕捉对象。
    # 同时，步幅为4，以减少输出的高度和宽度。
    # 另外，输出通道的数目远大于LeNet
    nn.Conv2d(1, 96, kernel_size=11, stride=4, padding=1), nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2),
    # 减小卷积窗口，使用填充为2来使得输入与输出的高和宽一致，且增大输出通道数
    nn.Conv2d(96, 256, kernel_size=5, padding=2), nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2),
    # 使用三个连续的卷积层和较小的卷积窗口。
    # 除了最后的卷积层，输出通道的数量进一步增加。
    # 在前两个卷积层之后，汇聚层不用于减少输入的高度和宽度
    nn.Conv2d(256, 384, kernel_size=3, padding=1), nn.ReLU(),
    nn.Conv2d(384, 384, kernel_size=3, padding=1), nn.ReLU(),
    nn.Conv2d(384, 256, kernel_size=3, padding=1), nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2),
    nn.Flatten(),
    # 这里，全连接层的输出数量是LeNet中的好几倍。使用dropout层来减轻过拟合
    nn.Linear(6400, 4096), nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(4096, 4096), nn.ReLU(),
    nn.Dropout(p=0.5),
    # 最后是输出层。由于这里使用Fashion-MNIST，所以用类别数为10，而非论文中的1000
    nn.Linear(4096, 10),
)

py 复制代码

X = torch.randn(1, 1, 224, 224)
for layer in net:
    X=layer(X)
    print(layer.__class__.__name__,'output shape:\t',X.shape)

py 复制代码

Conv2d output shape:	 torch.Size([1, 96, 54, 54])
ReLU output shape:	 torch.Size([1, 96, 54, 54])
MaxPool2d output shape:	 torch.Size([1, 96, 26, 26])
Conv2d output shape:	 torch.Size([1, 256, 26, 26])
ReLU output shape:	 torch.Size([1, 256, 26, 26])
MaxPool2d output shape:	 torch.Size([1, 256, 12, 12])
Conv2d output shape:	 torch.Size([1, 384, 12, 12])
ReLU output shape:	 torch.Size([1, 384, 12, 12])
Conv2d output shape:	 torch.Size([1, 384, 12, 12])
ReLU output shape:	 torch.Size([1, 384, 12, 12])
Conv2d output shape:	 torch.Size([1, 256, 12, 12])
ReLU output shape:	 torch.Size([1, 256, 12, 12])
MaxPool2d output shape:	 torch.Size([1, 256, 5, 5])
Flatten output shape:	 torch.Size([1, 6400])
Linear output shape:	 torch.Size([1, 4096])
ReLU output shape:	 torch.Size([1, 4096])
Dropout output shape:	 torch.Size([1, 4096])
Linear output shape:	 torch.Size([1, 4096])
ReLU output shape:	 torch.Size([1, 4096])
Dropout output shape:	 torch.Size([1, 4096])
Linear output shape:	 torch.Size([1, 10])

py 复制代码

batch_size = 128
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)

py 复制代码

lr, num_epochs = 0.01, 10
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

py 复制代码

loss 0.326, train acc 0.880, test acc 0.884
4997.8 examples/sec on cuda:0

py 复制代码

lr, num_epochs = 0.01, 20
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

py 复制代码

loss 0.246, train acc 0.910, test acc 0.901
4995.5 examples/sec on cuda:0

py 复制代码

lr, num_epochs = 0.01, 40
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

py 复制代码

# 10m 17.2s
loss 0.157, train acc 0.941, test acc 0.914
4994.7 examples/sec on cuda:0

2. AlexNet对Fashion-MNIST数据集来说可能太复杂了。

尝试简化模型以加快训练速度，同时确保准确性不会显著下降。
设计一个更好的模型，可以直接在 <math xmlns="http://www.w3.org/1998/Math/MathML"> 28 × 28 28 \times 28 </math>28×28图像上工作。

py 复制代码

import torch.nn as nn

simplified_net = nn.Sequential(
    # 使用7x7卷积核，并将步幅设置为2，减少参数和计算量
    nn.Conv2d(1, 32, kernel_size=7, stride=2, padding=3), nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1),

    # 第二个卷积层，减少通道数和计算量
    nn.Conv2d(32, 64, kernel_size=5, padding=2), nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1),

    # 使用两个3x3卷积层，但减少通道数
    nn.Conv2d(64, 128, kernel_size=3, padding=1), nn.ReLU(),
    nn.Conv2d(128, 128, kernel_size=3, padding=1), nn.ReLU(),
    nn.MaxPool2d(kernel_size=3, stride=2, padding=1),

    # Flatten层将多维的卷积层输出展平为一维
    nn.Flatten(),

    # 全连接层的输出减少到2048个节点
    nn.Linear(128 * 2 * 2, 2048), nn.ReLU(),
    nn.Dropout(p=0.5),

    # 第二个全连接层的输出也减少到2048个节点
    nn.Linear(2048, 2048), nn.ReLU(),
    nn.Dropout(p=0.5),

    # 最后的输出层，类别数为10
    nn.Linear(2048, 10),
)

print(simplified_net)

py 复制代码

Sequential(
  (0): Conv2d(1, 32, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
  (1): ReLU()
  (2): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (3): Conv2d(32, 64, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
  (4): ReLU()
  (5): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (6): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (7): ReLU()
  (8): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (9): ReLU()
  (10): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (11): Flatten(start_dim=1, end_dim=-1)
  (12): Linear(in_features=512, out_features=2048, bias=True)
  (13): ReLU()
  (14): Dropout(p=0.5, inplace=False)
  (15): Linear(in_features=2048, out_features=2048, bias=True)
  (16): ReLU()
  (17): Dropout(p=0.5, inplace=False)
  (18): Linear(in_features=2048, out_features=10, bias=True)
)

py 复制代码

X = torch.randn(1, 1, 28, 28)
for layer in simplified_net:
    X=layer(X)
    print(layer.__class__.__name__,'output shape:\t',X.shape)

py 复制代码

Conv2d output shape:	 torch.Size([1, 32, 14, 14])
ReLU output shape:	 torch.Size([1, 32, 14, 14])
MaxPool2d output shape:	 torch.Size([1, 32, 7, 7])
Conv2d output shape:	 torch.Size([1, 64, 7, 7])
ReLU output shape:	 torch.Size([1, 64, 7, 7])
MaxPool2d output shape:	 torch.Size([1, 64, 4, 4])
Conv2d output shape:	 torch.Size([1, 128, 4, 4])
ReLU output shape:	 torch.Size([1, 128, 4, 4])
Conv2d output shape:	 torch.Size([1, 128, 4, 4])
ReLU output shape:	 torch.Size([1, 128, 4, 4])
MaxPool2d output shape:	 torch.Size([1, 128, 2, 2])
Flatten output shape:	 torch.Size([1, 512])
Linear output shape:	 torch.Size([1, 2048])
ReLU output shape:	 torch.Size([1, 2048])
Dropout output shape:	 torch.Size([1, 2048])
Linear output shape:	 torch.Size([1, 2048])
ReLU output shape:	 torch.Size([1, 2048])
Dropout output shape:	 torch.Size([1, 2048])
Linear output shape:	 torch.Size([1, 10])

py 复制代码

batch_size = 128
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)

py 复制代码

lr, num_epochs = 0.01, 10
d2l.train_ch6(simplified_net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

py 复制代码

loss 0.488, train acc 0.818, test acc 0.825
99055.9 examples/sec on cuda:0

py 复制代码

lr, num_epochs = 0.01, 20
d2l.train_ch6(simplified_net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

py 复制代码

loss 0.347, train acc 0.872, test acc 0.867
100890.6 examples/sec on cuda:0

py 复制代码

lr, num_epochs = 0.01, 40
d2l.train_ch6(simplified_net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

py 复制代码

loss 0.245, train acc 0.909, test acc 0.893
100245.4 examples/sec on cuda:0

py 复制代码

lr, num_epochs = 0.01, 100
d2l.train_ch6(simplified_net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

py 复制代码

loss 0.103, train acc 0.962, test acc 0.905
98952.8 examples/sec on cuda:0

3. 修改批量大小，并观察模型精度和GPU显存变化。

py 复制代码

batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)

py 复制代码

batch_size = 1024
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size, resize=224)

4. 分析了AlexNet的计算性能。

在AlexNet中主要是哪部分占用显存？
在AlexNet中主要是哪部分需要更多的计算？
计算结果时显存带宽如何？

1. 在 AlexNet 中主要是哪部分占用显存？

在 AlexNet 中，显存的占用主要来源于以下几个方面：

1.1 卷积层和全连接层的权重参数

卷积层权重参数 ：卷积层有大量的过滤器，每个过滤器都有自己的权重参数。卷积层的权重参数数量可以通过以下公式计算： <math xmlns="http://www.w3.org/1998/Math/MathML"> 卷积层参数数 = ( 过滤器宽度 × 过滤器高度 × 输入通道数 + 1 ) × 输出通道数 \text{卷积层参数数} = (\text{过滤器宽度} \times \text{过滤器高度} \times \text{输入通道数} + 1) \times \text{输出通道数} </math>卷积层参数数=(过滤器宽度×过滤器高度×输入通道数+1)×输出通道数其中，+1 表示每个过滤器都有一个偏置参数。
全连接层权重参数 ：全连接层的参数量通常比卷积层要多，因为每个节点都与前一层的每个节点相连。全连接层的参数数量可以通过以下公式计算： <math xmlns="http://www.w3.org/1998/Math/MathML"> 全连接层参数数 = ( 输入节点数 + 1 ) × 输出节点数 \text{全连接层参数数} = (\text{输入节点数} + 1) \times \text{输出节点数} </math>全连接层参数数=(输入节点数+1)×输出节点数其中，+1 表示每个节点都有一个偏置参数。

1.2 中间激活值（Feature Maps）

卷积层和全连接层的中间激活值（feature maps）也占用了大量显存。尤其是卷积层的输出特征图，由于其较大的空间维度和较多的通道数，会占用大量的显存。

2. 在 AlexNet 中主要是哪部分需要更多的计算？

在 AlexNet 中，计算量主要集中在卷积层上。

2.1 卷积层计算量

卷积运算 ：卷积层的计算复杂度通常比全连接层要高。卷积运算的计算量可以通过以下公式计算： <math xmlns="http://www.w3.org/1998/Math/MathML"> 卷积计算量 = 输出通道数 × 输出宽度 × 输出高度 × ( 输入通道数 × 过滤器宽度 × 过滤器高度 ) \text{卷积计算量} = \text{输出通道数} \times \text{输出宽度} \times \text{输出高度} \times (\text{输入通道数} \times \text{过滤器宽度} \times \text{过滤器高度}) </math>卷积计算量=输出通道数×输出宽度×输出高度×(输入通道数×过滤器宽度×过滤器高度) 由于卷积操作需要对每个位置进行大量的乘法和加法运算，因此计算量非常大。

2.2 全连接层计算量

矩阵乘法 ：全连接层的计算量主要来自于矩阵乘法。虽然全连接层的参数量可能较大，但其计算量通常不如卷积层大。全连接层的计算量可以通过以下公式计算： <math xmlns="http://www.w3.org/1998/Math/MathML"> 全连接层计算量 = 输入节点数 × 输出节点数 \text{全连接层计算量} = \text{输入节点数} \times \text{输出节点数} </math>全连接层计算量=输入节点数×输出节点数

3. 计算结果时显存带宽如何？

3.1 显存带宽的重要性

显存带宽：显存带宽是指显存与GPU之间的数据传输速率。对于深度学习模型，显存带宽是一个重要的瓶颈，因为模型在前向传播和反向传播过程中，需要频繁地在显存和GPU之间传输数据。
数据传输量：模型的权重参数和中间激活值需要频繁地从显存传输到GPU进行计算，然后再传回显存进行存储。因此，显存带宽对模型的训练速度和推理速度有着重要的影响。

3.2 AlexNet中的显存带宽

中间激活值传输：在前向传播过程中，每一层的输出（即中间激活值）需要传输到下一层进行计算。在反向传播过程中，梯度信息也需要传输。
参数更新：在每次梯度更新时，权重参数需要从显存传输到GPU进行更新，然后传回显存存储。

3.3 优化显存带宽利用

批处理：通过增加批处理大小，可以更高效地利用显存带宽，因为大批量的数据可以一起传输，减少了传输次数。
模型压缩：减少模型的参数量和中间激活值的大小，如通过剪枝、量化等方法，可以减少显存带宽的需求。
高效的内存管理：使用诸如混合精度训练（使用半精度浮点数进行计算）等技术，可以减少显存带宽的压力。

总的来说，AlexNet 的显存占用主要集中在卷积层和全连接层的权重参数以及中间激活值上；计算量主要集中在卷积层；显存带宽的利用率在数据传输中起着重要作用，是影响模型性能的一个关键因素。

5. 将dropout和ReLU应用于LeNet-5，效果有提升吗？再试试预处理会怎么样？

py 复制代码

net = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5, padding=2), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5), nn.Sigmoid(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Flatten(),
    nn.Linear(16 * 5 * 5, 120), nn.Sigmoid(),
    nn.Linear(120, 84), nn.Sigmoid(),
    nn.Linear(84, 10))

py 复制代码

batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size=batch_size)

py 复制代码

lr, num_epochs = 0.9, 10
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr, d2l.try_gpu())

py 复制代码

loss 0.468, train acc 0.825, test acc 0.820
257837.9 examples/sec on cuda:0

py 复制代码

net = nn.Sequential(
    nn.Conv2d(1, 6, kernel_size=5, padding=2), nn.ReLU(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Conv2d(6, 16, kernel_size=5), nn.ReLU(),
    nn.AvgPool2d(kernel_size=2, stride=2),
    nn.Flatten(),
    nn.Linear(16 * 5 * 5, 120), nn.ReLU(), nn.Dropout(p=0.5),
    nn.Linear(120, 84), nn.ReLU(), nn.Dropout(p=0.5),
    nn.Linear(84, 10))

py 复制代码

loss 0.396, train acc 0.861, test acc 0.868
243008.4 examples/sec on cuda:0

深度卷积神经网络（AlexNet）｜现代卷积神经网络｜动手学深度学习

1. 试着增加迭代轮数。对比LeNet的结果有什么不同？为什么？

2. AlexNet对Fashion-MNIST数据集来说可能太复杂了。

3. 修改批量大小，并观察模型精度和GPU显存变化。

4. 分析了AlexNet的计算性能。

1. 在 AlexNet 中主要是哪部分占用显存？

1.1 卷积层和全连接层的权重参数

1.2 中间激活值（Feature Maps）

2. 在 AlexNet 中主要是哪部分需要更多的计算？

2.1 卷积层计算量

2.2 全连接层计算量

3. 计算结果时显存带宽如何？

3.1 显存带宽的重要性

3.2 AlexNet中的显存带宽

3.3 优化显存带宽利用

5. 将dropout和ReLU应用于LeNet-5，效果有提升吗？再试试预处理会怎么样？