【学习笔记】从零构建大模型

代码清单3-5 多头注意力

python 复制代码

import torch
import torch.nn as nn

class MultiHeadAttention(nn.Module):
    def __init__(self, d_in, d_out,
            context_length, dropout, num_heads, qkv_bias=False):
        super().__init__()
        assert (d_out % num_heads == 0), \
                "d_out must be divisible by num_heads"
        self.d_out = d_out
        self.num_heads = num_heads
        # 减少投影维度以匹配所需的输出维度
        self.head_dim = d_out // num_heads
        self.W_query = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_key = nn.Linear(d_in, d_out, bias=qkv_bias)
        self.W_value = nn.Linear(d_in, d_out, bias=qkv_bias)
        # 使用一个线性层来组合头的输出
        self.out_proj = nn.Linear(d_out, d_out)
        self.dropout = nn.Dropout(dropout)
        self.register_buffer(
                "mask",
                torch.triu(torch.ones(context_length, context_length),
                    diagonal=1)
        )

    def forward(self, x):
        b, num_tokens, d_in = x.shape # (2, 6, 3)
        # shape: (b, num_tokens, d_out)
        keys = self.W_key(x) # (2, 6, 3) @ (3, 2) => (2, 6, 2)
        queries = self.W_query(x)
        values = self.W_value(x)

        # 通过添加一个 num_heads 维 度来隐式地分隔矩阵。然后 展开最后一个维度:
        # (b, num _tokens, d_ out) -> (b, num_tokens, num_heads, head_dim)
        keys = keys.view(b, num_tokens, self.num_heads, self.head_dim) # (2, 6, 2) => (2, 6, 2, 1)
        values = values.view(b, num_tokens, self.num_heads, self.head_dim)
        queries = queries.view(
                b, num_tokens, self.num_heads, self.head_dim
        )

        # 从形状(b, num_tokens, num_heads, head_dim) 转换到(b, num_heads, num_tokens, head_dim)
        keys = keys.transpose(1, 2) # (2, 6, 2, 1) => (2, 2, 6, 1)
        values = values.transpose(1, 2)
        queries = queries.transpose(1, 2)

        # 计算每个头的点积
        attn_scores = queries @ keys.transpose(2, 3) # (2, 2, 6, 1) @ (2, 2, 1, 6) => (2, 2, 6, 6)
        mask_bool = self.mask.bool()[:num_tokens, :num_tokens]
        attn_scores.masked_fill_(mask_bool, -torch.inf)
        attn_weights = torch.softmax(
                attn_scores / keys.shape[-1]**0.5, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # 张量形状: (b, num_tokens, n_heads, head_dim)
        context_vec = (attn_weights @ values).transpose(1, 2) # (2, 2, 6, 6) @ (2, 2, 6, 1) => (2, 2, 6, 1) => (2, 6, 2, 1)
        # 组合头，其中 self.d_out = self.num_heads * self.head_dim
        context_vec = context_vec.contiguous().view(
                b, num_tokens, self.d_out
        ) # (2, 6, 2, 1) => (2, 6, 2)
        # 添加一个可选 的线性投影
        context_vec = self.out_proj(context_vec)
        return context_vec

inputs = torch.tensor(
    [[0.43, 0.15, 0.89], # Your     (x^1)
    [0.55, 0.87, 0.66], # journey  (x^2)
    [0.57, 0.85, 0.64], # starts
    [0.22, 0.58, 0.33], # with
    [0.77, 0.25, 0.10], # one
    [0.05, 0.80, 0.55]] # step
)
batch = torch.stack((inputs, inputs), dim=0)

torch.manual_seed(123)
batch_size, context_length, d_in = batch.shape # (2, 6, 3)
d_out = 2
mha = MultiHeadAttention(d_in, d_out, context_length, 0, num_heads=2)
context_vecs = mha(batch)
print(context_vecs.shape, context_vecs)

代码清单4-2 层归一化类

层归一化：减去均值，并将结果除以方差的平方根(也就是标准差)。归一化后的层输出也包含负值，其均值为 0，方差为 1

层归一化，提高神经网络训练的稳定性和效率。层归一化的主要思想是调整神经网络层的激活(输出)，使其均值为 0 且方差(单位方差)为 1。这种调整有助于加速权重的有效收敛，并确保训练过程的一致性和可靠性。

python 复制代码

import torch
import torch.nn as nn

torch.manual_seed(123)
batch_example = torch.randn(2, 5)

layer = nn.Sequential(nn.Linear(5, 6), nn.ReLU())
out = layer(batch_example)
print(out)
"""
tensor([[0.2260, 0.3470, 0.0000, 0.2216, 0.0000, 0.0000],
        [0.2133, 0.2394, 0.0000, 0.5198, 0.3297, 0.0000]],
       grad_fn=<ReluBackward0>)
"""

# 均值
mean = out.mean(dim=-1, keepdim=True)
# 方差
var = out.var(dim=-1, keepdim=True)
print("Mean:", mean)
print("Variance:", var)
"""
Mean: tensor([[0.1324],
        [0.2170]], grad_fn=<MeanBackward1>)
Variance: tensor([[0.0231],
        [0.0398]], grad_fn=<VarBackward0>)
"""


# 归一化
out_norm = (out - mean) / torch.sqrt(var)
mean = out_norm.mean(dim=-1, keepdim=True)
var = out_norm.var(dim=-1, keepdim=True)
print("out_norm:", out_norm)
"""
out_norm: tensor([[ 0.6159,  1.4126, -0.8719,  0.5872, -0.8719, -0.8719],
        [-0.0189,  0.1121, -1.0876,  1.5173,  0.5647, -1.0876]],
       grad_fn=<DivBackward0>)
"""

torch.set_printoptions(sci_mode=False)
print("mean:", mean)
print("variance:", var)
"""
mean: tensor([[    0.0000],
        [    0.0000]], grad_fn=<MeanBackward1>)
variance: tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
"""

python 复制代码

import torch
import torch.nn as nn

class LayerNorm(nn.Module):
    def __init__(self, emb_dim):
        super().__init__()
        self.eps = 1e-5
        self.scale = nn.Parameter(torch.ones(emb_dim))
        self.shift = nn.Parameter(torch.zeros(emb_dim))

    def forward(self, x):
        mean = x.mean(dim=-1, keepdim=True)
        var = x.var(dim=-1, keepdim=True, unbiased=False)
        norm_x = (x - mean) / torch.sqrt(var + self.eps)
        return self.scale * norm_x + self.shift


torch.manual_seed(123)
batch_example = torch.randn(2, 5)

ln = LayerNorm(emb_dim=5)
out_ln = ln(batch_example)
mean = out_ln.mean(dim=-1, keepdim=True)
var = out_ln.var(dim=-1, unbiased=False, keepdim=True)

torch.set_printoptions(sci_mode=False)
print("Mean:", mean)
print("Variance:", var)
"""
Mean: tensor([[     0.0000],
        [    -0.0000]], grad_fn=<MeanBackward1>)
Variance: tensor([[1.0000],
        [1.0000]], grad_fn=<VarBackward0>)
"""

代码清单4-3 GELU激活函数

python 复制代码

import torch
import torch.nn as nn
import matplotlib.pyplot as plt

class GELU(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, x):
        return 0.5 * x * (1 + torch.tanh(
            torch.sqrt(torch.tensor(2.0 / torch.pi)) *
            (x + 0.044715 * torch.pow(x, 3))
        ))


gelu, relu = GELU(), nn.ReLU()

x = torch.linspace(-3, 3, 100) # 在-3 和 3 之间创建 100 个样本数据点
y_gelu, y_relu = gelu(x), relu(x)
plt.figure(figsize=(8,3))
for i, (y, label) in enumerate(zip([y_gelu, y_relu], ["GELU", "ReLU"]), 1):
    plt.subplot(1, 2, i)
    plt.plot(x, y)
    plt.title(f"{label} activation function") # 图标标题
    plt.xlabel("x") # X轴标签
    plt.ylabel(f"{label}(x)")
    plt.grid(True) # 显示网格
plt.tight_layout() # 自动调整图表布局，防止标签重叠
plt.savefig('gelu_relu')
plt.show()