（深度学习记录）第TR4周：Pytorch复现Transformer

🍨 本文为🔗365天深度学习训练营中的学习记录博客
🍖 原作者：K同学啊 | 接辅导、项目定制

🏡我的环境：

语言环境：Python3.11.4
编译器：Jupyter Notebook
torcch版本：2.0.1

python 复制代码

import torch
import torch.nn as nn
 
class MultiHeadAttention(nn.Module):
    def __init__(self, hid_dim, n_heads, dropout):
        super().__init__()
 
        self.hid_dim = hid_dim
        self.n_heads = n_heads
 
        # hid_dim必须整除
        assert hid_dim % n_heads == 0
        # 定义wq
        self.w_q = nn.Linear(hid_dim, hid_dim)
        # 定义wk
        self.w_k = nn.Linear(hid_dim, hid_dim)
        # 定义wv
        self.w_v = nn.Linear(hid_dim, hid_dim)
 
        self.fc = nn.Linear(hid_dim, hid_dim)
        self.do = nn.Dropout(dropout)
 
        self.scale = torch.sqrt(torch.FloatTensor([hid_dim//n_heads]))
 
    def forward(self, query, key, value, mask=None):
        # Q与KV在句子长度这一个维度上数值可以不一样
        bsz = query.shape[0]
        Q = self.w_q(query)
        K = self.w_k(key)
        V = self.w_v(value)
 
        # 将QKV拆成多组，方案是将向量直接拆开了
        # (64, 12, 300) -> (64, 12, 6, 50) -> (64, 6, 12, 50)
        # (64, 10, 300) -> (64, 10, 6, 50) -> (64, 6, 10, 50)
        # (64, 10, 300) -> (64, 10, 6, 50) -> (64, 6, 10, 50)
        Q = Q.view(bsz, -1, self.n_heads, self.hid_dim//self.n_heads).permute(0, 2, 1, 3)
        K = K.view(bsz, -1, self.n_heads, self.hid_dim//self.n_heads).permute(0, 2, 1, 3)
        V = V.view(bsz, -1, self.n_heads, self.hid_dim//self.n_heads).permute(0, 2, 1, 3)
 
        # 第1步，Q x K / scale
        # (64, 6, 12, 50) x (64, 6, 50, 10) -> (64, 6, 12, 10)
        attention = torch.matmul(Q, K.permute(0, 1, 3, 2)) / self.scale
 
        # 需要mask掉的地方，attention设置的很小很小
        if mask is not None:
            attention = attention.masked_fill(mask == 0, -1e10)
 
        # 第2步，做softmax 再dropout得到attention
        attention = self.do(torch.softmax(attention, dim=-1))
 
 
        # 第3步，attention结果与k相乘，得到多头注意力的结果
        # (64, 6, 12, 10) x (64, 6, 10, 50) -> (64, 6, 12, 50)
        x = torch.matmul(attention, V)
 
        # 把结果转回去
        # (64, 6, 12, 50) -> (64, 12, 6, 50)
        x = x.permute(0, 2, 1, 3).contiguous()
 
        # 把结果合并
        # (64, 12, 6, 50) -> (64, 12, 300)
        x = x.view(bsz, -1, self.n_heads * (self.hid_dim // self.n_heads))
        x = self.fc(x)
        return x
 
query = torch.rand(64, 12, 300)
key = torch.rand(64, 10, 300)
value = torch.rand(64, 10, 300)
attention = MultiHeadAttention(hid_dim=300, n_heads=6, dropout=0.1)
output = attention(query, key, value)
print(output.shape)

多头注意力机制拓展了模型关注不同位置的能力，赋予Attention层多个"子表示空间"。