Transformer——多头注意力机制(Pytorch)

  1. 原理图

  2. 代码

python 复制代码
import torch
import torch.nn as nn


class Multi_Head_Self_Attention(nn.Module):
    def __init__(self, embed_size, heads):
        super(Multi_Head_Self_Attention, self).__init__()
        self.embed_size = embed_size
        self.heads = heads
        self.head_dim = embed_size // heads

        self.queries = nn.Linear(self.embed_size, self.embed_size, bias=False)
        self.keys = nn.Linear(self.embed_size, self.embed_size, bias=False)
        self.values = nn.Linear(self.embed_size, self.embed_size, bias=False)
        self.fc_out = nn.Linear(self.embed_size, self.embed_size, bias=False)

    def forward(self,queries, keys, values, mask):
        N = queries.shape[0]  # batch_size
        query_len = queries.shape[1]  # sequence_length
        key_len = keys.shape[1]  # sequence_length 
        value_len = values.shape[1]  # sequence_length

        queries = self.queries(queries)
        keys = self.keys(keys)
        values = self.values(values)

        # Split the embedding into self.heads pieces
        # batch_size, sequence_length, embed_size(512) --> 
        # batch_size, sequence_length, heads(8), head_dim(64)
        queries = queries.reshape(N, query_len, self.heads, self.head_dim)
        keys = keys.reshape(N, key_len, self.heads, self.head_dim)
        values = values.reshape(N, value_len, self.heads, self.head_dim)

        # batch_size, sequence_length, heads(8), head_dim(64) --> 
        # batch_size, heads(8), sequence_length, head_dim(64)
        queries = queries.transpose(1, 2)
        keys = keys.transpose(1, 2)
        values = values.transpose(1, 2)

        # Scaled dot-product attention
        score = torch.matmul(queries, keys.transpose(-2, -1)) / (self.head_dim ** (1/2))

        if mask is not None:
            score = score.masked_fill(mask == 0, float("-inf"))
        # batch_size, heads(8), sequence_length, sequence_length
        attention = torch.softmax(score, dim=-1)

        out = torch.matmul(attention, values)
        # batch_size, heads(8), sequence_length, head_dim(64) -->
        # batch_size, sequence_length, heads(8), head_dim(64) -->
        # batch_size, sequence_length, embed_size(512)
        # 为了方便送入后面的网络
        out = out.transpose(1, 2).contiguous().reshape(N, query_len, self.embed_size)
        out = self.fc_out(out)

        return out
    

batch_size = 64
sequence_length = 10
embed_size = 512
heads = 8
mask = None

Q = torch.randn(batch_size, sequence_length, embed_size)  
K = torch.randn(batch_size, sequence_length, embed_size)  
V = torch.randn(batch_size, sequence_length, embed_size)  

model = Multi_Head_Self_Attention(embed_size, heads)
output = model(Q, K, V, mask)
print(output.shape)
相关推荐
肾透侧视攻城狮6 分钟前
《解锁计算机视觉:深度解析 PyTorch torchvision 核心与进阶技巧》
人工智能·深度学习·计算机视觉模快·支持的数据集类型·常用变换方法分类·图像分类流程实战·视觉模快高级功能
CoovallyAIHub7 分钟前
让本地知识引导AI追踪社区变迁,让AI真正理解社会现象
深度学习·算法·计算机视觉
算法狗217 分钟前
大模型面试题:在混合精度训练中如何选择合适的精度
人工智能·深度学习·机器学习·语言模型
图学习小组23 分钟前
Degradation-Aware Feature Perturbation for All-in-One Image Restoration
人工智能·深度学习·计算机视觉
CoovallyAIHub26 分钟前
AAAI 2026这篇杰出论文说了什么?用LLM给CLIP换了个“聪明大脑”
深度学习·算法·计算机视觉
听麟32 分钟前
HarmonyOS 6.0+ PC端虚拟仿真训练系统开发实战:3D引擎集成与交互联动落地
笔记·深度学习·3d·华为·交互·harmonyos
玄同7651 小时前
Python 自动发送邮件实战:用 QQ/163 邮箱发送大模型生成的内容
开发语言·人工智能·python·深度学习·机器学习·邮件·邮箱
玄同7651 小时前
机器学习中的三大距离度量:欧式距离、曼哈顿距离、切比雪夫距离详解
人工智能·深度学习·神经网络·目标检测·机器学习·自然语言处理·数据挖掘
听麟2 小时前
HarmonyOS 6.0+ APP AR文旅导览系统开发实战:空间定位与文物交互落地
人工智能·深度学习·华为·ar·wpf·harmonyos
盼小辉丶2 小时前
Transformer实战——微调多语言Transformer模型
深度学习·语言模型·transformer