SelfAttention和MultiHeadAttion实现demo

#encoding:utf-8

from math import sqrt

import torch

import torch.nn as nn

class Self_Attention(nn.Module):

def init(self, input_dim, dim_k, dim_v):

super(Self_Attention, self). init()

self.q = nn.Linear(input_dim, dim_k)

self.k = nn.Linear(input_dim, dim_k)

self.v = nn.Linear(input_dim, dim_v)

self.norm_fact = 1 / sqrt(dim_k)

def forward(self, x):

print("x.shape:", x.shape)

print("q.shape:", self.q.shape)

Q = self.q(x)

print("Q.shape:", Q.shape)

K = self.k(x)

print("K.shape:", K.shape)

V = self.v(x)

print("V.shape:", V.shape)

atten = nn.Softmax(dim=-1)(torch.bmm(Q,K.permute(0,2,1))) * self.norm_fact

output = torch.bmm(atten, V)

return output

print("\n")

print("self attention:")

x = torch.randn(4,3,1024)

print(x)

print("input size:", x.size())

self_attention = Self_Attention(1024,128,5)

res = self_attention(x)

print("\n")

print(res)

print("output size:", res.size())

print("\n")

class Self_Attention_Muti_Head(nn.Module):

def init(self, input_dim, dim_k, dim_v, nums_head):

super(Self_Attention_Muti_Head, self).init()

assert dim_k % nums_head == 0

assert dim_v % nums_head == 0

self.q = nn.Linear(input_dim, dim_k)

self.k = nn.Linear(input_dim, dim_k)

self.v = nn.Linear(input_dim, dim_v)

self.nums_head = nums_head

self.dim_k = dim_k

self.dim_v = dim_v

self._norm_fact = 1 / sqrt(dim_k)

def forward(self, x):

Q = self.q(x).reshape(-1, x.shape[0], x.shape[1], self.dim_k//self.nums_head)

K = self.k(x).reshape(-1, x.shape[0], x.shape[1], self.dim_k//self.nums_head)

V = self.v(x).reshape(-1, x.shape[0], x.shape[1], self.dim_v//self.nums_head)

print("x.shape:", x.shape)

print("Q.shape", Q.size())

atten = nn.Softmax(dim=-1)(torch.matmul(Q, K.permute(0,1,3,2)))

output = torch.matmul(atten, V).reshape(x.shape[0], x.shape[1], -1)

return output

print("\n")

print("multi head attention:")

x = torch.randn(4,3,1024)

print(x)

print(x.size())

self_attention = Self_Attention_Muti_Head(1024,128,6,2)

res = self_attention(x)

print("\n")

print(res)

print(res.size())


有个问题:

根据文献:https://arxiv.org/pdf/1911.02150.pdf,感觉这里说的Multi Head Attenion和 Group Query Attention意思是一样的:

这下面这张经典的图中的的Grouped-query意思是一样的:

哪里没理解到位?

相关推荐
胖达不服输1 分钟前
「日拱一码」021 机器学习——特征工程
人工智能·python·机器学习·特征工程
小哥谈1 小时前
论文解析篇 | YOLOv12:以注意力机制为核心的实时目标检测算法
人工智能·深度学习·yolo·目标检测·机器学习·计算机视觉
screenCui1 小时前
macOS运行python程序遇libiomp5.dylib库冲突错误解决方案
开发语言·python·macos
水龙吟啸1 小时前
从零开始搭建深度学习大厦系列-2.卷积神经网络基础(5-9)
人工智能·pytorch·深度学习·cnn·mxnet
小眼睛羊羊1 小时前
pyinstaller打包paddleocr
python
java1234_小锋1 小时前
基于Python的旅游推荐协同过滤算法系统(去哪儿网数据分析及可视化(Django+echarts))
python·数据分析·旅游
蓝婷儿1 小时前
Python 机器学习核心入门与实战进阶 Day 4 - 支持向量机(SVM)原理与分类实战
python·机器学习·支持向量机
%d%d22 小时前
python 在运行时没有加载修改后的版本
java·服务器·python
HollowKnightZ2 小时前
论文阅读笔记:VI-Net: Boosting Category-level 6D Object Pose Estimation
人工智能·深度学习·计算机视觉
yzx9910132 小时前
AI大模型平台
大数据·人工智能·深度学习·机器学习