AI 基础知识十三 Transformer注意力机制(Attention)

注意力机制

Transformer 的核心是自注意力多头注意力,让序列每个位置都能动态关注全局相关信息,并行捕捉长程依赖。

自注意力公式

多头注意力公式

计算步骤

参照论文说明

  1. Q、 K矩阵相乘

  2. 缩放处理

  3. 加掩码处理 是可选项

  4. Softmax 归一化指数函数

  5. 与V矩阵相乘

本着简单的原则,用一个实例来说明Q,K,V计算过程

实例

自注意力

实现代码

例子 "Welcome to Machine Learning Pad Pad" 经过 词嵌入 和**位置编码,**得到6X4矩阵,为了方便计算对这个矩阵手动设置特定的数据

cpp 复制代码
/*

	{"Pad",      0},
	{"Welcome",  1},
	{"to",       2},
	{"Machine",  3},
	{"Learning", 4}

	1. Welcome to Machine Learning Pad Pad -- > [1,2,3,4,0,0]

	2. Embedding + PositionalEncoding        ->  x

	3. x: [6 ,4]  

*/

	auto x = torch::tensor({
				{{1.0, 0.0, 0.0, 0.0}, // Welcome
				 {2.0, 0.0, 0.0, 0.0}, // to
				 {3.0, 0.0, 0.0, 0.0}, // Machine
				 {4.0, 0.0, 0.0, 0.0}, // Learning
				 {0.0, 0.0, 0.0, 0.0}, // Pad
				 {0.0, 0.0, 0.0, 0.0}  // Pad
				} }, torch::kFloat);

	

Q,K,V是一组权重 它的词嵌入的 维度**,**为了方便计算都它们设定单位矩阵

cpp 复制代码
class SelfAttention : public torch::nn::Module
{
public:
	SelfAttention()
	{


	}
	void InitQKV(int64_t dim)
	{
		auto linear = torch::nn::LinearOptions(dim, dim).bias(false);

		Q = register_module("q", torch::nn::Linear(linear));
		K = register_module("k", torch::nn::Linear(linear));
		V = register_module("v", torch::nn::Linear(linear));

		norm_fact = 1.0 / sqrt(dim); // 缩放

		auto onesw = torch::eye(dim);  //单位矩阵

		Q->weight.set_data(onesw);
		K->weight.set_data(onesw);
		V->weight.set_data(onesw);
	}

	
	torch::nn::Linear Q{ nullptr };
	torch::nn::Linear K{ nullptr };
	torch::nn::Linear V{ nullptr };
	double norm_fact = 0 ;
};

torch::nn::Transformer 要求输入张量形状**[seq, batch, dim],** 这里简单化为**[seq,dim]**

参照论文实现计算步骤

cpp 复制代码
	auto forward(torch::Tensor x,torch::Tensor mask = {})
	{
		torch::Tensor q ;
		torch::Tensor k ;
		torch::Tensor v;
		torch::Tensor kt;
		torch::Tensor out;

		auto dim = x.dim();
	
	
		// x: [seq, dim]
		InitQKV(x.size(1));
		
		/// 1.输入x 与 q k v 运算  q k v是 单位矩阵所以 q k v = x
		 q = Q->forward(x);
		 k = K->forward(x);
		 v = V->forward(x);

		 cout << "q k v \n" << q << endl;

		
		kt = k.transpose(0, 1);// kt 是 k 的置换矩阵  kt: [dim,seq]
		

		cout << "kt \n" << kt << endl;

		auto attn_score = torch::matmul(q, kt); //2.  q:[seq, dim] X kt: [dim,seq] -> [seq, seq]
		cout << "q X kt \n" << attn_score << endl;

		attn_score = attn_score * norm_fact;     //3. 矩阵缩放
		cout << "scale q.X.kt  \n" << attn_score << endl;

		if (mask.defined())
		{
			attn_score += mask;
		}


		attn_score = torch::softmax(attn_score, -1);//4. Softmax 归一化指数函数
		cout << "torch::softmax q.X.kt  \n" << attn_score << endl;

		out = torch::matmul(attn_score, v); /// 5.与V矩阵相乘  [seq, seq] X v:[seq, dim] -> [seq, dim]

		cout << "torch::matmul V  \n" << out << endl;
	
		return out;
	}

重点解析

  1. = =

实际意义是

矩阵相乘结果

每个字符都能其他字符产生运算,也就是它能根据上下文来确定语意,字符序列长度N,Transformer时间复杂度为

  1. Softmax 归一化指数函数

数学公: 式 输入向量

=

第 i 个元素的 Softmax 输出为

每行内所有数据相加等于1, 原数据按一定比例缩小

  1. 与V矩阵相乘

qkv现在全部建立关系了

当要求输入张量形状**[seq, batch, dim]**时,其流程都一样,要变换处理张量

高维张量矩阵相乘

公式: a[..,..., M,N] * b[...,...,N, K] = [..,...,M, K] 看到最后两维和两维矩阵相乘一样

整理之的代码,支持两三维输入张量

cpp 复制代码
	auto forward(torch::Tensor x,torch::Tensor mask = {})
	{
		torch::Tensor q ;
		torch::Tensor k ;
		torch::Tensor v;
		torch::Tensor kt;
		torch::Tensor out;

		auto dim = x.dim();
	

		if (dim == 3)
		{
			//x:  [batch, seq, dim]  --->  [seq, batch, dim]
			x = x.permute({1,0,2});
			InitQKV(x.size(2));
		}
		else
		{
			// x: [seq, dim]
			InitQKV(x.size(1));
		}
		/// 1.输入x 与 q k v 运算  q k v是 单位矩阵所以 q k v = x
		 q = Q->forward(x);
		 k = K->forward(x);
		 v = V->forward(x);

		 cout << "q k v \n" << q << endl;

		if (dim == 3)
		{
			kt = k.permute({ 1,2,0 });
		 	v = v.permute({ 1,0,2 });
		}
		else
		{
			kt = k.transpose(0, 1);// kt 是 k 的置换矩阵  kt: [dim,seq]
		}

	
		cout << "kt \n" << kt << endl;

		auto attn_score = torch::matmul(q, kt); //2.  
		cout << "q X kt \n" << attn_score << endl;

		attn_score = attn_score * norm_fact;     //3. 矩阵缩放
		cout << "scale q.X.kt  \n" << attn_score << endl;

		if (mask.defined())
		{
			attn_score += mask;
		}


		attn_score = torch::softmax(attn_score, -1);//4. Softmax 归一化指数函数
		cout << "torch::softmax q.X.kt  \n" << attn_score << endl;

		out = torch::matmul(attn_score, v); /// 5. qKt * V

		cout << "torch::matmul V  \n" << out << endl;
	
		return out;
	}

多头注意力

在"自注意力"的基础上增加

  1. 维度被为多份 分别用于Q K V 计算

2.将多份重新拼接

3.最后加输出投影

输入两维张量时,写一个函数forward

  1. 输入张量x 形状**[seq, dim], q、k、v** 形状 [seq, dim]

  2. q、k、v 拆分成 [H, S, Dk] , seq简写S, H:头数量, Dk = dim/ H

cpp 复制代码
q = q.view({ seq,H,Dk }); //q: [seq, dim] ->   [S, H, Dk] 
k = k.view({ seq,H,Dk });
v = v.view({ seq,H,Dk });

q = q.permute({ 1,0,2 }); //[S, H, Dk] --->[H, S, Dk]
k = k.permute({ 1,0,2 });
v = v.permute({ 1,0,2 });
  1. 形状 [H, Dk, S]
cpp 复制代码
auto kt = k.permute({ 0,2,1 }); //kt:  [H, S, Dk] --> [H, Dk, S]
  1. 与V矩阵相乘之后 输出形状 [H, S, Dk], 要转换成 [S, H, Dk]
cpp 复制代码
auto out = torch::matmul(attn_score, v); // [H, S, S] * [H, S, Dk]  ->  out: [H, S, Dk]
  1. [S, H, Dk] 拼接成 **[seq, dim],**最后输出投影
cpp 复制代码
out = out.transpose(1, 0).contiguous().view({ seq, dim }); //  [H, S, Dk] --> [S, H, Dk] -> [seq, dim]

cout << "torch::matmul QK * V  \n" << out.squeeze() << endl;

out = Wo->forward(out);

输入三维张量时,写一个函数forward2去实现,除了张量形状调整不同外其他都一样,实现细节只能看代码

cpp 复制代码
	auto forward(torch::Tensor x, int64_t head = 2, torch::Tensor mask = {})
	{
		x.squeeze_(); //x: [batch, seq ,dim]  -->   [seq, dim]
		assert(x.dim() == 2);
		//x: [seq, dim]
		  
		InitQKV(x.size(1), head);

		auto seq = x.size(0);
		auto dim = x.size(1);

		auto q = Q->forward(x);
		auto k = K->forward(x);
		auto v = V->forward(x);
		q = q.view({ seq,H,Dk }); //q: [seq, dim] ->   [S, H, Dk] 
		k = k.view({ seq,H,Dk });
		v = v.view({ seq,H,Dk });

		q = q.permute({ 1,0,2 }); //[S, H, Dk] --->[H, S, Dk]
		k = k.permute({ 1,0,2 });
		v = v.permute({ 1,0,2 });

		cout << "q k v \n" << q << endl;

		auto kt = k.permute({ 0,2,1 }); //kt:  [H, S, Dk] --> [H, Dk, S]

		cout << "kt \n" << kt << endl;

		auto attn_score = torch::matmul(q, kt);  // [H, S, Dk] *  [H, Dk, S]
		cout << "q X kt \n" << attn_score << endl;


		attn_score = attn_score * norm_fact;
		cout << "scale q.X.kt  \n" << attn_score << endl;

		if (mask.defined())
		{
			attn_score += mask;
		}

		attn_score = torch::softmax(attn_score, -1); /// attn_score: [H, S, S]
		cout << "torch::softmax q.X.kt  \n" << attn_score.squeeze() << endl;

		auto out = torch::matmul(attn_score, v); // [H, S, S] * [H, S, Dk]  ->  out: [H, S, Dk]
		out = out.transpose(1, 0).contiguous().view({ seq, dim }); //  [H, S, Dk] --> [S, H, Dk] -> [seq, dim]

		cout << "torch::matmul QK * V  \n" << out.squeeze() << endl;
		
		out = Wo->forward(out);

		return out;
	}


	auto forward2(torch::Tensor x, int64_t head = 2,torch::Tensor mask = {})
	{
		assert(x.dim() == 3);
	  
		x = x.permute({ 1,0,2 });  //   x: x: [batch, seq, dim]-->  [seq, batch, dim]

		InitQKV(x.size(2), head);

		auto seq = x.size(0);
		auto batch = x.size(1);
		auto dim = x.size(2);

		auto q = Q->forward(x);
		auto k = K->forward(x);
		auto v = V->forward(x);
		q = q.view({ seq,batch,H,Dk}); //q: [seq, batch, dim] ->   [S, B, H, Dk] 
		k = k.view({ seq,batch,H,Dk });
		v = v.view({ seq,batch,H,Dk });
		
		q = q.permute({1,2,0,3}); //[S, B, H, Dk] --->[B, H, S, Dk]
		k = k.permute({ 1,2,0,3 });
		v = v.permute({ 1,2,0,3 });
		
		cout << "q k v \n" << q << endl;

		auto kt = k.permute({ 0,1,3,2}); //kt:  [B, H, S, Dk] --> [B, H, Dk, S]

		cout << "kt \n" << kt.squeeze() << endl;

		auto attn_score = torch::matmul(q, kt);
		cout << "q X kt \n" << attn_score << endl;


		attn_score = attn_score * norm_fact;
		cout << "scale q.X.kt  \n" << attn_score << endl;

		if (mask.defined())
		{
			attn_score += mask;
		}

		attn_score = torch::softmax(attn_score, -1); /// attn_score: [B, H, S, S]
		cout << "torch::softmax q.X.kt  \n" << attn_score << endl;

		auto out = torch::matmul(attn_score, v); // [B, H, S, S] * [B, H, S, Dk]  ->  out: [B, H, S, Dk]
		out = out.transpose(1, 2).contiguous().view({ seq,batch, dim }); //  [B, H, S, Dk] --> [B, S, H, Dk] -> [seq,batch, dim]
		
		cout << "torch::matmul QK * V  \n" << out << endl;
		
		out = Wo->forward(out);

		return out;
	}

完整代码

cpp 复制代码
#include <torch/torch.h>
#include <iostream>
#include <torch/serialize.h>
#include <regex>
//#include <iostream>
#include <fstream>
using namespace std;


class FeedForwardNet : public torch::nn::Module
{
	//Q = register_module("q", torch::nn::Linear(linear));
};



class SelfAttention : public torch::nn::Module
{
public:
	SelfAttention()
	{


	}
	void InitQKV(int64_t dim)
	{
		auto linear = torch::nn::LinearOptions(dim, dim).bias(false);

		Q = register_module("q", torch::nn::Linear(linear));
		K = register_module("k", torch::nn::Linear(linear));
		V = register_module("v", torch::nn::Linear(linear));

		norm_fact = 1.0 / sqrt(dim); // 缩放

		auto onesw = torch::eye(dim); //单位矩阵

		Q->weight.set_data(onesw);
		K->weight.set_data(onesw);
		V->weight.set_data(onesw);
	}

	
	auto forward(torch::Tensor x,torch::Tensor mask = {})
	{
		torch::Tensor q ;
		torch::Tensor k ;
		torch::Tensor v;
		torch::Tensor kt;
		torch::Tensor out;

		auto dim = x.dim();
	

		if (dim == 3)
		{
			//x:  [batch, seq, dim]  --->  [seq, batch, dim]
			x = x.permute({1,0,2});
			InitQKV(x.size(2));
		}
		else
		{
			// x: [seq, dim]
			InitQKV(x.size(1));
		}
		/// 1.输入x 与 q k v 运算  q k v是 单位矩阵所以 q k v = x
		 q = Q->forward(x);
		 k = K->forward(x);
		 v = V->forward(x);

		 cout << "q k v \n" << q << endl;

		if (dim == 3)
		{
			kt = k.permute({ 1,2,0 });
		 	v = v.permute({ 1,0,2 });
		}
		else
		{
			kt = k.transpose(0, 1);// kt 是 k 的置换矩阵  kt: [dim,seq]
		}

	
		cout << "kt \n" << kt << endl;

		auto attn_score = torch::matmul(q, kt); //2.  
		cout << "q X kt \n" << attn_score << endl;

		attn_score = attn_score * norm_fact;     //3. 矩阵缩放
		cout << "scale q.X.kt  \n" << attn_score << endl;

		if (mask.defined())
		{
			attn_score += mask;
		}


		attn_score = torch::softmax(attn_score, -1);//4. Softmax 归一化指数函数
		cout << "torch::softmax q.X.kt  \n" << attn_score << endl;

		out = torch::matmul(attn_score, v); /// 5. qKt * V

		cout << "torch::matmul V  \n" << out << endl;
	
		return out;
	}

	torch::nn::Linear Q{ nullptr };
	torch::nn::Linear K{ nullptr };
	torch::nn::Linear V{ nullptr };
	double norm_fact = 0 ;
};

class MultiHeadAttention: public torch::nn::Module
{
public:
	void InitQKV(int64_t dim, int64_t head=2)
	{
		assert(dim % head == 0);

		auto linear = torch::nn::LinearOptions(dim, dim).bias(false);

		Q = register_module("q", torch::nn::Linear(linear));
		K = register_module("k", torch::nn::Linear(linear));
		V = register_module("v", torch::nn::Linear(linear));
		Wo = register_module("Wo", torch::nn::Linear(linear)); // 输出投影

		norm_fact = 1.0 / sqrt(dim);
		
		Dk = dim / head;
		H = head;

		auto onesw = torch::eye(dim);   
		Q->weight.set_data(onesw);
		K->weight.set_data(onesw);
		V->weight.set_data(onesw);
		Wo->weight.set_data(onesw);
	}

	auto forward(torch::Tensor x, int64_t head = 2, torch::Tensor mask = {})
	{
		x.squeeze_(); //x: [batch, seq ,dim]  -->   [seq, dim]
		assert(x.dim() == 2);
		//x: [seq, dim]
		  
		InitQKV(x.size(1), head);

		auto seq = x.size(0);
		auto dim = x.size(1);

		auto q = Q->forward(x);
		auto k = K->forward(x);
		auto v = V->forward(x);
		q = q.view({ seq,H,Dk }); //q: [seq, dim] ->   [S, H, Dk] 
		k = k.view({ seq,H,Dk });
		v = v.view({ seq,H,Dk });

		q = q.permute({ 1,0,2 }); //[S, H, Dk] --->[H, S, Dk]
		k = k.permute({ 1,0,2 });
		v = v.permute({ 1,0,2 });

		cout << "q k v \n" << q << endl;

		auto kt = k.permute({ 0,2,1 }); //kt:  [H, S, Dk] --> [H, Dk, S]

		cout << "kt \n" << kt << endl;

		auto attn_score = torch::matmul(q, kt);  // [H, S, Dk] *  [H, Dk, S]
		cout << "q X kt \n" << attn_score << endl;


		attn_score = attn_score * norm_fact;
		cout << "scale q.X.kt  \n" << attn_score << endl;

		if (mask.defined())
		{
			attn_score += mask;
		}

		attn_score = torch::softmax(attn_score, -1); /// attn_score: [H, S, S]
		cout << "torch::softmax q.X.kt  \n" << attn_score.squeeze() << endl;

		auto out = torch::matmul(attn_score, v); // [H, S, S] * [H, S, Dk]  ->  out: [H, S, Dk]
		out = out.transpose(1, 0).contiguous().view({ seq, dim }); //  [H, S, Dk] --> [S, H, Dk] -> [seq, dim]

		cout << "torch::matmul QK * V  \n" << out.squeeze() << endl;
		
		out = Wo->forward(out);

		return out;
	}


	auto forward2(torch::Tensor x, int64_t head = 2,torch::Tensor mask = {})
	{
		assert(x.dim() == 3);
	  
		x = x.permute({ 1,0,2 });  //   x: x: [batch, seq, dim]-->  [seq, batch, dim]

		InitQKV(x.size(2), head);

		auto seq = x.size(0);
		auto batch = x.size(1);
		auto dim = x.size(2);

		auto q = Q->forward(x);
		auto k = K->forward(x);
		auto v = V->forward(x);
		q = q.view({ seq,batch,H,Dk}); //q: [seq, batch, dim] ->   [S, B, H, Dk] 
		k = k.view({ seq,batch,H,Dk });
		v = v.view({ seq,batch,H,Dk });
		
		q = q.permute({1,2,0,3}); //[S, B, H, Dk] --->[B, H, S, Dk]
		k = k.permute({ 1,2,0,3 });
		v = v.permute({ 1,2,0,3 });
		
		cout << "q k v \n" << q << endl;

		auto kt = k.permute({ 0,1,3,2}); //kt:  [B, H, S, Dk] --> [B, H, Dk, S]

		cout << "kt \n" << kt.squeeze() << endl;

		auto attn_score = torch::matmul(q, kt);
		cout << "q X kt \n" << attn_score << endl;


		attn_score = attn_score * norm_fact;
		cout << "scale q.X.kt  \n" << attn_score << endl;

		if (mask.defined())
		{
			attn_score += mask;
		}

		attn_score = torch::softmax(attn_score, -1); /// attn_score: [B, H, S, S]
		cout << "torch::softmax q.X.kt  \n" << attn_score << endl;

		auto out = torch::matmul(attn_score, v); // [B, H, S, S] * [B, H, S, Dk]  ->  out: [B, H, S, Dk]
		out = out.transpose(1, 2).contiguous().view({ seq,batch, dim }); //  [B, H, S, Dk] --> [B, S, H, Dk] -> [seq,batch, dim]
		
		cout << "torch::matmul QK * V  \n" << out << endl;
		
		out = Wo->forward(out);

		return out;
	}


	torch::nn::Linear Q{ nullptr };
	torch::nn::Linear K{ nullptr };
	torch::nn::Linear V{ nullptr };
	torch::nn::Linear Wo{ nullptr };

	double norm_fact = 0;
	int64_t Dk;
	int64_t H;
};


void TransformerAttentionMain()
{


	auto x = torch::tensor({
				{{1.0, 0.0, 0.0, 0.0}, // Welcome
				 {2.0, 0.0, 0.0, 0.0}, // to
				 {3.0, 0.0, 0.0, 0.0}, // Machine
				 {4.0, 0.0, 0.0, 0.0}, // Learning
				 {0.0, 0.0, 0.0, 0.0}, // Pad
				 {0.0, 0.0, 0.0, 0.0}  // Pad
				} }, torch::kFloat);



	
	auto w = torch::tensor({
			{
				{0.0, 1.0, 0.0, 0.0},
				{0.0, 2.0, 0.0, 0.0},
				{0.0, 3.0, 0.0, 0.0},
				{0.0, 4.0, 0.0, 0.0}
			}
		}, torch::kFloat);

	cout << "input\n" << x << endl;
	
	cout << "-------------SelfAttention--------------------\n"  << endl;
	auto x1 = x.squeeze();
	auto atten= SelfAttention();
	auto y = atten.forward(x1);
	cout << "-------------SelfAttention--------------------\n"  << endl;
	

	cout << "\n\n-------------MultiHeadAttention--------------------\n"  << endl;
	auto multiAtten = MultiHeadAttention();
	multiAtten.forward2(x,1);
	cout << "-------------MultiHeadAttention--------------------\n"  << endl;

}

感谢大家的支持,如要问题欢迎提问指正。

相关推荐
AI周红伟1 小时前
All in Token, 移动,电信,联通,阿里,百度,华为,字节,Token石油战争,Token经济,百度要“重写”AI价值度量
大数据·人工智能·机器学习·百度·copilot·openclaw
AI周红伟1 小时前
Token经济学:AI时代的新货币战争,All in Token, 新时代的石油战争,华为,阿里,百度,字节的石油战争
大数据·人工智能·机器学习·百度·华为·copilot·openclaw
YUDAMENGNIUBI2 小时前
day20_逻辑回归
算法·机器学习·逻辑回归
XM_jhxx4 小时前
±0.03mm的精度怎么保证?翌东塑胶用AI赋能质量管控升级
人工智能
阿正的梦工坊5 小时前
深入理解 PyTorch 中的 unsqueeze 操作
人工智能·pytorch·python
秦歌6666 小时前
DeepAgents框架详解和文件后端
人工智能·langchain
测试员周周7 小时前
【Appium 系列】第06节-页面对象实现 — LoginPage 实战
开发语言·前端·人工智能·python·功能测试·appium·测试用例
霸道流氓气质7 小时前
基于 Milvus Lite 的 Spring AI RAG 向量库实践方案与示例
人工智能·spring·milvus
ar01237 小时前
AR巡检平台:构筑智能巡检新模式的数字化引擎
人工智能·ar
语音之家7 小时前
【预讲会征集】ACL 2026 论文预讲会
人工智能·论文·acl