1. 绝对正弦位置编码(Sinusoidal PE)
来源:原始 Transformer(Vaswani et al., 2017)
通过固定的正弦/余弦函数为每个位置生成编码,无需训练参数。
PE(pos, 2i)=sin(pos100002i/dmodel)PE_{(pos,\ 2i)} = \sin\left(\frac{pos}{10000^{2i/d_{model}}}\right)PE(pos, 2i)=sin(100002i/dmodelpos)
PE(pos, 2i+1)=cos(pos100002i/dmodel)PE_{(pos,\ 2i+1)} = \cos\left(\frac{pos}{10000^{2i/d_{model}}}\right)PE(pos, 2i+1)=cos(100002i/dmodelpos)
其中 pospospos 是序列位置,iii 是维度索引,dmodeld_{model}dmodel 是嵌入维度。
python
import torch
import math
def sinusoidal_pe(max_len: int, d_model: int) -> torch.Tensor:
"""
返回 shape: (max_len, d_model)
"""
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len).unsqueeze(1).float() # (max_len, 1)
div_term = torch.exp(
torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
) # (d_model/2,)
pe[:, 0::2] = torch.sin(position * div_term) # 偶数维度
pe[:, 1::2] = torch.cos(position * div_term) # 奇数维度
return pe
# 示例
pe = sinusoidal_pe(max_len=512, d_model=128)
print(pe.shape) # torch.Size([512, 128])
2. 可学习位置编码(Learned PE)
来源:BERT、GPT 等
直接将位置索引映射到一个可训练的嵌入矩阵,简单灵活,但不能外推到训练时未见过的长度。
PE=Embedding(pos),pos∈{0,1,...,L−1}PE = \text{Embedding}(pos), \quad pos \in \{0, 1, \dots, L-1\}PE=Embedding(pos),pos∈{0,1,...,L−1}
python
import torch.nn as nn
class LearnedPE(nn.Module):
def __init__(self, max_len: int, d_model: int):
super().__init__()
self.embedding = nn.Embedding(max_len, d_model)
def forward(self, x: torch.Tensor) -> torch.Tensor:
"""
x: (batch, seq_len, d_model)
"""
seq_len = x.size(1)
positions = torch.arange(seq_len, device=x.device) # (seq_len,)
return x + self.embedding(positions) # 广播加法
# 示例
model = LearnedPE(max_len=512, d_model=128)
x = torch.randn(2, 64, 128)
out = model(x)
print(out.shape) # torch.Size([2, 64, 128])
3. 相对位置编码(Relative PE,Shaw et al. 2018)
来源:Self-Attention with Relative Position Representations
不直接编码绝对位置,而是在注意力计算中加入 query 与 key 之间的相对距离信息。
修改后的注意力分数为:
eij=qi(kj+aijK)Tdke_{ij} = \frac{\mathbf{q}_i \left(\mathbf{k}j + \mathbf{a}^{K}{ij}\right)^{T}}{\sqrt{d_k}}eij=dk qi(kj+aijK)T
aijK=wclip(j−i, −k, k)K\mathbf{a}^{K}{ij} = w^{K}{\text{clip}(j-i,\ -k,\ k)}aijK=wclip(j−i, −k, k)K
其中 clip(j−i,−k,k)\text{clip}(j-i, -k, k)clip(j−i,−k,k) 将相对距离截断到 [−k,k][-k, k][−k,k],wKw^KwK 是可学习的相对位置向量。
python
import torch
import torch.nn as nn
import torch.nn.functional as F
class RelativeAttention(nn.Module):
def __init__(self, d_model: int, n_heads: int, max_relative: int = 16):
super().__init__()
self.n_heads = n_heads
self.d_k = d_model // n_heads
self.max_relative = max_relative
self.W_q = nn.Linear(d_model, d_model)
self.W_k = nn.Linear(d_model, d_model)
self.W_v = nn.Linear(d_model, d_model)
# 相对位置嵌入:共 2*max_relative+1 个位置
self.rel_embed = nn.Embedding(2 * max_relative + 1, self.d_k)
def _relative_index(self, seq_len: int) -> torch.Tensor:
"""生成相对位置索引矩阵,shape: (seq_len, seq_len)"""
range_vec = torch.arange(seq_len)
dist = range_vec.unsqueeze(0) - range_vec.unsqueeze(1) # (L, L)
dist_clipped = dist.clamp(-self.max_relative, self.max_relative)
return dist_clipped + self.max_relative # 平移到 [0, 2k]
def forward(self, x: torch.Tensor) -> torch.Tensor:
B, L, _ = x.shape
Q = self.W_q(x).view(B, L, self.n_heads, self.d_k).transpose(1, 2)
K = self.W_k(x).view(B, L, self.n_heads, self.d_k).transpose(1, 2)
V = self.W_v(x).view(B, L, self.n_heads, self.d_k).transpose(1, 2)
# 标准注意力分数
scores = torch.matmul(Q, K.transpose(-2, -1)) # (B, H, L, L)
# 相对位置偏置
rel_idx = self._relative_index(L).to(x.device) # (L, L)
rel_emb = self.rel_embed(rel_idx) # (L, L, d_k)
rel_scores = torch.einsum('bhid,ijd->bhij', Q, rel_emb)
scores = (scores + rel_scores) / (self.d_k ** 0.5)
attn = F.softmax(scores, dim=-1)
out = torch.matmul(attn, V) # (B, H, L, d_k)
out = out.transpose(1, 2).contiguous().view(B, L, -1)
return out
# 示例
attn = RelativeAttention(d_model=128, n_heads=4)
x = torch.randn(2, 32, 128)
print(attn(x).shape) # torch.Size([2, 32, 128])
4. 旋转位置编码(RoPE)
来源:RoFormer(Su et al., 2021),被 LLaMA、GPT-NeoX 等广泛采用
核心思想:对 query/key 向量按位置旋转,使得 qmTkn\mathbf{q}_m^T \mathbf{k}_nqmTkn 自然包含相对位置 (m−n)(m - n)(m−n) 的信息。
对于二维情形,旋转矩阵为:
Rθ,m=(cosmθ−sinmθsinmθcosmθ)R_{\theta,m} = \begin{pmatrix} \cos m\theta & -\sin m\theta \\ \sin m\theta & \cos m\theta \end{pmatrix}Rθ,m=(cosmθsinmθ−sinmθcosmθ)
推广到 ddd 维时,将向量两两分组,第 iii 对使用频率:
θi=10000−2i/d\theta_i = 10000^{-2i/d}θi=10000−2i/d
内积满足:
(Rθ,mq)T(Rθ,nk)=qTRθ,n−mk(R_{\theta,m} \mathbf{q})^{T} (R_{\theta,n} \mathbf{k}) = \mathbf{q}^{T} R_{\theta,n-m} \mathbf{k}(Rθ,mq)T(Rθ,nk)=qTRθ,n−mk
python
import torch
def precompute_rope_freqs(d: int, max_len: int, base: float = 10000.0):
"""预计算 RoPE 的 cos/sin 缓存"""
# 频率:shape (d/2,)
theta = 1.0 / (base ** (torch.arange(0, d, 2).float() / d))
positions = torch.arange(max_len).float() # (max_len,)
freqs = torch.outer(positions, theta) # (max_len, d/2)
cos = torch.cos(freqs) # (max_len, d/2)
sin = torch.sin(freqs) # (max_len, d/2)
return cos, sin
def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
"""
x: (batch, n_heads, seq_len, d_head)
cos/sin: (seq_len, d_head/2)
"""
seq_len, d = x.shape[-2], x.shape[-1]
x1 = x[..., : d // 2] # 前半
x2 = x[..., d // 2 :] # 后半
cos = cos[:seq_len].unsqueeze(0).unsqueeze(0) # (1,1,L,d/2)
sin = sin[:seq_len].unsqueeze(0).unsqueeze(0)
# 旋转:[x1, x2] -> [x1*cos - x2*sin, x2*cos + x1*sin]
x_rot = torch.cat([x1 * cos - x2 * sin,
x2 * cos + x1 * sin], dim=-1)
return x_rot
# 示例
d_head, max_len = 64, 512
cos_cache, sin_cache = precompute_rope_freqs(d_head, max_len)
q = torch.randn(2, 8, 32, d_head) # (batch, heads, seq, d_head)
q_rope = apply_rope(q, cos_cache, sin_cache)
print(q_rope.shape) # torch.Size([2, 8, 32, 64])
5. ALiBi(线性偏置注意力)
来源:Train Short, Test Long(Press et al., 2022)
不修改输入嵌入,而是直接在注意力分数上加一个与相对距离成正比的负偏置,天然支持长度外推。
Attn(i,j)=qikjT−mh⋅∣i−j∣\text{Attn}(i, j) = \mathbf{q}_i \mathbf{k}_j^{T} - m_h \cdot |i - j|Attn(i,j)=qikjT−mh⋅∣i−j∣
其中 mhm_hmh 是第 hhh 个注意力头的斜率,按等比数列分配:
mh=128h/H,h=1,...,Hm_h = \frac{1}{2^{8h/H}}, \quad h = 1, \dots, Hmh=28h/H1,h=1,...,H
python
import torch
import math
def get_alibi_slopes(n_heads: int) -> torch.Tensor:
"""计算每个头的斜率 m_h"""
def get_slopes_power_of_2(n):
start = 2 ** (-(2 ** -(math.log2(n) - 3)))
return [start * (start ** i) for i in range(n)]
if math.log2(n_heads).is_integer():
slopes = get_slopes_power_of_2(n_heads)
else:
closest = 2 ** math.floor(math.log2(n_heads))
slopes = get_slopes_power_of_2(closest)
extra = get_slopes_power_of_2(2 * closest)[0::2][:n_heads - closest]
slopes = slopes + extra
return torch.tensor(slopes, dtype=torch.float32)
def build_alibi_bias(n_heads: int, seq_len: int) -> torch.Tensor:
"""
返回 ALiBi 偏置矩阵,shape: (1, n_heads, seq_len, seq_len)
"""
slopes = get_alibi_slopes(n_heads) # (n_heads,)
positions = torch.arange(seq_len)
# 相对距离矩阵:|i - j|,shape (seq_len, seq_len)
dist = (positions.unsqueeze(0) - positions.unsqueeze(1)).abs().float()
# 每个头乘以对应斜率并取负
bias = -slopes.view(-1, 1, 1) * dist.unsqueeze(0) # (n_heads, L, L)
return bias.unsqueeze(0) # (1, n_heads, L, L)
# 示例:将偏置加到注意力分数上
n_heads, seq_len = 8, 64
alibi_bias = build_alibi_bias(n_heads, seq_len)
print(alibi_bias.shape) # torch.Size([1, 8, 64, 64])
# 使用方式:scores = qk_scores + alibi_bias
对比总结
| 方法 | 参数量 | 长度外推 | 相对位置感知 | 典型模型 |
|---|---|---|---|---|
| Sinusoidal PE | 无 | 有限 | 间接 | 原始 Transformer |
| Learned PE | 有 | ✗ | 无 | BERT, GPT-2 |
| Relative PE | 少量 | 有限 | ✓ | Transformer-XL |
| RoPE | 无 | 较好 | ✓ | LLaMA, Qwen |
| ALiBi | 无 | ✓ 最强 | ✓ | BLOOM, MPT |
主要修改点汇总:
| 原写法 | 修改后 | 原因 |
|---|---|---|
\sin\!\left( |
\sin\left( |
\! 负空格 CSDN 不支持 |
\mathbf{q}_m^\top |
\mathbf{q}_m^{T} |
\top 部分版本不渲染 |
\operatorname{clip} |
\text{clip} |
CSDN 不支持 \operatorname |
R_{\theta,\,n-m} |
R_{\theta,n-m} |
\, 在部分公式中导致异常 |
d_{\text{model}} |
d_{model} |
下标中嵌套 \text 易出错 |