【ML】位置编码

1. 绝对正弦位置编码（Sinusoidal PE）

来源：原始 Transformer（Vaswani et al., 2017）

通过固定的正弦/余弦函数为每个位置生成编码，无需训练参数。

PE(pos, 2i)=sin⁡(pos100002i/dmodel)PE_{(pos,\ 2i)} = \sin\left(\frac{pos}{10000^{2i/d_{model}}}\right)PE(pos, 2i)=sin(100002i/dmodelpos)

PE(pos, 2i+1)=cos⁡(pos100002i/dmodel)PE_{(pos,\ 2i+1)} = \cos\left(\frac{pos}{10000^{2i/d_{model}}}\right)PE(pos, 2i+1)=cos(100002i/dmodelpos)

其中 pospospos 是序列位置，iii 是维度索引，dmodeld_{model}dmodel 是嵌入维度。

python 复制代码

import torch
import math

def sinusoidal_pe(max_len: int, d_model: int) -> torch.Tensor:
    """
    返回 shape: (max_len, d_model)
    """
    pe = torch.zeros(max_len, d_model)
    position = torch.arange(0, max_len).unsqueeze(1).float()          # (max_len, 1)
    div_term = torch.exp(
        torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model)
    )                                                                  # (d_model/2,)

    pe[:, 0::2] = torch.sin(position * div_term)   # 偶数维度
    pe[:, 1::2] = torch.cos(position * div_term)   # 奇数维度
    return pe

# 示例
pe = sinusoidal_pe(max_len=512, d_model=128)
print(pe.shape)  # torch.Size([512, 128])

2. 可学习位置编码（Learned PE）

来源：BERT、GPT 等

直接将位置索引映射到一个可训练的嵌入矩阵，简单灵活，但不能外推到训练时未见过的长度。

PE=Embedding(pos),pos∈{0,1,...,L−1}PE = \text{Embedding}(pos), \quad pos \in \{0, 1, \dots, L-1\}PE=Embedding(pos),pos∈{0,1,...,L−1}

python 复制代码

import torch.nn as nn

class LearnedPE(nn.Module):
    def __init__(self, max_len: int, d_model: int):
        super().__init__()
        self.embedding = nn.Embedding(max_len, d_model)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        """
        x: (batch, seq_len, d_model)
        """
        seq_len = x.size(1)
        positions = torch.arange(seq_len, device=x.device)   # (seq_len,)
        return x + self.embedding(positions)                  # 广播加法

# 示例
model = LearnedPE(max_len=512, d_model=128)
x = torch.randn(2, 64, 128)
out = model(x)
print(out.shape)  # torch.Size([2, 64, 128])

3. 相对位置编码（Relative PE，Shaw et al. 2018）

来源：Self-Attention with Relative Position Representations

不直接编码绝对位置，而是在注意力计算中加入 query 与 key 之间的相对距离信息。

修改后的注意力分数为：

eij=qi(kj+aijK)Tdke_{ij} = \frac{\mathbf{q}_i \left(\mathbf{k}j + \mathbf{a}^{K}{ij}\right)^{T}}{\sqrt{d_k}}eij=dk qi(kj+aijK)T

aijK=wclip(j−i, −k, k)K\mathbf{a}^{K}{ij} = w^{K}{\text{clip}(j-i,\ -k,\ k)}aijK=wclip(j−i, −k, k)K

其中 clip(j−i,−k,k)\text{clip}(j-i, -k, k)clip(j−i,−k,k) 将相对距离截断到 [−k,k][-k, k][−k,k]，wKw^KwK 是可学习的相对位置向量。

python 复制代码

import torch
import torch.nn as nn
import torch.nn.functional as F

class RelativeAttention(nn.Module):
    def __init__(self, d_model: int, n_heads: int, max_relative: int = 16):
        super().__init__()
        self.n_heads = n_heads
        self.d_k = d_model // n_heads
        self.max_relative = max_relative

        self.W_q = nn.Linear(d_model, d_model)
        self.W_k = nn.Linear(d_model, d_model)
        self.W_v = nn.Linear(d_model, d_model)

        # 相对位置嵌入：共 2*max_relative+1 个位置
        self.rel_embed = nn.Embedding(2 * max_relative + 1, self.d_k)

    def _relative_index(self, seq_len: int) -> torch.Tensor:
        """生成相对位置索引矩阵，shape: (seq_len, seq_len)"""
        range_vec = torch.arange(seq_len)
        dist = range_vec.unsqueeze(0) - range_vec.unsqueeze(1)   # (L, L)
        dist_clipped = dist.clamp(-self.max_relative, self.max_relative)
        return dist_clipped + self.max_relative                   # 平移到 [0, 2k]

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        B, L, _ = x.shape
        Q = self.W_q(x).view(B, L, self.n_heads, self.d_k).transpose(1, 2)
        K = self.W_k(x).view(B, L, self.n_heads, self.d_k).transpose(1, 2)
        V = self.W_v(x).view(B, L, self.n_heads, self.d_k).transpose(1, 2)

        # 标准注意力分数
        scores = torch.matmul(Q, K.transpose(-2, -1))             # (B, H, L, L)

        # 相对位置偏置
        rel_idx = self._relative_index(L).to(x.device)           # (L, L)
        rel_emb = self.rel_embed(rel_idx)                         # (L, L, d_k)
        rel_scores = torch.einsum('bhid,ijd->bhij', Q, rel_emb)
        scores = (scores + rel_scores) / (self.d_k ** 0.5)

        attn = F.softmax(scores, dim=-1)
        out = torch.matmul(attn, V)                               # (B, H, L, d_k)
        out = out.transpose(1, 2).contiguous().view(B, L, -1)
        return out

# 示例
attn = RelativeAttention(d_model=128, n_heads=4)
x = torch.randn(2, 32, 128)
print(attn(x).shape)  # torch.Size([2, 32, 128])

4. 旋转位置编码（RoPE）

来源：RoFormer（Su et al., 2021），被 LLaMA、GPT-NeoX 等广泛采用

核心思想：对 query/key 向量按位置旋转，使得 qmTkn\mathbf{q}_m^T \mathbf{k}_nqmTkn 自然包含相对位置 (m−n)(m - n)(m−n) 的信息。

对于二维情形，旋转矩阵为：

Rθ,m=(cos⁡mθ−sin⁡mθsin⁡mθcos⁡mθ)R_{\theta,m} = \begin{pmatrix} \cos m\theta & -\sin m\theta \\ \sin m\theta & \cos m\theta \end{pmatrix}Rθ,m=(cosmθsinmθ−sinmθcosmθ)

推广到 ddd 维时，将向量两两分组，第 iii 对使用频率：

θi=10000−2i/d\theta_i = 10000^{-2i/d}θi=10000−2i/d

内积满足：

(Rθ,mq)T(Rθ,nk)=qTRθ,n−mk(R_{\theta,m} \mathbf{q})^{T} (R_{\theta,n} \mathbf{k}) = \mathbf{q}^{T} R_{\theta,n-m} \mathbf{k}(Rθ,mq)T(Rθ,nk)=qTRθ,n−mk

python 复制代码

import torch

def precompute_rope_freqs(d: int, max_len: int, base: float = 10000.0):
    """预计算 RoPE 的 cos/sin 缓存"""
    # 频率：shape (d/2,)
    theta = 1.0 / (base ** (torch.arange(0, d, 2).float() / d))
    positions = torch.arange(max_len).float()                    # (max_len,)
    freqs = torch.outer(positions, theta)                        # (max_len, d/2)
    cos = torch.cos(freqs)                                       # (max_len, d/2)
    sin = torch.sin(freqs)                                       # (max_len, d/2)
    return cos, sin

def apply_rope(x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor) -> torch.Tensor:
    """
    x:   (batch, n_heads, seq_len, d_head)
    cos/sin: (seq_len, d_head/2)
    """
    seq_len, d = x.shape[-2], x.shape[-1]
    x1 = x[..., : d // 2]                                       # 前半
    x2 = x[..., d // 2 :]                                       # 后半

    cos = cos[:seq_len].unsqueeze(0).unsqueeze(0)               # (1,1,L,d/2)
    sin = sin[:seq_len].unsqueeze(0).unsqueeze(0)

    # 旋转：[x1, x2] -> [x1*cos - x2*sin, x2*cos + x1*sin]
    x_rot = torch.cat([x1 * cos - x2 * sin,
                        x2 * cos + x1 * sin], dim=-1)
    return x_rot

# 示例
d_head, max_len = 64, 512
cos_cache, sin_cache = precompute_rope_freqs(d_head, max_len)

q = torch.randn(2, 8, 32, d_head)   # (batch, heads, seq, d_head)
q_rope = apply_rope(q, cos_cache, sin_cache)
print(q_rope.shape)  # torch.Size([2, 8, 32, 64])

5. ALiBi（线性偏置注意力）

来源：Train Short, Test Long（Press et al., 2022）

不修改输入嵌入，而是直接在注意力分数上加一个与相对距离成正比的负偏置，天然支持长度外推。

Attn(i,j)=qikjT−mh⋅∣i−j∣\text{Attn}(i, j) = \mathbf{q}_i \mathbf{k}_j^{T} - m_h \cdot |i - j|Attn(i,j)=qikjT−mh⋅∣i−j∣

其中 mhm_hmh 是第 hhh 个注意力头的斜率，按等比数列分配：

mh=128h/H,h=1,...,Hm_h = \frac{1}{2^{8h/H}}, \quad h = 1, \dots, Hmh=28h/H1,h=1,...,H

python 复制代码

import torch
import math

def get_alibi_slopes(n_heads: int) -> torch.Tensor:
    """计算每个头的斜率 m_h"""
    def get_slopes_power_of_2(n):
        start = 2 ** (-(2 ** -(math.log2(n) - 3)))
        return [start * (start ** i) for i in range(n)]

    if math.log2(n_heads).is_integer():
        slopes = get_slopes_power_of_2(n_heads)
    else:
        closest = 2 ** math.floor(math.log2(n_heads))
        slopes = get_slopes_power_of_2(closest)
        extra = get_slopes_power_of_2(2 * closest)[0::2][:n_heads - closest]
        slopes = slopes + extra

    return torch.tensor(slopes, dtype=torch.float32)

def build_alibi_bias(n_heads: int, seq_len: int) -> torch.Tensor:
    """
    返回 ALiBi 偏置矩阵，shape: (1, n_heads, seq_len, seq_len)
    """
    slopes = get_alibi_slopes(n_heads)                         # (n_heads,)
    positions = torch.arange(seq_len)
    # 相对距离矩阵：|i - j|，shape (seq_len, seq_len)
    dist = (positions.unsqueeze(0) - positions.unsqueeze(1)).abs().float()
    # 每个头乘以对应斜率并取负
    bias = -slopes.view(-1, 1, 1) * dist.unsqueeze(0)         # (n_heads, L, L)
    return bias.unsqueeze(0)                                   # (1, n_heads, L, L)

# 示例：将偏置加到注意力分数上
n_heads, seq_len = 8, 64
alibi_bias = build_alibi_bias(n_heads, seq_len)
print(alibi_bias.shape)  # torch.Size([1, 8, 64, 64])

# 使用方式：scores = qk_scores + alibi_bias

对比总结

方法	参数量	长度外推	相对位置感知	典型模型
Sinusoidal PE	无	有限	间接	原始 Transformer
Learned PE	有	✗	无	BERT, GPT-2
Relative PE	少量	有限	✓	Transformer-XL
RoPE	无	较好	✓	LLaMA, Qwen
ALiBi	无	✓ 最强	✓	BLOOM, MPT

主要修改点汇总：

原写法	修改后	原因
`\sin\!\left(`	`\sin\left(`	`\!` 负空格 CSDN 不支持
`\mathbf{q}_m^\top`	`\mathbf{q}_m^{T}`	`\top` 部分版本不渲染
`\operatorname{clip}`	`\text{clip}`	CSDN 不支持 `\operatorname`
`R_{\theta,\,n-m}`	`R_{\theta,n-m}`	`\,` 在部分公式中导致异常
`d_{\text{model}}`	`d_{model}`	下标中嵌套 `\text` 易出错