TimeDRT
TimeDART: A Diffusion Autoregressive Transformer for Self-Supervised Time Series Representation
主要思想是将时间点分成Patch,再进行Patch级别的独立加噪
Patch 长度 P 的选择敏感,可以考虑增加尺度的思想
c
Input(B, T, C)
|
|-- Instance Normalization(B, T, N)
|
|-- Patching Embedding z(B, T/P, N) 将时间维度变成一个个Patch
|
|-- CAUSAL TRANSFORMER ENCODER X_qk = f1(z)
| #先在L维前面拼一个长度P的随机可学习向量SOS,然后丢弃最后P个点
| #然后正弦Position Embedding(SOS不做)
| #Causal Mask Transformer Encoder
|-- PATCH-LEVEL DIFFUSION X_v = f2(z)
| #对z进行Patch级别的加噪
|-- Decoder部分,根据X_qk和X_v去做 mask self attention
|
--- 反Instance Normalization
Patching Embedding
原本的embedding对T上每个点进行embedding
x1:L=[x1,x2,...,xL]x_{1:L}=[x_1,x_2,...,x_L]x1:L=[x1,x2,...,xL]
现在则切割成
x1:L=[x1:P,xP+1:2P,...]x_{1:L} = [x_{1:P},x_{P+1:2P},...]x1:L=[x1:P,xP+1:2P,...]
每隔P分割一次,得到序列z
然后进行一次Embedding(Linear)
py
class InstanceNorm1D(nn.Module):
"""
Normalize per instance, per channel over time.
Input: (B, T, C)
"""
def __init__(self, eps=1e-5):
super().__init__()
self.eps = eps
def forward(self, x):
mean = x.mean(dim=1, keepdim=True)
std = x.std(dim=1, keepdim=True) + self.eps
return (x - mean) / std, mean, std
def inverse(self, x, mean, std):
return x * std + mean
CAUSAL TRANSFORMER ENCODER
z1:Nin=Concat[SOS, z1:N−1]+PE1:Nz^{\text{in}}{1:N} = \operatorname{Concat}[\text{SOS},\, z{1:N-1}] + \text{PE}_{1:N}z1:Nin=Concat[SOS,z1:N−1]+PE1:N
f (z1:Nin)=Encoder (z1:Nin, M)f\!\left(z^{\text{in}}{1:N}\right) = \operatorname{Encoder}\!\left(z^{\text{in}}{1:N},\, M\right)f(z1:Nin)=Encoder(z1:Nin,M)
py
class CausalEncoder(nn.Module):
def __init__(self, dim, depth, heads):
super().__init__()
encoder_layer = nn.TransformerEncoderLayer(
d_model=dim,
nhead=heads,
batch_first=True
)
self.encoder = nn.TransformerEncoder(encoder_layer, depth)
self.sos = nn.Parameter(torch.randn(1, 1, dim))
self.pe = SinusoidalPE(dim)
def causal_mask(self, N, device):
return torch.triu(
torch.ones(N, N, device=device), diagonal=1
).bool()
def forward(self, z):
B, N, D = z.shape
sos = self.sos.expand(B, 1, D)
z_in = torch.cat([sos, z[:, :-1]], dim=1) # Concat[SOS, z1:N-1]
z_in = self.pe(z_in)
mask = self.causal_mask(N, z.device)
return self.encoder(z_in, mask)
PATCH-LEVEL DIFFUSION
对于每个patch进行多次加噪
xjs=α(s) xjs−1+1−α(s) ϵ,ϵ∼N(0,I)x_j^{s} = \sqrt{\alpha(s)}\,x_j^{s-1} + \sqrt{1-\alpha(s)}\,\epsilon, \quad \epsilon \sim \mathcal{N}(0, I)xjs=α(s) xjs−1+1−α(s) ϵ,ϵ∼N(0,I)
α(s)=αˉ(s)αˉ(s−1)\alpha(s) = \frac{\bar{\alpha}(s)}{\bar{\alpha}(s-1)} α(s)=αˉ(s−1)αˉ(s)
αˉ(s)=cos2 (s/S+ϵ1+ϵ⋅π2)cos2 (ϵ1+ϵ⋅π2)\bar{\alpha}(s) = \frac{\cos^2\!\left( \frac{s/S + \epsilon}{1 + \epsilon} \cdot \frac{\pi}{2} \right)} {\cos^2\!\left( \frac{\epsilon}{1 + \epsilon} \cdot \frac{\pi}{2} \right)}αˉ(s)=cos2(1+ϵϵ⋅2π)cos2(1+ϵs/S+ϵ⋅2π)
其中αˉ(s)\bar{\alpha}(s)αˉ(s)是累计噪声保留率,S是总扩散步数,s是逐步扩散步数,ϵ\epsilonϵ稳定系数0.008
py
class CosineDiffusion:
def __init__(self, S, eps=0.008):
self.S = S
self.eps = eps
def alpha_bar(self, s):
return (
torch.cos(
((s / self.S + self.eps) / (1 + self.eps)) * math.pi / 2
) ** 2
)
def alpha(self, s):
return self.alpha_bar(s) / self.alpha_bar(s - 1)
def q_sample(self, x0, s):
"""
x0: (B, N, D)
"""
device = x0.device
s = torch.tensor(s, device=device)
ab = self.alpha_bar(s)
noise = torch.randn_like(x0)
return (
torch.sqrt(ab) * x0
+ torch.sqrt(1 - ab) * noise
), noise
Decoder
py
class DenoisingDecoder(nn.Module):
def __init__(self, dim, heads):
super().__init__()
self.attn = nn.MultiheadAttention(
embed_dim=dim,
num_heads=heads,
batch_first=True
)
self.ffn = nn.Sequential(
nn.Linear(dim, 4 * dim),
nn.GELU(),
nn.Linear(4 * dim, dim)
)
def forward(self, z_noisy, z_enc):
"""
z_noisy: (B, N, D) queries
z_enc: (B, N, D) keys / values
"""
# index-aligned attention: no mixing across N
out, _ = self.attn(
query=z_noisy,
key=z_enc,
value=z_enc,
need_weights=False
)
return self.ffn(out)