rust-candle学习笔记12-实现因果注意力

参考:about-pytorch

定义结构体:

rust 复制代码
struct CausalAttention {
    w_qkv: Linear,
    dropout: Dropout, 
    d_model: Tensor,
    mask: Tensor,
    device: Device,   
}

定义new方法:

rust 复制代码
impl CausalAttention {
    fn new(vb: VarBuilder, embedding_dim: usize, out_dim: usize, seq_len: usize, dropout: f32, device: Device) -> Result<Self> {
        Ok(Self { 
            w_qkv: linear_no_bias(embedding_dim, 3*out_dim, vb.pp("w_qkv"))?,
            d_model: Tensor::new(embedding_dim as f32, &device)?,
            mask: Tensor::tril2(seq_len, DType::U32, &device)?,
            dropout: Dropout::new(dropout),
            device
        })
    }
}

定义forward方法:

rust 复制代码
    fn forward(&self, x: &Tensor, train: bool) -> Result<Tensor> { 
        let qkv = self.w_qkv.forward(x)?;
        let (batch_size, seq_len, _) = qkv.dims3()?;
        let qkv = qkv.reshape((batch_size, seq_len, 3, ()))?;
        let q = qkv.get_on_dim(2, 0)?;
        let q = q.reshape((batch_size, seq_len, ()))?;
        let k = qkv.get_on_dim(2, 1)?;
        let k = k.reshape((batch_size, seq_len, ()))?;
        let v = qkv.get_on_dim(2, 2)?;
        let v = v.reshape((batch_size, seq_len, ()))?;
        let mut attn_score = q.matmul(&k.t()?)?;
        // println!("attn_score: {:?}\n", attn_score.to_vec3::<f32>()?);
        let dim = attn_score.rank() - 1;
        let mask_dim = attn_score.dims()[dim];
        let mask = self.mask.broadcast_as(attn_score.shape())?;
        // println!("mask: {:?}\n", mask);
        // println!("mask: {:?}\n", mask.to_vec3::<u32>()?);
        attn_score = masked_fill(&attn_score, &mask, f32::NEG_INFINITY)?;
        // println!("attn_score: {:?}\n", attn_score);
        // println!("attn_score: {:?}\n", attn_score.to_vec3::<f32>()?);
        let attn_score = attn_score.broadcast_div(&self.d_model.sqrt()?)?; 
        let attn_weights = ops::softmax(&attn_score, dim)?;
        // println!("attn_weights: {:?}\n", attn_weights);
        // println!("attn_weights: {:?}\n", attn_weights.to_vec3::<f32>()?); 
        let attn_weights = self.dropout.forward(&attn_weights, train)?;
        // println!("dropout attn_weights: {:?}\n", attn_weights);
        // println!("dropout attn_weights: {:?}\n", attn_weights.to_vec3::<f32>()?); 
        let attn_output = attn_weights.matmul(&v)?;
        Ok(attn_output)
    }

测试:

rust 复制代码
fn main() -> Result<()> {
    let device = Device::cuda_if_available(0)?;
    let varmap = VarMap::new();
    let vb = VarBuilder::from_varmap(&varmap, candle_core::DType::F32, &device);
    
    let input = Tensor::from_vec(vec![0.43f32, 0.15, 0.89, 
                                                    0.55, 0.87, 0.66,
                                                    0.57, 0.85, 0.64,
                                                    0.22, 0.58, 0.33,
                                                    0.77, 0.25, 0.10,
                                                    0.05, 0.80, 0.55, 
                                                    0.43, 0.15, 0.89, 
                                                    0.55, 0.87, 0.66,
                                                    0.57, 0.85, 0.64,
                                                    0.22, 0.58, 0.33,
                                                    0.77, 0.25, 0.10,
                                                    0.05, 0.80, 0.55], (2, 6, 3), &device)?;
    let model = CausalAttention::new(vb.clone(), 3, 2, 6, 0.5, device.clone())?;
    let output = model.forward(&input, true)?;
    println!("output: {:?}\n", output);
    println!("output: {:?}\n", output.to_vec3::<f32>()?);
    Ok(())
}
相关推荐
Hcoco_me几秒前
大模型面试题39:KV Cache 完全指南
人工智能·深度学习·自然语言处理·transformer·word2vec
小途软件1 分钟前
基于计算机视觉的课堂行为编码研究
人工智能·python·深度学习·计算机视觉·语言模型·自然语言处理·django
●VON1 分钟前
DeepSeek-V3.2 模型在 OpenJiuWen 中的部署实践
学习·华为·von·openjiuwen
小途软件2 分钟前
基于计算机视觉的桥梁索力测试方法
人工智能·python·语言模型·自然语言处理·django
咚咚王者3 分钟前
人工智能之核心基础 机器学习 第七章 监督学习总结
人工智能·学习·机器学习
狮子座明仔13 分钟前
DeepSeek开年王炸:mHC架构——用流形约束重构残差连接的革命性突破
人工智能·语言模型·自然语言处理
斯外戈的小白16 分钟前
【NLP】Transformer在pytorch 的实现+情感分析案例+生成式任务案例
pytorch·自然语言处理·transformer
grd418 分钟前
Electron for OpenHarmony 实战:Pagination 分页组件实现
python·学习
W|J21 分钟前
ES 学习笔记
笔记·学习·elasticsearch
张人玉22 分钟前
西门子 S7 PLC 通信 WPF 应用分析笔记
笔记·c#·wpf·plc