rust-candle学习笔记11-实现一个简单的自注意力

参考:about-pytorch

定义ScaledDotProductAttention结构体:

rust 复制代码
use candle_core::{Result, Device, Tensor};
use candle_nn::{Linear, Module, linear_no_bias, VarMap, VarBuilder, ops};

struct ScaledDotProductAttention {
    wq: Linear,
    wk: Linear,
    wv: Linear,
    d_model: Tensor,
    device: Device,
}

为ScaledDotProductAttention结构体实现new方法:

rust 复制代码
impl ScaledDotProductAttention {
    fn new(vb: VarBuilder, embedding_dim: usize, out_dim: usize, device: Device) -> Result<Self> {
        Ok(Self { 
            wq: linear_no_bias(embedding_dim, out_dim, vb.pp("wq"))?, 
            wk: linear_no_bias(embedding_dim, out_dim, vb.pp("wk"))?, 
            wv: linear_no_bias(embedding_dim, out_dim, vb.pp("wv"))?,
            d_model: Tensor::new(embedding_dim as f32, &device)?,
            device,
        })
    }
}

为结构体实现Module的forward trait:

rust 复制代码
impl Module for ScaledDotProductAttention {
    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
        let q = self.wq.forward(xs)?;
        let k = self.wk.forward(xs)?;
        let v = self.wv.forward(xs)?;
        let attn_score = q.matmul(&k.t()?)?;
        let attn_score = attn_score.broadcast_div(&self.d_model.sqrt()?)?;
        let dim = attn_score.rank() - 1;
        let attn_weights = ops::softmax(&attn_score, dim)?;
        let attn_output = attn_weights.matmul(&v)?;
        Ok(attn_output)
    }
}

融合qkv实现:

定义ScaledDotProductAttentionFusedQKV结构体:

rust 复制代码
struct ScaledDotProductAttentionFusedQKV {
    w_qkv: Linear,
    d_model: Tensor,
    device: Device,
}

为结构体实现new方法:

rust 复制代码
impl ScaledDotProductAttentionFusedQKV {
    fn new(vb: VarBuilder, embedding_dim: usize, out_dim: usize, device: Device) -> Result<Self> {
        Ok(Self { 
            w_qkv: linear_no_bias(embedding_dim, 3*out_dim, vb.pp("w_qkv"))?,
            d_model: Tensor::new(embedding_dim as f32, &device)?,
            device,
        })
    }
}

为结构体实现forward trait:

rust 复制代码
impl Module for ScaledDotProductAttentionFusedQKV {
    fn forward(&self, xs: &Tensor) -> Result<Tensor> {
        let qkv = self.w_qkv.forward(xs)?;
        let (batch_size, seq_len, _) = qkv.dims3()?;
        let qkv = qkv.reshape((batch_size, seq_len, 3, ()))?;
        let q = qkv.get_on_dim(2, 0)?;
        let q = q.reshape((batch_size, seq_len, ()))?;
        let k = qkv.get_on_dim(2, 1)?;
        let k = k.reshape((batch_size, seq_len, ()))?;
        let v = qkv.get_on_dim(2, 2)?;
        let v = v.reshape((batch_size, seq_len, ()))?;
        let attn_score = q.matmul(&k.t()?)?;
        let attn_score = attn_score.broadcast_div(&self.d_model.sqrt()?)?;
        let dim = attn_score.rank() - 1;
        let attn_weights = ops::softmax(&attn_score, dim)?;
        let attn_output = attn_weights.matmul(&v)?;
        Ok(attn_output)
    }
}

测试:

rust 复制代码
fn main() -> Result<()> {
    let device = Device::cuda_if_available(0)?;
    let varmap = VarMap::new();
    let vb = VarBuilder::from_varmap(&varmap, candle_core::DType::F32, &device);
    
    let input = Tensor::from_vec(vec![0.43f32, 0.15, 0.89, 
                                                    0.55, 0.87, 0.66,
                                                    0.57, 0.85, 0.64,
                                                    0.22, 0.58, 0.33,
                                                    0.77, 0.25, 0.10,
                                                    0.05, 0.80, 0.55, 
                                                    0.43, 0.15, 0.89, 
                                                    0.55, 0.87, 0.66,
                                                    0.57, 0.85, 0.64,
                                                    0.22, 0.58, 0.33,
                                                    0.77, 0.25, 0.10,
                                                    0.05, 0.80, 0.55], (2, 6, 3), &device)?;
    // let model = ScaledDotProductAttention::new(vb.clone(), 3, 2, device.clone())?;
    let model = ScaledDotProductAttentionFusedQKV::new(vb.clone(), 3, 2, device.clone())?;
    let output = model.forward(&input)?;
    println!("output: {:?}\n", output);
    println!("output: {:?}\n", output.to_vec3::<f32>()?);
    Ok(())
}
相关推荐
doiito2 小时前
【Agent Harness】Gliding Horse L2 作战地图深度优化:给多 Agent 上下文装上“精准导航”
ai·rust·架构设计·系统设计·ai agent
花褪残红青杏小9 小时前
Rust图像处理第8节-暗角 & 复古胶片特效:四周衰减中心高亮
rust·webassembly·图形学
独孤留白1 天前
从C到Rust:Rust 的 Trait 不是Interface,那是什么?
rust
花褪残红青杏小1 天前
Rust图像处理第7节-马赛克像素化:分块取平均色实现打码风格
rust·webassembly·图形学
doiito2 天前
【Agent Harness】Gliding Horse 设计细节 -- 不跟风开发自己的AI Agent
架构·rust·agent
doiito2 天前
【Agent Harness】Gliding Horse 核心设计理念,不跟风开发自己的AI Agent
ai·rust·架构设计·系统设计·ai agent
花褪残红青杏小2 天前
Rust图像处理第6节- 均值模糊 & 中值模糊:3×3 邻域的两种经典玩法
rust·webassembly·图形学
子兮曰3 天前
前端工具链的「Rust 化」:一场没有赢家的军备竞赛?
前端·后端·rust
星栈3 天前
写 Dioxus Demo 不难,难的是把它写成项目
前端·rust·前端框架
mCell3 天前
【锐评】桌面端技术营销:别拿跑分当工程判断
前端·rust·electron