- 核心机制理解
- 项目结构与工具
- [Tokenizer 实现](#Tokenizer 实现)
- [Batcher 实现](#Batcher 实现)
- [Model 实现](#Model 实现)
- [简单 Embedding 模型](#简单 Embedding 模型)
- 前向传播测试
- [Generate 实现](#Generate 实现)
- 训练实现
本视频是从0手搓GPT,主要目标是复现GPT最核心的部分。使用 Rust 实现(受 Andrej Karpathy 的 Python 视频启发,保持相同的变量命名风格)
核心机制
神经网络基础
-
前馈(Forward) :输入向量经网络变换(线性/非线性),
映射到输出向量 -
反馈(Backpropagation) :根据期望值与
实际输出的差异调整网络参数 -
训练 :
反复执行前馈→反馈循环,直到输出逼近期望结果V1 → [网络变换] → V2' (期望 V2)
↓
调整网络参数
大语言模型的核心逻辑
语言可预测性假设:给定上文,下一个字符不是什么都行,而是有规律的。
"发展"概念:将语言中的规律理解为一种"发展":
蓝色字符串 → 橙色字符串
- 蓝色:已知上文(输入)
- 橙色:期望下一个字符(目标)
蓝色能发展成橙色,就是语言中存在的一个事实
"事实"的抽象与训练
- 从语料中提取无数个"发展"事实
- 将这些事实交给神经网络训练
- 网络参数中隐含了这些事实背后的规律
- 训练后,给定蓝色部分,网络能预测橙色部分
窗口容量 :蓝色/橙色窗口的字符数量 = 最大生成长度(context window)。
Tokenizer 实现
语料加载与字符提取
rust
use std::collections::HashSet;
// 加载语料
let text = std::fs::read_to_string("src/input.txt").unwrap();
// 提取所有不重复字符
let mut chars = text.chars().collect::<HashSet<_>>();
let vocab: Vec<char> = chars.into_iter().collect::<BTreeSet<_>>().into_iter().collect();
- 使用
HashSet去重 - 使用
BTreeSet排序(按字符的自然顺序) - 语料包含莎士比亚作品集
双向映射构建
rust
type TokenId = i32;
// char → token_id 映射
let mut stoi: HashMap<char, TokenId> = HashMap::new();
for (i, ch) in vocab.iter().enumerate() {
stoi.insert(*ch, i as TokenId);
}
// token_id → char 映射
let mut itos: HashMap<TokenId, char> = HashMap::new();
for (i, ch) in vocab.iter().enumerate() {
itos.insert(i as TokenId, *ch);
}
示例映射关系:
N→ 53Z→ 39E→ 44
编码解码函数
rust
// 编码:字符串 → token 序列
fn encode(s: &str, stoi: &HashMap<char, TokenId>) -> Vec<TokenId> {
s.chars()
.map(|ch| *stoi.get(&ch).unwrap())
.collect()
}
// 解码:token 序列 → 字符串
fn decode(tokens: &[TokenId], itos: &HashMap<TokenId, char>) -> String {
tokens.iter()
.map(|id| *itos.get(id).unwrap())
.collect()
}
// 获取词表大小
fn vocab_size(_stoi: &HashMap<char, TokenId>) -> usize {
66 // 包含所有 ASCII 可打印字符 + \r\n 等
}
Batcher 实现
训练集/测试集划分
rust
let tokens: Vec<TokenId> = encode(&text, &stoi);
// 90% 训练集,10% 测试集
let split_idx = (tokens.len() as f32 * 0.9) as usize;
let train_data = &tokens[..split_idx];
let test_data = &tokens[split_idx..];
批次采样逻辑
rust
// 批次参数
let batch_size = 4;
let block_size = 8; // 上下文窗口大小
// 从语料中随机采样起始位置
fn get_batch(data: &[TokenId], batch_size: usize, block_size: usize) -> (Tensor, Tensor) {
use rand::seq::SliceRandom;
let max_start = data.len() - block_size - 1;
// 随机采样 batch_size 个起始位置
let mut rng = rand::thread_rng();
let start_positions: Vec<usize> = (0..batch_size)
.map(|_| {
let idx = (0..max_start).choose(&mut rng).unwrap();
idx
})
.collect();
// 构建 X 和 Y
let mut x_list: Vec<Vec<TokenId>> = Vec::new();
let mut y_list: Vec<Vec<TokenId>> = Vec::new();
for &start in &start_positions {
// X: 从起始位置取 block_size 个 token
let x = data[start..start + block_size].to_vec();
// Y: 从 X 的下一个位置开始,取 block_size 个 token
let y = data[start + 1..start + 1 + block_size].to_vec();
x_list.push(x);
y_list.push(y);
}
// 转成 Tensor
let x_flat: Vec<i32> = x_list.into_iter().flatten().collect();
let y_flat: Vec<i32> = y_list.into_iter().flatten().collect();
let x = Tensor::of_slice(&x_flat).reshape(&[batch_size as i64, block_size as i64]);
let y = Tensor::of_slice(&y_flat).reshape(&[batch_size as i64, block_size as i64]);
(x, y)
}
- 相邻批次之间的连续序列不必紧挨,可以是语料中任意位置
- Y 比 X 整体向后偏移 1 位
- 保证 X 和 Y 的一一对应关系
X/Y 数据构建
示例(假设采样到不同位置):
Batch 1: X = [42, 45, 51, ...] Y = [45, 51, 53, ...]
Batch 2: X = [67, 12, 33, ...] Y = [12, 33, 44, ...]
...
Model 实现
简单 Embedding 模型
rust
use tch::{nn, nn::Module, Tensor, Kind, Device};
fn create_model(vocab_size: usize, embedding_dim: usize, device: Device) -> nn::Sequential {
let root = nn::VarStore::new(device);
nn::seq()
.add(nn::embedding("embed", vocab_size, embedding_dim, &root))
.add(nn::linear("linear", embedding_dim, vocab_size, &root))
}
- Embedding 层:将 token_id 映射为 embedding 向量
- Linear 层:将 embedding 映射为 vocab_size 维的 logits
前向传播测试
rust
let vocab_size = 66;
let embedding_dim = 512;
let device = Device::cuda_if_available();
let model = create_model(vocab_size, embedding_dim, device);
// 生成批次
let (x, _y) = get_batch(&train_data, batch_size, block_size);
// 前向传播
let logits = model.forward(&x);
// logits 形状: [batch_size, block_size, vocab_size]
// 每个位置预测下一个 token 的 unnormalized 概率
输出形状:
x:[4, 8]- 4个批次,每批8个tokenlogits:[4, 8, 66]- 每个token对应66维向量
Generate 实现
自回归生成逻辑
rust
fn generate(
model: &dyn Module,
itos: &HashMap<TokenId, char>,
prompt: &[TokenId],
max_new_tokens: usize,
) -> String {
let device = model.device();
// 将 prompt 复制到可变 context
let mut context: Vec<TokenId> = prompt.to_vec();
// 迭代生成
for _ in 0..max_new_tokens {
// 截取最后 block_size 个 token(如果超过)
let start_idx = context.len().saturating_sub(block_size);
let x = Tensor::of_slice(&context[start_idx..])
.reshape(&[1, context[start_idx..].len() as i64])
.to_device(device);
// 模型预测
let logits = model.forward(&x);
// 只取最后一个位置的预测
let last_logit = logits.get(0).get(-1);
// softmax 转为概率分布
let probs = Tensor::softmax(&last_logit, -1);
// 采样下一个 token
let next_token = sample_from_probs(&probs);
context.push(next_token);
}
decode(&context, itos)
}
采样与概率分布
rust
use rand::Rng;
fn sample_from_probs(probs: &Tensor) -> TokenId {
let prob_vec: Vec<f32> = probs.into();
let total: f32 = prob_vec.iter().sum();
// 归一化(确保和为1)
let normalized: Vec<f32> = prob_vec.iter().map(|p| p / total).collect();
// 随机采样
let mut rng = rand::thread_rng();
let r: f32 = rng.gen_range(0.0..1.0);
let mut cumsum = 0.0;
for (i, &p) in normalized.iter().enumerate() {
cumsum += p;
if r <= cumsum {
return i as TokenId;
}
}
(normalized.len() - 1) as TokenId
}
训练实现
交叉熵损失计算
rust
fn compute_loss(
model: &nn::Sequential,
x: &Tensor,
target: &Tensor,
block_size: usize,
vocab_size: usize,
) -> Tensor {
// 前向传播
let logits = model.forward(x);
// 重塑: [batch, block, vocab] -> [batch*block, vocab]
let logits_flat = logits.reshape(&[-1, vocab_size as i64]);
// 重塑目标: [batch, block] -> [batch*block]
let target_flat = target.reshape(&[-1]);
// 计算交叉熵
let loss = Tensor::cross_entropy_loss(&logits_flat, &target_flat, None, 0.0, 0.0);
loss
}
损失计算过程:
- 对 logits 最后一维 softmax 转为概率分布
- 与 target 的 one-hot 编码计算交叉熵
- 返回标量损失值
反向传播与优化器
rust
use tch::nn::OptimizerConfig;
let lr = 0.01;
let mut opt = nn::Optimizer::new(&model.var_store(), lr);
for iter in 0..max_iters {
// 计算损失
let loss = compute_loss(&model, &x, &y, block_size, vocab_size);
// 反向传播
opt.backward_step(&loss);
}
训练循环
rust
let max_iters = 12000;
let eval_interval = 1000;
let batch_size = 4;
let block_size = 8;
for iter in 0..max_iters {
// 获取批次数据
let (x, y) = get_batch(&train_data, batch_size, block_size);
// 计算损失
let loss = compute_loss(&model, &x, &y, block_size, vocab_size);
// 反向传播更新
opt.backward_step(&loss);
// 定期评估
if iter % eval_interval == 0 {
let eval_loss = compute_loss(&model, &eval_x, &eval_y, block_size, vocab_size);
println!("iter {}: loss = {:.4}", iter, eval_loss.double_value(&[]));
}
}
// 训练后切换到评估模式
model.set_mode(false); // 或 model.set_mode(true) 表示训练模式
训练参数:
- 学习率:0.01
- 训练迭代:12000 次
- 评估间隔:1000 次
代码
rust
use std::collections::{HashMap, HashSet, BTreeSet};
use rand::seq::SliceRandom;
use rand::Rng;
use tch::{nn, nn::Module, Tensor, Kind, Device};
type TokenId = i32;
// ==================== Tokenizer ====================
fn encode(s: &str, stoi: &HashMap<char, TokenId>) -> Vec<TokenId> {
s.chars()
.map(|ch| *stoi.get(&ch).unwrap())
.collect()
}
fn decode(tokens: &[TokenId], itos: &HashMap<TokenId, char>) -> String {
tokens.iter()
.map(|id| *itos.get(id).unwrap())
.collect()
}
fn vocab_size(_stoi: &HashMap<char, TokenId>) -> usize {
66
}
// ==================== Batcher ====================
fn get_batch(data: &[TokenId], batch_size: usize, block_size: usize) -> (Tensor, Tensor) {
let max_start = data.len() - block_size - 1;
let mut rng = rand::thread_rng();
let start_positions: Vec<usize> = (0..batch_size)
.map(|_| (0..max_start).choose(&mut rng).unwrap())
.collect();
let mut x_list: Vec<Vec<TokenId>> = Vec::new();
let mut y_list: Vec<Vec<TokenId>> = Vec::new();
for &start in &start_positions {
let x = data[start..start + block_size].to_vec();
let y = data[start + 1..start + 1 + block_size].to_vec();
x_list.push(x);
y_list.push(y);
}
let x_flat: Vec<i32> = x_list.into_iter().flatten().collect();
let y_flat: Vec<i32> = y_list.into_iter().flatten().collect();
let x = Tensor::of_slice(&x_flat).reshape(&[batch_size as i64, block_size as i64]);
let y = Tensor::of_slice(&y_flat).reshape(&[batch_size as i64, block_size as i64]);
(x, y)
}
// ==================== Model ====================
fn create_model(vocab_size: usize, embedding_dim: usize, device: Device) -> nn::Sequential {
let root = nn::VarStore::new(device);
nn::seq()
.add(nn::embedding("embed", vocab_size, embedding_dim, &root))
.add(nn::linear("linear", embedding_dim, vocab_size, &root))
}
// ==================== Generate ====================
fn sample_from_probs(probs: &Tensor) -> TokenId {
let prob_vec: Vec<f32> = probs.into();
let total: f32 = prob_vec.iter().sum();
let normalized: Vec<f32> = prob_vec.iter().map(|p| p / total).collect();
let mut rng = rand::thread_rng();
let r: f32 = rng.gen_range(0.0..1.0);
let mut cumsum = 0.0;
for (i, &p) in normalized.iter().enumerate() {
cumsum += p;
if r <= cumsum {
return i as TokenId;
}
}
(normalized.len() - 1) as TokenId
}
fn generate(
model: &dyn Module,
itos: &HashMap<TokenId, char>,
prompt: &[TokenId],
max_new_tokens: usize,
block_size: usize,
) -> String {
let device = model.device();
let mut context: Vec<TokenId> = prompt.to_vec();
for _ in 0..max_new_tokens {
let start_idx = context.len().saturating_sub(block_size);
let x = Tensor::of_slice(&context[start_idx..])
.reshape(&[1, context[start_idx..].len() as i64])
.to_device(device);
let logits = model.forward(&x);
let last_logit = logits.get(0).get(-1);
let probs = Tensor::softmax(&last_logit, -1);
let next_token = sample_from_probs(&probs);
context.push(next_token);
}
decode(&context, itos)
}
// ==================== Training ====================
fn compute_loss(
model: &nn::Sequential,
x: &Tensor,
target: &Tensor,
block_size: usize,
vocab_size: usize,
) -> Tensor {
let logits = model.forward(x);
let logits_flat = logits.reshape(&[-1, vocab_size as i64]);
let target_flat = target.reshape(&[-1]);
Tensor::cross_entropy_loss(&logits_flat, &target_flat, None, 0.0, 0.0)
}
fn train_model(
model: &nn::Sequential,
train_data: &[TokenId],
test_data: &[TokenId],
batch_size: usize,
block_size: usize,
vocab_size: usize,
max_iters: usize,
eval_interval: usize,
) -> nn::Sequential {
let device = model.device();
let mut opt = nn::Optimizer::new(&model.var_store(), 0.01);
for iter in 0..max_iters {
let (x, y) = get_batch(train_data, batch_size, block_size);
let loss = compute_loss(model, &x, &y, block_size, vocab_size);
opt.backward_step(&loss);
if iter % eval_interval == 0 {
let (eval_x, eval_y) = get_batch(test_data, batch_size, block_size);
let eval_loss = compute_loss(model, &eval_x, &eval_y, block_size, vocab_size);
println!("iter {}: loss = {:.4}", iter, eval_loss.double_value(&[]));
}
}
model.set_mode(false);
model.clone()
}
// ==================== Main ====================
fn main() {
// 加载语料
let text = std::fs::read_to_string("src/input.txt").unwrap();
// 构建词表
let mut chars = text.chars().collect::<HashSet<_>>();
let vocab: Vec<char> = chars.into_iter().collect::<BTreeSet<_>>().into_iter().collect();
let mut stoi: HashMap<char, TokenId> = HashMap::new();
for (i, ch) in vocab.iter().enumerate() {
stoi.insert(*ch, i as TokenId);
}
let mut itos: HashMap<TokenId, char> = HashMap::new();
for (i, ch) in vocab.iter().enumerate() {
itos.insert(i as TokenId, *ch);
}
let vs = vocab_size(&stoi);
println!("Vocab size: {}", vs);
// 编码 token
let tokens: Vec<TokenId> = encode(&text, &stoi);
// 划分数据集
let split_idx = (tokens.len() as f32 * 0.9) as usize;
let train_data = tokens[..split_idx].to_vec();
let test_data = tokens[split_idx..].to_vec();
// 参数
let batch_size = 4;
let block_size = 8;
let embedding_dim = 256;
let max_iters = 12000;
let eval_interval = 1000;
// 设备
let device = Device::cuda_if_available();
println!("Using device: {:?}", device);
// 创建模型
let model = create_model(vs, embedding_dim, device);
// 训练
let trained_model = train_model(
&model,
&train_data,
&test_data,
batch_size,
block_size,
vs,
max_iters,
eval_interval,
);
// 生成文本测试
let prompt = encode("A", &stoi);
let generated = generate(&trained_model, &itos, &prompt, 100, block_size);
println!("Generated: {}", generated);
}
总结
| 组件 | 作用 | 关键参数 |
|---|---|---|
| Tokenizer | 字符 ↔ token ID 双向映射 | vocab_size = 66 |
| Batcher | 批次数据采样 | batch_size=4, block_size=8 |
| Model | 简单 embedding + linear | embedding_dim=256 |
| Generate | 自回归文本生成 | 逐 token 采样 |
| Train | 交叉熵损失 + 优化器 | lr=0.01, iter=12000 |
语言模型本质上是在学习语料中无数个"上文发展成下文"的事实,神经网络将这些事实的规律编码到参数中,从而具备泛化能力