前言
RNN虽然能处理序列数据,但有两个致命问题:
-
无法并行:必须按顺序计算,速度慢
-
长期依赖:即使有LSTM,长序列时早期信息还是会"遗忘"
2017年,Google提出了Transformer,用自注意力机制(Self-Attention)完全替代了RNN,开启了AI的新时代。
今天,我们用C语言从零实现一个简化版Transformer:
· 实现多头自注意力(Multi-Head Self-Attention)
· 实现前馈网络、层归一化、残差连接
· 实现位置编码
· 理解ChatGPT的底层原理
一、Transformer的核心原理
- 自注意力机制
核心问题:如何让模型知道句子中哪些词更重要?
```
句子:"我 爱 北京 天安门"
计算"爱"的注意力:
-
对"我"的关注度: 0.3
-
对"爱"的关注度: 0.5(自己)
-
对"北京"的关注度: 0.1
-
对"天安门"的关注度: 0.1
输出 = 0.3×"我" + 0.5×"爱" + 0.1×"北京" + 0.1×"天安门"
```
- 计算公式
```
Attention(Q,K,V) = softmax(Q·K^T / √d_k) · V
-
Q(Query):查询,当前词要问"我该关注谁"
-
K(Key):键,每个词的标签,用于匹配查询
-
V(Value):值,每个词的实际信息
```
- Transformer结构
```
输入: "我 爱 你"
↓
词嵌入(Word Embedding) + 位置编码(Positional Encoding)
↓
┌─────────────────────────────────────┐
│ Transformer Block × N │
│ ┌─────────────────────────────┐ │
│ │ 多头自注意力 + 残差连接 │ │
│ │ ↓ │ │
│ │ 层归一化 │ │
│ └─────────────────────────────┘ │
│ ┌─────────────────────────────┐ │
│ │ 前馈网络 + 残差连接 │ │
│ │ ↓ │ │
│ │ 层归一化 │ │
│ └─────────────────────────────┘ │
└─────────────────────────────────────┘
↓
输出概率
```
二、完整代码实现
- 基础数据结构
```c
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <string.h>
#include <time.h>
// 矩阵结构
typedef struct {
int rows;
int cols;
float *data;
} matrix_t;
matrix_t *mat_create(int rows, int cols) {
matrix_t *m = malloc(sizeof(matrix_t));
m->rows = rows;
m->cols = cols;
m->data = calloc(rows * cols, sizeof(float));
return m;
}
void mat_free(matrix_t *m) {
if (m) {
free(m->data);
free(m);
}
}
void mat_random(matrix_t *m, float scale) {
for (int i = 0; i < m->rows * m->cols; i++) {
m->data[i] = ((float)rand() / RAND_MAX - 0.5) * 2 * scale;
}
}
void mat_copy(matrix_t *src, matrix_t *dst) {
memcpy(dst->data, src->data, src->rows * src->cols * sizeof(float));
}
// 矩阵乘法 C = A × B
void mat_mul(matrix_t *A, matrix_t *B, matrix_t *C) {
for (int i = 0; i < A->rows; i++) {
for (int j = 0; j < B->cols; j++) {
float sum = 0;
for (int k = 0; k < A->cols; k++) {
sum += A->data[i * A->cols + k] * B->data[k * B->cols + j];
}
C->data[i * C->cols + j] = sum;
}
}
}
// 矩阵加法 C = A + B
void mat_add(matrix_t *A, matrix_t *B, matrix_t *C) {
for (int i = 0; i < A->rows * A->cols; i++) {
C->data[i] = A->data[i] + B->data[i];
}
}
// 矩阵缩放
void mat_scale(matrix_t *m, float scale) {
for (int i = 0; i < m->rows * m->cols; i++) {
m->data[i] *= scale;
}
}
// Softmax(按行)
void mat_softmax(matrix_t *m) {
for (int i = 0; i < m->rows; i++) {
float max_val = m->data[i * m->cols];
for (int j = 1; j < m->cols; j++) {
if (m->data[i * m->cols + j] > max_val) {
max_val = m->data[i * m->cols + j];
}
}
float sum = 0;
for (int j = 0; j < m->cols; j++) {
m->data[i * m->cols + j] = expf(m->data[i * m->cols + j] - max_val);
sum += m->data[i * m->cols + j];
}
for (int j = 0; j < m->cols; j++) {
m->data[i * m->cols + j] /= sum;
}
}
}
```
- 位置编码
```c
// 位置编码(让模型知道词的位置信息)
matrix_t *positional_encoding(int seq_len, int d_model) {
matrix_t *pe = mat_create(seq_len, d_model);
for (int pos = 0; pos < seq_len; pos++) {
for (int i = 0; i < d_model; i++) {
float angle = pos / powf(10000.0, 2.0 * i / d_model);
if (i % 2 == 0) {
pe->data[pos * d_model + i] = sinf(angle);
} else {
pe->data[pos * d_model + i] = cosf(angle);
}
}
}
return pe;
}
```
- 自注意力机制
```c
// 单头注意力
typedef struct {
matrix_t *W_q; // Query权重 [d_model, d_k]
matrix_t *W_k; // Key权重 [d_model, d_k]
matrix_t *W_v; // Value权重 [d_model, d_v]
matrix_t *W_o; // 输出权重 [d_v, d_model]
int d_model;
int d_k;
int d_v;
} attention_head_t;
attention_head_t *attn_head_create(int d_model, int d_k, int d_v) {
attention_head_t *head = malloc(sizeof(attention_head_t));
head->d_model = d_model;
head->d_k = d_k;
head->d_v = d_v;
head->W_q = mat_create(d_k, d_model);
head->W_k = mat_create(d_k, d_model);
head->W_v = mat_create(d_v, d_model);
head->W_o = mat_create(d_model, d_v);
float scale = sqrt(2.0f / d_model);
mat_random(head->W_q, scale);
mat_random(head->W_k, scale);
mat_random(head->W_v, scale);
mat_random(head->W_o, scale);
return head;
}
void attn_head_free(attention_head_t *head) {
if (head) {
mat_free(head->W_q); mat_free(head->W_k);
mat_free(head->W_v); mat_free(head->W_o);
free(head);
}
}
// 单头注意力前向传播
matrix_t *attn_head_forward(attention_head_t *head, matrix_t *x) {
// x: [seq_len, d_model]
int seq_len = x->rows;
// Q = x × W_q^T, K = x × W_k^T, V = x × W_v^T
matrix_t *Q = mat_create(seq_len, head->d_k);
matrix_t *K = mat_create(seq_len, head->d_k);
matrix_t *V = mat_create(seq_len, head->d_v);
mat_mul(x, head->W_q, Q);
mat_mul(x, head->W_k, K);
mat_mul(x, head->W_v, V);
// 计算注意力分数: scores = Q × K^T / sqrt(d_k)
matrix_t *K_t = mat_create(head->d_k, seq_len);
for (int i = 0; i < seq_len; i++) {
for (int j = 0; j < head->d_k; j++) {
K_t->data[j * seq_len + i] = K->data[i * head->d_k + j];
}
}
matrix_t *scores = mat_create(seq_len, seq_len);
mat_mul(Q, K_t, scores);
mat_scale(scores, 1.0 / sqrt(head->d_k));
mat_softmax(scores);
mat_free(K_t);
// attention = scores × V
matrix_t *attention = mat_create(seq_len, head->d_v);
mat_mul(scores, V, attention);
// output = attention × W_o
matrix_t *output = mat_create(seq_len, head->d_model);
mat_mul(attention, head->W_o, output);
mat_free(Q); mat_free(K); mat_free(V);
mat_free(scores); mat_free(attention);
return output;
}
```
- 多头注意力
```c
// 多头注意力结构
typedef struct {
attention_head_t **heads;
int num_heads;
int d_model;
int d_k;
} multihead_attention_t;
multihead_attention_t *mh_attn_create(int d_model, int num_heads) {
multihead_attention_t *mha = malloc(sizeof(multihead_attention_t));
mha->num_heads = num_heads;
mha->d_model = d_model;
mha->d_k = d_model / num_heads;
mha->heads = malloc(sizeof(attention_head_t*) * num_heads);
for (int i = 0; i < num_heads; i++) {
mha->heads[i] = attn_head_create(d_model, mha->d_k, mha->d_k);
}
return mha;
}
void mh_attn_free(multihead_attention_t *mha) {
if (mha) {
for (int i = 0; i < mha->num_heads; i++) {
attn_head_free(mha->heads[i]);
}
free(mha->heads);
free(mha);
}
}
// 多头注意力前向传播
matrix_t *mh_attn_forward(multihead_attention_t *mha, matrix_t *x) {
int seq_len = x->rows;
int d_model = mha->d_model;
int d_k = mha->d_k;
int num_heads = mha->num_heads;
// 收集所有头的输出
matrix_t *head_outputs = mat_create(seq_len, d_model);
for (int h = 0; h < num_heads; h++) {
matrix_t *out = attn_head_forward(mha->heads[h], x);
for (int i = 0; i < seq_len; i++) {
for (int j = 0; j < d_k; j++) {
head_outputs->data[i * d_model + h * d_k + j] = out->data[i * d_k + j];
}
}
mat_free(out);
}
return head_outputs;
}
```
- 前馈网络
```c
typedef struct {
matrix_t *W1;
matrix_t *W2;
matrix_t *b1;
matrix_t *b2;
int d_model;
int d_ff;
} feed_forward_t;
feed_forward_t *ff_create(int d_model, int d_ff) {
feed_forward_t *ff = malloc(sizeof(feed_forward_t));
ff->d_model = d_model;
ff->d_ff = d_ff;
ff->W1 = mat_create(d_ff, d_model);
ff->W2 = mat_create(d_model, d_ff);
ff->b1 = mat_create(d_ff, 1);
ff->b2 = mat_create(d_model, 1);
float scale = sqrt(2.0f / d_model);
mat_random(ff->W1, scale);
mat_random(ff->W2, scale);
return ff;
}
void ff_free(feed_forward_t *ff) {
if (ff) {
mat_free(ff->W1); mat_free(ff->W2);
mat_free(ff->b1); mat_free(ff->b2);
free(ff);
}
}
// ReLU激活
float relu(float x) {
return x > 0 ? x : 0;
}
void mat_relu(matrix_t *m) {
for (int i = 0; i < m->rows * m->cols; i++) {
m->data[i] = relu(m->data[i]);
}
}
// 前馈网络前向传播
matrix_t *ff_forward(feed_forward_t *ff, matrix_t *x) {
// FFN(x) = max(0, xW1 + b1)W2 + b2
int seq_len = x->rows;
// xW1 + b1
matrix_t *hidden = mat_create(seq_len, ff->d_ff);
mat_mul(x, ff->W1, hidden);
for (int i = 0; i < seq_len; i++) {
for (int j = 0; j < ff->d_ff; j++) {
hidden->data[i * ff->d_ff + j] += ff->b1->data[j];
}
}
// ReLU
mat_relu(hidden);
// hidden × W2 + b2
matrix_t *output = mat_create(seq_len, ff->d_model);
mat_mul(hidden, ff->W2, output);
for (int i = 0; i < seq_len; i++) {
for (int j = 0; j < ff->d_model; j++) {
output->data[i * ff->d_model + j] += ff->b2->data[j];
}
}
mat_free(hidden);
return output;
}
```
- 层归一化
```c
// 层归一化
typedef struct {
float *gamma;
float *beta;
float eps;
int size;
} layer_norm_t;
layer_norm_t *ln_create(int size, float eps) {
layer_norm_t *ln = malloc(sizeof(layer_norm_t));
ln->size = size;
ln->eps = eps;
ln->gamma = malloc(size * sizeof(float));
ln->beta = malloc(size * sizeof(float));
for (int i = 0; i < size; i++) {
ln->gamma[i] = 1.0;
ln->beta[i] = 0.0;
}
return ln;
}
void ln_free(layer_norm_t *ln) {
if (ln) {
free(ln->gamma);
free(ln->beta);
free(ln);
}
}
// 层归一化前向传播
matrix_t *ln_forward(layer_norm_t *ln, matrix_t *x) {
// LayerNorm(x) = gamma * (x - mean) / sqrt(var + eps) + beta
int seq_len = x->rows;
int d_model = x->cols;
matrix_t *output = mat_create(seq_len, d_model);
for (int i = 0; i < seq_len; i++) {
// 计算均值和方差
float mean = 0;
for (int j = 0; j < d_model; j++) {
mean += x->data[i * d_model + j];
}
mean /= d_model;
float var = 0;
for (int j = 0; j < d_model; j++) {
float diff = x->data[i * d_model + j] - mean;
var += diff * diff;
}
var /= d_model;
float std = sqrtf(var + ln->eps);
// 归一化
for (int j = 0; j < d_model; j++) {
float normalized = (x->data[i * d_model + j] - mean) / std;
output->data[i * d_model + j] = ln->gamma[j] * normalized + ln->beta[j];
}
}
return output;
}
```
- Transformer块
```c
// Transformer块
typedef struct {
multihead_attention_t *mha;
feed_forward_t *ff;
layer_norm_t *ln1;
layer_norm_t *ln2;
int d_model;
} transformer_block_t;
transformer_block_t *block_create(int d_model, int num_heads, int d_ff) {
transformer_block_t *block = malloc(sizeof(transformer_block_t));
block->d_model = d_model;
block->mha = mh_attn_create(d_model, num_heads);
block->ff = ff_create(d_model, d_ff);
block->ln1 = ln_create(d_model, 1e-5);
block->ln2 = ln_create(d_model, 1e-5);
return block;
}
void block_free(transformer_block_t *block) {
if (block) {
mh_attn_free(block->mha);
ff_free(block->ff);
ln_free(block->ln1);
ln_free(block->ln2);
free(block);
}
}
// Transformer块前向传播(带残差连接)
matrix_t *block_forward(transformer_block_t *block, matrix_t *x) {
// 多头自注意力 + 残差连接 + 层归一化
matrix_t *attn_out = mh_attn_forward(block->mha, x);
matrix_t *residual1 = mat_create(x->rows, x->cols);
mat_add(x, attn_out, residual1);
mat_free(attn_out);
matrix_t *norm1 = ln_forward(block->ln1, residual1);
mat_free(residual1);
// 前馈网络 + 残差连接 + 层归一化
matrix_t *ff_out = ff_forward(block->ff, norm1);
matrix_t *residual2 = mat_create(norm1->rows, norm1->cols);
mat_add(norm1, ff_out, residual2);
mat_free(ff_out);
mat_free(norm1);
matrix_t *output = ln_forward(block->ln2, residual2);
mat_free(residual2);
return output;
}
```
- 完整Transformer
```c
// 完整Transformer模型
typedef struct {
transformer_block_t **blocks;
int num_blocks;
int d_model;
int vocab_size;
} transformer_t;
transformer_t *transformer_create(int vocab_size, int d_model,
int num_heads, int num_blocks, int d_ff) {
transformer_t *model = malloc(sizeof(transformer_t));
model->vocab_size = vocab_size;
model->d_model = d_model;
model->num_blocks = num_blocks;
model->blocks = malloc(sizeof(transformer_block_t*) * num_blocks);
for (int i = 0; i < num_blocks; i++) {
model->blocks[i] = block_create(d_model, num_heads, d_ff);
}
return model;
}
void transformer_free(transformer_t *model) {
if (model) {
for (int i = 0; i < model->num_blocks; i++) {
block_free(model->blocks[i]);
}
free(model->blocks);
free(model);
}
}
// 词嵌入层
matrix_t *word_embedding(int *tokens, int seq_len, int vocab_size, int d_model) {
matrix_t *embed = mat_create(seq_len, d_model);
// 简化的随机嵌入(实际需要训练)
for (int i = 0; i < seq_len; i++) {
for (int j = 0; j < d_model; j++) {
embed->data[i * d_model + j] = ((float)rand() / RAND_MAX - 0.5);
}
}
return embed;
}
// 完整前向传播
matrix_t *transformer_forward(transformer_t *model, int *tokens, int seq_len) {
// 词嵌入 + 位置编码
matrix_t *x = word_embedding(tokens, seq_len, model->vocab_size, model->d_model);
matrix_t *pe = positional_encoding(seq_len, model->d_model);
mat_add(x, pe, x);
mat_free(pe);
// 通过所有Transformer块
for (int i = 0; i < model->num_blocks; i++) {
matrix_t *new_x = block_forward(model->blocks[i], x);
mat_free(x);
x = new_x;
}
return x;
}
// 预测下一个词
int transformer_predict(transformer_t *model, int *tokens, int seq_len) {
matrix_t *output = transformer_forward(model, tokens, seq_len);
// 取最后一个位置的输出,投影到词表大小
int last_pos = seq_len - 1;
float max_prob = output->data[last_pos * model->d_model];
int pred = 0;
for (int i = 1; i < model->vocab_size; i++) {
if (output->data[last_pos * model->d_model + i] > max_prob) {
max_prob = output->data[last_pos * model->d_model + i];
pred = i;
}
}
mat_free(output);
return pred;
}
```
- 简单测试
```c
int main() {
printf("=== Transformer 模型测试 ===\n\n");
srand(time(NULL));
int vocab_size = 10;
int d_model = 32;
int num_heads = 4;
int num_blocks = 2;
int d_ff = 64;
int seq_len = 5;
transformer_t *model = transformer_create(vocab_size, d_model,
num_heads, num_blocks, d_ff);
printf("模型创建完成:\n");
printf(" 词表大小: %d\n", vocab_size);
printf(" 嵌入维度: %d\n", d_model);
printf(" 注意力头数: %d\n", num_heads);
printf(" Transformer块数: %d\n", num_blocks);
printf(" 前馈网络维度: %d\n", d_ff);
printf(" 序列长度: %d\n\n", seq_len);
// 模拟输入序列
int tokens[] = {1, 2, 3, 4, 5};
printf("输入序列: ");
for (int i = 0; i < seq_len; i++) {
printf("%d ", tokens[i]);
}
printf("\n");
// 预测
int pred = transformer_predict(model, tokens, seq_len);
printf("预测下一个词: %d\n", pred);
transformer_free(model);
printf("\n=== 注意 ===\n");
printf("这是简化版实现,完整Transformer还需要:\n");
printf("1. 交叉熵损失 + 反向传播\n");
printf("2. 掩码注意力(masked attention)\n");
printf("3. 编码器-解码器结构(用于翻译)\n");
printf("4. Adam优化器 + 学习率预热\n");
return 0;
}
```
三、Transformer vs RNN对比
特性 RNN/LSTM Transformer
并行计算 ❌ 顺序 ✅ 完全并行
长期依赖 ❌ 会遗忘 ✅ 全局注意力
训练速度 慢 快
参数量 少 多
推理速度 快 慢(需要缓存KV)
内存占用 小 大(O(n²)注意力)
四、现代Transformer变体
模型 创新点 代表产品
BERT 双向注意力 谷歌搜索
GPT 单向自回归 ChatGPT
T5 统一框架 谷歌翻译
RoBERTa 改进训练 Facebook
ALBERT 参数共享 轻量级
五、总结
通过这篇文章,你学会了:
· Transformer的核心原理(自注意力 + 多头 + 残差 + 层归一化)
· 完整的多头自注意力实现
· 前馈网络、位置编码、层归一化
· 完整的Transformer块和模型
· ChatGPT的底层架构理解
Transformer是当前大语言模型的基石。虽然完整实现有更多细节(掩码、反向传播、训练),但核心思想就是这个。
下一篇预告:《大语言模型(LLM)入门:从Transformer到ChatGPT》
评论区分享一下你对Transformer的理解~