从零实现Transformer:从注意力机制到ChatGPT

前言

RNN虽然能处理序列数据,但有两个致命问题:

  1. 无法并行:必须按顺序计算,速度慢

  2. 长期依赖:即使有LSTM,长序列时早期信息还是会"遗忘"

2017年,Google提出了Transformer,用自注意力机制(Self-Attention)完全替代了RNN,开启了AI的新时代。

今天,我们用C语言从零实现一个简化版Transformer:

· 实现多头自注意力(Multi-Head Self-Attention)

· 实现前馈网络、层归一化、残差连接

· 实现位置编码

· 理解ChatGPT的底层原理


一、Transformer的核心原理

  1. 自注意力机制

核心问题:如何让模型知道句子中哪些词更重要?

```

句子:"我 爱 北京 天安门"

计算"爱"的注意力:

  • 对"我"的关注度: 0.3

  • 对"爱"的关注度: 0.5(自己)

  • 对"北京"的关注度: 0.1

  • 对"天安门"的关注度: 0.1

输出 = 0.3×"我" + 0.5×"爱" + 0.1×"北京" + 0.1×"天安门"

```

  1. 计算公式

```

Attention(Q,K,V) = softmax(Q·K^T / √d_k) · V

  • Q(Query):查询,当前词要问"我该关注谁"

  • K(Key):键,每个词的标签,用于匹配查询

  • V(Value):值,每个词的实际信息

```

  1. Transformer结构

```

输入: "我 爱 你"

词嵌入(Word Embedding) + 位置编码(Positional Encoding)

┌─────────────────────────────────────┐

│ Transformer Block × N │

│ ┌─────────────────────────────┐ │

│ │ 多头自注意力 + 残差连接 │ │

│ │ ↓ │ │

│ │ 层归一化 │ │

│ └─────────────────────────────┘ │

│ ┌─────────────────────────────┐ │

│ │ 前馈网络 + 残差连接 │ │

│ │ ↓ │ │

│ │ 层归一化 │ │

│ └─────────────────────────────┘ │

└─────────────────────────────────────┘

输出概率

```


二、完整代码实现

  1. 基础数据结构

```c

#include <stdio.h>

#include <stdlib.h>

#include <math.h>

#include <string.h>

#include <time.h>

// 矩阵结构

typedef struct {

int rows;

int cols;

float *data;

} matrix_t;

matrix_t *mat_create(int rows, int cols) {

matrix_t *m = malloc(sizeof(matrix_t));

m->rows = rows;

m->cols = cols;

m->data = calloc(rows * cols, sizeof(float));

return m;

}

void mat_free(matrix_t *m) {

if (m) {

free(m->data);

free(m);

}

}

void mat_random(matrix_t *m, float scale) {

for (int i = 0; i < m->rows * m->cols; i++) {

m->data[i] = ((float)rand() / RAND_MAX - 0.5) * 2 * scale;

}

}

void mat_copy(matrix_t *src, matrix_t *dst) {

memcpy(dst->data, src->data, src->rows * src->cols * sizeof(float));

}

// 矩阵乘法 C = A × B

void mat_mul(matrix_t *A, matrix_t *B, matrix_t *C) {

for (int i = 0; i < A->rows; i++) {

for (int j = 0; j < B->cols; j++) {

float sum = 0;

for (int k = 0; k < A->cols; k++) {

sum += A->data[i * A->cols + k] * B->data[k * B->cols + j];

}

C->data[i * C->cols + j] = sum;

}

}

}

// 矩阵加法 C = A + B

void mat_add(matrix_t *A, matrix_t *B, matrix_t *C) {

for (int i = 0; i < A->rows * A->cols; i++) {

C->data[i] = A->data[i] + B->data[i];

}

}

// 矩阵缩放

void mat_scale(matrix_t *m, float scale) {

for (int i = 0; i < m->rows * m->cols; i++) {

m->data[i] *= scale;

}

}

// Softmax(按行)

void mat_softmax(matrix_t *m) {

for (int i = 0; i < m->rows; i++) {

float max_val = m->data[i * m->cols];

for (int j = 1; j < m->cols; j++) {

if (m->data[i * m->cols + j] > max_val) {

max_val = m->data[i * m->cols + j];

}

}

float sum = 0;

for (int j = 0; j < m->cols; j++) {

m->data[i * m->cols + j] = expf(m->data[i * m->cols + j] - max_val);

sum += m->data[i * m->cols + j];

}

for (int j = 0; j < m->cols; j++) {

m->data[i * m->cols + j] /= sum;

}

}

}

```

  1. 位置编码

```c

// 位置编码(让模型知道词的位置信息)

matrix_t *positional_encoding(int seq_len, int d_model) {

matrix_t *pe = mat_create(seq_len, d_model);

for (int pos = 0; pos < seq_len; pos++) {

for (int i = 0; i < d_model; i++) {

float angle = pos / powf(10000.0, 2.0 * i / d_model);

if (i % 2 == 0) {

pe->data[pos * d_model + i] = sinf(angle);

} else {

pe->data[pos * d_model + i] = cosf(angle);

}

}

}

return pe;

}

```

  1. 自注意力机制

```c

// 单头注意力

typedef struct {

matrix_t *W_q; // Query权重 [d_model, d_k]

matrix_t *W_k; // Key权重 [d_model, d_k]

matrix_t *W_v; // Value权重 [d_model, d_v]

matrix_t *W_o; // 输出权重 [d_v, d_model]

int d_model;

int d_k;

int d_v;

} attention_head_t;

attention_head_t *attn_head_create(int d_model, int d_k, int d_v) {

attention_head_t *head = malloc(sizeof(attention_head_t));

head->d_model = d_model;

head->d_k = d_k;

head->d_v = d_v;

head->W_q = mat_create(d_k, d_model);

head->W_k = mat_create(d_k, d_model);

head->W_v = mat_create(d_v, d_model);

head->W_o = mat_create(d_model, d_v);

float scale = sqrt(2.0f / d_model);

mat_random(head->W_q, scale);

mat_random(head->W_k, scale);

mat_random(head->W_v, scale);

mat_random(head->W_o, scale);

return head;

}

void attn_head_free(attention_head_t *head) {

if (head) {

mat_free(head->W_q); mat_free(head->W_k);

mat_free(head->W_v); mat_free(head->W_o);

free(head);

}

}

// 单头注意力前向传播

matrix_t *attn_head_forward(attention_head_t *head, matrix_t *x) {

// x: [seq_len, d_model]

int seq_len = x->rows;

// Q = x × W_q^T, K = x × W_k^T, V = x × W_v^T

matrix_t *Q = mat_create(seq_len, head->d_k);

matrix_t *K = mat_create(seq_len, head->d_k);

matrix_t *V = mat_create(seq_len, head->d_v);

mat_mul(x, head->W_q, Q);

mat_mul(x, head->W_k, K);

mat_mul(x, head->W_v, V);

// 计算注意力分数: scores = Q × K^T / sqrt(d_k)

matrix_t *K_t = mat_create(head->d_k, seq_len);

for (int i = 0; i < seq_len; i++) {

for (int j = 0; j < head->d_k; j++) {

K_t->data[j * seq_len + i] = K->data[i * head->d_k + j];

}

}

matrix_t *scores = mat_create(seq_len, seq_len);

mat_mul(Q, K_t, scores);

mat_scale(scores, 1.0 / sqrt(head->d_k));

mat_softmax(scores);

mat_free(K_t);

// attention = scores × V

matrix_t *attention = mat_create(seq_len, head->d_v);

mat_mul(scores, V, attention);

// output = attention × W_o

matrix_t *output = mat_create(seq_len, head->d_model);

mat_mul(attention, head->W_o, output);

mat_free(Q); mat_free(K); mat_free(V);

mat_free(scores); mat_free(attention);

return output;

}

```

  1. 多头注意力

```c

// 多头注意力结构

typedef struct {

attention_head_t **heads;

int num_heads;

int d_model;

int d_k;

} multihead_attention_t;

multihead_attention_t *mh_attn_create(int d_model, int num_heads) {

multihead_attention_t *mha = malloc(sizeof(multihead_attention_t));

mha->num_heads = num_heads;

mha->d_model = d_model;

mha->d_k = d_model / num_heads;

mha->heads = malloc(sizeof(attention_head_t*) * num_heads);

for (int i = 0; i < num_heads; i++) {

mha->heads[i] = attn_head_create(d_model, mha->d_k, mha->d_k);

}

return mha;

}

void mh_attn_free(multihead_attention_t *mha) {

if (mha) {

for (int i = 0; i < mha->num_heads; i++) {

attn_head_free(mha->heads[i]);

}

free(mha->heads);

free(mha);

}

}

// 多头注意力前向传播

matrix_t *mh_attn_forward(multihead_attention_t *mha, matrix_t *x) {

int seq_len = x->rows;

int d_model = mha->d_model;

int d_k = mha->d_k;

int num_heads = mha->num_heads;

// 收集所有头的输出

matrix_t *head_outputs = mat_create(seq_len, d_model);

for (int h = 0; h < num_heads; h++) {

matrix_t *out = attn_head_forward(mha->heads[h], x);

for (int i = 0; i < seq_len; i++) {

for (int j = 0; j < d_k; j++) {

head_outputs->data[i * d_model + h * d_k + j] = out->data[i * d_k + j];

}

}

mat_free(out);

}

return head_outputs;

}

```

  1. 前馈网络

```c

typedef struct {

matrix_t *W1;

matrix_t *W2;

matrix_t *b1;

matrix_t *b2;

int d_model;

int d_ff;

} feed_forward_t;

feed_forward_t *ff_create(int d_model, int d_ff) {

feed_forward_t *ff = malloc(sizeof(feed_forward_t));

ff->d_model = d_model;

ff->d_ff = d_ff;

ff->W1 = mat_create(d_ff, d_model);

ff->W2 = mat_create(d_model, d_ff);

ff->b1 = mat_create(d_ff, 1);

ff->b2 = mat_create(d_model, 1);

float scale = sqrt(2.0f / d_model);

mat_random(ff->W1, scale);

mat_random(ff->W2, scale);

return ff;

}

void ff_free(feed_forward_t *ff) {

if (ff) {

mat_free(ff->W1); mat_free(ff->W2);

mat_free(ff->b1); mat_free(ff->b2);

free(ff);

}

}

// ReLU激活

float relu(float x) {

return x > 0 ? x : 0;

}

void mat_relu(matrix_t *m) {

for (int i = 0; i < m->rows * m->cols; i++) {

m->data[i] = relu(m->data[i]);

}

}

// 前馈网络前向传播

matrix_t *ff_forward(feed_forward_t *ff, matrix_t *x) {

// FFN(x) = max(0, xW1 + b1)W2 + b2

int seq_len = x->rows;

// xW1 + b1

matrix_t *hidden = mat_create(seq_len, ff->d_ff);

mat_mul(x, ff->W1, hidden);

for (int i = 0; i < seq_len; i++) {

for (int j = 0; j < ff->d_ff; j++) {

hidden->data[i * ff->d_ff + j] += ff->b1->data[j];

}

}

// ReLU

mat_relu(hidden);

// hidden × W2 + b2

matrix_t *output = mat_create(seq_len, ff->d_model);

mat_mul(hidden, ff->W2, output);

for (int i = 0; i < seq_len; i++) {

for (int j = 0; j < ff->d_model; j++) {

output->data[i * ff->d_model + j] += ff->b2->data[j];

}

}

mat_free(hidden);

return output;

}

```

  1. 层归一化

```c

// 层归一化

typedef struct {

float *gamma;

float *beta;

float eps;

int size;

} layer_norm_t;

layer_norm_t *ln_create(int size, float eps) {

layer_norm_t *ln = malloc(sizeof(layer_norm_t));

ln->size = size;

ln->eps = eps;

ln->gamma = malloc(size * sizeof(float));

ln->beta = malloc(size * sizeof(float));

for (int i = 0; i < size; i++) {

ln->gamma[i] = 1.0;

ln->beta[i] = 0.0;

}

return ln;

}

void ln_free(layer_norm_t *ln) {

if (ln) {

free(ln->gamma);

free(ln->beta);

free(ln);

}

}

// 层归一化前向传播

matrix_t *ln_forward(layer_norm_t *ln, matrix_t *x) {

// LayerNorm(x) = gamma * (x - mean) / sqrt(var + eps) + beta

int seq_len = x->rows;

int d_model = x->cols;

matrix_t *output = mat_create(seq_len, d_model);

for (int i = 0; i < seq_len; i++) {

// 计算均值和方差

float mean = 0;

for (int j = 0; j < d_model; j++) {

mean += x->data[i * d_model + j];

}

mean /= d_model;

float var = 0;

for (int j = 0; j < d_model; j++) {

float diff = x->data[i * d_model + j] - mean;

var += diff * diff;

}

var /= d_model;

float std = sqrtf(var + ln->eps);

// 归一化

for (int j = 0; j < d_model; j++) {

float normalized = (x->data[i * d_model + j] - mean) / std;

output->data[i * d_model + j] = ln->gamma[j] * normalized + ln->beta[j];

}

}

return output;

}

```

  1. Transformer块

```c

// Transformer块

typedef struct {

multihead_attention_t *mha;

feed_forward_t *ff;

layer_norm_t *ln1;

layer_norm_t *ln2;

int d_model;

} transformer_block_t;

transformer_block_t *block_create(int d_model, int num_heads, int d_ff) {

transformer_block_t *block = malloc(sizeof(transformer_block_t));

block->d_model = d_model;

block->mha = mh_attn_create(d_model, num_heads);

block->ff = ff_create(d_model, d_ff);

block->ln1 = ln_create(d_model, 1e-5);

block->ln2 = ln_create(d_model, 1e-5);

return block;

}

void block_free(transformer_block_t *block) {

if (block) {

mh_attn_free(block->mha);

ff_free(block->ff);

ln_free(block->ln1);

ln_free(block->ln2);

free(block);

}

}

// Transformer块前向传播(带残差连接)

matrix_t *block_forward(transformer_block_t *block, matrix_t *x) {

// 多头自注意力 + 残差连接 + 层归一化

matrix_t *attn_out = mh_attn_forward(block->mha, x);

matrix_t *residual1 = mat_create(x->rows, x->cols);

mat_add(x, attn_out, residual1);

mat_free(attn_out);

matrix_t *norm1 = ln_forward(block->ln1, residual1);

mat_free(residual1);

// 前馈网络 + 残差连接 + 层归一化

matrix_t *ff_out = ff_forward(block->ff, norm1);

matrix_t *residual2 = mat_create(norm1->rows, norm1->cols);

mat_add(norm1, ff_out, residual2);

mat_free(ff_out);

mat_free(norm1);

matrix_t *output = ln_forward(block->ln2, residual2);

mat_free(residual2);

return output;

}

```

  1. 完整Transformer

```c

// 完整Transformer模型

typedef struct {

transformer_block_t **blocks;

int num_blocks;

int d_model;

int vocab_size;

} transformer_t;

transformer_t *transformer_create(int vocab_size, int d_model,

int num_heads, int num_blocks, int d_ff) {

transformer_t *model = malloc(sizeof(transformer_t));

model->vocab_size = vocab_size;

model->d_model = d_model;

model->num_blocks = num_blocks;

model->blocks = malloc(sizeof(transformer_block_t*) * num_blocks);

for (int i = 0; i < num_blocks; i++) {

model->blocks[i] = block_create(d_model, num_heads, d_ff);

}

return model;

}

void transformer_free(transformer_t *model) {

if (model) {

for (int i = 0; i < model->num_blocks; i++) {

block_free(model->blocks[i]);

}

free(model->blocks);

free(model);

}

}

// 词嵌入层

matrix_t *word_embedding(int *tokens, int seq_len, int vocab_size, int d_model) {

matrix_t *embed = mat_create(seq_len, d_model);

// 简化的随机嵌入(实际需要训练)

for (int i = 0; i < seq_len; i++) {

for (int j = 0; j < d_model; j++) {

embed->data[i * d_model + j] = ((float)rand() / RAND_MAX - 0.5);

}

}

return embed;

}

// 完整前向传播

matrix_t *transformer_forward(transformer_t *model, int *tokens, int seq_len) {

// 词嵌入 + 位置编码

matrix_t *x = word_embedding(tokens, seq_len, model->vocab_size, model->d_model);

matrix_t *pe = positional_encoding(seq_len, model->d_model);

mat_add(x, pe, x);

mat_free(pe);

// 通过所有Transformer块

for (int i = 0; i < model->num_blocks; i++) {

matrix_t *new_x = block_forward(model->blocks[i], x);

mat_free(x);

x = new_x;

}

return x;

}

// 预测下一个词

int transformer_predict(transformer_t *model, int *tokens, int seq_len) {

matrix_t *output = transformer_forward(model, tokens, seq_len);

// 取最后一个位置的输出,投影到词表大小

int last_pos = seq_len - 1;

float max_prob = output->data[last_pos * model->d_model];

int pred = 0;

for (int i = 1; i < model->vocab_size; i++) {

if (output->data[last_pos * model->d_model + i] > max_prob) {

max_prob = output->data[last_pos * model->d_model + i];

pred = i;

}

}

mat_free(output);

return pred;

}

```

  1. 简单测试

```c

int main() {

printf("=== Transformer 模型测试 ===\n\n");

srand(time(NULL));

int vocab_size = 10;

int d_model = 32;

int num_heads = 4;

int num_blocks = 2;

int d_ff = 64;

int seq_len = 5;

transformer_t *model = transformer_create(vocab_size, d_model,

num_heads, num_blocks, d_ff);

printf("模型创建完成:\n");

printf(" 词表大小: %d\n", vocab_size);

printf(" 嵌入维度: %d\n", d_model);

printf(" 注意力头数: %d\n", num_heads);

printf(" Transformer块数: %d\n", num_blocks);

printf(" 前馈网络维度: %d\n", d_ff);

printf(" 序列长度: %d\n\n", seq_len);

// 模拟输入序列

int tokens[] = {1, 2, 3, 4, 5};

printf("输入序列: ");

for (int i = 0; i < seq_len; i++) {

printf("%d ", tokens[i]);

}

printf("\n");

// 预测

int pred = transformer_predict(model, tokens, seq_len);

printf("预测下一个词: %d\n", pred);

transformer_free(model);

printf("\n=== 注意 ===\n");

printf("这是简化版实现,完整Transformer还需要:\n");

printf("1. 交叉熵损失 + 反向传播\n");

printf("2. 掩码注意力(masked attention)\n");

printf("3. 编码器-解码器结构(用于翻译)\n");

printf("4. Adam优化器 + 学习率预热\n");

return 0;

}

```


三、Transformer vs RNN对比

特性 RNN/LSTM Transformer

并行计算 ❌ 顺序 ✅ 完全并行

长期依赖 ❌ 会遗忘 ✅ 全局注意力

训练速度 慢 快

参数量 少 多

推理速度 快 慢(需要缓存KV)

内存占用 小 大(O(n²)注意力)


四、现代Transformer变体

模型 创新点 代表产品

BERT 双向注意力 谷歌搜索

GPT 单向自回归 ChatGPT

T5 统一框架 谷歌翻译

RoBERTa 改进训练 Facebook

ALBERT 参数共享 轻量级


五、总结

通过这篇文章,你学会了:

· Transformer的核心原理(自注意力 + 多头 + 残差 + 层归一化)

· 完整的多头自注意力实现

· 前馈网络、位置编码、层归一化

· 完整的Transformer块和模型

· ChatGPT的底层架构理解

Transformer是当前大语言模型的基石。虽然完整实现有更多细节(掩码、反向传播、训练),但核心思想就是这个。

下一篇预告:《大语言模型(LLM)入门:从Transformer到ChatGPT》


评论区分享一下你对Transformer的理解~

相关推荐
weixin_537217064 小时前
聊天技巧资源合集
经验分享
学习中的码虫4 小时前
(C++)从this构造shared_ptr导致多控制块的处理
c++
进击的荆棘4 小时前
优选算法——哈希表
c++·算法·leetcode·哈希算法·散列表
RH2312114 小时前
2026.5.17数据结构 八大排序
数据结构·算法·排序算法
毋语天4 小时前
NumPy 完全入门指南:核心数据结构与高效数值计算
数据结构·numpy
蜡笔小马4 小时前
12.C++设计模式-模板方法模式
c++·设计模式·模板方法模式
江屿风4 小时前
【C++笔记】内存管理流食般投喂
开发语言·c++·笔记
雪度娃娃4 小时前
行为型设计模式——备忘录模式
服务器·c++·设计模式·备忘录模式
khalil10204 小时前
代码随想录算法训练营Day-55 图论06 | 108.冗余连接、109.冗余连接II
c++·算法·leetcode·图论·并查集