前言
你有没有遇到过这种情况:一个请求经过3个微服务,结果报错了,但不知道是哪个服务出的问题,也不知道调用链路是什么样的。
分布式链路追踪通过为每个请求生成唯一的TraceId,记录经过的每个Span,让我们能清晰地看到请求的完整路径。
今天我们用C语言从零实现一个分布式链路追踪系统:
· TraceId生成与传递
· Span的创建和管理
· 链路数据收集和存储
· 可视化查询
· 采样策略
一、链路追踪核心原理
- 核心概念
```
┌──────────────────────────────────────────────────────────────┐
│ Trace │
│ ┌────────────┐ ┌────────────┐ ┌────────────┐ │
│ │ Span A │────│ Span B │────│ Span C │ │
│ │ (服务A) │ │ (服务B) │ │ (服务C) │ │
│ └────────────┘ └────────────┘ └────────────┘ │
│ │ │ │ │
│ ┌────┴────┐ ┌────┴────┐ ┌────┴────┐ │
│ │Span A-1 │ │Span B-1 │ │Span C-1 │ │
│ └─────────┘ └─────────┘ └─────────┘ │
└──────────────────────────────────────────────────────────────┘
```
概念 说明
Trace 一次完整请求的调用链
TraceId 全局唯一标识,贯穿整个调用链
Span 一次调用(一个服务/一个操作)
SpanId 当前Span的唯一标识
ParentSpanId 父Span的标识
Annotation 时间戳标注(开始、结束等)
- 数据传递方式
```
HTTP Header:
X-B3-TraceId: 52a7e3b2d1c4e8f9
X-B3-SpanId: a1b2c3d4e5f6
X-B3-ParentSpanId: 9876543210
RPC Context:
trace_id: 52a7e3b2d1c4e8f9
span_id: a1b2c3d4e5f6
parent_span_id: 9876543210
```
二、完整代码实现
- 基础数据结构
```c
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <pthread.h>
#include <time.h>
#include <sys/time.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <errno.h>
#include <signal.h>
#include <fcntl.h>
#define MAX_TRACE_ID 32
#define MAX_SPAN_ID 16
#define MAX_SERVICE_NAME 64
#define MAX_SPAN_NAME 128
#define MAX_TAGS 16
// 时间戳(微秒)
typedef long long timestamp_t;
// Span结构
typedef struct span {
char trace_idMAX_TRACE_ID; // 全局TraceId
char span_idMAX_SPAN_ID; // SpanId
char parent_span_idMAX_SPAN_ID; // 父SpanId
char service_nameMAX_SERVICE_NAME; // 服务名
char span_nameMAX_SPAN_NAME; // Span名称
timestamp_t start_time; // 开始时间
timestamp_t end_time; // 结束时间
struct {
char key64;
char value256;
} tagsMAX_TAGS;
int tag_count;
struct span *next;
} span_t;
// Trace结构
typedef struct trace {
char trace_idMAX_TRACE_ID;
span_t *spans;
int span_count;
struct trace *next;
} trace_t;
// 链路追踪收集器
typedef struct tracer {
trace_t *traces; // 存储所有trace
int max_traces; // 最大存储数量
int sampling_rate; // 采样率 (0-100)
pthread_mutex_t mutex;
int collector_port; // 收集端口
int query_port; // 查询端口
int running;
} tracer_t;
```
- ID生成器
```c
// 生成TraceId (16字节十六进制)
void generate_trace_id(char *buf, int size) {
unsigned char bytes16;
for (int i = 0; i < 16; i++) {
bytesi = rand() & 0xFF;
}
for (int i = 0; i < 16 && i * 2 < size - 1; i++) {
snprintf(buf + i * 2, 3, "%02x", bytesi);
}
buf32 = '\0';
}
// 生成SpanId (8字节十六进制)
void generate_span_id(char *buf, int size) {
unsigned long long id = 0;
for (int i = 0; i < 8; i++) {
id = (id << 8) | (rand() & 0xFF);
}
snprintf(buf, size, "%016llx", id);
}
// 获取当前时间戳(微秒)
timestamp_t get_timestamp_us() {
struct timeval tv;
gettimeofday(&tv, NULL);
return (timestamp_t)tv.tv_sec * 1000000 + tv.tv_usec;
}
// 从HTTP头提取Trace信息
void extract_trace_from_headers(const char *headers,
char *trace_id, char *span_id, char *parent_span_id) {
// 简化实现,实际需要解析headers
// X-B3-TraceId: xxx
// X-B3-SpanId: xxx
// X-B3-ParentSpanId: xxx
const char *patterns\[\] = {"X-B3-TraceId:", "X-B3-SpanId:", "X-B3-ParentSpanId:"};
char *dests\[\] = {trace_id, span_id, parent_span_id};
for (int i = 0; i < 3; i++) {
char *start = strstr(headers, patternsi);
if (start) {
start += strlen(patternsi);
while (*start == ' ') start++;
char *end = strchr(start, '\r');
if (!end) end = strchr(start, '\n');
if (end) {
int len = end - start;
if (len < 64) {
strncpy(destsi, start, len);
destsilen = '\0';
}
}
}
}
}
```
- 核心追踪器
```c
// 创建追踪器
tracer_t *tracer_create(int collector_port, int query_port) {
tracer_t *t = malloc(sizeof(tracer_t));
t->traces = NULL;
t->max_traces = 10000;
t->sampling_rate = 100;
t->collector_port = collector_port;
t->query_port = query_port;
t->running = 1;
pthread_mutex_init(&t->mutex, NULL);
printf("链路追踪启动: collector=%d, query=%d\n", collector_port, query_port);
return t;
}
// 创建新Span
span_t *create_span(const char *trace_id, const char *parent_span_id,
const char *service_name, const char *span_name) {
span_t *s = malloc(sizeof(span_t));
memset(s, 0, sizeof(span_t));
strcpy(s->trace_id, trace_id);
generate_span_id(s->span_id, sizeof(s->span_id));
if (parent_span_id && strlen(parent_span_id) > 0) {
strcpy(s->parent_span_id, parent_span_id);
}
strcpy(s->service_name, service_name);
strcpy(s->span_name, span_name);
s->start_time = get_timestamp_us();
s->tag_count = 0;
s->next = NULL;
return s;
}
// 结束Span
void finish_span(span_t *s) {
s->end_time = get_timestamp_us();
}
// 添加标签
void span_add_tag(span_t *s, const char *key, const char *value) {
if (s->tag_count >= MAX_TAGS) return;
strcpy(s->tagss-\>tag_count.key, key);
strcpy(s->tagss-\>tag_count.value, value);
s->tag_count++;
}
// 记录到追踪器
void tracer_save(tracer_t *t, trace_t *trace) {
pthread_mutex_lock(&t->mutex);
// 如果trace数量超限,删除最老的
int count = 0;
trace_t *cur = t->traces;
while (cur) {
count++;
cur = cur->next;
}
if (count >= t->max_traces) {
trace_t *oldest = t->traces;
t->traces = oldest->next;
// 释放内存
span_t *span = oldest->spans;
while (span) {
span_t *next = span->next;
free(span);
span = next;
}
free(oldest);
}
// 添加到链头
trace->next = t->traces;
t->traces = trace;
pthread_mutex_unlock(&t->mutex);
}
// 开始一个新的Trace
trace_t *tracer_start_trace(tracer_t *t, const char *service_name, const char *span_name) {
if (t->sampling_rate > 0 && (rand() % 100) >= t->sampling_rate) {
return NULL; // 未采样
}
trace_t *trace = malloc(sizeof(trace_t));
generate_trace_id(trace->trace_id, sizeof(trace->trace_id));
trace->spans = NULL;
trace->span_count = 0;
trace->next = NULL;
// 创建根Span
span_t *root = create_span(trace->trace_id, NULL, service_name, span_name);
root->next = trace->spans;
trace->spans = root;
trace->span_count++;
return trace;
}
// 在现有Trace中创建子Span
span_t *tracer_start_span(tracer_t *t, trace_t *trace,
const char *service_name, const char *span_name,
const char *parent_span_id) {
if (!trace) return NULL;
span_t *s = create_span(trace->trace_id, parent_span_id, service_name, span_name);
s->next = trace->spans;
trace->spans = s;
trace->span_count++;
return s;
}
```
- 数据收集服务
```c
// 解析并保存收集到的Span数据
void handle_collector_request(tracer_t *t, int client_fd) {
char buffer65536;
int n = recv(client_fd, buffer, sizeof(buffer) - 1, 0);
if (n <= 0) {
close(client_fd);
return;
}
buffern = '\0';
// 解析Trace数据(简化版,实际用JSON/Protobuf)
// 格式: TRACE|trace_id|service_name|span_name|parent_id|duration|tag1=val1|tag2=val2...
trace_t *trace = malloc(sizeof(trace_t));
memset(trace, 0, sizeof(trace_t));
trace->spans = NULL;
trace->span_count = 0;
char *line = strtok(buffer, "\n");
while (line) {
// 解析每一行Span数据
char trace_id64, service64, span_name128, parent_id64;
long long duration;
if (sscanf(line, "SPAN|%\^\||%\^\||%\^\||%\^\||%lld",
trace_id, service, span_name, parent_id, &duration) == 5) {
if (trace->span_count == 0) {
strcpy(trace->trace_id, trace_id);
}
span_t *s = create_span(trace_id, parent_id, service, span_name);
s->end_time = s->start_time + duration;
// 解析tags
char *tags_start = strchr(line, '|');
for (int i = 0; i < 5 && tags_start; i++) {
tags_start = strchr(tags_start + 1, '|');
}
if (tags_start) {
tags_start++;
char *tag = strtok(tags_start, "|");
while (tag) {
char *eq = strchr(tag, '=');
if (eq) {
*eq = '\0';
span_add_tag(s, tag, eq + 1);
}
tag = strtok(NULL, "|");
}
}
s->next = trace->spans;
trace->spans = s;
trace->span_count++;
}
line = strtok(NULL, "\n");
}
if (trace->span_count > 0) {
tracer_save(t, trace);
send(client_fd, "OK\n", 3, 0);
} else {
free(trace);
send(client_fd, "ERROR\n", 6, 0);
}
close(client_fd);
}
// 收集服务线程
void *collector_thread(void *arg) {
tracer_t *t = (tracer_t*)arg;
int server_fd = socket(AF_INET, SOCK_STREAM, 0);
int opt = 1;
setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
struct sockaddr_in addr;
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = INADDR_ANY;
addr.sin_port = htons(t->collector_port);
bind(server_fd, (struct sockaddr*)&addr, sizeof(addr));
listen(server_fd, 128);
printf("收集服务启动: %d\n", t->collector_port);
while (t->running) {
int client_fd = accept(server_fd, NULL, NULL);
if (client_fd < 0) continue;
handle_collector_request(t, client_fd);
}
close(server_fd);
return NULL;
}
```
- 查询服务
```c
// 查询Trace
void handle_query_request(tracer_t *t, int client_fd, const char *request) {
char response65536 = "";
// 解析: QUERY|trace_id
if (strncmp(request, "QUERY|", 6) == 0) {
char trace_idMAX_TRACE_ID;
strcpy(trace_id, request + 6);
trace_idstrcspn(trace_id, "\\n") = '\0';
pthread_mutex_lock(&t->mutex);
trace_t *trace = t->traces;
while (trace) {
if (strcmp(trace->trace_id, trace_id) == 0) {
// 构建响应
char *ptr = response;
ptr += sprintf(ptr, "TRACE|%s\n", trace_id);
span_t *span = trace->spans;
while (span) {
long long duration = span->end_time - span->start_time;
ptr += sprintf(ptr, "SPAN|%s|%s|%s|%lld",
span->span_id, span->parent_span_id,
span->service_name, duration);
for (int i = 0; i < span->tag_count; i++) {
ptr += sprintf(ptr, "|%s=%s",
span->tagsi.key, span->tagsi.value);
}
ptr += sprintf(ptr, "\n");
span = span->next;
}
pthread_mutex_unlock(&t->mutex);
send(client_fd, response, strlen(response), 0);
close(client_fd);
return;
}
trace = trace->next;
}
pthread_mutex_unlock(&t->mutex);
send(client_fd, "NOT_FOUND\n", 10, 0);
} else if (strncmp(request, "LIST", 4) == 0) {
// 列出最近Trace
pthread_mutex_lock(&t->mutex);
char *ptr = response;
trace_t *trace = t->traces;
int count = 0;
while (trace && count < 100) {
ptr += sprintf(ptr, "%s\n", trace->trace_id);
count++;
trace = trace->next;
}
pthread_mutex_unlock(&t->mutex);
send(client_fd, response, strlen(response), 0);
}
close(client_fd);
}
void *query_thread(void *arg) {
tracer_t *t = (tracer_t*)arg;
int server_fd = socket(AF_INET, SOCK_STREAM, 0);
int opt = 1;
setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));
struct sockaddr_in addr;
addr.sin_family = AF_INET;
addr.sin_addr.s_addr = INADDR_ANY;
addr.sin_port = htons(t->query_port);
bind(server_fd, (struct sockaddr*)&addr, sizeof(addr));
listen(server_fd, 128);
printf("查询服务启动: %d\n", t->query_port);
while (t->running) {
int client_fd = accept(server_fd, NULL, NULL);
if (client_fd < 0) continue;
char buffer4096;
int n = recv(client_fd, buffer, sizeof(buffer) - 1, 0);
if (n > 0) {
buffern = '\0';
handle_query_request(t, client_fd, buffer);
} else {
close(client_fd);
}
}
close(server_fd);
return NULL;
}
```
- HTTP中间件(集成示例)
```c
// 模拟HTTP请求处理器
void handle_http_request(tracer_t *t, trace_t *trace, const char *headers, int client_fd) {
// 提取Trace信息
char trace_idMAX_TRACE_ID = "";
char span_idMAX_SPAN_ID = "";
char parent_span_idMAX_SPAN_ID = "";
extract_trace_from_headers(headers, trace_id, span_id, parent_span_id);
// 如果是新请求,创建新Trace
if (strlen(trace_id) == 0) {
trace = tracer_start_trace(t, "gateway", "http.request");
if (trace) {
strcpy(trace_id, trace->trace_id);
span_id = trace->spans->span_id;
}
}
// 创建Span
span_t *span = tracer_start_span(t, trace, "backend-service", "process.request", span_id);
if (span) {
span_add_tag(span, "http.method", "GET");
span_add_tag(span, "http.path", "/api/users");
// ... 处理业务逻辑
finish_span(span);
}
// 保存Trace
if (trace) {
tracer_save(t, trace);
// 响应中添加TraceId
char response256;
snprintf(response, sizeof(response),
"HTTP/1.1 200 OK\r\n"
"X-B3-TraceId: %s\r\n"
"\r\n"
"OK\n", trace_id);
send(client_fd, response, strlen(response), 0);
}
}
```
- 采样策略
```c
// 多种采样策略
typedef enum {
SAMPLING_RATE, // 固定比例采样
SAMPLING_QUOTA, // 配额采样
SAMPLING_LAST, // 尾采样
} sampling_strategy_t;
typedef struct {
sampling_strategy_t strategy;
union {
int rate; // 0-100
int quota; // 每分钟采样数量
double latency; // 尾采样延迟阈值(ms)
};
} sampling_config_t;
int should_sample(sampling_config_t *config, timestamp_t start_time) {
static int quota_used = 0;
static time_t last_reset = 0;
switch (config->strategy) {
case SAMPLING_RATE:
return (rand() % 100) < config->rate;
case SAMPLING_QUOTA:
time_t now = time(NULL);
if (now - last_reset > 60) {
quota_used = 0;
last_reset = now;
}
if (quota_used < config->quota) {
quota_used++;
return 1;
}
return 0;
case SAMPLING_LAST:
// 先记录所有请求,延迟采样(需要事后处理)
return 1;
default:
return 1;
}
}
```
三、测试代码
```c
int main() {
printf("=== 分布式链路追踪测试 ===\n\n");
srand(time(NULL));
// 创建追踪器
tracer_t *t = tracer_create(9090, 9091);
// 启动收集和查询服务
pthread_t collect_thread, query_thread;
pthread_create(&collect_thread, NULL, collector_thread, t);
pthread_create(&query_thread, NULL, query_thread, t);
// 模拟三个服务调用
printf("模拟分布式调用链...\n");
// 服务A: 网关收到请求
trace_t *trace = tracer_start_trace(t, "gateway", "http.request");
if (trace) {
span_t *gateway_span = trace->spans;
span_add_tag(gateway_span, "http.method", "GET");
span_add_tag(gateway_span, "http.path", "/api/order");
span_add_tag(gateway_span, "status", "200");
// 模拟调用服务B (延时)
usleep(50000);
// 服务B: 订单服务
span_t *order_span = tracer_start_span(t, trace, "order-service", "get.order",
gateway_span->span_id);
if (order_span) {
span_add_tag(order_span, "order.id", "12345");
span_add_tag(order_span, "db.operation", "select");
// 模拟调用服务C (延时)
usleep(30000);
// 服务C: 用户服务
span_t *user_span = tracer_start_span(t, trace, "user-service", "get.user",
order_span->span_id);
if (user_span) {
span_add_tag(user_span, "user.id", "67890");
span_add_tag(user_span, "cache.hit", "true");
usleep(20000);
finish_span(user_span);
}
finish_span(order_span);
}
finish_span(gateway_span);
// 保存完整的Trace
tracer_save(t, trace);
printf("Trace完成: %s\n", trace->trace_id);
}
// 等待数据写入
sleep(1);
printf("\n查询最近Trace:\n");
// 通过模拟查询(实际可以通过客户端发送查询请求)
sleep(2);
t->running = 0;
pthread_join(collect_thread, NULL);
pthread_join(query_thread, NULL);
free(t);
return 0;
}
```
四、编译和运行
```bash
gcc -o tracing tracing.c -lpthread
./tracing
```
输出示例:
```
=== 分布式链路追踪测试 ===
链路追踪启动: collector=9090, query=9091
收集服务启动: 9090
查询服务启动: 9091
模拟分布式调用链...
Trace完成: 52a7e3b2d1c4e8f9a1b2c3d4e5f6a7b8
查询最近Trace:
52a7e3b2d1c4e8f9a1b2c3d4e5f6a7b8
```
五、可视化示例
```html
<!DOCTYPE html>
<html>
<head><title>链路追踪</title></head>
<body>
<h1>Trace: <span id="trace_id"></span></h1>
<div id="timeline"></div>
<script>
function renderTrace(traceId) {
fetch('/api/trace/' + traceId)
.then(r => r.json())
.then(data => {
// 绘制时间线
const minTime = Math.min(...data.spans.map(s => s.start));
const maxTime = Math.max(...data.spans.map(s => s.end));
const duration = maxTime - minTime;
data.spans.forEach(span => {
const left = (span.start - minTime) / duration * 100;
const width = (span.end - span.start) / duration * 100;
// 渲染为条形图
});
});
}
</script>
</body>
</html>
```
六、总结
通过这篇文章,你学会了:
· 链路追踪的核心概念(Trace、Span、TraceId)
· 分布式上下文传递(HTTP头/RPC上下文)
· Span的创建、管理和标签
· 数据收集和查询服务
· 采样策略(比例、配额、尾采样)
分布式链路追踪是可观测性三大支柱之一。掌握它,你就拥有了排查微服务问题的"火眼金睛"。
下一篇预告:《从零实现一个服务网格:Sidecar与流量管理》
评论区分享一下你遇到过的最难排查的分布式问题~