从零实现一个分布式链路追踪:TraceId与Span

前言

你有没有遇到过这种情况:一个请求经过3个微服务,结果报错了,但不知道是哪个服务出的问题,也不知道调用链路是什么样的。

分布式链路追踪通过为每个请求生成唯一的TraceId,记录经过的每个Span,让我们能清晰地看到请求的完整路径。

今天我们用C语言从零实现一个分布式链路追踪系统:

· TraceId生成与传递

· Span的创建和管理

· 链路数据收集和存储

· 可视化查询

· 采样策略


一、链路追踪核心原理

  1. 核心概念

```

┌──────────────────────────────────────────────────────────────┐

│ Trace │

│ ┌────────────┐ ┌────────────┐ ┌────────────┐ │

│ │ Span A │────│ Span B │────│ Span C │ │

│ │ (服务A) │ │ (服务B) │ │ (服务C) │ │

│ └────────────┘ └────────────┘ └────────────┘ │

│ │ │ │ │

│ ┌────┴────┐ ┌────┴────┐ ┌────┴────┐ │

│ │Span A-1 │ │Span B-1 │ │Span C-1 │ │

│ └─────────┘ └─────────┘ └─────────┘ │

└──────────────────────────────────────────────────────────────┘

```

概念 说明

Trace 一次完整请求的调用链

TraceId 全局唯一标识,贯穿整个调用链

Span 一次调用(一个服务/一个操作)

SpanId 当前Span的唯一标识

ParentSpanId 父Span的标识

Annotation 时间戳标注(开始、结束等)

  1. 数据传递方式

```

HTTP Header:

X-B3-TraceId: 52a7e3b2d1c4e8f9

X-B3-SpanId: a1b2c3d4e5f6

X-B3-ParentSpanId: 9876543210

RPC Context:

trace_id: 52a7e3b2d1c4e8f9

span_id: a1b2c3d4e5f6

parent_span_id: 9876543210

```


二、完整代码实现

  1. 基础数据结构

```c

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <unistd.h>

#include <pthread.h>

#include <time.h>

#include <sys/time.h>

#include <sys/socket.h>

#include <netinet/in.h>

#include <arpa/inet.h>

#include <errno.h>

#include <signal.h>

#include <fcntl.h>

#define MAX_TRACE_ID 32

#define MAX_SPAN_ID 16

#define MAX_SERVICE_NAME 64

#define MAX_SPAN_NAME 128

#define MAX_TAGS 16

// 时间戳(微秒)

typedef long long timestamp_t;

// Span结构

typedef struct span {

char trace_idMAX_TRACE_ID; // 全局TraceId

char span_idMAX_SPAN_ID; // SpanId

char parent_span_idMAX_SPAN_ID; // 父SpanId

char service_nameMAX_SERVICE_NAME; // 服务名

char span_nameMAX_SPAN_NAME; // Span名称

timestamp_t start_time; // 开始时间

timestamp_t end_time; // 结束时间

struct {

char key64;

char value256;

} tagsMAX_TAGS;

int tag_count;

struct span *next;

} span_t;

// Trace结构

typedef struct trace {

char trace_idMAX_TRACE_ID;

span_t *spans;

int span_count;

struct trace *next;

} trace_t;

// 链路追踪收集器

typedef struct tracer {

trace_t *traces; // 存储所有trace

int max_traces; // 最大存储数量

int sampling_rate; // 采样率 (0-100)

pthread_mutex_t mutex;

int collector_port; // 收集端口

int query_port; // 查询端口

int running;

} tracer_t;

```

  1. ID生成器

```c

// 生成TraceId (16字节十六进制)

void generate_trace_id(char *buf, int size) {

unsigned char bytes16;

for (int i = 0; i < 16; i++) {

bytesi = rand() & 0xFF;

}

for (int i = 0; i < 16 && i * 2 < size - 1; i++) {

snprintf(buf + i * 2, 3, "%02x", bytesi);

}

buf32 = '\0';

}

// 生成SpanId (8字节十六进制)

void generate_span_id(char *buf, int size) {

unsigned long long id = 0;

for (int i = 0; i < 8; i++) {

id = (id << 8) | (rand() & 0xFF);

}

snprintf(buf, size, "%016llx", id);

}

// 获取当前时间戳(微秒)

timestamp_t get_timestamp_us() {

struct timeval tv;

gettimeofday(&tv, NULL);

return (timestamp_t)tv.tv_sec * 1000000 + tv.tv_usec;

}

// 从HTTP头提取Trace信息

void extract_trace_from_headers(const char *headers,

char *trace_id, char *span_id, char *parent_span_id) {

// 简化实现,实际需要解析headers

// X-B3-TraceId: xxx

// X-B3-SpanId: xxx

// X-B3-ParentSpanId: xxx

const char *patterns\[\] = {"X-B3-TraceId:", "X-B3-SpanId:", "X-B3-ParentSpanId:"};

char *dests\[\] = {trace_id, span_id, parent_span_id};

for (int i = 0; i < 3; i++) {

char *start = strstr(headers, patternsi);

if (start) {

start += strlen(patternsi);

while (*start == ' ') start++;

char *end = strchr(start, '\r');

if (!end) end = strchr(start, '\n');

if (end) {

int len = end - start;

if (len < 64) {

strncpy(destsi, start, len);

destsilen = '\0';

}

}

}

}

}

```

  1. 核心追踪器

```c

// 创建追踪器

tracer_t *tracer_create(int collector_port, int query_port) {

tracer_t *t = malloc(sizeof(tracer_t));

t->traces = NULL;

t->max_traces = 10000;

t->sampling_rate = 100;

t->collector_port = collector_port;

t->query_port = query_port;

t->running = 1;

pthread_mutex_init(&t->mutex, NULL);

printf("链路追踪启动: collector=%d, query=%d\n", collector_port, query_port);

return t;

}

// 创建新Span

span_t *create_span(const char *trace_id, const char *parent_span_id,

const char *service_name, const char *span_name) {

span_t *s = malloc(sizeof(span_t));

memset(s, 0, sizeof(span_t));

strcpy(s->trace_id, trace_id);

generate_span_id(s->span_id, sizeof(s->span_id));

if (parent_span_id && strlen(parent_span_id) > 0) {

strcpy(s->parent_span_id, parent_span_id);

}

strcpy(s->service_name, service_name);

strcpy(s->span_name, span_name);

s->start_time = get_timestamp_us();

s->tag_count = 0;

s->next = NULL;

return s;

}

// 结束Span

void finish_span(span_t *s) {

s->end_time = get_timestamp_us();

}

// 添加标签

void span_add_tag(span_t *s, const char *key, const char *value) {

if (s->tag_count >= MAX_TAGS) return;

strcpy(s->tagss-\>tag_count.key, key);

strcpy(s->tagss-\>tag_count.value, value);

s->tag_count++;

}

// 记录到追踪器

void tracer_save(tracer_t *t, trace_t *trace) {

pthread_mutex_lock(&t->mutex);

// 如果trace数量超限,删除最老的

int count = 0;

trace_t *cur = t->traces;

while (cur) {

count++;

cur = cur->next;

}

if (count >= t->max_traces) {

trace_t *oldest = t->traces;

t->traces = oldest->next;

// 释放内存

span_t *span = oldest->spans;

while (span) {

span_t *next = span->next;

free(span);

span = next;

}

free(oldest);

}

// 添加到链头

trace->next = t->traces;

t->traces = trace;

pthread_mutex_unlock(&t->mutex);

}

// 开始一个新的Trace

trace_t *tracer_start_trace(tracer_t *t, const char *service_name, const char *span_name) {

if (t->sampling_rate > 0 && (rand() % 100) >= t->sampling_rate) {

return NULL; // 未采样

}

trace_t *trace = malloc(sizeof(trace_t));

generate_trace_id(trace->trace_id, sizeof(trace->trace_id));

trace->spans = NULL;

trace->span_count = 0;

trace->next = NULL;

// 创建根Span

span_t *root = create_span(trace->trace_id, NULL, service_name, span_name);

root->next = trace->spans;

trace->spans = root;

trace->span_count++;

return trace;

}

// 在现有Trace中创建子Span

span_t *tracer_start_span(tracer_t *t, trace_t *trace,

const char *service_name, const char *span_name,

const char *parent_span_id) {

if (!trace) return NULL;

span_t *s = create_span(trace->trace_id, parent_span_id, service_name, span_name);

s->next = trace->spans;

trace->spans = s;

trace->span_count++;

return s;

}

```

  1. 数据收集服务

```c

// 解析并保存收集到的Span数据

void handle_collector_request(tracer_t *t, int client_fd) {

char buffer65536;

int n = recv(client_fd, buffer, sizeof(buffer) - 1, 0);

if (n <= 0) {

close(client_fd);

return;

}

buffern = '\0';

// 解析Trace数据(简化版,实际用JSON/Protobuf)

// 格式: TRACE|trace_id|service_name|span_name|parent_id|duration|tag1=val1|tag2=val2...

trace_t *trace = malloc(sizeof(trace_t));

memset(trace, 0, sizeof(trace_t));

trace->spans = NULL;

trace->span_count = 0;

char *line = strtok(buffer, "\n");

while (line) {

// 解析每一行Span数据

char trace_id64, service64, span_name128, parent_id64;

long long duration;

if (sscanf(line, "SPAN|%\^\||%\^\||%\^\||%\^\||%lld",

trace_id, service, span_name, parent_id, &duration) == 5) {

if (trace->span_count == 0) {

strcpy(trace->trace_id, trace_id);

}

span_t *s = create_span(trace_id, parent_id, service, span_name);

s->end_time = s->start_time + duration;

// 解析tags

char *tags_start = strchr(line, '|');

for (int i = 0; i < 5 && tags_start; i++) {

tags_start = strchr(tags_start + 1, '|');

}

if (tags_start) {

tags_start++;

char *tag = strtok(tags_start, "|");

while (tag) {

char *eq = strchr(tag, '=');

if (eq) {

*eq = '\0';

span_add_tag(s, tag, eq + 1);

}

tag = strtok(NULL, "|");

}

}

s->next = trace->spans;

trace->spans = s;

trace->span_count++;

}

line = strtok(NULL, "\n");

}

if (trace->span_count > 0) {

tracer_save(t, trace);

send(client_fd, "OK\n", 3, 0);

} else {

free(trace);

send(client_fd, "ERROR\n", 6, 0);

}

close(client_fd);

}

// 收集服务线程

void *collector_thread(void *arg) {

tracer_t *t = (tracer_t*)arg;

int server_fd = socket(AF_INET, SOCK_STREAM, 0);

int opt = 1;

setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));

struct sockaddr_in addr;

addr.sin_family = AF_INET;

addr.sin_addr.s_addr = INADDR_ANY;

addr.sin_port = htons(t->collector_port);

bind(server_fd, (struct sockaddr*)&addr, sizeof(addr));

listen(server_fd, 128);

printf("收集服务启动: %d\n", t->collector_port);

while (t->running) {

int client_fd = accept(server_fd, NULL, NULL);

if (client_fd < 0) continue;

handle_collector_request(t, client_fd);

}

close(server_fd);

return NULL;

}

```

  1. 查询服务

```c

// 查询Trace

void handle_query_request(tracer_t *t, int client_fd, const char *request) {

char response65536 = "";

// 解析: QUERY|trace_id

if (strncmp(request, "QUERY|", 6) == 0) {

char trace_idMAX_TRACE_ID;

strcpy(trace_id, request + 6);

trace_idstrcspn(trace_id, "\\n") = '\0';

pthread_mutex_lock(&t->mutex);

trace_t *trace = t->traces;

while (trace) {

if (strcmp(trace->trace_id, trace_id) == 0) {

// 构建响应

char *ptr = response;

ptr += sprintf(ptr, "TRACE|%s\n", trace_id);

span_t *span = trace->spans;

while (span) {

long long duration = span->end_time - span->start_time;

ptr += sprintf(ptr, "SPAN|%s|%s|%s|%lld",

span->span_id, span->parent_span_id,

span->service_name, duration);

for (int i = 0; i < span->tag_count; i++) {

ptr += sprintf(ptr, "|%s=%s",

span->tagsi.key, span->tagsi.value);

}

ptr += sprintf(ptr, "\n");

span = span->next;

}

pthread_mutex_unlock(&t->mutex);

send(client_fd, response, strlen(response), 0);

close(client_fd);

return;

}

trace = trace->next;

}

pthread_mutex_unlock(&t->mutex);

send(client_fd, "NOT_FOUND\n", 10, 0);

} else if (strncmp(request, "LIST", 4) == 0) {

// 列出最近Trace

pthread_mutex_lock(&t->mutex);

char *ptr = response;

trace_t *trace = t->traces;

int count = 0;

while (trace && count < 100) {

ptr += sprintf(ptr, "%s\n", trace->trace_id);

count++;

trace = trace->next;

}

pthread_mutex_unlock(&t->mutex);

send(client_fd, response, strlen(response), 0);

}

close(client_fd);

}

void *query_thread(void *arg) {

tracer_t *t = (tracer_t*)arg;

int server_fd = socket(AF_INET, SOCK_STREAM, 0);

int opt = 1;

setsockopt(server_fd, SOL_SOCKET, SO_REUSEADDR, &opt, sizeof(opt));

struct sockaddr_in addr;

addr.sin_family = AF_INET;

addr.sin_addr.s_addr = INADDR_ANY;

addr.sin_port = htons(t->query_port);

bind(server_fd, (struct sockaddr*)&addr, sizeof(addr));

listen(server_fd, 128);

printf("查询服务启动: %d\n", t->query_port);

while (t->running) {

int client_fd = accept(server_fd, NULL, NULL);

if (client_fd < 0) continue;

char buffer4096;

int n = recv(client_fd, buffer, sizeof(buffer) - 1, 0);

if (n > 0) {

buffern = '\0';

handle_query_request(t, client_fd, buffer);

} else {

close(client_fd);

}

}

close(server_fd);

return NULL;

}

```

  1. HTTP中间件(集成示例)

```c

// 模拟HTTP请求处理器

void handle_http_request(tracer_t *t, trace_t *trace, const char *headers, int client_fd) {

// 提取Trace信息

char trace_idMAX_TRACE_ID = "";

char span_idMAX_SPAN_ID = "";

char parent_span_idMAX_SPAN_ID = "";

extract_trace_from_headers(headers, trace_id, span_id, parent_span_id);

// 如果是新请求,创建新Trace

if (strlen(trace_id) == 0) {

trace = tracer_start_trace(t, "gateway", "http.request");

if (trace) {

strcpy(trace_id, trace->trace_id);

span_id = trace->spans->span_id;

}

}

// 创建Span

span_t *span = tracer_start_span(t, trace, "backend-service", "process.request", span_id);

if (span) {

span_add_tag(span, "http.method", "GET");

span_add_tag(span, "http.path", "/api/users");

// ... 处理业务逻辑

finish_span(span);

}

// 保存Trace

if (trace) {

tracer_save(t, trace);

// 响应中添加TraceId

char response256;

snprintf(response, sizeof(response),

"HTTP/1.1 200 OK\r\n"

"X-B3-TraceId: %s\r\n"

"\r\n"

"OK\n", trace_id);

send(client_fd, response, strlen(response), 0);

}

}

```

  1. 采样策略

```c

// 多种采样策略

typedef enum {

SAMPLING_RATE, // 固定比例采样

SAMPLING_QUOTA, // 配额采样

SAMPLING_LAST, // 尾采样

} sampling_strategy_t;

typedef struct {

sampling_strategy_t strategy;

union {

int rate; // 0-100

int quota; // 每分钟采样数量

double latency; // 尾采样延迟阈值(ms)

};

} sampling_config_t;

int should_sample(sampling_config_t *config, timestamp_t start_time) {

static int quota_used = 0;

static time_t last_reset = 0;

switch (config->strategy) {

case SAMPLING_RATE:

return (rand() % 100) < config->rate;

case SAMPLING_QUOTA:

time_t now = time(NULL);

if (now - last_reset > 60) {

quota_used = 0;

last_reset = now;

}

if (quota_used < config->quota) {

quota_used++;

return 1;

}

return 0;

case SAMPLING_LAST:

// 先记录所有请求,延迟采样(需要事后处理)

return 1;

default:

return 1;

}

}

```


三、测试代码

```c

int main() {

printf("=== 分布式链路追踪测试 ===\n\n");

srand(time(NULL));

// 创建追踪器

tracer_t *t = tracer_create(9090, 9091);

// 启动收集和查询服务

pthread_t collect_thread, query_thread;

pthread_create(&collect_thread, NULL, collector_thread, t);

pthread_create(&query_thread, NULL, query_thread, t);

// 模拟三个服务调用

printf("模拟分布式调用链...\n");

// 服务A: 网关收到请求

trace_t *trace = tracer_start_trace(t, "gateway", "http.request");

if (trace) {

span_t *gateway_span = trace->spans;

span_add_tag(gateway_span, "http.method", "GET");

span_add_tag(gateway_span, "http.path", "/api/order");

span_add_tag(gateway_span, "status", "200");

// 模拟调用服务B (延时)

usleep(50000);

// 服务B: 订单服务

span_t *order_span = tracer_start_span(t, trace, "order-service", "get.order",

gateway_span->span_id);

if (order_span) {

span_add_tag(order_span, "order.id", "12345");

span_add_tag(order_span, "db.operation", "select");

// 模拟调用服务C (延时)

usleep(30000);

// 服务C: 用户服务

span_t *user_span = tracer_start_span(t, trace, "user-service", "get.user",

order_span->span_id);

if (user_span) {

span_add_tag(user_span, "user.id", "67890");

span_add_tag(user_span, "cache.hit", "true");

usleep(20000);

finish_span(user_span);

}

finish_span(order_span);

}

finish_span(gateway_span);

// 保存完整的Trace

tracer_save(t, trace);

printf("Trace完成: %s\n", trace->trace_id);

}

// 等待数据写入

sleep(1);

printf("\n查询最近Trace:\n");

// 通过模拟查询(实际可以通过客户端发送查询请求)

sleep(2);

t->running = 0;

pthread_join(collect_thread, NULL);

pthread_join(query_thread, NULL);

free(t);

return 0;

}

```


四、编译和运行

```bash

gcc -o tracing tracing.c -lpthread

./tracing

```

输出示例:

```

=== 分布式链路追踪测试 ===

链路追踪启动: collector=9090, query=9091

收集服务启动: 9090

查询服务启动: 9091

模拟分布式调用链...

Trace完成: 52a7e3b2d1c4e8f9a1b2c3d4e5f6a7b8

查询最近Trace:

52a7e3b2d1c4e8f9a1b2c3d4e5f6a7b8

```


五、可视化示例

```html

<!DOCTYPE html>

<html>

<head><title>链路追踪</title></head>

<body>

<h1>Trace: <span id="trace_id"></span></h1>

<div id="timeline"></div>

<script>

function renderTrace(traceId) {

fetch('/api/trace/' + traceId)

.then(r => r.json())

.then(data => {

// 绘制时间线

const minTime = Math.min(...data.spans.map(s => s.start));

const maxTime = Math.max(...data.spans.map(s => s.end));

const duration = maxTime - minTime;

data.spans.forEach(span => {

const left = (span.start - minTime) / duration * 100;

const width = (span.end - span.start) / duration * 100;

// 渲染为条形图

});

});

}

</script>

</body>

</html>

```


六、总结

通过这篇文章,你学会了:

· 链路追踪的核心概念(Trace、Span、TraceId)

· 分布式上下文传递(HTTP头/RPC上下文)

· Span的创建、管理和标签

· 数据收集和查询服务

· 采样策略(比例、配额、尾采样)

分布式链路追踪是可观测性三大支柱之一。掌握它,你就拥有了排查微服务问题的"火眼金睛"。

下一篇预告:《从零实现一个服务网格:Sidecar与流量管理》


评论区分享一下你遇到过的最难排查的分布式问题~

相关推荐
森G2 小时前
78、框架分析------服务器源码解析----云视频服务项目
服务器·c++·qt
我不是懒洋洋2 小时前
【C++】string(string的成员变量、auto和范围for、string常用接口的说明、OJ题目、string的模拟实现)
c语言·开发语言·c++·visual studio
Brilliantwxx2 小时前
【C++】 C++11 知识点梳理(中)
开发语言·c++
j7~2 小时前
【C++】STL--Vector容器--拆析解剖Vector的实现以及Vector的底层详解(2)
开发语言·c++·动态二维数组·vector深度剖析·vector的实现·杨辉三角形
旖-旎3 小时前
《LeetCode 130 被围绕的区域 FloodFill DFS 解法》
c++·算法·深度优先·力扣·floodfill
一只旭宝10 小时前
【C++入门精讲22】常见设计模式
c++·设计模式
c++之路12 小时前
Bazel C++ 构建系列文档(三):构建第一个 C++ 项目
开发语言·c++
旖-旎12 小时前
《LeetCode 695 岛屿的最大面积 FloodFill DFS 解法》
c++·算法·力扣·深度优先遍历·floodfill
森G12 小时前
61、信号与槽机制在 TCP 编程中的应用---------网络编程
网络·c++·qt·网络协议·tcp/ip