从零实现一个全链路监控平台:Metrics与Alerting

前言

你有没有想过:当系统出现故障时,你怎么知道是哪个服务、哪个接口出了问题?CPU飙高、内存泄漏、接口变慢------这些怎么才能提前发现?

全链路监控平台是可观测性的三大支柱之一(Metrics + Logging + Tracing)。今天我们从零实现:

· 指标采集(Counter、Gauge、Histogram)

· 指标聚合与存储

· 告警规则引擎

· 告警通知

· 可视化Dashboard


一、监控平台核心原理

  1. 架构图

```

┌─────────────────────────────────────────────────────────────┐

│ 应用服务 │

│ ┌─────────┐ ┌─────────┐ ┌─────────┐ │

│ │ 指标采集 │→│ 指标聚合 │→│ 指标上报 │ │

│ └─────────┘ └─────────┘ └─────────┘ │

└─────────────────────────────────────────────────────────────┘

┌─────────────────────────────────────────────────────────────┐

│ 监控中心 │

│ ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ │

│ │ 指标存储 │ │ 规则引擎 │ │ 告警通知 │ │

│ │ (时序数据) │ │ (阈值判断) │ │ (钉钉/邮件) │ │

│ └─────────────┘ └─────────────┘ └─────────────┘ │

└─────────────────────────────────────────────────────────────┘

┌─────────────┐

│ Dashboard │

│ (可视化) │

└─────────────┘

```

  1. 核心概念

概念 说明 示例

Counter 只增不减的计数器 请求总数

Gauge 可增可减的测量值 CPU使用率、内存使用量

Histogram 分布统计 请求延迟(P50/P95/P99)

Label 维度标签 服务名、接口名、状态码


二、完整代码实现

  1. 基础数据结构

```c

#include <stdio.h>

#include <stdlib.h>

#include <string.h>

#include <unistd.h>

#include <pthread.h>

#include <time.h>

#include <errno.h>

#include <math.h>

#define MAX_METRIC_NAME 128

#define MAX_LABELS 8

#define MAX_LABEL_KEY 32

#define MAX_LABEL_VALUE 64

#define MAX_HISTOGRAM_BUCKETS 20

#define MAX_ALERTS 100

// 指标类型

typedef enum {

METRIC_COUNTER = 0,

METRIC_GAUGE,

METRIC_HISTOGRAM,

METRIC_SUMMARY

} metric_type_t;

// 标签

typedef struct label {

char keyMAX_LABEL_KEY;

char valueMAX_LABEL_VALUE;

} label_t;

// 指标值

typedef struct metric_value {

metric_type_t type;

char nameMAX_METRIC_NAME;

label_t labelsMAX_LABELS;

int label_count;

double value;

double sum;

double count;

double bucketsMAX_HISTOGRAM_BUCKETS;

double bucket_upperMAX_HISTOGRAM_BUCKETS;

int bucket_count;

time_t timestamp;

struct metric_value *next;

} metric_value_t;

// 告警规则

typedef struct alert_rule {

char name64;

char metric_nameMAX_METRIC_NAME;

char condition16; // ">", "<", ">=", "<=", "=="

double threshold;

int for_seconds; // 持续时间

char severity16; // "critical", "warning", "info"

char message256;

struct alert_rule *next;

} alert_rule_t;

// 告警事件

typedef struct alert_event {

char rule_name64;

char metric_nameMAX_METRIC_NAME;

char severity16;

char message256;

double current_value;

time_t start_time;

time_t end_time;

int active;

struct alert_event *next;

} alert_event_t;

// 监控平台

typedef struct monitor_platform {

metric_value_t *metrics;

alert_rule_t *alert_rules;

alert_event_t *alert_events;

pthread_mutex_t mutex;

int retention_days;

int running;

} monitor_platform_t;

```

  1. 指标采集

```c

// 创建监控平台

monitor_platform_t *monitor_create(void) {

monitor_platform_t *mp = malloc(sizeof(monitor_platform_t));

memset(mp, 0, sizeof(monitor_platform_t));

mp->retention_days = 7;

mp->running = 1;

pthread_mutex_init(&mp->mutex, NULL);

printf("监控平台启动\n");

return mp;

}

// 创建Counter

void metric_counter_add(monitor_platform_t *mp, const char *name,

label_t *labels, int label_count, double delta) {

pthread_mutex_lock(&mp->mutex);

metric_value_t *m = mp->metrics;

while (m) {

if (strcmp(m->name, name) == 0 && m->type == METRIC_COUNTER) {

// 检查标签匹配

int match = 1;

if (m->label_count == label_count) {

for (int i = 0; i < label_count; i++) {

if (strcmp(m->labelsi.key, labelsi.key) != 0 ||

strcmp(m->labelsi.value, labelsi.value) != 0) {

match = 0;

break;

}

}

} else {

match = 0;

}

if (match) {

m->value += delta;

m->timestamp = time(NULL);

pthread_mutex_unlock(&mp->mutex);

return;

}

}

m = m->next;

}

// 创建新指标

m = malloc(sizeof(metric_value_t));

m->type = METRIC_COUNTER;

strcpy(m->name, name);

m->label_count = label_count;

for (int i = 0; i < label_count && i < MAX_LABELS; i++) {

strcpy(m->labelsi.key, labelsi.key);

strcpy(m->labelsi.value, labelsi.value);

}

m->value = delta;

m->timestamp = time(NULL);

m->next = mp->metrics;

mp->metrics = m;

pthread_mutex_unlock(&mp->mutex);

}

// 设置Gauge

void metric_gauge_set(monitor_platform_t *mp, const char *name,

label_t *labels, int label_count, double value) {

pthread_mutex_lock(&mp->mutex);

metric_value_t *m = mp->metrics;

while (m) {

if (strcmp(m->name, name) == 0 && m->type == METRIC_GAUGE) {

int match = 1;

if (m->label_count == label_count) {

for (int i = 0; i < label_count; i++) {

if (strcmp(m->labelsi.key, labelsi.key) != 0 ||

strcmp(m->labelsi.value, labelsi.value) != 0) {

match = 0;

break;

}

}

} else {

match = 0;

}

if (match) {

m->value = value;

m->timestamp = time(NULL);

pthread_mutex_unlock(&mp->mutex);

return;

}

}

m = m->next;

}

m = malloc(sizeof(metric_value_t));

m->type = METRIC_GAUGE;

strcpy(m->name, name);

m->label_count = label_count;

for (int i = 0; i < label_count && i < MAX_LABELS; i++) {

strcpy(m->labelsi.key, labelsi.key);

strcpy(m->labelsi.value, labelsi.value);

}

m->value = value;

m->timestamp = time(NULL);

m->next = mp->metrics;

mp->metrics = m;

pthread_mutex_unlock(&mp->mutex);

}

// 记录Histogram观测值

void metric_histogram_observe(monitor_platform_t *mp, const char *name,

label_t *labels, int label_count, double value) {

pthread_mutex_lock(&mp->mutex);

metric_value_t *m = mp->metrics;

while (m) {

if (strcmp(m->name, name) == 0 && m->type == METRIC_HISTOGRAM) {

int match = 1;

if (m->label_count == label_count) {

for (int i = 0; i < label_count; i++) {

if (strcmp(m->labelsi.key, labelsi.key) != 0 ||

strcmp(m->labelsi.value, labelsi.value) != 0) {

match = 0;

break;

}

}

} else {

match = 0;

}

if (match) {

m->sum += value;

m->count++;

// 分配桶

for (int i = 0; i < m->bucket_count; i++) {

if (value <= m->bucket_upperi) {

m->bucketsi++;

break;

}

}

m->timestamp = time(NULL);

pthread_mutex_unlock(&mp->mutex);

return;

}

}

m = m->next;

}

m = malloc(sizeof(metric_value_t));

m->type = METRIC_HISTOGRAM;

strcpy(m->name, name);

m->label_count = label_count;

for (int i = 0; i < label_count && i < MAX_LABELS; i++) {

strcpy(m->labelsi.key, labelsi.key);

strcpy(m->labelsi.value, labelsi.value);

}

m->sum = value;

m->count = 1;

// 默认桶分布:0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10

double default_buckets\[\] = {0.005, 0.01, 0.025, 0.05, 0.1, 0.25, 0.5, 1, 2.5, 5, 10};

m->bucket_count = 11;

for (int i = 0; i < m->bucket_count; i++) {

m->bucket_upperi = default_bucketsi;

m->bucketsi = (value <= default_bucketsi) ? 1 : 0;

}

m->timestamp = time(NULL);

m->next = mp->metrics;

mp->metrics = m;

pthread_mutex_unlock(&mp->mutex);

}

```

  1. 告警规则引擎

```c

// 添加告警规则

void monitor_add_alert_rule(monitor_platform_t *mp, const char *name,

const char *metric_name, const char *condition,

double threshold, int for_seconds,

const char *severity, const char *message) {

pthread_mutex_lock(&mp->mutex);

alert_rule_t *rule = malloc(sizeof(alert_rule_t));

strcpy(rule->name, name);

strcpy(rule->metric_name, metric_name);

strcpy(rule->condition, condition);

rule->threshold = threshold;

rule->for_seconds = for_seconds;

strcpy(rule->severity, severity);

strcpy(rule->message, message);

rule->next = mp->alert_rules;

mp->alert_rules = rule;

pthread_mutex_unlock(&mp->mutex);

printf("告警 添加规则: %s (%s %s %.2f)\n",

name, metric_name, condition, threshold);

}

// 检查告警条件

int check_condition(double value, const char *condition, double threshold) {

if (strcmp(condition, ">") == 0) return value > threshold;

if (strcmp(condition, "<") == 0) return value < threshold;

if (strcmp(condition, ">=") == 0) return value >= threshold;

if (strcmp(condition, "<=") == 0) return value <= threshold;

if (strcmp(condition, "==") == 0) return fabs(value - threshold) < 0.0001;

return 0;

}

// 评估告警规则

void monitor_evaluate_alerts(monitor_platform_t *mp) {

pthread_mutex_lock(&mp->mutex);

time_t now = time(NULL);

alert_rule_t *rule = mp->alert_rules;

while (rule) {

// 查找对应的指标

metric_value_t *m = mp->metrics;

while (m) {

if (strcmp(m->name, rule->metric_name) == 0) {

int triggered = check_condition(m->value, rule->condition, rule->threshold);

if (triggered) {

// 检查是否已存在告警

alert_event_t *evt = mp->alert_events;

int found = 0;

while (evt) {

if (strcmp(evt->rule_name, rule->name) == 0 && evt->active) {

found = 1;

break;

}

evt = evt->next;

}

if (!found) {

// 创建新告警

alert_event_t *new_evt = malloc(sizeof(alert_event_t));

strcpy(new_evt->rule_name, rule->name);

strcpy(new_evt->metric_name, rule->metric_name);

strcpy(new_evt->severity, rule->severity);

snprintf(new_evt->message, sizeof(new_evt->message),

"%s (当前值: %.2f)", rule->message, m->value);

new_evt->current_value = m->value;

new_evt->start_time = now;

new_evt->end_time = 0;

new_evt->active = 1;

new_evt->next = mp->alert_events;

mp->alert_events = new_evt;

printf("告警 %s: %s (%.2f %s %.2f)\n",

rule->severity, rule->name, m->value,

rule->condition, rule->threshold);

}

} else {

// 关闭告警

alert_event_t *evt = mp->alert_events;

while (evt) {

if (strcmp(evt->rule_name, rule->name) == 0 && evt->active) {

evt->active = 0;

evt->end_time = now;

printf("告警 %s 已恢复\n", rule->name);

}

evt = evt->next;

}

}

}

m = m->next;

}

rule = rule->next;

}

pthread_mutex_unlock(&mp->mutex);

}

```

  1. 监控线程

```c

// 监控评估线程

void *monitor_eval_thread(void *arg) {

monitor_platform_t *mp = (monitor_platform_t*)arg;

while (mp->running) {

sleep(10); // 每10秒评估一次

monitor_evaluate_alerts(mp);

}

return NULL;

}

```

  1. Dashboard生成

```c

// 生成HTML Dashboard

void monitor_generate_dashboard(monitor_platform_t *mp, const char *filename) {

FILE *fp = fopen(filename, "w");

if (!fp) return;

fprintf(fp, "<!DOCTYPE html>\n");

fprintf(fp, "<html><head><title>监控Dashboard</title>\n");

fprintf(fp, "<style>\n");

fprintf(fp, "body{font-family:monospace;padding:20px;background:#1a1a2e;color:#eee}\n");

fprintf(fp, ".metric{background:#16213e;padding:15px;margin:10px 0;border-radius:8px}\n");

fprintf(fp, ".metric-name{color:#e94560;font-weight:bold}\n");

fprintf(fp, ".metric-value{color:#0f3460;font-size:24px}\n");

fprintf(fp, ".label{color:#aaa;font-size:12px}\n");

fprintf(fp, ".critical{color:#ff6b6b}\n");

fprintf(fp, ".warning{color:#ffd93d}\n");

fprintf(fp, ".info{color:#6bcb77}\n");

fprintf(fp, "</style></head><body>\n");

fprintf(fp, "<h1>📊 监控Dashboard</h1>\n");

// 显示指标

pthread_mutex_lock(&mp->mutex);

metric_value_t *m = mp->metrics;

while (m) {

fprintf(fp, "<div class='metric'>\n");

fprintf(fp, " <span class='metric-name'>%s</span>\n", m->name);

// 显示标签

if (m->label_count > 0) {

fprintf(fp, " <span class='label'>");

for (int i = 0; i < m->label_count; i++) {

fprintf(fp, "%s=%s ", m->labelsi.key, m->labelsi.value);

}

fprintf(fp, "</span>\n");

}

// 显示值

if (m->type == METRIC_COUNTER) {

fprintf(fp, " <div class='metric-value'>%.0f</div>\n", m->value);

} else if (m->type == METRIC_GAUGE) {

fprintf(fp, " <div class='metric-value'>%.2f</div>\n", m->value);

} else if (m->type == METRIC_HISTOGRAM) {

fprintf(fp, " <div class='metric-value'>count=%.0f, sum=%.2f</div>\n",

m->count, m->sum);

}

fprintf(fp, "</div>\n");

m = m->next;

}

// 显示告警

fprintf(fp, "<h2>🚨 活跃告警</h2>\n");

alert_event_t *evt = mp->alert_events;

while (evt) {

if (evt->active) {

const char *cls = (strcmp(evt->severity, "critical") == 0) ? "critical" :

(strcmp(evt->severity, "warning") == 0) ? "warning" : "info";

fprintf(fp, "<div class='metric %s'>%s: %s</div>\n",

cls, evt->severity, evt->message);

}

evt = evt->next;

}

pthread_mutex_unlock(&mp->mutex);

fprintf(fp, "</body></html>\n");

fclose(fp);

printf("Dashboard 已生成: %s\n", filename);

}

```

  1. 测试代码

```c

void test_monitor() {

printf("=== 全链路监控平台测试 ===\n\n");

monitor_platform_t *mp = monitor_create();

// 添加告警规则

monitor_add_alert_rule(mp, "cpu_high", "cpu_usage", ">", 80.0, 30,

"critical", "CPU使用率过高");

monitor_add_alert_rule(mp, "error_rate_high", "error_rate", ">", 5.0, 60,

"warning", "错误率过高");

monitor_add_alert_rule(mp, "memory_low", "memory_free", "<", 1024.0, 60,

"critical", "可用内存不足");

// 启动监控线程

pthread_t eval_tid;

pthread_create(&eval_tid, NULL, monitor_eval_thread, mp);

// 模拟指标采集

printf("模拟 开始采集指标...\n");

for (int i = 0; i < 50; i++) {

label_t labels2;

strcpy(labels0.key, "service");

strcpy(labels0.value, "order-service");

strcpy(labels1.key, "env");

strcpy(labels1.value, "prod");

// 模拟Counter

metric_counter_add(mp, "http_requests_total", labels, 2, 10 + rand() % 50);

// 模拟Gauge

double cpu = 30 + (rand() % 80);

metric_gauge_set(mp, "cpu_usage", labels, 2, cpu);

// 模拟Histogram

double latency = (rand() % 1000) / 100.0;

metric_histogram_observe(mp, "request_duration_ms", labels, 2, latency);

// 模拟错误率

double error_rate = (rand() % 10) / 100.0;

metric_gauge_set(mp, "error_rate", labels, 2, error_rate * 100);

// 模拟内存

double memory_free = 500 + (rand() % 4000);

metric_gauge_set(mp, "memory_free", labels, 2, memory_free);

usleep(100000);

if (i % 10 == 0) {

printf("模拟 采集 %d/50\n", i);

}

}

sleep(2);

// 生成Dashboard

monitor_generate_dashboard(mp, "dashboard.html");

mp->running = 0;

pthread_join(eval_tid, NULL);

printf("\n✅ 测试完成,打开 dashboard.html 查看监控面板\n");

free(mp);

}

int main() {

srand(time(NULL));

test_monitor();

return 0;

}

```


三、编译和运行

```bash

gcc -o monitor monitor.c -lpthread -lm

./monitor

```


四、Prometheus vs 本实现

特性 本实现 Prometheus

指标采集 ✅ ✅

多类型 Counter/Gauge/Histogram Counter/Gauge/Histogram/Summary

标签支持 ✅ ✅

告警规则 ✅ ✅ (Alertmanager)

持久化 ❌ ✅ (TSDB)

查询语言 ❌ ✅ (PromQL)

可视化 ✅ (简单) ✅ (Grafana)

服务发现 ❌ ✅


五、总结

通过这篇文章,你学会了:

· 监控平台的核心原理(指标采集、告警、可视化)

· 三种指标类型(Counter、Gauge、Histogram)

· 告警规则引擎

· 标签(维度)的使用

· Dashboard生成

全链路监控是可观测性的核心。掌握它,你就拥有了提前发现系统问题的能力。

下一篇预告:《从零实现一个分布式任务调度平台:XXL-JOB的核心设计》


评论区分享一下你用监控系统发现过什么问题~