使用opentelemetry 可观测监控springboot应用的指标、链路实践，使用zipkin展示链路追踪数据，使用grafana展示指标

1.安装docker，docker-compose

bash 复制代码

（1）安装依赖包
yum install -y yum-utils device-mapper-persistent-data lvm2
 
2.2、部署docker

tar  xvf docker-20.10.19.tgz
cp docker/* /usr/bin/



vim /usr/lib/systemd/system/docker.service

[Unit]
Description=Docker Application Container Engine
Documentation=https://docs.docker.com
After=network-online.target firewalld.service
Wants=network-online.target
 
 
[Service]
Type=notify
# the default is not to use systemd for cgroups because the delegate issues still
# exists and systemd currently does not support the cgroup feature set required
# for containers run by docker
ExecStart=/usr/bin/dockerd
ExecReload=/bin/kill -s HUP $MAINPID
# Having non-zero Limit*s causes performance problems due to accounting overhead
# in the kernel. We recommend using cgroups to do container-local accounting.
LimitNOFILE=infinity
LimitNPROC=infinity
LimitCORE=infinity
# Uncomment TasksMax if your systemd version supports it.
# Only systemd 226 and above support this version.
#TasksMax=infinity
TimeoutStartSec=0
# set delegate yes so that systemd does not reset the cgroups of docker containers
Delegate=yes
# kill only the docker process, not all processes in the cgroup
KillMode=process
# restart the docker process if it exits prematurely
Restart=on-failure
StartLimitBurst=3
StartLimitInterval=60s
[Install]
WantedBy=multi-user.target




systemctl enable docker
systemctl restart docker 



vim /etc/docker/daemon.json

{
   "data-root": "/data/docker/ucsp-data/docker "
}
```


mkdir -p /data/docker/ucsp-data/docker
cp -r /var/lib/docker/*  /data/docker/ucsp-data/docker



systemctl daemon-reload
systemctl restart docker
docker   version
[root@localhost ~]# docker   version
Client:
 Version:           20.10.19
 API version:       1.41
 Go version:        go1.18.7
 Git commit:        d85ef84
 Built:             Thu Oct 13 16:43:07 2022
 OS/Arch:           linux/amd64
 Context:           default
 Experimental:      true

Server: Docker Engine - Community
 Engine:
  Version:          20.10.19
  API version:      1.41 (minimum version 1.12)
  Go version:       go1.18.7
  Git commit:       c964641
  Built:            Thu Oct 13 16:48:41 2022
  OS/Arch:          linux/amd64
  Experimental:     false
 containerd:
  Version:          v1.6.8
  GitCommit:        9cd3357b7fd7218e4aec3eae239db1f68a5a6ec6
 runc:
  Version:          1.1.4
  GitCommit:        v1.1.4-0-g5fd4c4d1
 docker-init:
  Version:          0.19.0
  GitCommit:        de40ad0
[root@localhost ~]#

安装docker-compose

bash 复制代码

##安装docker-compose
cp ./docker-compose-linux-x86_64 /usr/local/bin/docker-compose
chmod +x /usr/local/bin/docker-compose
ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose

配置docker国内镜像源

bash 复制代码

##配置DockerHub 国内镜像源
vim /etc/docker/daemon.json 添加如下：
"registry-mirrors": [
        "https://docker.1ms.run",
        "https://docker.xuanyuan.me"
    ]


##查看配置
[root@localhost ~]# cat /etc/docker/daemon.json
{
   "data-root": "/data/docker/ucsp-data/docker",
   "registry-mirrors": [
        "https://docker.1ms.run",
        "https://docker.xuanyuan.me"
    ]
}
[root@localhost ~]#


##重启
systemctl daemon-reload
systemctl restart docker

修改docker-compose.yaml

复制代码

[root@localhost OpenTelemetry]# cat docker-compose.yml 
version: '3.8'

services:
  # OpenTelemetry Collector (数据收集和转发)
  otel-collector:
    image: otel/opentelemetry-collector:latest
    container_name: otel-collector
    command: ["--config=/etc/otel-config.yaml"]
    volumes:
      - ./otel-config.yaml:/etc/otel-config.yaml
    ports:
      - "4317:4317"  # OTLP gRPC
      - "4318:4318"  # OTLP http
      - "8889:8889"  # Promentheus exporter metrics
    depends_on:
      - prometheus

  # Prometheus (指标存储)
  prometheus:
    image: prom/prometheus:latest
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
    ports:
      - "9090:9090"
    command:
      - "--config.file=/etc/prometheus/prometheus.yml"
      - "--web.external-url=http://localhost:9090"
    depends_on:
      - alertmanager

  # Alertmanager (告警管理)
  alertmanager:
    image: prom/alertmanager:latest
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
    ports:
      - "9093:9093"
    command:
      - "--config.file=/etc/alertmanager/alertmanager.yml"

  # Grafana (可视化)
  grafana:
    image: grafana/grafana:latest
    volumes:
      - grafana-storage:/var/lib/grafana
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    ports:
      - "3000:3000"
    depends_on:
      - prometheus

  zipkin:
    image: openzipkin/zipkin:latest
    container_name: zipkin
    ports:
      - "9411:9411"
    deploy:
      resources:
        limits:
          memory: 2G
    environment:
      JAVA_OPTS: "-Xmx1g -Xms1g -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError"

  redis:
    image: "redis:6.2"
    container_name: redis
    hostname: redis
    restart: always
    environment:
      TZ: Asia/Shanghai
    ports:
      - "6379:6379"

  victoriametrics:
    image: victoriametrics/victoria-metrics:v1.79.12
    container_name: victoriametrics
    hostname: victoriametrics
    restart: always
    environment:
      TZ: Asia/Shanghai
    ports:
      - "8428:8428"
    command:
      - "--loggerTimezone=Asia/Shanghai"

  nightingale:
    image: flashcatcloud/nightingale:latest
    container_name: nightingale
    hostname: nightingale
    restart: always
    environment:
      GIN_MODE: release
      TZ: Asia/Shanghai
      WAIT_HOSTS: redis:6379
    volumes:
      - ./etc-nightingale:/app/etc
    ports:
      - "17000:17000"
      - "20090:20090"
    depends_on:
      - redis
      - victoriametrics
    command: >
      sh -c "/app/n9e"

  categraf:
    image: "flashcatcloud/categraf:latest"
    container_name: "categraf"
    hostname: "categraf01"
    restart: always
    environment:
      TZ: Asia/Shanghai
      HOST_PROC: /hostfs/proc
      HOST_SYS: /hostfs/sys
      HOST_MOUNT_PREFIX: /hostfs
      WAIT_HOSTS: nightingale:17000, nightingale:20090
    volumes:
      - ./etc-categraf:/etc/categraf/conf
      - /:/hostfs
    depends_on:
      - nightingale

volumes:
  grafana-storage:

修改otel-config.yaml

bash 复制代码

[root@localhost OpenTelemetry]# cat otel-config.yaml 
receivers:
  otlp:
    protocols:
      grpc:
        endpoint: '0.0.0.0:4317'
      http:
        endpoint: '0.0.0.0:4318'
processors:
  batch:

exporters:
  # NOTE: Prior to v0.86.0 use `logging` instead of `debug`.
  prometheus:
    endpoint: "0.0.0.0:8889"
  debug:
    verbosity: detailed
  zipkin:
    endpoint: "http://0.0.0.0:9411/api/v2/spans"

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: []
      exporters: [zipkin]
    metrics:
      receivers: [otlp]
      processors: []
      exporters: [prometheus]
    logs:
      receivers: [otlp]
      exporters: [debug]

修改prometheus.yml

bash 复制代码

[root@localhost OpenTelemetry]# cat prometheus.yml 
global:
  scrape_interval: 15s
  evaluation_interval: 15s

rule_files:
  - '/etc/prometheus/alert.rules.yml'

alerting:
  alertmanagers:
    - static_configs:
        - targets: ['alertmanager:9093']

scrape_configs:
  - job_name: 'otel-collector'
    metrics_path: '/metrics'
    scrape_interval: 5s
    static_configs:
      - targets: ['otel-collector:8889']
  - job_name: 'prometheus'
    static_configs:
      - targets: ['prometheus:9090']
  - job_name: 'alertmanager'
    static_configs:
      - targets: ['alertmanager:9093']

修改alertmanager.yml

bash 复制代码

[root@localhost OpenTelemetry]# cat alertmanager.yml 
route:
  group_by: ['alertname']
  receiver: 'email-notifications'

receivers:
- name: 'email-notifications'
  email_configs:
  - to: 'your-email@example.com'
    from: 'alertmanager@example.com'
    smarthost: 'smtp.example.com:587'
    auth_username: 'your-email@example.com'
    auth_password: 'your-password'
    send_resolved: true

修改alert.rules.yml

bash 复制代码

[root@localhost OpenTelemetry]# cat alert.rules.yml 
groups:
- name: example
  rules:
  - alert: HighRequestLatency
    expr: histogram_quantile(0.95, sum(rate(http_server_duration_seconds_bucket[5m])) by (le) > 1
    for: 10m
    labels:
      severity: critical
    annotations:
      summary: "High request latency on {{ $labels.instance }}"
      description: "Request latency is {{ $value }} seconds"

修改./etc-nightingale/config.toml

bash 复制代码

[root@localhost OpenTelemetry]# cat ./etc-nightingale/config.toml 
[Global]
RunMode = "release"

[Log]
# log write dir
Dir = "logs"
# log level: DEBUG INFO WARNING ERROR
Level = "INFO"
# stdout, stderr, file
Output = "stdout"
# # rotate by time
# KeepHours = 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256

[HTTP]
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 17000
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = false
# whether enable pprof
PProf = false
# expose prometheus /metrics?
ExposeMetrics = true
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120

[HTTP.ShowCaptcha]
Enable = false 

[HTTP.APIForAgent]
Enable = true 
# [HTTP.APIForAgent.BasicAuth]
# user001 = "ccc26da7b9aba533cbb263a36c07dcc5"

[HTTP.APIForService]
Enable = false
[HTTP.APIForService.BasicAuth]
user001 = "ccc26da7b9aba533cbb263a36c07dcc5"

[HTTP.JWTAuth]
# unit: min
AccessExpired = 1500
# unit: min
RefreshExpired = 10080
RedisKeyPrefix = "/jwt/"

[HTTP.TokenAuth]
Enable = false
HeaderUserTokenKey = "X-User-Token"

[HTTP.ProxyAuth]
# if proxy auth enabled, jwt auth is disabled
Enable = false
# username key in http proxy header
HeaderUserNameKey = "X-User-Name"
DefaultRoles = ["Standard"]

[HTTP.RSA]
# open RSA
OpenRSA = false

[DB]
# postgres: DSN="host=127.0.0.1 port=5432 user=root dbname=n9e_v6 password=1234 sslmode=disable"
DSN="nightingale:Nightingale_324@tcp(10.12.12.80:3306)/n9e_v6?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"
# enable debug mode or not
Debug = false
# mysql postgres
DBType = "mysql"
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
# enable auto migrate or not
# EnableAutoMigrate = false

[Redis]
# address, ip:port or ip1:port,ip2:port for cluster and sentinel(SentinelAddrs)
Address = "redis:6379"
# Username = ""
# Password = ""
# DB = 0
# UseTLS = false
# TLSMinVersion = "1.2"
# standalone cluster sentinel
RedisType = "standalone"
# Mastername for sentinel type
# MasterName = "mymaster"
# SentinelUsername = ""
# SentinelPassword = ""

[Alert]
[Alert.Heartbeat]
# auto detect if blank
IP = ""
# unit ms
Interval = 1000
EngineName = "default"

# [Alert.Alerting]
# NotifyConcurrency = 10

[Center]
MetricsYamlFile = "./etc/metrics.yaml"
I18NHeaderKey = "X-Language"

[Center.AnonymousAccess]
PromQuerier = false
AlertDetail = false

[Pushgw]
# use target labels in database instead of in series
LabelRewrite = true
ForceUseServerTS = true

# [Pushgw.DebugSample]
# ident = "xx"
# __name__ = "xx"

# [Pushgw.WriterOpt]
# QueueMaxSize = 1000000
# QueuePopSize = 1000

[[Pushgw.Writers]] 
# Url = "http://127.0.0.1:8480/insert/0/prometheus/api/v1/write"
Url = "http://victoriametrics:8428/api/v1/write"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Headers = ["X-From", "n9e"]
Timeout = 10000
DialTimeout = 3000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
## Optional TLS Config
# UseTLS = false
# TLSCA = "/etc/n9e/ca.pem"
# TLSCert = "/etc/n9e/cert.pem"
# TLSKey = "/etc/n9e/key.pem"
# InsecureSkipVerify = false
# [[Writers.WriteRelabels]]
# Action = "replace"
# SourceLabels = ["__address__"]
# Regex = "([^:]+)(?::\\d+)?"
# Replacement = "$1:80"
# TargetLabel = "__address__"

[Ibex]
Enable = true
RPCListen = "0.0.0.0:20090"

springboot 应用启动参数添加

bash 复制代码

nohup java -server -Xms256m -Xmx512m -javaagent:/路径/opentelemetry-javaagent.jar \
-Dotel.resource.attributes=service.name=myService \
-Dotel.exporter.otlp.endpoint=http://localhost:4318 \
-Dotel.service.name=my-java-app \
-Dotel.traces.exporter=zipkin \
-Dotel.exporter.zipkin.endpoint=http://localhost:9411/api/v2/spans \
-jar springboot_app.jar >test.log 2>&1 &

截图如下：

Prometheus监控

http://ip::9090/graph

grafana监控

http://IP:3000/

zipkin监控

http://IP:9411/

夜莺监控

http://iP:17000