使用opentelemetry 可观测监控springboot应用的指标、链路实践,使用zipkin展示链路追踪数据,使用grafana展示指标

1.安装docker,docker-compose

bash 复制代码
(1)安装依赖包
yum install -y yum-utils device-mapper-persistent-data lvm2
 
2.2、部署docker

tar  xvf docker-20.10.19.tgz
cp docker/* /usr/bin/



vim /usr/lib/systemd/system/docker.service

[Unit]
Description=Docker Application Container Engine
Documentation=https://docs.docker.com
After=network-online.target firewalld.service
Wants=network-online.target
 
 
[Service]
Type=notify
# the default is not to use systemd for cgroups because the delegate issues still
# exists and systemd currently does not support the cgroup feature set required
# for containers run by docker
ExecStart=/usr/bin/dockerd
ExecReload=/bin/kill -s HUP $MAINPID
# Having non-zero Limit*s causes performance problems due to accounting overhead
# in the kernel. We recommend using cgroups to do container-local accounting.
LimitNOFILE=infinity
LimitNPROC=infinity
LimitCORE=infinity
# Uncomment TasksMax if your systemd version supports it.
# Only systemd 226 and above support this version.
#TasksMax=infinity
TimeoutStartSec=0
# set delegate yes so that systemd does not reset the cgroups of docker containers
Delegate=yes
# kill only the docker process, not all processes in the cgroup
KillMode=process
# restart the docker process if it exits prematurely
Restart=on-failure
StartLimitBurst=3
StartLimitInterval=60s
[Install]
WantedBy=multi-user.target




systemctl enable docker
systemctl restart docker 



vim /etc/docker/daemon.json

{
   "data-root": "/data/docker/ucsp-data/docker "
}
```


mkdir -p /data/docker/ucsp-data/docker
cp -r /var/lib/docker/*  /data/docker/ucsp-data/docker



systemctl daemon-reload
systemctl restart docker
docker   version
[root@localhost ~]# docker   version
Client:
 Version:           20.10.19
 API version:       1.41
 Go version:        go1.18.7
 Git commit:        d85ef84
 Built:             Thu Oct 13 16:43:07 2022
 OS/Arch:           linux/amd64
 Context:           default
 Experimental:      true

Server: Docker Engine - Community
 Engine:
  Version:          20.10.19
  API version:      1.41 (minimum version 1.12)
  Go version:       go1.18.7
  Git commit:       c964641
  Built:            Thu Oct 13 16:48:41 2022
  OS/Arch:          linux/amd64
  Experimental:     false
 containerd:
  Version:          v1.6.8
  GitCommit:        9cd3357b7fd7218e4aec3eae239db1f68a5a6ec6
 runc:
  Version:          1.1.4
  GitCommit:        v1.1.4-0-g5fd4c4d1
 docker-init:
  Version:          0.19.0
  GitCommit:        de40ad0
[root@localhost ~]#

安装docker-compose

bash 复制代码
##安装docker-compose
cp ./docker-compose-linux-x86_64 /usr/local/bin/docker-compose
chmod +x /usr/local/bin/docker-compose
ln -s /usr/local/bin/docker-compose /usr/bin/docker-compose

配置docker国内镜像源

bash 复制代码
##配置DockerHub 国内镜像源
vim /etc/docker/daemon.json 添加如下:
"registry-mirrors": [
        "https://docker.1ms.run",
        "https://docker.xuanyuan.me"
    ]


##查看配置
[root@localhost ~]# cat /etc/docker/daemon.json
{
   "data-root": "/data/docker/ucsp-data/docker",
   "registry-mirrors": [
        "https://docker.1ms.run",
        "https://docker.xuanyuan.me"
    ]
}
[root@localhost ~]#


##重启
systemctl daemon-reload
systemctl restart docker

修改docker-compose.yaml

复制代码
[root@localhost OpenTelemetry]# cat docker-compose.yml 
version: '3.8'

services:
  # OpenTelemetry Collector (数据收集和转发)
  otel-collector:
    image: otel/opentelemetry-collector:latest
    container_name: otel-collector
    command: ["--config=/etc/otel-config.yaml"]
    volumes:
      - ./otel-config.yaml:/etc/otel-config.yaml
    ports:
      - "4317:4317"  # OTLP gRPC
      - "4318:4318"  # OTLP http
      - "8889:8889"  # Promentheus exporter metrics
    depends_on:
      - prometheus

  # Prometheus (指标存储)
  prometheus:
    image: prom/prometheus:latest
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
    ports:
      - "9090:9090"
    command:
      - "--config.file=/etc/prometheus/prometheus.yml"
      - "--web.external-url=http://localhost:9090"
    depends_on:
      - alertmanager

  # Alertmanager (告警管理)
  alertmanager:
    image: prom/alertmanager:latest
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
    ports:
      - "9093:9093"
    command:
      - "--config.file=/etc/alertmanager/alertmanager.yml"

  # Grafana (可视化)
  grafana:
    image: grafana/grafana:latest
    volumes:
      - grafana-storage:/var/lib/grafana
    environment:
      - GF_SECURITY_ADMIN_PASSWORD=admin
    ports:
      - "3000:3000"
    depends_on:
      - prometheus

  zipkin:
    image: openzipkin/zipkin:latest
    container_name: zipkin
    ports:
      - "9411:9411"
    deploy:
      resources:
        limits:
          memory: 2G
    environment:
      JAVA_OPTS: "-Xmx1g -Xms1g -XX:+UseG1GC -XX:+HeapDumpOnOutOfMemoryError"

  redis:
    image: "redis:6.2"
    container_name: redis
    hostname: redis
    restart: always
    environment:
      TZ: Asia/Shanghai
    ports:
      - "6379:6379"

  victoriametrics:
    image: victoriametrics/victoria-metrics:v1.79.12
    container_name: victoriametrics
    hostname: victoriametrics
    restart: always
    environment:
      TZ: Asia/Shanghai
    ports:
      - "8428:8428"
    command:
      - "--loggerTimezone=Asia/Shanghai"

  nightingale:
    image: flashcatcloud/nightingale:latest
    container_name: nightingale
    hostname: nightingale
    restart: always
    environment:
      GIN_MODE: release
      TZ: Asia/Shanghai
      WAIT_HOSTS: redis:6379
    volumes:
      - ./etc-nightingale:/app/etc
    ports:
      - "17000:17000"
      - "20090:20090"
    depends_on:
      - redis
      - victoriametrics
    command: >
      sh -c "/app/n9e"

  categraf:
    image: "flashcatcloud/categraf:latest"
    container_name: "categraf"
    hostname: "categraf01"
    restart: always
    environment:
      TZ: Asia/Shanghai
      HOST_PROC: /hostfs/proc
      HOST_SYS: /hostfs/sys
      HOST_MOUNT_PREFIX: /hostfs
      WAIT_HOSTS: nightingale:17000, nightingale:20090
    volumes:
      - ./etc-categraf:/etc/categraf/conf
      - /:/hostfs
    depends_on:
      - nightingale

volumes:
  grafana-storage:

修改otel-config.yaml

bash 复制代码
[root@localhost OpenTelemetry]# cat otel-config.yaml 
receivers:
  otlp:
    protocols:
      grpc:
        endpoint: '0.0.0.0:4317'
      http:
        endpoint: '0.0.0.0:4318'
processors:
  batch:

exporters:
  # NOTE: Prior to v0.86.0 use `logging` instead of `debug`.
  prometheus:
    endpoint: "0.0.0.0:8889"
  debug:
    verbosity: detailed
  zipkin:
    endpoint: "http://0.0.0.0:9411/api/v2/spans"

service:
  pipelines:
    traces:
      receivers: [otlp]
      processors: []
      exporters: [zipkin]
    metrics:
      receivers: [otlp]
      processors: []
      exporters: [prometheus]
    logs:
      receivers: [otlp]
      exporters: [debug]

修改prometheus.yml

bash 复制代码
[root@localhost OpenTelemetry]# cat prometheus.yml 
global:
  scrape_interval: 15s
  evaluation_interval: 15s

rule_files:
  - '/etc/prometheus/alert.rules.yml'

alerting:
  alertmanagers:
    - static_configs:
        - targets: ['alertmanager:9093']

scrape_configs:
  - job_name: 'otel-collector'
    metrics_path: '/metrics'
    scrape_interval: 5s
    static_configs:
      - targets: ['otel-collector:8889']
  - job_name: 'prometheus'
    static_configs:
      - targets: ['prometheus:9090']
  - job_name: 'alertmanager'
    static_configs:
      - targets: ['alertmanager:9093']

修改alertmanager.yml

bash 复制代码
[root@localhost OpenTelemetry]# cat alertmanager.yml 
route:
  group_by: ['alertname']
  receiver: 'email-notifications'

receivers:
- name: 'email-notifications'
  email_configs:
  - to: 'your-email@example.com'
    from: 'alertmanager@example.com'
    smarthost: 'smtp.example.com:587'
    auth_username: 'your-email@example.com'
    auth_password: 'your-password'
    send_resolved: true

修改alert.rules.yml

bash 复制代码
[root@localhost OpenTelemetry]# cat alert.rules.yml 
groups:
- name: example
  rules:
  - alert: HighRequestLatency
    expr: histogram_quantile(0.95, sum(rate(http_server_duration_seconds_bucket[5m])) by (le) > 1
    for: 10m
    labels:
      severity: critical
    annotations:
      summary: "High request latency on {{ $labels.instance }}"
      description: "Request latency is {{ $value }} seconds"

修改./etc-nightingale/config.toml

bash 复制代码
[root@localhost OpenTelemetry]# cat ./etc-nightingale/config.toml 
[Global]
RunMode = "release"

[Log]
# log write dir
Dir = "logs"
# log level: DEBUG INFO WARNING ERROR
Level = "INFO"
# stdout, stderr, file
Output = "stdout"
# # rotate by time
# KeepHours = 4
# # rotate by size
# RotateNum = 3
# # unit: MB
# RotateSize = 256

[HTTP]
# http listening address
Host = "0.0.0.0"
# http listening port
Port = 17000
# https cert file path
CertFile = ""
# https key file path
KeyFile = ""
# whether print access log
PrintAccessLog = false
# whether enable pprof
PProf = false
# expose prometheus /metrics?
ExposeMetrics = true
# http graceful shutdown timeout, unit: s
ShutdownTimeout = 30
# max content length: 64M
MaxContentLength = 67108864
# http server read timeout, unit: s
ReadTimeout = 20
# http server write timeout, unit: s
WriteTimeout = 40
# http server idle timeout, unit: s
IdleTimeout = 120

[HTTP.ShowCaptcha]
Enable = false 

[HTTP.APIForAgent]
Enable = true 
# [HTTP.APIForAgent.BasicAuth]
# user001 = "ccc26da7b9aba533cbb263a36c07dcc5"

[HTTP.APIForService]
Enable = false
[HTTP.APIForService.BasicAuth]
user001 = "ccc26da7b9aba533cbb263a36c07dcc5"

[HTTP.JWTAuth]
# unit: min
AccessExpired = 1500
# unit: min
RefreshExpired = 10080
RedisKeyPrefix = "/jwt/"

[HTTP.TokenAuth]
Enable = false
HeaderUserTokenKey = "X-User-Token"

[HTTP.ProxyAuth]
# if proxy auth enabled, jwt auth is disabled
Enable = false
# username key in http proxy header
HeaderUserNameKey = "X-User-Name"
DefaultRoles = ["Standard"]

[HTTP.RSA]
# open RSA
OpenRSA = false

[DB]
# postgres: DSN="host=127.0.0.1 port=5432 user=root dbname=n9e_v6 password=1234 sslmode=disable"
DSN="nightingale:Nightingale_324@tcp(10.12.12.80:3306)/n9e_v6?charset=utf8mb4&parseTime=True&loc=Local&allowNativePasswords=true"
# enable debug mode or not
Debug = false
# mysql postgres
DBType = "mysql"
# unit: s
MaxLifetime = 7200
# max open connections
MaxOpenConns = 150
# max idle connections
MaxIdleConns = 50
# enable auto migrate or not
# EnableAutoMigrate = false

[Redis]
# address, ip:port or ip1:port,ip2:port for cluster and sentinel(SentinelAddrs)
Address = "redis:6379"
# Username = ""
# Password = ""
# DB = 0
# UseTLS = false
# TLSMinVersion = "1.2"
# standalone cluster sentinel
RedisType = "standalone"
# Mastername for sentinel type
# MasterName = "mymaster"
# SentinelUsername = ""
# SentinelPassword = ""

[Alert]
[Alert.Heartbeat]
# auto detect if blank
IP = ""
# unit ms
Interval = 1000
EngineName = "default"

# [Alert.Alerting]
# NotifyConcurrency = 10

[Center]
MetricsYamlFile = "./etc/metrics.yaml"
I18NHeaderKey = "X-Language"

[Center.AnonymousAccess]
PromQuerier = false
AlertDetail = false

[Pushgw]
# use target labels in database instead of in series
LabelRewrite = true
ForceUseServerTS = true

# [Pushgw.DebugSample]
# ident = "xx"
# __name__ = "xx"

# [Pushgw.WriterOpt]
# QueueMaxSize = 1000000
# QueuePopSize = 1000

[[Pushgw.Writers]] 
# Url = "http://127.0.0.1:8480/insert/0/prometheus/api/v1/write"
Url = "http://victoriametrics:8428/api/v1/write"
# Basic auth username
BasicAuthUser = ""
# Basic auth password
BasicAuthPass = ""
# timeout settings, unit: ms
Headers = ["X-From", "n9e"]
Timeout = 10000
DialTimeout = 3000
TLSHandshakeTimeout = 30000
ExpectContinueTimeout = 1000
IdleConnTimeout = 90000
# time duration, unit: ms
KeepAlive = 30000
MaxConnsPerHost = 0
MaxIdleConns = 100
MaxIdleConnsPerHost = 100
## Optional TLS Config
# UseTLS = false
# TLSCA = "/etc/n9e/ca.pem"
# TLSCert = "/etc/n9e/cert.pem"
# TLSKey = "/etc/n9e/key.pem"
# InsecureSkipVerify = false
# [[Writers.WriteRelabels]]
# Action = "replace"
# SourceLabels = ["__address__"]
# Regex = "([^:]+)(?::\\d+)?"
# Replacement = "$1:80"
# TargetLabel = "__address__"

[Ibex]
Enable = true
RPCListen = "0.0.0.0:20090"

springboot 应用启动参数添加

bash 复制代码
nohup java -server -Xms256m -Xmx512m -javaagent:/路径/opentelemetry-javaagent.jar \
-Dotel.resource.attributes=service.name=myService \
-Dotel.exporter.otlp.endpoint=http://localhost:4318 \
-Dotel.service.name=my-java-app \
-Dotel.traces.exporter=zipkin \
-Dotel.exporter.zipkin.endpoint=http://localhost:9411/api/v2/spans \
-jar springboot_app.jar >test.log 2>&1 &

截图如下:

Prometheus监控

http://ip::9090/graph

grafana监控

http://IP:3000/

zipkin监控

http://IP:9411/

夜莺监控

http://iP:17000

相关推荐
失伟29 分钟前
Stratovirt安装及使用
运维·虚拟化
捧月华如2 小时前
Linux 系统性能压测工具全景指南(含工程实战)
linux·运维·服务器
s19134838482d2 小时前
vlan实验报告
运维·服务器·网络
想唱rap2 小时前
线程的同步与互斥
linux·运维·服务器·数据库·mysql
格林威2 小时前
SSD 写入速度测试命令(Linux)(基于工业相机高速存储)
linux·运维·开发语言·人工智能·数码相机·计算机视觉·工业相机
勇闯逆流河3 小时前
【LInux】linux控制(进程替换,自主shell的实现详解)
linux·运维·服务器
IMPYLH3 小时前
Linux 的 ls 命令
linux·运维·服务器·bash
Agent产品评测局4 小时前
企业发票管理自动化落地,验真归档全流程实现方法:2026企业级智能体选型与实测指南
运维·网络·人工智能·ai·chatgpt·自动化
wwj888wwj4 小时前
Ansible基础(复习1)
linux·运维·ansible
DYuW5gBmH4 小时前
Anthropic 开源 Bloom:基于 LLM 的自动化行为评估框架
运维·microsoft·自动化