一、prometheus搭建
支持监控的expoter官方文档地址:
https://prometheus.io/docs/instrumenting/exporters/
1.配置文件构成
全局、报警、规则、抓取
Prometheus 的配置文件(prometheus.yml)就 四大金刚:
-
global
全局默认参数:多久抓一次、多久算一次报警、对外的"身份证"标签。
-
alerting
报警出口:算出火警后往哪台 Alertmanager 送;没有就空着。
-
rule_files
报警规则/记录规则文件列表,Prometheus 启动时会把它们加载进来。
-
scrape_configs
抓取任务列表:告诉 Prometheus 去哪些地址、多久一次、怎么贴标签、怎样服务发现。
2.配置文件模板
# 全局
global:
scrape_interval: 5s
evaluation_interval: 5s
external_labels:
monitor: 'dashboard'
# 告警出口
#alerting:
# alertmanagers:
# - static_configs:
# - targets:
# - "10.xx.xx.xx:9093"
# 告警规则
rule_files:
- '/etc/prometheus/rules/*.yml'
# 抓取指标
scrape_configs:
#示例
- job_name: 'prometheus/pushgateway'
scrape_interval: 15s
static_configs:
- targets: ['prometheus:9090']
#服务器监控
- job_name: 'node_exporter'
scrape_interval: 15s
static_configs:
- targets: ['10.xx.xx.xx:xxxx]
#容器监控
- job_name: 'cadvisor'
scrape_interval: 15s
static_configs:
- targets: ['10.xx.xx.xx:xxxx]
#后端服务
- job_name: 'qc6_metrics'
metrics_path: '/metrics/qc_server'
scrape_interval: 15s
static_configs:
- targets: ['10.233.63.6:9990']
relabel_configs:
- source_labels: [__address__]
target_label: instance
replacement: "qc6_仿真"
3.docker-compose文件示例
version: "3"
services:
grafana:
image: grafana/grafana:10.4.19
#image: grafana/grafana:12.0.0
container_name: grafana
user: root
restart: always
ports:
- "13000:3000"
volumes:
- /data/grafana/conf:/etc/grafana
- /data/grafana/data:/var/lib/grafana
- /nfsdata/prod_env/app_log/middle/grafana:/var/log/grafana
#- ./log:/var/log/grafana
#networks:
#- lightning-network
二、监控采集容器搭建
1.node-exporter搭建
docker-compose 文件示例
version: '3'
services:
node_exporter:
image: prom/node-exporter:latest
container_name: node_exporter
command:
- '--path.rootfs=/host'
pid: host
restart: always
environment:
- TZ=Asia/Shanghai
ports:
- "9100:9100"
volumes:
- '/:/host:ro,rslave'
#"把整个宿主机根目录挂到容器里的 /host 路径,只读(ro)方式,并且用rslave保证容器内能看到宿主机后续挂载点变化,让 node_exporter 能读到真正的 /proc、/sys 等系统文件。"
2.cadvisor搭建
docker-compose文件示例
version: '3.2'
services:
cadvisor:
image: harbor:443/cmamoc/cadvisor:v0.37.0
container_name: cadvisor
ports:
- 8080:8080
volumes:
- /:/rootfs:ro
- /var/run:/var/run:rw
- /sys:/sys:ro
- /data/docker/:/var/lib/docker:ro
三、grafana的搭建
仪表盘模板地址
https://grafana.com/grafana/dashboards/
官方文档地址
1.docker-compose文件示例
version: "3"
services:
grafana:
image: grafana/grafana:10.4.19
#image: grafana/grafana:12.0.0
container_name: grafana
user: root
restart: always
ports:
- "13000:3000"
volumes:
- /data/grafana/conf:/etc/grafana
- /data/grafana/data:/var/lib/grafana
- /nfsdata/prod_env/app_log/middle/grafana:/var/log/grafana
#- ./log:/var/log/grafana
#networks:
#- lightning-network
2.配置文件
1.控制 Grafana 服务器的基本行为
[server]
# 协议:http 或 https
protocol = http
# HTTP 端口
http_port = 3000
# 绑定地址(0.0.0.0 表示所有接口)
domain = 0.0.0.0
# 公共访问地址(必须设置,用于重定向和链接生成)
root_url = http://localhost:3000
# 是否从子路径提供服务(用于反向代理)
serve_from_sub_path = false
# 启用 GZIP 压缩
enable_gzip = false
# 静态文件路径
static_root_path = public
# SSL/TLS 配置
cert_file = /path/to/cert.pem
cert_key = /path/to/key.pem
2. 数据库配置 [database]
[database]
# 数据库类型:sqlite3, mysql, postgres
type = sqlite3
# SQLite 数据库文件路径
path = grafana.db
# MySQL/PostgreSQL 连接
host = 127.0.0.1:3306
name = grafana
user = grafana
password = secret
# 连接池设置
max_idle_conn = 2
max_open_conn = 0
conn_max_lifetime = 14400
# 日志 SQL 查询
log_queries = false
3. 安全配置 [security]
[security]
# 管理员初始密码
admin_password = admin
# 加密密钥(必须更改!)
secret_key = SW2YcwTIb9zpOOhoPsMm
# Cookie 安全设置
cookie_secure = false # 仅 HTTPS
cookie_samesite = lax # lax/strict/none
login_remember_days = 7 # 记住登录天数
# 强制密码策略
disable_initial_admin_creation = false
disable_brute_force_login_protection = false
# 允许的用户名正则
allowed_username = ^[A-Za-z0-9_-]+$
4. 用户配置 [users]
[users]
# 允许用户注册
allow_sign_up = false
# 自动分配组织和角色
auto_assign_org = true
auto_assign_org_role = Viewer
# 查看者是否可以编辑
viewers_can_edit = false
# 允许组织创建
allow_org_create = false
# 默认主题:dark, light, system
default_theme = dark
# 登录提示
login_hint = email or username
# 外部用户管理
external_manage_link_url =
external_manage_link_name =
external_manage_info =
5. 认证配置 [auth]
[auth]
# 禁用登录表单(当使用 OAuth/LDAP 时)
disable_login_form = false
# 禁用登出菜单
disable_signout_menu = false
# 登录令牌生命周期(秒)
oauth_state_cookie_max_age = 600
# API 密钥最大天数
api_key_max_seconds_to_live = 86400
# 匿名访问配置
[auth.anonymous]
enabled = false
org_name = Main Org.
org_role = Viewer
# 基本认证
[auth.basic]
enabled = true
6. 日志配置 [log]
[log]
# 日志模式:console, file, syslog
mode = console file
# 日志级别:debug, info, warn, error, critical
level = info
# 日志文件设置
filters = # 过滤器
log_line_limit = 1000 # 每行最大字符数
log_file_name = grafana.log # 日志文件名
# 控制台输出格式
console_format = console # console/json
7.仪表板配置 [dashboards]
[dashboards]
# 保留的历史版本数量
versions_to_keep = 20
# 最小刷新间隔
min_refresh_interval = 5s
# 默认时间范围
default_home_dashboard_path =
# 不允许的时间范围
timezone_options =
8.警报配置 [alerting]
[alerting]
# 告警启用状态
enabled = true
execute_alerts = true
# 告警超时设置
notification_timeout_seconds = 30
max_attempts = 3
# 告警评估超时
evaluation_timeout_seconds = 30
# 错误/超时配置
error_or_timeout = alerting
nodata_or_nullvalues = no_data
# 并发告警限制
concurrent_render_limit = 5