prometheus监控RabbitMQ策略

一般用官方的rabbitmq_exporter采取数据即可,然后在普米配置。但如果rabbitmq节点的队列数超过了5000,往往rabbitmq_exporter就会瘫痪,因为rabbitmq_exporter采集的信息太多,尤其是那些队列的细节,所以队列多了,rabbitmq_exporter就没法用了。所以我们不得不自己写脚本探测MQ,脚本分享如下:

首先 pip3 install prometheus-client

bash 复制代码
import prometheus_client as prom
import pandas as pd
from sqlalchemy import create_engine
import requests,time
#自定义普米MQ监控指标
port = '15672'
username = 'username'
password = 'password'
g0 = prom.Gauge("rabbitmq_up", 'life of the node',labelnames=['node','region'])
g1 = prom.Gauge("rabbitmq_queues", 'total queue num of the node',labelnames=['node','region'])
g2 = prom.Gauge("rabbitmq_channels", 'total queue num of the node',labelnames=['node','region'])
g3 = prom.Gauge("rabbitmq_connections", 'total queue num of the node',labelnames=['node','region'])
g4 = prom.Gauge("rabbitmq_consumers", 'total queue num of the node',labelnames=['node','region'])
g5 = prom.Gauge("rabbitmq_exchanges", 'total queue num of the node',labelnames=['node','region'])
g6 = prom.Gauge("rabbitmq_messages", 'total messages of the node',labelnames=['node','region'])
g7 = prom.Gauge("rabbitmq_vhosts", 'total vhost num of the node',labelnames=['node','region'])
g8 = prom.Gauge("rabbitmq_node_mem_used", 'mem used of the node',labelnames=['node','region'])
g9 = prom.Gauge("rabbitmq_node_mem_limit", 'mem limit of the node',labelnames=['node','region'])
g10 = prom.Gauge("rabbitmq_node_mem_alarm", 'mem alarm of the node',labelnames=['node','region'])
g11 = prom.Gauge("rabbitmq_node_disk_free_alarm", 'free disk alarm of the node',labelnames=['node','region'])

prom.start_http_server(8086)
#要监控的MQ节点
nodelist=['1.1.1.1','1.1.1.2','1.1.1.3'] 
while True:
  for node in nodelist:                      #遍历各个node
    status=1
    try:                                                                              #测试连通性
      requests.get(url=f"http://{node}:{port}/api/overview", auth=(username, password),timeout=5)
    except:
      status=0
      continue
    finally:
      g0.labels(node=node,region=region).set(status)
    info1=requests.get(url=f"http://{node}:{port}/api/overview", auth=(username, password),timeout=5)
    info2=requests.get(url=f"http://{node}:{port}/api/nodes", auth=(username, password),timeout=5)[0]
    info3=requests.get(url=f"http://{node}:{port}/api/vhosts", auth=(username, password),timeout=5)
    g1.labels(node=node,region=region).set(info1.get('object_totals').get('queues'))  
    g2.labels(node=node,region=region).set(info1.get('object_totals').get('channels')) 
    g3.labels(node=node,region=region).set(info1.get('object_totals').get('connections')) 
    g4.labels(node=node,region=region).set(info1.get('object_totals').get('consumers')) 
    g5.labels(node=node,region=region).set(info1.get('object_totals').get('exchanges')) 
    g6.labels(node=node,region=region).set(info1.get('queue_totals').get('messages')) 
    g7.labels(node=node,region=region).set(len(info3)) 
    g8.labels(node=node,region=region).set(info2.get('mem_used'))
    g9.labels(node=node,region=region).set(info2.get('mem_limit'))
    g10.labels(node=node,region=region).set(info2.get('mem_alarm'))
    g11.labels(node=node,region=region).set(info2.get('disk_free_alarm'))
  time.sleep(30)

python3 执行这个脚本,就会运行一个页面如下

于是就可以用普米采集了

注意honor_labels 指标,就是让job,instance指标不被普米自身的覆盖

或者写脚本推送到Pushgateway也可以,如下

bash 复制代码
import prometheus_client as prom
import pandas as pd
from sqlalchemy import create_engine
import requests,time
#自定义普米MQ监控指标
port = '15672'
username = 'username'
password = 'password'
url='http://pushgateway地址:9091'
registry = prom.CollectorRegistry()
url='http://172.18.2.143:8181'
job='rabbitmq'
g0 = prom.Gauge("rabbitmq_up", 'life of the node',labelnames=['node','region'],registry=registry)
g1 = prom.Gauge("rabbitmq_queues", 'total queue num of the node',labelnames=['node','region'],registry=registry)
g2 = prom.Gauge("rabbitmq_channels", 'total queue num of the node',labelnames=['node','region'],registry=registry)
g3 = prom.Gauge("rabbitmq_connections", 'total queue num of the node',labelnames=['node','region'],registry=registry)
g4 = prom.Gauge("rabbitmq_consumers", 'total queue num of the node',labelnames=['node','region'],registry=registry)
g5 = prom.Gauge("rabbitmq_exchanges", 'total queue num of the node',labelnames=['node','region'],registry=registry)
g6 = prom.Gauge("rabbitmq_messages", 'total messages of the node',labelnames=['node','region'],registry=registry)
g7 = prom.Gauge("rabbitmq_vhosts", 'total vhost num of the node',labelnames=['node','region'],registry=registry)
g8 = prom.Gauge("rabbitmq_node_mem_used", 'mem used of the node',labelnames=['node','region'],registry=registry)
g9 = prom.Gauge("rabbitmq_node_mem_limit", 'mem limit of the node',labelnames=['node','region'],registry=registry)
g10 = prom.Gauge("rabbitmq_node_mem_alarm", 'mem alarm of the node',labelnames=['node','region'],registry=registry)
g11 = prom.Gauge("rabbitmq_node_disk_free_alarm", 'free disk alarm of the node',labelnames=['node','region'],registry=registry)

#要监控的MQ节点
nodelist=['1.1.1.1','1.1.1.2','1.1.1.3'] 
while True:
  for node in nodelist:                      #遍历各个node
    status=1
    try:                                                                              #测试连通性
      requests.get(url=f"http://{node}:{port}/api/overview", auth=(username, password),timeout=5)
    except:
      status=0
      continue
    finally:
      g0.labels(node=node,region=region).set(status)
    info1=requests.get(url=f"http://{node}:{port}/api/overview", auth=(username, password),timeout=5)
    info2=requests.get(url=f"http://{node}:{port}/api/nodes", auth=(username, password),timeout=5)[0]
    info3=requests.get(url=f"http://{node}:{port}/api/vhosts", auth=(username, password),timeout=5)
    g1.labels(node=node,region=region).set(info1.get('object_totals').get('queues'))  
    g2.labels(node=node,region=region).set(info1.get('object_totals').get('channels')) 
    g3.labels(node=node,region=region).set(info1.get('object_totals').get('connections')) 
    g4.labels(node=node,region=region).set(info1.get('object_totals').get('consumers')) 
    g5.labels(node=node,region=region).set(info1.get('object_totals').get('exchanges')) 
    g6.labels(node=node,region=region).set(info1.get('queue_totals').get('messages')) 
    g7.labels(node=node,region=region).set(len(info3)) 
    g8.labels(node=node,region=region).set(info2.get('mem_used'))
    g9.labels(node=node,region=region).set(info2.get('mem_limit'))
    g10.labels(node=node,region=region).set(info2.get('mem_alarm'))
    g11.labels(node=node,region=region).set(info2.get('disk_free_alarm'))
    prom.push_to_gateway(url,job=job,registry=registry, timeout=20)
  time.sleep(30)
相关推荐
川石课堂软件测试1 天前
涨薪技术|Prometheus使用Recoding Rules优化性能
功能测试·测试工具·jmeter·mysql·面试·单元测试·prometheus
BPM_宏天低代码2 天前
宏天CRM系统的消息中心:基于RabbitMQ的实践
分布式·rabbitmq
yunson_Liu2 天前
prometheus添加es监控模块
elasticsearch·prometheus
少许极端2 天前
消息队列4-RabbitMQ的高级特性-TTL机制、死信队列、延迟队列
分布式·消息队列·rabbitmq
014-code2 天前
RabbitMQ 生产端可靠投递(confirm、return、重试)
分布式·消息队列·rabbitmq
014-code2 天前
RabbitMQ 消费端幂等实战(重复消息、去重、重放怎么处理)
分布式·消息队列·rabbitmq
8Qi82 天前
微服务通信:同步 vs 异步与MQ选型指南
java·分布式·微服务·云原生·中间件·架构·rabbitmq
redaijufeng3 天前
SpringBoot中整合RabbitMQ(测试+部署上线 最完整)
spring boot·rabbitmq·java-rabbitmq
杜子不疼.3 天前
Prometheus Pushgateway:批量离线任务指标监控实践
prometheus
糖炒栗子03263 天前
后端消息投递可靠性:基于 RabbitMQ 的“双重防线-幂等闭环”模式
java·后端·rabbitmq