Prometheus指标

文章目录

Prometheus指标

主要参数解释

# 节点IP和端口(instance)
例如:192.168.1.226:9100、192.168.1.227:9100、192.168.1.228:9100

# HDFS-DataNode-IP和端口
例如:192.168.1.226:30003、192.168.1.227:30003、192.168.1.228:30003

# Yarn-NodeManager-IP和端口
例如:192.168.1.226:30005、192.168.1.227:30005、192.168.1.228:30005

一、可用性监测(0代表存在异常或未启动,1代表运行中)

# 各个节点的可用性监测
up{job="node_exporter"}

# hdfs-namenode可用性监测
up{job="hdfs-namenode"}

# hdfs-datanode可用性监测
up{job="hdfs-datanode"}

# yarn-resourcemanager可用性监测
up{job="yarn-resourcemanager"}

# yarn-nodemanager可用性监测
up{job="yarn-nodemanager"}

# zookeeper可用性监测
up{job="zookeeper"}

# kafka可用性监测
up{job="kafka"}

# hiveserver2可用性监测
up{job="hiveserver2"}

# metastore可用性监测
up{job="metastore"}

二、节点监测

# 磁盘总容量
node_filesystem_size_bytes{instance="节点IP和端口",mountpoint="/"}/1024/1024/1024

# 磁盘可用容量
node_filesystem_avail_bytes{instance="节点IP和端口",mountpoint="/"} /1024 /1024/1024

# CPU负载
node_load1{instance="节点IP和端口"}

# CPU使用率
100-avg(irate(node_cpu_seconds_total{mode="idle",instance="节点IP和端口"}[1m])) by (instance)*100

# 内存饱和度
sum((rate(node_vmstat_pgpgin{instance="节点IP和端口"}[1m])+rate(node_vmstat_pgpgout{instance="节点IP和端口"}[1m]))) by (instance)

# 物理内存使用率
(node_memory_MemTotal_bytes{instance="节点IP和端口"} - (node_memory_MemFree_bytes{instance="节点IP和端口"} + node_memory_Buffers_bytes{instance="节点IP和端口"} + node_memory_Cached_bytes{instance="节点IP和端口"}))/node_memory_MemTotal_bytes{instance="节点IP和端口"} * 100

# SWAP内存使用率
(node_memory_SwapTotal_bytes{instance="节点IP和端口"} - node_memory_SwapFree_bytes{instance="节点IP和端口"})/node_memory_SwapTotal_bytes{instance="节点IP和端口"} * 100

# 磁盘分区使用率("/"分区)
(node_filesystem_size_bytes{mountpoint="/",instance="节点IP和端口"} - node_filesystem_free_bytes{mountpoint="/",instance="节点IP和端口"})/node_filesystem_size_bytes{mountpoint="/",instance="节点IP和端口"} * 100

# 磁盘吞吐量(每秒读速率)
irate(node_disk_read_bytes_total{device="sda",instance="节点IP和端口"}[1m]) / 1024

# 磁盘吞吐量(每秒写速率)
irate(node_disk_written_bytes_total{device="sda",instance="节点IP和端口"}[1m]) / 1024

# 磁盘IOPS(每秒读次数)
irate(node_disk_reads_completed_total{device="sda",instance="节点IP和端口"}[1m])

# 磁盘IOPS(每秒写次数)
irate(node_disk_writes_completed_total{device="sda",instance="节点IP和端口"}[1m])

# 网卡流量(下载流量总字节数)
irate(node_network_receive_bytes_total{device != "lo",device !~ "docker.*",instance="节点IP和端口"}[1m]) / 1024

# 网卡流量(上传流量总字节数)
irate(node_network_transmit_bytes_total{device != "lo",device !~ "docker.*",instance="节点IP和端口"}[1m]) / 1024

三、服务监测

1.HDFS监测
# HDFS NameNode JVM堆内存使用率
Hadoop_NameNode_MemHeapUsedM/Hadoop_NameNode_MemHeapMaxM * 100

# HDFS DataNode JVM堆内存使用率
Hadoop_DataNode_MemHeapUsedM{instance="HDFS-DataNode-IP和端口"}/Hadoop_DataNode_MemHeapMaxM{instance="HDFS-DataNode-IP和端口"} * 100

# HDFS 容量使用率(百分比)
Hadoop_NameNode_PercentUsed

# HDFS:CPU 使用率
sum(java_lang_OperatingSystem_ProcessCpuLoad{job=~"hdfs.*"})*100

# HDFS:NameNode RPC请求平均处理时间 (单位:ms)
Hadoop_NameNode_RpcProcessingTimeAvgTime{job="hdfs-namenode"}

# HDFS:NameNode RPC处理队列长度 (无单位或1/个)
Hadoop_NameNode_CallQueueLength{job="hdfs-namenode"}

# HDFS:DataNode RPC请求平均处理时间 (单位:ms)
Hadoop_DataNode_RpcProcessingTimeAvgTime{job="hdfs-datanode"}

# HDFS:DataNode RPC处理队列长度 (无单位或1/个)
Hadoop_DataNode_CallQueueLength{job="hdfs-datanode"}

# HDFS:NameNode 线程阻塞数量(无单位或1/个)
sum(Hadoop_NameNode_ThreadsBlocked{job="hdfs-namenode"})

# HDFS:NameNode 线程等待数量(无单位或1/个)
sum(Hadoop_NameNode_ThreadsWaiting{job="hdfs-namenode"})

# HDFS:DataNode 线程阻塞数量(无单位或1/个)
sum(Hadoop_DataNode_ThreadsBlocked{job="hdfs-datanode"})

# HDFS:DataNode 线程等待数量(无单位或1/个)
sum(Hadoop_DataNode_ThreadsWaiting{job="hdfs-datanode"})

# HDFS:DataNode 当前已使用容量(单位:MB)
Hadoop_NameNode_CapacityUsed{name="FSNamesystem"}/1024/1024

# HDFS:DataNode 原始总容量(单位:GB)
Hadoop_NameNode_CapacityTotal{name="FSNamesystem"}/1024/1024/1024

# HDFS:当前运行正常的DataNode数量
Hadoop_NameNode_NumLiveDataNodes

# HDFS:当前运行异常的DataNode数量
Hadoop_NameNode_NumDeadDataNodes

# HDFS:当前分配的block数量
Hadoop_NameNode_BlocksTotal{name="FSNamesystem"}

# HDFS:NameNode接收字节速率(单位:B/s)
irate(Hadoop_NameNode_ReceivedBytes[1m])

# HDFS:NameNode发送字节速率(单位:B/s)
irate(Hadoop_NameNode_SentBytes[1m])

# HDFS:NameNode创建文件的操作总数
Hadoop_NameNode_CreateFileOps

# HDFS:NameNode删除文件的操作总数
Hadoop_NameNode_DeleteFileOps

# HDFS:NameNode当前连接数
Hadoop_NameNode_TotalLoad{name="FSNamesystem"}

# HDFS:NameNode当前文件和目录的数量
Hadoop_NameNode_FilesTotal{name="FSNamesystem"}

# HDFS:DataNode读取字节速率(单位:B/s)
sum(irate(Hadoop_DataNode_BytesRead[1m]))

# HDFS:DataNode写入字节速率(单位:B/s)
sum(irate(Hadoop_DataNode_BytesWritten[1m]))

# HDFS:DataNode读操作总数
sum(Hadoop_DataNode_ReadBlockOpNumOps)

# HDFS:DataNode写操作总数
sum(Hadoop_DataNode_WriteBlockOpNumOps)

# HDFS:DataNode接收字节速率(单位:B/s)
sum(irate(Hadoop_DataNode_ReceivedBytes[1m]))

# HDFS:DataNode发送字节速率(单位:B/s)
sum(irate(Hadoop_DataNode_SentBytes[1m]))
2.Yarn监测
# Yarn ResourceManager JVM堆内存使用率
Hadoop_ResourceManager_MemHeapUsedM/Hadoop_ResourceManager_MemHeapMaxM * 100

# Yarn NodeManager JVM堆内存使用率
Hadoop_NodeManager_MemHeapUsedM{instance="Yarn-NodeManager-IP和端口"}/Hadoop_NodeManager_MemHeapMaxM{instance="Yarn-NodeManager-IP和端口"} * 100

# Yarn:CPU 使用率
sum(java_lang_OperatingSystem_ProcessCpuLoad{job=~"yarn.*"})*100

# Yarn:ResourceManager RPC请求平均处理时间(单位:ms)
Hadoop_ResourceManager_RpcProcessingTimeAvgTime{job="yarn-resourcemanager"}

# Yarn:ResourceManager RPC处理队列长度(无单位或1/个)
Hadoop_ResourceManager_CallQueueLength{job="yarn-resourcemanager"}

# Yarn:NodeManager RPC请求平均处理时间(单位:ms)
Hadoop_NodeManager_RpcProcessingTimeAvgTime{job="yarn-nodemanager"}

# Yarn:NodeManager RPC处理队列长度(无单位或1/个)
Hadoop_NodeManager_CallQueueLength{job="yarn-nodemanager"}

# Yarn:ResourceManager 线程阻塞数量(无单位或1/个)
sum(Hadoop_ResourceManager_ThreadsBlocked{job="yarn-resourcemanager"})

# Yarn:NodeManager 线程阻塞数量(无单位或1/个)
sum(Hadoop_NodeManager_ThreadsBlocked{job="yarn-nodemanager"})

# Yarn:ResourceManager 线程等待数量(无单位或1/个)
sum(Hadoop_ResourceManager_ThreadsWaiting{job="yarn-resourcemanager"})

# Yarn:NodeManager 线程等待数量(无单位或1/个)
sum(Hadoop_NodeManager_ThreadsWaiting{job="yarn-nodemanager"})

# Yarn:NM存活节点数量统计
Hadoop_ResourceManager_NumActiveNMs

# Yarn:NM丢失节点数量统计
Hadoop_ResourceManager_NumLostNMs

# Yarn:NM不健康节点数量统计
Hadoop_ResourceManager_NumUnhealthyNMs

# Yarn:app运行数量
sum(Hadoop_ResourceManager_AppsRunning{q1 != ""})

# Yarn:app提交数量
sum(Hadoop_ResourceManager_AppsSubmitted{q1 != ""})

# Yarn:app等待数量
sum(Hadoop_ResourceManager_AppsPending{q1 != ""})

# Yarn:app完成数量
sum(Hadoop_ResourceManager_AppsCompleted{q1 != ""})

# Yarn:app被kill的数量
sum(Hadoop_ResourceManager_AppsKilled{q1 != ""})

# Yarn:app失败数量
sum(Hadoop_ResourceManager_AppsFailed{q1 != ""})

# Yarn:已分配的内存大小
sum(Hadoop_ResourceManager_AllocatedMB{q1 != ""})

# Yarn:已分配的核数量
sum(Hadoop_ResourceManager_AllocatedVCores{q1 != ""})

# Yarn:已分配的Container数量
sum(Hadoop_ResourceManager_AllocatedContainers{q1 != ""})

# Yarn:可用的内存大小(单位:MB)
sum(Hadoop_ResourceManager_AvailableMB{q1 != ""})

# Yarn:1min内NM心跳汇报次数
increase(Hadoop_ResourceManager_NodeUpdateNumOps[1m])

# Yarn:NM心跳汇报处理时间
Hadoop_ResourceManager_NodeUpdateAvgTime
3.Hive监测
# Hive:CPU 使用率
sum(java_lang_OperatingSystem_ProcessCpuLoad{job=~"hiveserver2|metastore"})*100

# Hive:JVM堆内存使用率
sum(jvm_memory_bytes_used{job=~"hiveserver2|metastore",area="heap"})/sum(jvm_memory_bytes_max{job=~"hiveserver2|metastore",area="heap"})*100

# Hive:JVM GC Time(单位:ms)
increase(jvm_gc_collection_seconds_sum{job=~"hiveserver2|metastore"}[5m]) * 1000

# Hive:JVM GC Count(无单位)
increase(jvm_gc_collection_seconds_count{job=~"hiveserver2|metastore"}[5m])

# Hive:jvm当前线程数量(无单位)
sum(jvm_threads_current{job=~"hiveserver2|metastore"})

# Hive:jvm后台线程数量(无单位)
sum(jvm_threads_daemon{job=~"hiveserver2|metastore"})

# Hive:jvm死锁线程数量(无单位)
sum(jvm_threads_deadlocked{job=~"hiveserver2|metastore"})

# Hive:jvm线程阻塞数量(无单位)
sum(jvm_threads_state{job=~"hiveserver2|metastore",state="BLOCKED"})

# Hive:jvm线程等待数量(无单位)
sum(jvm_threads_state{job=~"hiveserver2|metastore",state="WAITING"})

# Hive:当前jvm已加载类数量(无单位)
sum(jvm_classes_loaded_total{job=~"hiveserver2|metastore"})

# Hive:用户和系统的总cpu使用时间(单位:s)
max(process_cpu_seconds_total{job=~"hiveserver2|metastore"})
4.Kafka监测
# Kafka:CPU使用率(%)
irate(process_cpu_seconds_total{job="kafka"}[5m])*100

# Kafka:JVM内存使用情况(单位:MiB)
sum without(area)(jvm_memory_bytes_used{job="kafka"}) / 1024 / 1024

# Kafka:Broker网络吞吐量-流入(单位:B/s)
sum(rate(kafka_server_brokertopicmetrics_bytesinpersec{job="kafka",topic!=""}[5m]))

# Kafka:Broker网络吞吐量-流出(单位:B/s)
sum(rate(kafka_server_brokertopicmetrics_bytesoutpersec{job="kafka",topic!=""}[5m]))

# Kafka:激活状态控制器数量(无单位)
sum(kafka_controller_kafkacontroller_activecontrollercount{job="kafka"})

# Kafka:在线分区数(无单位)
sum(kafka_server_replicamanager_partitioncount{job="kafka"})

# Kafka:离线分区数(无单位)
sum(kafka_controller_kafkacontroller_offlinepartitionscount{job="kafka"})

# Kafka:在线Broker数量(无单位)
count(kafka_server_replicamanager_leadercount{job="kafka"})

# Kafka:Broker消息速率(io/s)
sum without(topic)(rate(kafka_server_brokertopicmetrics_messagesinpersec{job="kafka"}[5m]))

# Kafka:Broker入站流量速率(B/s)
sum without(topic)(rate(kafka_server_brokertopicmetrics_bytesinpersec{job="kafka"}[5m]))

# Kafka:Broker出站流量速率(B/s)
sum without(topic)(rate(kafka_server_brokertopicmetrics_bytesoutpersec{job="kafka"}[5m]))

# Kafka:topic消息速率(io/s)
sum without(instance)(rate(kafka_server_brokertopicmetrics_messagesinpersec{job="kafka",topic!=""}[5m]))

# Kafka:topic入站流量速率(B/s)
sum without(instance)(rate(kafka_server_brokertopicmetrics_bytesinpersec{job="kafka",topic!=""}[5m]))

# Kafka:topic出站流量速率(B/s)
sum without(instance)(rate(kafka_server_brokertopicmetrics_bytesoutpersec{job="kafka",topic!=""}[5m]))

# Kafka:I-O线程池平均空闲比例(%)
kafka_server_kafkarequesthandlerpool_requesthandleravgidlepercent_total{job="kafka"}

# Kafka:活跃连接数
sum(kafka_server_socketservermetrics_connection_count{job="kafka"}) by (instance, listener)
5.Zookeeper监测
# Zookeeper:Quorum Size(无单位)
zookeeper_QuorumSize{job="zookeeper"}

# Zookeeper:Follower个数(无单位)
avg(zookeeper_QuorumSize{job="zookeeper"}) -1

# Zookeeper:健康检查(Ticktime)(单位:ms)
avg(zookeeper_TickTime{job="zookeeper"}) by (instance)

# Zookeeper:平均请求延迟
zookeeper_AvgRequestLatency{job="zookeeper"}

# Zookeeper:Znode个数
zookeeper_InMemoryDataTree_NodeCount{job="zookeeper"}

# Zookeeper:活跃连接数
zookeeper_NumAliveConnections{job="zookeeper"}

# Zookeeper:未完成请求数
zookeeper_OutstandingRequests{job="zookeeper"}

# Zookeeper:监视器数量
zookeeper_InMemoryDataTree_WatchCount{job="zookeeper"}

# Zookeeper:JVM GC Time(单位:ms)
increase(jvm_gc_collection_seconds_sum{job="zookeeper"}[5m]) * 1000

# Zookeeper:JVM GC Count(无单位)
increase(jvm_gc_collection_seconds_count{job="zookeeper"}[5m])

# Zookeeper:JVM堆内存使用率(%)
jvm_memory_bytes_used{job="zookeeper", area="heap"}/jvm_memory_bytes_max{job="zookeeper", area="heap"}*100
相关推荐
逻辑与&&1 天前
[Prometheus学习笔记]从架构到案例,一站式教程
笔记·学习·prometheus
Walden-20201 天前
构建基于 DCGM-Exporter, Node exporter,PROMETHEUS 和 GRAFANA 构建算力监控系统
docker·容器·grafana·prometheus
牛角上的男孩2 天前
部署Prometheus、Grafana、Zipkin、Kiali监控度量Istio
grafana·prometheus·istio
福大大架构师每日一题4 天前
文心一言 VS 讯飞星火 VS chatgpt (383)-- 算法导论24.5 3题
prometheus
小安运维日记6 天前
Linux云计算 |【第五阶段】CLOUD-DAY10
linux·运维·云计算·k8s·grafana·prometheus
福大大架构师每日一题7 天前
29.2 golang实战项目log2metrics架构说明
架构·prometheus
花开了¥7 天前
prometheus 快速入门
prometheus
陈小肚9 天前
k8s 1.28.2 集群部署 Thanos 对接 MinIO 实现 Prometheus 数据长期存储
kubernetes·prometheus·thanos
福大大架构师每日一题9 天前
27.9 调用go-ansible执行playbook拷贝json文件重载采集器
golang·json·ansible·prometheus
迷茫运维路10 天前
Prometheus+Telegraf实现自定义监控项配置
运维·prometheus·telegraf·自定义监控