Prometheus指标

文章目录

Prometheus指标

主要参数解释

复制代码
# 节点IP和端口(instance)
例如:192.168.1.226:9100、192.168.1.227:9100、192.168.1.228:9100

# HDFS-DataNode-IP和端口
例如:192.168.1.226:30003、192.168.1.227:30003、192.168.1.228:30003

# Yarn-NodeManager-IP和端口
例如:192.168.1.226:30005、192.168.1.227:30005、192.168.1.228:30005

一、可用性监测(0代表存在异常或未启动,1代表运行中)

复制代码
# 各个节点的可用性监测
up{job="node_exporter"}

# hdfs-namenode可用性监测
up{job="hdfs-namenode"}

# hdfs-datanode可用性监测
up{job="hdfs-datanode"}

# yarn-resourcemanager可用性监测
up{job="yarn-resourcemanager"}

# yarn-nodemanager可用性监测
up{job="yarn-nodemanager"}

# zookeeper可用性监测
up{job="zookeeper"}

# kafka可用性监测
up{job="kafka"}

# hiveserver2可用性监测
up{job="hiveserver2"}

# metastore可用性监测
up{job="metastore"}

二、节点监测

复制代码
# 磁盘总容量
node_filesystem_size_bytes{instance="节点IP和端口",mountpoint="/"}/1024/1024/1024

# 磁盘可用容量
node_filesystem_avail_bytes{instance="节点IP和端口",mountpoint="/"} /1024 /1024/1024

# CPU负载
node_load1{instance="节点IP和端口"}

# CPU使用率
100-avg(irate(node_cpu_seconds_total{mode="idle",instance="节点IP和端口"}[1m])) by (instance)*100

# 内存饱和度
sum((rate(node_vmstat_pgpgin{instance="节点IP和端口"}[1m])+rate(node_vmstat_pgpgout{instance="节点IP和端口"}[1m]))) by (instance)

# 物理内存使用率
(node_memory_MemTotal_bytes{instance="节点IP和端口"} - (node_memory_MemFree_bytes{instance="节点IP和端口"} + node_memory_Buffers_bytes{instance="节点IP和端口"} + node_memory_Cached_bytes{instance="节点IP和端口"}))/node_memory_MemTotal_bytes{instance="节点IP和端口"} * 100

# SWAP内存使用率
(node_memory_SwapTotal_bytes{instance="节点IP和端口"} - node_memory_SwapFree_bytes{instance="节点IP和端口"})/node_memory_SwapTotal_bytes{instance="节点IP和端口"} * 100

# 磁盘分区使用率("/"分区)
(node_filesystem_size_bytes{mountpoint="/",instance="节点IP和端口"} - node_filesystem_free_bytes{mountpoint="/",instance="节点IP和端口"})/node_filesystem_size_bytes{mountpoint="/",instance="节点IP和端口"} * 100

# 磁盘吞吐量(每秒读速率)
irate(node_disk_read_bytes_total{device="sda",instance="节点IP和端口"}[1m]) / 1024

# 磁盘吞吐量(每秒写速率)
irate(node_disk_written_bytes_total{device="sda",instance="节点IP和端口"}[1m]) / 1024

# 磁盘IOPS(每秒读次数)
irate(node_disk_reads_completed_total{device="sda",instance="节点IP和端口"}[1m])

# 磁盘IOPS(每秒写次数)
irate(node_disk_writes_completed_total{device="sda",instance="节点IP和端口"}[1m])

# 网卡流量(下载流量总字节数)
irate(node_network_receive_bytes_total{device != "lo",device !~ "docker.*",instance="节点IP和端口"}[1m]) / 1024

# 网卡流量(上传流量总字节数)
irate(node_network_transmit_bytes_total{device != "lo",device !~ "docker.*",instance="节点IP和端口"}[1m]) / 1024

三、服务监测

1.HDFS监测
复制代码
# HDFS NameNode JVM堆内存使用率
Hadoop_NameNode_MemHeapUsedM/Hadoop_NameNode_MemHeapMaxM * 100

# HDFS DataNode JVM堆内存使用率
Hadoop_DataNode_MemHeapUsedM{instance="HDFS-DataNode-IP和端口"}/Hadoop_DataNode_MemHeapMaxM{instance="HDFS-DataNode-IP和端口"} * 100

# HDFS 容量使用率(百分比)
Hadoop_NameNode_PercentUsed

# HDFS:CPU 使用率
sum(java_lang_OperatingSystem_ProcessCpuLoad{job=~"hdfs.*"})*100

# HDFS:NameNode RPC请求平均处理时间 (单位:ms)
Hadoop_NameNode_RpcProcessingTimeAvgTime{job="hdfs-namenode"}

# HDFS:NameNode RPC处理队列长度 (无单位或1/个)
Hadoop_NameNode_CallQueueLength{job="hdfs-namenode"}

# HDFS:DataNode RPC请求平均处理时间 (单位:ms)
Hadoop_DataNode_RpcProcessingTimeAvgTime{job="hdfs-datanode"}

# HDFS:DataNode RPC处理队列长度 (无单位或1/个)
Hadoop_DataNode_CallQueueLength{job="hdfs-datanode"}

# HDFS:NameNode 线程阻塞数量(无单位或1/个)
sum(Hadoop_NameNode_ThreadsBlocked{job="hdfs-namenode"})

# HDFS:NameNode 线程等待数量(无单位或1/个)
sum(Hadoop_NameNode_ThreadsWaiting{job="hdfs-namenode"})

# HDFS:DataNode 线程阻塞数量(无单位或1/个)
sum(Hadoop_DataNode_ThreadsBlocked{job="hdfs-datanode"})

# HDFS:DataNode 线程等待数量(无单位或1/个)
sum(Hadoop_DataNode_ThreadsWaiting{job="hdfs-datanode"})

# HDFS:DataNode 当前已使用容量(单位:MB)
Hadoop_NameNode_CapacityUsed{name="FSNamesystem"}/1024/1024

# HDFS:DataNode 原始总容量(单位:GB)
Hadoop_NameNode_CapacityTotal{name="FSNamesystem"}/1024/1024/1024

# HDFS:当前运行正常的DataNode数量
Hadoop_NameNode_NumLiveDataNodes

# HDFS:当前运行异常的DataNode数量
Hadoop_NameNode_NumDeadDataNodes

# HDFS:当前分配的block数量
Hadoop_NameNode_BlocksTotal{name="FSNamesystem"}

# HDFS:NameNode接收字节速率(单位:B/s)
irate(Hadoop_NameNode_ReceivedBytes[1m])

# HDFS:NameNode发送字节速率(单位:B/s)
irate(Hadoop_NameNode_SentBytes[1m])

# HDFS:NameNode创建文件的操作总数
Hadoop_NameNode_CreateFileOps

# HDFS:NameNode删除文件的操作总数
Hadoop_NameNode_DeleteFileOps

# HDFS:NameNode当前连接数
Hadoop_NameNode_TotalLoad{name="FSNamesystem"}

# HDFS:NameNode当前文件和目录的数量
Hadoop_NameNode_FilesTotal{name="FSNamesystem"}

# HDFS:DataNode读取字节速率(单位:B/s)
sum(irate(Hadoop_DataNode_BytesRead[1m]))

# HDFS:DataNode写入字节速率(单位:B/s)
sum(irate(Hadoop_DataNode_BytesWritten[1m]))

# HDFS:DataNode读操作总数
sum(Hadoop_DataNode_ReadBlockOpNumOps)

# HDFS:DataNode写操作总数
sum(Hadoop_DataNode_WriteBlockOpNumOps)

# HDFS:DataNode接收字节速率(单位:B/s)
sum(irate(Hadoop_DataNode_ReceivedBytes[1m]))

# HDFS:DataNode发送字节速率(单位:B/s)
sum(irate(Hadoop_DataNode_SentBytes[1m]))
2.Yarn监测
复制代码
# Yarn ResourceManager JVM堆内存使用率
Hadoop_ResourceManager_MemHeapUsedM/Hadoop_ResourceManager_MemHeapMaxM * 100

# Yarn NodeManager JVM堆内存使用率
Hadoop_NodeManager_MemHeapUsedM{instance="Yarn-NodeManager-IP和端口"}/Hadoop_NodeManager_MemHeapMaxM{instance="Yarn-NodeManager-IP和端口"} * 100

# Yarn:CPU 使用率
sum(java_lang_OperatingSystem_ProcessCpuLoad{job=~"yarn.*"})*100

# Yarn:ResourceManager RPC请求平均处理时间(单位:ms)
Hadoop_ResourceManager_RpcProcessingTimeAvgTime{job="yarn-resourcemanager"}

# Yarn:ResourceManager RPC处理队列长度(无单位或1/个)
Hadoop_ResourceManager_CallQueueLength{job="yarn-resourcemanager"}

# Yarn:NodeManager RPC请求平均处理时间(单位:ms)
Hadoop_NodeManager_RpcProcessingTimeAvgTime{job="yarn-nodemanager"}

# Yarn:NodeManager RPC处理队列长度(无单位或1/个)
Hadoop_NodeManager_CallQueueLength{job="yarn-nodemanager"}

# Yarn:ResourceManager 线程阻塞数量(无单位或1/个)
sum(Hadoop_ResourceManager_ThreadsBlocked{job="yarn-resourcemanager"})

# Yarn:NodeManager 线程阻塞数量(无单位或1/个)
sum(Hadoop_NodeManager_ThreadsBlocked{job="yarn-nodemanager"})

# Yarn:ResourceManager 线程等待数量(无单位或1/个)
sum(Hadoop_ResourceManager_ThreadsWaiting{job="yarn-resourcemanager"})

# Yarn:NodeManager 线程等待数量(无单位或1/个)
sum(Hadoop_NodeManager_ThreadsWaiting{job="yarn-nodemanager"})

# Yarn:NM存活节点数量统计
Hadoop_ResourceManager_NumActiveNMs

# Yarn:NM丢失节点数量统计
Hadoop_ResourceManager_NumLostNMs

# Yarn:NM不健康节点数量统计
Hadoop_ResourceManager_NumUnhealthyNMs

# Yarn:app运行数量
sum(Hadoop_ResourceManager_AppsRunning{q1 != ""})

# Yarn:app提交数量
sum(Hadoop_ResourceManager_AppsSubmitted{q1 != ""})

# Yarn:app等待数量
sum(Hadoop_ResourceManager_AppsPending{q1 != ""})

# Yarn:app完成数量
sum(Hadoop_ResourceManager_AppsCompleted{q1 != ""})

# Yarn:app被kill的数量
sum(Hadoop_ResourceManager_AppsKilled{q1 != ""})

# Yarn:app失败数量
sum(Hadoop_ResourceManager_AppsFailed{q1 != ""})

# Yarn:已分配的内存大小
sum(Hadoop_ResourceManager_AllocatedMB{q1 != ""})

# Yarn:已分配的核数量
sum(Hadoop_ResourceManager_AllocatedVCores{q1 != ""})

# Yarn:已分配的Container数量
sum(Hadoop_ResourceManager_AllocatedContainers{q1 != ""})

# Yarn:可用的内存大小(单位:MB)
sum(Hadoop_ResourceManager_AvailableMB{q1 != ""})

# Yarn:1min内NM心跳汇报次数
increase(Hadoop_ResourceManager_NodeUpdateNumOps[1m])

# Yarn:NM心跳汇报处理时间
Hadoop_ResourceManager_NodeUpdateAvgTime
3.Hive监测
复制代码
# Hive:CPU 使用率
sum(java_lang_OperatingSystem_ProcessCpuLoad{job=~"hiveserver2|metastore"})*100

# Hive:JVM堆内存使用率
sum(jvm_memory_bytes_used{job=~"hiveserver2|metastore",area="heap"})/sum(jvm_memory_bytes_max{job=~"hiveserver2|metastore",area="heap"})*100

# Hive:JVM GC Time(单位:ms)
increase(jvm_gc_collection_seconds_sum{job=~"hiveserver2|metastore"}[5m]) * 1000

# Hive:JVM GC Count(无单位)
increase(jvm_gc_collection_seconds_count{job=~"hiveserver2|metastore"}[5m])

# Hive:jvm当前线程数量(无单位)
sum(jvm_threads_current{job=~"hiveserver2|metastore"})

# Hive:jvm后台线程数量(无单位)
sum(jvm_threads_daemon{job=~"hiveserver2|metastore"})

# Hive:jvm死锁线程数量(无单位)
sum(jvm_threads_deadlocked{job=~"hiveserver2|metastore"})

# Hive:jvm线程阻塞数量(无单位)
sum(jvm_threads_state{job=~"hiveserver2|metastore",state="BLOCKED"})

# Hive:jvm线程等待数量(无单位)
sum(jvm_threads_state{job=~"hiveserver2|metastore",state="WAITING"})

# Hive:当前jvm已加载类数量(无单位)
sum(jvm_classes_loaded_total{job=~"hiveserver2|metastore"})

# Hive:用户和系统的总cpu使用时间(单位:s)
max(process_cpu_seconds_total{job=~"hiveserver2|metastore"})
4.Kafka监测
复制代码
# Kafka:CPU使用率(%)
irate(process_cpu_seconds_total{job="kafka"}[5m])*100

# Kafka:JVM内存使用情况(单位:MiB)
sum without(area)(jvm_memory_bytes_used{job="kafka"}) / 1024 / 1024

# Kafka:Broker网络吞吐量-流入(单位:B/s)
sum(rate(kafka_server_brokertopicmetrics_bytesinpersec{job="kafka",topic!=""}[5m]))

# Kafka:Broker网络吞吐量-流出(单位:B/s)
sum(rate(kafka_server_brokertopicmetrics_bytesoutpersec{job="kafka",topic!=""}[5m]))

# Kafka:激活状态控制器数量(无单位)
sum(kafka_controller_kafkacontroller_activecontrollercount{job="kafka"})

# Kafka:在线分区数(无单位)
sum(kafka_server_replicamanager_partitioncount{job="kafka"})

# Kafka:离线分区数(无单位)
sum(kafka_controller_kafkacontroller_offlinepartitionscount{job="kafka"})

# Kafka:在线Broker数量(无单位)
count(kafka_server_replicamanager_leadercount{job="kafka"})

# Kafka:Broker消息速率(io/s)
sum without(topic)(rate(kafka_server_brokertopicmetrics_messagesinpersec{job="kafka"}[5m]))

# Kafka:Broker入站流量速率(B/s)
sum without(topic)(rate(kafka_server_brokertopicmetrics_bytesinpersec{job="kafka"}[5m]))

# Kafka:Broker出站流量速率(B/s)
sum without(topic)(rate(kafka_server_brokertopicmetrics_bytesoutpersec{job="kafka"}[5m]))

# Kafka:topic消息速率(io/s)
sum without(instance)(rate(kafka_server_brokertopicmetrics_messagesinpersec{job="kafka",topic!=""}[5m]))

# Kafka:topic入站流量速率(B/s)
sum without(instance)(rate(kafka_server_brokertopicmetrics_bytesinpersec{job="kafka",topic!=""}[5m]))

# Kafka:topic出站流量速率(B/s)
sum without(instance)(rate(kafka_server_brokertopicmetrics_bytesoutpersec{job="kafka",topic!=""}[5m]))

# Kafka:I-O线程池平均空闲比例(%)
kafka_server_kafkarequesthandlerpool_requesthandleravgidlepercent_total{job="kafka"}

# Kafka:活跃连接数
sum(kafka_server_socketservermetrics_connection_count{job="kafka"}) by (instance, listener)
5.Zookeeper监测
复制代码
# Zookeeper:Quorum Size(无单位)
zookeeper_QuorumSize{job="zookeeper"}

# Zookeeper:Follower个数(无单位)
avg(zookeeper_QuorumSize{job="zookeeper"}) -1

# Zookeeper:健康检查(Ticktime)(单位:ms)
avg(zookeeper_TickTime{job="zookeeper"}) by (instance)

# Zookeeper:平均请求延迟
zookeeper_AvgRequestLatency{job="zookeeper"}

# Zookeeper:Znode个数
zookeeper_InMemoryDataTree_NodeCount{job="zookeeper"}

# Zookeeper:活跃连接数
zookeeper_NumAliveConnections{job="zookeeper"}

# Zookeeper:未完成请求数
zookeeper_OutstandingRequests{job="zookeeper"}

# Zookeeper:监视器数量
zookeeper_InMemoryDataTree_WatchCount{job="zookeeper"}

# Zookeeper:JVM GC Time(单位:ms)
increase(jvm_gc_collection_seconds_sum{job="zookeeper"}[5m]) * 1000

# Zookeeper:JVM GC Count(无单位)
increase(jvm_gc_collection_seconds_count{job="zookeeper"}[5m])

# Zookeeper:JVM堆内存使用率(%)
jvm_memory_bytes_used{job="zookeeper", area="heap"}/jvm_memory_bytes_max{job="zookeeper", area="heap"}*100
相关推荐
xiao-xiang2 天前
redis-集成prometheus监控(k8s)
数据库·redis·kubernetes·k8s·grafana·prometheus
陈陈CHENCHEN2 天前
【Kubernetes】在 K8s 上部署 Prometheus
kubernetes·prometheus
云游4 天前
大模型性能指标的监控系统(prometheus3.5.0)和可视化工具(grafana12.1.0)基础篇
grafana·prometheus·可视化·监控
qq_232045575 天前
非容器方式安装Prometheus和Grafana,以及nginx配置访问Grafana
nginx·grafana·prometheus
夜莺云原生监控6 天前
Prometheus 监控 Kubernetes Cluster 最新极简教程
容器·kubernetes·prometheus
SRETalk6 天前
Prometheus 监控 Kubernetes Cluster 最新极简教程
kubernetes·prometheus
川石课堂软件测试6 天前
JMeter并发测试与多进程测试
功能测试·jmeter·docker·容器·kubernetes·单元测试·prometheus
SRETalk7 天前
夜莺监控的几种架构模式详解
prometheus·victoriametrics·nightingale·夜莺监控
Ditglu.8 天前
使用Prometheus + Grafana + node_exporter实现Linux服务器性能监控
服务器·grafana·prometheus
SRETalk8 天前
监控系统如何选型:Zabbix vs Prometheus
zabbix·prometheus