Prometheus指标

文章目录

Prometheus指标

主要参数解释

# 节点IP和端口(instance)
例如:192.168.1.226:9100、192.168.1.227:9100、192.168.1.228:9100

# HDFS-DataNode-IP和端口
例如:192.168.1.226:30003、192.168.1.227:30003、192.168.1.228:30003

# Yarn-NodeManager-IP和端口
例如:192.168.1.226:30005、192.168.1.227:30005、192.168.1.228:30005

一、可用性监测(0代表存在异常或未启动,1代表运行中)

# 各个节点的可用性监测
up{job="node_exporter"}

# hdfs-namenode可用性监测
up{job="hdfs-namenode"}

# hdfs-datanode可用性监测
up{job="hdfs-datanode"}

# yarn-resourcemanager可用性监测
up{job="yarn-resourcemanager"}

# yarn-nodemanager可用性监测
up{job="yarn-nodemanager"}

# zookeeper可用性监测
up{job="zookeeper"}

# kafka可用性监测
up{job="kafka"}

# hiveserver2可用性监测
up{job="hiveserver2"}

# metastore可用性监测
up{job="metastore"}

二、节点监测

# 磁盘总容量
node_filesystem_size_bytes{instance="节点IP和端口",mountpoint="/"}/1024/1024/1024

# 磁盘可用容量
node_filesystem_avail_bytes{instance="节点IP和端口",mountpoint="/"} /1024 /1024/1024

# CPU负载
node_load1{instance="节点IP和端口"}

# CPU使用率
100-avg(irate(node_cpu_seconds_total{mode="idle",instance="节点IP和端口"}[1m])) by (instance)*100

# 内存饱和度
sum((rate(node_vmstat_pgpgin{instance="节点IP和端口"}[1m])+rate(node_vmstat_pgpgout{instance="节点IP和端口"}[1m]))) by (instance)

# 物理内存使用率
(node_memory_MemTotal_bytes{instance="节点IP和端口"} - (node_memory_MemFree_bytes{instance="节点IP和端口"} + node_memory_Buffers_bytes{instance="节点IP和端口"} + node_memory_Cached_bytes{instance="节点IP和端口"}))/node_memory_MemTotal_bytes{instance="节点IP和端口"} * 100

# SWAP内存使用率
(node_memory_SwapTotal_bytes{instance="节点IP和端口"} - node_memory_SwapFree_bytes{instance="节点IP和端口"})/node_memory_SwapTotal_bytes{instance="节点IP和端口"} * 100

# 磁盘分区使用率("/"分区)
(node_filesystem_size_bytes{mountpoint="/",instance="节点IP和端口"} - node_filesystem_free_bytes{mountpoint="/",instance="节点IP和端口"})/node_filesystem_size_bytes{mountpoint="/",instance="节点IP和端口"} * 100

# 磁盘吞吐量(每秒读速率)
irate(node_disk_read_bytes_total{device="sda",instance="节点IP和端口"}[1m]) / 1024

# 磁盘吞吐量(每秒写速率)
irate(node_disk_written_bytes_total{device="sda",instance="节点IP和端口"}[1m]) / 1024

# 磁盘IOPS(每秒读次数)
irate(node_disk_reads_completed_total{device="sda",instance="节点IP和端口"}[1m])

# 磁盘IOPS(每秒写次数)
irate(node_disk_writes_completed_total{device="sda",instance="节点IP和端口"}[1m])

# 网卡流量(下载流量总字节数)
irate(node_network_receive_bytes_total{device != "lo",device !~ "docker.*",instance="节点IP和端口"}[1m]) / 1024

# 网卡流量(上传流量总字节数)
irate(node_network_transmit_bytes_total{device != "lo",device !~ "docker.*",instance="节点IP和端口"}[1m]) / 1024

三、服务监测

1.HDFS监测
# HDFS NameNode JVM堆内存使用率
Hadoop_NameNode_MemHeapUsedM/Hadoop_NameNode_MemHeapMaxM * 100

# HDFS DataNode JVM堆内存使用率
Hadoop_DataNode_MemHeapUsedM{instance="HDFS-DataNode-IP和端口"}/Hadoop_DataNode_MemHeapMaxM{instance="HDFS-DataNode-IP和端口"} * 100

# HDFS 容量使用率(百分比)
Hadoop_NameNode_PercentUsed

# HDFS:CPU 使用率
sum(java_lang_OperatingSystem_ProcessCpuLoad{job=~"hdfs.*"})*100

# HDFS:NameNode RPC请求平均处理时间 (单位:ms)
Hadoop_NameNode_RpcProcessingTimeAvgTime{job="hdfs-namenode"}

# HDFS:NameNode RPC处理队列长度 (无单位或1/个)
Hadoop_NameNode_CallQueueLength{job="hdfs-namenode"}

# HDFS:DataNode RPC请求平均处理时间 (单位:ms)
Hadoop_DataNode_RpcProcessingTimeAvgTime{job="hdfs-datanode"}

# HDFS:DataNode RPC处理队列长度 (无单位或1/个)
Hadoop_DataNode_CallQueueLength{job="hdfs-datanode"}

# HDFS:NameNode 线程阻塞数量(无单位或1/个)
sum(Hadoop_NameNode_ThreadsBlocked{job="hdfs-namenode"})

# HDFS:NameNode 线程等待数量(无单位或1/个)
sum(Hadoop_NameNode_ThreadsWaiting{job="hdfs-namenode"})

# HDFS:DataNode 线程阻塞数量(无单位或1/个)
sum(Hadoop_DataNode_ThreadsBlocked{job="hdfs-datanode"})

# HDFS:DataNode 线程等待数量(无单位或1/个)
sum(Hadoop_DataNode_ThreadsWaiting{job="hdfs-datanode"})

# HDFS:DataNode 当前已使用容量(单位:MB)
Hadoop_NameNode_CapacityUsed{name="FSNamesystem"}/1024/1024

# HDFS:DataNode 原始总容量(单位:GB)
Hadoop_NameNode_CapacityTotal{name="FSNamesystem"}/1024/1024/1024

# HDFS:当前运行正常的DataNode数量
Hadoop_NameNode_NumLiveDataNodes

# HDFS:当前运行异常的DataNode数量
Hadoop_NameNode_NumDeadDataNodes

# HDFS:当前分配的block数量
Hadoop_NameNode_BlocksTotal{name="FSNamesystem"}

# HDFS:NameNode接收字节速率(单位:B/s)
irate(Hadoop_NameNode_ReceivedBytes[1m])

# HDFS:NameNode发送字节速率(单位:B/s)
irate(Hadoop_NameNode_SentBytes[1m])

# HDFS:NameNode创建文件的操作总数
Hadoop_NameNode_CreateFileOps

# HDFS:NameNode删除文件的操作总数
Hadoop_NameNode_DeleteFileOps

# HDFS:NameNode当前连接数
Hadoop_NameNode_TotalLoad{name="FSNamesystem"}

# HDFS:NameNode当前文件和目录的数量
Hadoop_NameNode_FilesTotal{name="FSNamesystem"}

# HDFS:DataNode读取字节速率(单位:B/s)
sum(irate(Hadoop_DataNode_BytesRead[1m]))

# HDFS:DataNode写入字节速率(单位:B/s)
sum(irate(Hadoop_DataNode_BytesWritten[1m]))

# HDFS:DataNode读操作总数
sum(Hadoop_DataNode_ReadBlockOpNumOps)

# HDFS:DataNode写操作总数
sum(Hadoop_DataNode_WriteBlockOpNumOps)

# HDFS:DataNode接收字节速率(单位:B/s)
sum(irate(Hadoop_DataNode_ReceivedBytes[1m]))

# HDFS:DataNode发送字节速率(单位:B/s)
sum(irate(Hadoop_DataNode_SentBytes[1m]))
2.Yarn监测
# Yarn ResourceManager JVM堆内存使用率
Hadoop_ResourceManager_MemHeapUsedM/Hadoop_ResourceManager_MemHeapMaxM * 100

# Yarn NodeManager JVM堆内存使用率
Hadoop_NodeManager_MemHeapUsedM{instance="Yarn-NodeManager-IP和端口"}/Hadoop_NodeManager_MemHeapMaxM{instance="Yarn-NodeManager-IP和端口"} * 100

# Yarn:CPU 使用率
sum(java_lang_OperatingSystem_ProcessCpuLoad{job=~"yarn.*"})*100

# Yarn:ResourceManager RPC请求平均处理时间(单位:ms)
Hadoop_ResourceManager_RpcProcessingTimeAvgTime{job="yarn-resourcemanager"}

# Yarn:ResourceManager RPC处理队列长度(无单位或1/个)
Hadoop_ResourceManager_CallQueueLength{job="yarn-resourcemanager"}

# Yarn:NodeManager RPC请求平均处理时间(单位:ms)
Hadoop_NodeManager_RpcProcessingTimeAvgTime{job="yarn-nodemanager"}

# Yarn:NodeManager RPC处理队列长度(无单位或1/个)
Hadoop_NodeManager_CallQueueLength{job="yarn-nodemanager"}

# Yarn:ResourceManager 线程阻塞数量(无单位或1/个)
sum(Hadoop_ResourceManager_ThreadsBlocked{job="yarn-resourcemanager"})

# Yarn:NodeManager 线程阻塞数量(无单位或1/个)
sum(Hadoop_NodeManager_ThreadsBlocked{job="yarn-nodemanager"})

# Yarn:ResourceManager 线程等待数量(无单位或1/个)
sum(Hadoop_ResourceManager_ThreadsWaiting{job="yarn-resourcemanager"})

# Yarn:NodeManager 线程等待数量(无单位或1/个)
sum(Hadoop_NodeManager_ThreadsWaiting{job="yarn-nodemanager"})

# Yarn:NM存活节点数量统计
Hadoop_ResourceManager_NumActiveNMs

# Yarn:NM丢失节点数量统计
Hadoop_ResourceManager_NumLostNMs

# Yarn:NM不健康节点数量统计
Hadoop_ResourceManager_NumUnhealthyNMs

# Yarn:app运行数量
sum(Hadoop_ResourceManager_AppsRunning{q1 != ""})

# Yarn:app提交数量
sum(Hadoop_ResourceManager_AppsSubmitted{q1 != ""})

# Yarn:app等待数量
sum(Hadoop_ResourceManager_AppsPending{q1 != ""})

# Yarn:app完成数量
sum(Hadoop_ResourceManager_AppsCompleted{q1 != ""})

# Yarn:app被kill的数量
sum(Hadoop_ResourceManager_AppsKilled{q1 != ""})

# Yarn:app失败数量
sum(Hadoop_ResourceManager_AppsFailed{q1 != ""})

# Yarn:已分配的内存大小
sum(Hadoop_ResourceManager_AllocatedMB{q1 != ""})

# Yarn:已分配的核数量
sum(Hadoop_ResourceManager_AllocatedVCores{q1 != ""})

# Yarn:已分配的Container数量
sum(Hadoop_ResourceManager_AllocatedContainers{q1 != ""})

# Yarn:可用的内存大小(单位:MB)
sum(Hadoop_ResourceManager_AvailableMB{q1 != ""})

# Yarn:1min内NM心跳汇报次数
increase(Hadoop_ResourceManager_NodeUpdateNumOps[1m])

# Yarn:NM心跳汇报处理时间
Hadoop_ResourceManager_NodeUpdateAvgTime
3.Hive监测
# Hive:CPU 使用率
sum(java_lang_OperatingSystem_ProcessCpuLoad{job=~"hiveserver2|metastore"})*100

# Hive:JVM堆内存使用率
sum(jvm_memory_bytes_used{job=~"hiveserver2|metastore",area="heap"})/sum(jvm_memory_bytes_max{job=~"hiveserver2|metastore",area="heap"})*100

# Hive:JVM GC Time(单位:ms)
increase(jvm_gc_collection_seconds_sum{job=~"hiveserver2|metastore"}[5m]) * 1000

# Hive:JVM GC Count(无单位)
increase(jvm_gc_collection_seconds_count{job=~"hiveserver2|metastore"}[5m])

# Hive:jvm当前线程数量(无单位)
sum(jvm_threads_current{job=~"hiveserver2|metastore"})

# Hive:jvm后台线程数量(无单位)
sum(jvm_threads_daemon{job=~"hiveserver2|metastore"})

# Hive:jvm死锁线程数量(无单位)
sum(jvm_threads_deadlocked{job=~"hiveserver2|metastore"})

# Hive:jvm线程阻塞数量(无单位)
sum(jvm_threads_state{job=~"hiveserver2|metastore",state="BLOCKED"})

# Hive:jvm线程等待数量(无单位)
sum(jvm_threads_state{job=~"hiveserver2|metastore",state="WAITING"})

# Hive:当前jvm已加载类数量(无单位)
sum(jvm_classes_loaded_total{job=~"hiveserver2|metastore"})

# Hive:用户和系统的总cpu使用时间(单位:s)
max(process_cpu_seconds_total{job=~"hiveserver2|metastore"})
4.Kafka监测
# Kafka:CPU使用率(%)
irate(process_cpu_seconds_total{job="kafka"}[5m])*100

# Kafka:JVM内存使用情况(单位:MiB)
sum without(area)(jvm_memory_bytes_used{job="kafka"}) / 1024 / 1024

# Kafka:Broker网络吞吐量-流入(单位:B/s)
sum(rate(kafka_server_brokertopicmetrics_bytesinpersec{job="kafka",topic!=""}[5m]))

# Kafka:Broker网络吞吐量-流出(单位:B/s)
sum(rate(kafka_server_brokertopicmetrics_bytesoutpersec{job="kafka",topic!=""}[5m]))

# Kafka:激活状态控制器数量(无单位)
sum(kafka_controller_kafkacontroller_activecontrollercount{job="kafka"})

# Kafka:在线分区数(无单位)
sum(kafka_server_replicamanager_partitioncount{job="kafka"})

# Kafka:离线分区数(无单位)
sum(kafka_controller_kafkacontroller_offlinepartitionscount{job="kafka"})

# Kafka:在线Broker数量(无单位)
count(kafka_server_replicamanager_leadercount{job="kafka"})

# Kafka:Broker消息速率(io/s)
sum without(topic)(rate(kafka_server_brokertopicmetrics_messagesinpersec{job="kafka"}[5m]))

# Kafka:Broker入站流量速率(B/s)
sum without(topic)(rate(kafka_server_brokertopicmetrics_bytesinpersec{job="kafka"}[5m]))

# Kafka:Broker出站流量速率(B/s)
sum without(topic)(rate(kafka_server_brokertopicmetrics_bytesoutpersec{job="kafka"}[5m]))

# Kafka:topic消息速率(io/s)
sum without(instance)(rate(kafka_server_brokertopicmetrics_messagesinpersec{job="kafka",topic!=""}[5m]))

# Kafka:topic入站流量速率(B/s)
sum without(instance)(rate(kafka_server_brokertopicmetrics_bytesinpersec{job="kafka",topic!=""}[5m]))

# Kafka:topic出站流量速率(B/s)
sum without(instance)(rate(kafka_server_brokertopicmetrics_bytesoutpersec{job="kafka",topic!=""}[5m]))

# Kafka:I-O线程池平均空闲比例(%)
kafka_server_kafkarequesthandlerpool_requesthandleravgidlepercent_total{job="kafka"}

# Kafka:活跃连接数
sum(kafka_server_socketservermetrics_connection_count{job="kafka"}) by (instance, listener)
5.Zookeeper监测
# Zookeeper:Quorum Size(无单位)
zookeeper_QuorumSize{job="zookeeper"}

# Zookeeper:Follower个数(无单位)
avg(zookeeper_QuorumSize{job="zookeeper"}) -1

# Zookeeper:健康检查(Ticktime)(单位:ms)
avg(zookeeper_TickTime{job="zookeeper"}) by (instance)

# Zookeeper:平均请求延迟
zookeeper_AvgRequestLatency{job="zookeeper"}

# Zookeeper:Znode个数
zookeeper_InMemoryDataTree_NodeCount{job="zookeeper"}

# Zookeeper:活跃连接数
zookeeper_NumAliveConnections{job="zookeeper"}

# Zookeeper:未完成请求数
zookeeper_OutstandingRequests{job="zookeeper"}

# Zookeeper:监视器数量
zookeeper_InMemoryDataTree_WatchCount{job="zookeeper"}

# Zookeeper:JVM GC Time(单位:ms)
increase(jvm_gc_collection_seconds_sum{job="zookeeper"}[5m]) * 1000

# Zookeeper:JVM GC Count(无单位)
increase(jvm_gc_collection_seconds_count{job="zookeeper"}[5m])

# Zookeeper:JVM堆内存使用率(%)
jvm_memory_bytes_used{job="zookeeper", area="heap"}/jvm_memory_bytes_max{job="zookeeper", area="heap"}*100
相关推荐
chaodaibing3 天前
elasticsearch_exporter启动报错 failed to fetch and decode node stats
elasticsearch·prometheus
陌殇殇殇4 天前
Prometheus监控MySQL主从数据库
运维·数据库·mysql·prometheus
福大大架构师每日一题4 天前
19.1 使用k8s的sdk编写一个项目获取pod和node信息
云原生·容器·kubernetes·prometheus
福大大架构师每日一题4 天前
19.3 打镜像部署到k8s中,prometheus配置采集并在grafana看图
kubernetes·grafana·prometheus
福大大架构师每日一题5 天前
21.2 k8s中etcd的tls双向认证原理解析
容器·kubernetes·prometheus·etcd
我的运维人生5 天前
基于Prometheus和Grafana的现代服务器监控体系构建
服务器·运维开发·grafana·prometheus·技术共享
BUG弄潮儿5 天前
k8s 部署 prometheus
容器·kubernetes·prometheus
iQM756 天前
基于Prometheus和Grafana的现代服务器监控体系构建
服务器·grafana·prometheus
kayotin8 天前
使用Prometheus进行系统监控,包括Mysql、Redis,并使用Grafana图形化表示
redis·mysql·prometheus
人类群星闪耀时9 天前
监控和日志管理:深入了解Nagios、Zabbix和Prometheus
zabbix·prometheus