Azure AKS集群监控告警表达式配置

背景需求

Azure AKS集群中,需要对部署的服务进行监控和告警,需要创建并启用预警规则,而这里怎么去监控每个pod级别的CPU和内存,需要自己写搜索查询

解决方法

搜索和查询的语句如下,需要自己替换其中的部分信息,其中的clusterID需要在AKS集群中的概述-右侧JSON视图-id,其中id就是我们要的clusterID,然后替换下面的clusterID字段内容。最后的 percentage 也需要根据实际情况来设置对应值。

复制代码
#内存
let metricUsageCounterName = 'memoryRssBytes';
let metricLimitCounterName = 'memoryLimitBytes';
let clusterID = "/subscriptions/xxxxxxxxxxx";
let CachedFilteredPerfTable = Perf
| where ObjectName == 'K8SContainer'
| where InstanceName startswith clusterID
| project Node = Computer, TimeGenerated, CounterName, CounterValue, InstanceName;
let LimitsTable = CachedFilteredPerfTable
| where CounterName =~ metricLimitCounterName
| summarize arg_max(TimeGenerated, *) by Node, InstanceName
| project Node, InstanceName, LimitsValue = CounterValue, TimeGenerated;
let MetaDataTable = KubePodInventory
| where isnotempty(ClusterName) | where isnotempty(Namespace) | where isnotempty(Computer)
| where ClusterId =~ clusterID
| project TimeGenerated, ClusterId, Namespace, ControllerName, Node = Computer, Pod = Name, ContainerInstance = ContainerName, ContainerID, ControllerKind = ControllerKind
| summarize arg_max(TimeGenerated, *) by Node, ContainerInstance
| project Namespace, ControllerName, Node, Pod, ContainerInstance, InstanceName = strcat(ClusterId, '/', ContainerInstance), ContainerID, ControllerKind, 
ContainerName = tostring(split(ContainerInstance, '/')[1]), LastPodInventoryTimeGenerated = TimeGenerated, ClusterId
| join kind=leftouter (LimitsTable) on Node, InstanceName
| project Namespace, ControllerName, Node, Pod, InstanceName, ContainerID, LimitsValue, ControllerKind, ContainerName, ContainerInstance, LastPodInventoryTimeGenerated, ClusterId;
let AggregationTable = CachedFilteredPerfTable
| where CounterName =~ metricUsageCounterName
| project TimeGenerated, Node, InstanceName, CounterValue
| summarize  Aggregation = percentile(CounterValue, 95) by Node, InstanceName 
| project Node, InstanceName, Aggregation;
MetaDataTable
| join kind= leftouter( AggregationTable ) on Node, InstanceName
| order by ContainerName asc, ContainerName
| extend ContainerIdentity = strcat(ContainerName, '|', Pod)
| extend percentage = Aggregation/LimitsValue * 100
| project ContainerIdentity, percentage, Aggregation, LimitsValue, Node, ControllerName, ControllerKind, ContainerID, ContainerInstance, InstanceName, Namespace, LastPodInventoryTimeGenerated, ClusterId
| where percentage > 80






#CPU
let metricUsageCounterName = 'cpuUsageNanoCores';
let metricLimitCounterName = 'cpuLimitNanoCores'; 
let clusterID = "/subscriptions/xxxxxxxxxxx";
let CachedFilteredPerfTable = Perf
| where ObjectName == 'K8SContainer'
| where InstanceName startswith clusterID
| project Node = Computer, TimeGenerated, CounterName, CounterValue, InstanceName;
let LimitsTable = CachedFilteredPerfTable
| where CounterName =~ metricLimitCounterName
| summarize arg_max(TimeGenerated, *) by Node, InstanceName
| project Node, InstanceName, LimitsValue = CounterValue/1000000, TimeGenerated;
let MetaDataTable = KubePodInventory
| where isnotempty(ClusterName) | where isnotempty(Namespace) | where isnotempty(Computer)
| where ClusterId =~ clusterID
| project TimeGenerated, ClusterId, Namespace, ControllerName, Node = Computer, Pod = Name, ContainerInstance = ContainerName, ContainerID, ControllerKind = ControllerKind
| summarize arg_max(TimeGenerated, *) by Node, ContainerInstance
| project Namespace, ControllerName, Node, Pod, ContainerInstance, InstanceName = strcat(ClusterId, '/', ContainerInstance), ContainerID, ControllerKind, 
ContainerName = tostring(split(ContainerInstance, '/')[1]), LastPodInventoryTimeGenerated = TimeGenerated, ClusterId
| join kind=leftouter (LimitsTable) on Node, InstanceName
| project Namespace, ControllerName, Node, Pod, InstanceName, ContainerID, LimitsValue, ControllerKind, ContainerName, ContainerInstance, LastPodInventoryTimeGenerated, ClusterId;
let AggregationTable = CachedFilteredPerfTable
| where CounterName =~ metricUsageCounterName
| project TimeGenerated, Node, InstanceName, CounterValue = CounterValue/1000000
| summarize  Aggregation = percentile(CounterValue, 95) by Node, InstanceName 
| project Node, InstanceName, Aggregation;
MetaDataTable
| join kind= leftouter( AggregationTable ) on Node, InstanceName
| order by ContainerName asc, ContainerName
| extend ContainerIdentity = strcat(ContainerName, '|', Pod)
| extend percentage = Aggregation/LimitsValue * 100
| project ContainerIdentity, percentage, Aggregation, LimitsValue, Node, ControllerName, ControllerKind, ContainerID, ContainerInstance, InstanceName, Namespace, LastPodInventoryTimeGenerated, ClusterId
| where percentage > 80
相关推荐
JimCarter14 小时前
使用Azure Devops Pipeline将Docker应用部署到你的Raspberry Pi上
docker·azure·树莓派·devops·orangepi·香橙派·raspberrypi
2601_9618752417 小时前
高考真题电子版|2025高考全科真题分类PDF
金融·pdf·云计算·azure·七牛云存储·交友·高考
步步为营DotNet19 小时前
Microsoft.Extensions.AI 在 .NET 后端性能优化中的应用与解析
人工智能·microsoft·.net
xiaoshuaishuai819 小时前
C# 逆向分析Privazer
数据库·microsoft·c#
BizViewStudio19 小时前
2026 年 GEO 成为企业线上流量增长核心风口|2026 品牌 GEO 运营指南,6 家全链路优化服务商解析
运维·网络·人工智能·microsoft·ai
骑士雄师2 天前
17.2 通过 Config 传入用户名 → 工具1存入 State → 工具2读取 State 并返回答案
服务器·windows·microsoft
热爱学习的小翁同学2 天前
Azure Automation Runbook 获取托管标识的访问令牌(Access Token)
microsoft·azure
川石课堂软件测试2 天前
UI自动化测试|XPath元素定位实践
功能测试·测试工具·jmeter·microsoft·ui·postman·harmonyos
骑士雄师2 天前
16.1深入讲解 LangGraph 的静态配置 configurable
windows·microsoft
jghhh012 天前
C# 图片水印工具(支持9个位置)
数据库·microsoft·c#