Azure AKS集群监控告警表达式配置

背景需求

Azure AKS集群中,需要对部署的服务进行监控和告警,需要创建并启用预警规则,而这里怎么去监控每个pod级别的CPU和内存,需要自己写搜索查询

解决方法

搜索和查询的语句如下,需要自己替换其中的部分信息,其中的clusterID需要在AKS集群中的概述-右侧JSON视图-id,其中id就是我们要的clusterID,然后替换下面的clusterID字段内容。最后的 percentage 也需要根据实际情况来设置对应值。

#内存
let metricUsageCounterName = 'memoryRssBytes';
let metricLimitCounterName = 'memoryLimitBytes';
let clusterID = "/subscriptions/xxxxxxxxxxx";
let CachedFilteredPerfTable = Perf
| where ObjectName == 'K8SContainer'
| where InstanceName startswith clusterID
| project Node = Computer, TimeGenerated, CounterName, CounterValue, InstanceName;
let LimitsTable = CachedFilteredPerfTable
| where CounterName =~ metricLimitCounterName
| summarize arg_max(TimeGenerated, *) by Node, InstanceName
| project Node, InstanceName, LimitsValue = CounterValue, TimeGenerated;
let MetaDataTable = KubePodInventory
| where isnotempty(ClusterName) | where isnotempty(Namespace) | where isnotempty(Computer)
| where ClusterId =~ clusterID
| project TimeGenerated, ClusterId, Namespace, ControllerName, Node = Computer, Pod = Name, ContainerInstance = ContainerName, ContainerID, ControllerKind = ControllerKind
| summarize arg_max(TimeGenerated, *) by Node, ContainerInstance
| project Namespace, ControllerName, Node, Pod, ContainerInstance, InstanceName = strcat(ClusterId, '/', ContainerInstance), ContainerID, ControllerKind, 
ContainerName = tostring(split(ContainerInstance, '/')[1]), LastPodInventoryTimeGenerated = TimeGenerated, ClusterId
| join kind=leftouter (LimitsTable) on Node, InstanceName
| project Namespace, ControllerName, Node, Pod, InstanceName, ContainerID, LimitsValue, ControllerKind, ContainerName, ContainerInstance, LastPodInventoryTimeGenerated, ClusterId;
let AggregationTable = CachedFilteredPerfTable
| where CounterName =~ metricUsageCounterName
| project TimeGenerated, Node, InstanceName, CounterValue
| summarize  Aggregation = percentile(CounterValue, 95) by Node, InstanceName 
| project Node, InstanceName, Aggregation;
MetaDataTable
| join kind= leftouter( AggregationTable ) on Node, InstanceName
| order by ContainerName asc, ContainerName
| extend ContainerIdentity = strcat(ContainerName, '|', Pod)
| extend percentage = Aggregation/LimitsValue * 100
| project ContainerIdentity, percentage, Aggregation, LimitsValue, Node, ControllerName, ControllerKind, ContainerID, ContainerInstance, InstanceName, Namespace, LastPodInventoryTimeGenerated, ClusterId
| where percentage > 80






#CPU
let metricUsageCounterName = 'cpuUsageNanoCores';
let metricLimitCounterName = 'cpuLimitNanoCores'; 
let clusterID = "/subscriptions/xxxxxxxxxxx";
let CachedFilteredPerfTable = Perf
| where ObjectName == 'K8SContainer'
| where InstanceName startswith clusterID
| project Node = Computer, TimeGenerated, CounterName, CounterValue, InstanceName;
let LimitsTable = CachedFilteredPerfTable
| where CounterName =~ metricLimitCounterName
| summarize arg_max(TimeGenerated, *) by Node, InstanceName
| project Node, InstanceName, LimitsValue = CounterValue/1000000, TimeGenerated;
let MetaDataTable = KubePodInventory
| where isnotempty(ClusterName) | where isnotempty(Namespace) | where isnotempty(Computer)
| where ClusterId =~ clusterID
| project TimeGenerated, ClusterId, Namespace, ControllerName, Node = Computer, Pod = Name, ContainerInstance = ContainerName, ContainerID, ControllerKind = ControllerKind
| summarize arg_max(TimeGenerated, *) by Node, ContainerInstance
| project Namespace, ControllerName, Node, Pod, ContainerInstance, InstanceName = strcat(ClusterId, '/', ContainerInstance), ContainerID, ControllerKind, 
ContainerName = tostring(split(ContainerInstance, '/')[1]), LastPodInventoryTimeGenerated = TimeGenerated, ClusterId
| join kind=leftouter (LimitsTable) on Node, InstanceName
| project Namespace, ControllerName, Node, Pod, InstanceName, ContainerID, LimitsValue, ControllerKind, ContainerName, ContainerInstance, LastPodInventoryTimeGenerated, ClusterId;
let AggregationTable = CachedFilteredPerfTable
| where CounterName =~ metricUsageCounterName
| project TimeGenerated, Node, InstanceName, CounterValue = CounterValue/1000000
| summarize  Aggregation = percentile(CounterValue, 95) by Node, InstanceName 
| project Node, InstanceName, Aggregation;
MetaDataTable
| join kind= leftouter( AggregationTable ) on Node, InstanceName
| order by ContainerName asc, ContainerName
| extend ContainerIdentity = strcat(ContainerName, '|', Pod)
| extend percentage = Aggregation/LimitsValue * 100
| project ContainerIdentity, percentage, Aggregation, LimitsValue, Node, ControllerName, ControllerKind, ContainerID, ContainerInstance, InstanceName, Namespace, LastPodInventoryTimeGenerated, ClusterId
| where percentage > 80
相关推荐
Elastic 中国社区官方博客2 小时前
使用 Elastic AI Assistant for Search 和 Azure OpenAI 实现从 0 到 60 的转变
大数据·人工智能·elasticsearch·microsoft·搜索引擎·ai·azure
A_cot12 小时前
理解设计模式与 UML 类图:构建稳健软件架构的基石
microsoft·设计模式·简单工厂模式·工厂方法模式·uml
C-cat.15 小时前
Linux|进程程序替换
linux·服务器·microsoft
Smartdaili China18 小时前
如何在 Microsoft Edge 中设置代理: 快速而简单的方法
前端·爬虫·安全·microsoft·edge·社交·动态住宅代理
小白不太白9501 天前
设计模式之 外观模式
microsoft·设计模式·外观模式
万里沧海寄云帆1 天前
Word 插入分节符页码更新问题
windows·microsoft·word
huaqianzkh2 天前
学习C#中的Parallel类
windows·microsoft·c#
Code哈哈笑2 天前
【Java 学习】初识类和对象、this引用
java·学习·microsoft
code_shenbing2 天前
跨平台WPF框架Avalonia教程 三
前端·microsoft·ui·c#·wpf·跨平台·界面设计
宝桥南山3 天前
.NET 9 - BinaryFormatter移除
microsoft·微软·c#·asp.net·.net·.netcore