检查会遇到集群节点内存消耗超过90%,我们可以筛选一些可以进行重启的pods,如脚本中涉及svc-开头的,进行触发即重启的shell编写。此项会涉及metrics组件需要安装。
#!/bin/bash
# 设置内存使用率阈值为90%
MEMORY_THRESHOLD=90
# 初始化一个数组来存储需要处理的节点名
EXCEED_NODES=()
# 获取所有节点的内存使用信息
NODES=$(sudo kubectl top node)
# 逐行处理节点信息
while IFS= read -r line; do
# 跳过表头行
if [[ $line =~ ^NAME ]]; then
continue
fi
# 提取节点名称和内存使用百分比
NODE=$(echo "$line" | awk '{print $1}')
MEMORY_PERCENT=$(echo "$line" | awk '{print $4}' | tr -d '%')
# 检查是否超过阈值
if [[ $MEMORY_PERCENT -gt $MEMORY_THRESHOLD ]]; then
EXCEED_NODES+=("$NODE")
echo "Node $NODE memory usage is over $MEMORY_THRESHOLD%, checking for svc- prefixed Pods..."
else
echo "Node $NODE is within acceptable memory usage."
fi
done <<< "$(echo "$NODES" | awk '/^[^[:space:]]/ {print}')"
# 特定命名空间
NAMESPACE="test"
# 对于每一个超过阈值的节点,查找并重启svc-开头的Pods
for NODE in "${EXCEED_NODES[@]}"; do
# 获取该节点上所有名称以svc-开头的Pods
PODS=$(sudo kubectl get pods -n $NAMESPACE -o wide | grep "$NODE" | grep "^.*\ssvc-" | awk '{print $1}')
# 如果有符合条件的Pods,则尝试删除它们
if [ -n "$PODS" ]; then
for POD in $PODS; do
sudo kubectl delete pod $POD -n $NAMESPACE
echo "Deleted Pod $NAMESPACE/$POD"
done
else
echo "No svc- prefixed Pods found on Node $NODE."
fi
done
# 输出最终状态
if [ ${#EXCEED_NODES[@]} -eq 0 ]; then
echo "No Pods deleted, all nodes are below the threshold."
else
echo "Some Pods were deleted due to high memory usage."
fi