1、调整内核参数,避免过度分配内存
1.1、编辑 /etc/sysctl.conf
#严格控制内存分配:0=启发式,1=总是过载,2=禁止过载(推荐)
vm.overcommit_memory = 2
#当 overcommit_memory=2 时,允许分配的地址空间不超过swap+RAM*比例
vm.overcommit_ratio = 90
1.2、执行 sysctl -p 生效
2、编辑脚本实现,定时(crontab)Centos7当整体CPU使用率连续5分钟(可配置)超过99%时,自动终止除操作系统核心进程外CPU占用最高的3个进程,并记录kill进程的详细信息。
#!/bin/bash
配置参数
LOG_FILE="/home/sunxy/cpukill/cpu_monitor.log"
COUNT_FILE="/home/sunxy/cpukill/cpu_monitor_count.txt"
THRESHOLD=99 # CPU使用率阈值
DURATION=5 # 连续超过阈值的分钟数
MAX_PROCESSES=3 # 最多终止的进程数
核心进程列表(根据CentOS 7系统核心进程调整)
CORE_PROCESSES=("systemd" "kthreadd" "ksoftirqd" "rcu_sched" "rcu_bh" "migration" "watchdog" "kworker" "kswapd" "vmstat" "journald" "sshd" "crond" "rsyslogd" "network" "udevd")
确保日志文件和计数文件目录存在
mkdir -p "(dirname"(dirname "(dirname"LOG_FILE")"
mkdir -p "(dirname"(dirname "(dirname"COUNT_FILE")"
记录日志函数
log_message() {
local message="1"echo"1" echo "1"echo"(date '+%Y-%m-%d %H:%M:%S') - message">>"message" >> "message">>"LOG_FILE"
echo "$message"
}
检查是否为核心进程
is_core_process() {
local process_name="1"forcoreprocin"1" for core_proc in "1"forcoreprocin"{CORE_PROCESSES[@]}"; do
if [[ "processname"==∗"process_name" == *"processname"==∗"core_proc"* ]]; then
return 0
fi
done
return 1
}
读取计数
read_count() {
if [ -f "COUNTFILE"];thencat"COUNT_FILE" ]; then cat "COUNTFILE"];thencat"COUNT_FILE"
else
echo 0
fi
}
写入计数
write_count() {
local count="1"echo"1" echo "1"echo"count" > "$COUNT_FILE"
}
主监控函数
main() {
log_message "CPU监控脚本执行"
# 读取当前计数
high_cpu_count=$(read_count)
# 获取整体CPU使用率(使用top命令,过滤掉空闲CPU)
cpu_usage=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1}')
# 四舍五入到整数
cpu_usage_int=$(printf "%.0f" "$cpu_usage")
log_message "当前CPU使用率: ${cpu_usage_int}% (阈值: ${THRESHOLD}%)"
# 检查是否超过阈值
if [ "$cpu_usage_int" -ge "$THRESHOLD" ]; then
high_cpu_count=$((high_cpu_count + 1))
log_message "CPU使用率超过阈值,连续计数: ${high_cpu_count}/${DURATION}"
write_count "$high_cpu_count"
else
high_cpu_count=0
log_message "CPU使用率低于阈值,重置计数"
write_count "$high_cpu_count"
fi
# 连续达到指定分钟数,执行清理操作
if [ "$high_cpu_count" -ge "$DURATION" ]; then
log_message "CPU使用率连续${DURATION}分钟超过${THRESHOLD}%,开始清理进程"
# 获取CPU占用最高的进程(排除核心进程)
top_processes=$(ps aux --sort=-%cpu | awk 'NR>1 {print $1 " " $2 " " $3 " " $11}' | head -20)
# 筛选非核心进程并取前N个
kill_count=0
while read -r line; do
if [ "$kill_count" -ge "$MAX_PROCESSES" ]; then
break
fi
user=$(echo "$line" | awk '{print $1}')
pid=$(echo "$line" | awk '{print $2}')
cpu=$(echo "$line" | awk '{print $3}')
cmd=$(echo "$line" | awk '{print $4}')
# 检查是否为核心进程
if ! is_core_process "$cmd"; then
log_message "准备终止进程: PID=$pid, USER=$user, CPU=$cpu%, COMMAND=$cmd"
# 记录进程详细信息
process_detail=$(ps -p "$pid" -o pid,ppid,user,group,cmd,%cpu,%mem,etime,args --no-headers)
log_message "进程详细信息: $process_detail"
# 终止进程
if kill -9 "$pid"; then
log_message "成功终止进程: PID=$pid"
kill_count=$((kill_count + 1))
else
log_message "终止进程失败: PID=$pid"
fi
fi
done <<< "$top_processes"
# 重置计数
high_cpu_count=0
write_count "$high_cpu_count"
log_message "清理完成,共终止 ${kill_count} 个进程"
fi
}
执行主函数
main