背景:
1.生产环境某个应用CPU偶发瞬时冲高情况,每次持续2-5分钟,运维人员来不及立马定位原因
2.pod应用,没有添加到集成监控系统中,需要自己想办法监测
3.pod应用中不让安装crontab工具
脚本内容如下:
#!/bin/bash
while [ true ]
do
cpu=(top -bn 1\| sed -n '3p'\|awk '{print 2}')
echo "(date '+%Y%m%d%H%M%S'): 间隔10s检索CPU情况,CPU=cpu"
if [[ `expr $cpu \> 60` == 1 ]]; then
echo "cpu surge, find process, cpu=$cpu"
PACPU=((top -bn 1\|grep java\| sed -n '1,5p'\|awk '{print 1,$9}'))
echo "${PACPU[*]}"
len=${#PACPU[*]}
for ((i=0;i<$len;i+=2))
do
pid={PACPU\[i]}
tids=((top -bn 1 -p pid -H|sed -n '8,12p'|awk '{print $1}'))
cpu={PACPU\[i+1]}
file=pid-(date '+%Y%m%d%H%M%S').log
echo "print process pid, tids={tids[*]}, cpu=cpu, file=file"
for tid in ${tids[*]}
do
printf "nid=0x%x\n" tid \>\> file
done
jstack -l pid \>\> file
done
fi
sleep 10s
done