本文提供完整的Linux服务器性能问题排查指南,从基础监控到高级诊断,包含详细的步骤和操作代码,帮助您快速定位并解决CPU和内存飙高问题。
性能问题排查基础准备
环境准备和工具安装
在开始排查前,确保系统已安装必要的性能分析工具。
bash
#!/bin/bash
# 性能排查工具安装脚本
# 检查系统类型
if [ -f /etc/redhat-release ]; then
# CentOS/RHEL
yum update -y
yum install -y epel-release
yum install -y htop iotop nethogs sysstat dstat perf bpftrace bcc-tools \
stress-ng procps-ng numactl lsof strace ltrace gdb \
tcpdump net-tools iproute
elif [ -f /etc/debian_version ]; then
# Ubuntu/Debian
apt-get update
apt-get install -y htop iotop nethogs sysstat dstat linux-tools-common \
linux-tools-generic bpfcc-tools stress-ng procps \
numactl lsof strace ltrace gdb tcpdump net-tools \
iproute2
else
echo "不支持的Linux发行版"
exit 1
fi
# 创建性能排查工作目录
mkdir -p /opt/performance/{scripts,logs,reports}
cd /opt/performance
echo "✅ 性能排查工具安装完成"
基础监控脚本
创建基础性能监控脚本,用于快速获取系统状态。
bash
#!/bin/bash
# basic_monitor.sh - 基础系统监控脚本
set -euo pipefail
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color
# 日志函数
log() {
local level=$1
shift
local message=$*
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
case $level in
"INFO") echo -e "${GREEN}[INFO]${NC} $message" ;;
"WARN") echo -e "${YELLOW}[WARN]${NC} $message" ;;
"ERROR") echo -e "${RED}[ERROR]${NC} $message" ;;
"DEBUG") echo -e "${BLUE}[DEBUG]${NC} $message" ;;
esac
}
# 系统概览
system_overview() {
echo -e "\n${CYAN}=== 系统概览 ===${NC}"
# 系统基本信息
echo -e "${GREEN}主机名:${NC} $(hostname)"
echo -e "${GREEN}操作系统:${NC} $(cat /etc/redhat-release 2>/dev/null || cat /etc/os-release | grep PRETTY_NAME | cut -d= -f2 | tr -d '\"')"
echo -e "${GREEN}内核版本:${NC} $(uname -r)"
echo -e "${GREEN}启动时间:${NC} $(uptime -s)"
echo -e "${GREEN}运行时间:${NC} $(uptime -p)"
}
# CPU监控
cpu_monitor() {
echo -e "\n${CYAN}=== CPU监控 ===${NC}"
# CPU基本信息
local cpu_cores=$(nproc)
local cpu_model=$(grep "model name" /proc/cpuinfo | head -1 | cut -d: -f2 | sed 's/^ *//')
echo -e "${GREEN}CPU型号:${NC} $cpu_model"
echo -e "${GREEN}CPU核心数:${NC} $cpu_cores"
# CPU使用率
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1"%"}')
echo -e "${GREEN}CPU使用率:${NC} $cpu_usage"
# 负载平均值
local load_avg=$(uptime | awk -F'load average:' '{print $2}')
echo -e "${GREEN}负载平均值:${NC} $load_avg"
# 每个CPU核心的使用率
echo -e "${GREEN}每个核心使用率:${NC}"
mpstat -P ALL 1 1 | awk '
/Average/ && $2 != "all" && $2 != "CPU" {
printf " CPU %s: %.1f%%\n", $2, 100 - $12
}'
# 中断和上下文切换
echo -e "${GREEN}中断统计:${NC}"
cat /proc/interrupts | head -1
cat /proc/interrupts | awk '{for(i=2;i<=NF-3;i++) sum[i]+=$i;} END {for(i=2;i<=NF-3;i++) printf " CPU%d: %d次\n", i-2, sum[i]}' | head -5
local context_switches=$(grep ctxt /proc/stat | awk '{print $2}')
echo -e "${GREEN}上下文切换:${NC} $context_switches 次"
}
# 内存监控
memory_monitor() {
echo -e "\n${CYAN}=== 内存监控 ===${NC}"
# 内存使用情况
local mem_info=$(free -h)
echo -e "${GREEN}内存使用:${NC}"
echo "$mem_info" | head -2
echo "$mem_info" | grep -E "Mem:|Swap:"
# 详细内存信息
echo -e "\n${GREEN}详细内存信息:${NC}"
cat /proc/meminfo | grep -E "MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree" | while read line; do
echo " $line"
done
# 内存压力
echo -e "\n${GREEN}内存压力:${NC}"
local mem_pressure=$(cat /proc/pressure/memory 2>/dev/null || echo "无法获取内存压力信息")
echo " $mem_pressure"
# slab内存
local slab_memory=$(awk '/SReclaimable/ {print $2}' /proc/meminfo)
echo -e "${GREEN}可回收Slab内存:${NC} $((slab_memory / 1024)) MB"
}
# 进程监控
process_monitor() {
echo -e "\n${CYAN}=== 进程监控 ===${NC}"
# CPU占用最高的进程
echo -e "${GREEN}CPU占用Top 5进程:${NC}"
ps aux --sort=-%cpu | head -6 | awk 'NR==1 {printf " %-8s %-8s %-6s %-6s %s\n", "USER", "PID", "%CPU", "%MEM", "COMMAND"}
NR>1 {printf " %-8s %-8s %-6.1f %-6.1f %s\n", $1, $2, $3, $4, $11}'
# 内存占用最高的进程
echo -e "\n${GREEN}内存占用Top 5进程:${NC}"
ps aux --sort=-%mem | head -6 | awk 'NR==1 {printf " %-8s %-8s %-6s %-6s %s\n", "USER", "PID", "%CPU", "%MEM", "COMMAND"}
NR>1 {printf " %-8s %-8s %-6.1f %-6.1f %s\n", $1, $2, $3, $4, $11}'
# 僵尸进程
local zombie_count=$(ps aux | awk '$8=="Z" {print $2}' | wc -l)
if [ $zombie_count -gt 0 ]; then
echo -e "${RED}发现僵尸进程:${NC} $zombie_count 个"
ps aux | awk '$8=="Z" {print " PID: "$2", 命令: "$11}'
else
echo -e "${GREEN}僵尸进程:${NC} 0 个"
fi
}
# 网络监控
network_monitor() {
echo -e "\n${CYAN}=== 网络监控 ===${NC}"
# 网络接口统计
echo -e "${GREEN}网络接口统计:${NC}"
ip -s link show | grep -E "^[0-9]|RX|TX" | head -12
# 连接统计
echo -e "\n${GREEN}网络连接统计:${NC}"
ss -s | head -10
# TCP连接状态
echo -e "\n${GREEN}TCP连接状态:${NC}"
ss -t -a | awk 'NR>1 {count[$2]++} END {for(state in count) printf " %s: %d\n", state, count[state]}'
}
# 磁盘监控
disk_monitor() {
echo -e "\n${CYAN}=== 磁盘监控 ===${NC}"
# 磁盘使用情况
echo -e "${GREEN}磁盘使用情况:${NC}"
df -h | awk 'NR==1 {print " "$0} $5+0 > 80 {print " "$0}'
# inode使用情况
echo -e "\n${GREEN}inode使用情况:${NC}"
df -i | awk 'NR==1 {print " "$0} $5+0 > 80 {print " "$0}'
# IO统计
echo -e "\n${GREEN}磁盘IO统计:${NC}"
iostat -x 1 1 | head -10
}
# 主函数
main() {
echo -e "${PURPLE}🖥️ Linux系统性能监控报告${NC}"
echo -e "${PURPLE}生成时间: $(date)${NC}"
echo -e "${PURPLE}================================${NC}"
system_overview
cpu_monitor
memory_monitor
process_monitor
network_monitor
disk_monitor
echo -e "\n${PURPLE}================================${NC}"
echo -e "${PURPLE}监控报告生成完成${NC}"
}
# 执行主函数
main
CPU性能问题深度排查
CPU问题排查脚本
bash
#!/bin/bash
# cpu_investigation.sh - CPU性能深度排查脚本
set -euo pipefail
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'
# 全局变量
INVESTIGATION_DIR="/opt/performance/investigation_$(date +%Y%m%d_%H%M%S)"
LOG_FILE="${INVESTIGATION_DIR}/cpu_investigation.log"
# 初始化
init_investigation() {
mkdir -p "$INVESTIGATION_DIR"
exec > >(tee -a "$LOG_FILE")
exec 2>&1
echo -e "${CYAN}=== CPU性能深度排查开始 ===${NC}"
echo -e "调查目录: $INVESTIGATION_DIR"
echo -e "日志文件: $LOG_FILE"
echo -e "开始时间: $(date)"
}
# 记录调查步骤
log_step() {
echo -e "\n${BLUE}>>> $1${NC}"
echo "[STEP] $1 - $(date)" >> "$LOG_FILE"
}
# 快速CPU问题识别
quick_cpu_analysis() {
log_step "快速CPU问题识别"
# 检查CPU使用率
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print 100 - $8}')
echo -e "${GREEN}当前CPU使用率:${NC} ${cpu_usage}%"
if (( $(echo "$cpu_usage > 80" | bc -l) )); then
echo -e "${RED}⚠️ CPU使用率过高${NC}"
elif (( $(echo "$cpu_usage > 50" | bc -l) )); then
echo -e "${YELLOW}⚠️ CPU使用率偏高${NC}"
else
echo -e "${GREEN}✅ CPU使用率正常${NC}"
fi
# 检查负载
local load_avg=$(uptime | awk -F'load average:' '{print $2}')
local cpu_cores=$(nproc)
echo -e "${GREEN}负载平均值:${NC} $load_avg"
echo -e "${GREEN}CPU核心数:${NC} $cpu_cores"
# 负载与核心数比较
local load1=$(echo $load_avg | awk -F, '{print $1}')
if (( $(echo "$load1 > $cpu_cores" | bc -l) )); then
echo -e "${RED}⚠️ 系统负载过高${NC}"
fi
}
# 详细进程分析
detailed_process_analysis() {
log_step "详细进程分析"
# 保存当前进程快照
local process_file="${INVESTIGATION_DIR}/process_snapshot_$(date +%H%M%S).txt"
ps aux --sort=-%cpu > "$process_file"
echo -e "${GREEN}CPU占用Top 10进程:${NC}"
head -11 "$process_file" | awk 'NR==1 {printf " %-8s %-8s %-6s %-6s %-12s %s\n", "USER", "PID", "%CPU", "%MEM", "TIME", "COMMAND"}
NR>1 {printf " %-8s %-8s %-6.1f %-6.1f %-12s %s\n", $1, $2, $3, $4, $10, $11}'
# 分析可疑进程
echo -e "\n${GREEN}可疑进程分析:${NC}"
awk '$3 > 10.0 {print $2}' "$process_file" | while read pid; do
if [ -n "$pid" ] && [ "$pid" != "PID" ]; then
echo -e " ${YELLOW}分析进程 PID: $pid${NC}"
# 进程详细信息
local proc_status="/proc/$pid/status"
if [ -f "$proc_status" ]; then
local process_name=$(ps -p "$pid" -o comm= 2>/dev/null || echo "未知")
local thread_count=$(grep "Threads" "$proc_status" | awk '{print $2}')
local cpu_affinity=$(taskset -cp "$pid" 2>/dev/null | awk -F: '{print $2}' | sed 's/^ *//')
echo -e " 进程名: $process_name"
echo -e " 线程数: $thread_count"
echo -e " CPU亲和性: $cpu_affinity"
# 检查进程状态
local state=$(grep "State" "$proc_status" | awk '{print $2}')
echo -e " 进程状态: $state"
fi
fi
done
}
# 线程级别分析
thread_level_analysis() {
log_step "线程级别分析"
local thread_file="${INVESTIGATION_DIR}/thread_analysis_$(date +%H%M%S).txt"
# 获取高CPU进程的线程信息
echo -e "${GREEN}高CPU进程的线程分析:${NC}"
# 找到CPU使用率超过5%的进程
ps -eo pid,%cpu,comm --no-headers | awk '$2 > 5.0 {print $1}' | while read pid; do
if [ -d "/proc/$pid/task" ]; then
echo -e "\n ${CYAN}进程 $pid 的线程:${NC}"
# 获取进程名
local process_name=$(ps -p "$pid" -o comm= 2>/dev/null || echo "未知")
echo -e " 进程名: $process_name"
# 分析每个线程
for tid in /proc/$pid/task/*; do
local thread_id=$(basename "$tid")
if [ "$thread_id" != "$pid" ]; then # 跳过主线程(已经在进程列表中)
local thread_cpu=$(ps -L -p "$pid" -o tid,pcpu,comm --no-headers 2>/dev/null | awk -v tid="$thread_id" '$1 == tid {print $2}')
if [ -n "$thread_cpu" ] && (( $(echo "$thread_cpu > 1.0" | bc -l) )); then
echo -e " ${YELLOW}线程 $thread_id: ${thread_cpu}% CPU${NC}"
# 获取线程的调用栈
local stack_file="${INVESTIGATION_DIR}/stack_${pid}_${thread_id}.txt"
if command -v gdb >/dev/null 2>&1; then
gdb -ex "set pagination 0" -ex "thread apply all bt" -batch -p "$pid" 2>/dev/null | \
grep -A 20 "Thread $thread_id" > "$stack_file" 2>/dev/null || true
fi
fi
fi
done
fi
done
}
# CPU调度分析
cpu_scheduling_analysis() {
log_step "CPU调度分析"
echo -e "${GREEN}运行队列长度:${NC}"
sar -q 1 3 | tail -3
echo -e "\n${GREEN}CPU调度统计:${NC}"
grep -E "cpu|ctxt" /proc/stat | head -5
echo -e "\n${GREEN}每个CPU的运行队列:${NC}"
mpstat -P ALL 1 1 | grep -E "CPU|Average" | awk '
/CPU/ {printf " %s: ", $2}
/Average/ && $2 != "all" {printf "运行队列: %.2f\n", $8}'
# 检查软中断分布
echo -e "\n${GREEN}软中断分布:${NC}"
cat /proc/softirqs | head -5
}
# 性能计数器分析
perf_counter_analysis() {
log_step "性能计数器分析"
if command -v perf >/dev/null 2>&1; then
echo -e "${GREEN}系统级性能分析:${NC}"
# 系统级性能统计(采样10秒)
local perf_file="${INVESTIGATION_DIR}/perf_system_$(date +%H%M%S).data"
timeout 10 perf record -a -g -o "$perf_file" >/dev/null 2>&1 &
local perf_pid=$!
echo -e " 正在收集性能数据(10秒)..."
wait $perf_pid 2>/dev/null || true
if [ -f "$perf_file" ]; then
echo -e " 性能数据已保存: $perf_file"
# 生成报告
perf report -i "$perf_file" --stdio | head -50 > "${INVESTIGATION_DIR}/perf_report.txt" 2>/dev/null || true
echo -e " 性能报告已生成: ${INVESTIGATION_DIR}/perf_report.txt"
fi
else
echo -e "${YELLOW}perf工具未安装,跳过性能计数器分析${NC}"
fi
}
# 系统调用分析
system_call_analysis() {
log_step "系统调用分析"
# 找到高CPU进程进行strace跟踪
ps -eo pid,%cpu,comm --no-headers | awk '$2 > 20.0 {print $1}' | head -3 | while read pid; do
if [ -n "$pid" ] && [ -d "/proc/$pid" ]; then
local process_name=$(ps -p "$pid" -o comm=)
echo -e "${GREEN}跟踪进程 $pid ($process_name) 的系统调用:${NC}"
local strace_file="${INVESTIGATION_DIR}/strace_${pid}_$(date +%H%M%S).log"
# 跟踪5秒
timeout 5 strace -c -p "$pid" 2>&1 | head -20 > "$strace_file" 2>&1 &
local strace_pid=$!
wait $strace_pid 2>/dev/null || true
if [ -s "$strace_file" ]; then
cat "$strace_file"
echo -e " 完整跟踪日志: $strace_file"
else
echo -e " 跟踪失败或没有系统调用活动"
fi
fi
done
}
# 生成报告
generate_report() {
log_step "生成调查报告"
local report_file="${INVESTIGATION_DIR}/cpu_investigation_report.txt"
cat > "$report_file" << EOF
CPU性能问题调查报告
===================
调查时间: $(date)
调查目录: $INVESTIGATION_DIR
📊 系统概览
-----------
主机名: $(hostname)
CPU核心数: $(nproc)
当前负载: $(uptime | awk -F'load average:' '{print $2}')
CPU使用率: $(top -bn1 | grep "Cpu(s)" | awk '{print 100 - $8}')%
🔍 发现的问题
------------
$(grep -E "⚠️|ERROR|WARN" "$LOG_FILE" | head -10)
💡 建议措施
----------
1. 检查高CPU进程的合法性
2. 分析线程级别的性能问题
3. 检查应用程序配置和代码
4. 考虑系统调优参数
5. 监控系统资源使用趋势
📁 收集的文件
------------
$(find "$INVESTIGATION_DIR" -type f -name "*.txt" -o -name "*.log" -o -name "*.data" | while read file; do
echo "- $(basename "$file")"
done)
注意: 详细分析请查看各个日志文件
EOF
echo -e "${GREEN}调查报告已生成:${NC} $report_file"
echo -e "\n${CYAN}=== CPU性能深度排查完成 ===${NC}"
echo -e "所有日志和报告保存在: $INVESTIGATION_DIR"
}
# 主函数
main() {
init_investigation
quick_cpu_analysis
detailed_process_analysis
thread_level_analysis
cpu_scheduling_analysis
perf_counter_analysis
system_call_analysis
generate_report
}
# 执行主函数
main "$@"
内存性能问题深度排查
bash
#!/bin/bash
# memory_investigation.sh - 内存性能深度排查脚本
set -euo pipefail
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'
# 全局变量
INVESTIGATION_DIR="/opt/performance/memory_investigation_$(date +%Y%m%d_%H%M%S)"
LOG_FILE="${INVESTIGATION_DIR}/memory_investigation.log"
# 初始化
init_investigation() {
mkdir -p "$INVESTIGATION_DIR"
exec > >(tee -a "$LOG_FILE")
exec 2>&1
echo -e "${CYAN}=== 内存性能深度排查开始 ===${NC}"
echo -e "调查目录: $INVESTIGATION_DIR"
echo -e "开始时间: $(date)"
}
# 记录调查步骤
log_step() {
echo -e "\n${BLUE}>>> $1${NC}"
}
# 快速内存分析
quick_memory_analysis() {
log_step "快速内存分析"
# 内存使用概况
echo -e "${GREEN}内存使用概况:${NC}"
free -h
# 检查内存压力
local available_mem=$(free -m | awk 'NR==2{print $7}')
local total_mem=$(free -m | awk 'NR==2{print $2}')
local mem_usage_percent=$(( (total_mem - available_mem) * 100 / total_mem ))
echo -e "${GREEN}内存使用率:${NC} ${mem_usage_percent}%"
if [ $mem_usage_percent -gt 90 ]; then
echo -e "${RED}⚠️ 内存使用率超过90%,可能存在内存压力${NC}"
elif [ $mem_usage_percent -gt 70 ]; then
echo -e "${YELLOW}⚠️ 内存使用率超过70%,需要关注${NC}"
else
echo -e "${GREEN}✅ 内存使用率正常${NC}"
fi
# 检查Swap使用
local swap_used=$(free -m | awk 'NR==3{print $3}')
if [ $swap_used -gt 0 ]; then
echo -e "${YELLOW}⚠️ Swap正在使用: ${swap_used}MB${NC}"
else
echo -e "${GREEN}✅ Swap未使用${NC}"
fi
}
# 详细内存统计
detailed_memory_stats() {
log_step "详细内存统计"
local meminfo_file="${INVESTIGATION_DIR}/meminfo_$(date +%H%M%S).txt"
cat /proc/meminfo > "$meminfo_file"
echo -e "${GREEN}关键内存指标:${NC}"
grep -E "MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree|Slab|SReclaimable|SUnreclaim" "$meminfo_file" | \
while read key value unit; do
local mb_value=$((value / 1024))
printf " %-20s: %8d MB\n" "$key" "$mb_value"
done
# 检查内存碎片
echo -e "\n${GREEN}内存碎片信息:${NC}"
if [ -f /proc/buddyinfo ]; then
cat /proc/buddyinfo | head -5
fi
# 检查大页内存
echo -e "\n${GREEN}大页内存信息:${NC}"
grep -E "HugePages_Total|HugePages_Free|Hugepagesize" "$meminfo_file"
}
# 进程内存分析
process_memory_analysis() {
log_step "进程内存分析"
local process_file="${INVESTIGATION_DIR}/process_memory_$(date +%H%M%S).txt"
# 获取内存占用最高的进程
echo -e "${GREEN}内存占用Top 10进程:${NC}"
ps aux --sort=-%mem | head -11 | awk 'NR==1 {printf " %-8s %-8s %-6s %-6s %-8s %s\n", "USER", "PID", "%CPU", "%MEM", "RSS", "COMMAND"}
NR>1 {printf " %-8s %-8s %-6.1f %-6.1f %-8s %s\n", $1, $2, $3, $4, $6, $11}' | \
while read line; do
echo " $line"
done
# 分析高内存进程的详细内存使用
ps aux --sort=-%mem | awk 'NR>1 && $4 > 1.0 {print $2}' | head -5 | while read pid; do
if [ -d "/proc/$pid" ]; then
echo -e "\n ${CYAN}分析高内存进程 PID: $pid${NC}"
local process_name=$(ps -p "$pid" -o comm= 2>/dev/null || echo "未知")
echo -e " 进程名: $process_name"
# 检查进程内存映射
local maps_file="${INVESTIGATION_DIR}/maps_${pid}.txt"
if [ -f "/proc/$pid/smaps" ]; then
cat "/proc/$pid/smaps" > "$maps_file"
# 计算各部分内存使用
local pss=$(grep -E "Pss|Swap" "/proc/$pid/smaps" | awk '{sum += $2} END {print sum " kB"}')
local swap=$(grep "Swap" "/proc/$pid/smaps" | awk '{sum += $2} END {print sum " kB"}')
echo -e " 实际使用(PSS): $pss"
echo -e " Swap使用: $swap"
# 分析内存段
echo -e " 内存段分析:"
grep -E "Size|Rss|Pss|Shared" "/proc/$pid/smaps" | head -20 | \
while read line; do
echo -e " $line"
done
fi
fi
done
}
# slab内存分析
slab_memory_analysis() {
log_step "Slab内存分析"
echo -e "${GREEN}Slab内存使用:${NC}"
if command -v slabtop >/dev/null 2>&1; then
slabtop -o -s c | head -20
else
echo -e " ${YELLOW}slabtop命令不可用${NC}"
fi
# 检查slab信息
local slabinfo_file="${INVESTIGATION_DIR}/slabinfo_$(date +%H%M%S).txt"
cat /proc/slabinfo > "$slabinfo_file" 2>/dev/null || true
if [ -s "$slabinfo_file" ]; then
echo -e "\n${GREEN}Slab占用Top 10:${NC}"
awk 'NR>1 {print $1,$2,$3}' "$slabinfo_file" | sort -k2 -nr | head -10 | \
while read name active_objs objsize; do
local total_size=$((active_objs * objsize / 1024))
printf " %-20s: %8d KB\n" "$name" "$total_size"
done
fi
}
# 内存泄漏检测
memory_leak_detection() {
log_step "内存泄漏检测"
echo -e "${GREEN}检查可能的内存泄漏:${NC}"
# 监控内存增长趋势
local mem_usage_log="${INVESTIGATION_DIR}/memory_trend.log"
for i in {1..5}; do
local timestamp=$(date '+%H:%M:%S')
local mem_info=$(free -m | awk 'NR==2{print $3 "," $4 "," $7}')
echo "$timestamp,$mem_info" >> "$mem_usage_log"
sleep 2
done
echo -e " 内存趋势已记录到: $mem_usage_log"
# 检查OOM killer活动
echo -e "\n${GREEN}OOM Killer记录:${NC}"
if dmesg | grep -i "killed process" | tail -5; then
echo -e "${RED}⚠️ 检测到OOM Killer活动${NC}"
else
echo -e " 未发现最近的OOM Killer活动"
fi
# 检查内存回收压力
echo -e "\n${GREEN}内存回收压力:${NC}"
if [ -f /proc/pressure/memory ]; then
cat /proc/pressure/memory
else
echo " 内核不支持内存压力监测"
fi
}
# 交换空间分析
swap_analysis() {
log_step "交换空间分析"
echo -e "${GREEN}交换空间使用:${NC}"
swapon --show
echo -e "\n${GREEN}交换活动统计:${NC}"
if command -v vmstat >/dev/null 2>&1; then
vmstat 1 3 | tail -3
fi
# 检查页面交换
echo -e "\n${GREEN}页面交换统计:${NC}"
grep -E "pgpgin|pgpgout|pswpin|pswpout" /proc/vmstat | \
while read key value; do
local mb_value=$((value / 256)) # 近似转换为MB
printf " %-15s: %8d MB\n" "$key" "$mb_value"
done
}
# 生成内存报告
generate_memory_report() {
log_step "生成内存调查报告"
local report_file="${INVESTIGATION_DIR}/memory_investigation_report.txt"
cat > "$report_file" << EOF
内存性能问题调查报告
===================
调查时间: $(date)
调查目录: $INVESTIGATION_DIR
📊 内存概览
-----------
总内存: $(free -h | awk 'NR==2{print $2}')
已使用: $(free -h | awk 'NR==2{print $3}')
可用内存: $(free -h | awk 'NR==2{print $7}')
内存使用率: $(free | awk 'NR==2{printf "%.1f%%", $3/$2*100}')
🔍 发现的问题
------------
$(grep -E "⚠️|WARN" "$LOG_FILE" | head -10)
💡 建议措施
----------
1. 检查高内存进程的合理性
2. 分析内存泄漏可能性
3. 优化应用程序内存使用
4. 考虑增加物理内存
5. 调整Swappiness参数
6. 清理Slab缓存
📊 内存优化建议
---------------
1. 定期监控内存使用趋势
2. 设置适当的内存限制
3. 优化应用程序内存分配
4. 考虑使用内存压缩
5. 配置适当的内存回收策略
📁 收集的文件
------------
$(find "$INVESTIGATION_DIR" -type f -name "*.txt" -o -name "*.log" | while read file; do
echo "- $(basename "$file")"
done)
注意: 详细分析请查看各个日志文件
EOF
echo -e "${GREEN}内存调查报告已生成:${NC} $report_file"
echo -e "\n${CYAN}=== 内存性能深度排查完成 ===${NC}"
echo -e "所有日志和报告保存在: $INVESTIGATION_DIR"
}
# 主函数
main() {
init_investigation
quick_memory_analysis
detailed_memory_stats
process_memory_analysis
slab_memory_analysis
memory_leak_detection
swap_analysis
generate_memory_report
}
# 执行主函数
main "$@"
高级性能诊断工具
实时性能监控面板
bash
#!/bin/bash
# performance_dashboard.sh - 实时性能监控面板
set -euo pipefail
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m'
# 清理屏幕并设置终端
clear_screen() {
printf "\033[2J\033[H"
}
# 显示头部信息
show_header() {
local width=${1:-80}
printf "${PURPLE}"
printf "╔%*s╗\n" $((width-2)) "" | tr ' ' '='
printf "║%*s║\n" $((width-2)) ""
printf "║ 🚀 Linux实时性能监控面板 %*s║\n" $((width-40)) ""
printf "║%*s║\n" $((width-2)) ""
printf "║ 主机: %-20s 时间: %-19s║\n" "$(hostname)" "$(date '+%Y-%m-%d %H:%M:%S')"
printf "║%*s║\n" $((width-2)) ""
printf "╚%*s╝\n" $((width-2)) "" | tr ' ' '='
printf "${NC}\n"
}
# CPU监控部分
show_cpu_section() {
local width=$1
printf "${CYAN}🖥️ CPU监控${NC}\n"
printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
# CPU使用率
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{printf "%.1f", 100 - $8}')
printf "${BLUE}│${NC} %-15s: ${GREEN}%5.1f%%${NC} %*s ${BLUE}│${NC}\n" "使用率" "$cpu_usage" $((width-30)) ""
# 负载平均值
local load_avg=$(uptime | awk -F'load average:' '{print $2}' | sed 's/^ *//')
local cpu_cores=$(nproc)
printf "${BLUE}│${NC} %-15s: ${YELLOW}%s${NC} %*s ${BLUE}│${NC}\n" "负载" "$load_avg" $((width-35-${#load_avg})) ""
printf "${BLUE}│${NC} %-15s: ${CYAN}%d 核心${NC} %*s ${BLUE}│${NC}\n" "CPU核心" "$cpu_cores" $((width-28)) ""
# 中断和上下文切换
local interrupts=$(awk '{print $1}' /proc/stat | grep -c "^cpu")
local context_switches=$(grep ctxt /proc/stat | awk '{print $2}')
printf "${BLUE}│${NC} %-15s: ${CYAN}%'d${NC} %*s ${BLUE}│${NC}\n" "中断" "$interrupts" $((width-28)) ""
printf "${BLUE}│${NC} %-15s: ${CYAN}%'d${NC} %*s ${BLUE}│${NC}\n" "上下文切换" "$context_switches" $((width-33)) ""
printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
printf "\n"
}
# 内存监控部分
show_memory_section() {
local width=$1
printf "${CYAN}💾 内存监控${NC}\n"
printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
# 内存使用情况
local mem_total=$(free -m | awk 'NR==2{print $2}')
local mem_used=$(free -m | awk 'NR==2{print $3}')
local mem_available=$(free -m | awk 'NR==2{print $7}')
local mem_usage_percent=$((mem_used * 100 / mem_total))
printf "${BLUE}│${NC} %-15s: ${GREEN}%'d MB${NC} %*s ${BLUE}│${NC}\n" "总内存" "$mem_total" $((width-30)) ""
printf "${BLUE}│${NC} %-15s: ${YELLOW}%'d MB${NC} %*s ${BLUE}│${NC}\n" "已使用" "$mem_used" $((width-30)) ""
printf "${BLUE}│${NC} %-15s: ${GREEN}%'d MB${NC} %*s ${BLUE}│${NC}\n" "可用内存" "$mem_available" $((width-32)) ""
printf "${BLUE}│${NC} %-15s: " "使用率"
# 彩色进度条
local bar_width=20
local filled=$((mem_usage_percent * bar_width / 100))
local empty=$((bar_width - filled))
if [ $mem_usage_percent -gt 90 ]; then
printf "${RED}"
elif [ $mem_usage_percent -gt 70 ]; then
printf "${YELLOW}"
else
printf "${GREEN}"
fi
printf "%3d%% [" "$mem_usage_percent"
printf "%*s" "$filled" | tr ' ' '■'
printf "%*s" "$empty" | tr ' ' '·'
printf "] %*s ${BLUE}│${NC}\n" $((width-42)) ""
# Swap使用
local swap_total=$(free -m | awk 'NR==3{print $2}')
local swap_used=$(free -m | awk 'NR==3{print $3}')
if [ $swap_total -gt 0 ]; then
local swap_percent=$((swap_used * 100 / swap_total))
printf "${BLUE}│${NC} %-15s: ${YELLOW}%'d/%'d MB${NC} %*s ${BLUE}│${NC}\n" "Swap" "$swap_used" "$swap_total" $((width-40)) ""
fi
printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
printf "\n"
}
# 磁盘监控部分
show_disk_section() {
local width=$1
printf "${CYAN}💽 磁盘监控${NC}\n"
printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
# 磁盘使用情况
df -h | awk 'NR>1 && $5+0 > 50 {print $1,$2,$3,$4,$5,$6}' | head -3 | \
while read fs size used avail percent mount; do
local percent_num=${percent%\%}
printf "${BLUE}│${NC} %-15s: " "$mount"
if [ $percent_num -gt 90 ]; then
printf "${RED}"
elif [ $percent_num -gt 70 ]; then
printf "${YELLOW}"
else
printf "${GREEN}"
fi
printf "%-5s %-4s %-4s %3s %*s ${BLUE}│${NC}\n" "$used" "/" "$size" "$percent" $((width-45)) ""
done
# IO统计
if command -v iostat >/dev/null 2>&1; then
local io_util=$(iostat -d 1 1 | awk 'NR>3 && $2!="" {util+=$NF} END {printf "%.1f", util}')
printf "${BLUE}│${NC} %-15s: ${CYAN}%5.1f%%${NC} %*s ${BLUE}│${NC}\n" "IO使用率" "$io_util" $((width-30)) ""
fi
printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
printf "\n"
}
# 进程监控部分
show_process_section() {
local width=$1
printf "${CYAN}🔄 进程监控${NC}\n"
printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
# CPU占用最高的进程
printf "${BLUE}│${NC} ${GREEN}%-6s ${YELLOW}%-8s ${CYAN}%-6s ${NC}%-20s %*s ${BLUE}│${NC}\n" "CPU%" "PID" "MEM%" "进程名" $((width-50)) ""
printf "${BLUE}│%*s│${NC}\n" $((width-2)) "" | tr ' ' '─'
ps aux --sort=-%cpu | awk 'NR>1 && NR<=6 {
printf "│ %-6.1f %-8s %-6.1f %-20s %*s │\n",
$3, $2, $4, substr($11,1,18), '"$((width-45))"', ""
}' | while read line; do
echo -e "${BLUE}$line${NC}"
done
printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
printf "\n"
}
# 网络监控部分
show_network_section() {
local width=$1
printf "${CYAN}🌐 网络监控${NC}\n"
printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
# 连接统计
local tcp_connections=$(ss -t -a | wc -l)
local established_connections=$(ss -t -a | grep ESTAB | wc -l)
printf "${BLUE}│${NC} %-15s: ${CYAN}%'d${NC} %*s ${BLUE}│${NC}\n" "TCP连接" "$tcp_connections" $((width-28)) ""
printf "${BLUE}│${NC} %-15s: ${GREEN}%'d${NC} %*s ${BLUE}│${NC}\n" "已建立" "$established_connections" $((width-28)) ""
# 网络接口
local interface=$(ip route | awk '/default/ {print $5}' | head -1)
if [ -n "$interface" ]; then
local rx_bytes=$(cat /sys/class/net/$interface/statistics/rx_bytes 2>/dev/null || echo 0)
local tx_bytes=$(cat /sys/class/net/$interface/statistics/tx_bytes 2>/dev/null || echo 0)
printf "${BLUE}│${NC} %-15s: ${CYAN}%s${NC} %*s ${BLUE}│${NC}\n" "活动接口" "$interface" $((width-31)) ""
printf "${BLUE}│${NC} %-15s: ${GREEN}%'d${NC} %*s ${BLUE}│${NC}\n" "接收字节" "$rx_bytes" $((width-31)) ""
printf "${BLUE}│${NC} %-15s: ${YELLOW}%'d${NC} %*s ${BLUE}│${NC}\n" "发送字节" "$tx_bytes" $((width-31)) ""
fi
printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
printf "\n"
}
# 系统负载可视化
show_load_visualization() {
local width=$1
printf "${CYAN}📊 系统负载可视化${NC}\n"
printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
# 获取1分钟负载
local load1=$(uptime | awk -F'load average:' '{print $2}' | awk -F, '{print $1}' | sed 's/ //g')
local cpu_cores=$(nproc)
printf "${BLUE}│${NC} %-20s: ${CYAN}%s${NC} %*s ${BLUE}│${NC}\n" "1分钟负载" "$load1" $((width-35)) ""
printf "${BLUE}│${NC} %-20s: ${CYAN}%d${NC} %*s ${BLUE}│${NC}\n" "CPU核心数" "$cpu_cores" $((width-33)) ""
# 负载可视化
local load_value=$(echo "$load1" | awk '{print int($1*100)}')
local load_percent=$((load_value / cpu_cores))
printf "${BLUE}│${NC} %-20s: " "负载水平"
if [ $load_percent -gt 150 ]; then
printf "${RED}严重超载 🔴${NC}"
elif [ $load_percent -gt 100 ]; then
printf "${RED}超载 🟠${NC}"
elif [ $load_percent -gt 70 ]; then
printf "${YELLOW}较高 🟡${NC}"
elif [ $load_percent -gt 30 ]; then
printf "${GREEN}正常 🟢${NC}"
else
printf "${GREEN}较低 🔵${NC}"
fi
printf " %*s ${BLUE}│${NC}\n" $((width-40)) ""
printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
printf "\n"
}
# 主监控循环
main() {
local terminal_width=${COLUMNS:-80}
local refresh_rate=3
# 检查终端大小
if [ $terminal_width -lt 80 ]; then
terminal_width=80
fi
# 处理Ctrl+C退出
trap 'clear_screen; echo -e "\n${GREEN}监控已退出${NC}"; exit 0' INT
while true; do
clear_screen
show_header $terminal_width
show_cpu_section $terminal_width
show_memory_section $terminal_width
show_disk_section $terminal_width
show_process_section $terminal_width
show_network_section $terminal_width
show_load_visualization $terminal_width
# 底部信息
printf "${PURPLE}按 Ctrl+C 退出监控 | 刷新间隔: ${refresh_rate}秒${NC}\n"
sleep $refresh_rate
done
}
# 执行主函数
main "$@"
性能问题排查流程图
以下图表展示了完整的性能问题排查流程:
flowchart TD
A[性能告警] --> B{识别问题类型}
B -->|CPU飙高| C[CPU问题排查]
B -->|内存飙高| D[内存问题排查]
B -->|IO问题| E[磁盘IO排查]
B -->|网络问题| F[网络问题排查]
C --> C1[快速系统检查]
C1 --> C2[进程级分析]
C2 --> C3[线程级分析]
C3 --> C4[性能剖析]
C4 --> C5[系统调用分析]
C5 --> C6[生成解决方案]
D --> D1[内存使用分析]
D1 --> D2[进程内存分析]
D2 --> D3[Slab缓存分析]
D3 --> D4[内存泄漏检测]
D4 --> D5[交换空间分析]
D5 --> D6[内存优化方案]
E --> E1[磁盘使用检查]
E1 --> E2[IO性能分析]
E2 --> E3[文件系统检查]
E3 --> E4[IO调度分析]
E4 --> E5[存储优化]
F --> F1[网络连接分析]
F1 --> F2[带宽使用检查]
F2 --> F3[网络错误分析]
F3 --> F4[连接追踪]
F4 --> F5[网络优化]
C6 --> G[实施解决方案]
D6 --> G
E5 --> G
F5 --> G
G --> H[监控改进效果]
H --> I{问题解决?}
I -->|是| J[问题解决]
I -->|否| K[重新分析]
K --> B
style A fill:#e74c3c,color:#fff
style J fill:#27ae60,color:#fff
style C fill:#3498db,color:#fff
style D fill:#9b59b6,color:#fff
style E fill:#e67e22,color:#fff
style F fill:#34495e,color:#fff
应急处理和进程管理
紧急进程管理脚本
bash
#!/bin/bash
# emergency_process_manager.sh - 紧急进程管理脚本
set -euo pipefail
# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m'
# 日志函数
log() {
local level=$1
shift
local message=$*
local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
echo -e "[$timestamp] [$level] $message" | tee -a "/var/log/emergency_process.log"
}
# 显示进程列表
show_process_list() {
echo -e "\n${CYAN}=== 当前高资源占用进程 ===${NC}"
# CPU占用Top 10
echo -e "${YELLOW}🖥️ CPU占用Top 10:${NC}"
ps aux --sort=-%cpu | head -11 | awk 'NR==1 {printf " %-8s %-8s %-6s %-6s %-12s %s\n", "USER", "PID", "%CPU", "%MEM", "TIME", "COMMAND"}
NR>1 {printf " %-8s %-8s %-6.1f %-6.1f %-12s %s\n", $1, $2, $3, $4, $10, $11}'
# 内存占用Top 10
echo -e "\n${YELLOW}💾 内存占用Top 10:${NC}"
ps aux --sort=-%mem | head -11 | awk 'NR==1 {printf " %-8s %-8s %-6s %-6s %-8s %s\n", "USER", "PID", "%CPU", "%MEM", "RSS", "COMMAND"}
NR>1 {printf " %-8s %-8s %-6.1f %-6.1f %-8s %s\n", $1, $2, $3, $4, $6, $11}'
}
# 分析可疑进程
analyze_suspicious_processes() {
echo -e "\n${CYAN}=== 可疑进程分析 ===${NC}"
# 查找CPU占用超过50%的进程
local high_cpu_processes=$(ps aux --sort=-%cpu | awk 'NR>1 && $3 > 50.0 {print $2, $1, $3, $11}')
if [ -n "$high_cpu_processes" ]; then
echo -e "${RED}⚠️ 发现高CPU占用进程:${NC}"
echo "$high_cpu_processes" | while read pid user cpu command; do
echo -e " ${RED}PID: $pid, 用户: $user, CPU: ${cpu}%, 命令: $command${NC}"
# 检查进程详细信息
if [ -d "/proc/$pid" ]; then
local process_name=$(ps -p "$pid" -o comm= 2>/dev/null || echo "未知")
local thread_count=$(ps -L -p "$pid" | wc -l)
local start_time=$(ps -p "$pid" -o lstart= 2>/dev/null || echo "未知")
echo -e " 进程名: $process_name"
echo -e " 线程数: $((thread_count - 1))"
echo -e " 启动时间: $start_time"
# 检查进程文件
local exe_path=$(readlink "/proc/$pid/exe" 2>/dev/null || echo "无法访问")
echo -e " 可执行文件: $exe_path"
fi
echo ""
done
else
echo -e "${GREEN}✅ 未发现异常高CPU占用进程${NC}"
fi
# 查找内存占用超过10%的进程
local high_mem_processes=$(ps aux --sort=-%mem | awk 'NR>1 && $4 > 10.0 {print $2, $1, $4, $11}')
if [ -n "$high_mem_processes" ]; then
echo -e "${RED}⚠️ 发现高内存占用进程:${NC}"
echo "$high_mem_processes" | while read pid user mem command; do
echo -e " ${RED}PID: $pid, 用户: $user, 内存: ${mem}%, 命令: $command${NC}"
done
else
echo -e "${GREEN}✅ 未发现异常高内存占用进程${NC}"
fi
}
# 安全终止进程
safely_kill_process() {
local pid=$1
local signal=${2:-TERM}
if [ ! -d "/proc/$pid" ]; then
echo -e "${RED}错误: 进程 $pid 不存在${NC}"
return 1
fi
local process_info=$(ps -p "$pid" -o user,pid,%cpu,%mem,comm --no-headers 2>/dev/null)
if [ -z "$process_info" ]; then
echo -e "${RED}错误: 无法获取进程 $pid 的信息${NC}"
return 1
fi
echo -e "${YELLOW}准备终止进程:${NC}"
echo -e " 进程信息: $process_info"
# 检查进程重要性
local process_name=$(echo "$process_info" | awk '{print $5}')
local critical_processes=("systemd" "init" "kernel" "sshd" "bash" "sh")
for critical in "${critical_processes[@]}"; do
if [[ "$process_name" == *"$critical"* ]]; then
echo -e "${RED}警告: 进程 $process_name 可能是关键系统进程,终止可能导致系统不稳定${NC}"
read -p "是否继续? (y/N): " confirm
if [[ ! "$confirm" =~ ^[Yy]$ ]]; then
echo -e "${GREEN}操作已取消${NC}"
return 1
fi
break
fi
done
# 发送信号
echo -e "${YELLOW}发送 $signal 信号到进程 $pid...${NC}"
if kill -$signal "$pid" 2>/dev/null; then
echo -e "${GREEN}✅ 信号发送成功${NC}"
# 等待进程退出
local wait_time=0
local max_wait=10
while [ -d "/proc/$pid" ] && [ $wait_time -lt $max_wait ]; do
sleep 1
((wait_time++))
echo -e " 等待进程退出... ($wait_time/$max_wait 秒)"
done
if [ -d "/proc/$pid" ]; then
echo -e "${RED}进程未正常退出,发送KILL信号...${NC}"
kill -KILL "$pid"
sleep 1
fi
if [ ! -d "/proc/$pid" ]; then
echo -e "${GREEN}✅ 进程已成功终止${NC}"
log "INFO" "进程 $pid ($process_name) 已被终止"
else
echo -e "${RED}❌ 无法终止进程 $pid${NC}"
log "ERROR" "无法终止进程 $pid"
return 1
fi
else
echo -e "${RED}❌ 无法发送信号到进程 $pid${NC}"
log "ERROR" "无法发送信号到进程 $pid"
return 1
fi
}
# 批量终止进程
batch_kill_processes() {
echo -e "\n${CYAN}=== 批量进程管理 ===${NC}"
# 按条件选择进程
echo -e "${YELLOW}选择终止条件:${NC}"
echo " 1) CPU使用率 > 指定百分比"
echo " 2) 内存使用率 > 指定百分比"
echo " 3) 指定进程名"
echo " 4) 指定用户"
read -p "请选择选项 (1-4): " option
case $option in
1)
read -p "请输入CPU使用率阈值 (例如: 50): " threshold
local processes=$(ps aux --sort=-%cpu | awk -v threshold="$threshold" 'NR>1 && $3 > threshold {print $2, $1, $3, $11}')
;;
2)
read -p "请输入内存使用率阈值 (例如: 10): " threshold
local processes=$(ps aux --sort=-%mem | awk -v threshold="$threshold" 'NR>1 && $4 > threshold {print $2, $1, $4, $11}')
;;
3)
read -p "请输入进程名: " process_name
local processes=$(ps aux | awk -v name="$process_name" 'NR>1 && $11 ~ name {print $2, $1, $3, $11}')
;;
4)
read -p "请输入用户名: " username
local processes=$(ps aux | awk -v user="$username" 'NR>1 && $1 == user {print $2, $1, $3, $11}')
;;
*)
echo -e "${RED}无效选项${NC}"
return 1
;;
esac
if [ -z "$processes" ]; then
echo -e "${GREEN}没有找到匹配的进程${NC}"
return 0
fi
echo -e "\n${YELLOW}找到以下进程:${NC}"
echo "$processes" | while read pid user usage command; do
echo -e " PID: $pid, 用户: $user, 资源: $usage%, 命令: $command"
done
read -p "是否终止这些进程? (y/N): " confirm
if [[ "$confirm" =~ ^[Yy]$ ]]; then
echo "$processes" | while read pid user usage command; do
echo -e "\n${YELLOW}处理进程 $pid...${NC}"
safely_kill_process "$pid"
done
else
echo -e "${GREEN}操作已取消${NC}"
fi
}
# 进程资源限制
set_process_limits() {
echo -e "\n${CYAN}=== 进程资源限制 ===${NC}"
# 检查cgroups是否可用
if [ ! -d "/sys/fs/cgroup" ]; then
echo -e "${RED}错误: cgroups不可用${NC}"
return 1
fi
echo -e "${YELLOW}设置进程资源限制:${NC}"
echo " 1) CPU限制"
echo " 2) 内存限制"
echo " 3) IO限制"
read -p "请选择选项 (1-3): " option
case $option in
1)
read -p "请输入要限制的PID: " pid
read -p "请输入CPU使用率限制 (例如: 50): " cpu_limit
if [ -d "/proc/$pid" ]; then
# 创建cgroup
local cgroup_dir="/sys/fs/cgroup/cpu/limit_$pid"
mkdir -p "$cgroup_dir"
# 设置CPU限制 (百分比转换为CFS配额)
local cpu_quota=$((cpu_limit * 1000))
echo "$cpu_quota" > "$cgroup_dir/cpu.cfs_quota_us"
echo "100000" > "$cgroup_dir/cpu.cfs_period_us"
# 将进程加入cgroup
echo "$pid" > "$cgroup_dir/cgroup.procs"
echo -e "${GREEN}✅ 已设置进程 $pid 的CPU限制为 ${cpu_limit}%${NC}"
else
echo -e "${RED}错误: 进程 $pid 不存在${NC}"
fi
;;
2)
read -p "请输入要限制的PID: " pid
read -p "请输入内存限制 (MB): " mem_limit
if [ -d "/proc/$pid" ]; then
# 创建cgroup
local cgroup_dir="/sys/fs/cgroup/memory/limit_$pid"
mkdir -p "$cgroup_dir"
# 设置内存限制
local mem_bytes=$((mem_limit * 1024 * 1024))
echo "$mem_bytes" > "$cgroup_dir/memory.limit_in_bytes"
# 将进程加入cgroup
echo "$pid" > "$cgroup_dir/cgroup.procs"
echo -e "${GREEN}✅ 已设置进程 $pid 的内存限制为 ${mem_limit}MB${NC}"
else
echo -e "${RED}错误: 进程 $pid 不存在${NC}"
fi
;;
3)
echo -e "${YELLOW}IO限制功能需要更复杂的配置,建议使用systemd或手动配置cgroup${NC}"
;;
*)
echo -e "${RED}无效选项${NC}"
;;
esac
}
# 系统优化建议
system_optimization_suggestions() {
echo -e "\n${CYAN}=== 系统优化建议 ===${NC}"
# CPU优化建议
local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print 100 - $8}')
if (( $(echo "$cpu_usage > 80" | bc -l) )); then
echo -e "${YELLOW}🖥️ CPU优化建议:${NC}"
echo " • 检查并优化高CPU进程"
echo " • 考虑增加CPU核心或升级硬件"
echo " • 调整进程优先级 (nice/renice)"
echo " • 使用cgroups限制资源使用"
fi
# 内存优化建议
local mem_usage=$(free | awk 'NR==2{printf "%.1f", $3/$2*100}')
if (( $(echo "$mem_usage > 80" | bc -l) )); then
echo -e "${YELLOW}💾 内存优化建议:${NC}"
echo " • 检查内存泄漏"
echo " • 优化应用程序内存使用"
echo " • 增加物理内存"
echo " • 调整swappiness参数"
echo " • 清理缓存: echo 3 > /proc/sys/vm/drop_caches"
fi
# 通用优化建议
echo -e "${YELLOW}🔧 通用优化建议:${NC}"
echo " • 定期更新系统和软件"
echo " • 监控系统资源使用趋势"
echo " • 配置适当的监控告警"
echo " • 优化应用程序配置"
echo " • 考虑使用负载均衡"
}
# 主菜单
main_menu() {
while true; do
clear
echo -e "${PURPLE}================================${NC}"
echo -e "${PURPLE} Linux紧急进程管理器${NC}"
echo -e "${PURPLE}================================${NC}"
echo -e "${GREEN}1) 显示进程列表${NC}"
echo -e "${GREEN}2) 分析可疑进程${NC}"
echo -e "${YELLOW}3) 终止单个进程${NC}"
echo -e "${YELLOW}4) 批量终止进程${NC}"
echo -e "${BLUE}5) 设置进程资源限制${NC}"
echo -e "${CYAN}6) 系统优化建议${NC}"
echo -e "${RED}7) 退出${NC}"
echo -e "${PURPLE}================================${NC}"
read -p "请选择操作 (1-7): " choice
case $choice in
1) show_process_list ;;
2) analyze_suspicious_processes ;;
3)
read -p "请输入要终止的PID: " pid
safely_kill_process "$pid"
;;
4) batch_kill_processes ;;
5) set_process_limits ;;
6) system_optimization_suggestions ;;
7)
echo -e "${GREEN}再见!${NC}"
exit 0
;;
*)
echo -e "${RED}无效选择,请重新输入${NC}"
;;
esac
echo -e "\n按Enter键继续..."
read
done
}
# 执行主菜单
main_menu
总结
本文提供了完整的Linux服务器性能问题排查解决方案,包含:
核心排查能力
- 实时监控: 全面的系统性能监控面板
- 深度诊断: CPU、内存、IO、网络的详细分析
- 进程管理: 安全的进程终止和资源限制
- 问题定位: 从系统级别到线程级别的精准定位
实用工具集
- 基础监控脚本: 快速获取系统状态
- 深度排查工具: 性能计数器、系统调用分析
- 应急处理: 紧急情况下的进程管理
- 优化建议: 基于实际情况的调优建议
关键排查流程
- 快速识别: 使用监控面板快速定位问题类型
- 深度分析: 使用专业工具进行根本原因分析
- 安全处理: 在了解影响的前提下处理问题进程
- 持续优化: 实施长期优化方案防止问题复发
希望能帮助运维人员快速响应和解决性能问题。