性能排查必看!当Linux服务器CPU/内存飙高,如何快速定位并"干掉"罪魁祸首进程?

本文提供完整的Linux服务器性能问题排查指南,从基础监控到高级诊断,包含详细的步骤和操作代码,帮助您快速定位并解决CPU和内存飙高问题。

性能问题排查基础准备

环境准备和工具安装

在开始排查前,确保系统已安装必要的性能分析工具。

bash 复制代码
#!/bin/bash
# 性能排查工具安装脚本

# 检查系统类型
if [ -f /etc/redhat-release ]; then
    # CentOS/RHEL
    yum update -y
    yum install -y epel-release
    yum install -y htop iotop nethogs sysstat dstat perf bpftrace bcc-tools \
                   stress-ng procps-ng numactl lsof strace ltrace gdb \
                   tcpdump net-tools iproute
elif [ -f /etc/debian_version ]; then
    # Ubuntu/Debian
    apt-get update
    apt-get install -y htop iotop nethogs sysstat dstat linux-tools-common \
                      linux-tools-generic bpfcc-tools stress-ng procps \
                      numactl lsof strace ltrace gdb tcpdump net-tools \
                      iproute2
else
    echo "不支持的Linux发行版"
    exit 1
fi

# 创建性能排查工作目录
mkdir -p /opt/performance/{scripts,logs,reports}
cd /opt/performance

echo "✅ 性能排查工具安装完成"

基础监控脚本

创建基础性能监控脚本,用于快速获取系统状态。

bash 复制代码
#!/bin/bash
# basic_monitor.sh - 基础系统监控脚本

set -euo pipefail

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color

# 日志函数
log() {
    local level=$1
    shift
    local message=$*
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    
    case $level in
        "INFO") echo -e "${GREEN}[INFO]${NC} $message" ;;
        "WARN") echo -e "${YELLOW}[WARN]${NC} $message" ;;
        "ERROR") echo -e "${RED}[ERROR]${NC} $message" ;;
        "DEBUG") echo -e "${BLUE}[DEBUG]${NC} $message" ;;
    esac
}

# 系统概览
system_overview() {
    echo -e "\n${CYAN}=== 系统概览 ===${NC}"
    
    # 系统基本信息
    echo -e "${GREEN}主机名:${NC} $(hostname)"
    echo -e "${GREEN}操作系统:${NC} $(cat /etc/redhat-release 2>/dev/null || cat /etc/os-release | grep PRETTY_NAME | cut -d= -f2 | tr -d '\"')"
    echo -e "${GREEN}内核版本:${NC} $(uname -r)"
    echo -e "${GREEN}启动时间:${NC} $(uptime -s)"
    echo -e "${GREEN}运行时间:${NC} $(uptime -p)"
}

# CPU监控
cpu_monitor() {
    echo -e "\n${CYAN}=== CPU监控 ===${NC}"
    
    # CPU基本信息
    local cpu_cores=$(nproc)
    local cpu_model=$(grep "model name" /proc/cpuinfo | head -1 | cut -d: -f2 | sed 's/^ *//')
    echo -e "${GREEN}CPU型号:${NC} $cpu_model"
    echo -e "${GREEN}CPU核心数:${NC} $cpu_cores"
    
    # CPU使用率
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1"%"}')
    echo -e "${GREEN}CPU使用率:${NC} $cpu_usage"
    
    # 负载平均值
    local load_avg=$(uptime | awk -F'load average:' '{print $2}')
    echo -e "${GREEN}负载平均值:${NC} $load_avg"
    
    # 每个CPU核心的使用率
    echo -e "${GREEN}每个核心使用率:${NC}"
    mpstat -P ALL 1 1 | awk '
        /Average/ && $2 != "all" && $2 != "CPU" {
            printf "  CPU %s: %.1f%%\n", $2, 100 - $12
        }'
    
    # 中断和上下文切换
    echo -e "${GREEN}中断统计:${NC}"
    cat /proc/interrupts | head -1
    cat /proc/interrupts | awk '{for(i=2;i<=NF-3;i++) sum[i]+=$i;} END {for(i=2;i<=NF-3;i++) printf "  CPU%d: %d次\n", i-2, sum[i]}' | head -5
    
    local context_switches=$(grep ctxt /proc/stat | awk '{print $2}')
    echo -e "${GREEN}上下文切换:${NC} $context_switches 次"
}

# 内存监控
memory_monitor() {
    echo -e "\n${CYAN}=== 内存监控 ===${NC}"
    
    # 内存使用情况
    local mem_info=$(free -h)
    echo -e "${GREEN}内存使用:${NC}"
    echo "$mem_info" | head -2
    echo "$mem_info" | grep -E "Mem:|Swap:"
    
    # 详细内存信息
    echo -e "\n${GREEN}详细内存信息:${NC}"
    cat /proc/meminfo | grep -E "MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree" | while read line; do
        echo "  $line"
    done
    
    # 内存压力
    echo -e "\n${GREEN}内存压力:${NC}"
    local mem_pressure=$(cat /proc/pressure/memory 2>/dev/null || echo "无法获取内存压力信息")
    echo "  $mem_pressure"
    
    # slab内存
    local slab_memory=$(awk '/SReclaimable/ {print $2}' /proc/meminfo)
    echo -e "${GREEN}可回收Slab内存:${NC} $((slab_memory / 1024)) MB"
}

# 进程监控
process_monitor() {
    echo -e "\n${CYAN}=== 进程监控 ===${NC}"
    
    # CPU占用最高的进程
    echo -e "${GREEN}CPU占用Top 5进程:${NC}"
    ps aux --sort=-%cpu | head -6 | awk 'NR==1 {printf "  %-8s %-8s %-6s %-6s %s\n", "USER", "PID", "%CPU", "%MEM", "COMMAND"} 
                                         NR>1 {printf "  %-8s %-8s %-6.1f %-6.1f %s\n", $1, $2, $3, $4, $11}'
    
    # 内存占用最高的进程
    echo -e "\n${GREEN}内存占用Top 5进程:${NC}"
    ps aux --sort=-%mem | head -6 | awk 'NR==1 {printf "  %-8s %-8s %-6s %-6s %s\n", "USER", "PID", "%CPU", "%MEM", "COMMAND"} 
                                        NR>1 {printf "  %-8s %-8s %-6.1f %-6.1f %s\n", $1, $2, $3, $4, $11}'
    
    # 僵尸进程
    local zombie_count=$(ps aux | awk '$8=="Z" {print $2}' | wc -l)
    if [ $zombie_count -gt 0 ]; then
        echo -e "${RED}发现僵尸进程:${NC} $zombie_count 个"
        ps aux | awk '$8=="Z" {print "  PID: "$2", 命令: "$11}'
    else
        echo -e "${GREEN}僵尸进程:${NC} 0 个"
    fi
}

# 网络监控
network_monitor() {
    echo -e "\n${CYAN}=== 网络监控 ===${NC}"
    
    # 网络接口统计
    echo -e "${GREEN}网络接口统计:${NC}"
    ip -s link show | grep -E "^[0-9]|RX|TX" | head -12
    
    # 连接统计
    echo -e "\n${GREEN}网络连接统计:${NC}"
    ss -s | head -10
    
    # TCP连接状态
    echo -e "\n${GREEN}TCP连接状态:${NC}"
    ss -t -a | awk 'NR>1 {count[$2]++} END {for(state in count) printf "  %s: %d\n", state, count[state]}'
}

# 磁盘监控
disk_monitor() {
    echo -e "\n${CYAN}=== 磁盘监控 ===${NC}"
    
    # 磁盘使用情况
    echo -e "${GREEN}磁盘使用情况:${NC}"
    df -h | awk 'NR==1 {print "  "$0} $5+0 > 80 {print "  "$0}'
    
    # inode使用情况
    echo -e "\n${GREEN}inode使用情况:${NC}"
    df -i | awk 'NR==1 {print "  "$0} $5+0 > 80 {print "  "$0}'
    
    # IO统计
    echo -e "\n${GREEN}磁盘IO统计:${NC}"
    iostat -x 1 1 | head -10
}

# 主函数
main() {
    echo -e "${PURPLE}🖥️  Linux系统性能监控报告${NC}"
    echo -e "${PURPLE}生成时间: $(date)${NC}"
    echo -e "${PURPLE}================================${NC}"
    
    system_overview
    cpu_monitor
    memory_monitor
    process_monitor
    network_monitor
    disk_monitor
    
    echo -e "\n${PURPLE}================================${NC}"
    echo -e "${PURPLE}监控报告生成完成${NC}"
}

# 执行主函数
main

CPU性能问题深度排查

CPU问题排查脚本

bash 复制代码
#!/bin/bash
# cpu_investigation.sh - CPU性能深度排查脚本

set -euo pipefail

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'

# 全局变量
INVESTIGATION_DIR="/opt/performance/investigation_$(date +%Y%m%d_%H%M%S)"
LOG_FILE="${INVESTIGATION_DIR}/cpu_investigation.log"

# 初始化
init_investigation() {
    mkdir -p "$INVESTIGATION_DIR"
    exec > >(tee -a "$LOG_FILE")
    exec 2>&1
    
    echo -e "${CYAN}=== CPU性能深度排查开始 ===${NC}"
    echo -e "调查目录: $INVESTIGATION_DIR"
    echo -e "日志文件: $LOG_FILE"
    echo -e "开始时间: $(date)"
}

# 记录调查步骤
log_step() {
    echo -e "\n${BLUE}>>> $1${NC}"
    echo "[STEP] $1 - $(date)" >> "$LOG_FILE"
}

# 快速CPU问题识别
quick_cpu_analysis() {
    log_step "快速CPU问题识别"
    
    # 检查CPU使用率
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print 100 - $8}')
    echo -e "${GREEN}当前CPU使用率:${NC} ${cpu_usage}%"
    
    if (( $(echo "$cpu_usage > 80" | bc -l) )); then
        echo -e "${RED}⚠️  CPU使用率过高${NC}"
    elif (( $(echo "$cpu_usage > 50" | bc -l) )); then
        echo -e "${YELLOW}⚠️  CPU使用率偏高${NC}"
    else
        echo -e "${GREEN}✅ CPU使用率正常${NC}"
    fi
    
    # 检查负载
    local load_avg=$(uptime | awk -F'load average:' '{print $2}')
    local cpu_cores=$(nproc)
    echo -e "${GREEN}负载平均值:${NC} $load_avg"
    echo -e "${GREEN}CPU核心数:${NC} $cpu_cores"
    
    # 负载与核心数比较
    local load1=$(echo $load_avg | awk -F, '{print $1}')
    if (( $(echo "$load1 > $cpu_cores" | bc -l) )); then
        echo -e "${RED}⚠️  系统负载过高${NC}"
    fi
}

# 详细进程分析
detailed_process_analysis() {
    log_step "详细进程分析"
    
    # 保存当前进程快照
    local process_file="${INVESTIGATION_DIR}/process_snapshot_$(date +%H%M%S).txt"
    ps aux --sort=-%cpu > "$process_file"
    
    echo -e "${GREEN}CPU占用Top 10进程:${NC}"
    head -11 "$process_file" | awk 'NR==1 {printf "  %-8s %-8s %-6s %-6s %-12s %s\n", "USER", "PID", "%CPU", "%MEM", "TIME", "COMMAND"} 
                                   NR>1 {printf "  %-8s %-8s %-6.1f %-6.1f %-12s %s\n", $1, $2, $3, $4, $10, $11}'
    
    # 分析可疑进程
    echo -e "\n${GREEN}可疑进程分析:${NC}"
    awk '$3 > 10.0 {print $2}' "$process_file" | while read pid; do
        if [ -n "$pid" ] && [ "$pid" != "PID" ]; then
            echo -e "  ${YELLOW}分析进程 PID: $pid${NC}"
            
            # 进程详细信息
            local proc_status="/proc/$pid/status"
            if [ -f "$proc_status" ]; then
                local process_name=$(ps -p "$pid" -o comm= 2>/dev/null || echo "未知")
                local thread_count=$(grep "Threads" "$proc_status" | awk '{print $2}')
                local cpu_affinity=$(taskset -cp "$pid" 2>/dev/null | awk -F: '{print $2}' | sed 's/^ *//')
                
                echo -e "    进程名: $process_name"
                echo -e "    线程数: $thread_count"
                echo -e "    CPU亲和性: $cpu_affinity"
                
                # 检查进程状态
                local state=$(grep "State" "$proc_status" | awk '{print $2}')
                echo -e "    进程状态: $state"
            fi
        fi
    done
}

# 线程级别分析
thread_level_analysis() {
    log_step "线程级别分析"
    
    local thread_file="${INVESTIGATION_DIR}/thread_analysis_$(date +%H%M%S).txt"
    
    # 获取高CPU进程的线程信息
    echo -e "${GREEN}高CPU进程的线程分析:${NC}"
    
    # 找到CPU使用率超过5%的进程
    ps -eo pid,%cpu,comm --no-headers | awk '$2 > 5.0 {print $1}' | while read pid; do
        if [ -d "/proc/$pid/task" ]; then
            echo -e "\n  ${CYAN}进程 $pid 的线程:${NC}"
            
            # 获取进程名
            local process_name=$(ps -p "$pid" -o comm= 2>/dev/null || echo "未知")
            echo -e "    进程名: $process_name"
            
            # 分析每个线程
            for tid in /proc/$pid/task/*; do
                local thread_id=$(basename "$tid")
                if [ "$thread_id" != "$pid" ]; then  # 跳过主线程(已经在进程列表中)
                    local thread_cpu=$(ps -L -p "$pid" -o tid,pcpu,comm --no-headers 2>/dev/null | awk -v tid="$thread_id" '$1 == tid {print $2}')
                    if [ -n "$thread_cpu" ] && (( $(echo "$thread_cpu > 1.0" | bc -l) )); then
                        echo -e "    ${YELLOW}线程 $thread_id: ${thread_cpu}% CPU${NC}"
                        
                        # 获取线程的调用栈
                        local stack_file="${INVESTIGATION_DIR}/stack_${pid}_${thread_id}.txt"
                        if command -v gdb >/dev/null 2>&1; then
                            gdb -ex "set pagination 0" -ex "thread apply all bt" -batch -p "$pid" 2>/dev/null | \
                            grep -A 20 "Thread $thread_id" > "$stack_file" 2>/dev/null || true
                        fi
                    fi
                fi
            done
        fi
    done
}

# CPU调度分析
cpu_scheduling_analysis() {
    log_step "CPU调度分析"
    
    echo -e "${GREEN}运行队列长度:${NC}"
    sar -q 1 3 | tail -3
    
    echo -e "\n${GREEN}CPU调度统计:${NC}"
    grep -E "cpu|ctxt" /proc/stat | head -5
    
    echo -e "\n${GREEN}每个CPU的运行队列:${NC}"
    mpstat -P ALL 1 1 | grep -E "CPU|Average" | awk '
        /CPU/ {printf "  %s: ", $2}
        /Average/ && $2 != "all" {printf "运行队列: %.2f\n", $8}'
    
    # 检查软中断分布
    echo -e "\n${GREEN}软中断分布:${NC}"
    cat /proc/softirqs | head -5
}

# 性能计数器分析
perf_counter_analysis() {
    log_step "性能计数器分析"
    
    if command -v perf >/dev/null 2>&1; then
        echo -e "${GREEN}系统级性能分析:${NC}"
        
        # 系统级性能统计(采样10秒)
        local perf_file="${INVESTIGATION_DIR}/perf_system_$(date +%H%M%S).data"
        timeout 10 perf record -a -g -o "$perf_file" >/dev/null 2>&1 &
        local perf_pid=$!
        
        echo -e "  正在收集性能数据(10秒)..."
        wait $perf_pid 2>/dev/null || true
        
        if [ -f "$perf_file" ]; then
            echo -e "  性能数据已保存: $perf_file"
            
            # 生成报告
            perf report -i "$perf_file" --stdio | head -50 > "${INVESTIGATION_DIR}/perf_report.txt" 2>/dev/null || true
            echo -e "  性能报告已生成: ${INVESTIGATION_DIR}/perf_report.txt"
        fi
    else
        echo -e "${YELLOW}perf工具未安装,跳过性能计数器分析${NC}"
    fi
}

# 系统调用分析
system_call_analysis() {
    log_step "系统调用分析"
    
    # 找到高CPU进程进行strace跟踪
    ps -eo pid,%cpu,comm --no-headers | awk '$2 > 20.0 {print $1}' | head -3 | while read pid; do
        if [ -n "$pid" ] && [ -d "/proc/$pid" ]; then
            local process_name=$(ps -p "$pid" -o comm=)
            echo -e "${GREEN}跟踪进程 $pid ($process_name) 的系统调用:${NC}"
            
            local strace_file="${INVESTIGATION_DIR}/strace_${pid}_$(date +%H%M%S).log"
            
            # 跟踪5秒
            timeout 5 strace -c -p "$pid" 2>&1 | head -20 > "$strace_file" 2>&1 &
            local strace_pid=$!
            wait $strace_pid 2>/dev/null || true
            
            if [ -s "$strace_file" ]; then
                cat "$strace_file"
                echo -e "  完整跟踪日志: $strace_file"
            else
                echo -e "  跟踪失败或没有系统调用活动"
            fi
        fi
    done
}

# 生成报告
generate_report() {
    log_step "生成调查报告"
    
    local report_file="${INVESTIGATION_DIR}/cpu_investigation_report.txt"
    
    cat > "$report_file" << EOF
CPU性能问题调查报告
===================

调查时间: $(date)
调查目录: $INVESTIGATION_DIR

📊 系统概览
-----------
主机名: $(hostname)
CPU核心数: $(nproc)
当前负载: $(uptime | awk -F'load average:' '{print $2}')
CPU使用率: $(top -bn1 | grep "Cpu(s)" | awk '{print 100 - $8}')%

🔍 发现的问题
------------
$(grep -E "⚠️|ERROR|WARN" "$LOG_FILE" | head -10)

💡 建议措施
----------
1. 检查高CPU进程的合法性
2. 分析线程级别的性能问题
3. 检查应用程序配置和代码
4. 考虑系统调优参数
5. 监控系统资源使用趋势

📁 收集的文件
------------
$(find "$INVESTIGATION_DIR" -type f -name "*.txt" -o -name "*.log" -o -name "*.data" | while read file; do
    echo "- $(basename "$file")"
done)

注意: 详细分析请查看各个日志文件
EOF

    echo -e "${GREEN}调查报告已生成:${NC} $report_file"
    echo -e "\n${CYAN}=== CPU性能深度排查完成 ===${NC}"
    echo -e "所有日志和报告保存在: $INVESTIGATION_DIR"
}

# 主函数
main() {
    init_investigation
    quick_cpu_analysis
    detailed_process_analysis
    thread_level_analysis
    cpu_scheduling_analysis
    perf_counter_analysis
    system_call_analysis
    generate_report
}

# 执行主函数
main "$@"

内存性能问题深度排查

bash 复制代码
#!/bin/bash
# memory_investigation.sh - 内存性能深度排查脚本

set -euo pipefail

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'

# 全局变量
INVESTIGATION_DIR="/opt/performance/memory_investigation_$(date +%Y%m%d_%H%M%S)"
LOG_FILE="${INVESTIGATION_DIR}/memory_investigation.log"

# 初始化
init_investigation() {
    mkdir -p "$INVESTIGATION_DIR"
    exec > >(tee -a "$LOG_FILE")
    exec 2>&1
    
    echo -e "${CYAN}=== 内存性能深度排查开始 ===${NC}"
    echo -e "调查目录: $INVESTIGATION_DIR"
    echo -e "开始时间: $(date)"
}

# 记录调查步骤
log_step() {
    echo -e "\n${BLUE}>>> $1${NC}"
}

# 快速内存分析
quick_memory_analysis() {
    log_step "快速内存分析"
    
    # 内存使用概况
    echo -e "${GREEN}内存使用概况:${NC}"
    free -h
    
    # 检查内存压力
    local available_mem=$(free -m | awk 'NR==2{print $7}')
    local total_mem=$(free -m | awk 'NR==2{print $2}')
    local mem_usage_percent=$(( (total_mem - available_mem) * 100 / total_mem ))
    
    echo -e "${GREEN}内存使用率:${NC} ${mem_usage_percent}%"
    
    if [ $mem_usage_percent -gt 90 ]; then
        echo -e "${RED}⚠️  内存使用率超过90%,可能存在内存压力${NC}"
    elif [ $mem_usage_percent -gt 70 ]; then
        echo -e "${YELLOW}⚠️  内存使用率超过70%,需要关注${NC}"
    else
        echo -e "${GREEN}✅ 内存使用率正常${NC}"
    fi
    
    # 检查Swap使用
    local swap_used=$(free -m | awk 'NR==3{print $3}')
    if [ $swap_used -gt 0 ]; then
        echo -e "${YELLOW}⚠️  Swap正在使用: ${swap_used}MB${NC}"
    else
        echo -e "${GREEN}✅ Swap未使用${NC}"
    fi
}

# 详细内存统计
detailed_memory_stats() {
    log_step "详细内存统计"
    
    local meminfo_file="${INVESTIGATION_DIR}/meminfo_$(date +%H%M%S).txt"
    cat /proc/meminfo > "$meminfo_file"
    
    echo -e "${GREEN}关键内存指标:${NC}"
    grep -E "MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree|Slab|SReclaimable|SUnreclaim" "$meminfo_file" | \
    while read key value unit; do
        local mb_value=$((value / 1024))
        printf "  %-20s: %8d MB\n" "$key" "$mb_value"
    done
    
    # 检查内存碎片
    echo -e "\n${GREEN}内存碎片信息:${NC}"
    if [ -f /proc/buddyinfo ]; then
        cat /proc/buddyinfo | head -5
    fi
    
    # 检查大页内存
    echo -e "\n${GREEN}大页内存信息:${NC}"
    grep -E "HugePages_Total|HugePages_Free|Hugepagesize" "$meminfo_file"
}

# 进程内存分析
process_memory_analysis() {
    log_step "进程内存分析"
    
    local process_file="${INVESTIGATION_DIR}/process_memory_$(date +%H%M%S).txt"
    
    # 获取内存占用最高的进程
    echo -e "${GREEN}内存占用Top 10进程:${NC}"
    ps aux --sort=-%mem | head -11 | awk 'NR==1 {printf "  %-8s %-8s %-6s %-6s %-8s %s\n", "USER", "PID", "%CPU", "%MEM", "RSS", "COMMAND"} 
                                        NR>1 {printf "  %-8s %-8s %-6.1f %-6.1f %-8s %s\n", $1, $2, $3, $4, $6, $11}' | \
    while read line; do
        echo "  $line"
    done
    
    # 分析高内存进程的详细内存使用
    ps aux --sort=-%mem | awk 'NR>1 && $4 > 1.0 {print $2}' | head -5 | while read pid; do
        if [ -d "/proc/$pid" ]; then
            echo -e "\n  ${CYAN}分析高内存进程 PID: $pid${NC}"
            
            local process_name=$(ps -p "$pid" -o comm= 2>/dev/null || echo "未知")
            echo -e "    进程名: $process_name"
            
            # 检查进程内存映射
            local maps_file="${INVESTIGATION_DIR}/maps_${pid}.txt"
            if [ -f "/proc/$pid/smaps" ]; then
                cat "/proc/$pid/smaps" > "$maps_file"
                
                # 计算各部分内存使用
                local pss=$(grep -E "Pss|Swap" "/proc/$pid/smaps" | awk '{sum += $2} END {print sum " kB"}')
                local swap=$(grep "Swap" "/proc/$pid/smaps" | awk '{sum += $2} END {print sum " kB"}')
                
                echo -e "    实际使用(PSS): $pss"
                echo -e "    Swap使用: $swap"
                
                # 分析内存段
                echo -e "    内存段分析:"
                grep -E "Size|Rss|Pss|Shared" "/proc/$pid/smaps" | head -20 | \
                while read line; do
                    echo -e "      $line"
                done
            fi
        fi
    done
}

# slab内存分析
slab_memory_analysis() {
    log_step "Slab内存分析"
    
    echo -e "${GREEN}Slab内存使用:${NC}"
    if command -v slabtop >/dev/null 2>&1; then
        slabtop -o -s c | head -20
    else
        echo -e "  ${YELLOW}slabtop命令不可用${NC}"
    fi
    
    # 检查slab信息
    local slabinfo_file="${INVESTIGATION_DIR}/slabinfo_$(date +%H%M%S).txt"
    cat /proc/slabinfo > "$slabinfo_file" 2>/dev/null || true
    
    if [ -s "$slabinfo_file" ]; then
        echo -e "\n${GREEN}Slab占用Top 10:${NC}"
        awk 'NR>1 {print $1,$2,$3}' "$slabinfo_file" | sort -k2 -nr | head -10 | \
        while read name active_objs objsize; do
            local total_size=$((active_objs * objsize / 1024))
            printf "  %-20s: %8d KB\n" "$name" "$total_size"
        done
    fi
}

# 内存泄漏检测
memory_leak_detection() {
    log_step "内存泄漏检测"
    
    echo -e "${GREEN}检查可能的内存泄漏:${NC}"
    
    # 监控内存增长趋势
    local mem_usage_log="${INVESTIGATION_DIR}/memory_trend.log"
    
    for i in {1..5}; do
        local timestamp=$(date '+%H:%M:%S')
        local mem_info=$(free -m | awk 'NR==2{print $3 "," $4 "," $7}')
        echo "$timestamp,$mem_info" >> "$mem_usage_log"
        sleep 2
    done
    
    echo -e "  内存趋势已记录到: $mem_usage_log"
    
    # 检查OOM killer活动
    echo -e "\n${GREEN}OOM Killer记录:${NC}"
    if dmesg | grep -i "killed process" | tail -5; then
        echo -e "${RED}⚠️  检测到OOM Killer活动${NC}"
    else
        echo -e "  未发现最近的OOM Killer活动"
    fi
    
    # 检查内存回收压力
    echo -e "\n${GREEN}内存回收压力:${NC}"
    if [ -f /proc/pressure/memory ]; then
        cat /proc/pressure/memory
    else
        echo "  内核不支持内存压力监测"
    fi
}

# 交换空间分析
swap_analysis() {
    log_step "交换空间分析"
    
    echo -e "${GREEN}交换空间使用:${NC}"
    swapon --show
    
    echo -e "\n${GREEN}交换活动统计:${NC}"
    if command -v vmstat >/dev/null 2>&1; then
        vmstat 1 3 | tail -3
    fi
    
    # 检查页面交换
    echo -e "\n${GREEN}页面交换统计:${NC}"
    grep -E "pgpgin|pgpgout|pswpin|pswpout" /proc/vmstat | \
    while read key value; do
        local mb_value=$((value / 256))  # 近似转换为MB
        printf "  %-15s: %8d MB\n" "$key" "$mb_value"
    done
}

# 生成内存报告
generate_memory_report() {
    log_step "生成内存调查报告"
    
    local report_file="${INVESTIGATION_DIR}/memory_investigation_report.txt"
    
    cat > "$report_file" << EOF
内存性能问题调查报告
===================

调查时间: $(date)
调查目录: $INVESTIGATION_DIR

📊 内存概览
-----------
总内存: $(free -h | awk 'NR==2{print $2}')
已使用: $(free -h | awk 'NR==2{print $3}')
可用内存: $(free -h | awk 'NR==2{print $7}')
内存使用率: $(free | awk 'NR==2{printf "%.1f%%", $3/$2*100}')

🔍 发现的问题
------------
$(grep -E "⚠️|WARN" "$LOG_FILE" | head -10)

💡 建议措施
----------
1. 检查高内存进程的合理性
2. 分析内存泄漏可能性
3. 优化应用程序内存使用
4. 考虑增加物理内存
5. 调整Swappiness参数
6. 清理Slab缓存

📊 内存优化建议
---------------
1. 定期监控内存使用趋势
2. 设置适当的内存限制
3. 优化应用程序内存分配
4. 考虑使用内存压缩
5. 配置适当的内存回收策略

📁 收集的文件
------------
$(find "$INVESTIGATION_DIR" -type f -name "*.txt" -o -name "*.log" | while read file; do
    echo "- $(basename "$file")"
done)

注意: 详细分析请查看各个日志文件
EOF

    echo -e "${GREEN}内存调查报告已生成:${NC} $report_file"
    echo -e "\n${CYAN}=== 内存性能深度排查完成 ===${NC}"
    echo -e "所有日志和报告保存在: $INVESTIGATION_DIR"
}

# 主函数
main() {
    init_investigation
    quick_memory_analysis
    detailed_memory_stats
    process_memory_analysis
    slab_memory_analysis
    memory_leak_detection
    swap_analysis
    generate_memory_report
}

# 执行主函数
main "$@"

高级性能诊断工具

实时性能监控面板

bash 复制代码
#!/bin/bash
# performance_dashboard.sh - 实时性能监控面板

set -euo pipefail

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m'

# 清理屏幕并设置终端
clear_screen() {
    printf "\033[2J\033[H"
}

# 显示头部信息
show_header() {
    local width=${1:-80}
    printf "${PURPLE}"
    printf "╔%*s╗\n" $((width-2)) "" | tr ' ' '='
    printf "║%*s║\n" $((width-2)) ""
    printf "║   🚀 Linux实时性能监控面板 %*s║\n" $((width-40)) ""
    printf "║%*s║\n" $((width-2)) ""
    printf "║   主机: %-20s 时间: %-19s║\n" "$(hostname)" "$(date '+%Y-%m-%d %H:%M:%S')"
    printf "║%*s║\n" $((width-2)) ""
    printf "╚%*s╝\n" $((width-2)) "" | tr ' ' '='
    printf "${NC}\n"
}

# CPU监控部分
show_cpu_section() {
    local width=$1
    
    printf "${CYAN}🖥️  CPU监控${NC}\n"
    printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
    
    # CPU使用率
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{printf "%.1f", 100 - $8}')
    printf "${BLUE}│${NC} %-15s: ${GREEN}%5.1f%%${NC} %*s ${BLUE}│${NC}\n" "使用率" "$cpu_usage" $((width-30)) ""
    
    # 负载平均值
    local load_avg=$(uptime | awk -F'load average:' '{print $2}' | sed 's/^ *//')
    local cpu_cores=$(nproc)
    printf "${BLUE}│${NC} %-15s: ${YELLOW}%s${NC} %*s ${BLUE}│${NC}\n" "负载" "$load_avg" $((width-35-${#load_avg})) ""
    printf "${BLUE}│${NC} %-15s: ${CYAN}%d 核心${NC} %*s ${BLUE}│${NC}\n" "CPU核心" "$cpu_cores" $((width-28)) ""
    
    # 中断和上下文切换
    local interrupts=$(awk '{print $1}' /proc/stat | grep -c "^cpu")
    local context_switches=$(grep ctxt /proc/stat | awk '{print $2}')
    printf "${BLUE}│${NC} %-15s: ${CYAN}%'d${NC} %*s ${BLUE}│${NC}\n" "中断" "$interrupts" $((width-28)) ""
    printf "${BLUE}│${NC} %-15s: ${CYAN}%'d${NC} %*s ${BLUE}│${NC}\n" "上下文切换" "$context_switches" $((width-33)) ""
    
    printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
    printf "\n"
}

# 内存监控部分
show_memory_section() {
    local width=$1
    
    printf "${CYAN}💾 内存监控${NC}\n"
    printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
    
    # 内存使用情况
    local mem_total=$(free -m | awk 'NR==2{print $2}')
    local mem_used=$(free -m | awk 'NR==2{print $3}')
    local mem_available=$(free -m | awk 'NR==2{print $7}')
    local mem_usage_percent=$((mem_used * 100 / mem_total))
    
    printf "${BLUE}│${NC} %-15s: ${GREEN}%'d MB${NC} %*s ${BLUE}│${NC}\n" "总内存" "$mem_total" $((width-30)) ""
    printf "${BLUE}│${NC} %-15s: ${YELLOW}%'d MB${NC} %*s ${BLUE}│${NC}\n" "已使用" "$mem_used" $((width-30)) ""
    printf "${BLUE}│${NC} %-15s: ${GREEN}%'d MB${NC} %*s ${BLUE}│${NC}\n" "可用内存" "$mem_available" $((width-32)) ""
    printf "${BLUE}│${NC} %-15s: " "使用率"
    
    # 彩色进度条
    local bar_width=20
    local filled=$((mem_usage_percent * bar_width / 100))
    local empty=$((bar_width - filled))
    
    if [ $mem_usage_percent -gt 90 ]; then
        printf "${RED}"
    elif [ $mem_usage_percent -gt 70 ]; then
        printf "${YELLOW}"
    else
        printf "${GREEN}"
    fi
    
    printf "%3d%% [" "$mem_usage_percent"
    printf "%*s" "$filled" | tr ' ' '■'
    printf "%*s" "$empty" | tr ' ' '·'
    printf "] %*s ${BLUE}│${NC}\n" $((width-42)) ""
    
    # Swap使用
    local swap_total=$(free -m | awk 'NR==3{print $2}')
    local swap_used=$(free -m | awk 'NR==3{print $3}')
    if [ $swap_total -gt 0 ]; then
        local swap_percent=$((swap_used * 100 / swap_total))
        printf "${BLUE}│${NC} %-15s: ${YELLOW}%'d/%'d MB${NC} %*s ${BLUE}│${NC}\n" "Swap" "$swap_used" "$swap_total" $((width-40)) ""
    fi
    
    printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
    printf "\n"
}

# 磁盘监控部分
show_disk_section() {
    local width=$1
    
    printf "${CYAN}💽 磁盘监控${NC}\n"
    printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
    
    # 磁盘使用情况
    df -h | awk 'NR>1 && $5+0 > 50 {print $1,$2,$3,$4,$5,$6}' | head -3 | \
    while read fs size used avail percent mount; do
        local percent_num=${percent%\%}
        printf "${BLUE}│${NC} %-15s: " "$mount"
        
        if [ $percent_num -gt 90 ]; then
            printf "${RED}"
        elif [ $percent_num -gt 70 ]; then
            printf "${YELLOW}"
        else
            printf "${GREEN}"
        fi
        
        printf "%-5s %-4s %-4s %3s %*s ${BLUE}│${NC}\n" "$used" "/" "$size" "$percent" $((width-45)) ""
    done
    
    # IO统计
    if command -v iostat >/dev/null 2>&1; then
        local io_util=$(iostat -d 1 1 | awk 'NR>3 && $2!="" {util+=$NF} END {printf "%.1f", util}')
        printf "${BLUE}│${NC} %-15s: ${CYAN}%5.1f%%${NC} %*s ${BLUE}│${NC}\n" "IO使用率" "$io_util" $((width-30)) ""
    fi
    
    printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
    printf "\n"
}

# 进程监控部分
show_process_section() {
    local width=$1
    
    printf "${CYAN}🔄 进程监控${NC}\n"
    printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
    
    # CPU占用最高的进程
    printf "${BLUE}│${NC} ${GREEN}%-6s ${YELLOW}%-8s ${CYAN}%-6s ${NC}%-20s %*s ${BLUE}│${NC}\n" "CPU%" "PID" "MEM%" "进程名" $((width-50)) ""
    printf "${BLUE}│%*s│${NC}\n" $((width-2)) "" | tr ' ' '─'
    
    ps aux --sort=-%cpu | awk 'NR>1 && NR<=6 {
        printf "│ %-6.1f %-8s %-6.1f %-20s %*s │\n", 
        $3, $2, $4, substr($11,1,18), '"$((width-45))"', ""
    }' | while read line; do
        echo -e "${BLUE}$line${NC}"
    done
    
    printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
    printf "\n"
}

# 网络监控部分
show_network_section() {
    local width=$1
    
    printf "${CYAN}🌐 网络监控${NC}\n"
    printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
    
    # 连接统计
    local tcp_connections=$(ss -t -a | wc -l)
    local established_connections=$(ss -t -a | grep ESTAB | wc -l)
    
    printf "${BLUE}│${NC} %-15s: ${CYAN}%'d${NC} %*s ${BLUE}│${NC}\n" "TCP连接" "$tcp_connections" $((width-28)) ""
    printf "${BLUE}│${NC} %-15s: ${GREEN}%'d${NC} %*s ${BLUE}│${NC}\n" "已建立" "$established_connections" $((width-28)) ""
    
    # 网络接口
    local interface=$(ip route | awk '/default/ {print $5}' | head -1)
    if [ -n "$interface" ]; then
        local rx_bytes=$(cat /sys/class/net/$interface/statistics/rx_bytes 2>/dev/null || echo 0)
        local tx_bytes=$(cat /sys/class/net/$interface/statistics/tx_bytes 2>/dev/null || echo 0)
        
        printf "${BLUE}│${NC} %-15s: ${CYAN}%s${NC} %*s ${BLUE}│${NC}\n" "活动接口" "$interface" $((width-31)) ""
        printf "${BLUE}│${NC} %-15s: ${GREEN}%'d${NC} %*s ${BLUE}│${NC}\n" "接收字节" "$rx_bytes" $((width-31)) ""
        printf "${BLUE}│${NC} %-15s: ${YELLOW}%'d${NC} %*s ${BLUE}│${NC}\n" "发送字节" "$tx_bytes" $((width-31)) ""
    fi
    
    printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
    printf "\n"
}

# 系统负载可视化
show_load_visualization() {
    local width=$1
    
    printf "${CYAN}📊 系统负载可视化${NC}\n"
    printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
    
    # 获取1分钟负载
    local load1=$(uptime | awk -F'load average:' '{print $2}' | awk -F, '{print $1}' | sed 's/ //g')
    local cpu_cores=$(nproc)
    
    printf "${BLUE}│${NC} %-20s: ${CYAN}%s${NC} %*s ${BLUE}│${NC}\n" "1分钟负载" "$load1" $((width-35)) ""
    printf "${BLUE}│${NC} %-20s: ${CYAN}%d${NC} %*s ${BLUE}│${NC}\n" "CPU核心数" "$cpu_cores" $((width-33)) ""
    
    # 负载可视化
    local load_value=$(echo "$load1" | awk '{print int($1*100)}')
    local load_percent=$((load_value / cpu_cores))
    
    printf "${BLUE}│${NC} %-20s: " "负载水平"
    
    if [ $load_percent -gt 150 ]; then
        printf "${RED}严重超载 🔴${NC}"
    elif [ $load_percent -gt 100 ]; then
        printf "${RED}超载 🟠${NC}"
    elif [ $load_percent -gt 70 ]; then
        printf "${YELLOW}较高 🟡${NC}"
    elif [ $load_percent -gt 30 ]; then
        printf "${GREEN}正常 🟢${NC}"
    else
        printf "${GREEN}较低 🔵${NC}"
    fi
    
    printf " %*s ${BLUE}│${NC}\n" $((width-40)) ""
    printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
    printf "\n"
}

# 主监控循环
main() {
    local terminal_width=${COLUMNS:-80}
    local refresh_rate=3
    
    # 检查终端大小
    if [ $terminal_width -lt 80 ]; then
        terminal_width=80
    fi
    
    # 处理Ctrl+C退出
    trap 'clear_screen; echo -e "\n${GREEN}监控已退出${NC}"; exit 0' INT
    
    while true; do
        clear_screen
        show_header $terminal_width
        show_cpu_section $terminal_width
        show_memory_section $terminal_width
        show_disk_section $terminal_width
        show_process_section $terminal_width
        show_network_section $terminal_width
        show_load_visualization $terminal_width
        
        # 底部信息
        printf "${PURPLE}按 Ctrl+C 退出监控 | 刷新间隔: ${refresh_rate}秒${NC}\n"
        
        sleep $refresh_rate
    done
}

# 执行主函数
main "$@"

性能问题排查流程图

以下图表展示了完整的性能问题排查流程:

flowchart TD A[性能告警] --> B{识别问题类型} B -->|CPU飙高| C[CPU问题排查] B -->|内存飙高| D[内存问题排查] B -->|IO问题| E[磁盘IO排查] B -->|网络问题| F[网络问题排查] C --> C1[快速系统检查] C1 --> C2[进程级分析] C2 --> C3[线程级分析] C3 --> C4[性能剖析] C4 --> C5[系统调用分析] C5 --> C6[生成解决方案] D --> D1[内存使用分析] D1 --> D2[进程内存分析] D2 --> D3[Slab缓存分析] D3 --> D4[内存泄漏检测] D4 --> D5[交换空间分析] D5 --> D6[内存优化方案] E --> E1[磁盘使用检查] E1 --> E2[IO性能分析] E2 --> E3[文件系统检查] E3 --> E4[IO调度分析] E4 --> E5[存储优化] F --> F1[网络连接分析] F1 --> F2[带宽使用检查] F2 --> F3[网络错误分析] F3 --> F4[连接追踪] F4 --> F5[网络优化] C6 --> G[实施解决方案] D6 --> G E5 --> G F5 --> G G --> H[监控改进效果] H --> I{问题解决?} I -->|是| J[问题解决] I -->|否| K[重新分析] K --> B style A fill:#e74c3c,color:#fff style J fill:#27ae60,color:#fff style C fill:#3498db,color:#fff style D fill:#9b59b6,color:#fff style E fill:#e67e22,color:#fff style F fill:#34495e,color:#fff

应急处理和进程管理

紧急进程管理脚本

bash 复制代码
#!/bin/bash
# emergency_process_manager.sh - 紧急进程管理脚本

set -euo pipefail

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m'

# 日志函数
log() {
    local level=$1
    shift
    local message=$*
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    
    echo -e "[$timestamp] [$level] $message" | tee -a "/var/log/emergency_process.log"
}

# 显示进程列表
show_process_list() {
    echo -e "\n${CYAN}=== 当前高资源占用进程 ===${NC}"
    
    # CPU占用Top 10
    echo -e "${YELLOW}🖥️  CPU占用Top 10:${NC}"
    ps aux --sort=-%cpu | head -11 | awk 'NR==1 {printf "  %-8s %-8s %-6s %-6s %-12s %s\n", "USER", "PID", "%CPU", "%MEM", "TIME", "COMMAND"} 
                                        NR>1 {printf "  %-8s %-8s %-6.1f %-6.1f %-12s %s\n", $1, $2, $3, $4, $10, $11}'
    
    # 内存占用Top 10
    echo -e "\n${YELLOW}💾 内存占用Top 10:${NC}"
    ps aux --sort=-%mem | head -11 | awk 'NR==1 {printf "  %-8s %-8s %-6s %-6s %-8s %s\n", "USER", "PID", "%CPU", "%MEM", "RSS", "COMMAND"} 
                                        NR>1 {printf "  %-8s %-8s %-6.1f %-6.1f %-8s %s\n", $1, $2, $3, $4, $6, $11}'
}

# 分析可疑进程
analyze_suspicious_processes() {
    echo -e "\n${CYAN}=== 可疑进程分析 ===${NC}"
    
    # 查找CPU占用超过50%的进程
    local high_cpu_processes=$(ps aux --sort=-%cpu | awk 'NR>1 && $3 > 50.0 {print $2, $1, $3, $11}')
    
    if [ -n "$high_cpu_processes" ]; then
        echo -e "${RED}⚠️  发现高CPU占用进程:${NC}"
        echo "$high_cpu_processes" | while read pid user cpu command; do
            echo -e "  ${RED}PID: $pid, 用户: $user, CPU: ${cpu}%, 命令: $command${NC}"
            
            # 检查进程详细信息
            if [ -d "/proc/$pid" ]; then
                local process_name=$(ps -p "$pid" -o comm= 2>/dev/null || echo "未知")
                local thread_count=$(ps -L -p "$pid" | wc -l)
                local start_time=$(ps -p "$pid" -o lstart= 2>/dev/null || echo "未知")
                
                echo -e "     进程名: $process_name"
                echo -e "     线程数: $((thread_count - 1))"
                echo -e "     启动时间: $start_time"
                
                # 检查进程文件
                local exe_path=$(readlink "/proc/$pid/exe" 2>/dev/null || echo "无法访问")
                echo -e "     可执行文件: $exe_path"
            fi
            echo ""
        done
    else
        echo -e "${GREEN}✅ 未发现异常高CPU占用进程${NC}"
    fi
    
    # 查找内存占用超过10%的进程
    local high_mem_processes=$(ps aux --sort=-%mem | awk 'NR>1 && $4 > 10.0 {print $2, $1, $4, $11}')
    
    if [ -n "$high_mem_processes" ]; then
        echo -e "${RED}⚠️  发现高内存占用进程:${NC}"
        echo "$high_mem_processes" | while read pid user mem command; do
            echo -e "  ${RED}PID: $pid, 用户: $user, 内存: ${mem}%, 命令: $command${NC}"
        done
    else
        echo -e "${GREEN}✅ 未发现异常高内存占用进程${NC}"
    fi
}

# 安全终止进程
safely_kill_process() {
    local pid=$1
    local signal=${2:-TERM}
    
    if [ ! -d "/proc/$pid" ]; then
        echo -e "${RED}错误: 进程 $pid 不存在${NC}"
        return 1
    fi
    
    local process_info=$(ps -p "$pid" -o user,pid,%cpu,%mem,comm --no-headers 2>/dev/null)
    if [ -z "$process_info" ]; then
        echo -e "${RED}错误: 无法获取进程 $pid 的信息${NC}"
        return 1
    fi
    
    echo -e "${YELLOW}准备终止进程:${NC}"
    echo -e "  进程信息: $process_info"
    
    # 检查进程重要性
    local process_name=$(echo "$process_info" | awk '{print $5}')
    local critical_processes=("systemd" "init" "kernel" "sshd" "bash" "sh")
    
    for critical in "${critical_processes[@]}"; do
        if [[ "$process_name" == *"$critical"* ]]; then
            echo -e "${RED}警告: 进程 $process_name 可能是关键系统进程,终止可能导致系统不稳定${NC}"
            read -p "是否继续? (y/N): " confirm
            if [[ ! "$confirm" =~ ^[Yy]$ ]]; then
                echo -e "${GREEN}操作已取消${NC}"
                return 1
            fi
            break
        fi
    done
    
    # 发送信号
    echo -e "${YELLOW}发送 $signal 信号到进程 $pid...${NC}"
    if kill -$signal "$pid" 2>/dev/null; then
        echo -e "${GREEN}✅ 信号发送成功${NC}"
        
        # 等待进程退出
        local wait_time=0
        local max_wait=10
        
        while [ -d "/proc/$pid" ] && [ $wait_time -lt $max_wait ]; do
            sleep 1
            ((wait_time++))
            echo -e "  等待进程退出... ($wait_time/$max_wait 秒)"
        done
        
        if [ -d "/proc/$pid" ]; then
            echo -e "${RED}进程未正常退出,发送KILL信号...${NC}"
            kill -KILL "$pid"
            sleep 1
        fi
        
        if [ ! -d "/proc/$pid" ]; then
            echo -e "${GREEN}✅ 进程已成功终止${NC}"
            log "INFO" "进程 $pid ($process_name) 已被终止"
        else
            echo -e "${RED}❌ 无法终止进程 $pid${NC}"
            log "ERROR" "无法终止进程 $pid"
            return 1
        fi
    else
        echo -e "${RED}❌ 无法发送信号到进程 $pid${NC}"
        log "ERROR" "无法发送信号到进程 $pid"
        return 1
    fi
}

# 批量终止进程
batch_kill_processes() {
    echo -e "\n${CYAN}=== 批量进程管理 ===${NC}"
    
    # 按条件选择进程
    echo -e "${YELLOW}选择终止条件:${NC}"
    echo "  1) CPU使用率 > 指定百分比"
    echo "  2) 内存使用率 > 指定百分比"
    echo "  3) 指定进程名"
    echo "  4) 指定用户"
    
    read -p "请选择选项 (1-4): " option
    
    case $option in
        1)
            read -p "请输入CPU使用率阈值 (例如: 50): " threshold
            local processes=$(ps aux --sort=-%cpu | awk -v threshold="$threshold" 'NR>1 && $3 > threshold {print $2, $1, $3, $11}')
            ;;
        2)
            read -p "请输入内存使用率阈值 (例如: 10): " threshold
            local processes=$(ps aux --sort=-%mem | awk -v threshold="$threshold" 'NR>1 && $4 > threshold {print $2, $1, $4, $11}')
            ;;
        3)
            read -p "请输入进程名: " process_name
            local processes=$(ps aux | awk -v name="$process_name" 'NR>1 && $11 ~ name {print $2, $1, $3, $11}')
            ;;
        4)
            read -p "请输入用户名: " username
            local processes=$(ps aux | awk -v user="$username" 'NR>1 && $1 == user {print $2, $1, $3, $11}')
            ;;
        *)
            echo -e "${RED}无效选项${NC}"
            return 1
            ;;
    esac
    
    if [ -z "$processes" ]; then
        echo -e "${GREEN}没有找到匹配的进程${NC}"
        return 0
    fi
    
    echo -e "\n${YELLOW}找到以下进程:${NC}"
    echo "$processes" | while read pid user usage command; do
        echo -e "  PID: $pid, 用户: $user, 资源: $usage%, 命令: $command"
    done
    
    read -p "是否终止这些进程? (y/N): " confirm
    if [[ "$confirm" =~ ^[Yy]$ ]]; then
        echo "$processes" | while read pid user usage command; do
            echo -e "\n${YELLOW}处理进程 $pid...${NC}"
            safely_kill_process "$pid"
        done
    else
        echo -e "${GREEN}操作已取消${NC}"
    fi
}

# 进程资源限制
set_process_limits() {
    echo -e "\n${CYAN}=== 进程资源限制 ===${NC}"
    
    # 检查cgroups是否可用
    if [ ! -d "/sys/fs/cgroup" ]; then
        echo -e "${RED}错误: cgroups不可用${NC}"
        return 1
    fi
    
    echo -e "${YELLOW}设置进程资源限制:${NC}"
    echo "  1) CPU限制"
    echo "  2) 内存限制"
    echo "  3) IO限制"
    
    read -p "请选择选项 (1-3): " option
    
    case $option in
        1)
            read -p "请输入要限制的PID: " pid
            read -p "请输入CPU使用率限制 (例如: 50): " cpu_limit
            
            if [ -d "/proc/$pid" ]; then
                # 创建cgroup
                local cgroup_dir="/sys/fs/cgroup/cpu/limit_$pid"
                mkdir -p "$cgroup_dir"
                
                # 设置CPU限制 (百分比转换为CFS配额)
                local cpu_quota=$((cpu_limit * 1000))
                echo "$cpu_quota" > "$cgroup_dir/cpu.cfs_quota_us"
                echo "100000" > "$cgroup_dir/cpu.cfs_period_us"
                
                # 将进程加入cgroup
                echo "$pid" > "$cgroup_dir/cgroup.procs"
                
                echo -e "${GREEN}✅ 已设置进程 $pid 的CPU限制为 ${cpu_limit}%${NC}"
            else
                echo -e "${RED}错误: 进程 $pid 不存在${NC}"
            fi
            ;;
        2)
            read -p "请输入要限制的PID: " pid
            read -p "请输入内存限制 (MB): " mem_limit
            
            if [ -d "/proc/$pid" ]; then
                # 创建cgroup
                local cgroup_dir="/sys/fs/cgroup/memory/limit_$pid"
                mkdir -p "$cgroup_dir"
                
                # 设置内存限制
                local mem_bytes=$((mem_limit * 1024 * 1024))
                echo "$mem_bytes" > "$cgroup_dir/memory.limit_in_bytes"
                
                # 将进程加入cgroup
                echo "$pid" > "$cgroup_dir/cgroup.procs"
                
                echo -e "${GREEN}✅ 已设置进程 $pid 的内存限制为 ${mem_limit}MB${NC}"
            else
                echo -e "${RED}错误: 进程 $pid 不存在${NC}"
            fi
            ;;
        3)
            echo -e "${YELLOW}IO限制功能需要更复杂的配置,建议使用systemd或手动配置cgroup${NC}"
            ;;
        *)
            echo -e "${RED}无效选项${NC}"
            ;;
    esac
}

# 系统优化建议
system_optimization_suggestions() {
    echo -e "\n${CYAN}=== 系统优化建议 ===${NC}"
    
    # CPU优化建议
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print 100 - $8}')
    if (( $(echo "$cpu_usage > 80" | bc -l) )); then
        echo -e "${YELLOW}🖥️  CPU优化建议:${NC}"
        echo "  • 检查并优化高CPU进程"
        echo "  • 考虑增加CPU核心或升级硬件"
        echo "  • 调整进程优先级 (nice/renice)"
        echo "  • 使用cgroups限制资源使用"
    fi
    
    # 内存优化建议
    local mem_usage=$(free | awk 'NR==2{printf "%.1f", $3/$2*100}')
    if (( $(echo "$mem_usage > 80" | bc -l) )); then
        echo -e "${YELLOW}💾 内存优化建议:${NC}"
        echo "  • 检查内存泄漏"
        echo "  • 优化应用程序内存使用"
        echo "  • 增加物理内存"
        echo "  • 调整swappiness参数"
        echo "  • 清理缓存: echo 3 > /proc/sys/vm/drop_caches"
    fi
    
    # 通用优化建议
    echo -e "${YELLOW}🔧 通用优化建议:${NC}"
    echo "  • 定期更新系统和软件"
    echo "  • 监控系统资源使用趋势"
    echo "  • 配置适当的监控告警"
    echo "  • 优化应用程序配置"
    echo "  • 考虑使用负载均衡"
}

# 主菜单
main_menu() {
    while true; do
        clear
        echo -e "${PURPLE}================================${NC}"
        echo -e "${PURPLE}    Linux紧急进程管理器${NC}"
        echo -e "${PURPLE}================================${NC}"
        echo -e "${GREEN}1) 显示进程列表${NC}"
        echo -e "${GREEN}2) 分析可疑进程${NC}"
        echo -e "${YELLOW}3) 终止单个进程${NC}"
        echo -e "${YELLOW}4) 批量终止进程${NC}"
        echo -e "${BLUE}5) 设置进程资源限制${NC}"
        echo -e "${CYAN}6) 系统优化建议${NC}"
        echo -e "${RED}7) 退出${NC}"
        echo -e "${PURPLE}================================${NC}"
        
        read -p "请选择操作 (1-7): " choice
        
        case $choice in
            1) show_process_list ;;
            2) analyze_suspicious_processes ;;
            3) 
                read -p "请输入要终止的PID: " pid
                safely_kill_process "$pid"
                ;;
            4) batch_kill_processes ;;
            5) set_process_limits ;;
            6) system_optimization_suggestions ;;
            7) 
                echo -e "${GREEN}再见!${NC}"
                exit 0
                ;;
            *) 
                echo -e "${RED}无效选择,请重新输入${NC}"
                ;;
        esac
        
        echo -e "\n按Enter键继续..."
        read
    done
}

# 执行主菜单
main_menu

总结

本文提供了完整的Linux服务器性能问题排查解决方案,包含:

核心排查能力

  • 实时监控: 全面的系统性能监控面板
  • 深度诊断: CPU、内存、IO、网络的详细分析
  • 进程管理: 安全的进程终止和资源限制
  • 问题定位: 从系统级别到线程级别的精准定位

实用工具集

  • 基础监控脚本: 快速获取系统状态
  • 深度排查工具: 性能计数器、系统调用分析
  • 应急处理: 紧急情况下的进程管理
  • 优化建议: 基于实际情况的调优建议

关键排查流程

  1. 快速识别: 使用监控面板快速定位问题类型
  2. 深度分析: 使用专业工具进行根本原因分析
  3. 安全处理: 在了解影响的前提下处理问题进程
  4. 持续优化: 实施长期优化方案防止问题复发

希望能帮助运维人员快速响应和解决性能问题。

相关推荐
oak隔壁找我2 小时前
MyBatis 源码深度解析
java·后端
lang201509282 小时前
Spring 4.1新特性:深度优化与生态整合
java·后端·spring
纳就这样吧2 小时前
Spring Cloud中@EnableDiscoveryClient注解详解
spring boot·后端·spring cloud
DBLens数据库管理和开发工具2 小时前
GROUP BY隐性排序:MySQL 5.x 与 8.x 的行为大不同
后端
oak隔壁找我2 小时前
Spring框架原理深度源码级解析
java·后端
yinke小琪2 小时前
谈谈项目中单点登录的实现原理
java·后端
brzhang3 小时前
我且问你,如果有人用 AI 抄你的产品,爱卿又当如何应对?
前端·后端·架构
渣哥3 小时前
面试必问:Spring Bean 的作用域类型有多少种?
javascript·后端·面试
christine-rr3 小时前
MySQL数据库管理、DDL、DQL、DML、DCL等总结
linux·数据库·mysql