性能排查必看!当Linux服务器CPU/内存飙高,如何快速定位并"干掉"罪魁祸首进程?

本文提供完整的Linux服务器性能问题排查指南,从基础监控到高级诊断,包含详细的步骤和操作代码,帮助您快速定位并解决CPU和内存飙高问题。

性能问题排查基础准备

环境准备和工具安装

在开始排查前,确保系统已安装必要的性能分析工具。

bash 复制代码
#!/bin/bash
# 性能排查工具安装脚本

# 检查系统类型
if [ -f /etc/redhat-release ]; then
    # CentOS/RHEL
    yum update -y
    yum install -y epel-release
    yum install -y htop iotop nethogs sysstat dstat perf bpftrace bcc-tools \
                   stress-ng procps-ng numactl lsof strace ltrace gdb \
                   tcpdump net-tools iproute
elif [ -f /etc/debian_version ]; then
    # Ubuntu/Debian
    apt-get update
    apt-get install -y htop iotop nethogs sysstat dstat linux-tools-common \
                      linux-tools-generic bpfcc-tools stress-ng procps \
                      numactl lsof strace ltrace gdb tcpdump net-tools \
                      iproute2
else
    echo "不支持的Linux发行版"
    exit 1
fi

# 创建性能排查工作目录
mkdir -p /opt/performance/{scripts,logs,reports}
cd /opt/performance

echo "✅ 性能排查工具安装完成"

基础监控脚本

创建基础性能监控脚本,用于快速获取系统状态。

bash 复制代码
#!/bin/bash
# basic_monitor.sh - 基础系统监控脚本

set -euo pipefail

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color

# 日志函数
log() {
    local level=$1
    shift
    local message=$*
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    
    case $level in
        "INFO") echo -e "${GREEN}[INFO]${NC} $message" ;;
        "WARN") echo -e "${YELLOW}[WARN]${NC} $message" ;;
        "ERROR") echo -e "${RED}[ERROR]${NC} $message" ;;
        "DEBUG") echo -e "${BLUE}[DEBUG]${NC} $message" ;;
    esac
}

# 系统概览
system_overview() {
    echo -e "\n${CYAN}=== 系统概览 ===${NC}"
    
    # 系统基本信息
    echo -e "${GREEN}主机名:${NC} $(hostname)"
    echo -e "${GREEN}操作系统:${NC} $(cat /etc/redhat-release 2>/dev/null || cat /etc/os-release | grep PRETTY_NAME | cut -d= -f2 | tr -d '\"')"
    echo -e "${GREEN}内核版本:${NC} $(uname -r)"
    echo -e "${GREEN}启动时间:${NC} $(uptime -s)"
    echo -e "${GREEN}运行时间:${NC} $(uptime -p)"
}

# CPU监控
cpu_monitor() {
    echo -e "\n${CYAN}=== CPU监控 ===${NC}"
    
    # CPU基本信息
    local cpu_cores=$(nproc)
    local cpu_model=$(grep "model name" /proc/cpuinfo | head -1 | cut -d: -f2 | sed 's/^ *//')
    echo -e "${GREEN}CPU型号:${NC} $cpu_model"
    echo -e "${GREEN}CPU核心数:${NC} $cpu_cores"
    
    # CPU使用率
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | sed "s/.*, *\([0-9.]*\)%* id.*/\1/" | awk '{print 100 - $1"%"}')
    echo -e "${GREEN}CPU使用率:${NC} $cpu_usage"
    
    # 负载平均值
    local load_avg=$(uptime | awk -F'load average:' '{print $2}')
    echo -e "${GREEN}负载平均值:${NC} $load_avg"
    
    # 每个CPU核心的使用率
    echo -e "${GREEN}每个核心使用率:${NC}"
    mpstat -P ALL 1 1 | awk '
        /Average/ && $2 != "all" && $2 != "CPU" {
            printf "  CPU %s: %.1f%%\n", $2, 100 - $12
        }'
    
    # 中断和上下文切换
    echo -e "${GREEN}中断统计:${NC}"
    cat /proc/interrupts | head -1
    cat /proc/interrupts | awk '{for(i=2;i<=NF-3;i++) sum[i]+=$i;} END {for(i=2;i<=NF-3;i++) printf "  CPU%d: %d次\n", i-2, sum[i]}' | head -5
    
    local context_switches=$(grep ctxt /proc/stat | awk '{print $2}')
    echo -e "${GREEN}上下文切换:${NC} $context_switches 次"
}

# 内存监控
memory_monitor() {
    echo -e "\n${CYAN}=== 内存监控 ===${NC}"
    
    # 内存使用情况
    local mem_info=$(free -h)
    echo -e "${GREEN}内存使用:${NC}"
    echo "$mem_info" | head -2
    echo "$mem_info" | grep -E "Mem:|Swap:"
    
    # 详细内存信息
    echo -e "\n${GREEN}详细内存信息:${NC}"
    cat /proc/meminfo | grep -E "MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree" | while read line; do
        echo "  $line"
    done
    
    # 内存压力
    echo -e "\n${GREEN}内存压力:${NC}"
    local mem_pressure=$(cat /proc/pressure/memory 2>/dev/null || echo "无法获取内存压力信息")
    echo "  $mem_pressure"
    
    # slab内存
    local slab_memory=$(awk '/SReclaimable/ {print $2}' /proc/meminfo)
    echo -e "${GREEN}可回收Slab内存:${NC} $((slab_memory / 1024)) MB"
}

# 进程监控
process_monitor() {
    echo -e "\n${CYAN}=== 进程监控 ===${NC}"
    
    # CPU占用最高的进程
    echo -e "${GREEN}CPU占用Top 5进程:${NC}"
    ps aux --sort=-%cpu | head -6 | awk 'NR==1 {printf "  %-8s %-8s %-6s %-6s %s\n", "USER", "PID", "%CPU", "%MEM", "COMMAND"} 
                                         NR>1 {printf "  %-8s %-8s %-6.1f %-6.1f %s\n", $1, $2, $3, $4, $11}'
    
    # 内存占用最高的进程
    echo -e "\n${GREEN}内存占用Top 5进程:${NC}"
    ps aux --sort=-%mem | head -6 | awk 'NR==1 {printf "  %-8s %-8s %-6s %-6s %s\n", "USER", "PID", "%CPU", "%MEM", "COMMAND"} 
                                        NR>1 {printf "  %-8s %-8s %-6.1f %-6.1f %s\n", $1, $2, $3, $4, $11}'
    
    # 僵尸进程
    local zombie_count=$(ps aux | awk '$8=="Z" {print $2}' | wc -l)
    if [ $zombie_count -gt 0 ]; then
        echo -e "${RED}发现僵尸进程:${NC} $zombie_count 个"
        ps aux | awk '$8=="Z" {print "  PID: "$2", 命令: "$11}'
    else
        echo -e "${GREEN}僵尸进程:${NC} 0 个"
    fi
}

# 网络监控
network_monitor() {
    echo -e "\n${CYAN}=== 网络监控 ===${NC}"
    
    # 网络接口统计
    echo -e "${GREEN}网络接口统计:${NC}"
    ip -s link show | grep -E "^[0-9]|RX|TX" | head -12
    
    # 连接统计
    echo -e "\n${GREEN}网络连接统计:${NC}"
    ss -s | head -10
    
    # TCP连接状态
    echo -e "\n${GREEN}TCP连接状态:${NC}"
    ss -t -a | awk 'NR>1 {count[$2]++} END {for(state in count) printf "  %s: %d\n", state, count[state]}'
}

# 磁盘监控
disk_monitor() {
    echo -e "\n${CYAN}=== 磁盘监控 ===${NC}"
    
    # 磁盘使用情况
    echo -e "${GREEN}磁盘使用情况:${NC}"
    df -h | awk 'NR==1 {print "  "$0} $5+0 > 80 {print "  "$0}'
    
    # inode使用情况
    echo -e "\n${GREEN}inode使用情况:${NC}"
    df -i | awk 'NR==1 {print "  "$0} $5+0 > 80 {print "  "$0}'
    
    # IO统计
    echo -e "\n${GREEN}磁盘IO统计:${NC}"
    iostat -x 1 1 | head -10
}

# 主函数
main() {
    echo -e "${PURPLE}🖥️  Linux系统性能监控报告${NC}"
    echo -e "${PURPLE}生成时间: $(date)${NC}"
    echo -e "${PURPLE}================================${NC}"
    
    system_overview
    cpu_monitor
    memory_monitor
    process_monitor
    network_monitor
    disk_monitor
    
    echo -e "\n${PURPLE}================================${NC}"
    echo -e "${PURPLE}监控报告生成完成${NC}"
}

# 执行主函数
main

CPU性能问题深度排查

CPU问题排查脚本

bash 复制代码
#!/bin/bash
# cpu_investigation.sh - CPU性能深度排查脚本

set -euo pipefail

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'

# 全局变量
INVESTIGATION_DIR="/opt/performance/investigation_$(date +%Y%m%d_%H%M%S)"
LOG_FILE="${INVESTIGATION_DIR}/cpu_investigation.log"

# 初始化
init_investigation() {
    mkdir -p "$INVESTIGATION_DIR"
    exec > >(tee -a "$LOG_FILE")
    exec 2>&1
    
    echo -e "${CYAN}=== CPU性能深度排查开始 ===${NC}"
    echo -e "调查目录: $INVESTIGATION_DIR"
    echo -e "日志文件: $LOG_FILE"
    echo -e "开始时间: $(date)"
}

# 记录调查步骤
log_step() {
    echo -e "\n${BLUE}>>> $1${NC}"
    echo "[STEP] $1 - $(date)" >> "$LOG_FILE"
}

# 快速CPU问题识别
quick_cpu_analysis() {
    log_step "快速CPU问题识别"
    
    # 检查CPU使用率
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print 100 - $8}')
    echo -e "${GREEN}当前CPU使用率:${NC} ${cpu_usage}%"
    
    if (( $(echo "$cpu_usage > 80" | bc -l) )); then
        echo -e "${RED}⚠️  CPU使用率过高${NC}"
    elif (( $(echo "$cpu_usage > 50" | bc -l) )); then
        echo -e "${YELLOW}⚠️  CPU使用率偏高${NC}"
    else
        echo -e "${GREEN}✅ CPU使用率正常${NC}"
    fi
    
    # 检查负载
    local load_avg=$(uptime | awk -F'load average:' '{print $2}')
    local cpu_cores=$(nproc)
    echo -e "${GREEN}负载平均值:${NC} $load_avg"
    echo -e "${GREEN}CPU核心数:${NC} $cpu_cores"
    
    # 负载与核心数比较
    local load1=$(echo $load_avg | awk -F, '{print $1}')
    if (( $(echo "$load1 > $cpu_cores" | bc -l) )); then
        echo -e "${RED}⚠️  系统负载过高${NC}"
    fi
}

# 详细进程分析
detailed_process_analysis() {
    log_step "详细进程分析"
    
    # 保存当前进程快照
    local process_file="${INVESTIGATION_DIR}/process_snapshot_$(date +%H%M%S).txt"
    ps aux --sort=-%cpu > "$process_file"
    
    echo -e "${GREEN}CPU占用Top 10进程:${NC}"
    head -11 "$process_file" | awk 'NR==1 {printf "  %-8s %-8s %-6s %-6s %-12s %s\n", "USER", "PID", "%CPU", "%MEM", "TIME", "COMMAND"} 
                                   NR>1 {printf "  %-8s %-8s %-6.1f %-6.1f %-12s %s\n", $1, $2, $3, $4, $10, $11}'
    
    # 分析可疑进程
    echo -e "\n${GREEN}可疑进程分析:${NC}"
    awk '$3 > 10.0 {print $2}' "$process_file" | while read pid; do
        if [ -n "$pid" ] && [ "$pid" != "PID" ]; then
            echo -e "  ${YELLOW}分析进程 PID: $pid${NC}"
            
            # 进程详细信息
            local proc_status="/proc/$pid/status"
            if [ -f "$proc_status" ]; then
                local process_name=$(ps -p "$pid" -o comm= 2>/dev/null || echo "未知")
                local thread_count=$(grep "Threads" "$proc_status" | awk '{print $2}')
                local cpu_affinity=$(taskset -cp "$pid" 2>/dev/null | awk -F: '{print $2}' | sed 's/^ *//')
                
                echo -e "    进程名: $process_name"
                echo -e "    线程数: $thread_count"
                echo -e "    CPU亲和性: $cpu_affinity"
                
                # 检查进程状态
                local state=$(grep "State" "$proc_status" | awk '{print $2}')
                echo -e "    进程状态: $state"
            fi
        fi
    done
}

# 线程级别分析
thread_level_analysis() {
    log_step "线程级别分析"
    
    local thread_file="${INVESTIGATION_DIR}/thread_analysis_$(date +%H%M%S).txt"
    
    # 获取高CPU进程的线程信息
    echo -e "${GREEN}高CPU进程的线程分析:${NC}"
    
    # 找到CPU使用率超过5%的进程
    ps -eo pid,%cpu,comm --no-headers | awk '$2 > 5.0 {print $1}' | while read pid; do
        if [ -d "/proc/$pid/task" ]; then
            echo -e "\n  ${CYAN}进程 $pid 的线程:${NC}"
            
            # 获取进程名
            local process_name=$(ps -p "$pid" -o comm= 2>/dev/null || echo "未知")
            echo -e "    进程名: $process_name"
            
            # 分析每个线程
            for tid in /proc/$pid/task/*; do
                local thread_id=$(basename "$tid")
                if [ "$thread_id" != "$pid" ]; then  # 跳过主线程(已经在进程列表中)
                    local thread_cpu=$(ps -L -p "$pid" -o tid,pcpu,comm --no-headers 2>/dev/null | awk -v tid="$thread_id" '$1 == tid {print $2}')
                    if [ -n "$thread_cpu" ] && (( $(echo "$thread_cpu > 1.0" | bc -l) )); then
                        echo -e "    ${YELLOW}线程 $thread_id: ${thread_cpu}% CPU${NC}"
                        
                        # 获取线程的调用栈
                        local stack_file="${INVESTIGATION_DIR}/stack_${pid}_${thread_id}.txt"
                        if command -v gdb >/dev/null 2>&1; then
                            gdb -ex "set pagination 0" -ex "thread apply all bt" -batch -p "$pid" 2>/dev/null | \
                            grep -A 20 "Thread $thread_id" > "$stack_file" 2>/dev/null || true
                        fi
                    fi
                fi
            done
        fi
    done
}

# CPU调度分析
cpu_scheduling_analysis() {
    log_step "CPU调度分析"
    
    echo -e "${GREEN}运行队列长度:${NC}"
    sar -q 1 3 | tail -3
    
    echo -e "\n${GREEN}CPU调度统计:${NC}"
    grep -E "cpu|ctxt" /proc/stat | head -5
    
    echo -e "\n${GREEN}每个CPU的运行队列:${NC}"
    mpstat -P ALL 1 1 | grep -E "CPU|Average" | awk '
        /CPU/ {printf "  %s: ", $2}
        /Average/ && $2 != "all" {printf "运行队列: %.2f\n", $8}'
    
    # 检查软中断分布
    echo -e "\n${GREEN}软中断分布:${NC}"
    cat /proc/softirqs | head -5
}

# 性能计数器分析
perf_counter_analysis() {
    log_step "性能计数器分析"
    
    if command -v perf >/dev/null 2>&1; then
        echo -e "${GREEN}系统级性能分析:${NC}"
        
        # 系统级性能统计(采样10秒)
        local perf_file="${INVESTIGATION_DIR}/perf_system_$(date +%H%M%S).data"
        timeout 10 perf record -a -g -o "$perf_file" >/dev/null 2>&1 &
        local perf_pid=$!
        
        echo -e "  正在收集性能数据(10秒)..."
        wait $perf_pid 2>/dev/null || true
        
        if [ -f "$perf_file" ]; then
            echo -e "  性能数据已保存: $perf_file"
            
            # 生成报告
            perf report -i "$perf_file" --stdio | head -50 > "${INVESTIGATION_DIR}/perf_report.txt" 2>/dev/null || true
            echo -e "  性能报告已生成: ${INVESTIGATION_DIR}/perf_report.txt"
        fi
    else
        echo -e "${YELLOW}perf工具未安装,跳过性能计数器分析${NC}"
    fi
}

# 系统调用分析
system_call_analysis() {
    log_step "系统调用分析"
    
    # 找到高CPU进程进行strace跟踪
    ps -eo pid,%cpu,comm --no-headers | awk '$2 > 20.0 {print $1}' | head -3 | while read pid; do
        if [ -n "$pid" ] && [ -d "/proc/$pid" ]; then
            local process_name=$(ps -p "$pid" -o comm=)
            echo -e "${GREEN}跟踪进程 $pid ($process_name) 的系统调用:${NC}"
            
            local strace_file="${INVESTIGATION_DIR}/strace_${pid}_$(date +%H%M%S).log"
            
            # 跟踪5秒
            timeout 5 strace -c -p "$pid" 2>&1 | head -20 > "$strace_file" 2>&1 &
            local strace_pid=$!
            wait $strace_pid 2>/dev/null || true
            
            if [ -s "$strace_file" ]; then
                cat "$strace_file"
                echo -e "  完整跟踪日志: $strace_file"
            else
                echo -e "  跟踪失败或没有系统调用活动"
            fi
        fi
    done
}

# 生成报告
generate_report() {
    log_step "生成调查报告"
    
    local report_file="${INVESTIGATION_DIR}/cpu_investigation_report.txt"
    
    cat > "$report_file" << EOF
CPU性能问题调查报告
===================

调查时间: $(date)
调查目录: $INVESTIGATION_DIR

📊 系统概览
-----------
主机名: $(hostname)
CPU核心数: $(nproc)
当前负载: $(uptime | awk -F'load average:' '{print $2}')
CPU使用率: $(top -bn1 | grep "Cpu(s)" | awk '{print 100 - $8}')%

🔍 发现的问题
------------
$(grep -E "⚠️|ERROR|WARN" "$LOG_FILE" | head -10)

💡 建议措施
----------
1. 检查高CPU进程的合法性
2. 分析线程级别的性能问题
3. 检查应用程序配置和代码
4. 考虑系统调优参数
5. 监控系统资源使用趋势

📁 收集的文件
------------
$(find "$INVESTIGATION_DIR" -type f -name "*.txt" -o -name "*.log" -o -name "*.data" | while read file; do
    echo "- $(basename "$file")"
done)

注意: 详细分析请查看各个日志文件
EOF

    echo -e "${GREEN}调查报告已生成:${NC} $report_file"
    echo -e "\n${CYAN}=== CPU性能深度排查完成 ===${NC}"
    echo -e "所有日志和报告保存在: $INVESTIGATION_DIR"
}

# 主函数
main() {
    init_investigation
    quick_cpu_analysis
    detailed_process_analysis
    thread_level_analysis
    cpu_scheduling_analysis
    perf_counter_analysis
    system_call_analysis
    generate_report
}

# 执行主函数
main "$@"

内存性能问题深度排查

bash 复制代码
#!/bin/bash
# memory_investigation.sh - 内存性能深度排查脚本

set -euo pipefail

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m'

# 全局变量
INVESTIGATION_DIR="/opt/performance/memory_investigation_$(date +%Y%m%d_%H%M%S)"
LOG_FILE="${INVESTIGATION_DIR}/memory_investigation.log"

# 初始化
init_investigation() {
    mkdir -p "$INVESTIGATION_DIR"
    exec > >(tee -a "$LOG_FILE")
    exec 2>&1
    
    echo -e "${CYAN}=== 内存性能深度排查开始 ===${NC}"
    echo -e "调查目录: $INVESTIGATION_DIR"
    echo -e "开始时间: $(date)"
}

# 记录调查步骤
log_step() {
    echo -e "\n${BLUE}>>> $1${NC}"
}

# 快速内存分析
quick_memory_analysis() {
    log_step "快速内存分析"
    
    # 内存使用概况
    echo -e "${GREEN}内存使用概况:${NC}"
    free -h
    
    # 检查内存压力
    local available_mem=$(free -m | awk 'NR==2{print $7}')
    local total_mem=$(free -m | awk 'NR==2{print $2}')
    local mem_usage_percent=$(( (total_mem - available_mem) * 100 / total_mem ))
    
    echo -e "${GREEN}内存使用率:${NC} ${mem_usage_percent}%"
    
    if [ $mem_usage_percent -gt 90 ]; then
        echo -e "${RED}⚠️  内存使用率超过90%,可能存在内存压力${NC}"
    elif [ $mem_usage_percent -gt 70 ]; then
        echo -e "${YELLOW}⚠️  内存使用率超过70%,需要关注${NC}"
    else
        echo -e "${GREEN}✅ 内存使用率正常${NC}"
    fi
    
    # 检查Swap使用
    local swap_used=$(free -m | awk 'NR==3{print $3}')
    if [ $swap_used -gt 0 ]; then
        echo -e "${YELLOW}⚠️  Swap正在使用: ${swap_used}MB${NC}"
    else
        echo -e "${GREEN}✅ Swap未使用${NC}"
    fi
}

# 详细内存统计
detailed_memory_stats() {
    log_step "详细内存统计"
    
    local meminfo_file="${INVESTIGATION_DIR}/meminfo_$(date +%H%M%S).txt"
    cat /proc/meminfo > "$meminfo_file"
    
    echo -e "${GREEN}关键内存指标:${NC}"
    grep -E "MemTotal|MemFree|MemAvailable|Buffers|Cached|SwapTotal|SwapFree|Slab|SReclaimable|SUnreclaim" "$meminfo_file" | \
    while read key value unit; do
        local mb_value=$((value / 1024))
        printf "  %-20s: %8d MB\n" "$key" "$mb_value"
    done
    
    # 检查内存碎片
    echo -e "\n${GREEN}内存碎片信息:${NC}"
    if [ -f /proc/buddyinfo ]; then
        cat /proc/buddyinfo | head -5
    fi
    
    # 检查大页内存
    echo -e "\n${GREEN}大页内存信息:${NC}"
    grep -E "HugePages_Total|HugePages_Free|Hugepagesize" "$meminfo_file"
}

# 进程内存分析
process_memory_analysis() {
    log_step "进程内存分析"
    
    local process_file="${INVESTIGATION_DIR}/process_memory_$(date +%H%M%S).txt"
    
    # 获取内存占用最高的进程
    echo -e "${GREEN}内存占用Top 10进程:${NC}"
    ps aux --sort=-%mem | head -11 | awk 'NR==1 {printf "  %-8s %-8s %-6s %-6s %-8s %s\n", "USER", "PID", "%CPU", "%MEM", "RSS", "COMMAND"} 
                                        NR>1 {printf "  %-8s %-8s %-6.1f %-6.1f %-8s %s\n", $1, $2, $3, $4, $6, $11}' | \
    while read line; do
        echo "  $line"
    done
    
    # 分析高内存进程的详细内存使用
    ps aux --sort=-%mem | awk 'NR>1 && $4 > 1.0 {print $2}' | head -5 | while read pid; do
        if [ -d "/proc/$pid" ]; then
            echo -e "\n  ${CYAN}分析高内存进程 PID: $pid${NC}"
            
            local process_name=$(ps -p "$pid" -o comm= 2>/dev/null || echo "未知")
            echo -e "    进程名: $process_name"
            
            # 检查进程内存映射
            local maps_file="${INVESTIGATION_DIR}/maps_${pid}.txt"
            if [ -f "/proc/$pid/smaps" ]; then
                cat "/proc/$pid/smaps" > "$maps_file"
                
                # 计算各部分内存使用
                local pss=$(grep -E "Pss|Swap" "/proc/$pid/smaps" | awk '{sum += $2} END {print sum " kB"}')
                local swap=$(grep "Swap" "/proc/$pid/smaps" | awk '{sum += $2} END {print sum " kB"}')
                
                echo -e "    实际使用(PSS): $pss"
                echo -e "    Swap使用: $swap"
                
                # 分析内存段
                echo -e "    内存段分析:"
                grep -E "Size|Rss|Pss|Shared" "/proc/$pid/smaps" | head -20 | \
                while read line; do
                    echo -e "      $line"
                done
            fi
        fi
    done
}

# slab内存分析
slab_memory_analysis() {
    log_step "Slab内存分析"
    
    echo -e "${GREEN}Slab内存使用:${NC}"
    if command -v slabtop >/dev/null 2>&1; then
        slabtop -o -s c | head -20
    else
        echo -e "  ${YELLOW}slabtop命令不可用${NC}"
    fi
    
    # 检查slab信息
    local slabinfo_file="${INVESTIGATION_DIR}/slabinfo_$(date +%H%M%S).txt"
    cat /proc/slabinfo > "$slabinfo_file" 2>/dev/null || true
    
    if [ -s "$slabinfo_file" ]; then
        echo -e "\n${GREEN}Slab占用Top 10:${NC}"
        awk 'NR>1 {print $1,$2,$3}' "$slabinfo_file" | sort -k2 -nr | head -10 | \
        while read name active_objs objsize; do
            local total_size=$((active_objs * objsize / 1024))
            printf "  %-20s: %8d KB\n" "$name" "$total_size"
        done
    fi
}

# 内存泄漏检测
memory_leak_detection() {
    log_step "内存泄漏检测"
    
    echo -e "${GREEN}检查可能的内存泄漏:${NC}"
    
    # 监控内存增长趋势
    local mem_usage_log="${INVESTIGATION_DIR}/memory_trend.log"
    
    for i in {1..5}; do
        local timestamp=$(date '+%H:%M:%S')
        local mem_info=$(free -m | awk 'NR==2{print $3 "," $4 "," $7}')
        echo "$timestamp,$mem_info" >> "$mem_usage_log"
        sleep 2
    done
    
    echo -e "  内存趋势已记录到: $mem_usage_log"
    
    # 检查OOM killer活动
    echo -e "\n${GREEN}OOM Killer记录:${NC}"
    if dmesg | grep -i "killed process" | tail -5; then
        echo -e "${RED}⚠️  检测到OOM Killer活动${NC}"
    else
        echo -e "  未发现最近的OOM Killer活动"
    fi
    
    # 检查内存回收压力
    echo -e "\n${GREEN}内存回收压力:${NC}"
    if [ -f /proc/pressure/memory ]; then
        cat /proc/pressure/memory
    else
        echo "  内核不支持内存压力监测"
    fi
}

# 交换空间分析
swap_analysis() {
    log_step "交换空间分析"
    
    echo -e "${GREEN}交换空间使用:${NC}"
    swapon --show
    
    echo -e "\n${GREEN}交换活动统计:${NC}"
    if command -v vmstat >/dev/null 2>&1; then
        vmstat 1 3 | tail -3
    fi
    
    # 检查页面交换
    echo -e "\n${GREEN}页面交换统计:${NC}"
    grep -E "pgpgin|pgpgout|pswpin|pswpout" /proc/vmstat | \
    while read key value; do
        local mb_value=$((value / 256))  # 近似转换为MB
        printf "  %-15s: %8d MB\n" "$key" "$mb_value"
    done
}

# 生成内存报告
generate_memory_report() {
    log_step "生成内存调查报告"
    
    local report_file="${INVESTIGATION_DIR}/memory_investigation_report.txt"
    
    cat > "$report_file" << EOF
内存性能问题调查报告
===================

调查时间: $(date)
调查目录: $INVESTIGATION_DIR

📊 内存概览
-----------
总内存: $(free -h | awk 'NR==2{print $2}')
已使用: $(free -h | awk 'NR==2{print $3}')
可用内存: $(free -h | awk 'NR==2{print $7}')
内存使用率: $(free | awk 'NR==2{printf "%.1f%%", $3/$2*100}')

🔍 发现的问题
------------
$(grep -E "⚠️|WARN" "$LOG_FILE" | head -10)

💡 建议措施
----------
1. 检查高内存进程的合理性
2. 分析内存泄漏可能性
3. 优化应用程序内存使用
4. 考虑增加物理内存
5. 调整Swappiness参数
6. 清理Slab缓存

📊 内存优化建议
---------------
1. 定期监控内存使用趋势
2. 设置适当的内存限制
3. 优化应用程序内存分配
4. 考虑使用内存压缩
5. 配置适当的内存回收策略

📁 收集的文件
------------
$(find "$INVESTIGATION_DIR" -type f -name "*.txt" -o -name "*.log" | while read file; do
    echo "- $(basename "$file")"
done)

注意: 详细分析请查看各个日志文件
EOF

    echo -e "${GREEN}内存调查报告已生成:${NC} $report_file"
    echo -e "\n${CYAN}=== 内存性能深度排查完成 ===${NC}"
    echo -e "所有日志和报告保存在: $INVESTIGATION_DIR"
}

# 主函数
main() {
    init_investigation
    quick_memory_analysis
    detailed_memory_stats
    process_memory_analysis
    slab_memory_analysis
    memory_leak_detection
    swap_analysis
    generate_memory_report
}

# 执行主函数
main "$@"

高级性能诊断工具

实时性能监控面板

bash 复制代码
#!/bin/bash
# performance_dashboard.sh - 实时性能监控面板

set -euo pipefail

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m'

# 清理屏幕并设置终端
clear_screen() {
    printf "\033[2J\033[H"
}

# 显示头部信息
show_header() {
    local width=${1:-80}
    printf "${PURPLE}"
    printf "╔%*s╗\n" $((width-2)) "" | tr ' ' '='
    printf "║%*s║\n" $((width-2)) ""
    printf "║   🚀 Linux实时性能监控面板 %*s║\n" $((width-40)) ""
    printf "║%*s║\n" $((width-2)) ""
    printf "║   主机: %-20s 时间: %-19s║\n" "$(hostname)" "$(date '+%Y-%m-%d %H:%M:%S')"
    printf "║%*s║\n" $((width-2)) ""
    printf "╚%*s╝\n" $((width-2)) "" | tr ' ' '='
    printf "${NC}\n"
}

# CPU监控部分
show_cpu_section() {
    local width=$1
    
    printf "${CYAN}🖥️  CPU监控${NC}\n"
    printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
    
    # CPU使用率
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{printf "%.1f", 100 - $8}')
    printf "${BLUE}│${NC} %-15s: ${GREEN}%5.1f%%${NC} %*s ${BLUE}│${NC}\n" "使用率" "$cpu_usage" $((width-30)) ""
    
    # 负载平均值
    local load_avg=$(uptime | awk -F'load average:' '{print $2}' | sed 's/^ *//')
    local cpu_cores=$(nproc)
    printf "${BLUE}│${NC} %-15s: ${YELLOW}%s${NC} %*s ${BLUE}│${NC}\n" "负载" "$load_avg" $((width-35-${#load_avg})) ""
    printf "${BLUE}│${NC} %-15s: ${CYAN}%d 核心${NC} %*s ${BLUE}│${NC}\n" "CPU核心" "$cpu_cores" $((width-28)) ""
    
    # 中断和上下文切换
    local interrupts=$(awk '{print $1}' /proc/stat | grep -c "^cpu")
    local context_switches=$(grep ctxt /proc/stat | awk '{print $2}')
    printf "${BLUE}│${NC} %-15s: ${CYAN}%'d${NC} %*s ${BLUE}│${NC}\n" "中断" "$interrupts" $((width-28)) ""
    printf "${BLUE}│${NC} %-15s: ${CYAN}%'d${NC} %*s ${BLUE}│${NC}\n" "上下文切换" "$context_switches" $((width-33)) ""
    
    printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
    printf "\n"
}

# 内存监控部分
show_memory_section() {
    local width=$1
    
    printf "${CYAN}💾 内存监控${NC}\n"
    printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
    
    # 内存使用情况
    local mem_total=$(free -m | awk 'NR==2{print $2}')
    local mem_used=$(free -m | awk 'NR==2{print $3}')
    local mem_available=$(free -m | awk 'NR==2{print $7}')
    local mem_usage_percent=$((mem_used * 100 / mem_total))
    
    printf "${BLUE}│${NC} %-15s: ${GREEN}%'d MB${NC} %*s ${BLUE}│${NC}\n" "总内存" "$mem_total" $((width-30)) ""
    printf "${BLUE}│${NC} %-15s: ${YELLOW}%'d MB${NC} %*s ${BLUE}│${NC}\n" "已使用" "$mem_used" $((width-30)) ""
    printf "${BLUE}│${NC} %-15s: ${GREEN}%'d MB${NC} %*s ${BLUE}│${NC}\n" "可用内存" "$mem_available" $((width-32)) ""
    printf "${BLUE}│${NC} %-15s: " "使用率"
    
    # 彩色进度条
    local bar_width=20
    local filled=$((mem_usage_percent * bar_width / 100))
    local empty=$((bar_width - filled))
    
    if [ $mem_usage_percent -gt 90 ]; then
        printf "${RED}"
    elif [ $mem_usage_percent -gt 70 ]; then
        printf "${YELLOW}"
    else
        printf "${GREEN}"
    fi
    
    printf "%3d%% [" "$mem_usage_percent"
    printf "%*s" "$filled" | tr ' ' '■'
    printf "%*s" "$empty" | tr ' ' '·'
    printf "] %*s ${BLUE}│${NC}\n" $((width-42)) ""
    
    # Swap使用
    local swap_total=$(free -m | awk 'NR==3{print $2}')
    local swap_used=$(free -m | awk 'NR==3{print $3}')
    if [ $swap_total -gt 0 ]; then
        local swap_percent=$((swap_used * 100 / swap_total))
        printf "${BLUE}│${NC} %-15s: ${YELLOW}%'d/%'d MB${NC} %*s ${BLUE}│${NC}\n" "Swap" "$swap_used" "$swap_total" $((width-40)) ""
    fi
    
    printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
    printf "\n"
}

# 磁盘监控部分
show_disk_section() {
    local width=$1
    
    printf "${CYAN}💽 磁盘监控${NC}\n"
    printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
    
    # 磁盘使用情况
    df -h | awk 'NR>1 && $5+0 > 50 {print $1,$2,$3,$4,$5,$6}' | head -3 | \
    while read fs size used avail percent mount; do
        local percent_num=${percent%\%}
        printf "${BLUE}│${NC} %-15s: " "$mount"
        
        if [ $percent_num -gt 90 ]; then
            printf "${RED}"
        elif [ $percent_num -gt 70 ]; then
            printf "${YELLOW}"
        else
            printf "${GREEN}"
        fi
        
        printf "%-5s %-4s %-4s %3s %*s ${BLUE}│${NC}\n" "$used" "/" "$size" "$percent" $((width-45)) ""
    done
    
    # IO统计
    if command -v iostat >/dev/null 2>&1; then
        local io_util=$(iostat -d 1 1 | awk 'NR>3 && $2!="" {util+=$NF} END {printf "%.1f", util}')
        printf "${BLUE}│${NC} %-15s: ${CYAN}%5.1f%%${NC} %*s ${BLUE}│${NC}\n" "IO使用率" "$io_util" $((width-30)) ""
    fi
    
    printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
    printf "\n"
}

# 进程监控部分
show_process_section() {
    local width=$1
    
    printf "${CYAN}🔄 进程监控${NC}\n"
    printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
    
    # CPU占用最高的进程
    printf "${BLUE}│${NC} ${GREEN}%-6s ${YELLOW}%-8s ${CYAN}%-6s ${NC}%-20s %*s ${BLUE}│${NC}\n" "CPU%" "PID" "MEM%" "进程名" $((width-50)) ""
    printf "${BLUE}│%*s│${NC}\n" $((width-2)) "" | tr ' ' '─'
    
    ps aux --sort=-%cpu | awk 'NR>1 && NR<=6 {
        printf "│ %-6.1f %-8s %-6.1f %-20s %*s │\n", 
        $3, $2, $4, substr($11,1,18), '"$((width-45))"', ""
    }' | while read line; do
        echo -e "${BLUE}$line${NC}"
    done
    
    printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
    printf "\n"
}

# 网络监控部分
show_network_section() {
    local width=$1
    
    printf "${CYAN}🌐 网络监控${NC}\n"
    printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
    
    # 连接统计
    local tcp_connections=$(ss -t -a | wc -l)
    local established_connections=$(ss -t -a | grep ESTAB | wc -l)
    
    printf "${BLUE}│${NC} %-15s: ${CYAN}%'d${NC} %*s ${BLUE}│${NC}\n" "TCP连接" "$tcp_connections" $((width-28)) ""
    printf "${BLUE}│${NC} %-15s: ${GREEN}%'d${NC} %*s ${BLUE}│${NC}\n" "已建立" "$established_connections" $((width-28)) ""
    
    # 网络接口
    local interface=$(ip route | awk '/default/ {print $5}' | head -1)
    if [ -n "$interface" ]; then
        local rx_bytes=$(cat /sys/class/net/$interface/statistics/rx_bytes 2>/dev/null || echo 0)
        local tx_bytes=$(cat /sys/class/net/$interface/statistics/tx_bytes 2>/dev/null || echo 0)
        
        printf "${BLUE}│${NC} %-15s: ${CYAN}%s${NC} %*s ${BLUE}│${NC}\n" "活动接口" "$interface" $((width-31)) ""
        printf "${BLUE}│${NC} %-15s: ${GREEN}%'d${NC} %*s ${BLUE}│${NC}\n" "接收字节" "$rx_bytes" $((width-31)) ""
        printf "${BLUE}│${NC} %-15s: ${YELLOW}%'d${NC} %*s ${BLUE}│${NC}\n" "发送字节" "$tx_bytes" $((width-31)) ""
    fi
    
    printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
    printf "\n"
}

# 系统负载可视化
show_load_visualization() {
    local width=$1
    
    printf "${CYAN}📊 系统负载可视化${NC}\n"
    printf "${BLUE}┌%*s┐${NC}\n" $((width-2)) "" | tr ' ' '─'
    
    # 获取1分钟负载
    local load1=$(uptime | awk -F'load average:' '{print $2}' | awk -F, '{print $1}' | sed 's/ //g')
    local cpu_cores=$(nproc)
    
    printf "${BLUE}│${NC} %-20s: ${CYAN}%s${NC} %*s ${BLUE}│${NC}\n" "1分钟负载" "$load1" $((width-35)) ""
    printf "${BLUE}│${NC} %-20s: ${CYAN}%d${NC} %*s ${BLUE}│${NC}\n" "CPU核心数" "$cpu_cores" $((width-33)) ""
    
    # 负载可视化
    local load_value=$(echo "$load1" | awk '{print int($1*100)}')
    local load_percent=$((load_value / cpu_cores))
    
    printf "${BLUE}│${NC} %-20s: " "负载水平"
    
    if [ $load_percent -gt 150 ]; then
        printf "${RED}严重超载 🔴${NC}"
    elif [ $load_percent -gt 100 ]; then
        printf "${RED}超载 🟠${NC}"
    elif [ $load_percent -gt 70 ]; then
        printf "${YELLOW}较高 🟡${NC}"
    elif [ $load_percent -gt 30 ]; then
        printf "${GREEN}正常 🟢${NC}"
    else
        printf "${GREEN}较低 🔵${NC}"
    fi
    
    printf " %*s ${BLUE}│${NC}\n" $((width-40)) ""
    printf "${BLUE}└%*s┘${NC}\n" $((width-2)) "" | tr ' ' '─'
    printf "\n"
}

# 主监控循环
main() {
    local terminal_width=${COLUMNS:-80}
    local refresh_rate=3
    
    # 检查终端大小
    if [ $terminal_width -lt 80 ]; then
        terminal_width=80
    fi
    
    # 处理Ctrl+C退出
    trap 'clear_screen; echo -e "\n${GREEN}监控已退出${NC}"; exit 0' INT
    
    while true; do
        clear_screen
        show_header $terminal_width
        show_cpu_section $terminal_width
        show_memory_section $terminal_width
        show_disk_section $terminal_width
        show_process_section $terminal_width
        show_network_section $terminal_width
        show_load_visualization $terminal_width
        
        # 底部信息
        printf "${PURPLE}按 Ctrl+C 退出监控 | 刷新间隔: ${refresh_rate}秒${NC}\n"
        
        sleep $refresh_rate
    done
}

# 执行主函数
main "$@"

性能问题排查流程图

以下图表展示了完整的性能问题排查流程:

flowchart TD A[性能告警] --> B{识别问题类型} B -->|CPU飙高| C[CPU问题排查] B -->|内存飙高| D[内存问题排查] B -->|IO问题| E[磁盘IO排查] B -->|网络问题| F[网络问题排查] C --> C1[快速系统检查] C1 --> C2[进程级分析] C2 --> C3[线程级分析] C3 --> C4[性能剖析] C4 --> C5[系统调用分析] C5 --> C6[生成解决方案] D --> D1[内存使用分析] D1 --> D2[进程内存分析] D2 --> D3[Slab缓存分析] D3 --> D4[内存泄漏检测] D4 --> D5[交换空间分析] D5 --> D6[内存优化方案] E --> E1[磁盘使用检查] E1 --> E2[IO性能分析] E2 --> E3[文件系统检查] E3 --> E4[IO调度分析] E4 --> E5[存储优化] F --> F1[网络连接分析] F1 --> F2[带宽使用检查] F2 --> F3[网络错误分析] F3 --> F4[连接追踪] F4 --> F5[网络优化] C6 --> G[实施解决方案] D6 --> G E5 --> G F5 --> G G --> H[监控改进效果] H --> I{问题解决?} I -->|是| J[问题解决] I -->|否| K[重新分析] K --> B style A fill:#e74c3c,color:#fff style J fill:#27ae60,color:#fff style C fill:#3498db,color:#fff style D fill:#9b59b6,color:#fff style E fill:#e67e22,color:#fff style F fill:#34495e,color:#fff

应急处理和进程管理

紧急进程管理脚本

bash 复制代码
#!/bin/bash
# emergency_process_manager.sh - 紧急进程管理脚本

set -euo pipefail

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
PURPLE='\033[0;35m'
CYAN='\033[0;36m'
NC='\033[0m'

# 日志函数
log() {
    local level=$1
    shift
    local message=$*
    local timestamp=$(date '+%Y-%m-%d %H:%M:%S')
    
    echo -e "[$timestamp] [$level] $message" | tee -a "/var/log/emergency_process.log"
}

# 显示进程列表
show_process_list() {
    echo -e "\n${CYAN}=== 当前高资源占用进程 ===${NC}"
    
    # CPU占用Top 10
    echo -e "${YELLOW}🖥️  CPU占用Top 10:${NC}"
    ps aux --sort=-%cpu | head -11 | awk 'NR==1 {printf "  %-8s %-8s %-6s %-6s %-12s %s\n", "USER", "PID", "%CPU", "%MEM", "TIME", "COMMAND"} 
                                        NR>1 {printf "  %-8s %-8s %-6.1f %-6.1f %-12s %s\n", $1, $2, $3, $4, $10, $11}'
    
    # 内存占用Top 10
    echo -e "\n${YELLOW}💾 内存占用Top 10:${NC}"
    ps aux --sort=-%mem | head -11 | awk 'NR==1 {printf "  %-8s %-8s %-6s %-6s %-8s %s\n", "USER", "PID", "%CPU", "%MEM", "RSS", "COMMAND"} 
                                        NR>1 {printf "  %-8s %-8s %-6.1f %-6.1f %-8s %s\n", $1, $2, $3, $4, $6, $11}'
}

# 分析可疑进程
analyze_suspicious_processes() {
    echo -e "\n${CYAN}=== 可疑进程分析 ===${NC}"
    
    # 查找CPU占用超过50%的进程
    local high_cpu_processes=$(ps aux --sort=-%cpu | awk 'NR>1 && $3 > 50.0 {print $2, $1, $3, $11}')
    
    if [ -n "$high_cpu_processes" ]; then
        echo -e "${RED}⚠️  发现高CPU占用进程:${NC}"
        echo "$high_cpu_processes" | while read pid user cpu command; do
            echo -e "  ${RED}PID: $pid, 用户: $user, CPU: ${cpu}%, 命令: $command${NC}"
            
            # 检查进程详细信息
            if [ -d "/proc/$pid" ]; then
                local process_name=$(ps -p "$pid" -o comm= 2>/dev/null || echo "未知")
                local thread_count=$(ps -L -p "$pid" | wc -l)
                local start_time=$(ps -p "$pid" -o lstart= 2>/dev/null || echo "未知")
                
                echo -e "     进程名: $process_name"
                echo -e "     线程数: $((thread_count - 1))"
                echo -e "     启动时间: $start_time"
                
                # 检查进程文件
                local exe_path=$(readlink "/proc/$pid/exe" 2>/dev/null || echo "无法访问")
                echo -e "     可执行文件: $exe_path"
            fi
            echo ""
        done
    else
        echo -e "${GREEN}✅ 未发现异常高CPU占用进程${NC}"
    fi
    
    # 查找内存占用超过10%的进程
    local high_mem_processes=$(ps aux --sort=-%mem | awk 'NR>1 && $4 > 10.0 {print $2, $1, $4, $11}')
    
    if [ -n "$high_mem_processes" ]; then
        echo -e "${RED}⚠️  发现高内存占用进程:${NC}"
        echo "$high_mem_processes" | while read pid user mem command; do
            echo -e "  ${RED}PID: $pid, 用户: $user, 内存: ${mem}%, 命令: $command${NC}"
        done
    else
        echo -e "${GREEN}✅ 未发现异常高内存占用进程${NC}"
    fi
}

# 安全终止进程
safely_kill_process() {
    local pid=$1
    local signal=${2:-TERM}
    
    if [ ! -d "/proc/$pid" ]; then
        echo -e "${RED}错误: 进程 $pid 不存在${NC}"
        return 1
    fi
    
    local process_info=$(ps -p "$pid" -o user,pid,%cpu,%mem,comm --no-headers 2>/dev/null)
    if [ -z "$process_info" ]; then
        echo -e "${RED}错误: 无法获取进程 $pid 的信息${NC}"
        return 1
    fi
    
    echo -e "${YELLOW}准备终止进程:${NC}"
    echo -e "  进程信息: $process_info"
    
    # 检查进程重要性
    local process_name=$(echo "$process_info" | awk '{print $5}')
    local critical_processes=("systemd" "init" "kernel" "sshd" "bash" "sh")
    
    for critical in "${critical_processes[@]}"; do
        if [[ "$process_name" == *"$critical"* ]]; then
            echo -e "${RED}警告: 进程 $process_name 可能是关键系统进程,终止可能导致系统不稳定${NC}"
            read -p "是否继续? (y/N): " confirm
            if [[ ! "$confirm" =~ ^[Yy]$ ]]; then
                echo -e "${GREEN}操作已取消${NC}"
                return 1
            fi
            break
        fi
    done
    
    # 发送信号
    echo -e "${YELLOW}发送 $signal 信号到进程 $pid...${NC}"
    if kill -$signal "$pid" 2>/dev/null; then
        echo -e "${GREEN}✅ 信号发送成功${NC}"
        
        # 等待进程退出
        local wait_time=0
        local max_wait=10
        
        while [ -d "/proc/$pid" ] && [ $wait_time -lt $max_wait ]; do
            sleep 1
            ((wait_time++))
            echo -e "  等待进程退出... ($wait_time/$max_wait 秒)"
        done
        
        if [ -d "/proc/$pid" ]; then
            echo -e "${RED}进程未正常退出,发送KILL信号...${NC}"
            kill -KILL "$pid"
            sleep 1
        fi
        
        if [ ! -d "/proc/$pid" ]; then
            echo -e "${GREEN}✅ 进程已成功终止${NC}"
            log "INFO" "进程 $pid ($process_name) 已被终止"
        else
            echo -e "${RED}❌ 无法终止进程 $pid${NC}"
            log "ERROR" "无法终止进程 $pid"
            return 1
        fi
    else
        echo -e "${RED}❌ 无法发送信号到进程 $pid${NC}"
        log "ERROR" "无法发送信号到进程 $pid"
        return 1
    fi
}

# 批量终止进程
batch_kill_processes() {
    echo -e "\n${CYAN}=== 批量进程管理 ===${NC}"
    
    # 按条件选择进程
    echo -e "${YELLOW}选择终止条件:${NC}"
    echo "  1) CPU使用率 > 指定百分比"
    echo "  2) 内存使用率 > 指定百分比"
    echo "  3) 指定进程名"
    echo "  4) 指定用户"
    
    read -p "请选择选项 (1-4): " option
    
    case $option in
        1)
            read -p "请输入CPU使用率阈值 (例如: 50): " threshold
            local processes=$(ps aux --sort=-%cpu | awk -v threshold="$threshold" 'NR>1 && $3 > threshold {print $2, $1, $3, $11}')
            ;;
        2)
            read -p "请输入内存使用率阈值 (例如: 10): " threshold
            local processes=$(ps aux --sort=-%mem | awk -v threshold="$threshold" 'NR>1 && $4 > threshold {print $2, $1, $4, $11}')
            ;;
        3)
            read -p "请输入进程名: " process_name
            local processes=$(ps aux | awk -v name="$process_name" 'NR>1 && $11 ~ name {print $2, $1, $3, $11}')
            ;;
        4)
            read -p "请输入用户名: " username
            local processes=$(ps aux | awk -v user="$username" 'NR>1 && $1 == user {print $2, $1, $3, $11}')
            ;;
        *)
            echo -e "${RED}无效选项${NC}"
            return 1
            ;;
    esac
    
    if [ -z "$processes" ]; then
        echo -e "${GREEN}没有找到匹配的进程${NC}"
        return 0
    fi
    
    echo -e "\n${YELLOW}找到以下进程:${NC}"
    echo "$processes" | while read pid user usage command; do
        echo -e "  PID: $pid, 用户: $user, 资源: $usage%, 命令: $command"
    done
    
    read -p "是否终止这些进程? (y/N): " confirm
    if [[ "$confirm" =~ ^[Yy]$ ]]; then
        echo "$processes" | while read pid user usage command; do
            echo -e "\n${YELLOW}处理进程 $pid...${NC}"
            safely_kill_process "$pid"
        done
    else
        echo -e "${GREEN}操作已取消${NC}"
    fi
}

# 进程资源限制
set_process_limits() {
    echo -e "\n${CYAN}=== 进程资源限制 ===${NC}"
    
    # 检查cgroups是否可用
    if [ ! -d "/sys/fs/cgroup" ]; then
        echo -e "${RED}错误: cgroups不可用${NC}"
        return 1
    fi
    
    echo -e "${YELLOW}设置进程资源限制:${NC}"
    echo "  1) CPU限制"
    echo "  2) 内存限制"
    echo "  3) IO限制"
    
    read -p "请选择选项 (1-3): " option
    
    case $option in
        1)
            read -p "请输入要限制的PID: " pid
            read -p "请输入CPU使用率限制 (例如: 50): " cpu_limit
            
            if [ -d "/proc/$pid" ]; then
                # 创建cgroup
                local cgroup_dir="/sys/fs/cgroup/cpu/limit_$pid"
                mkdir -p "$cgroup_dir"
                
                # 设置CPU限制 (百分比转换为CFS配额)
                local cpu_quota=$((cpu_limit * 1000))
                echo "$cpu_quota" > "$cgroup_dir/cpu.cfs_quota_us"
                echo "100000" > "$cgroup_dir/cpu.cfs_period_us"
                
                # 将进程加入cgroup
                echo "$pid" > "$cgroup_dir/cgroup.procs"
                
                echo -e "${GREEN}✅ 已设置进程 $pid 的CPU限制为 ${cpu_limit}%${NC}"
            else
                echo -e "${RED}错误: 进程 $pid 不存在${NC}"
            fi
            ;;
        2)
            read -p "请输入要限制的PID: " pid
            read -p "请输入内存限制 (MB): " mem_limit
            
            if [ -d "/proc/$pid" ]; then
                # 创建cgroup
                local cgroup_dir="/sys/fs/cgroup/memory/limit_$pid"
                mkdir -p "$cgroup_dir"
                
                # 设置内存限制
                local mem_bytes=$((mem_limit * 1024 * 1024))
                echo "$mem_bytes" > "$cgroup_dir/memory.limit_in_bytes"
                
                # 将进程加入cgroup
                echo "$pid" > "$cgroup_dir/cgroup.procs"
                
                echo -e "${GREEN}✅ 已设置进程 $pid 的内存限制为 ${mem_limit}MB${NC}"
            else
                echo -e "${RED}错误: 进程 $pid 不存在${NC}"
            fi
            ;;
        3)
            echo -e "${YELLOW}IO限制功能需要更复杂的配置,建议使用systemd或手动配置cgroup${NC}"
            ;;
        *)
            echo -e "${RED}无效选项${NC}"
            ;;
    esac
}

# 系统优化建议
system_optimization_suggestions() {
    echo -e "\n${CYAN}=== 系统优化建议 ===${NC}"
    
    # CPU优化建议
    local cpu_usage=$(top -bn1 | grep "Cpu(s)" | awk '{print 100 - $8}')
    if (( $(echo "$cpu_usage > 80" | bc -l) )); then
        echo -e "${YELLOW}🖥️  CPU优化建议:${NC}"
        echo "  • 检查并优化高CPU进程"
        echo "  • 考虑增加CPU核心或升级硬件"
        echo "  • 调整进程优先级 (nice/renice)"
        echo "  • 使用cgroups限制资源使用"
    fi
    
    # 内存优化建议
    local mem_usage=$(free | awk 'NR==2{printf "%.1f", $3/$2*100}')
    if (( $(echo "$mem_usage > 80" | bc -l) )); then
        echo -e "${YELLOW}💾 内存优化建议:${NC}"
        echo "  • 检查内存泄漏"
        echo "  • 优化应用程序内存使用"
        echo "  • 增加物理内存"
        echo "  • 调整swappiness参数"
        echo "  • 清理缓存: echo 3 > /proc/sys/vm/drop_caches"
    fi
    
    # 通用优化建议
    echo -e "${YELLOW}🔧 通用优化建议:${NC}"
    echo "  • 定期更新系统和软件"
    echo "  • 监控系统资源使用趋势"
    echo "  • 配置适当的监控告警"
    echo "  • 优化应用程序配置"
    echo "  • 考虑使用负载均衡"
}

# 主菜单
main_menu() {
    while true; do
        clear
        echo -e "${PURPLE}================================${NC}"
        echo -e "${PURPLE}    Linux紧急进程管理器${NC}"
        echo -e "${PURPLE}================================${NC}"
        echo -e "${GREEN}1) 显示进程列表${NC}"
        echo -e "${GREEN}2) 分析可疑进程${NC}"
        echo -e "${YELLOW}3) 终止单个进程${NC}"
        echo -e "${YELLOW}4) 批量终止进程${NC}"
        echo -e "${BLUE}5) 设置进程资源限制${NC}"
        echo -e "${CYAN}6) 系统优化建议${NC}"
        echo -e "${RED}7) 退出${NC}"
        echo -e "${PURPLE}================================${NC}"
        
        read -p "请选择操作 (1-7): " choice
        
        case $choice in
            1) show_process_list ;;
            2) analyze_suspicious_processes ;;
            3) 
                read -p "请输入要终止的PID: " pid
                safely_kill_process "$pid"
                ;;
            4) batch_kill_processes ;;
            5) set_process_limits ;;
            6) system_optimization_suggestions ;;
            7) 
                echo -e "${GREEN}再见!${NC}"
                exit 0
                ;;
            *) 
                echo -e "${RED}无效选择,请重新输入${NC}"
                ;;
        esac
        
        echo -e "\n按Enter键继续..."
        read
    done
}

# 执行主菜单
main_menu

总结

本文提供了完整的Linux服务器性能问题排查解决方案,包含:

核心排查能力

  • 实时监控: 全面的系统性能监控面板
  • 深度诊断: CPU、内存、IO、网络的详细分析
  • 进程管理: 安全的进程终止和资源限制
  • 问题定位: 从系统级别到线程级别的精准定位

实用工具集

  • 基础监控脚本: 快速获取系统状态
  • 深度排查工具: 性能计数器、系统调用分析
  • 应急处理: 紧急情况下的进程管理
  • 优化建议: 基于实际情况的调优建议

关键排查流程

  1. 快速识别: 使用监控面板快速定位问题类型
  2. 深度分析: 使用专业工具进行根本原因分析
  3. 安全处理: 在了解影响的前提下处理问题进程
  4. 持续优化: 实施长期优化方案防止问题复发

希望能帮助运维人员快速响应和解决性能问题。

相关推荐
我也不曾来过12 分钟前
传输层协议UDP和TCP
linux·网络·udp
molihuan2 分钟前
最新VMware Ubuntu 1分钟极速安装 植物人教程
linux·ubuntu
sdm07042713 分钟前
深刻理解进程信号
linux·运维·服务器
Simonhans17 分钟前
Linux安装Bun
linux·bun
lagrahhn24 分钟前
ES索引的基础和进阶内容
后端·elasticsearch·搜索引擎
SamDeepThinking26 分钟前
秒杀系统怎么区分真实用户和黄牛脚本?
java·后端·架构
stark张宇26 分钟前
深入Go运行时:数值溢出、浮点精度与栈堆分配决策
后端·go
fliter29 分钟前
Rust 里最让人头疼的两个类型:Pin 和 Unpin,究竟解决了什么问题?
后端
70asunflower30 分钟前
Ubuntu `tree` 命令完全指南:让目录结构一目了然
linux·数据库·ubuntu
老四啊laosi30 分钟前
【Linux系统】16. 进程程序替换
linux·exec·程序替换