服务器操作卡顿问题解决

1、目前使用时发现操作明显有卡顿感 常用的方法,进程角度

bash 复制代码
# 综合视图,按 CPU 排序
ps aux --sort=-%cpu | head -20

# 按内存排序
ps aux --sort=-%mem | head -20

# 线程级查看(找多线程程序的瓶颈线程)
top -H -p <PID>  # 然后按 P 排序看 CPU,按 M 排序看内存

# 进程树,看父子关系
pstree -p <PID>
bash 复制代码
hanwang@k8s-master-node:~/work/ai-coding$ ps aux --sort=-%mem | head -20
USER         PID %CPU %MEM    VSZ   RSS TTY      STAT START   TIME COMMAND
libvirt+    4186 11.4 28.3 15033916 8392752 ?    Sl   3月26 406:23 /usr/bin/qemu-system-x86_64 -name guest=devstack-vm,debug-threads=on -S -object {"qom-type":"secret","id":"masterKey0","format":"raw","file":"/var/lib/libvirt/qemu/domain-1-devstack-vm/master-key.aes"} -machine pc-q35-8.2,usb=off,dump-guest-core=off,memory-backend=pc.ram,hpet=off,acpi=on -accel kvm -cpu host,migratable=on -m size=8388608k -object {"qom-type":"memory-backend-ram","id":"pc.ram","size":8589934592} -overcommit mem-lock=off -smp 4,sockets=4,cores=1,threads=1 -uuid 5e6f16c3-0a69-412e-a303-ac90d506e8f4 -display none -no-user-config -nodefaults -chardev socket,id=charmonitor,fd=32,server=on,wait=off -mon chardev=charmonitor,id=monitor,mode=control -rtc base=utc,driftfix=slew -global kvm-pit.lost_tick_policy=delay -no-shutdown -global ICH9-LPC.disable_s3=1 -global ICH9-LPC.disable_s4=1 -boot strict=on -device {"driver":"pcie-root-port","port":8,"chassis":1,"id":"pci.1","bus":"pcie.0","multifunction":true,"addr":"0x1"} -device {"driver":"pcie-root-port","port":9,"chassis":2,"id":"pci.2","bus":"pcie.0","addr":"0x1.0x1"} -device {"driver":"pcie-root-port","port":10,"chassis":3,"id":"pci.3","bus":"pcie.0","addr":"0x1.0x2"} -device {"driver":"pcie-root-port","port":11,"chassis":4,"id":"pci.4","bus":"pcie.0","addr":"0x1.0x3"} -device {"driver":"pcie-root-port","port":12,"chassis":5,"id":"pci.5","bus":"pcie.0","addr":"0x1.0x4"} -device {"driver":"pcie-root-port","port":13,"chassis":6,"id":"pci.6","bus":"pcie.0","addr":"0x1.0x5"} -device {"driver":"pcie-root-port","port":14,"chassis":7,"id":"pci.7","bus":"pcie.0","addr":"0x1.0x6"} -device {"driver":"pcie-root-port","port":15,"chassis":8,"id":"pci.8","bus":"pcie.0","addr":"0x1.0x7"} -device {"driver":"pcie-root-port","port":16,"chassis":9,"id":"pci.9","bus":"pcie.0","multifunction":true,"addr":"0x2"} -device {"driver":"pcie-root-port","port":17,"chassis":10,"id":"pci.10","bus":"pcie.0","addr":"0x2.0x1"} -device {"driver":"pcie-root-port","port":18,"chassis":11,"id":"pci.11","bus":"pcie.0","addr":"0x2.0x2"} -device {"driver":"pcie-root-port","port":19,"chassis":12,"id":"pci.12","bus":"pcie.0","addr":"0x2.0x3"} -device {"driver":"pcie-root-port","port":20,"chassis":13,"id":"pci.13","bus":"pcie.0","addr":"0x2.0x4"} -device {"driver":"pcie-root-port","port":21,"chassis":14,"id":"pci.14","bus":"pcie.0","addr":"0x2.0x5"} -device {"driver":"qemu-xhci","p2":15,"p3":15,"id":"usb","bus":"pci.2","addr":"0x0"} -device {"driver":"virtio-serial-pci","id":"virtio-serial0","bus":"pci.3","addr":"0x0"} -blockdev {"driver":"file","filename":"/var/lib/libvirt/images/devstack/base.img","node-name":"libvirt-3-storage","auto-read-only":true,"discard":"unmap"} -blockdev {"node-name":"libvirt-3-format","read-only":true,"discard":"unmap","driver":"qcow2","file":"libvirt-3-storage","backing":null} -blockdev {"driver":"file","filename":"/var/lib/libvirt/images/devstack-vm-1.qcow2","node-name":"libvirt-2-storage","auto-read-only":true,"discard":"unmap"} -blockdev {"node-name":"libvirt-2-format","read-only":false,"discard":"unmap","driver":"qcow2","file":"libvirt-2-storage","backing":"libvirt-3-format"} -device {"driver":"virtio-blk-pci","bus":"pci.4","addr":"0x0","drive":"libvirt-2-format","id":"virtio-disk0","bootindex":1} -blockdev {"driver":"file","filename":"/var/lib/libvirt/images/devstack/seed.iso","node-name":"libvirt-1-storage","auto-read-only":true,"discard":"unmap"} -blockdev {"node-name":"libvirt-1-format","read-only":true,"driver":"raw","file":"libvirt-1-storage"} -device {"driver":"ide-cd","bus":"ide.0","drive":"libvirt-1-format","id":"sata0-0-0"} -netdev {"type":"tap","fd":"30","vhost":true,"vhostfd":"36","id":"hostnet0"} -device {"driver":"virtio-net-pci","netdev":"hostnet0","id":"net0","mac":"52:54:00:f6:f2:55","bus":"pci.1","addr":"0x0"} -chardev pty,id=charserial0 -device {"driver":"isa-serial","chardev":"charserial0","id":"serial0","index":0} -chardev socket,id=charchannel0,fd=31,server=on,wait=off -device {"driver":"virtserialport","bus":"virtio-serial0.0","nr":1,"chardev":"charchannel0","id":"channel0","name":"org.qemu.guest_agent.0"} -audiodev {"id":"audio1","driver":"none"} -global ICH9-LPC.noreboot=off -watchdog-action reset -device {"driver":"virtio-balloon-pci","id":"balloon0","bus":"pci.5","addr":"0x0"} -object {"qom-type":"rng-random","id":"objrng0","filename":"/dev/urandom"} -device {"driver":"virtio-rng-pci","rng":"objrng0","id":"rng0","bus":"pci.6","addr":"0x0"} -sandbox on,obsolete=deny,elevateprivileges=deny,spawn=deny,resourcecontrol=deny -msg timestamp=on
hanwang     8557  0.2 15.3 14077228 4546868 ?    Sl   3月26   7:47 /usr/share/elasticsearch/jdk/bin/java -Des.networkaddress.cache.ttl=60 -Des.networkaddress.cache.negative.ttl=10 -Djava.security.manager=allow -XX:+AlwaysPreTouch -Xss1m -Djava.awt.headless=true -Dfile.encoding=UTF-8 -Djna.nosys=true -XX:-OmitStackTraceInFastThrow -Dio.netty.noUnsafe=true -Dio.netty.noKeySetOptimization=true -Dio.netty.recycler.maxCapacityPerThread=0 -Dlog4j.shutdownHookEnabled=false -Dlog4j2.disable.jmx=true -Dlog4j2.formatMsgNoLookups=true -Djava.locale.providers=SPI,COMPAT --add-opens=java.base/java.io=org.elasticsearch.preallocate -Des.cgroups.hierarchy.override=/ -XX:+UseG1GC -Djava.io.tmpdir=/tmp/elasticsearch-14664739578300384182 --add-modules=jdk.incubator.vector -XX:+HeapDumpOnOutOfMemoryError -XX:+ExitOnOutOfMemoryError -XX:HeapDumpPath=data -XX:ErrorFile=logs/hs_err_pid%p.log -Xlog:gc*,gc+age=trace,safepoint:file=logs/gc.log:utctime,level,pid,tags:filecount=32,filesize=64m -Xms3849m -Xmx3849m -XX:MaxDirectMemorySize=2018508800 -XX:G1HeapRegionSize=4m -XX:InitiatingHeapOccupancyPercent=30 -XX:G1ReservePercent=15 -Des.distribution.type=docker --module-path /usr/share/elasticsearch/lib --add-modules=jdk.net --add-modules=ALL-MODULE-PATH -m org.elasticsearch.server/org.elasticsearch.bootstrap.Elasticsearch
hanwang   625102 13.3  2.0 74796540 616464 pts/5 Tl   09:50   0:11 /usr/lib/node_modules/opencode-ai/bin/.opencode
hanwang     7571  0.9  1.9 726924 585612 ?       Ss   3月26  32:17 /usr/bin/Xvfb :1 -screen 0 15360x8640x24 -dpi 96 +extension COMPOSITE +extension DAMAGE +extension GLX +extension RANDR +extension RENDER +extension MIT-SHM +extension XFIXES +extension XTEST +iglx +render -nolisten tcp -ac -noreset -shmem
root        8339  4.0  1.8 1320308 555536 ?      Ssl  3月26 143:50 kube-apiserver --advertise-address=192.168.18.133 --allow-privileged=true --audit-log-format=json --audit-log-maxage=7 --audit-log-maxbackup=10 --audit-log-maxsize=100 --audit-log-path=/var/log/kubernetes/audit.log --audit-policy-file=/etc/kubernetes/audit-policy.yml --authorization-mode=Node,RBAC --client-ca-file=/etc/kubernetes/pki/ca.crt --enable-admission-plugins=NodeRestriction --enable-aggregator-routing=true --enable-bootstrap-token-auth=true --etcd-cafile=/etc/kubernetes/pki/etcd/ca.crt --etcd-certfile=/etc/kubernetes/pki/apiserver-etcd-client.crt --etcd-keyfile=/etc/kubernetes/pki/apiserver-etcd-client.key --etcd-servers=https://127.0.0.1:2379 --feature-gates= --kubelet-client-certificate=/etc/kubernetes/pki/apiserver-kubelet-client.crt --kubelet-client-key=/etc/kubernetes/pki/apiserver-kubelet-client.key --kubelet-preferred-address-types=InternalIP,ExternalIP,Hostname --proxy-client-cert-file=/etc/kubernetes/pki/front-proxy-client.crt --proxy-client-key-file=/etc/kubernetes/pki/front-proxy-client.key --requestheader-allowed-names=front-proxy-client --requestheader-client-ca-file=/etc/kubernetes/pki/front-proxy-ca.crt --requestheader-extra-headers-prefix=X-Remote-Extra- --requestheader-group-headers=X-Remote-Group --requestheader-username-headers=X-Remote-User --secure-port=6443 --service-account-issuer=https://kubernetes.default.svc.cluster.local --service-account-key-file=/etc/kubernetes/pki/sa.pub --service-account-signing-key-file=/etc/kubernetes/pki/sa.key --service-cluster-ip-range=10.96.0.0/22 --tls-cert-file=/etc/kubernetes/pki/apiserver.crt --tls-private-key-file=/etc/kubernetes/pki/apiserver.key
hanwang  3823945  0.4  1.6 7008400 486596 pts/4  Sl+  3月28   2:45 /home/hanwang/work/cowpaw/venv/bin/python3 /home/hanwang/work/cowpaw/venv/bin/copaw app --host 0.0.0.0
pcp         5294  0.4  1.5 2221828 469296 ?      Ssl  3月26  15:49 mysqld
pcp        13524  0.4  1.5 2353940 467044 ?      Ssl  3月26  16:50 mysqld
hanwang     7963 38.0  1.4 3689060 427180 pts/0  SLsl+ 3月26 1346:21 /usr/bin/retroarch -f
pcp         5574  0.5  1.3 2331232 388364 ?      Ssl  3月26  19:10 mysqld --max_connections=1000 --character-set-server=utf8mb4 --collation-server=utf8mb4_unicode_ci --default-authentication-plugin=mysql_native_password --tls_version=TLSv1.2,TLSv1.3 --init-file /data/application/init.sql --binlog_expire_logs_seconds=604800
hanwang  2764530  3.4  1.1 34785652 333112 ?     Sl   3月28  49:18 /home/hanwang/.qoder-server/bin/16c1dec9a3994f378353c6947b77c2dcf96e7c49/node --dns-result-order=ipv4first /home/hanwang/.qoder-server/bin/16c1dec9a3994f378353c6947b77c2dcf96e7c49/out/bootstrap-fork --type=extensionHost --transformURIs --useHostProxy=false
hanwang     6566  0.0  0.9 6587772 292244 ?      Sl   3月26   0:14 /snap/snap-store/1216/usr/bin/snap-store --gapplication-service
hanwang     4359  0.0  0.9 5169488 275436 ?      Ssl  3月26   2:57 /usr/bin/gnome-shell
root        5253  0.0  0.8 273463132 264468 ?    Ssl  3月26   0:47 /jellyfin/jellyfin
hanwang    17060  0.0  0.8 879348 253756 ?       SNl  3月26   2:42 /usr/bin/python3 /usr/bin/update-manager --no-update --no-focus-on-map
472         5575  0.3  0.8 1804476 244240 ?      Ssl  3月26  13:32 grafana server --homepath=/usr/share/grafana --config=/etc/grafana/grafana.ini --packaging=docker cfg:default.log.mode=console cfg:default.paths.data=/var/lib/grafana cfg:default.paths.logs=/var/log/grafana cfg:default.paths.plugins=/var/lib/grafana/plugins cfg:default.paths.provisioning=/etc/grafana/provisioning
hanwang  2764730  0.3  0.7 5805520 216720 ?      Sl   3月28   5:22 /home/hanwang/.qoder-server/bin/16c1dec9a3994f378353c6947b77c2dcf96e7c49/extensions/aicoding-agent/bin/x86_64_linux/Qoder start --workDir /home/hanwang/.config/Qoder/4fd368a50cab7edf1f493950ad5427e90b9ac12fca180ac55f52481aaf7b6541/SharedClientCache
root        1078  0.2  0.6 1638684 199424 ?      S<Lsl 3月26   7:45 ovs-vswitchd unix:/var/run/openvswitch/db.sock -vconsole:emer -vsyslog:err -vfile:info --mlockall --no-chdir --log-file=/var/log/openvswitch/ovs-vswitchd.log --pidfile=/var/run/openvswitch/ovs-vswitchd.pid --detach
root         424  0.0  0.6 252252 178004 ?       S<s  3月26   0:30 /usr/lib/systemd/systemd-journald

2 问题发现

进程 PID %MEM RSS (实际内存) 说明
qemu-system-x86_64 (devstack-vm) 4186 28.3% 8.0 GB KVM 虚拟机
Elasticsearch 8557 15.3% 4.3 GB 搜索引擎
opencode-ai 625102 2.0% 616 MB AI 编码助手
Xvfb 7571 1.9% 585 MB 虚拟显示服务器
kube-apiserver 8339 1.8% 555 MB K8s API 服务器
cowpaw (Python) 3823945 1.6% 486 MB Python 应用
MySQL (3个实例) 5294/13524/5574 4.3% 共 ~1.3 GB 数据库
retroarch 7963 1.4% 427 MB 游戏模拟器
qoder-server (Node) 2764530 1.1% 333 MB 代码服务器

3、目前前两位服务暂时不用,考虑关掉

perl 复制代码
sudo virsh shutdown devstack-vm  # 直接关机释放 8GB,需要等待一段事件

4、总结了一套诊断脚本

bash 复制代码
#!/bin/bash
#===============================================================================
#
#          FILE: system_diagnosis.sh
#
#         USAGE: ./system_diagnosis.sh [options]
#
#   DESCRIPTION: 服务器卡顿分层排查脚本
#                1. 快速概览层 (1分钟)
#                2. 系统资源层 (CPU/内存/磁盘/网络)
#                3. 进程服务层 (TOP进程/僵尸进程/服务状态)
#                4. 深度分析层 (IO延迟/系统调用/硬件错误)
#
#       OPTIONS: -q (快速模式), -d (深度模式), -o <file> (输出到文件)
#  REQUIREMENTS: root权限(部分功能), sysstat/iperf3等工具(可选)
#          BUGS: 报告至 hanwang@example.com
#         NOTES: 建议在卡顿期间运行,多次采样更准确
#        AUTHOR: Han Wang, Cloud Platform Architect
#       VERSION: 2.0
#       CREATED: 2026-03-29
#===============================================================================

set -o pipefail

# 颜色定义
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
CYAN='\033[0;36m'
NC='\033[0m' # No Color

# 全局变量
REPORT_FILE=""
QUICK_MODE=false
DEEP_MODE=false
SAMPLE_INTERVAL=1
SAMPLE_COUNT=3
START_TIME=$(date +%s)

# 日志函数
log_info() {
    echo -e "${GREEN}[INFO]${NC} $1"
}

log_warn() {
    echo -e "${YELLOW}[WARN]${NC} $1"
}

log_error() {
    echo -e "${RED}[ERROR]${NC} $1"
}

log_section() {
    echo -e "\n${CYAN}========================================${NC}"
    echo -e "${CYAN}  $1${NC}"
    echo -e "${CYAN}========================================${NC}\n"
}

# 检查root权限
check_root() {
    if [[ $EUID -ne 0 ]]; then
        log_warn "当前非root用户,部分诊断信息可能无法获取"
        log_warn "建议: sudo ./system_diagnosis.sh"
        echo ""
    fi
}

# 检查依赖工具
check_dependencies() {
    local missing_tools=()
    
    for tool in vmstat iostat mpstat pidstat ss; do
        if ! command -v $tool &> /dev/null; then
            missing_tools+=($tool)
        fi
    done
    
    if [[ ${#missing_tools[@]} -gt 0 ]]; then
        log_warn "缺少工具: ${missing_tools[*]}"
        log_info "安装: sudo apt-get install -y sysstat iproute2"
        echo ""
    fi
}

# 打印报告头部
print_header() {
    cat << 'EOF'
    ____  _   _ _____    _     _       _             
   / ___|| | | | ____|  | |   (_)_ __ | |_ ___  _ __ 
   \___ \| |_| |  _|    | |   | | '_ \| __/ _ \| '__|
    ___) |  _  | |___   | |___| | | | | || (_) | |   
   |____/|_| |_|_____|  |_____|_|_| |_|\__\___/|_|   
                                                     
   服务器卡顿分层排查脚本 v2.0
   Author: Han Wang | Cloud Platform Architect
EOF
    echo ""
    log_info "开始时间: $(date '+%Y-%m-%d %H:%M:%S')"
    log_info "主机名: $(hostname)"
    log_info "系统: $(lsb_release -ds 2>/dev/null || cat /etc/os-release | grep PRETTY_NAME | cut -d'"' -f2)"
    log_info "内核: $(uname -r)"
    log_info "架构: $(uname -m)"
    echo ""
}

#===============================================================================
# 第一层:快速概览 (1分钟健康检查)
#===============================================================================
layer1_quick_overview() {
    log_section "第一层:快速概览 (1分钟健康检查)"
    
    # 系统负载
    echo -e "${BLUE}[系统负载]${NC}"
    uptime_info=$(uptime)
    echo "  $uptime_info"
    
    # 解析负载值
    load_1min=$(echo "$uptime_info" | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | tr -d ' ')
    cpu_cores=$(nproc)
    load_threshold=$(echo "$cpu_cores * 2" | bc -l 2>/dev/null || echo "$((cpu_cores * 2))")
    
    if [[ $(echo "$load_1min > $load_threshold" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
        log_error "1分钟负载 $load_1min 超过阈值 $load_threshold (CPU核数×2),系统过载!"
    elif [[ $(echo "$load_1min > $cpu_cores" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
        log_warn "1分钟负载 $load_1min 超过CPU核数 $cpu_cores,负载较高"
    else
        log_info "负载正常: $load_1min / $cpu_cores 核"
    fi
    echo ""
    
    # 内存概览
    echo -e "${BLUE}[内存概览]${NC}"
    free -h | grep -E "(Mem|Swap)" | while read line; do
        echo "  $line"
    done
    
    mem_info=$(free | grep Mem)
    total_mem=$(echo $mem_info | awk '{print $2}')
    used_mem=$(echo $mem_info | awk '{print $3}')
    avail_mem=$(echo $mem_info | awk '{print $7}')
    mem_usage=$(echo "scale=1; $used_mem * 100 / $total_mem" | bc -l 2>/dev/null || echo "0")
    
    if [[ $(echo "$mem_usage > 90" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
        log_error "内存使用率 ${mem_usage}%,严重不足!"
    elif [[ $(echo "$mem_usage > 70" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
        log_warn "内存使用率 ${mem_usage}%,压力较大"
    else
        log_info "内存使用率 ${mem_usage}%,状态良好"
    fi
    
    # Swap检查
    swap_info=$(free | grep Swap)
    swap_total=$(echo $swap_info | awk '{print $2}')
    swap_used=$(echo $swap_info | awk '{print $3}')
    if [[ $swap_total -gt 0 && $swap_used -gt 0 ]]; then
        swap_usage=$(echo "scale=1; $swap_used * 100 / $swap_total" | bc -l 2>/dev/null || echo "0")
        if [[ $(echo "$swap_usage > 50" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            log_error "Swap使用率 ${swap_usage}%,频繁换页导致卡顿!"
        else
            log_warn "Swap使用率 ${swap_usage}%,注意内存压力"
        fi
    fi
    echo ""
    
    # 磁盘概览
    echo -e "${BLUE}[磁盘概览]${NC}"
    df -h | grep -E "^/dev" | while read line; do
        usage=$(echo $line | awk '{print $5}' | tr -d '%')
        mount=$(echo $line | awk '{print $6}')
        if [[ $usage -gt 90 ]]; then
            echo -e "  ${RED}[警告]${NC} $line"
        elif [[ $usage -gt 80 ]]; then
            echo -e "  ${YELLOW}[注意]${NC} $line"
        else
            echo "  $line"
        fi
    done
    echo ""
}

#===============================================================================
# 第二层:系统资源层详细检查
#===============================================================================
layer2_system_resources() {
    log_section "第二层:系统资源层详细检查"
    
    # CPU 详细分析
    echo -e "${BLUE}[CPU 详细分析]${NC}"
    echo "  物理核数: $(nproc)"
    echo "  型号: $(grep 'model name' /proc/cpuinfo | head -1 | cut -d':' -f2 | sed 's/^[ \t]*//')"
    
    if command -v mpstat &> /dev/null; then
        echo ""
        echo "  各核负载分布 (mpstat -P ALL):"
        mpstat -P ALL $SAMPLE_INTERVAL 1 | tail -n +4 | head -n 20 | while read line; do
            echo "    $line"
        done
        
        # 检查是否单核瓶颈
        max_cpu=$(mpstat -P ALL $SAMPLE_INTERVAL 1 | tail -n +4 | awk '{print $4}' | sort -rn | head -1)
        if [[ $(echo "$max_cpu > 80" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            log_warn "检测到单核使用率超过80%,可能存在单线程瓶颈"
        fi
    fi
    
    # 检查 steal time (虚拟机场景)
    if grep -q "st" /proc/stat 2>/dev/null; then
        steal_time=$(top -bn1 | grep "Cpu(s)" | grep -oP '\d+\.\d+(?=\s*st)' | head -1)
        if [[ -n "$steal_time" && $(echo "$steal_time > 5" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            log_error "CPU Steal Time: ${steal_time}%,宿主机超售严重!"
        fi
    fi
    echo ""
    
    # 内存压力检测
    echo -e "${BLUE}[内存压力检测]${NC}"
    
    # /proc/meminfo 关键指标
    echo "  关键指标:"
    for key in MemTotal MemFree MemAvailable Buffers Cached Active Inactive Dirty Writeback AnonPages; do
        value=$(grep "^$key:" /proc/meminfo | awk '{print $2, $3}')
        [[ -n "$value" ]] && echo "    $key: $value"
    done
    
    # Dirty 内存检查
    dirty=$(grep "^Dirty:" /proc/meminfo | awk '{print $2}')
    if [[ $dirty -gt 102400 ]]; then  # > 100MB
        log_warn "Dirty内存 ${dirty}KB,大量数据待写入磁盘,即将触发IO风暴"
    fi
    
    # 检查 OOM 历史
    if dmesg 2>/dev/null | grep -i "out of memory" | tail -5 | grep -q "oom"; then
        log_error "检测到历史 OOM 事件!"
        echo "  最近5条:"
        dmesg 2>/dev/null | grep -i "out of memory" | tail -5 | sed 's/^/    /'
    fi
    echo ""
    
    # 磁盘 IO 分析
    echo -e "${BLUE}[磁盘 IO 分析]${NC}"
    if command -v iostat &> /dev/null; then
        echo "  IO统计 (iostat -xz):"
        iostat -xz $SAMPLE_INTERVAL 1 | tail -n +4 | head -20 | while read line; do
            echo "    $line"
        done
        
        # 提取关键指标
        disk_util=$(iostat -xz $SAMPLE_INTERVAL 1 2>/dev/null | awk '/^nvme|^sd|^vd/{getline; print $NF}' | sort -rn | head -1)
        if [[ -n "$disk_util" && $(echo "$disk_util > 90" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            log_error "磁盘利用率 ${disk_util}%,IO 饱和!"
        elif [[ -n "$disk_util" && $(echo "$disk_util > 70" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            log_warn "磁盘利用率 ${disk_util}%,IO 压力较大"
        fi
    else
        log_warn "未安装 sysstat,无法获取 IO 统计"
    fi
    
    # 检查磁盘延迟 (通过 /proc/diskstats)
    echo ""
    echo "  磁盘延迟估算 (读取/写入 ms):"
    cat /proc/diskstats | grep -E "nvme|sd[a-z] " | while read line; do
        device=$(echo $line | awk '{print $3}')
        reads=$(echo $line | awk '{print $6}')
        read_ms=$(echo $line | awk '{print $7}')
        writes=$(echo $line | awk '{print $10}')
        write_ms=$(echo $line | awk '{print $11}')
        
        if [[ $reads -gt 0 ]]; then
            avg_read_latency=$(echo "scale=2; $read_ms / $reads" | bc -l 2>/dev/null || echo "N/A")
        else
            avg_read_latency="N/A"
        fi
        
        if [[ $writes -gt 0 ]]; then
            avg_write_latency=$(echo "scale=2; $write_ms / $writes" | bc -l 2>/dev/null || echo "N/A")
        else
            avg_write_latency="N/A"
        fi
        
        echo "    $device: 读延迟=${avg_read_latency}ms, 写延迟=${avg_write_latency}ms"
    done
    echo ""
    
    # 网络分析
    echo -e "${BLUE}[网络连接分析]${NC}"
    echo "  连接状态统计:"
    ss -s 2>/dev/null | sed 's/^/    /'
    
    echo ""
    echo "  TCP 状态分布:"
    ss -ant | awk 'NR>1 {++S[$1]} END {for(a in S) print "    " a, S[a]}' | sort -rn -k2
    
    # 检查 TIME_WAIT 和 CLOSE_WAIT
    time_wait=$(ss -ant | grep -c TIME_WAIT 2>/dev/null || echo "0")
    close_wait=$(ss -ant | grep -c CLOSE_WAIT 2>/dev/null || echo "0")
    
    if [[ $close_wait -gt 100 ]]; then
        log_error "CLOSE_WAIT 连接数 $close_wait,存在连接泄漏!"
    fi
    
    if [[ $time_wait -gt 10000 ]]; then
        log_warn "TIME_WAIT 连接数 $time_wait,端口可能耗尽"
    fi
    echo ""
}

#===============================================================================
# 第三层:进程服务层检查
#===============================================================================
layer3_process_services() {
    log_section "第三层:进程服务层检查"
    
    # TOP 资源消耗进程
    echo -e "${BLUE}[TOP 资源消耗进程]${NC}"
    
    echo "  CPU 占用 TOP 10:"
    ps aux --sort=-%cpu | head -11 | tail -10 | nl -w 2 -s '. ' | while read line; do
        pid=$(echo $line | awk '{print $3}')
        cpu=$(echo $line | awk '{print $4}')
        mem=$(echo $line | awk '{print $5}')
        cmd=$(echo $line | cut -d' ' -f12-)
        
        if [[ $(echo "$cpu > 50" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            echo -e "    ${RED}$line${NC}"
        elif [[ $(echo "$cpu > 20" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            echo -e "    ${YELLOW}$line${NC}"
        else
            echo "    $line"
        fi
    done
    
    echo ""
    echo "  内存占用 TOP 10:"
    ps aux --sort=-%mem | head -11 | tail -10 | nl -w 2 -s '. ' | while read line; do
        mem=$(echo $line | awk '{print $5}')
        if [[ $(echo "$mem > 10" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            echo -e "    ${RED}$line${NC}"
        elif [[ $(echo "$mem > 5" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
            echo -e "    ${YELLOW}$line${NC}"
        else
            echo "    $line"
        fi
    done
    echo ""
    
    # 僵尸进程检查
    echo -e "${BLUE}[僵尸进程检查]${NC}"
    zombie_count=$(ps aux | awk '$8=="Z" {count++} END {print count+0}')
    if [[ $zombie_count -gt 0 ]]; then
        log_error "发现 $zombie_count 个僵尸进程!"
        echo "  僵尸进程列表:"
        ps aux | awk '$8=="Z" {print "    PID: " $2, "PPID: " $3, "CMD: " $11}' | head -10
        echo "  父进程:"
        ps aux | awk '$8=="Z" {print $3}' | sort -u | xargs -I {} ps -p {} -o pid,comm 2>/dev/null
    else
        log_info "未发现僵尸进程"
    fi
    echo ""
    
    # D 状态进程(不可中断睡眠,通常是IO等待)
    echo -e "${BLUE}[D 状态进程检查 (不可中断睡眠)]${NC}"
    d_state_procs=$(ps aux | awk '$8=="D" {print $0}')
    if [[ -n "$d_state_procs" ]]; then
        log_error "发现 D 状态进程(IO 等待/死锁):"
        echo "$d_state_procs" | head -5 | sed 's/^/    /'
    else
        log_info "未发现 D 状态进程"
    fi
    echo ""
    
    # 系统服务状态
    echo -e "${BLUE}[关键服务状态]${NC}"
    critical_services=("sshd" "systemd-journald" "cron" "networking" "docker" "containerd" "kubelet")
    for service in "${critical_services[@]}"; do
        if systemctl is-active --quiet $service 2>/dev/null; then
            echo "  [运行中] $service"
        elif systemctl is-failed --quiet $service 2>/dev/null; then
            echo -e "  ${RED}[失败] $service${NC}"
        else
            echo "  [未运行/未安装] $service"
        fi
    done
    echo ""
    
    # 定时任务检查
    echo -e "${BLUE}[定时任务检查]${NC}"
    echo "  当前时间: $(date '+%H:%M')"
    echo "  最近5分钟内可能执行的 cron 任务:"
    crontab -l 2>/dev/null | grep -v "^#" | while read line; do
        echo "    $line"
    done
    ls -la /etc/cron.d/ 2>/dev/null | tail -n +3 | while read line; do
        echo "    /etc/cron.d/$(echo $line | awk '{print $9}')"
    done
    echo ""
}

#===============================================================================
# 第四层:深度分析(可选)
#===============================================================================
layer4_deep_analysis() {
    if [[ "$DEEP_MODE" != true ]]; then
        return
    fi
    
    log_section "第四层:深度分析 (Deep Mode)"
    
    # IO 压力详细分析
    echo -e "${BLUE}[IO 压力详细分析]${NC}"
    if [[ -f /proc/pressure/io ]]; then
        echo "  PSI IO 压力统计:"
        cat /proc/pressure/io | sed 's/^/    /'
    fi
    
    if [[ -f /proc/pressure/memory ]]; then
        echo ""
        echo "  PSI 内存压力统计:"
        cat /proc/pressure/memory | sed 's/^/    /'
    fi
    echo ""
    
    # 系统调用跟踪(采样)
    echo -e "${BLUE}[系统调用采样 (pidstat)]${NC}"
    if command -v pidstat &> /dev/null; then
        echo "  采样 5 秒,查看系统调用频率..."
        pidstat -S 1 5 2>/dev/null | tail -20 | sed 's/^/    /'
    fi
    echo ""
    
    # 上下文切换和中断
    echo -e "${BLUE}[上下文切换统计]${NC}"
    if command -v vmstat &> /dev/null; then
        echo "  (cs: 上下文切换, in: 中断, us/sy/id/wa: CPU时间分布)"
        vmstat -s | grep -E "(context switch|interrupt|CPU)" | sed 's/^/    /'
    fi
    echo ""
    
    # 硬件错误检查
    echo -e "${BLUE}[硬件错误检查]${NC}"
    if command -v mcelog &> /dev/null; then
        mcelog --client 2>/dev/null | head -10 | sed 's/^/    /'
    else
        log_info "未安装 mcelog,跳过硬件错误检查"
    fi
    
    # 内核日志错误
    echo ""
    echo "  最近内核错误 (dmesg):"
    dmesg -T 2>/dev/null | grep -iE "(error|fail|warn|oom|killed)" | tail -10 | sed 's/^/    /'
    echo ""
}

#===============================================================================
# 诊断报告生成
#===============================================================================
generate_report() {
    log_section "诊断报告总结"
    
    END_TIME=$(date +%s)
    DURATION=$((END_TIME - START_TIME))
    
    echo "排查耗时: ${DURATION} 秒"
    echo ""
    
    # 生成建议
    echo -e "${BLUE}[优化建议]${NC}"
    
    # 根据收集的信息给出建议
    load=$(uptime | awk -F'load average:' '{print $2}' | awk -F',' '{print $1}' | tr -d ' ')
    mem_usage=$(free | grep Mem | awk '{printf("%.0f", $3/$2 * 100)}')
    
    if [[ $(echo "$load > $(nproc)" | bc -l 2>/dev/null || echo "0") -eq 1 ]]; then
        echo "  1. [高负载] 系统负载过高,建议:"
        echo "     - 检查 CPU 占用最高的进程,考虑优化或迁移"
        echo "     - 如果是多核系统,检查是否存在单线程瓶颈"
    fi
    
    if [[ $mem_usage -gt 80 ]]; then
        echo "  2. [内存不足] 内存使用率 ${mem_usage}%,建议:"
        echo "     - 找出内存占用 TOP 进程,关闭不必要的服务"
        echo "     - 检查是否有内存泄漏(RSS 持续增长的进程)"
        echo "     - 考虑增加物理内存或启用 swap(临时方案)"
    fi
    
    # 检查是否有僵尸进程
    zombie_count=$(ps aux | awk '$8=="Z" {count++} END {print count+0}')
    if [[ $zombie_count -gt 0 ]]; then
        echo "  3. [僵尸进程] 存在 $zombie_count 个僵尸进程,建议重启父进程或系统"
    fi
    
    # 检查 D 状态进程
    d_count=$(ps aux | awk '$8=="D" {count++} END {print count+0}')
    if [[ $d_count -gt 0 ]]; then
        echo "  4. [IO 阻塞] 存在 $d_count 个 D 状态进程,建议:"
        echo "     - 检查磁盘健康状态 (smartctl)"
        echo "     - 检查是否有 NFS/iSCSI 等网络存储挂死"
    fi
    
    echo ""
    echo -e "${GREEN}诊断完成!如需进一步分析,请查看上述详细输出。${NC}"
}

#===============================================================================
# 主函数
#===============================================================================
main() {
    # 参数解析
    while getopts "qdo:h" opt; do
        case $opt in
            q) QUICK_MODE=true ;;
            d) DEEP_MODE=true ;;
            o) REPORT_FILE="$OPTARG" ;;
            h|*) 
                echo "Usage: $0 [-q] [-d] [-o <output_file>]"
                echo "  -q    快速模式 (仅第一层)"
                echo "  -d    深度模式 (包含第四层)"
                echo "  -o    输出到文件"
                exit 0
                ;;
        esac
    done
    
    # 重定向输出到文件
    if [[ -n "$REPORT_FILE" ]]; then
        exec > >(tee -a "$REPORT_FILE")
        exec 2>&1
    fi
    
    # 执行检查
    check_root
    check_dependencies
    print_header
    
    # 分层执行
    layer1_quick_overview
    
    if [[ "$QUICK_MODE" != true ]]; then
        layer2_system_resources
        layer3_process_services
        layer4_deep_analysis
    fi
    
    generate_report
}

# 运行主函数
main "$@"

执行

复制代码
bash diagnose.sh
相关推荐
vortex52 天前
进程管理器大横评:从 PM2 到 Systemd 的选型与实战
linux·shell·进程管理
Irene19914 天前
Shell 相关基础入门,在 Ubuntu 与 CentOS Shell 中的语法差异总结(bash、dash、sh)
shell
小肝一下4 天前
5. 基础IO
android·linux·shell·基础io·操作系统底层·伊涅夫·伊雷娜
红茶要加冰6 天前
七、正则表达式
linux·运维·正则表达式·shell
lifewange6 天前
WSL安装问题解决
shell
AdCj36 天前
放弃第三方框架,用系统自带工具玩转 Shell 测试
shell·测试
红茶要加冰6 天前
九、文本处理三剑客——sed
linux·运维·服务器·正则表达式·shell
红茶要加冰7 天前
五、流程控制之循环
linux·运维·shell
红茶要加冰7 天前
二、shell中的变量
linux·运维·shell
Irene19917 天前
大数据开发(Hadoop/Spark 生态)在 Ubuntu 环境下:5 个高频率使用的功能性 Shell 脚本
shell