Linux进程调度与多核CPU深度解析——从内核调度器到实战优化

前言

做嵌入式Linux开发这些年，对进程调度这块一直是既熟悉又陌生。熟悉是因为天天跟进程线程打交道，陌生是因为很少深入去研究内核到底是怎么调度的。

直到去年做一个多核ARM平台的实时性优化项目，被逼着把Linux调度器的代码翻了个遍，才真正理解了这套机制。今天把这些内容整理出来，从最基本的进程切换讲到多核调度、从CFS调度器讲到实时调度策略。

进程与线程基础

从内核视角看进程

在Linux内核里，进程和线程都是用task_struct结构体表示的，内核并不严格区分它们：

c 复制代码

/**
 * task_struct 简化结构（实际有600多个字段）
 * 位于 include/linux/sched.h
 */
struct task_struct {
    /* 调度相关 */
    volatile long state;            // 进程状态
    int prio;                       // 动态优先级
    int static_prio;                // 静态优先级
    int normal_prio;                // 普通优先级
    unsigned int rt_priority;       // 实时优先级
    const struct sched_class *sched_class;  // 调度类
    struct sched_entity se;         // CFS调度实体
    struct sched_rt_entity rt;      // 实时调度实体
    unsigned int policy;            // 调度策略
    int nr_cpus_allowed;            // 允许运行的CPU数
    cpumask_t cpus_allowed;         // CPU亲和性掩码
    
    /* 进程关系 */
    struct task_struct *parent;     // 父进程
    struct list_head children;      // 子进程链表
    struct list_head sibling;       // 兄弟进程链表
    struct task_struct *group_leader;   // 线程组leader
    
    /* 内存管理 */
    struct mm_struct *mm;           // 内存描述符（线程共享）
    struct mm_struct *active_mm;
    
    /* 文件系统 */
    struct fs_struct *fs;           // 文件系统信息
    struct files_struct *files;     // 打开的文件
    
    /* 信号处理 */
    struct signal_struct *signal;
    struct sighand_struct *sighand;
    
    /* CPU上下文 */
    struct thread_struct thread;    // CPU特定的状态
    
    /* 标识 */
    pid_t pid;                      // 进程ID
    pid_t tgid;                     // 线程组ID（主线程的pid）
    
    /* ... 还有很多字段 */
};

进程状态

复制代码

                    ┌─────────────────────────────────────────────────────┐
                    │              Linux进程状态转换图                      │
                    └─────────────────────────────────────────────────────┘

                                    fork()
                                      │
                                      ▼
                              ┌───────────────┐
                              │  TASK_NEW     │
                              │   (新建)      │
                              └───────┬───────┘
                                      │ 初始化完成
                                      ▼
         ┌────────────────────────────────────────────────────────┐
         │                                                        │
         │                  ┌───────────────┐                     │
         │    ┌────────────→│TASK_RUNNING   │←────────────┐       │
         │    │             │  (就绪/运行)   │             │       │
         │    │             └───────┬───────┘             │       │
         │    │                     │                     │       │
         │    │            被调度执行│                     │       │
         │    │                     ▼                     │       │
         │    │             ┌───────────────┐             │       │
         │    │             │   CPU执行中   │             │       │
         │    │             └───────┬───────┘             │       │
         │    │                     │                     │       │
         │    │         ┌───────────┼───────────┐         │       │
         │    │         │           │           │         │       │
         │    │    时间片用完   等待I/O     等待信号    唤醒│       │
         │    │         │           │           │         │       │
         │    │         ▼           ▼           ▼         │       │
         │    │   ┌─────────┐ ┌───────────┐ ┌─────────┐  │       │
         │    └───│ 重新    │ │TASK_      │ │TASK_    │──┘       │
         │        │ 排队    │ │INTERRUPTIB│ │UNINTERRU│          │
         │        └─────────┘ │LE(可中断) │ │PTIBLE   │          │
         │                    └─────┬─────┘ └────┬────┘          │
         │                          │            │               │
         │                     I/O完成       特定事件             │
         │                          │            │               │
         │                          └─────┬──────┘               │
         │                                │                      │
         │                                ▼                      │
         │                        ┌───────────────┐              │
         │                        │  回到就绪队列  │──────────────┘
         │                        └───────────────┘
         │
         │     exit()
         │        │
         │        ▼
         │  ┌───────────────┐         ┌───────────────┐
         └─→│ TASK_DEAD     │────────→│   TASK_ZOMBIE │
            │  (退出中)      │         │    (僵尸)     │
            └───────────────┘         └───────┬───────┘
                                              │ 父进程wait()
                                              ▼
                                        ┌───────────┐
                                        │  回收资源  │
                                        └───────────┘

c 复制代码

/**
 * 进程状态定义
 */
#define TASK_RUNNING            0x0000  // 就绪或运行
#define TASK_INTERRUPTIBLE      0x0001  // 可中断睡眠
#define TASK_UNINTERRUPTIBLE    0x0002  // 不可中断睡眠
#define __TASK_STOPPED          0x0004  // 停止
#define __TASK_TRACED           0x0008  // 被跟踪（调试）
#define EXIT_DEAD               0x0010  // 最终死亡
#define EXIT_ZOMBIE             0x0020  // 僵尸状态
#define TASK_DEAD               0x0040  // 死亡
#define TASK_WAKEKILL           0x0080
#define TASK_WAKING             0x0100
#define TASK_PARKED             0x0200
#define TASK_NOLOAD             0x0400
#define TASK_NEW                0x0800

/**
 * 用户空间查看进程状态
 * /proc/<pid>/status 中的 State 字段
 */
// R (Running)      - 运行或就绪
// S (Sleeping)     - 可中断睡眠
// D (Disk Sleep)   - 不可中断睡眠
// T (Stopped)      - 停止
// Z (Zombie)       - 僵尸
// X (Dead)         - 死亡

进程 vs 线程

c 复制代码

/**
 * fork() vs pthread_create() 的本质区别
 * 
 * 都是调用 clone() 系统调用，只是 flags 不同
 */

// fork() 创建进程 - 复制几乎所有资源
// 等价于: clone(SIGCHLD, 0)
pid_t pid = fork();

// pthread_create() 创建线程 - 共享大部分资源
// 等价于: clone(CLONE_VM | CLONE_FS | CLONE_FILES | CLONE_SIGHAND | 
//               CLONE_THREAD | CLONE_SYSVSEM | CLONE_SETTLS |
//               CLONE_PARENT_SETTID | CLONE_CHILD_CLEARTID, 0)

/**
 * clone flags 含义
 */
#define CLONE_VM        0x00000100  // 共享内存空间
#define CLONE_FS        0x00000200  // 共享文件系统信息
#define CLONE_FILES     0x00000400  // 共享文件描述符表
#define CLONE_SIGHAND   0x00000800  // 共享信号处理
#define CLONE_THREAD    0x00010000  // 同一线程组
#define CLONE_SYSVSEM   0x00040000  // 共享System V信号量

线程和进程的资源共享对比：

复制代码

┌─────────────────────────────────────────────────────────────────┐
│                    进程 vs 线程 资源对比                          │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│   资源类型          │    多进程(fork)    │   多线程(pthread)     │
│  ──────────────────┼───────────────────┼──────────────────────│
│   内存空间(mm)      │    独立(复制)      │       共享           │
│   代码段            │    独立(COW)      │       共享           │
│   数据段            │    独立(COW)      │       共享           │
│   堆               │    独立(COW)      │       共享           │
│   栈               │      独立         │       独立           │
│   文件描述符        │    独立(复制)      │       共享           │
│   信号处理          │      独立         │       共享           │
│   PID              │      不同         │       相同(TGID)     │
│   寄存器/PC         │      独立         │       独立           │
│   调度             │      独立         │       独立           │
│                                                                 │
│   COW = Copy-On-Write（写时复制）                               │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

上下文切换详解

什么触发上下文切换

c 复制代码

/**
 * 上下文切换的触发条件
 */

// 1. 时间片耗尽（抢占式调度）
//    调度器定时检查，当前进程运行时间超过分配的时间片

// 2. 主动让出CPU
//    - 调用 sleep(), usleep(), nanosleep()
//    - 调用阻塞式I/O (read, write, recv, send等)
//    - 调用 sched_yield()
//    - 等待锁 (mutex, semaphore等)
//    - 等待条件变量

// 3. 被更高优先级进程抢占
//    - 高优先级进程从睡眠中被唤醒
//    - 高优先级进程被创建

// 4. 中断处理后
//    - 从中断返回用户态时检查是否需要调度

上下文切换过程

复制代码

┌─────────────────────────────────────────────────────────────────┐
│                    上下文切换详细流程                             │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│   进程A运行中                                                    │
│       │                                                         │
│       │ ① 触发切换（时钟中断/系统调用/主动让出）                   │
│       ▼                                                         │
│   ┌─────────────────────────────────────────────┐              │
│   │  保存进程A的CPU上下文                        │              │
│   │  - 通用寄存器 (rax,rbx,rcx...)              │              │
│   │  - 程序计数器 (rip/PC)                      │              │
│   │  - 栈指针 (rsp/SP)                          │              │
│   │  - 标志寄存器 (rflags)                      │              │
│   │  - 段寄存器                                 │              │
│   │  → 保存到 A 的 task_struct->thread          │              │
│   └─────────────────────────────────────────────┘              │
│       │                                                         │
│       │ ② 调用调度器选择下一个进程                               │
│       ▼                                                         │
│   ┌─────────────────────────────────────────────┐              │
│   │  schedule() -> __schedule()                 │              │
│   │  - pick_next_task() 选择进程B               │              │
│   │  - 更新统计信息                             │              │
│   └─────────────────────────────────────────────┘              │
│       │                                                         │
│       │ ③ 切换地址空间（如果是不同进程）                         │
│       ▼                                                         │
│   ┌─────────────────────────────────────────────┐              │
│   │  switch_mm()                                │              │
│   │  - 加载进程B的页表基地址到CR3寄存器          │              │
│   │  - 刷新TLB                                  │              │
│   └─────────────────────────────────────────────┘              │
│       │                                                         │
│       │ ④ 切换内核栈和CPU上下文                                  │
│       ▼                                                         │
│   ┌─────────────────────────────────────────────┐              │
│   │  switch_to(A, B)                            │              │
│   │  - 切换内核栈指针                           │              │
│   │  - 从 B 的 task_struct->thread 恢复寄存器   │              │
│   │  - 恢复程序计数器（跳转到B的执行点）         │              │
│   └─────────────────────────────────────────────┘              │
│       │                                                         │
│       ▼                                                         │
│   进程B开始运行                                                  │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

x86-64上下文切换代码分析

c 复制代码

/**
 * 上下文切换的核心代码
 * arch/x86/kernel/process_64.c
 */

/**
 * switch_to 宏 - 切换CPU上下文
 */
#define switch_to(prev, next, last)                     \
do {                                                    \
    ((last) = __switch_to_asm((prev), (next)));         \
} while (0)

/**
 * __switch_to_asm - 汇编实现的上下文切换
 * arch/x86/entry/entry_64.S
 */
/*
SYM_FUNC_START(__switch_to_asm)
    // 保存 prev 进程的寄存器到栈上
    pushq   %rbp
    pushq   %rbx
    pushq   %r12
    pushq   %r13
    pushq   %r14
    pushq   %r15
    
    // 切换栈指针
    // prev->thread.sp = rsp
    movq    %rsp, TASK_threadsp(%rdi)
    // rsp = next->thread.sp
    movq    TASK_threadsp(%rsi), %rsp
    
    // 恢复 next 进程的寄存器
    popq    %r15
    popq    %r14
    popq    %r13
    popq    %r12
    popq    %rbx
    popq    %rbp
    
    // 跳转到 __switch_to() 做其他清理工作
    jmp     __switch_to
SYM_FUNC_END(__switch_to_asm)
*/

/**
 * __switch_to - C语言部分的上下文切换
 */
__visible __notrace_funcgraph struct task_struct *
__switch_to(struct task_struct *prev_p, struct task_struct *next_p)
{
    struct thread_struct *prev = &prev_p->thread;
    struct thread_struct *next = &next_p->thread;
    
    // 切换FPU状态
    switch_fpu_prepare(prev_p, cpu);
    
    // 切换调试寄存器
    if (unlikely(test_tsk_thread_flag(next_p, TIF_DEBUG)))
        switch_to_debugreg();
    
    // 切换段寄存器
    load_TLS(next, cpu);
    
    // 切换内核栈
    this_cpu_write(current_task, next_p);
    this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p));
    
    // 切换I/O权限位图
    switch_to_bitmap(prev, next);
    
    // 更新PKRU
    if (prev_p->thread.pkru != next_p->thread.pkru)
        write_pkru(next_p->thread.pkru);
    
    return prev_p;
}

查看上下文切换统计

bash 复制代码

# 查看系统级上下文切换次数
$ cat /proc/stat | grep ctxt
ctxt 12345678901

# 查看单个进程的上下文切换
$ cat /proc/<pid>/status | grep ctxt
voluntary_ctxt_switches:	150      # 主动切换（等待I/O等）
nonvoluntary_ctxt_switches:	42   # 被动切换（时间片用完）

# 使用 vmstat 实时查看
$ vmstat 1
procs -----------memory---------- ---swap-- -----io---- -system-- ------cpu-----
 r  b   swpd   free   buff  cache   si   so    bi    bo   in   cs us sy id wa st
 1  0      0 123456  12345 234567    0    0     0     0  123  456  1  1 98  0  0
                                                              ↑
                                                          上下文切换次数/秒

# 使用 pidstat 查看进程级别
$ pidstat -w 1
Linux 5.4.0 ...
03:45:01 PM   UID       PID   cswch/s nvcswch/s  Command
03:45:02 PM     0      1234     12.00      3.00  myprocess

Linux调度器架构

调度类层次结构

Linux使用模块化的调度器设计，不同类型的进程使用不同的调度类：

复制代码

┌─────────────────────────────────────────────────────────────────┐
│                    Linux调度类优先级                             │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│   优先级高                                                       │
│      ↑                                                          │
│      │    ┌─────────────────────────────────────────────┐      │
│      │    │  stop_sched_class (最高优先级)               │      │
│      │    │  - 用于停止CPU (migration/stop任务)          │      │
│      │    │  - 不能被抢占                                │      │
│      │    └─────────────────────────────────────────────┘      │
│      │                         │                                │
│      │                         ▼                                │
│      │    ┌─────────────────────────────────────────────┐      │
│      │    │  dl_sched_class (Deadline调度类)            │      │
│      │    │  - SCHED_DEADLINE策略                       │      │
│      │    │  - 基于截止时间的实时调度                    │      │
│      │    │  - 最严格的实时保证                          │      │
│      │    └─────────────────────────────────────────────┘      │
│      │                         │                                │
│      │                         ▼                                │
│      │    ┌─────────────────────────────────────────────┐      │
│      │    │  rt_sched_class (实时调度类)                │      │
│      │    │  - SCHED_FIFO: 先进先出，无时间片            │      │
│      │    │  - SCHED_RR: 轮转，有时间片                  │      │
│      │    │  - 优先级 1-99                              │      │
│      │    └─────────────────────────────────────────────┘      │
│      │                         │                                │
│      │                         ▼                                │
│      │    ┌─────────────────────────────────────────────┐      │
│      │    │  fair_sched_class (CFS完全公平调度)         │      │
│      │    │  - SCHED_NORMAL: 普通进程                   │      │
│      │    │  - SCHED_BATCH: 批处理进程                  │      │
│      │    │  - nice值 -20 到 +19                        │      │
│      │    │  - 大多数进程使用此调度类                    │      │
│      │    └─────────────────────────────────────────────┘      │
│      │                         │                                │
│      ▼                         ▼                                │
│   优先级低  ┌─────────────────────────────────────────────┐      │
│            │  idle_sched_class (空闲调度类)              │      │
│            │  - SCHED_IDLE: 只在系统空闲时运行           │      │
│            │  - 优先级最低                               │      │
│            └─────────────────────────────────────────────┘      │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

CFS完全公平调度器

CFS（Completely Fair Scheduler）是Linux默认的调度器，核心思想是让每个进程获得公平的CPU时间。

c 复制代码

/**
 * CFS的核心数据结构
 */

/**
 * 调度实体 - 每个可调度单位（进程/进程组）
 */
struct sched_entity {
    struct load_weight      load;           // 权重（由nice值决定）
    struct rb_node          run_node;       // 红黑树节点
    unsigned int            on_rq;          // 是否在运行队列
    
    u64                     exec_start;     // 开始执行的时间
    u64                     sum_exec_runtime;   // 总执行时间
    u64                     vruntime;       // 虚拟运行时间（核心！）
    u64                     prev_sum_exec_runtime;
    
    // 组调度相关
    struct sched_entity     *parent;
    struct cfs_rq           *cfs_rq;        // 所属的CFS队列
    struct cfs_rq           *my_q;          // 拥有的CFS队列（组调度）
};

/**
 * CFS运行队列
 */
struct cfs_rq {
    struct load_weight      load;           // 队列总权重
    unsigned int            nr_running;     // 可运行进程数
    
    u64                     min_vruntime;   // 最小虚拟运行时间
    
    struct rb_root_cached   tasks_timeline; // 红黑树（按vruntime排序）
    struct sched_entity     *curr;          // 当前运行的实体
    struct sched_entity     *next;          // 下一个要运行的
    struct sched_entity     *last;          // 上一个运行的
    
    // ... 统计信息
};

虚拟运行时间（vruntime）

vruntime是CFS的核心概念，它表示进程"应该获得"的CPU时间的归一化值：

c 复制代码

/**
 * vruntime计算
 * 
 * vruntime = actual_runtime × (NICE_0_WEIGHT / weight)
 * 
 * 权重由nice值决定:
 * nice = 0 时，weight = 1024
 * nice 每减1，权重约增加25%
 * nice 每加1，权重约减少20%
 */

// nice值到权重的映射表
static const int sched_prio_to_weight[40] = {
 /* -20 */     88761,     71755,     56483,     46273,     36291,
 /* -15 */     29154,     23254,     18705,     14949,     11916,
 /* -10 */      9548,      7620,      6100,      4904,      3906,
 /*  -5 */      3121,      2501,      1991,      1586,      1277,
 /*   0 */      1024,       820,       655,       526,       423,
 /*   5 */       335,       272,       215,       172,       137,
 /*  10 */       110,        87,        70,        56,        45,
 /*  15 */        36,        29,        23,        18,        15,
};

/**
 * 更新vruntime
 */
static void update_curr(struct cfs_rq *cfs_rq)
{
    struct sched_entity *curr = cfs_rq->curr;
    u64 now = rq_clock_task(rq_of(cfs_rq));
    u64 delta_exec;
    
    if (unlikely(!curr))
        return;
    
    // 计算实际运行时间
    delta_exec = now - curr->exec_start;
    if (unlikely((s64)delta_exec <= 0))
        return;
    
    curr->exec_start = now;
    curr->sum_exec_runtime += delta_exec;
    
    // 计算vruntime增量
    // delta_vruntime = delta_exec × NICE_0_WEIGHT / weight
    curr->vruntime += calc_delta_fair(delta_exec, curr);
    
    // 更新最小vruntime
    update_min_vruntime(cfs_rq);
}

/**
 * 计算公平的vruntime增量
 */
static inline u64 calc_delta_fair(u64 delta, struct sched_entity *se)
{
    // 如果权重正好是NICE_0_WEIGHT，直接返回
    if (unlikely(se->load.weight != NICE_0_LOAD))
        delta = __calc_delta(delta, NICE_0_LOAD, &se->load);
    
    return delta;
}

CFS红黑树

复制代码

CFS使用红黑树组织所有可运行进程，按vruntime排序:

                    ┌───────────────────┐
                    │   vruntime=1000   │ (根节点)
                    │     进程D         │
                    └─────────┬─────────┘
                              │
              ┌───────────────┴───────────────┐
              │                               │
      ┌───────┴───────┐               ┌───────┴───────┐
      │ vruntime=800  │               │ vruntime=1200 │
      │    进程B      │               │    进程F      │
      └───────┬───────┘               └───────┬───────┘
              │                               │
      ┌───────┴───────┐               ┌───────┴───────┐
      │               │               │               │
  ┌───┴───┐       ┌───┴───┐       ┌───┴───┐       ┌───┴───┐
  │vr=700 │       │vr=900 │       │vr=1100│       │vr=1500│
  │进程A  │       │进程C  │       │进程E  │       │进程G  │
  └───────┘       └───────┘       └───────┘       └───────┘
      ↑
  最左节点 = vruntime最小 = 下一个要运行的进程
  
查找复杂度: O(1) (缓存了最左节点)
插入/删除: O(log n)

c 复制代码

/**
 * 选择下一个要运行的进程
 */
static struct sched_entity *pick_next_entity(struct cfs_rq *cfs_rq)
{
    // 获取红黑树最左节点（vruntime最小）
    struct rb_node *left = rb_first_cached(&cfs_rq->tasks_timeline);
    
    if (!left)
        return NULL;
    
    return rb_entry(left, struct sched_entity, run_node);
}

/**
 * 将进程加入红黑树
 */
static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se)
{
    struct rb_node **link = &cfs_rq->tasks_timeline.rb_root.rb_node;
    struct rb_node *parent = NULL;
    struct sched_entity *entry;
    bool leftmost = true;
    
    // 按vruntime找到插入位置
    while (*link) {
        parent = *link;
        entry = rb_entry(parent, struct sched_entity, run_node);
        
        if (entity_before(se, entry)) {
            link = &parent->rb_left;
        } else {
            link = &parent->rb_right;
            leftmost = false;
        }
    }
    
    rb_link_node(&se->run_node, parent, link);
    rb_insert_color_cached(&se->run_node, &cfs_rq->tasks_timeline, leftmost);
}

实时调度器

c 复制代码

/**
 * 实时调度策略
 */
#define SCHED_FIFO      1   // 先进先出，无时间片限制
#define SCHED_RR        2   // 轮转调度，有时间片

/**
 * 实时优先级: 1-99 (数值越大优先级越高)
 * 实时进程总是优先于普通进程运行
 */

/**
 * 实时调度实体
 */
struct sched_rt_entity {
    struct list_head        run_list;       // 运行队列链表
    unsigned long           timeout;        // 时间片（SCHED_RR）
    unsigned int            time_slice;     // 剩余时间片
    unsigned short          on_rq;
    unsigned short          on_list;
    
    struct sched_rt_entity  *back;          // 用于组调度
    struct sched_rt_entity  *parent;
};

/**
 * 实时运行队列 - 每个优先级一个链表
 */
struct rt_prio_array {
    DECLARE_BITMAP(bitmap, MAX_RT_PRIO+1);  // 位图，标记哪些优先级有进程
    struct list_head queue[MAX_RT_PRIO];    // 100个优先级队列
};

/**
 * 选择最高优先级的实时进程
 */
static struct task_struct *pick_next_task_rt(struct rq *rq)
{
    struct rt_rq *rt_rq = &rq->rt;
    struct rt_prio_array *array = &rt_rq->active;
    struct sched_rt_entity *rt_se;
    struct task_struct *p;
    int idx;
    
    // 找到最高优先级（位图中第一个1）
    idx = sched_find_first_bit(array->bitmap);
    if (idx >= MAX_RT_PRIO)
        return NULL;  // 没有实时进程
    
    // 取该优先级队列的第一个进程
    rt_se = list_entry(array->queue[idx].next, struct sched_rt_entity, run_list);
    p = rt_task_of(rt_se);
    
    return p;
}

多核CPU调度

多核调度架构

复制代码

┌─────────────────────────────────────────────────────────────────┐
│                    多核CPU调度架构                               │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│    ┌─────────┐  ┌─────────┐  ┌─────────┐  ┌─────────┐         │
│    │  CPU 0  │  │  CPU 1  │  │  CPU 2  │  │  CPU 3  │         │
│    └────┬────┘  └────┬────┘  └────┬────┘  └────┬────┘         │
│         │            │            │            │               │
│    ┌────┴────┐  ┌────┴────┐  ┌────┴────┐  ┌────┴────┐         │
│    │ Run     │  │ Run     │  │ Run     │  │ Run     │         │
│    │ Queue 0 │  │ Queue 1 │  │ Queue 2 │  │ Queue 3 │         │
│    │         │  │         │  │         │  │         │         │
│    │ [P1,P5] │  │ [P2,P6] │  │ [P3,P7] │  │ [P4]    │         │
│    │         │  │         │  │         │  │         │         │
│    └────┬────┘  └────┬────┘  └────┬────┘  └────┬────┘         │
│         │            │            │            │               │
│         └────────────┴─────┬──────┴────────────┘               │
│                            │                                   │
│                    ┌───────┴───────┐                           │
│                    │ 负载均衡器    │                           │
│                    │ Load Balancer│                           │
│                    └───────────────┘                           │
│                                                                 │
│    每个CPU有独立的运行队列，负载均衡器定期检查并迁移进程         │
│                                                                 │
└─────────────────────────────────────────────────────────────────┘

NUMA架构感知

复制代码

┌─────────────────────────────────────────────────────────────────┐
│                      NUMA架构示意                                │
├─────────────────────────────────────────────────────────────────┤
│                                                                 │
│   ┌─────────────────────────┐  ┌─────────────────────────┐     │
│   │       NUMA Node 0       │  │       NUMA Node 1       │     │
│   │  ┌──────┐  ┌──────┐    │  │    ┌──────┐  ┌──────┐  │     │
│   │  │CPU 0 │  │CPU 1 │    │  │    │CPU 2 │  │CPU 3 │  │     │
│   │  └──────┘  └──────┘    │  │    └──────┘  └──────┘  │     │
│   │       ↑          ↑      │  │      ↑          ↑      │     │
│   │       └────┬─────┘      │  │      └────┬─────┘      │     │
│   │            ↓            │  │           ↓            │     │
│   │   ┌────────────────┐   │  │  ┌────────────────┐   │     │
│   │   │ Local Memory   │   │  │  │ Local Memory   │   │     │
│   │   │    (快)        │   │  │  │    (快)        │   │     │
│   │   └────────┬───────┘   │  │  └───────┬────────┘   │     │
│   │            │            │  │          │            │     │
│   └────────────┼────────────┘  └──────────┼────────────┘     │
│                │                          │                   │
│                └──────────┬───────────────┘                   │
│                           │                                   │
│                    互联总线(慢)                                │
│                                                               │
│   访问本地内存: ~100ns                                         │
│   访问远程内存: ~300ns (3倍延迟!)                              │
│                                                               │
└─────────────────────────────────────────────────────────────────┘

负载均衡

c 复制代码

/**
 * 调度域层次结构
 * 从CPU到整个系统，多层级负载均衡
 */

/*
调度域层次示例 (2个NUMA节点，每节点2个核，每核2个超线程):

              ┌─────────────────────────────────────┐
      Level 2 │           NUMA Domain              │
              │  (所有CPU，负载均衡代价最高)         │
              └─────────────────┬───────────────────┘
                                │
                ┌───────────────┴───────────────┐
                │                               │
        ┌───────┴───────┐               ┌───────┴───────┐
Level 1 │  Node Domain  │               │  Node Domain  │
        │   (Node 0)    │               │   (Node 1)    │
        └───────┬───────┘               └───────┬───────┘
                │                               │
        ┌───────┴───────┐               ┌───────┴───────┐
        │               │               │               │
    ┌───┴───┐       ┌───┴───┐       ┌───┴───┐       ┌───┴───┐
L0  │SMT Dom│       │SMT Dom│       │SMT Dom│       │SMT Dom│
    │Core 0 │       │Core 1 │       │Core 2 │       │Core 3 │
    └───┬───┘       └───┬───┘       └───┬───┘       └───┬───┘
        │               │               │               │
    ┌───┴───┐       ┌───┴───┐       ┌───┴───┐       ┌───┴───┐
    │CPU0│1 │       │CPU2│3 │       │CPU4│5 │       │CPU6│7 │
    └───────┘       └───────┘       └───────┘       └───────┘
*/

/**
 * 负载均衡触发时机
 */
// 1. 定时器触发 (scheduler_tick)
// 2. 进程唤醒时 (wake_up)
// 3. 进程创建时 (fork/clone)
// 4. CPU空闲时 (idle_balance)

/**
 * 负载计算
 */
static unsigned long cpu_load(struct rq *rq)
{
    // 基于可运行进程的权重和数
    return rq->cfs.runnable_load_avg;
}

/**
 * 寻找最繁忙的CPU
 */
static struct rq *find_busiest_queue(struct sched_domain *sd,
                                      struct sched_group *group,
                                      enum cpu_idle_type idle)
{
    struct rq *busiest = NULL;
    unsigned long busiest_load = 0;
    int i;
    
    for_each_cpu(i, sched_group_span(group)) {
        struct rq *rq = cpu_rq(i);
        unsigned long load = cpu_load(rq);
        
        if (load > busiest_load) {
            busiest_load = load;
            busiest = rq;
        }
    }
    
    return busiest;
}

/**
 * 进程迁移
 */
static int move_tasks(struct rq *this_rq, struct rq *busiest_rq,
                      unsigned long max_load_move)
{
    struct list_head *tasks = &busiest_rq->cfs_tasks;
    struct task_struct *p;
    unsigned long load_moved = 0;
    
    while (!list_empty(tasks) && load_moved < max_load_move) {
        p = list_first_entry(tasks, struct task_struct, se.group_node);
        
        // 检查是否允许迁移
        if (!can_migrate_task(p, busiest_rq, this_rq))
            continue;
        
        // 从源队列移除
        deactivate_task(busiest_rq, p, 0);
        // 加入目标队列
        activate_task(this_rq, p, 0);
        
        load_moved += task_load(p);
    }
    
    return load_moved;
}

CPU亲和性

c 复制代码

/**
 * CPU亲和性 - 限制进程运行在特定CPU上
 */

#include <sched.h>

/**
 * 设置CPU亲和性
 */
int set_cpu_affinity(pid_t pid, int cpu)
{
    cpu_set_t mask;
    CPU_ZERO(&mask);
    CPU_SET(cpu, &mask);
    
    return sched_setaffinity(pid, sizeof(mask), &mask);
}

/**
 * 设置多CPU亲和性
 */
int set_cpu_affinity_multi(pid_t pid, int *cpus, int num_cpus)
{
    cpu_set_t mask;
    CPU_ZERO(&mask);
    
    for (int i = 0; i < num_cpus; i++) {
        CPU_SET(cpus[i], &mask);
    }
    
    return sched_setaffinity(pid, sizeof(mask), &mask);
}

/**
 * 获取CPU亲和性
 */
int get_cpu_affinity(pid_t pid, cpu_set_t *mask)
{
    CPU_ZERO(mask);
    return sched_getaffinity(pid, sizeof(*mask), mask);
}

/**
 * 获取当前运行的CPU
 */
int get_current_cpu(void)
{
    return sched_getcpu();
}

/**
 * 示例：将线程绑定到特定CPU
 */
void *thread_func(void *arg)
{
    int cpu_id = *(int *)arg;
    
    // 绑定到指定CPU
    cpu_set_t mask;
    CPU_ZERO(&mask);
    CPU_SET(cpu_id, &mask);
    
    if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) != 0) {
        perror("pthread_setaffinity_np");
    }
    
    printf("Thread running on CPU %d\n", sched_getcpu());
    
    // 执行任务...
    
    return NULL;
}

/**
 * 命令行设置CPU亲和性
 */
// 启动时指定
// $ taskset -c 0,1 ./myprogram        # 绑定到CPU 0和1
// $ taskset -c 0-3 ./myprogram        # 绑定到CPU 0-3

// 运行中修改
// $ taskset -pc 2,3 <pid>             # 修改进程的CPU亲和性

调度策略实战

设置调度策略

c 复制代码

/**
 * 调度策略设置
 */
#include <sched.h>
#include <sys/resource.h>

/**
 * 设置实时调度策略
 */
int set_realtime_priority(pid_t pid, int policy, int priority)
{
    struct sched_param param;
    param.sched_priority = priority;
    
    // policy: SCHED_FIFO 或 SCHED_RR
    // priority: 1-99
    return sched_setscheduler(pid, policy, &param);
}

/**
 * 设置普通进程nice值
 */
int set_nice(pid_t pid, int nice_value)
{
    // nice_value: -20 (最高) 到 +19 (最低)
    return setpriority(PRIO_PROCESS, pid, nice_value);
}

/**
 * 设置SCHED_DEADLINE
 */
#include <linux/sched.h>

struct sched_attr {
    uint32_t size;
    uint32_t sched_policy;
    uint64_t sched_flags;
    int32_t  sched_nice;
    uint32_t sched_priority;
    uint64_t sched_runtime;     // 每周期需要的CPU时间
    uint64_t sched_deadline;    // 截止时间
    uint64_t sched_period;      // 周期
};

int set_deadline_policy(pid_t pid, uint64_t runtime, uint64_t deadline, uint64_t period)
{
    struct sched_attr attr = {
        .size = sizeof(attr),
        .sched_policy = SCHED_DEADLINE,
        .sched_runtime = runtime,
        .sched_deadline = deadline,
        .sched_period = period
    };
    
    return syscall(SYS_sched_setattr, pid, &attr, 0);
}

/**
 * 实际使用示例
 */
int main()
{
    pid_t pid = getpid();
    
    // 方案1: 设置为实时FIFO，优先级50
    if (set_realtime_priority(pid, SCHED_FIFO, 50) != 0) {
        perror("Failed to set SCHED_FIFO");
        // 可能需要root权限
    }
    
    // 方案2: 设置nice值（普通用户可用）
    if (set_nice(pid, -10) != 0) {
        perror("Failed to set nice");
    }
    
    // 方案3: 设置DEADLINE (需要root)
    // 每10ms周期中需要2ms CPU时间，截止时间5ms
    if (set_deadline_policy(pid, 2000000, 5000000, 10000000) != 0) {
        perror("Failed to set SCHED_DEADLINE");
    }
    
    // 执行任务...
    
    return 0;
}

完整的多线程调度示例

c 复制代码

/**
 * 多线程调度综合示例
 */
#define _GNU_SOURCE
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <pthread.h>
#include <sched.h>
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/resource.h>
#include <time.h>
#include <errno.h>

#define NUM_THREADS 4
#define WORK_ITERATIONS 100000000

/**
 * 线程配置
 */
typedef struct {
    int thread_id;
    int cpu_id;             // 绑定的CPU，-1表示不绑定
    int sched_policy;       // 调度策略
    int priority;           // 优先级 (nice值或实时优先级)
    int (*work_func)(int);  // 工作函数
} thread_config_t;

/**
 * 线程结果
 */
typedef struct {
    int thread_id;
    double elapsed_time;
    long long result;
    int actual_cpu;
} thread_result_t;

static thread_result_t results[NUM_THREADS];

/**
 * CPU密集型工作
 */
int cpu_intensive_work(int iterations)
{
    volatile long long sum = 0;
    for (int i = 0; i < iterations; i++) {
        sum += i * i;
        // 模拟一些计算
        for (int j = 0; j < 10; j++) {
            sum ^= (sum << 3) | (sum >> 5);
        }
    }
    return (int)(sum & 0x7FFFFFFF);
}

/**
 * I/O密集型工作
 */
int io_intensive_work(int iterations)
{
    volatile long long sum = 0;
    for (int i = 0; i < iterations / 1000; i++) {
        sum += i;
        // 模拟I/O等待
        usleep(1);
    }
    return (int)(sum & 0x7FFFFFFF);
}

/**
 * 获取当前时间（纳秒）
 */
long long get_time_ns(void)
{
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return ts.tv_sec * 1000000000LL + ts.tv_nsec;
}

/**
 * 获取线程ID
 */
pid_t gettid(void)
{
    return syscall(SYS_gettid);
}

/**
 * 线程主函数
 */
void *thread_main(void *arg)
{
    thread_config_t *config = (thread_config_t *)arg;
    thread_result_t *result = &results[config->thread_id];
    
    result->thread_id = config->thread_id;
    
    // 1. 设置CPU亲和性
    if (config->cpu_id >= 0) {
        cpu_set_t mask;
        CPU_ZERO(&mask);
        CPU_SET(config->cpu_id, &mask);
        
        if (pthread_setaffinity_np(pthread_self(), sizeof(mask), &mask) != 0) {
            fprintf(stderr, "Thread %d: Failed to set affinity to CPU %d\n",
                    config->thread_id, config->cpu_id);
        }
    }
    
    // 2. 设置调度策略
    struct sched_param param;
    
    switch (config->sched_policy) {
    case SCHED_FIFO:
    case SCHED_RR:
        param.sched_priority = config->priority;
        if (pthread_setschedparam(pthread_self(), config->sched_policy, &param) != 0) {
            fprintf(stderr, "Thread %d: Failed to set RT policy (need root)\n",
                    config->thread_id);
        }
        break;
        
    case SCHED_OTHER:
        // 设置nice值
        if (setpriority(PRIO_PROCESS, 0, config->priority) != 0) {
            fprintf(stderr, "Thread %d: Failed to set nice to %d\n",
                    config->thread_id, config->priority);
        }
        break;
    }
    
    // 获取实际运行的CPU
    result->actual_cpu = sched_getcpu();
    
    printf("Thread %d started: TID=%d, CPU=%d, Policy=%d, Priority=%d\n",
           config->thread_id, gettid(), result->actual_cpu,
           config->sched_policy, config->priority);
    
    // 3. 执行工作
    long long start = get_time_ns();
    
    long long work_result = config->work_func(WORK_ITERATIONS);
    
    long long end = get_time_ns();
    
    result->elapsed_time = (end - start) / 1000000000.0;
    result->result = work_result;
    
    printf("Thread %d finished: %.3f seconds, CPU=%d\n",
           config->thread_id, result->elapsed_time, sched_getcpu());
    
    return NULL;
}

/**
 * 打印调度信息
 */
void print_scheduling_info(void)
{
    printf("\n=== System Scheduling Info ===\n");
    
    // CPU数量
    int num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
    printf("Online CPUs: %d\n", num_cpus);
    
    // 调度器信息
    printf("\nScheduling policies:\n");
    printf("  SCHED_OTHER (CFS): priority 0, nice -20~19\n");
    printf("  SCHED_FIFO:        priority 1-99\n");
    printf("  SCHED_RR:          priority 1-99, time slice=%ld ms\n",
           sched_rr_get_interval(0, NULL) / 1000000);
    
    // 各CPU负载
    printf("\nCPU Load (from /proc/stat):\n");
    FILE *fp = fopen("/proc/stat", "r");
    if (fp) {
        char line[256];
        while (fgets(line, sizeof(line), fp)) {
            if (strncmp(line, "cpu", 3) == 0) {
                printf("  %s", line);
            } else {
                break;
            }
        }
        fclose(fp);
    }
}

/**
 * 实验1: 比较不同调度策略
 */
void experiment_scheduling_policies(void)
{
    printf("\n\n=== Experiment 1: Scheduling Policies ===\n");
    
    pthread_t threads[NUM_THREADS];
    thread_config_t configs[NUM_THREADS];
    
    // 配置不同策略的线程
    configs[0] = (thread_config_t){0, -1, SCHED_OTHER, 0,  cpu_intensive_work};  // 普通, nice=0
    configs[1] = (thread_config_t){1, -1, SCHED_OTHER, 10, cpu_intensive_work};  // 普通, nice=10
    configs[2] = (thread_config_t){2, -1, SCHED_OTHER, -10, cpu_intensive_work}; // 普通, nice=-10
    configs[3] = (thread_config_t){3, -1, SCHED_RR,    50, cpu_intensive_work};  // 实时RR (需要root)
    
    // 创建线程
    for (int i = 0; i < NUM_THREADS; i++) {
        pthread_create(&threads[i], NULL, thread_main, &configs[i]);
    }
    
    // 等待完成
    for (int i = 0; i < NUM_THREADS; i++) {
        pthread_join(threads[i], NULL);
    }
    
    // 打印结果
    printf("\nResults:\n");
    printf("%-10s %-10s %-15s %-12s\n", "Thread", "Policy", "Nice/Priority", "Time(s)");
    printf("----------------------------------------------\n");
    for (int i = 0; i < NUM_THREADS; i++) {
        const char *policy_name[] = {"OTHER", "FIFO", "RR"};
        printf("%-10d %-10s %-15d %-12.3f\n",
               i, policy_name[configs[i].sched_policy], 
               configs[i].priority, results[i].elapsed_time);
    }
}

/**
 * 实验2: CPU亲和性影响
 */
void experiment_cpu_affinity(void)
{
    printf("\n\n=== Experiment 2: CPU Affinity ===\n");
    
    int num_cpus = sysconf(_SC_NPROCESSORS_ONLN);
    if (num_cpus < 2) {
        printf("Need at least 2 CPUs for this experiment\n");
        return;
    }
    
    pthread_t threads[4];
    thread_config_t configs[4];
    
    // 两个线程绑定到同一CPU，两个线程各绑定到不同CPU
    configs[0] = (thread_config_t){0, 0, SCHED_OTHER, 0, cpu_intensive_work};  // 绑定CPU0
    configs[1] = (thread_config_t){1, 0, SCHED_OTHER, 0, cpu_intensive_work};  // 绑定CPU0
    configs[2] = (thread_config_t){2, 1, SCHED_OTHER, 0, cpu_intensive_work};  // 绑定CPU1
    configs[3] = (thread_config_t){3, (num_cpus > 2) ? 2 : 1, SCHED_OTHER, 0, cpu_intensive_work};
    
    for (int i = 0; i < 4; i++) {
        pthread_create(&threads[i], NULL, thread_main, &configs[i]);
    }
    
    for (int i = 0; i < 4; i++) {
        pthread_join(threads[i], NULL);
    }
    
    printf("\nResults:\n");
    printf("%-10s %-10s %-10s %-12s\n", "Thread", "Bound CPU", "Actual CPU", "Time(s)");
    printf("----------------------------------------------\n");
    for (int i = 0; i < 4; i++) {
        printf("%-10d %-10d %-10d %-12.3f\n",
               i, configs[i].cpu_id, results[i].actual_cpu, results[i].elapsed_time);
    }
    
    printf("\nNote: Threads 0&1 share CPU0, expect ~2x time vs threads 2&3\n");
}

/**
 * 实验3: 上下文切换开销
 */
void experiment_context_switch(void)
{
    printf("\n\n=== Experiment 3: Context Switch Overhead ===\n");
    
    // 获取初始上下文切换计数
    char stat_path[64];
    snprintf(stat_path, sizeof(stat_path), "/proc/%d/status", getpid());
    
    // 使用yield强制上下文切换
    long long start = get_time_ns();
    
    for (int i = 0; i < 100000; i++) {
        sched_yield();
    }
    
    long long end = get_time_ns();
    
    double avg_switch_us = (end - start) / 100000.0 / 1000.0;
    
    printf("100000 sched_yield() calls: %.3f ms\n", (end - start) / 1000000.0);
    printf("Average per yield: %.3f us\n", avg_switch_us);
    
    // 读取上下文切换统计
    FILE *fp = fopen(stat_path, "r");
    if (fp) {
        char line[256];
        while (fgets(line, sizeof(line), fp)) {
            if (strstr(line, "ctxt_switches")) {
                printf("%s", line);
            }
        }
        fclose(fp);
    }
}

int main(int argc, char *argv[])
{
    printf("===========================================\n");
    printf("   Linux Scheduling Experiments\n");
    printf("===========================================\n");
    
    print_scheduling_info();
    
    experiment_scheduling_policies();
    experiment_cpu_affinity();
    experiment_context_switch();
    
    printf("\n===========================================\n");
    printf("   All experiments completed!\n");
    printf("===========================================\n");
    
    return 0;
}

编译和运行：

bash 复制代码

# 编译
gcc -o sched_test sched_test.c -lpthread -lrt -O2

# 普通用户运行（部分实时功能可能受限）
./sched_test

# root运行（完整功能）
sudo ./sched_test

进程间通信与同步

多进程共享CPU时的通信

c 复制代码

/**
 * 进程间通信方式对比
 */

/*
┌─────────────────────────────────────────────────────────────────┐
│                    IPC方式对比                                   │
├───────────────┬──────────────┬──────────────┬───────────────────┤
│    方式       │    速度      │   复杂度     │      适用场景      │
├───────────────┼──────────────┼──────────────┼───────────────────┤
│ 管道(pipe)    │    中等      │     低       │ 父子进程单向通信   │
│ 命名管道      │    中等      │     低       │ 无关进程通信       │
│ 消息队列      │    中等      │     中       │ 结构化消息传递     │
│ 共享内存      │    最快      │     高       │ 大量数据共享       │
│ 信号量        │    快        │     中       │ 同步/互斥          │
│ 信号          │    快        │     低       │ 简单通知           │
│ Socket        │    较慢      │     中       │ 网络/跨机器通信    │
└───────────────┴──────────────┴──────────────┴───────────────────┘
*/

/**
 * 共享内存 + 信号量示例
 * 最高效的进程间数据共享方式
 */
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <semaphore.h>

#define SHM_NAME "/my_shm"
#define SEM_NAME "/my_sem"
#define SHM_SIZE 4096

/**
 * 共享数据结构
 */
typedef struct {
    int counter;
    char message[256];
    int producer_done;
} shared_data_t;

/**
 * 生产者进程
 */
void producer_process(void)
{
    // 创建共享内存
    int fd = shm_open(SHM_NAME, O_CREAT | O_RDWR, 0666);
    ftruncate(fd, SHM_SIZE);
    
    shared_data_t *data = mmap(NULL, SHM_SIZE, PROT_READ | PROT_WRITE,
                               MAP_SHARED, fd, 0);
    
    // 创建信号量
    sem_t *sem = sem_open(SEM_NAME, O_CREAT, 0666, 0);
    
    // 生产数据
    for (int i = 0; i < 10; i++) {
        data->counter = i;
        snprintf(data->message, sizeof(data->message), "Message %d", i);
        printf("Producer: wrote counter=%d\n", i);
        
        sem_post(sem);  // 通知消费者
        usleep(100000); // 100ms
    }
    
    data->producer_done = 1;
    sem_post(sem);
    
    // 清理
    munmap(data, SHM_SIZE);
    close(fd);
}

/**
 * 消费者进程
 */
void consumer_process(void)
{
    // 打开共享内存
    int fd = shm_open(SHM_NAME, O_RDWR, 0666);
    shared_data_t *data = mmap(NULL, SHM_SIZE, PROT_READ | PROT_WRITE,
                               MAP_SHARED, fd, 0);
    
    // 打开信号量
    sem_t *sem = sem_open(SEM_NAME, 0);
    
    // 消费数据
    while (1) {
        sem_wait(sem);  // 等待生产者
        
        if (data->producer_done) break;
        
        printf("Consumer: read counter=%d, message='%s'\n",
               data->counter, data->message);
    }
    
    // 清理
    munmap(data, SHM_SIZE);
    close(fd);
    shm_unlink(SHM_NAME);
    sem_unlink(SEM_NAME);
}

多线程同步原语

c 复制代码

/**
 * 线程同步原语性能对比
 */

#include <pthread.h>
#include <stdatomic.h>

/**
 * 1. 互斥锁 (Mutex)
 */
pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;

void mutex_example(void)
{
    pthread_mutex_lock(&mutex);
    // 临界区
    pthread_mutex_unlock(&mutex);
}

/**
 * 2. 自旋锁 (Spinlock)
 * 适合临界区很短的情况
 */
pthread_spinlock_t spinlock;

void spinlock_example(void)
{
    pthread_spin_init(&spinlock, PTHREAD_PROCESS_PRIVATE);
    
    pthread_spin_lock(&spinlock);
    // 临界区（应该很短）
    pthread_spin_unlock(&spinlock);
}

/**
 * 3. 读写锁 (RWLock)
 * 适合读多写少的场景
 */
pthread_rwlock_t rwlock = PTHREAD_RWLOCK_INITIALIZER;

void rwlock_example(void)
{
    // 读操作
    pthread_rwlock_rdlock(&rwlock);
    // 读取数据...
    pthread_rwlock_unlock(&rwlock);
    
    // 写操作
    pthread_rwlock_wrlock(&rwlock);
    // 修改数据...
    pthread_rwlock_unlock(&rwlock);
}

/**
 * 4. 条件变量 (Condition Variable)
 */
pthread_cond_t cond = PTHREAD_COND_INITIALIZER;
pthread_mutex_t cond_mutex = PTHREAD_MUTEX_INITIALIZER;
int ready = 0;

void *waiter_thread(void *arg)
{
    pthread_mutex_lock(&cond_mutex);
    while (!ready) {
        pthread_cond_wait(&cond, &cond_mutex);
    }
    // 处理...
    pthread_mutex_unlock(&cond_mutex);
    return NULL;
}

void *signaler_thread(void *arg)
{
    pthread_mutex_lock(&cond_mutex);
    ready = 1;
    pthread_cond_signal(&cond);
    pthread_mutex_unlock(&cond_mutex);
    return NULL;
}

/**
 * 5. 原子操作 (最快，无锁)
 */
atomic_int atomic_counter = 0;

void atomic_example(void)
{
    atomic_fetch_add(&atomic_counter, 1);
    atomic_fetch_sub(&atomic_counter, 1);
    
    int expected = 0;
    atomic_compare_exchange_strong(&atomic_counter, &expected, 1);
}

/**
 * 6. 屏障 (Barrier)
 * 让多个线程在某点同步
 */
pthread_barrier_t barrier;

void barrier_example(int num_threads)
{
    pthread_barrier_init(&barrier, NULL, num_threads);
    
    // 每个线程调用
    // pthread_barrier_wait(&barrier);
    // 所有线程都到达后才继续
}

/**
 * 同步原语性能测试
 */
#define ITERATIONS 10000000

void benchmark_sync_primitives(void)
{
    long long start, end;
    
    // Mutex
    pthread_mutex_t mtx = PTHREAD_MUTEX_INITIALIZER;
    start = get_time_ns();
    for (int i = 0; i < ITERATIONS; i++) {
        pthread_mutex_lock(&mtx);
        pthread_mutex_unlock(&mtx);
    }
    end = get_time_ns();
    printf("Mutex: %.2f ns/op\n", (double)(end - start) / ITERATIONS);
    
    // Spinlock
    pthread_spinlock_t spin;
    pthread_spin_init(&spin, PTHREAD_PROCESS_PRIVATE);
    start = get_time_ns();
    for (int i = 0; i < ITERATIONS; i++) {
        pthread_spin_lock(&spin);
        pthread_spin_unlock(&spin);
    }
    end = get_time_ns();
    printf("Spinlock: %.2f ns/op\n", (double)(end - start) / ITERATIONS);
    
    // Atomic
    atomic_int counter = 0;
    start = get_time_ns();
    for (int i = 0; i < ITERATIONS; i++) {
        atomic_fetch_add(&counter, 1);
    }
    end = get_time_ns();
    printf("Atomic: %.2f ns/op\n", (double)(end - start) / ITERATIONS);
}

调试与性能分析

常用调试工具

bash 复制代码

# 1. 查看进程调度信息
$ cat /proc/<pid>/sched
nr_switches                  :              1234
nr_voluntary_switches        :               800
nr_involuntary_switches      :               434
se.exec_start                :        1234567890
se.vruntime                  :         987654321
policy                       :                 0
prio                         :               120

# 2. 查看进程状态
$ cat /proc/<pid>/status
State:  S (sleeping)
Threads:        4
voluntary_ctxt_switches:        150
nonvoluntary_ctxt_switches:     42
Cpus_allowed:   ff
Cpus_allowed_list:      0-7

# 3. 实时监控调度延迟
$ sudo perf sched latency
-------------------------------------------------
 Task                  |   Runtime ms  |  Count
-------------------------------------------------
 migration/0           |      0.012 ms |      3
 kworker/0:1           |      0.045 ms |      7
 myprocess             |     45.123 ms |    156

# 4. 跟踪调度事件
$ sudo perf sched record ./myprogram
$ sudo perf sched map

# 5. 使用ftrace跟踪调度
$ echo 1 > /sys/kernel/debug/tracing/events/sched/sched_switch/enable
$ cat /sys/kernel/debug/tracing/trace

# 6. 查看CPU运行队列
$ cat /proc/schedstat

# 7. 查看调度域
$ cat /proc/sys/kernel/sched_domain/cpu0/domain0/name

性能调优脚本

c 复制代码

/**
 * 调度性能分析工具
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <time.h>

/**
 * 读取进程调度统计
 */
typedef struct {
    unsigned long nr_switches;
    unsigned long nr_voluntary_switches;
    unsigned long nr_involuntary_switches;
    unsigned long long vruntime;
    int policy;
    int priority;
} sched_stats_t;

int read_sched_stats(pid_t pid, sched_stats_t *stats)
{
    char path[64];
    char line[256];
    FILE *fp;
    
    snprintf(path, sizeof(path), "/proc/%d/sched", pid);
    fp = fopen(path, "r");
    if (!fp) return -1;
    
    while (fgets(line, sizeof(line), fp)) {
        if (strstr(line, "nr_switches")) {
            sscanf(line, "nr_switches : %lu", &stats->nr_switches);
        } else if (strstr(line, "nr_voluntary_switches")) {
            sscanf(line, "nr_voluntary_switches : %lu", &stats->nr_voluntary_switches);
        } else if (strstr(line, "nr_involuntary_switches")) {
            sscanf(line, "nr_involuntary_switches : %lu", &stats->nr_involuntary_switches);
        } else if (strstr(line, "se.vruntime")) {
            sscanf(line, "se.vruntime : %llu", &stats->vruntime);
        } else if (strstr(line, "policy")) {
            sscanf(line, "policy : %d", &stats->policy);
        } else if (strstr(line, "prio")) {
            sscanf(line, "prio : %d", &stats->priority);
        }
    }
    
    fclose(fp);
    return 0;
}

/**
 * 读取CPU使用率
 */
typedef struct {
    unsigned long user;
    unsigned long nice;
    unsigned long system;
    unsigned long idle;
    unsigned long iowait;
    unsigned long irq;
    unsigned long softirq;
} cpu_stats_t;

int read_cpu_stats(int cpu_id, cpu_stats_t *stats)
{
    FILE *fp = fopen("/proc/stat", "r");
    if (!fp) return -1;
    
    char line[256];
    char cpu_name[16];
    
    if (cpu_id < 0) {
        snprintf(cpu_name, sizeof(cpu_name), "cpu ");  // 总体
    } else {
        snprintf(cpu_name, sizeof(cpu_name), "cpu%d ", cpu_id);
    }
    
    while (fgets(line, sizeof(line), fp)) {
        if (strncmp(line, cpu_name, strlen(cpu_name)) == 0) {
            sscanf(line + strlen(cpu_name), "%lu %lu %lu %lu %lu %lu %lu",
                   &stats->user, &stats->nice, &stats->system, &stats->idle,
                   &stats->iowait, &stats->irq, &stats->softirq);
            fclose(fp);
            return 0;
        }
    }
    
    fclose(fp);
    return -1;
}

/**
 * 计算CPU使用率
 */
float calc_cpu_usage(cpu_stats_t *prev, cpu_stats_t *curr)
{
    unsigned long prev_total = prev->user + prev->nice + prev->system + 
                                prev->idle + prev->iowait + prev->irq + prev->softirq;
    unsigned long curr_total = curr->user + curr->nice + curr->system + 
                                curr->idle + curr->iowait + curr->irq + curr->softirq;
    
    unsigned long total_diff = curr_total - prev_total;
    unsigned long idle_diff = curr->idle - prev->idle;
    
    if (total_diff == 0) return 0;
    
    return 100.0 * (total_diff - idle_diff) / total_diff;
}

/**
 * 监控进程
 */
void monitor_process(pid_t pid, int interval_sec, int count)
{
    sched_stats_t prev_stats, curr_stats;
    cpu_stats_t prev_cpu, curr_cpu;
    
    printf("Monitoring PID %d every %d seconds...\n\n", pid, interval_sec);
    printf("%-6s %-12s %-12s %-12s %-10s %-10s\n",
           "Time", "Voluntary", "Involuntary", "Total", "CPU%", "vruntime");
    printf("--------------------------------------------------------------\n");
    
    read_sched_stats(pid, &prev_stats);
    read_cpu_stats(-1, &prev_cpu);
    
    for (int i = 0; i < count; i++) {
        sleep(interval_sec);
        
        read_sched_stats(pid, &curr_stats);
        read_cpu_stats(-1, &curr_cpu);
        
        unsigned long vol = curr_stats.nr_voluntary_switches - prev_stats.nr_voluntary_switches;
        unsigned long invol = curr_stats.nr_involuntary_switches - prev_stats.nr_involuntary_switches;
        float cpu = calc_cpu_usage(&prev_cpu, &curr_cpu);
        
        printf("%-6d %-12lu %-12lu %-12lu %-10.1f %-10llu\n",
               i * interval_sec,
               vol / interval_sec,
               invol / interval_sec,
               (vol + invol) / interval_sec,
               cpu,
               curr_stats.vruntime);
        
        prev_stats = curr_stats;
        prev_cpu = curr_cpu;
    }
}

int main(int argc, char *argv[])
{
    if (argc < 2) {
        printf("Usage: %s <pid> [interval] [count]\n", argv[0]);
        return 1;
    }
    
    pid_t pid = atoi(argv[1]);
    int interval = argc > 2 ? atoi(argv[2]) : 1;
    int count = argc > 3 ? atoi(argv[3]) : 10;
    
    monitor_process(pid, interval, count);
    
    return 0;
}

总结

Linux进程调度的核心知识点：

模块	关键技术
进程结构	task_struct、进程状态、进程vs线程
上下文切换	寄存器保存/恢复、页表切换、switch_to
CFS调度器	vruntime、红黑树、公平调度
实时调度	SCHED_FIFO/RR/DEADLINE、优先级
多核调度	每CPU运行队列、负载均衡、NUMA感知
CPU亲和性	cpumask、绑核、迁移控制
同步原语	mutex、spinlock、atomic、条件变量

调度优化建议：

场景	优化策略
CPU密集型	绑定CPU、减少上下文切换、适当nice值
I/O密集型	使用异步I/O、合理线程数
实时任务	SCHED_FIFO/DEADLINE、CPU隔离
高并发服务	合理负载均衡、避免锁竞争
NUMA系统	本地内存访问、CPU-内存绑定

这套内容是从内核代码和实际项目中整理出来的，代码都是可运行的。有问题欢迎评论区讨论~

参考资料：

《深入Linux内核架构》

Linux内核源码 kernel/sched/

《Linux内核设计与实现》

LWN.net 调度器相关文章