系统调用是用户态程序进入内核的唯一合法通道。在 x86_64 架构上,syscall 指令提供了最低延迟的陷门。本文基于 Linux 6.8.12 源码,完整剖析 64 位系统调用的生命周期:从 CPU 的 MSR 初始化,到 entry_SYSCALL_64 的现场保护,再到 do_syscall_64 的分发执行,以及最终 sysret 与 iret 的路径选择。同时,我们还将看到内核如何为了安全而在性能上做出的必要取舍。
一、硬件准备:syscall_init 与 MSR 魔法
x86_64 CPU 通过一组模型特定寄存器(MSR) 控制 syscall / sysret 的行为。Linux 在启动时通过 syscall_init() 配置这些寄存器(arch/x86/kernel/cpu/common.c)。
c
scss
void syscall_init(void)
{
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
// ... 兼容 32 位与 SYSENTER 处理 ...
wrmsrl(MSR_SYSCALL_MASK, X86_EFLAGS_CF|X86_EFLAGS_PF| ... |X86_EFLAGS_ID);
}
MSR_STAR:syscall进入内核时,CPU 从该寄存器的[47:32]位加载CS和SS(内核段),退出时从[63:48]位加载用户段。MSR_LSTAR:存放内核系统调用入口点entry_SYSCALL_64。MSR_SYSCALL_MASK:当syscall执行时,硬件会将RFLAGS与该掩码做AND操作,从而清除中断标志IF、方向标志DF等,保证内核态运行时不会意外被中断。
一个小细节:
syscall_init没有 标记为__init,因为系统休眠唤醒后需要重新加载这些 MSR,所以该函数必须保留在运行时镜像中。
二、入口冲锋:entry_SYSCALL_64 的栈切换与现场保存
当用户程序执行 syscall 时,CPU 自动完成以下动作:
- 将
RIP保存到RCX,将RFLAGS保存到R11; - 从
MSR_LSTAR加载RIP到entry_SYSCALL_64; - 从
MSR_STAR加载内核CS,从MSR_SYSCALL_MASK屏蔽RFLAGS; - 不自动切换
RSP(仍指向用户栈),不压栈任何内容。
因此入口代码的第一要务是切换到内核栈并保存所有寄存器 (arch/x86/entry/entry_64.S):
assembly
scss
SYM_CODE_START(entry_SYSCALL_64)
swapgs /* 交换 GS 基址,访问 per-CPU 数据 */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* 暂存用户栈顶 */
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp /* 切换到内核栈 */
/* 构建 struct pt_regs */
pushq $__USER_DS /* ss */
pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* sp (用户栈) */
pushq %r11 /* flags */
pushq $__USER_CS /* cs */
pushq %rcx /* ip (用户返回地址) */
pushq %rax /* orig_ax (系统调用号) */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS /* 保存剩余通用寄存器,rax 设为 -ENOSYS */
压栈顺序与 struct pt_regs 的定义完全匹配。注意 RCX 和 R11 被特意压入栈中------它们保存了用户态的 RIP 和 RFLAGS,将在返回时使用。
三、C 世界:do_syscall_64 与系统调用表
栈建立完成后,调用 do_syscall_64(struct pt_regs *regs, int nr)(arch/x86/entry/common.c):
c
scss
__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
{
add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
instrumentation_begin();
if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1)
regs->ax = __x64_sys_ni_syscall(regs);
instrumentation_end();
syscall_exit_to_user_mode(regs);
// ... 返回路径决策 ...
}
核心分发函数 do_syscall_x64 会通过系统调用表查找对应的内核函数:
c
arduino
static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
{
unsigned int unr = nr;
if (likely(unr < NR_syscalls)) {
unr = array_index_nospec(unr, NR_syscalls);
regs->ax = x64_sys_call(regs, unr);
return true;
}
return false;
}
x64_sys_call 是由脚本 arch/x86/syscalls/syscall_64.tbl 自动生成的巨大 switch 语句,将系统调用号映射到实际函数(如 __x64_sys_read)。
注意 :代码中有一行调试输出 trace_printk("do_syscall_64: nr=%d, ip=0x%lx\n", nr, regs->ip);,这通常是内核开发期间用于追踪系统调用行为的临时日志,生产内核中不应出现。
四、返回决策:SYSRET 快速路 vs IRET 慢速路
系统调用返回时,内核可以选择两条路:
- 快速路 :使用
sysretq指令,开销极小但条件苛刻。 - 慢速路 :使用
iretq指令,能够处理各种复杂情况(如改变CS、SS、信号返回等)。
do_syscall_64 在返回前会执行一系列检查,决定是否能够使用 sysret:
c
rust
/* 1. Xen PV 虚拟机强制走 IRET */
if (cpu_feature_enabled(X86_FEATURE_XENPV))
return false;
/* 2. RCX == RIP 且 R11 == RFLAGS 才能用 SYSRET */
if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
return false;
/* 3. CS/SS 必须为标准用户段 */
if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
return false;
/* 4. RIP 必须在用户空间范围内(非规范地址) */
if (unlikely(regs->ip >= TASK_SIZE_MAX))
return false;
/* 5. 不能有 RF 或 TF 标志 */
if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
return false;
return true; /* 所有检查通过,使用 SYSRET */
- RCX/RIP 匹配 :因为
sysret从RCX恢复RIP,如果用户态不小心修改了RCX,直接跳转会导致不可预知的结果。 - 非规范地址 :某些旧 CPU 在
sysret遇到非规范地址时会触发内核态#GP,成为安全漏洞,因此必须拦截。 - TF 标志 :如果用户态要求单步调试,
sysret恢复TF后会在用户态立即触发#DB,破坏执行流。
五、快速返回的实现细节
如果决策通过,汇编代码会执行快速返回路径:
assembly
css
syscall_return_via_sysret:
IBRS_EXIT
POP_REGS pop_rdi=0 /* 恢复除 RDI、RSP 外的所有寄存器 */
movq %rsp, %rdi /* 保存当前栈指针 */
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp /* 切到 trampoline 栈 */
pushq RSP-RDI(%rdi) /* 压入原用户 RSP */
pushq (%rdi) /* 压入原 RDI */
STACKLEAK_ERASE_NOCLOBBER
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
popq %rdi
popq %rsp
swapgs
CLEAR_CPU_BUFFERS
sysretq
最后一步 sysretq 硬件完成:
RIP = RCX(用户返回地址)RFLAGS = R11CS/SS从MSR_STAR加载用户段- 特权级切换到 Ring 3
整个过程不经过任何软件中断或任务切换,因此延迟极低。
六、安全博弈:缓解现代 CPU 漏洞的代价
Linux 6.8.12 在系统调用路径中集成了大量针对推测执行漏洞的缓解措施:
| 宏 / 标签 | 作用 |
|---|---|
IBRS_ENTER |
阻止用户态间接分支预测污染内核 |
UNTRAIN_RET |
清空返回预测器(RSB),防御 Retbleed |
CLEAR_BRANCH_HISTORY |
冲刷分支历史缓冲区,防御 Branch History Injection (BHI) |
CLEAR_CPU_BUFFERS |
在返回用户态前清除 CPU 内部缓冲区(如填充缓冲区) |
SWITCH_TO_KERNEL_CR3 |
配合内核页表隔离(KPTI),切换内核页表 |
这些措施在 entry_SYSCALL_64 中被精确放置在用户态可控状态刚刚进入内核时 ,以及返回用户态的前一刻 。
当然,它们也带来了可测量的性能开销:一次简单的 getpid 系统调用可能因为额外的 IBRS 和 RSB 清理而增加数十纳秒。然而在安全威胁面前,这已经是内核开发者能够做出的最优权衡。
七、总结:一条指令背后的复杂性
从用户态 syscall 到内核态 sysret,整个流程涉及:
- MSR 的精心配置 (
STAR,LSTAR,SYSCALL_MASK) - 栈的两次切换(用户栈 → 内核栈 → trampoline 栈)
- 上下文的完整保存与恢复 (
pt_regs) - 系统调用表的快速分发 (
x64_sys_call) - 安全返回的条件判断 (
do_syscall_64中的五重检查) - 多层次的漏洞缓解 (
IBRS,UNTRAIN_RET,CLEAR_BRANCH_HISTORY等)
每一处代码都凝聚了内核开发者对性能 、正确性 与安全的反复权衡。了解这些细节,不仅有助于写出更高效的应用程序,也能帮助我们真正理解"用户态与内核态边界"这一操作系统的核心抽象。 ##源码
/*
void syscall_init(void)
{
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
if (ia32_enabled()) {
wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
* This does not cause SYSENTER to jump to the wrong location, because
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
(unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
} else {
wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
}
/*
* Flags to clear on syscall; clear as much as possible
* to minimize user space-kernel interference.
*/
wrmsrl(MSR_SYSCALL_MASK,
X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
X86_EFLAGS_AC|X86_EFLAGS_ID);
}
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
* This is the only entry point used for 64-bit system calls. The
* hardware interface is reasonably well designed and the register to
* argument mapping Linux uses fits well with the registers that are
* available when SYSCALL is used.
*
* SYSCALL instructions can be found inlined in libc implementations as
* well as some other programs and libraries. There are also a handful
* of SYSCALL instructions in the vDSO used, for example, as a
* clock_gettimeofday fallback.
*
* 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
* then loads new ss, cs, and rip from previously programmed MSRs.
* rflags gets masked by a value from another MSR (so CLD and CLAC
* are not needed). SYSCALL does not save anything on the stack
* and does not change rsp.
*
* Registers on entry:
* rax system call number
* rcx return address
* r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
* rdi arg0
* rsi arg1
* rdx arg2
* r10 arg3 (needs to be moved to rcx to conform to C ABI)
* r8 arg4
* r9 arg5
* (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
*
* Only called from user space.
*
* When user can change pt_regs->foo always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
*/
SYM_CODE_START(entry_SYSCALL_64)
UNWIND_HINT_ENTRY
ENDBR
swapgs
/* tss.sp2 is scratch space. */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
pushq %rax /* pt_regs->orig_ax */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
/* IRQs are off. */
movq %rsp, %rdi
/* Sign extend the lower 32bit as syscall numbers are treated as int */
movslq %eax, %rsi
/* clobbers %rax, make sure it is after saving the syscall nr */
IBRS_ENTER
UNTRAIN_RET
CLEAR_BRANCH_HISTORY
call do_syscall_64 /* returns with IRQs disabled */
/*
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
* go to the slow exit path.
* In the Xen PV case we must use iret anyway.
*/
ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
/*
* We win! This label is here just for ease of understanding
* perf profiles. Nothing jumps here.
*/
syscall_return_via_sysret:
IBRS_EXIT
POP_REGS pop_rdi=0
/*
* Now all regs are restored except RSP and RDI.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
UNWIND_HINT_END_OF_STACK
pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
STACKLEAK_ERASE_NOCLOBBER
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
popq %rdi
popq %rsp
SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
swapgs
CLEAR_CPU_BUFFERS
sysretq
SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
int3
SYM_CODE_END(entry_SYSCALL_64)
/* Returns true to return using SYSRET, or false to use IRET */
__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
{
add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
instrumentation_begin();
if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
/* Invalid system call, but still a system call. */
regs->ax = __x64_sys_ni_syscall(regs);
}
instrumentation_end();
syscall_exit_to_user_mode(regs);
/*
* Check that the register state is valid for using SYSRET to exit
* to userspace. Otherwise use the slower but fully capable IRET
* exit path.
*/
/* XEN PV guests always use the IRET path */
if (cpu_feature_enabled(X86_FEATURE_XENPV))
return false;
/* SYSRET requires RCX == RIP and R11 == EFLAGS */
if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
return false;
/* CS and SS must match the values set in MSR_STAR */
if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
return false;
/*
* On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
* in kernel space. This essentially lets the user take over
* the kernel, since userspace controls RSP.
*
* TASK_SIZE_MAX covers all user-accessible addresses other than
* the deprecated vsyscall page.
*/
if (unlikely(regs->ip >= TASK_SIZE_MAX))
return false;
/*
* SYSRET cannot restore RF. It can restore TF, but unlike IRET,
* restoring TF results in a trap from userspace immediately after
* SYSRET.
*/
if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
return false;
/* Use SYSRET to exit to userspace */
return true;
}
static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
{
/*
* Convert negative numbers to very high and thus out of range
* numbers for comparisons.
*/
unsigned int unr = nr;
//yym-gaizao
trace_printk("do_syscall_64: nr=%d, ip=0x%lx\n", nr, regs->ip);
if (likely(unr < NR_syscalls)) {
unr = array_index_nospec(unr, NR_syscalls);
regs->ax = x64_sys_call(regs, unr);
return true;
}
return false;
}
long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
{
switch (nr) {
#include <asm/syscalls_64.h>
default: return __x64_sys_ni_syscall(regs);
}
};