Linux 7 中的系统调用原理

一、系统调用整体架构

Linux 系统调用采用分层架构，从用户空间到内核空间分为以下几个层次：

text

复制代码

用户空间 write() → glibc 封装 → syscall 指令 → entry_SYSCALL_64 → 
do_syscall_64 → SYSCALL_DEFINE3(write) → ksys_write() → vfs_write()

二、从用户空间到内核的进入过程

1. 硬件层面（syscall 指令）

assembly

复制代码

; 用户空间执行：syscall
; 硬件自动完成：
; 1. 保存 rip 到 rcx
; 2. 保存 rflags 到 r11  
; 3. 从 MSR 寄存器加载内核 cs, ss, rip

2. 入口点：entry_SYSCALL_64

这是 syscall 指令跳转的内核入口，主要工作：

复制代码

SYM_CODE_START(entry_SYSCALL_64)
    swapgs                 ; 切换 GS 寄存器，获取内核 per-CPU 数据
    movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) ; 保存用户栈
    SWITCH_TO_KERNEL_CR3   ; 切换到内核页表
    movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp ; 切换到内核栈
    
    ; 构建 pt_regs 结构体（保存用户态寄存器）
    pushq $__USER_DS      ; ss
    pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) ; sp
    pushq %r11            ; flags
    pushq $__USER_CS      ; cs
    pushq %rcx            ; ip（返回地址）
    pushq %rax            ; orig_ax（系统调用号）
    
    PUSH_AND_CLEAR_REGS   ; 保存所有通用寄存器
    
    ; 调用系统调用处理函数
    movq %rsp, %rdi       ; pt_regs 指针作为第一个参数
    movslq %eax, %rsi     ; 系统调用号作为第二个参数
    call do_syscall_64

3. MSR 寄存器初始化

复制代码

void syscall_init(void)
{
    wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
    wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);  // 64位入口地址
    
    // 设置系统调用屏蔽标志位
    wrmsrl(MSR_SYSCALL_MASK,
           X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|...);
}

三、系统调用分发与执行

1. do_syscall_64 - 系统调用分发器

复制代码

bool do_syscall_64(struct pt_regs *regs, int nr)
{
    nr = syscall_enter_from_user_mode(regs, nr);  // 审计、追踪
    
    if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr)) {
        // 无效系统调用
        regs->ax = __x64_sys_ni_syscall(regs);  // 返回 -ENOSYS
    }
    
    syscall_exit_to_user_mode(regs);
    
    // 检查是否可以使用 SYSRET 快速返回
    return check_for_sysret(regs);
}

2. 系统调用表映射

复制代码

// 系统调用表定义
const sys_call_ptr_t sys_call_table[] = {
    [0] = __x64_sys_read,
    [1] = __x64_sys_write,
    [2] = __x64_sys_open,
    // ...
};

// 系统调用分发
long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
{
    switch (nr) {
    case __NR_write:
        return __x64_sys_write(regs);
    case __NR_read:
        return __x64_sys_read(regs);
    // ...
    }
}

3. 系统调用宏展开

复制代码

// SYSCALL_DEFINE3 宏展开
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count)
{
    return ksys_write(fd, buf, count);
}

// 展开后生成：
long __x64_sys_write(const struct pt_regs *regs)
{
    return __se_sys_write(regs->di, regs->si, regs->dx);
}

四、具体的 write 系统调用实现

1. ksys_write - 系统调用实现

复制代码

ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
    struct fd f = fdget_pos(fd);  // 通过 fd 获取 file 结构
    ssize_t ret = -EBADF;
    
    if (f.file) {
        loff_t pos, *ppos = file_ppos(f.file);
        if (ppos) {
            pos = *ppos;  // 获取当前文件位置
            ppos = &pos;
        }
        ret = vfs_write(f.file, buf, count, ppos);
        if (ret >= 0 && ppos)
            f.file->f_pos = pos;  // 更新文件位置
        fdput_pos(f);
    }
    return ret;
}

2. vfs_write - 虚拟文件系统层

复制代码

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
    // 1. 权限检查
    if (!(file->f_mode & FMODE_WRITE)) return -EBADF;
    if (!access_ok(buf, count)) return -EFAULT;
    
    // 2. 验证写入区域
    ret = rw_verify_area(WRITE, file, pos, count);
    
    // 3. 调用具体的文件操作
    if (file->f_op->write)
        ret = file->f_op->write(file, buf, count, pos);
    else if (file->f_op->write_iter)
        ret = new_sync_write(file, buf, count, pos);
    
    // 4. 通知和统计
    if (ret > 0) {
        fsnotify_modify(file);    // 文件修改通知
        add_wchar(current, ret);  // 统计写入字符数
    }
    inc_syscw(current);  // 系统调用计数
    return ret;
}

五、系统调用返回

1. 快速返回路径（SYSRET）

assembly

复制代码

syscall_return_via_sysret:
    POP_REGS pop_rdi=0      ; 恢复寄存器
    movq %rsp, %rdi         ; 保存栈指针
    movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp ; 切换到 trampoline 栈
    
    SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi  ; 切换回用户页表
    popq %rdi
    popq %rsp
    swapgs                   ; 切回用户 GS
    sysretq                  ; 快速返回用户空间

2. 慢速返回路径（IRET）

如果不满足 SYSRET 条件（如非规范地址、需要恢复标志位等），使用 IRET：

保存完整的上下文恢复
更安全但更慢

六、关键数据结构

1. pt_regs - 保存的用户态寄存器

复制代码

struct pt_regs {
    unsigned long r15;
    unsigned long r14;
    unsigned long r13;
    unsigned long r12;
    unsigned long bp;
    unsigned long bx;
    // ...
    unsigned long di;    // 第一个参数
    unsigned long si;    // 第二个参数  
    unsigned long dx;    // 第三个参数
    unsigned long cx;    // 第四个参数（syscall 使用 r10）
    unsigned long ax;    // 系统调用号/返回值
    unsigned long orig_ax; // 原始系统调用号
    // ...
};

2. file 结构 - 文件描述符的表示

复制代码

struct file {
    fmode_t f_mode;           // 文件模式
    loff_t f_pos;             // 文件位置
    const struct file_operations *f_op; // 文件操作函数表
    // ...
};

七、安全机制

1. 地址验证

复制代码

access_ok(buf, count)  // 检查用户缓冲区是否可访问

2. 权限检查

复制代码

rw_verify_area(WRITE, file, pos, count)  // 验证写入区域

3. 边界检查

复制代码

if (count > MAX_RW_COUNT) count = MAX_RW_COUNT;

4. 控制流防护

复制代码

ENDBR          // Intel CET 保护
IBRS_ENTER     // 推测执行防护
UNWIND_HINT    // 栈展开提示

八、性能优化

快速系统调用：使用 syscall/sysret 而非传统中断
寄存器传参：6个参数通过寄存器传递，避免栈复制
SYSRET 快速路径：满足条件时使用更快的返回指令
页表缓存：CR3 切换优化
per-CPU 数据：swapgs 快速访问每CPU数据

总结流程

用户空间 write() → syscall 指令 → 硬件切换 → entry_SYSCALL_64 → 保存上下文 → do_syscall_64 → 系统调用表 → __x64_sys_write → ksys_write → vfs_write → 具体文件系统操作 → 返回结果 → 恢复上下文 → sysret/iret → 用户空间

这个架构在保持兼容性的同时，实现了高效的系统调用处理，是现代 Linux 系统性能的关键组成部分。

##源码

复制代码

ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
	ssize_t ret;

	if (!(file->f_mode & FMODE_WRITE))
		return -EBADF;
	if (!(file->f_mode & FMODE_CAN_WRITE))
		return -EINVAL;
	if (unlikely(!access_ok(buf, count)))
		return -EFAULT;

	ret = rw_verify_area(WRITE, file, pos, count);
	if (ret)
		return ret;
	if (count > MAX_RW_COUNT)
		count =  MAX_RW_COUNT;
	file_start_write(file);
	if (file->f_op->write)
		ret = file->f_op->write(file, buf, count, pos);
	else if (file->f_op->write_iter)
		ret = new_sync_write(file, buf, count, pos);
	else
		ret = -EINVAL;
	if (ret > 0) {
		fsnotify_modify(file);
		add_wchar(current, ret);
	}
	inc_syscw(current);
	file_end_write(file);
	return ret;
}


ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
	struct fd f = fdget_pos(fd);
	ssize_t ret = -EBADF;

	if (f.file) {
		loff_t pos, *ppos = file_ppos(f.file);
		if (ppos) {
			pos = *ppos;
			ppos = &pos;
		}
		ret = vfs_write(f.file, buf, count, ppos);
		if (ret >= 0 && ppos)
			f.file->f_pos = pos;
		fdput_pos(f);
	}

	return ret;
}

SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
		size_t, count)
{
	return ksys_write(fd, buf, count);
}

/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
	wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
	wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);

	if (ia32_enabled()) {
		wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
		/*
		 * This only works on Intel CPUs.
		 * On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
		 * This does not cause SYSENTER to jump to the wrong location, because
		 * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
		 */
		wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
		wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
			    (unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
		wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
	} else {
		wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore);
		wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
		wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
		wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
	}

	/*
	 * Flags to clear on syscall; clear as much as possible
	 * to minimize user space-kernel interference.
	 */
	wrmsrl(MSR_SYSCALL_MASK,
	       X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
	       X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
	       X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
	       X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
	       X86_EFLAGS_AC|X86_EFLAGS_ID);
}

/*
 * 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
 *
 * This is the only entry point used for 64-bit system calls.  The
 * hardware interface is reasonably well designed and the register to
 * argument mapping Linux uses fits well with the registers that are
 * available when SYSCALL is used.
 *
 * SYSCALL instructions can be found inlined in libc implementations as
 * well as some other programs and libraries.  There are also a handful
 * of SYSCALL instructions in the vDSO used, for example, as a
 * clock_gettimeofday fallback.
 *
 * 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
 * then loads new ss, cs, and rip from previously programmed MSRs.
 * rflags gets masked by a value from another MSR (so CLD and CLAC
 * are not needed). SYSCALL does not save anything on the stack
 * and does not change rsp.
 *
 * Registers on entry:
 * rax  system call number
 * rcx  return address
 * r11  saved rflags (note: r11 is callee-clobbered register in C ABI)
 * rdi  arg0
 * rsi  arg1
 * rdx  arg2
 * r10  arg3 (needs to be moved to rcx to conform to C ABI)
 * r8   arg4
 * r9   arg5
 * (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
 *
 * Only called from user space.
 *
 * When user can change pt_regs->foo always force IRET. That is because
 * it deals with uncanonical addresses better. SYSRET has trouble
 * with them due to bugs in both AMD and Intel CPUs.
 */

SYM_CODE_START(entry_SYSCALL_64)
	UNWIND_HINT_ENTRY
	ENDBR

	swapgs
	/* tss.sp2 is scratch space. */
	movq	%rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
	SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
	movq	PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp

SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
	ANNOTATE_NOENDBR

	/* Construct struct pt_regs on stack */
	pushq	$__USER_DS				/* pt_regs->ss */
	pushq	PER_CPU_VAR(cpu_tss_rw + TSS_sp2)	/* pt_regs->sp */
	pushq	%r11					/* pt_regs->flags */
	pushq	$__USER_CS				/* pt_regs->cs */
	pushq	%rcx					/* pt_regs->ip */
SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
	pushq	%rax					/* pt_regs->orig_ax */

	PUSH_AND_CLEAR_REGS rax=$-ENOSYS

	/* IRQs are off. */
	movq	%rsp, %rdi
	/* Sign extend the lower 32bit as syscall numbers are treated as int */
	movslq	%eax, %rsi

	/* clobbers %rax, make sure it is after saving the syscall nr */
	IBRS_ENTER
	UNTRAIN_RET
	CLEAR_BRANCH_HISTORY

	call	do_syscall_64		/* returns with IRQs disabled */

	/*
	 * Try to use SYSRET instead of IRET if we're returning to
	 * a completely clean 64-bit userspace context.  If we're not,
	 * go to the slow exit path.
	 * In the Xen PV case we must use iret anyway.
	 */

	ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
		"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV

	/*
	 * We win! This label is here just for ease of understanding
	 * perf profiles. Nothing jumps here.
	 */
syscall_return_via_sysret:
	IBRS_EXIT
	POP_REGS pop_rdi=0

	/*
	 * Now all regs are restored except RSP and RDI.
	 * Save old stack pointer and switch to trampoline stack.
	 */
	movq	%rsp, %rdi
	movq	PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
	UNWIND_HINT_END_OF_STACK

	pushq	RSP-RDI(%rdi)	/* RSP */
	pushq	(%rdi)		/* RDI */

	/*
	 * We are on the trampoline stack.  All regs except RDI are live.
	 * We can do future final exit work right here.
	 */
	STACKLEAK_ERASE_NOCLOBBER

	SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi

	popq	%rdi
	popq	%rsp
SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
	ANNOTATE_NOENDBR
	swapgs
	CLEAR_CPU_BUFFERS
	sysretq
SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
	ANNOTATE_NOENDBR
	int3
SYM_CODE_END(entry_SYSCALL_64)


/* Returns true to return using SYSRET, or false to use IRET */
__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
{
	add_random_kstack_offset();
	nr = syscall_enter_from_user_mode(regs, nr);

	instrumentation_begin();

	if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
		/* Invalid system call, but still a system call. */
		regs->ax = __x64_sys_ni_syscall(regs);
	}

	instrumentation_end();
	syscall_exit_to_user_mode(regs);

	/*
	 * Check that the register state is valid for using SYSRET to exit
	 * to userspace.  Otherwise use the slower but fully capable IRET
	 * exit path.
	 */

	/* XEN PV guests always use the IRET path */
	if (cpu_feature_enabled(X86_FEATURE_XENPV))
		return false;

	/* SYSRET requires RCX == RIP and R11 == EFLAGS */
	if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
		return false;

	/* CS and SS must match the values set in MSR_STAR */
	if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
		return false;

	/*
	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
	 * in kernel space.  This essentially lets the user take over
	 * the kernel, since userspace controls RSP.
	 *
	 * TASK_SIZE_MAX covers all user-accessible addresses other than
	 * the deprecated vsyscall page.
	 */
	if (unlikely(regs->ip >= TASK_SIZE_MAX))
		return false;

	/*
	 * SYSRET cannot restore RF.  It can restore TF, but unlike IRET,
	 * restoring TF results in a trap from userspace immediately after
	 * SYSRET.
	 */
	if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
		return false;

	/* Use SYSRET to exit to userspace */
	return true;
}

static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
{
	/*
	 * Convert negative numbers to very high and thus out of range
	 * numbers for comparisons.
	 */
	unsigned int unr = nr;

	if (likely(unr < NR_syscalls)) {
		unr = array_index_nospec(unr, NR_syscalls);
		regs->ax = x64_sys_call(regs, unr);
		return true;
	}
	return false;
}

#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
#include <asm/syscalls_64.h>
#undef __SYSCALL

/*
 * The sys_call_table[] is no longer used for system calls, but
 * kernel/trace/trace_syscalls.c still wants to know the system
 * call address.
 */
#define __SYSCALL(nr, sym) __x64_##sym,
const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_64.h>
};
#undef __SYSCALL

#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);

long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
{
	switch (nr) {
	#include <asm/syscalls_64.h>
	default: return __x64_sys_ni_syscall(regs);
	}
};

#define __SYSCALL_DEFINEx(x, name, ...)					\
	static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__));	\
	static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
	__X64_SYS_STUBx(x, name, __VA_ARGS__)				\
	__IA32_SYS_STUBx(x, name, __VA_ARGS__)				\
	static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__))	\
	{								\
		long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\
		__MAP(x,__SC_TEST,__VA_ARGS__);				\
		__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__));	\
		return ret;						\
	}								\
	static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))


#ifdef CONFIG_X86_64
#define __X64_SYS_STUB0(name)						\
	__SYS_STUB0(x64, sys_##name)

#define __X64_SYS_STUBx(x, name, ...)					\
	__SYS_STUBx(x64, sys##name,					\
		    SC_X86_64_REGS_TO_ARGS(x, __VA_ARGS__))

#define __X64_COND_SYSCALL(name)					\
	__COND_SYSCALL(x64, sys_##name)

#else /* CONFIG_X86_64 */
#define __X64_SYS_STUB0(name)
#define __X64_SYS_STUBx(x, name, ...)
#define __X64_COND_SYSCALL(name)
#endif /* CONFIG_X86_64 */

#define __SYS_STUBx(abi, name, ...)					\
	long __##abi##_##name(const struct pt_regs *regs);		\
	ALLOW_ERROR_INJECTION(__##abi##_##name, ERRNO);			\
	long __##abi##_##name(const struct pt_regs *regs)		\
	{								\
		return __se_##name(__VA_ARGS__);			\
	}