一、系统调用整体架构
Linux 系统调用采用分层架构,从用户空间到内核空间分为以下几个层次:
text
用户空间 write() → glibc 封装 → syscall 指令 → entry_SYSCALL_64 →
do_syscall_64 → SYSCALL_DEFINE3(write) → ksys_write() → vfs_write()
二、从用户空间到内核的进入过程
1. 硬件层面(syscall 指令)
assembly
; 用户空间执行:syscall
; 硬件自动完成:
; 1. 保存 rip 到 rcx
; 2. 保存 rflags 到 r11
; 3. 从 MSR 寄存器加载内核 cs, ss, rip
2. 入口点:entry_SYSCALL_64
这是 syscall 指令跳转的内核入口,主要工作:
c
SYM_CODE_START(entry_SYSCALL_64)
swapgs ; 切换 GS 寄存器,获取内核 per-CPU 数据
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2) ; 保存用户栈
SWITCH_TO_KERNEL_CR3 ; 切换到内核页表
movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp ; 切换到内核栈
; 构建 pt_regs 结构体(保存用户态寄存器)
pushq $__USER_DS ; ss
pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) ; sp
pushq %r11 ; flags
pushq $__USER_CS ; cs
pushq %rcx ; ip(返回地址)
pushq %rax ; orig_ax(系统调用号)
PUSH_AND_CLEAR_REGS ; 保存所有通用寄存器
; 调用系统调用处理函数
movq %rsp, %rdi ; pt_regs 指针作为第一个参数
movslq %eax, %rsi ; 系统调用号作为第二个参数
call do_syscall_64
3. MSR 寄存器初始化
c
void syscall_init(void)
{
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); // 64位入口地址
// 设置系统调用屏蔽标志位
wrmsrl(MSR_SYSCALL_MASK,
X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|...);
}
三、系统调用分发与执行
1. do_syscall_64 - 系统调用分发器
c
bool do_syscall_64(struct pt_regs *regs, int nr)
{
nr = syscall_enter_from_user_mode(regs, nr); // 审计、追踪
if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr)) {
// 无效系统调用
regs->ax = __x64_sys_ni_syscall(regs); // 返回 -ENOSYS
}
syscall_exit_to_user_mode(regs);
// 检查是否可以使用 SYSRET 快速返回
return check_for_sysret(regs);
}
2. 系统调用表映射
c
// 系统调用表定义
const sys_call_ptr_t sys_call_table[] = {
[0] = __x64_sys_read,
[1] = __x64_sys_write,
[2] = __x64_sys_open,
// ...
};
// 系统调用分发
long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
{
switch (nr) {
case __NR_write:
return __x64_sys_write(regs);
case __NR_read:
return __x64_sys_read(regs);
// ...
}
}
3. 系统调用宏展开
c
// SYSCALL_DEFINE3 宏展开
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf, size_t, count)
{
return ksys_write(fd, buf, count);
}
// 展开后生成:
long __x64_sys_write(const struct pt_regs *regs)
{
return __se_sys_write(regs->di, regs->si, regs->dx);
}
四、具体的 write 系统调用实现
1. ksys_write - 系统调用实现
c
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
struct fd f = fdget_pos(fd); // 通过 fd 获取 file 结构
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos, *ppos = file_ppos(f.file);
if (ppos) {
pos = *ppos; // 获取当前文件位置
ppos = &pos;
}
ret = vfs_write(f.file, buf, count, ppos);
if (ret >= 0 && ppos)
f.file->f_pos = pos; // 更新文件位置
fdput_pos(f);
}
return ret;
}
2. vfs_write - 虚拟文件系统层
c
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
// 1. 权限检查
if (!(file->f_mode & FMODE_WRITE)) return -EBADF;
if (!access_ok(buf, count)) return -EFAULT;
// 2. 验证写入区域
ret = rw_verify_area(WRITE, file, pos, count);
// 3. 调用具体的文件操作
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else if (file->f_op->write_iter)
ret = new_sync_write(file, buf, count, pos);
// 4. 通知和统计
if (ret > 0) {
fsnotify_modify(file); // 文件修改通知
add_wchar(current, ret); // 统计写入字符数
}
inc_syscw(current); // 系统调用计数
return ret;
}
五、系统调用返回
1. 快速返回路径(SYSRET)
assembly
syscall_return_via_sysret:
POP_REGS pop_rdi=0 ; 恢复寄存器
movq %rsp, %rdi ; 保存栈指针
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp ; 切换到 trampoline 栈
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi ; 切换回用户页表
popq %rdi
popq %rsp
swapgs ; 切回用户 GS
sysretq ; 快速返回用户空间
2. 慢速返回路径(IRET)
如果不满足 SYSRET 条件(如非规范地址、需要恢复标志位等),使用 IRET:
-
保存完整的上下文恢复
-
更安全但更慢
六、关键数据结构
1. pt_regs - 保存的用户态寄存器
c
struct pt_regs {
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long bp;
unsigned long bx;
// ...
unsigned long di; // 第一个参数
unsigned long si; // 第二个参数
unsigned long dx; // 第三个参数
unsigned long cx; // 第四个参数(syscall 使用 r10)
unsigned long ax; // 系统调用号/返回值
unsigned long orig_ax; // 原始系统调用号
// ...
};
2. file 结构 - 文件描述符的表示
c
struct file {
fmode_t f_mode; // 文件模式
loff_t f_pos; // 文件位置
const struct file_operations *f_op; // 文件操作函数表
// ...
};
七、安全机制
1. 地址验证
c
access_ok(buf, count) // 检查用户缓冲区是否可访问
2. 权限检查
c
rw_verify_area(WRITE, file, pos, count) // 验证写入区域
3. 边界检查
c
if (count > MAX_RW_COUNT) count = MAX_RW_COUNT;
4. 控制流防护
c
ENDBR // Intel CET 保护
IBRS_ENTER // 推测执行防护
UNWIND_HINT // 栈展开提示
八、性能优化
-
快速系统调用:使用 syscall/sysret 而非传统中断
-
寄存器传参:6个参数通过寄存器传递,避免栈复制
-
SYSRET 快速路径:满足条件时使用更快的返回指令
-
页表缓存:CR3 切换优化
-
per-CPU 数据:swapgs 快速访问每CPU数据
总结流程
用户空间 write() → syscall 指令 → 硬件切换 → entry_SYSCALL_64 → 保存上下文 → do_syscall_64 → 系统调用表 → __x64_sys_write → ksys_write → vfs_write → 具体文件系统操作 → 返回结果 → 恢复上下文 → sysret/iret → 用户空间
这个架构在保持兼容性的同时,实现了高效的系统调用处理,是现代 Linux 系统性能的关键组成部分。
##源码
ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos)
{
ssize_t ret;
if (!(file->f_mode & FMODE_WRITE))
return -EBADF;
if (!(file->f_mode & FMODE_CAN_WRITE))
return -EINVAL;
if (unlikely(!access_ok(buf, count)))
return -EFAULT;
ret = rw_verify_area(WRITE, file, pos, count);
if (ret)
return ret;
if (count > MAX_RW_COUNT)
count = MAX_RW_COUNT;
file_start_write(file);
if (file->f_op->write)
ret = file->f_op->write(file, buf, count, pos);
else if (file->f_op->write_iter)
ret = new_sync_write(file, buf, count, pos);
else
ret = -EINVAL;
if (ret > 0) {
fsnotify_modify(file);
add_wchar(current, ret);
}
inc_syscw(current);
file_end_write(file);
return ret;
}
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count)
{
struct fd f = fdget_pos(fd);
ssize_t ret = -EBADF;
if (f.file) {
loff_t pos, *ppos = file_ppos(f.file);
if (ppos) {
pos = *ppos;
ppos = &pos;
}
ret = vfs_write(f.file, buf, count, ppos);
if (ret >= 0 && ppos)
f.file->f_pos = pos;
fdput_pos(f);
}
return ret;
}
SYSCALL_DEFINE3(write, unsigned int, fd, const char __user *, buf,
size_t, count)
{
return ksys_write(fd, buf, count);
}
/* May not be marked __init: used by software suspend */
void syscall_init(void)
{
wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS);
wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64);
if (ia32_enabled()) {
wrmsrl_cstar((unsigned long)entry_SYSCALL_compat);
/*
* This only works on Intel CPUs.
* On AMD CPUs these MSRs are 32-bit, CPU truncates MSR_IA32_SYSENTER_EIP.
* This does not cause SYSENTER to jump to the wrong location, because
* AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit).
*/
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP,
(unsigned long)(cpu_entry_stack(smp_processor_id()) + 1));
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat);
} else {
wrmsrl_cstar((unsigned long)entry_SYSCALL32_ignore);
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)GDT_ENTRY_INVALID_SEG);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, 0ULL);
}
/*
* Flags to clear on syscall; clear as much as possible
* to minimize user space-kernel interference.
*/
wrmsrl(MSR_SYSCALL_MASK,
X86_EFLAGS_CF|X86_EFLAGS_PF|X86_EFLAGS_AF|
X86_EFLAGS_ZF|X86_EFLAGS_SF|X86_EFLAGS_TF|
X86_EFLAGS_IF|X86_EFLAGS_DF|X86_EFLAGS_OF|
X86_EFLAGS_IOPL|X86_EFLAGS_NT|X86_EFLAGS_RF|
X86_EFLAGS_AC|X86_EFLAGS_ID);
}
/*
* 64-bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
* This is the only entry point used for 64-bit system calls. The
* hardware interface is reasonably well designed and the register to
* argument mapping Linux uses fits well with the registers that are
* available when SYSCALL is used.
*
* SYSCALL instructions can be found inlined in libc implementations as
* well as some other programs and libraries. There are also a handful
* of SYSCALL instructions in the vDSO used, for example, as a
* clock_gettimeofday fallback.
*
* 64-bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
* then loads new ss, cs, and rip from previously programmed MSRs.
* rflags gets masked by a value from another MSR (so CLD and CLAC
* are not needed). SYSCALL does not save anything on the stack
* and does not change rsp.
*
* Registers on entry:
* rax system call number
* rcx return address
* r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
* rdi arg0
* rsi arg1
* rdx arg2
* r10 arg3 (needs to be moved to rcx to conform to C ABI)
* r8 arg4
* r9 arg5
* (note: r12-r15, rbp, rbx are callee-preserved in C ABI)
*
* Only called from user space.
*
* When user can change pt_regs->foo always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
*/
SYM_CODE_START(entry_SYSCALL_64)
UNWIND_HINT_ENTRY
ENDBR
swapgs
/* tss.sp2 is scratch space. */
movq %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
movq PER_CPU_VAR(pcpu_hot + X86_top_of_stack), %rsp
SYM_INNER_LABEL(entry_SYSCALL_64_safe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
/* Construct struct pt_regs on stack */
pushq $__USER_DS /* pt_regs->ss */
pushq PER_CPU_VAR(cpu_tss_rw + TSS_sp2) /* pt_regs->sp */
pushq %r11 /* pt_regs->flags */
pushq $__USER_CS /* pt_regs->cs */
pushq %rcx /* pt_regs->ip */
SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
pushq %rax /* pt_regs->orig_ax */
PUSH_AND_CLEAR_REGS rax=$-ENOSYS
/* IRQs are off. */
movq %rsp, %rdi
/* Sign extend the lower 32bit as syscall numbers are treated as int */
movslq %eax, %rsi
/* clobbers %rax, make sure it is after saving the syscall nr */
IBRS_ENTER
UNTRAIN_RET
CLEAR_BRANCH_HISTORY
call do_syscall_64 /* returns with IRQs disabled */
/*
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context. If we're not,
* go to the slow exit path.
* In the Xen PV case we must use iret anyway.
*/
ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
"jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
/*
* We win! This label is here just for ease of understanding
* perf profiles. Nothing jumps here.
*/
syscall_return_via_sysret:
IBRS_EXIT
POP_REGS pop_rdi=0
/*
* Now all regs are restored except RSP and RDI.
* Save old stack pointer and switch to trampoline stack.
*/
movq %rsp, %rdi
movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp
UNWIND_HINT_END_OF_STACK
pushq RSP-RDI(%rdi) /* RSP */
pushq (%rdi) /* RDI */
/*
* We are on the trampoline stack. All regs except RDI are live.
* We can do future final exit work right here.
*/
STACKLEAK_ERASE_NOCLOBBER
SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi
popq %rdi
popq %rsp
SYM_INNER_LABEL(entry_SYSRETQ_unsafe_stack, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
swapgs
CLEAR_CPU_BUFFERS
sysretq
SYM_INNER_LABEL(entry_SYSRETQ_end, SYM_L_GLOBAL)
ANNOTATE_NOENDBR
int3
SYM_CODE_END(entry_SYSCALL_64)
/* Returns true to return using SYSRET, or false to use IRET */
__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
{
add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
instrumentation_begin();
if (!do_syscall_x64(regs, nr) && !do_syscall_x32(regs, nr) && nr != -1) {
/* Invalid system call, but still a system call. */
regs->ax = __x64_sys_ni_syscall(regs);
}
instrumentation_end();
syscall_exit_to_user_mode(regs);
/*
* Check that the register state is valid for using SYSRET to exit
* to userspace. Otherwise use the slower but fully capable IRET
* exit path.
*/
/* XEN PV guests always use the IRET path */
if (cpu_feature_enabled(X86_FEATURE_XENPV))
return false;
/* SYSRET requires RCX == RIP and R11 == EFLAGS */
if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
return false;
/* CS and SS must match the values set in MSR_STAR */
if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
return false;
/*
* On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
* in kernel space. This essentially lets the user take over
* the kernel, since userspace controls RSP.
*
* TASK_SIZE_MAX covers all user-accessible addresses other than
* the deprecated vsyscall page.
*/
if (unlikely(regs->ip >= TASK_SIZE_MAX))
return false;
/*
* SYSRET cannot restore RF. It can restore TF, but unlike IRET,
* restoring TF results in a trap from userspace immediately after
* SYSRET.
*/
if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
return false;
/* Use SYSRET to exit to userspace */
return true;
}
static __always_inline bool do_syscall_x64(struct pt_regs *regs, int nr)
{
/*
* Convert negative numbers to very high and thus out of range
* numbers for comparisons.
*/
unsigned int unr = nr;
if (likely(unr < NR_syscalls)) {
unr = array_index_nospec(unr, NR_syscalls);
regs->ax = x64_sys_call(regs, unr);
return true;
}
return false;
}
#define __SYSCALL(nr, sym) extern long __x64_##sym(const struct pt_regs *);
#include <asm/syscalls_64.h>
#undef __SYSCALL
/*
* The sys_call_table[] is no longer used for system calls, but
* kernel/trace/trace_syscalls.c still wants to know the system
* call address.
*/
#define __SYSCALL(nr, sym) __x64_##sym,
const sys_call_ptr_t sys_call_table[] = {
#include <asm/syscalls_64.h>
};
#undef __SYSCALL
#define __SYSCALL(nr, sym) case nr: return __x64_##sym(regs);
long x64_sys_call(const struct pt_regs *regs, unsigned int nr)
{
switch (nr) {
#include <asm/syscalls_64.h>
default: return __x64_sys_ni_syscall(regs);
}
};
#define __SYSCALL_DEFINEx(x, name, ...) \
static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__));\
__X64_SYS_STUBx(x, name, __VA_ARGS__) \
__IA32_SYS_STUBx(x, name, __VA_ARGS__) \
static long __se_sys##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
{ \
long ret = __do_sys##name(__MAP(x,__SC_CAST,__VA_ARGS__));\
__MAP(x,__SC_TEST,__VA_ARGS__); \
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
return ret; \
} \
static inline long __do_sys##name(__MAP(x,__SC_DECL,__VA_ARGS__))
#ifdef CONFIG_X86_64
#define __X64_SYS_STUB0(name) \
__SYS_STUB0(x64, sys_##name)
#define __X64_SYS_STUBx(x, name, ...) \
__SYS_STUBx(x64, sys##name, \
SC_X86_64_REGS_TO_ARGS(x, __VA_ARGS__))
#define __X64_COND_SYSCALL(name) \
__COND_SYSCALL(x64, sys_##name)
#else /* CONFIG_X86_64 */
#define __X64_SYS_STUB0(name)
#define __X64_SYS_STUBx(x, name, ...)
#define __X64_COND_SYSCALL(name)
#endif /* CONFIG_X86_64 */
#define __SYS_STUBx(abi, name, ...) \
long __##abi##_##name(const struct pt_regs *regs); \
ALLOW_ERROR_INJECTION(__##abi##_##name, ERRNO); \
long __##abi##_##name(const struct pt_regs *regs) \
{ \
return __se_##name(__VA_ARGS__); \
}