引言:操作系统的"第一声啼哭"
当计算机电源按下,BIOS/UEFI引导加载程序将Linux内核映像载入内存,内核完成一系列复杂的初始化工作后,最终要启动第一个用户态进程------init。这个过程宛如一个新生命的诞生:内核是"母亲",init是"第一个孩子"。而kernel_init函数正是这场分娩的主刀医生。本文将沿着Linux内核源码,从kernel_init函数出发,一路剖析内核如何加载并执行init进程,覆盖内核执行用户程序的完整路径:execve系统调用、二进制格式处理、直至ELF加载器的精细操作。
阅读本文前,建议读者对Linux内核进程、虚拟内存、可执行文件格式有基本了解。本文将基于Linux 5.x/6.x内核源码,关键函数和数据结构均取自真实代码(略有简化以突出主线)。
第一章:一切从kernel_init开始
1.1 内核初始化的最后冲刺
Linux内核的启动入口是start_kernel,完成大部分子系统的初始化后,会创建init内核线程(PID=1),该线程执行函数kernel_init。源码如下:
c
scss
static int __ref kernel_init(void *unused)
{
int ret;
/* 等待kthreadd线程完全就绪 */
wait_for_completion(&kthreadd_done);
kernel_init_freeable();
/* 等待所有异步__init代码完成,以便释放初始化内存 */
async_synchronize_full();
system_state = SYSTEM_FREEING_INITMEM;
kprobe_free_init_mem();
ftrace_free_init_mem();
kgdb_free_init_mem();
exit_boot_config();
free_initmem(); /* 释放__init节的内存 */
mark_readonly(); /* 将内核文本设为只读 */
/* 内核映射已定型,更新用户态页表以完成PTI */
pti_finalize();
system_state = SYSTEM_RUNNING;
numa_default_policy();
rcu_end_inkernel_boot();
do_sysctl_args();
/* 尝试执行ramdisk中的init命令 */
if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)
return 0;
pr_err("Failed to execute %s (error %d)\n",
ramdisk_execute_command, ret);
}
/* 依次尝试execute_command、默认init路径等 */
if (execute_command) {
ret = run_init_process(execute_command);
if (!ret)
return 0;
panic("Requested init %s failed (error %d).",
execute_command, ret);
}
if (CONFIG_DEFAULT_INIT[0] != '\0') {
ret = run_init_process(CONFIG_DEFAULT_INIT);
if (ret)
pr_err("Default init %s failed (error %d)\n",
CONFIG_DEFAULT_INIT, ret);
else
return 0;
}
/* 最后的挣扎:常见位置 */
if (!try_to_run_init_process("/sbin/init") ||
!try_to_run_init_process("/etc/init") ||
!try_to_run_init_process("/bin/init") ||
!try_to_run_init_process("/bin/sh"))
return 0;
panic("No working init found. Try passing init= option to kernel. "
"See Linux Documentation/admin-guide/init.rst for guidance.");
}
逐步解读:
-
wait_for_completion(&kthreadd_done):确保内核线程管理器kthreadd已就绪,因为后续可能创建内核线程。 -
kernel_init_freeable():完成可推迟(freeable)的初始化,例如启动SMP、挂载rootfs等。注意此函数会调用do_basic_setup(),后者会执行do_initcalls(),启动所有设备驱动和内置模块。 -
async_synchronize_full():等待所有异步初始化(用async_schedule调度)完成,这样在释放.init内存前不会丢失代码。 -
然后系统状态变为
SYSTEM_FREEING_INITMEM,释放__init节的内存(包括大量初始化函数和数据),调用mark_readonly()使内核代码段只读,增强安全。 -
pti_finalize():完成页表隔离(Page Table Isolation),用于防御Meltdown漏洞。 -
系统状态变为
SYSTEM_RUNNING,设置NUMA缺省策略,结束RCU内核启动阶段,处理内核启动参数(do_sysctl_args)。 -
接下来进入核心:尝试执行init进程。搜索顺序为:
ramdisk_execute_command(由init=内核参数或默认/init指定)。execute_command(也是init=参数,但优先级略低)。- 编译时默认的init路径(
CONFIG_DEFAULT_INIT)。 - 传统位置:
/sbin/init、/etc/init、/bin/init、/bin/sh。
-
若全部失败,内核panic。
其中,run_init_process和try_to_run_init_process封装了实际执行程序的动作。
1.2 run_init_process:第一次execve
c
scss
static int run_init_process(const char *init_filename)
{
const char *const *p;
argv_init[0] = init_filename;
pr_info("Run %s as init process\n", init_filename);
pr_debug(" with arguments:\n");
for (p = argv_init; *p; p++)
pr_debug(" %s\n", *p);
pr_debug(" with environment:\n");
for (p = envp_init; *p; p++)
pr_debug(" %s\n", *p);
return kernel_execve(init_filename, argv_init, envp_init);
}
argv_init和envp_init是内核预先定义的参数和环境变量,例如argv_init = { "init", NULL },envp_init包含了HOME=/、TERM=linux等基本环境。最终调用kernel_execve。
try_to_run_init_process只是对run_init_process的包装,在返回-ENOENT(文件不存在)时不打印错误,而其他错误(如权限不足、格式错误)会报警。
第二章:内核中的execve------kernel_execve剖析
用户态通过execve系统调用陷入内核,而kernel_execve是内核内部启动进程的接口,用于init及后来的kexec等场景。其实现如下:
c
ini
int kernel_execve(const char *kernel_filename,
const char *const *argv, const char *const *envp)
{
struct filename *filename;
struct linux_binprm *bprm;
int fd = AT_FDCWD;
int retval;
/* 内核线程调用execve毫无意义 */
if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
return -EINVAL;
filename = getname_kernel(kernel_filename);
if (IS_ERR(filename))
return PTR_ERR(filename);
bprm = alloc_bprm(fd, filename, 0);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
}
retval = count_strings_kernel(argv);
if (WARN_ON_ONCE(retval == 0))
retval = -EINVAL;
if (retval < 0)
goto out_free;
bprm->argc = retval;
retval = count_strings_kernel(envp);
if (retval < 0)
goto out_free;
bprm->envc = retval;
retval = bprm_stack_limits(bprm);
if (retval < 0)
goto out_free;
retval = copy_string_kernel(bprm->filename, bprm);
if (retval < 0)
goto out_free;
bprm->exec = bprm->p;
retval = copy_strings_kernel(bprm->envc, envp, bprm);
if (retval < 0)
goto out_free;
retval = copy_strings_kernel(bprm->argc, argv, bprm);
if (retval < 0)
goto out_free;
retval = bprm_execve(bprm);
out_free:
free_bprm(bprm);
out_ret:
putname(filename);
return retval;
}
2.1 关键数据结构:linux_binprm
struct linux_binprm是内核中表示"待执行的二进制程序"的核心结构体,它承载了执行一个程序所需的所有信息:参数列表、环境变量、文件描述符、内存布局等。分配通过alloc_bprm完成。
kernel_execve的主要动作:
- 获取文件名 :
getname_kernel复制内核空间路径名到可访问的结构。 - 分配bprm :初始化
bprm,包括分配mm_struct、设置栈起始地址等。 - 统计参数/环境变量个数 :
count_strings_kernel遍历指针数组,计算字符串数量并检查总长度是否超出限制(MAX_ARG_STRINGS和MAX_ARG_STRLEN)。 - 检查栈空间限制 :
bprm_stack_limits确保参数+环境不会超过栈空间大小(通常限制为栈大小的1/4)。 - 复制参数和环境到栈 :
copy_string_kernel和copy_strings_kernel将文件名、参数、环境变量字符串逐字节复制到新进程的用户态栈中(同时也复制到内核空间缓存,但最终用户态栈会保留这些字符串)。注意这里使用的是bprm->p指针,它从用户栈顶向下移动。 - 调用核心函数 :
bprm_execve执行实际的二进制加载。
一个细节:为什么copy_string_kernel(bprm->filename, bprm)要拷贝两次(文件名已经在bprm->filename中,这里又拷贝到栈上)?这是因为argv[0]通常应该指向可执行文件名,内核必须将文件名也放入用户栈,作为argv[0]的内容。而bprm->filename本身是内核空间分配的字符串,不能直接映射到用户空间,所以需要复制到用户栈。
2.2 bprm_execve:通往新程序的大门
c
ini
static int bprm_execve(struct linux_binprm *bprm)
{
int retval;
retval = prepare_bprm_creds(bprm);
if (retval)
return retval;
check_unsafe_exec(bprm);
current->in_execve = 1;
sched_mm_cid_before_execve(current);
sched_exec();
retval = security_bprm_creds_for_exec(bprm);
if (retval)
goto out;
retval = exec_binprm(bprm);
if (retval < 0)
goto out;
sched_mm_cid_after_execve(current);
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
rseq_execve(current);
user_events_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
return retval;
out:
if (bprm->point_of_no_return && !fatal_signal_pending(current))
force_fatal_sig(SIGSEGV);
sched_mm_cid_after_execve(current);
current->fs->in_exec = 0;
current->in_execve = 0;
return retval;
}
该函数的核心是exec_binprm,之前还做了几件重要的事:
prepare_bprm_creds:准备新的凭证(credentials),即将执行新程序时切换用户ID、组ID等。这一步会克隆当前进程的凭证,以备后续根据setuid位等修改。check_unsafe_exec:检查当前进程是否处于不安全状态(例如正在被ptrace跟踪,或有多个线程),会导致某些安全策略拒绝执行。sched_exec:调度器钩子,为新执行的程序做NUMA亲和性调整。security_bprm_creds_for_exec:LSM(Linux安全模块)钩子,例如SELinux可以在此时检查权限。
bprm->point_of_no_return标志在begin_new_exec(稍后看到)中设置,一旦越过此点,新程序的部分资源已经替换,无法回滚,如果后续失败只能强制发送信号杀死进程。
2.3 exec_binprm:二进制格式识别与解释器循环
c
ini
static int exec_binprm(struct linux_binprm *bprm)
{
pid_t old_pid, old_vpid;
int ret, depth;
old_pid = current->pid;
rcu_read_lock();
old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
rcu_read_unlock();
for (depth = 0;; depth++) {
struct file *exec;
if (depth > 5)
return -ELOOP;
ret = search_binary_handler(bprm);
if (ret < 0)
return ret;
if (!bprm->interpreter)
break;
exec = bprm->file;
bprm->file = bprm->interpreter;
bprm->interpreter = NULL;
allow_write_access(exec);
if (unlikely(bprm->have_execfd)) {
if (bprm->executable) {
fput(exec);
return -ENOEXEC;
}
bprm->executable = exec;
} else
fput(exec);
}
audit_bprm(bprm);
trace_sched_process_exec(current, old_pid, bprm);
ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
proc_exec_connector(current);
return 0;
}
这段代码体现了Linux支持多种可执行文件格式(如ELF、a.out、脚本)及解释器(如shebang)的灵活架构。关键点:
search_binary_handler:遍历已注册的二进制格式处理程序链表,尝试识别并加载二进制文件。- 如果
bprm->interpreter被设置(如脚本的#!解释器或者ELF的动态链接器),则循环处理:将当前文件替换为解释器文件,重新调用search_binary_handler。深度限制为5,防止递归过深。 bprm->have_execfd用于处理execveat的AT_EXECFD标志,这里不详细展开。- 成功加载后,记录审计、tracepoint、ptrace事件等。
2.3.1 search_binary_handler:格式匹配的艺术
c
ini
static int search_binary_handler(struct linux_binprm *bprm)
{
bool need_retry = IS_ENABLED(CONFIG_MODULES);
struct linux_binfmt *fmt;
int retval;
retval = prepare_binprm(bprm);
if (retval < 0)
return retval;
retval = security_bprm_check(bprm);
if (retval)
return retval;
retval = -ENOENT;
retry:
read_lock(&binfmt_lock);
list_for_each_entry(fmt, &formats, lh) {
if (!try_module_get(fmt->module))
continue;
read_unlock(&binfmt_lock);
retval = fmt->load_binary(bprm);
read_lock(&binfmt_lock);
put_binfmt(fmt);
if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
read_unlock(&binfmt_lock);
return retval;
}
}
read_unlock(&binfmt_lock);
if (need_retry) {
if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
printable(bprm->buf[2]) && printable(bprm->buf[3]))
return retval;
if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
return retval;
need_retry = false;
goto retry;
}
return retval;
}
逻辑:
-
prepare_binprm:读取文件头部(前128字节)到bprm->buf,并根据文件权限设置bprm->cred的euid/egid(处理setuid位)。 -
security_bprm_check:再次安全检查。 -
遍历
formats链表,每个元素对应一种二进制格式(ELF、a.out、脚本等)。每个格式提供load_binary函数。try_module_get防止模块在加载过程中被卸载。- 调用
fmt->load_binary(bprm)尝试解析。若返回-ENOEXEC(不是当前格式),继续尝试下一个;否则返回(无论成功或失败)。
-
若所有格式都不识别,且
need_retry为真(表示可能缺失模块),尝试通过request_module动态加载binfmt-xxxx模块(xxxx是文件头中两个字节的十六进制值,通常用于识别#!脚本?实际上对于未知格式,内核会尝试用魔数请求模块)。再次重试。
其中formats链表在内核初始化时注册了elf_format、script_format等。例如ELF格式定义为:
c
ini
static struct linux_binfmt elf_format = {
.module = THIS_MODULE,
.load_binary = load_elf_binary,
.load_shlib = load_elf_library,
.core_dump = elf_core_dump,
.min_coredump = ELF_EXEC_PAGESIZE,
};
脚本格式(#!解释器)的load_binary会解析第一行,提取解释器路径,设置bprm->interpreter后返回-ENOEXEC(迫使外层循环再次搜索)。
第三章:ELF加载器深度解析(load_elf_binary)
ELF(Executable and Linkable Format)是Linux上最常用的可执行文件格式。load_elf_binary是Linux内核中最复杂、最关键的代码之一,负责将ELF文件映射到进程地址空间,并跳转到入口点。
由于源码过长,我们将它拆解成若干步骤,并提取核心部分讲解。
3.1 一致性检查与头部解析
c
objectivec
static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL;
unsigned long load_bias = 0, phdr_addr = 0;
int first_pt_load = 1;
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
struct elf_phdr *elf_property_phdata = NULL;
unsigned long elf_brk;
int retval, i;
unsigned long elf_entry;
unsigned long e_entry;
unsigned long interp_load_addr = 0;
... // 省略变量声明
struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf; // 头部已在prepare_binprm中读入
retval = -ENOEXEC;
if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
goto out;
if (elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN)
goto out;
if (!elf_check_arch(elf_ex))
goto out;
if (elf_check_fdpic(elf_ex))
goto out;
if (!bprm->file->f_op->mmap)
goto out;
- 首先验证ELF魔数
\177ELF、文件类型(可执行或共享对象,ET_EXEC为静态位置可执行文件,ET_DYN为位置无关可执行PIE或共享库)、体系架构匹配、非FDPIC(另一种嵌入式ABI)、文件系统支持内存映射。 - 加载程序头表(Program Header Table):
load_elf_phdrs读取所有程序头。
3.2 处理PT_INTERP:动态链接器的识别
c
ini
elf_phdata = load_elf_phdrs(elf_ex, bprm->file);
if (!elf_phdata)
goto out;
elf_ppnt = elf_phdata;
for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) {
char *elf_interpreter;
if (elf_ppnt->p_type == PT_GNU_PROPERTY) {
elf_property_phdata = elf_ppnt;
continue;
}
if (elf_ppnt->p_type != PT_INTERP)
continue;
// 读取解释器路径(如 /lib64/ld-linux-x86-64.so.2)
retval = -ENOEXEC;
if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2)
goto out_free_ph;
elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL);
if (!elf_interpreter)
goto out_free_ph;
retval = elf_read(bprm->file, elf_interpreter, elf_ppnt->p_filesz,
elf_ppnt->p_offset);
if (retval < 0)
goto out_free_interp;
if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
goto out_free_interp;
interpreter = open_exec(elf_interpreter);
kfree(elf_interpreter);
retval = PTR_ERR(interpreter);
if (IS_ERR(interpreter))
goto out_free_ph;
would_dump(bprm, interpreter);
interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL);
if (!interp_elf_ex) {
retval = -ENOMEM;
goto out_free_file;
}
retval = elf_read(interpreter, interp_elf_ex,
sizeof(*interp_elf_ex), 0);
if (retval < 0)
goto out_free_dentry;
break;
}
- 遍历程序头,找到
PT_INTERP段,读取其中的动态链接器路径(例如/lib64/ld-linux-x86-64.so.2)。然后通过open_exec打开该文件,并将它的ELF头读入interp_elf_ex。 - 后续会加载解释器本身的程序头,并可能在地址空间布局时区分PIE程序和动态链接器。
3.3 处理GNU_STACK和属性段
c
ini
elf_ppnt = elf_phdata;
for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++)
switch (elf_ppnt->p_type) {
case PT_GNU_STACK:
if (elf_ppnt->p_flags & PF_X)
executable_stack = EXSTACK_ENABLE_X;
else
executable_stack = EXSTACK_DISABLE_X;
break;
case PT_LOPROC ... PT_HIPROC:
retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
bprm->file, false,
&arch_state);
if (retval)
goto out_free_dentry;
break;
}
PT_GNU_STACK控制栈是否可执行(用于NX保护)。默认情况下,现代Linux要求栈不可执行,但某些古老二进制可能要求可执行栈,内核会据此设置executable_stack。PT_LOPROC到PT_HIPROC是处理器特定的段,由体系架构代码处理(如ARM的.ARM.attributes)。
3.4 动态链接器的一致性检查
如果存在解释器(即动态链接情况),需要检查解释器的ELF头合法性、体系结构,并加载其程序头表。同样也会处理PT_GNU_PROPERTY(用于Intel CET等硬件特性)和PT_LOPROC。
3.5 关键转换:begin_new_exec
c
ini
retval = begin_new_exec(bprm);
if (retval)
goto out_free_dentry;
begin_new_exec是execve的核心转折点,它负责:
- 清空当前进程的内存映射(
mm),但保留内核栈等。 - 重置信号处理、线程信息、文件系统等。
- 设置
bprm->point_of_no_return = 1(从此无法回头)。 - 复制新的凭证(根据setuid等)。
- 更新进程的
comm字段为文件名。
此函数调用后,原进程的用户空间上下文彻底消失。
3.6 设置新进程的内存布局
c
scss
SET_PERSONALITY2(*elf_ex, &arch_state);
if (elf_read_implies_exec(*elf_ex, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
current->flags |= PF_RANDOMIZE;
setup_new_exec(bprm);
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
executable_stack);
if (retval < 0)
goto out_free_dentry;
SET_PERSONALITY2设置进程的personality(如PER_LINUX、PER_LINUX32),可能影响内存布局、系统调用行为。对于32位兼容程序尤其重要。elf_read_implies_exec:如果ELF标志指示整个地址空间可执行(老旧二进制),或栈要求可执行,则设置READ_IMPLIES_EXEC。PF_RANDOMIZE标志决定是否启用ASLR(地址空间布局随机化)。setup_new_exec:进一步初始化新的执行环境(如清除FPU状态)。setup_arg_pages:在用户栈顶部建立参数和环境变量区域,随机化栈顶位置。
3.7 映射ELF的LOAD段
这是最复杂的步骤。遍历所有PT_LOAD段,对于每个段,计算内存保护属性(PROT_READ, PROT_WRITE, PROT_EXEC)和映射标志(MAP_PRIVATE),然后调用elf_load(实际上是vm_mmap的封装)将文件内容映射到指定地址。
对于ET_EXEC(固定地址可执行文件),地址由p_vaddr直接指定;对于ET_DYN(PIE或动态库),需要计算随机化偏移load_bias。
PIE处理的核心逻辑:
c
ini
if (elf_ex->e_type == ET_DYN) {
if (interpreter) {
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
if (alignment)
load_bias &= ~(alignment - 1);
elf_flags |= MAP_FIXED_NOREPLACE;
} else
load_bias = 0;
load_bias = ELF_PAGESTART(load_bias - vaddr);
total_size = total_mapping_size(elf_phdata, elf_ex->e_phnum);
// 然后映射整个总大小,使用MAP_FIXED_NOREPLACE,避免覆盖已有映射
}
- 如果是动态链接的可执行程序 (有
PT_INTERP),load_bias设为ELF_ET_DYN_BASE(通常为TASK_SIZE / 3 * 2,即地址空间下半部靠上位置),并加入随机偏移(若ASLR开启)。然后调整load_bias,使其与vaddr对齐,保证映射后虚拟地址为load_bias + vaddr。 - 对于动态链接器本身 (没有
PT_INTERP),load_bias = 0,意味着它将被映射到由内核mmap随机选择的位置(不固定),这可以防止动态链接器与程序冲突。 MAP_FIXED_NOREPLACE标志要求映射不能覆盖已有的VMA,增强了安全性。
对于第一个PT_LOAD段,还会记录reloc_func_desc(即load_bias),供体系架构代码调整函数描述符(如PowerPC64)。
映射每个段后,更新start_code、end_code、start_data、end_data,以及elf_brk(程序的堆起始地址,由最后一个LOAD段的p_memsz后边界确定)。
3.8 加载解释器(若存在)
如果有动态链接器,调用load_elf_interp加载它。该函数与加载主程序类似,但行为更简单:它将解释器的LOAD段映射到进程空间,并返回加载的基址(interp_load_addr)。然后解释器入口点为interp_load_addr + interp_elf_ex->e_entry。否则,主程序入口点为e_entry + load_bias。
3.9 设置进程的辅助向量(auxv)
create_elf_tables在用户栈上构建辅助向量(AT_PHDR、AT_ENTRY、AT_RANDOM等),这些是动态链接器和libc初始化所需的额外信息。例如AT_ENTRY告诉动态链接器主程序入口点。
3.10 最终调整和启动
c
ini
mm = current->mm;
mm->end_code = end_code;
mm->start_code = start_code;
mm->start_data = start_data;
mm->end_data = end_data;
mm->start_stack = bprm->p;
if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
elf_ex->e_type == ET_DYN && !interpreter) {
mm->brk = mm->start_brk = ELF_ET_DYN_BASE;
}
mm->brk = mm->start_brk = arch_randomize_brk(mm);
}
// 可选:映射零页(遗留的SVr4行为)
if (current->personality & MMAP_PAGE_ZERO) {
error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE, 0);
}
regs = current_pt_regs();
#ifdef ELF_PLAT_INIT
ELF_PLAT_INIT(regs, reloc_func_desc);
#endif
finalize_exec(bprm);
START_THREAD(elf_ex, regs, elf_entry, bprm->p);
retval = 0;
out:
return retval;
- 设置
mm结构中代码段、数据段、栈顶的位置。 - 随机化堆的起始地址(brk),除非是静态链接的PIE且没有解释器的特殊情况(此时堆放在
ELF_ET_DYN_BASE附近,避免与栈冲突)。 - 通过
ELF_PLAT_INIT宏设置平台相关的寄存器初始值(例如ARM64的x0设置为AT_BASE等)。 finalize_exec执行最后的清理(如向LSM通知状态变更)。START_THREAD实际上是一系列操作的宏:设置用户态栈指针sp = bprm->p,设置程序计数器pc = elf_entry,然后通过start_thread切换上下文,最终通过ret_from_fork返回到用户空间执行新程序。
至此,init进程或任何后续execve的程序成功启动。
第四章:完整流程图与关键数据结构
我们可以用一幅图概括整个流程:
text
scss
kernel_init()
│
├── run_init_process(init_path)
│ │
│ └── kernel_execve()
│ │
│ ├── alloc_bprm()
│ ├── copy_strings_to_stack()
│ └── bprm_execve()
│ │
│ ├── prepare_bprm_creds()
│ ├── exec_binprm()
│ │ │
│ │ └── search_binary_handler()
│ │ │
│ │ ├── prepare_binprm()
│ │ ├── list_for_each_entry(fmt)
│ │ │ └── fmt->load_binary()
│ │ │ └── load_elf_binary()
│ │ │ ├── 解析ELF和解释器
│ │ │ ├── begin_new_exec()
│ │ │ ├── setup_arg_pages()
│ │ │ ├── 映射LOAD段
│ │ │ ├── 加载解释器
│ │ │ ├── create_elf_tables()
│ │ │ └── START_THREAD()
│ │ └── request_module() [若需]
│ └── 返回用户空间
关键数据结构:
struct linux_binprm:存储执行程序的所有参数、环境、文件指针、内存准备状态。struct linux_binfmt:二进制格式驱动,提供load_binary方法。struct elfhdr/struct elf_phdr:ELF头和程序头。struct mm_struct:进程内存描述符,exec时被重置。
第五章:常见问题与调试技巧
5.1 为什么init必须是PID=1?
kernel_init函数以内核线程运行,PID=1。执行execve后,它变成用户进程,但PID保持不变。PID=1是系统第一个用户进程,负责收养孤儿进程,处理SIGCHLD等,具有特殊的权限和角色。
5.2 如果init崩溃,系统会怎样?
如果init进程崩溃(收到致命信号或exit),内核会调用forget_original_parent重新设置所有进程的父进程为init的父进程(即PID=0的swapper),然后触发panic。因为PID=1是系统正常运行的前提,一旦退出,内核无法处理孤儿进程,直接panic。这就是为什么"init进程不能死"。
5.3 如何调试内核execve过程?
- 使用
printk动态打印:在kernel_init、load_elf_binary等函数中添加pr_info,但需重新编译内核。 - 使用
ftrace跟踪函数调用:echo function > /sys/kernel/debug/tracing/current_tracer,然后过滤exec*和load_elf_binary。 - 使用
kgdb或kprobe动态断点。 - 用户态工具如
strace可以跟踪execve系统调用,但无法查看内核内部细节。
5.4 为什么ELF加载中要使用MAP_FIXED_NOREPLACE?
传统的MAP_FIXED会无条件覆盖已有的VMA,可能被恶意利用(例如映射到关键内核区域)。MAP_FIXED_NOREPLACE在地址冲突时返回错误,增加安全性。对于ET_EXEC固定地址程序,地址是硬编码的,可能与其他映射冲突(例如vdso),内核需要仔细处理,这也是现代内核改进的一环。
5.5 脚本文件如何执行?
脚本格式处理器的load_binary会读取第一行#! /path/to/interpreter,然后设置bprm->interpreter,将bprm->file替换为解释器文件,并修改argv使解释器的参数包含原脚本名。随后返回-ENOEXEC,外层exec_binprm循环重新搜索二进制格式,此时解释器是真正的二进制(如/bin/sh),再次进入load_elf_binary或其它格式加载。整个过程对用户透明。
第六章:总结与现代内核的演进
通过以上源码分析,我们完整地走通了从内核启动到用户态init进程的第一行指令的全过程。这不仅是一次代码漫步,更是理解Linux内核进程管理、内存管理、文件系统交互的绝佳案例。
从kernel_init到load_elf_binary,我们看到了:
- 内核如何逐步释放初始化资源,将自己化为只读并进入运行状态。
- execve机制如何精心准备用户栈、参数、环境,并安全地切换执行上下文。
- 二进制格式的插件化设计如何优雅地支持多样化和嵌套解释器。
- ELF加载器的复杂性,尤其是PIE、ASLR、动态链接器加载等现代操作系统安全特性的实现细节。
Linux内核发展的数十年间,这个流程不断演进:增加了MAP_FIXED_NOREPLACE、PT_GNU_PROPERTY、CET支持、随机化的brk等等,但核心架构保持稳定。理解这一过程,对于从事系统开发、性能优化、安全研究的人来说,都是不可或缺的基础。
最后,不妨在自己的Linux机器上尝试strace /bin/init(如果系统允许)或者用gdb跟踪一个简单程序的execve,观察前述的数据结构和调用顺序。理论结合实践,才能深入掌握这个操作系统的"起搏器"。
#源码
static
{
int ret;
/*
* Wait until kthreadd is all set-up.
*/
wait_for_completion(&kthreadd_done);
kernel_init_freeable();
/* need to finish all async __init code before freeing the memory */
async_synchronize_full();
system_state = SYSTEM_FREEING_INITMEM;
kprobe_free_init_mem();
ftrace_free_init_mem();
kgdb_free_init_mem();
exit_boot_config();
free_initmem();
mark_readonly();
/*
* Kernel mappings are now finalized - update the userspace page-table
* to finalize PTI.
*/
pti_finalize();
system_state = SYSTEM_RUNNING;
numa_default_policy();
rcu_end_inkernel_boot();
do_sysctl_args();
if (ramdisk_execute_command) {
ret = run_init_process(ramdisk_execute_command);
if (!ret)
return 0;
pr_err("Failed to execute %s (error %d)\n",
ramdisk_execute_command, ret);
}
/*
* We try each of these until one succeeds.
*
* The Bourne shell can be used instead of init if we are
* trying to recover a really broken machine.
*/
if (execute_command) {
ret = run_init_process(execute_command);
if (!ret)
return 0;
panic("Requested init %s failed (error %d).",
execute_command, ret);
}
if (CONFIG_DEFAULT_INIT[0] != '\0') {
ret = run_init_process(CONFIG_DEFAULT_INIT);
if (ret)
pr_err("Default init %s failed (error %d)\n",
CONFIG_DEFAULT_INIT, ret);
else
return 0;
}
if (!try_to_run_init_process("/sbin/init") ||
!try_to_run_init_process("/etc/init") ||
!try_to_run_init_process("/bin/init") ||
!try_to_run_init_process("/bin/sh"))
return 0;
panic("No working init found. Try passing init= option to kernel. "
"See Linux Documentation/admin-guide/init.rst for guidance.");
}
static int run_init_process(const char *init_filename)
{
const char *const *p;
argv_init[0] = init_filename;
pr_info("Run %s as init process\n", init_filename);
pr_debug(" with arguments:\n");
for (p = argv_init; *p; p++)
pr_debug(" %s\n", *p);
pr_debug(" with environment:\n");
for (p = envp_init; *p; p++)
pr_debug(" %s\n", *p);
return kernel_execve(init_filename, argv_init, envp_init);
}
static int try_to_run_init_process(const char *init_filename)
{
int ret;
ret = run_init_process(init_filename);
if (ret && ret != -ENOENT) {
pr_err("Starting init: %s exists but couldn't execute it (error %d)\n",
init_filename, ret);
}
return ret;
}
int kernel_execve(const char *kernel_filename,
const char *const *argv, const char *const *envp)
{
struct filename *filename;
struct linux_binprm *bprm;
int fd = AT_FDCWD;
int retval;
/* It is non-sense for kernel threads to call execve */
if (WARN_ON_ONCE(current->flags & PF_KTHREAD))
return -EINVAL;
filename = getname_kernel(kernel_filename);
if (IS_ERR(filename))
return PTR_ERR(filename);
bprm = alloc_bprm(fd, filename, 0);
if (IS_ERR(bprm)) {
retval = PTR_ERR(bprm);
goto out_ret;
}
retval = count_strings_kernel(argv);
if (WARN_ON_ONCE(retval == 0))
retval = -EINVAL;
if (retval < 0)
goto out_free;
bprm->argc = retval;
retval = count_strings_kernel(envp);
if (retval < 0)
goto out_free;
bprm->envc = retval;
retval = bprm_stack_limits(bprm);
if (retval < 0)
goto out_free;
retval = copy_string_kernel(bprm->filename, bprm);
if (retval < 0)
goto out_free;
bprm->exec = bprm->p;
retval = copy_strings_kernel(bprm->envc, envp, bprm);
if (retval < 0)
goto out_free;
retval = copy_strings_kernel(bprm->argc, argv, bprm);
if (retval < 0)
goto out_free;
retval = bprm_execve(bprm);
out_free:
free_bprm(bprm);
out_ret:
putname(filename);
return retval;
}
static int bprm_execve(struct linux_binprm *bprm)
{
int retval;
retval = prepare_bprm_creds(bprm);
if (retval)
return retval;
/*
* Check for unsafe execution states before exec_binprm(), which
* will call back into begin_new_exec(), into bprm_creds_from_file(),
* where setuid-ness is evaluated.
*/
check_unsafe_exec(bprm);
current->in_execve = 1;
sched_mm_cid_before_execve(current);
sched_exec();
/* Set the unchanging part of bprm->cred */
retval = security_bprm_creds_for_exec(bprm);
if (retval)
goto out;
retval = exec_binprm(bprm);
if (retval < 0)
goto out;
sched_mm_cid_after_execve(current);
/* execve succeeded */
current->fs->in_exec = 0;
current->in_execve = 0;
rseq_execve(current);
user_events_execve(current);
acct_update_integrals(current);
task_numa_free(current, false);
return retval;
out:
/*
* If past the point of no return ensure the code never
* returns to the userspace process. Use an existing fatal
* signal if present otherwise terminate the process with
* SIGSEGV.
*/
if (bprm->point_of_no_return && !fatal_signal_pending(current))
force_fatal_sig(SIGSEGV);
sched_mm_cid_after_execve(current);
current->fs->in_exec = 0;
current->in_execve = 0;
return retval;
}
/* binfmt handlers will call back into begin_new_exec() on success. */
static int exec_binprm(struct linux_binprm *bprm)
{
pid_t old_pid, old_vpid;
int ret, depth;
/* Need to fetch pid before load_binary changes it */
old_pid = current->pid;
rcu_read_lock();
old_vpid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
rcu_read_unlock();
/* This allows 4 levels of binfmt rewrites before failing hard. */
for (depth = 0;; depth++) {
struct file *exec;
if (depth > 5)
return -ELOOP;
ret = search_binary_handler(bprm);
if (ret < 0)
return ret;
if (!bprm->interpreter)
break;
exec = bprm->file;
bprm->file = bprm->interpreter;
bprm->interpreter = NULL;
allow_write_access(exec);
if (unlikely(bprm->have_execfd)) {
if (bprm->executable) {
fput(exec);
return -ENOEXEC;
}
bprm->executable = exec;
} else
fput(exec);
}
audit_bprm(bprm);
trace_sched_process_exec(current, old_pid, bprm);
ptrace_event(PTRACE_EVENT_EXEC, old_vpid);
proc_exec_connector(current);
return 0;
}
/*
* cycle the list of binary formats handler, until one recognizes the image
*/
static int search_binary_handler(struct linux_binprm *bprm)
{
bool need_retry = IS_ENABLED(CONFIG_MODULES);
struct linux_binfmt *fmt;
int retval;
retval = prepare_binprm(bprm);
if (retval < 0)
return retval;
retval = security_bprm_check(bprm);
if (retval)
return retval;
retval = -ENOENT;
retry:
read_lock(&binfmt_lock);
list_for_each_entry(fmt, &formats, lh) {
if (!try_module_get(fmt->module))
continue;
read_unlock(&binfmt_lock);
retval = fmt->load_binary(bprm);
read_lock(&binfmt_lock);
put_binfmt(fmt);
if (bprm->point_of_no_return || (retval != -ENOEXEC)) {
read_unlock(&binfmt_lock);
return retval;
}
}
read_unlock(&binfmt_lock);
if (need_retry) {
if (printable(bprm->buf[0]) && printable(bprm->buf[1]) &&
printable(bprm->buf[2]) && printable(bprm->buf[3]))
return retval;
if (request_module("binfmt-%04x", *(ushort *)(bprm->buf + 2)) < 0)
return retval;
need_retry = false;
goto retry;
}
return retval;
}
static struct linux_binfmt elf_format = {
.module = THIS_MODULE,
.load_binary = load_elf_binary,
.load_shlib = load_elf_library,
#ifdef CONFIG_COREDUMP
.core_dump = elf_core_dump,
.min_coredump = ELF_EXEC_PAGESIZE,
#endif
};
static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
unsigned long load_bias = 0, phdr_addr = 0;
int first_pt_load = 1;
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
struct elf_phdr *elf_property_phdata = NULL;
unsigned long elf_brk;
int retval, i;
unsigned long elf_entry;
unsigned long e_entry;
unsigned long interp_load_addr = 0;
unsigned long start_code, end_code, start_data, end_data;
unsigned long reloc_func_desc __maybe_unused = 0;
int executable_stack = EXSTACK_DEFAULT;
struct elfhdr *elf_ex = (struct elfhdr *)bprm->buf;
struct elfhdr *interp_elf_ex = NULL;
struct arch_elf_state arch_state = INIT_ARCH_ELF_STATE;
struct mm_struct *mm;
struct pt_regs *regs;
retval = -ENOEXEC;
/* First of all, some simple consistency checks */
if (memcmp(elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
goto out;
if (elf_ex->e_type != ET_EXEC && elf_ex->e_type != ET_DYN)
goto out;
if (!elf_check_arch(elf_ex))
goto out;
if (elf_check_fdpic(elf_ex))
goto out;
if (!bprm->file->f_op->mmap)
goto out;
elf_phdata = load_elf_phdrs(elf_ex, bprm->file);
if (!elf_phdata)
goto out;
elf_ppnt = elf_phdata;
for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++) {
char *elf_interpreter;
if (elf_ppnt->p_type == PT_GNU_PROPERTY) {
elf_property_phdata = elf_ppnt;
continue;
}
if (elf_ppnt->p_type != PT_INTERP)
continue;
/*
* This is the program interpreter used for shared libraries -
* for now assume that this is an a.out format binary.
*/
retval = -ENOEXEC;
if (elf_ppnt->p_filesz > PATH_MAX || elf_ppnt->p_filesz < 2)
goto out_free_ph;
retval = -ENOMEM;
elf_interpreter = kmalloc(elf_ppnt->p_filesz, GFP_KERNEL);
if (!elf_interpreter)
goto out_free_ph;
retval = elf_read(bprm->file, elf_interpreter, elf_ppnt->p_filesz,
elf_ppnt->p_offset);
if (retval < 0)
goto out_free_interp;
/* make sure path is NULL terminated */
retval = -ENOEXEC;
if (elf_interpreter[elf_ppnt->p_filesz - 1] != '\0')
goto out_free_interp;
interpreter = open_exec(elf_interpreter);
kfree(elf_interpreter);
retval = PTR_ERR(interpreter);
if (IS_ERR(interpreter))
goto out_free_ph;
/*
* If the binary is not readable then enforce mm->dumpable = 0
* regardless of the interpreter's permissions.
*/
would_dump(bprm, interpreter);
interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL);
if (!interp_elf_ex) {
retval = -ENOMEM;
goto out_free_file;
}
/* Get the exec headers */
retval = elf_read(interpreter, interp_elf_ex,
sizeof(*interp_elf_ex), 0);
if (retval < 0)
goto out_free_dentry;
break;
out_free_interp:
kfree(elf_interpreter);
goto out_free_ph;
}
elf_ppnt = elf_phdata;
for (i = 0; i < elf_ex->e_phnum; i++, elf_ppnt++)
switch (elf_ppnt->p_type) {
case PT_GNU_STACK:
if (elf_ppnt->p_flags & PF_X)
executable_stack = EXSTACK_ENABLE_X;
else
executable_stack = EXSTACK_DISABLE_X;
break;
case PT_LOPROC ... PT_HIPROC:
retval = arch_elf_pt_proc(elf_ex, elf_ppnt,
bprm->file, false,
&arch_state);
if (retval)
goto out_free_dentry;
break;
}
/* Some simple consistency checks for the interpreter */
if (interpreter) {
retval = -ELIBBAD;
/* Not an ELF interpreter */
if (memcmp(interp_elf_ex->e_ident, ELFMAG, SELFMAG) != 0)
goto out_free_dentry;
/* Verify the interpreter has a valid arch */
if (!elf_check_arch(interp_elf_ex) ||
elf_check_fdpic(interp_elf_ex))
goto out_free_dentry;
/* Load the interpreter program headers */
interp_elf_phdata = load_elf_phdrs(interp_elf_ex,
interpreter);
if (!interp_elf_phdata)
goto out_free_dentry;
/* Pass PT_LOPROC..PT_HIPROC headers to arch code */
elf_property_phdata = NULL;
elf_ppnt = interp_elf_phdata;
for (i = 0; i < interp_elf_ex->e_phnum; i++, elf_ppnt++)
switch (elf_ppnt->p_type) {
case PT_GNU_PROPERTY:
elf_property_phdata = elf_ppnt;
break;
case PT_LOPROC ... PT_HIPROC:
retval = arch_elf_pt_proc(interp_elf_ex,
elf_ppnt, interpreter,
true, &arch_state);
if (retval)
goto out_free_dentry;
break;
}
}
retval = parse_elf_properties(interpreter ?: bprm->file,
elf_property_phdata, &arch_state);
if (retval)
goto out_free_dentry;
/*
* Allow arch code to reject the ELF at this point, whilst it's
* still possible to return an error to the code that invoked
* the exec syscall.
*/
retval = arch_check_elf(elf_ex,
!!interpreter, interp_elf_ex,
&arch_state);
if (retval)
goto out_free_dentry;
/* Flush all traces of the currently running executable */
retval = begin_new_exec(bprm);
if (retval)
goto out_free_dentry;
/* Do this immediately, since STACK_TOP as used in setup_arg_pages
may depend on the personality. */
SET_PERSONALITY2(*elf_ex, &arch_state);
if (elf_read_implies_exec(*elf_ex, executable_stack))
current->personality |= READ_IMPLIES_EXEC;
if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space)
current->flags |= PF_RANDOMIZE;
setup_new_exec(bprm);
/* Do this so that we can load the interpreter, if need be. We will
change some of these later */
retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP),
executable_stack);
if (retval < 0)
goto out_free_dentry;
elf_brk = 0;
start_code = ~0UL;
end_code = 0;
start_data = 0;
end_data = 0;
/* Now we do a little grungy work by mmapping the ELF image into
the correct location in memory. */
for(i = 0, elf_ppnt = elf_phdata;
i < elf_ex->e_phnum; i++, elf_ppnt++) {
int elf_prot, elf_flags;
unsigned long k, vaddr;
unsigned long total_size = 0;
unsigned long alignment;
if (elf_ppnt->p_type != PT_LOAD)
continue;
elf_prot = make_prot(elf_ppnt->p_flags, &arch_state,
!!interpreter, false);
elf_flags = MAP_PRIVATE;
vaddr = elf_ppnt->p_vaddr;
/*
* The first time through the loop, first_pt_load is true:
* layout will be calculated. Once set, use MAP_FIXED since
* we know we've already safely mapped the entire region with
* MAP_FIXED_NOREPLACE in the once-per-binary logic following.
*/
if (!first_pt_load) {
elf_flags |= MAP_FIXED;
} else if (elf_ex->e_type == ET_EXEC) {
/*
* This logic is run once for the first LOAD Program
* Header for ET_EXEC binaries. No special handling
* is needed.
*/
elf_flags |= MAP_FIXED_NOREPLACE;
} else if (elf_ex->e_type == ET_DYN) {
/*
* This logic is run once for the first LOAD Program
* Header for ET_DYN binaries to calculate the
* randomization (load_bias) for all the LOAD
* Program Headers.
*
* There are effectively two types of ET_DYN
* binaries: programs (i.e. PIE: ET_DYN with INTERP)
* and loaders (ET_DYN without INTERP, since they
* _are_ the ELF interpreter). The loaders must
* be loaded away from programs since the program
* may otherwise collide with the loader (especially
* for ET_EXEC which does not have a randomized
* position). For example to handle invocations of
* "./ld.so someprog" to test out a new version of
* the loader, the subsequent program that the
* loader loads must avoid the loader itself, so
* they cannot share the same load range. Sufficient
* room for the brk must be allocated with the
* loader as well, since brk must be available with
* the loader.
*
* Therefore, programs are loaded offset from
* ELF_ET_DYN_BASE and loaders are loaded into the
* independently randomized mmap region (0 load_bias
* without MAP_FIXED nor MAP_FIXED_NOREPLACE).
*/
if (interpreter) {
load_bias = ELF_ET_DYN_BASE;
if (current->flags & PF_RANDOMIZE)
load_bias += arch_mmap_rnd();
alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum);
if (alignment)
load_bias &= ~(alignment - 1);
elf_flags |= MAP_FIXED_NOREPLACE;
} else
load_bias = 0;
/*
* Since load_bias is used for all subsequent loading
* calculations, we must lower it by the first vaddr
* so that the remaining calculations based on the
* ELF vaddrs will be correctly offset. The result
* is then page aligned.
*/
load_bias = ELF_PAGESTART(load_bias - vaddr);
/*
* Calculate the entire size of the ELF mapping
* (total_size), used for the initial mapping,
* due to load_addr_set which is set to true later
* once the initial mapping is performed.
*
* Note that this is only sensible when the LOAD
* segments are contiguous (or overlapping). If
* used for LOADs that are far apart, this would
* cause the holes between LOADs to be mapped,
* running the risk of having the mapping fail,
* as it would be larger than the ELF file itself.
*
* As a result, only ET_DYN does this, since
* some ET_EXEC (e.g. ia64) may have large virtual
* memory holes between LOADs.
*
*/
total_size = total_mapping_size(elf_phdata,
elf_ex->e_phnum);
if (!total_size) {
retval = -EINVAL;
goto out_free_dentry;
}
}
error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt,
elf_prot, elf_flags, total_size);
if (BAD_ADDR(error)) {
retval = IS_ERR_VALUE(error) ?
PTR_ERR((void*)error) : -EINVAL;
goto out_free_dentry;
}
if (first_pt_load) {
first_pt_load = 0;
if (elf_ex->e_type == ET_DYN) {
load_bias += error -
ELF_PAGESTART(load_bias + vaddr);
reloc_func_desc = load_bias;
}
}
/*
* Figure out which segment in the file contains the Program
* Header table, and map to the associated memory address.
*/
if (elf_ppnt->p_offset <= elf_ex->e_phoff &&
elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) {
phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset +
elf_ppnt->p_vaddr;
}
k = elf_ppnt->p_vaddr;
if ((elf_ppnt->p_flags & PF_X) && k < start_code)
start_code = k;
if (start_data < k)
start_data = k;
/*
* Check to see if the section's size will overflow the
* allowed task size. Note that p_filesz must always be
* <= p_memsz so it is only necessary to check p_memsz.
*/
if (BAD_ADDR(k) || elf_ppnt->p_filesz > elf_ppnt->p_memsz ||
elf_ppnt->p_memsz > TASK_SIZE ||
TASK_SIZE - elf_ppnt->p_memsz < k) {
/* set_brk can never work. Avoid overflows. */
retval = -EINVAL;
goto out_free_dentry;
}
k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz;
if ((elf_ppnt->p_flags & PF_X) && end_code < k)
end_code = k;
if (end_data < k)
end_data = k;
k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz;
if (k > elf_brk)
elf_brk = k;
}
e_entry = elf_ex->e_entry + load_bias;
phdr_addr += load_bias;
elf_brk += load_bias;
start_code += load_bias;
end_code += load_bias;
start_data += load_bias;
end_data += load_bias;
current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk);
if (interpreter) {
elf_entry = load_elf_interp(interp_elf_ex,
interpreter,
load_bias, interp_elf_phdata,
&arch_state);
if (!IS_ERR_VALUE(elf_entry)) {
/*
* load_elf_interp() returns relocation
* adjustment
*/
interp_load_addr = elf_entry;
elf_entry += interp_elf_ex->e_entry;
}
if (BAD_ADDR(elf_entry)) {
retval = IS_ERR_VALUE(elf_entry) ?
(int)elf_entry : -EINVAL;
goto out_free_dentry;
}
reloc_func_desc = interp_load_addr;
allow_write_access(interpreter);
fput(interpreter);
kfree(interp_elf_ex);
kfree(interp_elf_phdata);
} else {
elf_entry = e_entry;
if (BAD_ADDR(elf_entry)) {
retval = -EINVAL;
goto out_free_dentry;
}
}
kfree(elf_phdata);
set_binfmt(&elf_format);
#ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES
retval = ARCH_SETUP_ADDITIONAL_PAGES(bprm, elf_ex, !!interpreter);
if (retval < 0)
goto out;
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
retval = create_elf_tables(bprm, elf_ex, interp_load_addr,
e_entry, phdr_addr);
if (retval < 0)
goto out;
mm = current->mm;
mm->end_code = end_code;
mm->start_code = start_code;
mm->start_data = start_data;
mm->end_data = end_data;
mm->start_stack = bprm->p;
if ((current->flags & PF_RANDOMIZE) && (randomize_va_space > 1)) {
/*
* For architectures with ELF randomization, when executing
* a loader directly (i.e. no interpreter listed in ELF
* headers), move the brk area out of the mmap region
* (since it grows up, and may collide early with the stack
* growing down), and into the unused ELF_ET_DYN_BASE region.
*/
if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) &&
elf_ex->e_type == ET_DYN && !interpreter) {
mm->brk = mm->start_brk = ELF_ET_DYN_BASE;
}
mm->brk = mm->start_brk = arch_randomize_brk(mm);
#ifdef compat_brk_randomized
current->brk_randomized = 1;
#endif
}
if (current->personality & MMAP_PAGE_ZERO) {
/* Why this, you ask??? Well SVr4 maps page 0 as read-only,
and some applications "depend" upon this behavior.
Since we do not have the power to recompile these, we
emulate the SVr4 behavior. Sigh. */
error = vm_mmap(NULL, 0, PAGE_SIZE, PROT_READ | PROT_EXEC,
MAP_FIXED | MAP_PRIVATE, 0);
}
regs = current_pt_regs();
#ifdef ELF_PLAT_INIT
/*
* The ABI may specify that certain registers be set up in special
* ways (on i386 %edx is the address of a DT_FINI function, for
* example. In addition, it may also specify (eg, PowerPC64 ELF)
* that the e_entry field is the address of the function descriptor
* for the startup routine, rather than the address of the startup
* routine itself. This macro performs whatever initialization to
* the regs structure is required as well as any relocations to the
* function descriptor entries when executing dynamically links apps.
*/
ELF_PLAT_INIT(regs, reloc_func_desc);
#endif
finalize_exec(bprm);
START_THREAD(elf_ex, regs, elf_entry, bprm->p);
retval = 0;
out:
return retval;
/* error cleanup */
out_free_dentry:
kfree(interp_elf_ex);
kfree(interp_elf_phdata);
out_free_file:
allow_write_access(interpreter);
if (interpreter)
fput(interpreter);
out_free_ph:
kfree(elf_phdata);
goto out;
}