一、引言
touch 命令是 Linux 系统中最常用的小工具之一,它的主要功能是修改文件的时间戳(atime - 访问时间,mtime - 修改时间),如果文件不存在则默认创建一个空文件。这个看似简单的命令背后,实际上隐藏着从用户空间到内核空间,再到具体文件系统(如 ext4)的一整套复杂而精妙的调用链条。
本文基于 Linux 内核 6.8.12 源码以及 GNU Coreutils 中 touch 的实现,逐层剖析当你执行 touch file.txt 时,系统究竟做了哪些工作。我们将沿着代码的执行路径,从 main 函数开始,经过参数解析、时间戳处理,然后深入到 open 系统调用、VFS 层、路径查找、dentry 缓存、inode 分配,直到最终在磁盘上留下痕迹。对于文件已存在的情况,我们也会简要分析时间戳更新的系统调用路径。
整篇文章超过 8000 字,包含大量源码级的中文注释,力求让读者对 touch 有一个全景式的理解。
二、用户空间:coreutils 中的 touch 实现
2.1 主函数流程概览
GNU coreutils 中的 touch 源码(片段)如下所示。我们首先分析 main 函数,它负责解析命令行参数,确定要修改哪些时间属性,以及时间值的来源。
c
ini
int main (int argc, char **argv)
{
int c;
bool date_set = false; // 是否已通过某种方式设置了时间
bool ok = true;
char const *flex_date = NULL; // 灵活日期字符串(-d 选项)
// 初始化程序名、本地化环境等
initialize_main (&argc, &argv);
set_program_name (argv[0]);
setlocale (LC_ALL, "");
bindtextdomain (PACKAGE, LOCALEDIR);
textdomain (PACKAGE);
atexit (close_stdout); // 注册退出时刷新标准输出
change_times = 0; // 全局变量,位掩码,记录要修改哪些时间
no_create = use_ref = false; // -c 选项:不创建文件;-r:参考文件
// 使用 getopt_long 解析长选项和短选项
while ((c = getopt_long (argc, argv, "acd:fhmr:t:", longopts, NULL)) != -1)
{
switch (c)
{
case 'a':
change_times |= CH_ATIME; // 要求修改访问时间
break;
case 'c':
no_create = true; // 文件不存在时不创建
break;
case 'd':
flex_date = optarg; // 灵活的日期字符串
break;
case 'f': // 忽略,历史兼容
break;
case 'h': // 影响符号链接本身而非其指向
no_dereference = true;
break;
case 'm':
change_times |= CH_MTIME; // 要求修改修改时间
break;
case 'r':
use_ref = true; // 使用参考文件的时间戳
ref_file = optarg;
break;
case 't':
// 解析固定格式的日期时间 [[CC]YY]MMDDhhmm[.ss]
if (! posixtime (&newtime[0].tv_sec, optarg,
PDS_LEADING_YEAR | PDS_CENTURY | PDS_SECONDS))
error (EXIT_FAILURE, 0, _("invalid date format %s"), quote (optarg));
newtime[0].tv_nsec = 0;
newtime[1] = newtime[0]; // 同时设置 atime 和 mtime 为同一个值
date_set = true;
break;
case TIME_OPTION: /* --time 选项,可指定 =atime, =mtime 等 */
change_times |= XARGMATCH ("--time", optarg, time_args, time_masks);
break;
// ... 帮助和版本选项
default:
usage (EXIT_FAILURE);
}
}
// 如果既没有 -a 也没有 -m,则默认同时修改 atime 和 mtime
if (change_times == 0)
change_times = CH_ATIME | CH_MTIME;
// 不能同时指定多个时间源:-t、-r、-d 三者互斥
if (date_set && (use_ref || flex_date))
{
error (0, 0, _("cannot specify times from more than one source"));
usage (EXIT_FAILURE);
}
核心逻辑注释:
change_times是一个位掩码,可取CH_ATIME(1 << 0) 和CH_MTIME(1 << 1)。若用户未明确指定,则两者都改。- 时间戳的来源有三种:
-t固定格式字符串、-r参考文件、-d自由日期描述(如 "yesterday")。三个选项互斥,否则报错。 newtime[0]和newtime[1]分别代表要设置的访问时间和修改时间,类型为struct timespec(秒+纳秒)。
2.2 从参考文件或自由日期获取时间戳
继续看主函数后面如何处理 -r 和 -d 选项:
c
ini
if (use_ref)
{
struct stat ref_stats;
// 根据是否追踪符号链接,使用 lstat 或 stat 获取参考文件属性
if (no_dereference ? lstat (ref_file, &ref_stats)
: stat (ref_file, &ref_stats))
error (EXIT_FAILURE, errno,
_("failed to get attributes of %s"), quoteaf (ref_file));
// 提取 atime 和 mtime,注意这里使用了内联函数 get_stat_atime/get_stat_mtime
// 以正确处理不同平台的结构体成员名差异
newtime[0] = get_stat_atime (&ref_stats);
newtime[1] = get_stat_mtime (&ref_stats);
date_set = true;
// 如果同时指定了 -d,则在参考文件时间的基础上进行相对偏移
if (flex_date)
{
if (change_times & CH_ATIME)
newtime[0] = date_relative (flex_date, newtime[0]);
if (change_times & CH_MTIME)
newtime[1] = date_relative (flex_date, newtime[1]);
}
}
else
{
if (flex_date)
{
struct timespec now = current_timespec (); // 获取当前系统时间
newtime[1] = newtime[0] = date_relative (flex_date, now);
date_set = true;
// 一个特殊优化:如果只用了 -d now 且 change_times 是两者都改,
// 且解析出来的时间恰好等于当前时间,则将 date_set 改回 false。
// 这允许 touch 在权限不足时仍能"成功"(因为实际上什么都不做)。
if (change_times == (CH_ATIME | CH_MTIME)
&& newtime[0].tv_sec == now.tv_sec
&& newtime[0].tv_nsec == now.tv_nsec)
{
struct timespec notnow, notnow1;
notnow.tv_sec = now.tv_sec ^ 1;
notnow.tv_nsec = now.tv_nsec;
notnow1 = date_relative (flex_date, notnow);
// 验证真的是 "-d now" 而不仅仅是巧合
if (notnow1.tv_sec == notnow.tv_sec
&& notnow1.tv_nsec == notnow.tv_nsec)
date_set = false;
}
}
}
要点:
date_relative是一个强大的函数,可以解析 "now", "yesterday", "+1 day" 等自然语言,内部调用parse_datetime。- 对于
-d now的特判,是为了避免在只读文件系统或无写权限时,touch因试图设置当前时间而失败。如果确实没有时间变化需求,那就干脆跳过系统调用。
2.3 过时的 MMDDhhmm[YY] 格式兼容
在 POSIX 200112 之前的版本中,touch 允许使用一种紧凑的日期格式。代码中保留了对这种旧用法的支持:
c
ini
// 只有当非选项参数数量 >=2 且 POSIX 版本低于 200112 时才尝试解析
if (!date_set && 2 <= argc - optind && posix2_version () < 200112
&& posixtime (&newtime[0].tv_sec, argv[optind],
PDS_TRAILING_YEAR | PDS_PRE_2000))
{
newtime[0].tv_nsec = 0;
newtime[1] = newtime[0];
date_set = true;
// 除非设置了 POSIXLY_CORRECT 环境变量,否则发出警告
if (! getenv ("POSIXLY_CORRECT"))
{
struct tm const *tm = localtime (&newtime[0].tv_sec);
if (tm)
error (0, 0,
_("warning: 'touch %s' is obsolete; use "
"'touch -t %04ld%02d%02d%02d%02d.%02d'"),
argv[optind],
tm->tm_year + 1900L, tm->tm_mon + 1, tm->tm_mday,
tm->tm_hour, tm->tm_min, tm->tm_sec);
}
optind++; // 消耗掉这个日期参数
}
2.4 最终准备:未设置时间时的默认行为
如果最终 date_set 仍为 false,说明用户没有提供任何时间源:
c
ini
if (!date_set)
{
// 如果同时修改 atime 和 mtime,则使用特殊标记 amtime_now
// 表示我们想使用当前时间,但交给底层函数去处理(可能因为权限问题而跳过)
if (change_times == (CH_ATIME | CH_MTIME))
amtime_now = true;
else
// 如果只修改其中一种时间,则另一种时间要用 UTIME_OMIT 忽略
newtime[1].tv_nsec = newtime[0].tv_nsec = UTIME_NOW;
}
UTIME_NOW 和 UTIME_OMIT 是 Linux 内核 utimensat 系统调用使用的特殊值,定义在 fcntl.h 中:
UTIME_NOW:将对应的时间戳设置为当前时间。UTIME_OMIT:保持对应的时间戳不变。
2.5 文件操作的核心:touch 函数
主函数最后遍历所有剩余的参数(文件路径),对每个文件调用 touch 函数,并累积返回状态:
c
ini
if (optind == argc)
{
error (0, 0, _("missing file operand"));
usage (EXIT_FAILURE);
}
for (; optind < argc; ++optind)
ok &= touch (argv[optind]);
return ok ? EXIT_SUCCESS : EXIT_FAILURE;
}
touch 函数实现了一个文件的具体处理逻辑,是用户空间和内核交互的直接入口:
c
ini
static bool touch (char const *file)
{
int fd = -1;
int open_errno = 0;
struct timespec const *t = newtime; // 指向要设置的时间数组
// 特殊处理:如果文件名为 "-",则视为标准输出
if (streq (file, "-"))
fd = STDOUT_FILENO;
else if (! (no_create || no_dereference))
{
// 尝试打开文件,如果不存在则创建(O_CREAT)
// fd_reopen 尝试将打开的文件描述符重定向到 STDIN_FILENO(0)
// 这里传入 STDIN_FILENO 是为了复用已有的描述符 0,节省资源
fd = fd_reopen (STDIN_FILENO, file,
O_WRONLY | O_CREAT | O_NONBLOCK | O_NOCTTY, MODE_RW_UGO);
if (fd < 0)
open_errno = errno;
}
// 如果只要求修改 atime 或 mtime 中的一种,则另一种用 UTIME_OMIT 忽略
if (change_times != (CH_ATIME | CH_MTIME))
{
if (change_times == CH_MTIME)
newtime[0].tv_nsec = UTIME_OMIT; // atime 忽略
else
{
affirm (change_times == CH_ATIME);
newtime[1].tv_nsec = UTIME_OMIT; // mtime 忽略
}
}
// amtime_now 为 true 时,我们将时间参数设为 NULL,
// 表示使用当前时间,且如果文件不可写但可读,utimensat 仍能成功(后面解释)
if (amtime_now)
t = NULL;
char const *file_opt = fd == STDOUT_FILENO ? NULL : file;
int atflag = no_dereference ? AT_SYMLINK_NOFOLLOW : 0;
// 核心系统调用:fdutimensat(实际上是 utimensat 的包装)
int utime_errno = (fdutimensat (fd, AT_FDCWD, file_opt, t, atflag) == 0
? 0 : errno);
// 对于标准输入的处理:如果我们打开了 STDIN_FILENO 用于文件操作,
// 操作完成后需要关闭它,避免影响后续读取标准输入。
if (fd == STDIN_FILENO)
{
if (close (STDIN_FILENO) != 0)
{
error (0, errno, _("failed to close %s"), quoteaf (file));
return false;
}
}
else if (fd == STDOUT_FILENO)
{
// 处理 "touch -c - >&-" 这种关闭标准输出的情况,EBADF 时静默成功
if (utime_errno == EBADF && no_create)
return true;
}
// 如果 utimensat 失败,给出合适的错误消息
if (utime_errno != 0)
{
struct stat st;
// 如果 open 失败了,且不是因为"文件是目录"这类原因,则报告 open 错误
if (open_errno
&& ! (open_errno == EISDIR
|| ((open_errno == EINVAL || open_errno == EEXIST)
&& stat (file, &st) == 0 && S_ISDIR (st.st_mode))))
{
error (0, open_errno, _("cannot touch %s"), quoteaf (file));
}
else
{
// 对于设置了 -c 且文件不存在(ENOENT)的情况,静默成功
if (no_create && utime_errno == ENOENT)
return true;
error (0, utime_errno, _("setting times of %s"), quoteaf (file));
}
return false;
}
return true;
}
关键点:
fd_reopen尝试将打开的文件描述符变成指定的数值(这里是STDIN_FILENO=0)。这样做可以避免消耗额外的文件描述符,同时也方便后续处理。其实现如下:
c
ini
int fd_reopen (int desired_fd, char const *file, int flags, mode_t mode)
{
int fd = open (file, flags, mode); // 调用 open 系统调用
if (fd == desired_fd || fd < 0)
return fd;
else
{
// 如果不等于 desired_fd,则使用 dup2 将其重定向
int fd2 = dup2 (fd, desired_fd);
int saved_errno = errno;
close (fd); // 关闭原来的 fd
errno = saved_errno;
return fd2;
}
}
-
utimensat系统调用是真正负责修改时间戳的入口。它的原型是:c
arduinoint utimensat(int dirfd, const char *pathname, const struct timespec times[2], int flags);其中
times数组的两个元素分别对应访问时间和修改时间。如果times为NULL,则相当于两者都设为当前时间。
现在,我们已经从用户空间看到了 touch 如何准备参数并调用 open(创建文件)和 utimensat(更新时间)。接下来的旅程将进入 Linux 内核。
三、从用户空间到内核:系统调用的边界
open 和 utimensat 是 C 库函数,它们最终会触发对应的系统调用。由于我们提供的源码片段中包含了 openat 的完整内核实现,而 utimensat 的实现没有给出,本文将重点剖析 open 路径------即当目标文件不存在且未使用 -c 时,touch 如何创建新文件。对于已存在文件的时间戳更新,我们将简要介绍 utimensat 的内核处理。
3.1 glibc 中的 open 包装
在 glibc 中,open 函数通常是一个弱别名,最终调用 __libc_open:
c
scss
int __libc_open (const char *file, int oflag, ...)
{
int mode = 0;
// 如果标志位要求指定 mode(如 O_CREAT),则从可变参数中读取
if (__OPEN_NEEDS_MODE (oflag))
{
va_list arg;
va_start (arg, oflag);
mode = va_arg (arg, int);
va_end (arg);
}
// 调用 openat 系统调用,第一个参数 AT_FDCWD 表示相对于当前工作目录
return SYSCALL_CANCEL (openat, AT_FDCWD, file, oflag, mode);
}
libc_hidden_def (__libc_open)
weak_alias (__libc_open, __open)
weak_alias (__libc_open, open)
可见,所有 open 调用都被转换为 openat,并传入 AT_FDCWD 表示路径解释相对于当前目录。
3.2 内核系统调用入口:openat
内核源码中,openat 的系统调用定义如下:
c
arduino
SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
umode_t, mode)
{
// 如果启用了大文件支持,自动添加 O_LARGEFILE 标志
if (force_o_largefile())
flags |= O_LARGEFILE;
return do_sys_open(dfd, filename, flags, mode);
}
这里 SYSCALL_DEFINE4 是内核宏,用于生成系统调用处理函数。接着调用 do_sys_open:
c
arduino
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_how how = build_open_how(flags, mode); // 将 flags 和 mode 打包
return do_sys_openat2(dfd, filename, &how);
}
do_sys_openat2 是实际干活的地方:
c
scss
static long do_sys_openat2(int dfd, const char __user *filename,
struct open_how *how)
{
struct open_flags op;
int fd = build_open_flags(how, &op); // 将用户态标志转换为内核内部表示
struct filename *tmp;
if (fd)
return fd;
tmp = getname(filename); // 从用户空间安全地拷贝路径名
if (IS_ERR(tmp))
return PTR_ERR(tmp);
fd = get_unused_fd_flags(how->flags); // 分配一个未使用的文件描述符
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, &op); // 核心:打开文件
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
fd_install(fd, f); // 将文件结构与 fd 关联
}
}
putname(tmp);
return fd;
}
函数作用注释:
build_open_flags将用户标志(如O_CREAT、O_RDWR)转换为 VFS 层使用的open_flags结构,同时进行合法性检查。getname负责从用户态复制路径字符串并分配内核内存。get_unused_fd_flags在当前进程的打开文件表中找一个空闲位置。do_filp_open是 VFS 打开文件的核心,返回一个struct file *。- 最后
fd_install将文件描述符与struct file关联起来,使得用户态可以通过 fd 访问。
四、VFS 层:路径查找与打开
4.1 do_filp_open 与 path_openat
do_filp_open 只是简单封装,然后调用 path_openat:
c
ini
struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op)
{
struct nameidata nd; // 路径查找上下文
int flags = op->lookup_flags;
struct file *filp;
set_nameidata(&nd, dfd, pathname, NULL); // 初始化 nameidata
filp = path_openat(&nd, op, flags | LOOKUP_RCU); // 先尝试 RCU 模式(无锁)
if (unlikely(filp == ERR_PTR(-ECHILD))) // RCU 模式失败,回退到引用计数模式
filp = path_openat(&nd, op, flags);
if (unlikely(filp == ERR_PTR(-ESTALE))) // 遇到 stale 文件句柄,强制重新验证
filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
restore_nameidata();
return filp;
}
path_openat 是 VFS 打开文件的主函数:
c
ini
static struct file *path_openat(struct nameidata *nd,
const struct open_flags *op, unsigned flags)
{
struct file *file;
int error;
// 分配一个 struct file 对象,并初始化部分字段
file = alloc_empty_file(op->open_flag, current_cred());
if (IS_ERR(file))
return file;
// 根据不同的打开类型走不同分支
if (unlikely(file->f_flags & __O_TMPFILE)) {
error = do_tmpfile(nd, flags, op, file); // O_TMPFILE 临时文件
} else if (unlikely(file->f_flags & O_PATH)) {
error = do_o_path(nd, flags, file); // O_PATH 仅获取路径
} else {
const char *s = path_init(nd, flags); // 初始化路径查找
// 循环调用 link_path_walk 解析路径分量,最后调用 open_last_lookups
while (!(error = link_path_walk(s, nd)) &&
(s = open_last_lookups(nd, file, op)) != NULL)
;
if (!error)
error = do_open(nd, file, op); // 打开最终的文件对象
terminate_walk(nd);
}
if (likely(!error)) {
if (likely(file->f_mode & FMODE_OPENED))
return file;
WARN_ON(1);
error = -EINVAL;
}
fput(file);
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
else
error = -ESTALE;
}
return ERR_PTR(error);
}
4.2 解析最后分量:open_last_lookups
对于普通文件(非 O_TMPFILE 非 O_PATH),最关键的是 open_last_lookups。它负责处理路径的最后一部分(文件名),并且如果设置了 O_CREAT,会尝试创建该文件。
c
scss
static const char *open_last_lookups(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
struct dentry *dir = nd->path.dentry; // 父目录的 dentry
int open_flag = op->open_flag;
bool got_write = false;
struct dentry *dentry;
const char *res;
nd->flags |= op->intent; // 设置查找意图(打开、创建等)
// 如果最后分量是 "." 或 "..",直接处理
if (nd->last_type != LAST_NORM) {
if (nd->depth)
put_link(nd);
return handle_dots(nd, nd->last_type);
}
// 如果没有指定 O_CREAT,先尝试快速查找(包括 RCU 模式)
if (!(open_flag & O_CREAT)) {
if (nd->last.name[nd->last.len])
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
dentry = lookup_fast(nd); // 快速查找,可能在 dcache 中命中
if (IS_ERR(dentry))
return ERR_CAST(dentry);
if (likely(dentry))
goto finish_lookup; // 找到了,跳转到完成部分
if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
return ERR_PTR(-ECHILD);
} else {
// O_CREAT 分支:需要创建文件,不能仅用 RCU,要获取锁
if (nd->flags & LOOKUP_RCU) {
if (!try_to_unlazy(nd))
return ERR_PTR(-ECHILD);
}
audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
// 检查最后分量是否以斜杠结尾(例如 "dir/",不允许创建文件)
if (unlikely(nd->last.name[nd->last.len]))
return ERR_PTR(-EISDIR);
}
// 如果需要写权限(创建、截断、写打开),预先获取挂载点写权限
if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
got_write = !mnt_want_write(nd->path.mnt);
// 注意:即使获取失败,这里也不会立即失败,而是先尝试创建,
// 因为可能文件已经存在,不需要写操作。后续 lookup_open 会处理。
}
// 根据 O_CREAT 决定锁类型:创建需要排他锁,否则共享锁
if (open_flag & O_CREAT)
inode_lock(dir->d_inode);
else
inode_lock_shared(dir->d_inode);
// 核心:查找或创建 dentry
dentry = lookup_open(nd, file, op, got_write);
if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
fsnotify_create(dir->d_inode, dentry); // 通知文件系统创建事件
// 解锁
if (open_flag & O_CREAT)
inode_unlock(dir->d_inode);
else
inode_unlock_shared(dir->d_inode);
if (got_write)
mnt_drop_write(nd->path.mnt);
if (IS_ERR(dentry))
return ERR_CAST(dentry);
// 如果文件已经被完全打开(FMODE_OPENED 或 FMODE_CREATED),直接返回
if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
dput(nd->path.dentry);
nd->path.dentry = dentry;
return NULL;
}
finish_lookup:
if (nd->depth)
put_link(nd);
// 完成查找,进入最终路径项,可能跟随符号链接
res = step_into(nd, WALK_TRAILING, dentry);
if (unlikely(res))
nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
return res;
}
4.3 真正的创建:lookup_open
lookup_open 是 VFS 中实现文件创建的关键函数,它会在父目录中查找文件,如果不存在且设置了 O_CREAT,则调用具体文件系统的 create 方法。
c
ini
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
const struct open_flags *op,
bool got_write)
{
struct mnt_idmap *idmap;
struct dentry *dir = nd->path.dentry;
struct inode *dir_inode = dir->d_inode;
int open_flag = op->open_flag;
struct dentry *dentry;
int error, create_error = 0;
umode_t mode = op->mode;
if (unlikely(IS_DEADDIR(dir_inode)))
return ERR_PTR(-ENOENT);
file->f_mode &= ~FMODE_CREATED;
// 在 dcache 中查找 dentry
dentry = d_lookup(dir, &nd->last);
for (;;) {
if (!dentry) {
// 并行查找:如果有其他进程也在查找,可能阻塞
dentry = d_alloc_parallel(dir, &nd->last, &wq);
if (IS_ERR(dentry))
return dentry;
}
if (d_in_lookup(dentry))
break;
// 验证 dentry 是否有效(例如检查是否过期)
error = d_revalidate(dentry, nd->flags);
if (likely(error > 0))
break;
if (error)
goto out_dput;
d_invalidate(dentry);
dput(dentry);
dentry = NULL;
}
// 如果 dentry 已经有 inode(positive dentry),直接返回
if (dentry->d_inode) {
return dentry;
}
// 以下处理 negative dentry(文件不存在)
idmap = mnt_idmap(nd->path.mnt);
// 如果有 O_CREAT,先做权限检查和模式准备
if (open_flag & O_CREAT) {
if (open_flag & O_EXCL)
open_flag &= ~O_TRUNC; // O_EXCL 时不需要截断
mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode);
if (likely(got_write))
create_error = may_o_create(idmap, &nd->path, dentry, mode);
else
create_error = -EROFS; // 无写权限(如只读文件系统)
}
if (create_error)
open_flag &= ~O_CREAT; // 清除 O_CREAT 标志,避免尝试创建
// 如果文件系统实现了 atomic_open,优先使用(可减少一次 lookup)
if (dir_inode->i_op->atomic_open) {
dentry = atomic_open(nd, dentry, file, open_flag, mode);
if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
dentry = ERR_PTR(create_error);
return dentry;
}
// 没有 atomic_open,则需要手动调用 lookup
if (d_in_lookup(dentry)) {
struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, nd->flags);
d_lookup_done(dentry);
if (unlikely(res)) {
if (IS_ERR(res)) {
error = PTR_ERR(res);
goto out_dput;
}
dput(dentry);
dentry = res;
}
}
// 如果仍然是 negative dentry 且设置了 O_CREAT,则创建文件
if (!dentry->d_inode && (open_flag & O_CREAT)) {
file->f_mode |= FMODE_CREATED; // 标记文件已创建
audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
if (!dir_inode->i_op->create) {
error = -EACCES;
goto out_dput;
}
// 调用具体文件系统的 create 方法
error = dir_inode->i_op->create(idmap, dir_inode, dentry,
mode, open_flag & O_EXCL);
if (error)
goto out_dput;
}
// 如果创建过程中出现了之前的权限错误,且最终文件还是没有 inode,则报告该错误
if (unlikely(create_error) && !dentry->d_inode) {
error = create_error;
goto out_dput;
}
return dentry;
out_dput:
dput(dentry);
return ERR_PTR(error);
}
注释:
d_lookup在目录的 dcache 哈希表中查找 dentry。如果找到但处于d_in_lookup状态(正在被其他进程查找),则等待。- 如果 dentry 是 negative(
d_inode == NULL)并且有O_CREAT,则调用父目录 inode 操作中的create方法。 vfs_prepare_mode根据 umask 等规则调整最终的文件权限。may_o_create检查是否有权限在父目录中创建文件。
五、ext4 文件系统的具体实现
当 VFS 调用父目录的 create 方法时,对于 ext4 文件系统,实际执行的是 ext4_create。
5.1 ext4 目录 inode 操作表
ext4 定义了自己的目录 inode 操作表:
c
ini
const struct inode_operations ext4_dir_inode_operations = {
.create = ext4_create, // 创建普通文件
.lookup = ext4_lookup,
.link = ext4_link,
.unlink = ext4_unlink,
.symlink = ext4_symlink,
.mkdir = ext4_mkdir,
.rmdir = ext4_rmdir,
.mknod = ext4_mknod,
.tmpfile = ext4_tmpfile,
.rename = ext4_rename2,
.setattr = ext4_setattr,
.getattr = ext4_getattr,
// ... 其他操作
};
5.2 ext4_create 函数
c
ini
static int ext4_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
handle_t *handle;
struct inode *inode;
int err, credits, retries = 0;
err = dquot_initialize(dir);
if (err)
return err;
// 计算创建文件需要的日志 credits(用于事务)
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
// 分配新的 inode 并启动日志事务
inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name,
0, NULL, EXT4_HT_DIR, credits);
handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
// 设置 inode 的操作表为普通文件操作
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
// 将新 inode 添加到目录中
err = ext4_add_nondir(handle, dentry, &inode);
if (!err)
ext4_fc_track_create(handle, dentry);
}
if (handle)
ext4_journal_stop(handle);
if (!IS_ERR_OR_NULL(inode))
iput(inode);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
}
5.3 分配新的 inode:__ext4_new_inode
ext4_new_inode_start_handle 是一个宏,最终调用 __ext4_new_inode。这是 ext4 中最复杂的函数之一,负责从磁盘上找到一个空闲的 inode 并初始化。
c
scss
struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
handle_t *handle, struct inode *dir,
umode_t mode, const struct qstr *qstr,
__u32 goal, uid_t *owner, __u32 i_flags,
int handle_type, unsigned int line_no,
int nblocks)
{
struct super_block *sb;
struct buffer_head *inode_bitmap_bh = NULL;
ext4_group_t ngroups, group = 0;
unsigned long ino = 0;
struct inode *inode;
struct ext4_group_desc *gdp = NULL;
struct ext4_sb_info *sbi;
int err;
// ... 变量声明
sb = dir->i_sb;
sbi = EXT4_SB(sb);
// 分配内存 VFS inode
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
// 设置 inode 的属主和权限
if (owner) {
inode->i_mode = mode;
i_uid_write(inode, owner[0]);
i_gid_write(inode, owner[1]);
} else if (test_opt(sb, GRPID)) {
inode->i_mode = mode;
inode_fsuid_set(inode, idmap);
inode->i_gid = dir->i_gid;
} else
inode_init_owner(idmap, inode, dir, mode);
// 选择分配组:目录使用 orlov 算法,普通文件使用父目录所在的组
if (S_ISDIR(mode))
ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
else
ret2 = find_group_other(sb, dir, &group, mode);
if (ret2 == -1) {
err = -ENOSPC;
goto out;
}
// 循环查找空闲 inode
for (i = 0; i < ngroups; i++, ino = 0) {
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
if (!gdp) {
err = -EIO;
goto out;
}
if (ext4_free_inodes_count(sb, gdp) == 0)
goto next_group;
// 读取 inode 位图
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
if (IS_ERR(inode_bitmap_bh)) {
inode_bitmap_bh = NULL;
goto next_group;
}
repeat_in_this_group:
// 在位图中找到一个空闲位
ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
if (!ret2)
goto next_group;
// 确保不会分配到保留 inode(如 ext4 的 journal inode)
if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
ext4_error(sb, "reserved inode found cleared - inode=%lu", ino + 1);
ext4_mark_group_bitmap_corrupted(sb, group,
EXT4_GROUP_INFO_IBITMAP_CORRUPT);
goto next_group;
}
// 获取位图块的写访问(日志)
err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh,
EXT4_JTR_NONE);
if (err)
goto out;
ext4_lock_group(sb, group);
// 原子地测试并设置位图位
ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
if (ret2) {
// 如果被其他人抢先,继续查找
ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
if (ret2) {
ext4_set_bit(ino, inode_bitmap_bh->b_data);
ret2 = 0;
} else {
ret2 = 1;
}
}
ext4_unlock_group(sb, group);
ino++;
if (!ret2)
goto got; // 成功获得 inode
if (ino < EXT4_INODES_PER_GROUP(sb))
goto repeat_in_this_group;
next_group:
if (++group == ngroups)
group = 0;
}
err = -ENOSPC;
goto out;
got:
// 标记位图块为脏
err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
if (err)
goto out;
// 更新块组描述符中的空闲 inode 计数
ext4_lock_group(sb, group);
ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
if (S_ISDIR(mode)) {
ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
// 更新 flex_bg 统计
if (sbi->s_log_groups_per_flex) {
ext4_group_t f = ext4_flex_group(sbi, group);
atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups, f)->used_dirs);
}
}
// 更新组描述符校验和
ext4_group_desc_csum_set(sb, group, gdp);
ext4_unlock_group(sb, group);
// 设置 inode 号
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
inode->i_blocks = 0;
simple_inode_init_ts(inode); // 初始化时间戳
EXT4_I(inode)->i_crtime = inode_get_mtime(inode); // 创建时间
// 继承父目录的部分标志(如 extents, projid 等)
ei->i_flags = ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
ei->i_flags |= i_flags;
ei->i_block_group = group;
// 插入 inode 到全局哈希表,并设置状态
if (insert_inode_locked(inode) < 0) {
err = -EIO;
ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
inode->i_ino);
goto out;
}
inode->i_generation = get_random_u32();
// 如果启用了 metadata checksum,计算种子
if (ext4_has_metadata_csum(sb)) {
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
}
ext4_set_inode_state(inode, EXT4_STATE_NEW);
// 设置扩展属性、ACL、加密等(省略部分代码)
// ...
// 将 inode 标记为脏,等待写入磁盘
err = ext4_mark_inode_dirty(handle, inode);
if (err)
goto fail_free_drop;
brelse(inode_bitmap_bh);
return inode;
out:
// 错误处理:丢弃 inode
iput(inode);
brelse(inode_bitmap_bh);
return ERR_PTR(err);
}
这段代码展示了 ext4 如何在成千上万的 inode 中选择一个、更新位图、更新组描述符、初始化 inode 结构,最后将其插入 VFS 的 inode 缓存中。
5.4 添加目录项:ext4_add_nondir
有了 inode 之后,需要将其与文件名关联起来,即创建目录项(dentry)并写入父目录的块中。
c
scss
static int ext4_add_nondir(handle_t *handle,
struct dentry *dentry, struct inode **inodep)
{
struct inode *dir = d_inode(dentry->d_parent);
struct inode *inode = *inodep;
int err = ext4_add_entry(handle, dentry, inode);
if (!err) {
err = ext4_mark_inode_dirty(handle, inode);
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
d_instantiate_new(dentry, inode); // 将 dentry 与 inode 绑定
*inodep = NULL; // 所有权转移给 dentry
return err;
}
// 失败时清理:减少 inode 链接数,加入孤儿列表
drop_nlink(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
return err;
}
ext4_add_entry 负责在目录文件中添加一个条目(文件名 + inode 号)。对于 ext4,目录可以是线性列表或索引树(htree)。这里不再深入。
d_instantiate_new 将新分配的 inode 关联到 dentry,并将 dentry 标记为有效。之后,该文件就可以被路径查找找到了。
六、打开文件并返回文件描述符
创建完 inode 和 dentry 后,lookup_open 返回到 open_last_lookups,然后回到 path_openat。接着调用 do_open 完成最后的打开动作。
6.1 do_open 函数
c
ini
static int do_open(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
struct mnt_idmap *idmap;
int open_flag = op->open_flag;
bool do_truncate;
int acc_mode;
int error;
// 确保路径查找已经完成(例如处理符号链接)
if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
error = complete_walk(nd);
if (error)
return error;
}
if (!(file->f_mode & FMODE_CREATED))
audit_inode(nd->name, nd->path.dentry, 0);
idmap = mnt_idmap(nd->path.mnt);
// 检查 O_EXCL 与文件已存在的冲突
if (open_flag & O_CREAT) {
if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
return -EEXIST;
if (d_is_dir(nd->path.dentry))
return -EISDIR;
error = may_create_in_sticky(idmap, nd, d_backing_inode(nd->path.dentry));
if (unlikely(error))
return error;
}
// 检查是否要求打开目录但路径不是目录
if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
return -ENOTDIR;
do_truncate = false;
acc_mode = op->acc_mode;
if (file->f_mode & FMODE_CREATED) {
// 新创建的文件,不需要检查写权限,也不需要截断
open_flag &= ~O_TRUNC;
acc_mode = 0;
} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
// 如果文件已存在且要求截断,则需要获取写权限
error = mnt_want_write(nd->path.mnt);
if (error)
return error;
do_truncate = true;
}
// 检查打开权限(读、写、执行等)
error = may_open(idmap, &nd->path, acc_mode, open_flag);
if (!error && !(file->f_mode & FMODE_OPENED))
error = vfs_open(&nd->path, file); // 调用具体文件系统的 open 方法
if (!error)
error = ima_file_check(file, op->acc_mode);
if (!error && do_truncate)
error = handle_truncate(idmap, file);
if (do_truncate)
mnt_drop_write(nd->path.mnt);
return error;
}
6.2 vfs_open 和 do_dentry_open
vfs_open 非常简单,它设置 file->f_path 然后调用 do_dentry_open:
c
arduino
int vfs_open(const struct path *path, struct file *file)
{
file->f_path = *path;
return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
}
do_dentry_open 执行实际的打开操作,包括:
- 设置
f_inode、f_mapping。 - 对常规文件增加读写计数。
- 获取文件操作表
f_op(从 inode 的i_fop)。 - 调用文件系统特定的
open方法(如果存在)。 - 设置标志
FMODE_OPENED。
至此,struct file 已经完全构建好,可以返回给用户态。
七、回到用户空间:时间戳更新
对于我们之前的 touch 流程,创建完文件后,还需要设置时间戳。touch 函数中调用了 fdutimensat,它封装了 utimensat 系统调用。由于内核源码中未提供 utimensat 的实现,我们简要概述其过程:
utimensat系统调用从用户空间拷贝times数组。- 根据
flags和路径,找到目标文件的struct path。 - 调用
vfs_utimes,该函数会检查权限(进程是否拥有文件或具有 CAP_FOWNER 等)。 - 对于常规文件,最终调用
inode->i_op->update_time或直接使用generic_update_time更新 inode 中的i_atime、i_mtime、i_ctime。 - 如果文件系统开启了日志,则记录时间戳变更。
- 标记 inode 为脏,等待回写磁盘。
对于 amtime_now 的特殊处理:当 t 参数为 NULL 时,内核会将 atime 和 mtime 都设置为当前时间,但如果文件不可写(只读),系统调用仍然可以成功,因为 NULL 的含义是"使用当前时间,但如果不能设置,请忽略错误"------这依赖于 vfs_utimes 中的逻辑。
八、总结
通过上述漫长的代码之旅,我们完整地剖析了 touch 命令从用户输入到内核完成文件创建和时间戳更新的全过程。回顾一下关键路径:
- 命令行解析 :
main函数处理各种选项,确定时间来源和要修改的时间类型。 - 准备时间戳 :通过
-t、-r、-d或默认当前时间,构造struct timespec数组。 - 对每个文件 :调用
touch函数,该函数根据情况先open(可能带有O_CREAT),然后utimensat。 open的系统调用链 :open->openat->do_sys_open->do_sys_openat2->do_filp_open->path_openat。- VFS 路径查找 :
open_last_lookups处理最后分量,lookup_open在 dcache 中查找或创建 dentry。 - 文件系统创建 :对于 ext4,调用
ext4_create->__ext4_new_inode(分配 inode)->ext4_add_nondir(添加目录项)。 - 打开文件 :
do_open执行权限检查和可能的截断,vfs_open关联struct file。 - 设置时间戳 :
utimensat系统调用更新 inode 中的时间字段。 - 返回用户空间 :文件描述符与
struct file关联,touch函数完成,关闭临时文件描述符。
整个流程体现了 Linux 内核 VFS 层的精妙设计:通过通用的接口(如 dentry、inode、file_operations)将具体文件系统的差异封装起来,使得上层工具可以统一操作。同时,dcache(目录项缓存)和 inode 缓存极大提升了路径查找的性能。
touch 命令虽小,却涉及了文件系统最核心的几个概念:路径解析、目录项、inode 分配、时间戳更新。希望通过本文的详细解读,读者能对 Linux 系统有更深刻的理解,并能够举一反三,分析其他文件操作命令的实现。 #源码
scss
int
main (int argc, char **argv)
{
int c;
bool date_set = false;
bool ok = true;
char const *flex_date = NULL;
initialize_main (&argc, &argv);
set_program_name (argv[0]);
setlocale (LC_ALL, "");
bindtextdomain (PACKAGE, LOCALEDIR);
textdomain (PACKAGE);
atexit (close_stdout);
change_times = 0;
no_create = use_ref = false;
while ((c = getopt_long (argc, argv, "acd:fhmr:t:", longopts, NULL)) != -1)
{
switch (c)
{
case 'a':
change_times |= CH_ATIME;
break;
case 'c':
no_create = true;
break;
case 'd':
flex_date = optarg;
break;
case 'f':
break;
case 'h':
no_dereference = true;
break;
case 'm':
change_times |= CH_MTIME;
break;
case 'r':
use_ref = true;
ref_file = optarg;
break;
case 't':
if (! posixtime (&newtime[0].tv_sec, optarg,
PDS_LEADING_YEAR | PDS_CENTURY | PDS_SECONDS))
error (EXIT_FAILURE, 0, _("invalid date format %s"),
quote (optarg));
newtime[0].tv_nsec = 0;
newtime[1] = newtime[0];
date_set = true;
break;
case TIME_OPTION: /* --time */
change_times |= XARGMATCH ("--time", optarg,
time_args, time_masks);
break;
case_GETOPT_HELP_CHAR;
case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);
default:
usage (EXIT_FAILURE);
}
}
if (change_times == 0)
change_times = CH_ATIME | CH_MTIME;
if (date_set && (use_ref || flex_date))
{
error (0, 0, _("cannot specify times from more than one source"));
usage (EXIT_FAILURE);
}
if (use_ref)
{
struct stat ref_stats;
/* Don't use (no_dereference?lstat:stat) (args), since stat
might be an object-like macro. */
if (no_dereference ? lstat (ref_file, &ref_stats)
: stat (ref_file, &ref_stats))
error (EXIT_FAILURE, errno,
_("failed to get attributes of %s"), quoteaf (ref_file));
newtime[0] = get_stat_atime (&ref_stats);
newtime[1] = get_stat_mtime (&ref_stats);
date_set = true;
if (flex_date)
{
if (change_times & CH_ATIME)
newtime[0] = date_relative (flex_date, newtime[0]);
if (change_times & CH_MTIME)
newtime[1] = date_relative (flex_date, newtime[1]);
}
}
else
{
if (flex_date)
{
struct timespec now = current_timespec ();
newtime[1] = newtime[0] = date_relative (flex_date, now);
date_set = true;
/* If neither -a nor -m is specified, treat "-d now" as if
it were absent; this lets "touch" succeed more often in
the presence of restrictive permissions. */
if (change_times == (CH_ATIME | CH_MTIME)
&& newtime[0].tv_sec == now.tv_sec
&& newtime[0].tv_nsec == now.tv_nsec)
{
/* Check that it really was "-d now", and not a timestamp
that just happens to be the current time. */
struct timespec notnow, notnow1;
notnow.tv_sec = now.tv_sec ^ 1;
notnow.tv_nsec = now.tv_nsec;
notnow1 = date_relative (flex_date, notnow);
if (notnow1.tv_sec == notnow.tv_sec
&& notnow1.tv_nsec == notnow.tv_nsec)
date_set = false;
}
}
}
/* The obsolete 'MMDDhhmm[YY]' form is valid IFF there are
two or more non-option arguments. */
if (!date_set && 2 <= argc - optind && posix2_version () < 200112
&& posixtime (&newtime[0].tv_sec, argv[optind],
PDS_TRAILING_YEAR | PDS_PRE_2000))
{
newtime[0].tv_nsec = 0;
newtime[1] = newtime[0];
date_set = true;
if (! getenv ("POSIXLY_CORRECT"))
{
struct tm const *tm = localtime (&newtime[0].tv_sec);
/* Technically, it appears that even a deliberate attempt to cause
the above localtime to return NULL will always fail because our
posixtime implementation rejects all dates for which localtime
would fail. However, skip the warning if it ever fails. */
if (tm)
error (0, 0,
_("warning: 'touch %s' is obsolete; use "
"'touch -t %04ld%02d%02d%02d%02d.%02d'"),
argv[optind],
tm->tm_year + 1900L, tm->tm_mon + 1, tm->tm_mday,
tm->tm_hour, tm->tm_min, tm->tm_sec);
}
optind++;
}
if (!date_set)
{
if (change_times == (CH_ATIME | CH_MTIME))
amtime_now = true;
else
newtime[1].tv_nsec = newtime[0].tv_nsec = UTIME_NOW;
}
if (optind == argc)
{
error (0, 0, _("missing file operand"));
usage (EXIT_FAILURE);
}
for (; optind < argc; ++optind)
ok &= touch (argv[optind]);
return ok ? EXIT_SUCCESS : EXIT_FAILURE;
}
static bool
touch (char const *file)
{
int fd = -1;
int open_errno = 0;
struct timespec const *t = newtime;
if (streq (file, "-"))
fd = STDOUT_FILENO;
else if (! (no_create || no_dereference))
{
/* Try to open FILE, creating it if necessary. */
fd = fd_reopen (STDIN_FILENO, file,
O_WRONLY | O_CREAT | O_NONBLOCK | O_NOCTTY, MODE_RW_UGO);
if (fd < 0)
open_errno = errno;
}
if (change_times != (CH_ATIME | CH_MTIME))
{
/* We're setting only one of the time values. */
if (change_times == CH_MTIME)
newtime[0].tv_nsec = UTIME_OMIT;
else
{
affirm (change_times == CH_ATIME);
newtime[1].tv_nsec = UTIME_OMIT;
}
}
if (amtime_now)
{
/* Pass NULL to futimens so it will not fail if we have
write access to the file, but don't own it. */
t = NULL;
}
char const *file_opt = fd == STDOUT_FILENO ? NULL : file;
int atflag = no_dereference ? AT_SYMLINK_NOFOLLOW : 0;
int utime_errno = (fdutimensat (fd, AT_FDCWD, file_opt, t, atflag) == 0
? 0 : errno);
if (fd == STDIN_FILENO)
{
if (close (STDIN_FILENO) != 0)
{
error (0, errno, _("failed to close %s"), quoteaf (file));
return false;
}
}
else if (fd == STDOUT_FILENO)
{
/* Do not diagnose "touch -c - >&-". */
if (utime_errno == EBADF && no_create)
return true;
}
if (utime_errno != 0)
{
/* Don't diagnose with open_errno if FILE is a directory, as that
would give a bogus diagnostic for e.g., 'touch /' (assuming we
don't own / or have write access). On Solaris 10 and probably
other systems, opening a directory like "." fails with EINVAL.
(On SunOS 4 it was EPERM but that's obsolete.) On macOS 26
opening "/" fails with EEXIST. */
struct stat st;
if (open_errno
&& ! (open_errno == EISDIR
|| ((open_errno == EINVAL || open_errno == EEXIST)
&& stat (file, &st) == 0 && S_ISDIR (st.st_mode))))
{
/* The wording of this diagnostic should cover at least two cases:
- the file does not exist, but the parent directory is unwritable
- the file exists, but it isn't writable
I think it's not worth trying to distinguish them. */
error (0, open_errno, _("cannot touch %s"), quoteaf (file));
}
else
{
if (no_create && utime_errno == ENOENT)
return true;
error (0, utime_errno, _("setting times of %s"), quoteaf (file));
}
return false;
}
return true;
}
/* Open a file to a particular file descriptor. This is like standard
'open', except it always returns DESIRED_FD if successful. */
int
fd_reopen (int desired_fd, char const *file, int flags, mode_t mode)
{
int fd = open (file, flags, mode);
if (fd == desired_fd || fd < 0)
return fd;
else
{
int fd2 = dup2 (fd, desired_fd);
int saved_errno = errno;
close (fd);
errno = saved_errno;
return fd2;
}
}
257 common openat sys_openat
#ifndef __OFF_T_MATCHES_OFF64_T
/* Open FILE with access OFLAG. If O_CREAT or O_TMPFILE is in OFLAG,
a third argument is the file protection. */
int
__libc_open (const char *file, int oflag, ...)
{
int mode = 0;
if (__OPEN_NEEDS_MODE (oflag))
{
va_list arg;
va_start (arg, oflag);
mode = va_arg (arg, int);
va_end (arg);
}
return SYSCALL_CANCEL (openat, AT_FDCWD, file, oflag, mode);
}
libc_hidden_def (__libc_open)
weak_alias (__libc_open, __open)
libc_hidden_weak (__open)
weak_alias (__libc_open, open)
#endif
SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
umode_t, mode)
{
if (force_o_largefile())
flags |= O_LARGEFILE;
return do_sys_open(dfd, filename, flags, mode);
}
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_how how = build_open_how(flags, mode);
return do_sys_openat2(dfd, filename, &how);
}
static long do_sys_openat2(int dfd, const char __user *filename,
struct open_how *how)
{
struct open_flags op;
int fd = build_open_flags(how, &op);
struct filename *tmp;
if (fd)
return fd;
tmp = getname(filename);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
fd = get_unused_fd_flags(how->flags);
if (fd >= 0) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
put_unused_fd(fd);
fd = PTR_ERR(f);
} else {
fd_install(fd, f);
}
}
putname(tmp);
return fd;
}
extern struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op);
struct file *do_filp_open(int dfd, struct filename *pathname,
const struct open_flags *op)
{
struct nameidata nd;
int flags = op->lookup_flags;
struct file *filp;
set_nameidata(&nd, dfd, pathname, NULL);
filp = path_openat(&nd, op, flags | LOOKUP_RCU);
if (unlikely(filp == ERR_PTR(-ECHILD)))
filp = path_openat(&nd, op, flags);
if (unlikely(filp == ERR_PTR(-ESTALE)))
filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
restore_nameidata();
return filp;
}
static struct file *path_openat(struct nameidata *nd,
const struct open_flags *op, unsigned flags)
{
struct file *file;
int error;
file = alloc_empty_file(op->open_flag, current_cred());
if (IS_ERR(file))
return file;
if (unlikely(file->f_flags & __O_TMPFILE)) {
error = do_tmpfile(nd, flags, op, file);
} else if (unlikely(file->f_flags & O_PATH)) {
error = do_o_path(nd, flags, file);
} else {
const char *s = path_init(nd, flags);
while (!(error = link_path_walk(s, nd)) &&
(s = open_last_lookups(nd, file, op)) != NULL)
;
if (!error)
error = do_open(nd, file, op);
terminate_walk(nd);
}
if (likely(!error)) {
if (likely(file->f_mode & FMODE_OPENED))
return file;
WARN_ON(1);
error = -EINVAL;
}
fput(file);
if (error == -EOPENSTALE) {
if (flags & LOOKUP_RCU)
error = -ECHILD;
else
error = -ESTALE;
}
return ERR_PTR(error);
}
static const char *open_last_lookups(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
struct dentry *dir = nd->path.dentry;
int open_flag = op->open_flag;
bool got_write = false;
struct dentry *dentry;
const char *res;
nd->flags |= op->intent;
if (nd->last_type != LAST_NORM) {
if (nd->depth)
put_link(nd);
return handle_dots(nd, nd->last_type);
}
if (!(open_flag & O_CREAT)) {
if (nd->last.name[nd->last.len])
nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
/* we _can_ be in RCU mode here */
dentry = lookup_fast(nd);
if (IS_ERR(dentry))
return ERR_CAST(dentry);
if (likely(dentry))
goto finish_lookup;
if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
return ERR_PTR(-ECHILD);
} else {
/* create side of things */
if (nd->flags & LOOKUP_RCU) {
if (!try_to_unlazy(nd))
return ERR_PTR(-ECHILD);
}
audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
/* trailing slashes? */
if (unlikely(nd->last.name[nd->last.len]))
return ERR_PTR(-EISDIR);
}
if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
got_write = !mnt_want_write(nd->path.mnt);
/*
* do _not_ fail yet - we might not need that or fail with
* a different error; let lookup_open() decide; we'll be
* dropping this one anyway.
*/
}
if (open_flag & O_CREAT)
inode_lock(dir->d_inode);
else
inode_lock_shared(dir->d_inode);
dentry = lookup_open(nd, file, op, got_write);
if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
fsnotify_create(dir->d_inode, dentry);
if (open_flag & O_CREAT)
inode_unlock(dir->d_inode);
else
inode_unlock_shared(dir->d_inode);
if (got_write)
mnt_drop_write(nd->path.mnt);
if (IS_ERR(dentry))
return ERR_CAST(dentry);
if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
dput(nd->path.dentry);
nd->path.dentry = dentry;
return NULL;
}
finish_lookup:
if (nd->depth)
put_link(nd);
res = step_into(nd, WALK_TRAILING, dentry);
if (unlikely(res))
nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
return res;
}
/*
* Look up and maybe create and open the last component.
*
* Must be called with parent locked (exclusive in O_CREAT case).
*
* Returns 0 on success, that is, if
* the file was successfully atomically created (if necessary) and opened, or
* the file was not completely opened at this time, though lookups and
* creations were performed.
* These case are distinguished by presence of FMODE_OPENED on file->f_mode.
* In the latter case dentry returned in @path might be negative if O_CREAT
* hadn't been specified.
*
* An error code is returned on failure.
*/
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
const struct open_flags *op,
bool got_write)
{
struct mnt_idmap *idmap;
struct dentry *dir = nd->path.dentry;
struct inode *dir_inode = dir->d_inode;
int open_flag = op->open_flag;
struct dentry *dentry;
int error, create_error = 0;
umode_t mode = op->mode;
DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);
if (unlikely(IS_DEADDIR(dir_inode)))
return ERR_PTR(-ENOENT);
file->f_mode &= ~FMODE_CREATED;
dentry = d_lookup(dir, &nd->last);
for (;;) {
if (!dentry) {
dentry = d_alloc_parallel(dir, &nd->last, &wq);
if (IS_ERR(dentry))
return dentry;
}
if (d_in_lookup(dentry))
break;
error = d_revalidate(dentry, nd->flags);
if (likely(error > 0))
break;
if (error)
goto out_dput;
d_invalidate(dentry);
dput(dentry);
dentry = NULL;
}
if (dentry->d_inode) {
/* Cached positive dentry: will open in f_op->open */
return dentry;
}
/*
* Checking write permission is tricky, bacuse we don't know if we are
* going to actually need it: O_CREAT opens should work as long as the
* file exists. But checking existence breaks atomicity. The trick is
* to check access and if not granted clear O_CREAT from the flags.
*
* Another problem is returing the "right" error value (e.g. for an
* O_EXCL open we want to return EEXIST not EROFS).
*/
if (unlikely(!got_write))
open_flag &= ~O_TRUNC;
idmap = mnt_idmap(nd->path.mnt);
if (open_flag & O_CREAT) {
if (open_flag & O_EXCL)
open_flag &= ~O_TRUNC;
mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode);
if (likely(got_write))
create_error = may_o_create(idmap, &nd->path,
dentry, mode);
else
create_error = -EROFS;
}
if (create_error)
open_flag &= ~O_CREAT;
if (dir_inode->i_op->atomic_open) {
dentry = atomic_open(nd, dentry, file, open_flag, mode);
if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
dentry = ERR_PTR(create_error);
return dentry;
}
if (d_in_lookup(dentry)) {
struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
nd->flags);
d_lookup_done(dentry);
if (unlikely(res)) {
if (IS_ERR(res)) {
error = PTR_ERR(res);
goto out_dput;
}
dput(dentry);
dentry = res;
}
}
/* Negative dentry, just create the file */
if (!dentry->d_inode && (open_flag & O_CREAT)) {
file->f_mode |= FMODE_CREATED;
audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
if (!dir_inode->i_op->create) {
error = -EACCES;
goto out_dput;
}
error = dir_inode->i_op->create(idmap, dir_inode, dentry,
mode, open_flag & O_EXCL);
if (error)
goto out_dput;
}
if (unlikely(create_error) && !dentry->d_inode) {
error = create_error;
goto out_dput;
}
return dentry;
out_dput:
dput(dentry);
return ERR_PTR(error);
}
/*
* directories can handle most operations...
*/
const struct inode_operations ext4_dir_inode_operations = {
.create = ext4_create,
.lookup = ext4_lookup,
.link = ext4_link,
.unlink = ext4_unlink,
.symlink = ext4_symlink,
.mkdir = ext4_mkdir,
.rmdir = ext4_rmdir,
.mknod = ext4_mknod,
.tmpfile = ext4_tmpfile,
.rename = ext4_rename2,
.setattr = ext4_setattr,
.getattr = ext4_getattr,
.listxattr = ext4_listxattr,
.get_inode_acl = ext4_get_acl,
.set_acl = ext4_set_acl,
.fiemap = ext4_fiemap,
.fileattr_get = ext4_fileattr_get,
.fileattr_set = ext4_fileattr_set,
};
/*
* By the time this is called, we already have created
* the directory cache entry for the new file, but it
* is so far negative - it has no inode.
*
* If the create succeeds, we fill in the inode information
* with d_instantiate().
*/
static int ext4_create(struct mnt_idmap *idmap, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
handle_t *handle;
struct inode *inode;
int err, credits, retries = 0;
err = dquot_initialize(dir);
if (err)
return err;
credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name,
0, NULL, EXT4_HT_DIR, credits);
handle = ext4_journal_current_handle();
err = PTR_ERR(inode);
if (!IS_ERR(inode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
ext4_set_aops(inode);
err = ext4_add_nondir(handle, dentry, &inode);
if (!err)
ext4_fc_track_create(handle, dentry);
}
if (handle)
ext4_journal_stop(handle);
if (!IS_ERR_OR_NULL(inode))
iput(inode);
if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
goto retry;
return err;
}
#define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \
type, nblocks) \
__ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \
0, (type), __LINE__, (nblocks))
/*
* There are two policies for allocating an inode. If the new inode is
* a directory, then a forward search is made for a block group with both
* free space and a low directory-to-inode ratio; if that fails, then of
* the groups with above-average free space, that group with the fewest
* directories already is chosen.
*
* For other inodes, search forward from the parent directory's block
* group to find a free inode.
*/
struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
handle_t *handle, struct inode *dir,
umode_t mode, const struct qstr *qstr,
__u32 goal, uid_t *owner, __u32 i_flags,
int handle_type, unsigned int line_no,
int nblocks)
{
struct super_block *sb;
struct buffer_head *inode_bitmap_bh = NULL;
struct buffer_head *group_desc_bh;
ext4_group_t ngroups, group = 0;
unsigned long ino = 0;
struct inode *inode;
struct ext4_group_desc *gdp = NULL;
struct ext4_inode_info *ei;
struct ext4_sb_info *sbi;
int ret2, err;
struct inode *ret;
ext4_group_t i;
ext4_group_t flex_group;
struct ext4_group_info *grp = NULL;
bool encrypt = false;
/* Cannot create files in a deleted directory */
if (!dir || !dir->i_nlink)
return ERR_PTR(-EPERM);
sb = dir->i_sb;
sbi = EXT4_SB(sb);
if (unlikely(ext4_forced_shutdown(sb)))
return ERR_PTR(-EIO);
ngroups = ext4_get_groups_count(sb);
trace_ext4_request_inode(dir, mode);
inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
ei = EXT4_I(inode);
/*
* Initialize owners and quota early so that we don't have to account
* for quota initialization worst case in standard inode creating
* transaction
*/
if (owner) {
inode->i_mode = mode;
i_uid_write(inode, owner[0]);
i_gid_write(inode, owner[1]);
} else if (test_opt(sb, GRPID)) {
inode->i_mode = mode;
inode_fsuid_set(inode, idmap);
inode->i_gid = dir->i_gid;
} else
inode_init_owner(idmap, inode, dir, mode);
if (ext4_has_feature_project(sb) &&
ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
ei->i_projid = EXT4_I(dir)->i_projid;
else
ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);
if (!(i_flags & EXT4_EA_INODE_FL)) {
err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
if (err)
goto out;
}
err = dquot_initialize(inode);
if (err)
goto out;
if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt);
if (ret2 < 0) {
err = ret2;
goto out;
}
nblocks += ret2;
}
if (!goal)
goal = sbi->s_inode_goal;
if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
ret2 = 0;
goto got_group;
}
if (S_ISDIR(mode))
ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
else
ret2 = find_group_other(sb, dir, &group, mode);
got_group:
EXT4_I(dir)->i_last_alloc_group = group;
err = -ENOSPC;
if (ret2 == -1)
goto out;
/*
* Normally we will only go through one pass of this loop,
* unless we get unlucky and it turns out the group we selected
* had its last inode grabbed by someone else.
*/
for (i = 0; i < ngroups; i++, ino = 0) {
err = -EIO;
gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
if (!gdp)
goto out;
/*
* Check free inodes count before loading bitmap.
*/
if (ext4_free_inodes_count(sb, gdp) == 0)
goto next_group;
if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
grp = ext4_get_group_info(sb, group);
/*
* Skip groups with already-known suspicious inode
* tables
*/
if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
goto next_group;
}
brelse(inode_bitmap_bh);
inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
/* Skip groups with suspicious inode tables */
if (((!(sbi->s_mount_state & EXT4_FC_REPLAY))
&& EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ||
IS_ERR(inode_bitmap_bh)) {
inode_bitmap_bh = NULL;
goto next_group;
}
repeat_in_this_group:
ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
if (!ret2)
goto next_group;
if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
ext4_error(sb, "reserved inode found cleared - "
"inode=%lu", ino + 1);
ext4_mark_group_bitmap_corrupted(sb, group,
EXT4_GROUP_INFO_IBITMAP_CORRUPT);
goto next_group;
}
if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
BUG_ON(nblocks <= 0);
handle = __ext4_journal_start_sb(NULL, dir->i_sb,
line_no, handle_type, nblocks, 0,
ext4_trans_default_revoke_credits(sb));
if (IS_ERR(handle)) {
err = PTR_ERR(handle);
ext4_std_error(sb, err);
goto out;
}
}
BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh,
EXT4_JTR_NONE);
if (err) {
ext4_std_error(sb, err);
goto out;
}
ext4_lock_group(sb, group);
ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
if (ret2) {
/* Someone already took the bit. Repeat the search
* with lock held.
*/
ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
if (ret2) {
ext4_set_bit(ino, inode_bitmap_bh->b_data);
ret2 = 0;
} else {
ret2 = 1; /* we didn't grab the inode */
}
}
ext4_unlock_group(sb, group);
ino++; /* the inode bitmap is zero-based */
if (!ret2)
goto got; /* we grabbed the inode! */
if (ino < EXT4_INODES_PER_GROUP(sb))
goto repeat_in_this_group;
next_group:
if (++group == ngroups)
group = 0;
}
err = -ENOSPC;
goto out;
got:
BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
if (err) {
ext4_std_error(sb, err);
goto out;
}
BUFFER_TRACE(group_desc_bh, "get_write_access");
err = ext4_journal_get_write_access(handle, sb, group_desc_bh,
EXT4_JTR_NONE);
if (err) {
ext4_std_error(sb, err);
goto out;
}
/* We may have to initialize the block bitmap if it isn't already */
if (ext4_has_group_desc_csum(sb) &&
gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
struct buffer_head *block_bitmap_bh;
block_bitmap_bh = ext4_read_block_bitmap(sb, group);
if (IS_ERR(block_bitmap_bh)) {
err = PTR_ERR(block_bitmap_bh);
goto out;
}
BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh,
EXT4_JTR_NONE);
if (err) {
brelse(block_bitmap_bh);
ext4_std_error(sb, err);
goto out;
}
BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);
/* recheck and clear flag under lock if we still need to */
ext4_lock_group(sb, group);
if (ext4_has_group_desc_csum(sb) &&
(gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
ext4_free_group_clusters_set(sb, gdp,
ext4_free_clusters_after_init(sb, group, gdp));
ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh);
ext4_group_desc_csum_set(sb, group, gdp);
}
ext4_unlock_group(sb, group);
brelse(block_bitmap_bh);
if (err) {
ext4_std_error(sb, err);
goto out;
}
}
/* Update the relevant bg descriptor fields */
if (ext4_has_group_desc_csum(sb)) {
int free;
struct ext4_group_info *grp = NULL;
if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
grp = ext4_get_group_info(sb, group);
if (!grp) {
err = -EFSCORRUPTED;
goto out;
}
down_read(&grp->alloc_sem); /*
* protect vs itable
* lazyinit
*/
}
ext4_lock_group(sb, group); /* while we modify the bg desc */
free = EXT4_INODES_PER_GROUP(sb) -
ext4_itable_unused_count(sb, gdp);
if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
free = 0;
}
/*
* Check the relative inode number against the last used
* relative inode number in this group. if it is greater
* we need to update the bg_itable_unused count
*/
if (ino > free)
ext4_itable_unused_set(sb, gdp,
(EXT4_INODES_PER_GROUP(sb) - ino));
if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
up_read(&grp->alloc_sem);
} else {
ext4_lock_group(sb, group);
}
ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
if (S_ISDIR(mode)) {
ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
if (sbi->s_log_groups_per_flex) {
ext4_group_t f = ext4_flex_group(sbi, group);
atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
f)->used_dirs);
}
}
if (ext4_has_group_desc_csum(sb)) {
ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh,
EXT4_INODES_PER_GROUP(sb) / 8);
ext4_group_desc_csum_set(sb, group, gdp);
}
ext4_unlock_group(sb, group);
BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
if (err) {
ext4_std_error(sb, err);
goto out;
}
percpu_counter_dec(&sbi->s_freeinodes_counter);
if (S_ISDIR(mode))
percpu_counter_inc(&sbi->s_dirs_counter);
if (sbi->s_log_groups_per_flex) {
flex_group = ext4_flex_group(sbi, group);
atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups,
flex_group)->free_inodes);
}
inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
/* This is the optimal IO size (for stat), not the fs block size */
inode->i_blocks = 0;
simple_inode_init_ts(inode);
ei->i_crtime = inode_get_mtime(inode);
memset(ei->i_data, 0, sizeof(ei->i_data));
ei->i_dir_start_lookup = 0;
ei->i_disksize = 0;
/* Don't inherit extent flag from directory, amongst others. */
ei->i_flags =
ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
ei->i_flags |= i_flags;
ei->i_file_acl = 0;
ei->i_dtime = 0;
ei->i_block_group = group;
ei->i_last_alloc_group = ~0;
ext4_set_inode_flags(inode, true);
if (IS_DIRSYNC(inode))
ext4_handle_sync(handle);
if (insert_inode_locked(inode) < 0) {
/*
* Likely a bitmap corruption causing inode to be allocated
* twice.
*/
err = -EIO;
ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
inode->i_ino);
ext4_mark_group_bitmap_corrupted(sb, group,
EXT4_GROUP_INFO_IBITMAP_CORRUPT);
goto out;
}
inode->i_generation = get_random_u32();
/* Precompute checksum seed for inode metadata */
if (ext4_has_metadata_csum(sb)) {
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
sizeof(inum));
ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
sizeof(gen));
}
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
ext4_set_inode_state(inode, EXT4_STATE_NEW);
ei->i_extra_isize = sbi->s_want_extra_isize;
ei->i_inline_off = 0;
if (ext4_has_feature_inline_data(sb) &&
(!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
ret = inode;
err = dquot_alloc_inode(inode);
if (err)
goto fail_drop;
/*
* Since the encryption xattr will always be unique, create it first so
* that it's less likely to end up in an external xattr block and
* prevent its deduplication.
*/
if (encrypt) {
err = fscrypt_set_context(inode, handle);
if (err)
goto fail_free_drop;
}
if (!(ei->i_flags & EXT4_EA_INODE_FL)) {
err = ext4_init_acl(handle, inode, dir);
if (err)
goto fail_free_drop;
err = ext4_init_security(handle, inode, dir, qstr);
if (err)
goto fail_free_drop;
}
if (ext4_has_feature_extents(sb)) {
/* set extent flag only for directory, file and normal symlink*/
if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
ext4_ext_tree_init(handle, inode);
}
}
if (ext4_handle_valid(handle)) {
ei->i_sync_tid = handle->h_transaction->t_tid;
ei->i_datasync_tid = handle->h_transaction->t_tid;
}
err = ext4_mark_inode_dirty(handle, inode);
if (err) {
ext4_std_error(sb, err);
goto fail_free_drop;
}
ext4_debug("allocating inode %lu\n", inode->i_ino);
trace_ext4_allocate_inode(inode, dir, mode);
brelse(inode_bitmap_bh);
return ret;
fail_free_drop:
dquot_free_inode(inode);
fail_drop:
clear_nlink(inode);
unlock_new_inode(inode);
out:
dquot_drop(inode);
inode->i_flags |= S_NOQUOTA;
iput(inode);
brelse(inode_bitmap_bh);
return ERR_PTR(err);
}
/*
* Add non-directory inode to a directory. On success, the inode reference is
* consumed by dentry is instantiation. This is also indicated by clearing of
* *inodep pointer. On failure, the caller is responsible for dropping the
* inode reference in the safe context.
*/
static int ext4_add_nondir(handle_t *handle,
struct dentry *dentry, struct inode **inodep)
{
struct inode *dir = d_inode(dentry->d_parent);
struct inode *inode = *inodep;
int err = ext4_add_entry(handle, dentry, inode);
if (!err) {
err = ext4_mark_inode_dirty(handle, inode);
if (IS_DIRSYNC(dir))
ext4_handle_sync(handle);
d_instantiate_new(dentry, inode);
*inodep = NULL;
return err;
}
drop_nlink(inode);
ext4_mark_inode_dirty(handle, inode);
ext4_orphan_add(handle, inode);
unlock_new_inode(inode);
return err;
}
/*
* Handle the last step of open()
*/
static int do_open(struct nameidata *nd,
struct file *file, const struct open_flags *op)
{
struct mnt_idmap *idmap;
int open_flag = op->open_flag;
bool do_truncate;
int acc_mode;
int error;
if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
error = complete_walk(nd);
if (error)
return error;
}
if (!(file->f_mode & FMODE_CREATED))
audit_inode(nd->name, nd->path.dentry, 0);
idmap = mnt_idmap(nd->path.mnt);
if (open_flag & O_CREAT) {
if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
return -EEXIST;
if (d_is_dir(nd->path.dentry))
return -EISDIR;
error = may_create_in_sticky(idmap, nd,
d_backing_inode(nd->path.dentry));
if (unlikely(error))
return error;
}
if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
return -ENOTDIR;
do_truncate = false;
acc_mode = op->acc_mode;
if (file->f_mode & FMODE_CREATED) {
/* Don't check for write permission, don't truncate */
open_flag &= ~O_TRUNC;
acc_mode = 0;
} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
error = mnt_want_write(nd->path.mnt);
if (error)
return error;
do_truncate = true;
}
error = may_open(idmap, &nd->path, acc_mode, open_flag);
if (!error && !(file->f_mode & FMODE_OPENED))
error = vfs_open(&nd->path, file);
if (!error)
error = ima_file_check(file, op->acc_mode);
if (!error && do_truncate)
error = handle_truncate(idmap, file);
if (unlikely(error > 0)) {
WARN_ON(1);
error = -EINVAL;
}
if (do_truncate)
mnt_drop_write(nd->path.mnt);
return error;
}
/**
* vfs_open - open the file at the given path
* @path: path to open
* @file: newly allocated file with f_flag initialized
*/
int vfs_open(const struct path *path, struct file *file)
{
file->f_path = *path;
return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
}
static int do_dentry_open(struct file *f,
struct inode *inode,
int (*open)(struct inode *, struct file *))
{
static const struct file_operations empty_fops = {};
int error;
path_get(&f->f_path);
f->f_inode = inode;
f->f_mapping = inode->i_mapping;
f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
f->f_sb_err = file_sample_sb_err(f);
if (unlikely(f->f_flags & O_PATH)) {
f->f_mode = FMODE_PATH | FMODE_OPENED;
f->f_op = &empty_fops;
return 0;
}
if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
i_readcount_inc(inode);
} else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
error = file_get_write_access(f);
if (unlikely(error))
goto cleanup_file;
f->f_mode |= FMODE_WRITER;
}
/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
f->f_mode |= FMODE_ATOMIC_POS;
f->f_op = fops_get(inode->i_fop);
if (WARN_ON(!f->f_op)) {
error = -ENODEV;
goto cleanup_all;
}
error = security_file_open(f);
if (error)
goto cleanup_all;
error = break_lease(file_inode(f), f->f_flags);
if (error)
goto cleanup_all;
/* normally all 3 are set; ->open() can clear them if needed */
f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
if (!open)
open = f->f_op->open;
if (open) {
error = open(inode, f);
if (error)
goto cleanup_all;
}
f->f_mode |= FMODE_OPENED;
if ((f->f_mode & FMODE_READ) &&
likely(f->f_op->read || f->f_op->read_iter))
f->f_mode |= FMODE_CAN_READ;
if ((f->f_mode & FMODE_WRITE) &&
likely(f->f_op->write || f->f_op->write_iter))
f->f_mode |= FMODE_CAN_WRITE;
if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek)
f->f_mode &= ~FMODE_LSEEK;
if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
f->f_mode |= FMODE_CAN_ODIRECT;
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
f->f_iocb_flags = iocb_flags(f);
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
return -EINVAL;
/*
* XXX: Huge page cache doesn't support writing yet. Drop all page
* cache for this file before processing writes.
*/
if (f->f_mode & FMODE_WRITE) {
/*
* Paired with smp_mb() in collapse_file() to ensure nr_thps
* is up to date and the update to i_writecount by
* get_write_access() is visible. Ensures subsequent insertion
* of THPs into the page cache will fail.
*/
smp_mb();
if (filemap_nr_thps(inode->i_mapping)) {
struct address_space *mapping = inode->i_mapping;
filemap_invalidate_lock(inode->i_mapping);
/*
* unmap_mapping_range just need to be called once
* here, because the private pages is not need to be
* unmapped mapping (e.g. data segment of dynamic
* shared libraries here).
*/
unmap_mapping_range(mapping, 0, 0, 0);
truncate_inode_pages(mapping, 0);
filemap_invalidate_unlock(inode->i_mapping);
}
}
/*
* Once we return a file with FMODE_OPENED, __fput() will call
* fsnotify_close(), so we need fsnotify_open() here for symmetry.
*/
fsnotify_open(f);
return 0;
cleanup_all:
if (WARN_ON_ONCE(error > 0))
error = -EINVAL;
fops_put(f->f_op);
put_file_access(f);
cleanup_file:
path_put(&f->f_path);
f->f_path.mnt = NULL;
f->f_path.dentry = NULL;
f->f_inode = NULL;
return error;
}