走进 Linux 内核:从 touch 命令到磁盘 inode 的完整旅程

一句 touch file.txt,背后是一整个操作系统在为你忙碌

前言

几乎每个 Linux 用户都曾用过 touch 命令。你可能知道,当文件不存在时它会创建一个空文件;当文件已存在时,它会更新文件的访问时间和修改时间。但你是否想过,这个简单的命令是如何一步步将请求从键盘传递到磁盘的?用户空间的 main 函数做了哪些判断?内核 VFS(虚拟文件系统)如何将路径解析成 dentry?ext4 文件系统又是怎样从数十亿个 inode 中找到一个空闲位,然后将其写入磁盘?

本文以 Linux 内核 6.8.12 源码GNU Coreutils touch 实现 为基础,完整呈现一条从命令行参数到硬件交互的逻辑链。我们将深入分析每一个关键函数,用中文注释拆解核心代码,最终串联起一幅关于"文件创建与时间戳更新"的全景图。全文超过 8000 字,适合对操作系统原理感兴趣的中高级开发者。


第一部分:用户空间 ------ touch 的命令行交响曲

1.1 程序入口与参数解析

touch 命令的实现在 coreutils 的 src/touch.c 中。main 函数是整个用户态流程的指挥家。

c

复制代码
int main (int argc, char **argv)
{
  int c;
  bool date_set = false;          // 是否已经确定了时间来源
  bool ok = true;
  char const *flex_date = NULL;   // 存储 -d 选项的灵活日期字符串

  // 初始化 —— 设置程序名、本地化、错误退出时刷新 stdout
  initialize_main (&argc, &argv);
  set_program_name (argv[0]);
  setlocale (LC_ALL, "");
  bindtextdomain (PACKAGE, LOCALEDIR);
  textdomain (PACKAGE);
  atexit (close_stdout);

  change_times = 0;               // 位掩码:记录要修改哪些时间(atime/mtime)
  no_create = use_ref = false;    // -c 不创建文件,-r 使用参考文件

  // 使用 getopt_long 解析选项,支持短选项和长选项
  while ((c = getopt_long (argc, argv, "acd:fhmr:t:", longopts, NULL)) != -1)
    {
      switch (c)
        {
        case 'a':
          change_times |= CH_ATIME;    // 要求修改访问时间
          break;
        case 'c':
          no_create = true;            // 不创建新文件
          break;
        case 'd':
          flex_date = optarg;          // 例如 "yesterday" 或 "2024-01-01"
          break;
        case 'f':                     // 忽略,历史遗留
          break;
        case 'h':                     // 操作符号链接本身
          no_dereference = true;
          break;
        case 'm':
          change_times |= CH_MTIME;    // 要求修改修改时间
          break;
        case 'r':
          use_ref = true;              // 使用参考文件的时间
          ref_file = optarg;
          break;
        case 't':                     // 固定格式 [[CC]YY]MMDDhhmm[.ss]
          if (! posixtime (&newtime[0].tv_sec, optarg,
                           PDS_LEADING_YEAR | PDS_CENTURY | PDS_SECONDS))
            error (EXIT_FAILURE, 0, _("invalid date format %s"),
                   quote (optarg));
          newtime[0].tv_nsec = 0;
          newtime[1] = newtime[0];   // 同时设置 atime 和 mtime
          date_set = true;
          break;
        case TIME_OPTION:            // --time=atime 等
          change_times |= XARGMATCH ("--time", optarg,
                                     time_args, time_masks);
          break;
        // ... 帮助和版本选项
        default:
          usage (EXIT_FAILURE);
        }
    }

  // 如果用户没指定 -a 也没指定 -m,则默认同时修改 atime 和 mtime
  if (change_times == 0)
    change_times = CH_ATIME | CH_MTIME;

  // 三种时间源互斥:-t、-r、-d 不能同时出现
  if (date_set && (use_ref || flex_date))
    {
      error (0, 0, _("cannot specify times from more than one source"));
      usage (EXIT_FAILURE);
    }

关键点注释

  • change_times 是位掩码,取值为 CH_ATIME(1<<0)和 CH_MTIME(1<<1)。后续通过该掩码决定哪些时间需要变更。

  • newtime[2]struct timespec 数组,分别存放目标 atime 和 mtime。tv_sec 是秒数,tv_nsec 是纳秒。

  • posixtime 是 coreutils 内部的日期解析函数,严格遵循 POSIX touch -t 格式。

1.2 从参考文件获取时间戳

使用 -r ref.txt 时,需要先读取参考文件的属性,再把它的 atime/mtime 复制给目标文件。

c

复制代码
  if (use_ref)
    {
      struct stat ref_stats;
      // 根据 no_dereference 决定是否跟随符号链接
      if (no_dereference ? lstat (ref_file, &ref_stats)
          : stat (ref_file, &ref_stats))
        error (EXIT_FAILURE, errno,
               _("failed to get attributes of %s"), quoteaf (ref_file));

      // 提取 atime 和 mtime —— 这里使用宏或内联函数处理不同平台的结构体差异
      newtime[0] = get_stat_atime (&ref_stats);
      newtime[1] = get_stat_mtime (&ref_stats);
      date_set = true;

      // 如果同时给了 -d,则在参考时间基础上加减
      if (flex_date)
        {
          if (change_times & CH_ATIME)
            newtime[0] = date_relative (flex_date, newtime[0]);
          if (change_times & CH_MTIME)
            newtime[1] = date_relative (flex_date, newtime[1]);
        }
    }

get_stat_atimeget_stat_mtime 是 coreutils 定义的宏,用于兼容不同操作系统中 struct stat 的成员名差异(例如有些系统是 st_atim,有些是 st_atimespec)。

1.3 处理 -d 自由日期

如果用户给出 -d "now"-d "next monday"touch 会调用 date_relative 来解析。这里还包含一个微妙的优化:

c

复制代码
  else
    {
      if (flex_date)
        {
          struct timespec now = current_timespec ();   // 获取当前系统时间
          newtime[1] = newtime[0] = date_relative (flex_date, now);
          date_set = true;

          /* 针对 "-d now" 的特殊处理:
             如果 change_times 是同时修改两者,且解析出的时间恰好等于当前时间,
             则将 date_set 改回 false。这样在后续处理中,内核可能因权限不足而无法
             更新时间时,touch 仍可成功退出(因为实际上时间并没有变化)。 */
          if (change_times == (CH_ATIME | CH_MTIME)
              && newtime[0].tv_sec == now.tv_sec
              && newtime[0].tv_nsec == now.tv_nsec)
            {
              struct timespec notnow, notnow1;
              notnow.tv_sec = now.tv_sec ^ 1;
              notnow.tv_nsec = now.tv_nsec;
              notnow1 = date_relative (flex_date, notnow);
              if (notnow1.tv_sec == notnow.tv_sec
                  && notnow1.tv_nsec == notnow.tv_nsec)
                date_set = false;   // 确认为 "-d now",不是巧合
            }
        }
    }

1.4 古老的日期格式兼容

POSIX 标准在 200112 版本之前,允许使用 MMDDhhmm[YY] 这种简洁格式。coreutils 在保持兼容的同时会发出警告:

c

复制代码
  if (!date_set && 2 <= argc - optind && posix2_version () < 200112
      && posixtime (&newtime[0].tv_sec, argv[optind],
                    PDS_TRAILING_YEAR | PDS_PRE_2000))
    {
      // ... 使用 argv[optind] 作为日期,并移动 optind
      if (! getenv ("POSIXLY_CORRECT"))
        error (0, 0, _("warning: 'touch %s' is obsolete; ..."), ...);
      optind++;
    }

1.5 最终时间戳准备

c

复制代码
  if (!date_set)
    {
      // 如果同时修改 atime 和 mtime,则使用特殊标志 amtime_now
      if (change_times == (CH_ATIME | CH_MTIME))
        amtime_now = true;
      else
        // 只修改其中一种时间,另一种使用 UTIME_OMIT(保持不变)
        newtime[1].tv_nsec = newtime[0].tv_nsec = UTIME_NOW;
    }

UTIME_NOWUTIME_OMIT 是 Linux 定义在 <fcntl.h> 中的特殊值:

  • UTIME_NOW:内核会将对应时间设为当前时间。

  • UTIME_OMIT:内核保持该时间不变。

1.6 核心文件处理函数 touch

main 函数最后遍历所有剩余参数(文件路径),对每个文件调用 touch 函数:

c

复制代码
  for (; optind < argc; ++optind)
    ok &= touch (argv[optind]);

  return ok ? EXIT_SUCCESS : EXIT_FAILURE;
}

touch 函数的实现如下。它负责执行 open(可能带 O_CREAT)和 utimensat 系统调用。

c

复制代码
static bool touch (char const *file)
{
  int fd = -1;
  int open_errno = 0;
  struct timespec const *t = newtime;

  // 特殊文件 "-" 代表标准输出
  if (streq (file, "-"))
    fd = STDOUT_FILENO;
  else if (! (no_create || no_dereference))
    {
      // 尝试打开文件,如果不存在则创建,权限为 0666 & ~umask
      // fd_reopen 试图将打开的文件描述符重定向到 STDIN_FILENO(0)
      fd = fd_reopen (STDIN_FILENO, file,
                      O_WRONLY | O_CREAT | O_NONBLOCK | O_NOCTTY, MODE_RW_UGO);
      if (fd < 0)
        open_errno = errno;
    }

  // 如果只修改一种时间,则另一种时间标记为 UTIME_OMIT
  if (change_times != (CH_ATIME | CH_MTIME))
    {
      if (change_times == CH_MTIME)
        newtime[0].tv_nsec = UTIME_OMIT;   // 忽略 atime
      else
        {
          affirm (change_times == CH_ATIME);
          newtime[1].tv_nsec = UTIME_OMIT; // 忽略 mtime
        }
    }

  // 如果 amtime_now 为 true,向内核传递 NULL 指针
  // 内核会理解成“使用当前时间,且如果文件不可写仍可成功”
  if (amtime_now)
    t = NULL;

  char const *file_opt = fd == STDOUT_FILENO ? NULL : file;
  int atflag = no_dereference ? AT_SYMLINK_NOFOLLOW : 0;
  // 核心系统调用:fdutimensat -> utimensat
  int utime_errno = (fdutimensat (fd, AT_FDCWD, file_opt, t, atflag) == 0
                     ? 0 : errno);

  // 清理临时文件描述符
  if (fd == STDIN_FILENO)
    {
      if (close (STDIN_FILENO) != 0)
        {
          error (0, errno, _("failed to close %s"), quoteaf (file));
          return false;
        }
    }
  else if (fd == STDOUT_FILENO)
    {
      // 特殊场景:标准输出已关闭,EBADF 时若 no_create 则静默成功
      if (utime_errno == EBADF && no_create)
        return true;
    }

  if (utime_errno != 0)
    {
      struct stat st;
      // 如果 open 失败是由于文件是目录(EISDIR)等,则尝试用 stat 判断
      if (open_errno
          && ! (open_errno == EISDIR
                || ((open_errno == EINVAL || open_errno == EEXIST)
                    && stat (file, &st) == 0 && S_ISDIR (st.st_mode))))
        {
          error (0, open_errno, _("cannot touch %s"), quoteaf (file));
        }
      else
        {
          if (no_create && utime_errno == ENOENT)
            return true;   // -c 且文件不存在,静默成功
          error (0, utime_errno, _("setting times of %s"), quoteaf (file));
        }
      return false;
    }

  return true;
}

fd_reopen 的实现相当经典:它打开一个文件,然后通过 dup2 将其复制到期望的描述符号上,再关闭原描述符。

c

复制代码
int fd_reopen (int desired_fd, char const *file, int flags, mode_t mode)
{
  int fd = open (file, flags, mode);
  if (fd == desired_fd || fd < 0)
    return fd;
  else
    {
      int fd2 = dup2 (fd, desired_fd);
      int saved_errno = errno;
      close (fd);
      errno = saved_errno;
      return fd2;
    }
}

至此,用户空间部分结束。接下来我们将跟随 openutimensat 进入内核。


第二部分:内核入口 ------ 系统调用的桥梁

2.1 从 glibc 到 openat

在 Linux 上,open 是 C 库函数,最终都会转化为 openat 系统调用,并传入 AT_FDCWD 表示相对于当前目录。

c

复制代码
// glibc 中的实现(简化)
int __libc_open (const char *file, int oflag, ...)
{
  int mode = 0;
  if (__OPEN_NEEDS_MODE (oflag))
    {
      va_list arg;
      va_start (arg, oflag);
      mode = va_arg (arg, int);
      va_end (arg);
    }
  return SYSCALL_CANCEL (openat, AT_FDCWD, file, oflag, mode);
}
weak_alias (__libc_open, open)

2.2 内核系统调用 openat

内核源码 fs/open.c 中定义了 openat 的系统调用入口:

c

复制代码
SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
		umode_t, mode)
{
	// 如果内核配置了强制大文件,自动添加 O_LARGEFILE
	if (force_o_largefile())
		flags |= O_LARGEFILE;
	return do_sys_open(dfd, filename, flags, mode);
}

do_sys_open 只是简单地将参数打包成 struct open_how,然后调用 do_sys_openat2

c

复制代码
long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
	struct open_how how = build_open_how(flags, mode);
	return do_sys_openat2(dfd, filename, &how);
}

2.3 do_sys_openat2 ------ 分配 fd 并执行打开

这是文件打开的核心内核入口:

c

复制代码
static long do_sys_openat2(int dfd, const char __user *filename,
			   struct open_how *how)
{
	struct open_flags op;
	int fd = build_open_flags(how, &op);   // 将用户标志转为内核标志,并检查合法性
	struct filename *tmp;

	if (fd)
		return fd;

	tmp = getname(filename);               // 从用户空间安全复制路径名
	if (IS_ERR(tmp))
		return PTR_ERR(tmp);

	fd = get_unused_fd_flags(how->flags);  // 在当前进程的文件描述符表中分配一个空位
	if (fd >= 0) {
		struct file *f = do_filp_open(dfd, tmp, &op); // 真正打开文件
		if (IS_ERR(f)) {
			put_unused_fd(fd);
			fd = PTR_ERR(f);
		} else {
			fd_install(fd, f);       // 将 fd 与 struct file 绑定
		}
	}
	putname(tmp);
	return fd;
}

关键点

  • build_open_flags 会检查标志组合是否合法(例如 O_CREAT 必须提供 mode)。

  • getname 会分配内核内存并拷贝路径字符串,还会处理 AT_EMPTY_PATH 等特殊情况。

  • get_unused_fd_flags 遍历 current->files->fdt,找到第一个空闲位。

  • fd_installstruct file 指针写入文件描述符表,之后用户态就可以通过整数 fd 访问该文件。


第三部分:VFS 层 ------ 路径查找与 dentry

3.1 do_filp_openpath_openat

do_filp_open 是 VFS 打开文件的入口,它初始化 struct nameidata 并调用 path_openat

c

复制代码
struct file *do_filp_open(int dfd, struct filename *pathname,
		const struct open_flags *op)
{
	struct nameidata nd;
	int flags = op->lookup_flags;
	struct file *filp;

	set_nameidata(&nd, dfd, pathname, NULL);          // 初始化 nameidata
	filp = path_openat(&nd, op, flags | LOOKUP_RCU); // 优先使用 RCU 无锁查找
	if (unlikely(filp == ERR_PTR(-ECHILD)))           // RCU 失败(如需要阻塞)
		filp = path_openat(&nd, op, flags);        // 回退到引用计数模式
	if (unlikely(filp == ERR_PTR(-ESTALE)))           // 缓存失效,强制重新验证
		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
	restore_nameidata();
	return filp;
}

path_openat 负责分配 struct file,然后根据不同的打开类型(普通打开、O_PATH、O_TMPFILE)走不同分支。

c

复制代码
static struct file *path_openat(struct nameidata *nd,
			const struct open_flags *op, unsigned flags)
{
	struct file *file;
	int error;

	file = alloc_empty_file(op->open_flag, current_cred());
	if (IS_ERR(file))
		return file;

	if (unlikely(file->f_flags & __O_TMPFILE)) {
		error = do_tmpfile(nd, flags, op, file);
	} else if (unlikely(file->f_flags & O_PATH)) {
		error = do_o_path(nd, flags, file);
	} else {
		const char *s = path_init(nd, flags);          // 初始化路径查找
		// 循环解析路径分量,直到处理完最后一部分
		while (!(error = link_path_walk(s, nd)) &&
		       (s = open_last_lookups(nd, file, op)) != NULL)
			;
		if (!error)
			error = do_open(nd, file, op);         // 完成打开
		terminate_walk(nd);
	}
	// 错误处理和返回 ...
}

3.2 解析最后分量:open_last_lookups

当路径只剩最后一个分量(文件名)时,open_last_lookups 决定是直接查找已有 dentry,还是进入创建流程。

c

复制代码
static const char *open_last_lookups(struct nameidata *nd,
		   struct file *file, const struct open_flags *op)
{
	struct dentry *dir = nd->path.dentry;
	int open_flag = op->open_flag;
	bool got_write = false;
	struct dentry *dentry;
	const char *res;

	nd->flags |= op->intent;   // 设置查找意图,如 LOOKUP_OPEN, LOOKUP_CREATE

	// 处理 "." 和 ".."
	if (nd->last_type != LAST_NORM) {
		if (nd->depth)
			put_link(nd);
		return handle_dots(nd, nd->last_type);
	}

	// 如果没有 O_CREAT,尝试快速查找(可能在 dcache 中命中)
	if (!(open_flag & O_CREAT)) {
		if (nd->last.name[nd->last.len])
			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
		dentry = lookup_fast(nd);
		if (IS_ERR(dentry))
			return ERR_CAST(dentry);
		if (likely(dentry))
			goto finish_lookup;
		if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
			return ERR_PTR(-ECHILD);
	} else {
		// O_CREAT 分支:需要确保不在 RCU 模式,且不能有尾部斜杠
		if (nd->flags & LOOKUP_RCU) {
			if (!try_to_unlazy(nd))
				return ERR_PTR(-ECHILD);
		}
		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
		if (unlikely(nd->last.name[nd->last.len]))
			return ERR_PTR(-EISDIR);
	}

	// 如果需要写操作(创建、截断、写打开),先获取挂载点的写权限
	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
		got_write = !mnt_want_write(nd->path.mnt);
		// 注意:即使 mnt_want_write 失败,这里也不立即失败,
		// 因为文件可能已存在,不需要写操作。lookup_open 会处理。
	}

	// 根据 O_CREAT 决定锁类型
	if (open_flag & O_CREAT)
		inode_lock(dir->d_inode);
	else
		inode_lock_shared(dir->d_inode);

	dentry = lookup_open(nd, file, op, got_write);   // 核心:查找或创建

	if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
		fsnotify_create(dir->d_inode, dentry);

	if (open_flag & O_CREAT)
		inode_unlock(dir->d_inode);
	else
		inode_unlock_shared(dir->d_inode);

	if (got_write)
		mnt_drop_write(nd->path.mnt);

	if (IS_ERR(dentry))
		return ERR_CAST(dentry);

	if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
		dput(nd->path.dentry);
		nd->path.dentry = dentry;
		return NULL;
	}

finish_lookup:
	if (nd->depth)
		put_link(nd);
	res = step_into(nd, WALK_TRAILING, dentry);
	if (unlikely(res))
		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
	return res;
}

3.3 lookup_open ------ 真正的查找与创建

lookup_open 负责在目录的 dcache 中查找 dentry,如果不存在且允许创建,则调用文件系统的 create 方法。

c

复制代码
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
				  const struct open_flags *op,
				  bool got_write)
{
	struct mnt_idmap *idmap;
	struct dentry *dir = nd->path.dentry;
	struct inode *dir_inode = dir->d_inode;
	int open_flag = op->open_flag;
	struct dentry *dentry;
	int error, create_error = 0;
	umode_t mode = op->mode;

	if (unlikely(IS_DEADDIR(dir_inode)))
		return ERR_PTR(-ENOENT);

	file->f_mode &= ~FMODE_CREATED;
	dentry = d_lookup(dir, &nd->last);   // 在 dcache 中查找
	for (;;) {
		if (!dentry) {
			dentry = d_alloc_parallel(dir, &nd->last, &wq);
			if (IS_ERR(dentry))
				return dentry;
		}
		if (d_in_lookup(dentry))
			break;

		error = d_revalidate(dentry, nd->flags);
		if (likely(error > 0))
			break;
		if (error)
			goto out_dput;
		d_invalidate(dentry);
		dput(dentry);
		dentry = NULL;
	}

	// 如果找到的 dentry 已经有 inode(positive dentry),直接返回
	if (dentry->d_inode)
		return dentry;

	// 以下为 negative dentry(文件不存在)的处理
	idmap = mnt_idmap(nd->path.mnt);
	if (open_flag & O_CREAT) {
		if (open_flag & O_EXCL)
			open_flag &= ~O_TRUNC;
		mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode);
		if (likely(got_write))
			create_error = may_o_create(idmap, &nd->path, dentry, mode);
		else
			create_error = -EROFS;
	}
	if (create_error)
		open_flag &= ~O_CREAT;

	// 如果文件系统实现了 atomic_open,优先使用(可以合并 lookup+create)
	if (dir_inode->i_op->atomic_open) {
		dentry = atomic_open(nd, dentry, file, open_flag, mode);
		if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
			dentry = ERR_PTR(create_error);
		return dentry;
	}

	// 没有 atomic_open,需要手动调用 lookup 填充
	if (d_in_lookup(dentry)) {
		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry, nd->flags);
		d_lookup_done(dentry);
		if (unlikely(res)) {
			if (IS_ERR(res)) {
				error = PTR_ERR(res);
				goto out_dput;
			}
			dput(dentry);
			dentry = res;
		}
	}

	// 如果仍是 negative 且设置了 O_CREAT,则创建文件
	if (!dentry->d_inode && (open_flag & O_CREAT)) {
		file->f_mode |= FMODE_CREATED;
		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
		if (!dir_inode->i_op->create) {
			error = -EACCES;
			goto out_dput;
		}
		// 调用具体文件系统的 create 方法(对于 ext4 是 ext4_create)
		error = dir_inode->i_op->create(idmap, dir_inode, dentry,
						mode, open_flag & O_EXCL);
		if (error)
			goto out_dput;
	}
	if (unlikely(create_error) && !dentry->d_inode) {
		error = create_error;
		goto out_dput;
	}
	return dentry;

out_dput:
	dput(dentry);
	return ERR_PTR(error);
}

要点

  • d_lookup 在目录的哈希表中搜索。如果未命中,会分配一个新的 dentry(d_alloc_parallel)并进入并行查找状态。

  • d_revalidate 允许文件系统检查 dentry 是否仍然有效(例如网络文件系统)。

  • open_flag 包含 O_CREAT 且 dentry 为 negative 时,最终调用父目录 inode 操作中的 create 方法。


第四部分:走进 ext4 ------ 磁盘 inode 的诞生

4.1 ext4 目录操作表

ext4 文件系统在 fs/ext4/namei.c 中定义了其目录的 inode 操作:

c

复制代码
const struct inode_operations ext4_dir_inode_operations = {
	.create		= ext4_create,
	.lookup		= ext4_lookup,
	.link		= ext4_link,
	.unlink		= ext4_unlink,
	.symlink	= ext4_symlink,
	.mkdir		= ext4_mkdir,
	.rmdir		= ext4_rmdir,
	.mknod		= ext4_mknod,
	.tmpfile	= ext4_tmpfile,
	.rename		= ext4_rename2,
	.setattr	= ext4_setattr,
	.getattr	= ext4_getattr,
	.listxattr	= ext4_listxattr,
	.get_inode_acl	= ext4_get_acl,
	.set_acl	= ext4_set_acl,
	.fiemap         = ext4_fiemap,
	.fileattr_get	= ext4_fileattr_get,
	.fileattr_set	= ext4_fileattr_set,
};

当 VFS 调用 create 时,实际执行的是 ext4_create

4.2 ext4_create ------ 日志事务的开始

c

复制代码
static int ext4_create(struct mnt_idmap *idmap, struct inode *dir,
		       struct dentry *dentry, umode_t mode, bool excl)
{
	handle_t *handle;
	struct inode *inode;
	int err, credits, retries = 0;

	err = dquot_initialize(dir);
	if (err)
		return err;

	// 计算创建文件所需的日志块数(预留空间)
	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
	// 分配新的 inode 并启动日志事务
	inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name,
					    0, NULL, EXT4_HT_DIR, credits);
	handle = ext4_journal_current_handle();
	err = PTR_ERR(inode);
	if (!IS_ERR(inode)) {
		// 设置 inode 的操作函数为普通文件操作(而非目录)
		inode->i_op = &ext4_file_inode_operations;
		inode->i_fop = &ext4_file_operations;
		ext4_set_aops(inode);
		// 将新 inode 添加到目录项
		err = ext4_add_nondir(handle, dentry, &inode);
		if (!err)
			ext4_fc_track_create(handle, dentry);  // 快速提交跟踪
	}
	if (handle)
		ext4_journal_stop(handle);
	if (!IS_ERR_OR_NULL(inode))
		iput(inode);
	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
		goto retry;
	return err;
}

ext4_new_inode_start_handle 是一个宏,展开后调用 __ext4_new_inode ------ 这是 ext4 分配新 inode 的核心函数。

4.3 __ext4_new_inode ------ 寻找空闲的 inode 位

c

复制代码
struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
			       handle_t *handle, struct inode *dir,
			       umode_t mode, const struct qstr *qstr,
			       __u32 goal, uid_t *owner, __u32 i_flags,
			       int handle_type, unsigned int line_no,
			       int nblocks)
{
	struct super_block *sb;
	struct buffer_head *inode_bitmap_bh = NULL;
	ext4_group_t ngroups, group = 0;
	unsigned long ino = 0;
	struct inode *inode;
	struct ext4_group_desc *gdp = NULL;
	struct ext4_sb_info *sbi;
	int err;

	sb = dir->i_sb;
	sbi = EXT4_SB(sb);

	// 分配 VFS inode 对象
	inode = new_inode(sb);
	if (!inode)
		return ERR_PTR(-ENOMEM);

	// 设置文件属主和权限(继承父目录或使用进程的 uid/gid)
	if (owner) {
		inode->i_mode = mode;
		i_uid_write(inode, owner[0]);
		i_gid_write(inode, owner[1]);
	} else if (test_opt(sb, GRPID)) {
		inode->i_mode = mode;
		inode_fsuid_set(inode, idmap);
		inode->i_gid = dir->i_gid;
	} else
		inode_init_owner(idmap, inode, dir, mode);

	// 选择分配组:目录使用 Orlov 算法,普通文件使用父目录所在组
	if (S_ISDIR(mode))
		ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
	else
		ret2 = find_group_other(sb, dir, &group, mode);
	if (ret2 == -1) {
		err = -ENOSPC;
		goto out;
	}

	ngroups = ext4_get_groups_count(sb);
	// 遍历所有块组,寻找空闲 inode
	for (i = 0; i < ngroups; i++, ino = 0) {
		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
		if (!gdp) {
			err = -EIO;
			goto out;
		}
		if (ext4_free_inodes_count(sb, gdp) == 0)
			goto next_group;

		// 读取该组的 inode 位图
		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
		if (IS_ERR(inode_bitmap_bh)) {
			inode_bitmap_bh = NULL;
			goto next_group;
		}

repeat_in_this_group:
		// 在位图中找到一个空闲位
		ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
		if (!ret2)
			goto next_group;

		// 保留 inode 保护(0-10 等)
		if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
			ext4_error(sb, "reserved inode found cleared");
			ext4_mark_group_bitmap_corrupted(sb, group,
					EXT4_GROUP_INFO_IBITMAP_CORRUPT);
			goto next_group;
		}

		// 获取位图块的写访问(日志)
		err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh,
						    EXT4_JTR_NONE);
		if (err)
			goto out;

		ext4_lock_group(sb, group);
		// 原子地测试并设置位
		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
		if (ret2) {
			// 如果位已被其他进程设置,重新查找
			ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
			if (ret2) {
				ext4_set_bit(ino, inode_bitmap_bh->b_data);
				ret2 = 0;
			} else {
				ret2 = 1;
			}
		}
		ext4_unlock_group(sb, group);
		ino++;   // inode 编号从 1 开始,位图是从 0 开始计数的
		if (!ret2)
			goto got;   // 成功获取 inode

		if (ino < EXT4_INODES_PER_GROUP(sb))
			goto repeat_in_this_group;

next_group:
		if (++group == ngroups)
			group = 0;
	}
	err = -ENOSPC;
	goto out;

got:
	// 标记位图块为脏,将写入日志
	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
	if (err)
		goto out;

	// 更新块组描述符中的空闲 inode 计数和已用目录计数
	ext4_lock_group(sb, group);
	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
	if (S_ISDIR(mode)) {
		ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
		// 更新 flex_bg 统计
		if (sbi->s_log_groups_per_flex) {
			ext4_group_t f = ext4_flex_group(sbi, group);
			atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups, f)->used_dirs);
		}
	}
	ext4_group_desc_csum_set(sb, group, gdp);
	ext4_unlock_group(sb, group);

	// 设置 inode 号(全局唯一)
	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
	inode->i_blocks = 0;
	simple_inode_init_ts(inode);                // 初始化 atime/mtime/ctime
	EXT4_I(inode)->i_crtime = inode_get_mtime(inode); // 创建时间

	// 继承父目录的文件系统标志(如 extents, inline data 等)
	ei->i_flags = ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
	ei->i_flags |= i_flags;
	ei->i_block_group = group;

	// 将 inode 插入 VFS inode 哈希表,防止重复
	if (insert_inode_locked(inode) < 0) {
		err = -EIO;
		ext4_error(sb, "doubly allocated inode");
		goto out;
	}
	inode->i_generation = get_random_u32();

	// 如果启用了 metadata checksum,计算校验和种子
	if (ext4_has_metadata_csum(sb)) {
		__u32 csum;
		__le32 inum = cpu_to_le32(inode->i_ino);
		__le32 gen = cpu_to_le32(inode->i_generation);
		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
		ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
	}

	ext4_set_inode_state(inode, EXT4_STATE_NEW);
	// 设置扩展属性、ACL、加密上下文等(省略部分代码)
	// ...

	// 将 inode 标记为脏,等待事务提交时写入磁盘
	err = ext4_mark_inode_dirty(handle, inode);
	if (err)
		goto fail_free_drop;

	brelse(inode_bitmap_bh);
	return inode;

out:
	iput(inode);
	brelse(inode_bitmap_bh);
	return ERR_PTR(err);
}

这段代码展示了 ext4 如何从位图中找到空闲 inode、更新块组描述符、初始化 VFS inode,最后将其标记为脏。脏 inode 会在日志事务提交时由 jbd2 线程写入磁盘。

4.4 添加目录项:ext4_add_nondir

有了 inode 之后,需要将它和文件名关联起来,即创建目录项。

c

复制代码
static int ext4_add_nondir(handle_t *handle,
		struct dentry *dentry, struct inode **inodep)
{
	struct inode *dir = d_inode(dentry->d_parent);
	struct inode *inode = *inodep;
	int err = ext4_add_entry(handle, dentry, inode);
	if (!err) {
		err = ext4_mark_inode_dirty(handle, inode);
		if (IS_DIRSYNC(dir))
			ext4_handle_sync(handle);
		d_instantiate_new(dentry, inode);   // 将 dentry 与 inode 绑定
		*inodep = NULL;   // 所有权转移,调用者不再需要 iput
		return err;
	}
	// 失败清理
	drop_nlink(inode);
	ext4_mark_inode_dirty(handle, inode);
	ext4_orphan_add(handle, inode);
	unlock_new_inode(inode);
	return err;
}

ext4_add_entry 负责在父目录的数据块中添加一个 ext4_dir_entry_2 结构,包含文件名、inode 号和文件类型。如果目录开启了 htree 索引,还会更新索引树。

d_instantiate_new 将新建的 dentry 与 inode 关联,并将 dentry 状态从 negative 转为 positive,同时将其插入 dcache 哈希表,以便后续路径查找能够快速命中。


第五部分:完成打开与时间戳更新

5.1 do_openvfs_open

回到 path_openat,当 open_last_lookups 返回后,会调用 do_open 完成最后的打开操作。

c

复制代码
static int do_open(struct nameidata *nd,
		   struct file *file, const struct open_flags *op)
{
	int open_flag = op->open_flag;
	bool do_truncate;
	int acc_mode;
	int error;

	// 确保路径解析完成(例如跟随符号链接)
	if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
		error = complete_walk(nd);
		if (error)
			return error;
	}
	if (!(file->f_mode & FMODE_CREATED))
		audit_inode(nd->name, nd->path.dentry, 0);

	// O_EXCL 与已存在文件的冲突检查
	if (open_flag & O_CREAT) {
		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
			return -EEXIST;
		if (d_is_dir(nd->path.dentry))
			return -EISDIR;
		error = may_create_in_sticky(idmap, nd, d_backing_inode(nd->path.dentry));
		if (unlikely(error))
			return error;
	}
	// ... 

	do_truncate = false;
	acc_mode = op->acc_mode;
	if (file->f_mode & FMODE_CREATED) {
		// 新创建的文件,不需要截断,也不需要检查写权限
		open_flag &= ~O_TRUNC;
		acc_mode = 0;
	} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
		error = mnt_want_write(nd->path.mnt);
		if (error)
			return error;
		do_truncate = true;
	}
	error = may_open(idmap, &nd->path, acc_mode, open_flag);
	if (!error && !(file->f_mode & FMODE_OPENED))
		error = vfs_open(&nd->path, file);
	if (!error && do_truncate)
		error = handle_truncate(idmap, file);
	if (do_truncate)
		mnt_drop_write(nd->path.mnt);
	return error;
}

vfs_open 调用 do_dentry_open,后者会调用文件系统的 open 方法(对于 ext4 是 ext4_file_open,通常只是做一些检查),并设置 file->f_op,最后标记 FMODE_OPENED

5.2 时间戳更新 ------ utimensat 简述

touch 函数中,创建文件后(或文件已存在时),会调用 fdutimensat -> utimensat。虽然本文没有提供 utimensat 的内核源码,但我们可以概括其流程:

  1. 通过 fd 或路径找到对应的 struct path

  2. 调用 vfs_utimes,它会检查调用者是否有权限修改时间戳(拥有文件或具有 CAP_FOWNER)。

  3. 对于常规文件,最终调用 inode->i_op->update_time 或直接调用 generic_update_time,将 timespec 写入 inode->i_atimei_mtimei_ctime

  4. 标记 inode 为脏,由文件系统负责写入磁盘。

touch 传递的 t 参数为 NULL 时,内核会将其视为 UTIME_NOW 同时作用于 atime 和 mtime。但有一个重要的语义差异:如果文件不可写(只读),NULL 仍然允许调用成功,而显式传递 UTIME_NOW 则可能失败(取决于具体实现)。这正是 touchamtime_now 优化的原因。


第六部分:从磁盘到缓存 ------ 整个系统的协作

通过以上分析,我们已经完整走通了 touch 命令从用户输入到磁盘 inode 分配的全过程。为了更直观地理解,下面用文字概括整个链条:

  1. 用户态 coreutils main :解析选项,确定时间源,构造 newtime 数组。

  2. 对每个文件调用 touch :根据需要先 open(可能带 O_CREAT),然后 utimensat

  3. glibc 系统调用封装open -> openatutimensat 直接进入内核。

  4. 内核系统调用入口SYSCALL_DEFINE4(openat, ...) -> do_sys_open -> do_sys_openat2

  5. VFS 层 do_filp_open :初始化 nameidata,调用 path_openat

  6. 路径解析path_initlink_path_walk 逐级解析目录,open_last_lookups 处理最后一个分量。

  7. lookup_open :在 dcache 中查找 dentry;如果不存在且需要创建,则调用文件系统的 create

  8. ext4 文件系统ext4_create 启动事务,__ext4_new_inode 选择块组、分配 inode 位、初始化 VFS inode,ext4_add_nondir 添加目录项。

  9. 返回 VFSdo_open 完成权限检查和打开,vfs_openstruct file 与 inode 关联。

  10. 返回用户态fd_install 将文件描述符与 struct file 关联,用户态获得 fd。

  11. utimensat 调用:类似地经过 VFS,最终更新 inode 的时间字段。

  12. 日志提交:ext4 的日志机制(jbd2)在事务提交时将位图、inode 表、目录块写入磁盘。

所有这一切,都是在几十毫秒内完成的,而用户只感受到了一个空文件的诞生。


结语

touch 命令看似简单,但其背后涉及了操作系统最核心的概念:系统调用、VFS、dentry 缓存、inode 管理、文件系统日志、磁盘块分配等。理解这一过程,不仅有助于更好地使用命令行,更能为深入理解 Linux 内核打下坚实的基础。

希望这篇博客能够帮助你在脑海中建立起一个从用户空间到物理磁盘的完整模型。下一次当你随手敲下 touch newfile.txt 时,也许会想起那趟穿越 kernel 的漫长旅程。

##源码

cpp 复制代码
int
main (int argc, char **argv)
{
  int c;
  bool date_set = false;
  bool ok = true;
  char const *flex_date = NULL;

  initialize_main (&argc, &argv);
  set_program_name (argv[0]);
  setlocale (LC_ALL, "");
  bindtextdomain (PACKAGE, LOCALEDIR);
  textdomain (PACKAGE);

  atexit (close_stdout);

  change_times = 0;
  no_create = use_ref = false;

  while ((c = getopt_long (argc, argv, "acd:fhmr:t:", longopts, NULL)) != -1)
    {
      switch (c)
        {
        case 'a':
          change_times |= CH_ATIME;
          break;

        case 'c':
          no_create = true;
          break;

        case 'd':
          flex_date = optarg;
          break;

        case 'f':
          break;

        case 'h':
          no_dereference = true;
          break;

        case 'm':
          change_times |= CH_MTIME;
          break;

        case 'r':
          use_ref = true;
          ref_file = optarg;
          break;

        case 't':
          if (! posixtime (&newtime[0].tv_sec, optarg,
                           PDS_LEADING_YEAR | PDS_CENTURY | PDS_SECONDS))
            error (EXIT_FAILURE, 0, _("invalid date format %s"),
                   quote (optarg));
          newtime[0].tv_nsec = 0;
          newtime[1] = newtime[0];
          date_set = true;
          break;

        case TIME_OPTION:	/* --time */
          change_times |= XARGMATCH ("--time", optarg,
                                     time_args, time_masks);
          break;

        case_GETOPT_HELP_CHAR;

        case_GETOPT_VERSION_CHAR (PROGRAM_NAME, AUTHORS);

        default:
          usage (EXIT_FAILURE);
        }
    }

  if (change_times == 0)
    change_times = CH_ATIME | CH_MTIME;

  if (date_set && (use_ref || flex_date))
    {
      error (0, 0, _("cannot specify times from more than one source"));
      usage (EXIT_FAILURE);
    }

  if (use_ref)
    {
      struct stat ref_stats;
      /* Don't use (no_dereference?lstat:stat) (args), since stat
         might be an object-like macro.  */
      if (no_dereference ? lstat (ref_file, &ref_stats)
          : stat (ref_file, &ref_stats))
        error (EXIT_FAILURE, errno,
               _("failed to get attributes of %s"), quoteaf (ref_file));
      newtime[0] = get_stat_atime (&ref_stats);
      newtime[1] = get_stat_mtime (&ref_stats);
      date_set = true;
      if (flex_date)
        {
          if (change_times & CH_ATIME)
            newtime[0] = date_relative (flex_date, newtime[0]);
          if (change_times & CH_MTIME)
            newtime[1] = date_relative (flex_date, newtime[1]);
        }
    }
  else
    {
      if (flex_date)
        {
          struct timespec now = current_timespec ();
          newtime[1] = newtime[0] = date_relative (flex_date, now);
          date_set = true;

          /* If neither -a nor -m is specified, treat "-d now" as if
             it were absent; this lets "touch" succeed more often in
             the presence of restrictive permissions.  */
          if (change_times == (CH_ATIME | CH_MTIME)
              && newtime[0].tv_sec == now.tv_sec
              && newtime[0].tv_nsec == now.tv_nsec)
            {
              /* Check that it really was "-d now", and not a timestamp
                 that just happens to be the current time.  */
              struct timespec notnow, notnow1;
              notnow.tv_sec = now.tv_sec ^ 1;
              notnow.tv_nsec = now.tv_nsec;
              notnow1 = date_relative (flex_date, notnow);
              if (notnow1.tv_sec == notnow.tv_sec
                  && notnow1.tv_nsec == notnow.tv_nsec)
                date_set = false;
            }
        }
    }

  /* The obsolete 'MMDDhhmm[YY]' form is valid IFF there are
     two or more non-option arguments.  */
  if (!date_set && 2 <= argc - optind && posix2_version () < 200112
      && posixtime (&newtime[0].tv_sec, argv[optind],
                    PDS_TRAILING_YEAR | PDS_PRE_2000))
    {
      newtime[0].tv_nsec = 0;
      newtime[1] = newtime[0];
      date_set = true;

      if (! getenv ("POSIXLY_CORRECT"))
        {
          struct tm const *tm = localtime (&newtime[0].tv_sec);

          /* Technically, it appears that even a deliberate attempt to cause
             the above localtime to return NULL will always fail because our
             posixtime implementation rejects all dates for which localtime
             would fail.  However, skip the warning if it ever fails.  */
          if (tm)
            error (0, 0,
                   _("warning: 'touch %s' is obsolete; use "
                     "'touch -t %04ld%02d%02d%02d%02d.%02d'"),
                   argv[optind],
                   tm->tm_year + 1900L, tm->tm_mon + 1, tm->tm_mday,
                   tm->tm_hour, tm->tm_min, tm->tm_sec);
        }

      optind++;
    }

  if (!date_set)
    {
      if (change_times == (CH_ATIME | CH_MTIME))
        amtime_now = true;
      else
        newtime[1].tv_nsec = newtime[0].tv_nsec = UTIME_NOW;
    }

  if (optind == argc)
    {
      error (0, 0, _("missing file operand"));
      usage (EXIT_FAILURE);
    }

  for (; optind < argc; ++optind)
    ok &= touch (argv[optind]);

  return ok ? EXIT_SUCCESS : EXIT_FAILURE;
}

static bool
touch (char const *file)
{
  int fd = -1;
  int open_errno = 0;
  struct timespec const *t = newtime;

  if (streq (file, "-"))
    fd = STDOUT_FILENO;
  else if (! (no_create || no_dereference))
    {
      /* Try to open FILE, creating it if necessary.  */
      fd = fd_reopen (STDIN_FILENO, file,
                      O_WRONLY | O_CREAT | O_NONBLOCK | O_NOCTTY, MODE_RW_UGO);
      if (fd < 0)
        open_errno = errno;
    }

  if (change_times != (CH_ATIME | CH_MTIME))
    {
      /* We're setting only one of the time values.  */
      if (change_times == CH_MTIME)
        newtime[0].tv_nsec = UTIME_OMIT;
      else
        {
          affirm (change_times == CH_ATIME);
          newtime[1].tv_nsec = UTIME_OMIT;
        }
    }

  if (amtime_now)
    {
      /* Pass NULL to futimens so it will not fail if we have
         write access to the file, but don't own it.  */
      t = NULL;
    }

  char const *file_opt = fd == STDOUT_FILENO ? NULL : file;
  int atflag = no_dereference ? AT_SYMLINK_NOFOLLOW : 0;
  int utime_errno = (fdutimensat (fd, AT_FDCWD, file_opt, t, atflag) == 0
                     ? 0 : errno);

  if (fd == STDIN_FILENO)
    {
      if (close (STDIN_FILENO) != 0)
        {
          error (0, errno, _("failed to close %s"), quoteaf (file));
          return false;
        }
    }
  else if (fd == STDOUT_FILENO)
    {
      /* Do not diagnose "touch -c - >&-".  */
      if (utime_errno == EBADF && no_create)
        return true;
    }

  if (utime_errno != 0)
    {
      /* Don't diagnose with open_errno if FILE is a directory, as that
         would give a bogus diagnostic for e.g., 'touch /' (assuming we
         don't own / or have write access).  On Solaris 10 and probably
         other systems, opening a directory like "." fails with EINVAL.
         (On SunOS 4 it was EPERM but that's obsolete.)  On macOS 26
         opening "/" fails with EEXIST.  */
      struct stat st;
      if (open_errno
          && ! (open_errno == EISDIR
                || ((open_errno == EINVAL || open_errno == EEXIST)
                    && stat (file, &st) == 0 && S_ISDIR (st.st_mode))))
        {
          /* The wording of this diagnostic should cover at least two cases:
             - the file does not exist, but the parent directory is unwritable
             - the file exists, but it isn't writable
             I think it's not worth trying to distinguish them.  */
          error (0, open_errno, _("cannot touch %s"), quoteaf (file));
        }
      else
        {
          if (no_create && utime_errno == ENOENT)
            return true;
          error (0, utime_errno, _("setting times of %s"), quoteaf (file));
        }
      return false;
    }

  return true;
}

/* Open a file to a particular file descriptor.  This is like standard
   'open', except it always returns DESIRED_FD if successful.  */

int
fd_reopen (int desired_fd, char const *file, int flags, mode_t mode)
{
  int fd = open (file, flags, mode);

  if (fd == desired_fd || fd < 0)
    return fd;
  else
    {
      int fd2 = dup2 (fd, desired_fd);
      int saved_errno = errno;
      close (fd);
      errno = saved_errno;
      return fd2;
    }
}

257	common	openat			sys_openat

#ifndef __OFF_T_MATCHES_OFF64_T

/* Open FILE with access OFLAG.  If O_CREAT or O_TMPFILE is in OFLAG,
   a third argument is the file protection.  */
int
__libc_open (const char *file, int oflag, ...)
{
  int mode = 0;

  if (__OPEN_NEEDS_MODE (oflag))
    {
      va_list arg;
      va_start (arg, oflag);
      mode = va_arg (arg, int);
      va_end (arg);
    }

  return SYSCALL_CANCEL (openat, AT_FDCWD, file, oflag, mode);
}
libc_hidden_def (__libc_open)

weak_alias (__libc_open, __open)
libc_hidden_weak (__open)
weak_alias (__libc_open, open)
#endif

SYSCALL_DEFINE4(openat, int, dfd, const char __user *, filename, int, flags,
		umode_t, mode)
{
	if (force_o_largefile())
		flags |= O_LARGEFILE;
	return do_sys_open(dfd, filename, flags, mode);
}

long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
	struct open_how how = build_open_how(flags, mode);
	return do_sys_openat2(dfd, filename, &how);
}

static long do_sys_openat2(int dfd, const char __user *filename,
			   struct open_how *how)
{
	struct open_flags op;
	int fd = build_open_flags(how, &op);
	struct filename *tmp;

	if (fd)
		return fd;

	tmp = getname(filename);
	if (IS_ERR(tmp))
		return PTR_ERR(tmp);

	fd = get_unused_fd_flags(how->flags);
	if (fd >= 0) {
		struct file *f = do_filp_open(dfd, tmp, &op);
		if (IS_ERR(f)) {
			put_unused_fd(fd);
			fd = PTR_ERR(f);
		} else {
			fd_install(fd, f);
		}
	}
	putname(tmp);
	return fd;
}

extern struct file *do_filp_open(int dfd, struct filename *pathname,
		const struct open_flags *op);
		
struct file *do_filp_open(int dfd, struct filename *pathname,
		const struct open_flags *op)
{
	struct nameidata nd;
	int flags = op->lookup_flags;
	struct file *filp;

	set_nameidata(&nd, dfd, pathname, NULL);
	filp = path_openat(&nd, op, flags | LOOKUP_RCU);
	if (unlikely(filp == ERR_PTR(-ECHILD)))
		filp = path_openat(&nd, op, flags);
	if (unlikely(filp == ERR_PTR(-ESTALE)))
		filp = path_openat(&nd, op, flags | LOOKUP_REVAL);
	restore_nameidata();
	return filp;
}

static struct file *path_openat(struct nameidata *nd,
			const struct open_flags *op, unsigned flags)
{
	struct file *file;
	int error;

	file = alloc_empty_file(op->open_flag, current_cred());
	if (IS_ERR(file))
		return file;

	if (unlikely(file->f_flags & __O_TMPFILE)) {
		error = do_tmpfile(nd, flags, op, file);
	} else if (unlikely(file->f_flags & O_PATH)) {
		error = do_o_path(nd, flags, file);
	} else {
		const char *s = path_init(nd, flags);
		while (!(error = link_path_walk(s, nd)) &&
		       (s = open_last_lookups(nd, file, op)) != NULL)
			;
		if (!error)
			error = do_open(nd, file, op);
		terminate_walk(nd);
	}
	if (likely(!error)) {
		if (likely(file->f_mode & FMODE_OPENED))
			return file;
		WARN_ON(1);
		error = -EINVAL;
	}
	fput(file);
	if (error == -EOPENSTALE) {
		if (flags & LOOKUP_RCU)
			error = -ECHILD;
		else
			error = -ESTALE;
	}
	return ERR_PTR(error);
}

static const char *open_last_lookups(struct nameidata *nd,
		   struct file *file, const struct open_flags *op)
{
	struct dentry *dir = nd->path.dentry;
	int open_flag = op->open_flag;
	bool got_write = false;
	struct dentry *dentry;
	const char *res;

	nd->flags |= op->intent;

	if (nd->last_type != LAST_NORM) {
		if (nd->depth)
			put_link(nd);
		return handle_dots(nd, nd->last_type);
	}

	if (!(open_flag & O_CREAT)) {
		if (nd->last.name[nd->last.len])
			nd->flags |= LOOKUP_FOLLOW | LOOKUP_DIRECTORY;
		/* we _can_ be in RCU mode here */
		dentry = lookup_fast(nd);
		if (IS_ERR(dentry))
			return ERR_CAST(dentry);
		if (likely(dentry))
			goto finish_lookup;

		if (WARN_ON_ONCE(nd->flags & LOOKUP_RCU))
			return ERR_PTR(-ECHILD);
	} else {
		/* create side of things */
		if (nd->flags & LOOKUP_RCU) {
			if (!try_to_unlazy(nd))
				return ERR_PTR(-ECHILD);
		}
		audit_inode(nd->name, dir, AUDIT_INODE_PARENT);
		/* trailing slashes? */
		if (unlikely(nd->last.name[nd->last.len]))
			return ERR_PTR(-EISDIR);
	}

	if (open_flag & (O_CREAT | O_TRUNC | O_WRONLY | O_RDWR)) {
		got_write = !mnt_want_write(nd->path.mnt);
		/*
		 * do _not_ fail yet - we might not need that or fail with
		 * a different error; let lookup_open() decide; we'll be
		 * dropping this one anyway.
		 */
	}
	if (open_flag & O_CREAT)
		inode_lock(dir->d_inode);
	else
		inode_lock_shared(dir->d_inode);
	dentry = lookup_open(nd, file, op, got_write);
	if (!IS_ERR(dentry) && (file->f_mode & FMODE_CREATED))
		fsnotify_create(dir->d_inode, dentry);
	if (open_flag & O_CREAT)
		inode_unlock(dir->d_inode);
	else
		inode_unlock_shared(dir->d_inode);

	if (got_write)
		mnt_drop_write(nd->path.mnt);

	if (IS_ERR(dentry))
		return ERR_CAST(dentry);

	if (file->f_mode & (FMODE_OPENED | FMODE_CREATED)) {
		dput(nd->path.dentry);
		nd->path.dentry = dentry;
		return NULL;
	}

finish_lookup:
	if (nd->depth)
		put_link(nd);
	res = step_into(nd, WALK_TRAILING, dentry);
	if (unlikely(res))
		nd->flags &= ~(LOOKUP_OPEN|LOOKUP_CREATE|LOOKUP_EXCL);
	return res;
}


/*
 * Look up and maybe create and open the last component.
 *
 * Must be called with parent locked (exclusive in O_CREAT case).
 *
 * Returns 0 on success, that is, if
 *  the file was successfully atomically created (if necessary) and opened, or
 *  the file was not completely opened at this time, though lookups and
 *  creations were performed.
 * These case are distinguished by presence of FMODE_OPENED on file->f_mode.
 * In the latter case dentry returned in @path might be negative if O_CREAT
 * hadn't been specified.
 *
 * An error code is returned on failure.
 */
static struct dentry *lookup_open(struct nameidata *nd, struct file *file,
				  const struct open_flags *op,
				  bool got_write)
{
	struct mnt_idmap *idmap;
	struct dentry *dir = nd->path.dentry;
	struct inode *dir_inode = dir->d_inode;
	int open_flag = op->open_flag;
	struct dentry *dentry;
	int error, create_error = 0;
	umode_t mode = op->mode;
	DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq);

	if (unlikely(IS_DEADDIR(dir_inode)))
		return ERR_PTR(-ENOENT);

	file->f_mode &= ~FMODE_CREATED;
	dentry = d_lookup(dir, &nd->last);
	for (;;) {
		if (!dentry) {
			dentry = d_alloc_parallel(dir, &nd->last, &wq);
			if (IS_ERR(dentry))
				return dentry;
		}
		if (d_in_lookup(dentry))
			break;

		error = d_revalidate(dentry, nd->flags);
		if (likely(error > 0))
			break;
		if (error)
			goto out_dput;
		d_invalidate(dentry);
		dput(dentry);
		dentry = NULL;
	}
	if (dentry->d_inode) {
		/* Cached positive dentry: will open in f_op->open */
		return dentry;
	}

	/*
	 * Checking write permission is tricky, bacuse we don't know if we are
	 * going to actually need it: O_CREAT opens should work as long as the
	 * file exists.  But checking existence breaks atomicity.  The trick is
	 * to check access and if not granted clear O_CREAT from the flags.
	 *
	 * Another problem is returing the "right" error value (e.g. for an
	 * O_EXCL open we want to return EEXIST not EROFS).
	 */
	if (unlikely(!got_write))
		open_flag &= ~O_TRUNC;
	idmap = mnt_idmap(nd->path.mnt);
	if (open_flag & O_CREAT) {
		if (open_flag & O_EXCL)
			open_flag &= ~O_TRUNC;
		mode = vfs_prepare_mode(idmap, dir->d_inode, mode, mode, mode);
		if (likely(got_write))
			create_error = may_o_create(idmap, &nd->path,
						    dentry, mode);
		else
			create_error = -EROFS;
	}
	if (create_error)
		open_flag &= ~O_CREAT;
	if (dir_inode->i_op->atomic_open) {
		dentry = atomic_open(nd, dentry, file, open_flag, mode);
		if (unlikely(create_error) && dentry == ERR_PTR(-ENOENT))
			dentry = ERR_PTR(create_error);
		return dentry;
	}

	if (d_in_lookup(dentry)) {
		struct dentry *res = dir_inode->i_op->lookup(dir_inode, dentry,
							     nd->flags);
		d_lookup_done(dentry);
		if (unlikely(res)) {
			if (IS_ERR(res)) {
				error = PTR_ERR(res);
				goto out_dput;
			}
			dput(dentry);
			dentry = res;
		}
	}

	/* Negative dentry, just create the file */
	if (!dentry->d_inode && (open_flag & O_CREAT)) {
		file->f_mode |= FMODE_CREATED;
		audit_inode_child(dir_inode, dentry, AUDIT_TYPE_CHILD_CREATE);
		if (!dir_inode->i_op->create) {
			error = -EACCES;
			goto out_dput;
		}

		error = dir_inode->i_op->create(idmap, dir_inode, dentry,
						mode, open_flag & O_EXCL);
		if (error)
			goto out_dput;
	}
	if (unlikely(create_error) && !dentry->d_inode) {
		error = create_error;
		goto out_dput;
	}
	return dentry;

out_dput:
	dput(dentry);
	return ERR_PTR(error);
}

/*
 * directories can handle most operations...
 */
const struct inode_operations ext4_dir_inode_operations = {
	.create		= ext4_create,
	.lookup		= ext4_lookup,
	.link		= ext4_link,
	.unlink		= ext4_unlink,
	.symlink	= ext4_symlink,
	.mkdir		= ext4_mkdir,
	.rmdir		= ext4_rmdir,
	.mknod		= ext4_mknod,
	.tmpfile	= ext4_tmpfile,
	.rename		= ext4_rename2,
	.setattr	= ext4_setattr,
	.getattr	= ext4_getattr,
	.listxattr	= ext4_listxattr,
	.get_inode_acl	= ext4_get_acl,
	.set_acl	= ext4_set_acl,
	.fiemap         = ext4_fiemap,
	.fileattr_get	= ext4_fileattr_get,
	.fileattr_set	= ext4_fileattr_set,
};


/*
 * By the time this is called, we already have created
 * the directory cache entry for the new file, but it
 * is so far negative - it has no inode.
 *
 * If the create succeeds, we fill in the inode information
 * with d_instantiate().
 */
static int ext4_create(struct mnt_idmap *idmap, struct inode *dir,
		       struct dentry *dentry, umode_t mode, bool excl)
{
	handle_t *handle;
	struct inode *inode;
	int err, credits, retries = 0;

	err = dquot_initialize(dir);
	if (err)
		return err;

	credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) +
		   EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3);
retry:
	inode = ext4_new_inode_start_handle(idmap, dir, mode, &dentry->d_name,
					    0, NULL, EXT4_HT_DIR, credits);
	handle = ext4_journal_current_handle();
	err = PTR_ERR(inode);
	if (!IS_ERR(inode)) {
		inode->i_op = &ext4_file_inode_operations;
		inode->i_fop = &ext4_file_operations;
		ext4_set_aops(inode);
		err = ext4_add_nondir(handle, dentry, &inode);
		if (!err)
			ext4_fc_track_create(handle, dentry);
	}
	if (handle)
		ext4_journal_stop(handle);
	if (!IS_ERR_OR_NULL(inode))
		iput(inode);
	if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries))
		goto retry;
	return err;
}

#define ext4_new_inode_start_handle(idmap, dir, mode, qstr, goal, owner, \
				    type, nblocks)		    \
	__ext4_new_inode((idmap), NULL, (dir), (mode), (qstr), (goal), (owner), \
			 0, (type), __LINE__, (nblocks))


/*
 * There are two policies for allocating an inode.  If the new inode is
 * a directory, then a forward search is made for a block group with both
 * free space and a low directory-to-inode ratio; if that fails, then of
 * the groups with above-average free space, that group with the fewest
 * directories already is chosen.
 *
 * For other inodes, search forward from the parent directory's block
 * group to find a free inode.
 */
struct inode *__ext4_new_inode(struct mnt_idmap *idmap,
			       handle_t *handle, struct inode *dir,
			       umode_t mode, const struct qstr *qstr,
			       __u32 goal, uid_t *owner, __u32 i_flags,
			       int handle_type, unsigned int line_no,
			       int nblocks)
{
	struct super_block *sb;
	struct buffer_head *inode_bitmap_bh = NULL;
	struct buffer_head *group_desc_bh;
	ext4_group_t ngroups, group = 0;
	unsigned long ino = 0;
	struct inode *inode;
	struct ext4_group_desc *gdp = NULL;
	struct ext4_inode_info *ei;
	struct ext4_sb_info *sbi;
	int ret2, err;
	struct inode *ret;
	ext4_group_t i;
	ext4_group_t flex_group;
	struct ext4_group_info *grp = NULL;
	bool encrypt = false;

	/* Cannot create files in a deleted directory */
	if (!dir || !dir->i_nlink)
		return ERR_PTR(-EPERM);

	sb = dir->i_sb;
	sbi = EXT4_SB(sb);

	if (unlikely(ext4_forced_shutdown(sb)))
		return ERR_PTR(-EIO);

	ngroups = ext4_get_groups_count(sb);
	trace_ext4_request_inode(dir, mode);
	inode = new_inode(sb);
	if (!inode)
		return ERR_PTR(-ENOMEM);
	ei = EXT4_I(inode);

	/*
	 * Initialize owners and quota early so that we don't have to account
	 * for quota initialization worst case in standard inode creating
	 * transaction
	 */
	if (owner) {
		inode->i_mode = mode;
		i_uid_write(inode, owner[0]);
		i_gid_write(inode, owner[1]);
	} else if (test_opt(sb, GRPID)) {
		inode->i_mode = mode;
		inode_fsuid_set(inode, idmap);
		inode->i_gid = dir->i_gid;
	} else
		inode_init_owner(idmap, inode, dir, mode);

	if (ext4_has_feature_project(sb) &&
	    ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT))
		ei->i_projid = EXT4_I(dir)->i_projid;
	else
		ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID);

	if (!(i_flags & EXT4_EA_INODE_FL)) {
		err = fscrypt_prepare_new_inode(dir, inode, &encrypt);
		if (err)
			goto out;
	}

	err = dquot_initialize(inode);
	if (err)
		goto out;

	if (!handle && sbi->s_journal && !(i_flags & EXT4_EA_INODE_FL)) {
		ret2 = ext4_xattr_credits_for_new_inode(dir, mode, encrypt);
		if (ret2 < 0) {
			err = ret2;
			goto out;
		}
		nblocks += ret2;
	}

	if (!goal)
		goal = sbi->s_inode_goal;

	if (goal && goal <= le32_to_cpu(sbi->s_es->s_inodes_count)) {
		group = (goal - 1) / EXT4_INODES_PER_GROUP(sb);
		ino = (goal - 1) % EXT4_INODES_PER_GROUP(sb);
		ret2 = 0;
		goto got_group;
	}

	if (S_ISDIR(mode))
		ret2 = find_group_orlov(sb, dir, &group, mode, qstr);
	else
		ret2 = find_group_other(sb, dir, &group, mode);

got_group:
	EXT4_I(dir)->i_last_alloc_group = group;
	err = -ENOSPC;
	if (ret2 == -1)
		goto out;

	/*
	 * Normally we will only go through one pass of this loop,
	 * unless we get unlucky and it turns out the group we selected
	 * had its last inode grabbed by someone else.
	 */
	for (i = 0; i < ngroups; i++, ino = 0) {
		err = -EIO;

		gdp = ext4_get_group_desc(sb, group, &group_desc_bh);
		if (!gdp)
			goto out;

		/*
		 * Check free inodes count before loading bitmap.
		 */
		if (ext4_free_inodes_count(sb, gdp) == 0)
			goto next_group;

		if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
			grp = ext4_get_group_info(sb, group);
			/*
			 * Skip groups with already-known suspicious inode
			 * tables
			 */
			if (!grp || EXT4_MB_GRP_IBITMAP_CORRUPT(grp))
				goto next_group;
		}

		brelse(inode_bitmap_bh);
		inode_bitmap_bh = ext4_read_inode_bitmap(sb, group);
		/* Skip groups with suspicious inode tables */
		if (((!(sbi->s_mount_state & EXT4_FC_REPLAY))
		     && EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) ||
		    IS_ERR(inode_bitmap_bh)) {
			inode_bitmap_bh = NULL;
			goto next_group;
		}

repeat_in_this_group:
		ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
		if (!ret2)
			goto next_group;

		if (group == 0 && (ino + 1) < EXT4_FIRST_INO(sb)) {
			ext4_error(sb, "reserved inode found cleared - "
				   "inode=%lu", ino + 1);
			ext4_mark_group_bitmap_corrupted(sb, group,
					EXT4_GROUP_INFO_IBITMAP_CORRUPT);
			goto next_group;
		}

		if ((!(sbi->s_mount_state & EXT4_FC_REPLAY)) && !handle) {
			BUG_ON(nblocks <= 0);
			handle = __ext4_journal_start_sb(NULL, dir->i_sb,
				 line_no, handle_type, nblocks, 0,
				 ext4_trans_default_revoke_credits(sb));
			if (IS_ERR(handle)) {
				err = PTR_ERR(handle);
				ext4_std_error(sb, err);
				goto out;
			}
		}
		BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
		err = ext4_journal_get_write_access(handle, sb, inode_bitmap_bh,
						    EXT4_JTR_NONE);
		if (err) {
			ext4_std_error(sb, err);
			goto out;
		}
		ext4_lock_group(sb, group);
		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
		if (ret2) {
			/* Someone already took the bit. Repeat the search
			 * with lock held.
			 */
			ret2 = find_inode_bit(sb, group, inode_bitmap_bh, &ino);
			if (ret2) {
				ext4_set_bit(ino, inode_bitmap_bh->b_data);
				ret2 = 0;
			} else {
				ret2 = 1; /* we didn't grab the inode */
			}
		}
		ext4_unlock_group(sb, group);
		ino++;		/* the inode bitmap is zero-based */
		if (!ret2)
			goto got; /* we grabbed the inode! */

		if (ino < EXT4_INODES_PER_GROUP(sb))
			goto repeat_in_this_group;
next_group:
		if (++group == ngroups)
			group = 0;
	}
	err = -ENOSPC;
	goto out;

got:
	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
	if (err) {
		ext4_std_error(sb, err);
		goto out;
	}

	BUFFER_TRACE(group_desc_bh, "get_write_access");
	err = ext4_journal_get_write_access(handle, sb, group_desc_bh,
					    EXT4_JTR_NONE);
	if (err) {
		ext4_std_error(sb, err);
		goto out;
	}

	/* We may have to initialize the block bitmap if it isn't already */
	if (ext4_has_group_desc_csum(sb) &&
	    gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
		struct buffer_head *block_bitmap_bh;

		block_bitmap_bh = ext4_read_block_bitmap(sb, group);
		if (IS_ERR(block_bitmap_bh)) {
			err = PTR_ERR(block_bitmap_bh);
			goto out;
		}
		BUFFER_TRACE(block_bitmap_bh, "get block bitmap access");
		err = ext4_journal_get_write_access(handle, sb, block_bitmap_bh,
						    EXT4_JTR_NONE);
		if (err) {
			brelse(block_bitmap_bh);
			ext4_std_error(sb, err);
			goto out;
		}

		BUFFER_TRACE(block_bitmap_bh, "dirty block bitmap");
		err = ext4_handle_dirty_metadata(handle, NULL, block_bitmap_bh);

		/* recheck and clear flag under lock if we still need to */
		ext4_lock_group(sb, group);
		if (ext4_has_group_desc_csum(sb) &&
		    (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
			ext4_free_group_clusters_set(sb, gdp,
				ext4_free_clusters_after_init(sb, group, gdp));
			ext4_block_bitmap_csum_set(sb, gdp, block_bitmap_bh);
			ext4_group_desc_csum_set(sb, group, gdp);
		}
		ext4_unlock_group(sb, group);
		brelse(block_bitmap_bh);

		if (err) {
			ext4_std_error(sb, err);
			goto out;
		}
	}

	/* Update the relevant bg descriptor fields */
	if (ext4_has_group_desc_csum(sb)) {
		int free;
		struct ext4_group_info *grp = NULL;

		if (!(sbi->s_mount_state & EXT4_FC_REPLAY)) {
			grp = ext4_get_group_info(sb, group);
			if (!grp) {
				err = -EFSCORRUPTED;
				goto out;
			}
			down_read(&grp->alloc_sem); /*
						     * protect vs itable
						     * lazyinit
						     */
		}
		ext4_lock_group(sb, group); /* while we modify the bg desc */
		free = EXT4_INODES_PER_GROUP(sb) -
			ext4_itable_unused_count(sb, gdp);
		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
			free = 0;
		}
		/*
		 * Check the relative inode number against the last used
		 * relative inode number in this group. if it is greater
		 * we need to update the bg_itable_unused count
		 */
		if (ino > free)
			ext4_itable_unused_set(sb, gdp,
					(EXT4_INODES_PER_GROUP(sb) - ino));
		if (!(sbi->s_mount_state & EXT4_FC_REPLAY))
			up_read(&grp->alloc_sem);
	} else {
		ext4_lock_group(sb, group);
	}

	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
	if (S_ISDIR(mode)) {
		ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
		if (sbi->s_log_groups_per_flex) {
			ext4_group_t f = ext4_flex_group(sbi, group);

			atomic_inc(&sbi_array_rcu_deref(sbi, s_flex_groups,
							f)->used_dirs);
		}
	}
	if (ext4_has_group_desc_csum(sb)) {
		ext4_inode_bitmap_csum_set(sb, gdp, inode_bitmap_bh,
					   EXT4_INODES_PER_GROUP(sb) / 8);
		ext4_group_desc_csum_set(sb, group, gdp);
	}
	ext4_unlock_group(sb, group);

	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
	if (err) {
		ext4_std_error(sb, err);
		goto out;
	}

	percpu_counter_dec(&sbi->s_freeinodes_counter);
	if (S_ISDIR(mode))
		percpu_counter_inc(&sbi->s_dirs_counter);

	if (sbi->s_log_groups_per_flex) {
		flex_group = ext4_flex_group(sbi, group);
		atomic_dec(&sbi_array_rcu_deref(sbi, s_flex_groups,
						flex_group)->free_inodes);
	}

	inode->i_ino = ino + group * EXT4_INODES_PER_GROUP(sb);
	/* This is the optimal IO size (for stat), not the fs block size */
	inode->i_blocks = 0;
	simple_inode_init_ts(inode);
	ei->i_crtime = inode_get_mtime(inode);

	memset(ei->i_data, 0, sizeof(ei->i_data));
	ei->i_dir_start_lookup = 0;
	ei->i_disksize = 0;

	/* Don't inherit extent flag from directory, amongst others. */
	ei->i_flags =
		ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
	ei->i_flags |= i_flags;
	ei->i_file_acl = 0;
	ei->i_dtime = 0;
	ei->i_block_group = group;
	ei->i_last_alloc_group = ~0;

	ext4_set_inode_flags(inode, true);
	if (IS_DIRSYNC(inode))
		ext4_handle_sync(handle);
	if (insert_inode_locked(inode) < 0) {
		/*
		 * Likely a bitmap corruption causing inode to be allocated
		 * twice.
		 */
		err = -EIO;
		ext4_error(sb, "failed to insert inode %lu: doubly allocated?",
			   inode->i_ino);
		ext4_mark_group_bitmap_corrupted(sb, group,
					EXT4_GROUP_INFO_IBITMAP_CORRUPT);
		goto out;
	}
	inode->i_generation = get_random_u32();

	/* Precompute checksum seed for inode metadata */
	if (ext4_has_metadata_csum(sb)) {
		__u32 csum;
		__le32 inum = cpu_to_le32(inode->i_ino);
		__le32 gen = cpu_to_le32(inode->i_generation);
		csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
				   sizeof(inum));
		ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
					      sizeof(gen));
	}

	ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
	ext4_set_inode_state(inode, EXT4_STATE_NEW);

	ei->i_extra_isize = sbi->s_want_extra_isize;
	ei->i_inline_off = 0;
	if (ext4_has_feature_inline_data(sb) &&
	    (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode)))
		ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
	ret = inode;
	err = dquot_alloc_inode(inode);
	if (err)
		goto fail_drop;

	/*
	 * Since the encryption xattr will always be unique, create it first so
	 * that it's less likely to end up in an external xattr block and
	 * prevent its deduplication.
	 */
	if (encrypt) {
		err = fscrypt_set_context(inode, handle);
		if (err)
			goto fail_free_drop;
	}

	if (!(ei->i_flags & EXT4_EA_INODE_FL)) {
		err = ext4_init_acl(handle, inode, dir);
		if (err)
			goto fail_free_drop;

		err = ext4_init_security(handle, inode, dir, qstr);
		if (err)
			goto fail_free_drop;
	}

	if (ext4_has_feature_extents(sb)) {
		/* set extent flag only for directory, file and normal symlink*/
		if (S_ISDIR(mode) || S_ISREG(mode) || S_ISLNK(mode)) {
			ext4_set_inode_flag(inode, EXT4_INODE_EXTENTS);
			ext4_ext_tree_init(handle, inode);
		}
	}

	if (ext4_handle_valid(handle)) {
		ei->i_sync_tid = handle->h_transaction->t_tid;
		ei->i_datasync_tid = handle->h_transaction->t_tid;
	}

	err = ext4_mark_inode_dirty(handle, inode);
	if (err) {
		ext4_std_error(sb, err);
		goto fail_free_drop;
	}

	ext4_debug("allocating inode %lu\n", inode->i_ino);
	trace_ext4_allocate_inode(inode, dir, mode);
	brelse(inode_bitmap_bh);
	return ret;

fail_free_drop:
	dquot_free_inode(inode);
fail_drop:
	clear_nlink(inode);
	unlock_new_inode(inode);
out:
	dquot_drop(inode);
	inode->i_flags |= S_NOQUOTA;
	iput(inode);
	brelse(inode_bitmap_bh);
	return ERR_PTR(err);
}

/*
 * Add non-directory inode to a directory. On success, the inode reference is
 * consumed by dentry is instantiation. This is also indicated by clearing of
 * *inodep pointer. On failure, the caller is responsible for dropping the
 * inode reference in the safe context.
 */
static int ext4_add_nondir(handle_t *handle,
		struct dentry *dentry, struct inode **inodep)
{
	struct inode *dir = d_inode(dentry->d_parent);
	struct inode *inode = *inodep;
	int err = ext4_add_entry(handle, dentry, inode);
	if (!err) {
		err = ext4_mark_inode_dirty(handle, inode);
		if (IS_DIRSYNC(dir))
			ext4_handle_sync(handle);
		d_instantiate_new(dentry, inode);
		*inodep = NULL;
		return err;
	}
	drop_nlink(inode);
	ext4_mark_inode_dirty(handle, inode);
	ext4_orphan_add(handle, inode);
	unlock_new_inode(inode);
	return err;
}


/*
 * Handle the last step of open()
 */
static int do_open(struct nameidata *nd,
		   struct file *file, const struct open_flags *op)
{
	struct mnt_idmap *idmap;
	int open_flag = op->open_flag;
	bool do_truncate;
	int acc_mode;
	int error;

	if (!(file->f_mode & (FMODE_OPENED | FMODE_CREATED))) {
		error = complete_walk(nd);
		if (error)
			return error;
	}
	if (!(file->f_mode & FMODE_CREATED))
		audit_inode(nd->name, nd->path.dentry, 0);
	idmap = mnt_idmap(nd->path.mnt);
	if (open_flag & O_CREAT) {
		if ((open_flag & O_EXCL) && !(file->f_mode & FMODE_CREATED))
			return -EEXIST;
		if (d_is_dir(nd->path.dentry))
			return -EISDIR;
		error = may_create_in_sticky(idmap, nd,
					     d_backing_inode(nd->path.dentry));
		if (unlikely(error))
			return error;
	}
	if ((nd->flags & LOOKUP_DIRECTORY) && !d_can_lookup(nd->path.dentry))
		return -ENOTDIR;

	do_truncate = false;
	acc_mode = op->acc_mode;
	if (file->f_mode & FMODE_CREATED) {
		/* Don't check for write permission, don't truncate */
		open_flag &= ~O_TRUNC;
		acc_mode = 0;
	} else if (d_is_reg(nd->path.dentry) && open_flag & O_TRUNC) {
		error = mnt_want_write(nd->path.mnt);
		if (error)
			return error;
		do_truncate = true;
	}
	error = may_open(idmap, &nd->path, acc_mode, open_flag);
	if (!error && !(file->f_mode & FMODE_OPENED))
		error = vfs_open(&nd->path, file);
	if (!error)
		error = ima_file_check(file, op->acc_mode);
	if (!error && do_truncate)
		error = handle_truncate(idmap, file);
	if (unlikely(error > 0)) {
		WARN_ON(1);
		error = -EINVAL;
	}
	if (do_truncate)
		mnt_drop_write(nd->path.mnt);
	return error;
}

/**
 * vfs_open - open the file at the given path
 * @path: path to open
 * @file: newly allocated file with f_flag initialized
 */
int vfs_open(const struct path *path, struct file *file)
{
	file->f_path = *path;
	return do_dentry_open(file, d_backing_inode(path->dentry), NULL);
}


static int do_dentry_open(struct file *f,
			  struct inode *inode,
			  int (*open)(struct inode *, struct file *))
{
	static const struct file_operations empty_fops = {};
	int error;

	path_get(&f->f_path);
	f->f_inode = inode;
	f->f_mapping = inode->i_mapping;
	f->f_wb_err = filemap_sample_wb_err(f->f_mapping);
	f->f_sb_err = file_sample_sb_err(f);

	if (unlikely(f->f_flags & O_PATH)) {
		f->f_mode = FMODE_PATH | FMODE_OPENED;
		f->f_op = &empty_fops;
		return 0;
	}

	if ((f->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) {
		i_readcount_inc(inode);
	} else if (f->f_mode & FMODE_WRITE && !special_file(inode->i_mode)) {
		error = file_get_write_access(f);
		if (unlikely(error))
			goto cleanup_file;
		f->f_mode |= FMODE_WRITER;
	}

	/* POSIX.1-2008/SUSv4 Section XSI 2.9.7 */
	if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))
		f->f_mode |= FMODE_ATOMIC_POS;

	f->f_op = fops_get(inode->i_fop);
	if (WARN_ON(!f->f_op)) {
		error = -ENODEV;
		goto cleanup_all;
	}

	error = security_file_open(f);
	if (error)
		goto cleanup_all;

	error = break_lease(file_inode(f), f->f_flags);
	if (error)
		goto cleanup_all;

	/* normally all 3 are set; ->open() can clear them if needed */
	f->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE;
	if (!open)
		open = f->f_op->open;
	if (open) {
		error = open(inode, f);
		if (error)
			goto cleanup_all;
	}
	f->f_mode |= FMODE_OPENED;
	if ((f->f_mode & FMODE_READ) &&
	     likely(f->f_op->read || f->f_op->read_iter))
		f->f_mode |= FMODE_CAN_READ;
	if ((f->f_mode & FMODE_WRITE) &&
	     likely(f->f_op->write || f->f_op->write_iter))
		f->f_mode |= FMODE_CAN_WRITE;
	if ((f->f_mode & FMODE_LSEEK) && !f->f_op->llseek)
		f->f_mode &= ~FMODE_LSEEK;
	if (f->f_mapping->a_ops && f->f_mapping->a_ops->direct_IO)
		f->f_mode |= FMODE_CAN_ODIRECT;

	f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
	f->f_iocb_flags = iocb_flags(f);

	file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);

	if ((f->f_flags & O_DIRECT) && !(f->f_mode & FMODE_CAN_ODIRECT))
		return -EINVAL;

	/*
	 * XXX: Huge page cache doesn't support writing yet. Drop all page
	 * cache for this file before processing writes.
	 */
	if (f->f_mode & FMODE_WRITE) {
		/*
		 * Paired with smp_mb() in collapse_file() to ensure nr_thps
		 * is up to date and the update to i_writecount by
		 * get_write_access() is visible. Ensures subsequent insertion
		 * of THPs into the page cache will fail.
		 */
		smp_mb();
		if (filemap_nr_thps(inode->i_mapping)) {
			struct address_space *mapping = inode->i_mapping;

			filemap_invalidate_lock(inode->i_mapping);
			/*
			 * unmap_mapping_range just need to be called once
			 * here, because the private pages is not need to be
			 * unmapped mapping (e.g. data segment of dynamic
			 * shared libraries here).
			 */
			unmap_mapping_range(mapping, 0, 0, 0);
			truncate_inode_pages(mapping, 0);
			filemap_invalidate_unlock(inode->i_mapping);
		}
	}

	/*
	 * Once we return a file with FMODE_OPENED, __fput() will call
	 * fsnotify_close(), so we need fsnotify_open() here for symmetry.
	 */
	fsnotify_open(f);
	return 0;

cleanup_all:
	if (WARN_ON_ONCE(error > 0))
		error = -EINVAL;
	fops_put(f->f_op);
	put_file_access(f);
cleanup_file:
	path_put(&f->f_path);
	f->f_path.mnt = NULL;
	f->f_path.dentry = NULL;
	f->f_inode = NULL;
	return error;
}

		
相关推荐
feng_you_ying_li1 小时前
liunx之可重入函数,volatite,和线程的基本介绍(1)
linux
因_崔斯汀1 小时前
用 AI 编程助手从零生成 3D 智慧校园数据大屏 —— Claude Code 实战全记录
前端
zzqssliu1 小时前
taocarts 跨境独立站 SEO 优化实践(多语言 + 反向海淘场景)
java·javascript·php
前端Hardy1 小时前
CSS 动画真的比 JS 快?Josh Comeau 做了组实验,结果跟直觉不一样
前端·javascript·后端
兮山与1 小时前
Linux
linux·javaee进阶
自进化Agent智能体1 小时前
MCP与Hooks:让AI Agent安全连接一切的治理框架
前端
在繁华处1 小时前
Java从零到熟练(十一):Spring框架入门
java·开发语言·spring
明天一点1 小时前
Cloudflare 通知转发钉钉机器人
前端·后端
小锋java12341 小时前
【技术专题】LangChain4j 开发Java Agent智能体 - 整合SpringBoot4
java·人工智能