Linux驱动开发进阶 - 文件系统

文章目录

1、前言
2、学习目标
3、VFS虚拟文件系统
- [3.1、超级块（Super Block）](#3.1、超级块（Super Block）)
- 3.2、dentry
- 3.3、inode
- 3.4、file
4、文件系统的挂载
5、文件系统的注册
6、实现一个虚拟文件系统

1、前言

学习参考书籍：李文山的《Linux驱动开发进阶》
本文是为了学习上述书籍时，不能囫囵吞枣，才写的。等实际遇到问题了，我也只会去回看原书籍。所以本文不太具备教学功能。

2、学习目标

在Linux中，文件系统可以分为两大类：虚拟文件系统（如sysfs、procfs、devtmpfs）和实际物理存储设备的文件系统（如ext2、ext3、ext4、vfat、fat32）。那Linux如何管理这些文件系统呢？同时本文将在最后编写一个虚拟的文件系统驱动程序。

3、VFS虚拟文件系统

Linux内核的设计哲学非常注重抽象和模块化，这种设计使得系统更加灵活、可扩展且易于维护。Linux为了管理各种类型的文件系统（sysfs、procfs、ext2、ext3、ext4、fat32、...），抽象出了 VFS（Virtual File System Switch，虚拟文件系统切换层）。

一个功能的诞生肯定是为了解决实际问题，那VFS虚拟文件系统的诞生也是为了管理众多不同的文件系统。在Linux的VFS虚拟文件系统中，有四个核心对象是理解和实现文件系统的关键，分别是：超级块（Super Block）、索引节点（Inode）、目录项（Dentry）和文件对象（File）。

3.1、超级块（Super Block）

在Linux文件系统中，许多文件系统本身就存在显示的超级块，如ext2、ext3、ext4。它们在分区的第一个块（或前几个块）中存储了超级块。超级块包含了文件系统的元数据，如总块数、总Inode数、块大小、文件系统状态等。

但不是所有的文件系统都显示存在超级块，如Windows下常用的文件系统，如ntfs、fat32等，它们有自己的结构来存储文件系统的元数据和管理信息，这些结构在功能上类似于超级块，但名称和具体实现不同。

在Linux下，无论该文件系统是否存在超级块，挂载时必须初始化超级块数据结构。

c 复制代码

struct super_block {
	struct list_head	s_list;		/* Keep this first */
	dev_t			s_dev;		/* search index; _not_ kdev_t */
	unsigned char		s_blocksize_bits;
	unsigned long		s_blocksize;
	loff_t			s_maxbytes;	/* Max file size */
	struct file_system_type	*s_type;
	const struct super_operations	*s_op;
	const struct dquot_operations	*dq_op;
	const struct quotactl_ops	*s_qcop;
	const struct export_operations *s_export_op;
	unsigned long		s_flags;
	unsigned long		s_iflags;	/* internal SB_I_* flags */
	unsigned long		s_magic;
	struct dentry		*s_root;
	struct rw_semaphore	s_umount;
	int			s_count;
	atomic_t		s_active;
#ifdef CONFIG_SECURITY
	void                    *s_security;
#endif
	const struct xattr_handler **s_xattr;
#ifdef CONFIG_FS_ENCRYPTION
	const struct fscrypt_operations	*s_cop;
#ifdef __GENKSYMS__
	/*
	 * Android ABI CRC preservation due to commit 391cceee6d43 ("fscrypt:
	 * stop using keyrings subsystem for fscrypt_master_key") changing this
	 * type.  Size is the same, this is a private field.
	 */
	struct key		*s_master_keys; /* master crypto keys in use */
#else
	struct fscrypt_keyring	*s_master_keys; /* master crypto keys in use */
#endif
#endif
#ifdef CONFIG_FS_VERITY
	const struct fsverity_operations *s_vop;
#endif
#ifdef CONFIG_UNICODE
	struct unicode_map *s_encoding;
	__u16 s_encoding_flags;
#endif
	struct hlist_bl_head	s_roots;	/* alternate root dentries for NFS */
	struct list_head	s_mounts;	/* list of mounts; _not_ for fs use */
	struct block_device	*s_bdev;
	struct backing_dev_info *s_bdi;
	struct mtd_info		*s_mtd;
	struct hlist_node	s_instances;
	unsigned int		s_quota_types;	/* Bitmask of supported quota types */
	struct quota_info	s_dquot;	/* Diskquota specific options */

	struct sb_writers	s_writers;

	/*
	 * Keep s_fs_info, s_time_gran, s_fsnotify_mask, and
	 * s_fsnotify_marks together for cache efficiency. They are frequently
	 * accessed and rarely modified.
	 */
	void			*s_fs_info;	/* Filesystem private info */

	/* Granularity of c/m/atime in ns (cannot be worse than a second) */
	u32			s_time_gran;
	/* Time limits for c/m/atime in seconds */
	time64_t		   s_time_min;
	time64_t		   s_time_max;
#ifdef CONFIG_FSNOTIFY
	__u32			s_fsnotify_mask;
	struct fsnotify_mark_connector __rcu	*s_fsnotify_marks;
#endif

	char			s_id[32];	/* Informational name */
	uuid_t			s_uuid;		/* UUID */

	unsigned int		s_max_links;
	fmode_t			s_mode;

	/*
	 * The next field is for VFS *only*. No filesystems have any business
	 * even looking at it. You had been warned.
	 */
	struct mutex s_vfs_rename_mutex;	/* Kludge */

	/*
	 * Filesystem subtype.  If non-empty the filesystem type field
	 * in /proc/mounts will be "type.subtype"
	 */
	const char *s_subtype;

	const struct dentry_operations *s_d_op; /* default d_op for dentries */

	/*
	 * Saved pool identifier for cleancache (-1 means none)
	 */
	int cleancache_poolid;

	struct shrinker s_shrink;	/* per-sb shrinker handle */

	/* Number of inodes with nlink == 0 but still referenced */
	atomic_long_t s_remove_count;

	/* Pending fsnotify inode refs */
	atomic_long_t s_fsnotify_inode_refs;

	/* Being remounted read-only */
	int s_readonly_remount;

	/* per-sb errseq_t for reporting writeback errors via syncfs */
	errseq_t s_wb_err;

	/* AIO completions deferred from interrupt context */
	struct workqueue_struct *s_dio_done_wq;
	struct hlist_head s_pins;

	/*
	 * Owning user namespace and default context in which to
	 * interpret filesystem uids, gids, quotas, device nodes,
	 * xattrs and security labels.
	 */
	struct user_namespace *s_user_ns;

	/*
	 * The list_lru structure is essentially just a pointer to a table
	 * of per-node lru lists, each of which has its own spinlock.
	 * There is no need to put them into separate cachelines.
	 */
	struct list_lru		s_dentry_lru;
	struct list_lru		s_inode_lru;
	struct rcu_head		rcu;
	struct work_struct	destroy_work;

	struct mutex		s_sync_lock;	/* sync serialisation lock */

	/*
	 * Indicates how deep in a filesystem stack this SB is
	 */
	int s_stack_depth;

	/* s_inode_list_lock protects s_inodes */
	spinlock_t		s_inode_list_lock ____cacheline_aligned_in_smp;
	struct list_head	s_inodes;	/* all inodes */

	spinlock_t		s_inode_wblist_lock;
	struct list_head	s_inodes_wb;	/* writeback inodes */

	ANDROID_KABI_RESERVE(1);
	ANDROID_KABI_RESERVE(2);
	ANDROID_KABI_RESERVE(3);
	ANDROID_KABI_RESERVE(4);
} __randomize_layout;

s_list：链表结构体，包含prev和next指针，用来连接前驱和后继节点
s_dev：块设备标识符，例如/dev/sda、/dev/nvme
s_blocksize_bits：块大小占用的位数，例如块大小为4096则该值为12
s_blocksize：数据块大小，单位为字节
s_maxbytes：文件的最大长度，单位字节
s_type：文件系统类型
s_op：超级块操作方法，指向具体的文件系统
dp_op：和s_op一样，但用于特定操作方法
s_qcop：和s_op一样，但用于配置磁盘的特定的操作方法
s_flags：文件系统是否安装标志
s_magic：文件系统魔数，每个文件系统都有各自的魔数
s_root：文件系统的根目录文件
s_umount：用于文件系统对文件进行读写同步
s_count：超级块的使用计数
s_active：超级块的引用计数
s_security：用于安全的私有指针
s_d_op：dentry操作方法集合

其中超级块的操作函数结构体struct super_operations内容如下：

c 复制代码

struct super_operations {
   	struct inode *(*alloc_inode)(struct super_block *sb);
	void (*destroy_inode)(struct inode *);
	void (*free_inode)(struct inode *);

   	void (*dirty_inode) (struct inode *, int flags);
	int (*write_inode) (struct inode *, struct writeback_control *wbc);
	int (*drop_inode) (struct inode *);
	void (*evict_inode) (struct inode *);
	void (*put_super) (struct super_block *);
	int (*sync_fs)(struct super_block *sb, int wait);
	int (*freeze_super) (struct super_block *);
	int (*freeze_fs) (struct super_block *);
	int (*thaw_super) (struct super_block *);
	int (*unfreeze_fs) (struct super_block *);
	int (*statfs) (struct dentry *, struct kstatfs *);
	int (*remount_fs) (struct super_block *, int *, char *);
	void (*umount_begin) (struct super_block *);

	int (*show_options)(struct seq_file *, struct dentry *);
	int (*show_devname)(struct seq_file *, struct dentry *);
	int (*show_path)(struct seq_file *, struct dentry *);
	int (*show_stats)(struct seq_file *, struct dentry *);
#ifdef CONFIG_QUOTA
	ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
	struct dquot **(*get_dquots)(struct inode *);
#endif
	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
	long (*nr_cached_objects)(struct super_block *,
				  struct shrink_control *);
	long (*free_cached_objects)(struct super_block *,
				    struct shrink_control *);

	ANDROID_KABI_RESERVE(1);
	ANDROID_KABI_RESERVE(2);
	ANDROID_KABI_RESERVE(3);
	ANDROID_KABI_RESERVE(4);
};

这些操作方法不用全部实现，下面对部分成员进行说明：

alloc_inode：分配一个inode
destroy_inode：释放一个硬盘上的inode
free_inode：释放内存中的inode
dirty_inode：用于将"脏"块标记的方法
write_inode：写入数据到inode中
drop_inode：当最后一个用户释放inode时调用该函数
put_super：释放超级块，卸载文件系统时调用此函数
sync_fs：同步文件系统，当有"脏页"时，更新数据
statfs：查看文件系统信息，例如魔数、文件名最长多少、页大小

3.2、dentry

dentry翻译过来叫"目录项"。在上面的struct super_block结构体里有一个成员s_root就是struct dentry类型：

c 复制代码

struct dentry {
    /* RCU lookup touched fields */
    unsigned int d_flags;		/* protected by d_lock */
    seqcount_spinlock_t d_seq;	/* per dentry seqlock */
    struct hlist_bl_node d_hash;	/* lookup hash list */
    struct dentry *d_parent;	/* parent directory */
    struct qstr d_name;
    struct inode *d_inode;		/* Where the name belongs to - NULL is
					 * negative */
    unsigned char d_iname[DNAME_INLINE_LEN];	/* small names */

    /* Ref lookup also touches following */
    struct lockref d_lockref;	/* per-dentry lock and refcount */
    const struct dentry_operations *d_op;
    struct super_block *d_sb;	/* The root of the dentry tree */
    unsigned long d_time;		/* used by d_revalidate */
    void *d_fsdata;			/* fs-specific data */

    union {
        struct list_head d_lru;		/* LRU list */
        wait_queue_head_t *d_wait;	/* in-lookup ones only */
    };
    struct list_head d_child;	/* child of parent list */
    struct list_head d_subdirs;	/* our children */
    /*
	 * d_alias and d_rcu can share memory
	 */
    union {
        struct hlist_node d_alias;	/* inode alias list */
        struct hlist_bl_node d_in_lookup_hash;	/* only for in-lookup ones */
        struct rcu_head d_rcu;
    } d_u;

    ANDROID_KABI_RESERVE(1);
    ANDROID_KABI_RESERVE(2);
} __randomize_layout;

d_flags：目录项标志
d_hash：哈希表表项链表
d_parent：父目录
d_name：目录名称
d_inode：指向目录或文件的inode
d_iname：短文件名，当文件名小于DNAME_INLINE_LEN时，文件名存储在数组中
d_op：目录项操作方法集合
d_sb：指向该目录项的超级块指针
d_child：同级目录链表
d_subdirs：子目录项链表

对于某些文件系统（如 ext2/ext3/ext4、XFS等），它们在磁盘上有一个物理的超级块。dentry并不存在于磁盘中，它是vfs虚拟文件系统抽象出来的一个对象，它只存在于内存中。目录项是文件系统中用于将文件名与inode号关联起来的数据结构，作用是快速定位文件路径，减少路径解析的时间。

struct dentry中的d_op操作方法集合如下：

c 复制代码

struct dentry_operations {
	int (*d_revalidate)(struct dentry *, unsigned int);
	int (*d_weak_revalidate)(struct dentry *, unsigned int);
	int (*d_hash)(const struct dentry *, struct qstr *);
	int (*d_compare)(const struct dentry *,
			unsigned int, const char *, const struct qstr *);
	int (*d_delete)(const struct dentry *);
	int (*d_init)(struct dentry *);
	void (*d_release)(struct dentry *);
	void (*d_prune)(struct dentry *);
	void (*d_iput)(struct dentry *, struct inode *);
	char *(*d_dname)(struct dentry *, char *, int);
	struct vfsmount *(*d_automount)(struct path *);
	int (*d_manage)(const struct path *, bool);
	struct dentry *(*d_real)(struct dentry *, const struct inode *);
	void (*d_canonical_path)(const struct path *, struct path *);
	ANDROID_KABI_RESERVE(1);
	ANDROID_KABI_RESERVE(2);
	ANDROID_KABI_RESERVE(3);
	ANDROID_KABI_RESERVE(4);
} ____cacheline_aligned;

d_revalidate：使一个目录项重新生效
d_hash：生成一个哈希值，用于VFS向哈希表中加入一个目录项
d_compare：比较两个目录项名称
d_delete：删除目录项
d_init：初始化目录项
d_release：释放目录项
d_iput：当目录项的inode为NULL时，此时会调用该函数
d_dname：设置目录项名称

上面有例举到，在struct dentry中有一个成员是d_inode，这里d_inode就是我们将要介绍的第三个核心对象。

3.3、inode

inode描述了磁盘上的文件信息，将所有文件的索引拿出来组成一个表，即inode表（inode table）。下图展示inode和dentry的关系（图片来自作者李文山的《Linux驱动开发进阶》）：

当文件系统需要访问一个文件时，以下步骤会发生：

路径解析：
- 从根目录开始，逐级解析路径中的每个目录项，找到目标文件的 dentry。
- 每个目录项在目录文件中存储了文件名和对应的 inode 号。
找到 inode：
- 通过 dentry 的 d_inode 字段，找到与该文件名关联的 inode 对象。
- 如果 inode 对象尚未加载到内存中，文件系统会从磁盘读取对应的 inode 数据，并将其加载到内存中的 VFS inode 结构中。
访问文件数据：
- 通过 inode 中的数据块指针，找到文件数据在磁盘上的实际存储位置。
- 文件系统通过这些数据块指针读取或写入文件的实际数据。

我们通过struct inode来看看inode存储了什么：

c 复制代码

struct inode {
	umode_t			i_mode;
	unsigned short		i_opflags;
	kuid_t			i_uid;
	kgid_t			i_gid;
	unsigned int		i_flags;

#ifdef CONFIG_FS_POSIX_ACL
	struct posix_acl	*i_acl;
	struct posix_acl	*i_default_acl;
#endif

	const struct inode_operations	*i_op;
	struct super_block	*i_sb;
	struct address_space	*i_mapping;

#ifdef CONFIG_SECURITY
	void			*i_security;
#endif

	/* Stat data, not accessed from path walking */
	unsigned long		i_ino;
	/*
	 * Filesystems may only read i_nlink directly.  They shall use the
	 * following functions for modification:
	 *
	 *    (set|clear|inc|drop)_nlink
	 *    inode_(inc|dec)_link_count
	 */
	union {
		const unsigned int i_nlink;
		unsigned int __i_nlink;
	};
	dev_t			i_rdev;
	loff_t			i_size;
	struct timespec64	i_atime;
	struct timespec64	i_mtime;
	struct timespec64	i_ctime;
	spinlock_t		i_lock;	/* i_blocks, i_bytes, maybe i_size */
	unsigned short          i_bytes;
	u8			i_blkbits;
	u8			i_write_hint;
	blkcnt_t		i_blocks;

#ifdef __NEED_I_SIZE_ORDERED
	seqcount_t		i_size_seqcount;
#endif

	/* Misc */
	unsigned long		i_state;
	struct rw_semaphore	i_rwsem;

	unsigned long		dirtied_when;	/* jiffies of first dirtying */
	unsigned long		dirtied_time_when;

	struct hlist_node	i_hash;
	struct list_head	i_io_list;	/* backing dev IO list */
#ifdef CONFIG_CGROUP_WRITEBACK
	struct bdi_writeback	*i_wb;		/* the associated cgroup wb */

	/* foreign inode detection, see wbc_detach_inode() */
	int			i_wb_frn_winner;
	u16			i_wb_frn_avg_time;
	u16			i_wb_frn_history;
#endif
	struct list_head	i_lru;		/* inode LRU list */
	struct list_head	i_sb_list;
	struct list_head	i_wb_list;	/* backing dev writeback list */
	union {
		struct hlist_head	i_dentry;
		struct rcu_head		i_rcu;
	};
	atomic64_t		i_version;
	atomic64_t		i_sequence; /* see futex */
	atomic_t		i_count;
	atomic_t		i_dio_count;
	atomic_t		i_writecount;
#if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING)
	atomic_t		i_readcount; /* struct files open RO */
#endif
	union {
		const struct file_operations	*i_fop;	/* former ->i_op->default_file_ops */
		void (*free_inode)(struct inode *);
	};
	struct file_lock_context	*i_flctx;
	struct address_space	i_data;
	struct list_head	i_devices;
	union {
		struct pipe_inode_info	*i_pipe;
		struct block_device	*i_bdev;
		struct cdev		*i_cdev;
		char			*i_link;
		unsigned		i_dir_seq;
	};

	__u32			i_generation;

#ifdef CONFIG_FSNOTIFY
	__u32			i_fsnotify_mask; /* all events this inode cares about */
	struct fsnotify_mark_connector __rcu	*i_fsnotify_marks;
#endif

#ifdef CONFIG_FS_ENCRYPTION
	struct fscrypt_info	*i_crypt_info;
#endif

#ifdef CONFIG_FS_VERITY
	struct fsverity_info	*i_verity_info;
#endif

	void			*i_private; /* fs or device private pointer */

	ANDROID_KABI_RESERVE(1);
	ANDROID_KABI_RESERVE(2);
} __randomize_layout;

i_mode：文件的访问权限
i_op：inode操作函数集合
i_sb：超级块指针，指向文件系统的超级块
i_mapping：地址映射描述
i_nlink：硬连接数目
i_rdev：设备号，在Linux中，所有的设备即是文件
i_size：文件大小，单位字节
i_atime：最后访问时间
i_mtime：最后修改时间
i_ctime：最后改变时间
i_blkbits：块大小表示位数，例如块为4096字节时，此时值为12
i_blocks：文件所占用的block数量
i_hash：哈希表
i_dentry：目录项链表
i_fop：文件操作方法集合
i_data：设备数据地址映射
i_devices：块设备链表
i_pipe：管道文件
i_cdev：字符设备文件
i_link：连接文件
i_private：私有指针，一般用来存放数据块的首地址

其中成员i_op为inode的操作集合：

c 复制代码

struct inode_operations {
	struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
	const char * (*get_link) (struct dentry *, struct inode *, struct delayed_call *);
	int (*permission) (struct inode *, int);
	struct posix_acl * (*get_acl)(struct inode *, int);

	int (*readlink) (struct dentry *, char __user *,int);

	int (*create) (struct inode *,struct dentry *, umode_t, bool);
	int (*link) (struct dentry *,struct inode *,struct dentry *);
	int (*unlink) (struct inode *,struct dentry *);
	int (*symlink) (struct inode *,struct dentry *,const char *);
	int (*mkdir) (struct inode *,struct dentry *,umode_t);
	int (*rmdir) (struct inode *,struct dentry *);
	int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
	int (*rename) (struct inode *, struct dentry *,
			struct inode *, struct dentry *, unsigned int);
	int (*setattr) (struct dentry *, struct iattr *);
	int (*getattr) (const struct path *, struct kstat *, u32, unsigned int);
	ssize_t (*listxattr) (struct dentry *, char *, size_t);
	int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start,
		      u64 len);
	int (*update_time)(struct inode *, struct timespec64 *, int);
	int (*atomic_open)(struct inode *, struct dentry *,
			   struct file *, unsigned open_flag,
			   umode_t create_mode);
	int (*tmpfile) (struct inode *, struct dentry *, umode_t);
	int (*set_acl)(struct inode *, struct posix_acl *, int);

	ANDROID_KABI_RESERVE(1);
	ANDROID_KABI_RESERVE(2);
	ANDROID_KABI_RESERVE(3);
	ANDROID_KABI_RESERVE(4);
} ____cacheline_aligned;

lookup：在dentry下查找inode
create：在dentry下创建一个inode
link：为一个indoe创建一个inode
unlink：删除一个连接文件
mkdir：在dentry下创建一个目录inode
rmdir：在dentry下删除一个inode
mknod：创建设备节点
update_time：更新文件时间

3.4、file

file对象是与进程相关的文件描述符的内核表示。它是文件系统和进程之间交互的核心数据结构之一。file对象是Linux内核中用于表示打开文件的结构体。当进程通过系统调用（如 open()）打开文件时，内核会创建一个file对象，并将其添加到进程的文件描述符表中。当进程通过系统调用（如 close()）关闭文件时，内核会释放对应的file对象。file结构体如下：

c 复制代码

struct file {
	union {
		struct llist_node	fu_llist;
		struct rcu_head 	fu_rcuhead;
	} f_u;
	struct path		f_path;
	struct inode		*f_inode;	/* cached value */
	const struct file_operations	*f_op;

	/*
	 * Protects f_ep_links, f_flags.
	 * Must not be taken from IRQ context.
	 */
	spinlock_t		f_lock;
	enum rw_hint		f_write_hint;
	atomic_long_t		f_count;
	unsigned int 		f_flags;
	fmode_t			f_mode;
	struct mutex		f_pos_lock;
	loff_t			f_pos;
	struct fown_struct	f_owner;
	const struct cred	*f_cred;
	struct file_ra_state	f_ra;

	u64			f_version;
#ifdef CONFIG_SECURITY
	void			*f_security;
#endif
	/* needed for tty driver, and maybe others */
	void			*private_data;

#ifdef CONFIG_EPOLL
	/* Used by fs/eventpoll.c to link all the hooks to this file */
	struct list_head	f_ep_links;
	struct list_head	f_tfile_llink;
#endif /* #ifdef CONFIG_EPOLL */
	struct address_space	*f_mapping;
	errseq_t		f_wb_err;
	errseq_t		f_sb_err; /* for syncfs */

	ANDROID_KABI_RESERVE(1);
	ANDROID_KABI_RESERVE(2);
	ANDROID_OEM_DATA(1);
} __randomize_layout
  __attribute__((aligned(4)));	/* lest something weird decides that 2 is OK */

f_path：文件路径，包含了dentry和vfsmout
f_inode：执行inode指针
f_op：文件操作方法集合
f_count：文件对象使用计数
f_flags：文件被打开时指定的标志，例如O_RDONLY，O_WRONLY
f_mode：文件读写权限
f_pos：文件当前的偏移量，即当前读写的位置相对于文件开始地址的偏移
f_owner：文件所有者
private_data：私有数据指针，较为常用
f_mapping：文件的页缓冲映射地址

4、文件系统的挂载

当一个磁盘上的分区被挂载时，此时Linux内核会扫描该磁盘上对应的分区的所有索引节点(inode)，然后创建struct mount结构体和dentry对象，并将所有的超级块信息保存在struct superblock结构体中，并将所有的inode信息以链表的形式保存在struct inode结构体中，整个过程就建立了从struct mount到inode之间的关系。

5、文件系统的注册

内核维护一个全局链表 file_systems，用于存储所有已注册的文件系统类型。每个文件系统类型通过 file_system_type 结构体注册到这个链表中。

5.1、文件系统的注册过程

5.1.2、定义文件系统类型

文件系统开发者需要定义一个 file_system_type 结构体实例，并实现必要的操作函数（如挂载和卸载函数）。

c 复制代码

static struct file_system_type myfs_type = {
    .owner = THIS_MODULE,
    .name = "myfs",
    .mount = myfs_mount,
    .kill_sb = myfs_kill_sb,
    .fs_flags = FS_REQUIRES_DEV,
};

5.1.3、注册文件系统

使用 register_filesystem() 函数将文件系统类型注册到内核中。这通常在文件系统模块加载时完成。

c 复制代码

static int __init myfs_init(void) {
    return register_filesystem(&myfs_type);
}

c 复制代码

int register_filesystem(struct file_system_type * fs)
{
	int res = 0;
	struct file_system_type ** p;

	if (fs->parameters &&
	    !fs_validate_description(fs->name, fs->parameters))
		return -EINVAL;

	BUG_ON(strchr(fs->name, '.'));
	if (fs->next)
		return -EBUSY;
	write_lock(&file_systems_lock);
	p = find_filesystem(fs->name, strlen(fs->name));
	if (*p)
		res = -EBUSY;
	else
		*p = fs;
	write_unlock(&file_systems_lock);
	return res;
}

5.1.4、注销文件系统

使用 unregister_filesystem() 函数将文件系统从内核中注销。这通常在文件系统模块卸载时完成。

c 复制代码

static void __exit myfs_exit(void) {
    unregister_filesystem(&myfs_type);
}

c 复制代码

int unregister_filesystem(struct file_system_type * fs)
{
	struct file_system_type ** tmp;

	write_lock(&file_systems_lock);
	tmp = &file_systems;
	while (*tmp) {
		if (fs == *tmp) {
			*tmp = fs->next;
			fs->next = NULL;
			write_unlock(&file_systems_lock);
			synchronize_rcu();
			return 0;
		}
		tmp = &(*tmp)->next;
	}
	write_unlock(&file_systems_lock);

	return -EINVAL;
}

5.2、文件系统的挂载与注册的关系

挂载前的注册：在文件系统被挂载之前，它必须先注册到内核中。只有注册过的文件系统类型才能被挂载。
挂载时的识别：当用户尝试挂载一个文件系统时（如通过 mount 命令），内核会遍历 file_systems 链表，查找匹配的文件系统类型。
动态加载：某些文件系统（如通过模块加载的文件系统）可以在运行时动态注册和注销。例如，ntfs 文件系统可以通过加载 ntfs.ko 模块动态注册。

6、实现一个虚拟文件系统

注：程序源码一样来自李文山的《Linux驱动开发进阶》：https://gitee.com/li-shan-asked/linux-advanced-development-code/blob/master/part1/memfs/memfs.c

但上述源码是基于6.1的kernel，我实验的环境是5.x的kernel，部分接口函数会不一样。所以下面展示的源码是略有修改的。

6.1、定义文件系统结构

一般需要自定义开发文件系统时，可能才需要编写文件系统驱动程序。下图是本次要实现的虚拟文件系统结构（图片来自作者李文山的《Linux驱动开发进阶》）：

6.2、完整的驱动程序

c 复制代码

#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <asm/atomic.h>
#include <asm/uaccess.h>
#include <linux/slab.h>

#define MEMFS_INVALID          0xFFFFFFFF
#define MEMFS_FILE_NAME_MAX    16
#define MEMFS_INODE_MAX        128
#define MEMFS_BLK_MAX          128
#define MEMFS_FILE_SIZE_MAX    1024

struct memfs_sb {
    uint32_t blk_size_bit;	// 块大小占用的位数
    uint32_t block_size;	// 数据块大小
    uint32_t magic;			// 文件系统魔数
    uint32_t private;		// 
};

struct memfs_inode {
    char file_name[MEMFS_FILE_NAME_MAX];	// 文件名称
    uint32_t mode;			// 记录文件或者文件夹的读写权限
    uint32_t idx; 			// 记录文件或者文件夹在inode bitmap的索引节点号
    uint32_t child; 		// 记录当前文件夹下的目录或者文件的第一个(=MEMFS_INVALID: no child)
    uint32_t brother; 		// 记录当前文件或目录的同级目录或者文件(=MEMFS_INVALID: no brother)
    uint32_t file_size;		// 
    uint32_t data; 			// =MEMFS_INVALID: dir    0~127: file
};

struct memfs_block {
	char data[1024];
};

struct memfs_sb g_mf_sb = {
    .blk_size_bit = 10,
    .block_size = 1024,
    .magic = 0x20221001,
};

char g_inode_bitmap[16]={0};	// 128bit空间的一个inode bitmap, 每个bit都是一个开关量，标记着inode池中对应的inode是否已使用
char g_block_bitmap[16]={0};	// 128bit空间的一个block bitmap

static struct memfs_inode *g_mf_inode;	// 指向inode池
static struct memfs_block *g_mf_block;	// 指向block池

static struct inode_operations memfs_inode_ops;

void set_bitmap(char *bitmap, uint32_t index)
{
    *(bitmap + (index>>3)) |= (1<< (index%8));     
}

void reset_bitmap(char *bitmap, uint32_t index)
{
    *(bitmap + (index>>3)) &= ~(1<< (index%8));
}

uint32_t get_idle_index(char *bitmap)
{
    uint8_t tmp;
	
    for(int i = 0; i < 16; i++)						// 循环检查16个字节
	{
        if(bitmap[i] != 0xFF) 						// 如果该字节(8个bit)存在空闲bit
		{
            tmp = bitmap[i];
            for(int j = 0; j<8; j++) 				// 逐位检查8个bit
			{
                if((tmp & 0x1) == 0) 				// 找到空闲bit
				{
                    set_bitmap(bitmap, i*8 + j);	// 设置改bit为非空闲
                    return i*8 + j;					// 返回下标
                }
                else 
				{
                    tmp >>= 1;						
                }
            }
        }
    }
	
    return MEMFS_INVALID;
}

void put_used_index(char *bitmap, uint32_t index) 
{
    reset_bitmap(bitmap, index);
}

int memfs_alloc_mem(void)
{
	/* 分配inode池，大小5120bytes */
	g_mf_inode = kzalloc(5120, GFP_KERNEL);
	if(!g_mf_inode)
		return -1;

	/* 分配block池，大小128KB */
	g_mf_block = kzalloc(128*1024, GFP_KERNEL);
	if(!g_mf_block)
		return -1;

	/* 初始化inode的brother和child属性 */
	for(int i = 0; i < MEMFS_INODE_MAX; i++)
	{
		g_mf_inode[i].brother = MEMFS_INVALID;
		g_mf_inode[i].child = MEMFS_INVALID;
	}

	return 0;
}

void memfs_free_mem(void)
{
	kfree(g_mf_inode);	
	kfree(g_mf_block);
}

static int memfs_readdir(struct file *filp, struct dir_context *ctx)
{
	struct memfs_inode *mf_inode, *child_inode;
    if (ctx->pos) 
        return 0;
    mf_inode = &g_mf_inode[filp->f_path.dentry->d_inode->i_ino];	
    if (!S_ISDIR(mf_inode->mode)) 
	{
        return -ENOTDIR;
    }
    if(mf_inode->child != MEMFS_INVALID) 
	{
        child_inode = &g_mf_inode[mf_inode->child];
    }
    else 
	{
        return 0;
    }
	
    while(child_inode->idx != MEMFS_INVALID) 
	{
        if (!dir_emit(ctx, child_inode->file_name, MEMFS_FILE_NAME_MAX, child_inode->idx, DT_UNKNOWN)) 
		{
            return 0;
        }
        ctx->pos += sizeof(struct memfs_inode);
        if(child_inode->brother != MEMFS_INVALID)
            child_inode = &g_mf_inode[child_inode->brother];
        else
            break;
    }
    return 0;
}

ssize_t memfs_read_file(struct file * filp, char __user * buf, size_t len, loff_t *ppos)
{
	struct memfs_inode *inode;
    char *buffer;
    inode = &g_mf_inode[filp->f_path.dentry->d_inode->i_ino];	// 获取实际要操作的inode
    if (*ppos >= inode->file_size)
        return 0;
    buffer = (char*)&g_mf_block[inode->data];					// 获取block池中对应位置的首地址
    buffer += *ppos;											// 
    len = min((size_t)(inode->file_size - *ppos), len);
    if (copy_to_user(buf, buffer, len)) 						// 拷贝到用户态
	{
        return -EFAULT;
    }
    *ppos += len;												// 更新偏移
    return len;
}

ssize_t memfs_write_file(struct file *filp, const char __user *buf, size_t len, loff_t *ppos)
{
	struct memfs_inode *inode;
    char *buffer;
    inode = &g_mf_inode[filp->f_path.dentry->d_inode->i_ino];	// 获取实际要操作的inode
    if (*ppos + len > MEMFS_FILE_SIZE_MAX )
        return 0;
    buffer = (char*)&g_mf_block[inode->data];					// 获取block池中对应位置的首地址
    buffer += *ppos;											// 
    if (copy_from_user(buffer, buf, len)) 
	{
        return -EFAULT;
    }
    *ppos += len;												// 更新偏移
    inode->file_size = *ppos;									// 更新文件大小
    return len;
}

const struct file_operations memfs_file_operations = {
    .read = memfs_read_file,
    .write = memfs_write_file,
};
 
const struct file_operations memfs_dir_operations = {
    .owner = THIS_MODULE,
    .iterate_shared = memfs_readdir,
};

//dir: 当前目录的inode
//dentry：要创建的文件的dentry
//mode：要创建的文件的mode
static int memfs_do_create(struct inode *dir, struct dentry *dentry, umode_t mode)
{
    struct inode *inode;
    struct super_block *sb;
    struct memfs_inode *mf_inode, *p_mf_inode, *tmp_mf_inode;
    uint32_t idx_inode;

	/* 获取sb指针 */
    sb = dir->i_sb;
	
    /* 判断是否是目录和常规文件，如果不是，返回错误 */
    if (!S_ISDIR(mode) && !S_ISREG(mode)) 
	{
        return -EINVAL;
    }
    if (strlen(dentry->d_name.name) > MEMFS_FILE_NAME_MAX) 
	{
        return -ENAMETOOLONG;
    }
    inode = new_inode(sb);
    if (!inode) 
	{
        return -ENOMEM;
    }
	
    /* 初始化现在要创建的inode的sb */
	idx_inode = get_idle_index(g_inode_bitmap);	// 获取一个空闲的inode，用于保存当前创建的目录或者文件的inode信息
    if (idx_inode == MEMFS_INVALID) 
	{
        return -ENOSPC;
    }
	
    inode->i_sb = sb;
    inode->i_op = &memfs_inode_ops; 			// 初始化当前的inode的ops
    inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode);	// 初始化创建时间和修改时间为当前时间
    inode->i_ino = idx_inode;
	
    mf_inode = &g_mf_inode[idx_inode];			// 
    mf_inode->idx = idx_inode;					// 
    mf_inode->mode = mode;
	
    /* 接下来都是inode的初始化 */
    if (S_ISDIR(mode)) 							// 如果创建的是一个文件，则分配一个block，如果是一个目录则不用分配block
	{
        mf_inode->data = MEMFS_INVALID;
        inode->i_fop = &memfs_dir_operations;
    } 
	else if (S_ISREG(mode)) 
	{
        mf_inode->child = MEMFS_INVALID;
        mf_inode->file_size = 0;
        inode->i_fop = &memfs_file_operations;
        mf_inode->data = get_idle_index(g_block_bitmap);
        if(mf_inode->data == MEMFS_INVALID) 
		{
            return -ENOSPC;
        }
    }
	
    p_mf_inode = &g_mf_inode[dir->i_ino];		// 获取当前新创建的父目录节点
    if(p_mf_inode->child == MEMFS_INVALID) 		// 当前目录为空目录
	{ 
        p_mf_inode->child = mf_inode->idx;		// 父目录节点的child直接指向现在要创建的inode
    }
    else 										// 非空目录，找到最后一个child
	{ 
        tmp_mf_inode = &g_mf_inode[p_mf_inode->child];			// 第一个child
        while(tmp_mf_inode->brother != MEMFS_INVALID) 
		{
            tmp_mf_inode = &g_mf_inode[tmp_mf_inode->brother];	// 找到父目录最后一个child
        }
        tmp_mf_inode->brother = mf_inode->idx;					// 最后一个child，并设置brother
    }
    
    strcpy(mf_inode->file_name, dentry->d_name.name);			// 初始化内核的dentry名称
    inode_init_owner(inode, dir, mode);							// 添加inode到dir中
    d_add(dentry, inode);										// 绑定内核dentry与inode
    
    return 0;
}

static int memfs_inode_mkdir(struct inode *dir, struct dentry *direntry, umode_t mode)
{
    return memfs_do_create(dir, direntry, S_IFDIR | mode);
}

static int memfs_inode_create(struct inode *dir, struct dentry *direntry, umode_t mode,
		bool excl)
{
    return memfs_do_create(dir, direntry, mode);
}

//parent_inode: 父目录节点
//find_dentry: 要查找的dentry
static struct dentry *memfs_inode_lookup(struct inode *parent_inode, struct dentry *find_dentry, unsigned int flags)
{
    return NULL;
}

//删除空目录
//dentry: 待删除空目录的dentry
int memfs_inode_rmdir(struct inode *dir, struct dentry *dentry)
{
    uint32_t index = dentry->d_inode->i_ino;					// 待删除的空目录的inode下标
    struct memfs_inode *p_mf_inode, *child_mf_inode, *tmp_mf_inode;
    
    if(g_mf_inode[index].child != MEMFS_INVALID)				// 如果是非空目录，返回错误
        return -ENOTEMPTY;
    p_mf_inode = &g_mf_inode[dir->i_ino];						// 获取当前目录inode
    
    child_mf_inode = &g_mf_inode[p_mf_inode->child];			// 获取父目录inode的第一个child
    put_used_index(g_inode_bitmap, index);
    if(p_mf_inode->child == index) 								// 如果要删除的空目录是父目录的第一个child
	{
        if(child_mf_inode->brother == MEMFS_INVALID)			// 如果当前node没有brother了
            p_mf_inode->child = MEMFS_INVALID;					// 那么父目录inode也不会有child
        else
            p_mf_inode->child = child_mf_inode->brother;		// 否则父目录inode指向当前inode的brother
    }
    else 														// 如果要删除的空目录不是父目录的第一个child
	{
        while(child_mf_inode->idx != MEMFS_INVALID) 
		{
            if(child_mf_inode->brother != MEMFS_INVALID) 
			{
                tmp_mf_inode = child_mf_inode;
                child_mf_inode = &g_mf_inode[child_mf_inode->brother];			// 获取brother
                if(child_mf_inode->idx == index) 								// 找到待删除的空目录了
				{	
                    if(child_mf_inode->brother != MEMFS_INVALID)				
                        tmp_mf_inode->brother = child_mf_inode->brother;		// 在链表关系中，移除了待删除的这个空目录
                    else
                        tmp_mf_inode->brother = MEMFS_INVALID;
                    break;
                }
            }
        }
    }
    g_mf_inode[index].idx = MEMFS_INVALID;
    g_mf_inode[index].brother = MEMFS_INVALID;
    return simple_unlink(dir, dentry);
}

//删除文件操作
int memfs_inode_unlink(struct inode *dir, struct dentry *dentry) 
{
    uint32_t index = dentry->d_inode->i_ino;
    struct memfs_inode *p_mf_inode, *child_mf_inode, *tmp_mf_inode;
    p_mf_inode = &g_mf_inode[dir->i_ino];
    //获取第一个child
    child_mf_inode = &g_mf_inode[p_mf_inode->child];
    put_used_index(g_inode_bitmap, index);
    put_used_index(g_block_bitmap, g_mf_inode[index].data);
    if(p_mf_inode->child == index) {
        if(child_mf_inode->brother == MEMFS_INVALID)
            p_mf_inode->child = MEMFS_INVALID;
        else
            p_mf_inode->child = child_mf_inode->brother;
    }
    else {
        while(child_mf_inode->idx != MEMFS_INVALID) {
            if(child_mf_inode->brother != MEMFS_INVALID) {
                tmp_mf_inode = child_mf_inode;
                child_mf_inode = &g_mf_inode[child_mf_inode->brother];
                if(child_mf_inode->idx == index) {
                    if(child_mf_inode->brother != MEMFS_INVALID)
                        tmp_mf_inode->brother = child_mf_inode->brother;
                    else
                        tmp_mf_inode->brother = MEMFS_INVALID;
                    break;
                }
            }
        }
    }
    g_mf_inode[index].idx = MEMFS_INVALID;
    g_mf_inode[index].brother = MEMFS_INVALID;
    return simple_unlink(dir, dentry);
}

static struct inode_operations memfs_inode_ops = {
    .create = memfs_inode_create,		// 在dentry下创建一个inode
    .lookup = memfs_inode_lookup,		// 在dentry下查找inode
    .mkdir = memfs_inode_mkdir,			// 在dentry下创建一个目录inode
    .rmdir = memfs_inode_rmdir,			// 在dentry下删除一个inode
    .unlink = memfs_inode_unlink,		// 删除文件
};

/* 每挂载一个块设备，都需要初始化相应的超级块，
 * 该函数就是初始化超级块。
 */
static int memfs_demo_fill_super(struct super_block *sb, void *data, int silent)
{
	struct inode *root_inode;
	int mode = S_IFDIR | 0755;
	root_inode = new_inode(sb);						// 新建inode，用于保存根节点
    root_inode->i_ino = 0; 							// 设置根节点的编号为0
    root_inode->i_mode = mode;						// 初始化根节点权限
    root_inode->i_sb = sb;							// 设置根节点的超级块
    root_inode->i_op = &memfs_inode_ops;			// 设置根节点的节点操作集合
    root_inode->i_fop = &memfs_dir_operations;		// 设置根节点目录操作集合
    root_inode->i_atime = root_inode->i_mtime = root_inode->i_ctime = current_time(root_inode); //设置根节点的创建修改时间为当前时间

	/* 初始化inode池中的第0个inode，第0个inode就是根节点 */
	strcpy(g_mf_inode[0].file_name, "memfs");		 
    g_mf_inode[0].mode = mode;
    g_mf_inode[0].idx = 0;
    g_mf_inode[0].child = MEMFS_INVALID;
    g_mf_inode[0].brother = MEMFS_INVALID;
	set_bitmap(g_inode_bitmap, g_mf_inode[0].idx);	// 置inode bitmap的第0位为1
	
	root_inode->i_private = &g_mf_inode[0]; 		// 将根节点保存到root_inode的私有数据中

	/* 初始化磁盘的描述信息(super block) */
    sb->s_root = d_make_root(root_inode);			// 设置上面分配的inode为根目录	
    sb->s_magic = g_mf_sb.magic;					// 文件系统魔数
    sb->s_blocksize_bits = g_mf_sb.blk_size_bit;	// 页大小所占的位数为12
    sb->s_blocksize = g_mf_sb.blk_size_bit;			// 页大小为1024Bytes

	return 0;
}

static struct dentry *memfs_demo_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data)
{
	return mount_nodev(fs_type, flags, data, memfs_demo_fill_super);
}

static void memfs_kill_sb(struct super_block *sb) 
{
	kill_anon_super(sb);
}

/* 文件系统类型 */
static struct file_system_type memfs_type = {
	.owner 		= THIS_MODULE,
	.name		= "memfs",				// 文件系统名字
	.mount      = memfs_demo_mount,		// 挂载文件系统时，所执行的函数
	.kill_sb	= memfs_kill_sb,		// 卸载文件系统时，所执行的函数
};

static int __init memfs_demo_init(void)
{
	/* 分配inode池和block池 */
	if(memfs_alloc_mem())
	{
		printk(KERN_ERR "alloc memory failed\n");
        return -ENOMEM;
	}
	/* 注册文件系统 */
	return register_filesystem(&memfs_type);
}

static void __exit memfs_demo_exit(void)
{
	/* 释放inode池和block池 */
	memfs_free_mem();
	/* 卸载文件系统 */
	unregister_filesystem(&memfs_type);
}

module_init(memfs_demo_init);
module_exit(memfs_demo_exit);

MODULE_LICENSE("GPL");
MODULE_AUTHOR("1477153217@qq.com");
MODULE_DESCRIPTION("memory fs demo");
MODULE_ALIAS("fs:memfs");
MODULE_IMPORT_NS(VFS_internal_I_am_really_a_filesystem_and_am_NOT_a_driver);

Makefile如下：

c 复制代码

KERN_DIR = /home/cohen/sdk/docker/rk356x-sdk/kernel/

all:
	make -C $(KERN_DIR) M=$(PWD) modules

clean:
	make -C $(KERN_DIR) M=$(PWD) clean
	rm -f modules.order 

obj-m	+= my_memfs.o

ccflags-y += -std=gnu99

6.3、测试

将编译好的驱动程序拷贝到板卡，安装驱动：

shell 复制代码

insmod my_memfs.ko

此时，已经将一个名为memfs的虚拟文件系统注册进了内核。

挂载文件系统：

shell 复制代码

# -t 表示文件系统的类型
# none 表示没有IO设备
# /mnt 为挂载点
mount -t memfs none /mnt

进入/mnt，创建文件：

删除文件：