Linux:文件 mmap 读写流程简析

文章目录

  • [1. 前言](#1. 前言)
  • [2. 文件 mmap 读写流程](#2. 文件 mmap 读写流程)
    • [2.1 分配 mmap 映射虚拟地址区间](#2.1 分配 mmap 映射虚拟地址区间)
    • [2.2 页表 和 page cache 分配](#2.2 页表 和 page cache 分配)
  • [3. 推荐阅读](#3. 推荐阅读)

1. 前言

限于作者能力水平,本文可能存在谬误,因此而给读者带来的损失,作者不做任何承诺。

2. 文件 mmap 读写流程

文件 read() / write() 操作,需要在用户空间缓冲内核空间文件 page cache 之间进行数据拷贝:

bash 复制代码
         读/写
用户缓冲 <----> page cache <-----> 磁盘
         拷贝

而文件的 mmap() 对文件的读写操作,是直接作用于内核空间文件 page cache

bash 复制代码
         读/写
映射地址 <----> page cache <-----> 磁盘

这是文件的 mmap() 读写操作在较大数据量读写场合上性能优于普通 read() / write() 操作的原因。当然,也不是所有的场合 mmap() 的性能都优于 read() / write(),譬如小文件的场景。

同时,在 32 位架构下,由于(1G/3G 布局下)进程地址空间的只有大概小于 2G 的空间可用于 mmap(),所以能映射的文件大小也受到了限制。下图是一个 32 位架构下,典型的 1G/3G布局下进程地址空间分布:

bash 复制代码
0xFFFF_FFFF  -------------  \
            |   Kernel    |  |
            |             |   } 1
            |   Space     |  |  G
            |-------------| /
            |   modules   | \
            |-------------| |
            |             | |
            |   STACK     | |
            |             | |
            |-------------| |
            |             | |
            |             |  } 2G
            |    MMAP     | |
            |             | |
            |             | |
            |-------------| |
            |             | |
            |     HEAP    | |
            |             | |
            |-------------|/
            |   PROGRAM   |\
            |  CODE/DATA  | |
            |-------------|  } 1G
            |   reserved  | |
0x0000_0000  ------------- /

不同于 32 位系统,64 位系统下可用的 mmap 虚拟地址空间很大,大概接近 128T

另外,和普通的read() / write() 一样,mmap() 的数据仍然也是在 page cache,而不是直接落入磁盘,可以在调用 munmap() 之前,调用 msync() 来将数据同步到磁盘。

文件的 mmap() 读写操作,主要分为如下 2 步:

bash 复制代码
1. 从进程虚拟地址空间(mmap 区间),找出一段可以建立映射的空间,然后用 vm_area_struct 数据记录相关信息,并返
   回映射区间虚拟地址到用户空间;
2. 用户空间进行读写,将产生 page fault,在 page fault 处理中为映射区间建立页表,并分配文件的 page cache 
   页面,然后用户空间的读写将落到 page cache 页面中。

也可以通过 MAP_POPULATE 标志位在第 1 步就进行页表和 page cache 页面的预分配。

下面分别分析上面 2 步的代码实现细节。

2.1 分配 mmap 映射虚拟地址区间

从系统调用 mmap() 开始:

c 复制代码
/* mm/mmap.c */

SYSCALL_DEFINE6(mmap_pgoff, unsigned long, addr, unsigned long, len,
		unsigned long, prot, unsigned long, flags,
		unsigned long, fd, unsigned long, pgoff)
{
	if (!(flags & MAP_ANONYMOUS)) { /* 非匿名映射, 即文件映射 */
		...
		file = fget(fd); /* 找到 @fd 对应的文件对象 @file */
		if (!file) /* 找不到 @fd 对应的文件对象 */
			return -EBADF;
		...
	} else if (flags & MAP_HUGETLB) {
		...
	}

	flags &= ~(MAP_EXECUTABLE | MAP_DENYWRITE);

	retval = vm_mmap_pgoff(file, addr, len, prot, flags, pgoff);
	...
	return retval; /* 返回分配的 mmap 虚拟地址映射区间地址 或 错误码 */
}
c 复制代码
/* mm/util.c */

unsigned long vm_mmap_pgoff(struct file *file, unsigned long addr,
	unsigned long len, unsigned long prot,
	unsigned long flag, unsigned long pgoff)
{
	unsigned long ret;
	struct mm_struct *mm = current->mm; /* 当前进程的地址空间管理对象 */
	unsigned long populate;
	LIST_HEAD(uf);

	ret = security_mmap_file(file, prot, flag);
	if (!ret) {
		if (down_write_killable(&mm->mmap_sem)) /* 锁住当前进程地址空间 */
			return -EINTR;
		ret = do_mmap_pgoff(file, addr, len, prot, flag, pgoff,
				    &populate, &uf);
		up_write(&mm->mmap_sem); /* 下锁当前进程地址空间 */
		userfaultfd_unmap_complete(mm, &uf);
		if (populate) /* (flags & (MAP_POPULATE | MAP_NONBLOCK)) == MAP_POPULATE) */
			mm_populate(ret, populate); /* 返回前建立页表并分配好物理内存 */
	}
	return ret;
}
c 复制代码
do_mmap_pgoff()
	do_mmap()
c 复制代码
/* mm/mmap.c */

/*
 * The caller must hold down_write(&current->mm->mmap_sem).
 */
unsigned long do_mmap(struct file *file, unsigned long addr,
			unsigned long len, unsigned long prot,
			unsigned long flags, vm_flags_t vm_flags,
			unsigned long pgoff, unsigned long *populate,
			struct list_head *uf)
{
	struct mm_struct *mm = current->mm;
	int pkey = 0;
	
	...
	
	/* Obtain the address to map to. we verify (or select) it and ensure
	 * that it represents a valid section of the address space.
	 */
	/* 从进程地址空间找一块空闲的 mmap 虚拟地址段 */
	addr = get_unmapped_area(file, addr, len, pgoff, flags);

	...

	addr = mmap_region(file, addr, len, vm_flags, pgoff, uf);
	...
	return addr;
}

unsigned long mmap_region(struct file *file, unsigned long addr,
		unsigned long len, vm_flags_t vm_flags, unsigned long pgoff,
		struct list_head *uf)
{
	struct mm_struct *mm = current->mm;
	struct vm_area_struct *vma, *prev;
	int error;
	struct rb_node **rb_link, *rb_parent;
	unsigned long charged = 0;

	...

	/* 新建一个 vm_area_struct 来表示 mmap 映射区间 */
	vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
	if (!vma) {
		error = -ENOMEM;
		goto unacct_error;
	}

	vma->vm_mm = mm;
	vma->vm_start = addr;
	vma->vm_end = addr + len;
	vma->vm_flags = vm_flags;
	vma->vm_page_prot = vm_get_page_prot(vm_flags);
	vma->vm_pgoff = pgoff;
	INIT_LIST_HEAD(&vma->anon_vma_chain);

	if (file) { /* 文件 mmap 映射 */
		...
		/* ->mmap() can change vma->vm_file, but must guarantee that
		 * vma_link() below can deny write-access if VM_DENYWRITE is set
		 * and map writably if VM_SHARED is set. This usually means the
		 * new file must not have been exposed to user-space, yet.
		 */
		vma->vm_file = get_file(file);
		error = call_mmap(file, vma); /* 文件系统的 mmap 操作 */
		...
	
		addr = vma->vm_start; /* 返回分配的 mmap 的虚拟地址 */
		...
	} else if (vm_flags & VM_SHARED) {
		...
	}
	
	/* 将新的 VMA 插入到进程地址空间 VMA 红黑树 */
	vma_link(mm, vma, prev, rb_link, rb_parent);

	...

	return addr; /* 返回分配的 mmap 的虚拟地址 */

	...
}
c 复制代码
/* include/ */

/* 文件系统的 mmap 操作 */
static inline int call_mmap(struct file *file, struct vm_area_struct *vma)
{
	/*
	 * ext4: fs/ext4/file.c, ext4_file_mmap()
	 * ...
	 */
	return file->f_op->mmap(file, vma);
}

每个文件系统的 mmap 操作不同,这里以 ext4 文件系统为例进行分析:

c 复制代码
/* fs/ext4/file.c */

static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
{
	struct inode *inode = file->f_mapping->host;

	if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
		return -EIO;

	file_accessed(file);
	if (IS_DAX(file_inode(file))) {
		vma->vm_ops = &ext4_dax_vm_ops;
		vma->vm_flags |= VM_MIXEDMAP | VM_HUGEPAGE;
	} else {
		vma->vm_ops = &ext4_file_vm_ops;
	}
	return 0;
}

到此,mmap 的第 1 步分配 mmap 虚拟地址空间已经完成,接下来看读写时 page fault 中 mmap 相关的处理细节。

2.2 页表 和 page cache 分配

当对 未建立页表 和 未分配 page cache 的文件 mmap 地址进行读写时,将产生 page fault,这里以 ARM32 的处理流程为例进行分析:

c 复制代码
@ arch/arm/kernel/entry-armv.S

vector_stub	dabt, ABT_MODE, 8
	.long	__dabt_usr			@  0  (USR_26 / USR_32)
	...
	.long	__dabt_svc			@  3  (SVC_26 / SVC_32)
	...
	.globl	vector_fiq

	/* 各个CPU模式下的中断向量表指针 */
	.section .vectors, "ax", %progbits
.L__vectors_start:
	...
	W(b)	vector_dabt /* DataAbort模式的中断向量表指针 */
	...

	/* 缺页(DataAbort)中断可产生于[SVC、用户]两种模式下 */
__dabt_usr: /* 用户模式缺页中断 */
	...
	dabt_helper // bl	CPU_DABORT_HANDLER -> bl v7_early_abort
	...

  .align	5
__dabt_svc: /* SVC模式缺页中断 */
	...
	dabt_helper // bl	CPU_DABORT_HANDLER  -> bl v7_early_abort
	...
c 复制代码
@ arch/arm/mm/abort-ev7.S

.align	5
ENTRY(v7_early_abort)
	mrc	p15, 0, r1, c5, c0, 0		@ get FSR
	mrc	p15, 0, r0, c6, c0, 0		@ get FAR
	uaccess_disable ip			@ disable userspace access
	...
	b	do_DataAbort
ENDPROC(v7_early_abort)
c 复制代码
/*
 * 以 3 级页表举例。
 * arch/arm/mm/fsr-3level.c 
 */
static struct fsr_info fsr_info[] = {
	...
	/* 缺页中断处理接口 */
	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 1 translation fault"	}, /* 1级页目录转换接口 */
	{ do_translation_fault,	SIGSEGV, SEGV_MAPERR,	"level 2 translation fault"	}, /* 2级页目录转换接口 */
	{ do_page_fault,	SIGSEGV, SEGV_MAPERR,	"level 3 translation fault"	}, /* 3级页表项转换接口 */
	...
};

/*
 * arch/arm/mm/fault.c 
 */

asmlinkage void __exception
do_DataAbort(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
	const struct fsr_info *inf = fsr_info + fsr_fs(fsr);
	struct siginfo info;

	/* 调用具体类型缺页中断的入口: do_translation_fault() 或 do_page_fault() */
	if (!inf->fn(addr, fsr & ~FSR_LNX_PF, regs))
		return;
	
	...
}

#ifdef CONFIG_MMU
static int __kprobes
do_translation_fault(unsigned long addr, unsigned int fsr,
		     struct pt_regs *regs)
{
	...

	if (addr < TASK_SIZE) /* 用户空间地址 */
		return do_page_fault(addr, fsr, regs);

	...
}

static int __kprobes
do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs)
{
	struct task_struct *tsk;
	struct mm_struct *mm;

	...

	tsk = current;
	mm  = tsk->mm;

	/* Enable interrupts if they were enabled in the parent context. */
	if (interrupts_enabled(regs))
		local_irq_enable();

	...

	if (fsr & FSR_WRITE) /* 写入引发的 Data Abort (如 mmap 文件写,COW: Copy-On-Write) */
		flags |= FAULT_FLAG_WRITE;

	...

	fault = __do_page_fault(mm, addr, fsr, flags, tsk);

	...
}

static int __kprobes
__do_page_fault(struct mm_struct *mm, unsigned long addr, unsigned int fsr,
		unsigned int flags, struct task_struct *tsk)
{
	struct vm_area_struct *vma;
	int fault;

	/*
	 * 查看 addr 是否存在对应的 vma ? 
	 * 如果没有的话,意味着非法地址访问.
	 */
	vma = find_vma(mm, addr);
	fault = VM_FAULT_BADMAP;
	if (unlikely(!vma))
		goto out;
	...

	/*
	 * Ok, we have a good vm_area for this
	 * memory access, so we can handle it.
	 */
	/* 验证对 addr 的访问(读/写/执行)是否合法? */ 
good_area:
	if (access_error(fsr, vma)) {
		fault = VM_FAULT_BADACCESS;
		goto out;
	}

	return handle_mm_fault(vma, addr & PAGE_MASK, flags);

check_stack:
	...
out:
	...
}
c 复制代码
/*
 * mm/memory.c
 */

/* 建立 [页表 + page cache 页面] */
int handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
		unsigned int flags)
{
	int ret;

	...

	if (unlikely(is_vm_hugetlb_page(vma)))
		...
	else
		ret = __handle_mm_fault(vma, address, flags);

	...

	return ret;
}

static int __handle_mm_fault(struct vm_area_struct *vma, unsigned long address,
		unsigned int flags)
{
	struct vm_fault vmf = {
		.vma = vma,
		.address = address & PAGE_MASK,
		.flags = flags,
		.pgoff = linear_page_index(vma, address),
		.gfp_mask = __get_fault_gfp_mask(vma),
	};
	unsigned int dirty = flags & FAULT_FLAG_WRITE;
	struct mm_struct *mm = vma->vm_mm;
	pgd_t *pgd;
	p4d_t *p4d;
	int ret;

	pgd = pgd_offset(mm, address);
	p4d = p4d_alloc(mm, pgd, address);
	if (!p4d)
		return VM_FAULT_OOM;

	vmf.pud = pud_alloc(mm, p4d, address);
	if (!vmf.pud)
		return VM_FAULT_OOM;
	...

	vmf.pmd = pmd_alloc(mm, vmf.pud, address);
	if (!vmf.pmd)
		return VM_FAULT_OOM;
	...

	return handle_pte_fault(&vmf);
}

static int handle_pte_fault(struct vm_fault *vmf)
{
	pte_t entry;

	if (unlikely(pmd_none(*vmf->pmd))) {
		/*
		 * Leave __pte_alloc() until later: because vm_ops->fault may
		 * want to allocate huge page, and if we expose page table
		 * for an instant, it will be difficult to retract from
		 * concurrent faults and from rmap lookups.
		 */
		vmf->pte = NULL;
	} else {
		...
	}

	if (!vmf->pte) {
		if (vma_is_anonymous(vmf->vma))
			return do_anonymous_page(vmf); /* 匿名映射 */
		else
			return do_fault(vmf); /* 场景一: 文件 mmap 映射 */
	}

	...
}

static int do_fault(struct vm_fault *vmf)
{
	struct vm_area_struct *vma = vmf->vma;
	int ret;

	/*
	 * The VMA was not fully populated on mmap() or missing VM_DONTEXPAND
	 */
	if (!vma->vm_ops->fault) {
		...
	} else if (!(vmf->flags & FAULT_FLAG_WRITE))
		ret = do_read_fault(vmf);
	else if (!(vma->vm_flags & VM_SHARED))
		ret = do_cow_fault(vmf); /* COW fault */
	else
		ret = do_shared_fault(vmf);

	/* preallocated pagetable is unused: free it */
	if (vmf->prealloc_pte) {
		pte_free(vma->vm_mm, vmf->prealloc_pte);
		vmf->prealloc_pte = NULL;
	}
	return ret;
}

static int do_read_fault(struct vm_fault *vmf)
{
	struct vm_area_struct *vma = vmf->vma;
	int ret = 0;

	...

	/*
	 * mmap 场景: 
	 * 分配 page cache 页面,读取数据到 page cache 页面,
	 * 然后将 page cache 页面对象从 @vmf 返回。
	 */
	ret = __do_fault(vmf);
	...
	
	/*
	 * 将新建的 page 页面设定到 @vmf 的虚拟地址区间 的 PTE 页表项,
	 * 即将 @vmf 的虚拟地址区间 映射到 page 页面。
	 */
	ret |= finish_fault(vmf);
	...
	
	return ret;
}

static int __do_fault(struct vm_fault *vmf)
{
	struct vm_area_struct *vma = vmf->vma;
	int ret;

	...

	ret = vma->vm_ops->fault(vmf); /* ext4_filemap_fault() */

	...

	return ret;
}
c 复制代码
/* fs/ext4/inode.c */

int ext4_filemap_fault(struct vm_fault *vmf)
{
	struct inode *inode = file_inode(vmf->vma->vm_file);
	int err;

	down_read(&EXT4_I(inode)->i_mmap_sem);
	err = filemap_fault(vmf);
	up_read(&EXT4_I(inode)->i_mmap_sem);

	return err;
}
c 复制代码
/* mm/filemap.c */

/**
 * filemap_fault - read in file data for page fault handling
 * @vmf:	struct vm_fault containing details of the fault
 * ......
 */
int filemap_fault(struct vm_fault *vmf)
{
	int error;
	struct file *file = vmf->vma->vm_file;
	struct address_space *mapping = file->f_mapping;
	struct file_ra_state *ra = &file->f_ra;
	struct inode *inode = mapping->host;
	pgoff_t offset = vmf->pgoff;
	pgoff_t max_off;
	struct page *page;
	int ret = 0;

	...

	/*
	 * Do we have something in the page cache already?
	 */
	page = find_get_page(mapping, offset);
	if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) { /* 找到了文件 @file 对应的 page cache */
		/*
		 * We found the page, so try async readahead before
		 * waiting for the lock.
		 */
		do_async_mmap_readahead(vmf->vma, ra, file, page, offset);
	} else if (!page) { /* 没有找到文件 @file 对应的 page cache, 分配 page cache, 并预读指定 @offset 的内容到 page cache */
		/* No page in the page cache at all */
		do_sync_mmap_readahead(vmf->vma, ra, file, offset);
		count_vm_event(PGMAJFAULT);
		count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT);
		ret = VM_FAULT_MAJOR;
retry_find:
		page = find_get_page(mapping, offset);
		if (!page)
			goto no_cached_page;
	}

	...

	/*
	 * 返回 page cache 页面。
	 * 后续在 finish_fault() 中设定到 mmap 映射的虚拟地址区间 的 PTE 
	 * 页表项,即将 mmap 映射的虚拟地址区间 映射到 page cache 页面
	 */
	vmf->page = page;
	return ret | VM_FAULT_LOCKED;

no_cached_page:
	/*
	 * We're only likely to ever get here if MADV_RANDOM is in
	 * effect.
	 */
	error = page_cache_read(file, offset, vmf->gfp_mask);
	...
}

/**
 * page_cache_read - adds requested page to the page cache if not already there
 * @file:	file to read
 * @offset:	page index
 * @gfp_mask:	memory allocation flags
 *
 * This adds the requested page to the page cache if it isn't already there,
 * and schedules an I/O to read in its contents from disk.
 */
static int page_cache_read(struct file *file, pgoff_t offset, gfp_t gfp_mask)
{
	struct address_space *mapping = file->f_mapping;
	struct page *page;
	int ret;

	do {
		/* 分配 page cache 页面 */
		page = __page_cache_alloc(gfp_mask|__GFP_COLD);
		if (!page)
			return -ENOMEM;

		/* 添加 read page cache 页面到 地址空间 @mapping 的 LRU 链表 */
		ret = add_to_page_cache_lru(page, mapping, offset, gfp_mask);
		if (ret == 0)
			/*
			 * 请求从磁盘读取数据到 page cache
			 * ext4: ext4_readpage()
			 * ...
			 */
			ret = mapping->a_ops->readpage(file, page);
		else if (ret == -EEXIST)
			ret = 0; /* losing race to add is OK */

		put_page(page);

	} while (ret == AOP_TRUNCATED_PAGE);

	return ret;
}

最后,finish_fault()mmap 虚拟地址区间映射到分配的 page cache 页面

c 复制代码
int finish_fault(struct vm_fault *vmf)
{
	struct page *page;
	int ret = 0;
	
	/* Did we COW the page? */
	if ((vmf->flags & FAULT_FLAG_WRITE) &&
	    !(vmf->vma->vm_flags & VM_SHARED))
		page = vmf->cow_page;
	else
		page = vmf->page;

	/*
	 * check even for read faults because we might have lost our CoWed
	 * page
	 */
	if (!(vmf->vma->vm_flags & VM_SHARED))
		ret = check_stable_address_space(vmf->vma->vm_mm);
	if (!ret)
		ret = alloc_set_pte(vmf, vmf->memcg, page); /* PTE 页表项映射到 @page 页面 */
	if (vmf->pte)
		pte_unmap_unlock(vmf->pte, vmf->ptl);
	return ret;
}

到此,mmap 读写操作 page fault 过程中建立 mmap 映射区间页表以及 page cahce 的过程已经分析完毕。

3. 推荐阅读

从内核世界透视 mmap 内存映射的本质(源码实现篇)

相关推荐
洋芋土豆3 小时前
linux用户及权限管理
linux·运维·服务器
wheeldown4 小时前
【Linux】Linux内存管理与线程控制核心解析
linux·运维·服务器
努力努力再努力wz4 小时前
【Linux进阶系列】:线程(下)
linux·运维·服务器·c语言·数据结构·c++·算法
LCG元5 小时前
Linux 防火墙双雄:iptables 与 firewalld 配置案例详解
linux
KV_T5 小时前
centos运维常用命令
linux·运维·centos
dessler5 小时前
MYSQL-主键(Primary Key)
linux·运维·mysql
LCG元5 小时前
Nginx 配置入门到实战:从静态网站到负载均衡
linux
代码程序猿RIP5 小时前
【Linux面经】OSI七层模型和TCP/IP四层体系结构
linux·网络·面试·面经
我什么都学不会5 小时前
DNS主从服务器练习
linux·运维·服务器