ext4 - mballoc块分配机制

概述

ext4为了尽量避免block管理的碎片化有如此措施:

1.mballoc多块分配器。

  • buddy算法管理每个block group
  • 采用prellocation机制,氛围per-cpu local preallocation和per inode preallocation
    • 小文件和大文件采用不同的策略
    • 小文件(具体怎么算小文件可配置)尽量保持在一起,默认应该是512 blocks的一块区域, 采用的是per_cpu locality group,为每个cpu都配置这么一块存放小文件的区域。
    • 大文件采用per-inode preallocation方式。
  • block分配时,会比请求的分配数量更多,多余的空间会放入preallocation space,这样给write多留些空间,避免concurrent write时候碎片化。
  • 计算目标goal phsycial block,尽量保持块分配的连续性。

2.delay allocation。

  • delay allocation可以尽可能将连续的申请组织成extent,配置mballoc一次分配连续的多个phsycial block,降低cpu使用率/碎片化。

3.data block优先和其inode在同一个block group中

4.磁盘分成128M的block group

5.同一个目录下的inode优先保存期该目录所在的block group(具体源码在哪里尚未找到,不太确认ext4是否实现)

6.defrag反碎片化工具。

ext4_mb_new_blocks

ext4 mballoc执行phsycial block分配的入口点是ext4_mb_new_blocks:

复制代码
/*
 * Main entry point into mballoc to allocate blocks
 * it tries to use preallocation first, then falls back
 * to usual allocation
 */
ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
				struct ext4_allocation_request *ar, int *errp)
{
	struct ext4_allocation_context *ac = NULL;
	struct ext4_sb_info *sbi;
	struct super_block *sb;
	ext4_fsblk_t block = 0;
	unsigned int inquota = 0;
	unsigned int reserv_clstrs = 0;
	u64 seq;

	might_sleep();
	sb = ar->inode->i_sb;
	sbi = EXT4_SB(sb);

	trace_ext4_request_blocks(ar);
    .../*主要是检查是否有足够的空间满足分配*/
      
    
    //创建一个allocation context
	ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
	if (!ac) {
		ar->len = 0;
		*errp = -ENOMEM;
		goto out;
	}
    //初始化context
	*errp = ext4_mb_initialize_context(ac, ar);
	if (*errp) {
		ar->len = 0;
		goto out;
	}

	ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
	seq = this_cpu_read(discard_pa_seq);
    //优先使用prellcation space分配
	if (!ext4_mb_use_preallocated(ac)) {
		ac->ac_op = EXT4_MB_HISTORY_ALLOC;
        //所谓规范化本质是分配比请求量更大的空间
		ext4_mb_normalize_request(ac, ar);
        
        //初始化ac->pa
		*errp = ext4_mb_pa_alloc(ac);
		if (*errp)
			goto errout;
repeat:
		/* allocate space in core */
        //预分配失败,进入常规的分配逻辑
		*errp = ext4_mb_regular_allocator(ac);
		/*
		 * pa allocated above is added to grp->bb_prealloc_list only
		 * when we were able to allocate some block i.e. when
		 * ac->ac_status == AC_STATUS_FOUND.
		 * And error from above mean ac->ac_status != AC_STATUS_FOUND
		 * So we have to free this pa here itself.
		 */
		if (*errp) {
			ext4_mb_pa_free(ac);
			ext4_discard_allocated_blocks(ac);
			goto errout;
		}
		if (ac->ac_status == AC_STATUS_FOUND &&
			ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
			ext4_mb_pa_free(ac);
	}
	if (likely(ac->ac_status == AC_STATUS_FOUND)) {
		*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
		if (*errp) {
			ext4_discard_allocated_blocks(ac);
			goto errout;
		} else {
			block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
			ar->len = ac->ac_b_ex.fe_len;
		}
	} else {
		if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
			goto repeat;
		/*
		 * If block allocation fails then the pa allocated above
		 * needs to be freed here itself.
		 */
		ext4_mb_pa_free(ac);
		*errp = -ENOSPC;
	}
    ...

	return block;
}
ext4_allocation_context结构体
复制代码
struct ext4_allocation_context {
	struct inode *ac_inode;
	struct super_block *ac_sb;

	/* original request */
	struct ext4_free_extent ac_o_ex;

	/* goal request (normalized ac_o_ex) */
	struct ext4_free_extent ac_g_ex;

	/* the best found extent */
	struct ext4_free_extent ac_b_ex;

	/* copy of the best found extent taken before preallocation efforts */
	struct ext4_free_extent ac_f_ex;

	__u16 ac_groups_scanned;
	__u16 ac_found;
	__u16 ac_tail;
	__u16 ac_buddy;
	__u16 ac_flags;		/* allocation hints */
	__u8 ac_status;
	__u8 ac_criteria;
	__u8 ac_2order;		/* if request is to allocate 2^N blocks and
				 * N > 0, the field stores N, otherwise 0 */
	__u8 ac_op;		/* operation, for history only */
	struct page *ac_bitmap_page;
	struct page *ac_buddy_page;
	struct ext4_prealloc_space *ac_pa;
	struct ext4_locality_group *ac_lg;
};

上面注释写的非常清晰:

ac_o_ex: 原始请求

ac_g_ex :目标请求,可以跟ac_o_ex不同,比如如注释中说明,ac_g_ex是ac_o_ex经过normalized(对应mballoc::ext4_mb_normalize_request函数处理之后即为ac_g_ex)的结果,**ac_b_ex:**最终的分配结果,因为ac_g_ex未必能被100%满足

ac_f_ex: ac_b_ex的一份拷贝。

**ac_2order:**申请物理block数量如果正好是2的N次方,那么ac_2order = N,否则为0

ac_bitmap_page/ac_buddy_page : 跟mballoc相关的bit位信息,参考ext4 mballoc之buddy算法_nginux的博客-CSDN博客

**ac_pa:**per-inode预分配

**ac_lg:**per-cpu预分配,给小文件准备的。

ext4_mb_initialize_context函数
复制代码
static noinline_for_stack int
ext4_mb_initialize_context(struct ext4_allocation_context *ac,
				struct ext4_allocation_request *ar)
{
	struct super_block *sb = ar->inode->i_sb;
	struct ext4_sb_info *sbi = EXT4_SB(sb);
	struct ext4_super_block *es = sbi->s_es;
	ext4_group_t group;
	unsigned int len;
	ext4_fsblk_t goal;
	ext4_grpblk_t block;

	/* we can't allocate > group size */
	len = ar->len;

	/* just a dirty hack to filter too big requests  */
	if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
		len = EXT4_CLUSTERS_PER_GROUP(sb);

	/* start searching from the goal */
	goal = ar->goal;
	if (goal < le32_to_cpu(es->s_first_data_block) ||
			goal >= ext4_blocks_count(es))
		goal = le32_to_cpu(es->s_first_data_block);
	ext4_get_group_no_and_offset(sb, goal, &group, &block);

	/* set up allocation goals */
	ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
	ac->ac_status = AC_STATUS_CONTINUE;
	ac->ac_sb = sb;
	ac->ac_inode = ar->inode;
	ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
	ac->ac_o_ex.fe_group = group;
	ac->ac_o_ex.fe_start = block;
	ac->ac_o_ex.fe_len = len;
    //可以看到normalized前ac_g_ex跟ac_o_ex相同
	ac->ac_g_ex = ac->ac_o_ex;
	ac->ac_flags = ar->flags;

	/* we have to define context: we'll work with a file or
	 * locality group. this is a policy, actually */
	ext4_mb_group_or_file(ac);

    ...
	return 0;

}

首先完成ac_o_ex和 ac_g_ex的赋值工作;然后,ext4_mb_group_or_file函数决定是一个文件到底是小文件和大文件,如概述中描述,ext4针对这两种文件策略不同。

ext4_mb_group_or_file函数
复制代码
/*
 * We use locality group preallocation for small size file. The size of the
 * file is determined by the current size or the resulting size after
 * allocation which ever is larger
 *
 * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
 */
static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
{
	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
	int bsbits = ac->ac_sb->s_blocksize_bits;
	loff_t size, isize;

	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
		return;

	if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
		return;

	size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
	isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
		>> bsbits;

	if ((size == isize) && !ext4_fs_is_busy(sbi) &&
	    !inode_is_open_for_write(ac->ac_inode)) {
		ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
		return;
	}
    //s_mb_group_prealloc是给小文分配的per-cpu local group空间大小,如果<=0
    //就设置EXT4_MB_STREAM_ALLOC,不适用小文件分配策略
	if (sbi->s_mb_group_prealloc <= 0) {
		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
		return;
	}

	/* don't use group allocation for large files */
    //s_mb_stream_request值来自于/sys/fs/ext4/xxx/mb_stream_req,文件大小大于了该值
    //为大文件,否则为小文件
	size = max(size, isize);
	if (size > sbi->s_mb_stream_request) {
		ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
		return;
	}

	BUG_ON(ac->ac_lg != NULL);
	/*
	 * locality group prealloc space are per cpu. The reason for having
	 * per cpu locality group is to reduce the contention between block
	 * request from multiple CPUs.
	 */
	ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);

	/* we're going to use group allocation */
    //如果进行到这里,说明是小文件
	ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;

	/* serialize all allocations in the group */
	mutex_lock(&ac->ac_lg->lg_mutex);
}
ext4_prealloc_space结构体
复制代码
struct ext4_prealloc_space {
    //如果是per-inode preallocation挂在ext4_inode_info的i_prealloc_list
    //如果是per_cpu locality group预分配空间挂在ext4_locality_group的lg_prealloc_list链表上
	struct list_head	pa_inode_list;

    //预分配空间同时也会挂在ext4_group_info的bb_prealloc_list链表上,
    //用于初始化buddy bitmap的之      前给block bitmap置上对应的已使用标记
	struct list_head	pa_group_list;
	union {
		struct list_head pa_tmp_list;
		struct rcu_head	pa_rcu;
	} u;
	spinlock_t		pa_lock;
	atomic_t		pa_count;
    //预分配空间是否已删除
	unsigned		pa_deleted;
    //起始物理块号
	ext4_fsblk_t		pa_pstart;	/* phys. block */
    //起始逻辑块号(相对于文件)
	ext4_lblk_t		pa_lstart;	/* log. block */
    //预分配空间长度(单位是block)
	ext4_grpblk_t		pa_len;		/* len of preallocated chunk */
    //空间的可用长度
	ext4_grpblk_t		pa_free;	/* how many blocks are free */
    //类型,indode or group
	unsigned short		pa_type;	/* pa type. inode or group */
	spinlock_t		*pa_obj_lock;
	struct inode		*pa_inode;	/* hack, for history only */
};
ext4_locality_group结构体
复制代码
/*
 * Locality group:
 *   we try to group all related changes together
 *   so that writeback can flush/allocate them together as well
 *   Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
 *   (512). We store prealloc space into the hash based on the pa_free blocks
 *   order value.ie, fls(pa_free)-1;
 */
#define PREALLOC_TB_SIZE 10
struct ext4_locality_group {
	/* for allocator */
	/* to serialize allocates */
	struct mutex		lg_mutex;
	/* list of preallocations */
    // 挂ext4_prealloc_space的链表,按照预分配空间的可用长度进行分组
	struct list_head	lg_prealloc_list[PREALLOC_TB_SIZE];
	spinlock_t		lg_prealloc_lock;
};
ext4_mb_use_preallocated函数

函数判定能否使用preallocation space分配block,优先使用per-inode preallocation预分配空间;如果失败,再判断是否是小文件能使用per-cpu local group preallocation预分配空间;如果任何一个预分配空间分配成功,return true;否者return false代表无法使用预分配空间。

复制代码
/*
 * search goal blocks in preallocated space
 */
static noinline_for_stack bool
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
	int order, i;
	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
	struct ext4_locality_group *lg;
	struct ext4_prealloc_space *pa, *cpa = NULL;
	ext4_fsblk_t goal_block;

	/* only data can be preallocated */
    //linux一切皆文件,只要普通文件才使用预分配,EXT4_MB_HINT_DATA是ext4_ext_map_blocks
    //中根据如下条件设置    if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA; 
	if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
		return false;

	/* first, try per-file preallocation */
	rcu_read_lock();
	list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {

		/* all fields in this condition don't change,
		 * so we can skip locking for them */
        //不在这个预分配空间范围内,跳到下一个预分配空间
		if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
		    ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
					       EXT4_C2B(sbi, pa->pa_len)))
			continue;

		/* non-extent files can't have physical blocks past 2^32 */
		if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
		    (pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
		     EXT4_MAX_BLOCK_FILE_PHYS))
			continue;

		/* found preallocated blocks, use them */
        //找到了合适的预分配空间
		spin_lock(&pa->pa_lock);
		if (pa->pa_deleted == 0 && pa->pa_free) {
			atomic_inc(&pa->pa_count);
			ext4_mb_use_inode_pa(ac, pa);
			spin_unlock(&pa->pa_lock);
			ac->ac_criteria = 10;
			rcu_read_unlock();
			return true;
		}
		spin_unlock(&pa->pa_lock);
	}
	rcu_read_unlock();
    //走到这里说明per-inode没有分配成功,需要判定能否是小文件走per-cpu local group分配
	/* can we use group allocation? */
	if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
		return false;

	/* inode may have no locality group for some reason */
	lg = ac->ac_lg;
	if (lg == NULL)
		return false;
	order  = fls(ac->ac_o_ex.fe_len) - 1;
	if (order > PREALLOC_TB_SIZE - 1)
		/* The max size of hash table is PREALLOC_TB_SIZE */
		order = PREALLOC_TB_SIZE - 1;

	goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
	/*
	 * search for the prealloc space that is having
	 * minimal distance from the goal block.
	 */
	for (i = order; i < PREALLOC_TB_SIZE; i++) {
		rcu_read_lock();
		list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
					pa_inode_list) {
			spin_lock(&pa->pa_lock);
			if (pa->pa_deleted == 0 &&
					pa->pa_free >= ac->ac_o_ex.fe_len) {

				cpa = ext4_mb_check_group_pa(goal_block,
								pa, cpa);
			}
			spin_unlock(&pa->pa_lock);
		}
		rcu_read_unlock();
	}
	if (cpa) {
        //小文件预分配空间分配成功
		ext4_mb_use_group_pa(ac, cpa);
		ac->ac_criteria = 20;
		return true;
	}
	return false;
}
ext4_mb_normalize_request

预分配空间分配失败就会进入ext4_mb_normalize_request,如代码注释所谓的normalize是考虑申请更合适的大小,一般会大于等于request size.

ext4_mb_regular_allocator

这个函数是mballoc buddy分配算法的核心函数,涉及的内容非常多,后面专门放到一篇文章分析

参考文章:

https://www.cnblogs.com/kanie/p/15359346.html

相关推荐
眠修11 分钟前
Kuberrnetes 服务发布
linux·运维·服务器
即将头秃的程序媛3 小时前
centos 7.9安装tomcat,并实现开机自启
linux·运维·centos
fangeqin3 小时前
ubuntu源码安装python3.13遇到Could not build the ssl module!解决方法
linux·python·ubuntu·openssl
爱奥尼欧5 小时前
【Linux 系统】基础IO——Linux中对文件的理解
linux·服务器·microsoft
超喜欢下雨天5 小时前
服务器安装 ros2时遇到底层库依赖冲突的问题
linux·运维·服务器·ros2
tan77º6 小时前
【Linux网络编程】网络基础
linux·服务器·网络
笑衬人心。6 小时前
Ubuntu 22.04 + MySQL 8 无密码登录问题与 root 密码重置指南
linux·mysql·ubuntu
chanalbert8 小时前
CentOS系统新手指导手册
linux·运维·centos
星宸追风8 小时前
Ubuntu更换Home目录所在硬盘的过程
linux·运维·ubuntu
热爱生活的猴子9 小时前
Poetry 在 Linux 和 Windows 系统中的安装步骤
linux·运维·windows