概述
ext4为了尽量避免block管理的碎片化有如此措施:
1.mballoc多块分配器。
- buddy算法管理每个block group
- 采用prellocation机制,氛围per-cpu local preallocation和per inode preallocation
- 小文件和大文件采用不同的策略
- 小文件(具体怎么算小文件可配置)尽量保持在一起,默认应该是512 blocks的一块区域, 采用的是per_cpu locality group,为每个cpu都配置这么一块存放小文件的区域。
- 大文件采用per-inode preallocation方式。
- block分配时,会比请求的分配数量更多,多余的空间会放入preallocation space,这样给write多留些空间,避免concurrent write时候碎片化。
- 计算目标goal phsycial block,尽量保持块分配的连续性。
2.delay allocation。
- delay allocation可以尽可能将连续的申请组织成extent,配置mballoc一次分配连续的多个phsycial block,降低cpu使用率/碎片化。
3.data block优先和其inode在同一个block group中
4.磁盘分成128M的block group
5.同一个目录下的inode优先保存期该目录所在的block group(具体源码在哪里尚未找到,不太确认ext4是否实现)
6.defrag反碎片化工具。
ext4_mb_new_blocks
ext4 mballoc执行phsycial block分配的入口点是ext4_mb_new_blocks:
/*
* Main entry point into mballoc to allocate blocks
* it tries to use preallocation first, then falls back
* to usual allocation
*/
ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle,
struct ext4_allocation_request *ar, int *errp)
{
struct ext4_allocation_context *ac = NULL;
struct ext4_sb_info *sbi;
struct super_block *sb;
ext4_fsblk_t block = 0;
unsigned int inquota = 0;
unsigned int reserv_clstrs = 0;
u64 seq;
might_sleep();
sb = ar->inode->i_sb;
sbi = EXT4_SB(sb);
trace_ext4_request_blocks(ar);
.../*主要是检查是否有足够的空间满足分配*/
//创建一个allocation context
ac = kmem_cache_zalloc(ext4_ac_cachep, GFP_NOFS);
if (!ac) {
ar->len = 0;
*errp = -ENOMEM;
goto out;
}
//初始化context
*errp = ext4_mb_initialize_context(ac, ar);
if (*errp) {
ar->len = 0;
goto out;
}
ac->ac_op = EXT4_MB_HISTORY_PREALLOC;
seq = this_cpu_read(discard_pa_seq);
//优先使用prellcation space分配
if (!ext4_mb_use_preallocated(ac)) {
ac->ac_op = EXT4_MB_HISTORY_ALLOC;
//所谓规范化本质是分配比请求量更大的空间
ext4_mb_normalize_request(ac, ar);
//初始化ac->pa
*errp = ext4_mb_pa_alloc(ac);
if (*errp)
goto errout;
repeat:
/* allocate space in core */
//预分配失败,进入常规的分配逻辑
*errp = ext4_mb_regular_allocator(ac);
/*
* pa allocated above is added to grp->bb_prealloc_list only
* when we were able to allocate some block i.e. when
* ac->ac_status == AC_STATUS_FOUND.
* And error from above mean ac->ac_status != AC_STATUS_FOUND
* So we have to free this pa here itself.
*/
if (*errp) {
ext4_mb_pa_free(ac);
ext4_discard_allocated_blocks(ac);
goto errout;
}
if (ac->ac_status == AC_STATUS_FOUND &&
ac->ac_o_ex.fe_len >= ac->ac_f_ex.fe_len)
ext4_mb_pa_free(ac);
}
if (likely(ac->ac_status == AC_STATUS_FOUND)) {
*errp = ext4_mb_mark_diskspace_used(ac, handle, reserv_clstrs);
if (*errp) {
ext4_discard_allocated_blocks(ac);
goto errout;
} else {
block = ext4_grp_offs_to_block(sb, &ac->ac_b_ex);
ar->len = ac->ac_b_ex.fe_len;
}
} else {
if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
goto repeat;
/*
* If block allocation fails then the pa allocated above
* needs to be freed here itself.
*/
ext4_mb_pa_free(ac);
*errp = -ENOSPC;
}
...
return block;
}
ext4_allocation_context结构体
struct ext4_allocation_context {
struct inode *ac_inode;
struct super_block *ac_sb;
/* original request */
struct ext4_free_extent ac_o_ex;
/* goal request (normalized ac_o_ex) */
struct ext4_free_extent ac_g_ex;
/* the best found extent */
struct ext4_free_extent ac_b_ex;
/* copy of the best found extent taken before preallocation efforts */
struct ext4_free_extent ac_f_ex;
__u16 ac_groups_scanned;
__u16 ac_found;
__u16 ac_tail;
__u16 ac_buddy;
__u16 ac_flags; /* allocation hints */
__u8 ac_status;
__u8 ac_criteria;
__u8 ac_2order; /* if request is to allocate 2^N blocks and
* N > 0, the field stores N, otherwise 0 */
__u8 ac_op; /* operation, for history only */
struct page *ac_bitmap_page;
struct page *ac_buddy_page;
struct ext4_prealloc_space *ac_pa;
struct ext4_locality_group *ac_lg;
};
上面注释写的非常清晰:
ac_o_ex: 原始请求
ac_g_ex :目标请求,可以跟ac_o_ex不同,比如如注释中说明,ac_g_ex是ac_o_ex经过normalized(对应mballoc::ext4_mb_normalize_request函数处理之后即为ac_g_ex)的结果,**ac_b_ex:**最终的分配结果,因为ac_g_ex未必能被100%满足
ac_f_ex: ac_b_ex的一份拷贝。
**ac_2order:**申请物理block数量如果正好是2的N次方,那么ac_2order = N,否则为0
ac_bitmap_page/ac_buddy_page : 跟mballoc相关的bit位信息,参考ext4 mballoc之buddy算法_nginux的博客-CSDN博客
**ac_pa:**per-inode预分配
**ac_lg:**per-cpu预分配,给小文件准备的。
ext4_mb_initialize_context函数
static noinline_for_stack int
ext4_mb_initialize_context(struct ext4_allocation_context *ac,
struct ext4_allocation_request *ar)
{
struct super_block *sb = ar->inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_super_block *es = sbi->s_es;
ext4_group_t group;
unsigned int len;
ext4_fsblk_t goal;
ext4_grpblk_t block;
/* we can't allocate > group size */
len = ar->len;
/* just a dirty hack to filter too big requests */
if (len >= EXT4_CLUSTERS_PER_GROUP(sb))
len = EXT4_CLUSTERS_PER_GROUP(sb);
/* start searching from the goal */
goal = ar->goal;
if (goal < le32_to_cpu(es->s_first_data_block) ||
goal >= ext4_blocks_count(es))
goal = le32_to_cpu(es->s_first_data_block);
ext4_get_group_no_and_offset(sb, goal, &group, &block);
/* set up allocation goals */
ac->ac_b_ex.fe_logical = EXT4_LBLK_CMASK(sbi, ar->logical);
ac->ac_status = AC_STATUS_CONTINUE;
ac->ac_sb = sb;
ac->ac_inode = ar->inode;
ac->ac_o_ex.fe_logical = ac->ac_b_ex.fe_logical;
ac->ac_o_ex.fe_group = group;
ac->ac_o_ex.fe_start = block;
ac->ac_o_ex.fe_len = len;
//可以看到normalized前ac_g_ex跟ac_o_ex相同
ac->ac_g_ex = ac->ac_o_ex;
ac->ac_flags = ar->flags;
/* we have to define context: we'll work with a file or
* locality group. this is a policy, actually */
ext4_mb_group_or_file(ac);
...
return 0;
}
首先完成ac_o_ex和 ac_g_ex的赋值工作;然后,ext4_mb_group_or_file函数决定是一个文件到底是小文件和大文件,如概述中描述,ext4针对这两种文件策略不同。
ext4_mb_group_or_file函数
/*
* We use locality group preallocation for small size file. The size of the
* file is determined by the current size or the resulting size after
* allocation which ever is larger
*
* One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
*/
static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int bsbits = ac->ac_sb->s_blocksize_bits;
loff_t size, isize;
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return;
if (unlikely(ac->ac_flags & EXT4_MB_HINT_GOAL_ONLY))
return;
size = ac->ac_o_ex.fe_logical + EXT4_C2B(sbi, ac->ac_o_ex.fe_len);
isize = (i_size_read(ac->ac_inode) + ac->ac_sb->s_blocksize - 1)
>> bsbits;
if ((size == isize) && !ext4_fs_is_busy(sbi) &&
!inode_is_open_for_write(ac->ac_inode)) {
ac->ac_flags |= EXT4_MB_HINT_NOPREALLOC;
return;
}
//s_mb_group_prealloc是给小文分配的per-cpu local group空间大小,如果<=0
//就设置EXT4_MB_STREAM_ALLOC,不适用小文件分配策略
if (sbi->s_mb_group_prealloc <= 0) {
ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
}
/* don't use group allocation for large files */
//s_mb_stream_request值来自于/sys/fs/ext4/xxx/mb_stream_req,文件大小大于了该值
//为大文件,否则为小文件
size = max(size, isize);
if (size > sbi->s_mb_stream_request) {
ac->ac_flags |= EXT4_MB_STREAM_ALLOC;
return;
}
BUG_ON(ac->ac_lg != NULL);
/*
* locality group prealloc space are per cpu. The reason for having
* per cpu locality group is to reduce the contention between block
* request from multiple CPUs.
*/
ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups);
/* we're going to use group allocation */
//如果进行到这里,说明是小文件
ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC;
/* serialize all allocations in the group */
mutex_lock(&ac->ac_lg->lg_mutex);
}
ext4_prealloc_space结构体
struct ext4_prealloc_space {
//如果是per-inode preallocation挂在ext4_inode_info的i_prealloc_list
//如果是per_cpu locality group预分配空间挂在ext4_locality_group的lg_prealloc_list链表上
struct list_head pa_inode_list;
//预分配空间同时也会挂在ext4_group_info的bb_prealloc_list链表上,
//用于初始化buddy bitmap的之 前给block bitmap置上对应的已使用标记
struct list_head pa_group_list;
union {
struct list_head pa_tmp_list;
struct rcu_head pa_rcu;
} u;
spinlock_t pa_lock;
atomic_t pa_count;
//预分配空间是否已删除
unsigned pa_deleted;
//起始物理块号
ext4_fsblk_t pa_pstart; /* phys. block */
//起始逻辑块号(相对于文件)
ext4_lblk_t pa_lstart; /* log. block */
//预分配空间长度(单位是block)
ext4_grpblk_t pa_len; /* len of preallocated chunk */
//空间的可用长度
ext4_grpblk_t pa_free; /* how many blocks are free */
//类型,indode or group
unsigned short pa_type; /* pa type. inode or group */
spinlock_t *pa_obj_lock;
struct inode *pa_inode; /* hack, for history only */
};
ext4_locality_group结构体
/*
* Locality group:
* we try to group all related changes together
* so that writeback can flush/allocate them together as well
* Size of lg_prealloc_list hash is determined by MB_DEFAULT_GROUP_PREALLOC
* (512). We store prealloc space into the hash based on the pa_free blocks
* order value.ie, fls(pa_free)-1;
*/
#define PREALLOC_TB_SIZE 10
struct ext4_locality_group {
/* for allocator */
/* to serialize allocates */
struct mutex lg_mutex;
/* list of preallocations */
// 挂ext4_prealloc_space的链表,按照预分配空间的可用长度进行分组
struct list_head lg_prealloc_list[PREALLOC_TB_SIZE];
spinlock_t lg_prealloc_lock;
};
ext4_mb_use_preallocated函数
函数判定能否使用preallocation space分配block,优先使用per-inode preallocation预分配空间;如果失败,再判断是否是小文件能使用per-cpu local group preallocation预分配空间;如果任何一个预分配空间分配成功,return true;否者return false代表无法使用预分配空间。
/*
* search goal blocks in preallocated space
*/
static noinline_for_stack bool
ext4_mb_use_preallocated(struct ext4_allocation_context *ac)
{
struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
int order, i;
struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
struct ext4_locality_group *lg;
struct ext4_prealloc_space *pa, *cpa = NULL;
ext4_fsblk_t goal_block;
/* only data can be preallocated */
//linux一切皆文件,只要普通文件才使用预分配,EXT4_MB_HINT_DATA是ext4_ext_map_blocks
//中根据如下条件设置 if (S_ISREG(inode->i_mode)) ar.flags = EXT4_MB_HINT_DATA;
if (!(ac->ac_flags & EXT4_MB_HINT_DATA))
return false;
/* first, try per-file preallocation */
rcu_read_lock();
list_for_each_entry_rcu(pa, &ei->i_prealloc_list, pa_inode_list) {
/* all fields in this condition don't change,
* so we can skip locking for them */
//不在这个预分配空间范围内,跳到下一个预分配空间
if (ac->ac_o_ex.fe_logical < pa->pa_lstart ||
ac->ac_o_ex.fe_logical >= (pa->pa_lstart +
EXT4_C2B(sbi, pa->pa_len)))
continue;
/* non-extent files can't have physical blocks past 2^32 */
if (!(ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) &&
(pa->pa_pstart + EXT4_C2B(sbi, pa->pa_len) >
EXT4_MAX_BLOCK_FILE_PHYS))
continue;
/* found preallocated blocks, use them */
//找到了合适的预分配空间
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0 && pa->pa_free) {
atomic_inc(&pa->pa_count);
ext4_mb_use_inode_pa(ac, pa);
spin_unlock(&pa->pa_lock);
ac->ac_criteria = 10;
rcu_read_unlock();
return true;
}
spin_unlock(&pa->pa_lock);
}
rcu_read_unlock();
//走到这里说明per-inode没有分配成功,需要判定能否是小文件走per-cpu local group分配
/* can we use group allocation? */
if (!(ac->ac_flags & EXT4_MB_HINT_GROUP_ALLOC))
return false;
/* inode may have no locality group for some reason */
lg = ac->ac_lg;
if (lg == NULL)
return false;
order = fls(ac->ac_o_ex.fe_len) - 1;
if (order > PREALLOC_TB_SIZE - 1)
/* The max size of hash table is PREALLOC_TB_SIZE */
order = PREALLOC_TB_SIZE - 1;
goal_block = ext4_grp_offs_to_block(ac->ac_sb, &ac->ac_g_ex);
/*
* search for the prealloc space that is having
* minimal distance from the goal block.
*/
for (i = order; i < PREALLOC_TB_SIZE; i++) {
rcu_read_lock();
list_for_each_entry_rcu(pa, &lg->lg_prealloc_list[i],
pa_inode_list) {
spin_lock(&pa->pa_lock);
if (pa->pa_deleted == 0 &&
pa->pa_free >= ac->ac_o_ex.fe_len) {
cpa = ext4_mb_check_group_pa(goal_block,
pa, cpa);
}
spin_unlock(&pa->pa_lock);
}
rcu_read_unlock();
}
if (cpa) {
//小文件预分配空间分配成功
ext4_mb_use_group_pa(ac, cpa);
ac->ac_criteria = 20;
return true;
}
return false;
}
ext4_mb_normalize_request
预分配空间分配失败就会进入ext4_mb_normalize_request,如代码注释所谓的normalize是考虑申请更合适的大小,一般会大于等于request size.
ext4_mb_regular_allocator
这个函数是mballoc buddy分配算法的核心函数,涉及的内容非常多,后面专门放到一篇文章分析
参考文章: