Linux 软raid - - Barrier

什么是Barriers

在linux软raid中，用来处理正常IO和同步IO的并发问题，可以简单理解为专用于软raid的锁。

软raid在做resync/recovery，或者配置操作时需要raise 屏障，于此同时必须暂停正常IO。

barrier是可以被多次raise的一个计数器，来计算有多少个相关活动事件在发生，其中不包括正常IO。

raise 屏障的条件是没有pending的IO即nr_pending=0。

只有在没有人等待barrier down的情况下，才会选择raise barrier。这意味着，一旦IO请求准备就绪，在IO请求有机会之前，不会启动其他需要屏障的操作。

常规IO调用"wait_barrier"。当返回时，没有后台组IO发生，它必须安排在完成IO后调用allow_barrier。

后台组IO调用必须调用raise_barrier。一旦返回，就没有正常的IO发生。它必须安排在特定后台IO完成时调用lower_barrier。

c 复制代码

/* Barriers....
 * Sometimes we need to suspend IO while we do something else,
 * either some resync/recovery, or reconfigure the array.
 * To do this we raise a 'barrier'.
 * The 'barrier' is a counter that can be raised multiple times
 * to count how many activities are happening which preclude
 * normal IO.
 * We can only raise the barrier if there is no pending IO.
 * i.e. if nr_pending == 0.
 * We choose only to raise the barrier if no-one is waiting for the
 * barrier to go down.  This means that as soon as an IO request
 * is ready, no other operations which require a barrier will start
 * until the IO request has had a chance.
 *
 * So: regular IO calls 'wait_barrier'.  When that returns there
 *    is no backgroup IO happening,  It must arrange to call
 *    allow_barrier when it has finished its IO.
 * backgroup IO calls must call raise_barrier.  Once that returns
 *    there is no normal IO happeing.  It must arrange to call
 *    lower_barrier when the particular background IO completes.
 */

相关数据结构

用来描述软raid配置相关的所有信息。

c 复制代码

struct r1conf {
	struct mddev		*mddev;
	struct raid1_info	*mirrors;	/* twice 'raid_disks' to
						 * allow for replacements.
						 */
	int			raid_disks;

	spinlock_t		device_lock;

	/* list of 'struct r1bio' that need to be processed by raid1d,
	 * whether to retry a read, writeout a resync or recovery
	 * block, or anything else.
	 */
	struct list_head	retry_list;
	/* A separate list of r1bio which just need raid_end_bio_io called.
	 * This mustn't happen for writes which had any errors if the superblock
	 * needs to be written.
	 */
	struct list_head	bio_end_io_list;

	/* queue pending writes to be submitted on unplug */
	struct bio_list		pending_bio_list;
	int			pending_count;

	/* for use when syncing mirrors:
	 * We don't allow both normal IO and resync/recovery IO at
	 * the same time - resync/recovery can only happen when there
	 * is no other IO.  So when either is active, the other has to wait.
	 * See more details description in raid1.c near raise_barrier().
	 */
	wait_queue_head_t	wait_barrier;
	spinlock_t		resync_lock;
	atomic_t		nr_sync_pending;
	atomic_t		*nr_pending;
	atomic_t		*nr_waiting;
	atomic_t		*nr_queued;
	atomic_t		*barrier;
	int			array_frozen;

	/* Set to 1 if a full sync is needed, (fresh device added).
	 * Cleared when a sync completes.
	 */
	int			fullsync;

	/* When the same as mddev->recovery_disabled we don't allow
	 * recovery to be attempted as we expect a read error.
	 */
	int			recovery_disabled;

	/* poolinfo contains information about the content of the
	 * mempools - it changes when the array grows or shrinks
	 */
	struct pool_info	*poolinfo;
	mempool_t		r1bio_pool;
	mempool_t		r1buf_pool;

	struct bio_set		bio_split;

	/* temporary buffer to synchronous IO when attempting to repair
	 * a read error.
	 */
	struct page		*tmppage;

	/* When taking over an array from a different personality, we store
	 * the new thread here until we fully activate the array.
	 */
	struct md_thread	*thread;

	/* Keep track of cluster resync window to send to other
	 * nodes.
	 */
	sector_t		cluster_sync_low;
	sector_t		cluster_sync_high;

};

在当前的例子中，我们需要关注3个成员。

nr_pending
正在处理的正常IO
nr_waitting
等待同步完成的正常IO
barrier
正在处理的同步IO

相关内核函数

raise_barrier

raise_barrier只有在同步IO的场景下raid1_sync_request才会被调用，这就意味着，只有等待正常IO完成之后，才能把屏障加起来。

c 复制代码

static sector_t raise_barrier(struct r1conf *conf, sector_t sector_nr)
{
        int idx = sector_to_idx(sector_nr);	// 获取在bucket中的index。

        spin_lock_irq(&conf->resync_lock);

        /* Wait until no block IO is waiting */
        wait_event_lock_irq(conf->wait_barrier,
                            !atomic_read(&conf->nr_waiting[idx]),
                            conf->resync_lock);

        /* block any new IO from starting */
        atomic_inc(&conf->barrier[idx]);
        /*
         * In raise_barrier() we firstly increase conf->barrier[idx] then
         * check conf->nr_pending[idx]. In _wait_barrier() we firstly
         * increase conf->nr_pending[idx] then check conf->barrier[idx].
         * A memory barrier here to make sure conf->nr_pending[idx] won't
         * be fetched before conf->barrier[idx] is increased. Otherwise
         * there will be a race between raise_barrier() and _wait_barrier().
         */
        smp_mb__after_atomic();	// 内存屏障。

        /* For these conditions we must wait:
         * A: while the array is in frozen state
         * B: while conf->nr_pending[idx] is not 0, meaning regular I/O
         *    existing in corresponding I/O barrier bucket.
         * C: while conf->barrier[idx] >= RESYNC_DEPTH, meaning reaches
         *    max resync count which allowed on current I/O barrier bucket.
         */
        wait_event_lock_irq(conf->wait_barrier,
                            (!conf->array_frozen &&
                             !atomic_read(&conf->nr_pending[idx]) &&
                             atomic_read(&conf->barrier[idx]) < RESYNC_DEPTH) ||
                                test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery),
                            conf->resync_lock);

        if (test_bit(MD_RECOVERY_INTR, &conf->mddev->recovery)) {
                atomic_dec(&conf->barrier[idx]);
                spin_unlock_irq(&conf->resync_lock);
                wake_up(&conf->wait_barrier);
                return -EINTR;
        }

        atomic_inc(&conf->nr_sync_pending);
        spin_unlock_irq(&conf->resync_lock);

        return 0;
}

wait_barrier

wait_barrier只有在向下发写请求raid1_write_request时被调用，如果此时对应的磁盘扇区存在barrier，nr_waiting会被添加，表示同一时刻，同一扇区存在同步IO。

c 复制代码

static void _wait_barrier(struct r1conf *conf, int idx)
{
        /*
         * We need to increase conf->nr_pending[idx] very early here,
         * then raise_barrier() can be blocked when it waits for
         * conf->nr_pending[idx] to be 0. Then we can avoid holding
         * conf->resync_lock when there is no barrier raised in same
         * barrier unit bucket. Also if the array is frozen, I/O
         * should be blocked until array is unfrozen.
         */
        atomic_inc(&conf->nr_pending[idx]);
        /*
         * In _wait_barrier() we firstly increase conf->nr_pending[idx], then
         * check conf->barrier[idx]. In raise_barrier() we firstly increase
         * conf->barrier[idx], then check conf->nr_pending[idx]. A memory
         * barrier is necessary here to make sure conf->barrier[idx] won't be
         * fetched before conf->nr_pending[idx] is increased. Otherwise there
         * will be a race between _wait_barrier() and raise_barrier().
         */
        smp_mb__after_atomic();
        
        /*
         * Don't worry about checking two atomic_t variables at same time
         * here. If during we check conf->barrier[idx], the array is
         * frozen (conf->array_frozen is 1), and chonf->barrier[idx] is
         * 0, it is safe to return and make the I/O continue. Because the
         * array is frozen, all I/O returned here will eventually complete
         * or be queued, no race will happen. See code comment in
         * frozen_array().
         */
        if (!READ_ONCE(conf->array_frozen) &&
            !atomic_read(&conf->barrier[idx]))
                return;

        /*
         * After holding conf->resync_lock, conf->nr_pending[idx]
         * should be decreased before waiting for barrier to drop.
         * Otherwise, we may encounter a race condition because
         * raise_barrer() might be waiting for conf->nr_pending[idx]
         * to be 0 at same time.
         */
        spin_lock_irq(&conf->resync_lock);
        atomic_inc(&conf->nr_waiting[idx]);
        atomic_dec(&conf->nr_pending[idx]);
        /*
         * In case freeze_array() is waiting for
         * get_unqueued_pending() == extra
         */
        wake_up(&conf->wait_barrier);
        /* Wait for the barrier in same barrier unit bucket to drop. */
        wait_event_lock_irq(conf->wait_barrier,
                            !conf->array_frozen &&
                             !atomic_read(&conf->barrier[idx]),
                            conf->resync_lock);
        atomic_inc(&conf->nr_pending[idx]);
        atomic_dec(&conf->nr_waiting[idx]);
        spin_unlock_irq(&conf->resync_lock);
}

static void wait_barrier(struct r1conf *conf, sector_t sector_nr)
{
        int idx = sector_to_idx(sector_nr);

        _wait_barrier(conf, idx);
}

wait_read_barrier

wait_read_barrier只有在下发IO读请求时被调用raid1_write_request，读请求入口将对应的bio状态置为pending状态，如果raid处于非frozen状态，直接返回。

c 复制代码

static void wait_read_barrier(struct r1conf *conf, sector_t sector_nr)
{
        int idx = sector_to_idx(sector_nr);

        /*
         * Very similar to _wait_barrier(). The difference is, for read
         * I/O we don't need wait for sync I/O, but if the whole array
         * is frozen, the read I/O still has to wait until the array is
         * unfrozen. Since there is no ordering requirement with
         * conf->barrier[idx] here, memory barrier is unnecessary as well.
         */
        atomic_inc(&conf->nr_pending[idx]);

        if (!READ_ONCE(conf->array_frozen))
                return;

        spin_lock_irq(&conf->resync_lock);
        atomic_inc(&conf->nr_waiting[idx]);
        atomic_dec(&conf->nr_pending[idx]);
        /*
         * In case freeze_array() is waiting for
         * get_unqueued_pending() == extra
         */
        wake_up(&conf->wait_barrier);
        /* Wait for array to be unfrozen */
        wait_event_lock_irq(conf->wait_barrier,
                            !conf->array_frozen,
                            conf->resync_lock);
        atomic_inc(&conf->nr_pending[idx]);
        atomic_dec(&conf->nr_waiting[idx]);
        spin_unlock_irq(&conf->resync_lock);
}

lower_barrier

c 复制代码

static void lower_barrier(struct r1conf *conf, sector_t sector_nr)
{
        int idx = sector_to_idx(sector_nr);

        BUG_ON(atomic_read(&conf->barrier[idx]) <= 0);

        atomic_dec(&conf->barrier[idx]);
        atomic_dec(&conf->nr_sync_pending);
        wake_up(&conf->wait_barrier);
}