From 5e3b8a8d59b2064f68ab349182f5dc5f5bb4fd30 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 16 Jul 2020 13:54:40 +0900 Subject: [PATCH 01/14] md: Fix compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the if statement around the calls to sysfs_link_rdev() to avoid the compilation warnings: warning: suggest braces around empty body in an ‘if’ statement when compiling with W=1. For the call to sysfs_create_link() generating the same warning, use the err variable to store the function result, avoiding triggering another warning as the function is declared as 'warn_unused_result'. Signed-off-by: Damien Le Moal Signed-off-by: Song Liu --- drivers/md/md.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index fc33f2f8a415..9d740e4181ff 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2469,8 +2469,8 @@ static int bind_rdev_to_array(struct md_rdev *rdev, struct mddev *mddev) goto fail; ko = &part_to_dev(rdev->bdev->bd_part)->kobj; - if (sysfs_create_link(&rdev->kobj, ko, "block")) - /* failure here is OK */; + /* failure here is OK */ + err = sysfs_create_link(&rdev->kobj, ko, "block"); rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state"); rdev->sysfs_unack_badblocks = sysfs_get_dirent_safe(rdev->kobj.sd, "unacknowledged_bad_blocks"); @@ -3238,8 +3238,8 @@ slot_store(struct md_rdev *rdev, const char *buf, size_t len) return err; } else sysfs_notify_dirent_safe(rdev->sysfs_state); - if (sysfs_link_rdev(rdev->mddev, rdev)) - /* failure here is OK */; + /* failure here is OK */; + sysfs_link_rdev(rdev->mddev, rdev); /* don't wakeup anyone, leave that to userspace. */ } else { if (slot >= rdev->mddev->raid_disks && @@ -9113,8 +9113,8 @@ static int remove_and_add_spares(struct mddev *mddev, rdev->recovery_offset = 0; } if (mddev->pers->hot_add_disk(mddev, rdev) == 0) { - if (sysfs_link_rdev(mddev, rdev)) - /* failure here is OK */; + /* failure here is OK */ + sysfs_link_rdev(mddev, rdev); if (!test_bit(Journal, &rdev->flags)) spares++; md_new_event(mddev); From 52923083b54e7c6eeec8d78dbb9d209f4b2e9b9e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 16 Jul 2020 13:54:41 +0900 Subject: [PATCH 02/14] md: raid5-cache: Remove set but unused variable Remove the variable offset in r5c_tree_index() to avoid a "set but not used" compilation warning when compiling with W=1. Signed-off-by: Damien Le Moal Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 0bea21d81697..34fd942dad83 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -195,9 +195,7 @@ struct r5l_log { static inline sector_t r5c_tree_index(struct r5conf *conf, sector_t sect) { - sector_t offset; - - offset = sector_div(sect, conf->chunk_sectors); + sector_div(sect, conf->chunk_sectors); return sect; } From 2aada5b14bd0db76fb28885cbb22ce893b0654c8 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 16 Jul 2020 13:54:42 +0900 Subject: [PATCH 03/14] md: raid5: Fix compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the if statement around the calls to sysfs_link_rdev() to avoid the compilation warning "suggest braces around empty body in an ‘if’ statement" when compiling with W=1. Also fix function description comments to avoid kdoc format warnings. Signed-off-by: Damien Le Moal Signed-off-by: Song Liu --- drivers/md/raid5.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index a4fbafa5b8f8..2dad541a60da 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2217,9 +2217,9 @@ static int grow_stripes(struct r5conf *conf, int num) /** * scribble_alloc - allocate percpu scribble buffer for required size * of the scribble region - * @percpu - from for_each_present_cpu() of the caller - * @num - total number of disks in the array - * @cnt - scribble objs count for required size of the scribble region + * @percpu: from for_each_present_cpu() of the caller + * @num: total number of disks in the array + * @cnt: scribble objs count for required size of the scribble region * * The scribble buffer size must be enough to contain: * 1/ a struct page pointer for each device in the array +2 @@ -3710,7 +3710,7 @@ static int fetch_block(struct stripe_head *sh, struct stripe_head_state *s, return 0; } -/** +/* * handle_stripe_fill - read or compute data to satisfy pending requests. */ static void handle_stripe_fill(struct stripe_head *sh, @@ -7944,8 +7944,8 @@ static int raid5_start_reshape(struct mddev *mddev) else rdev->recovery_offset = 0; - if (sysfs_link_rdev(mddev, rdev)) - /* Failure here is OK */; + /* Failure here is OK */ + sysfs_link_rdev(mddev, rdev); } } else if (rdev->raid_disk >= conf->previous_raid_disks && !test_bit(Faulty, &rdev->flags)) { From 38ffc01f38cc844ad2b81cdc9b0a087c4fc4c2b8 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Thu, 16 Jul 2020 13:54:43 +0900 Subject: [PATCH 04/14] md: raid10: Fix compilation warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove the if statement around the call to sysfs_link_rdev() in raid10_start_reshape() to avoid the compilation warning: warning: suggest braces around empty body in an ‘if’ statement when compiling with W=1. Signed-off-by: Damien Le Moal Signed-off-by: Song Liu --- drivers/md/raid10.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 2c47474fa69d..14b1ba732cd7 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -4307,8 +4307,8 @@ out: else rdev->recovery_offset = 0; - if (sysfs_link_rdev(mddev, rdev)) - /* Failure here is OK */; + /* Failure here is OK */ + sysfs_link_rdev(mddev, rdev); } } else if (rdev->raid_disk >= conf->prev.raid_disks && !test_bit(Faulty, &rdev->flags)) { From a377a472b9bca8f904f1e1bdf1b472bada35ac37 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 16 Jun 2020 11:25:50 +0200 Subject: [PATCH 05/14] raid5: call clear_batch_ready before set STRIPE_ACTIVE We tried to only put the head sh of batch list to handle_list, then the handle_stripe doesn't handle other members in the batch list. However, we still got the calltrace in break_stripe_batch_list. [593764.644269] stripe state: 2003 kernel: [593764.644299] ------------[ cut here ]------------ kernel: [593764.644308] WARNING: CPU: 12 PID: 856 at drivers/md/raid5.c:4625 break_stripe_batch_list+0x203/0x240 [raid456] [...] kernel: [593764.644363] Call Trace: kernel: [593764.644370] handle_stripe+0x907/0x20c0 [raid456] kernel: [593764.644376] ? __wake_up_common_lock+0x89/0xc0 kernel: [593764.644379] handle_active_stripes.isra.57+0x35f/0x570 [raid456] kernel: [593764.644382] ? raid5_wakeup_stripe_thread+0x96/0x1f0 [raid456] kernel: [593764.644385] raid5d+0x480/0x6a0 [raid456] kernel: [593764.644390] ? md_thread+0x11f/0x160 kernel: [593764.644392] md_thread+0x11f/0x160 kernel: [593764.644394] ? wait_woken+0x80/0x80 kernel: [593764.644396] kthread+0xfc/0x130 kernel: [593764.644398] ? find_pers+0x70/0x70 kernel: [593764.644399] ? kthread_create_on_node+0x70/0x70 kernel: [593764.644401] ret_from_fork+0x1f/0x30 As we can see, the stripe was set with STRIPE_ACTIVE and STRIPE_HANDLE, and only handle_stripe could set those flags then return. And since the stipe was already in the batch list, we need to return earlier before set the two flags. And after dig a little about git history especially commit 3664847d95e6 ("md/raid5: fix a race condition in stripe batch"), it seems the batched stipe still could be handled by handle_stipe, then handle_stipe needs to return earlier if clear_batch_ready to return true. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 2dad541a60da..29dfd91f5095 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4682,6 +4682,16 @@ static void handle_stripe(struct stripe_head *sh) struct r5dev *pdev, *qdev; clear_bit(STRIPE_HANDLE, &sh->state); + + /* + * handle_stripe should not continue handle the batched stripe, only + * the head of batch list or lone stripe can continue. Otherwise we + * could see break_stripe_batch_list warns about the STRIPE_ACTIVE + * is set for the batched stripe. + */ + if (clear_batch_ready(sh)) + return; + if (test_and_set_bit_lock(STRIPE_ACTIVE, &sh->state)) { /* already being handled, ensure it gets handled * again when current action finishes */ @@ -4689,11 +4699,6 @@ static void handle_stripe(struct stripe_head *sh) return; } - if (clear_batch_ready(sh) ) { - clear_bit_unlock(STRIPE_ACTIVE, &sh->state); - return; - } - if (test_and_clear_bit(STRIPE_BATCH_ERR, &sh->state)) break_stripe_batch_list(sh, 0); From cb9902db3827135bfd6ae70bcb65b653418bb775 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 16 Jun 2020 11:25:51 +0200 Subject: [PATCH 06/14] raid5: put the comment of clear_batch_ready to the right place To make people understand the function well, let's put the comment to the right place. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 29dfd91f5095..3519560d35b0 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4573,12 +4573,12 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) rcu_read_unlock(); } +/* + * Return '1' if this is a member of batch, or '0' if it is a lone stripe or + * a head which can now be handled. + */ static int clear_batch_ready(struct stripe_head *sh) { - /* Return '1' if this is a member of batch, or - * '0' if it is a lone stripe or a head which can now be - * handled. - */ struct stripe_head *tmp; if (!test_and_clear_bit(STRIPE_BATCH_READY, &sh->state)) return (sh->batch_head && sh->batch_head != sh); From 1684e97538b9484c72bcaff2961569c3b0021473 Mon Sep 17 00:00:00 2001 From: Guoqing Jiang Date: Tue, 16 Jun 2020 11:25:52 +0200 Subject: [PATCH 07/14] raid5: remove the meaningless check in raid5_make_request We can't guarntee the batched stripe to be set with STRIPE_HANDLE since there are lots of functions could set the flag, such as sync_request, ops_complete_* and end_{read,write}_request etc. Also clear_batch_ready called in handle_stripe ensures the batched list can't continue to be handled by handle_stripe. Signed-off-by: Guoqing Jiang Signed-off-by: Song Liu --- drivers/md/raid5.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 3519560d35b0..73128f46924c 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5738,8 +5738,7 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) do_flush = false; } - if (!sh->batch_head || sh == sh->batch_head) - set_bit(STRIPE_HANDLE, &sh->state); + set_bit(STRIPE_HANDLE, &sh->state); clear_bit(STRIPE_DELAYED, &sh->state); if ((!sh->batch_head || sh == sh->batch_head) && (bi->bi_opf & REQ_SYNC) && From c911c46c017c745e0f5ece9626d0fbfaff5a1f97 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Sat, 18 Jul 2020 05:29:07 -0400 Subject: [PATCH 08/14] md/raid456: convert macro STRIPE_* to RAID5_STRIPE_* Convert macro STRIPE_SIZE, STRIPE_SECTORS and STRIPE_SHIFT to RAID5_STRIPE_SIZE(), RAID5_STRIPE_SECTORS() and RAID5_STRIPE_SHIFT(). This patch is prepare for the following adjustable stripe_size. It will not change any existing functionality. Signed-off-by: Yufen Yu Signed-off-by: Song Liu --- drivers/md/raid5-cache.c | 8 +- drivers/md/raid5-ppl.c | 11 +- drivers/md/raid5.c | 225 +++++++++++++++++++++------------------ drivers/md/raid5.h | 37 ++++--- 4 files changed, 153 insertions(+), 128 deletions(-) diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 34fd942dad83..82eb4a906e31 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -296,8 +296,8 @@ r5c_return_dev_pending_writes(struct r5conf *conf, struct r5dev *dev) wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { - wbi2 = r5_next_bio(wbi, dev->sector); + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + wbi2 = r5_next_bio(conf, wbi, dev->sector); md_write_end(conf->mddev); bio_endio(wbi); wbi = wbi2; @@ -314,7 +314,7 @@ void r5c_handle_cached_data_endio(struct r5conf *conf, set_bit(R5_UPTODATE, &sh->dev[i].flags); r5c_return_dev_pending_writes(conf, &sh->dev[i]); md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), !test_bit(STRIPE_DEGRADED, &sh->state), 0); } @@ -362,7 +362,7 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) */ if (atomic_read(&conf->r5c_cached_full_stripes) >= min(R5C_FULL_STRIPE_FLUSH_BATCH(conf), - conf->chunk_sectors >> STRIPE_SHIFT)) + conf->chunk_sectors >> RAID5_STRIPE_SHIFT(conf))) r5l_wake_reclaim(conf->log, 0); } diff --git a/drivers/md/raid5-ppl.c b/drivers/md/raid5-ppl.c index a750f4bbb5d9..d0f540296fe9 100644 --- a/drivers/md/raid5-ppl.c +++ b/drivers/md/raid5-ppl.c @@ -324,7 +324,7 @@ static int ppl_log_stripe(struct ppl_log *log, struct stripe_head *sh) * be just after the last logged stripe and write to the same * disks. Use bit shift and logarithm to avoid 64-bit division. */ - if ((sh->sector == sh_last->sector + STRIPE_SECTORS) && + if ((sh->sector == sh_last->sector + RAID5_STRIPE_SECTORS(conf)) && (data_sector >> ilog2(conf->chunk_sectors) == data_sector_last >> ilog2(conf->chunk_sectors)) && ((data_sector - data_sector_last) * data_disks == @@ -844,9 +844,9 @@ static int ppl_recover_entry(struct ppl_log *log, struct ppl_header_entry *e, /* if start and end is 4k aligned, use a 4k block */ if (block_size == 512 && - (r_sector_first & (STRIPE_SECTORS - 1)) == 0 && - (r_sector_last & (STRIPE_SECTORS - 1)) == 0) - block_size = STRIPE_SIZE; + (r_sector_first & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0 && + (r_sector_last & (RAID5_STRIPE_SECTORS(conf) - 1)) == 0) + block_size = RAID5_STRIPE_SIZE(conf); /* iterate through blocks in strip */ for (i = 0; i < strip_sectors; i += (block_size >> 9)) { @@ -1274,7 +1274,8 @@ static int ppl_validate_rdev(struct md_rdev *rdev) ppl_data_sectors = rdev->ppl.size - (PPL_HEADER_SIZE >> 9); if (ppl_data_sectors > 0) - ppl_data_sectors = rounddown(ppl_data_sectors, STRIPE_SECTORS); + ppl_data_sectors = rounddown(ppl_data_sectors, + RAID5_STRIPE_SECTORS((struct r5conf *)rdev->mddev->private)); if (ppl_data_sectors <= 0) { pr_warn("md/raid:%s: PPL space too small on %s\n", diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 73128f46924c..18f20f3d9664 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -69,13 +69,13 @@ static struct workqueue_struct *raid5_wq; static inline struct hlist_head *stripe_hash(struct r5conf *conf, sector_t sect) { - int hash = (sect >> STRIPE_SHIFT) & HASH_MASK; + int hash = (sect >> RAID5_STRIPE_SHIFT(conf)) & HASH_MASK; return &conf->stripe_hashtbl[hash]; } -static inline int stripe_hash_locks_hash(sector_t sect) +static inline int stripe_hash_locks_hash(struct r5conf *conf, sector_t sect) { - return (sect >> STRIPE_SHIFT) & STRIPE_HASH_LOCKS_MASK; + return (sect >> RAID5_STRIPE_SHIFT(conf)) & STRIPE_HASH_LOCKS_MASK; } static inline void lock_device_hash_lock(struct r5conf *conf, int hash) @@ -627,7 +627,7 @@ raid5_get_active_stripe(struct r5conf *conf, sector_t sector, int previous, int noblock, int noquiesce) { struct stripe_head *sh; - int hash = stripe_hash_locks_hash(sector); + int hash = stripe_hash_locks_hash(conf, sector); int inc_empty_inactive_list_flag; pr_debug("get_stripe, sector %llu\n", (unsigned long long)sector); @@ -748,9 +748,9 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh tmp_sec = sh->sector; if (!sector_div(tmp_sec, conf->chunk_sectors)) return; - head_sector = sh->sector - STRIPE_SECTORS; + head_sector = sh->sector - RAID5_STRIPE_SECTORS(conf); - hash = stripe_hash_locks_hash(head_sector); + hash = stripe_hash_locks_hash(conf, head_sector); spin_lock_irq(conf->hash_locks + hash); head = __find_stripe(conf, head_sector, conf->generation); if (head && !atomic_inc_not_zero(&head->count)) { @@ -1057,7 +1057,7 @@ again: test_bit(WriteErrorSeen, &rdev->flags)) { sector_t first_bad; int bad_sectors; - int bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, + int bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors); if (!bad) break; @@ -1089,7 +1089,7 @@ again: if (rdev) { if (s->syncing || s->expanding || s->expanded || s->replacing) - md_sync_acct(rdev->bdev, STRIPE_SECTORS); + md_sync_acct(rdev->bdev, RAID5_STRIPE_SECTORS(conf)); set_bit(STRIPE_IO_STARTED, &sh->state); @@ -1129,9 +1129,9 @@ again: else sh->dev[i].vec.bv_page = sh->dev[i].page; bi->bi_vcnt = 1; - bi->bi_io_vec[0].bv_len = STRIPE_SIZE; + bi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); bi->bi_io_vec[0].bv_offset = 0; - bi->bi_iter.bi_size = STRIPE_SIZE; + bi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); bi->bi_write_hint = sh->dev[i].write_hint; if (!rrdev) sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; @@ -1156,7 +1156,7 @@ again: if (rrdev) { if (s->syncing || s->expanding || s->expanded || s->replacing) - md_sync_acct(rrdev->bdev, STRIPE_SECTORS); + md_sync_acct(rrdev->bdev, RAID5_STRIPE_SECTORS(conf)); set_bit(STRIPE_IO_STARTED, &sh->state); @@ -1183,9 +1183,9 @@ again: WARN_ON(test_bit(R5_UPTODATE, &sh->dev[i].flags)); sh->dev[i].rvec.bv_page = sh->dev[i].page; rbi->bi_vcnt = 1; - rbi->bi_io_vec[0].bv_len = STRIPE_SIZE; + rbi->bi_io_vec[0].bv_len = RAID5_STRIPE_SIZE(conf); rbi->bi_io_vec[0].bv_offset = 0; - rbi->bi_iter.bi_size = STRIPE_SIZE; + rbi->bi_iter.bi_size = RAID5_STRIPE_SIZE(conf); rbi->bi_write_hint = sh->dev[i].write_hint; sh->dev[i].write_hint = RWH_WRITE_LIFE_NOT_SET; /* @@ -1235,6 +1235,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, int page_offset; struct async_submit_ctl submit; enum async_tx_flags flags = 0; + struct r5conf *conf = sh->raid_conf; if (bio->bi_iter.bi_sector >= sector) page_offset = (signed)(bio->bi_iter.bi_sector - sector) * 512; @@ -1256,8 +1257,8 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, len -= b_offset; } - if (len > 0 && page_offset + len > STRIPE_SIZE) - clen = STRIPE_SIZE - page_offset; + if (len > 0 && page_offset + len > RAID5_STRIPE_SIZE(conf)) + clen = RAID5_STRIPE_SIZE(conf) - page_offset; else clen = len; @@ -1265,9 +1266,9 @@ async_copy_data(int frombio, struct bio *bio, struct page **page, b_offset += bvl.bv_offset; bio_page = bvl.bv_page; if (frombio) { - if (sh->raid_conf->skip_copy && + if (conf->skip_copy && b_offset == 0 && page_offset == 0 && - clen == STRIPE_SIZE && + clen == RAID5_STRIPE_SIZE(conf) && !no_skipcopy) *page = bio_page; else @@ -1292,6 +1293,7 @@ static void ops_complete_biofill(void *stripe_head_ref) { struct stripe_head *sh = stripe_head_ref; int i; + struct r5conf *conf = sh->raid_conf; pr_debug("%s: stripe %llu\n", __func__, (unsigned long long)sh->sector); @@ -1312,8 +1314,8 @@ static void ops_complete_biofill(void *stripe_head_ref) rbi = dev->read; dev->read = NULL; while (rbi && rbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { - rbi2 = r5_next_bio(rbi, dev->sector); + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + rbi2 = r5_next_bio(conf, rbi, dev->sector); bio_endio(rbi); rbi = rbi2; } @@ -1330,6 +1332,7 @@ static void ops_run_biofill(struct stripe_head *sh) struct dma_async_tx_descriptor *tx = NULL; struct async_submit_ctl submit; int i; + struct r5conf *conf = sh->raid_conf; BUG_ON(sh->batch_head); pr_debug("%s: stripe %llu\n", __func__, @@ -1344,10 +1347,10 @@ static void ops_run_biofill(struct stripe_head *sh) dev->toread = NULL; spin_unlock_irq(&sh->stripe_lock); while (rbi && rbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { + dev->sector + RAID5_STRIPE_SECTORS(conf)) { tx = async_copy_data(0, rbi, &dev->page, dev->sector, tx, sh, 0); - rbi = r5_next_bio(rbi, dev->sector); + rbi = r5_next_bio(conf, rbi, dev->sector); } } } @@ -1429,9 +1432,11 @@ ops_run_compute5(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + tx = async_xor(xor_dest, xor_srcs, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; } @@ -1522,7 +1527,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, ASYNC_TX_FENCE, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + tx = async_gen_syndrome(blocks, 0, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } else { /* Compute any data- or p-drive using XOR */ count = 0; @@ -1535,7 +1541,8 @@ ops_run_compute6_1(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); - tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, &submit); + tx = async_xor(dest, blocks, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); } return tx; @@ -1598,7 +1605,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); return async_gen_syndrome(blocks, 0, syndrome_disks+2, - STRIPE_SIZE, &submit); + RAID5_STRIPE_SIZE(sh->raid_conf), + &submit); } else { struct page *dest; int data_target; @@ -1621,7 +1629,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) ASYNC_TX_FENCE|ASYNC_TX_XOR_ZERO_DST, NULL, NULL, NULL, to_addr_conv(sh, percpu, 0)); - tx = async_xor(dest, blocks, 0, count, STRIPE_SIZE, + tx = async_xor(dest, blocks, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); count = set_syndrome_sources(blocks, sh, SYNDROME_SRC_ALL); @@ -1629,7 +1638,8 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) ops_complete_compute, sh, to_addr_conv(sh, percpu, 0)); return async_gen_syndrome(blocks, 0, count+2, - STRIPE_SIZE, &submit); + RAID5_STRIPE_SIZE(sh->raid_conf), + &submit); } } else { init_async_submit(&submit, ASYNC_TX_FENCE, NULL, @@ -1638,13 +1648,15 @@ ops_run_compute6_2(struct stripe_head *sh, struct raid5_percpu *percpu) if (failb == syndrome_disks) { /* We're missing D+P. */ return async_raid6_datap_recov(syndrome_disks+2, - STRIPE_SIZE, faila, - blocks, &submit); + RAID5_STRIPE_SIZE(sh->raid_conf), + faila, + blocks, &submit); } else { /* We're missing D+D. */ return async_raid6_2data_recov(syndrome_disks+2, - STRIPE_SIZE, faila, failb, - blocks, &submit); + RAID5_STRIPE_SIZE(sh->raid_conf), + faila, failb, + blocks, &submit); } } } @@ -1691,7 +1703,8 @@ ops_run_prexor5(struct stripe_head *sh, struct raid5_percpu *percpu, init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_XOR_DROP_DST, tx, ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + tx = async_xor(xor_dest, xor_srcs, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; } @@ -1711,7 +1724,8 @@ ops_run_prexor6(struct stripe_head *sh, struct raid5_percpu *percpu, init_async_submit(&submit, ASYNC_TX_FENCE|ASYNC_TX_PQ_XOR_DST, tx, ops_complete_prexor, sh, to_addr_conv(sh, percpu, 0)); - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + tx = async_gen_syndrome(blocks, 0, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); return tx; } @@ -1752,7 +1766,7 @@ again: WARN_ON(dev->page != dev->orig_page); while (wbi && wbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { + dev->sector + RAID5_STRIPE_SECTORS(conf)) { if (wbi->bi_opf & REQ_FUA) set_bit(R5_WantFUA, &dev->flags); if (wbi->bi_opf & REQ_SYNC) @@ -1770,7 +1784,7 @@ again: clear_bit(R5_OVERWRITE, &dev->flags); } } - wbi = r5_next_bio(wbi, dev->sector); + wbi = r5_next_bio(conf, wbi, dev->sector); } if (head_sh->batch_head) { @@ -1910,9 +1924,11 @@ again: } if (unlikely(count == 1)) - tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, STRIPE_SIZE, &submit); + tx = async_memcpy(xor_dest, xor_srcs[0], 0, 0, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); else - tx = async_xor(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, &submit); + tx = async_xor(xor_dest, xor_srcs, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); if (!last_stripe) { j++; sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -1972,7 +1988,8 @@ again: } else init_async_submit(&submit, 0, tx, NULL, NULL, to_addr_conv(sh, percpu, j)); - tx = async_gen_syndrome(blocks, 0, count+2, STRIPE_SIZE, &submit); + tx = async_gen_syndrome(blocks, 0, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &submit); if (!last_stripe) { j++; sh = list_first_entry(&sh->batch_list, struct stripe_head, @@ -2020,7 +2037,8 @@ static void ops_run_check_p(struct stripe_head *sh, struct raid5_percpu *percpu) init_async_submit(&submit, 0, NULL, NULL, NULL, to_addr_conv(sh, percpu, 0)); - tx = async_xor_val(xor_dest, xor_srcs, 0, count, STRIPE_SIZE, + tx = async_xor_val(xor_dest, xor_srcs, 0, count, + RAID5_STRIPE_SIZE(sh->raid_conf), &sh->ops.zero_sum_result, &submit); atomic_inc(&sh->count); @@ -2045,7 +2063,8 @@ static void ops_run_check_pq(struct stripe_head *sh, struct raid5_percpu *percpu atomic_inc(&sh->count); init_async_submit(&submit, ASYNC_TX_ACK, NULL, ops_complete_check, sh, to_addr_conv(sh, percpu, 0)); - async_syndrome_val(srcs, 0, count+2, STRIPE_SIZE, + async_syndrome_val(srcs, 0, count+2, + RAID5_STRIPE_SIZE(sh->raid_conf), &sh->ops.zero_sum_result, percpu->spare_page, &submit); } @@ -2275,7 +2294,7 @@ static int resize_chunks(struct r5conf *conf, int new_disks, int new_sectors) percpu = per_cpu_ptr(conf->percpu, cpu); err = scribble_alloc(percpu, new_disks, - new_sectors / STRIPE_SECTORS); + new_sectors / RAID5_STRIPE_SECTORS(conf)); if (err) break; } @@ -2509,10 +2528,10 @@ static void raid5_end_read_request(struct bio * bi) */ pr_info_ratelimited( "md/raid:%s: read error corrected (%lu sectors at %llu on %s)\n", - mdname(conf->mddev), STRIPE_SECTORS, + mdname(conf->mddev), RAID5_STRIPE_SECTORS(conf), (unsigned long long)s, bdevname(rdev->bdev, b)); - atomic_add(STRIPE_SECTORS, &rdev->corrected_errors); + atomic_add(RAID5_STRIPE_SECTORS(conf), &rdev->corrected_errors); clear_bit(R5_ReadError, &sh->dev[i].flags); clear_bit(R5_ReWrite, &sh->dev[i].flags); } else if (test_bit(R5_ReadNoMerge, &sh->dev[i].flags)) @@ -2585,7 +2604,7 @@ static void raid5_end_read_request(struct bio * bi) if (!(set_bad && test_bit(In_sync, &rdev->flags) && rdev_set_badblocks( - rdev, sh->sector, STRIPE_SECTORS, 0))) + rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), 0))) md_error(conf->mddev, rdev); } } @@ -2637,7 +2656,7 @@ static void raid5_end_write_request(struct bio *bi) if (bi->bi_status) md_error(conf->mddev, rdev); else if (is_badblock(rdev, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors)) set_bit(R5_MadeGoodRepl, &sh->dev[i].flags); } else { @@ -2649,7 +2668,7 @@ static void raid5_end_write_request(struct bio *bi) set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery); } else if (is_badblock(rdev, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors)) { set_bit(R5_MadeGood, &sh->dev[i].flags); if (test_bit(R5_ReadError, &sh->dev[i].flags)) @@ -3283,13 +3302,13 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, /* check if page is covered */ sector_t sector = sh->dev[dd_idx].sector; for (bi=sh->dev[dd_idx].towrite; - sector < sh->dev[dd_idx].sector + STRIPE_SECTORS && + sector < sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf) && bi && bi->bi_iter.bi_sector <= sector; - bi = r5_next_bio(bi, sh->dev[dd_idx].sector)) { + bi = r5_next_bio(conf, bi, sh->dev[dd_idx].sector)) { if (bio_end_sector(bi) >= sector) sector = bio_end_sector(bi); } - if (sector >= sh->dev[dd_idx].sector + STRIPE_SECTORS) + if (sector >= sh->dev[dd_idx].sector + RAID5_STRIPE_SECTORS(conf)) if (!test_and_set_bit(R5_OVERWRITE, &sh->dev[dd_idx].flags)) sh->overwrite_disks++; } @@ -3314,7 +3333,7 @@ static int add_stripe_bio(struct stripe_head *sh, struct bio *bi, int dd_idx, set_bit(STRIPE_BITMAP_PENDING, &sh->state); spin_unlock_irq(&sh->stripe_lock); md_bitmap_startwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0); + RAID5_STRIPE_SECTORS(conf), 0); spin_lock_irq(&sh->stripe_lock); clear_bit(STRIPE_BITMAP_PENDING, &sh->state); if (!sh->batch_head) { @@ -3376,7 +3395,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (!rdev_set_badblocks( rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) md_error(conf->mddev, rdev); rdev_dec_pending(rdev, conf->mddev); } @@ -3396,8 +3415,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, wake_up(&conf->wait_for_overlap); while (bi && bi->bi_iter.bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { - struct bio *nextbi = r5_next_bio(bi, sh->dev[i].sector); + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { + struct bio *nextbi = r5_next_bio(conf, bi, sh->dev[i].sector); md_write_end(conf->mddev); bio_io_error(bi); @@ -3405,7 +3424,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, } if (bitmap_end) md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0, 0); + RAID5_STRIPE_SECTORS(conf), 0, 0); bitmap_end = 0; /* and fail all 'written' */ bi = sh->dev[i].written; @@ -3417,8 +3436,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (bi) bitmap_end = 1; while (bi && bi->bi_iter.bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { - struct bio *bi2 = r5_next_bio(bi, sh->dev[i].sector); + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { + struct bio *bi2 = r5_next_bio(conf, bi, sh->dev[i].sector); md_write_end(conf->mddev); bio_io_error(bi); @@ -3441,9 +3460,9 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, if (bi) s->to_read--; while (bi && bi->bi_iter.bi_sector < - sh->dev[i].sector + STRIPE_SECTORS) { + sh->dev[i].sector + RAID5_STRIPE_SECTORS(conf)) { struct bio *nextbi = - r5_next_bio(bi, sh->dev[i].sector); + r5_next_bio(conf, bi, sh->dev[i].sector); bio_io_error(bi); bi = nextbi; @@ -3451,7 +3470,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh, } if (bitmap_end) md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, 0, 0); + RAID5_STRIPE_SECTORS(conf), 0, 0); /* If we were in the middle of a write the parity block might * still be locked - so just clear all R5_LOCKED flags */ @@ -3496,14 +3515,14 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && !rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) abort = 1; rdev = rcu_dereference(conf->disks[i].replacement); if (rdev && !test_bit(Faulty, &rdev->flags) && !test_bit(In_sync, &rdev->flags) && !rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) abort = 1; } rcu_read_unlock(); @@ -3511,7 +3530,7 @@ handle_failed_sync(struct r5conf *conf, struct stripe_head *sh, conf->recovery_disabled = conf->mddev->recovery_disabled; } - md_done_sync(conf->mddev, STRIPE_SECTORS, !abort); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), !abort); } static int want_replace(struct stripe_head *sh, int disk_idx) @@ -3785,14 +3804,14 @@ returnbi: wbi = dev->written; dev->written = NULL; while (wbi && wbi->bi_iter.bi_sector < - dev->sector + STRIPE_SECTORS) { - wbi2 = r5_next_bio(wbi, dev->sector); + dev->sector + RAID5_STRIPE_SECTORS(conf)) { + wbi2 = r5_next_bio(conf, wbi, dev->sector); md_write_end(conf->mddev); bio_endio(wbi); wbi = wbi2; } md_bitmap_endwrite(conf->mddev->bitmap, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), !test_bit(STRIPE_DEGRADED, &sh->state), 0); if (head_sh->batch_head) { @@ -4099,7 +4118,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, */ set_bit(STRIPE_INSYNC, &sh->state); else { - atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); + atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -4107,7 +4126,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, "%llu-%llu\n", mdname(conf->mddev), (unsigned long long) sh->sector, (unsigned long long) sh->sector + - STRIPE_SECTORS); + RAID5_STRIPE_SECTORS(conf)); } else { sh->check_state = check_state_compute_run; set_bit(STRIPE_COMPUTE_RUN, &sh->state); @@ -4264,7 +4283,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, */ } } else { - atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); + atomic64_add(RAID5_STRIPE_SECTORS(conf), &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) { /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -4272,7 +4291,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, "%llu-%llu\n", mdname(conf->mddev), (unsigned long long) sh->sector, (unsigned long long) sh->sector + - STRIPE_SECTORS); + RAID5_STRIPE_SECTORS(conf)); } else { int *target = &sh->ops.target; @@ -4343,7 +4362,7 @@ static void handle_stripe_expansion(struct r5conf *conf, struct stripe_head *sh) /* place all the copies on one channel */ init_async_submit(&submit, 0, tx, NULL, NULL, NULL); tx = async_memcpy(sh2->dev[dd_idx].page, - sh->dev[i].page, 0, 0, STRIPE_SIZE, + sh->dev[i].page, 0, 0, RAID5_STRIPE_SIZE(conf), &submit); set_bit(R5_Expanded, &sh2->dev[dd_idx].flags); @@ -4442,8 +4461,8 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) */ rdev = rcu_dereference(conf->disks[i].replacement); if (rdev && !test_bit(Faulty, &rdev->flags) && - rdev->recovery_offset >= sh->sector + STRIPE_SECTORS && - !is_badblock(rdev, sh->sector, STRIPE_SECTORS, + rdev->recovery_offset >= sh->sector + RAID5_STRIPE_SECTORS(conf) && + !is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors)) set_bit(R5_ReadRepl, &dev->flags); else { @@ -4457,7 +4476,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) if (rdev && test_bit(Faulty, &rdev->flags)) rdev = NULL; if (rdev) { - is_bad = is_badblock(rdev, sh->sector, STRIPE_SECTORS, + is_bad = is_badblock(rdev, sh->sector, RAID5_STRIPE_SECTORS(conf), &first_bad, &bad_sectors); if (s->blocked_rdev == NULL && (test_bit(Blocked, &rdev->flags) @@ -4484,7 +4503,7 @@ static void analyse_stripe(struct stripe_head *sh, struct stripe_head_state *s) } } else if (test_bit(In_sync, &rdev->flags)) set_bit(R5_Insync, &dev->flags); - else if (sh->sector + STRIPE_SECTORS <= rdev->recovery_offset) + else if (sh->sector + RAID5_STRIPE_SECTORS(conf) <= rdev->recovery_offset) /* in sync if before recovery_offset */ set_bit(R5_Insync, &dev->flags); else if (test_bit(R5_UPTODATE, &dev->flags) && @@ -4932,7 +4951,7 @@ static void handle_stripe(struct stripe_head *sh) if ((s.syncing || s.replacing) && s.locked == 0 && !test_bit(STRIPE_COMPUTE_RUN, &sh->state) && test_bit(STRIPE_INSYNC, &sh->state)) { - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); clear_bit(STRIPE_SYNCING, &sh->state); if (test_and_clear_bit(R5_Overlap, &sh->dev[sh->pd_idx].flags)) wake_up(&conf->wait_for_overlap); @@ -5000,7 +5019,7 @@ static void handle_stripe(struct stripe_head *sh) clear_bit(STRIPE_EXPAND_READY, &sh->state); atomic_dec(&conf->reshape_stripes); wake_up(&conf->wait_for_overlap); - md_done_sync(conf->mddev, STRIPE_SECTORS, 1); + md_done_sync(conf->mddev, RAID5_STRIPE_SECTORS(conf), 1); } if (s.expanding && s.locked == 0 && @@ -5030,14 +5049,14 @@ finish: /* We own a safe reference to the rdev */ rdev = conf->disks[i].rdev; if (!rdev_set_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0)) + RAID5_STRIPE_SECTORS(conf), 0)) md_error(conf->mddev, rdev); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGood, &dev->flags)) { rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0); + RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); } if (test_and_clear_bit(R5_MadeGoodRepl, &dev->flags)) { @@ -5046,7 +5065,7 @@ finish: /* rdev have been moved down */ rdev = conf->disks[i].rdev; rdev_clear_badblocks(rdev, sh->sector, - STRIPE_SECTORS, 0); + RAID5_STRIPE_SECTORS(conf), 0); rdev_dec_pending(rdev, conf->mddev); } } @@ -5510,7 +5529,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) /* Skip discard while reshape is happening */ return; - logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); last_sector = bio_end_sector(bi); bi->bi_next = NULL; @@ -5525,7 +5544,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) last_sector *= conf->chunk_sectors; for (; logical_sector < last_sector; - logical_sector += STRIPE_SECTORS) { + logical_sector += RAID5_STRIPE_SECTORS(conf)) { DEFINE_WAIT(w); int d; again: @@ -5570,7 +5589,7 @@ static void make_discard_request(struct mddev *mddev, struct bio *bi) d++) md_bitmap_startwrite(mddev->bitmap, sh->sector, - STRIPE_SECTORS, + RAID5_STRIPE_SECTORS(conf), 0); sh->bm_seq = conf->seq_flush + 1; set_bit(STRIPE_BIT_DELAY, &sh->state); @@ -5635,12 +5654,12 @@ static bool raid5_make_request(struct mddev *mddev, struct bio * bi) return true; } - logical_sector = bi->bi_iter.bi_sector & ~((sector_t)STRIPE_SECTORS-1); + logical_sector = bi->bi_iter.bi_sector & ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); last_sector = bio_end_sector(bi); bi->bi_next = NULL; prepare_to_wait(&conf->wait_for_overlap, &w, TASK_UNINTERRUPTIBLE); - for (;logical_sector < last_sector; logical_sector += STRIPE_SECTORS) { + for (; logical_sector < last_sector; logical_sector += RAID5_STRIPE_SECTORS(conf)) { int previous; int seq; @@ -5921,7 +5940,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk } INIT_LIST_HEAD(&stripes); - for (i = 0; i < reshape_sectors; i += STRIPE_SECTORS) { + for (i = 0; i < reshape_sectors; i += RAID5_STRIPE_SECTORS(conf)) { int j; int skipped_disk = 0; sh = raid5_get_active_stripe(conf, stripe_addr+i, 0, 0, 1); @@ -5942,7 +5961,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk skipped_disk = 1; continue; } - memset(page_address(sh->dev[j].page), 0, STRIPE_SIZE); + memset(page_address(sh->dev[j].page), 0, RAID5_STRIPE_SIZE(conf)); set_bit(R5_Expanded, &sh->dev[j].flags); set_bit(R5_UPTODATE, &sh->dev[j].flags); } @@ -5977,7 +5996,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk set_bit(STRIPE_EXPAND_SOURCE, &sh->state); set_bit(STRIPE_HANDLE, &sh->state); raid5_release_stripe(sh); - first_sector += STRIPE_SECTORS; + first_sector += RAID5_STRIPE_SECTORS(conf); } /* Now that the sources are clearly marked, we can release * the destination stripes @@ -6083,11 +6102,12 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n if (!test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && !conf->fullsync && !md_bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && - sync_blocks >= STRIPE_SECTORS) { + sync_blocks >= RAID5_STRIPE_SECTORS(conf)) { /* we can skip this block, and probably more */ - sync_blocks /= STRIPE_SECTORS; + sync_blocks /= RAID5_STRIPE_SECTORS(conf); *skipped = 1; - return sync_blocks * STRIPE_SECTORS; /* keep things rounded to whole stripes */ + /* keep things rounded to whole stripes */ + return sync_blocks * RAID5_STRIPE_SECTORS(conf); } md_bitmap_cond_end_sync(mddev->bitmap, sector_nr, false); @@ -6120,7 +6140,7 @@ static inline sector_t raid5_sync_request(struct mddev *mddev, sector_t sector_n raid5_release_stripe(sh); - return STRIPE_SECTORS; + return RAID5_STRIPE_SECTORS(conf); } static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, @@ -6143,14 +6163,14 @@ static int retry_aligned_read(struct r5conf *conf, struct bio *raid_bio, int handled = 0; logical_sector = raid_bio->bi_iter.bi_sector & - ~((sector_t)STRIPE_SECTORS-1); + ~((sector_t)RAID5_STRIPE_SECTORS(conf)-1); sector = raid5_compute_sector(conf, logical_sector, 0, &dd_idx, NULL); last_sector = bio_end_sector(raid_bio); for (; logical_sector < last_sector; - logical_sector += STRIPE_SECTORS, - sector += STRIPE_SECTORS, + logical_sector += RAID5_STRIPE_SECTORS(conf), + sector += RAID5_STRIPE_SECTORS(conf), scnt++) { if (scnt < offset) @@ -6770,7 +6790,7 @@ static int alloc_scratch_buffer(struct r5conf *conf, struct raid5_percpu *percpu conf->previous_raid_disks), max(conf->chunk_sectors, conf->prev_chunk_sectors) - / STRIPE_SECTORS)) { + / RAID5_STRIPE_SECTORS(conf))) { free_scratch_buffer(conf, percpu); return -ENOMEM; } @@ -6922,6 +6942,7 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf = kzalloc(sizeof(struct r5conf), GFP_KERNEL); if (conf == NULL) goto abort; + INIT_LIST_HEAD(&conf->free_list); INIT_LIST_HEAD(&conf->pending_list); conf->pending_data = kcalloc(PENDING_IO_MAX, @@ -7073,8 +7094,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) conf->min_nr_stripes = NR_STRIPES; if (mddev->reshape_position != MaxSector) { int stripes = max_t(int, - ((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4, - ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4); + ((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4, + ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4); conf->min_nr_stripes = max(NR_STRIPES, stripes); if (conf->min_nr_stripes != NR_STRIPES) pr_info("md/raid:%s: force stripe size %d for reshape\n", @@ -7805,14 +7826,14 @@ static int check_stripe_cache(struct mddev *mddev) * stripe_heads first. */ struct r5conf *conf = mddev->private; - if (((mddev->chunk_sectors << 9) / STRIPE_SIZE) * 4 + if (((mddev->chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4 > conf->min_nr_stripes || - ((mddev->new_chunk_sectors << 9) / STRIPE_SIZE) * 4 + ((mddev->new_chunk_sectors << 9) / RAID5_STRIPE_SIZE(conf)) * 4 > conf->min_nr_stripes) { pr_warn("md/raid:%s: reshape: not enough stripes. Needed %lu\n", mdname(mddev), ((max(mddev->chunk_sectors, mddev->new_chunk_sectors) << 9) - / STRIPE_SIZE)*4); + / RAID5_STRIPE_SIZE(conf))*4); return 0; } return 1; @@ -8144,7 +8165,7 @@ static void *raid5_takeover_raid1(struct mddev *mddev) while (chunksect && (mddev->array_sectors & (chunksect-1))) chunksect >>= 1; - if ((chunksect<<9) < STRIPE_SIZE) + if ((chunksect<<9) < RAID5_STRIPE_SIZE((struct r5conf *)mddev->private)) /* array size does not allow a suitable chunk size */ return ERR_PTR(-EINVAL); diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index f90e0704bed9..ca21f30e31da 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -481,23 +481,6 @@ struct disk_info { #define HASH_MASK (NR_HASH - 1) #define MAX_STRIPE_BATCH 8 -/* bio's attached to a stripe+device for I/O are linked together in bi_sector - * order without overlap. There may be several bio's per stripe+device, and - * a bio could span several devices. - * When walking this list for a particular stripe+device, we must never proceed - * beyond a bio that extends past this device, as the next bio might no longer - * be valid. - * This function is used to determine the 'next' bio in the list, given the - * sector of the current stripe+device - */ -static inline struct bio *r5_next_bio(struct bio *bio, sector_t sector) -{ - if (bio_end_sector(bio) < sector + STRIPE_SECTORS) - return bio->bi_next; - else - return NULL; -} - /* NOTE NR_STRIPE_HASH_LOCKS must remain below 64. * This is because we sometimes take all the spinlocks * and creating that much locking depth can cause @@ -690,6 +673,26 @@ struct r5conf { struct r5pending_data *next_pending_data; }; +#define RAID5_STRIPE_SIZE(conf) STRIPE_SIZE +#define RAID5_STRIPE_SHIFT(conf) STRIPE_SHIFT +#define RAID5_STRIPE_SECTORS(conf) STRIPE_SECTORS + +/* bio's attached to a stripe+device for I/O are linked together in bi_sector + * order without overlap. There may be several bio's per stripe+device, and + * a bio could span several devices. + * When walking this list for a particular stripe+device, we must never proceed + * beyond a bio that extends past this device, as the next bio might no longer + * be valid. + * This function is used to determine the 'next' bio in the list, given the + * sector of the current stripe+device + */ +static inline struct bio *r5_next_bio(struct r5conf *conf, struct bio *bio, sector_t sector) +{ + if (bio_end_sector(bio) < sector + RAID5_STRIPE_SECTORS(conf)) + return bio->bi_next; + else + return NULL; +} /* * Our supported algorithms From e236858243d7a8e0ac60972d2f9522146088a736 Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Sat, 18 Jul 2020 05:29:08 -0400 Subject: [PATCH 09/14] md/raid5: set default stripe_size as 4096 In RAID5, if issued bio size is bigger than stripe_size, it will be split in the unit of stripe_size and process them one by one. Even for size less then stripe_size, RAID5 also request data from disk at least of stripe_size. Nowdays, stripe_size is equal to the value of PAGE_SIZE. Since filesystem usually issue bio in the unit of 4KB, there is no problem for PAGE_SIZE as 4KB. But, for 64KB PAGE_SIZE, bio from filesystem requests 4KB data while RAID5 issue IO at least stripe_size (64KB) each time. That will waste resource of disk bandwidth and compute xor. To avoding the waste, we want to make stripe_size configurable. This patch just set default stripe_size as 4096. User can also set the value bigger than 4KB for some special requirements, such as we know the issued io size is more than 4KB. To evaluate the new feature, we create raid5 device '/dev/md5' with 4 SSD disk and test it on arm64 machine with 64KB PAGE_SIZE. 1) We format /dev/md5 with mkfs.ext4 and mount ext4 with default configure on /mnt directory. Then, trying to test it by dbench with command: dbench -D /mnt -t 1000 10. Result show as: 'stripe_size = 64KB' Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 9805011 0.021 64.728 Close 7202525 0.001 0.120 Rename 415213 0.051 44.681 Unlink 1980066 0.079 93.147 Deltree 240 1.793 6.516 Mkdir 120 0.004 0.007 Qpathinfo 8887512 0.007 37.114 Qfileinfo 1557262 0.001 0.030 Qfsinfo 1629582 0.012 0.152 Sfileinfo 798756 0.040 57.641 Find 3436004 0.019 57.782 WriteX 4887239 0.021 57.638 ReadX 15370483 0.005 37.818 LockX 31934 0.003 0.022 UnlockX 31933 0.001 0.021 Flush 687205 13.302 530.088 Throughput 307.799 MB/sec 10 clients 10 procs max_latency=530.091 ms ------------------------------------------------------- 'stripe_size = 4KB' Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 11999166 0.021 36.380 Close 8814128 0.001 0.122 Rename 508113 0.051 29.169 Unlink 2423242 0.070 38.141 Deltree 300 1.885 7.155 Mkdir 150 0.004 0.006 Qpathinfo 10875921 0.007 35.485 Qfileinfo 1905837 0.001 0.032 Qfsinfo 1994304 0.012 0.125 Sfileinfo 977450 0.029 26.489 Find 4204952 0.019 9.361 WriteX 5981890 0.019 27.804 ReadX 18809742 0.004 33.491 LockX 39074 0.003 0.025 UnlockX 39074 0.001 0.014 Flush 841022 10.712 458.848 Throughput 376.777 MB/sec 10 clients 10 procs max_latency=458.852 ms ------------------------------------------------------- It show that setting stripe_size as 4KB has higher thoughput, i.e. (376.777 vs 307.799) and has smaller latency than that setting as 64KB. 2) We try to evaluate IO throughput for /dev/md5 by fio with config: [4KB randwrite] direct=1 numjob=2 iodepth=64 ioengine=libaio filename=/dev/md5 bs=4KB rw=randwrite [64KB write] direct=1 numjob=2 iodepth=64 ioengine=libaio filename=/dev/md5 bs=1MB rw=write The result as follow: + + | stripe_size(64KB) | stripe_size(4KB) +----------------------------------------------------+ 4KB randwrite | 15MB/s | 100MB/s +----------------------------------------------------+ 1MB write | 1000MB/s | 700MB/s The result show that when size of io is bigger than 4KB (64KB), 64KB stripe_size has much higher IOPS. But for 4KB randwrite, that means, size of io issued to device are smaller, 4KB stripe_size have better performance. Normally, default value (4096) can get relatively good performance. But if each issued io is bigger than 4096, setting value more than 4096 may get better performance. Here, we just set default stripe_size as 4096, and we will try to support setting different stripe_size by sysfs interface in the following patch. Signed-off-by: Yufen Yu Signed-off-by: Song Liu --- drivers/md/raid5.c | 5 +++++ drivers/md/raid5.h | 22 +++++++++++++++++++--- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 18f20f3d9664..60cfa397f2d6 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6943,6 +6943,11 @@ static struct r5conf *setup_conf(struct mddev *mddev) if (conf == NULL) goto abort; +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + conf->stripe_size = DEFAULT_STRIPE_SIZE; + conf->stripe_shift = ilog2(DEFAULT_STRIPE_SIZE) - 9; + conf->stripe_sectors = DEFAULT_STRIPE_SIZE >> 9; +#endif INIT_LIST_HEAD(&conf->free_list); INIT_LIST_HEAD(&conf->pending_list); conf->pending_data = kcalloc(PENDING_IO_MAX, diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index ca21f30e31da..7fb3b26a181a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -472,9 +472,14 @@ struct disk_info { */ #define NR_STRIPES 256 +#define DEFAULT_STRIPE_SIZE 4096 + +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE #define STRIPE_SIZE PAGE_SIZE #define STRIPE_SHIFT (PAGE_SHIFT - 9) #define STRIPE_SECTORS (STRIPE_SIZE>>9) +#endif + #define IO_THRESHOLD 1 #define BYPASS_THRESHOLD 1 #define NR_HASH (PAGE_SIZE / sizeof(struct hlist_head)) @@ -557,6 +562,11 @@ struct r5conf { int raid_disks; int max_nr_stripes; int min_nr_stripes; +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE + unsigned long stripe_size; + unsigned int stripe_shift; + unsigned long stripe_sectors; +#endif /* reshape_progress is the leading edge of a 'reshape' * It has value MaxSector when no reshape is happening @@ -673,9 +683,15 @@ struct r5conf { struct r5pending_data *next_pending_data; }; -#define RAID5_STRIPE_SIZE(conf) STRIPE_SIZE -#define RAID5_STRIPE_SHIFT(conf) STRIPE_SHIFT -#define RAID5_STRIPE_SECTORS(conf) STRIPE_SECTORS +#if PAGE_SIZE == DEFAULT_STRIPE_SIZE +#define RAID5_STRIPE_SIZE(conf) STRIPE_SIZE +#define RAID5_STRIPE_SHIFT(conf) STRIPE_SHIFT +#define RAID5_STRIPE_SECTORS(conf) STRIPE_SECTORS +#else +#define RAID5_STRIPE_SIZE(conf) ((conf)->stripe_size) +#define RAID5_STRIPE_SHIFT(conf) ((conf)->stripe_shift) +#define RAID5_STRIPE_SECTORS(conf) ((conf)->stripe_sectors) +#endif /* bio's attached to a stripe+device for I/O are linked together in bi_sector * order without overlap. There may be several bio's per stripe+device, and From 3b5408b98e4db62b322f8516a0d08f95f197c42f Mon Sep 17 00:00:00 2001 From: Yufen Yu Date: Sat, 18 Jul 2020 05:29:09 -0400 Subject: [PATCH 10/14] md/raid5: support config stripe_size by sysfs entry Adding a new 'stripe_size' sysfs entry to set and show stripe_size. stripe_size should not be bigger than PAGE_SIZE, and it requires to be multiple of 4096. We can adjust stripe_size by writing value into sysfs entry, likely, set stripe_size as 16KB: echo 16384 > /sys/block/md1/md/stripe_size Show current stripe_size value: cat /sys/block/md1/md/stripe_size For PAGE_SIZE is equal to 4096, 'stripe_size' can just be read. Signed-off-by: Yufen Yu Signed-off-by: Song Liu --- drivers/md/raid5.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 60cfa397f2d6..40961dd1777b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -6503,6 +6503,77 @@ raid5_rmw_level = __ATTR(rmw_level, S_IRUGO | S_IWUSR, raid5_show_rmw_level, raid5_store_rmw_level); +static ssize_t +raid5_show_stripe_size(struct mddev *mddev, char *page) +{ + struct r5conf *conf; + int ret = 0; + + spin_lock(&mddev->lock); + conf = mddev->private; + if (conf) + ret = sprintf(page, "%lu\n", RAID5_STRIPE_SIZE(conf)); + spin_unlock(&mddev->lock); + return ret; +} + +#if PAGE_SIZE != DEFAULT_STRIPE_SIZE +static ssize_t +raid5_store_stripe_size(struct mddev *mddev, const char *page, size_t len) +{ + struct r5conf *conf; + unsigned long new; + int err; + + if (len >= PAGE_SIZE) + return -EINVAL; + if (kstrtoul(page, 10, &new)) + return -EINVAL; + + /* + * The value should not be bigger than PAGE_SIZE. It requires to + * be multiple of DEFAULT_STRIPE_SIZE. + */ + if (new % DEFAULT_STRIPE_SIZE != 0 || new > PAGE_SIZE || new == 0) + return -EINVAL; + + err = mddev_lock(mddev); + if (err) + return err; + + conf = mddev->private; + if (!conf) { + err = -ENODEV; + goto out_unlock; + } + + if (new == conf->stripe_size) + goto out_unlock; + + pr_debug("md/raid: change stripe_size from %lu to %lu\n", + conf->stripe_size, new); + + mddev_suspend(mddev); + conf->stripe_size = new; + conf->stripe_shift = ilog2(new) - 9; + conf->stripe_sectors = new >> 9; + mddev_resume(mddev); + +out_unlock: + mddev_unlock(mddev); + return err ?: len; +} + +static struct md_sysfs_entry +raid5_stripe_size = __ATTR(stripe_size, 0644, + raid5_show_stripe_size, + raid5_store_stripe_size); +#else +static struct md_sysfs_entry +raid5_stripe_size = __ATTR(stripe_size, 0444, + raid5_show_stripe_size, + NULL); +#endif static ssize_t raid5_show_preread_threshold(struct mddev *mddev, char *page) @@ -6691,6 +6762,7 @@ static struct attribute *raid5_attrs[] = { &raid5_group_thread_cnt.attr, &raid5_skip_copy.attr, &raid5_rmw_level.attr, + &raid5_stripe_size.attr, &r5c_journal_mode.attr, &ppl_write_hint.attr, NULL, From 7c9d5c54fb72d92fe3fd7d3a3eb58b5200e454c4 Mon Sep 17 00:00:00 2001 From: Zhao Heming Date: Tue, 21 Jul 2020 02:08:52 +0800 Subject: [PATCH 11/14] md-cluster: fix safemode_delay value when converting to clustered bitmap When array convert to clustered bitmap, the safe_mode_delay doesn't clean and vice versa. the /sys/block/mdX/md/safe_mode_delay keep original value after changing bitmap type. In safe_delay_store(), the code forbids setting mddev->safemode_delay when array is clustered. So in cluster-md env, the expected safemode_delay value should be 0. Reproducible steps: ``` node1 # mdadm --zero-superblock /dev/sd{b,c,d} node1 # mdadm -C /dev/md0 -b internal -e 1.2 -n 2 -l mirror /dev/sdb /dev/sdc node1 # cat /sys/block/md0/md/safe_mode_delay 0.204 node1 # mdadm -G /dev/md0 -b none node1 # mdadm --grow /dev/md0 --bitmap=clustered node1 # cat /sys/block/md0/md/safe_mode_delay 0.204 <== doesn't change, should ZERO for cluster-md node1 # mdadm --zero-superblock /dev/sd{b,c,d} node1 # mdadm -C /dev/md0 -b clustered -e 1.2 -n 2 -l mirror /dev/sdb /dev/sdc node1 # cat /sys/block/md0/md/safe_mode_delay 0.000 node1 # mdadm -G /dev/md0 -b none node1 # cat /sys/block/md0/md/safe_mode_delay 0.000 <== doesn't change, should default value ``` Reviewed-by: NeilBrown Signed-off-by: Zhao Heming Signed-off-by: Song Liu --- drivers/md/md.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index 9d740e4181ff..79a6b2f423dd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -101,6 +101,8 @@ static void mddev_detach(struct mddev *mddev); * count by 2 for every hour elapsed between read errors. */ #define MD_DEFAULT_MAX_CORRECTED_READ_ERRORS 20 +/* Default safemode delay: 200 msec */ +#define DEFAULT_SAFEMODE_DELAY ((200 * HZ)/1000 +1) /* * Current RAID-1,4,5 parallel reconstruction 'guaranteed speed limit' * is 1000 KB/sec, so the extra system load does not show up that much. @@ -6034,7 +6036,7 @@ int md_run(struct mddev *mddev) if (mddev_is_clustered(mddev)) mddev->safemode_delay = 0; else - mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */ + mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; mddev->in_sync = 1; smp_wmb(); spin_lock(&mddev->lock); @@ -7413,6 +7415,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.nodes = 0; md_cluster_ops->leave(mddev); + mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } mddev_suspend(mddev); md_bitmap_destroy(mddev); @@ -8408,6 +8411,7 @@ EXPORT_SYMBOL(unregister_md_cluster_operations); int md_setup_cluster(struct mddev *mddev, int nodes) { + int ret; if (!md_cluster_ops) request_module("md-cluster"); spin_lock(&pers_lock); @@ -8419,7 +8423,10 @@ int md_setup_cluster(struct mddev *mddev, int nodes) } spin_unlock(&pers_lock); - return md_cluster_ops->join(mddev, nodes); + ret = md_cluster_ops->join(mddev, nodes); + if (!ret) + mddev->safemode_delay = 0; + return ret; } void md_cluster_stop(struct mddev *mddev) From edee9dfe51b77ad6532a1014a7be91b2a423bc37 Mon Sep 17 00:00:00 2001 From: Zhao Heming Date: Tue, 21 Jul 2020 02:08:53 +0800 Subject: [PATCH 12/14] md-cluster: fix rmmod issue when md_cluster convert bitmap to none update_array_info misses calling module_put when removing cluster bitmap. steps to reproduce: ``` node1 # mdadm -C /dev/md0 -b clustered -e 1.2 -n 2 -l mirror /dev/sda /dev/sdb mdadm: array /dev/md0 started. node1 # lsmod | egrep "dlm|md_|raid1" md_cluster 28672 1 dlm 212992 14 md_cluster configfs 57344 2 dlm raid1 53248 1 md_mod 176128 2 raid1,md_cluster node1 # mdadm -G /dev/md0 -b none node1 # lsmod | egrep "dlm|md_|raid1" md_cluster 28672 1 <== should be zero dlm 212992 9 md_cluster configfs 57344 2 dlm raid1 53248 1 md_mod 176128 2 raid1,md_cluster node1 # mdadm -G /dev/md0 -b clustered node1 # lsmod | egrep "dlm|md_|raid1" md_cluster 28672 2 <== increase dlm 212992 14 md_cluster configfs 57344 2 dlm raid1 53248 1 md_mod 176128 2 raid1,md_cluster node1 # mdadm -G /dev/md0 -b none node1 # mdadm -G /dev/md0 -b clustered node1 # lsmod | egrep "dlm|md_|raid1" md_cluster 28672 3 <== increase dlm 212992 14 md_cluster configfs 57344 2 dlm raid1 53248 1 md_mod 176128 2 raid1,md_cluster ``` Reviewed-by: NeilBrown Signed-off-by: Zhao Heming Signed-off-by: Song Liu --- drivers/md/md.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/md/md.c b/drivers/md/md.c index 79a6b2f423dd..ea48bc25cce1 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -7415,6 +7415,7 @@ static int update_array_info(struct mddev *mddev, mdu_array_info_t *info) mddev->bitmap_info.nodes = 0; md_cluster_ops->leave(mddev); + module_put(md_cluster_mod); mddev->safemode_delay = DEFAULT_SAFEMODE_DELAY; } mddev_suspend(mddev); From c333f9495c451d958c6f4a41e5de2d8f80f79496 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 17 Jul 2020 16:37:13 -0700 Subject: [PATCH 13/14] raid: md_p.h: drop duplicated word in a comment Drop the doubled word "the" in a comment. Signed-off-by: Randy Dunlap Cc: Song Liu Cc: linux-raid@vger.kernel.org Signed-off-by: Song Liu --- include/uapi/linux/raid/md_p.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/uapi/linux/raid/md_p.h b/include/uapi/linux/raid/md_p.h index 1f2d8c81f0e0..e5a98a16f9b0 100644 --- a/include/uapi/linux/raid/md_p.h +++ b/include/uapi/linux/raid/md_p.h @@ -123,7 +123,7 @@ typedef struct mdp_device_descriptor_s { /* * Notes: - * - if an array is being reshaped (restriped) in order to change the + * - if an array is being reshaped (restriped) in order to change * the number of active devices in the array, 'raid_disks' will be * the larger of the old and new numbers. 'delta_disks' will * be the "new - old". So if +ve, raid_disks is the new value, and From fe630de009d0729584d79c78f43121e07c745fdc Mon Sep 17 00:00:00 2001 From: Vitaly Mayatskikh Date: Tue, 3 Mar 2020 13:14:40 -0500 Subject: [PATCH 14/14] md/raid10: avoid deadlock on recovery. When disk failure happens and the array has a spare drive, resync thread kicks in and starts to refill the spare. However it may get blocked by a retry thread that resubmits failed IO to a mirror and itself can get blocked on a barrier raised by the resync thread. Acked-by: Nigel Croxon Signed-off-by: Vitaly Mayatskikh Signed-off-by: Song Liu --- drivers/md/raid10.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 14b1ba732cd7..cefda2abd34f 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -980,6 +980,7 @@ static void wait_barrier(struct r10conf *conf) { spin_lock_irq(&conf->resync_lock); if (conf->barrier) { + struct bio_list *bio_list = current->bio_list; conf->nr_waiting++; /* Wait for the barrier to drop. * However if there are already pending @@ -994,9 +995,16 @@ static void wait_barrier(struct r10conf *conf) wait_event_lock_irq(conf->wait_barrier, !conf->barrier || (atomic_read(&conf->nr_pending) && - current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1]))), + bio_list && + (!bio_list_empty(&bio_list[0]) || + !bio_list_empty(&bio_list[1]))) || + /* move on if recovery thread is + * blocked by us + */ + (conf->mddev->thread->tsk == current && + test_bit(MD_RECOVERY_RUNNING, + &conf->mddev->recovery) && + conf->nr_queued > 0), conf->resync_lock); conf->nr_waiting--; if (!conf->nr_waiting)