diff --git a/Makefile b/Makefile index 502b397496ef..48389320ae04 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 5 PATCHLEVEL = 10 SUBLEVEL = 0 -EXTRAVERSION = -rc7 +EXTRAVERSION = NAME = Kleptomaniac Octopus # *DOCUMENTATION* diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 816b31c68550..394757ee030a 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -155,6 +155,7 @@ enum page_cache_mode { #define _PAGE_ENC (_AT(pteval_t, sme_me_mask)) #define _PAGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT) +#define _PAGE_LARGE_CACHE_MASK (_PAGE_PWT | _PAGE_PCD | _PAGE_PAT_LARGE) #define _PAGE_NOCACHE (cachemode2protval(_PAGE_CACHE_MODE_UC)) #define _PAGE_CACHE_WP (cachemode2protval(_PAGE_CACHE_MODE_WP)) diff --git a/arch/x86/include/asm/sync_core.h b/arch/x86/include/asm/sync_core.h index 0fd4a9dfb29c..ab7382f92aff 100644 --- a/arch/x86/include/asm/sync_core.h +++ b/arch/x86/include/asm/sync_core.h @@ -98,12 +98,13 @@ static inline void sync_core_before_usermode(void) /* With PTI, we unconditionally serialize before running user code. */ if (static_cpu_has(X86_FEATURE_PTI)) return; + /* - * Return from interrupt and NMI is done through iret, which is core - * serializing. + * Even if we're in an interrupt, we might reschedule before returning, + * in which case we could switch to a different thread in the same mm + * and return using SYSRET or SYSEXIT. Instead of trying to keep + * track of our need to sync the core, just sync right away. */ - if (in_irq() || in_nmi()) - return; sync_core(); } diff --git a/arch/x86/kernel/apic/vector.c b/arch/x86/kernel/apic/vector.c index 1eac53632786..758bbf25ef74 100644 --- a/arch/x86/kernel/apic/vector.c +++ b/arch/x86/kernel/apic/vector.c @@ -273,20 +273,24 @@ static int assign_irq_vector_any_locked(struct irq_data *irqd) const struct cpumask *affmsk = irq_data_get_affinity_mask(irqd); int node = irq_data_get_node(irqd); - if (node == NUMA_NO_NODE) - goto all; - /* Try the intersection of @affmsk and node mask */ - cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk); - if (!assign_vector_locked(irqd, vector_searchmask)) - return 0; - /* Try the node mask */ - if (!assign_vector_locked(irqd, cpumask_of_node(node))) - return 0; -all: + if (node != NUMA_NO_NODE) { + /* Try the intersection of @affmsk and node mask */ + cpumask_and(vector_searchmask, cpumask_of_node(node), affmsk); + if (!assign_vector_locked(irqd, vector_searchmask)) + return 0; + } + /* Try the full affinity mask */ cpumask_and(vector_searchmask, affmsk, cpu_online_mask); if (!assign_vector_locked(irqd, vector_searchmask)) return 0; + + if (node != NUMA_NO_NODE) { + /* Try the node mask */ + if (!assign_vector_locked(irqd, cpumask_of_node(node))) + return 0; + } + /* Try the full online mask */ return assign_vector_locked(irqd, cpu_online_mask); } diff --git a/arch/x86/kernel/cpu/resctrl/monitor.c b/arch/x86/kernel/cpu/resctrl/monitor.c index 54dffe574e67..a98519a3a2e6 100644 --- a/arch/x86/kernel/cpu/resctrl/monitor.c +++ b/arch/x86/kernel/cpu/resctrl/monitor.c @@ -279,7 +279,6 @@ static void mbm_bw_count(u32 rmid, struct rmid_read *rr) return; chunks = mbm_overflow_count(m->prev_bw_msr, tval, rr->r->mbm_width); - m->chunks += chunks; cur_bw = (chunks * r->mon_scale) >> 20; if (m->delta_comp) @@ -450,15 +449,14 @@ static void mbm_update(struct rdt_resource *r, struct rdt_domain *d, int rmid) } if (is_mbm_local_enabled()) { rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; + __mon_event_count(rmid, &rr); /* * Call the MBA software controller only for the * control groups and when user has enabled * the software controller explicitly. */ - if (!is_mba_sc(NULL)) - __mon_event_count(rmid, &rr); - else + if (is_mba_sc(NULL)) mbm_bw_count(rmid, &rr); } } diff --git a/arch/x86/kernel/kprobes/opt.c b/arch/x86/kernel/kprobes/opt.c index 041f0b50bc27..08eb23074f92 100644 --- a/arch/x86/kernel/kprobes/opt.c +++ b/arch/x86/kernel/kprobes/opt.c @@ -272,6 +272,19 @@ static int insn_is_indirect_jump(struct insn *insn) return ret; } +static bool is_padding_int3(unsigned long addr, unsigned long eaddr) +{ + unsigned char ops; + + for (; addr < eaddr; addr++) { + if (get_kernel_nofault(ops, (void *)addr) < 0 || + ops != INT3_INSN_OPCODE) + return false; + } + + return true; +} + /* Decode whole function to ensure any instructions don't jump into target */ static int can_optimize(unsigned long paddr) { @@ -310,9 +323,14 @@ static int can_optimize(unsigned long paddr) return 0; kernel_insn_init(&insn, (void *)recovered_insn, MAX_INSN_SIZE); insn_get_length(&insn); - /* Another subsystem puts a breakpoint */ + /* + * In the case of detecting unknown breakpoint, this could be + * a padding INT3 between functions. Let's check that all the + * rest of the bytes are also INT3. + */ if (insn.opcode.bytes[0] == INT3_INSN_OPCODE) - return 0; + return is_padding_int3(addr, paddr - offset + size) ? 1 : 0; + /* Recover address */ insn.kaddr = (void *)addr; insn.next_byte = (void *)(addr + insn.length); diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 733b983f3a89..6c5eb6f3f14f 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -45,8 +45,8 @@ #define PMD_FLAGS_LARGE (__PAGE_KERNEL_LARGE_EXEC & ~_PAGE_GLOBAL) #define PMD_FLAGS_DEC PMD_FLAGS_LARGE -#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_CACHE_MASK) | \ - (_PAGE_PAT | _PAGE_PWT)) +#define PMD_FLAGS_DEC_WP ((PMD_FLAGS_DEC & ~_PAGE_LARGE_CACHE_MASK) | \ + (_PAGE_PAT_LARGE | _PAGE_PWT)) #define PMD_FLAGS_ENC (PMD_FLAGS_LARGE | _PAGE_ENC) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 11666ba19b62..569ac1d57f55 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -474,8 +474,14 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, /* * The membarrier system call requires a full memory barrier and * core serialization before returning to user-space, after - * storing to rq->curr. Writing to CR3 provides that full - * memory barrier and core serializing instruction. + * storing to rq->curr, when changing mm. This is because + * membarrier() sends IPIs to all CPUs that are in the target mm + * to make them issue memory barriers. However, if another CPU + * switches to/from the target mm concurrently with + * membarrier(), it can cause that CPU not to receive an IPI + * when it really should issue a memory barrier. Writing to CR3 + * provides that full memory barrier and core serializing + * instruction. */ if (real_prev == next) { VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != diff --git a/drivers/md/dm-raid.c b/drivers/md/dm-raid.c index 9c1f7c4de65b..dc8568ab96f2 100644 --- a/drivers/md/dm-raid.c +++ b/drivers/md/dm-raid.c @@ -3728,6 +3728,17 @@ static void raid_io_hints(struct dm_target *ti, struct queue_limits *limits) blk_limits_io_min(limits, chunk_size_bytes); blk_limits_io_opt(limits, chunk_size_bytes * mddev_data_stripes(rs)); + + /* + * RAID10 personality requires bio splitting, + * RAID0/1/4/5/6 don't and process large discard bios properly. + */ + if (rs_is_raid10(rs)) { + limits->discard_granularity = max(chunk_size_bytes, + limits->discard_granularity); + limits->max_discard_sectors = min_not_zero(rs->md.chunk_sectors, + limits->max_discard_sectors); + } } static void raid_postsuspend(struct dm_target *ti) diff --git a/drivers/md/md.c b/drivers/md/md.c index 98bac4f304ae..0037c6ecab65 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8582,26 +8582,6 @@ void md_write_end(struct mddev *mddev) EXPORT_SYMBOL(md_write_end); -/* This is used by raid0 and raid10 */ -void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, - struct bio *bio, sector_t start, sector_t size) -{ - struct bio *discard_bio = NULL; - - if (__blkdev_issue_discard(rdev->bdev, start, size, - GFP_NOIO, 0, &discard_bio) || !discard_bio) - return; - - bio_chain(discard_bio, bio); - bio_clone_blkg_association(discard_bio, bio); - if (mddev->gendisk) - trace_block_bio_remap(bdev_get_queue(rdev->bdev), - discard_bio, disk_devt(mddev->gendisk), - bio->bi_iter.bi_sector); - submit_bio_noacct(discard_bio); -} -EXPORT_SYMBOL(md_submit_discard_bio); - /* md_allow_write(mddev) * Calling this ensures that the array is marked 'active' so that writes * may proceed without blocking. It is important to call this before diff --git a/drivers/md/md.h b/drivers/md/md.h index ccfb69868c2e..bb645bc3ba6d 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -311,7 +311,7 @@ struct mddev { int external; /* metadata is * managed externally */ char metadata_type[17]; /* externally set*/ - int chunk_sectors; + unsigned int chunk_sectors; time64_t ctime, utime; int level, layout; char clevel[16]; @@ -339,7 +339,7 @@ struct mddev { */ sector_t reshape_position; int delta_disks, new_level, new_layout; - int new_chunk_sectors; + unsigned int new_chunk_sectors; int reshape_backwards; struct md_thread *thread; /* management thread */ @@ -713,8 +713,6 @@ extern void md_write_end(struct mddev *mddev); extern void md_done_sync(struct mddev *mddev, int blocks, int ok); extern void md_error(struct mddev *mddev, struct md_rdev *rdev); extern void md_finish_reshape(struct mddev *mddev); -extern void md_submit_discard_bio(struct mddev *mddev, struct md_rdev *rdev, - struct bio *bio, sector_t start, sector_t size); extern bool __must_check md_flush_request(struct mddev *mddev, struct bio *bio); extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 6f44177593a5..35843df15b5e 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -477,6 +477,7 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) for (disk = 0; disk < zone->nb_dev; disk++) { sector_t dev_start, dev_end; + struct bio *discard_bio = NULL; struct md_rdev *rdev; if (disk < start_disk_index) @@ -499,9 +500,18 @@ static void raid0_handle_discard(struct mddev *mddev, struct bio *bio) rdev = conf->devlist[(zone - conf->strip_zone) * conf->strip_zone[0].nb_dev + disk]; - md_submit_discard_bio(mddev, rdev, bio, + if (__blkdev_issue_discard(rdev->bdev, dev_start + zone->dev_start + rdev->data_offset, - dev_end - dev_start); + dev_end - dev_start, GFP_NOIO, 0, &discard_bio) || + !discard_bio) + continue; + bio_chain(discard_bio, bio); + bio_clone_blkg_association(discard_bio, bio); + if (mddev->gendisk) + trace_block_bio_remap(bdev_get_queue(rdev->bdev), + discard_bio, disk_devt(mddev->gendisk), + bio->bi_iter.bi_sector); + submit_bio_noacct(discard_bio); } bio_endio(bio); } diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index b7bca6703df8..3b598a3cb462 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -91,7 +91,7 @@ static inline struct r10bio *get_resync_r10bio(struct bio *bio) static void * r10bio_pool_alloc(gfp_t gfp_flags, void *data) { struct r10conf *conf = data; - int size = offsetof(struct r10bio, devs[conf->geo.raid_disks]); + int size = offsetof(struct r10bio, devs[conf->copies]); /* allocate a r10bio with room for raid_disks entries in the * bios array */ @@ -238,7 +238,7 @@ static void put_all_bios(struct r10conf *conf, struct r10bio *r10_bio) { int i; - for (i = 0; i < conf->geo.raid_disks; i++) { + for (i = 0; i < conf->copies; i++) { struct bio **bio = & r10_bio->devs[i].bio; if (!BIO_SPECIAL(*bio)) bio_put(*bio); @@ -327,7 +327,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, int slot; int repl = 0; - for (slot = 0; slot < conf->geo.raid_disks; slot++) { + for (slot = 0; slot < conf->copies; slot++) { if (r10_bio->devs[slot].bio == bio) break; if (r10_bio->devs[slot].repl_bio == bio) { @@ -336,6 +336,7 @@ static int find_bio_disk(struct r10conf *conf, struct r10bio *r10_bio, } } + BUG_ON(slot == conf->copies); update_head_pos(slot, r10_bio); if (slotp) @@ -1275,75 +1276,12 @@ static void raid10_write_one_disk(struct mddev *mddev, struct r10bio *r10_bio, } } -static void wait_blocked_dev(struct mddev *mddev, struct r10bio *r10_bio) -{ - int i; - struct r10conf *conf = mddev->private; - struct md_rdev *blocked_rdev; - -retry_wait: - blocked_rdev = NULL; - rcu_read_lock(); - for (i = 0; i < conf->copies; i++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[i].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[i].replacement); - if (rdev == rrdev) - rrdev = NULL; - if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { - atomic_inc(&rdev->nr_pending); - blocked_rdev = rdev; - break; - } - if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { - atomic_inc(&rrdev->nr_pending); - blocked_rdev = rrdev; - break; - } - - if (rdev && test_bit(WriteErrorSeen, &rdev->flags)) { - sector_t first_bad; - sector_t dev_sector = r10_bio->devs[i].addr; - int bad_sectors; - int is_bad; - - /* Discard request doesn't care the write result - * so it doesn't need to wait blocked disk here. - */ - if (!r10_bio->sectors) - continue; - - is_bad = is_badblock(rdev, dev_sector, r10_bio->sectors, - &first_bad, &bad_sectors); - if (is_bad < 0) { - /* Mustn't write here until the bad block - * is acknowledged - */ - atomic_inc(&rdev->nr_pending); - set_bit(BlockedBadBlocks, &rdev->flags); - blocked_rdev = rdev; - break; - } - } - } - rcu_read_unlock(); - - if (unlikely(blocked_rdev)) { - /* Have to wait for this device to get unblocked, then retry */ - allow_barrier(conf); - raid10_log(conf->mddev, "%s wait rdev %d blocked", - __func__, blocked_rdev->raid_disk); - md_wait_for_blocked_rdev(blocked_rdev, mddev); - wait_barrier(conf); - goto retry_wait; - } -} - static void raid10_write_request(struct mddev *mddev, struct bio *bio, struct r10bio *r10_bio) { struct r10conf *conf = mddev->private; int i; + struct md_rdev *blocked_rdev; sector_t sectors; int max_sectors; @@ -1401,9 +1339,8 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, r10_bio->read_slot = -1; /* make sure repl_bio gets freed */ raid10_find_phys(conf, r10_bio); - - wait_blocked_dev(mddev, r10_bio); - +retry_write: + blocked_rdev = NULL; rcu_read_lock(); max_sectors = r10_bio->sectors; @@ -1414,6 +1351,16 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, conf->mirrors[d].replacement); if (rdev == rrdev) rrdev = NULL; + if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) { + atomic_inc(&rdev->nr_pending); + blocked_rdev = rdev; + break; + } + if (rrdev && unlikely(test_bit(Blocked, &rrdev->flags))) { + atomic_inc(&rrdev->nr_pending); + blocked_rdev = rrdev; + break; + } if (rdev && (test_bit(Faulty, &rdev->flags))) rdev = NULL; if (rrdev && (test_bit(Faulty, &rrdev->flags))) @@ -1434,6 +1381,15 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, is_bad = is_badblock(rdev, dev_sector, max_sectors, &first_bad, &bad_sectors); + if (is_bad < 0) { + /* Mustn't write here until the bad block + * is acknowledged + */ + atomic_inc(&rdev->nr_pending); + set_bit(BlockedBadBlocks, &rdev->flags); + blocked_rdev = rdev; + break; + } if (is_bad && first_bad <= dev_sector) { /* Cannot write here at all */ bad_sectors -= (dev_sector - first_bad); @@ -1469,6 +1425,35 @@ static void raid10_write_request(struct mddev *mddev, struct bio *bio, } rcu_read_unlock(); + if (unlikely(blocked_rdev)) { + /* Have to wait for this device to get unblocked, then retry */ + int j; + int d; + + for (j = 0; j < i; j++) { + if (r10_bio->devs[j].bio) { + d = r10_bio->devs[j].devnum; + rdev_dec_pending(conf->mirrors[d].rdev, mddev); + } + if (r10_bio->devs[j].repl_bio) { + struct md_rdev *rdev; + d = r10_bio->devs[j].devnum; + rdev = conf->mirrors[d].replacement; + if (!rdev) { + /* Race with remove_disk */ + smp_mb(); + rdev = conf->mirrors[d].rdev; + } + rdev_dec_pending(rdev, mddev); + } + } + allow_barrier(conf); + raid10_log(conf->mddev, "wait rdev %d blocked", blocked_rdev->raid_disk); + md_wait_for_blocked_rdev(blocked_rdev, mddev); + wait_barrier(conf); + goto retry_write; + } + if (max_sectors < r10_bio->sectors) r10_bio->sectors = max_sectors; @@ -1508,7 +1493,7 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) r10_bio->mddev = mddev; r10_bio->sector = bio->bi_iter.bi_sector; r10_bio->state = 0; - memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->geo.raid_disks); + memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * conf->copies); if (bio_data_dir(bio) == READ) raid10_read_request(mddev, bio, r10_bio); @@ -1516,296 +1501,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio, int sectors) raid10_write_request(mddev, bio, r10_bio); } -static struct bio *raid10_split_bio(struct r10conf *conf, - struct bio *bio, sector_t sectors, bool want_first) -{ - struct bio *split; - - split = bio_split(bio, sectors, GFP_NOIO, &conf->bio_split); - bio_chain(split, bio); - allow_barrier(conf); - if (want_first) { - submit_bio_noacct(bio); - bio = split; - } else - submit_bio_noacct(split); - wait_barrier(conf); - - return bio; -} - -static void raid_end_discard_bio(struct r10bio *r10bio) -{ - struct r10conf *conf = r10bio->mddev->private; - struct r10bio *first_r10bio; - - while (atomic_dec_and_test(&r10bio->remaining)) { - - allow_barrier(conf); - - if (!test_bit(R10BIO_Discard, &r10bio->state)) { - first_r10bio = (struct r10bio *)r10bio->master_bio; - free_r10bio(r10bio); - r10bio = first_r10bio; - } else { - md_write_end(r10bio->mddev); - bio_endio(r10bio->master_bio); - free_r10bio(r10bio); - break; - } - } -} - -static void raid10_end_discard_request(struct bio *bio) -{ - struct r10bio *r10_bio = bio->bi_private; - struct r10conf *conf = r10_bio->mddev->private; - struct md_rdev *rdev = NULL; - int dev; - int slot, repl; - - /* - * We don't care the return value of discard bio - */ - if (!test_bit(R10BIO_Uptodate, &r10_bio->state)) - set_bit(R10BIO_Uptodate, &r10_bio->state); - - dev = find_bio_disk(conf, r10_bio, bio, &slot, &repl); - if (repl) - rdev = conf->mirrors[dev].replacement; - if (!rdev) { - /* raid10_remove_disk uses smp_mb to make sure rdev is set to - * replacement before setting replacement to NULL. It can read - * rdev first without barrier protect even replacment is NULL - */ - smp_rmb(); - rdev = conf->mirrors[dev].rdev; - } - - raid_end_discard_bio(r10_bio); - rdev_dec_pending(rdev, conf->mddev); -} - -/* There are some limitations to handle discard bio - * 1st, the discard size is bigger than stripe_size*2. - * 2st, if the discard bio spans reshape progress, we use the old way to - * handle discard bio - */ -static int raid10_handle_discard(struct mddev *mddev, struct bio *bio) -{ - struct r10conf *conf = mddev->private; - struct geom *geo = &conf->geo; - struct r10bio *r10_bio, *first_r10bio; - int far_copies = geo->far_copies; - bool first_copy = true; - - int disk; - sector_t chunk; - unsigned int stripe_size; - sector_t split_size; - - sector_t bio_start, bio_end; - sector_t first_stripe_index, last_stripe_index; - sector_t start_disk_offset; - unsigned int start_disk_index; - sector_t end_disk_offset; - unsigned int end_disk_index; - unsigned int remainder; - - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - return -EAGAIN; - - wait_barrier(conf); - - /* Check reshape again to avoid reshape happens after checking - * MD_RECOVERY_RESHAPE and before wait_barrier - */ - if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) - goto out; - - stripe_size = geo->raid_disks << geo->chunk_shift; - bio_start = bio->bi_iter.bi_sector; - bio_end = bio_end_sector(bio); - - /* Maybe one discard bio is smaller than strip size or across one stripe - * and discard region is larger than one stripe size. For far offset layout, - * if the discard region is not aligned with stripe size, there is hole - * when we submit discard bio to member disk. For simplicity, we only - * handle discard bio which discard region is bigger than stripe_size*2 - */ - if (bio_sectors(bio) < stripe_size*2) - goto out; - - /* For far and far offset layout, if bio is not aligned with stripe size, - * it splits the part that is not aligned with strip size. - */ - div_u64_rem(bio_start, stripe_size, &remainder); - if ((far_copies > 1) && remainder) { - split_size = stripe_size - remainder; - bio = raid10_split_bio(conf, bio, split_size, false); - } - div_u64_rem(bio_end, stripe_size, &remainder); - if ((far_copies > 1) && remainder) { - split_size = bio_sectors(bio) - remainder; - bio = raid10_split_bio(conf, bio, split_size, true); - } - - bio_start = bio->bi_iter.bi_sector; - bio_end = bio_end_sector(bio); - - /* raid10 uses chunk as the unit to store data. It's similar like raid0. - * One stripe contains the chunks from all member disk (one chunk from - * one disk at the same HBA address). For layout detail, see 'man md 4' - */ - chunk = bio_start >> geo->chunk_shift; - chunk *= geo->near_copies; - first_stripe_index = chunk; - start_disk_index = sector_div(first_stripe_index, geo->raid_disks); - if (geo->far_offset) - first_stripe_index *= geo->far_copies; - start_disk_offset = (bio_start & geo->chunk_mask) + - (first_stripe_index << geo->chunk_shift); - - chunk = bio_end >> geo->chunk_shift; - chunk *= geo->near_copies; - last_stripe_index = chunk; - end_disk_index = sector_div(last_stripe_index, geo->raid_disks); - if (geo->far_offset) - last_stripe_index *= geo->far_copies; - end_disk_offset = (bio_end & geo->chunk_mask) + - (last_stripe_index << geo->chunk_shift); - -retry_discard: - r10_bio = mempool_alloc(&conf->r10bio_pool, GFP_NOIO); - r10_bio->mddev = mddev; - r10_bio->state = 0; - r10_bio->sectors = 0; - memset(r10_bio->devs, 0, sizeof(r10_bio->devs[0]) * geo->raid_disks); - wait_blocked_dev(mddev, r10_bio); - - /* For far layout it needs more than one r10bio to cover all regions. - * Inspired by raid10_sync_request, we can use the first r10bio->master_bio - * to record the discard bio. Other r10bio->master_bio record the first - * r10bio. The first r10bio only release after all other r10bios finish. - * The discard bio returns only first r10bio finishes - */ - if (first_copy) { - r10_bio->master_bio = bio; - set_bit(R10BIO_Discard, &r10_bio->state); - first_copy = false; - first_r10bio = r10_bio; - } else - r10_bio->master_bio = (struct bio *)first_r10bio; - - rcu_read_lock(); - for (disk = 0; disk < geo->raid_disks; disk++) { - struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[disk].replacement); - - r10_bio->devs[disk].bio = NULL; - r10_bio->devs[disk].repl_bio = NULL; - - if (rdev && (test_bit(Faulty, &rdev->flags))) - rdev = NULL; - if (rrdev && (test_bit(Faulty, &rrdev->flags))) - rrdev = NULL; - if (!rdev && !rrdev) - continue; - - if (rdev) { - r10_bio->devs[disk].bio = bio; - atomic_inc(&rdev->nr_pending); - } - if (rrdev) { - r10_bio->devs[disk].repl_bio = bio; - atomic_inc(&rrdev->nr_pending); - } - } - rcu_read_unlock(); - - atomic_set(&r10_bio->remaining, 1); - for (disk = 0; disk < geo->raid_disks; disk++) { - sector_t dev_start, dev_end; - struct bio *mbio, *rbio = NULL; - struct md_rdev *rdev = rcu_dereference(conf->mirrors[disk].rdev); - struct md_rdev *rrdev = rcu_dereference( - conf->mirrors[disk].replacement); - - /* - * Now start to calculate the start and end address for each disk. - * The space between dev_start and dev_end is the discard region. - * - * For dev_start, it needs to consider three conditions: - * 1st, the disk is before start_disk, you can imagine the disk in - * the next stripe. So the dev_start is the start address of next - * stripe. - * 2st, the disk is after start_disk, it means the disk is at the - * same stripe of first disk - * 3st, the first disk itself, we can use start_disk_offset directly - */ - if (disk < start_disk_index) - dev_start = (first_stripe_index + 1) * mddev->chunk_sectors; - else if (disk > start_disk_index) - dev_start = first_stripe_index * mddev->chunk_sectors; - else - dev_start = start_disk_offset; - - if (disk < end_disk_index) - dev_end = (last_stripe_index + 1) * mddev->chunk_sectors; - else if (disk > end_disk_index) - dev_end = last_stripe_index * mddev->chunk_sectors; - else - dev_end = end_disk_offset; - - /* It only handles discard bio which size is >= stripe size, so - * dev_end > dev_start all the time - */ - if (r10_bio->devs[disk].bio) { - mbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); - mbio->bi_end_io = raid10_end_discard_request; - mbio->bi_private = r10_bio; - r10_bio->devs[disk].bio = mbio; - r10_bio->devs[disk].devnum = disk; - atomic_inc(&r10_bio->remaining); - md_submit_discard_bio(mddev, rdev, mbio, - dev_start + choose_data_offset(r10_bio, rdev), - dev_end - dev_start); - bio_endio(mbio); - } - if (r10_bio->devs[disk].repl_bio) { - rbio = bio_clone_fast(bio, GFP_NOIO, &mddev->bio_set); - rbio->bi_end_io = raid10_end_discard_request; - rbio->bi_private = r10_bio; - r10_bio->devs[disk].repl_bio = rbio; - r10_bio->devs[disk].devnum = disk; - atomic_inc(&r10_bio->remaining); - md_submit_discard_bio(mddev, rrdev, rbio, - dev_start + choose_data_offset(r10_bio, rrdev), - dev_end - dev_start); - bio_endio(rbio); - } - } - - if (!geo->far_offset && --far_copies) { - first_stripe_index += geo->stride >> geo->chunk_shift; - start_disk_offset += geo->stride; - last_stripe_index += geo->stride >> geo->chunk_shift; - end_disk_offset += geo->stride; - atomic_inc(&first_r10bio->remaining); - raid_end_discard_bio(r10_bio); - wait_barrier(conf); - goto retry_discard; - } - - raid_end_discard_bio(r10_bio); - - return 0; -out: - allow_barrier(conf); - return -EAGAIN; -} - static bool raid10_make_request(struct mddev *mddev, struct bio *bio) { struct r10conf *conf = mddev->private; @@ -1820,10 +1515,6 @@ static bool raid10_make_request(struct mddev *mddev, struct bio *bio) if (!md_write_start(mddev, bio)) return false; - if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) - if (!raid10_handle_discard(mddev, bio)) - return true; - /* * If this request crosses a chunk boundary, we need to split * it. @@ -4063,7 +3754,7 @@ static int raid10_run(struct mddev *mddev) if (mddev->queue) { blk_queue_max_discard_sectors(mddev->queue, - UINT_MAX); + mddev->chunk_sectors); blk_queue_max_write_same_sectors(mddev->queue, 0); blk_queue_max_write_zeroes_sectors(mddev->queue, 0); blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9); diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h index 1461fd55311b..79cd2b7d3128 100644 --- a/drivers/md/raid10.h +++ b/drivers/md/raid10.h @@ -179,6 +179,5 @@ enum r10bio_state { R10BIO_Previous, /* failfast devices did receive failfast requests. */ R10BIO_FailFast, - R10BIO_Discard, }; #endif diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index e23e74d52db5..9d8df34bea75 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -38,8 +38,33 @@ static void ipi_mb(void *info) smp_mb(); /* IPIs should be serializing but paranoid. */ } +static void ipi_sync_core(void *info) +{ + /* + * The smp_mb() in membarrier after all the IPIs is supposed to + * ensure that memory on remote CPUs that occur before the IPI + * become visible to membarrier()'s caller -- see scenario B in + * the big comment at the top of this file. + * + * A sync_core() would provide this guarantee, but + * sync_core_before_usermode() might end up being deferred until + * after membarrier()'s smp_mb(). + */ + smp_mb(); /* IPIs should be serializing but paranoid. */ + + sync_core_before_usermode(); +} + static void ipi_rseq(void *info) { + /* + * Ensure that all stores done by the calling thread are visible + * to the current task before the current task resumes. We could + * probably optimize this away on most architectures, but by the + * time we've already sent an IPI, the cost of the extra smp_mb() + * is negligible. + */ + smp_mb(); rseq_preempt(current); } @@ -154,6 +179,7 @@ static int membarrier_private_expedited(int flags, int cpu_id) if (!(atomic_read(&mm->membarrier_state) & MEMBARRIER_STATE_PRIVATE_EXPEDITED_SYNC_CORE_READY)) return -EPERM; + ipi_func = ipi_sync_core; } else if (flags == MEMBARRIER_FLAG_RSEQ) { if (!IS_ENABLED(CONFIG_RSEQ)) return -EINVAL; @@ -168,7 +194,8 @@ static int membarrier_private_expedited(int flags, int cpu_id) return -EPERM; } - if (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1) + if (flags != MEMBARRIER_FLAG_SYNC_CORE && + (atomic_read(&mm->mm_users) == 1 || num_online_cpus() == 1)) return 0; /* @@ -187,8 +214,6 @@ static int membarrier_private_expedited(int flags, int cpu_id) if (cpu_id >= nr_cpu_ids || !cpu_online(cpu_id)) goto out; - if (cpu_id == raw_smp_processor_id()) - goto out; rcu_read_lock(); p = rcu_dereference(cpu_rq(cpu_id)->curr); if (!p || p->mm != mm) { @@ -203,16 +228,6 @@ static int membarrier_private_expedited(int flags, int cpu_id) for_each_online_cpu(cpu) { struct task_struct *p; - /* - * Skipping the current CPU is OK even through we can be - * migrated at any point. The current CPU, at the point - * where we read raw_smp_processor_id(), is ensured to - * be in program order with respect to the caller - * thread. Therefore, we can skip this CPU from the - * iteration. - */ - if (cpu == raw_smp_processor_id()) - continue; p = rcu_dereference(cpu_rq(cpu)->curr); if (p && p->mm == mm) __cpumask_set_cpu(cpu, tmpmask); @@ -220,12 +235,38 @@ static int membarrier_private_expedited(int flags, int cpu_id) rcu_read_unlock(); } - preempt_disable(); - if (cpu_id >= 0) + if (cpu_id >= 0) { + /* + * smp_call_function_single() will call ipi_func() if cpu_id + * is the calling CPU. + */ smp_call_function_single(cpu_id, ipi_func, NULL, 1); - else - smp_call_function_many(tmpmask, ipi_func, NULL, 1); - preempt_enable(); + } else { + /* + * For regular membarrier, we can save a few cycles by + * skipping the current cpu -- we're about to do smp_mb() + * below, and if we migrate to a different cpu, this cpu + * and the new cpu will execute a full barrier in the + * scheduler. + * + * For SYNC_CORE, we do need a barrier on the current cpu -- + * otherwise, if we are migrated and replaced by a different + * task in the same mm just before, during, or after + * membarrier, we will end up with some thread in the mm + * running without a core sync. + * + * For RSEQ, don't rseq_preempt() the caller. User code + * is not supposed to issue syscalls at all from inside an + * rseq critical section. + */ + if (flags != MEMBARRIER_FLAG_SYNC_CORE) { + preempt_disable(); + smp_call_function_many(tmpmask, ipi_func, NULL, true); + preempt_enable(); + } else { + on_each_cpu_mask(tmpmask, ipi_func, NULL, true); + } + } out: if (cpu_id < 0)