diff --git a/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers index 45985e411f13..721a05b90109 100644 --- a/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers +++ b/Documentation/ABI/testing/sysfs-kernel-mm-memory-tiers @@ -10,7 +10,7 @@ Description: A collection of all the memory tiers allocated. What: /sys/devices/virtual/memory_tiering/memory_tierN/ - /sys/devices/virtual/memory_tiering/memory_tierN/nodes + /sys/devices/virtual/memory_tiering/memory_tierN/nodelist Date: August 2022 Contact: Linux memory management mailing list Description: Directory with details of a specific memory tier @@ -21,5 +21,5 @@ Description: Directory with details of a specific memory tier A smaller value of N implies a higher (faster) memory tier in the hierarchy. - nodes: NUMA nodes that are part of this memory tier. + nodelist: NUMA nodes that are part of this memory tier. diff --git a/MAINTAINERS b/MAINTAINERS index 31ccadd0f8de..7a7e83165395 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14530,7 +14530,7 @@ L: linux-nilfs@vger.kernel.org S: Supported W: https://nilfs.sourceforge.io/ W: https://nilfs.osdn.jp/ -T: git git://github.com/konis/nilfs2.git +T: git https://github.com/konis/nilfs2.git F: Documentation/filesystems/nilfs2.rst F: fs/nilfs2/ F: include/trace/events/nilfs2.h diff --git a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h index fab8332fe1ad..751921f6db46 100644 --- a/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h +++ b/arch/powerpc/include/asm/book3s/64/tlbflush-hash.h @@ -32,6 +32,11 @@ static inline void arch_enter_lazy_mmu_mode(void) if (radix_enabled()) return; + /* + * apply_to_page_range can call us this preempt enabled when + * operating on kernel page tables. + */ + preempt_disable(); batch = this_cpu_ptr(&ppc64_tlb_batch); batch->active = 1; } @@ -47,6 +52,7 @@ static inline void arch_leave_lazy_mmu_mode(void) if (batch->index) __flush_tlb_pending(batch); batch->active = 0; + preempt_enable(); } #define arch_flush_lazy_mmu_mode() do {} while (0) diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index 930e36099015..2f68fb2ee4fc 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -813,6 +813,13 @@ kernel_dbg_exc: EXCEPTION_COMMON(0x260) CHECK_NAPPING() addi r3,r1,STACK_FRAME_OVERHEAD + /* + * XXX: Returning from performance_monitor_exception taken as a + * soft-NMI (Linux irqs disabled) may be risky to use interrupt_return + * and could cause bugs in return or elsewhere. That case should just + * restore registers and return. There is a workaround for one known + * problem in interrupt_exit_kernel_prepare(). + */ bl performance_monitor_exception b interrupt_return diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 5381a43e50fe..651c36b056bd 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -2357,9 +2357,21 @@ EXC_VIRT_END(performance_monitor, 0x4f00, 0x20) EXC_COMMON_BEGIN(performance_monitor_common) GEN_COMMON performance_monitor addi r3,r1,STACK_FRAME_OVERHEAD - bl performance_monitor_exception + lbz r4,PACAIRQSOFTMASK(r13) + cmpdi r4,IRQS_ENABLED + bne 1f + bl performance_monitor_exception_async b interrupt_return_srr +1: + bl performance_monitor_exception_nmi + /* Clear MSR_RI before setting SRR0 and SRR1. */ + li r9,0 + mtmsrd r9,1 + kuap_kernel_restore r9, r10 + + EXCEPTION_RESTORE_REGS hsrr=0 + RFI_TO_KERNEL /** * Interrupt 0xf20 - Vector Unavailable Interrupt. diff --git a/arch/powerpc/kernel/interrupt.c b/arch/powerpc/kernel/interrupt.c index f9db0a172401..fc6631a80527 100644 --- a/arch/powerpc/kernel/interrupt.c +++ b/arch/powerpc/kernel/interrupt.c @@ -374,10 +374,18 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs) if (regs_is_unrecoverable(regs)) unrecoverable_exception(regs); /* - * CT_WARN_ON comes here via program_check_exception, - * so avoid recursion. + * CT_WARN_ON comes here via program_check_exception, so avoid + * recursion. + * + * Skip the assertion on PMIs on 64e to work around a problem caused + * by NMI PMIs incorrectly taking this interrupt return path, it's + * possible for this to hit after interrupt exit to user switches + * context to user. See also the comment in the performance monitor + * handler in exceptions-64e.S */ - if (TRAP(regs) != INTERRUPT_PROGRAM) + if (!IS_ENABLED(CONFIG_PPC_BOOK3E_64) && + TRAP(regs) != INTERRUPT_PROGRAM && + TRAP(regs) != INTERRUPT_PERFMON) CT_WARN_ON(ct_state() == CONTEXT_USER); kuap = kuap_get_and_assert_locked(); diff --git a/arch/powerpc/kernel/interrupt_64.S b/arch/powerpc/kernel/interrupt_64.S index 978a173eb339..a019ed6fc839 100644 --- a/arch/powerpc/kernel/interrupt_64.S +++ b/arch/powerpc/kernel/interrupt_64.S @@ -532,15 +532,24 @@ _ASM_NOKPROBE_SYMBOL(interrupt_return_\srr\()_kernel) * Returning to soft-disabled context. * Check if a MUST_HARD_MASK interrupt has become pending, in which * case we need to disable MSR[EE] in the return context. + * + * The MSR[EE] check catches among other things the short incoherency + * in hard_irq_disable() between clearing MSR[EE] and setting + * PACA_IRQ_HARD_DIS. */ ld r12,_MSR(r1) andi. r10,r12,MSR_EE beq .Lfast_kernel_interrupt_return_\srr\() // EE already disabled lbz r11,PACAIRQHAPPENED(r13) andi. r10,r11,PACA_IRQ_MUST_HARD_MASK - beq .Lfast_kernel_interrupt_return_\srr\() // No HARD_MASK pending + bne 1f // HARD_MASK is pending + // No HARD_MASK pending, clear possible HARD_DIS set by interrupt + andi. r11,r11,(~PACA_IRQ_HARD_DIS)@l + stb r11,PACAIRQHAPPENED(r13) + b .Lfast_kernel_interrupt_return_\srr\() - /* Must clear MSR_EE from _MSR */ + +1: /* Must clear MSR_EE from _MSR */ #ifdef CONFIG_PPC_BOOK3S li r10,0 /* Clear valid before changing _MSR */ diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 61cdd782d3c5..a9f57dad6d91 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -51,6 +51,7 @@ config KVM_BOOK3S_HV_POSSIBLE config KVM_BOOK3S_32 tristate "KVM support for PowerPC book3s_32 processors" depends on PPC_BOOK3S_32 && !SMP && !PTE_64BIT + depends on !CONTEXT_TRACKING_USER select KVM select KVM_BOOK3S_32_HANDLER select KVM_BOOK3S_PR_POSSIBLE @@ -105,6 +106,7 @@ config KVM_BOOK3S_64_HV config KVM_BOOK3S_64_PR tristate "KVM support without using hypervisor mode in host" depends on KVM_BOOK3S_64 + depends on !CONTEXT_TRACKING_USER select KVM_BOOK3S_PR_POSSIBLE help Support running guest kernels in virtual machines on processors @@ -190,6 +192,7 @@ config KVM_EXIT_TIMING config KVM_E500V2 bool "KVM support for PowerPC E500v2 processors" depends on PPC_E500 && !PPC_E500MC + depends on !CONTEXT_TRACKING_USER select KVM select KVM_MMIO select MMU_NOTIFIER @@ -205,6 +208,7 @@ config KVM_E500V2 config KVM_E500MC bool "KVM support for PowerPC E500MC/E5500/E6500 processors" depends on PPC_E500MC + depends on !CONTEXT_TRACKING_USER select KVM select KVM_MMIO select KVM_BOOKE_HV diff --git a/arch/powerpc/lib/vmx-helper.c b/arch/powerpc/lib/vmx-helper.c index f76a50291fd7..d491da8d1838 100644 --- a/arch/powerpc/lib/vmx-helper.c +++ b/arch/powerpc/lib/vmx-helper.c @@ -36,7 +36,17 @@ int exit_vmx_usercopy(void) { disable_kernel_altivec(); pagefault_enable(); - preempt_enable(); + preempt_enable_no_resched(); + /* + * Must never explicitly call schedule (including preempt_enable()) + * while in a kuap-unlocked user copy, because the AMR register will + * not be saved and restored across context switch. However preempt + * kernels need to be preempted as soon as possible if need_resched is + * set and we are preemptible. The hack here is to schedule a + * decrementer to fire here and reschedule for us if necessary. + */ + if (IS_ENABLED(CONFIG_PREEMPT) && need_resched()) + set_dec(1); return 0; } diff --git a/arch/powerpc/mm/book3s64/hash_native.c b/arch/powerpc/mm/book3s64/hash_native.c index 623a7b7ab38b..9342e79870df 100644 --- a/arch/powerpc/mm/book3s64/hash_native.c +++ b/arch/powerpc/mm/book3s64/hash_native.c @@ -43,6 +43,29 @@ static DEFINE_RAW_SPINLOCK(native_tlbie_lock); +#ifdef CONFIG_LOCKDEP +static struct lockdep_map hpte_lock_map = + STATIC_LOCKDEP_MAP_INIT("hpte_lock", &hpte_lock_map); + +static void acquire_hpte_lock(void) +{ + lock_map_acquire(&hpte_lock_map); +} + +static void release_hpte_lock(void) +{ + lock_map_release(&hpte_lock_map); +} +#else +static void acquire_hpte_lock(void) +{ +} + +static void release_hpte_lock(void) +{ +} +#endif + static inline unsigned long ___tlbie(unsigned long vpn, int psize, int apsize, int ssize) { @@ -220,6 +243,7 @@ static inline void native_lock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + acquire_hpte_lock(); while (1) { if (!test_and_set_bit_lock(HPTE_LOCK_BIT, word)) break; @@ -234,6 +258,7 @@ static inline void native_unlock_hpte(struct hash_pte *hptep) { unsigned long *word = (unsigned long *)&hptep->v; + release_hpte_lock(); clear_bit_unlock(HPTE_LOCK_BIT, word); } @@ -243,8 +268,11 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, { struct hash_pte *hptep = htab_address + hpte_group; unsigned long hpte_v, hpte_r; + unsigned long flags; int i; + local_irq_save(flags); + if (!(vflags & HPTE_V_BOLTED)) { DBG_LOW(" insert(group=%lx, vpn=%016lx, pa=%016lx," " rflags=%lx, vflags=%lx, psize=%d)\n", @@ -263,8 +291,10 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, hptep++; } - if (i == HPTES_PER_GROUP) + if (i == HPTES_PER_GROUP) { + local_irq_restore(flags); return -1; + } hpte_v = hpte_encode_v(vpn, psize, apsize, ssize) | vflags | HPTE_V_VALID; hpte_r = hpte_encode_r(pa, psize, apsize) | rflags; @@ -286,10 +316,13 @@ static long native_hpte_insert(unsigned long hpte_group, unsigned long vpn, * Now set the first dword including the valid bit * NOTE: this also unlocks the hpte */ + release_hpte_lock(); hptep->v = cpu_to_be64(hpte_v); __asm__ __volatile__ ("ptesync" : : : "memory"); + local_irq_restore(flags); + return i | (!!(vflags & HPTE_V_SECONDARY) << 3); } @@ -327,6 +360,7 @@ static long native_hpte_remove(unsigned long hpte_group) return -1; /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; return i; @@ -339,6 +373,9 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, struct hash_pte *hptep = htab_address + slot; unsigned long hpte_v, want_v; int ret = 0, local = 0; + unsigned long irqflags; + + local_irq_save(irqflags); want_v = hpte_encode_avpn(vpn, bpsize, ssize); @@ -382,6 +419,8 @@ static long native_hpte_updatepp(unsigned long slot, unsigned long newpp, if (!(flags & HPTE_NOHPTE_UPDATE)) tlbie(vpn, bpsize, apsize, ssize, local); + local_irq_restore(irqflags); + return ret; } @@ -445,6 +484,9 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -463,6 +505,8 @@ static void native_hpte_updateboltedpp(unsigned long newpp, unsigned long ea, * actual page size will be same. */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); } /* @@ -476,6 +520,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) unsigned long vsid; long slot; struct hash_pte *hptep; + unsigned long flags; + + local_irq_save(flags); vsid = get_kernel_vsid(ea, ssize); vpn = hpt_vpn(ea, vsid, ssize); @@ -493,6 +540,9 @@ static int native_hpte_removebolted(unsigned long ea, int psize, int ssize) /* Invalidate the TLB */ tlbie(vpn, psize, psize, ssize, 0); + + local_irq_restore(flags); + return 0; } @@ -517,10 +567,11 @@ static void native_hpte_invalidate(unsigned long slot, unsigned long vpn, /* recheck with locks held */ hpte_v = hpte_get_old_v(hptep); - if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) + if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; - else + } else native_unlock_hpte(hptep); } /* @@ -580,10 +631,8 @@ static void native_hugepage_invalidate(unsigned long vsid, hpte_v = hpte_get_old_v(hptep); if (HPTE_V_COMPARE(hpte_v, want_v) && (hpte_v & HPTE_V_VALID)) { - /* - * Invalidate the hpte. NOTE: this also unlocks it - */ - + /* Invalidate the hpte. NOTE: this also unlocks it */ + release_hpte_lock(); hptep->v = 0; } else native_unlock_hpte(hptep); @@ -765,8 +814,10 @@ static void native_flush_hash_range(unsigned long number, int local) if (!HPTE_V_COMPARE(hpte_v, want_v) || !(hpte_v & HPTE_V_VALID)) native_unlock_hpte(hptep); - else + else { + release_hpte_lock(); hptep->v = 0; + } } pte_iterate_hashed_end(); } diff --git a/arch/powerpc/mm/book3s64/hash_pgtable.c b/arch/powerpc/mm/book3s64/hash_pgtable.c index 747492edb75a..51f48984abca 100644 --- a/arch/powerpc/mm/book3s64/hash_pgtable.c +++ b/arch/powerpc/mm/book3s64/hash_pgtable.c @@ -404,7 +404,8 @@ EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); struct change_memory_parms { unsigned long start, end, newpp; - unsigned int step, nr_cpus, master_cpu; + unsigned int step, nr_cpus; + atomic_t master_cpu; atomic_t cpu_counter; }; @@ -478,7 +479,8 @@ static int change_memory_range_fn(void *data) { struct change_memory_parms *parms = data; - if (parms->master_cpu != smp_processor_id()) + // First CPU goes through, all others wait. + if (atomic_xchg(&parms->master_cpu, 1) == 1) return chmem_secondary_loop(parms); // Wait for all but one CPU (this one) to call-in @@ -516,7 +518,7 @@ static bool hash__change_memory_range(unsigned long start, unsigned long end, chmem_parms.end = end; chmem_parms.step = step; chmem_parms.newpp = newpp; - chmem_parms.master_cpu = smp_processor_id(); + atomic_set(&chmem_parms.master_cpu, 0); cpus_read_lock(); diff --git a/arch/powerpc/mm/book3s64/hash_utils.c b/arch/powerpc/mm/book3s64/hash_utils.c index df008edf7be0..6df4c6d38b66 100644 --- a/arch/powerpc/mm/book3s64/hash_utils.c +++ b/arch/powerpc/mm/book3s64/hash_utils.c @@ -1981,7 +1981,7 @@ repeat: } #if defined(CONFIG_DEBUG_PAGEALLOC) || defined(CONFIG_KFENCE) -static DEFINE_SPINLOCK(linear_map_hash_lock); +static DEFINE_RAW_SPINLOCK(linear_map_hash_lock); static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) { @@ -2005,10 +2005,10 @@ static void kernel_map_linear_page(unsigned long vaddr, unsigned long lmi) mmu_linear_psize, mmu_kernel_ssize); BUG_ON (ret < 0); - spin_lock(&linear_map_hash_lock); + raw_spin_lock(&linear_map_hash_lock); BUG_ON(linear_map_hash_slots[lmi] & 0x80); linear_map_hash_slots[lmi] = ret | 0x80; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); } static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) @@ -2018,14 +2018,14 @@ static void kernel_unmap_linear_page(unsigned long vaddr, unsigned long lmi) unsigned long vpn = hpt_vpn(vaddr, vsid, mmu_kernel_ssize); hash = hpt_hash(vpn, PAGE_SHIFT, mmu_kernel_ssize); - spin_lock(&linear_map_hash_lock); + raw_spin_lock(&linear_map_hash_lock); if (!(linear_map_hash_slots[lmi] & 0x80)) { - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); return; } hidx = linear_map_hash_slots[lmi] & 0x7f; linear_map_hash_slots[lmi] = 0; - spin_unlock(&linear_map_hash_lock); + raw_spin_unlock(&linear_map_hash_lock); if (hidx & _PTEIDX_SECONDARY) hash = ~hash; slot = (hash & htab_hash_mask) * HPTES_PER_GROUP; diff --git a/arch/powerpc/platforms/pseries/lparcfg.c b/arch/powerpc/platforms/pseries/lparcfg.c index 507dc0b5987d..63fd925ccbb8 100644 --- a/arch/powerpc/platforms/pseries/lparcfg.c +++ b/arch/powerpc/platforms/pseries/lparcfg.c @@ -35,6 +35,7 @@ #include #include "pseries.h" +#include "vas.h" /* pseries_vas_dlpar_cpu() */ /* * This isn't a module but we expose that to userspace @@ -748,6 +749,16 @@ static ssize_t lparcfg_write(struct file *file, const char __user * buf, return -EINVAL; retval = update_ppp(new_entitled_ptr, NULL); + + if (retval == H_SUCCESS || retval == H_CONSTRAINED) { + /* + * The hypervisor assigns VAS resources based + * on entitled capacity for shared mode. + * Reconfig VAS windows based on DLPAR CPU events. + */ + if (pseries_vas_dlpar_cpu() != 0) + retval = H_HARDWARE; + } } else if (!strcmp(kbuf, "capacity_weight")) { char *endp; *new_weight_ptr = (u8) simple_strtoul(tmp, &endp, 10); diff --git a/arch/powerpc/platforms/pseries/vas.c b/arch/powerpc/platforms/pseries/vas.c index 0e0524cbe20c..4ad6e510d405 100644 --- a/arch/powerpc/platforms/pseries/vas.c +++ b/arch/powerpc/platforms/pseries/vas.c @@ -200,16 +200,41 @@ static irqreturn_t pseries_vas_fault_thread_fn(int irq, void *data) struct vas_user_win_ref *tsk_ref; int rc; - rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb)); - if (!rc) { - tsk_ref = &txwin->vas_win.task_ref; - vas_dump_crb(&crb); - vas_update_csb(&crb, tsk_ref); + while (atomic_read(&txwin->pending_faults)) { + rc = h_get_nx_fault(txwin->vas_win.winid, (u64)virt_to_phys(&crb)); + if (!rc) { + tsk_ref = &txwin->vas_win.task_ref; + vas_dump_crb(&crb); + vas_update_csb(&crb, tsk_ref); + } + atomic_dec(&txwin->pending_faults); } return IRQ_HANDLED; } +/* + * irq_default_primary_handler() can be used only with IRQF_ONESHOT + * which disables IRQ before executing the thread handler and enables + * it after. But this disabling interrupt sets the VAS IRQ OFF + * state in the hypervisor. If the NX generates fault interrupt + * during this window, the hypervisor will not deliver this + * interrupt to the LPAR. So use VAS specific IRQ handler instead + * of calling the default primary handler. + */ +static irqreturn_t pseries_vas_irq_handler(int irq, void *data) +{ + struct pseries_vas_window *txwin = data; + + /* + * The thread hanlder will process this interrupt if it is + * already running. + */ + atomic_inc(&txwin->pending_faults); + + return IRQ_WAKE_THREAD; +} + /* * Allocate window and setup IRQ mapping. */ @@ -240,8 +265,9 @@ static int allocate_setup_window(struct pseries_vas_window *txwin, goto out_irq; } - rc = request_threaded_irq(txwin->fault_virq, NULL, - pseries_vas_fault_thread_fn, IRQF_ONESHOT, + rc = request_threaded_irq(txwin->fault_virq, + pseries_vas_irq_handler, + pseries_vas_fault_thread_fn, 0, txwin->name, txwin); if (rc) { pr_err("VAS-Window[%d]: Request IRQ(%u) failed with %d\n", @@ -826,6 +852,25 @@ int vas_reconfig_capabilties(u8 type, int new_nr_creds) mutex_unlock(&vas_pseries_mutex); return rc; } + +int pseries_vas_dlpar_cpu(void) +{ + int new_nr_creds, rc; + + rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, + vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, + (u64)virt_to_phys(&hv_cop_caps)); + if (!rc) { + new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); + rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, new_nr_creds); + } + + if (rc) + pr_err("Failed reconfig VAS capabilities with DLPAR\n"); + + return rc; +} + /* * Total number of default credits available (target_credits) * in LPAR depends on number of cores configured. It varies based on @@ -840,7 +885,15 @@ static int pseries_vas_notifier(struct notifier_block *nb, struct of_reconfig_data *rd = data; struct device_node *dn = rd->dn; const __be32 *intserv = NULL; - int new_nr_creds, len, rc = 0; + int len; + + /* + * For shared CPU partition, the hypervisor assigns total credits + * based on entitled core capacity. So updating VAS windows will + * be called from lparcfg_write(). + */ + if (is_shared_processor()) + return NOTIFY_OK; if ((action == OF_RECONFIG_ATTACH_NODE) || (action == OF_RECONFIG_DETACH_NODE)) @@ -852,19 +905,7 @@ static int pseries_vas_notifier(struct notifier_block *nb, if (!intserv) return NOTIFY_OK; - rc = h_query_vas_capabilities(H_QUERY_VAS_CAPABILITIES, - vascaps[VAS_GZIP_DEF_FEAT_TYPE].feat, - (u64)virt_to_phys(&hv_cop_caps)); - if (!rc) { - new_nr_creds = be16_to_cpu(hv_cop_caps.target_lpar_creds); - rc = vas_reconfig_capabilties(VAS_GZIP_DEF_FEAT_TYPE, - new_nr_creds); - } - - if (rc) - pr_err("Failed reconfig VAS capabilities with DLPAR\n"); - - return rc; + return pseries_vas_dlpar_cpu(); } static struct notifier_block pseries_vas_nb = { diff --git a/arch/powerpc/platforms/pseries/vas.h b/arch/powerpc/platforms/pseries/vas.h index 333ffa2f9f42..7115043ec488 100644 --- a/arch/powerpc/platforms/pseries/vas.h +++ b/arch/powerpc/platforms/pseries/vas.h @@ -132,6 +132,7 @@ struct pseries_vas_window { u64 flags; char *name; int fault_virq; + atomic_t pending_faults; /* Number of pending faults */ }; int sysfs_add_vas_caps(struct vas_cop_feat_caps *caps); @@ -140,10 +141,15 @@ int __init sysfs_pseries_vas_init(struct vas_all_caps *vas_caps); #ifdef CONFIG_PPC_VAS int vas_migration_handler(int action); +int pseries_vas_dlpar_cpu(void); #else static inline int vas_migration_handler(int action) { return 0; } +static inline int pseries_vas_dlpar_cpu(void) +{ + return 0; +} #endif #endif /* _VAS_H */ diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 3b87d889b6e1..888731ccf1f6 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -10,10 +10,13 @@ /* Even with __builtin_ the compiler may decide to use the out of line function. */ +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) +#include +#endif + #define __HAVE_ARCH_MEMCPY 1 -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) #undef memcpy -void *__msan_memcpy(void *dst, const void *src, size_t size); #define memcpy __msan_memcpy #else extern void *memcpy(void *to, const void *from, size_t len); @@ -21,7 +24,7 @@ extern void *memcpy(void *to, const void *from, size_t len); extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) extern void *__msan_memset(void *s, int c, size_t n); #undef memset #define memset __msan_memset @@ -67,7 +70,7 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n) } #define __HAVE_ARCH_MEMMOVE -#if defined(__SANITIZE_MEMORY__) +#if defined(__SANITIZE_MEMORY__) && defined(__NO_FORTIFY) #undef memmove void *__msan_memmove(void *dest, const void *src, size_t len); #define memmove __msan_memmove diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 8bc614cfe21b..1cc756eafa44 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -254,24 +254,25 @@ extern void __put_user_nocheck_8(void); #define __put_user_size(x, ptr, size, label) \ do { \ __typeof__(*(ptr)) __x = (x); /* eval x once */ \ - __chk_user_ptr(ptr); \ + __typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \ + __chk_user_ptr(__ptr); \ switch (size) { \ case 1: \ - __put_user_goto(__x, ptr, "b", "iq", label); \ + __put_user_goto(__x, __ptr, "b", "iq", label); \ break; \ case 2: \ - __put_user_goto(__x, ptr, "w", "ir", label); \ + __put_user_goto(__x, __ptr, "w", "ir", label); \ break; \ case 4: \ - __put_user_goto(__x, ptr, "l", "ir", label); \ + __put_user_goto(__x, __ptr, "l", "ir", label); \ break; \ case 8: \ - __put_user_goto_u64(__x, ptr, label); \ + __put_user_goto_u64(__x, __ptr, label); \ break; \ default: \ __put_user_bad(); \ } \ - instrument_put_user(__x, ptr, size); \ + instrument_put_user(__x, __ptr, size); \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT diff --git a/arch/x86/purgatory/Makefile b/arch/x86/purgatory/Makefile index 58a200dc762d..17f09dc26381 100644 --- a/arch/x86/purgatory/Makefile +++ b/arch/x86/purgatory/Makefile @@ -26,6 +26,7 @@ GCOV_PROFILE := n KASAN_SANITIZE := n UBSAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n KCOV_INSTRUMENT := n # These are adjustments to the compiler flags used for objects that diff --git a/block/blk-mq.c b/block/blk-mq.c index 33292c01875d..75c8296b6feb 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -611,6 +611,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, .nr_tags = 1, }; u64 alloc_time_ns = 0; + struct request *rq; unsigned int cpu; unsigned int tag; int ret; @@ -660,8 +661,12 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; - return blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag, + rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag, alloc_time_ns); + rq->__data_len = 0; + rq->__sector = (sector_t) -1; + rq->bio = rq->biotail = NULL; + return rq; out_queue_exit: blk_queue_exit(q); diff --git a/block/genhd.c b/block/genhd.c index 17b33c62423d..fee90eb98b4a 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -410,9 +410,10 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, * Otherwise just allocate the device numbers for both the whole device * and all partitions from the extended dev_t space. */ + ret = -EINVAL; if (disk->major) { if (WARN_ON(!disk->minors)) - return -EINVAL; + goto out_exit_elevator; if (disk->minors > DISK_MAX_PARTS) { pr_err("block: can't allocate more than %d partitions\n", @@ -420,14 +421,14 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, disk->minors = DISK_MAX_PARTS; } if (disk->first_minor + disk->minors > MINORMASK + 1) - return -EINVAL; + goto out_exit_elevator; } else { if (WARN_ON(disk->minors)) - return -EINVAL; + goto out_exit_elevator; ret = blk_alloc_ext_minor(); if (ret < 0) - return ret; + goto out_exit_elevator; disk->major = BLOCK_EXT_MAJOR; disk->first_minor = ret; } @@ -540,6 +541,9 @@ out_device_del: out_free_ext_minor: if (disk->major == BLOCK_EXT_MAJOR) blk_free_ext_minor(disk->first_minor); +out_exit_elevator: + if (disk->queue->elevator) + elevator_exit(disk->queue); return ret; } EXPORT_SYMBOL(device_add_disk); diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index f9e39301c4af..04453f4a319c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -7222,8 +7222,10 @@ static int __init rbd_sysfs_init(void) int ret; ret = device_register(&rbd_root_dev); - if (ret < 0) + if (ret < 0) { + put_device(&rbd_root_dev); return ret; + } ret = bus_register(&rbd_bus_type); if (ret < 0) diff --git a/drivers/char/random.c b/drivers/char/random.c index 2fe28eeb2f38..69754155300e 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -791,13 +791,13 @@ void __init random_init_early(const char *command_line) #endif for (i = 0, arch_bits = sizeof(entropy) * 8; i < ARRAY_SIZE(entropy);) { - longs = arch_get_random_seed_longs(entropy, ARRAY_SIZE(entropy) - i); + longs = arch_get_random_seed_longs_early(entropy, ARRAY_SIZE(entropy) - i); if (longs) { _mix_pool_bytes(entropy, sizeof(*entropy) * longs); i += longs; continue; } - longs = arch_get_random_longs(entropy, ARRAY_SIZE(entropy) - i); + longs = arch_get_random_longs_early(entropy, ARRAY_SIZE(entropy) - i); if (longs) { _mix_pool_bytes(entropy, sizeof(*entropy) * longs); i += longs; diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c index 0ea7e441e080..93e2138a8b42 100644 --- a/drivers/nvme/host/multipath.c +++ b/drivers/nvme/host/multipath.c @@ -516,6 +516,7 @@ int nvme_mpath_alloc_disk(struct nvme_ctrl *ctrl, struct nvme_ns_head *head) /* set to a default value of 512 until the disk is validated */ blk_queue_logical_block_size(head->disk->queue, 512); blk_set_stacking_limits(&head->disk->queue->limits); + blk_queue_dma_alignment(head->disk->queue, 3); /* we need to propagate up the VMC settings */ if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 1eed0fc26b3a..9b47dcb2a7d9 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -387,7 +387,7 @@ static inline void nvme_tcp_ddgst_update(struct ahash_request *hash, { struct scatterlist sg; - sg_init_marker(&sg, 1); + sg_init_table(&sg, 1); sg_set_page(&sg, page, len, off); ahash_request_set_crypt(hash, &sg, NULL, len); crypto_ahash_update(hash); @@ -1141,6 +1141,7 @@ static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req) static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) { struct nvme_tcp_request *req; + unsigned int noreclaim_flag; int ret = 1; if (!queue->request) { @@ -1150,12 +1151,13 @@ static int nvme_tcp_try_send(struct nvme_tcp_queue *queue) } req = queue->request; + noreclaim_flag = memalloc_noreclaim_save(); if (req->state == NVME_TCP_SEND_CMD_PDU) { ret = nvme_tcp_try_send_cmd_pdu(req); if (ret <= 0) goto done; if (!nvme_tcp_has_inline_data(req)) - return ret; + goto out; } if (req->state == NVME_TCP_SEND_H2C_PDU) { @@ -1181,6 +1183,8 @@ done: nvme_tcp_fail_request(queue->request); nvme_tcp_done_send_req(queue); } +out: + memalloc_noreclaim_restore(noreclaim_flag); return ret; } @@ -1296,6 +1300,7 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) struct page *page; struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(nctrl); struct nvme_tcp_queue *queue = &ctrl->queues[qid]; + unsigned int noreclaim_flag; if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) return; @@ -1308,7 +1313,11 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) __page_frag_cache_drain(page, queue->pf_cache.pagecnt_bias); queue->pf_cache.va = NULL; } + + noreclaim_flag = memalloc_noreclaim_save(); sock_release(queue->sock); + memalloc_noreclaim_restore(noreclaim_flag); + kfree(queue->pdu); mutex_destroy(&queue->send_mutex); mutex_destroy(&queue->queue_lock); diff --git a/drivers/scsi/lpfc/lpfc_bsg.c b/drivers/scsi/lpfc/lpfc_bsg.c index ac0c7ccf2eae..852b025e2fec 100644 --- a/drivers/scsi/lpfc/lpfc_bsg.c +++ b/drivers/scsi/lpfc/lpfc_bsg.c @@ -2582,7 +2582,7 @@ static int lpfcdiag_loop_self_unreg(struct lpfc_hba *phba, uint16_t rpi) * * This function obtains the transmit and receive ids required to send * an unsolicited ct command with a payload. A special lpfc FsType and CmdRsp - * flags are used to the unsolicted response handler is able to process + * flags are used to the unsolicited response handler is able to process * the ct command sent on the same port. **/ static int lpfcdiag_loop_get_xri(struct lpfc_hba *phba, uint16_t rpi, @@ -2874,7 +2874,7 @@ out: * @len: Number of data bytes * * This function allocates and posts a data buffer of sufficient size to receive - * an unsolicted CT command. + * an unsolicited CT command. **/ static int lpfcdiag_sli3_loop_post_rxbufs(struct lpfc_hba *phba, uint16_t rxxri, size_t len) diff --git a/drivers/scsi/lpfc/lpfc_ct.c b/drivers/scsi/lpfc/lpfc_ct.c index 75fd2bfc212b..e941a99aa965 100644 --- a/drivers/scsi/lpfc/lpfc_ct.c +++ b/drivers/scsi/lpfc/lpfc_ct.c @@ -90,7 +90,7 @@ lpfc_ct_ignore_hbq_buffer(struct lpfc_hba *phba, struct lpfc_iocbq *piocbq, get_job_ulpstatus(phba, piocbq)); } lpfc_printf_log(phba, KERN_INFO, LOG_ELS, - "0145 Ignoring unsolicted CT HBQ Size:%d " + "0145 Ignoring unsolicited CT HBQ Size:%d " "status = x%x\n", size, get_job_ulpstatus(phba, piocbq)); } diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 9be4ba61a076..d265a2d9d082 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -5874,10 +5874,6 @@ fallback: static int megasas_get_device_list(struct megasas_instance *instance) { - memset(instance->pd_list, 0, - (MEGASAS_MAX_PD * sizeof(struct megasas_pd_list))); - memset(instance->ld_ids, 0xff, MEGASAS_MAX_LD_IDS); - if (instance->enable_fw_dev_list) { if (megasas_host_device_list_query(instance, true)) return FAILED; @@ -7220,7 +7216,7 @@ int megasas_alloc_ctrl_dma_buffers(struct megasas_instance *instance) if (!fusion->ioc_init_request) { dev_err(&pdev->dev, - "Failed to allocate PD list buffer\n"); + "Failed to allocate ioc init request\n"); return -ENOMEM; } @@ -7439,7 +7435,6 @@ static inline void megasas_init_ctrl_params(struct megasas_instance *instance) (instance->pdev->device == PCI_DEVICE_ID_LSI_SAS0071SKINNY)) instance->flag_ieee = 1; - megasas_dbg_lvl = 0; instance->flag = 0; instance->unload = 1; instance->last_time = 0; @@ -8762,33 +8757,26 @@ static int megasas_update_device_list(struct megasas_instance *instance, int event_type) { - int dcmd_ret = DCMD_SUCCESS; + int dcmd_ret; if (instance->enable_fw_dev_list) { - dcmd_ret = megasas_host_device_list_query(instance, false); - if (dcmd_ret != DCMD_SUCCESS) - goto out; + return megasas_host_device_list_query(instance, false); } else { if (event_type & SCAN_PD_CHANNEL) { dcmd_ret = megasas_get_pd_list(instance); - if (dcmd_ret != DCMD_SUCCESS) - goto out; + return dcmd_ret; } if (event_type & SCAN_VD_CHANNEL) { if (!instance->requestorId || megasas_get_ld_vf_affiliation(instance, 0)) { - dcmd_ret = megasas_ld_list_query(instance, + return megasas_ld_list_query(instance, MR_LD_QUERY_TYPE_EXPOSED_TO_HOST); - if (dcmd_ret != DCMD_SUCCESS) - goto out; } } } - -out: - return dcmd_ret; + return DCMD_SUCCESS; } /** @@ -8918,7 +8906,7 @@ megasas_aen_polling(struct work_struct *work) sdev1 = scsi_device_lookup(instance->host, MEGASAS_MAX_PD_CHANNELS + (ld_target_id / MEGASAS_MAX_DEV_PER_CHANNEL), - (ld_target_id - MEGASAS_MAX_DEV_PER_CHANNEL), + (ld_target_id % MEGASAS_MAX_DEV_PER_CHANNEL), 0); if (sdev1) megasas_remove_scsi_device(sdev1); @@ -9016,6 +9004,7 @@ static int __init megasas_init(void) */ pr_info("megasas: %s\n", MEGASAS_VERSION); + megasas_dbg_lvl = 0; support_poll_for_event = 2; support_device_change = 1; support_nvme_encapsulation = true; diff --git a/drivers/scsi/mpi3mr/Kconfig b/drivers/scsi/mpi3mr/Kconfig index 8997531940c2..f48740cd5b95 100644 --- a/drivers/scsi/mpi3mr/Kconfig +++ b/drivers/scsi/mpi3mr/Kconfig @@ -4,5 +4,6 @@ config SCSI_MPI3MR tristate "Broadcom MPI3 Storage Controller Device Driver" depends on PCI && SCSI select BLK_DEV_BSGLIB + select SCSI_SAS_ATTRS help MPI3 based Storage & RAID Controllers Driver. diff --git a/drivers/scsi/pm8001/pm8001_init.c b/drivers/scsi/pm8001/pm8001_init.c index 2ff2fac1e403..7a7d63aa90e2 100644 --- a/drivers/scsi/pm8001/pm8001_init.c +++ b/drivers/scsi/pm8001/pm8001_init.c @@ -99,6 +99,7 @@ static void pm8001_map_queues(struct Scsi_Host *shost) static struct scsi_host_template pm8001_sht = { .module = THIS_MODULE, .name = DRV_NAME, + .proc_name = DRV_NAME, .queuecommand = sas_queuecommand, .dma_need_drain = ata_scsi_dma_need_drain, .target_alloc = sas_target_alloc, diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index fa1fcbfb946f..b67ad30d56e6 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -951,9 +951,9 @@ qla2x00_sysfs_read_dcbx_tlv(struct file *filp, struct kobject *kobj, if (!capable(CAP_SYS_ADMIN) || off != 0 || count > DCBX_TLV_DATA_SIZE) return 0; + mutex_lock(&vha->hw->optrom_mutex); if (ha->dcbx_tlv) goto do_read; - mutex_lock(&vha->hw->optrom_mutex); if (qla2x00_chip_is_down(vha)) { mutex_unlock(&vha->hw->optrom_mutex); return 0; @@ -3330,11 +3330,34 @@ struct fc_function_template qla2xxx_transport_vport_functions = { .bsg_timeout = qla24xx_bsg_timeout, }; +static uint +qla2x00_get_host_supported_speeds(scsi_qla_host_t *vha, uint speeds) +{ + uint supported_speeds = FC_PORTSPEED_UNKNOWN; + + if (speeds & FDMI_PORT_SPEED_64GB) + supported_speeds |= FC_PORTSPEED_64GBIT; + if (speeds & FDMI_PORT_SPEED_32GB) + supported_speeds |= FC_PORTSPEED_32GBIT; + if (speeds & FDMI_PORT_SPEED_16GB) + supported_speeds |= FC_PORTSPEED_16GBIT; + if (speeds & FDMI_PORT_SPEED_8GB) + supported_speeds |= FC_PORTSPEED_8GBIT; + if (speeds & FDMI_PORT_SPEED_4GB) + supported_speeds |= FC_PORTSPEED_4GBIT; + if (speeds & FDMI_PORT_SPEED_2GB) + supported_speeds |= FC_PORTSPEED_2GBIT; + if (speeds & FDMI_PORT_SPEED_1GB) + supported_speeds |= FC_PORTSPEED_1GBIT; + + return supported_speeds; +} + void qla2x00_init_host_attr(scsi_qla_host_t *vha) { struct qla_hw_data *ha = vha->hw; - u32 speeds = FC_PORTSPEED_UNKNOWN; + u32 speeds = 0, fdmi_speed = 0; fc_host_dev_loss_tmo(vha->host) = ha->port_down_retry_count; fc_host_node_name(vha->host) = wwn_to_u64(vha->node_name); @@ -3344,7 +3367,8 @@ qla2x00_init_host_attr(scsi_qla_host_t *vha) fc_host_max_npiv_vports(vha->host) = ha->max_npiv_vports; fc_host_npiv_vports_inuse(vha->host) = ha->cur_vport_count; - speeds = qla25xx_fdmi_port_speed_capability(ha); + fdmi_speed = qla25xx_fdmi_port_speed_capability(ha); + speeds = qla2x00_get_host_supported_speeds(vha, fdmi_speed); fc_host_supported_speeds(vha->host) = speeds; } diff --git a/drivers/target/target_core_device.c b/drivers/target/target_core_device.c index b7f16ee8aa0e..cb4f7cc02f8f 100644 --- a/drivers/target/target_core_device.c +++ b/drivers/target/target_core_device.c @@ -284,6 +284,25 @@ void target_pr_kref_release(struct kref *kref) complete(&deve->pr_comp); } +/* + * Establish UA condition on SCSI device - all LUNs + */ +void target_dev_ua_allocate(struct se_device *dev, u8 asc, u8 ascq) +{ + struct se_dev_entry *se_deve; + struct se_lun *lun; + + spin_lock(&dev->se_port_lock); + list_for_each_entry(lun, &dev->dev_sep_list, lun_dev_link) { + + spin_lock(&lun->lun_deve_lock); + list_for_each_entry(se_deve, &lun->lun_deve_list, lun_link) + core_scsi3_ua_allocate(se_deve, asc, ascq); + spin_unlock(&lun->lun_deve_lock); + } + spin_unlock(&dev->se_port_lock); +} + static void target_luns_data_has_changed(struct se_node_acl *nacl, struct se_dev_entry *new, bool skip_new) diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index 8351c974cee3..d9266cf558dc 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -230,14 +230,12 @@ static void iblock_unplug_device(struct se_dev_plug *se_plug) clear_bit(IBD_PLUGF_PLUGGED, &ib_dev_plug->flags); } -static unsigned long long iblock_emulate_read_cap_with_block_size( - struct se_device *dev, - struct block_device *bd, - struct request_queue *q) +static sector_t iblock_get_blocks(struct se_device *dev) { - u32 block_size = bdev_logical_block_size(bd); + struct iblock_dev *ib_dev = IBLOCK_DEV(dev); + u32 block_size = bdev_logical_block_size(ib_dev->ibd_bd); unsigned long long blocks_long = - div_u64(bdev_nr_bytes(bd), block_size) - 1; + div_u64(bdev_nr_bytes(ib_dev->ibd_bd), block_size) - 1; if (block_size == dev->dev_attrib.block_size) return blocks_long; @@ -829,15 +827,6 @@ fail: return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE; } -static sector_t iblock_get_blocks(struct se_device *dev) -{ - struct iblock_dev *ib_dev = IBLOCK_DEV(dev); - struct block_device *bd = ib_dev->ibd_bd; - struct request_queue *q = bdev_get_queue(bd); - - return iblock_emulate_read_cap_with_block_size(dev, bd, q); -} - static sector_t iblock_get_alignment_offset_lbas(struct se_device *dev) { struct iblock_dev *ib_dev = IBLOCK_DEV(dev); diff --git a/drivers/target/target_core_internal.h b/drivers/target/target_core_internal.h index 30fcf69e1a1d..38a6d08f75b3 100644 --- a/drivers/target/target_core_internal.h +++ b/drivers/target/target_core_internal.h @@ -89,6 +89,7 @@ int target_configure_device(struct se_device *dev); void target_free_device(struct se_device *); int target_for_each_device(int (*fn)(struct se_device *dev, void *data), void *data); +void target_dev_ua_allocate(struct se_device *dev, u8 asc, u8 ascq); /* target_core_configfs.c */ extern struct configfs_item_operations target_core_dev_item_ops; diff --git a/drivers/target/target_core_pr.c b/drivers/target/target_core_pr.c index a1d67554709f..1493b1d01194 100644 --- a/drivers/target/target_core_pr.c +++ b/drivers/target/target_core_pr.c @@ -2956,13 +2956,28 @@ core_scsi3_pro_preempt(struct se_cmd *cmd, int type, int scope, u64 res_key, __core_scsi3_complete_pro_preempt(dev, pr_reg_n, (preempt_type == PREEMPT_AND_ABORT) ? &preempt_and_abort_list : NULL, type, scope, preempt_type); - - if (preempt_type == PREEMPT_AND_ABORT) - core_scsi3_release_preempt_and_abort( - &preempt_and_abort_list, pr_reg_n); } + spin_unlock(&dev->dev_reservation_lock); + /* + * SPC-4 5.12.11.2.6 Preempting and aborting + * The actions described in this subclause shall be performed + * for all I_T nexuses that are registered with the non-zero + * SERVICE ACTION RESERVATION KEY value, without regard for + * whether the preempted I_T nexuses hold the persistent + * reservation. If the SERVICE ACTION RESERVATION KEY field is + * set to zero and an all registrants persistent reservation is + * present, the device server shall abort all commands for all + * registered I_T nexuses. + */ + if (preempt_type == PREEMPT_AND_ABORT) { + core_tmr_lun_reset(dev, NULL, &preempt_and_abort_list, + cmd); + core_scsi3_release_preempt_and_abort( + &preempt_and_abort_list, pr_reg_n); + } + if (pr_tmpl->pr_aptpl_active) core_scsi3_update_and_write_aptpl(cmd->se_dev, true); @@ -3022,7 +3037,7 @@ core_scsi3_pro_preempt(struct se_cmd *cmd, int type, int scope, u64 res_key, if (calling_it_nexus) continue; - if (pr_reg->pr_res_key != sa_res_key) + if (sa_res_key && pr_reg->pr_res_key != sa_res_key) continue; pr_reg_nacl = pr_reg->pr_reg_nacl; @@ -3425,8 +3440,6 @@ after_iport_check: * transport protocols where port names are not required; * d) Register the reservation key specified in the SERVICE ACTION * RESERVATION KEY field; - * e) Retain the reservation key specified in the SERVICE ACTION - * RESERVATION KEY field and associated information; * * Also, It is not an error for a REGISTER AND MOVE service action to * register an I_T nexus that is already registered with the same @@ -3448,6 +3461,12 @@ after_iport_check: dest_pr_reg = __core_scsi3_locate_pr_reg(dev, dest_node_acl, iport_ptr); new_reg = 1; + } else { + /* + * e) Retain the reservation key specified in the SERVICE ACTION + * RESERVATION KEY field and associated information; + */ + dest_pr_reg->pr_res_key = sa_res_key; } /* * f) Release the persistent reservation for the persistent reservation diff --git a/drivers/target/target_core_transport.c b/drivers/target/target_core_transport.c index 7838dc20f713..5926316252eb 100644 --- a/drivers/target/target_core_transport.c +++ b/drivers/target/target_core_transport.c @@ -3531,8 +3531,7 @@ static void target_tmr_work(struct work_struct *work) tmr->response = (!ret) ? TMR_FUNCTION_COMPLETE : TMR_FUNCTION_REJECTED; if (tmr->response == TMR_FUNCTION_COMPLETE) { - target_ua_allocate_lun(cmd->se_sess->se_node_acl, - cmd->orig_fe_lun, 0x29, + target_dev_ua_allocate(dev, 0x29, ASCQ_29H_BUS_DEVICE_RESET_FUNCTION_OCCURRED); } break; diff --git a/drivers/ufs/core/ufshcd.c b/drivers/ufs/core/ufshcd.c index d50fe7558cf8..ad93e3868e47 100644 --- a/drivers/ufs/core/ufshcd.c +++ b/drivers/ufs/core/ufshcd.c @@ -779,7 +779,7 @@ static inline void ufshcd_utrl_clear(struct ufs_hba *hba, u32 mask) } /** - * ufshcd_utmrl_clear - Clear a bit in UTRMLCLR register + * ufshcd_utmrl_clear - Clear a bit in UTMRLCLR register * @hba: per adapter instance * @pos: position of the bit to be cleared */ @@ -3118,7 +3118,7 @@ int ufshcd_query_flag_retry(struct ufs_hba *hba, if (ret) dev_err(hba->dev, - "%s: query attribute, opcode %d, idn %d, failed with error %d after %d retries\n", + "%s: query flag, opcode %d, idn %d, failed with error %d after %d retries\n", __func__, opcode, idn, ret, retries); return ret; } diff --git a/drivers/ufs/core/ufshpb.c b/drivers/ufs/core/ufshpb.c index 3d69a81c5b17..b7f412d0f301 100644 --- a/drivers/ufs/core/ufshpb.c +++ b/drivers/ufs/core/ufshpb.c @@ -383,7 +383,7 @@ int ufshpb_prep(struct ufs_hba *hba, struct ufshcd_lrb *lrbp) rgn = hpb->rgn_tbl + rgn_idx; srgn = rgn->srgn_tbl + srgn_idx; - /* If command type is WRITE or DISCARD, set bitmap as drity */ + /* If command type is WRITE or DISCARD, set bitmap as dirty */ if (ufshpb_is_write_or_discard(cmd)) { ufshpb_iterate_rgn(hpb, rgn_idx, srgn_idx, srgn_offset, transfer_len, true); @@ -616,7 +616,7 @@ static void ufshpb_activate_subregion(struct ufshpb_lu *hpb, static enum rq_end_io_ret ufshpb_umap_req_compl_fn(struct request *req, blk_status_t error) { - struct ufshpb_req *umap_req = (struct ufshpb_req *)req->end_io_data; + struct ufshpb_req *umap_req = req->end_io_data; ufshpb_put_req(umap_req->hpb, umap_req); return RQ_END_IO_NONE; @@ -625,7 +625,7 @@ static enum rq_end_io_ret ufshpb_umap_req_compl_fn(struct request *req, static enum rq_end_io_ret ufshpb_map_req_compl_fn(struct request *req, blk_status_t error) { - struct ufshpb_req *map_req = (struct ufshpb_req *) req->end_io_data; + struct ufshpb_req *map_req = req->end_io_data; struct ufshpb_lu *hpb = map_req->hpb; struct ufshpb_subregion *srgn; unsigned long flags; diff --git a/drivers/ufs/host/ufs-qcom-ice.c b/drivers/ufs/host/ufs-qcom-ice.c index 745e48ec598f..62387ccd5b30 100644 --- a/drivers/ufs/host/ufs-qcom-ice.c +++ b/drivers/ufs/host/ufs-qcom-ice.c @@ -118,7 +118,6 @@ int ufs_qcom_ice_init(struct ufs_qcom_host *host) host->ice_mmio = devm_ioremap_resource(dev, res); if (IS_ERR(host->ice_mmio)) { err = PTR_ERR(host->ice_mmio); - dev_err(dev, "Failed to map ICE registers; err=%d\n", err); return err; } diff --git a/fs/exec.c b/fs/exec.c index 32dc8cf5fceb..a0b1f0337a62 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1012,7 +1012,6 @@ static int exec_mmap(struct mm_struct *mm) active_mm = tsk->active_mm; tsk->active_mm = mm; tsk->mm = mm; - lru_gen_add_mm(mm); /* * This prevents preemption while active_mm is being loaded and * it and mm are being updated, which could cause problems for @@ -1025,6 +1024,7 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); + lru_gen_add_mm(mm); task_unlock(tsk); lru_gen_use_mm(mm); if (old_mm) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 989365b878a6..7950904fbf04 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1741,10 +1741,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) -static const char deprecated_msg[] = - "Mount option \"%s\" will be removed by %s\n" - "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; - #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 #define MOPT_NOSUPPORT 0x0004 diff --git a/fs/squashfs/file.c b/fs/squashfs/file.c index e56510964b22..8ba8c4c50770 100644 --- a/fs/squashfs/file.c +++ b/fs/squashfs/file.c @@ -506,8 +506,9 @@ static int squashfs_readahead_fragment(struct page **page, squashfs_i(inode)->fragment_size); struct squashfs_sb_info *msblk = inode->i_sb->s_fs_info; unsigned int n, mask = (1 << (msblk->block_log - PAGE_SHIFT)) - 1; + int error = buffer->error; - if (buffer->error) + if (error) goto out; expected += squashfs_i(inode)->fragment_offset; @@ -529,7 +530,7 @@ static int squashfs_readahead_fragment(struct page **page, out: squashfs_cache_put(buffer); - return buffer->error; + return error; } static void squashfs_readahead(struct readahead_control *ractl) @@ -557,6 +558,13 @@ static void squashfs_readahead(struct readahead_control *ractl) int res, bsize; u64 block = 0; unsigned int expected; + struct page *last_page; + + expected = start >> msblk->block_log == file_end ? + (i_size_read(inode) & (msblk->block_size - 1)) : + msblk->block_size; + + max_pages = (expected + PAGE_SIZE - 1) >> PAGE_SHIFT; nr_pages = __readahead_batch(ractl, pages, max_pages); if (!nr_pages) @@ -566,13 +574,10 @@ static void squashfs_readahead(struct readahead_control *ractl) goto skip_pages; index = pages[0]->index >> shift; + if ((pages[nr_pages - 1]->index >> shift) != index) goto skip_pages; - expected = index == file_end ? - (i_size_read(inode) & (msblk->block_size - 1)) : - msblk->block_size; - if (index == file_end && squashfs_i(inode)->fragment_block != SQUASHFS_INVALID_BLK) { res = squashfs_readahead_fragment(pages, nr_pages, @@ -593,15 +598,15 @@ static void squashfs_readahead(struct readahead_control *ractl) res = squashfs_read_data(inode->i_sb, block, bsize, NULL, actor); - squashfs_page_actor_free(actor); + last_page = squashfs_page_actor_free(actor); if (res == expected) { int bytes; /* Last page (if present) may have trailing bytes not filled */ bytes = res % PAGE_SIZE; - if (pages[nr_pages - 1]->index == file_end && bytes) - memzero_page(pages[nr_pages - 1], bytes, + if (index == file_end && bytes && last_page) + memzero_page(last_page, bytes, PAGE_SIZE - bytes); for (i = 0; i < nr_pages; i++) { diff --git a/fs/squashfs/page_actor.c b/fs/squashfs/page_actor.c index 54b93bf4a25c..81af6c4ca115 100644 --- a/fs/squashfs/page_actor.c +++ b/fs/squashfs/page_actor.c @@ -71,11 +71,13 @@ static void *handle_next_page(struct squashfs_page_actor *actor) (actor->next_index != actor->page[actor->next_page]->index)) { actor->next_index++; actor->returned_pages++; + actor->last_page = NULL; return actor->alloc_buffer ? actor->tmp_buffer : ERR_PTR(-ENOMEM); } actor->next_index++; actor->returned_pages++; + actor->last_page = actor->page[actor->next_page]; return actor->pageaddr = kmap_local_page(actor->page[actor->next_page++]); } @@ -125,6 +127,7 @@ struct squashfs_page_actor *squashfs_page_actor_init_special(struct squashfs_sb_ actor->returned_pages = 0; actor->next_index = page[0]->index & ~((1 << (msblk->block_log - PAGE_SHIFT)) - 1); actor->pageaddr = NULL; + actor->last_page = NULL; actor->alloc_buffer = msblk->decompressor->alloc_buffer; actor->squashfs_first_page = direct_first_page; actor->squashfs_next_page = direct_next_page; diff --git a/fs/squashfs/page_actor.h b/fs/squashfs/page_actor.h index 95ffbb543d91..97d4983559b1 100644 --- a/fs/squashfs/page_actor.h +++ b/fs/squashfs/page_actor.h @@ -16,6 +16,7 @@ struct squashfs_page_actor { void *(*squashfs_first_page)(struct squashfs_page_actor *); void *(*squashfs_next_page)(struct squashfs_page_actor *); void (*squashfs_finish_page)(struct squashfs_page_actor *); + struct page *last_page; int pages; int length; int next_page; @@ -29,10 +30,13 @@ extern struct squashfs_page_actor *squashfs_page_actor_init(void **buffer, extern struct squashfs_page_actor *squashfs_page_actor_init_special( struct squashfs_sb_info *msblk, struct page **page, int pages, int length); -static inline void squashfs_page_actor_free(struct squashfs_page_actor *actor) +static inline struct page *squashfs_page_actor_free(struct squashfs_page_actor *actor) { + struct page *last_page = actor->last_page; + kfree(actor->tmp_buffer); kfree(actor); + return last_page; } static inline void *squashfs_first_page(struct squashfs_page_actor *actor) { diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index ba18e9bdb799..d6119c5d1069 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -853,7 +853,8 @@ static inline bool blk_mq_add_to_batch(struct request *req, struct io_comp_batch *iob, int ioerror, void (*complete)(struct io_comp_batch *)) { - if (!iob || (req->rq_flags & RQF_ELV) || ioerror) + if (!iob || (req->rq_flags & RQF_ELV) || ioerror || + (req->end_io && !blk_rq_is_passthrough(req))) return false; if (!iob->complete) diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 4029fe368a4f..18a31b125f9d 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -43,11 +43,24 @@ extern __kernel_size_t __underlying_strlen(const char *p) __RENAME(strlen); extern char *__underlying_strncat(char *p, const char *q, __kernel_size_t count) __RENAME(strncat); extern char *__underlying_strncpy(char *p, const char *q, __kernel_size_t size) __RENAME(strncpy); #else -#define __underlying_memchr __builtin_memchr -#define __underlying_memcmp __builtin_memcmp + +#if defined(__SANITIZE_MEMORY__) +/* + * For KMSAN builds all memcpy/memset/memmove calls should be replaced by the + * corresponding __msan_XXX functions. + */ +#include +#define __underlying_memcpy __msan_memcpy +#define __underlying_memmove __msan_memmove +#define __underlying_memset __msan_memset +#else #define __underlying_memcpy __builtin_memcpy #define __underlying_memmove __builtin_memmove #define __underlying_memset __builtin_memset +#endif + +#define __underlying_memchr __builtin_memchr +#define __underlying_memcmp __builtin_memcmp #define __underlying_strcat __builtin_strcat #define __underlying_strcpy __builtin_strcpy #define __underlying_strlen __builtin_strlen diff --git a/include/linux/kmsan_string.h b/include/linux/kmsan_string.h new file mode 100644 index 000000000000..7287da6f52ef --- /dev/null +++ b/include/linux/kmsan_string.h @@ -0,0 +1,21 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KMSAN string functions API used in other headers. + * + * Copyright (C) 2022 Google LLC + * Author: Alexander Potapenko + * + */ +#ifndef _LINUX_KMSAN_STRING_H +#define _LINUX_KMSAN_STRING_H + +/* + * KMSAN overrides the default memcpy/memset/memmove implementations in the + * kernel, which requires having __msan_XXX function prototypes in several other + * headers. Keep them in one place instead of open-coding. + */ +void *__msan_memcpy(void *dst, const void *src, size_t size); +void *__msan_memset(void *s, int c, size_t n); +void *__msan_memmove(void *dest, const void *src, size_t len); + +#endif /* _LINUX_KMSAN_STRING_H */ diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f07e6998bb68..9df0b9a762cc 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -146,9 +146,9 @@ static inline bool userfaultfd_armed(struct vm_area_struct *vma) static inline bool vma_can_userfault(struct vm_area_struct *vma, unsigned long vm_flags) { - if (vm_flags & VM_UFFD_MINOR) - return is_vm_hugetlb_page(vma) || vma_is_shmem(vma); - + if ((vm_flags & VM_UFFD_MINOR) && + (!is_vm_hugetlb_page(vma) && !vma_is_shmem(vma))) + return false; #ifndef CONFIG_PTE_MARKER_UFFD_WP /* * If user requested uffd-wp but not enabled pte markers for diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 6cc16e39b27f..ac8c488e3077 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -1173,7 +1173,7 @@ static void __cold io_move_task_work_from_local(struct io_ring_ctx *ctx) } } -int __io_run_local_work(struct io_ring_ctx *ctx, bool locked) +int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked) { struct llist_node *node; struct llist_node fake; @@ -1192,7 +1192,7 @@ again: struct io_kiocb *req = container_of(node, struct io_kiocb, io_task_work.node); prefetch(container_of(next, struct io_kiocb, io_task_work.node)); - req->io_task_work.func(req, &locked); + req->io_task_work.func(req, locked); ret++; node = next; } @@ -1208,7 +1208,7 @@ again: goto again; } - if (locked) + if (*locked) io_submit_flush_completions(ctx); trace_io_uring_local_work_run(ctx, ret, loops); return ret; @@ -1225,7 +1225,7 @@ int io_run_local_work(struct io_ring_ctx *ctx) __set_current_state(TASK_RUNNING); locked = mutex_trylock(&ctx->uring_lock); - ret = __io_run_local_work(ctx, locked); + ret = __io_run_local_work(ctx, &locked); if (locked) mutex_unlock(&ctx->uring_lock); @@ -1446,8 +1446,7 @@ static int io_iopoll_check(struct io_ring_ctx *ctx, long min) io_task_work_pending(ctx)) { u32 tail = ctx->cached_cq_tail; - if (!llist_empty(&ctx->work_llist)) - __io_run_local_work(ctx, true); + (void) io_run_local_work_locked(ctx); if (task_work_pending(current) || wq_list_empty(&ctx->iopoll_list)) { diff --git a/io_uring/io_uring.h b/io_uring/io_uring.h index ef77d2aa3172..e99a79f2df9b 100644 --- a/io_uring/io_uring.h +++ b/io_uring/io_uring.h @@ -27,7 +27,7 @@ enum { struct io_uring_cqe *__io_get_cqe(struct io_ring_ctx *ctx, bool overflow); bool io_req_cqe_overflow(struct io_kiocb *req); int io_run_task_work_sig(struct io_ring_ctx *ctx); -int __io_run_local_work(struct io_ring_ctx *ctx, bool locked); +int __io_run_local_work(struct io_ring_ctx *ctx, bool *locked); int io_run_local_work(struct io_ring_ctx *ctx); void io_req_complete_failed(struct io_kiocb *req, s32 res); void __io_req_complete(struct io_kiocb *req, unsigned issue_flags); @@ -277,9 +277,18 @@ static inline int io_run_task_work_ctx(struct io_ring_ctx *ctx) static inline int io_run_local_work_locked(struct io_ring_ctx *ctx) { + bool locked; + int ret; + if (llist_empty(&ctx->work_llist)) return 0; - return __io_run_local_work(ctx, true); + + locked = true; + ret = __io_run_local_work(ctx, &locked); + /* shouldn't happen! */ + if (WARN_ON_ONCE(!locked)) + mutex_lock(&ctx->uring_lock); + return ret; } static inline void io_tw_lock(struct io_ring_ctx *ctx, bool *locked) diff --git a/ipc/msg.c b/ipc/msg.c index e4e0990e08f7..fd08b3cb36d7 100644 --- a/ipc/msg.c +++ b/ipc/msg.c @@ -1329,11 +1329,11 @@ fail_msg_bytes: #ifdef CONFIG_IPC_NS void msg_exit_ns(struct ipc_namespace *ns) { - percpu_counter_destroy(&ns->percpu_msg_bytes); - percpu_counter_destroy(&ns->percpu_msg_hdrs); free_ipcs(ns, &msg_ids(ns), freeque); idr_destroy(&ns->ids[IPC_MSG_IDS].ipcs_idr); rhashtable_destroy(&ns->ids[IPC_MSG_IDS].key_ht); + percpu_counter_destroy(&ns->percpu_msg_bytes); + percpu_counter_destroy(&ns->percpu_msg_hdrs); } #endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 3fc7abffc7aa..29280072dc0e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -400,8 +400,9 @@ config FRAME_WARN default 1536 if (!64BIT && XTENSA) default 1024 if !64BIT default 2048 if 64BIT + default 0 if KMSAN help - Tell gcc to warn at build time for stack frames larger than this. + Tell the compiler to warn at build time for stack frames larger than this. Setting this too low will cause a lot of warnings. Setting it to 0 disables the warning. diff --git a/lib/maple_tree.c b/lib/maple_tree.c index e1743803c851..fbde494444b8 100644 --- a/lib/maple_tree.c +++ b/lib/maple_tree.c @@ -2903,8 +2903,8 @@ static inline void *mtree_range_walk(struct ma_state *mas) unsigned long max, min; unsigned long prev_max, prev_min; - last = next = mas->node; - prev_min = min = mas->min; + next = mas->node; + min = mas->min; max = mas->max; do { offset = 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 03fc7e5edf07..561a42567477 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2462,7 +2462,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * Fix up and warn once if private is unexpectedly set. */ if (!folio_test_swapcache(page_folio(head))) { - VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, head); + VM_WARN_ON_ONCE_PAGE(page_tail->private != 0, page_tail); page_tail->private = 0; } diff --git a/mm/kmemleak.c b/mm/kmemleak.c index 37af2dc8dac9..646e2979641f 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1460,6 +1460,27 @@ static void scan_gray_list(void) WARN_ON(!list_empty(&gray_list)); } +/* + * Conditionally call resched() in a object iteration loop while making sure + * that the given object won't go away without RCU read lock by performing a + * get_object() if !pinned. + * + * Return: false if can't do a cond_resched() due to get_object() failure + * true otherwise + */ +static bool kmemleak_cond_resched(struct kmemleak_object *object, bool pinned) +{ + if (!pinned && !get_object(object)) + return false; + + rcu_read_unlock(); + cond_resched(); + rcu_read_lock(); + if (!pinned) + put_object(object); + return true; +} + /* * Scan data sections and all the referenced memory blocks allocated via the * kernel's standard allocators. This function must be called with the @@ -1471,7 +1492,7 @@ static void kmemleak_scan(void) struct zone *zone; int __maybe_unused i; int new_leaks = 0; - int loop1_cnt = 0; + int loop_cnt = 0; jiffies_last_scan = jiffies; @@ -1480,7 +1501,6 @@ static void kmemleak_scan(void) list_for_each_entry_rcu(object, &object_list, object_list) { bool obj_pinned = false; - loop1_cnt++; raw_spin_lock_irq(&object->lock); #ifdef DEBUG /* @@ -1514,24 +1534,11 @@ static void kmemleak_scan(void) raw_spin_unlock_irq(&object->lock); /* - * Do a cond_resched() to avoid soft lockup every 64k objects. - * Make sure a reference has been taken so that the object - * won't go away without RCU read lock. + * Do a cond_resched() every 64k objects to avoid soft lockup. */ - if (!(loop1_cnt & 0xffff)) { - if (!obj_pinned && !get_object(object)) { - /* Try the next object instead */ - loop1_cnt--; - continue; - } - - rcu_read_unlock(); - cond_resched(); - rcu_read_lock(); - - if (!obj_pinned) - put_object(object); - } + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, obj_pinned)) + loop_cnt--; /* Try again on next object */ } rcu_read_unlock(); @@ -1598,7 +1605,15 @@ static void kmemleak_scan(void) * scan and color them gray until the next scan. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { + /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in @@ -1632,7 +1647,15 @@ static void kmemleak_scan(void) * Scanning result reporting. */ rcu_read_lock(); + loop_cnt = 0; list_for_each_entry_rcu(object, &object_list, object_list) { + /* + * Do a cond_resched() every 64k objects to avoid soft lockup. + */ + if (!(++loop_cnt & 0xffff) && + !kmemleak_cond_resched(object, false)) + loop_cnt--; /* Try again on next object */ + /* * This is racy but we can save the overhead of lock/unlock * calls. The missed objects, if any, should be caught in diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c index 280d15413268..271f135f97a1 100644 --- a/mm/kmsan/instrumentation.c +++ b/mm/kmsan/instrumentation.c @@ -14,6 +14,7 @@ #include "kmsan.h" #include +#include #include #include diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 21e3e196ec3c..a787c04e9583 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -167,6 +167,7 @@ void kmsan_copy_page_meta(struct page *dst, struct page *src) __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE); kmsan_leave_runtime(); } +EXPORT_SYMBOL(kmsan_copy_page_meta); void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) { diff --git a/mm/madvise.c b/mm/madvise.c index 2baa93ca2310..c7105ec6d08c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -813,7 +813,14 @@ static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, if (start & ~huge_page_mask(hstate_vma(vma))) return false; - *end = ALIGN(*end, huge_page_size(hstate_vma(vma))); + /* + * Madvise callers expect the length to be rounded up to PAGE_SIZE + * boundaries, and may be unaware that this VMA uses huge pages. + * Avoid unexpected data loss by rounding down the number of + * huge pages freed. + */ + *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); + return true; } @@ -828,6 +835,9 @@ static long madvise_dontneed_free(struct vm_area_struct *vma, if (!madvise_dontneed_free_valid_vma(vma, start, &end, behavior)) return -EINVAL; + if (start == end) + return 0; + if (!userfaultfd_remove(vma, start, end)) { *prev = NULL; /* mmap_lock has been dropped, prev is stale */ diff --git a/mm/memory-tiers.c b/mm/memory-tiers.c index f116b7b6333e..fa8c9d07f9ce 100644 --- a/mm/memory-tiers.c +++ b/mm/memory-tiers.c @@ -131,8 +131,8 @@ static void memory_tier_device_release(struct device *dev) kfree(tier); } -static ssize_t nodes_show(struct device *dev, - struct device_attribute *attr, char *buf) +static ssize_t nodelist_show(struct device *dev, + struct device_attribute *attr, char *buf) { int ret; nodemask_t nmask; @@ -143,10 +143,10 @@ static ssize_t nodes_show(struct device *dev, mutex_unlock(&memory_tier_lock); return ret; } -static DEVICE_ATTR_RO(nodes); +static DEVICE_ATTR_RO(nodelist); static struct attribute *memtier_dev_attrs[] = { - &dev_attr_nodes.attr, + &dev_attr_nodelist.attr, NULL }; diff --git a/mm/migrate.c b/mm/migrate.c index 1379e1912772..dff333593a8a 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1582,6 +1582,13 @@ out: */ list_splice(&ret_pages, from); + /* + * Return 0 in case all subpages of fail-to-migrate THPs are + * migrated successfully. + */ + if (list_empty(from)) + rc = 0; + count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); diff --git a/mm/mmap.c b/mm/mmap.c index 3696b1909199..69c2f3523102 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2853,6 +2853,9 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (next->vm_flags != vma->vm_flags) goto out; + if (start + size <= next->vm_end) + break; + prev = next; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b5a6c815ae28..218b28ee49ed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -807,6 +807,7 @@ static void prep_compound_tail(struct page *head, int tail_idx) p->mapping = TAIL_MAPPING; set_compound_head(p, head); + set_page_private(p, 0); } void prep_compound_page(struct page *page, unsigned int order) diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 04141a9bea70..47fbc1696466 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -330,7 +330,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, zone->zone_start_pfn); if (skip_isolation) { - int mt = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); + int mt __maybe_unused = get_pageblock_migratetype(pfn_to_page(isolate_pageblock)); VM_BUG_ON(!is_migrate_isolate(mt)); } else { diff --git a/mm/shmem.c b/mm/shmem.c index 8280a5cb48df..c1d8b8a1aa3b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2424,9 +2424,26 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!zeropage) { /* COPY */ page_kaddr = kmap_local_folio(folio, 0); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); + pagefault_enable(); kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e24e8a47ce8a..3d0fef3980b3 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -157,11 +157,28 @@ static int mcopy_atomic_pte(struct mm_struct *dst_mm, if (!page) goto out; - page_kaddr = kmap_atomic(page); + page_kaddr = kmap_local_page(page); + /* + * The read mmap_lock is held here. Despite the + * mmap_lock being read recursive a deadlock is still + * possible if a writer has taken a lock. For example: + * + * process A thread 1 takes read lock on own mmap_lock + * process A thread 2 calls mmap, blocks taking write lock + * process B thread 1 takes page fault, read lock on own mmap lock + * process B thread 2 calls mmap, blocks taking write lock + * process A thread 1 blocks taking read lock on process B + * process B thread 1 blocks taking read lock on process A + * + * Disable page faults to prevent potential deadlock + * and retry the copy outside the mmap_lock. + */ + pagefault_disable(); ret = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap_atomic(page_kaddr); + pagefault_enable(); + kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { @@ -646,11 +663,11 @@ retry: mmap_read_unlock(dst_mm); BUG_ON(!page); - page_kaddr = kmap(page); + page_kaddr = kmap_local_page(page); err = copy_from_user(page_kaddr, (const void __user *) src_addr, PAGE_SIZE); - kunmap(page); + kunmap_local(page_kaddr); if (unlikely(err)) { err = -EFAULT; goto out;