From 73c007575b6b6321e0d34207044349f4ac4283a6 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 14 Apr 2024 12:10:17 +0200 Subject: [PATCH 01/23] pps: remove usage of the deprecated ida_simple_xx() API [ Upstream commit 55dbc5b5174d0e7d1fa397d05aa4cb145e8b887e ] ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Link: https://lkml.kernel.org/r/9f681747d446b874952a892491387d79ffe565a9.1713089394.git.christophe.jaillet@wanadoo.fr Signed-off-by: Christophe JAILLET Cc: Rodolfo Giometti Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Stable-dep-of: 62c5a01a5711 ("pps: add an error check in parport_attach") Signed-off-by: Sasha Levin --- drivers/pps/clients/pps_parport.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pps/clients/pps_parport.c b/drivers/pps/clients/pps_parport.c index 42f93d4c6ee3..af972cdc04b5 100644 --- a/drivers/pps/clients/pps_parport.c +++ b/drivers/pps/clients/pps_parport.c @@ -148,7 +148,7 @@ static void parport_attach(struct parport *port) return; } - index = ida_simple_get(&pps_client_index, 0, 0, GFP_KERNEL); + index = ida_alloc(&pps_client_index, GFP_KERNEL); memset(&pps_client_cb, 0, sizeof(pps_client_cb)); pps_client_cb.private = device; pps_client_cb.irq_func = parport_irq; @@ -188,7 +188,7 @@ err_release_dev: err_unregister_dev: parport_unregister_device(device->pardev); err_free: - ida_simple_remove(&pps_client_index, index); + ida_free(&pps_client_index, index); kfree(device); } @@ -208,7 +208,7 @@ static void parport_detach(struct parport *port) pps_unregister_source(device->pps); parport_release(pardev); parport_unregister_device(pardev); - ida_simple_remove(&pps_client_index, device->index); + ida_free(&pps_client_index, device->index); kfree(device); } From 8f925510ce3d283ef7f6a1f8d4df46270a5d0356 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Wed, 28 Aug 2024 21:18:14 +0800 Subject: [PATCH 02/23] pps: add an error check in parport_attach [ Upstream commit 62c5a01a5711c8e4be8ae7b6f0db663094615d48 ] In parport_attach, the return value of ida_alloc is unchecked, witch leads to the use of an invalid index value. To address this issue, index should be checked. When the index value is abnormal, the device should be freed. Found by code review, compile tested only. Cc: stable@vger.kernel.org Fixes: fb56d97df70e ("pps: client: use new parport device model") Signed-off-by: Ma Ke Acked-by: Rodolfo Giometti Link: https://lore.kernel.org/r/20240828131814.3034338-1-make24@iscas.ac.cn Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/pps/clients/pps_parport.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/pps/clients/pps_parport.c b/drivers/pps/clients/pps_parport.c index af972cdc04b5..53e9c304ae0a 100644 --- a/drivers/pps/clients/pps_parport.c +++ b/drivers/pps/clients/pps_parport.c @@ -149,6 +149,9 @@ static void parport_attach(struct parport *port) } index = ida_alloc(&pps_client_index, GFP_KERNEL); + if (index < 0) + goto err_free_device; + memset(&pps_client_cb, 0, sizeof(pps_client_cb)); pps_client_cb.private = device; pps_client_cb.irq_func = parport_irq; @@ -159,7 +162,7 @@ static void parport_attach(struct parport *port) index); if (!device->pardev) { pr_err("couldn't register with %s\n", port->name); - goto err_free; + goto err_free_ida; } if (parport_claim_or_block(device->pardev) < 0) { @@ -187,8 +190,9 @@ err_release_dev: parport_release(device->pardev); err_unregister_dev: parport_unregister_device(device->pardev); -err_free: +err_free_ida: ida_free(&pps_client_index, index); +err_free_device: kfree(device); } From 970d6010f85c02b9f35b39f36549578d53cdf003 Mon Sep 17 00:00:00 2001 From: Xin Li Date: Tue, 5 Dec 2023 02:50:11 -0800 Subject: [PATCH 03/23] x86/idtentry: Incorporate definitions/declarations of the FRED entries [ Upstream commit 90f357208200a941e90e75757123326684d715d0 ] FRED and IDT can share most of the definitions and declarations so that in the majority of cases the actual handler implementation is the same. The differences are the exceptions where FRED stores exception related information on the stack and the sysvec implementations as FRED can handle irqentry/exit() in the dispatcher instead of having it in each handler. Also add stub defines for vectors which are not used due to Kconfig decisions to spare the ifdeffery in the actual FRED dispatch code. Suggested-by: Thomas Gleixner Signed-off-by: Xin Li Signed-off-by: Thomas Gleixner Signed-off-by: Borislav Petkov (AMD) Tested-by: Shan Kang Link: https://lore.kernel.org/r/20231205105030.8698-23-xin3.li@intel.com Stable-dep-of: 477d81a1c47a ("x86/entry: Remove unwanted instrumentation in common_interrupt()") Signed-off-by: Sasha Levin --- arch/x86/include/asm/idtentry.h | 71 +++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 8 deletions(-) diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index fca710a93eb9..0c8823965952 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -13,15 +13,18 @@ #include +typedef void (*idtentry_t)(struct pt_regs *regs); + /** * DECLARE_IDTENTRY - Declare functions for simple IDT entry points * No error code pushed by hardware * @vector: Vector number (ignored for C) * @func: Function name of the entry point * - * Declares three functions: + * Declares four functions: * - The ASM entry point: asm_##func * - The XEN PV trap entry point: xen_##func (maybe unused) + * - The C handler called from the FRED event dispatcher (maybe unused) * - The C handler called from the ASM entry point * * Note: This is the C variant of DECLARE_IDTENTRY(). As the name says it @@ -31,6 +34,7 @@ #define DECLARE_IDTENTRY(vector, func) \ asmlinkage void asm_##func(void); \ asmlinkage void xen_asm_##func(void); \ + void fred_##func(struct pt_regs *regs); \ __visible void func(struct pt_regs *regs) /** @@ -137,6 +141,17 @@ static __always_inline void __##func(struct pt_regs *regs, \ #define DEFINE_IDTENTRY_RAW(func) \ __visible noinstr void func(struct pt_regs *regs) +/** + * DEFINE_FREDENTRY_RAW - Emit code for raw FRED entry points + * @func: Function name of the entry point + * + * @func is called from the FRED event dispatcher with interrupts disabled. + * + * See @DEFINE_IDTENTRY_RAW for further details. + */ +#define DEFINE_FREDENTRY_RAW(func) \ +noinstr void fred_##func(struct pt_regs *regs) + /** * DECLARE_IDTENTRY_RAW_ERRORCODE - Declare functions for raw IDT entry points * Error code pushed by hardware @@ -233,17 +248,27 @@ static noinline void __##func(struct pt_regs *regs, u32 vector) #define DEFINE_IDTENTRY_SYSVEC(func) \ static void __##func(struct pt_regs *regs); \ \ +static __always_inline void instr_##func(struct pt_regs *regs) \ +{ \ + kvm_set_cpu_l1tf_flush_l1d(); \ + run_sysvec_on_irqstack_cond(__##func, regs); \ +} \ + \ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ - run_sysvec_on_irqstack_cond(__##func, regs); \ + instr_##func (regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ +void fred_##func(struct pt_regs *regs) \ +{ \ + instr_##func (regs); \ +} \ + \ static noinline void __##func(struct pt_regs *regs) /** @@ -260,19 +285,29 @@ static noinline void __##func(struct pt_regs *regs) #define DEFINE_IDTENTRY_SYSVEC_SIMPLE(func) \ static __always_inline void __##func(struct pt_regs *regs); \ \ +static __always_inline void instr_##func(struct pt_regs *regs) \ +{ \ + __irq_enter_raw(); \ + kvm_set_cpu_l1tf_flush_l1d(); \ + __##func (regs); \ + __irq_exit_raw(); \ +} \ + \ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ instrumentation_begin(); \ - __irq_enter_raw(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ - __##func (regs); \ - __irq_exit_raw(); \ + instr_##func (regs); \ instrumentation_end(); \ irqentry_exit(regs, state); \ } \ \ +void fred_##func(struct pt_regs *regs) \ +{ \ + instr_##func (regs); \ +} \ + \ static __always_inline void __##func(struct pt_regs *regs) /** @@ -410,15 +445,18 @@ __visible noinstr void func(struct pt_regs *regs, \ /* C-Code mapping */ #define DECLARE_IDTENTRY_NMI DECLARE_IDTENTRY_RAW #define DEFINE_IDTENTRY_NMI DEFINE_IDTENTRY_RAW +#define DEFINE_FREDENTRY_NMI DEFINE_FREDENTRY_RAW #ifdef CONFIG_X86_64 #define DECLARE_IDTENTRY_MCE DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_MCE_USER DEFINE_IDTENTRY_NOIST +#define DEFINE_FREDENTRY_MCE DEFINE_FREDENTRY_RAW #define DECLARE_IDTENTRY_DEBUG DECLARE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG DEFINE_IDTENTRY_IST #define DEFINE_IDTENTRY_DEBUG_USER DEFINE_IDTENTRY_NOIST +#define DEFINE_FREDENTRY_DEBUG DEFINE_FREDENTRY_RAW #endif #else /* !__ASSEMBLY__ */ @@ -660,23 +698,36 @@ DECLARE_IDTENTRY_SYSVEC(IRQ_MOVE_CLEANUP_VECTOR, sysvec_irq_move_cleanup); DECLARE_IDTENTRY_SYSVEC(REBOOT_VECTOR, sysvec_reboot); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_SINGLE_VECTOR, sysvec_call_function_single); DECLARE_IDTENTRY_SYSVEC(CALL_FUNCTION_VECTOR, sysvec_call_function); +#else +# define fred_sysvec_reschedule_ipi NULL +# define fred_sysvec_reboot NULL +# define fred_sysvec_call_function_single NULL +# define fred_sysvec_call_function NULL #endif #ifdef CONFIG_X86_LOCAL_APIC # ifdef CONFIG_X86_MCE_THRESHOLD DECLARE_IDTENTRY_SYSVEC(THRESHOLD_APIC_VECTOR, sysvec_threshold); +# else +# define fred_sysvec_threshold NULL # endif # ifdef CONFIG_X86_MCE_AMD DECLARE_IDTENTRY_SYSVEC(DEFERRED_ERROR_VECTOR, sysvec_deferred_error); +# else +# define fred_sysvec_deferred_error NULL # endif # ifdef CONFIG_X86_THERMAL_VECTOR DECLARE_IDTENTRY_SYSVEC(THERMAL_APIC_VECTOR, sysvec_thermal); +# else +# define fred_sysvec_thermal NULL # endif # ifdef CONFIG_IRQ_WORK DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); +# else +# define fred_sysvec_irq_work NULL # endif #endif @@ -684,12 +735,16 @@ DECLARE_IDTENTRY_SYSVEC(IRQ_WORK_VECTOR, sysvec_irq_work); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_VECTOR, sysvec_kvm_posted_intr_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_WAKEUP_VECTOR, sysvec_kvm_posted_intr_wakeup_ipi); DECLARE_IDTENTRY_SYSVEC(POSTED_INTR_NESTED_VECTOR, sysvec_kvm_posted_intr_nested_ipi); +#else +# define fred_sysvec_kvm_posted_intr_ipi NULL +# define fred_sysvec_kvm_posted_intr_wakeup_ipi NULL +# define fred_sysvec_kvm_posted_intr_nested_ipi NULL #endif #if IS_ENABLED(CONFIG_HYPERV) DECLARE_IDTENTRY_SYSVEC(HYPERVISOR_CALLBACK_VECTOR, sysvec_hyperv_callback); DECLARE_IDTENTRY_SYSVEC(HYPERV_REENLIGHTENMENT_VECTOR, sysvec_hyperv_reenlightenment); -DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); +DECLARE_IDTENTRY_SYSVEC(HYPERV_STIMER0_VECTOR, sysvec_hyperv_stimer0); #endif #if IS_ENABLED(CONFIG_ACRN_GUEST) From e01c8c1d395c570d05f3f71864882b62eaace82f Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 11 Jun 2024 09:50:30 +0200 Subject: [PATCH 04/23] x86/entry: Remove unwanted instrumentation in common_interrupt() [ Upstream commit 477d81a1c47a1b79b9c08fc92b5dea3c5143800b ] common_interrupt() and related variants call kvm_set_cpu_l1tf_flush_l1d(), which is neither marked noinstr nor __always_inline. So compiler puts it out of line and adds instrumentation to it. Since the call is inside of instrumentation_begin/end(), objtool does not warn about it. The manifestation is that KCOV produces spurious coverage in kvm_set_cpu_l1tf_flush_l1d() in random places because the call happens when preempt count is not yet updated to say that the kernel is in an interrupt. Mark kvm_set_cpu_l1tf_flush_l1d() as __always_inline and move it out of the instrumentation_begin/end() section. It only calls __this_cpu_write() which is already safe to call in noinstr contexts. Fixes: 6368558c3710 ("x86/entry: Provide IDTENTRY_SYSVEC") Signed-off-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Reviewed-by: Alexander Potapenko Acked-by: Peter Zijlstra (Intel) Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/3f9a1de9e415fcb53d07dc9e19fa8481bb021b1b.1718092070.git.dvyukov@google.com Signed-off-by: Sasha Levin --- arch/x86/include/asm/hardirq.h | 8 ++++++-- arch/x86/include/asm/idtentry.h | 6 +++--- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 275e7fd20310..a18df4191699 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -62,7 +62,11 @@ extern u64 arch_irq_stat(void); #if IS_ENABLED(CONFIG_KVM_INTEL) -static inline void kvm_set_cpu_l1tf_flush_l1d(void) +/* + * This function is called from noinstr interrupt contexts + * and must be inlined to not get instrumentation. + */ +static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void) { __this_cpu_write(irq_stat.kvm_cpu_l1tf_flush_l1d, 1); } @@ -77,7 +81,7 @@ static __always_inline bool kvm_get_cpu_l1tf_flush_l1d(void) return __this_cpu_read(irq_stat.kvm_cpu_l1tf_flush_l1d); } #else /* !IS_ENABLED(CONFIG_KVM_INTEL) */ -static inline void kvm_set_cpu_l1tf_flush_l1d(void) { } +static __always_inline void kvm_set_cpu_l1tf_flush_l1d(void) { } #endif /* IS_ENABLED(CONFIG_KVM_INTEL) */ #endif /* _ASM_X86_HARDIRQ_H */ diff --git a/arch/x86/include/asm/idtentry.h b/arch/x86/include/asm/idtentry.h index 0c8823965952..9c04f3c0ff18 100644 --- a/arch/x86/include/asm/idtentry.h +++ b/arch/x86/include/asm/idtentry.h @@ -212,8 +212,8 @@ __visible noinstr void func(struct pt_regs *regs, \ irqentry_state_t state = irqentry_enter(regs); \ u32 vector = (u32)(u8)error_code; \ \ + kvm_set_cpu_l1tf_flush_l1d(); \ instrumentation_begin(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ run_irq_on_irqstack_cond(__##func, regs, vector); \ instrumentation_end(); \ irqentry_exit(regs, state); \ @@ -250,7 +250,6 @@ static void __##func(struct pt_regs *regs); \ \ static __always_inline void instr_##func(struct pt_regs *regs) \ { \ - kvm_set_cpu_l1tf_flush_l1d(); \ run_sysvec_on_irqstack_cond(__##func, regs); \ } \ \ @@ -258,6 +257,7 @@ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ + kvm_set_cpu_l1tf_flush_l1d(); \ instrumentation_begin(); \ instr_##func (regs); \ instrumentation_end(); \ @@ -288,7 +288,6 @@ static __always_inline void __##func(struct pt_regs *regs); \ static __always_inline void instr_##func(struct pt_regs *regs) \ { \ __irq_enter_raw(); \ - kvm_set_cpu_l1tf_flush_l1d(); \ __##func (regs); \ __irq_exit_raw(); \ } \ @@ -297,6 +296,7 @@ __visible noinstr void func(struct pt_regs *regs) \ { \ irqentry_state_t state = irqentry_enter(regs); \ \ + kvm_set_cpu_l1tf_flush_l1d(); \ instrumentation_begin(); \ instr_##func (regs); \ instrumentation_end(); \ From 0e7877d1bb5129e63fa9f932eae95d3d7b33c520 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 2 Oct 2024 05:06:23 +0800 Subject: [PATCH 05/23] mm/filemap: return early if failed to allocate memory for split commit de60fd8ddeda2b41fbe11df11733838c5f684616 upstream. xas_split_alloc could fail with NOMEM, and in such case, it should abort early instead of keep going and fail the xas_split below. Link: https://lkml.kernel.org/r/20240416071722.45997-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20240415171857.19244-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20240415171857.19244-2-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Stable-dep-of: 6758c1128ceb ("mm/filemap: optimize filemap folio adding") Signed-off-by: Greg Kroah-Hartman --- mm/filemap.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index 2809b1174f04..f85c13a1b739 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -867,9 +867,12 @@ noinline int __filemap_add_folio(struct address_space *mapping, unsigned int order = xa_get_order(xas.xa, xas.xa_index); void *entry, *old = NULL; - if (order > folio_order(folio)) + if (order > folio_order(folio)) { xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), order, gfp); + if (xas_error(&xas)) + goto error; + } xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { old = entry; From 3714f62ecb2b5f82ca61453d1f51a9f7d2a3392f Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 2 Oct 2024 05:06:24 +0800 Subject: [PATCH 06/23] lib/xarray: introduce a new helper xas_get_order commit a4864671ca0bf51c8e78242951741df52c06766f upstream. It can be used after xas_load to check the order of loaded entries. Compared to xa_get_order, it saves an XA_STATE and avoid a rewalk. Added new test for xas_get_order, to make the test work, we have to export xas_get_order with EXPORT_SYMBOL_GPL. Also fix a sparse warning by checking the slot value with xa_entry instead of accessing it directly, as suggested by Matthew Wilcox. [kasong@tencent.com: simplify comment, sparse warning fix, per Matthew Wilcox] Link: https://lkml.kernel.org/r/20240416071722.45997-4-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20240415171857.19244-4-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Stable-dep-of: 6758c1128ceb ("mm/filemap: optimize filemap folio adding") Signed-off-by: Greg Kroah-Hartman --- include/linux/xarray.h | 6 +++++ lib/test_xarray.c | 34 +++++++++++++++++++++++++++ lib/xarray.c | 53 ++++++++++++++++++++++++++---------------- 3 files changed, 73 insertions(+), 20 deletions(-) diff --git a/include/linux/xarray.h b/include/linux/xarray.h index 44dd6d6e01bc..0e2feb72e9e5 100644 --- a/include/linux/xarray.h +++ b/include/linux/xarray.h @@ -1530,6 +1530,7 @@ void xas_create_range(struct xa_state *); #ifdef CONFIG_XARRAY_MULTI int xa_get_order(struct xarray *, unsigned long index); +int xas_get_order(struct xa_state *xas); void xas_split(struct xa_state *, void *entry, unsigned int order); void xas_split_alloc(struct xa_state *, void *entry, unsigned int order, gfp_t); #else @@ -1538,6 +1539,11 @@ static inline int xa_get_order(struct xarray *xa, unsigned long index) return 0; } +static inline int xas_get_order(struct xa_state *xas) +{ + return 0; +} + static inline void xas_split(struct xa_state *xas, void *entry, unsigned int order) { diff --git a/lib/test_xarray.c b/lib/test_xarray.c index e77d4856442c..2e229012920b 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1756,6 +1756,39 @@ static noinline void check_get_order(struct xarray *xa) } } +static noinline void check_xas_get_order(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1; + unsigned int order; + unsigned long i, j; + + for (order = 0; order < max_order; order++) { + for (i = 0; i < 10; i++) { + xas_set_order(&xas, i << order, order); + do { + xas_lock(&xas); + xas_store(&xas, xa_mk_value(i)); + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + for (j = i << order; j < (i + 1) << order; j++) { + xas_set_order(&xas, j, 0); + rcu_read_lock(); + xas_load(&xas); + XA_BUG_ON(xa, xas_get_order(&xas) != order); + rcu_read_unlock(); + } + + xas_lock(&xas); + xas_set_order(&xas, i << order, order); + xas_store(&xas, NULL); + xas_unlock(&xas); + } + } +} + static noinline void check_destroy(struct xarray *xa) { unsigned long index; @@ -1805,6 +1838,7 @@ static int xarray_checks(void) check_reserve(&xa0); check_multi_store(&array); check_get_order(&array); + check_xas_get_order(&array); check_xa_alloc(); check_find(&array); check_find_entry(&array); diff --git a/lib/xarray.c b/lib/xarray.c index e9bd29826e8b..341878f98c5b 100644 --- a/lib/xarray.c +++ b/lib/xarray.c @@ -1751,6 +1751,36 @@ unlock: } EXPORT_SYMBOL(xa_store_range); +/** + * xas_get_order() - Get the order of an entry. + * @xas: XArray operation state. + * + * Called after xas_load, the xas should not be in an error state. + * + * Return: A number between 0 and 63 indicating the order of the entry. + */ +int xas_get_order(struct xa_state *xas) +{ + int order = 0; + + if (!xas->xa_node) + return 0; + + for (;;) { + unsigned int slot = xas->xa_offset + (1 << order); + + if (slot >= XA_CHUNK_SIZE) + break; + if (!xa_is_sibling(xa_entry(xas->xa, xas->xa_node, slot))) + break; + order++; + } + + order += xas->xa_node->shift; + return order; +} +EXPORT_SYMBOL_GPL(xas_get_order); + /** * xa_get_order() - Get the order of an entry. * @xa: XArray. @@ -1761,30 +1791,13 @@ EXPORT_SYMBOL(xa_store_range); int xa_get_order(struct xarray *xa, unsigned long index) { XA_STATE(xas, xa, index); - void *entry; int order = 0; + void *entry; rcu_read_lock(); entry = xas_load(&xas); - - if (!entry) - goto unlock; - - if (!xas.xa_node) - goto unlock; - - for (;;) { - unsigned int slot = xas.xa_offset + (1 << order); - - if (slot >= XA_CHUNK_SIZE) - break; - if (!xa_is_sibling(xas.xa_node->slots[slot])) - break; - order++; - } - - order += xas.xa_node->shift; -unlock: + if (entry) + order = xas_get_order(&xas); rcu_read_unlock(); return order; From fa47f96ee750c9591cb547595dcbd2955d87a924 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Wed, 2 Oct 2024 05:06:25 +0800 Subject: [PATCH 07/23] mm/filemap: optimize filemap folio adding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6758c1128ceb45d1a35298912b974eb4895b7dd9 upstream. Instead of doing multiple tree walks, do one optimism range check with lock hold, and exit if raced with another insertion. If a shadow exists, check it with a new xas_get_order helper before releasing the lock to avoid redundant tree walks for getting its order. Drop the lock and do the allocation only if a split is needed. In the best case, it only need to walk the tree once. If it needs to alloc and split, 3 walks are issued (One for first ranged conflict check and order retrieving, one for the second check after allocation, one for the insert after split). Testing with 4K pages, in an 8G cgroup, with 16G brd as block device: echo 3 > /proc/sys/vm/drop_caches fio -name=cached --numjobs=16 --filename=/mnt/test.img \ --buffered=1 --ioengine=mmap --rw=randread --time_based \ --ramp_time=30s --runtime=5m --group_reporting Before: bw ( MiB/s): min= 1027, max= 3520, per=100.00%, avg=2445.02, stdev=18.90, samples=8691 iops : min=263001, max=901288, avg=625924.36, stdev=4837.28, samples=8691 After (+7.3%): bw ( MiB/s): min= 493, max= 3947, per=100.00%, avg=2625.56, stdev=25.74, samples=8651 iops : min=126454, max=1010681, avg=672142.61, stdev=6590.48, samples=8651 Test result with THP (do a THP randread then switch to 4K page in hope it issues a lot of splitting): echo 3 > /proc/sys/vm/drop_caches fio -name=cached --numjobs=16 --filename=/mnt/test.img \ --buffered=1 --ioengine=mmap -thp=1 --readonly \ --rw=randread --time_based --ramp_time=30s --runtime=10m \ --group_reporting fio -name=cached --numjobs=16 --filename=/mnt/test.img \ --buffered=1 --ioengine=mmap \ --rw=randread --time_based --runtime=5s --group_reporting Before: bw ( KiB/s): min= 4141, max=14202, per=100.00%, avg=7935.51, stdev=96.85, samples=18976 iops : min= 1029, max= 3548, avg=1979.52, stdev=24.23, samples=18976· READ: bw=4545B/s (4545B/s), 4545B/s-4545B/s (4545B/s-4545B/s), io=64.0KiB (65.5kB), run=14419-14419msec After (+12.5%): bw ( KiB/s): min= 4611, max=15370, per=100.00%, avg=8928.74, stdev=105.17, samples=19146 iops : min= 1151, max= 3842, avg=2231.27, stdev=26.29, samples=19146 READ: bw=4635B/s (4635B/s), 4635B/s-4635B/s (4635B/s-4635B/s), io=64.0KiB (65.5kB), run=14137-14137msec The performance is better for both 4K (+7.5%) and THP (+12.5%) cached read. Link: https://lkml.kernel.org/r/20240415171857.19244-5-ryncsn@gmail.com Signed-off-by: Kairui Song Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton Closes: https://lore.kernel.org/linux-mm/A5A976CB-DB57-4513-A700-656580488AB6@flyingcircus.io/ [ kasong@tencent.com: minor adjustment of variable declarations ] Signed-off-by: Greg Kroah-Hartman --- lib/test_xarray.c | 59 +++++++++++++++++++++++++++++++++++++++++++++++ mm/filemap.c | 53 +++++++++++++++++++++++++++++++----------- 2 files changed, 98 insertions(+), 14 deletions(-) diff --git a/lib/test_xarray.c b/lib/test_xarray.c index 2e229012920b..542926da61a3 100644 --- a/lib/test_xarray.c +++ b/lib/test_xarray.c @@ -1789,6 +1789,64 @@ static noinline void check_xas_get_order(struct xarray *xa) } } +static noinline void check_xas_conflict_get_order(struct xarray *xa) +{ + XA_STATE(xas, xa, 0); + + void *entry; + int only_once; + unsigned int max_order = IS_ENABLED(CONFIG_XARRAY_MULTI) ? 20 : 1; + unsigned int order; + unsigned long i, j, k; + + for (order = 0; order < max_order; order++) { + for (i = 0; i < 10; i++) { + xas_set_order(&xas, i << order, order); + do { + xas_lock(&xas); + xas_store(&xas, xa_mk_value(i)); + xas_unlock(&xas); + } while (xas_nomem(&xas, GFP_KERNEL)); + + /* + * Ensure xas_get_order works with xas_for_each_conflict. + */ + j = i << order; + for (k = 0; k < order; k++) { + only_once = 0; + xas_set_order(&xas, j + (1 << k), k); + xas_lock(&xas); + xas_for_each_conflict(&xas, entry) { + XA_BUG_ON(xa, entry != xa_mk_value(i)); + XA_BUG_ON(xa, xas_get_order(&xas) != order); + only_once++; + } + XA_BUG_ON(xa, only_once != 1); + xas_unlock(&xas); + } + + if (order < max_order - 1) { + only_once = 0; + xas_set_order(&xas, (i & ~1UL) << order, order + 1); + xas_lock(&xas); + xas_for_each_conflict(&xas, entry) { + XA_BUG_ON(xa, entry != xa_mk_value(i)); + XA_BUG_ON(xa, xas_get_order(&xas) != order); + only_once++; + } + XA_BUG_ON(xa, only_once != 1); + xas_unlock(&xas); + } + + xas_set_order(&xas, i << order, order); + xas_lock(&xas); + xas_store(&xas, NULL); + xas_unlock(&xas); + } + } +} + + static noinline void check_destroy(struct xarray *xa) { unsigned long index; @@ -1839,6 +1897,7 @@ static int xarray_checks(void) check_multi_store(&array); check_get_order(&array); check_xas_get_order(&array); + check_xas_conflict_get_order(&array); check_xa_alloc(); check_find(&array); check_find_entry(&array); diff --git a/mm/filemap.c b/mm/filemap.c index f85c13a1b739..d3b925232a59 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -841,6 +841,8 @@ noinline int __filemap_add_folio(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, index); int huge = folio_test_hugetlb(folio); + void *alloced_shadow = NULL; + int alloced_order = 0; bool charged = false; long nr = 1; @@ -863,16 +865,10 @@ noinline int __filemap_add_folio(struct address_space *mapping, folio->mapping = mapping; folio->index = xas.xa_index; - do { - unsigned int order = xa_get_order(xas.xa, xas.xa_index); + for (;;) { + int order = -1, split_order = 0; void *entry, *old = NULL; - if (order > folio_order(folio)) { - xas_split_alloc(&xas, xa_load(xas.xa, xas.xa_index), - order, gfp); - if (xas_error(&xas)) - goto error; - } xas_lock_irq(&xas); xas_for_each_conflict(&xas, entry) { old = entry; @@ -880,19 +876,33 @@ noinline int __filemap_add_folio(struct address_space *mapping, xas_set_err(&xas, -EEXIST); goto unlock; } + /* + * If a larger entry exists, + * it will be the first and only entry iterated. + */ + if (order == -1) + order = xas_get_order(&xas); + } + + /* entry may have changed before we re-acquire the lock */ + if (alloced_order && (old != alloced_shadow || order != alloced_order)) { + xas_destroy(&xas); + alloced_order = 0; } if (old) { - if (shadowp) - *shadowp = old; - /* entry may have been split before we acquired lock */ - order = xa_get_order(xas.xa, xas.xa_index); - if (order > folio_order(folio)) { + if (order > 0 && order > folio_order(folio)) { /* How to handle large swap entries? */ BUG_ON(shmem_mapping(mapping)); + if (!alloced_order) { + split_order = order; + goto unlock; + } xas_split(&xas, old, order); xas_reset(&xas); } + if (shadowp) + *shadowp = old; } xas_store(&xas, folio); @@ -908,9 +918,24 @@ noinline int __filemap_add_folio(struct address_space *mapping, __lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } + unlock: xas_unlock_irq(&xas); - } while (xas_nomem(&xas, gfp)); + + /* split needed, alloc here and retry. */ + if (split_order) { + xas_split_alloc(&xas, old, split_order, gfp); + if (xas_error(&xas)) + goto error; + alloced_shadow = old; + alloced_order = split_order; + xas_reset(&xas); + continue; + } + + if (!xas_nomem(&xas, gfp)) + break; + } if (xas_error(&xas)) goto error; From 3ff50bc627aa309e256f30ae17ac7c69cbc2e94d Mon Sep 17 00:00:00 2001 From: Jamie Bainbridge Date: Tue, 1 Oct 2024 15:04:03 +0000 Subject: [PATCH 08/23] icmp: Add counters for rate limits commit d0941130c93515411c8d66fc22bdae407b509a6d upstream. There are multiple ICMP rate limiting mechanisms: * Global limits: net.ipv4.icmp_msgs_burst/icmp_msgs_per_sec * v4 per-host limits: net.ipv4.icmp_ratelimit/ratemask * v6 per-host limits: net.ipv6.icmp_ratelimit/ratemask However, when ICMP output is limited, there is no way to tell which limit has been hit or even if the limits are responsible for the lack of ICMP output. Add counters for each of the cases above. As we are within local_bh_disable(), use the __INC stats variant. Example output: # nstat -sz "*RateLimit*" IcmpOutRateLimitGlobal 134 0.0 IcmpOutRateLimitHost 770 0.0 Icmp6OutRateLimitHost 84 0.0 Signed-off-by: Jamie Bainbridge Suggested-by: Abhishek Rawal Link: https://lore.kernel.org/r/273b32241e6b7fdc5c609e6f5ebc68caf3994342.1674605770.git.jamie.bainbridge@gmail.com Signed-off-by: Paolo Abeni Signed-off-by: Eric Dumazet Signed-off-by: Greg Kroah-Hartman --- include/uapi/linux/snmp.h | 3 +++ net/ipv4/icmp.c | 3 +++ net/ipv4/proc.c | 8 +++++--- net/ipv6/icmp.c | 4 ++++ net/ipv6/proc.c | 1 + 5 files changed, 16 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/snmp.h b/include/uapi/linux/snmp.h index 4d7470036a8b..8ecb509a84d3 100644 --- a/include/uapi/linux/snmp.h +++ b/include/uapi/linux/snmp.h @@ -95,6 +95,8 @@ enum ICMP_MIB_OUTADDRMASKS, /* OutAddrMasks */ ICMP_MIB_OUTADDRMASKREPS, /* OutAddrMaskReps */ ICMP_MIB_CSUMERRORS, /* InCsumErrors */ + ICMP_MIB_RATELIMITGLOBAL, /* OutRateLimitGlobal */ + ICMP_MIB_RATELIMITHOST, /* OutRateLimitHost */ __ICMP_MIB_MAX }; @@ -112,6 +114,7 @@ enum ICMP6_MIB_OUTMSGS, /* OutMsgs */ ICMP6_MIB_OUTERRORS, /* OutErrors */ ICMP6_MIB_CSUMERRORS, /* InCsumErrors */ + ICMP6_MIB_RATELIMITHOST, /* OutRateLimitHost */ __ICMP6_MIB_MAX }; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 31051b327e53..190988bfa3e2 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -297,6 +297,7 @@ static bool icmpv4_global_allow(struct net *net, int type, int code) if (icmp_global_allow()) return true; + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -326,6 +327,8 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, if (peer) inet_putpeer(peer); out: + if (!rc) + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); return rc; } diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 5386f460bd20..1f52c5f2d347 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -352,7 +352,7 @@ static void icmp_put(struct seq_file *seq) seq_puts(seq, "\nIcmp: InMsgs InErrors InCsumErrors"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " In%s", icmpmibmap[i].name); - seq_puts(seq, " OutMsgs OutErrors"); + seq_puts(seq, " OutMsgs OutErrors OutRateLimitGlobal OutRateLimitHost"); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " Out%s", icmpmibmap[i].name); seq_printf(seq, "\nIcmp: %lu %lu %lu", @@ -362,9 +362,11 @@ static void icmp_put(struct seq_file *seq) for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + icmpmibmap[i].index)); - seq_printf(seq, " %lu %lu", + seq_printf(seq, " %lu %lu %lu %lu", snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTMSGS), - snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS)); + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_OUTERRORS), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITGLOBAL), + snmp_fold_field(net->mib.icmp_statistics, ICMP_MIB_RATELIMITHOST)); for (i = 0; icmpmibmap[i].name; i++) seq_printf(seq, " %lu", atomic_long_read(ptr + (icmpmibmap[i].index | 0x100))); diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index e2af7ab99282..7bea1d82e178 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -183,6 +183,7 @@ static bool icmpv6_global_allow(struct net *net, int type) if (icmp_global_allow()) return true; + __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -224,6 +225,9 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (peer) inet_putpeer(peer); } + if (!res) + __ICMP6_INC_STATS(net, ip6_dst_idev(dst), + ICMP6_MIB_RATELIMITHOST); dst_release(dst); return res; } diff --git a/net/ipv6/proc.c b/net/ipv6/proc.c index d6306aa46bb1..e20b3705c2d2 100644 --- a/net/ipv6/proc.c +++ b/net/ipv6/proc.c @@ -94,6 +94,7 @@ static const struct snmp_mib snmp6_icmp6_list[] = { SNMP_MIB_ITEM("Icmp6OutMsgs", ICMP6_MIB_OUTMSGS), SNMP_MIB_ITEM("Icmp6OutErrors", ICMP6_MIB_OUTERRORS), SNMP_MIB_ITEM("Icmp6InCsumErrors", ICMP6_MIB_CSUMERRORS), + SNMP_MIB_ITEM("Icmp6OutRateLimitHost", ICMP6_MIB_RATELIMITHOST), SNMP_MIB_SENTINEL }; From 997ba8889611891f91e8ad83583466aeab6239a3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 1 Oct 2024 15:04:04 +0000 Subject: [PATCH 09/23] icmp: change the order of rate limits commit 8c2bd38b95f75f3d2a08c93e35303e26d480d24e upstream. ICMP messages are ratelimited : After the blamed commits, the two rate limiters are applied in this order: 1) host wide ratelimit (icmp_global_allow()) 2) Per destination ratelimit (inetpeer based) In order to avoid side-channels attacks, we need to apply the per destination check first. This patch makes the following change : 1) icmp_global_allow() checks if the host wide limit is reached. But credits are not yet consumed. This is deferred to 3) 2) The per destination limit is checked/updated. This might add a new node in inetpeer tree. 3) icmp_global_consume() consumes tokens if prior operations succeeded. This means that host wide ratelimit is still effective in keeping inetpeer tree small even under DDOS. As a bonus, I removed icmp_global.lock as the fast path can use a lock-free operation. Fixes: c0303efeab73 ("net: reduce cycles spend on ICMP replies that gets rate limited") Fixes: 4cdf507d5452 ("icmp: add a global rate limitation") Reported-by: Keyu Man Signed-off-by: Eric Dumazet Reviewed-by: David Ahern Cc: Jesper Dangaard Brouer Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20240829144641.3880376-2-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Eric Dumazet Signed-off-by: Greg Kroah-Hartman --- include/net/ip.h | 2 + net/ipv4/icmp.c | 103 ++++++++++++++++++++++++++--------------------- net/ipv6/icmp.c | 28 ++++++++----- 3 files changed, 76 insertions(+), 57 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index 4f11f7df7dd6..9d754c4a5300 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -781,6 +781,8 @@ static inline void ip_cmsg_recv(struct msghdr *msg, struct sk_buff *skb) } bool icmp_global_allow(void); +void icmp_global_consume(void); + extern int sysctl_icmp_msgs_per_sec; extern int sysctl_icmp_msgs_burst; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 190988bfa3e2..9dffdd876fef 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -222,57 +222,59 @@ int sysctl_icmp_msgs_per_sec __read_mostly = 1000; int sysctl_icmp_msgs_burst __read_mostly = 50; static struct { - spinlock_t lock; - u32 credit; + atomic_t credit; u32 stamp; -} icmp_global = { - .lock = __SPIN_LOCK_UNLOCKED(icmp_global.lock), -}; +} icmp_global; /** * icmp_global_allow - Are we allowed to send one more ICMP message ? * * Uses a token bucket to limit our ICMP messages to ~sysctl_icmp_msgs_per_sec. * Returns false if we reached the limit and can not send another packet. - * Note: called with BH disabled + * Works in tandem with icmp_global_consume(). */ bool icmp_global_allow(void) { - u32 credit, delta, incr = 0, now = (u32)jiffies; - bool rc = false; + u32 delta, now, oldstamp; + int incr, new, old; - /* Check if token bucket is empty and cannot be refilled - * without taking the spinlock. The READ_ONCE() are paired - * with the following WRITE_ONCE() in this same function. + /* Note: many cpus could find this condition true. + * Then later icmp_global_consume() could consume more credits, + * this is an acceptable race. */ - if (!READ_ONCE(icmp_global.credit)) { - delta = min_t(u32, now - READ_ONCE(icmp_global.stamp), HZ); - if (delta < HZ / 50) - return false; - } + if (atomic_read(&icmp_global.credit) > 0) + return true; - spin_lock(&icmp_global.lock); - delta = min_t(u32, now - icmp_global.stamp, HZ); - if (delta >= HZ / 50) { - incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ; - if (incr) - WRITE_ONCE(icmp_global.stamp, now); + now = jiffies; + oldstamp = READ_ONCE(icmp_global.stamp); + delta = min_t(u32, now - oldstamp, HZ); + if (delta < HZ / 50) + return false; + + incr = READ_ONCE(sysctl_icmp_msgs_per_sec) * delta / HZ; + if (!incr) + return false; + + if (cmpxchg(&icmp_global.stamp, oldstamp, now) == oldstamp) { + old = atomic_read(&icmp_global.credit); + do { + new = min(old + incr, READ_ONCE(sysctl_icmp_msgs_burst)); + } while (!atomic_try_cmpxchg(&icmp_global.credit, &old, new)); } - credit = min_t(u32, icmp_global.credit + incr, - READ_ONCE(sysctl_icmp_msgs_burst)); - if (credit) { - /* We want to use a credit of one in average, but need to randomize - * it for security reasons. - */ - credit = max_t(int, credit - prandom_u32_max(3), 0); - rc = true; - } - WRITE_ONCE(icmp_global.credit, credit); - spin_unlock(&icmp_global.lock); - return rc; + return true; } EXPORT_SYMBOL(icmp_global_allow); +void icmp_global_consume(void) +{ + int credits = get_random_u32_below(3); + + /* Note: this might make icmp_global.credit negative. */ + if (credits) + atomic_sub(credits, &icmp_global.credit); +} +EXPORT_SYMBOL(icmp_global_consume); + static bool icmpv4_mask_allow(struct net *net, int type, int code) { if (type > NR_ICMP_TYPES) @@ -289,14 +291,16 @@ static bool icmpv4_mask_allow(struct net *net, int type, int code) return false; } -static bool icmpv4_global_allow(struct net *net, int type, int code) +static bool icmpv4_global_allow(struct net *net, int type, int code, + bool *apply_ratelimit) { if (icmpv4_mask_allow(net, type, code)) return true; - if (icmp_global_allow()) + if (icmp_global_allow()) { + *apply_ratelimit = true; return true; - + } __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -306,15 +310,16 @@ static bool icmpv4_global_allow(struct net *net, int type, int code) */ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, - struct flowi4 *fl4, int type, int code) + struct flowi4 *fl4, int type, int code, + bool apply_ratelimit) { struct dst_entry *dst = &rt->dst; struct inet_peer *peer; bool rc = true; int vif; - if (icmpv4_mask_allow(net, type, code)) - goto out; + if (!apply_ratelimit) + return true; /* No rate limit on loopback */ if (dst->dev && (dst->dev->flags&IFF_LOOPBACK)) @@ -329,6 +334,8 @@ static bool icmpv4_xrlim_allow(struct net *net, struct rtable *rt, out: if (!rc) __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITHOST); + else + icmp_global_consume(); return rc; } @@ -400,6 +407,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) struct ipcm_cookie ipc; struct rtable *rt = skb_rtable(skb); struct net *net = dev_net(rt->dst.dev); + bool apply_ratelimit = false; struct flowi4 fl4; struct sock *sk; struct inet_sock *inet; @@ -411,11 +419,11 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) if (ip_options_echo(net, &icmp_param->replyopts.opt.opt, skb)) return; - /* Needed by both icmp_global_allow and icmp_xmit_lock */ + /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ local_bh_disable(); - /* global icmp_msgs_per_sec */ - if (!icmpv4_global_allow(net, type, code)) + /* is global icmp_msgs_per_sec exhausted ? */ + if (!icmpv4_global_allow(net, type, code, &apply_ratelimit)) goto out_bh_enable; sk = icmp_xmit_lock(net); @@ -448,7 +456,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) goto out_unlock; - if (icmpv4_xrlim_allow(net, rt, &fl4, type, code)) + if (icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit)) icmp_push_reply(sk, icmp_param, &fl4, &ipc, &rt); ip_rt_put(rt); out_unlock: @@ -592,6 +600,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, int room; struct icmp_bxm icmp_param; struct rtable *rt = skb_rtable(skb_in); + bool apply_ratelimit = false; struct ipcm_cookie ipc; struct flowi4 fl4; __be32 saddr; @@ -673,7 +682,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, } } - /* Needed by both icmp_global_allow and icmp_xmit_lock */ + /* Needed by both icmpv4_global_allow and icmp_xmit_lock */ local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit, unless @@ -681,7 +690,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, * loopback, then peer ratelimit still work (in icmpv4_xrlim_allow) */ if (!(skb_in->dev && (skb_in->dev->flags&IFF_LOOPBACK)) && - !icmpv4_global_allow(net, type, code)) + !icmpv4_global_allow(net, type, code, &apply_ratelimit)) goto out_bh_enable; sk = icmp_xmit_lock(net); @@ -740,7 +749,7 @@ void __icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info, goto out_unlock; /* peer icmp_ratelimit */ - if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code)) + if (!icmpv4_xrlim_allow(net, rt, &fl4, type, code, apply_ratelimit)) goto ende; /* RFC says return as much as we can without exceeding 576 bytes. */ diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index 7bea1d82e178..ed8cdf7b8b11 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -175,14 +175,16 @@ static bool icmpv6_mask_allow(struct net *net, int type) return false; } -static bool icmpv6_global_allow(struct net *net, int type) +static bool icmpv6_global_allow(struct net *net, int type, + bool *apply_ratelimit) { if (icmpv6_mask_allow(net, type)) return true; - if (icmp_global_allow()) + if (icmp_global_allow()) { + *apply_ratelimit = true; return true; - + } __ICMP_INC_STATS(net, ICMP_MIB_RATELIMITGLOBAL); return false; } @@ -191,13 +193,13 @@ static bool icmpv6_global_allow(struct net *net, int type) * Check the ICMP output rate limit */ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, - struct flowi6 *fl6) + struct flowi6 *fl6, bool apply_ratelimit) { struct net *net = sock_net(sk); struct dst_entry *dst; bool res = false; - if (icmpv6_mask_allow(net, type)) + if (!apply_ratelimit) return true; /* @@ -228,6 +230,8 @@ static bool icmpv6_xrlim_allow(struct sock *sk, u8 type, if (!res) __ICMP6_INC_STATS(net, ip6_dst_idev(dst), ICMP6_MIB_RATELIMITHOST); + else + icmp_global_consume(); dst_release(dst); return res; } @@ -454,6 +458,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, struct net *net; struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; + bool apply_ratelimit = false; struct dst_entry *dst; struct icmp6hdr tmp_hdr; struct flowi6 fl6; @@ -535,11 +540,12 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, return; } - /* Needed by both icmp_global_allow and icmpv6_xmit_lock */ + /* Needed by both icmpv6_global_allow and icmpv6_xmit_lock */ local_bh_disable(); /* Check global sysctl_icmp_msgs_per_sec ratelimit */ - if (!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, type)) + if (!(skb->dev->flags & IFF_LOOPBACK) && + !icmpv6_global_allow(net, type, &apply_ratelimit)) goto out_bh_enable; mip6_addr_swap(skb, parm); @@ -577,7 +583,7 @@ void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, np = inet6_sk(sk); - if (!icmpv6_xrlim_allow(sk, type, &fl6)) + if (!icmpv6_xrlim_allow(sk, type, &fl6, apply_ratelimit)) goto out; tmp_hdr.icmp6_type = type; @@ -719,6 +725,7 @@ static void icmpv6_echo_reply(struct sk_buff *skb) struct ipv6_pinfo *np; const struct in6_addr *saddr = NULL; struct icmp6hdr *icmph = icmp6_hdr(skb); + bool apply_ratelimit = false; struct icmp6hdr tmp_hdr; struct flowi6 fl6; struct icmpv6_msg msg; @@ -782,8 +789,9 @@ static void icmpv6_echo_reply(struct sk_buff *skb) goto out; /* Check the ratelimit */ - if ((!(skb->dev->flags & IFF_LOOPBACK) && !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY)) || - !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6)) + if ((!(skb->dev->flags & IFF_LOOPBACK) && + !icmpv6_global_allow(net, ICMPV6_ECHO_REPLY, &apply_ratelimit)) || + !icmpv6_xrlim_allow(sk, ICMPV6_ECHO_REPLY, &fl6, apply_ratelimit)) goto out_dst_release; idev = __in6_dev_get(skb->dev); From f6633a3e1e9b9a6b00c6c268258f2d42812ab508 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Tue, 10 Sep 2024 22:55:08 -0700 Subject: [PATCH 10/23] bpf: lsm: Set bpf_lsm_blob_sizes.lbs_task to 0 commit 300a90b2cb5d442879e6398920c49aebbd5c8e40 upstream. bpf task local storage is now using task_struct->bpf_storage, so bpf_lsm_blob_sizes.lbs_task is no longer needed. Remove it to save some memory. Fixes: a10787e6d58c ("bpf: Enable task local storage for tracing programs") Cc: stable@vger.kernel.org Cc: KP Singh Cc: Matt Bobrowski Signed-off-by: Song Liu Acked-by: Matt Bobrowski Link: https://lore.kernel.org/r/20240911055508.9588-1-song@kernel.org Signed-off-by: Alexei Starovoitov Signed-off-by: Greg Kroah-Hartman --- security/bpf/hooks.c | 1 - 1 file changed, 1 deletion(-) diff --git a/security/bpf/hooks.c b/security/bpf/hooks.c index e5971fa74fd7..d2925f431216 100644 --- a/security/bpf/hooks.c +++ b/security/bpf/hooks.c @@ -24,7 +24,6 @@ static int __init bpf_lsm_init(void) struct lsm_blob_sizes bpf_lsm_blob_sizes __lsm_ro_after_init = { .lbs_inode = sizeof(struct bpf_storage_blob), - .lbs_task = sizeof(struct bpf_storage_blob), }; DEFINE_LSM(bpf) = { From a28711c53e3b1aaaa62738764c045938a1535c24 Mon Sep 17 00:00:00 2001 From: Zhiguo Niu Date: Thu, 20 Jun 2024 22:54:34 +0000 Subject: [PATCH 11/23] lockdep: fix deadlock issue between lockdep and rcu commit a6f88ac32c6e63e69c595bfae220d8641704c9b7 upstream. There is a deadlock scenario between lockdep and rcu when rcu nocb feature is enabled, just as following call stack: rcuop/x -000|queued_spin_lock_slowpath(lock = 0xFFFFFF817F2A8A80, val = ?) -001|queued_spin_lock(inline) // try to hold nocb_gp_lock -001|do_raw_spin_lock(lock = 0xFFFFFF817F2A8A80) -002|__raw_spin_lock_irqsave(inline) -002|_raw_spin_lock_irqsave(lock = 0xFFFFFF817F2A8A80) -003|wake_nocb_gp_defer(inline) -003|__call_rcu_nocb_wake(rdp = 0xFFFFFF817F30B680) -004|__call_rcu_common(inline) -004|call_rcu(head = 0xFFFFFFC082EECC28, func = ?) -005|call_rcu_zapped(inline) -005|free_zapped_rcu(ch = ?)// hold graph lock -006|rcu_do_batch(rdp = 0xFFFFFF817F245680) -007|nocb_cb_wait(inline) -007|rcu_nocb_cb_kthread(arg = 0xFFFFFF817F245680) -008|kthread(_create = 0xFFFFFF80803122C0) -009|ret_from_fork(asm) rcuop/y -000|queued_spin_lock_slowpath(lock = 0xFFFFFFC08291BBC8, val = 0) -001|queued_spin_lock() -001|lockdep_lock() -001|graph_lock() // try to hold graph lock -002|lookup_chain_cache_add() -002|validate_chain() -003|lock_acquire -004|_raw_spin_lock_irqsave(lock = 0xFFFFFF817F211D80) -005|lock_timer_base(inline) -006|mod_timer(inline) -006|wake_nocb_gp_defer(inline)// hold nocb_gp_lock -006|__call_rcu_nocb_wake(rdp = 0xFFFFFF817F2A8680) -007|__call_rcu_common(inline) -007|call_rcu(head = 0xFFFFFFC0822E0B58, func = ?) -008|call_rcu_hurry(inline) -008|rcu_sync_call(inline) -008|rcu_sync_func(rhp = 0xFFFFFFC0822E0B58) -009|rcu_do_batch(rdp = 0xFFFFFF817F266680) -010|nocb_cb_wait(inline) -010|rcu_nocb_cb_kthread(arg = 0xFFFFFF817F266680) -011|kthread(_create = 0xFFFFFF8080363740) -012|ret_from_fork(asm) rcuop/x and rcuop/y are rcu nocb threads with the same nocb gp thread. This patch release the graph lock before lockdep call_rcu. Fixes: a0b0fd53e1e6 ("locking/lockdep: Free lock classes that are no longer in use") Cc: stable@vger.kernel.org Cc: Boqun Feng Cc: Waiman Long Cc: Carlos Llamas Cc: Bart Van Assche Signed-off-by: Zhiguo Niu Signed-off-by: Xuewen Yan Reviewed-by: Waiman Long Reviewed-by: Carlos Llamas Reviewed-by: Bart Van Assche Signed-off-by: Carlos Llamas Acked-by: Paul E. McKenney Signed-off-by: Boqun Feng Link: https://lore.kernel.org/r/20240620225436.3127927-1-cmllamas@google.com Signed-off-by: Greg Kroah-Hartman --- kernel/locking/lockdep.c | 50 ++++++++++++++++++++++++++-------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 3b38303ed27b..7fb9731bc5f5 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -6064,25 +6064,27 @@ static struct pending_free *get_pending_free(void) static void free_zapped_rcu(struct rcu_head *cb); /* - * Schedule an RCU callback if no RCU callback is pending. Must be called with - * the graph lock held. - */ -static void call_rcu_zapped(struct pending_free *pf) +* See if we need to queue an RCU callback, must called with +* the lockdep lock held, returns false if either we don't have +* any pending free or the callback is already scheduled. +* Otherwise, a call_rcu() must follow this function call. +*/ +static bool prepare_call_rcu_zapped(struct pending_free *pf) { WARN_ON_ONCE(inside_selftest()); if (list_empty(&pf->zapped)) - return; + return false; if (delayed_free.scheduled) - return; + return false; delayed_free.scheduled = true; WARN_ON_ONCE(delayed_free.pf + delayed_free.index != pf); delayed_free.index ^= 1; - call_rcu(&delayed_free.rcu_head, free_zapped_rcu); + return true; } /* The caller must hold the graph lock. May be called from RCU context. */ @@ -6108,6 +6110,7 @@ static void free_zapped_rcu(struct rcu_head *ch) { struct pending_free *pf; unsigned long flags; + bool need_callback; if (WARN_ON_ONCE(ch != &delayed_free.rcu_head)) return; @@ -6119,14 +6122,18 @@ static void free_zapped_rcu(struct rcu_head *ch) pf = delayed_free.pf + (delayed_free.index ^ 1); __free_zapped_classes(pf); delayed_free.scheduled = false; - - /* - * If there's anything on the open list, close and start a new callback. - */ - call_rcu_zapped(delayed_free.pf + delayed_free.index); - + need_callback = + prepare_call_rcu_zapped(delayed_free.pf + delayed_free.index); lockdep_unlock(); raw_local_irq_restore(flags); + + /* + * If there's pending free and its callback has not been scheduled, + * queue an RCU callback. + */ + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); + } /* @@ -6166,6 +6173,7 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size) { struct pending_free *pf; unsigned long flags; + bool need_callback; init_data_structures_once(); @@ -6173,10 +6181,11 @@ static void lockdep_free_key_range_reg(void *start, unsigned long size) lockdep_lock(); pf = get_pending_free(); __lockdep_free_key_range(pf, start, size); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf); lockdep_unlock(); raw_local_irq_restore(flags); - + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); /* * Wait for any possible iterators from look_up_lock_class() to pass * before continuing to free the memory they refer to. @@ -6270,6 +6279,7 @@ static void lockdep_reset_lock_reg(struct lockdep_map *lock) struct pending_free *pf; unsigned long flags; int locked; + bool need_callback = false; raw_local_irq_save(flags); locked = graph_lock(); @@ -6278,11 +6288,13 @@ static void lockdep_reset_lock_reg(struct lockdep_map *lock) pf = get_pending_free(); __lockdep_reset_lock(pf, lock); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf); graph_unlock(); out_irq: raw_local_irq_restore(flags); + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); } /* @@ -6326,6 +6338,7 @@ void lockdep_unregister_key(struct lock_class_key *key) struct pending_free *pf; unsigned long flags; bool found = false; + bool need_callback = false; might_sleep(); @@ -6346,11 +6359,14 @@ void lockdep_unregister_key(struct lock_class_key *key) if (found) { pf = get_pending_free(); __lockdep_free_key_range(pf, key, 1); - call_rcu_zapped(pf); + need_callback = prepare_call_rcu_zapped(pf); } lockdep_unlock(); raw_local_irq_restore(flags); + if (need_callback) + call_rcu(&delayed_free.rcu_head, free_zapped_rcu); + /* Wait until is_dynamic_key() has finished accessing k->hash_entry. */ synchronize_rcu(); } From 1c19a8ae09f92b02507a53d522c118c39b74cc03 Mon Sep 17 00:00:00 2001 From: David Gow Date: Sat, 3 Aug 2024 15:46:41 +0800 Subject: [PATCH 12/23] mm: only enforce minimum stack gap size if it's sensible commit 69b50d4351ed924f29e3d46b159e28f70dfc707f upstream. The generic mmap_base code tries to leave a gap between the top of the stack and the mmap base address, but enforces a minimum gap size (MIN_GAP) of 128MB, which is too large on some setups. In particular, on arm tasks without ADDR_LIMIT_32BIT, the STACK_TOP value is less than 128MB, so it's impossible to fit such a gap in. Only enforce this minimum if MIN_GAP < MAX_GAP, as we'd prefer to honour MAX_GAP, which is defined proportionally, so scales better and always leaves us with both _some_ stack space and some room for mmap. This fixes the usercopy KUnit test suite on 32-bit arm, as it doesn't set any personality flags so gets the default (in this case 26-bit) task size. This test can be run with: ./tools/testing/kunit/kunit.py run --arch arm usercopy --make_options LLVM=1 Link: https://lkml.kernel.org/r/20240803074642.1849623-2-davidgow@google.com Fixes: dba79c3df4a2 ("arm: use generic mmap top-down layout and brk randomization") Signed-off-by: David Gow Reviewed-by: Kees Cook Cc: Alexandre Ghiti Cc: Linus Walleij Cc: Luis Chamberlain Cc: Mark Rutland Cc: Russell King Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/util.c b/mm/util.c index aa4f8a45dd56..94fff247831b 100644 --- a/mm/util.c +++ b/mm/util.c @@ -399,7 +399,7 @@ static unsigned long mmap_base(unsigned long rnd, struct rlimit *rlim_stack) if (gap + pad > gap) gap += pad; - if (gap < MIN_GAP) + if (gap < MIN_GAP && MIN_GAP < MAX_GAP) gap = MIN_GAP; else if (gap > MAX_GAP) gap = MAX_GAP; From f7f78d7b0a5f8fd6234bb3703d20a717bdff253e Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Tue, 11 Jun 2024 09:50:32 +0200 Subject: [PATCH 13/23] module: Fix KCOV-ignored file name commit f34d086fb7102fec895fd58b9e816b981b284c17 upstream. module.c was renamed to main.c, but the Makefile directive was copy-pasted verbatim with the old file name. Fix up the file name. Fixes: cfc1d277891e ("module: Move all into module/") Signed-off-by: Dmitry Vyukov Signed-off-by: Thomas Gleixner Reviewed-by: Alexander Potapenko Reviewed-by: Marco Elver Reviewed-by: Andrey Konovalov Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/bc0cf790b4839c5e38e2fafc64271f620568a39e.1718092070.git.dvyukov@google.com Signed-off-by: Greg Kroah-Hartman --- kernel/module/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/module/Makefile b/kernel/module/Makefile index 948efea81e85..bdd32929994c 100644 --- a/kernel/module/Makefile +++ b/kernel/module/Makefile @@ -5,7 +5,7 @@ # These are called from save_stack_trace() on slub debug path, # and produce insane amounts of uninteresting coverage. -KCOV_INSTRUMENT_module.o := n +KCOV_INSTRUMENT_main.o := n obj-y += main.o strict_rwx.o obj-$(CONFIG_MODULE_DECOMPRESS) += decompress.o From 417a4714540c2eeea2b54423774f19877d715db0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Wed, 4 Sep 2024 17:12:04 -0700 Subject: [PATCH 14/23] mm/damon/vaddr: protect vma traversal in __damon_va_thre_regions() with rcu read lock commit fb497d6db7c19c797cbd694b52d1af87c4eebcc6 upstream. Traversing VMAs of a given maple tree should be protected by rcu read lock. However, __damon_va_three_regions() is not doing the protection. Hold the lock. Link: https://lkml.kernel.org/r/20240905001204.1481-1-sj@kernel.org Fixes: d0cf3dd47f0d ("damon: convert __damon_va_three_regions to use the VMA iterator") Signed-off-by: Liam R. Howlett Signed-off-by: SeongJae Park Reported-by: Guenter Roeck Closes: https://lore.kernel.org/b83651a0-5b24-4206-b860-cb54ffdf209b@roeck-us.net Tested-by: Guenter Roeck Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/damon/vaddr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 26d561af74e4..260f0b775bfa 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -126,6 +126,7 @@ static int __damon_va_three_regions(struct mm_struct *mm, * If this is too slow, it can be optimised to examine the maple * tree gaps. */ + rcu_read_lock(); for_each_vma(vmi, vma) { unsigned long gap; @@ -146,6 +147,7 @@ static int __damon_va_three_regions(struct mm_struct *mm, next: prev = vma; } + rcu_read_unlock(); if (!sz_range(&second_gap) || !sz_range(&first_gap)) return -EINVAL; From f0649991642a6fe345e0593c7d828c6f9d772313 Mon Sep 17 00:00:00 2001 From: Tommy Huang Date: Wed, 11 Sep 2024 17:39:51 +0800 Subject: [PATCH 15/23] i2c: aspeed: Update the stop sw state when the bus recovery occurs commit 93701d3b84ac5f3ea07259d4ced405c53d757985 upstream. When the i2c bus recovery occurs, driver will send i2c stop command in the scl low condition. In this case the sw state will still keep original situation. Under multi-master usage, i2c bus recovery will be called when i2c transfer timeout occurs. Update the stop command calling with aspeed_i2c_do_stop function to update master_state. Fixes: f327c686d3ba ("i2c: aspeed: added driver for Aspeed I2C") Cc: stable@vger.kernel.org # v4.13+ Signed-off-by: Tommy Huang Signed-off-by: Andi Shyti Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-aspeed.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/i2c/busses/i2c-aspeed.c b/drivers/i2c/busses/i2c-aspeed.c index 86daf791aa27..d057974340b7 100644 --- a/drivers/i2c/busses/i2c-aspeed.c +++ b/drivers/i2c/busses/i2c-aspeed.c @@ -170,6 +170,13 @@ struct aspeed_i2c_bus { static int aspeed_i2c_reset(struct aspeed_i2c_bus *bus); +/* precondition: bus.lock has been acquired. */ +static void aspeed_i2c_do_stop(struct aspeed_i2c_bus *bus) +{ + bus->master_state = ASPEED_I2C_MASTER_STOP; + writel(ASPEED_I2CD_M_STOP_CMD, bus->base + ASPEED_I2C_CMD_REG); +} + static int aspeed_i2c_recover_bus(struct aspeed_i2c_bus *bus) { unsigned long time_left, flags; @@ -187,7 +194,7 @@ static int aspeed_i2c_recover_bus(struct aspeed_i2c_bus *bus) command); reinit_completion(&bus->cmd_complete); - writel(ASPEED_I2CD_M_STOP_CMD, bus->base + ASPEED_I2C_CMD_REG); + aspeed_i2c_do_stop(bus); spin_unlock_irqrestore(&bus->lock, flags); time_left = wait_for_completion_timeout( @@ -390,13 +397,6 @@ static void aspeed_i2c_do_start(struct aspeed_i2c_bus *bus) writel(command, bus->base + ASPEED_I2C_CMD_REG); } -/* precondition: bus.lock has been acquired. */ -static void aspeed_i2c_do_stop(struct aspeed_i2c_bus *bus) -{ - bus->master_state = ASPEED_I2C_MASTER_STOP; - writel(ASPEED_I2CD_M_STOP_CMD, bus->base + ASPEED_I2C_CMD_REG); -} - /* precondition: bus.lock has been acquired. */ static void aspeed_i2c_next_msg_or_stop(struct aspeed_i2c_bus *bus) { From 5d96aca2862627573a60e3e5eda48fee80642969 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 11 Sep 2024 18:39:14 +0300 Subject: [PATCH 16/23] i2c: isch: Add missed 'else' commit 1db4da55070d6a2754efeb3743f5312fc32f5961 upstream. In accordance with the existing comment and code analysis it is quite likely that there is a missed 'else' when adapter times out. Add it. Fixes: 5bc1200852c3 ("i2c: Add Intel SCH SMBus support") Signed-off-by: Andy Shevchenko Cc: # v2.6.27+ Signed-off-by: Andi Shyti Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-isch.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-isch.c b/drivers/i2c/busses/i2c-isch.c index 2dc7ada06ac5..d283e6bb1ff8 100644 --- a/drivers/i2c/busses/i2c-isch.c +++ b/drivers/i2c/busses/i2c-isch.c @@ -99,8 +99,7 @@ static int sch_transaction(void) if (retries > MAX_RETRIES) { dev_err(&sch_adapter.dev, "SMBus Timeout!\n"); result = -ETIMEDOUT; - } - if (temp & 0x04) { + } else if (temp & 0x04) { result = -EIO; dev_dbg(&sch_adapter.dev, "Bus collision! SMBus may be " "locked until next hard reset. (sorry!)\n"); From 0cf462ae2a0286a5adf937682791ef38b22d84e3 Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 18 Dec 2023 22:36:35 -0800 Subject: [PATCH 17/23] usb: yurex: Fix inconsistent locking bug in yurex_read() commit e7d3b9f28654dbfce7e09f8028210489adaf6a33 upstream. Unlock before returning on the error path. Fixes: 86b20af11e84 ("usb: yurex: Replace snprintf() with the safer scnprintf() variant") Reported-by: Dan Carpenter Reported-by: kernel test robot Closes: https://lore.kernel.org/r/202312170252.3udgrIcP-lkp@intel.com/ Signed-off-by: Harshit Mogalapalli Link: https://lore.kernel.org/r/20231219063639.450994-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/misc/yurex.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/misc/yurex.c b/drivers/usb/misc/yurex.c index 44136989f6c6..6adaaf66c14d 100644 --- a/drivers/usb/misc/yurex.c +++ b/drivers/usb/misc/yurex.c @@ -413,8 +413,10 @@ static ssize_t yurex_read(struct file *file, char __user *buffer, size_t count, return -ENODEV; } - if (WARN_ON_ONCE(dev->bbu > S64_MAX || dev->bbu < S64_MIN)) + if (WARN_ON_ONCE(dev->bbu > S64_MAX || dev->bbu < S64_MIN)) { + mutex_unlock(&dev->io_mutex); return -EIO; + } spin_lock_irq(&dev->lock); scnprintf(in_buffer, MAX_S64_STRLEN, "%lld\n", dev->bbu); From 85045e51c730e541d83b222d52659caaf43d23ff Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 11 Dec 2023 19:27:28 +0000 Subject: [PATCH 18/23] perf/arm-cmn: Fail DTC counter allocation correctly commit 1892fe103c3a20fced306c8dafa74f7f6d4ea0a3 upstream. Calling arm_cmn_event_clear() before all DTC indices are allocated is wrong, and can lead to arm_cmn_event_add() erroneously clearing live counters from full DTCs where allocation fails. Since the DTC counters are only updated by arm_cmn_init_counter() after all DTC and DTM allocations succeed, nothing actually needs cleaning up in this case anyway, and it should just return directly as it did before. Fixes: 7633ec2c262f ("perf/arm-cmn: Rework DTC counters (again)") Signed-off-by: Robin Murphy Reviewed-by: Ilkka Koskinen Acked-by: Will Deacon Link: https://lore.kernel.org/r/ed589c0d8e4130dc68b8ad1625226d28bdc185d4.1702322847.git.robin.murphy@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- drivers/perf/arm-cmn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 0d68da9dc358..b8983f42e749 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -1671,7 +1671,7 @@ static int arm_cmn_event_add(struct perf_event *event, int flags) idx = 0; while (cmn->dtc[j].counters[idx]) if (++idx == CMN_DT_NUM_COUNTERS) - goto free_dtms; + return -ENOSPC; } hw->dtc_idx[j] = idx; } From 97ab36524367e4e0b2bd242dd69657708a7995d3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20Apitzsch?= Date: Sun, 1 Oct 2023 18:09:56 +0200 Subject: [PATCH 19/23] iio: magnetometer: ak8975: Fix 'Unexpected device' error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 848f68c760ab1e14a9046ea6e45e3304ab9fa50b upstream. Explicity specify array indices to fix mapping between asahi_compass_chipset and ak_def_array. While at it, remove unneeded AKXXXX. Fixes: 4f9ea93afde1 ("iio: magnetometer: ak8975: Convert enum->pointer for data in the match tables") Signed-off-by: André Apitzsch Reviewed-by: Biju Das Reviewed-by: Andy Shevchenko Link: https://lore.kernel.org/r/20231001-ak_magnetometer-v1-1-09bf3b8798a3@apitzsch.eu Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/magnetometer/ak8975.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/drivers/iio/magnetometer/ak8975.c b/drivers/iio/magnetometer/ak8975.c index d2b606bf13ea..2a7995f3643b 100644 --- a/drivers/iio/magnetometer/ak8975.c +++ b/drivers/iio/magnetometer/ak8975.c @@ -204,7 +204,6 @@ static long ak09912_raw_to_gauss(u16 data) /* Compatible Asahi Kasei Compass parts */ enum asahi_compass_chipset { - AKXXXX = 0, AK8975, AK8963, AK09911, @@ -248,7 +247,7 @@ struct ak_def { }; static const struct ak_def ak_def_array[] = { - { + [AK8975] = { .type = AK8975, .raw_to_gauss = ak8975_raw_to_gauss, .range = 4096, @@ -273,7 +272,7 @@ static const struct ak_def ak_def_array[] = { AK8975_REG_HYL, AK8975_REG_HZL}, }, - { + [AK8963] = { .type = AK8963, .raw_to_gauss = ak8963_09911_raw_to_gauss, .range = 8190, @@ -298,7 +297,7 @@ static const struct ak_def ak_def_array[] = { AK8975_REG_HYL, AK8975_REG_HZL}, }, - { + [AK09911] = { .type = AK09911, .raw_to_gauss = ak8963_09911_raw_to_gauss, .range = 8192, @@ -323,7 +322,7 @@ static const struct ak_def ak_def_array[] = { AK09912_REG_HYL, AK09912_REG_HZL}, }, - { + [AK09912] = { .type = AK09912, .raw_to_gauss = ak09912_raw_to_gauss, .range = 32752, @@ -348,7 +347,7 @@ static const struct ak_def ak_def_array[] = { AK09912_REG_HYL, AK09912_REG_HZL}, }, - { + [AK09916] = { .type = AK09916, .raw_to_gauss = ak09912_raw_to_gauss, .range = 32752, From 6d5124a2b14ad22fee84a30a6060c6db1f08343e Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 15 Feb 2023 11:41:17 -0700 Subject: [PATCH 20/23] powerpc: Allow CONFIG_PPC64_BIG_ENDIAN_ELF_ABI_V2 with ld.lld 15+ commit a11334d8327b3fd7987cbfb38e956a44c722d88f upstream. Commit 5017b4594672 ("powerpc/64: Option to build big-endian with ELFv2 ABI") restricted the ELFv2 ABI configuration such that it can only be selected when linking with ld.bfd, due to lack of testing with LLVM. ld.lld can link ELFv2 kernels without any issues; in fact, it is the only ABI that ld.lld supports, as ELFv1 is not supported in ld.lld. As this has not seen a ton of real world testing yet, be conservative and only allow this option to be selected with the latest stable release of LLVM (15.x) and newer. While in the area, remove 'default n', as it is unnecessary to specify it explicitly since all boolean/tristate configuration symbols default to n. Tested-by: "Erhard F." Signed-off-by: Nathan Chancellor Signed-off-by: Michael Ellerman Link: https://msgid.link/20230118-ppc64-elfv2-llvm-v1-3-b9e2ec9da11d@kernel.org Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/Kconfig | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 2fa9e87b06dc..46ae22cd8bcc 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -593,8 +593,7 @@ config PPC64_BIG_ENDIAN_ELF_ABI_V2 bool "Build big-endian kernel using ELF ABI V2 (EXPERIMENTAL)" depends on PPC64 && CPU_BIG_ENDIAN depends on CC_HAS_ELFV2 - depends on LD_IS_BFD && LD_VERSION >= 22400 - default n + depends on LD_VERSION >= 22400 || LLD_VERSION >= 150000 help This builds the kernel image using the "Power Architecture 64-Bit ELF V2 ABI Specification", which has a reduced stack overhead and faster From 04ca17fbc809fc6efb5a5de4dcec377de88906b1 Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Mon, 18 Sep 2023 08:30:41 +0300 Subject: [PATCH 21/23] PCI/PM: Mark devices disconnected if upstream PCIe link is down on resume commit c82458101d5490230d735caecce14c9c27b1010c upstream. Mark Blakeney reported that when suspending system with a Thunderbolt dock connected and then unplugging the dock before resume (which is pretty normal flow with laptops), resuming takes long time. What happens is that the PCIe link from the root port to the PCIe switch inside the Thunderbolt device does not train (as expected, the link is unplugged): pcieport 0000:00:07.2: restoring config space at offset 0x24 (was 0x3bf12001, writing 0x3bf12001) pcieport 0000:00:07.0: waiting 100 ms for downstream link pcieport 0000:01:00.0: not ready 1023ms after resume; giving up However, at this point we still try to resume the devices below that unplugged link: pcieport 0000:01:00.0: Unable to change power state from D3cold to D0, device inaccessible ... pcieport 0000:01:00.0: restoring config space at offset 0x38 (was 0xffffffff, writing 0x0) ... pcieport 0000:02:02.0: waiting 100 ms for downstream link, after activation And this is the link from PCIe switch downstream port to the xHCI on the dock: xhci_hcd 0000:03:00.0: not ready 65535ms after resume; giving up xhci_hcd 0000:03:00.0: Unable to change power state from D3cold to D0, device inaccessible xhci_hcd 0000:03:00.0: restoring config space at offset 0x3c (was 0xffffffff, writing 0x1ff) This ends up slowing down the resume time considerably. For this reason mark these devices as disconnected if the link above them did not train properly. Fixes: e8b908146d44 ("PCI/PM: Increase wait time after resume") Link: https://lore.kernel.org/r/20230918053041.1018876-1-mika.westerberg@linux.intel.com Reported-by: Mark Blakeney Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217915 Signed-off-by: Mika Westerberg Signed-off-by: Bjorn Helgaas Reviewed-by: Lukas Wunner Cc: stable@vger.kernel.org # v6.4+ Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci-driver.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c index cbe47fad4714..18e973a91a97 100644 --- a/drivers/pci/pci-driver.c +++ b/drivers/pci/pci-driver.c @@ -579,7 +579,19 @@ static void pci_pm_default_resume_early(struct pci_dev *pci_dev) static void pci_pm_bridge_power_up_actions(struct pci_dev *pci_dev) { - pci_bridge_wait_for_secondary_bus(pci_dev, "resume"); + int ret; + + ret = pci_bridge_wait_for_secondary_bus(pci_dev, "resume"); + if (ret) { + /* + * The downstream link failed to come up, so mark the + * devices below as disconnected to make sure we don't + * attempt to resume them. + */ + pci_walk_bus(pci_dev->subordinate, pci_dev_set_disconnected, + NULL); + return; + } /* * When powering on a bridge from D3cold, the whole hierarchy may be From 25703a3c980e21548774eea8c8a87a75c5c8f58c Mon Sep 17 00:00:00 2001 From: "Alexey Gladkov (Intel)" Date: Fri, 13 Sep 2024 19:05:56 +0200 Subject: [PATCH 22/23] x86/tdx: Fix "in-kernel MMIO" check commit d4fc4d01471528da8a9797a065982e05090e1d81 upstream. TDX only supports kernel-initiated MMIO operations. The handle_mmio() function checks if the #VE exception occurred in the kernel and rejects the operation if it did not. However, userspace can deceive the kernel into performing MMIO on its behalf. For example, if userspace can point a syscall to an MMIO address, syscall does get_user() or put_user() on it, triggering MMIO #VE. The kernel will treat the #VE as in-kernel MMIO. Ensure that the target MMIO address is within the kernel before decoding instruction. Fixes: 31d58c4e557d ("x86/tdx: Handle in-kernel MMIO") Signed-off-by: Alexey Gladkov (Intel) Signed-off-by: Dave Hansen Reviewed-by: Kirill A. Shutemov Acked-by: Dave Hansen Cc:stable@vger.kernel.org Link: https://lore.kernel.org/all/565a804b80387970460a4ebc67c88d1380f61ad1.1726237595.git.legion%40kernel.org Signed-off-by: Alexey Gladkov (Intel) Signed-off-by: Greg Kroah-Hartman --- arch/x86/coco/tdx/tdx.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/coco/tdx/tdx.c b/arch/x86/coco/tdx/tdx.c index b9da467bd222..9032fea50219 100644 --- a/arch/x86/coco/tdx/tdx.c +++ b/arch/x86/coco/tdx/tdx.c @@ -12,6 +12,7 @@ #include #include #include +#include /* TDX module Call Leaf IDs */ #define TDX_GET_INFO 1 @@ -371,6 +372,11 @@ static int handle_mmio(struct pt_regs *regs, struct ve_info *ve) return -EINVAL; } + if (!fault_in_kernel_space(ve->gla)) { + WARN_ONCE(1, "Access to userspace address is not supported"); + return -EINVAL; + } + /* * Reject EPT violation #VEs that split pages. * From dffe86df26aee01a5fc56a175b7a7f157961e370 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Thu, 23 Feb 2023 00:10:25 +0100 Subject: [PATCH 23/23] wifi: mt76: do not run mt76_unregister_device() on unregistered hw MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 41130c32f3a18fcc930316da17f3a5f3bc326aa1 upstream. Trying to probe a mt7921e pci card without firmware results in a successful probe where ieee80211_register_hw hasn't been called. When removing the driver, ieee802111_unregister_hw is called unconditionally leading to a kernel NULL pointer dereference. Fix the issue running mt76_unregister_device routine just for registered hw. Link: https://bugs.debian.org/1029116 Link: https://bugs.kali.org/view.php?id=8140 Reported-by: Stuart Hayhurst Fixes: 1c71e03afe4b ("mt76: mt7921: move mt7921_init_hw in a dedicated work") Tested-by: Helmut Grohne Signed-off-by: Lorenzo Bianconi Signed-off-by: Kalle Valo Link: https://lore.kernel.org/r/be3457d82f4e44bb71a22b2b5db27b644a37b1e1.1677107277.git.lorenzo@kernel.org Signed-off-by: Georg Müller Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/mediatek/mt76/mac80211.c | 8 ++++++++ drivers/net/wireless/mediatek/mt76/mt76.h | 1 + 2 files changed, 9 insertions(+) diff --git a/drivers/net/wireless/mediatek/mt76/mac80211.c b/drivers/net/wireless/mediatek/mt76/mac80211.c index 6de13d641438..82fce4b1d581 100644 --- a/drivers/net/wireless/mediatek/mt76/mac80211.c +++ b/drivers/net/wireless/mediatek/mt76/mac80211.c @@ -522,6 +522,7 @@ int mt76_register_phy(struct mt76_phy *phy, bool vht, if (ret) return ret; + set_bit(MT76_STATE_REGISTERED, &phy->state); phy->dev->phys[phy->band_idx] = phy; return 0; @@ -532,6 +533,9 @@ void mt76_unregister_phy(struct mt76_phy *phy) { struct mt76_dev *dev = phy->dev; + if (!test_bit(MT76_STATE_REGISTERED, &phy->state)) + return; + mt76_tx_status_check(dev, true); ieee80211_unregister_hw(phy->hw); dev->phys[phy->band_idx] = NULL; @@ -654,6 +658,7 @@ int mt76_register_device(struct mt76_dev *dev, bool vht, return ret; WARN_ON(mt76_worker_setup(hw, &dev->tx_worker, NULL, "tx")); + set_bit(MT76_STATE_REGISTERED, &phy->state); sched_set_fifo_low(dev->tx_worker.task); return 0; @@ -664,6 +669,9 @@ void mt76_unregister_device(struct mt76_dev *dev) { struct ieee80211_hw *hw = dev->hw; + if (!test_bit(MT76_STATE_REGISTERED, &dev->phy.state)) + return; + if (IS_ENABLED(CONFIG_MT76_LEDS)) mt76_led_cleanup(dev); mt76_tx_status_check(dev, true); diff --git a/drivers/net/wireless/mediatek/mt76/mt76.h b/drivers/net/wireless/mediatek/mt76/mt76.h index 60c9f9c56a4f..5b03e3b33d54 100644 --- a/drivers/net/wireless/mediatek/mt76/mt76.h +++ b/drivers/net/wireless/mediatek/mt76/mt76.h @@ -388,6 +388,7 @@ struct mt76_tx_cb { enum { MT76_STATE_INITIALIZED, + MT76_STATE_REGISTERED, MT76_STATE_RUNNING, MT76_STATE_MCU_RUNNING, MT76_SCANNING,