From 8cf587aab9ce58622082ebbeb1f435141b25faae Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Mon, 6 Jan 2025 15:46:39 +0000 Subject: [PATCH 01/51] ASoC: wm8994: Add depends on MFD core [ Upstream commit 5ed01155cea69801f1f0c908954a56a5a3474bed ] The ASoC driver should not be used without the MFD component. This was causing randconfig issues with regmap IRQ which is selected by the MFD part of the wm8994 driver. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501061337.R0DlBUoD-lkp@intel.com/ Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20250106154639.3999553-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/codecs/Kconfig b/sound/soc/codecs/Kconfig index 0904827e2f3d..627e9fc92f0b 100644 --- a/sound/soc/codecs/Kconfig +++ b/sound/soc/codecs/Kconfig @@ -1982,6 +1982,7 @@ config SND_SOC_WM8993 config SND_SOC_WM8994 tristate + depends on MFD_WM8994 config SND_SOC_WM8995 tristate From b873c88bfcf2cf6f09000c0b746e06c1b1334fa4 Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Tue, 7 Jan 2025 10:41:34 +0000 Subject: [PATCH 02/51] ASoC: samsung: Add missing selects for MFD_WM8994 [ Upstream commit fd55c6065bec5268740e944a1800e6fad00974d9 ] Anything selecting SND_SOC_WM8994 should also select MFD_WM8994, as SND_SOC_WM8994 does not automatically do so. Add the missing selects. Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501071530.UwIXs7OL-lkp@intel.com/ Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20250107104134.12147-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/samsung/Kconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/soc/samsung/Kconfig b/sound/soc/samsung/Kconfig index 2a61e620cd3b..6bf9fd720d4b 100644 --- a/sound/soc/samsung/Kconfig +++ b/sound/soc/samsung/Kconfig @@ -220,8 +220,9 @@ config SND_SOC_SAMSUNG_TM2_WM5110 config SND_SOC_SAMSUNG_ARIES_WM8994 tristate "SoC I2S Audio support for WM8994 on Aries" - depends on SND_SOC_SAMSUNG && MFD_WM8994 && IIO && EXTCON + depends on SND_SOC_SAMSUNG && IIO && EXTCON select SND_SOC_BT_SCO + select MFD_WM8994 select SND_SOC_WM8994 select SND_SAMSUNG_I2S help @@ -235,6 +236,7 @@ config SND_SOC_SAMSUNG_MIDAS_WM1811 tristate "SoC I2S Audio support for Midas boards" depends on SND_SOC_SAMSUNG select SND_SAMSUNG_I2S + select MFD_WM8994 select SND_SOC_WM8994 help Say Y if you want to add support for SoC audio on the Midas boards. From 18cb5798df8bf475f5cae2362e26780d10248e7a Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Wed, 8 Jan 2025 23:44:45 +0100 Subject: [PATCH 03/51] seccomp: Stub for !CONFIG_SECCOMP [ Upstream commit f90877dd7fb5085dd9abd6399daf63dd2969fc90 ] When using !CONFIG_SECCOMP with CONFIG_GENERIC_ENTRY, the randconfig bots found the following snag: kernel/entry/common.c: In function 'syscall_trace_enter': >> kernel/entry/common.c:52:23: error: implicit declaration of function '__secure_computing' [-Wimplicit-function-declaration] 52 | ret = __secure_computing(NULL); | ^~~~~~~~~~~~~~~~~~ Since generic entry calls __secure_computing() unconditionally, fix this by moving the stub out of the ifdef clause for CONFIG_HAVE_ARCH_SECCOMP_FILTER so it's always available. Link: https://lore.kernel.org/oe-kbuild-all/202501061240.Fzk9qiFZ-lkp@intel.com/ Signed-off-by: Linus Walleij Link: https://lore.kernel.org/r/20250108-seccomp-stub-2-v2-1-74523d49420f@linaro.org Signed-off-by: Kees Cook Signed-off-by: Sasha Levin --- include/linux/seccomp.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/seccomp.h b/include/linux/seccomp.h index d31d76be4982..91ff537c6246 100644 --- a/include/linux/seccomp.h +++ b/include/linux/seccomp.h @@ -69,10 +69,10 @@ struct seccomp_data; #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER static inline int secure_computing(void) { return 0; } -static inline int __secure_computing(const struct seccomp_data *sd) { return 0; } #else static inline void secure_computing_strict(int this_syscall) { return; } #endif +static inline int __secure_computing(const struct seccomp_data *sd) { return 0; } static inline long prctl_get_seccomp(void) { From 2104ad719366d4772f6f1254f1b9df0cbe0fc138 Mon Sep 17 00:00:00 2001 From: Xiang Zhang Date: Tue, 7 Jan 2025 10:24:31 +0800 Subject: [PATCH 04/51] scsi: iscsi: Fix redundant response for ISCSI_UEVENT_GET_HOST_STATS request [ Upstream commit 63ca02221cc5aa0731fe2b0cc28158aaa4b84982 ] The ISCSI_UEVENT_GET_HOST_STATS request is already handled in iscsi_get_host_stats(). This fix ensures that redundant responses are skipped in iscsi_if_rx(). - On success: send reply and stats from iscsi_get_host_stats() within if_recv_msg(). - On error: fall through. Signed-off-by: Xiang Zhang Link: https://lore.kernel.org/r/20250107022432.65390-1-hawkxiang.cpp@gmail.com Reviewed-by: Mike Christie Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/scsi_transport_iscsi.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 49dbcd67579a..687487ea4fd3 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -4102,7 +4102,7 @@ iscsi_if_rx(struct sk_buff *skb) } do { /* - * special case for GET_STATS: + * special case for GET_STATS, GET_CHAP and GET_HOST_STATS: * on success - sending reply and stats from * inside of if_recv_msg(), * on error - fall through. @@ -4111,6 +4111,8 @@ iscsi_if_rx(struct sk_buff *skb) break; if (ev->type == ISCSI_UEVENT_GET_CHAP && !err) break; + if (ev->type == ISCSI_UEVENT_GET_HOST_STATS && !err) + break; err = iscsi_if_send_reply(portid, nlh->nlmsg_type, ev, sizeof(*ev)); if (err == -EAGAIN && --retries < 0) { From b7d2461858ac75c9d6bc4ab8af1a738d0814b716 Mon Sep 17 00:00:00 2001 From: Tom Chung Date: Tue, 1 Oct 2024 17:13:07 +0800 Subject: [PATCH 05/51] drm/amd/display: Use HW lock mgr for PSR1 [ Upstream commit b5c764d6ed556c4e81fbe3fd976da77ec450c08e ] [Why] Without the dmub hw lock, it may cause the lock timeout issue while do modeset on PSR1 eDP panel. [How] Allow dmub hw lock for PSR1. Reviewed-by: Sun peng Li Signed-off-by: Tom Chung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit a2b5a9956269f4c1a09537177f18ab0229fe79f7) Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c b/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c index 3f32e9c3fbaf..8d7b2eee8c7c 100644 --- a/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/dce/dmub_hw_lock_mgr.c @@ -65,7 +65,8 @@ void dmub_hw_lock_mgr_inbox0_cmd(struct dc_dmub_srv *dmub_srv, bool should_use_dmub_lock(struct dc_link *link) { - if (link->psr_settings.psr_version == DC_PSR_VERSION_SU_1) + if (link->psr_settings.psr_version == DC_PSR_VERSION_SU_1 || + link->psr_settings.psr_version == DC_PSR_VERSION_1) return true; return false; } From d2b4b39b75ad35f0bb661ae56163a03c5091fae2 Mon Sep 17 00:00:00 2001 From: Philippe Simons Date: Sun, 12 Jan 2025 13:34:02 +0100 Subject: [PATCH 06/51] irqchip/sunxi-nmi: Add missing SKIP_WAKE flag [ Upstream commit 3a748d483d80f066ca4b26abe45cdc0c367d13e9 ] Some boards with Allwinner SoCs connect the PMIC's IRQ pin to the SoC's NMI pin instead of a normal GPIO. Since the power key is connected to the PMIC, and people expect to wake up a suspended system via this key, the NMI IRQ controller must stay alive when the system goes into suspend. Add the SKIP_WAKE flag to prevent the sunxi NMI controller from going to sleep, so that the power key can wake up those systems. [ tglx: Fixed up coding style ] Signed-off-by: Philippe Simons Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/all/20250112123402.388520-1-simons.philippe@gmail.com Signed-off-by: Sasha Levin --- drivers/irqchip/irq-sunxi-nmi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-sunxi-nmi.c b/drivers/irqchip/irq-sunxi-nmi.c index 21d49791f855..83c7417611fa 100644 --- a/drivers/irqchip/irq-sunxi-nmi.c +++ b/drivers/irqchip/irq-sunxi-nmi.c @@ -187,7 +187,8 @@ static int __init sunxi_sc_nmi_irq_init(struct device_node *node, gc->chip_types[0].chip.irq_unmask = irq_gc_mask_set_bit; gc->chip_types[0].chip.irq_eoi = irq_gc_ack_set_bit; gc->chip_types[0].chip.irq_set_type = sunxi_sc_nmi_set_type; - gc->chip_types[0].chip.flags = IRQCHIP_EOI_THREADED | IRQCHIP_EOI_IF_HANDLED; + gc->chip_types[0].chip.flags = IRQCHIP_EOI_THREADED | IRQCHIP_EOI_IF_HANDLED | + IRQCHIP_SKIP_SET_WAKE; gc->chip_types[0].regs.ack = reg_offs->pend; gc->chip_types[0].regs.mask = reg_offs->enable; gc->chip_types[0].regs.type = reg_offs->ctrl; From 656100f6566620ee7f765cc6ac67f49772a3b971 Mon Sep 17 00:00:00 2001 From: Alper Nebi Yasak Date: Wed, 2 Aug 2023 20:57:37 +0300 Subject: [PATCH 07/51] ASoC: samsung: midas_wm1811: Map missing jack kcontrols [ Upstream commit d27224a45e5457ad89195d92decdd57596253428 ] This driver does not map jack pins to kcontrols that PulseAudio/PipeWire need to handle jack detection events. The WM1811 codec used here seems to support detecting Headphone and Headset Mic connections. Expose each to userspace as a kcontrol and add the necessary widgets. Signed-off-by: Alper Nebi Yasak Link: https://lore.kernel.org/r/20230802175737.263412-28-alpernebiyasak@gmail.com Signed-off-by: Mark Brown Stable-dep-of: 704dbe97a681 ("ASoC: samsung: Add missing depends on I2C") Signed-off-by: Sasha Levin --- sound/soc/samsung/midas_wm1811.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/sound/soc/samsung/midas_wm1811.c b/sound/soc/samsung/midas_wm1811.c index 6931b9a45b3e..44b32f5cddca 100644 --- a/sound/soc/samsung/midas_wm1811.c +++ b/sound/soc/samsung/midas_wm1811.c @@ -38,6 +38,17 @@ struct midas_priv { struct snd_soc_jack headset_jack; }; +static struct snd_soc_jack_pin headset_jack_pins[] = { + { + .pin = "Headphone", + .mask = SND_JACK_HEADPHONE, + }, + { + .pin = "Headset Mic", + .mask = SND_JACK_MICROPHONE, + }, +}; + static int midas_start_fll1(struct snd_soc_pcm_runtime *rtd, unsigned int rate) { struct snd_soc_card *card = rtd->card; @@ -246,6 +257,7 @@ static const struct snd_kcontrol_new midas_controls[] = { SOC_DAPM_PIN_SWITCH("Main Mic"), SOC_DAPM_PIN_SWITCH("Sub Mic"), SOC_DAPM_PIN_SWITCH("Headset Mic"), + SOC_DAPM_PIN_SWITCH("Headphone"), SOC_DAPM_PIN_SWITCH("FM In"), }; @@ -261,6 +273,7 @@ static const struct snd_soc_dapm_widget midas_dapm_widgets[] = { SND_SOC_DAPM_LINE("HDMI", NULL), SND_SOC_DAPM_LINE("FM In", midas_fm_set), + SND_SOC_DAPM_HP("Headphone", NULL), SND_SOC_DAPM_MIC("Headset Mic", NULL), SND_SOC_DAPM_MIC("Main Mic", midas_mic_bias), SND_SOC_DAPM_MIC("Sub Mic", midas_submic_bias), @@ -305,11 +318,13 @@ static int midas_late_probe(struct snd_soc_card *card) return ret; } - ret = snd_soc_card_jack_new(card, "Headset", - SND_JACK_HEADSET | SND_JACK_MECHANICAL | - SND_JACK_BTN_0 | SND_JACK_BTN_1 | SND_JACK_BTN_2 | - SND_JACK_BTN_3 | SND_JACK_BTN_4 | SND_JACK_BTN_5, - &priv->headset_jack); + ret = snd_soc_card_jack_new_pins(card, "Headset", + SND_JACK_HEADSET | SND_JACK_MECHANICAL | + SND_JACK_BTN_0 | SND_JACK_BTN_1 | SND_JACK_BTN_2 | + SND_JACK_BTN_3 | SND_JACK_BTN_4 | SND_JACK_BTN_5, + &priv->headset_jack, + headset_jack_pins, + ARRAY_SIZE(headset_jack_pins)); if (ret) return ret; From 87d69690814fbf217761419d9a5101c42e77076a Mon Sep 17 00:00:00 2001 From: Charles Keepax Date: Wed, 8 Jan 2025 13:48:28 +0000 Subject: [PATCH 08/51] ASoC: samsung: Add missing depends on I2C [ Upstream commit 704dbe97a68153a84319ad63f526e12ba868b88e ] When switching to selects for MFD_WM8994 a dependency should have also been added for I2C, as the dependency on MFD_WM8994 will not be considered by the select. Fixes: fd55c6065bec ("ASoC: samsung: Add missing selects for MFD_WM8994") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202501082020.2bpGGVTW-lkp@intel.com/ Signed-off-by: Charles Keepax Link: https://patch.msgid.link/20250108134828.246570-1-ckeepax@opensource.cirrus.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/samsung/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/soc/samsung/Kconfig b/sound/soc/samsung/Kconfig index 6bf9fd720d4b..a529852144f9 100644 --- a/sound/soc/samsung/Kconfig +++ b/sound/soc/samsung/Kconfig @@ -220,7 +220,7 @@ config SND_SOC_SAMSUNG_TM2_WM5110 config SND_SOC_SAMSUNG_ARIES_WM8994 tristate "SoC I2S Audio support for WM8994 on Aries" - depends on SND_SOC_SAMSUNG && IIO && EXTCON + depends on SND_SOC_SAMSUNG && I2C && IIO && EXTCON select SND_SOC_BT_SCO select MFD_WM8994 select SND_SOC_WM8994 @@ -234,7 +234,7 @@ config SND_SOC_SAMSUNG_ARIES_WM8994 config SND_SOC_SAMSUNG_MIDAS_WM1811 tristate "SoC I2S Audio support for Midas boards" - depends on SND_SOC_SAMSUNG + depends on SND_SOC_SAMSUNG && I2C select SND_SAMSUNG_I2S select MFD_WM8994 select SND_SOC_WM8994 From 9fdec4786690a648f1bbfbfc0ff42eb44e542732 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Thu, 28 Nov 2024 15:16:23 +0200 Subject: [PATCH 09/51] regmap: detach regmap from dev on regmap_exit commit 3061e170381af96d1e66799d34264e6414d428a7 upstream. At the end of __regmap_init(), if dev is not NULL, regmap_attach_dev() is called, which adds a devres reference to the regmap, to be able to retrieve a dev's regmap by name using dev_get_regmap(). When calling regmap_exit, the opposite does not happen, and the reference is kept until the dev is detached. Add a regmap_detach_dev() function and call it in regmap_exit() to make sure that the devres reference is not kept. Cc: stable@vger.kernel.org Fixes: 72b39f6f2b5a ("regmap: Implement dev_get_regmap()") Signed-off-by: Cosmin Tanislav Rule: add Link: https://lore.kernel.org/stable/20241128130554.362486-1-demonsingur%40gmail.com Link: https://patch.msgid.link/20241128131625.363835-1-demonsingur@gmail.com Signed-off-by: Mark Brown Link: https://lore.kernel.org/r/20250115033244.2540522-1-tzungbi@kernel.org Signed-off-by: Tzung-Bi Shih Signed-off-by: Greg Kroah-Hartman --- drivers/base/regmap/regmap.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/base/regmap/regmap.c b/drivers/base/regmap/regmap.c index 15b37a4163d3..f0e314abcafc 100644 --- a/drivers/base/regmap/regmap.c +++ b/drivers/base/regmap/regmap.c @@ -652,6 +652,17 @@ int regmap_attach_dev(struct device *dev, struct regmap *map, } EXPORT_SYMBOL_GPL(regmap_attach_dev); +static int dev_get_regmap_match(struct device *dev, void *res, void *data); + +static int regmap_detach_dev(struct device *dev, struct regmap *map) +{ + if (!dev) + return 0; + + return devres_release(dev, dev_get_regmap_release, + dev_get_regmap_match, (void *)map->name); +} + static enum regmap_endian regmap_get_reg_endian(const struct regmap_bus *bus, const struct regmap_config *config) { @@ -1536,6 +1547,7 @@ void regmap_exit(struct regmap *map) { struct regmap_async *async; + regmap_detach_dev(map->dev, map); regcache_exit(map); regmap_debugfs_exit(map); regmap_range_exit(map); From d0ec61c9f3583b76aebdbb271f5c0d3fcccd48b2 Mon Sep 17 00:00:00 2001 From: Omid Ehtemam-Haghighi Date: Tue, 5 Nov 2024 17:02:36 -0800 Subject: [PATCH 10/51] ipv6: Fix soft lockups in fib6_select_path under high next hop churn MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit d9ccb18f83ea2bb654289b6ecf014fd267cc988b upstream. Soft lockups have been observed on a cluster of Linux-based edge routers located in a highly dynamic environment. Using the `bird` service, these routers continuously update BGP-advertised routes due to frequently changing nexthop destinations, while also managing significant IPv6 traffic. The lockups occur during the traversal of the multipath circular linked-list in the `fib6_select_path` function, particularly while iterating through the siblings in the list. The issue typically arises when the nodes of the linked list are unexpectedly deleted concurrently on a different core—indicated by their 'next' and 'previous' elements pointing back to the node itself and their reference count dropping to zero. This results in an infinite loop, leading to a soft lockup that triggers a system panic via the watchdog timer. Apply RCU primitives in the problematic code sections to resolve the issue. Where necessary, update the references to fib6_siblings to annotate or use the RCU APIs. Include a test script that reproduces the issue. The script periodically updates the routing table while generating a heavy load of outgoing IPv6 traffic through multiple iperf3 clients. It consistently induces infinite soft lockups within a couple of minutes. Kernel log: 0 [ffffbd13003e8d30] machine_kexec at ffffffff8ceaf3eb 1 [ffffbd13003e8d90] __crash_kexec at ffffffff8d0120e3 2 [ffffbd13003e8e58] panic at ffffffff8cef65d4 3 [ffffbd13003e8ed8] watchdog_timer_fn at ffffffff8d05cb03 4 [ffffbd13003e8f08] __hrtimer_run_queues at ffffffff8cfec62f 5 [ffffbd13003e8f70] hrtimer_interrupt at ffffffff8cfed756 6 [ffffbd13003e8fd0] __sysvec_apic_timer_interrupt at ffffffff8cea01af 7 [ffffbd13003e8ff0] sysvec_apic_timer_interrupt at ffffffff8df1b83d -- -- 8 [ffffbd13003d3708] asm_sysvec_apic_timer_interrupt at ffffffff8e000ecb [exception RIP: fib6_select_path+299] RIP: ffffffff8ddafe7b RSP: ffffbd13003d37b8 RFLAGS: 00000287 RAX: ffff975850b43600 RBX: ffff975850b40200 RCX: 0000000000000000 RDX: 000000003fffffff RSI: 0000000051d383e4 RDI: ffff975850b43618 RBP: ffffbd13003d3800 R8: 0000000000000000 R9: ffff975850b40200 R10: 0000000000000000 R11: 0000000000000000 R12: ffffbd13003d3830 R13: ffff975850b436a8 R14: ffff975850b43600 R15: 0000000000000007 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 9 [ffffbd13003d3808] ip6_pol_route at ffffffff8ddb030c 10 [ffffbd13003d3888] ip6_pol_route_input at ffffffff8ddb068c 11 [ffffbd13003d3898] fib6_rule_lookup at ffffffff8ddf02b5 12 [ffffbd13003d3928] ip6_route_input at ffffffff8ddb0f47 13 [ffffbd13003d3a18] ip6_rcv_finish_core.constprop.0 at ffffffff8dd950d0 14 [ffffbd13003d3a30] ip6_list_rcv_finish.constprop.0 at ffffffff8dd96274 15 [ffffbd13003d3a98] ip6_sublist_rcv at ffffffff8dd96474 16 [ffffbd13003d3af8] ipv6_list_rcv at ffffffff8dd96615 17 [ffffbd13003d3b60] __netif_receive_skb_list_core at ffffffff8dc16fec 18 [ffffbd13003d3be0] netif_receive_skb_list_internal at ffffffff8dc176b3 19 [ffffbd13003d3c50] napi_gro_receive at ffffffff8dc565b9 20 [ffffbd13003d3c80] ice_receive_skb at ffffffffc087e4f5 [ice] 21 [ffffbd13003d3c90] ice_clean_rx_irq at ffffffffc0881b80 [ice] 22 [ffffbd13003d3d20] ice_napi_poll at ffffffffc088232f [ice] 23 [ffffbd13003d3d80] __napi_poll at ffffffff8dc18000 24 [ffffbd13003d3db8] net_rx_action at ffffffff8dc18581 25 [ffffbd13003d3e40] __do_softirq at ffffffff8df352e9 26 [ffffbd13003d3eb0] run_ksoftirqd at ffffffff8ceffe47 27 [ffffbd13003d3ec0] smpboot_thread_fn at ffffffff8cf36a30 28 [ffffbd13003d3ee8] kthread at ffffffff8cf2b39f 29 [ffffbd13003d3f28] ret_from_fork at ffffffff8ce5fa64 30 [ffffbd13003d3f50] ret_from_fork_asm at ffffffff8ce03cbb Fixes: 66f5d6ce53e6 ("ipv6: replace rwlock with rcu and spinlock in fib6_table") Reported-by: Adrian Oliver Signed-off-by: Omid Ehtemam-Haghighi Cc: Shuah Khan Cc: Ido Schimmel Cc: Kuniyuki Iwashima Cc: Simon Horman Reviewed-by: David Ahern Link: https://patch.msgid.link/20241106010236.1239299-1-omid.ehtemamhaghighi@menlosecurity.com Signed-off-by: Jakub Kicinski Signed-off-by: Rajani Kantha Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_fib.c | 8 +- net/ipv6/route.c | 45 ++- tools/testing/selftests/net/Makefile | 1 + .../net/ipv6_route_update_soft_lockup.sh | 262 ++++++++++++++++++ 4 files changed, 297 insertions(+), 19 deletions(-) create mode 100644 tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 0b45ef8b7ee2..b6a7cbd6bee0 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1180,8 +1180,8 @@ next_iter: while (sibling) { if (sibling->fib6_metric == rt->fib6_metric && rt6_qualify_for_ecmp(sibling)) { - list_add_tail(&rt->fib6_siblings, - &sibling->fib6_siblings); + list_add_tail_rcu(&rt->fib6_siblings, + &sibling->fib6_siblings); break; } sibling = rcu_dereference_protected(sibling->fib6_next, @@ -1242,7 +1242,7 @@ add: fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; - list_del_init(&rt->fib6_siblings); + list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); return err; } @@ -1955,7 +1955,7 @@ static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn, &rt->fib6_siblings, fib6_siblings) sibling->fib6_nsiblings--; rt->fib6_nsiblings = 0; - list_del_init(&rt->fib6_siblings); + list_del_rcu(&rt->fib6_siblings); rt6_multipath_rebalance(next_sibling); } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 5ae3ff6ffb7e..f3268bac9f19 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -420,8 +420,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, struct flowi6 *fl6, int oif, bool have_oif_match, const struct sk_buff *skb, int strict) { - struct fib6_info *sibling, *next_sibling; struct fib6_info *match = res->f6i; + struct fib6_info *sibling; if (!match->nh && (!match->fib6_nsiblings || have_oif_match)) goto out; @@ -447,8 +447,8 @@ void fib6_select_path(const struct net *net, struct fib6_result *res, if (fl6->mp_hash <= atomic_read(&match->fib6_nh->fib_nh_upper_bound)) goto out; - list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings, - fib6_siblings) { + list_for_each_entry_rcu(sibling, &match->fib6_siblings, + fib6_siblings) { const struct fib6_nh *nh = sibling->fib6_nh; int nh_upper_bound; @@ -5189,14 +5189,18 @@ static void ip6_route_mpath_notify(struct fib6_info *rt, * nexthop. Since sibling routes are always added at the end of * the list, find the first sibling of the last route appended */ + rcu_read_lock(); + if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) { - rt = list_first_entry(&rt_last->fib6_siblings, - struct fib6_info, - fib6_siblings); + rt = list_first_or_null_rcu(&rt_last->fib6_siblings, + struct fib6_info, + fib6_siblings); } if (rt) inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags); + + rcu_read_unlock(); } static bool ip6_route_mpath_should_notify(const struct fib6_info *rt) @@ -5541,17 +5545,21 @@ static size_t rt6_nlmsg_size(struct fib6_info *f6i) nexthop_for_each_fib6_nh(f6i->nh, rt6_nh_nlmsg_size, &nexthop_len); } else { - struct fib6_info *sibling, *next_sibling; struct fib6_nh *nh = f6i->fib6_nh; + struct fib6_info *sibling; nexthop_len = 0; if (f6i->fib6_nsiblings) { rt6_nh_nlmsg_size(nh, &nexthop_len); - list_for_each_entry_safe(sibling, next_sibling, - &f6i->fib6_siblings, fib6_siblings) { + rcu_read_lock(); + + list_for_each_entry_rcu(sibling, &f6i->fib6_siblings, + fib6_siblings) { rt6_nh_nlmsg_size(sibling->fib6_nh, &nexthop_len); } + + rcu_read_unlock(); } nexthop_len += lwtunnel_get_encap_size(nh->fib_nh_lws); } @@ -5715,7 +5723,7 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, lwtunnel_fill_encap(skb, dst->lwtstate, RTA_ENCAP, RTA_ENCAP_TYPE) < 0) goto nla_put_failure; } else if (rt->fib6_nsiblings) { - struct fib6_info *sibling, *next_sibling; + struct fib6_info *sibling; struct nlattr *mp; mp = nla_nest_start_noflag(skb, RTA_MULTIPATH); @@ -5727,14 +5735,21 @@ static int rt6_fill_node(struct net *net, struct sk_buff *skb, 0) < 0) goto nla_put_failure; - list_for_each_entry_safe(sibling, next_sibling, - &rt->fib6_siblings, fib6_siblings) { + rcu_read_lock(); + + list_for_each_entry_rcu(sibling, &rt->fib6_siblings, + fib6_siblings) { if (fib_add_nexthop(skb, &sibling->fib6_nh->nh_common, sibling->fib6_nh->fib_nh_weight, - AF_INET6, 0) < 0) + AF_INET6, 0) < 0) { + rcu_read_unlock(); + goto nla_put_failure; + } } + rcu_read_unlock(); + nla_nest_end(skb, mp); } else if (rt->nh) { if (nla_put_u32(skb, RTA_NH_ID, rt->nh->id)) @@ -6171,7 +6186,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, err = -ENOBUFS; seq = info->nlh ? info->nlh->nlmsg_seq : 0; - skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any()); + skb = nlmsg_new(rt6_nlmsg_size(rt), GFP_ATOMIC); if (!skb) goto errout; @@ -6184,7 +6199,7 @@ void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info, goto errout; } rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE, - info->nlh, gfp_any()); + info->nlh, GFP_ATOMIC); return; errout: if (err < 0) diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 48d1a68be1d5..080860a8826b 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -72,6 +72,7 @@ TEST_GEN_PROGS += sk_bind_sendto_listen TEST_GEN_PROGS += sk_connect_zero_addr TEST_PROGS += test_ingress_egress_chaining.sh TEST_GEN_FILES += nat6to4.o +TEST_PROGS += ipv6_route_update_soft_lockup.sh TEST_FILES := settings diff --git a/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh b/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh new file mode 100644 index 000000000000..a6b2b1f9c641 --- /dev/null +++ b/tools/testing/selftests/net/ipv6_route_update_soft_lockup.sh @@ -0,0 +1,262 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 +# +# Testing for potential kernel soft lockup during IPv6 routing table +# refresh under heavy outgoing IPv6 traffic. If a kernel soft lockup +# occurs, a kernel panic will be triggered to prevent associated issues. +# +# +# Test Environment Layout +# +# ┌----------------┐ ┌----------------┐ +# | SOURCE_NS | | SINK_NS | +# | NAMESPACE | | NAMESPACE | +# |(iperf3 clients)| |(iperf3 servers)| +# | | | | +# | | | | +# | ┌-----------| nexthops |---------┐ | +# | |veth_source|<--------------------------------------->|veth_sink|<┐ | +# | └-----------|2001:0DB8:1::0:1/96 2001:0DB8:1::1:1/96 |---------┘ | | +# | | ^ 2001:0DB8:1::1:2/96 | | | +# | | . . | fwd | | +# | ┌---------┐ | . . | | | +# | | IPv6 | | . . | V | +# | | routing | | . 2001:0DB8:1::1:80/96| ┌-----┐ | +# | | table | | . | | lo | | +# | | nexthop | | . └--------┴-----┴-┘ +# | | update | | ............................> 2001:0DB8:2::1:1/128 +# | └-------- ┘ | +# └----------------┘ +# +# The test script sets up two network namespaces, source_ns and sink_ns, +# connected via a veth link. Within source_ns, it continuously updates the +# IPv6 routing table by flushing and inserting IPV6_NEXTHOP_ADDR_COUNT nexthop +# IPs destined for SINK_LOOPBACK_IP_ADDR in sink_ns. This refresh occurs at a +# rate of 1/ROUTING_TABLE_REFRESH_PERIOD per second for TEST_DURATION seconds. +# +# Simultaneously, multiple iperf3 clients within source_ns generate heavy +# outgoing IPv6 traffic. Each client is assigned a unique port number starting +# at 5000 and incrementing sequentially. Each client targets a unique iperf3 +# server running in sink_ns, connected to the SINK_LOOPBACK_IFACE interface +# using the same port number. +# +# The number of iperf3 servers and clients is set to half of the total +# available cores on each machine. +# +# NOTE: We have tested this script on machines with various CPU specifications, +# ranging from lower to higher performance as listed below. The test script +# effectively triggered a kernel soft lockup on machines running an unpatched +# kernel in under a minute: +# +# - 1x Intel Xeon E-2278G 8-Core Processor @ 3.40GHz +# - 1x Intel Xeon E-2378G Processor 8-Core @ 2.80GHz +# - 1x AMD EPYC 7401P 24-Core Processor @ 2.00GHz +# - 1x AMD EPYC 7402P 24-Core Processor @ 2.80GHz +# - 2x Intel Xeon Gold 5120 14-Core Processor @ 2.20GHz +# - 1x Ampere Altra Q80-30 80-Core Processor @ 3.00GHz +# - 2x Intel Xeon Gold 5120 14-Core Processor @ 2.20GHz +# - 2x Intel Xeon Silver 4214 24-Core Processor @ 2.20GHz +# - 1x AMD EPYC 7502P 32-Core @ 2.50GHz +# - 1x Intel Xeon Gold 6314U 32-Core Processor @ 2.30GHz +# - 2x Intel Xeon Gold 6338 32-Core Processor @ 2.00GHz +# +# On less performant machines, you may need to increase the TEST_DURATION +# parameter to enhance the likelihood of encountering a race condition leading +# to a kernel soft lockup and avoid a false negative result. +# +# NOTE: The test may not produce the expected result in virtualized +# environments (e.g., qemu) due to differences in timing and CPU handling, +# which can affect the conditions needed to trigger a soft lockup. + +source lib.sh +source net_helper.sh + +TEST_DURATION=300 +ROUTING_TABLE_REFRESH_PERIOD=0.01 + +IPERF3_BITRATE="300m" + + +IPV6_NEXTHOP_ADDR_COUNT="128" +IPV6_NEXTHOP_ADDR_MASK="96" +IPV6_NEXTHOP_PREFIX="2001:0DB8:1" + + +SOURCE_TEST_IFACE="veth_source" +SOURCE_TEST_IP_ADDR="2001:0DB8:1::0:1/96" + +SINK_TEST_IFACE="veth_sink" +# ${SINK_TEST_IFACE} is populated with the following range of IPv6 addresses: +# 2001:0DB8:1::1:1 to 2001:0DB8:1::1:${IPV6_NEXTHOP_ADDR_COUNT} +SINK_LOOPBACK_IFACE="lo" +SINK_LOOPBACK_IP_MASK="128" +SINK_LOOPBACK_IP_ADDR="2001:0DB8:2::1:1" + +nexthop_ip_list="" +termination_signal="" +kernel_softlokup_panic_prev_val="" + +terminate_ns_processes_by_pattern() { + local ns=$1 + local pattern=$2 + + for pid in $(ip netns pids ${ns}); do + [ -e /proc/$pid/cmdline ] && grep -qe "${pattern}" /proc/$pid/cmdline && kill -9 $pid + done +} + +cleanup() { + echo "info: cleaning up namespaces and terminating all processes within them..." + + + # Terminate iperf3 instances running in the source_ns. To avoid race + # conditions, first iterate over the PIDs and terminate those + # associated with the bash shells running the + # `while true; do iperf3 -c ...; done` loops. In a second iteration, + # terminate the individual `iperf3 -c ...` instances. + terminate_ns_processes_by_pattern ${source_ns} while + terminate_ns_processes_by_pattern ${source_ns} iperf3 + + # Repeat the same process for sink_ns + terminate_ns_processes_by_pattern ${sink_ns} while + terminate_ns_processes_by_pattern ${sink_ns} iperf3 + + # Check if any iperf3 instances are still running. This could happen + # if a core has entered an infinite loop and the timeout for detecting + # the soft lockup has not expired, but either the test interval has + # already elapsed or the test was terminated manually (e.g., with ^C) + for pid in $(ip netns pids ${source_ns}); do + if [ -e /proc/$pid/cmdline ] && grep -qe 'iperf3' /proc/$pid/cmdline; then + echo "FAIL: unable to terminate some iperf3 instances. Soft lockup is underway. A kernel panic is on the way!" + exit ${ksft_fail} + fi + done + + if [ "$termination_signal" == "SIGINT" ]; then + echo "SKIP: Termination due to ^C (SIGINT)" + elif [ "$termination_signal" == "SIGALRM" ]; then + echo "PASS: No kernel soft lockup occurred during this ${TEST_DURATION} second test" + fi + + cleanup_ns ${source_ns} ${sink_ns} + + sysctl -qw kernel.softlockup_panic=${kernel_softlokup_panic_prev_val} +} + +setup_prepare() { + setup_ns source_ns sink_ns + + ip -n ${source_ns} link add name ${SOURCE_TEST_IFACE} type veth peer name ${SINK_TEST_IFACE} netns ${sink_ns} + + # Setting up the Source namespace + ip -n ${source_ns} addr add ${SOURCE_TEST_IP_ADDR} dev ${SOURCE_TEST_IFACE} + ip -n ${source_ns} link set dev ${SOURCE_TEST_IFACE} qlen 10000 + ip -n ${source_ns} link set dev ${SOURCE_TEST_IFACE} up + ip netns exec ${source_ns} sysctl -qw net.ipv6.fib_multipath_hash_policy=1 + + # Setting up the Sink namespace + ip -n ${sink_ns} addr add ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK} dev ${SINK_LOOPBACK_IFACE} + ip -n ${sink_ns} link set dev ${SINK_LOOPBACK_IFACE} up + ip netns exec ${sink_ns} sysctl -qw net.ipv6.conf.${SINK_LOOPBACK_IFACE}.forwarding=1 + + ip -n ${sink_ns} link set ${SINK_TEST_IFACE} up + ip netns exec ${sink_ns} sysctl -qw net.ipv6.conf.${SINK_TEST_IFACE}.forwarding=1 + + + # Populate nexthop IPv6 addresses on the test interface in the sink_ns + echo "info: populating ${IPV6_NEXTHOP_ADDR_COUNT} IPv6 addresses on the ${SINK_TEST_IFACE} interface ..." + for IP in $(seq 1 ${IPV6_NEXTHOP_ADDR_COUNT}); do + ip -n ${sink_ns} addr add ${IPV6_NEXTHOP_PREFIX}::$(printf "1:%x" "${IP}")/${IPV6_NEXTHOP_ADDR_MASK} dev ${SINK_TEST_IFACE}; + done + + # Preparing list of nexthops + for IP in $(seq 1 ${IPV6_NEXTHOP_ADDR_COUNT}); do + nexthop_ip_list=$nexthop_ip_list" nexthop via ${IPV6_NEXTHOP_PREFIX}::$(printf "1:%x" $IP) dev ${SOURCE_TEST_IFACE} weight 1" + done +} + + +test_soft_lockup_during_routing_table_refresh() { + # Start num_of_iperf_servers iperf3 servers in the sink_ns namespace, + # each listening on ports starting at 5001 and incrementing + # sequentially. Since iperf3 instances may terminate unexpectedly, a + # while loop is used to automatically restart them in such cases. + echo "info: starting ${num_of_iperf_servers} iperf3 servers in the sink_ns namespace ..." + for i in $(seq 1 ${num_of_iperf_servers}); do + cmd="iperf3 --bind ${SINK_LOOPBACK_IP_ADDR} -s -p $(printf '5%03d' ${i}) --rcv-timeout 200 &>/dev/null" + ip netns exec ${sink_ns} bash -c "while true; do ${cmd}; done &" &>/dev/null + done + + # Wait for the iperf3 servers to be ready + for i in $(seq ${num_of_iperf_servers}); do + port=$(printf '5%03d' ${i}); + wait_local_port_listen ${sink_ns} ${port} tcp + done + + # Continuously refresh the routing table in the background within + # the source_ns namespace + ip netns exec ${source_ns} bash -c " + while \$(ip netns list | grep -q ${source_ns}); do + ip -6 route add ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK} ${nexthop_ip_list}; + sleep ${ROUTING_TABLE_REFRESH_PERIOD}; + ip -6 route delete ${SINK_LOOPBACK_IP_ADDR}/${SINK_LOOPBACK_IP_MASK}; + done &" + + # Start num_of_iperf_servers iperf3 clients in the source_ns namespace, + # each sending TCP traffic on sequential ports starting at 5001. + # Since iperf3 instances may terminate unexpectedly (e.g., if the route + # to the server is deleted in the background during a route refresh), a + # while loop is used to automatically restart them in such cases. + echo "info: starting ${num_of_iperf_servers} iperf3 clients in the source_ns namespace ..." + for i in $(seq 1 ${num_of_iperf_servers}); do + cmd="iperf3 -c ${SINK_LOOPBACK_IP_ADDR} -p $(printf '5%03d' ${i}) --length 64 --bitrate ${IPERF3_BITRATE} -t 0 --connect-timeout 150 &>/dev/null" + ip netns exec ${source_ns} bash -c "while true; do ${cmd}; done &" &>/dev/null + done + + echo "info: IPv6 routing table is being updated at the rate of $(echo "1/${ROUTING_TABLE_REFRESH_PERIOD}" | bc)/s for ${TEST_DURATION} seconds ..." + echo "info: A kernel soft lockup, if detected, results in a kernel panic!" + + wait +} + +# Make sure 'iperf3' is installed, skip the test otherwise +if [ ! -x "$(command -v "iperf3")" ]; then + echo "SKIP: 'iperf3' is not installed. Skipping the test." + exit ${ksft_skip} +fi + +# Determine the number of cores on the machine +num_of_iperf_servers=$(( $(nproc)/2 )) + +# Check if we are running on a multi-core machine, skip the test otherwise +if [ "${num_of_iperf_servers}" -eq 0 ]; then + echo "SKIP: This test is not valid on a single core machine!" + exit ${ksft_skip} +fi + +# Since the kernel soft lockup we're testing causes at least one core to enter +# an infinite loop, destabilizing the host and likely affecting subsequent +# tests, we trigger a kernel panic instead of reporting a failure and +# continuing +kernel_softlokup_panic_prev_val=$(sysctl -n kernel.softlockup_panic) +sysctl -qw kernel.softlockup_panic=1 + +handle_sigint() { + termination_signal="SIGINT" + cleanup + exit ${ksft_skip} +} + +handle_sigalrm() { + termination_signal="SIGALRM" + cleanup + exit ${ksft_pass} +} + +trap handle_sigint SIGINT +trap handle_sigalrm SIGALRM + +(sleep ${TEST_DURATION} && kill -s SIGALRM $$)& + +setup_prepare +test_soft_lockup_during_routing_table_refresh From f1bc570611d56176ece78c79abcb0ef3fa25b8a4 Mon Sep 17 00:00:00 2001 From: K Prateek Nayak Date: Tue, 19 Nov 2024 05:44:29 +0000 Subject: [PATCH 11/51] softirq: Allow raising SCHED_SOFTIRQ from SMP-call-function on RT kernel commit 6675ce20046d149e1e1ffe7e9577947dee17aad5 upstream. do_softirq_post_smp_call_flush() on PREEMPT_RT kernels carries a WARN_ON_ONCE() for any SOFTIRQ being raised from an SMP-call-function. Since do_softirq_post_smp_call_flush() is called with preempt disabled, raising a SOFTIRQ during flush_smp_call_function_queue() can lead to longer preempt disabled sections. Since commit b2a02fc43a1f ("smp: Optimize send_call_function_single_ipi()") IPIs to an idle CPU in TIF_POLLING_NRFLAG mode can be optimized out by instead setting TIF_NEED_RESCHED bit in idle task's thread_info and relying on the flush_smp_call_function_queue() in the idle-exit path to run the SMP-call-function. To trigger an idle load balancing, the scheduler queues nohz_csd_function() responsible for triggering an idle load balancing on a target nohz idle CPU and sends an IPI. Only now, this IPI is optimized out and the SMP-call-function is executed from flush_smp_call_function_queue() in do_idle() which can raise a SCHED_SOFTIRQ to trigger the balancing. So far, this went undetected since, the need_resched() check in nohz_csd_function() would make it bail out of idle load balancing early as the idle thread does not clear TIF_POLLING_NRFLAG before calling flush_smp_call_function_queue(). The need_resched() check was added with the intent to catch a new task wakeup, however, it has recently discovered to be unnecessary and will be removed in the subsequent commit after which nohz_csd_function() can raise a SCHED_SOFTIRQ from flush_smp_call_function_queue() to trigger an idle load balance on an idle target in TIF_POLLING_NRFLAG mode. nohz_csd_function() bails out early if "idle_cpu()" check for the target CPU, and does not lock the target CPU's rq until the very end, once it has found tasks to run on the CPU and will not inhibit the wakeup of, or running of a newly woken up higher priority task. Account for this and prevent a WARN_ON_ONCE() when SCHED_SOFTIRQ is raised from flush_smp_call_function_queue(). Signed-off-by: K Prateek Nayak Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20241119054432.6405-2-kprateek.nayak@amd.com Tested-by: Felix Moessbauer Signed-off-by: Florian Bezdeka Signed-off-by: Greg Kroah-Hartman --- kernel/softirq.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/kernel/softirq.c b/kernel/softirq.c index a47396161843..6665f5cd60cb 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -294,17 +294,24 @@ static inline void invoke_softirq(void) wakeup_softirqd(); } +#define SCHED_SOFTIRQ_MASK BIT(SCHED_SOFTIRQ) + /* * flush_smp_call_function_queue() can raise a soft interrupt in a function - * call. On RT kernels this is undesired and the only known functionality - * in the block layer which does this is disabled on RT. If soft interrupts - * get raised which haven't been raised before the flush, warn so it can be + * call. On RT kernels this is undesired and the only known functionalities + * are in the block layer which is disabled on RT, and in the scheduler for + * idle load balancing. If soft interrupts get raised which haven't been + * raised before the flush, warn if it is not a SCHED_SOFTIRQ so it can be * investigated. */ void do_softirq_post_smp_call_flush(unsigned int was_pending) { - if (WARN_ON_ONCE(was_pending != local_softirq_pending())) + unsigned int is_pending = local_softirq_pending(); + + if (unlikely(was_pending != is_pending)) { + WARN_ON_ONCE(was_pending != (is_pending & ~SCHED_SOFTIRQ_MASK)); invoke_softirq(); + } } #else /* CONFIG_PREEMPT_RT */ From 79bdab54b66a3b175e0b5989da82d4c43d68500d Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 29 Jan 2025 10:46:59 -0800 Subject: [PATCH 12/51] xfs: bump max fsgeom struct version [ Upstream commit 9488062805943c2d63350d3ef9e4dc093799789a ] The latest version of the fs geometry structure is v5. Bump this constant so that xfs_db and mkfs calls to libxfs_fs_geometry will fill out all the fields. IOWs, this commit is a no-op for the kernel, but will be useful for userspace reporting in later changes. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_sb.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_sb.h b/fs/xfs/libxfs/xfs_sb.h index a5e14740ec9a..19134b23c10b 100644 --- a/fs/xfs/libxfs/xfs_sb.h +++ b/fs/xfs/libxfs/xfs_sb.h @@ -25,7 +25,7 @@ extern uint64_t xfs_sb_version_to_features(struct xfs_sb *sbp); extern int xfs_update_secondary_sbs(struct xfs_mount *mp); -#define XFS_FS_GEOM_MAX_STRUCT_VER (4) +#define XFS_FS_GEOM_MAX_STRUCT_VER (5) extern void xfs_fs_geometry(struct xfs_mount *mp, struct xfs_fsop_geom *geo, int struct_version); extern int xfs_sb_read_secondary(struct xfs_mount *mp, From 8ee604ac136afe290331f96a9667c78e78a44dd6 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 29 Jan 2025 10:47:00 -0800 Subject: [PATCH 13/51] xfs: hoist freeing of rt data fork extent mappings [ Upstream commit 6c664484337b37fa0cf6e958f4019623e30d40f7 ] Currently, xfs_bmap_del_extent_real contains a bunch of code to convert the physical extent of a data fork mapping for a realtime file into rt extents and pass that to the rt extent freeing function. Since the details of this aren't needed when CONFIG_XFS_REALTIME=n, move it to xfs_rtbitmap.c to reduce code size when realtime isn't enabled. This will (one day) enable realtime EFIs to reuse the same unit-converting call with less code duplication. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_bmap.c | 19 +++---------------- fs/xfs/libxfs/xfs_rtbitmap.c | 33 +++++++++++++++++++++++++++++++++ fs/xfs/xfs_rtalloc.h | 5 +++++ 3 files changed, 41 insertions(+), 16 deletions(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 9dc33cdc2ab9..d45a2e681f93 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5037,33 +5037,20 @@ xfs_bmap_del_extent_real( flags = XFS_ILOG_CORE; if (whichfork == XFS_DATA_FORK && XFS_IS_REALTIME_INODE(ip)) { - xfs_filblks_t len; - xfs_extlen_t mod; - - len = div_u64_rem(del->br_blockcount, mp->m_sb.sb_rextsize, - &mod); - ASSERT(mod == 0); - if (!(bflags & XFS_BMAPI_REMAP)) { - xfs_fsblock_t bno; - - bno = div_u64_rem(del->br_startblock, - mp->m_sb.sb_rextsize, &mod); - ASSERT(mod == 0); - - error = xfs_rtfree_extent(tp, bno, (xfs_extlen_t)len); + error = xfs_rtfree_blocks(tp, del->br_startblock, + del->br_blockcount); if (error) goto done; } do_fx = 0; - nblks = len * mp->m_sb.sb_rextsize; qfield = XFS_TRANS_DQ_RTBCOUNT; } else { do_fx = 1; - nblks = del->br_blockcount; qfield = XFS_TRANS_DQ_BCOUNT; } + nblks = del->br_blockcount; del_endblock = del->br_startblock + del->br_blockcount; if (cur) { diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index fa180ab66b73..655108a4cd05 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1005,6 +1005,39 @@ xfs_rtfree_extent( return 0; } +/* + * Free some blocks in the realtime subvolume. rtbno and rtlen are in units of + * rt blocks, not rt extents; must be aligned to the rt extent size; and rtlen + * cannot exceed XFS_MAX_BMBT_EXTLEN. + */ +int +xfs_rtfree_blocks( + struct xfs_trans *tp, + xfs_fsblock_t rtbno, + xfs_filblks_t rtlen) +{ + struct xfs_mount *mp = tp->t_mountp; + xfs_rtblock_t bno; + xfs_filblks_t len; + xfs_extlen_t mod; + + ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN); + + len = div_u64_rem(rtlen, mp->m_sb.sb_rextsize, &mod); + if (mod) { + ASSERT(mod == 0); + return -EIO; + } + + bno = div_u64_rem(rtbno, mp->m_sb.sb_rextsize, &mod); + if (mod) { + ASSERT(mod == 0); + return -EIO; + } + + return xfs_rtfree_extent(tp, bno, len); +} + /* Find all the free records within a given range. */ int xfs_rtalloc_query_range( diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 62c7ad79cbb6..3b2f1b499a11 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -58,6 +58,10 @@ xfs_rtfree_extent( xfs_rtblock_t bno, /* starting block number to free */ xfs_extlen_t len); /* length of extent freed */ +/* Same as above, but in units of rt blocks. */ +int xfs_rtfree_blocks(struct xfs_trans *tp, xfs_fsblock_t rtbno, + xfs_filblks_t rtlen); + /* * Initialize realtime fields in the mount structure. */ @@ -139,6 +143,7 @@ int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); #else # define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) # define xfs_rtfree_extent(t,b,l) (ENOSYS) +# define xfs_rtfree_blocks(t,rb,rl) (ENOSYS) # define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) # define xfs_growfs_rt(mp,in) (ENOSYS) # define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) From 9670abd18c33b20f0797d442c4331905b9679a20 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 29 Jan 2025 10:47:01 -0800 Subject: [PATCH 14/51] xfs: prevent rt growfs when quota is enabled [ Upstream commit b73494fa9a304ab95b59f07845e8d7d36e4d23e0 ] Quotas aren't (yet) supported with realtime, so we shouldn't allow userspace to set up a realtime section when quotas are enabled, even if they attached one via mount options. IOWS, you shouldn't be able to do: # mkfs.xfs -f /dev/sda # mount /dev/sda /mnt -o rtdev=/dev/sdb,usrquota # xfs_growfs -r /mnt Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_rtalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 292d5e54a92c..34980d7c2dd6 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -954,7 +954,7 @@ xfs_growfs_rt( return -EINVAL; /* Unsupported realtime features. */ - if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp)) + if (xfs_has_rmapbt(mp) || xfs_has_reflink(mp) || xfs_has_quota(mp)) return -EOPNOTSUPP; nrblocks = in->newblocks; From 91536449343fcec5a8aeadb14a0a6e522703a27c Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 29 Jan 2025 10:47:02 -0800 Subject: [PATCH 15/51] xfs: rt stubs should return negative errnos when rt disabled [ Upstream commit c2988eb5cff75c02bc57e02c323154aa08f55b78 ] When realtime support is not compiled into the kernel, these functions should return negative errnos, not positive errnos. While we're at it, fix a broken macro declaration. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_rtalloc.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 3b2f1b499a11..65c284e9d33e 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -141,17 +141,17 @@ int xfs_rtalloc_extent_is_free(struct xfs_mount *mp, struct xfs_trans *tp, bool *is_free); int xfs_rtalloc_reinit_frextents(struct xfs_mount *mp); #else -# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (ENOSYS) -# define xfs_rtfree_extent(t,b,l) (ENOSYS) -# define xfs_rtfree_blocks(t,rb,rl) (ENOSYS) -# define xfs_rtpick_extent(m,t,l,rb) (ENOSYS) -# define xfs_growfs_rt(mp,in) (ENOSYS) -# define xfs_rtalloc_query_range(t,l,h,f,p) (ENOSYS) -# define xfs_rtalloc_query_all(m,t,f,p) (ENOSYS) -# define xfs_rtbuf_get(m,t,b,i,p) (ENOSYS) -# define xfs_verify_rtbno(m, r) (false) -# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (ENOSYS) -# define xfs_rtalloc_reinit_frextents(m) (0) +# define xfs_rtallocate_extent(t,b,min,max,l,f,p,rb) (-ENOSYS) +# define xfs_rtfree_extent(t,b,l) (-ENOSYS) +# define xfs_rtfree_blocks(t,rb,rl) (-ENOSYS) +# define xfs_rtpick_extent(m,t,l,rb) (-ENOSYS) +# define xfs_growfs_rt(mp,in) (-ENOSYS) +# define xfs_rtalloc_query_range(m,t,l,h,f,p) (-ENOSYS) +# define xfs_rtalloc_query_all(m,t,f,p) (-ENOSYS) +# define xfs_rtbuf_get(m,t,b,i,p) (-ENOSYS) +# define xfs_verify_rtbno(m, r) (false) +# define xfs_rtalloc_extent_is_free(m,t,s,l,i) (-ENOSYS) +# define xfs_rtalloc_reinit_frextents(m) (0) static inline int /* error */ xfs_rtmount_init( xfs_mount_t *mp) /* file system mount structure */ @@ -162,7 +162,7 @@ xfs_rtmount_init( xfs_warn(mp, "Not built with CONFIG_XFS_RT"); return -ENOSYS; } -# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (ENOSYS)) +# define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS)) # define xfs_rtunmount_inodes(m) #endif /* CONFIG_XFS_RT */ From 34167d02161e68c0122effc591626d4bf8132819 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 29 Jan 2025 10:47:03 -0800 Subject: [PATCH 16/51] xfs: fix units conversion error in xfs_bmap_del_extent_delay [ Upstream commit ddd98076d5c075c8a6c49d9e6e8ee12844137f23 ] The unit conversions in this function do not make sense. First we convert a block count to bytes, then divide that bytes value by rextsize, which is in blocks, to get an rt extent count. You can't divide bytes by blocks to get a (possibly multiblock) extent value. Fortunately nobody uses delalloc on the rt volume so this hasn't mattered. Fixes: fa5c836ca8eb5 ("xfs: refactor xfs_bunmapi_cow") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_bmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index d45a2e681f93..27d3121e6da9 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -4807,7 +4807,7 @@ xfs_bmap_del_extent_delay( ASSERT(got_endoff >= del_endoff); if (isrt) { - uint64_t rtexts = XFS_FSB_TO_B(mp, del->br_blockcount); + uint64_t rtexts = del->br_blockcount; do_div(rtexts, mp->m_sb.sb_rextsize); xfs_mod_frextents(mp, rtexts); From 4d607041cf9c332c1aa014671350ddc562ad1ef7 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 29 Jan 2025 10:47:04 -0800 Subject: [PATCH 17/51] xfs: make sure maxlen is still congruent with prod when rounding down [ Upstream commit f6a2dae2a1f52ea23f649c02615d073beba4cc35 ] In commit 2a6ca4baed62, we tried to fix an overflow problem in the realtime allocator that was caused by an overly large maxlen value causing xfs_rtcheck_range to run off the end of the realtime bitmap. Unfortunately, there is a subtle bug here -- maxlen (and minlen) both have to be aligned with @prod, but @prod can be larger than 1 if the user has set an extent size hint on the file, and that extent size hint is larger than the realtime extent size. If the rt free space extents are not aligned to this file's extszhint because other files without extent size hints allocated space (or the number of rt extents is similarly not aligned), then it's possible that maxlen after clamping to sb_rextents will no longer be aligned to prod. The allocation will succeed just fine, but we still trip the assertion. Fix the problem by reducing maxlen by any misalignment with prod. While we're at it, split the assertions into two so that we can tell which value had the bad alignment. Fixes: 2a6ca4baed62 ("xfs: make sure the rt allocator doesn't run off the end") Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_rtalloc.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 34980d7c2dd6..0bfbbc1dd0da 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -211,6 +211,23 @@ xfs_rtallocate_range( return error; } +/* + * Make sure we don't run off the end of the rt volume. Be careful that + * adjusting maxlen downwards doesn't cause us to fail the alignment checks. + */ +static inline xfs_extlen_t +xfs_rtallocate_clamp_len( + struct xfs_mount *mp, + xfs_rtblock_t startrtx, + xfs_extlen_t rtxlen, + xfs_extlen_t prod) +{ + xfs_extlen_t ret; + + ret = min(mp->m_sb.sb_rextents, startrtx + rtxlen) - startrtx; + return rounddown(ret, prod); +} + /* * Attempt to allocate an extent minlen<=len<=maxlen starting from * bitmap block bbno. If we don't get maxlen then use prod to trim @@ -248,7 +265,7 @@ xfs_rtallocate_extent_block( i <= end; i++) { /* Make sure we don't scan off the end of the rt volume. */ - maxlen = min(mp->m_sb.sb_rextents, i + maxlen) - i; + maxlen = xfs_rtallocate_clamp_len(mp, i, maxlen, prod); /* * See if there's a free extent of maxlen starting at i. @@ -355,7 +372,8 @@ xfs_rtallocate_extent_exact( int isfree; /* extent is free */ xfs_rtblock_t next; /* next block to try (dummy) */ - ASSERT(minlen % prod == 0 && maxlen % prod == 0); + ASSERT(minlen % prod == 0); + ASSERT(maxlen % prod == 0); /* * Check if the range in question (for maxlen) is free. */ @@ -438,7 +456,9 @@ xfs_rtallocate_extent_near( xfs_rtblock_t n; /* next block to try */ xfs_rtblock_t r; /* result block */ - ASSERT(minlen % prod == 0 && maxlen % prod == 0); + ASSERT(minlen % prod == 0); + ASSERT(maxlen % prod == 0); + /* * If the block number given is off the end, silently set it to * the last block. @@ -447,7 +467,7 @@ xfs_rtallocate_extent_near( bno = mp->m_sb.sb_rextents - 1; /* Make sure we don't run off the end of the rt volume. */ - maxlen = min(mp->m_sb.sb_rextents, bno + maxlen) - bno; + maxlen = xfs_rtallocate_clamp_len(mp, bno, maxlen, prod); if (maxlen < minlen) { *rtblock = NULLRTBLOCK; return 0; @@ -638,7 +658,8 @@ xfs_rtallocate_extent_size( xfs_rtblock_t r; /* result block number */ xfs_suminfo_t sum; /* summary information for extents */ - ASSERT(minlen % prod == 0 && maxlen % prod == 0); + ASSERT(minlen % prod == 0); + ASSERT(maxlen % prod == 0); ASSERT(maxlen != 0); /* From feb30fe49552a07e8369d99d4181f74e45a0cfea Mon Sep 17 00:00:00 2001 From: Cheng Lin Date: Wed, 29 Jan 2025 10:47:05 -0800 Subject: [PATCH 18/51] xfs: introduce protection for drop nlink [ Upstream commit 2b99e410b28f5a75ae417e6389e767c7745d6fce ] When abnormal drop_nlink are detected on the inode, return error, to avoid corruption propagation. Signed-off-by: Cheng Lin Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_inode.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 909085269227..1d32823d5099 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -919,6 +919,13 @@ xfs_droplink( xfs_trans_t *tp, xfs_inode_t *ip) { + if (VFS_I(ip)->i_nlink == 0) { + xfs_alert(ip->i_mount, + "%s: Attempt to drop inode (%llu) with nlink zero.", + __func__, ip->i_ino); + return -EFSCORRUPTED; + } + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); drop_nlink(VFS_I(ip)); From 4eb3b579b4e29eb76caa7a7c54ec88d3d75d0df9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Jan 2025 10:47:06 -0800 Subject: [PATCH 19/51] xfs: handle nimaps=0 from xfs_bmapi_write in xfs_alloc_file_space [ Upstream commit 35dc55b9e80cb9ec4bcb969302000b002b2ed850 ] If xfs_bmapi_write finds a delalloc extent at the requested range, it tries to convert the entire delalloc extent to a real allocation. But if the allocator cannot find a single free extent large enough to cover the start block of the requested range, xfs_bmapi_write will return 0 but leave *nimaps set to 0. In that case we simply need to keep looping with the same startoffset_fsb so that one of the following allocations will eventually reach the requested range. Note that this could affect any caller of xfs_bmapi_write that covers an existing delayed allocation. As far as I can tell we do not have any other such caller, though - the regular writeback path uses xfs_bmapi_convert_delalloc to convert delayed allocations to real ones, and direct I/O invalidates the page cache first. Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_bmap_util.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index ce8e17ab5434..468bb61a5e46 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -780,12 +780,10 @@ xfs_alloc_file_space( { xfs_mount_t *mp = ip->i_mount; xfs_off_t count; - xfs_filblks_t allocated_fsb; xfs_filblks_t allocatesize_fsb; xfs_extlen_t extsz, temp; xfs_fileoff_t startoffset_fsb; xfs_fileoff_t endoffset_fsb; - int nimaps; int rt; xfs_trans_t *tp; xfs_bmbt_irec_t imaps[1], *imapp; @@ -808,7 +806,6 @@ xfs_alloc_file_space( count = len; imapp = &imaps[0]; - nimaps = 1; startoffset_fsb = XFS_B_TO_FSBT(mp, offset); endoffset_fsb = XFS_B_TO_FSB(mp, offset + count); allocatesize_fsb = endoffset_fsb - startoffset_fsb; @@ -819,6 +816,7 @@ xfs_alloc_file_space( while (allocatesize_fsb && !error) { xfs_fileoff_t s, e; unsigned int dblocks, rblocks, resblks; + int nimaps = 1; /* * Determine space reservations for data/realtime. @@ -884,15 +882,19 @@ xfs_alloc_file_space( if (error) break; - allocated_fsb = imapp->br_blockcount; - - if (nimaps == 0) { - error = -ENOSPC; - break; + /* + * If the allocator cannot find a single free extent large + * enough to cover the start block of the requested range, + * xfs_bmapi_write will return 0 but leave *nimaps set to 0. + * + * In that case we simply need to keep looping with the same + * startoffset_fsb so that one of the following allocations + * will eventually reach the requested range. + */ + if (nimaps) { + startoffset_fsb += imapp->br_blockcount; + allocatesize_fsb -= imapp->br_blockcount; } - - startoffset_fsb += allocated_fsb; - allocatesize_fsb -= allocated_fsb; } return error; From 6685b88514dd54fa9850723853b621e17f8fc7fe Mon Sep 17 00:00:00 2001 From: Catherine Hoang Date: Wed, 29 Jan 2025 10:47:07 -0800 Subject: [PATCH 20/51] xfs: allow read IO and FICLONE to run concurrently [ Upstream commit 14a537983b228cb050ceca3a5b743d01315dc4aa ] One of our VM cluster management products needs to snapshot KVM image files so that they can be restored in case of failure. Snapshotting is done by redirecting VM disk writes to a sidecar file and using reflink on the disk image, specifically the FICLONE ioctl as used by "cp --reflink". Reflink locks the source and destination files while it operates, which means that reads from the main vm disk image are blocked, causing the vm to stall. When an image file is heavily fragmented, the copy process could take several minutes. Some of the vm image files have 50-100 million extent records, and duplicating that much metadata locks the file for 30 minutes or more. Having activities suspended for such a long time in a cluster node could result in node eviction. Clone operations and read IO do not change any data in the source file, so they should be able to run concurrently. Demote the exclusive locks taken by FICLONE to shared locks to allow reads while cloning. While a clone is in progress, writes will take the IOLOCK_EXCL, so they block until the clone completes. Link: https://lore.kernel.org/linux-xfs/8911B94D-DD29-4D6E-B5BC-32EAF1866245@oracle.com/ Signed-off-by: Catherine Hoang Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_file.c | 63 +++++++++++++++++++++++++++++++++++--------- fs/xfs/xfs_inode.c | 17 ++++++++++++ fs/xfs/xfs_inode.h | 9 +++++++ fs/xfs/xfs_reflink.c | 4 +++ 4 files changed, 80 insertions(+), 13 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index 8de40cf63a5b..821cb86a83bd 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -214,6 +214,43 @@ xfs_ilock_iocb( return 0; } +static int +xfs_ilock_iocb_for_write( + struct kiocb *iocb, + unsigned int *lock_mode) +{ + ssize_t ret; + struct xfs_inode *ip = XFS_I(file_inode(iocb->ki_filp)); + + ret = xfs_ilock_iocb(iocb, *lock_mode); + if (ret) + return ret; + + if (*lock_mode == XFS_IOLOCK_EXCL) + return 0; + if (!xfs_iflags_test(ip, XFS_IREMAPPING)) + return 0; + + xfs_iunlock(ip, *lock_mode); + *lock_mode = XFS_IOLOCK_EXCL; + return xfs_ilock_iocb(iocb, *lock_mode); +} + +static unsigned int +xfs_ilock_for_write_fault( + struct xfs_inode *ip) +{ + /* get a shared lock if no remapping in progress */ + xfs_ilock(ip, XFS_MMAPLOCK_SHARED); + if (!xfs_iflags_test(ip, XFS_IREMAPPING)) + return XFS_MMAPLOCK_SHARED; + + /* wait for remapping to complete */ + xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); + xfs_ilock(ip, XFS_MMAPLOCK_EXCL); + return XFS_MMAPLOCK_EXCL; +} + STATIC ssize_t xfs_file_dio_read( struct kiocb *iocb, @@ -523,7 +560,7 @@ xfs_file_dio_write_aligned( unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; - ret = xfs_ilock_iocb(iocb, iolock); + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; ret = xfs_file_write_checks(iocb, from, &iolock); @@ -590,7 +627,7 @@ retry_exclusive: flags = IOMAP_DIO_FORCE_WAIT; } - ret = xfs_ilock_iocb(iocb, iolock); + ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; @@ -1158,7 +1195,7 @@ xfs_file_remap_range( if (xfs_file_sync_writes(file_in) || xfs_file_sync_writes(file_out)) xfs_log_force_inode(dest); out_unlock: - xfs_iunlock2_io_mmap(src, dest); + xfs_iunlock2_remapping(src, dest); if (ret) trace_xfs_reflink_remap_range_error(dest, ret, _RET_IP_); /* @@ -1313,6 +1350,7 @@ __xfs_filemap_fault( struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); vm_fault_t ret; + unsigned int lock_mode = 0; trace_xfs_filemap_fault(ip, pe_size, write_fault); @@ -1321,25 +1359,24 @@ __xfs_filemap_fault( file_update_time(vmf->vma->vm_file); } + if (IS_DAX(inode) || write_fault) + lock_mode = xfs_ilock_for_write_fault(XFS_I(inode)); + if (IS_DAX(inode)) { pfn_t pfn; - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); ret = xfs_dax_fault(vmf, pe_size, write_fault, &pfn); if (ret & VM_FAULT_NEEDDSYNC) ret = dax_finish_sync_fault(vmf, pe_size, pfn); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); + } else if (write_fault) { + ret = iomap_page_mkwrite(vmf, &xfs_page_mkwrite_iomap_ops); } else { - if (write_fault) { - xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - ret = iomap_page_mkwrite(vmf, - &xfs_page_mkwrite_iomap_ops); - xfs_iunlock(XFS_I(inode), XFS_MMAPLOCK_SHARED); - } else { - ret = filemap_fault(vmf); - } + ret = filemap_fault(vmf); } + if (lock_mode) + xfs_iunlock(XFS_I(inode), lock_mode); + if (write_fault) sb_end_pagefault(inode->i_sb); return ret; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 1d32823d5099..dc84c75be852 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -3644,6 +3644,23 @@ xfs_iunlock2_io_mmap( inode_unlock(VFS_I(ip1)); } +/* Drop the MMAPLOCK and the IOLOCK after a remap completes. */ +void +xfs_iunlock2_remapping( + struct xfs_inode *ip1, + struct xfs_inode *ip2) +{ + xfs_iflags_clear(ip1, XFS_IREMAPPING); + + if (ip1 != ip2) + xfs_iunlock(ip1, XFS_MMAPLOCK_SHARED); + xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); + + if (ip1 != ip2) + inode_unlock_shared(VFS_I(ip1)); + inode_unlock(VFS_I(ip2)); +} + /* * Reload the incore inode list for this inode. Caller should ensure that * the link count cannot change, either by taking ILOCK_SHARED or otherwise diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 85395ad2859c..3a81477c7797 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -347,6 +347,14 @@ static inline bool xfs_inode_has_large_extent_counts(struct xfs_inode *ip) /* Quotacheck is running but inode has not been added to quota counts. */ #define XFS_IQUOTAUNCHECKED (1 << 14) +/* + * Remap in progress. Callers that wish to update file data while + * holding a shared IOLOCK or MMAPLOCK must drop the lock and retake + * the lock in exclusive mode. Relocking the file will block until + * IREMAPPING is cleared. + */ +#define XFS_IREMAPPING (1U << 15) + /* All inode state flags related to inode reclaim. */ #define XFS_ALL_IRECLAIM_FLAGS (XFS_IRECLAIMABLE | \ XFS_IRECLAIM | \ @@ -595,6 +603,7 @@ void xfs_end_io(struct work_struct *work); int xfs_ilock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); void xfs_iunlock2_io_mmap(struct xfs_inode *ip1, struct xfs_inode *ip2); +void xfs_iunlock2_remapping(struct xfs_inode *ip1, struct xfs_inode *ip2); static inline bool xfs_inode_unlinked_incomplete( diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index fe46bce8cae6..004f5a0444be 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1539,6 +1539,10 @@ xfs_reflink_remap_prep( if (ret) goto out_unlock; + xfs_iflags_set(src, XFS_IREMAPPING); + if (inode_in != inode_out) + xfs_ilock_demote(src, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); + return 0; out_unlock: xfs_iunlock2_io_mmap(src, dest); From 16cf312bf0416fff83aadb0d04a35f0b969d503f Mon Sep 17 00:00:00 2001 From: Long Li Date: Wed, 29 Jan 2025 10:47:08 -0800 Subject: [PATCH 21/51] xfs: factor out xfs_defer_pending_abort [ Upstream commit 2a5db859c6825b5d50377dda9c3cc729c20cad43 ] Factor out xfs_defer_pending_abort() from xfs_defer_trans_abort(), which not use transaction parameter, so it can be used after the transaction life cycle. Signed-off-by: Long Li Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_defer.c | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 5a321b783398..c9adb649e9b3 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -245,26 +245,33 @@ xfs_defer_create_intents( return ret; } +STATIC void +xfs_defer_pending_abort( + struct xfs_mount *mp, + struct list_head *dop_list) +{ + struct xfs_defer_pending *dfp; + const struct xfs_defer_op_type *ops; + + /* Abort intent items that don't have a done item. */ + list_for_each_entry(dfp, dop_list, dfp_list) { + ops = defer_op_types[dfp->dfp_type]; + trace_xfs_defer_pending_abort(mp, dfp); + if (dfp->dfp_intent && !dfp->dfp_done) { + ops->abort_intent(dfp->dfp_intent); + dfp->dfp_intent = NULL; + } + } +} + /* Abort all the intents that were committed. */ STATIC void xfs_defer_trans_abort( struct xfs_trans *tp, struct list_head *dop_pending) { - struct xfs_defer_pending *dfp; - const struct xfs_defer_op_type *ops; - trace_xfs_defer_trans_abort(tp, _RET_IP_); - - /* Abort intent items that don't have a done item. */ - list_for_each_entry(dfp, dop_pending, dfp_list) { - ops = defer_op_types[dfp->dfp_type]; - trace_xfs_defer_pending_abort(tp->t_mountp, dfp); - if (dfp->dfp_intent && !dfp->dfp_done) { - ops->abort_intent(dfp->dfp_intent); - dfp->dfp_intent = NULL; - } - } + xfs_defer_pending_abort(tp->t_mountp, dop_pending); } /* From b655ee7d1a245b98a87dfd8d7c89f871a588f825 Mon Sep 17 00:00:00 2001 From: Long Li Date: Wed, 29 Jan 2025 10:47:09 -0800 Subject: [PATCH 22/51] xfs: abort intent items when recovery intents fail [ Upstream commit f8f9d952e42dd49ae534f61f2fa7ca0876cb9848 ] When recovering intents, we capture newly created intent items as part of committing recovered intent items. If intent recovery fails at a later point, we forget to remove those newly created intent items from the AIL and hang: [root@localhost ~]# cat /proc/539/stack [<0>] xfs_ail_push_all_sync+0x174/0x230 [<0>] xfs_unmount_flush_inodes+0x8d/0xd0 [<0>] xfs_mountfs+0x15f7/0x1e70 [<0>] xfs_fs_fill_super+0x10ec/0x1b20 [<0>] get_tree_bdev+0x3c8/0x730 [<0>] vfs_get_tree+0x89/0x2c0 [<0>] path_mount+0xecf/0x1800 [<0>] do_mount+0xf3/0x110 [<0>] __x64_sys_mount+0x154/0x1f0 [<0>] do_syscall_64+0x39/0x80 [<0>] entry_SYSCALL_64_after_hwframe+0x63/0xcd When newly created intent items fail to commit via transaction, intent recovery hasn't created done items for these newly created intent items, so the capture structure is the sole owner of the captured intent items. We must release them explicitly or else they leak: unreferenced object 0xffff888016719108 (size 432): comm "mount", pid 529, jiffies 4294706839 (age 144.463s) hex dump (first 32 bytes): 08 91 71 16 80 88 ff ff 08 91 71 16 80 88 ff ff ..q.......q..... 18 91 71 16 80 88 ff ff 18 91 71 16 80 88 ff ff ..q.......q..... backtrace: [] xfs_efi_init+0x18f/0x1d0 [] xfs_extent_free_create_intent+0x50/0x150 [] xfs_defer_create_intents+0x16a/0x340 [] xfs_defer_ops_capture_and_commit+0x8e/0xad0 [] xfs_cui_item_recover+0x819/0x980 [] xlog_recover_process_intents+0x246/0xb70 [] xlog_recover_finish+0x8a/0x9a0 [] xfs_log_mount_finish+0x2bb/0x4a0 [] xfs_mountfs+0x14bf/0x1e70 [] xfs_fs_fill_super+0x10d0/0x1b20 [] get_tree_bdev+0x3d2/0x6d0 [] vfs_get_tree+0x89/0x2c0 [] path_mount+0xecf/0x1800 [] do_mount+0xf3/0x110 [] __x64_sys_mount+0x154/0x1f0 [] do_syscall_64+0x39/0x80 Fix the problem above by abort intent items that don't have a done item when recovery intents fail. Fixes: e6fff81e4870 ("xfs: proper replay of deferred ops queued during log recovery") Signed-off-by: Long Li Reviewed-by: Darrick J. Wong Signed-off-by: Chandan Babu R Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_defer.c | 5 +++-- fs/xfs/libxfs/xfs_defer.h | 2 +- fs/xfs/xfs_log_recover.c | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index c9adb649e9b3..92470ed3fcbd 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -761,12 +761,13 @@ xfs_defer_ops_capture( /* Release all resources that we used to capture deferred ops. */ void -xfs_defer_ops_capture_free( +xfs_defer_ops_capture_abort( struct xfs_mount *mp, struct xfs_defer_capture *dfc) { unsigned short i; + xfs_defer_pending_abort(mp, &dfc->dfc_dfops); xfs_defer_cancel_list(mp, &dfc->dfc_dfops); for (i = 0; i < dfc->dfc_held.dr_bufs; i++) @@ -807,7 +808,7 @@ xfs_defer_ops_capture_and_commit( /* Commit the transaction and add the capture structure to the list. */ error = xfs_trans_commit(tp); if (error) { - xfs_defer_ops_capture_free(mp, dfc); + xfs_defer_ops_capture_abort(mp, dfc); return error; } diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index 114a3a4930a3..8788ad5f6a73 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -121,7 +121,7 @@ int xfs_defer_ops_capture_and_commit(struct xfs_trans *tp, struct list_head *capture_list); void xfs_defer_ops_continue(struct xfs_defer_capture *d, struct xfs_trans *tp, struct xfs_defer_resources *dres); -void xfs_defer_ops_capture_free(struct xfs_mount *mp, +void xfs_defer_ops_capture_abort(struct xfs_mount *mp, struct xfs_defer_capture *d); void xfs_defer_resources_rele(struct xfs_defer_resources *dres); diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 006a376c34b2..e009bb23d8a2 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -2514,7 +2514,7 @@ xlog_abort_defer_ops( list_for_each_entry_safe(dfc, next, capture_list, dfc_list) { list_del_init(&dfc->dfc_list); - xfs_defer_ops_capture_free(mp, dfc); + xfs_defer_ops_capture_abort(mp, dfc); } } From 67c362b810aa63979a6816435c56b1a0cf14103f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Jan 2025 10:47:10 -0800 Subject: [PATCH 23/51] xfs: only remap the written blocks in xfs_reflink_end_cow_extent [ Upstream commit 55f669f34184ecb25b8353f29c7f6f1ae5b313d1 ] xfs_reflink_end_cow_extent looks up the COW extent and the data fork extent at offset_fsb, and then proceeds to remap the common subset between the two. It does however not limit the remapped extent to the passed in [*offset_fsbm end_fsb] range and thus potentially remaps more blocks than the one handled by the current I/O completion. This means that with sufficiently large data and COW extents we could be remapping COW fork mappings that have not been written to, leading to a stale data exposure on a powerfail event. We use to have a xfs_trim_range to make the remap fit the I/O completion range, but that got (apparently accidentally) removed in commit df2fd88f8ac7 ("xfs: rewrite xfs_reflink_end_cow to use intents"). Note that I've only found this by code inspection, and a test case would probably require very specific delay and error injection. Fixes: df2fd88f8ac7 ("xfs: rewrite xfs_reflink_end_cow to use intents") Signed-off-by: Christoph Hellwig Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_reflink.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 004f5a0444be..cbdc23217a42 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -783,6 +783,7 @@ xfs_reflink_end_cow_extent( } } del = got; + xfs_trim_extent(&del, *offset_fsb, end_fsb - *offset_fsb); /* Grab the corresponding mapping in the data fork. */ nmaps = 1; From 323a707978dc9eb81501a4fd5f66019bf5a9a5e2 Mon Sep 17 00:00:00 2001 From: Leah Rumancik Date: Wed, 29 Jan 2025 10:47:11 -0800 Subject: [PATCH 24/51] xfs: up(ic_sema) if flushing data device fails [ Upstream commit 471de20303dda0b67981e06d59cc6c4a83fd2a3c ] We flush the data device cache before we issue external log IO. If the flush fails, we shut down the log immediately and return. However, the iclog->ic_sema is left in a decremented state so let's add an up(). Prior to this patch, xfs/438 would fail consistently when running with an external log device: sync -> xfs_log_force -> xlog_write_iclog -> down(&iclog->ic_sema) -> blkdev_issue_flush (fail causes us to intiate shutdown) -> xlog_force_shutdown -> return unmount -> xfs_log_umount -> xlog_wait_iclog_completion -> down(&iclog->ic_sema) --------> HANG There is a second early return / shutdown. Make sure the up() happens for it as well. Also make sure we cleanup the iclog state, xlog_state_done_syncing, before dropping the iclog lock. Fixes: b5d721eaae47 ("xfs: external logs need to flush data device") Fixes: 842a42d126b4 ("xfs: shutdown on failure to add page to log bio") Fixes: 7d839e325af2 ("xfs: check return codes when flushing block devices") Signed-off-by: Leah Rumancik Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_log.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 59c982297503..ce6b303484cf 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1891,9 +1891,7 @@ xlog_write_iclog( * the buffer manually, the code needs to be kept in sync * with the I/O completion path. */ - xlog_state_done_syncing(iclog); - up(&iclog->ic_sema); - return; + goto sync; } /* @@ -1923,20 +1921,17 @@ xlog_write_iclog( * avoid shutdown re-entering this path and erroring out again. */ if (log->l_targ != log->l_mp->m_ddev_targp && - blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) { - xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return; - } + blkdev_issue_flush(log->l_mp->m_ddev_targp->bt_bdev)) + goto shutdown; } if (iclog->ic_flags & XLOG_ICL_NEED_FUA) iclog->ic_bio.bi_opf |= REQ_FUA; iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); - if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) { - xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); - return; - } + if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) + goto shutdown; + if (is_vmalloc_addr(iclog->ic_data)) flush_kernel_vmap_range(iclog->ic_data, count); @@ -1957,6 +1952,12 @@ xlog_write_iclog( } submit_bio(&iclog->ic_bio); + return; +shutdown: + xlog_force_shutdown(log, SHUTDOWN_LOG_IO_ERROR); +sync: + xlog_state_done_syncing(iclog); + up(&iclog->ic_sema); } /* From 6e7826272e85583f1c913687d0c97a8f68def4da Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Wed, 29 Jan 2025 10:47:12 -0800 Subject: [PATCH 25/51] xfs: fix internal error from AGFL exhaustion [ Upstream commit f63a5b3769ad7659da4c0420751d78958ab97675 ] We've been seeing XFS errors like the following: XFS: Internal error i != 1 at line 3526 of file fs/xfs/libxfs/xfs_btree.c. Caller xfs_btree_insert+0x1ec/0x280 ... Call Trace: xfs_corruption_error+0x94/0xa0 xfs_btree_insert+0x221/0x280 xfs_alloc_fixup_trees+0x104/0x3e0 xfs_alloc_ag_vextent_size+0x667/0x820 xfs_alloc_fix_freelist+0x5d9/0x750 xfs_free_extent_fix_freelist+0x65/0xa0 __xfs_free_extent+0x57/0x180 ... This is the XFS_IS_CORRUPT() check in xfs_btree_insert() when xfs_btree_insrec() fails. After converting this into a panic and dissecting the core dump, I found that xfs_btree_insrec() is failing because it's trying to split a leaf node in the cntbt when the AG free list is empty. In particular, it's failing to get a block from the AGFL _while trying to refill the AGFL_. If a single operation splits every level of the bnobt and the cntbt (and the rmapbt if it is enabled) at once, the free list will be empty. Then, when the next operation tries to refill the free list, it allocates space. If the allocation does not use a full extent, it will need to insert records for the remaining space in the bnobt and cntbt. And if those new records go in full leaves, the leaves (and potentially more nodes up to the old root) need to be split. Fix it by accounting for the additional splits that may be required to refill the free list in the calculation for the minimum free list size. P.S. As far as I can tell, this bug has existed for a long time -- maybe back to xfs-history commit afdf80ae7405 ("Add XFS_AG_MAXLEVELS macros ...") in April 1994! It requires a very unlucky sequence of events, and in fact we didn't hit it until a particular sparse mmap workload updated from 5.12 to 5.19. But this bug existed in 5.12, so it must've been exposed by some other change in allocation or writeback patterns. It's also much less likely to be hit with the rmapbt enabled, since that increases the minimum free list size and is unlikely to split at the same time as the bnobt and cntbt. Reviewed-by: "Darrick J. Wong" Reviewed-by: Dave Chinner Signed-off-by: Omar Sandoval Signed-off-by: Chandan Babu R Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_alloc.c | 27 ++++++++++++++++++++++++--- 1 file changed, 24 insertions(+), 3 deletions(-) diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 8bb024b06b95..74d039bdc9f7 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2273,16 +2273,37 @@ xfs_alloc_min_freelist( ASSERT(mp->m_alloc_maxlevels > 0); + /* + * For a btree shorter than the maximum height, the worst case is that + * every level gets split and a new level is added, then while inserting + * another entry to refill the AGFL, every level under the old root gets + * split again. This is: + * + * (full height split reservation) + (AGFL refill split height) + * = (current height + 1) + (current height - 1) + * = (new height) + (new height - 2) + * = 2 * new height - 2 + * + * For a btree of maximum height, the worst case is that every level + * under the root gets split, then while inserting another entry to + * refill the AGFL, every level under the root gets split again. This is + * also: + * + * 2 * (current height - 1) + * = 2 * (new height - 1) + * = 2 * new height - 2 + */ + /* space needed by-bno freespace btree */ min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1, - mp->m_alloc_maxlevels); + mp->m_alloc_maxlevels) * 2 - 2; /* space needed by-size freespace btree */ min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1, - mp->m_alloc_maxlevels); + mp->m_alloc_maxlevels) * 2 - 2; /* space needed reverse mapping used space btree */ if (xfs_has_rmapbt(mp)) min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1, - mp->m_rmap_maxlevels); + mp->m_rmap_maxlevels) * 2 - 2; return min_free; } From 318cac2b98dba55bf35f7c72bbeefdf63f5d91dd Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 29 Jan 2025 10:47:13 -0800 Subject: [PATCH 26/51] xfs: inode recovery does not validate the recovered inode [ Upstream commit 038ca189c0d2c1570b4d922f25b524007c85cf94 ] Discovered when trying to track down a weird recovery corruption issue that wasn't detected at recovery time. The specific corruption was a zero extent count field when big extent counts are in use, and it turns out the dinode verifier doesn't detect that specific corruption case, either. So fix it too. Signed-off-by: Dave Chinner Reviewed-by: "Darrick J. Wong" Signed-off-by: Chandan Babu R Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_inode_buf.c | 3 +++ fs/xfs/xfs_inode_item_recover.c | 14 +++++++++++++- 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index 758aacd8166b..601b05ca5fc2 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -507,6 +507,9 @@ xfs_dinode_verify( if (mode && nextents + naextents > nblocks) return __this_address; + if (nextents + naextents == 0 && nblocks != 0) + return __this_address; + if (S_ISDIR(mode) && nextents > mp->m_dir_geo->max_extents) return __this_address; diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index e6609067ef26..144198a6b270 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -286,6 +286,7 @@ xlog_recover_inode_commit_pass2( struct xfs_log_dinode *ldip; uint isize; int need_free = 0; + xfs_failaddr_t fa; if (item->ri_buf[0].i_len == sizeof(struct xfs_inode_log_format)) { in_f = item->ri_buf[0].i_addr; @@ -530,8 +531,19 @@ out_owner_change: (dip->di_mode != 0)) error = xfs_recover_inode_owner_change(mp, dip, in_f, buffer_list); - /* re-generate the checksum. */ + /* re-generate the checksum and validate the recovered inode. */ xfs_dinode_calc_crc(log->l_mp, dip); + fa = xfs_dinode_verify(log->l_mp, in_f->ilf_ino, dip); + if (fa) { + XFS_CORRUPTION_ERROR( + "Bad dinode after recovery", + XFS_ERRLEVEL_LOW, mp, dip, sizeof(*dip)); + xfs_alert(mp, + "Metadata corruption detected at %pS, inode 0x%llx", + fa, in_f->ilf_ino); + error = -EFSCORRUPTED; + goto out_release; + } ASSERT(bp->b_mount == mp); bp->b_flags |= _XBF_LOGRECOVERY; From df716416db1f2d72abfc0bf66a6ae8b058069511 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 29 Jan 2025 10:47:14 -0800 Subject: [PATCH 27/51] xfs: clean up dqblk extraction [ Upstream commit ed17f7da5f0c8b65b7b5f7c98beb0aadbc0546ee ] Since the introduction of xfs_dqblk in V5, xfs really ought to find the dqblk pointer from the dquot buffer, then compute the xfs_disk_dquot pointer from the dqblk pointer. Fix the open-coded xfs_buf_offset calls and do the type checking in the correct order. Note that this has made no practical difference since the start of the xfs_disk_dquot is coincident with the start of the xfs_dqblk. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_dquot.c | 5 +++-- fs/xfs/xfs_dquot_item_recover.c | 7 ++++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index 7f071757f278..a8b2f3b278ea 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -562,7 +562,8 @@ xfs_dquot_from_disk( struct xfs_dquot *dqp, struct xfs_buf *bp) { - struct xfs_disk_dquot *ddqp = bp->b_addr + dqp->q_bufoffset; + struct xfs_dqblk *dqb = xfs_buf_offset(bp, dqp->q_bufoffset); + struct xfs_disk_dquot *ddqp = &dqb->dd_diskdq; /* * Ensure that we got the type and ID we were looking for. @@ -1250,7 +1251,7 @@ xfs_qm_dqflush( } /* Flush the incore dquot to the ondisk buffer. */ - dqblk = bp->b_addr + dqp->q_bufoffset; + dqblk = xfs_buf_offset(bp, dqp->q_bufoffset); xfs_dquot_to_disk(&dqblk->dd_diskdq, dqp); /* diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index 8966ba842395..db2cb5e4197b 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -65,6 +65,7 @@ xlog_recover_dquot_commit_pass2( { struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp; + struct xfs_dqblk *dqb; struct xfs_disk_dquot *ddq, *recddq; struct xfs_dq_logformat *dq_f; xfs_failaddr_t fa; @@ -130,14 +131,14 @@ xlog_recover_dquot_commit_pass2( return error; ASSERT(bp); - ddq = xfs_buf_offset(bp, dq_f->qlf_boffset); + dqb = xfs_buf_offset(bp, dq_f->qlf_boffset); + ddq = &dqb->dd_diskdq; /* * If the dquot has an LSN in it, recover the dquot only if it's less * than the lsn of the transaction we are replaying. */ if (xfs_has_crc(mp)) { - struct xfs_dqblk *dqb = (struct xfs_dqblk *)ddq; xfs_lsn_t lsn = be64_to_cpu(dqb->dd_lsn); if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) { @@ -147,7 +148,7 @@ xlog_recover_dquot_commit_pass2( memcpy(ddq, recddq, item->ri_buf[1].i_len); if (xfs_has_crc(mp)) { - xfs_update_cksum((char *)ddq, sizeof(struct xfs_dqblk), + xfs_update_cksum((char *)dqb, sizeof(struct xfs_dqblk), XFS_DQUOT_CRC_OFF); } From 7880b1f0adef4d363f42d6bb42aab3211291351b Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Wed, 29 Jan 2025 10:47:15 -0800 Subject: [PATCH 28/51] xfs: dquot recovery does not validate the recovered dquot [ Upstream commit 9c235dfc3d3f901fe22acb20f2ab37ff39f2ce02 ] When we're recovering ondisk quota records from the log, we need to validate the recovered buffer contents before writing them to disk. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Chandan Babu R Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_dquot_item_recover.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/xfs/xfs_dquot_item_recover.c b/fs/xfs/xfs_dquot_item_recover.c index db2cb5e4197b..2c2720ce6923 100644 --- a/fs/xfs/xfs_dquot_item_recover.c +++ b/fs/xfs/xfs_dquot_item_recover.c @@ -19,6 +19,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_log_recover.h" +#include "xfs_error.h" STATIC void xlog_recover_dquot_ra_pass2( @@ -152,6 +153,19 @@ xlog_recover_dquot_commit_pass2( XFS_DQUOT_CRC_OFF); } + /* Validate the recovered dquot. */ + fa = xfs_dqblk_verify(log->l_mp, dqb, dq_f->qlf_id); + if (fa) { + XFS_CORRUPTION_ERROR("Bad dquot after recovery", + XFS_ERRLEVEL_LOW, mp, dqb, + sizeof(struct xfs_dqblk)); + xfs_alert(mp, + "Metadata corruption detected at %pS, dquot 0x%x", + fa, dq_f->qlf_id); + error = -EFSCORRUPTED; + goto out_release; + } + ASSERT(dq_f->qlf_size == 2); ASSERT(bp->b_mount == mp); bp->b_flags |= _XBF_LOGRECOVERY; From bc4ad69947a2aa4d1103227f3ebc32655927cf79 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Jan 2025 10:47:16 -0800 Subject: [PATCH 29/51] xfs: clean up FS_XFLAG_REALTIME handling in xfs_ioctl_setattr_xflags [ Upstream commit c421df0b19430417a04f68919fc3d1943d20ac04 ] Introduce a local boolean variable if FS_XFLAG_REALTIME to make the checks for it more obvious, and de-densify a few of the conditionals using it to make them more readable while at it. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231025141020.192413-4-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_ioctl.c | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 85fbb3b71d1c..f6aa9e6138ae 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1120,23 +1120,25 @@ xfs_ioctl_setattr_xflags( struct fileattr *fa) { struct xfs_mount *mp = ip->i_mount; + bool rtflag = (fa->fsx_xflags & FS_XFLAG_REALTIME); uint64_t i_flags2; - /* Can't change realtime flag if any extents are allocated. */ - if ((ip->i_df.if_nextents || ip->i_delayed_blks) && - XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) - return -EINVAL; - - /* If realtime flag is set then must have realtime device */ - if (fa->fsx_xflags & FS_XFLAG_REALTIME) { - if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || - (ip->i_extsize % mp->m_sb.sb_rextsize)) + if (rtflag != XFS_IS_REALTIME_INODE(ip)) { + /* Can't change realtime flag if any extents are allocated. */ + if (ip->i_df.if_nextents || ip->i_delayed_blks) return -EINVAL; } - /* Clear reflink if we are actually able to set the rt flag. */ - if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip)) - ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + if (rtflag) { + /* If realtime flag is set then must have realtime device */ + if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || + (ip->i_extsize % mp->m_sb.sb_rextsize)) + return -EINVAL; + + /* Clear reflink if we are actually able to set the rt flag. */ + if (xfs_is_reflink_inode(ip)) + ip->i_diflags2 &= ~XFS_DIFLAG2_REFLINK; + } /* Don't allow us to set DAX mode for a reflinked file for now. */ if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip)) From 754df8c9b7d3e36bef506de1e3bd71ac08e7e9d3 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 29 Jan 2025 10:47:17 -0800 Subject: [PATCH 30/51] xfs: respect the stable writes flag on the RT device [ Upstream commit 9c04138414c00ae61421f36ada002712c4bac94a ] Update the per-folio stable writes flag dependening on which device an inode resides on. Signed-off-by: Christoph Hellwig Link: https://lore.kernel.org/r/20231025141020.192413-5-hch@lst.de Reviewed-by: Darrick J. Wong Signed-off-by: Christian Brauner Signed-off-by: Leah Rumancik Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_inode.h | 8 ++++++++ fs/xfs/xfs_ioctl.c | 8 ++++++++ fs/xfs/xfs_iops.c | 7 +++++++ 3 files changed, 23 insertions(+) diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 3a81477c7797..c177c92f3aa5 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -569,6 +569,14 @@ extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_iops(struct xfs_inode *ip); extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); +static inline void xfs_update_stable_writes(struct xfs_inode *ip) +{ + if (bdev_stable_writes(xfs_inode_buftarg(ip)->bt_bdev)) + mapping_set_stable_writes(VFS_I(ip)->i_mapping); + else + mapping_clear_stable_writes(VFS_I(ip)->i_mapping); +} + /* * When setting up a newly allocated inode, we need to call * xfs_finish_inode_setup() once the inode is fully instantiated at diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index f6aa9e6138ae..c7cb496dc345 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1153,6 +1153,14 @@ xfs_ioctl_setattr_xflags( ip->i_diflags2 = i_flags2; xfs_diflags_to_iflags(ip, false); + + /* + * Make the stable writes flag match that of the device the inode + * resides on when flipping the RT flag. + */ + if (rtflag != XFS_IS_REALTIME_INODE(ip) && S_ISREG(VFS_I(ip)->i_mode)) + xfs_update_stable_writes(ip); + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); XFS_STATS_INC(mp, xs_ig_attrchg); diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 2e10e1c66ad6..6fbdc0a19e54 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1291,6 +1291,13 @@ xfs_setup_inode( gfp_mask = mapping_gfp_mask(inode->i_mapping); mapping_set_gfp_mask(inode->i_mapping, (gfp_mask & ~(__GFP_FS))); + /* + * For real-time inodes update the stable write flags to that of the RT + * device instead of the data device. + */ + if (S_ISREG(inode->i_mode) && XFS_IS_REALTIME_INODE(ip)) + xfs_update_stable_writes(ip); + /* * If there is no attribute fork no ACL can exist on this inode, * and it can't have any file capabilities attached to it either. From 2a40a140e11fec699e128170ccaa98b6b82cb503 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 13 Jan 2025 19:31:28 +0100 Subject: [PATCH 31/51] gfs2: Truncate address space when flipping GFS2_DIF_JDATA flag commit 7c9d9223802fbed4dee1ae301661bf346964c9d2 upstream. Truncate an inode's address space when flipping the GFS2_DIF_JDATA flag: depending on that flag, the pages in the address space will either use buffer heads or iomap_folio_state structs, and we cannot mix the two. Reported-by: Kun Hu , Jiaji Qin Signed-off-by: Andreas Gruenbacher Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/file.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c367f1678d5d..8b564c685871 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -250,6 +250,7 @@ static int do_gfs2_set_flags(struct inode *inode, u32 reqflags, u32 mask) error = filemap_fdatawait(inode->i_mapping); if (error) goto out; + truncate_inode_pages(inode->i_mapping, 0); if (new_flags & GFS2_DIF_JDATA) gfs2_ordered_del_inode(ip); } From 74a37ce697f3fa393324990c872a3a8933344f64 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Mon, 9 Jan 2023 14:46:10 +0000 Subject: [PATCH 32/51] io_uring: fix waiters missing wake ups There are reports of mariadb hangs, which is caused by a missing barrier in the waking code resulting in waiters losing events. The problem was introduced in a backport 3ab9326f93ec4 ("io_uring: wake up optimisations"), and the change restores the barrier present in the original commit 3ab9326f93ec4 ("io_uring: wake up optimisations") Reported by: Xan Charbonnet Fixes: 3ab9326f93ec4 ("io_uring: wake up optimisations") Link: https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=1093243#99 Reviewed-by: Li Zetao Signed-off-by: Pavel Begunkov Signed-off-by: Greg Kroah-Hartman --- io_uring/io_uring.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 480752fc3eb6..64502323be5e 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -597,8 +597,10 @@ static inline void __io_cq_unlock_post_flush(struct io_ring_ctx *ctx) io_commit_cqring(ctx); spin_unlock(&ctx->completion_lock); io_commit_cqring_flush(ctx); - if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) + if (!(ctx->flags & IORING_SETUP_DEFER_TASKRUN)) { + smp_mb(); __io_cqring_wake(ctx); + } } void io_cq_unlock_post(struct io_ring_ctx *ctx) From 1332c6ed446be787f901ed1064ec6a3c694f028a Mon Sep 17 00:00:00 2001 From: Jamal Hadi Salim Date: Sat, 11 Jan 2025 09:57:39 -0500 Subject: [PATCH 33/51] net: sched: fix ets qdisc OOB Indexing commit d62b04fca4340a0d468d7853bd66e511935a18cb upstream. Haowei Yan found that ets_class_from_arg() can index an Out-Of-Bound class in ets_class_from_arg() when passed clid of 0. The overflow may cause local privilege escalation. [ 18.852298] ------------[ cut here ]------------ [ 18.853271] UBSAN: array-index-out-of-bounds in net/sched/sch_ets.c:93:20 [ 18.853743] index 18446744073709551615 is out of range for type 'ets_class [16]' [ 18.854254] CPU: 0 UID: 0 PID: 1275 Comm: poc Not tainted 6.12.6-dirty #17 [ 18.854821] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 [ 18.856532] Call Trace: [ 18.857441] [ 18.858227] dump_stack_lvl+0xc2/0xf0 [ 18.859607] dump_stack+0x10/0x20 [ 18.860908] __ubsan_handle_out_of_bounds+0xa7/0xf0 [ 18.864022] ets_class_change+0x3d6/0x3f0 [ 18.864322] tc_ctl_tclass+0x251/0x910 [ 18.864587] ? lock_acquire+0x5e/0x140 [ 18.865113] ? __mutex_lock+0x9c/0xe70 [ 18.866009] ? __mutex_lock+0xa34/0xe70 [ 18.866401] rtnetlink_rcv_msg+0x170/0x6f0 [ 18.866806] ? __lock_acquire+0x578/0xc10 [ 18.867184] ? __pfx_rtnetlink_rcv_msg+0x10/0x10 [ 18.867503] netlink_rcv_skb+0x59/0x110 [ 18.867776] rtnetlink_rcv+0x15/0x30 [ 18.868159] netlink_unicast+0x1c3/0x2b0 [ 18.868440] netlink_sendmsg+0x239/0x4b0 [ 18.868721] ____sys_sendmsg+0x3e2/0x410 [ 18.869012] ___sys_sendmsg+0x88/0xe0 [ 18.869276] ? rseq_ip_fixup+0x198/0x260 [ 18.869563] ? rseq_update_cpu_node_id+0x10a/0x190 [ 18.869900] ? trace_hardirqs_off+0x5a/0xd0 [ 18.870196] ? syscall_exit_to_user_mode+0xcc/0x220 [ 18.870547] ? do_syscall_64+0x93/0x150 [ 18.870821] ? __memcg_slab_free_hook+0x69/0x290 [ 18.871157] __sys_sendmsg+0x69/0xd0 [ 18.871416] __x64_sys_sendmsg+0x1d/0x30 [ 18.871699] x64_sys_call+0x9e2/0x2670 [ 18.871979] do_syscall_64+0x87/0x150 [ 18.873280] ? do_syscall_64+0x93/0x150 [ 18.874742] ? lock_release+0x7b/0x160 [ 18.876157] ? do_user_addr_fault+0x5ce/0x8f0 [ 18.877833] ? irqentry_exit_to_user_mode+0xc2/0x210 [ 18.879608] ? irqentry_exit+0x77/0xb0 [ 18.879808] ? clear_bhb_loop+0x15/0x70 [ 18.880023] ? clear_bhb_loop+0x15/0x70 [ 18.880223] ? clear_bhb_loop+0x15/0x70 [ 18.880426] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 18.880683] RIP: 0033:0x44a957 [ 18.880851] Code: ff ff e8 fc 00 00 00 66 2e 0f 1f 84 00 00 00 00 00 66 90 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 8974 24 10 [ 18.881766] RSP: 002b:00007ffcdd00fad8 EFLAGS: 00000246 ORIG_RAX: 000000000000002e [ 18.882149] RAX: ffffffffffffffda RBX: 00007ffcdd010db8 RCX: 000000000044a957 [ 18.882507] RDX: 0000000000000000 RSI: 00007ffcdd00fb70 RDI: 0000000000000003 [ 18.885037] RBP: 00007ffcdd010bc0 R08: 000000000703c770 R09: 000000000703c7c0 [ 18.887203] R10: 0000000000000080 R11: 0000000000000246 R12: 0000000000000001 [ 18.888026] R13: 00007ffcdd010da8 R14: 00000000004ca7d0 R15: 0000000000000001 [ 18.888395] [ 18.888610] ---[ end trace ]--- Fixes: dcc68b4d8084 ("net: sch_ets: Add a new Qdisc") Reported-by: Haowei Yan Suggested-by: Haowei Yan Signed-off-by: Jamal Hadi Salim Reviewed-by: Eric Dumazet Reviewed-by: Petr Machata Link: https://patch.msgid.link/20250111145740.74755-1-jhs@mojatatu.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_ets.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index b10efeaf0629..9fd70462b41d 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -91,6 +91,8 @@ ets_class_from_arg(struct Qdisc *sch, unsigned long arg) { struct ets_sched *q = qdisc_priv(sch); + if (arg == 0 || arg > q->nbands) + return NULL; return &q->classes[arg - 1]; } From 8476f8428e8b48fd7a0e4258fa2a96a8f4468239 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 3 Sep 2024 22:48:19 +0300 Subject: [PATCH 34/51] block: fix integer overflow in BLKSECDISCARD commit 697ba0b6ec4ae04afb67d3911799b5e2043b4455 upstream. I independently rediscovered commit 22d24a544b0d49bbcbd61c8c0eaf77d3c9297155 block: fix overflow in blk_ioctl_discard() but for secure erase. Same problem: uint64_t r[2] = {512, 18446744073709551104ULL}; ioctl(fd, BLKSECDISCARD, r); will enter near infinite loop inside blkdev_issue_secure_erase(): a.out: attempt to access beyond end of device loop0: rw=5, sector=3399043073, nr_sectors = 1024 limit=2048 bio_check_eod: 3286214 callbacks suppressed Signed-off-by: Alexey Dobriyan Link: https://lore.kernel.org/r/9e64057f-650a-46d1-b9f7-34af391536ef@p183 Signed-off-by: Jens Axboe Signed-off-by: Rajani Kantha Signed-off-by: Greg Kroah-Hartman --- block/ioctl.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/block/ioctl.c b/block/ioctl.c index c7390d8c9fc7..552da0ccbec0 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -115,7 +115,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, return -EINVAL; filemap_invalidate_lock(inode->i_mapping); - err = truncate_bdev_range(bdev, mode, start, start + len - 1); + err = truncate_bdev_range(bdev, mode, start, end - 1); if (err) goto fail; err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); @@ -127,7 +127,7 @@ fail: static int blk_ioctl_secure_erase(struct block_device *bdev, fmode_t mode, void __user *argp) { - uint64_t start, len; + uint64_t start, len, end; uint64_t range[2]; int err; @@ -142,11 +142,12 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, fmode_t mode, len = range[1]; if ((start & 511) || (len & 511)) return -EINVAL; - if (start + len > bdev_nr_bytes(bdev)) + if (check_add_overflow(start, len, &end) || + end > bdev_nr_bytes(bdev)) return -EINVAL; filemap_invalidate_lock(bdev->bd_inode->i_mapping); - err = truncate_bdev_range(bdev, mode, start, start + len - 1); + err = truncate_bdev_range(bdev, mode, start, end - 1); if (!err) err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, GFP_KERNEL); From 1a1b2b8c28987fd769cc9cc0616e9eb9ad599e2a Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 12 Dec 2024 09:53:10 +0100 Subject: [PATCH 35/51] Revert "HID: multitouch: Add support for lenovo Y9000P Touchpad" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3d88ba86ba6f35a0467f25a88c38aa5639190d04 upstream. This reverts commit 251efae73bd46b097deec4f9986d926813aed744. Quoting Wang Yuli: "The 27C6:01E0 touchpad doesn't require the workaround and applying it would actually break functionality. The initial report came from a BBS forum, but we suspect the information provided by the forum user may be incorrect which could happen sometimes. [1] Further investigation showed that the Lenovo Y9000P 2024 doesn't even use a Goodix touchpad. [2] For the broader issue of 27c6:01e0 being unusable on some devices, it just need to address it with a libinput quirk. In conclusion, we should revert this commit, which is the best solution." Reported-by: Ulrich Müller Reported-by: WangYuli Link: https://lore.kernel.org/all/uikt4wwpw@gentoo.org/ Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/hid-ids.h | 1 - drivers/hid/hid-multitouch.c | 8 ++------ 2 files changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 790966e5b6ec..d8c5e24e7d44 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -507,7 +507,6 @@ #define USB_DEVICE_ID_GENERAL_TOUCH_WIN8_PIT_E100 0xe100 #define I2C_VENDOR_ID_GOODIX 0x27c6 -#define I2C_DEVICE_ID_GOODIX_01E0 0x01e0 #define I2C_DEVICE_ID_GOODIX_01E8 0x01e8 #define I2C_DEVICE_ID_GOODIX_01E9 0x01e9 #define I2C_DEVICE_ID_GOODIX_01F0 0x01f0 diff --git a/drivers/hid/hid-multitouch.c b/drivers/hid/hid-multitouch.c index bf9cad711259..e62104e1a603 100644 --- a/drivers/hid/hid-multitouch.c +++ b/drivers/hid/hid-multitouch.c @@ -1447,8 +1447,7 @@ static __u8 *mt_report_fixup(struct hid_device *hdev, __u8 *rdesc, { if (hdev->vendor == I2C_VENDOR_ID_GOODIX && (hdev->product == I2C_DEVICE_ID_GOODIX_01E8 || - hdev->product == I2C_DEVICE_ID_GOODIX_01E9 || - hdev->product == I2C_DEVICE_ID_GOODIX_01E0)) { + hdev->product == I2C_DEVICE_ID_GOODIX_01E9)) { if (rdesc[607] == 0x15) { rdesc[607] = 0x25; dev_info( @@ -2073,10 +2072,7 @@ static const struct hid_device_id mt_devices[] = { I2C_DEVICE_ID_GOODIX_01E8) }, { .driver_data = MT_CLS_WIN_8_FORCE_MULTI_INPUT_NSMU, HID_DEVICE(BUS_I2C, HID_GROUP_ANY, I2C_VENDOR_ID_GOODIX, - I2C_DEVICE_ID_GOODIX_01E9) }, - { .driver_data = MT_CLS_WIN_8_FORCE_MULTI_INPUT_NSMU, - HID_DEVICE(BUS_I2C, HID_GROUP_ANY, I2C_VENDOR_ID_GOODIX, - I2C_DEVICE_ID_GOODIX_01E0) }, + I2C_DEVICE_ID_GOODIX_01E8) }, /* GoodTouch panels */ { .driver_data = MT_CLS_NSMU, From 6bcb8a5b70b80143db9bf12dfa7d53636f824d53 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Wed, 22 Jan 2025 10:38:30 -0700 Subject: [PATCH 36/51] vfio/platform: check the bounds of read/write syscalls MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit ce9ff21ea89d191e477a02ad7eabf4f996b80a69 upstream. count and offset are passed from user space and not checked, only offset is capped to 40 bits, which can be used to read/write out of bounds of the device. Fixes: 6e3f26456009 (“vfio/platform: read and write support for the device fd”) Cc: stable@vger.kernel.org Reported-by: Mostafa Saleh Reviewed-by: Eric Auger Reviewed-by: Mostafa Saleh Tested-by: Mostafa Saleh Signed-off-by: Alex Williamson Signed-off-by: Greg Kroah-Hartman --- drivers/vfio/platform/vfio_platform_common.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/vfio/platform/vfio_platform_common.c b/drivers/vfio/platform/vfio_platform_common.c index 1a0a238ffa35..af432b5b11ef 100644 --- a/drivers/vfio/platform/vfio_platform_common.c +++ b/drivers/vfio/platform/vfio_platform_common.c @@ -391,6 +391,11 @@ static ssize_t vfio_platform_read_mmio(struct vfio_platform_region *reg, { unsigned int done = 0; + if (off >= reg->size) + return -EINVAL; + + count = min_t(size_t, count, reg->size - off); + if (!reg->ioaddr) { reg->ioaddr = ioremap(reg->addr, reg->size); @@ -470,6 +475,11 @@ static ssize_t vfio_platform_write_mmio(struct vfio_platform_region *reg, { unsigned int done = 0; + if (off >= reg->size) + return -EINVAL; + + count = min_t(size_t, count, reg->size - off); + if (!reg->ioaddr) { reg->ioaddr = ioremap(reg->addr, reg->size); From 13ea9547763a0488a90ff37cdf52ec85e36ea344 Mon Sep 17 00:00:00 2001 From: "Luis Henriques (SUSE)" Date: Thu, 18 Jul 2024 10:43:56 +0100 Subject: [PATCH 37/51] ext4: fix access to uninitialised lock in fc replay path commit 23dfdb56581ad92a9967bcd720c8c23356af74c1 upstream. The following kernel trace can be triggered with fstest generic/629 when executed against a filesystem with fast-commit feature enabled: INFO: trying to register non-static key. The code is fine but needs lockdep annotation, or maybe you didn't initialize this object before use? turning off the locking correctness validator. CPU: 0 PID: 866 Comm: mount Not tainted 6.10.0+ #11 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-prebuilt.qemu.org 04/01/2014 Call Trace: dump_stack_lvl+0x66/0x90 register_lock_class+0x759/0x7d0 __lock_acquire+0x85/0x2630 ? __find_get_block+0xb4/0x380 lock_acquire+0xd1/0x2d0 ? __ext4_journal_get_write_access+0xd5/0x160 _raw_spin_lock+0x33/0x40 ? __ext4_journal_get_write_access+0xd5/0x160 __ext4_journal_get_write_access+0xd5/0x160 ext4_reserve_inode_write+0x61/0xb0 __ext4_mark_inode_dirty+0x79/0x270 ? ext4_ext_replay_set_iblocks+0x2f8/0x450 ext4_ext_replay_set_iblocks+0x330/0x450 ext4_fc_replay+0x14c8/0x1540 ? jread+0x88/0x2e0 ? rcu_is_watching+0x11/0x40 do_one_pass+0x447/0xd00 jbd2_journal_recover+0x139/0x1b0 jbd2_journal_load+0x96/0x390 ext4_load_and_init_journal+0x253/0xd40 ext4_fill_super+0x2cc6/0x3180 ... In the replay path there's an attempt to lock sbi->s_bdev_wb_lock in function ext4_check_bdev_write_error(). Unfortunately, at this point this spinlock has not been initialized yet. Moving it's initialization to an earlier point in __ext4_fill_super() fixes this splat. Signed-off-by: Luis Henriques (SUSE) Link: https://patch.msgid.link/20240718094356.7863-1-luis.henriques@linux.dev Signed-off-by: Theodore Ts'o Cc: stable@kernel.org Signed-off-by: Bruno VERNAY Signed-off-by: Victor Giraud Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0b2591c07166..53f1deb049ec 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -5264,6 +5264,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); + spin_lock_init(&sbi->s_bdev_wb_lock); + ext4_fast_commit_init(sb); sb->s_root = NULL; @@ -5514,7 +5516,6 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) * Save the original bdev mapping's wb_err value which could be * used to detect the metadata async write error. */ - spin_lock_init(&sbi->s_bdev_wb_lock); errseq_check_and_advance(&sb->s_bdev->bd_inode->i_mapping->wb_err, &sbi->s_bdev_wb_err); sb->s_bdev->bd_super = sb; From ce11424026cbf87d5861b09e5e33565ff7f2ec8d Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Wed, 23 Oct 2024 15:30:09 +0300 Subject: [PATCH 38/51] ipv4: ip_tunnel: Fix suspicious RCU usage warning in ip_tunnel_find() commit 90e0569dd3d32f4f4d2ca691d3fa5a8a14a13c12 upstream. The per-netns IP tunnel hash table is protected by the RTNL mutex and ip_tunnel_find() is only called from the control path where the mutex is taken. Add a lockdep expression to hlist_for_each_entry_rcu() in ip_tunnel_find() in order to validate that the mutex is held and to silence the suspicious RCU usage warning [1]. [1] WARNING: suspicious RCU usage 6.12.0-rc3-custom-gd95d9a31aceb #139 Not tainted ----------------------------- net/ipv4/ip_tunnel.c:221 RCU-list traversed in non-reader section!! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by ip/362: #0: ffffffff86fc7cb0 (rtnl_mutex){+.+.}-{3:3}, at: rtnetlink_rcv_msg+0x377/0xf60 stack backtrace: CPU: 12 UID: 0 PID: 362 Comm: ip Not tainted 6.12.0-rc3-custom-gd95d9a31aceb #139 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: dump_stack_lvl+0xba/0x110 lockdep_rcu_suspicious.cold+0x4f/0xd6 ip_tunnel_find+0x435/0x4d0 ip_tunnel_newlink+0x517/0x7a0 ipgre_newlink+0x14c/0x170 __rtnl_newlink+0x1173/0x19c0 rtnl_newlink+0x6c/0xa0 rtnetlink_rcv_msg+0x3cc/0xf60 netlink_rcv_skb+0x171/0x450 netlink_unicast+0x539/0x7f0 netlink_sendmsg+0x8c1/0xd80 ____sys_sendmsg+0x8f9/0xc20 ___sys_sendmsg+0x197/0x1e0 __sys_sendmsg+0x122/0x1f0 do_syscall_64+0xbb/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f Fixes: c54419321455 ("GRE: Refactor GRE tunneling code.") Suggested-by: Eric Dumazet Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241023123009.749764-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Alva Lan Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_tunnel.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c index 67cabc40f1dc..73d7372afb43 100644 --- a/net/ipv4/ip_tunnel.c +++ b/net/ipv4/ip_tunnel.c @@ -218,7 +218,7 @@ static struct ip_tunnel *ip_tunnel_find(struct ip_tunnel_net *itn, struct ip_tunnel *t = NULL; struct hlist_head *head = ip_bucket(itn, parms); - hlist_for_each_entry_rcu(t, head, hash_node) { + hlist_for_each_entry_rcu(t, head, hash_node, lockdep_rtnl_is_held()) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && link == READ_ONCE(t->parms.link) && From 182a4b7c731e95c08cb47f14b87a272b6ab2b2da Mon Sep 17 00:00:00 2001 From: Easwar Hariharan Date: Tue, 7 Jan 2025 17:28:40 +0000 Subject: [PATCH 39/51] scsi: storvsc: Ratelimit warning logs to prevent VM denial of service commit d2138eab8cde61e0e6f62d0713e45202e8457d6d upstream. If there's a persistent error in the hypervisor, the SCSI warning for failed I/O can flood the kernel log and max out CPU utilization, preventing troubleshooting from the VM side. Ratelimit the warning so it doesn't DoS the VM. Closes: https://github.com/microsoft/WSL/issues/9173 Signed-off-by: Easwar Hariharan Link: https://lore.kernel.org/r/20250107-eahariha-ratelimit-storvsc-v1-1-7fc193d1f2b0@linux.microsoft.com Reviewed-by: Michael Kelley Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/storvsc_drv.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 0685cbe7f0eb..d47adab00f04 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -171,6 +171,12 @@ do { \ dev_warn(&(dev)->device, fmt, ##__VA_ARGS__); \ } while (0) +#define storvsc_log_ratelimited(dev, level, fmt, ...) \ +do { \ + if (do_logging(level)) \ + dev_warn_ratelimited(&(dev)->device, fmt, ##__VA_ARGS__); \ +} while (0) + struct vmscsi_request { u16 length; u8 srb_status; @@ -1168,7 +1174,7 @@ static void storvsc_on_io_completion(struct storvsc_device *stor_device, int loglevel = (stor_pkt->vm_srb.cdb[0] == TEST_UNIT_READY) ? STORVSC_LOGGING_WARN : STORVSC_LOGGING_ERROR; - storvsc_log(device, loglevel, + storvsc_log_ratelimited(device, loglevel, "tag#%d cmd 0x%x status: scsi 0x%x srb 0x%x hv 0x%x\n", scsi_cmd_to_rq(request->cmd)->tag, stor_pkt->vm_srb.cdb[0], From 49a27ee475a72ea554812080341539d54005e840 Mon Sep 17 00:00:00 2001 From: Anjaneyulu Date: Wed, 14 Jun 2023 12:41:37 +0300 Subject: [PATCH 40/51] wifi: iwlwifi: add a few rate index validity checks commit efbe8f81952fe469d38655744627d860879dcde8 upstream. Validate index before access iwl_rate_mcs to keep rate->index inside the valid boundaries. Use MCS_0_INDEX if index is less than MCS_0_INDEX and MCS_9_INDEX if index is greater then MCS_9_INDEX. Signed-off-by: Anjaneyulu Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20230614123447.79f16b3aef32.If1137f894775d6d07b78cbf3a6163ffce6399507@changeid Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/intel/iwlwifi/dvm/rs.c | 9 ++++++--- drivers/net/wireless/intel/iwlwifi/mvm/rs.c | 9 ++++++--- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/dvm/rs.c b/drivers/net/wireless/intel/iwlwifi/dvm/rs.c index 687c906a9d72..4b1f006c105b 100644 --- a/drivers/net/wireless/intel/iwlwifi/dvm/rs.c +++ b/drivers/net/wireless/intel/iwlwifi/dvm/rs.c @@ -2,7 +2,7 @@ /****************************************************************************** * * Copyright(c) 2005 - 2014 Intel Corporation. All rights reserved. - * Copyright (C) 2019 - 2020, 2022 Intel Corporation + * Copyright (C) 2019 - 2020, 2022 - 2023 Intel Corporation *****************************************************************************/ #include #include @@ -125,7 +125,7 @@ static int iwl_hwrate_to_plcp_idx(u32 rate_n_flags) return idx; } - return -1; + return IWL_RATE_INVALID; } static void rs_rate_scale_perform(struct iwl_priv *priv, @@ -3146,7 +3146,10 @@ static ssize_t rs_sta_dbgfs_scale_table_read(struct file *file, for (i = 0; i < LINK_QUAL_MAX_RETRY_NUM; i++) { index = iwl_hwrate_to_plcp_idx( le32_to_cpu(lq_sta->lq.rs_table[i].rate_n_flags)); - if (is_legacy(tbl->lq_type)) { + if (index == IWL_RATE_INVALID) { + desc += sprintf(buff + desc, " rate[%d] 0x%X invalid rate\n", + i, le32_to_cpu(lq_sta->lq.rs_table[i].rate_n_flags)); + } else if (is_legacy(tbl->lq_type)) { desc += sprintf(buff+desc, " rate[%d] 0x%X %smbps\n", i, le32_to_cpu(lq_sta->lq.rs_table[i].rate_n_flags), iwl_rate_mcs[index].mbps); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c index 2be6801d48ac..c5c24c6ffde3 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rs.c @@ -1072,10 +1072,13 @@ static void rs_get_lower_rate_down_column(struct iwl_lq_sta *lq_sta, rate->bw = RATE_MCS_CHAN_WIDTH_20; - WARN_ON_ONCE(rate->index < IWL_RATE_MCS_0_INDEX || - rate->index > IWL_RATE_MCS_9_INDEX); + if (WARN_ON_ONCE(rate->index < IWL_RATE_MCS_0_INDEX)) + rate->index = rs_ht_to_legacy[IWL_RATE_MCS_0_INDEX]; + else if (WARN_ON_ONCE(rate->index > IWL_RATE_MCS_9_INDEX)) + rate->index = rs_ht_to_legacy[IWL_RATE_MCS_9_INDEX]; + else + rate->index = rs_ht_to_legacy[rate->index]; - rate->index = rs_ht_to_legacy[rate->index]; rate->ldpc = false; } else { /* Downgrade to SISO with same MCS if in MIMO */ From bce966530fd5542bbb422cb45ecb775f7a1a6bc3 Mon Sep 17 00:00:00 2001 From: Enzo Matsumiya Date: Thu, 26 Sep 2024 14:46:13 -0300 Subject: [PATCH 41/51] smb: client: fix UAF in async decryption commit b0abcd65ec545701b8793e12bc27dc98042b151a upstream. Doing an async decryption (large read) crashes with a slab-use-after-free way down in the crypto API. Reproducer: # mount.cifs -o ...,seal,esize=1 //srv/share /mnt # dd if=/mnt/largefile of=/dev/null ... [ 194.196391] ================================================================== [ 194.196844] BUG: KASAN: slab-use-after-free in gf128mul_4k_lle+0xc1/0x110 [ 194.197269] Read of size 8 at addr ffff888112bd0448 by task kworker/u77:2/899 [ 194.197707] [ 194.197818] CPU: 12 UID: 0 PID: 899 Comm: kworker/u77:2 Not tainted 6.11.0-lku-00028-gfca3ca14a17a-dirty #43 [ 194.198400] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-3-gd478f380-prebuilt.qemu.org 04/01/2014 [ 194.199046] Workqueue: smb3decryptd smb2_decrypt_offload [cifs] [ 194.200032] Call Trace: [ 194.200191] [ 194.200327] dump_stack_lvl+0x4e/0x70 [ 194.200558] ? gf128mul_4k_lle+0xc1/0x110 [ 194.200809] print_report+0x174/0x505 [ 194.201040] ? __pfx__raw_spin_lock_irqsave+0x10/0x10 [ 194.201352] ? srso_return_thunk+0x5/0x5f [ 194.201604] ? __virt_addr_valid+0xdf/0x1c0 [ 194.201868] ? gf128mul_4k_lle+0xc1/0x110 [ 194.202128] kasan_report+0xc8/0x150 [ 194.202361] ? gf128mul_4k_lle+0xc1/0x110 [ 194.202616] gf128mul_4k_lle+0xc1/0x110 [ 194.202863] ghash_update+0x184/0x210 [ 194.203103] shash_ahash_update+0x184/0x2a0 [ 194.203377] ? __pfx_shash_ahash_update+0x10/0x10 [ 194.203651] ? srso_return_thunk+0x5/0x5f [ 194.203877] ? crypto_gcm_init_common+0x1ba/0x340 [ 194.204142] gcm_hash_assoc_remain_continue+0x10a/0x140 [ 194.204434] crypt_message+0xec1/0x10a0 [cifs] [ 194.206489] ? __pfx_crypt_message+0x10/0x10 [cifs] [ 194.208507] ? srso_return_thunk+0x5/0x5f [ 194.209205] ? srso_return_thunk+0x5/0x5f [ 194.209925] ? srso_return_thunk+0x5/0x5f [ 194.210443] ? srso_return_thunk+0x5/0x5f [ 194.211037] decrypt_raw_data+0x15f/0x250 [cifs] [ 194.212906] ? __pfx_decrypt_raw_data+0x10/0x10 [cifs] [ 194.214670] ? srso_return_thunk+0x5/0x5f [ 194.215193] smb2_decrypt_offload+0x12a/0x6c0 [cifs] This is because TFM is being used in parallel. Fix this by allocating a new AEAD TFM for async decryption, but keep the existing one for synchronous READ cases (similar to what is done in smb3_calc_signature()). Also remove the calls to aead_request_set_callback() and crypto_wait_req() since it's always going to be a synchronous operation. Signed-off-by: Enzo Matsumiya Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/smb/client/smb2ops.c | 47 ++++++++++++++++++++++++----------------- fs/smb/client/smb2pdu.c | 6 ++++++ 2 files changed, 34 insertions(+), 19 deletions(-) diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 62935d61192a..4b9cd9893ac6 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4488,7 +4488,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) */ static int crypt_message(struct TCP_Server_Info *server, int num_rqst, - struct smb_rqst *rqst, int enc) + struct smb_rqst *rqst, int enc, struct crypto_aead *tfm) { struct smb2_transform_hdr *tr_hdr = (struct smb2_transform_hdr *)rqst[0].rq_iov[0].iov_base; @@ -4499,8 +4499,6 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, u8 key[SMB3_ENC_DEC_KEY_SIZE]; struct aead_request *req; u8 *iv; - DECLARE_CRYPTO_WAIT(wait); - struct crypto_aead *tfm; unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); void *creq; @@ -4511,14 +4509,6 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - rc = smb3_crypto_aead_allocate(server); - if (rc) { - cifs_server_dbg(VFS, "%s: crypto alloc failed\n", __func__); - return rc; - } - - tfm = enc ? server->secmech.enc : server->secmech.dec; - if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) rc = crypto_aead_setkey(tfm, key, SMB3_GCM256_CRYPTKEY_SIZE); @@ -4557,11 +4547,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, aead_request_set_crypt(req, sg, sg, crypt_len, iv); aead_request_set_ad(req, assoc_data_len); - aead_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, - crypto_req_done, &wait); - - rc = crypto_wait_req(enc ? crypto_aead_encrypt(req) - : crypto_aead_decrypt(req), &wait); + rc = enc ? crypto_aead_encrypt(req) : crypto_aead_decrypt(req); if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); @@ -4650,7 +4636,7 @@ smb3_init_transform_rq(struct TCP_Server_Info *server, int num_rqst, /* fill the 1st iov with a transform header */ fill_transform_hdr(tr_hdr, orig_len, old_rq, server->cipher_type); - rc = crypt_message(server, num_rqst, new_rq, 1); + rc = crypt_message(server, num_rqst, new_rq, 1, server->secmech.enc); cifs_dbg(FYI, "Encrypt message returned %d\n", rc); if (rc) goto err_free; @@ -4676,8 +4662,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, unsigned int npages, unsigned int page_data_size, bool is_offloaded) { - struct kvec iov[2]; + struct crypto_aead *tfm; struct smb_rqst rqst = {NULL}; + struct kvec iov[2]; int rc; iov[0].iov_base = buf; @@ -4692,9 +4679,31 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, rqst.rq_pagesz = PAGE_SIZE; rqst.rq_tailsz = (page_data_size % PAGE_SIZE) ? : PAGE_SIZE; - rc = crypt_message(server, 1, &rqst, 0); + if (is_offloaded) { + if ((server->cipher_type == SMB2_ENCRYPTION_AES128_GCM) || + (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) + tfm = crypto_alloc_aead("gcm(aes)", 0, 0); + else + tfm = crypto_alloc_aead("ccm(aes)", 0, 0); + if (IS_ERR(tfm)) { + rc = PTR_ERR(tfm); + cifs_server_dbg(VFS, "%s: Failed alloc decrypt TFM, rc=%d\n", __func__, rc); + + return rc; + } + } else { + if (unlikely(!server->secmech.dec)) + return -EIO; + + tfm = server->secmech.dec; + } + + rc = crypt_message(server, 1, &rqst, 0, tfm); cifs_dbg(FYI, "Decrypt message returned %d\n", rc); + if (is_offloaded) + crypto_free_aead(tfm); + if (rc) return rc; diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 9975711236b2..ae38ba7f1966 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1105,6 +1105,12 @@ SMB2_negotiate(const unsigned int xid, else cifs_server_dbg(VFS, "Missing expected negotiate contexts\n"); } + + if (server->cipher_type && !rc) { + rc = smb3_crypto_aead_allocate(server); + if (rc) + cifs_server_dbg(VFS, "%s: crypto alloc failed, rc=%d\n", __func__, rc); + } neg_exit: free_rsp_buf(resp_buftype, rsp); return rc; From 4b9b41fabcd38990f69ef0cee9c631d954a2b530 Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Mon, 13 Jan 2025 18:00:34 +0000 Subject: [PATCH 42/51] USB: serial: quatech2: fix null-ptr-deref in qt2_process_read_urb() commit 575a5adf48b06a2980c9eeffedf699ed5534fade upstream. This patch addresses a null-ptr-deref in qt2_process_read_urb() due to an incorrect bounds check in the following: if (newport > serial->num_ports) { dev_err(&port->dev, "%s - port change to invalid port: %i\n", __func__, newport); break; } The condition doesn't account for the valid range of the serial->port buffer, which is from 0 to serial->num_ports - 1. When newport is equal to serial->num_ports, the assignment of "port" in the following code is out-of-bounds and NULL: serial_priv->current_port = newport; port = serial->port[serial_priv->current_port]; The fix checks if newport is greater than or equal to serial->num_ports indicating it is out-of-bounds. Reported-by: syzbot Closes: https://syzkaller.appspot.com/bug?extid=506479ebf12fe435d01a Fixes: f7a33e608d9a ("USB: serial: add quatech2 usb to serial driver") Cc: # 3.5 Signed-off-by: Qasim Ijaz Reviewed-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/quatech2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/serial/quatech2.c b/drivers/usb/serial/quatech2.c index 6fca40ace83a..995313f4687a 100644 --- a/drivers/usb/serial/quatech2.c +++ b/drivers/usb/serial/quatech2.c @@ -511,7 +511,7 @@ static void qt2_process_read_urb(struct urb *urb) newport = *(ch + 3); - if (newport > serial->num_ports) { + if (newport >= serial->num_ports) { dev_err(&port->dev, "%s - port change to invalid port: %i\n", __func__, newport); From 33233b06ad15730d0463e8f152db2eca15c7f498 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 17 Jan 2025 09:17:12 +0100 Subject: [PATCH 43/51] Revert "usb: gadget: u_serial: Disable ep before setting port to null to fix the crash caused by port being null" commit 086fd062bc3883ae1ce4166cff5355db315ad879 upstream. This reverts commit 13014969cbf07f18d62ceea40bd8ca8ec9d36cec. It is reported to cause crashes on Tegra systems, so revert it for now. Link: https://lore.kernel.org/r/1037c1ad-9230-4181-b9c3-167dbaa47644@nvidia.com Reported-by: Jon Hunter Cc: stable Cc: Lianqin Hu Link: https://lore.kernel.org/r/2025011711-yippee-fever-a737@gregkh Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/u_serial.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/function/u_serial.c b/drivers/usb/gadget/function/u_serial.c index 35f1b8a0b506..8802dd53fbea 100644 --- a/drivers/usb/gadget/function/u_serial.c +++ b/drivers/usb/gadget/function/u_serial.c @@ -1393,10 +1393,6 @@ void gserial_disconnect(struct gserial *gser) /* REVISIT as above: how best to track this? */ port->port_line_coding = gser->port_line_coding; - /* disable endpoints, aborting down any active I/O */ - usb_ep_disable(gser->out); - usb_ep_disable(gser->in); - port->port_usb = NULL; gser->ioport = NULL; if (port->port.count > 0) { @@ -1408,6 +1404,10 @@ void gserial_disconnect(struct gserial *gser) spin_unlock(&port->port_lock); spin_unlock_irqrestore(&serial_port_lock, flags); + /* disable endpoints, aborting down any active I/O */ + usb_ep_disable(gser->out); + usb_ep_disable(gser->in); + /* finally, free any unused/unusable I/O buffers */ spin_lock_irqsave(&port->port_lock, flags); if (port->port.count == 0) From 4631653d8da880582a43a86e79ad5f95dd8bf392 Mon Sep 17 00:00:00 2001 From: Lianqin Hu Date: Wed, 15 Jan 2025 09:32:35 +0000 Subject: [PATCH 44/51] ALSA: usb-audio: Add delay quirk for USB Audio Device commit ad5b205f9e022b407d91f952faddd05718be2866 upstream. Audio control requests that sets sampling frequency sometimes fail on this card. Adding delay between control messages eliminates that problem. usb 1-1: New USB device found, idVendor=0d8c, idProduct=0014 usb 1-1: New USB device strings: Mfr=1, Product=2, SerialNumber=0 usb 1-1: Product: USB Audio Device usb 1-1: Manufacturer: C-Media Electronics Inc. Signed-off-by: Lianqin Hu Cc: Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/TYUPR06MB6217E94D922B9BF422A73F32D2192@TYUPR06MB6217.apcprd06.prod.outlook.com Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 6525b02af1b0..219fa6ff14bd 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2137,6 +2137,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x0c45, 0x6340, /* Sonix HD USB Camera */ QUIRK_FLAG_GET_SAMPLE_RATE), + DEVICE_FLG(0x0d8c, 0x0014, /* USB Audio Device */ + QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x0ecb, 0x205c, /* JBL Quantum610 Wireless */ QUIRK_FLAG_FIXED_RATE), DEVICE_FLG(0x0ecb, 0x2069, /* JBL Quantum810 Wireless */ From dd00051871f942b4b418bbdd0aceb6eaa0425f90 Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Mon, 20 Jan 2025 20:24:08 -0800 Subject: [PATCH 45/51] Input: atkbd - map F23 key to support default copilot shortcut commit 907bc9268a5a9f823ffa751957a5c1dd59f83f42 upstream. Microsoft defined Meta+Shift+F23 as the Copilot shortcut instead of a dedicated keycode, and multiple vendors have their keyboards emit this sequence in response to users pressing a dedicated "Copilot" key. Unfortunately the default keymap table in atkbd does not map scancode 0x6e (F23) and so the key combination does not work even if userspace is ready to handle it. Because this behavior is common between multiple vendors and the scancode is currently unused map 0x6e to keycode 193 (KEY_F23) so that key sequence is generated properly. MS documentation for the scan code: https://learn.microsoft.com/en-us/windows/win32/inputdev/about-keyboard-input#scan-codes Confirmed on Lenovo, HP and Dell machines by Canonical. Tested on Lenovo T14s G6 AMD. Signed-off-by: Mark Pearson Link: https://lore.kernel.org/r/20250107034554.25843-1-mpearson-lenovo@squebb.ca Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/keyboard/atkbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/keyboard/atkbd.c b/drivers/input/keyboard/atkbd.c index 661d6c8b059b..b3a856333d4e 100644 --- a/drivers/input/keyboard/atkbd.c +++ b/drivers/input/keyboard/atkbd.c @@ -89,7 +89,7 @@ static const unsigned short atkbd_set2_keycode[ATKBD_KEYMAP_SIZE] = { 0, 46, 45, 32, 18, 5, 4, 95, 0, 57, 47, 33, 20, 19, 6,183, 0, 49, 48, 35, 34, 21, 7,184, 0, 0, 50, 36, 22, 8, 9,185, 0, 51, 37, 23, 24, 11, 10, 0, 0, 52, 53, 38, 39, 25, 12, 0, - 0, 89, 40, 0, 26, 13, 0, 0, 58, 54, 28, 27, 0, 43, 0, 85, + 0, 89, 40, 0, 26, 13, 0,193, 58, 54, 28, 27, 0, 43, 0, 85, 0, 86, 91, 90, 92, 0, 14, 94, 0, 79,124, 75, 71,121, 0, 0, 82, 83, 80, 76, 77, 72, 1, 69, 87, 78, 81, 74, 55, 73, 70, 99, From b336f5832634c66ba8a7af27c77964f451b09a34 Mon Sep 17 00:00:00 2001 From: Nilton Perim Neto Date: Fri, 17 Jan 2025 09:34:18 -0800 Subject: [PATCH 46/51] Input: xpad - add unofficial Xbox 360 wireless receiver clone commit e4940fe6322c851659c17852b671c6e7b1aa9f56 upstream. Although it mimics the Microsoft's VendorID, it is in fact a clone. Taking into account that the original Microsoft Receiver is not being manufactured anymore, this drive can solve dpad issues encontered by those who still use the original 360 Wireless controller but are using a receiver clone. Signed-off-by: Nilton Perim Neto Signed-off-by: Pavel Rojtberg Link: https://lore.kernel.org/r/20250107192830.414709-12-rojtberg@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/joystick/xpad.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 13c36f51b935..49ba09241313 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -145,6 +145,7 @@ static const struct xpad_device { { 0x045e, 0x028e, "Microsoft X-Box 360 pad", 0, XTYPE_XBOX360 }, { 0x045e, 0x028f, "Microsoft X-Box 360 pad v2", 0, XTYPE_XBOX360 }, { 0x045e, 0x0291, "Xbox 360 Wireless Receiver (XBOX)", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W }, + { 0x045e, 0x02a9, "Xbox 360 Wireless Receiver (Unofficial)", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W }, { 0x045e, 0x02d1, "Microsoft X-Box One pad", 0, XTYPE_XBOXONE }, { 0x045e, 0x02dd, "Microsoft X-Box One pad (Firmware 2015)", 0, XTYPE_XBOXONE }, { 0x045e, 0x02e3, "Microsoft X-Box One Elite pad", MAP_PADDLES, XTYPE_XBOXONE }, From 4982cc83dac301f3d544fbaaf8afced38e1d99f3 Mon Sep 17 00:00:00 2001 From: Jack Greiner Date: Fri, 17 Jan 2025 16:51:58 -0800 Subject: [PATCH 47/51] Input: xpad - add support for wooting two he (arm) commit 222f3390c15c4452a9f7e26f5b7d9138e75d00d5 upstream. Add Wooting Two HE (ARM) to the list of supported devices. Signed-off-by: Jack Greiner Signed-off-by: Pavel Rojtberg Link: https://lore.kernel.org/r/20250107192830.414709-3-rojtberg@gmail.com Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/joystick/xpad.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index 49ba09241313..872381221e75 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -367,6 +367,7 @@ static const struct xpad_device { { 0x31e3, 0x1200, "Wooting Two", 0, XTYPE_XBOX360 }, { 0x31e3, 0x1210, "Wooting Lekker", 0, XTYPE_XBOX360 }, { 0x31e3, 0x1220, "Wooting Two HE", 0, XTYPE_XBOX360 }, + { 0x31e3, 0x1230, "Wooting Two HE (ARM)", 0, XTYPE_XBOX360 }, { 0x31e3, 0x1300, "Wooting 60HE (AVR)", 0, XTYPE_XBOX360 }, { 0x31e3, 0x1310, "Wooting 60HE (ARM)", 0, XTYPE_XBOX360 }, { 0x3285, 0x0607, "Nacon GC-100", 0, XTYPE_XBOX360 }, From 44c495818d9c4a741ab9e6bc9203ccc9f55f6f40 Mon Sep 17 00:00:00 2001 From: Paulo Alcantara Date: Mon, 25 Nov 2024 17:17:23 -0300 Subject: [PATCH 48/51] smb: client: fix NULL ptr deref in crypto_aead_setkey() commit 4bdec0d1f658f7c98749bd2c5a486e6cfa8565d2 upstream. Neither SMB3.0 or SMB3.02 supports encryption negotiate context, so when SMB2_GLOBAL_CAP_ENCRYPTION flag is set in the negotiate response, the client uses AES-128-CCM as the default cipher. See MS-SMB2 3.3.5.4. Commit b0abcd65ec54 ("smb: client: fix UAF in async decryption") added a @server->cipher_type check to conditionally call smb3_crypto_aead_allocate(), but that check would always be false as @server->cipher_type is unset for SMB3.02. Fix the following KASAN splat by setting @server->cipher_type for SMB3.02 as well. mount.cifs //srv/share /mnt -o vers=3.02,seal,... BUG: KASAN: null-ptr-deref in crypto_aead_setkey+0x2c/0x130 Read of size 8 at addr 0000000000000020 by task mount.cifs/1095 CPU: 1 UID: 0 PID: 1095 Comm: mount.cifs Not tainted 6.12.0 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-3.fc41 04/01/2014 Call Trace: dump_stack_lvl+0x5d/0x80 ? crypto_aead_setkey+0x2c/0x130 kasan_report+0xda/0x110 ? crypto_aead_setkey+0x2c/0x130 crypto_aead_setkey+0x2c/0x130 crypt_message+0x258/0xec0 [cifs] ? __asan_memset+0x23/0x50 ? __pfx_crypt_message+0x10/0x10 [cifs] ? mark_lock+0xb0/0x6a0 ? hlock_class+0x32/0xb0 ? mark_lock+0xb0/0x6a0 smb3_init_transform_rq+0x352/0x3f0 [cifs] ? lock_acquire.part.0+0xf4/0x2a0 smb_send_rqst+0x144/0x230 [cifs] ? __pfx_smb_send_rqst+0x10/0x10 [cifs] ? hlock_class+0x32/0xb0 ? smb2_setup_request+0x225/0x3a0 [cifs] ? __pfx_cifs_compound_last_callback+0x10/0x10 [cifs] compound_send_recv+0x59b/0x1140 [cifs] ? __pfx_compound_send_recv+0x10/0x10 [cifs] ? __create_object+0x5e/0x90 ? hlock_class+0x32/0xb0 ? do_raw_spin_unlock+0x9a/0xf0 cifs_send_recv+0x23/0x30 [cifs] SMB2_tcon+0x3ec/0xb30 [cifs] ? __pfx_SMB2_tcon+0x10/0x10 [cifs] ? lock_acquire.part.0+0xf4/0x2a0 ? __pfx_lock_release+0x10/0x10 ? do_raw_spin_trylock+0xc6/0x120 ? lock_acquire+0x3f/0x90 ? _get_xid+0x16/0xd0 [cifs] ? __pfx_SMB2_tcon+0x10/0x10 [cifs] ? cifs_get_smb_ses+0xcdd/0x10a0 [cifs] cifs_get_smb_ses+0xcdd/0x10a0 [cifs] ? __pfx_cifs_get_smb_ses+0x10/0x10 [cifs] ? cifs_get_tcp_session+0xaa0/0xca0 [cifs] cifs_mount_get_session+0x8a/0x210 [cifs] dfs_mount_share+0x1b0/0x11d0 [cifs] ? __pfx___lock_acquire+0x10/0x10 ? __pfx_dfs_mount_share+0x10/0x10 [cifs] ? lock_acquire.part.0+0xf4/0x2a0 ? find_held_lock+0x8a/0xa0 ? hlock_class+0x32/0xb0 ? lock_release+0x203/0x5d0 cifs_mount+0xb3/0x3d0 [cifs] ? do_raw_spin_trylock+0xc6/0x120 ? __pfx_cifs_mount+0x10/0x10 [cifs] ? lock_acquire+0x3f/0x90 ? find_nls+0x16/0xa0 ? smb3_update_mnt_flags+0x372/0x3b0 [cifs] cifs_smb3_do_mount+0x1e2/0xc80 [cifs] ? __pfx_vfs_parse_fs_string+0x10/0x10 ? __pfx_cifs_smb3_do_mount+0x10/0x10 [cifs] smb3_get_tree+0x1bf/0x330 [cifs] vfs_get_tree+0x4a/0x160 path_mount+0x3c1/0xfb0 ? kasan_quarantine_put+0xc7/0x1d0 ? __pfx_path_mount+0x10/0x10 ? kmem_cache_free+0x118/0x3e0 ? user_path_at+0x74/0xa0 __x64_sys_mount+0x1a6/0x1e0 ? __pfx___x64_sys_mount+0x10/0x10 ? mark_held_locks+0x1a/0x90 do_syscall_64+0xbb/0x1d0 entry_SYSCALL_64_after_hwframe+0x77/0x7f Cc: Tom Talpey Reported-by: Jianhong Yin Cc: stable@vger.kernel.org # v6.12 Fixes: b0abcd65ec54 ("smb: client: fix UAF in async decryption") Signed-off-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/smb/client/smb2pdu.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index ae38ba7f1966..bfe7b03307d4 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -1070,7 +1070,9 @@ SMB2_negotiate(const unsigned int xid, * SMB3.0 supports only 1 cipher and doesn't have a encryption neg context * Set the cipher type manually. */ - if (server->dialect == SMB30_PROT_ID && (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) + if ((server->dialect == SMB30_PROT_ID || + server->dialect == SMB302_PROT_ID) && + (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) server->cipher_type = SMB2_ENCRYPTION_AES128_CCM; security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, From 7d06d97e235f0315db6a4bd668352fc3247c5a83 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Wed, 9 Aug 2023 12:04:46 +0200 Subject: [PATCH 49/51] ASoC: samsung: midas_wm1811: Fix 'Headphone Switch' control creation commit 48c6253fefa38556e0c5c2942edd9181529407e4 upstream. 'Headphone Switch' control is already registered from sound/soc/codecs/wm_hubs.c:479, so duplicating it in midas_wm1811 causes following probe failure: midas-audio sound: control 2:0:0:Headphone Switch:0 is already present midas-audio sound: ASoC: Failed to add Headphone Switch: -16 midas-audio sound: Failed to register card: -16 midas-audio: probe of sound failed with error -16 Fix this by dropping duplicated control. Fixes: d27224a45e54 ("ASoC: samsung: midas_wm1811: Map missing jack kcontrols") Signed-off-by: Marek Szyprowski Link: https://lore.kernel.org/r/20230809100446.2105825-1-m.szyprowski@samsung.com Signed-off-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- sound/soc/samsung/midas_wm1811.c | 1 - 1 file changed, 1 deletion(-) diff --git a/sound/soc/samsung/midas_wm1811.c b/sound/soc/samsung/midas_wm1811.c index 44b32f5cddca..126098fdcf1b 100644 --- a/sound/soc/samsung/midas_wm1811.c +++ b/sound/soc/samsung/midas_wm1811.c @@ -257,7 +257,6 @@ static const struct snd_kcontrol_new midas_controls[] = { SOC_DAPM_PIN_SWITCH("Main Mic"), SOC_DAPM_PIN_SWITCH("Sub Mic"), SOC_DAPM_PIN_SWITCH("Headset Mic"), - SOC_DAPM_PIN_SWITCH("Headphone"), SOC_DAPM_PIN_SWITCH("FM In"), }; From a9401cd5d1bb5a0b8d2bef09623ca43551cd6e8a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Wed, 22 Jan 2025 22:24:03 -0300 Subject: [PATCH 50/51] drm/v3d: Assign job pointer to NULL before signaling the fence MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6e64d6b3a3c39655de56682ec83e894978d23412 upstream. In commit e4b5ccd392b9 ("drm/v3d: Ensure job pointer is set to NULL after job completion"), we introduced a change to assign the job pointer to NULL after completing a job, indicating job completion. However, this approach created a race condition between the DRM scheduler workqueue and the IRQ execution thread. As soon as the fence is signaled in the IRQ execution thread, a new job starts to be executed. This results in a race condition where the IRQ execution thread sets the job pointer to NULL simultaneously as the `run_job()` function assigns a new job to the pointer. This race condition can lead to a NULL pointer dereference if the IRQ execution thread sets the job pointer to NULL after `run_job()` assigns it to the new job. When the new job completes and the GPU emits an interrupt, `v3d_irq()` is triggered, potentially causing a crash. [ 466.310099] Unable to handle kernel NULL pointer dereference at virtual address 00000000000000c0 [ 466.318928] Mem abort info: [ 466.321723] ESR = 0x0000000096000005 [ 466.325479] EC = 0x25: DABT (current EL), IL = 32 bits [ 466.330807] SET = 0, FnV = 0 [ 466.333864] EA = 0, S1PTW = 0 [ 466.337010] FSC = 0x05: level 1 translation fault [ 466.341900] Data abort info: [ 466.344783] ISV = 0, ISS = 0x00000005, ISS2 = 0x00000000 [ 466.350285] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 466.355350] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 466.360677] user pgtable: 4k pages, 39-bit VAs, pgdp=0000000089772000 [ 466.367140] [00000000000000c0] pgd=0000000000000000, p4d=0000000000000000, pud=0000000000000000 [ 466.375875] Internal error: Oops: 0000000096000005 [#1] PREEMPT SMP [ 466.382163] Modules linked in: rfcomm snd_seq_dummy snd_hrtimer snd_seq snd_seq_device algif_hash algif_skcipher af_alg bnep binfmt_misc vc4 snd_soc_hdmi_codec drm_display_helper cec brcmfmac_wcc spidev rpivid_hevc(C) drm_client_lib brcmfmac hci_uart drm_dma_helper pisp_be btbcm brcmutil snd_soc_core aes_ce_blk v4l2_mem2mem bluetooth aes_ce_cipher snd_compress videobuf2_dma_contig ghash_ce cfg80211 gf128mul snd_pcm_dmaengine videobuf2_memops ecdh_generic sha2_ce ecc videobuf2_v4l2 snd_pcm v3d sha256_arm64 rfkill videodev snd_timer sha1_ce libaes gpu_sched snd videobuf2_common sha1_generic drm_shmem_helper mc rp1_pio drm_kms_helper raspberrypi_hwmon spi_bcm2835 gpio_keys i2c_brcmstb rp1 raspberrypi_gpiomem rp1_mailbox rp1_adc nvmem_rmem uio_pdrv_genirq uio i2c_dev drm ledtrig_pattern drm_panel_orientation_quirks backlight fuse dm_mod ip_tables x_tables ipv6 [ 466.458429] CPU: 0 UID: 1000 PID: 2008 Comm: chromium Tainted: G C 6.13.0-v8+ #18 [ 466.467336] Tainted: [C]=CRAP [ 466.470306] Hardware name: Raspberry Pi 5 Model B Rev 1.0 (DT) [ 466.476157] pstate: 404000c9 (nZcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 466.483143] pc : v3d_irq+0x118/0x2e0 [v3d] [ 466.487258] lr : __handle_irq_event_percpu+0x60/0x228 [ 466.492327] sp : ffffffc080003ea0 [ 466.495646] x29: ffffffc080003ea0 x28: ffffff80c0c94200 x27: 0000000000000000 [ 466.502807] x26: ffffffd08dd81d7b x25: ffffff80c0c94200 x24: ffffff8003bdc200 [ 466.509969] x23: 0000000000000001 x22: 00000000000000a7 x21: 0000000000000000 [ 466.517130] x20: ffffff8041bb0000 x19: 0000000000000001 x18: 0000000000000000 [ 466.524291] x17: ffffffafadfb0000 x16: ffffffc080000000 x15: 0000000000000000 [ 466.531452] x14: 0000000000000000 x13: 0000000000000000 x12: 0000000000000000 [ 466.538613] x11: 0000000000000000 x10: 0000000000000000 x9 : ffffffd08c527eb0 [ 466.545777] x8 : 0000000000000000 x7 : 0000000000000000 x6 : 0000000000000000 [ 466.552941] x5 : ffffffd08c4100d0 x4 : ffffffafadfb0000 x3 : ffffffc080003f70 [ 466.560102] x2 : ffffffc0829e8058 x1 : 0000000000000001 x0 : 0000000000000000 [ 466.567263] Call trace: [ 466.569711] v3d_irq+0x118/0x2e0 [v3d] (P) [ 466.573826] __handle_irq_event_percpu+0x60/0x228 [ 466.578546] handle_irq_event+0x54/0xb8 [ 466.582391] handle_fasteoi_irq+0xac/0x240 [ 466.586498] generic_handle_domain_irq+0x34/0x58 [ 466.591128] gic_handle_irq+0x48/0xd8 [ 466.594798] call_on_irq_stack+0x24/0x58 [ 466.598730] do_interrupt_handler+0x88/0x98 [ 466.602923] el0_interrupt+0x44/0xc0 [ 466.606508] __el0_irq_handler_common+0x18/0x28 [ 466.611050] el0t_64_irq_handler+0x10/0x20 [ 466.615156] el0t_64_irq+0x198/0x1a0 [ 466.618740] Code: 52800035 3607faf3 f9442e80 52800021 (f9406018) [ 466.624853] ---[ end trace 0000000000000000 ]--- [ 466.629483] Kernel panic - not syncing: Oops: Fatal exception in interrupt [ 466.636384] SMP: stopping secondary CPUs [ 466.640320] Kernel Offset: 0x100c400000 from 0xffffffc080000000 [ 466.646259] PHYS_OFFSET: 0x0 [ 466.649141] CPU features: 0x100,00000170,00901250,0200720b [ 466.654644] Memory Limit: none [ 466.657706] ---[ end Kernel panic - not syncing: Oops: Fatal exception in interrupt ]--- Fix the crash by assigning the job pointer to NULL before signaling the fence. This ensures that the job pointer is cleared before any new job starts execution, preventing the race condition and the NULL pointer dereference crash. Cc: stable@vger.kernel.org Fixes: e4b5ccd392b9 ("drm/v3d: Ensure job pointer is set to NULL after job completion") Signed-off-by: Maíra Canal Reviewed-by: Jose Maria Casanova Crespo Reviewed-by: Iago Toral Quiroga Tested-by: Phil Elwell Link: https://patchwork.freedesktop.org/patch/msgid/20250123012403.20447-1-mcanal@igalia.com Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/v3d/v3d_irq.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_irq.c b/drivers/gpu/drm/v3d/v3d_irq.c index 76806039691a..b2d59a168697 100644 --- a/drivers/gpu/drm/v3d/v3d_irq.c +++ b/drivers/gpu/drm/v3d/v3d_irq.c @@ -102,8 +102,10 @@ v3d_irq(int irq, void *arg) to_v3d_fence(v3d->bin_job->base.irq_fence); trace_v3d_bcl_irq(&v3d->drm, fence->seqno); - dma_fence_signal(&fence->base); + v3d->bin_job = NULL; + dma_fence_signal(&fence->base); + status = IRQ_HANDLED; } @@ -112,8 +114,10 @@ v3d_irq(int irq, void *arg) to_v3d_fence(v3d->render_job->base.irq_fence); trace_v3d_rcl_irq(&v3d->drm, fence->seqno); - dma_fence_signal(&fence->base); + v3d->render_job = NULL; + dma_fence_signal(&fence->base); + status = IRQ_HANDLED; } @@ -122,8 +126,10 @@ v3d_irq(int irq, void *arg) to_v3d_fence(v3d->csd_job->base.irq_fence); trace_v3d_csd_irq(&v3d->drm, fence->seqno); - dma_fence_signal(&fence->base); + v3d->csd_job = NULL; + dma_fence_signal(&fence->base); + status = IRQ_HANDLED; } @@ -159,8 +165,10 @@ v3d_hub_irq(int irq, void *arg) to_v3d_fence(v3d->tfu_job->base.irq_fence); trace_v3d_tfu_irq(&v3d->drm, fence->seqno); - dma_fence_signal(&fence->base); + v3d->tfu_job = NULL; + dma_fence_signal(&fence->base); + status = IRQ_HANDLED; } From 0cbb5f65e52f3e66410a7fe0edf75e1b2bf41e80 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 1 Feb 2025 18:30:12 +0100 Subject: [PATCH 51/51] Linux 6.1.128 Link: https://lore.kernel.org/r/20250130140133.825446496@linuxfoundation.org Tested-by: Mark Brown Tested-by: Florian Fainelli Tested-by: Salvatore Bonaccorso Tested-by: Jon Hunter Tested-by: Pavel Machek (CIP) Tested-by: Ron Economos Tested-by: Linux Kernel Functional Testing Tested-by: kernelci.org bot Tested-by: Peter Schneider Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9c703cff00bb..1fc69d1f5b8c 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 PATCHLEVEL = 1 -SUBLEVEL = 127 +SUBLEVEL = 128 EXTRAVERSION = NAME = Curry Ramen