From b173204aadffe62d9654b4da0d99d9d2e39a9789 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 30 Apr 2025 11:05:54 +0300 Subject: [PATCH 001/529] dm: add missing unlock on in dm_keyslot_evict() commit 650266ac4c7230c89bcd1307acf5c9c92cfa85e2 upstream. We need to call dm_put_live_table() even if dm_get_live_table() returns NULL. Fixes: 9355a9eb21a5 ("dm: support key eviction from keyslot managers of underlying devices") Cc: stable@vger.kernel.org # v5.12+ Signed-off-by: Dan Carpenter Signed-off-by: Mikulas Patocka Signed-off-by: Greg Kroah-Hartman --- drivers/md/dm-table.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 492a3b38f552..a20cf54d12dc 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -1243,7 +1243,7 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile, t = dm_get_live_table(md, &srcu_idx); if (!t) - return 0; + goto put_live_table; for (unsigned int i = 0; i < t->num_targets; i++) { struct dm_target *ti = dm_table_get_target(t, i); @@ -1254,6 +1254,7 @@ static int dm_keyslot_evict(struct blk_crypto_profile *profile, (void *)key); } +put_live_table: dm_put_live_table(md, srcu_idx); return 0; } From 9f9d1ddee895799194b52d1ae614c990b4c7f276 Mon Sep 17 00:00:00 2001 From: Wojciech Dubowik Date: Thu, 24 Apr 2025 11:59:14 +0200 Subject: [PATCH 002/529] arm64: dts: imx8mm-verdin: Link reg_usdhc2_vqmmc to usdhc2 commit 5591ce0069ddda97cdbbea596bed53e698f399c2 upstream. Define vqmmc regulator-gpio for usdhc2 with vin-supply coming from LDO5. Without this definition LDO5 will be powered down, disabling SD card after bootup. This has been introduced in commit f5aab0438ef1 ("regulator: pca9450: Fix enable register for LDO5"). Fixes: 6a57f224f734 ("arm64: dts: freescale: add initial support for verdin imx8m mini") Fixes: f5aab0438ef1 ("regulator: pca9450: Fix enable register for LDO5") Tested-by: Manuel Traut Reviewed-by: Philippe Schenker Tested-by: Francesco Dolcini Reviewed-by: Francesco Dolcini Cc: stable@vger.kernel.org Signed-off-by: Wojciech Dubowik Signed-off-by: Shawn Guo Signed-off-by: Greg Kroah-Hartman --- .../boot/dts/freescale/imx8mm-verdin.dtsi | 25 +++++++++++++++---- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi index a9bfcdf378ce..fb75800e8999 100644 --- a/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi +++ b/arch/arm64/boot/dts/freescale/imx8mm-verdin.dtsi @@ -144,6 +144,19 @@ startup-delay-us = <20000>; }; + reg_usdhc2_vqmmc: regulator-usdhc2-vqmmc { + compatible = "regulator-gpio"; + pinctrl-names = "default"; + pinctrl-0 = <&pinctrl_usdhc2_vsel>; + gpios = <&gpio1 4 GPIO_ACTIVE_HIGH>; + regulator-max-microvolt = <3300000>; + regulator-min-microvolt = <1800000>; + states = <1800000 0x1>, + <3300000 0x0>; + regulator-name = "PMIC_USDHC_VSELECT"; + vin-supply = <®_nvcc_sd>; + }; + reserved-memory { #address-cells = <2>; #size-cells = <2>; @@ -262,7 +275,7 @@ "SODIMM_19", "", "", - "", + "PMIC_USDHC_VSELECT", "", "", "", @@ -788,6 +801,7 @@ pinctrl-2 = <&pinctrl_usdhc2_200mhz>, <&pinctrl_usdhc2_cd>; pinctrl-3 = <&pinctrl_usdhc2_sleep>, <&pinctrl_usdhc2_cd_sleep>; vmmc-supply = <®_usdhc2_vmmc>; + vqmmc-supply = <®_usdhc2_vqmmc>; }; &wdog1 { @@ -1210,13 +1224,17 @@ ; /* SODIMM 76 */ }; + pinctrl_usdhc2_vsel: usdhc2vselgrp { + fsl,pins = + ; /* PMIC_USDHC_VSELECT */ + }; + /* * Note: Due to ERR050080 we use discrete external on-module resistors pulling-up to the * on-module +V3.3_1.8_SD (LDO5) rail and explicitly disable the internal pull-ups here. */ pinctrl_usdhc2: usdhc2grp { fsl,pins = - , , /* SODIMM 78 */ , /* SODIMM 74 */ , /* SODIMM 80 */ @@ -1227,7 +1245,6 @@ pinctrl_usdhc2_100mhz: usdhc2-100mhzgrp { fsl,pins = - , , , , @@ -1238,7 +1255,6 @@ pinctrl_usdhc2_200mhz: usdhc2-200mhzgrp { fsl,pins = - , , , , @@ -1250,7 +1266,6 @@ /* Avoid backfeeding with removed card power */ pinctrl_usdhc2_sleep: usdhc2slpgrp { fsl,pins = - , , , , From 0de5d055d4a881a23e3a4b20ceab5ffd7d202509 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 2 May 2025 16:13:46 +0200 Subject: [PATCH 003/529] can: mcan: m_can_class_unregister(): fix order of unregistration calls commit 0713a1b3276b98c7dafbeefef00d7bc3a9119a84 upstream. If a driver is removed, the driver framework invokes the driver's remove callback. A CAN driver's remove function calls unregister_candev(), which calls net_device_ops::ndo_stop further down in the call stack for interfaces which are in the "up" state. The removal of the module causes a warning, as can_rx_offload_del() deletes the NAPI, while it is still active, because the interface is still up. To fix the warning, first unregister the network interface, which calls net_device_ops::ndo_stop, which disables the NAPI, and then call can_rx_offload_del(). Fixes: 1be37d3b0414 ("can: m_can: fix periph RX path: use rx-offload to ensure skbs are sent from softirq context") Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20250502-can-rx-offload-del-v1-3-59a9b131589d@pengutronix.de Reviewed-by: Markus Schneider-Pargmann Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/m_can/m_can.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/m_can/m_can.c b/drivers/net/can/m_can/m_can.c index f28bdb5badd0..56b0f49c8116 100644 --- a/drivers/net/can/m_can/m_can.c +++ b/drivers/net/can/m_can/m_can.c @@ -2047,9 +2047,9 @@ EXPORT_SYMBOL_GPL(m_can_class_register); void m_can_class_unregister(struct m_can_classdev *cdev) { + unregister_candev(cdev->net); if (cdev->is_peripheral) can_rx_offload_del(&cdev->offload); - unregister_candev(cdev->net); } EXPORT_SYMBOL_GPL(m_can_class_unregister); From 7fbb439ee2295fc62462c6f747c461b9203f14c6 Mon Sep 17 00:00:00 2001 From: Marc Kleine-Budde Date: Fri, 2 May 2025 16:13:44 +0200 Subject: [PATCH 004/529] can: mcp251xfd: mcp251xfd_remove(): fix order of unregistration calls commit 84f5eb833f53ae192baed4cfb8d9eaab43481fc9 upstream. If a driver is removed, the driver framework invokes the driver's remove callback. A CAN driver's remove function calls unregister_candev(), which calls net_device_ops::ndo_stop further down in the call stack for interfaces which are in the "up" state. With the mcp251xfd driver the removal of the module causes the following warning: | WARNING: CPU: 0 PID: 352 at net/core/dev.c:7342 __netif_napi_del_locked+0xc8/0xd8 as can_rx_offload_del() deletes the NAPI, while it is still active, because the interface is still up. To fix the warning, first unregister the network interface, which calls net_device_ops::ndo_stop, which disables the NAPI, and then call can_rx_offload_del(). Fixes: 55e5b97f003e ("can: mcp25xxfd: add driver for Microchip MCP25xxFD SPI CAN") Cc: stable@vger.kernel.org Link: https://patch.msgid.link/20250502-can-rx-offload-del-v1-1-59a9b131589d@pengutronix.de Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index 6fecfe4cd080..a48996265172 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -2179,8 +2179,8 @@ static void mcp251xfd_remove(struct spi_device *spi) struct mcp251xfd_priv *priv = spi_get_drvdata(spi); struct net_device *ndev = priv->ndev; - can_rx_offload_del(&priv->offload); mcp251xfd_unregister(priv); + can_rx_offload_del(&priv->offload); spi->max_speed_hz = priv->spi_max_speed_hz_orig; free_candev(ndev); } From 7f61da79df86fd140c7768e668ad846bfa7ec8e1 Mon Sep 17 00:00:00 2001 From: Norbert Szetei Date: Fri, 2 May 2025 08:21:58 +0900 Subject: [PATCH 005/529] ksmbd: prevent out-of-bounds stream writes by validating *pos commit 0ca6df4f40cf4c32487944aaf48319cb6c25accc upstream. ksmbd_vfs_stream_write() did not validate whether the write offset (*pos) was within the bounds of the existing stream data length (v_len). If *pos was greater than or equal to v_len, this could lead to an out-of-bounds memory write. This patch adds a check to ensure *pos is less than v_len before proceeding. If the condition fails, -EINVAL is returned. Cc: stable@vger.kernel.org Signed-off-by: Norbert Szetei Acked-by: Namjae Jeon Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/smb/server/vfs.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 25e8004a5905..cf1b241a1578 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -440,6 +440,13 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, goto out; } + if (v_len <= *pos) { + pr_err("stream write position %lld is out of bounds (stream length: %zd)\n", + *pos, v_len); + err = -EINVAL; + goto out; + } + if (v_len < size) { wbuf = kvzalloc(size, GFP_KERNEL); if (!wbuf) { From bca8df998cce1fead8cbc69144862eadc2e34c87 Mon Sep 17 00:00:00 2001 From: Eelco Chaudron Date: Tue, 6 May 2025 16:28:54 +0200 Subject: [PATCH 006/529] openvswitch: Fix unsafe attribute parsing in output_userspace() commit 6beb6835c1fbb3f676aebb51a5fee6b77fed9308 upstream. This patch replaces the manual Netlink attribute iteration in output_userspace() with nla_for_each_nested(), which ensures that only well-formed attributes are processed. Fixes: ccb1352e76cf ("net: Add Open vSwitch kernel components.") Signed-off-by: Eelco Chaudron Acked-by: Ilya Maximets Acked-by: Aaron Conole Link: https://patch.msgid.link/0bd65949df61591d9171c0dc13e42cea8941da10.1746541734.git.echaudro@redhat.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- net/openvswitch/actions.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index c517b24b3093..a87c25e06baf 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -954,8 +954,7 @@ static int output_userspace(struct datapath *dp, struct sk_buff *skb, upcall.cmd = OVS_PACKET_CMD_ACTION; upcall.mru = OVS_CB(skb)->mru; - for (a = nla_data(attr), rem = nla_len(attr); rem > 0; - a = nla_next(a, &rem)) { + nla_for_each_nested(a, attr, rem) { switch (nla_type(a)) { case OVS_USERSPACE_ATTR_USERDATA: upcall.userdata = a; From facf22c1a394c1e023dab5daf9a494f722771e1c Mon Sep 17 00:00:00 2001 From: Wang Zhaolong Date: Wed, 30 Apr 2025 11:16:23 +0800 Subject: [PATCH 007/529] ksmbd: fix memory leak in parse_lease_state() [ Upstream commit eb4447bcce915b43b691123118893fca4f372a8f ] The previous patch that added bounds check for create lease context introduced a memory leak. When the bounds check fails, the function returns NULL without freeing the previously allocated lease_ctx_info structure. This patch fixes the issue by adding kfree(lreq) before returning NULL in both boundary check cases. Fixes: bab703ed8472 ("ksmbd: add bounds check for create lease context") Signed-off-by: Wang Zhaolong Acked-by: Namjae Jeon Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/smb/server/oplock.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 2fcfabc35b06..258ed9978b90 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -1536,7 +1536,7 @@ struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir) if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) < sizeof(struct create_lease_v2) - 4) - return NULL; + goto err_out; memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); if (is_dir) { @@ -1557,7 +1557,7 @@ struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir) if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) < sizeof(struct create_lease)) - return NULL; + goto err_out; memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); lreq->req_state = lc->lcontext.LeaseState; @@ -1566,6 +1566,9 @@ struct lease_ctx_info *parse_lease_state(void *open_req, bool is_dir) lreq->version = 1; } return lreq; +err_out: + kfree(lreq); + return NULL; } /** From c928dd4f6bf0c25c72b11824a1e9ac9bd37296a0 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Mon, 28 Apr 2025 16:29:54 -0700 Subject: [PATCH 008/529] sch_htb: make htb_deactivate() idempotent [ Upstream commit 3769478610135e82b262640252d90f6efb05be71 ] Alan reported a NULL pointer dereference in htb_next_rb_node() after we made htb_qlen_notify() idempotent. It turns out in the following case it introduced some regression: htb_dequeue_tree(): |-> fq_codel_dequeue() |-> qdisc_tree_reduce_backlog() |-> htb_qlen_notify() |-> htb_deactivate() |-> htb_next_rb_node() |-> htb_deactivate() For htb_next_rb_node(), after calling the 1st htb_deactivate(), the clprio[prio]->ptr could be already set to NULL, which means htb_next_rb_node() is vulnerable here. For htb_deactivate(), although we checked qlen before calling it, in case of qlen==0 after qdisc_tree_reduce_backlog(), we may call it again which triggers the warning inside. To fix the issues here, we need to: 1) Make htb_deactivate() idempotent, that is, simply return if we already call it before. 2) Make htb_next_rb_node() safe against ptr==NULL. Many thanks to Alan for testing and for the reproducer. Fixes: 5ba8b837b522 ("sch_htb: make htb_qlen_notify() idempotent") Reported-by: Alan J. Wylie Signed-off-by: Cong Wang Link: https://patch.msgid.link/20250428232955.1740419-2-xiyou.wangcong@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sched/sch_htb.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index fb0fb8825574..29f394fe3998 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -345,7 +345,8 @@ static void htb_add_to_wait_tree(struct htb_sched *q, */ static inline void htb_next_rb_node(struct rb_node **n) { - *n = rb_next(*n); + if (*n) + *n = rb_next(*n); } /** @@ -606,8 +607,8 @@ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) */ static inline void htb_deactivate(struct htb_sched *q, struct htb_class *cl) { - WARN_ON(!cl->prio_activity); - + if (!cl->prio_activity) + return; htb_deactivate_prios(q, cl); cl->prio_activity = 0; } @@ -1482,8 +1483,6 @@ static void htb_qlen_notify(struct Qdisc *sch, unsigned long arg) { struct htb_class *cl = (struct htb_class *)arg; - if (!cl->prio_activity) - return; htb_deactivate(qdisc_priv(sch), cl); } @@ -1735,8 +1734,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg, if (cl->parent) cl->parent->children--; - if (cl->prio_activity) - htb_deactivate(q, cl); + htb_deactivate(q, cl); if (cl->cmode != HTB_CAN_SEND) htb_safe_rb_erase(&cl->pq_node, @@ -1948,8 +1946,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* turn parent into inner node */ qdisc_purge_queue(parent->leaf.q); parent_qdisc = parent->leaf.q; - if (parent->prio_activity) - htb_deactivate(q, parent); + htb_deactivate(q, parent); /* remove from evt list because of level change */ if (parent->cmode != HTB_CAN_SEND) { From a51dc9669ff8f1e10c798ff1e6ea6f1eca5dde28 Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Sat, 3 May 2025 00:57:52 +0200 Subject: [PATCH 009/529] gre: Fix again IPv6 link-local address generation. [ Upstream commit 3e6a0243ff002ddbd7ee18a8974ae61d2e6ed00d ] Use addrconf_addr_gen() to generate IPv6 link-local addresses on GRE devices in most cases and fall back to using add_v4_addrs() only in case the GRE configuration is incompatible with addrconf_addr_gen(). GRE used to use addrconf_addr_gen() until commit e5dd729460ca ("ip/ip6_gre: use the same logic as SIT interfaces when computing v6LL address") restricted this use to gretap and ip6gretap devices, and created add_v4_addrs() (borrowed from SIT) for non-Ethernet GRE ones. The original problem came when commit 9af28511be10 ("addrconf: refuse isatap eui64 for INADDR_ANY") made __ipv6_isatap_ifid() fail when its addr parameter was 0. The commit says that this would create an invalid address, however, I couldn't find any RFC saying that the generated interface identifier would be wrong. Anyway, since gre over IPv4 devices pass their local tunnel address to __ipv6_isatap_ifid(), that commit broke their IPv6 link-local address generation when the local address was unspecified. Then commit e5dd729460ca ("ip/ip6_gre: use the same logic as SIT interfaces when computing v6LL address") tried to fix that case by defining add_v4_addrs() and calling it to generate the IPv6 link-local address instead of using addrconf_addr_gen() (apart for gretap and ip6gretap devices, which would still use the regular addrconf_addr_gen(), since they have a MAC address). That broke several use cases because add_v4_addrs() isn't properly integrated into the rest of IPv6 Neighbor Discovery code. Several of these shortcomings have been fixed over time, but add_v4_addrs() remains broken on several aspects. In particular, it doesn't send any Router Sollicitations, so the SLAAC process doesn't start until the interface receives a Router Advertisement. Also, add_v4_addrs() mostly ignores the address generation mode of the interface (/proc/sys/net/ipv6/conf/*/addr_gen_mode), thus breaking the IN6_ADDR_GEN_MODE_RANDOM and IN6_ADDR_GEN_MODE_STABLE_PRIVACY cases. Fix the situation by using add_v4_addrs() only in the specific scenario where the normal method would fail. That is, for interfaces that have all of the following characteristics: * run over IPv4, * transport IP packets directly, not Ethernet (that is, not gretap interfaces), * tunnel endpoint is INADDR_ANY (that is, 0), * device address generation mode is EUI64. In all other cases, revert back to the regular addrconf_addr_gen(). Also, remove the special case for ip6gre interfaces in add_v4_addrs(), since ip6gre devices now always use addrconf_addr_gen() instead. Note: This patch was originally applied as commit 183185a18ff9 ("gre: Fix IPv6 link-local address generation."). However, it was then reverted by commit fc486c2d060f ("Revert "gre: Fix IPv6 link-local address generation."") because it uncovered another bug that ended up breaking net/forwarding/ip6gre_custom_multipath_hash.sh. That other bug has now been fixed by commit 4d0ab3a6885e ("ipv6: Start path selection from the first nexthop"). Therefore we can now revive this GRE patch (no changes since original commit 183185a18ff9 ("gre: Fix IPv6 link-local address generation."). Fixes: e5dd729460ca ("ip/ip6_gre: use the same logic as SIT interfaces when computing v6LL address") Signed-off-by: Guillaume Nault Reviewed-by: Ido Schimmel Link: https://patch.msgid.link/a88cc5c4811af36007645d610c95102dccb360a6.1746225214.git.gnault@redhat.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv6/addrconf.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 0811cc8956a3..40ee2d6ef229 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3162,16 +3162,13 @@ static void add_v4_addrs(struct inet6_dev *idev) struct in6_addr addr; struct net_device *dev; struct net *net = dev_net(idev->dev); - int scope, plen, offset = 0; + int scope, plen; u32 pflags = 0; ASSERT_RTNL(); memset(&addr, 0, sizeof(struct in6_addr)); - /* in case of IP6GRE the dev_addr is an IPv6 and therefore we use only the last 4 bytes */ - if (idev->dev->addr_len == sizeof(struct in6_addr)) - offset = sizeof(struct in6_addr) - 4; - memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4); + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); if (!(idev->dev->flags & IFF_POINTOPOINT) && idev->dev->type == ARPHRD_SIT) { scope = IPV6_ADDR_COMPATv4; @@ -3481,7 +3478,13 @@ static void addrconf_gre_config(struct net_device *dev) return; } - if (dev->type == ARPHRD_ETHER) { + /* Generate the IPv6 link-local address using addrconf_addr_gen(), + * unless we have an IPv4 GRE device not bound to an IP address and + * which is in EUI64 mode (as __ipv6_isatap_ifid() would fail in this + * case). Such devices fall back to add_v4_addrs() instead. + */ + if (!(dev->type == ARPHRD_IPGRE && *(__be32 *)dev->dev_addr == 0 && + idev->cnf.addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)) { addrconf_addr_gen(idev, true); return; } From e50ee08fafadce9c333975ba2431a4acb2ce69f3 Mon Sep 17 00:00:00 2001 From: Kelsey Maes Date: Wed, 30 Apr 2025 09:15:01 -0700 Subject: [PATCH 010/529] can: mcp251xfd: fix TDC setting for low data bit rates [ Upstream commit 5e1663810e11c64956aa7e280cf74b2f3284d816 ] The TDC is currently hardcoded enabled. This means that even for lower CAN-FD data bitrates (with a DBRP (data bitrate prescaler) > 2) a TDC is configured. This leads to a bus-off condition. ISO 11898-1 section 11.3.3 says "Transmitter delay compensation" (TDC) is only applicable if DBRP is 1 or 2. To fix the problem, switch the driver to use the TDC calculation provided by the CAN driver framework (which respects ISO 11898-1 section 11.3.3). This has the positive side effect that userspace can control TDC as needed. Demonstration of the feature in action: | $ ip link set can0 up type can bitrate 125000 dbitrate 500000 fd on | $ ip -details link show can0 | 3: can0: mtu 72 qdisc pfifo_fast state UP mode DEFAULT group default qlen 10 | link/can promiscuity 0 allmulti 0 minmtu 0 maxmtu 0 | can state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 0 | bitrate 125000 sample-point 0.875 | tq 50 prop-seg 69 phase-seg1 70 phase-seg2 20 sjw 10 brp 2 | mcp251xfd: tseg1 2..256 tseg2 1..128 sjw 1..128 brp 1..256 brp_inc 1 | dbitrate 500000 dsample-point 0.875 | dtq 125 dprop-seg 6 dphase-seg1 7 dphase-seg2 2 dsjw 1 dbrp 5 | mcp251xfd: dtseg1 1..32 dtseg2 1..16 dsjw 1..16 dbrp 1..256 dbrp_inc 1 | tdcv 0..63 tdco 0..63 | clock 40000000 numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535 tso_max_size 65536 tso_max_segs 65535 gro_max_size 65536 parentbus spi parentdev spi0.0 | $ ip link set can0 up type can bitrate 1000000 dbitrate 4000000 fd on | $ ip -details link show can0 | 3: can0: mtu 72 qdisc pfifo_fast state UP mode DEFAULT group default qlen 10 | link/can promiscuity 0 allmulti 0 minmtu 0 maxmtu 0 | can state ERROR-ACTIVE (berr-counter tx 0 rx 0) restart-ms 0 | bitrate 1000000 sample-point 0.750 | tq 25 prop-seg 14 phase-seg1 15 phase-seg2 10 sjw 5 brp 1 | mcp251xfd: tseg1 2..256 tseg2 1..128 sjw 1..128 brp 1..256 brp_inc 1 | dbitrate 4000000 dsample-point 0.700 | dtq 25 dprop-seg 3 dphase-seg1 3 dphase-seg2 3 dsjw 1 dbrp 1 | tdco 7 | mcp251xfd: dtseg1 1..32 dtseg2 1..16 dsjw 1..16 dbrp 1..256 dbrp_inc 1 | tdcv 0..63 tdco 0..63 | clock 40000000 numtxqueues 1 numrxqueues 1 gso_max_size 65536 gso_max_segs 65535 tso_max_size 65536 tso_max_segs 65535 gro_max_size 65536 parentbus spi parentdev spi0.0 There has been some confusion about the MCP2518FD using a relative or absolute TDCO due to the datasheet specifying a range of [-64,63]. I have a custom board with a 40 MHz clock and an estimated loop delay of 100 to 216 ns. During testing at a data bit rate of 4 Mbit/s I found that using can_get_relative_tdco() resulted in bus-off errors. The final TDCO value was 1 which corresponds to a 10% SSP in an absolute configuration. This behavior is expected if the TDCO value is really absolute and not relative. Using priv->can.tdc.tdco instead results in a final TDCO of 8, setting the SSP at exactly 80%. This configuration works. The automatic, manual, and off TDC modes were tested at speeds up to, and including, 8 Mbit/s on real hardware and behave as expected. Fixes: 55e5b97f003e ("can: mcp25xxfd: add driver for Microchip MCP25xxFD SPI CAN") Reported-by: Kelsey Maes Closes: https://lore.kernel.org/all/C2121586-C87F-4B23-A933-845362C29CA1@vpprocess.com Reviewed-by: Vincent Mailhol Signed-off-by: Kelsey Maes Link: https://patch.msgid.link/20250430161501.79370-1-kelsey@vpprocess.com [mkl: add comment] Signed-off-by: Marc Kleine-Budde Signed-off-by: Sasha Levin --- .../net/can/spi/mcp251xfd/mcp251xfd-core.c | 40 +++++++++++++++---- 1 file changed, 32 insertions(+), 8 deletions(-) diff --git a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c index a48996265172..21ae3a89924e 100644 --- a/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c +++ b/drivers/net/can/spi/mcp251xfd/mcp251xfd-core.c @@ -75,6 +75,24 @@ static const struct can_bittiming_const mcp251xfd_data_bittiming_const = { .brp_inc = 1, }; +/* The datasheet of the mcp2518fd (DS20006027B) specifies a range of + * [-64,63] for TDCO, indicating a relative TDCO. + * + * Manual tests have shown, that using a relative TDCO configuration + * results in bus off, while an absolute configuration works. + * + * For TDCO use the max value (63) from the data sheet, but 0 as the + * minimum. + */ +static const struct can_tdc_const mcp251xfd_tdc_const = { + .tdcv_min = 0, + .tdcv_max = 63, + .tdco_min = 0, + .tdco_max = 63, + .tdcf_min = 0, + .tdcf_max = 0, +}; + static const char *__mcp251xfd_get_model_str(enum mcp251xfd_model model) { switch (model) { @@ -510,8 +528,7 @@ static int mcp251xfd_set_bittiming(const struct mcp251xfd_priv *priv) { const struct can_bittiming *bt = &priv->can.bittiming; const struct can_bittiming *dbt = &priv->can.data_bittiming; - u32 val = 0; - s8 tdco; + u32 tdcmod, val = 0; int err; /* CAN Control Register @@ -575,11 +592,16 @@ static int mcp251xfd_set_bittiming(const struct mcp251xfd_priv *priv) return err; /* Transmitter Delay Compensation */ - tdco = clamp_t(int, dbt->brp * (dbt->prop_seg + dbt->phase_seg1), - -64, 63); - val = FIELD_PREP(MCP251XFD_REG_TDC_TDCMOD_MASK, - MCP251XFD_REG_TDC_TDCMOD_AUTO) | - FIELD_PREP(MCP251XFD_REG_TDC_TDCO_MASK, tdco); + if (priv->can.ctrlmode & CAN_CTRLMODE_TDC_AUTO) + tdcmod = MCP251XFD_REG_TDC_TDCMOD_AUTO; + else if (priv->can.ctrlmode & CAN_CTRLMODE_TDC_MANUAL) + tdcmod = MCP251XFD_REG_TDC_TDCMOD_MANUAL; + else + tdcmod = MCP251XFD_REG_TDC_TDCMOD_DISABLED; + + val = FIELD_PREP(MCP251XFD_REG_TDC_TDCMOD_MASK, tdcmod) | + FIELD_PREP(MCP251XFD_REG_TDC_TDCV_MASK, priv->can.tdc.tdcv) | + FIELD_PREP(MCP251XFD_REG_TDC_TDCO_MASK, priv->can.tdc.tdco); return regmap_write(priv->map_reg, MCP251XFD_REG_TDC, val); } @@ -2083,10 +2105,12 @@ static int mcp251xfd_probe(struct spi_device *spi) priv->can.do_get_berr_counter = mcp251xfd_get_berr_counter; priv->can.bittiming_const = &mcp251xfd_bittiming_const; priv->can.data_bittiming_const = &mcp251xfd_data_bittiming_const; + priv->can.tdc_const = &mcp251xfd_tdc_const; priv->can.ctrlmode_supported = CAN_CTRLMODE_LOOPBACK | CAN_CTRLMODE_LISTENONLY | CAN_CTRLMODE_BERR_REPORTING | CAN_CTRLMODE_FD | CAN_CTRLMODE_FD_NON_ISO | - CAN_CTRLMODE_CC_LEN8_DLC; + CAN_CTRLMODE_CC_LEN8_DLC | CAN_CTRLMODE_TDC_AUTO | + CAN_CTRLMODE_TDC_MANUAL; set_bit(MCP251XFD_FLAGS_DOWN, priv->flags); priv->ndev = ndev; priv->spi = spi; From 3df065841cd7bdb63fcc8a7e7d1bc960524a46ee Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Wed, 1 Feb 2023 16:08:07 +0100 Subject: [PATCH 011/529] rcu/kvfree: Add kvfree_rcu_mightsleep() and kfree_rcu_mightsleep() [ Upstream commit 608723c41cd951fb32ade2f8371e61c270816175 ] The kvfree_rcu() and kfree_rcu() APIs are hazardous in that if you forget the second argument, it works, but might sleep. This sleeping can be a correctness bug from atomic contexts, and even in non-atomic contexts it might introduce unacceptable latencies. This commit therefore adds kvfree_rcu_mightsleep() and kfree_rcu_mightsleep(), which will replace the single-argument kvfree_rcu() and kfree_rcu(), respectively. This commit enables a series of commits that switch from single-argument kvfree_rcu() and kfree_rcu() to their _mightsleep() counterparts. Once all of these commits land, the single-argument versions will be removed. Signed-off-by: Uladzislau Rezki (Sony) Signed-off-by: Paul E. McKenney Stable-dep-of: 511e64e13d8c ("can: gw: fix RCU/BH usage in cgw_create_job()") Signed-off-by: Sasha Levin --- include/linux/rcupdate.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index d001a69fcb7d..aef8c7304d45 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1031,6 +1031,9 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define kvfree_rcu(...) KVFREE_GET_MACRO(__VA_ARGS__, \ kvfree_rcu_arg_2, kvfree_rcu_arg_1)(__VA_ARGS__) +#define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) +#define kfree_rcu_mightsleep(ptr) kvfree_rcu_mightsleep(ptr) + #define KVFREE_GET_MACRO(_1, _2, NAME, ...) NAME #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ From 21988c712a1f858b5f7e7c6bad36a84f5dda89fd Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 29 Apr 2025 09:05:55 +0200 Subject: [PATCH 012/529] can: gw: fix RCU/BH usage in cgw_create_job() [ Upstream commit 511e64e13d8cc72853275832e3f372607466c18c ] As reported by Sebastian Andrzej Siewior the use of local_bh_disable() is only feasible in uni processor systems to update the modification rules. The usual use-case to update the modification rules is to update the data of the modifications but not the modification types (AND/OR/XOR/SET) or the checksum functions itself. To omit additional memory allocations to maintain fast modification switching times, the modification description space is doubled at gw-job creation time so that only the reference to the active modification description is changed under rcu protection. Rename cgw_job::mod to cf_mod and make it a RCU pointer. Allocate in cgw_create_job() and free it together with cgw_job in cgw_job_free_rcu(). Update all users to dereference cgw_job::cf_mod with a RCU accessor and if possible once. [bigeasy: Replace mod1/mod2 from the Oliver's original patch with dynamic allocation, use RCU annotation and accessor] Reported-by: Sebastian Andrzej Siewior Closes: https://lore.kernel.org/linux-can/20231031112349.y0aLoBrz@linutronix.de/ Fixes: dd895d7f21b2 ("can: cangw: introduce optional uid to reference created routing jobs") Tested-by: Oliver Hartkopp Signed-off-by: Oliver Hartkopp Signed-off-by: Sebastian Andrzej Siewior Link: https://patch.msgid.link/20250429070555.cs-7b_eZ@linutronix.de Signed-off-by: Marc Kleine-Budde Signed-off-by: Sasha Levin --- net/can/gw.c | 151 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 91 insertions(+), 60 deletions(-) diff --git a/net/can/gw.c b/net/can/gw.c index 23a3d89cad81..5933423663cd 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -130,7 +130,7 @@ struct cgw_job { u32 handled_frames; u32 dropped_frames; u32 deleted_frames; - struct cf_mod mod; + struct cf_mod __rcu *cf_mod; union { /* CAN frame data source */ struct net_device *dev; @@ -459,6 +459,7 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) struct cgw_job *gwj = (struct cgw_job *)data; struct canfd_frame *cf; struct sk_buff *nskb; + struct cf_mod *mod; int modidx = 0; /* process strictly Classic CAN or CAN FD frames */ @@ -506,7 +507,8 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) * When there is at least one modification function activated, * we need to copy the skb as we want to modify skb->data. */ - if (gwj->mod.modfunc[0]) + mod = rcu_dereference(gwj->cf_mod); + if (mod->modfunc[0]) nskb = skb_copy(skb, GFP_ATOMIC); else nskb = skb_clone(skb, GFP_ATOMIC); @@ -529,8 +531,8 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) cf = (struct canfd_frame *)nskb->data; /* perform preprocessed modification functions if there are any */ - while (modidx < MAX_MODFUNCTIONS && gwj->mod.modfunc[modidx]) - (*gwj->mod.modfunc[modidx++])(cf, &gwj->mod); + while (modidx < MAX_MODFUNCTIONS && mod->modfunc[modidx]) + (*mod->modfunc[modidx++])(cf, mod); /* Has the CAN frame been modified? */ if (modidx) { @@ -546,11 +548,11 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) } /* check for checksum updates */ - if (gwj->mod.csumfunc.crc8) - (*gwj->mod.csumfunc.crc8)(cf, &gwj->mod.csum.crc8); + if (mod->csumfunc.crc8) + (*mod->csumfunc.crc8)(cf, &mod->csum.crc8); - if (gwj->mod.csumfunc.xor) - (*gwj->mod.csumfunc.xor)(cf, &gwj->mod.csum.xor); + if (mod->csumfunc.xor) + (*mod->csumfunc.xor)(cf, &mod->csum.xor); } /* clear the skb timestamp if not configured the other way */ @@ -581,9 +583,20 @@ static void cgw_job_free_rcu(struct rcu_head *rcu_head) { struct cgw_job *gwj = container_of(rcu_head, struct cgw_job, rcu); + /* cgw_job::cf_mod is always accessed from the same cgw_job object within + * the same RCU read section. Once cgw_job is scheduled for removal, + * cf_mod can also be removed without mandating an additional grace period. + */ + kfree(rcu_access_pointer(gwj->cf_mod)); kmem_cache_free(cgw_cache, gwj); } +/* Return cgw_job::cf_mod with RTNL protected section */ +static struct cf_mod *cgw_job_cf_mod(struct cgw_job *gwj) +{ + return rcu_dereference_protected(gwj->cf_mod, rtnl_is_locked()); +} + static int cgw_notifier(struct notifier_block *nb, unsigned long msg, void *ptr) { @@ -616,6 +629,7 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, { struct rtcanmsg *rtcan; struct nlmsghdr *nlh; + struct cf_mod *mod; nlh = nlmsg_put(skb, pid, seq, type, sizeof(*rtcan), flags); if (!nlh) @@ -650,82 +664,83 @@ static int cgw_put_job(struct sk_buff *skb, struct cgw_job *gwj, int type, goto cancel; } + mod = cgw_job_cf_mod(gwj); if (gwj->flags & CGW_FLAGS_CAN_FD) { struct cgw_fdframe_mod mb; - if (gwj->mod.modtype.and) { - memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.and; + if (mod->modtype.and) { + memcpy(&mb.cf, &mod->modframe.and, sizeof(mb.cf)); + mb.modtype = mod->modtype.and; if (nla_put(skb, CGW_FDMOD_AND, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.or) { - memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.or; + if (mod->modtype.or) { + memcpy(&mb.cf, &mod->modframe.or, sizeof(mb.cf)); + mb.modtype = mod->modtype.or; if (nla_put(skb, CGW_FDMOD_OR, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.xor) { - memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.xor; + if (mod->modtype.xor) { + memcpy(&mb.cf, &mod->modframe.xor, sizeof(mb.cf)); + mb.modtype = mod->modtype.xor; if (nla_put(skb, CGW_FDMOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.set) { - memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.set; + if (mod->modtype.set) { + memcpy(&mb.cf, &mod->modframe.set, sizeof(mb.cf)); + mb.modtype = mod->modtype.set; if (nla_put(skb, CGW_FDMOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } else { struct cgw_frame_mod mb; - if (gwj->mod.modtype.and) { - memcpy(&mb.cf, &gwj->mod.modframe.and, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.and; + if (mod->modtype.and) { + memcpy(&mb.cf, &mod->modframe.and, sizeof(mb.cf)); + mb.modtype = mod->modtype.and; if (nla_put(skb, CGW_MOD_AND, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.or) { - memcpy(&mb.cf, &gwj->mod.modframe.or, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.or; + if (mod->modtype.or) { + memcpy(&mb.cf, &mod->modframe.or, sizeof(mb.cf)); + mb.modtype = mod->modtype.or; if (nla_put(skb, CGW_MOD_OR, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.xor) { - memcpy(&mb.cf, &gwj->mod.modframe.xor, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.xor; + if (mod->modtype.xor) { + memcpy(&mb.cf, &mod->modframe.xor, sizeof(mb.cf)); + mb.modtype = mod->modtype.xor; if (nla_put(skb, CGW_MOD_XOR, sizeof(mb), &mb) < 0) goto cancel; } - if (gwj->mod.modtype.set) { - memcpy(&mb.cf, &gwj->mod.modframe.set, sizeof(mb.cf)); - mb.modtype = gwj->mod.modtype.set; + if (mod->modtype.set) { + memcpy(&mb.cf, &mod->modframe.set, sizeof(mb.cf)); + mb.modtype = mod->modtype.set; if (nla_put(skb, CGW_MOD_SET, sizeof(mb), &mb) < 0) goto cancel; } } - if (gwj->mod.uid) { - if (nla_put_u32(skb, CGW_MOD_UID, gwj->mod.uid) < 0) + if (mod->uid) { + if (nla_put_u32(skb, CGW_MOD_UID, mod->uid) < 0) goto cancel; } - if (gwj->mod.csumfunc.crc8) { + if (mod->csumfunc.crc8) { if (nla_put(skb, CGW_CS_CRC8, CGW_CS_CRC8_LEN, - &gwj->mod.csum.crc8) < 0) + &mod->csum.crc8) < 0) goto cancel; } - if (gwj->mod.csumfunc.xor) { + if (mod->csumfunc.xor) { if (nla_put(skb, CGW_CS_XOR, CGW_CS_XOR_LEN, - &gwj->mod.csum.xor) < 0) + &mod->csum.xor) < 0) goto cancel; } @@ -1059,7 +1074,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, struct net *net = sock_net(skb->sk); struct rtcanmsg *r; struct cgw_job *gwj; - struct cf_mod mod; + struct cf_mod *mod; struct can_can_gw ccgw; u8 limhops = 0; int err = 0; @@ -1078,37 +1093,48 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, if (r->gwtype != CGW_TYPE_CAN_CAN) return -EINVAL; - err = cgw_parse_attr(nlh, &mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); - if (err < 0) - return err; + mod = kmalloc(sizeof(*mod), GFP_KERNEL); + if (!mod) + return -ENOMEM; - if (mod.uid) { + err = cgw_parse_attr(nlh, mod, CGW_TYPE_CAN_CAN, &ccgw, &limhops); + if (err < 0) + goto out_free_cf; + + if (mod->uid) { ASSERT_RTNL(); /* check for updating an existing job with identical uid */ hlist_for_each_entry(gwj, &net->can.cgw_list, list) { - if (gwj->mod.uid != mod.uid) + struct cf_mod *old_cf; + + old_cf = cgw_job_cf_mod(gwj); + if (old_cf->uid != mod->uid) continue; /* interfaces & filters must be identical */ - if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) - return -EINVAL; + if (memcmp(&gwj->ccgw, &ccgw, sizeof(ccgw))) { + err = -EINVAL; + goto out_free_cf; + } - /* update modifications with disabled softirq & quit */ - local_bh_disable(); - memcpy(&gwj->mod, &mod, sizeof(mod)); - local_bh_enable(); + rcu_assign_pointer(gwj->cf_mod, mod); + kfree_rcu_mightsleep(old_cf); return 0; } } /* ifindex == 0 is not allowed for job creation */ - if (!ccgw.src_idx || !ccgw.dst_idx) - return -ENODEV; + if (!ccgw.src_idx || !ccgw.dst_idx) { + err = -ENODEV; + goto out_free_cf; + } gwj = kmem_cache_alloc(cgw_cache, GFP_KERNEL); - if (!gwj) - return -ENOMEM; + if (!gwj) { + err = -ENOMEM; + goto out_free_cf; + } gwj->handled_frames = 0; gwj->dropped_frames = 0; @@ -1118,7 +1144,7 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, gwj->limit_hops = limhops; /* insert already parsed information */ - memcpy(&gwj->mod, &mod, sizeof(mod)); + RCU_INIT_POINTER(gwj->cf_mod, mod); memcpy(&gwj->ccgw, &ccgw, sizeof(ccgw)); err = -ENODEV; @@ -1145,9 +1171,11 @@ static int cgw_create_job(struct sk_buff *skb, struct nlmsghdr *nlh, if (!err) hlist_add_head_rcu(&gwj->list, &net->can.cgw_list); out: - if (err) + if (err) { kmem_cache_free(cgw_cache, gwj); - +out_free_cf: + kfree(mod); + } return err; } @@ -1207,19 +1235,22 @@ static int cgw_remove_job(struct sk_buff *skb, struct nlmsghdr *nlh, /* remove only the first matching entry */ hlist_for_each_entry_safe(gwj, nx, &net->can.cgw_list, list) { + struct cf_mod *cf_mod; + if (gwj->flags != r->flags) continue; if (gwj->limit_hops != limhops) continue; + cf_mod = cgw_job_cf_mod(gwj); /* we have a match when uid is enabled and identical */ - if (gwj->mod.uid || mod.uid) { - if (gwj->mod.uid != mod.uid) + if (cf_mod->uid || mod.uid) { + if (cf_mod->uid != mod.uid) continue; } else { /* no uid => check for identical modifications */ - if (memcmp(&gwj->mod, &mod, sizeof(mod))) + if (memcmp(cf_mod, &mod, sizeof(mod))) continue; } From 6e67f25c118f0784046e3427571609fdfd23121e Mon Sep 17 00:00:00 2001 From: Guillaume Nault Date: Thu, 1 Jun 2023 18:37:46 +0200 Subject: [PATCH 013/529] ipv4: Drop tos parameter from flowi4_update_output() [ Upstream commit 3f06760c00f56c5fe6c7f3361c2cf64becee1174 ] Callers of flowi4_update_output() never try to update ->flowi4_tos: * ip_route_connect() updates ->flowi4_tos with its own current value. * ip_route_newports() has two users: tcp_v4_connect() and dccp_v4_connect. Both initialise fl4 with ip_route_connect(), which in turn sets ->flowi4_tos with RT_TOS(inet_sk(sk)->tos) and ->flowi4_scope based on SOCK_LOCALROUTE. Then ip_route_newports() updates ->flowi4_tos with RT_CONN_FLAGS(sk), which is the same as RT_TOS(inet_sk(sk)->tos), unless SOCK_LOCALROUTE is set on the socket. In that case, the lowest order bit is set to 1, to eventually inform ip_route_output_key_hash() to restrict the scope to RT_SCOPE_LINK. This is equivalent to properly setting ->flowi4_scope as ip_route_connect() did. * ip_vs_xmit.c initialises ->flowi4_tos with memset(0), then calls flowi4_update_output() with tos=0. * sctp_v4_get_dst() uses the same RT_CONN_FLAGS_TOS() when initialising ->flowi4_tos and when calling flowi4_update_output(). In the end, ->flowi4_tos never changes. So let's just drop the tos parameter. This will simplify the conversion of ->flowi4_tos from __u8 to dscp_t. Signed-off-by: Guillaume Nault Signed-off-by: David S. Miller Stable-dep-of: e34090d7214e ("ipvs: fix uninit-value for saddr in do_output_route4") Signed-off-by: Sasha Levin --- include/net/flow.h | 3 +-- include/net/route.h | 6 ++---- net/netfilter/ipvs/ip_vs_xmit.c | 4 ++-- net/sctp/protocol.c | 4 +--- 4 files changed, 6 insertions(+), 11 deletions(-) diff --git a/include/net/flow.h b/include/net/flow.h index 079cc493fe67..5a17fa6e016f 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -115,11 +115,10 @@ static inline void flowi4_init_output(struct flowi4 *fl4, int oif, } /* Reset some input parameters after previous lookup */ -static inline void flowi4_update_output(struct flowi4 *fl4, int oif, __u8 tos, +static inline void flowi4_update_output(struct flowi4 *fl4, int oif, __be32 daddr, __be32 saddr) { fl4->flowi4_oif = oif; - fl4->flowi4_tos = tos; fl4->daddr = daddr; fl4->saddr = saddr; } diff --git a/include/net/route.h b/include/net/route.h index 4185e6da9ef8..cdca622c5c6f 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -325,8 +325,7 @@ static inline struct rtable *ip_route_connect(struct flowi4 *fl4, __be32 dst, if (IS_ERR(rt)) return rt; ip_rt_put(rt); - flowi4_update_output(fl4, oif, fl4->flowi4_tos, fl4->daddr, - fl4->saddr); + flowi4_update_output(fl4, oif, fl4->daddr, fl4->saddr); } security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(net, fl4, sk); @@ -341,8 +340,7 @@ static inline struct rtable *ip_route_newports(struct flowi4 *fl4, struct rtable fl4->fl4_dport = dport; fl4->fl4_sport = sport; ip_rt_put(rt); - flowi4_update_output(fl4, sk->sk_bound_dev_if, - RT_CONN_FLAGS(sk), fl4->daddr, + flowi4_update_output(fl4, sk->sk_bound_dev_if, fl4->daddr, fl4->saddr); security_sk_classify_flow(sk, flowi4_to_flowi_common(fl4)); return ip_route_output_flow(sock_net(sk), fl4, sk); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index d40a4ca2b27f..f9ae72122e34 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -139,7 +139,7 @@ retry: if (PTR_ERR(rt) == -EINVAL && *saddr && rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { *saddr = 0; - flowi4_update_output(&fl4, 0, 0, daddr, 0); + flowi4_update_output(&fl4, 0, daddr, 0); goto retry; } IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); @@ -147,7 +147,7 @@ retry: } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { ip_rt_put(rt); *saddr = fl4.saddr; - flowi4_update_output(&fl4, 0, 0, daddr, fl4.saddr); + flowi4_update_output(&fl4, 0, daddr, fl4.saddr); loop = true; goto retry; } diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index bcd3384ab07a..036dc574af4f 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -497,9 +497,7 @@ static void sctp_v4_get_dst(struct sctp_transport *t, union sctp_addr *saddr, continue; fl4->fl4_sport = laddr->a.v4.sin_port; - flowi4_update_output(fl4, - asoc->base.sk->sk_bound_dev_if, - RT_CONN_FLAGS_TOS(asoc->base.sk, tos), + flowi4_update_output(fl4, asoc->base.sk->sk_bound_dev_if, daddr->v4.sin_addr.s_addr, laddr->a.v4.sin_addr.s_addr); From 7d0032112a0380d0b8d7c9005f621928a9b9fc76 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sat, 3 May 2025 01:01:18 +0300 Subject: [PATCH 014/529] ipvs: fix uninit-value for saddr in do_output_route4 [ Upstream commit e34090d7214e0516eb8722aee295cb2507317c07 ] syzbot reports for uninit-value for the saddr argument [1]. commit 4754957f04f5 ("ipvs: do not use random local source address for tunnels") already implies that the input value of saddr should be ignored but the code is still reading it which can prevent to connect the route. Fix it by changing the argument to ret_saddr. [1] BUG: KMSAN: uninit-value in do_output_route4+0x42c/0x4d0 net/netfilter/ipvs/ip_vs_xmit.c:147 do_output_route4+0x42c/0x4d0 net/netfilter/ipvs/ip_vs_xmit.c:147 __ip_vs_get_out_rt+0x403/0x21d0 net/netfilter/ipvs/ip_vs_xmit.c:330 ip_vs_tunnel_xmit+0x205/0x2380 net/netfilter/ipvs/ip_vs_xmit.c:1136 ip_vs_in_hook+0x1aa5/0x35b0 net/netfilter/ipvs/ip_vs_core.c:2063 nf_hook_entry_hookfn include/linux/netfilter.h:154 [inline] nf_hook_slow+0xf7/0x400 net/netfilter/core.c:626 nf_hook include/linux/netfilter.h:269 [inline] __ip_local_out+0x758/0x7e0 net/ipv4/ip_output.c:118 ip_local_out net/ipv4/ip_output.c:127 [inline] ip_send_skb+0x6a/0x3c0 net/ipv4/ip_output.c:1501 udp_send_skb+0xfda/0x1b70 net/ipv4/udp.c:1195 udp_sendmsg+0x2fe3/0x33c0 net/ipv4/udp.c:1483 inet_sendmsg+0x1fc/0x280 net/ipv4/af_inet.c:851 sock_sendmsg_nosec net/socket.c:712 [inline] __sock_sendmsg+0x267/0x380 net/socket.c:727 ____sys_sendmsg+0x91b/0xda0 net/socket.c:2566 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2620 __sys_sendmmsg+0x41d/0x880 net/socket.c:2702 __compat_sys_sendmmsg net/compat.c:360 [inline] __do_compat_sys_sendmmsg net/compat.c:367 [inline] __se_compat_sys_sendmmsg net/compat.c:364 [inline] __ia32_compat_sys_sendmmsg+0xc8/0x140 net/compat.c:364 ia32_sys_call+0x3ffa/0x41f0 arch/x86/include/generated/asm/syscalls_32.h:346 do_syscall_32_irqs_on arch/x86/entry/syscall_32.c:83 [inline] __do_fast_syscall_32+0xb0/0x110 arch/x86/entry/syscall_32.c:306 do_fast_syscall_32+0x38/0x80 arch/x86/entry/syscall_32.c:331 do_SYSENTER_32+0x1f/0x30 arch/x86/entry/syscall_32.c:369 entry_SYSENTER_compat_after_hwframe+0x84/0x8e Uninit was created at: slab_post_alloc_hook mm/slub.c:4167 [inline] slab_alloc_node mm/slub.c:4210 [inline] __kmalloc_cache_noprof+0x8fa/0xe00 mm/slub.c:4367 kmalloc_noprof include/linux/slab.h:905 [inline] ip_vs_dest_dst_alloc net/netfilter/ipvs/ip_vs_xmit.c:61 [inline] __ip_vs_get_out_rt+0x35d/0x21d0 net/netfilter/ipvs/ip_vs_xmit.c:323 ip_vs_tunnel_xmit+0x205/0x2380 net/netfilter/ipvs/ip_vs_xmit.c:1136 ip_vs_in_hook+0x1aa5/0x35b0 net/netfilter/ipvs/ip_vs_core.c:2063 nf_hook_entry_hookfn include/linux/netfilter.h:154 [inline] nf_hook_slow+0xf7/0x400 net/netfilter/core.c:626 nf_hook include/linux/netfilter.h:269 [inline] __ip_local_out+0x758/0x7e0 net/ipv4/ip_output.c:118 ip_local_out net/ipv4/ip_output.c:127 [inline] ip_send_skb+0x6a/0x3c0 net/ipv4/ip_output.c:1501 udp_send_skb+0xfda/0x1b70 net/ipv4/udp.c:1195 udp_sendmsg+0x2fe3/0x33c0 net/ipv4/udp.c:1483 inet_sendmsg+0x1fc/0x280 net/ipv4/af_inet.c:851 sock_sendmsg_nosec net/socket.c:712 [inline] __sock_sendmsg+0x267/0x380 net/socket.c:727 ____sys_sendmsg+0x91b/0xda0 net/socket.c:2566 ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2620 __sys_sendmmsg+0x41d/0x880 net/socket.c:2702 __compat_sys_sendmmsg net/compat.c:360 [inline] __do_compat_sys_sendmmsg net/compat.c:367 [inline] __se_compat_sys_sendmmsg net/compat.c:364 [inline] __ia32_compat_sys_sendmmsg+0xc8/0x140 net/compat.c:364 ia32_sys_call+0x3ffa/0x41f0 arch/x86/include/generated/asm/syscalls_32.h:346 do_syscall_32_irqs_on arch/x86/entry/syscall_32.c:83 [inline] __do_fast_syscall_32+0xb0/0x110 arch/x86/entry/syscall_32.c:306 do_fast_syscall_32+0x38/0x80 arch/x86/entry/syscall_32.c:331 do_SYSENTER_32+0x1f/0x30 arch/x86/entry/syscall_32.c:369 entry_SYSENTER_compat_after_hwframe+0x84/0x8e CPU: 0 UID: 0 PID: 22408 Comm: syz.4.5165 Not tainted 6.15.0-rc3-syzkaller-00019-gbc3372351d0c #0 PREEMPT(undef) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025 Reported-by: syzbot+04b9a82855c8aed20860@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/68138dfa.050a0220.14dd7d.0017.GAE@google.com/ Fixes: 4754957f04f5 ("ipvs: do not use random local source address for tunnels") Signed-off-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/ipvs/ip_vs_xmit.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index f9ae72122e34..e1437c72ca6e 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -119,13 +119,12 @@ __mtu_check_toobig_v6(const struct sk_buff *skb, u32 mtu) return false; } -/* Get route to daddr, update *saddr, optionally bind route to saddr */ +/* Get route to daddr, optionally bind route to saddr */ static struct rtable *do_output_route4(struct net *net, __be32 daddr, - int rt_mode, __be32 *saddr) + int rt_mode, __be32 *ret_saddr) { struct flowi4 fl4; struct rtable *rt; - bool loop = false; memset(&fl4, 0, sizeof(fl4)); fl4.daddr = daddr; @@ -135,23 +134,17 @@ static struct rtable *do_output_route4(struct net *net, __be32 daddr, retry: rt = ip_route_output_key(net, &fl4); if (IS_ERR(rt)) { - /* Invalid saddr ? */ - if (PTR_ERR(rt) == -EINVAL && *saddr && - rt_mode & IP_VS_RT_MODE_CONNECT && !loop) { - *saddr = 0; - flowi4_update_output(&fl4, 0, daddr, 0); - goto retry; - } IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &daddr); return NULL; - } else if (!*saddr && rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { + } + if (rt_mode & IP_VS_RT_MODE_CONNECT && fl4.saddr) { ip_rt_put(rt); - *saddr = fl4.saddr; flowi4_update_output(&fl4, 0, daddr, fl4.saddr); - loop = true; + rt_mode = 0; goto retry; } - *saddr = fl4.saddr; + if (ret_saddr) + *ret_saddr = fl4.saddr; return rt; } @@ -344,19 +337,15 @@ __ip_vs_get_out_rt(struct netns_ipvs *ipvs, int skb_af, struct sk_buff *skb, if (ret_saddr) *ret_saddr = dest_dst->dst_saddr.ip; } else { - __be32 saddr = htonl(INADDR_ANY); - noref = 0; /* For such unconfigured boxes avoid many route lookups * for performance reasons because we do not remember saddr */ rt_mode &= ~IP_VS_RT_MODE_CONNECT; - rt = do_output_route4(net, daddr, rt_mode, &saddr); + rt = do_output_route4(net, daddr, rt_mode, ret_saddr); if (!rt) goto err_unreach; - if (ret_saddr) - *ret_saddr = saddr; } local = (rt->rt_flags & RTCF_LOCAL) ? 1 : 0; From aa77294b0f73bb8265987591460cd25b8722c3df Mon Sep 17 00:00:00 2001 From: Jozsef Kadlecsik Date: Wed, 7 May 2025 17:01:59 +0200 Subject: [PATCH 015/529] netfilter: ipset: fix region locking in hash types [ Upstream commit 8478a729c0462273188263136880480729e9efca ] Region locking introduced in v5.6-rc4 contained three macros to handle the region locks: ahash_bucket_start(), ahash_bucket_end() which gave back the start and end hash bucket values belonging to a given region lock and ahash_region() which should give back the region lock belonging to a given hash bucket. The latter was incorrect which can lead to a race condition between the garbage collector and adding new elements when a hash type of set is defined with timeouts. Fixes: f66ee0410b1c ("netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports") Reported-by: Kota Toda Signed-off-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/ipset/ip_set_hash_gen.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index ef04e556aadb..0bd6bf46f05f 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -63,7 +63,7 @@ struct hbucket { #define ahash_sizeof_regions(htable_bits) \ (ahash_numof_locks(htable_bits) * sizeof(struct ip_set_region)) #define ahash_region(n, htable_bits) \ - ((n) % ahash_numof_locks(htable_bits)) + ((n) / jhash_size(HTABLE_REGION_BITS)) #define ahash_bucket_start(h, htable_bits) \ ((htable_bits) < HTABLE_REGION_BITS ? 0 \ : (h) * jhash_size(HTABLE_REGION_BITS)) From de1067cc8cf0e8c11ae20cbe5c467aef19d04ded Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Mon, 5 May 2025 21:58:04 +0200 Subject: [PATCH 016/529] bpf: Scrub packet on bpf_redirect_peer [ Upstream commit c4327229948879814229b46aa26a750718888503 ] When bpf_redirect_peer is used to redirect packets to a device in another network namespace, the skb isn't scrubbed. That can lead skb information from one namespace to be "misused" in another namespace. As one example, this is causing Cilium to drop traffic when using bpf_redirect_peer to redirect packets that just went through IPsec decryption to a container namespace. The following pwru trace shows (1) the packet path from the host's XFRM layer to the container's XFRM layer where it's dropped and (2) the number of active skb extensions at each function. NETNS MARK IFACE TUPLE FUNC 4026533547 d00 eth0 10.244.3.124:35473->10.244.2.158:53 xfrm_rcv_cb .active_extensions = (__u8)2, 4026533547 d00 eth0 10.244.3.124:35473->10.244.2.158:53 xfrm4_rcv_cb .active_extensions = (__u8)2, 4026533547 d00 eth0 10.244.3.124:35473->10.244.2.158:53 gro_cells_receive .active_extensions = (__u8)2, [...] 4026533547 0 eth0 10.244.3.124:35473->10.244.2.158:53 skb_do_redirect .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 ip_rcv .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 ip_rcv_core .active_extensions = (__u8)2, [...] 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 udp_queue_rcv_one_skb .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 __xfrm_policy_check .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 __xfrm_decode_session .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 security_xfrm_decode_session .active_extensions = (__u8)2, 4026534999 0 eth0 10.244.3.124:35473->10.244.2.158:53 kfree_skb_reason(SKB_DROP_REASON_XFRM_POLICY) .active_extensions = (__u8)2, In this case, there are no XFRM policies in the container's network namespace so the drop is unexpected. When we decrypt the IPsec packet, the XFRM state used for decryption is set in the skb extensions. This information is preserved across the netns switch. When we reach the XFRM policy check in the container's netns, __xfrm_policy_check drops the packet with LINUX_MIB_XFRMINNOPOLS because a (container-side) XFRM policy can't be found that matches the (host-side) XFRM state used for decryption. This patch fixes this by scrubbing the packet when using bpf_redirect_peer, as is done on typical netns switches via veth devices except skb->mark and skb->tstamp are not zeroed. Fixes: 9aa1206e8f482 ("bpf: Add redirect_peer helper") Signed-off-by: Paul Chaignon Acked-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://patch.msgid.link/1728ead5e0fe45e7a6542c36bd4e3ca07a73b7d6.1746460653.git.paul.chaignon@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/core/filter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/filter.c b/net/core/filter.c index d713696d0832..497b41ac399d 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -2500,6 +2500,7 @@ int skb_do_redirect(struct sk_buff *skb) goto out_drop; skb->dev = dev; dev_sw_netstats_rx_add(dev, skb->len); + skb_scrub_packet(skb, false); return -EAGAIN; } return flags & BPF_F_NEIGH ? From aad87c94be4366b6d81b4e5efd4bd9a29f400156 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:00 +0200 Subject: [PATCH 017/529] net: dsa: b53: allow leaky reserved multicast [ Upstream commit 5f93185a757ff38b36f849c659aeef368db15a68 ] Allow reserved multicast to ignore VLAN membership so STP and other management protocols work without a PVID VLAN configured when using a vlan aware bridge. Fixes: 967dd82ffc52 ("net: dsa: b53: Add support for Broadcom RoboSwitch") Signed-off-by: Jonas Gorski Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250429201710.330937-2-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/dsa/b53/b53_common.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index ea7d2c01e672..afaecc92f8a8 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -373,9 +373,11 @@ static void b53_enable_vlan(struct b53_device *dev, int port, bool enable, b53_read8(dev, B53_VLAN_PAGE, B53_VLAN_CTRL5, &vc5); } + vc1 &= ~VC1_RX_MCST_FWD_EN; + if (enable) { vc0 |= VC0_VLAN_EN | VC0_VID_CHK_EN | VC0_VID_HASH_VID; - vc1 |= VC1_RX_MCST_UNTAG_EN | VC1_RX_MCST_FWD_EN; + vc1 |= VC1_RX_MCST_UNTAG_EN; vc4 &= ~VC4_ING_VID_CHECK_MASK; if (enable_filtering) { vc4 |= VC4_ING_VID_VIO_DROP << VC4_ING_VID_CHECK_S; @@ -393,7 +395,7 @@ static void b53_enable_vlan(struct b53_device *dev, int port, bool enable, } else { vc0 &= ~(VC0_VLAN_EN | VC0_VID_CHK_EN | VC0_VID_HASH_VID); - vc1 &= ~(VC1_RX_MCST_UNTAG_EN | VC1_RX_MCST_FWD_EN); + vc1 &= ~VC1_RX_MCST_UNTAG_EN; vc4 &= ~VC4_ING_VID_CHECK_MASK; vc5 &= ~VC5_DROP_VTABLE_MISS; From 7f622ae738fa6f68b6909878a2bca7bf6e6a0b37 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:02 +0200 Subject: [PATCH 018/529] net: dsa: b53: fix clearing PVID of a port [ Upstream commit f480851981043d9bb6447ca9883ade9247b9a0ad ] Currently the PVID of ports are only set when adding/updating VLANs with PVID set or removing VLANs, but not when clearing the PVID flag of a VLAN. E.g. the following flow $ ip link add br0 type bridge vlan_filtering 1 $ ip link set sw1p1 master bridge $ bridge vlan add dev sw1p1 vid 10 pvid untagged $ bridge vlan add dev sw1p1 vid 10 untagged Would keep the PVID set as 10, despite the flag being cleared. Fix this by checking if we need to unset the PVID on vlan updates. Fixes: a2482d2ce349 ("net: dsa: b53: Plug in VLAN support") Signed-off-by: Jonas Gorski Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250429201710.330937-4-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/dsa/b53/b53_common.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index afaecc92f8a8..9e33e713b9c8 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1477,12 +1477,21 @@ int b53_vlan_add(struct dsa_switch *ds, int port, bool untagged = vlan->flags & BRIDGE_VLAN_INFO_UNTAGGED; bool pvid = vlan->flags & BRIDGE_VLAN_INFO_PVID; struct b53_vlan *vl; + u16 old_pvid, new_pvid; int err; err = b53_vlan_prepare(ds, port, vlan); if (err) return err; + b53_read16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), &old_pvid); + if (pvid) + new_pvid = vlan->vid; + else if (!pvid && vlan->vid == old_pvid) + new_pvid = b53_default_pvid(dev); + else + new_pvid = old_pvid; + vl = &dev->vlans[vlan->vid]; b53_get_vlan_entry(dev, vlan->vid, vl); @@ -1499,9 +1508,9 @@ int b53_vlan_add(struct dsa_switch *ds, int port, b53_set_vlan_entry(dev, vlan->vid, vl); b53_fast_age_vlan(dev, vlan->vid); - if (pvid && !dsa_is_cpu_port(ds, port)) { + if (!dsa_is_cpu_port(ds, port) && new_pvid != old_pvid) { b53_write16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), - vlan->vid); + new_pvid); b53_fast_age_vlan(dev, vlan->vid); } From 55c845e76fa7f8cfb06e42ec13ea93209acef095 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:03 +0200 Subject: [PATCH 019/529] net: dsa: b53: fix flushing old pvid VLAN on pvid change [ Upstream commit 083c6b28c0cbcd83b6af1a10f2c82937129b3438 ] Presumably the intention here was to flush the VLAN of the old pvid, not the added VLAN again, which we already flushed before. Fixes: a2482d2ce349 ("net: dsa: b53: Plug in VLAN support") Signed-off-by: Jonas Gorski Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250429201710.330937-5-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/dsa/b53/b53_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 9e33e713b9c8..cff0e1ecaca5 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1511,7 +1511,7 @@ int b53_vlan_add(struct dsa_switch *ds, int port, if (!dsa_is_cpu_port(ds, port) && new_pvid != old_pvid) { b53_write16(dev, B53_VLAN_PAGE, B53_VLAN_PORT_DEF_TAG(port), new_pvid); - b53_fast_age_vlan(dev, vlan->vid); + b53_fast_age_vlan(dev, old_pvid); } return 0; From 4dc610122c09f0bfdb5ee1331e6cf59c5c2b3a79 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:04 +0200 Subject: [PATCH 020/529] net: dsa: b53: fix VLAN ID for untagged vlan on bridge leave [ Upstream commit a1c1901c5cc881425cc45992ab6c5418174e9e5a ] The untagged default VLAN is added to the default vlan, which may be one, but we modify the VLAN 0 entry on bridge leave. Fix this to use the correct VLAN entry for the default pvid. Fixes: fea83353177a ("net: dsa: b53: Fix default VLAN ID") Signed-off-by: Jonas Gorski Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250429201710.330937-6-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/dsa/b53/b53_common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index cff0e1ecaca5..bffd23ff6e13 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1923,7 +1923,7 @@ EXPORT_SYMBOL(b53_br_join); void b53_br_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge) { struct b53_device *dev = ds->priv; - struct b53_vlan *vl = &dev->vlans[0]; + struct b53_vlan *vl; s8 cpu_port = dsa_to_port(ds, port)->cpu_dp->index; unsigned int i; u16 pvlan, reg, pvid; @@ -1949,6 +1949,7 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge) dev->ports[port].vlan_ctl_mask = pvlan; pvid = b53_default_pvid(dev); + vl = &dev->vlans[pvid]; /* Make this port join all VLANs without VLAN entries */ if (is58xx(dev)) { From 3379b3ca2c65d79ebb260ff4369404a95924821c Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:05 +0200 Subject: [PATCH 021/529] net: dsa: b53: always rejoin default untagged VLAN on bridge leave [ Upstream commit 13b152ae40495966501697693f048f47430c50fd ] While JOIN_ALL_VLAN allows to join all VLANs, we still need to keep the default VLAN enabled so that untagged traffic stays untagged. So rejoin the default VLAN even for switches with JOIN_ALL_VLAN support. Fixes: 48aea33a77ab ("net: dsa: b53: Add JOIN_ALL_VLAN support") Signed-off-by: Jonas Gorski Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250429201710.330937-7-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/dsa/b53/b53_common.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index bffd23ff6e13..9d410c21416f 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1958,12 +1958,12 @@ void b53_br_leave(struct dsa_switch *ds, int port, struct dsa_bridge bridge) if (!(reg & BIT(cpu_port))) reg |= BIT(cpu_port); b53_write16(dev, B53_VLAN_PAGE, B53_JOIN_ALL_VLAN_EN, reg); - } else { - b53_get_vlan_entry(dev, pvid, vl); - vl->members |= BIT(port) | BIT(cpu_port); - vl->untag |= BIT(port) | BIT(cpu_port); - b53_set_vlan_entry(dev, pvid, vl); } + + b53_get_vlan_entry(dev, pvid, vl); + vl->members |= BIT(port) | BIT(cpu_port); + vl->untag |= BIT(port) | BIT(cpu_port); + b53_set_vlan_entry(dev, pvid, vl); } EXPORT_SYMBOL(b53_br_leave); From 03d71e1b2488c1318b51e1db06324c6e4730c4f3 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Tue, 29 Apr 2025 22:17:09 +0200 Subject: [PATCH 022/529] net: dsa: b53: fix learning on VLAN unaware bridges [ Upstream commit 9f34ad89bcf0e6df6f8b01f1bdab211493fc66d1 ] When VLAN filtering is off, we configure the switch to forward, but not learn on VLAN table misses. This effectively disables learning while not filtering. Fix this by switching to forward and learn. Setting the learning disable register will still control whether learning actually happens. Fixes: dad8d7c6452b ("net: dsa: b53: Properly account for VLAN filtering") Signed-off-by: Jonas Gorski Tested-by: Florian Fainelli Reviewed-by: Florian Fainelli Link: https://patch.msgid.link/20250429201710.330937-11-jonas.gorski@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/dsa/b53/b53_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 9d410c21416f..1a23fcc0445c 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -383,7 +383,7 @@ static void b53_enable_vlan(struct b53_device *dev, int port, bool enable, vc4 |= VC4_ING_VID_VIO_DROP << VC4_ING_VID_CHECK_S; vc5 |= VC5_DROP_VTABLE_MISS; } else { - vc4 |= VC4_ING_VID_VIO_FWD << VC4_ING_VID_CHECK_S; + vc4 |= VC4_NO_ING_VID_CHK << VC4_ING_VID_CHECK_S; vc5 &= ~VC5_DROP_VTABLE_MISS; } From 334d74a798463ceec02a41eb0e2354aaac0d6249 Mon Sep 17 00:00:00 2001 From: Gary Bisson Date: Tue, 29 Apr 2025 09:16:29 -0700 Subject: [PATCH 023/529] Input: mtk-pmic-keys - fix possible null pointer dereference commit 11cdb506d0fbf5ac05bf55f5afcb3a215c316490 upstream. In mtk_pmic_keys_probe, the regs parameter is only set if the button is parsed in the device tree. However, on hardware where the button is left floating, that node will most likely be removed not to enable that input. In that case the code will try to dereference a null pointer. Let's use the regs struct instead as it is defined for all supported platforms. Note that it is ok setting the key reg even if that latter is disabled as the interrupt won't be enabled anyway. Fixes: b581acb49aec ("Input: mtk-pmic-keys - transfer per-key bit in mtk_pmic_keys_regs") Signed-off-by: Gary Bisson Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/keyboard/mtk-pmic-keys.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/input/keyboard/mtk-pmic-keys.c b/drivers/input/keyboard/mtk-pmic-keys.c index 9b34da0ec260..2d8999023468 100644 --- a/drivers/input/keyboard/mtk-pmic-keys.c +++ b/drivers/input/keyboard/mtk-pmic-keys.c @@ -133,8 +133,8 @@ static void mtk_pmic_keys_lp_reset_setup(struct mtk_pmic_keys *keys, u32 value, mask; int error; - kregs_home = keys->keys[MTK_PMIC_HOMEKEY_INDEX].regs; - kregs_pwr = keys->keys[MTK_PMIC_PWRKEY_INDEX].regs; + kregs_home = ®s->keys_regs[MTK_PMIC_HOMEKEY_INDEX]; + kregs_pwr = ®s->keys_regs[MTK_PMIC_PWRKEY_INDEX]; error = of_property_read_u32(keys->dev->of_node, "power-off-time-sec", &long_press_debounce); From f69db59eec3e2de1cfa94f608a5570e4853b7b8b Mon Sep 17 00:00:00 2001 From: Manuel Fombuena Date: Wed, 7 May 2025 12:05:26 -0700 Subject: [PATCH 024/529] Input: synaptics - enable InterTouch on Dynabook Portege X30-D commit 6d7ea0881000966607772451b789b5fb5766f11d upstream. [ 5.989588] psmouse serio1: synaptics: Your touchpad (PNP: TOS0213 PNP0f03) says it can support a different bus. If i2c-hid and hid-rmi are not used, you might want to try setting psmouse.synaptics_intertouch to 1 and report this to linux-input@vger.kernel.org. [ 6.039923] psmouse serio1: synaptics: Touchpad model: 1, fw: 9.32, id: 0x1e2a1, caps: 0xf00223/0x840300/0x12e800/0x52d884, board id: 3322, fw id: 2658004 The board is labelled TM3322. Present on the Toshiba / Dynabook Portege X30-D and possibly others. Confirmed working well with psmouse.synaptics_intertouch=1 and local build. Signed-off-by: Manuel Fombuena Signed-off-by: Aditya Garg Link: https://lore.kernel.org/r/PN3PR01MB9597711E7933A08389FEC31DB888A@PN3PR01MB9597.INDPRD01.PROD.OUTLOOK.COM Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/synaptics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index d8c90a23a101..f2d4182a629a 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -194,6 +194,7 @@ static const char * const smbus_pnp_ids[] = { "SYN3221", /* HP 15-ay000 */ "SYN323d", /* HP Spectre X360 13-w013dx */ "SYN3257", /* HP Envy 13-ad105ng */ + "TOS0213", /* Dynabook Portege X30-D */ NULL }; From ed506876c5fcaaa720f8d48f4c0d935c04c52fbe Mon Sep 17 00:00:00 2001 From: Aditya Garg Date: Wed, 7 May 2025 12:06:32 -0700 Subject: [PATCH 025/529] Input: synaptics - enable InterTouch on Dynabook Portege X30L-G commit 47d768b32e644b56901bb4bbbdb1feb01ea86c85 upstream. Enable InterTouch mode on Dynabook Portege X30L-G by adding "TOS01f6" to the list of SMBus-enabled variants. Reported-by: Xuntao Chi Tested-by: Xuntao Chi Signed-off-by: Aditya Garg Link: https://lore.kernel.org/r/PN3PR01MB959786E4AC797160CDA93012B888A@PN3PR01MB9597.INDPRD01.PROD.OUTLOOK.COM Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/synaptics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index f2d4182a629a..9654b7537679 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -194,6 +194,7 @@ static const char * const smbus_pnp_ids[] = { "SYN3221", /* HP 15-ay000 */ "SYN323d", /* HP Spectre X360 13-w013dx */ "SYN3257", /* HP Envy 13-ad105ng */ + "TOS01f6", /* Dynabook Portege X30L-G */ "TOS0213", /* Dynabook Portege X30-D */ NULL }; From 7905a5fd76ff227cecd6f835760546dcf6b2f4ed Mon Sep 17 00:00:00 2001 From: Aditya Garg Date: Wed, 7 May 2025 12:12:15 -0700 Subject: [PATCH 026/529] Input: synaptics - enable InterTouch on Dell Precision M3800 commit a609cb4cc07aa9ab8f50466622814356c06f2c17 upstream. Enable InterTouch mode on Dell Precision M3800 by adding "DLL060d" to the list of SMBus-enabled variants. Reported-by: Markus Rathgeb Signed-off-by: Aditya Garg Link: https://lore.kernel.org/r/PN3PR01MB959789DD6D574E16141E5DC4B888A@PN3PR01MB9597.INDPRD01.PROD.OUTLOOK.COM Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/synaptics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index 9654b7537679..17dfcc83e579 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -163,6 +163,7 @@ static const char * const topbuttonpad_pnp_ids[] = { static const char * const smbus_pnp_ids[] = { /* all of the topbuttonpad_pnp_ids are valid, we just add some extras */ + "DLL060d", /* Dell Precision M3800 */ "LEN0048", /* X1 Carbon 3 */ "LEN0046", /* X250 */ "LEN0049", /* Yoga 11e */ From 4e904ea41f54fe64ff39ff35a0d5b86f6cb46620 Mon Sep 17 00:00:00 2001 From: Dmitry Torokhov Date: Wed, 7 May 2025 14:52:55 -0700 Subject: [PATCH 027/529] Input: synaptics - enable SMBus for HP Elitebook 850 G1 commit f04f03d3e99bc8f89b6af5debf07ff67d961bc23 upstream. The kernel reports that the touchpad for this device can support SMBus mode. Reported-by: jt Link: https://lore.kernel.org/r/iys5dbv3ldddsgobfkxldazxyp54kay4bozzmagga6emy45jop@2ebvuxgaui4u Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/synaptics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index 17dfcc83e579..9baacebf095b 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -190,6 +190,7 @@ static const char * const smbus_pnp_ids[] = { "LEN2054", /* E480 */ "LEN2055", /* E580 */ "LEN2068", /* T14 Gen 1 */ + "SYN3003", /* HP EliteBook 850 G1 */ "SYN3015", /* HP EliteBook 840 G2 */ "SYN3052", /* HP EliteBook 840 G4 */ "SYN3221", /* HP 15-ay000 */ From a92a9a4a33d7c55bec16708082b07c633ac8a009 Mon Sep 17 00:00:00 2001 From: Aditya Garg Date: Wed, 7 May 2025 12:09:00 -0700 Subject: [PATCH 028/529] Input: synaptics - enable InterTouch on TUXEDO InfinityBook Pro 14 v5 commit 2abc698ac77314e0de5b33a6d96a39c5159d88e4 upstream. Enable InterTouch mode on TUXEDO InfinityBook Pro 14 v5 by adding "SYN1221" to the list of SMBus-enabled variants. Add support for InterTouch on SYN1221 by adding it to the list of SMBus-enabled variants. Reported-by: Matthias Eilert Tested-by: Matthias Eilert Signed-off-by: Aditya Garg Link: https://lore.kernel.org/r/PN3PR01MB9597C033C4BC20EE2A0C4543B888A@PN3PR01MB9597.INDPRD01.PROD.OUTLOOK.COM Cc: stable@vger.kernel.org Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/synaptics.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/mouse/synaptics.c b/drivers/input/mouse/synaptics.c index 9baacebf095b..6cc66774c107 100644 --- a/drivers/input/mouse/synaptics.c +++ b/drivers/input/mouse/synaptics.c @@ -190,6 +190,7 @@ static const char * const smbus_pnp_ids[] = { "LEN2054", /* E480 */ "LEN2055", /* E580 */ "LEN2068", /* T14 Gen 1 */ + "SYN1221", /* TUXEDO InfinityBook Pro 14 v5 */ "SYN3003", /* HP EliteBook 850 G1 */ "SYN3015", /* HP EliteBook 840 G2 */ "SYN3052", /* HP EliteBook 840 G4 */ From 0f2c03bc1de6419775b3802f69c86000f3772c82 Mon Sep 17 00:00:00 2001 From: Gabriel Shahrouzi Date: Mon, 14 Apr 2025 11:40:49 -0400 Subject: [PATCH 029/529] staging: iio: adc: ad7816: Correct conditional logic for store mode MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 2e922956277187655ed9bedf7b5c28906e51708f upstream. The mode setting logic in ad7816_store_mode was reversed due to incorrect handling of the strcmp return value. strcmp returns 0 on match, so the `if (strcmp(buf, "full"))` block executed when the input was not "full". This resulted in "full" setting the mode to AD7816_PD (power-down) and other inputs setting it to AD7816_FULL. Fix this by checking it against 0 to correctly check for "full" and "power-down", mapping them to AD7816_FULL and AD7816_PD respectively. Fixes: 7924425db04a ("staging: iio: adc: new driver for AD7816 devices") Cc: stable@vger.kernel.org Signed-off-by: Gabriel Shahrouzi Acked-by: Nuno Sá Link: https://lore.kernel.org/stable/20250414152920.467505-1-gshahrouzi%40gmail.com Link: https://patch.msgid.link/20250414154050.469482-1-gshahrouzi@gmail.com Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/staging/iio/adc/ad7816.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/iio/adc/ad7816.c b/drivers/staging/iio/adc/ad7816.c index 6c14d7bcdd67..081b17f49863 100644 --- a/drivers/staging/iio/adc/ad7816.c +++ b/drivers/staging/iio/adc/ad7816.c @@ -136,7 +136,7 @@ static ssize_t ad7816_store_mode(struct device *dev, struct iio_dev *indio_dev = dev_to_iio_dev(dev); struct ad7816_chip_info *chip = iio_priv(indio_dev); - if (strcmp(buf, "full")) { + if (strcmp(buf, "full") == 0) { gpiod_set_value(chip->rdwr_pin, 1); chip->mode = AD7816_FULL; } else { From 0a73a6ac5ffc4988027c88df61adddf3582fffbf Mon Sep 17 00:00:00 2001 From: Gabriel Shahrouzi Date: Fri, 18 Apr 2025 20:43:06 -0400 Subject: [PATCH 030/529] staging: axis-fifo: Remove hardware resets for user errors commit c6e8d85fafa7193613db37da29c0e8d6e2515b13 upstream. The axis-fifo driver performs a full hardware reset (via reset_ip_core()) in several error paths within the read and write functions. This reset flushes both TX and RX FIFOs and resets the AXI-Stream links. Allow the user to handle the error without causing hardware disruption or data loss in other FIFO paths. Fixes: 4a965c5f89de ("staging: add driver for Xilinx AXI-Stream FIFO v4.1 IP core") Cc: stable@vger.kernel.org Signed-off-by: Gabriel Shahrouzi Link: https://lore.kernel.org/r/20250419004306.669605-1-gshahrouzi@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/axis-fifo/axis-fifo.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/staging/axis-fifo/axis-fifo.c b/drivers/staging/axis-fifo/axis-fifo.c index 0a85ea667a1b..847d7ee8c084 100644 --- a/drivers/staging/axis-fifo/axis-fifo.c +++ b/drivers/staging/axis-fifo/axis-fifo.c @@ -400,16 +400,14 @@ static ssize_t axis_fifo_read(struct file *f, char __user *buf, bytes_available = ioread32(fifo->base_addr + XLLF_RLR_OFFSET); if (!bytes_available) { - dev_err(fifo->dt_device, "received a packet of length 0 - fifo core will be reset\n"); - reset_ip_core(fifo); + dev_err(fifo->dt_device, "received a packet of length 0\n"); ret = -EIO; goto end_unlock; } if (bytes_available > len) { - dev_err(fifo->dt_device, "user read buffer too small (available bytes=%zu user buffer bytes=%zu) - fifo core will be reset\n", + dev_err(fifo->dt_device, "user read buffer too small (available bytes=%zu user buffer bytes=%zu)\n", bytes_available, len); - reset_ip_core(fifo); ret = -EINVAL; goto end_unlock; } @@ -418,8 +416,7 @@ static ssize_t axis_fifo_read(struct file *f, char __user *buf, /* this probably can't happen unless IP * registers were previously mishandled */ - dev_err(fifo->dt_device, "received a packet that isn't word-aligned - fifo core will be reset\n"); - reset_ip_core(fifo); + dev_err(fifo->dt_device, "received a packet that isn't word-aligned\n"); ret = -EIO; goto end_unlock; } @@ -440,7 +437,6 @@ static ssize_t axis_fifo_read(struct file *f, char __user *buf, if (copy_to_user(buf + copied * sizeof(u32), tmp_buf, copy * sizeof(u32))) { - reset_ip_core(fifo); ret = -EFAULT; goto end_unlock; } @@ -549,7 +545,6 @@ static ssize_t axis_fifo_write(struct file *f, const char __user *buf, if (copy_from_user(tmp_buf, buf + copied * sizeof(u32), copy * sizeof(u32))) { - reset_ip_core(fifo); ret = -EFAULT; goto end_unlock; } From 651d9b7d478a17738267e835d431704fe5d545e3 Mon Sep 17 00:00:00 2001 From: Gabriel Shahrouzi Date: Fri, 18 Apr 2025 21:29:37 -0400 Subject: [PATCH 031/529] staging: axis-fifo: Correct handling of tx_fifo_depth for size validation commit 2ca34b508774aaa590fc3698a54204706ecca4ba upstream. Remove erroneous subtraction of 4 from the total FIFO depth read from device tree. The stored depth is for checking against total capacity, not initial vacancy. This prevented writes near the FIFO's full size. The check performed just before data transfer, which uses live reads of the TDFV register to determine current vacancy, correctly handles the initial Depth - 4 hardware state and subsequent FIFO fullness. Fixes: 4a965c5f89de ("staging: add driver for Xilinx AXI-Stream FIFO v4.1 IP core") Cc: stable@vger.kernel.org Signed-off-by: Gabriel Shahrouzi Link: https://lore.kernel.org/r/20250419012937.674924-1-gshahrouzi@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/staging/axis-fifo/axis-fifo.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/staging/axis-fifo/axis-fifo.c b/drivers/staging/axis-fifo/axis-fifo.c index 847d7ee8c084..3369892eacf4 100644 --- a/drivers/staging/axis-fifo/axis-fifo.c +++ b/drivers/staging/axis-fifo/axis-fifo.c @@ -777,9 +777,6 @@ static int axis_fifo_parse_dt(struct axis_fifo *fifo) goto end; } - /* IP sets TDFV to fifo depth - 4 so we will do the same */ - fifo->tx_fifo_depth -= 4; - ret = get_dts_property(fifo, "xlnx,use-rx-data", &fifo->has_rx_fifo); if (ret) { dev_err(fifo->dt_device, "missing xlnx,use-rx-data property\n"); From 02ad4ce144bd27f71f583f667fdf3b3ba0753477 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 8 May 2025 15:41:32 -0700 Subject: [PATCH 032/529] x86/mm: Eliminate window where TLB flushes may be inadvertently skipped commit fea4e317f9e7e1f449ce90dedc27a2d2a95bee5a upstream. tl;dr: There is a window in the mm switching code where the new CR3 is set and the CPU should be getting TLB flushes for the new mm. But should_flush_tlb() has a bug and suppresses the flush. Fix it by widening the window where should_flush_tlb() sends an IPI. Long Version: === History === There were a few things leading up to this. First, updating mm_cpumask() was observed to be too expensive, so it was made lazier. But being lazy caused too many unnecessary IPIs to CPUs due to the now-lazy mm_cpumask(). So code was added to cull mm_cpumask() periodically[2]. But that culling was a bit too aggressive and skipped sending TLB flushes to CPUs that need them. So here we are again. === Problem === The too-aggressive code in should_flush_tlb() strikes in this window: // Turn on IPIs for this CPU/mm combination, but only // if should_flush_tlb() agrees: cpumask_set_cpu(cpu, mm_cpumask(next)); next_tlb_gen = atomic64_read(&next->context.tlb_gen); choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); load_new_mm_cr3(need_flush); // ^ After 'need_flush' is set to false, IPIs *MUST* // be sent to this CPU and not be ignored. this_cpu_write(cpu_tlbstate.loaded_mm, next); // ^ Not until this point does should_flush_tlb() // become true! should_flush_tlb() will suppress TLB flushes between load_new_mm_cr3() and writing to 'loaded_mm', which is a window where they should not be suppressed. Whoops. === Solution === Thankfully, the fuzzy "just about to write CR3" window is already marked with loaded_mm==LOADED_MM_SWITCHING. Simply checking for that state in should_flush_tlb() is sufficient to ensure that the CPU is targeted with an IPI. This will cause more TLB flush IPIs. But the window is relatively small and I do not expect this to cause any kind of measurable performance impact. Update the comment where LOADED_MM_SWITCHING is written since it grew yet another user. Peter Z also raised a concern that should_flush_tlb() might not observe 'loaded_mm' and 'is_lazy' in the same order that switch_mm_irqs_off() writes them. Add a barrier to ensure that they are observed in the order they are written. Signed-off-by: Dave Hansen Acked-by: Rik van Riel Link: https://lore.kernel.org/oe-lkp/202411282207.6bd28eae-lkp@intel.com/ [1] Fixes: 6db2526c1d69 ("x86/mm/tlb: Only trim the mm_cpumask once a second") [2] Reported-by: Stephen Dolan Cc: stable@vger.kernel.org Acked-by: Ingo Molnar Acked-by: Peter Zijlstra (Intel) Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/tlb.c | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 8d46b9c0e920..4918268d2007 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -617,7 +617,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next, choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); - /* Let nmi_uaccess_okay() know that we're changing CR3. */ + /* + * Indicate that CR3 is about to change. nmi_uaccess_okay() + * and others are sensitive to the window where mm_cpumask(), + * CR3 and cpu_tlbstate.loaded_mm are not all in sync. + */ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); barrier(); } @@ -880,8 +884,16 @@ done: static bool should_flush_tlb(int cpu, void *data) { + struct mm_struct *loaded_mm = per_cpu(cpu_tlbstate.loaded_mm, cpu); struct flush_tlb_info *info = data; + /* + * Order the 'loaded_mm' and 'is_lazy' against their + * write ordering in switch_mm_irqs_off(). Ensure + * 'is_lazy' is at least as new as 'loaded_mm'. + */ + smp_rmb(); + /* Lazy TLB will get flushed at the next context switch. */ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) return false; @@ -890,8 +902,15 @@ static bool should_flush_tlb(int cpu, void *data) if (!info->mm) return true; + /* + * While switching, the remote CPU could have state from + * either the prev or next mm. Assume the worst and flush. + */ + if (loaded_mm == LOADED_MM_SWITCHING) + return true; + /* The target mm is loaded, and the CPU is not lazy. */ - if (per_cpu(cpu_tlbstate.loaded_mm, cpu) == info->mm) + if (loaded_mm == info->mm) return true; /* In cpumask, but not the loaded mm? Periodically remove by flushing. */ From 30a4efc0671d83d9c2d19409532357ceafd505cc Mon Sep 17 00:00:00 2001 From: Wayne Lin Date: Fri, 18 Apr 2025 16:31:59 +0800 Subject: [PATCH 033/529] drm/amd/display: Shift DMUB AUX reply command if necessary commit 5a3846648c0523fd850b7f0aec78c0139453ab8b upstream. [Why] Defined value of dmub AUX reply command field get updated but didn't adjust dm receiving side accordingly. [How] Check the received reply command value to see if it's updated version or not. Adjust it if necessary. Fixes: ead08b95fa50 ("drm/amd/display: Fix race condition in DPIA AUX transfer") Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Ray Wu Signed-off-by: Wayne Lin Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit d5c9ade755a9afa210840708a12a8f44c0d532f4) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 7dee02e8ba6f..7b8b560138cd 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -10702,8 +10702,11 @@ int amdgpu_dm_process_dmub_aux_transfer_sync( goto out; } + payload->reply[0] = adev->dm.dmub_notify->aux_reply.command & 0xF; + if (adev->dm.dmub_notify->aux_reply.command & 0xF0) + /* The reply is stored in the top nibble of the command. */ + payload->reply[0] = (adev->dm.dmub_notify->aux_reply.command >> 4) & 0xF; - payload->reply[0] = adev->dm.dmub_notify->aux_reply.command; if (!payload->write && p_notify->aux_reply.length && (payload->reply[0] == AUX_TRANSACTION_REPLY_AUX_ACK)) { From 06acabd59bd987c54d7467335704e8069c32846b Mon Sep 17 00:00:00 2001 From: Angelo Dureghello Date: Fri, 18 Apr 2025 20:37:53 +0200 Subject: [PATCH 034/529] iio: adc: ad7606: fix serial register access commit f083f8a21cc785ebe3a33f756a3fa3660611f8db upstream. Fix register read/write routine as per datasheet. When reading multiple consecutive registers, only the first one is read properly. This is due to missing chip select deassert and assert again between first and second 16bit transfer, as shown in the datasheet AD7606C-16, rev 0, figure 110. Fixes: f2a22e1e172f ("iio: adc: ad7606: Add support for software mode for ad7616") Reviewed-by: David Lechner Signed-off-by: Angelo Dureghello Link: https://patch.msgid.link/20250418-wip-bl-ad7606-fix-reg-access-v3-1-d5eeb440c738@baylibre.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/ad7606_spi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/adc/ad7606_spi.c b/drivers/iio/adc/ad7606_spi.c index 287a0591533b..67c96572cecc 100644 --- a/drivers/iio/adc/ad7606_spi.c +++ b/drivers/iio/adc/ad7606_spi.c @@ -127,7 +127,7 @@ static int ad7606_spi_reg_read(struct ad7606_state *st, unsigned int addr) { .tx_buf = &st->d16[0], .len = 2, - .cs_change = 0, + .cs_change = 1, }, { .rx_buf = &st->d16[1], .len = 2, From 2c66239cbf911849eb7c5ddd19bc75694b73ca41 Mon Sep 17 00:00:00 2001 From: Gabriel Shahrouzi Date: Mon, 21 Apr 2025 09:15:39 -0400 Subject: [PATCH 035/529] iio: adis16201: Correct inclinometer channel resolution commit 609bc31eca06c7408e6860d8b46311ebe45c1fef upstream. The inclinometer channels were previously defined with 14 realbits. However, the ADIS16201 datasheet states the resolution for these output channels is 12 bits (Page 14, text description; Page 15, table 7). Correct the realbits value to 12 to accurately reflect the hardware. Fixes: f7fe1d1dd5a5 ("staging: iio: new adis16201 driver") Cc: stable@vger.kernel.org Signed-off-by: Gabriel Shahrouzi Reviewed-by: Marcelo Schmitt Link: https://patch.msgid.link/20250421131539.912966-1-gshahrouzi@gmail.com Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/accel/adis16201.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/accel/adis16201.c b/drivers/iio/accel/adis16201.c index dfb8e2e5bdf5..f063226a42f3 100644 --- a/drivers/iio/accel/adis16201.c +++ b/drivers/iio/accel/adis16201.c @@ -211,9 +211,9 @@ static const struct iio_chan_spec adis16201_channels[] = { BIT(IIO_CHAN_INFO_CALIBBIAS), 0, 14), ADIS_AUX_ADC_CHAN(ADIS16201_AUX_ADC_REG, ADIS16201_SCAN_AUX_ADC, 0, 12), ADIS_INCLI_CHAN(X, ADIS16201_XINCL_OUT_REG, ADIS16201_SCAN_INCLI_X, - BIT(IIO_CHAN_INFO_CALIBBIAS), 0, 14), + BIT(IIO_CHAN_INFO_CALIBBIAS), 0, 12), ADIS_INCLI_CHAN(Y, ADIS16201_YINCL_OUT_REG, ADIS16201_SCAN_INCLI_Y, - BIT(IIO_CHAN_INFO_CALIBBIAS), 0, 14), + BIT(IIO_CHAN_INFO_CALIBBIAS), 0, 12), IIO_CHAN_SOFT_TIMESTAMP(7) }; From 6c4a5000618a8c44200d455c92e2f2a4db264717 Mon Sep 17 00:00:00 2001 From: Silvano Seva Date: Tue, 11 Mar 2025 09:49:47 +0100 Subject: [PATCH 036/529] iio: imu: st_lsm6dsx: fix possible lockup in st_lsm6dsx_read_fifo commit 159ca7f18129834b6f4c7eae67de48e96c752fc9 upstream. Prevent st_lsm6dsx_read_fifo from falling in an infinite loop in case pattern_len is equal to zero and the device FIFO is not empty. Fixes: 290a6ce11d93 ("iio: imu: add support to lsm6dsx driver") Signed-off-by: Silvano Seva Acked-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250311085030.3593-2-s.seva@4sigma.it Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_buffer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_buffer.c b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_buffer.c index e49f2d120ed3..df70b06f815a 100644 --- a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_buffer.c +++ b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_buffer.c @@ -370,6 +370,9 @@ int st_lsm6dsx_read_fifo(struct st_lsm6dsx_hw *hw) if (fifo_status & cpu_to_le16(ST_LSM6DSX_FIFO_EMPTY_MASK)) return 0; + if (!pattern_len) + pattern_len = ST_LSM6DSX_SAMPLE_SIZE; + fifo_len = (le16_to_cpu(fifo_status) & fifo_diff_mask) * ST_LSM6DSX_CHAN_SIZE; fifo_len = (fifo_len / pattern_len) * pattern_len; From 16857370b3a30663515956b3bd27f3def6a2cf06 Mon Sep 17 00:00:00 2001 From: Silvano Seva Date: Tue, 11 Mar 2025 09:49:49 +0100 Subject: [PATCH 037/529] iio: imu: st_lsm6dsx: fix possible lockup in st_lsm6dsx_read_tagged_fifo commit 8114ef86e2058e2554111b793596f17bee23fa15 upstream. Prevent st_lsm6dsx_read_tagged_fifo from falling in an infinite loop in case pattern_len is equal to zero and the device FIFO is not empty. Fixes: 801a6e0af0c6 ("iio: imu: st_lsm6dsx: add support to LSM6DSO") Signed-off-by: Silvano Seva Acked-by: Lorenzo Bianconi Link: https://patch.msgid.link/20250311085030.3593-4-s.seva@4sigma.it Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_buffer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_buffer.c b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_buffer.c index df70b06f815a..f5a68059f0a6 100644 --- a/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_buffer.c +++ b/drivers/iio/imu/st_lsm6dsx/st_lsm6dsx_buffer.c @@ -590,6 +590,9 @@ int st_lsm6dsx_read_tagged_fifo(struct st_lsm6dsx_hw *hw) if (!fifo_len) return 0; + if (!pattern_len) + pattern_len = ST_LSM6DSX_TAGGED_SAMPLE_SIZE; + for (read_len = 0; read_len < fifo_len; read_len += pattern_len) { err = st_lsm6dsx_read_block(hw, ST_LSM6DSX_REG_FIFO_OUT_TAG_ADDR, From 5235b56b7e5449d990d21d78723b1a5e7bb5738e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ma=C3=ADra=20Canal?= Date: Wed, 30 Apr 2025 17:51:52 -0300 Subject: [PATCH 038/529] drm/v3d: Add job to pending list if the reset was skipped MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 35e4079bf1a2570abffce6ababa631afcf8ea0e5 upstream. When a CL/CSD job times out, we check if the GPU has made any progress since the last timeout. If so, instead of resetting the hardware, we skip the reset and let the timer get rearmed. This gives long-running jobs a chance to complete. However, when `timedout_job()` is called, the job in question is removed from the pending list, which means it won't be automatically freed through `free_job()`. Consequently, when we skip the reset and keep the job running, the job won't be freed when it finally completes. This situation leads to a memory leak, as exposed in [1] and [2]. Similarly to commit 704d3d60fec4 ("drm/etnaviv: don't block scheduler when GPU is still active"), this patch ensures the job is put back on the pending list when extending the timeout. Cc: stable@vger.kernel.org # 6.0 Reported-by: Daivik Bhatia Closes: https://gitlab.freedesktop.org/mesa/mesa/-/issues/12227 [1] Closes: https://github.com/raspberrypi/linux/issues/6817 [2] Reviewed-by: Iago Toral Quiroga Acked-by: Tvrtko Ursulin Link: https://lore.kernel.org/r/20250430210643.57924-1-mcanal@igalia.com Signed-off-by: Maíra Canal Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/v3d/v3d_sched.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/drivers/gpu/drm/v3d/v3d_sched.c b/drivers/gpu/drm/v3d/v3d_sched.c index 5b729013fd26..41493cf3d03b 100644 --- a/drivers/gpu/drm/v3d/v3d_sched.c +++ b/drivers/gpu/drm/v3d/v3d_sched.c @@ -289,11 +289,16 @@ v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job) return DRM_GPU_SCHED_STAT_NOMINAL; } -/* If the current address or return address have changed, then the GPU - * has probably made progress and we should delay the reset. This - * could fail if the GPU got in an infinite loop in the CL, but that - * is pretty unlikely outside of an i-g-t testcase. - */ +static void +v3d_sched_skip_reset(struct drm_sched_job *sched_job) +{ + struct drm_gpu_scheduler *sched = sched_job->sched; + + spin_lock(&sched->job_list_lock); + list_add(&sched_job->list, &sched->pending_list); + spin_unlock(&sched->job_list_lock); +} + static enum drm_gpu_sched_stat v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q, u32 *timedout_ctca, u32 *timedout_ctra) @@ -303,9 +308,16 @@ v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q, u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(q)); u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(q)); + /* If the current address or return address have changed, then the GPU + * has probably made progress and we should delay the reset. This + * could fail if the GPU got in an infinite loop in the CL, but that + * is pretty unlikely outside of an i-g-t testcase. + */ if (*timedout_ctca != ctca || *timedout_ctra != ctra) { *timedout_ctca = ctca; *timedout_ctra = ctra; + + v3d_sched_skip_reset(sched_job); return DRM_GPU_SCHED_STAT_NOMINAL; } @@ -345,11 +357,13 @@ v3d_csd_job_timedout(struct drm_sched_job *sched_job) struct v3d_dev *v3d = job->base.v3d; u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4); - /* If we've made progress, skip reset and let the timer get - * rearmed. + /* If we've made progress, skip reset, add the job to the pending + * list, and let the timer get rearmed. */ if (job->timedout_batches != batches) { job->timedout_batches = batches; + + v3d_sched_skip_reset(sched_job); return DRM_GPU_SCHED_STAT_NOMINAL; } From f3385a056a0fb9e4a44d818b0ca70d72b4c49593 Mon Sep 17 00:00:00 2001 From: Wayne Lin Date: Sun, 20 Apr 2025 16:29:07 +0800 Subject: [PATCH 039/529] drm/amd/display: Fix the checking condition in dmub aux handling commit bc70e11b550d37fbd9eaed0f113ba560894f1609 upstream. [Why & How] Fix the checking condition for detecting AUX_RET_ERROR_PROTOCOL_ERROR. It was wrongly checking by "not equals to" Reviewed-by: Ray Wu Signed-off-by: Wayne Lin Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 1db6c9e9b62e1a8912f0a281c941099fca678da3) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 7b8b560138cd..3bbf0e83ab2c 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -10693,7 +10693,7 @@ int amdgpu_dm_process_dmub_aux_transfer_sync( * Transient states before tunneling is enabled could * lead to this error. We can ignore this for now. */ - if (p_notify->result != AUX_RET_ERROR_PROTOCOL_ERROR) { + if (p_notify->result == AUX_RET_ERROR_PROTOCOL_ERROR) { DRM_WARN("DPIA AUX failed on 0x%x(%d), error %d\n", payload->address, payload->length, p_notify->result); From 2cca631283d2f787e151c756fe2f4e97618405b8 Mon Sep 17 00:00:00 2001 From: Wayne Lin Date: Sun, 20 Apr 2025 16:56:54 +0800 Subject: [PATCH 040/529] drm/amd/display: Remove incorrect checking in dmub aux handler commit 396dc51b3b7ea524bf8061f478332d0039e96d5d upstream. [Why & How] "Request length != reply length" is expected behavior defined in spec. It's not an invalid reply. Besides, replied data handling logic is not designed to be written in amdgpu_dm_process_dmub_aux_transfer_sync(). Remove the incorrectly handling section. Fixes: ead08b95fa50 ("drm/amd/display: Fix race condition in DPIA AUX transfer") Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Ray Wu Signed-off-by: Wayne Lin Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 81b5c6fa62af62fe89ae9576f41aae37830b94cb) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 3bbf0e83ab2c..069d45dbbc22 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -10708,19 +10708,9 @@ int amdgpu_dm_process_dmub_aux_transfer_sync( payload->reply[0] = (adev->dm.dmub_notify->aux_reply.command >> 4) & 0xF; if (!payload->write && p_notify->aux_reply.length && - (payload->reply[0] == AUX_TRANSACTION_REPLY_AUX_ACK)) { - - if (payload->length != p_notify->aux_reply.length) { - DRM_WARN("invalid read length %d from DPIA AUX 0x%x(%d)!\n", - p_notify->aux_reply.length, - payload->address, payload->length); - *operation_result = AUX_RET_ERROR_INVALID_REPLY; - goto out; - } - + (payload->reply[0] == AUX_TRANSACTION_REPLY_AUX_ACK)) memcpy(payload->data, p_notify->aux_reply.data, p_notify->aux_reply.length); - } /* success */ ret = p_notify->aux_reply.length; From 9e83c84de3216a7560723d43eb10b4cce2e5c822 Mon Sep 17 00:00:00 2001 From: Wayne Lin Date: Sun, 20 Apr 2025 19:22:14 +0800 Subject: [PATCH 041/529] drm/amd/display: Fix wrong handling for AUX_DEFER case commit 65924ec69b29296845c7f628112353438e63ea56 upstream. [Why] We incorrectly ack all bytes get written when the reply actually is defer. When it's defer, means sink is not ready for the request. We should retry the request. [How] Only reply all data get written when receive I2C_ACK|AUX_ACK. Otherwise, reply the number of actual written bytes received from the sink. Add some messages to facilitate debugging as well. Fixes: ad6756b4d773 ("drm/amd/display: Shift dc link aux to aux_payload") Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Ray Wu Signed-off-by: Wayne Lin Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 3637e457eb0000bc37d8bbbec95964aad2fb29fd) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- .../display/amdgpu_dm/amdgpu_dm_mst_types.c | 28 ++++++++++++++++--- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 5eb994ed5471..ade5dd5d8231 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -55,6 +55,9 @@ bool is_timing_changed(struct dc_stream_state *cur_stream, struct dc_stream_state *new_stream); +/* + * This function handles both native AUX and I2C-Over-AUX transactions. + */ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, struct drm_dp_aux_msg *msg) { @@ -91,15 +94,25 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, if (adev->dm.aux_hpd_discon_quirk) { if (msg->address == DP_SIDEBAND_MSG_DOWN_REQ_BASE && operation_result == AUX_RET_ERROR_HPD_DISCON) { - result = 0; + result = msg->size; operation_result = AUX_RET_SUCCESS; } } - if (payload.write && result >= 0) - result = msg->size; + /* + * result equals to 0 includes the cases of AUX_DEFER/I2C_DEFER + */ + if (payload.write && result >= 0) { + if (result) { + /*one byte indicating partially written bytes. Force 0 to retry*/ + drm_info(adev_to_drm(adev), "amdgpu: AUX partially written\n"); + result = 0; + } else if (!payload.reply[0]) + /*I2C_ACK|AUX_ACK*/ + result = msg->size; + } - if (result < 0) + if (result < 0) { switch (operation_result) { case AUX_RET_SUCCESS: break; @@ -118,6 +131,13 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, break; } + drm_info(adev_to_drm(adev), "amdgpu: DP AUX transfer fail:%d\n", operation_result); + } + + if (payload.reply[0]) + drm_info(adev_to_drm(adev), "amdgpu: AUX reply command not ACK: 0x%02x.", + payload.reply[0]); + return result; } From 470f56fc351ae1c4d5d90d080218e9be0ca64ffb Mon Sep 17 00:00:00 2001 From: Wayne Lin Date: Sun, 20 Apr 2025 17:50:03 +0800 Subject: [PATCH 042/529] drm/amd/display: Copy AUX read reply data whenever length > 0 commit 3924f45d4de7250a603fd7b50379237a6a0e5adf upstream. [Why] amdgpu_dm_process_dmub_aux_transfer_sync() should return all exact data reply from the sink side. Don't do the analysis job in it. [How] Remove unnecessary check condition AUX_TRANSACTION_REPLY_AUX_ACK. Fixes: ead08b95fa50 ("drm/amd/display: Fix race condition in DPIA AUX transfer") Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Ray Wu Signed-off-by: Wayne Lin Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 9b540e3fe6796fec4fb1344f3be8952fc2f084d4) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 069d45dbbc22..43aaf3a42b2e 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -10707,8 +10707,7 @@ int amdgpu_dm_process_dmub_aux_transfer_sync( /* The reply is stored in the top nibble of the command. */ payload->reply[0] = (adev->dm.dmub_notify->aux_reply.command >> 4) & 0xF; - if (!payload->write && p_notify->aux_reply.length && - (payload->reply[0] == AUX_TRANSACTION_REPLY_AUX_ACK)) + if (!payload->write && p_notify->aux_reply.length) memcpy(payload->data, p_notify->aux_reply.data, p_notify->aux_reply.length); From b04cfc229af7e5fc59515b1f35ef7729952ad616 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 30 Apr 2025 12:47:37 -0400 Subject: [PATCH 043/529] drm/amdgpu/hdp5.2: use memcfg register to post the write for HDP flush commit dbc988c689333faeeed44d5561f372ff20395304 upstream. Reading back the remapped HDP flush register seems to cause problems on some platforms. All we need is a read, so read back the memcfg register. Fixes: f756dbac1ce1 ("drm/amdgpu/hdp5.2: do a posting read when flushing HDP") Reported-by: Alexey Klimov Link: https://lists.freedesktop.org/archives/amd-gfx/2025-April/123150.html Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4119 Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3908 Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher (cherry picked from commit 4a89b7698e771914b4d5b571600c76e2fdcbe2a9) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/hdp_v5_2.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/hdp_v5_2.c b/drivers/gpu/drm/amd/amdgpu/hdp_v5_2.c index f52552c5fa27..6b9f2e1d9d69 100644 --- a/drivers/gpu/drm/amd/amdgpu/hdp_v5_2.c +++ b/drivers/gpu/drm/amd/amdgpu/hdp_v5_2.c @@ -34,7 +34,17 @@ static void hdp_v5_2_flush_hdp(struct amdgpu_device *adev, if (!ring || !ring->funcs->emit_wreg) { WREG32_NO_KIQ((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, 0); - RREG32_NO_KIQ((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2); + if (amdgpu_sriov_vf(adev)) { + /* this is fine because SR_IOV doesn't remap the register */ + RREG32_NO_KIQ((adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2); + } else { + /* We just need to read back a register to post the write. + * Reading back the remapped register causes problems on + * some platforms so just read back the memory size register. + */ + if (adev->nbio.funcs->get_memsize) + adev->nbio.funcs->get_memsize(adev); + } } else { amdgpu_ring_emit_wreg(ring, (adev->rmmio_remap.reg_offset + KFD_MMIO_REMAP_HDP_MEM_FLUSH_CNTL) >> 2, From cf61669c50ce9c3b3a50205bd12139b0358c991b Mon Sep 17 00:00:00 2001 From: Alexey Charkov Date: Fri, 25 Apr 2025 18:11:11 +0400 Subject: [PATCH 044/529] usb: uhci-platform: Make the clock really optional commit a5c7973539b010874a37a0e846e62ac6f00553ba upstream. Device tree bindings state that the clock is optional for UHCI platform controllers, and some existing device trees don't provide those - such as those for VIA/WonderMedia devices. The driver however fails to probe now if no clock is provided, because devm_clk_get returns an error pointer in such case. Switch to devm_clk_get_optional instead, so that it could probe again on those platforms where no clocks are given. Cc: stable Fixes: 26c502701c52 ("usb: uhci: Add clk support to uhci-platform") Signed-off-by: Alexey Charkov Link: https://lore.kernel.org/r/20250425-uhci-clock-optional-v1-1-a1d462592f29@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/uhci-platform.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/uhci-platform.c b/drivers/usb/host/uhci-platform.c index b2049b47a08d..53d2fbee76e2 100644 --- a/drivers/usb/host/uhci-platform.c +++ b/drivers/usb/host/uhci-platform.c @@ -122,7 +122,7 @@ static int uhci_hcd_platform_probe(struct platform_device *pdev) } /* Get and enable clock if any specified */ - uhci->clk = devm_clk_get(&pdev->dev, NULL); + uhci->clk = devm_clk_get_optional(&pdev->dev, NULL); if (IS_ERR(uhci->clk)) { ret = PTR_ERR(uhci->clk); goto err_rmr; From 8b02f85e84dc6f7c150cef40ddb69af5a25659e5 Mon Sep 17 00:00:00 2001 From: Jason Andryuk Date: Tue, 6 May 2025 17:09:33 -0400 Subject: [PATCH 045/529] xenbus: Use kref to track req lifetime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 1f0304dfd9d217c2f8b04a9ef4b3258a66eedd27 upstream. Marek reported seeing a NULL pointer fault in the xenbus_thread callstack: BUG: kernel NULL pointer dereference, address: 0000000000000000 RIP: e030:__wake_up_common+0x4c/0x180 Call Trace: __wake_up_common_lock+0x82/0xd0 process_msg+0x18e/0x2f0 xenbus_thread+0x165/0x1c0 process_msg+0x18e is req->cb(req). req->cb is set to xs_wake_up(), a thin wrapper around wake_up(), or xenbus_dev_queue_reply(). It seems like it was xs_wake_up() in this case. It seems like req may have woken up the xs_wait_for_reply(), which kfree()ed the req. When xenbus_thread resumes, it faults on the zero-ed data. Linux Device Drivers 2nd edition states: "Normally, a wake_up call can cause an immediate reschedule to happen, meaning that other processes might run before wake_up returns." ... which would match the behaviour observed. Change to keeping two krefs on each request. One for the caller, and one for xenbus_thread. Each will kref_put() when finished, and the last will free it. This use of kref matches the description in Documentation/core-api/kref.rst Link: https://lore.kernel.org/xen-devel/ZO0WrR5J0xuwDIxW@mail-itl/ Reported-by: Marek Marczykowski-Górecki Fixes: fd8aa9095a95 ("xen: optimize xenbus driver for multiple concurrent xenstore accesses") Cc: stable@vger.kernel.org Signed-off-by: Jason Andryuk Reviewed-by: Juergen Gross Signed-off-by: Juergen Gross Message-ID: <20250506210935.5607-1-jason.andryuk@amd.com> Signed-off-by: Greg Kroah-Hartman --- drivers/xen/xenbus/xenbus.h | 2 ++ drivers/xen/xenbus/xenbus_comms.c | 9 ++++----- drivers/xen/xenbus/xenbus_dev_frontend.c | 2 +- drivers/xen/xenbus/xenbus_xs.c | 18 ++++++++++++++++-- 4 files changed, 23 insertions(+), 8 deletions(-) diff --git a/drivers/xen/xenbus/xenbus.h b/drivers/xen/xenbus/xenbus.h index 2754bdfadcb8..4ba73320694a 100644 --- a/drivers/xen/xenbus/xenbus.h +++ b/drivers/xen/xenbus/xenbus.h @@ -77,6 +77,7 @@ enum xb_req_state { struct xb_req_data { struct list_head list; wait_queue_head_t wq; + struct kref kref; struct xsd_sockmsg msg; uint32_t caller_req_id; enum xsd_sockmsg_type type; @@ -103,6 +104,7 @@ int xb_init_comms(void); void xb_deinit_comms(void); int xs_watch_msg(struct xs_watch_event *event); void xs_request_exit(struct xb_req_data *req); +void xs_free_req(struct kref *kref); int xenbus_match(struct device *_dev, struct device_driver *_drv); int xenbus_dev_probe(struct device *_dev); diff --git a/drivers/xen/xenbus/xenbus_comms.c b/drivers/xen/xenbus/xenbus_comms.c index e5fda0256feb..82df2da1b880 100644 --- a/drivers/xen/xenbus/xenbus_comms.c +++ b/drivers/xen/xenbus/xenbus_comms.c @@ -309,8 +309,8 @@ static int process_msg(void) virt_wmb(); req->state = xb_req_state_got_reply; req->cb(req); - } else - kfree(req); + } + kref_put(&req->kref, xs_free_req); } mutex_unlock(&xs_response_mutex); @@ -386,14 +386,13 @@ static int process_writes(void) state.req->msg.type = XS_ERROR; state.req->err = err; list_del(&state.req->list); - if (state.req->state == xb_req_state_aborted) - kfree(state.req); - else { + if (state.req->state != xb_req_state_aborted) { /* write err, then update state */ virt_wmb(); state.req->state = xb_req_state_got_reply; wake_up(&state.req->wq); } + kref_put(&state.req->kref, xs_free_req); mutex_unlock(&xb_write_mutex); diff --git a/drivers/xen/xenbus/xenbus_dev_frontend.c b/drivers/xen/xenbus/xenbus_dev_frontend.c index 0792fda49a15..c495cff3da30 100644 --- a/drivers/xen/xenbus/xenbus_dev_frontend.c +++ b/drivers/xen/xenbus/xenbus_dev_frontend.c @@ -406,7 +406,7 @@ void xenbus_dev_queue_reply(struct xb_req_data *req) mutex_unlock(&u->reply_mutex); kfree(req->body); - kfree(req); + kref_put(&req->kref, xs_free_req); kref_put(&u->kref, xenbus_file_free); diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index 12e02eb01f59..a4dd92719e7a 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -112,6 +112,12 @@ static void xs_suspend_exit(void) wake_up_all(&xs_state_enter_wq); } +void xs_free_req(struct kref *kref) +{ + struct xb_req_data *req = container_of(kref, struct xb_req_data, kref); + kfree(req); +} + static uint32_t xs_request_enter(struct xb_req_data *req) { uint32_t rq_id; @@ -237,6 +243,12 @@ static void xs_send(struct xb_req_data *req, struct xsd_sockmsg *msg) req->caller_req_id = req->msg.req_id; req->msg.req_id = xs_request_enter(req); + /* + * Take 2nd ref. One for this thread, and the second for the + * xenbus_thread. + */ + kref_get(&req->kref); + mutex_lock(&xb_write_mutex); list_add_tail(&req->list, &xb_write_list); notify = list_is_singular(&xb_write_list); @@ -261,8 +273,8 @@ static void *xs_wait_for_reply(struct xb_req_data *req, struct xsd_sockmsg *msg) if (req->state == xb_req_state_queued || req->state == xb_req_state_wait_reply) req->state = xb_req_state_aborted; - else - kfree(req); + + kref_put(&req->kref, xs_free_req); mutex_unlock(&xb_write_mutex); return ret; @@ -291,6 +303,7 @@ int xenbus_dev_request_and_reply(struct xsd_sockmsg *msg, void *par) req->cb = xenbus_dev_queue_reply; req->par = par; req->user_req = true; + kref_init(&req->kref); xs_send(req, msg); @@ -319,6 +332,7 @@ static void *xs_talkv(struct xenbus_transaction t, req->num_vecs = num_vecs; req->cb = xs_wake_up; req->user_req = false; + kref_init(&req->kref); msg.req_id = 0; msg.tx_id = t.id; From 9e7b49ce4f9d0cb5b6e87db9e07a2fb9e754b0dd Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Wed, 7 May 2025 09:50:44 +0300 Subject: [PATCH 046/529] module: ensure that kobject_put() is safe for module type kobjects commit a6aeb739974ec73e5217c75a7c008a688d3d5cf1 upstream. In 'lookup_or_create_module_kobject()', an internal kobject is created using 'module_ktype'. So call to 'kobject_put()' on error handling path causes an attempt to use an uninitialized completion pointer in 'module_kobject_release()'. In this scenario, we just want to release kobject without an extra synchronization required for a regular module unloading process, so adding an extra check whether 'complete()' is actually required makes 'kobject_put()' safe. Reported-by: syzbot+7fb8a372e1f6add936dd@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7fb8a372e1f6add936dd Fixes: 942e443127e9 ("module: Fix mod->mkobj.kobj potentially freed too early") Cc: stable@vger.kernel.org Suggested-by: Petr Pavlu Signed-off-by: Dmitry Antipov Link: https://lore.kernel.org/r/20250507065044.86529-1-dmantipov@yandex.ru Signed-off-by: Petr Pavlu Signed-off-by: Greg Kroah-Hartman --- kernel/params.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/params.c b/kernel/params.c index 5b92310425c5..27954dfe3d20 100644 --- a/kernel/params.c +++ b/kernel/params.c @@ -945,7 +945,9 @@ int module_sysfs_initialized; static void module_kobj_release(struct kobject *kobj) { struct module_kobject *mk = to_module_kobject(kobj); - complete(mk->kobj_completion); + + if (mk->kobj_completion) + complete(mk->kobj_completion); } struct kobj_type module_ktype = { From c77a473d6e40b1e84727e39aa0d613f292b53855 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 24 Apr 2025 15:45:11 +0200 Subject: [PATCH 047/529] ocfs2: switch osb->disable_recovery to enum commit c0fb83088f0cc4ee4706e0495ee8b06f49daa716 upstream. Patch series "ocfs2: Fix deadlocks in quota recovery", v3. This implements another approach to fixing quota recovery deadlocks. We avoid grabbing sb->s_umount semaphore from ocfs2_finish_quota_recovery() and instead stop quota recovery early in ocfs2_dismount_volume(). This patch (of 3): We will need more recovery states than just pure enable / disable to fix deadlocks with quota recovery. Switch osb->disable_recovery to enum. Link: https://lkml.kernel.org/r/20250424134301.1392-1-jack@suse.cz Link: https://lkml.kernel.org/r/20250424134515.18933-4-jack@suse.cz Fixes: 5f530de63cfc ("ocfs2: Use s_umount for quota recovery protection") Signed-off-by: Jan Kara Reviewed-by: Heming Zhao Tested-by: Heming Zhao Acked-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Murad Masimov Cc: Shichangkuo Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/journal.c | 14 ++++++++------ fs/ocfs2/ocfs2.h | 7 ++++++- 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 775766856a54..5c2bd578b728 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -173,7 +173,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb) struct ocfs2_recovery_map *rm; mutex_init(&osb->recovery_lock); - osb->disable_recovery = 0; + osb->recovery_state = OCFS2_REC_ENABLED; osb->recovery_thread_task = NULL; init_waitqueue_head(&osb->recovery_event); @@ -208,7 +208,7 @@ void ocfs2_recovery_exit(struct ocfs2_super *osb) /* disable any new recovery threads and wait for any currently * running ones to exit. Do this before setting the vol_state. */ mutex_lock(&osb->recovery_lock); - osb->disable_recovery = 1; + osb->recovery_state = OCFS2_REC_DISABLED; mutex_unlock(&osb->recovery_lock); wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); @@ -1549,14 +1549,16 @@ bail: void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) { + int was_set = -1; + mutex_lock(&osb->recovery_lock); + if (osb->recovery_state < OCFS2_REC_DISABLED) + was_set = ocfs2_recovery_map_set(osb, node_num); trace_ocfs2_recovery_thread(node_num, osb->node_num, - osb->disable_recovery, osb->recovery_thread_task, - osb->disable_recovery ? - -1 : ocfs2_recovery_map_set(osb, node_num)); + osb->recovery_state, osb->recovery_thread_task, was_set); - if (osb->disable_recovery) + if (osb->recovery_state == OCFS2_REC_DISABLED) goto out; if (osb->recovery_thread_task) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 740b64238312..fc111d2a0c2d 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -284,6 +284,11 @@ enum ocfs2_mount_options #define OCFS2_OSB_ERROR_FS 0x0004 #define OCFS2_DEFAULT_ATIME_QUANTUM 60 +enum ocfs2_recovery_state { + OCFS2_REC_ENABLED = 0, + OCFS2_REC_DISABLED, +}; + struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; @@ -346,7 +351,7 @@ struct ocfs2_super struct ocfs2_recovery_map *recovery_map; struct ocfs2_replay_map *replay_map; struct task_struct *recovery_thread_task; - int disable_recovery; + enum ocfs2_recovery_state recovery_state; wait_queue_head_t checkpoint_event; struct ocfs2_journal *journal; unsigned long osb_commit_interval; From 6e5c3d9f296445888c02c43bd8bdd3412bb35cef Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 24 Apr 2025 15:45:12 +0200 Subject: [PATCH 048/529] ocfs2: implement handshaking with ocfs2 recovery thread commit 8f947e0fd595951460f5a6e1ac29baa82fa02eab upstream. We will need ocfs2 recovery thread to acknowledge transitions of recovery_state when disabling particular types of recovery. This is similar to what currently happens when disabling recovery completely, just more general. Implement the handshake and use it for exit from recovery. Link: https://lkml.kernel.org/r/20250424134515.18933-5-jack@suse.cz Fixes: 5f530de63cfc ("ocfs2: Use s_umount for quota recovery protection") Signed-off-by: Jan Kara Reviewed-by: Heming Zhao Tested-by: Heming Zhao Acked-by: Joseph Qi Cc: Changwei Ge Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Cc: Murad Masimov Cc: Shichangkuo Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/journal.c | 52 +++++++++++++++++++++++++++++++--------------- fs/ocfs2/ocfs2.h | 4 ++++ 2 files changed, 39 insertions(+), 17 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 5c2bd578b728..f4cd58b5c6fe 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -192,31 +192,48 @@ int ocfs2_recovery_init(struct ocfs2_super *osb) return 0; } -/* we can't grab the goofy sem lock from inside wait_event, so we use - * memory barriers to make sure that we'll see the null task before - * being woken up */ static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) { - mb(); return osb->recovery_thread_task != NULL; } +static void ocfs2_recovery_disable(struct ocfs2_super *osb, + enum ocfs2_recovery_state state) +{ + mutex_lock(&osb->recovery_lock); + /* + * If recovery thread is not running, we can directly transition to + * final state. + */ + if (!ocfs2_recovery_thread_running(osb)) { + osb->recovery_state = state + 1; + goto out_lock; + } + osb->recovery_state = state; + /* Wait for recovery thread to acknowledge state transition */ + wait_event_cmd(osb->recovery_event, + !ocfs2_recovery_thread_running(osb) || + osb->recovery_state >= state + 1, + mutex_unlock(&osb->recovery_lock), + mutex_lock(&osb->recovery_lock)); +out_lock: + mutex_unlock(&osb->recovery_lock); + + /* + * At this point we know that no more recovery work can be queued so + * wait for any recovery completion work to complete. + */ + if (osb->ocfs2_wq) + flush_workqueue(osb->ocfs2_wq); +} + void ocfs2_recovery_exit(struct ocfs2_super *osb) { struct ocfs2_recovery_map *rm; /* disable any new recovery threads and wait for any currently * running ones to exit. Do this before setting the vol_state. */ - mutex_lock(&osb->recovery_lock); - osb->recovery_state = OCFS2_REC_DISABLED; - mutex_unlock(&osb->recovery_lock); - wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); - - /* At this point, we know that no more recovery threads can be - * launched, so wait for any recovery completion work to - * complete. */ - if (osb->ocfs2_wq) - flush_workqueue(osb->ocfs2_wq); + ocfs2_recovery_disable(osb, OCFS2_REC_WANT_DISABLE); /* * Now that recovery is shut down, and the osb is about to be @@ -1536,7 +1553,8 @@ bail: ocfs2_free_replay_slots(osb); osb->recovery_thread_task = NULL; - mb(); /* sync with ocfs2_recovery_thread_running */ + if (osb->recovery_state == OCFS2_REC_WANT_DISABLE) + osb->recovery_state = OCFS2_REC_DISABLED; wake_up(&osb->recovery_event); mutex_unlock(&osb->recovery_lock); @@ -1552,13 +1570,13 @@ void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) int was_set = -1; mutex_lock(&osb->recovery_lock); - if (osb->recovery_state < OCFS2_REC_DISABLED) + if (osb->recovery_state < OCFS2_REC_WANT_DISABLE) was_set = ocfs2_recovery_map_set(osb, node_num); trace_ocfs2_recovery_thread(node_num, osb->node_num, osb->recovery_state, osb->recovery_thread_task, was_set); - if (osb->recovery_state == OCFS2_REC_DISABLED) + if (osb->recovery_state >= OCFS2_REC_WANT_DISABLE) goto out; if (osb->recovery_thread_task) diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index fc111d2a0c2d..610e22daa2ad 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -286,6 +286,10 @@ enum ocfs2_mount_options enum ocfs2_recovery_state { OCFS2_REC_ENABLED = 0, + OCFS2_REC_WANT_DISABLE, + /* + * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work + */ OCFS2_REC_DISABLED, }; From 4c3a0b0b23dd9639739732556830a1d2fe14dc60 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 24 Apr 2025 15:45:13 +0200 Subject: [PATCH 049/529] ocfs2: stop quota recovery before disabling quotas commit fcaf3b2683b05a9684acdebda706a12025a6927a upstream. Currently quota recovery is synchronized with unmount using sb->s_umount semaphore. That is however prone to deadlocks because flush_workqueue(osb->ocfs2_wq) called from umount code can wait for quota recovery to complete while ocfs2_finish_quota_recovery() waits for sb->s_umount semaphore. Grabbing of sb->s_umount semaphore in ocfs2_finish_quota_recovery() is only needed to protect that function from disabling of quotas from ocfs2_dismount_volume(). Handle this problem by disabling quota recovery early during unmount in ocfs2_dismount_volume() instead so that we can drop acquisition of sb->s_umount from ocfs2_finish_quota_recovery(). Link: https://lkml.kernel.org/r/20250424134515.18933-6-jack@suse.cz Fixes: 5f530de63cfc ("ocfs2: Use s_umount for quota recovery protection") Signed-off-by: Jan Kara Reported-by: Shichangkuo Reported-by: Murad Masimov Reviewed-by: Heming Zhao Tested-by: Heming Zhao Acked-by: Joseph Qi Cc: Changwei Ge Cc: Joel Becker Cc: Jun Piao Cc: Junxiao Bi Cc: Mark Fasheh Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/journal.c | 20 ++++++++++++++++++-- fs/ocfs2/journal.h | 1 + fs/ocfs2/ocfs2.h | 6 ++++++ fs/ocfs2/quota_local.c | 9 ++------- fs/ocfs2/super.c | 3 +++ 5 files changed, 30 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f4cd58b5c6fe..7f5fd16d3740 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -227,6 +227,11 @@ out_lock: flush_workqueue(osb->ocfs2_wq); } +void ocfs2_recovery_disable_quota(struct ocfs2_super *osb) +{ + ocfs2_recovery_disable(osb, OCFS2_REC_QUOTA_WANT_DISABLE); +} + void ocfs2_recovery_exit(struct ocfs2_super *osb) { struct ocfs2_recovery_map *rm; @@ -1456,6 +1461,18 @@ static int __ocfs2_recovery_thread(void *arg) } } restart: + if (quota_enabled) { + mutex_lock(&osb->recovery_lock); + /* Confirm that recovery thread will no longer recover quotas */ + if (osb->recovery_state == OCFS2_REC_QUOTA_WANT_DISABLE) { + osb->recovery_state = OCFS2_REC_QUOTA_DISABLED; + wake_up(&osb->recovery_event); + } + if (osb->recovery_state >= OCFS2_REC_QUOTA_DISABLED) + quota_enabled = 0; + mutex_unlock(&osb->recovery_lock); + } + status = ocfs2_super_lock(osb, 1); if (status < 0) { mlog_errno(status); @@ -1559,8 +1576,7 @@ bail: mutex_unlock(&osb->recovery_lock); - if (quota_enabled) - kfree(rm_quota); + kfree(rm_quota); return status; } diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 689c340c6363..5ebde794a653 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -148,6 +148,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb); int ocfs2_recovery_init(struct ocfs2_super *osb); void ocfs2_recovery_exit(struct ocfs2_super *osb); +void ocfs2_recovery_disable_quota(struct ocfs2_super *osb); int ocfs2_compute_replay_slots(struct ocfs2_super *osb); void ocfs2_free_replay_slots(struct ocfs2_super *osb); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 610e22daa2ad..22882c636bfb 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -286,6 +286,12 @@ enum ocfs2_mount_options enum ocfs2_recovery_state { OCFS2_REC_ENABLED = 0, + OCFS2_REC_QUOTA_WANT_DISABLE, + /* + * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for + * ocfs2_recovery_disable_quota() to work. + */ + OCFS2_REC_QUOTA_DISABLED, OCFS2_REC_WANT_DISABLE, /* * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 4b4fa58cd32f..0ca8975a1df4 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -453,8 +453,7 @@ out: /* Sync changes in local quota file into global quota file and * reinitialize local quota file. - * The function expects local quota file to be already locked and - * s_umount locked in shared mode. */ + * The function expects local quota file to be already locked. */ static int ocfs2_recover_local_quota_file(struct inode *lqinode, int type, struct ocfs2_quota_recovery *rec) @@ -585,7 +584,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, { unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE }; - struct super_block *sb = osb->sb; struct ocfs2_local_disk_dqinfo *ldinfo; struct buffer_head *bh; handle_t *handle; @@ -597,7 +595,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " "slot %u\n", osb->dev_str, slot_num); - down_read(&sb->s_umount); for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (list_empty(&(rec->r_list[type]))) continue; @@ -674,7 +671,6 @@ out_put: break; } out: - up_read(&sb->s_umount); kfree(rec); return status; } @@ -840,8 +836,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); /* - * s_umount held in exclusive mode protects us against racing with - * recovery thread... + * ocfs2_dismount_volume() has already aborted quota recovery... */ if (oinfo->dqi_rec) { ocfs2_free_quota_recovery(oinfo->dqi_rec); diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8f7bb76d9cde..b30f9d71a35d 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1869,6 +1869,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); + /* Stop quota recovery so that we can disable quotas */ + ocfs2_recovery_disable_quota(osb); + ocfs2_disable_quotas(osb); /* All dquots should be freed by now */ From 0554dade57dd6d657eba7303b63a8ed93498063d Mon Sep 17 00:00:00 2001 From: Pawel Laszczak Date: Fri, 18 Apr 2025 04:55:16 +0000 Subject: [PATCH 050/529] usb: cdnsp: Fix issue with resuming from L1 commit 241e2ce88e5a494be7a5d44c0697592f1632fbee upstream. In very rare cases after resuming controller from L1 to L0 it reads registers before the clock UTMI have been enabled and as the result driver reads incorrect value. Most of registers are in APB domain clock but some of them (e.g. PORTSC) are in UTMI domain clock. After entering to L1 state the UTMI clock can be disabled. When controller transition from L1 to L0 the port status change event is reported and in interrupt runtime function driver reads PORTSC. During this read operation controller synchronize UTMI and APB domain but UTMI clock is still disabled and in result it reads 0xFFFFFFFF value. To fix this issue driver increases APB timeout value. The issue is platform specific and if the default value of APB timeout is not sufficient then this time should be set Individually for each platform. Fixes: 3d82904559f4 ("usb: cdnsp: cdns3 Add main part of Cadence USBSSP DRD Driver") Cc: stable Signed-off-by: Pawel Laszczak Acked-by: Peter Chen Link: https://lore.kernel.org/r/PH7PR07MB953846C57973E4DB134CAA71DDBF2@PH7PR07MB9538.namprd07.prod.outlook.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/cdnsp-gadget.c | 29 +++++++++++++++++++++++++++++ drivers/usb/cdns3/cdnsp-gadget.h | 3 +++ drivers/usb/cdns3/cdnsp-pci.c | 12 ++++++++++-- drivers/usb/cdns3/core.h | 3 +++ 4 files changed, 45 insertions(+), 2 deletions(-) diff --git a/drivers/usb/cdns3/cdnsp-gadget.c b/drivers/usb/cdns3/cdnsp-gadget.c index 0044897ee800..181138f27fb3 100644 --- a/drivers/usb/cdns3/cdnsp-gadget.c +++ b/drivers/usb/cdns3/cdnsp-gadget.c @@ -138,6 +138,26 @@ static void cdnsp_clear_port_change_bit(struct cdnsp_device *pdev, (portsc & PORT_CHANGE_BITS), port_regs); } +static void cdnsp_set_apb_timeout_value(struct cdnsp_device *pdev) +{ + struct cdns *cdns = dev_get_drvdata(pdev->dev); + __le32 __iomem *reg; + void __iomem *base; + u32 offset = 0; + u32 val; + + if (!cdns->override_apb_timeout) + return; + + base = &pdev->cap_regs->hc_capbase; + offset = cdnsp_find_next_ext_cap(base, offset, D_XEC_PRE_REGS_CAP); + reg = base + offset + REG_CHICKEN_BITS_3_OFFSET; + + val = le32_to_cpu(readl(reg)); + val = CHICKEN_APB_TIMEOUT_SET(val, cdns->override_apb_timeout); + writel(cpu_to_le32(val), reg); +} + static void cdnsp_set_chicken_bits_2(struct cdnsp_device *pdev, u32 bit) { __le32 __iomem *reg; @@ -1801,6 +1821,15 @@ static int cdnsp_gen_setup(struct cdnsp_device *pdev) pdev->hci_version = HC_VERSION(pdev->hcc_params); pdev->hcc_params = readl(&pdev->cap_regs->hcc_params); + /* + * Override the APB timeout value to give the controller more time for + * enabling UTMI clock and synchronizing APB and UTMI clock domains. + * This fix is platform specific and is required to fixes issue with + * reading incorrect value from PORTSC register after resuming + * from L1 state. + */ + cdnsp_set_apb_timeout_value(pdev); + cdnsp_get_rev_cap(pdev); /* Make sure the Device Controller is halted. */ diff --git a/drivers/usb/cdns3/cdnsp-gadget.h b/drivers/usb/cdns3/cdnsp-gadget.h index a61aef0dc273..79376b1e3419 100644 --- a/drivers/usb/cdns3/cdnsp-gadget.h +++ b/drivers/usb/cdns3/cdnsp-gadget.h @@ -520,6 +520,9 @@ struct cdnsp_rev_cap { #define REG_CHICKEN_BITS_2_OFFSET 0x48 #define CHICKEN_XDMA_2_TP_CACHE_DIS BIT(28) +#define REG_CHICKEN_BITS_3_OFFSET 0x4C +#define CHICKEN_APB_TIMEOUT_SET(p, val) (((p) & ~GENMASK(21, 0)) | (val)) + /* XBUF Extended Capability ID. */ #define XBUF_CAP_ID 0xCB #define XBUF_RX_TAG_MASK_0_OFFSET 0x1C diff --git a/drivers/usb/cdns3/cdnsp-pci.c b/drivers/usb/cdns3/cdnsp-pci.c index a85db23fa19f..b7a1f28faa1f 100644 --- a/drivers/usb/cdns3/cdnsp-pci.c +++ b/drivers/usb/cdns3/cdnsp-pci.c @@ -33,6 +33,8 @@ #define CDNS_DRD_ID 0x0100 #define CDNS_DRD_IF (PCI_CLASS_SERIAL_USB << 8 | 0x80) +#define CHICKEN_APB_TIMEOUT_VALUE 0x1C20 + static struct pci_dev *cdnsp_get_second_fun(struct pci_dev *pdev) { /* @@ -144,6 +146,14 @@ static int cdnsp_pci_probe(struct pci_dev *pdev, cdnsp->otg_irq = pdev->irq; } + /* + * Cadence PCI based platform require some longer timeout for APB + * to fixes domain clock synchronization issue after resuming + * controller from L1 state. + */ + cdnsp->override_apb_timeout = CHICKEN_APB_TIMEOUT_VALUE; + pci_set_drvdata(pdev, cdnsp); + if (pci_is_enabled(func)) { cdnsp->dev = dev; cdnsp->gadget_init = cdnsp_gadget_init; @@ -153,8 +163,6 @@ static int cdnsp_pci_probe(struct pci_dev *pdev, goto free_cdnsp; } - pci_set_drvdata(pdev, cdnsp); - device_wakeup_enable(&pdev->dev); if (pci_dev_run_wake(pdev)) pm_runtime_put_noidle(&pdev->dev); diff --git a/drivers/usb/cdns3/core.h b/drivers/usb/cdns3/core.h index 57d47348dc19..ac30ee21309d 100644 --- a/drivers/usb/cdns3/core.h +++ b/drivers/usb/cdns3/core.h @@ -79,6 +79,8 @@ struct cdns3_platform_data { * @pdata: platform data from glue layer * @lock: spinlock structure * @xhci_plat_data: xhci private data structure pointer + * @override_apb_timeout: hold value of APB timeout. For value 0 the default + * value in CHICKEN_BITS_3 will be preserved. * @gadget_init: pointer to gadget initialization function */ struct cdns { @@ -117,6 +119,7 @@ struct cdns { struct cdns3_platform_data *pdata; spinlock_t lock; struct xhci_plat_priv *xhci_plat_data; + u32 override_apb_timeout; int (*gadget_init)(struct cdns *cdns); }; From ee51a5d322c8de0a50e7be372bf502bbb5d8e1a0 Mon Sep 17 00:00:00 2001 From: Pawel Laszczak Date: Fri, 25 Apr 2025 05:55:40 +0000 Subject: [PATCH 051/529] usb: cdnsp: fix L1 resume issue for RTL_REVISION_NEW_LPM version commit 8614ecdb1570e4fffe87ebdc62b613ed66f1f6a6 upstream. The controllers with rtl version larger than RTL_REVISION_NEW_LPM (0x00002700) has bug which causes that controller doesn't resume from L1 state. It happens if after receiving LPM packet controller starts transitioning to L1 and in this moment the driver force resuming by write operation to PORTSC.PLS. It's corner case and happens when write operation to PORTSC occurs during device delay before transitioning to L1 after transmitting ACK time (TL1TokenRetry). Forcing transition from L1->L0 by driver for revision larger than RTL_REVISION_NEW_LPM is not needed, so driver can simply fix this issue through block call of cdnsp_force_l0_go function. Fixes: 3d82904559f4 ("usb: cdnsp: cdns3 Add main part of Cadence USBSSP DRD Driver") Cc: stable Signed-off-by: Pawel Laszczak Acked-by: Peter Chen Link: https://lore.kernel.org/r/PH7PR07MB9538B55C3A6E71F9ED29E980DD842@PH7PR07MB9538.namprd07.prod.outlook.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/cdns3/cdnsp-gadget.c | 2 ++ drivers/usb/cdns3/cdnsp-gadget.h | 3 +++ drivers/usb/cdns3/cdnsp-ring.c | 3 ++- 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/usb/cdns3/cdnsp-gadget.c b/drivers/usb/cdns3/cdnsp-gadget.c index 181138f27fb3..913050230d1a 100644 --- a/drivers/usb/cdns3/cdnsp-gadget.c +++ b/drivers/usb/cdns3/cdnsp-gadget.c @@ -1796,6 +1796,8 @@ static void cdnsp_get_rev_cap(struct cdnsp_device *pdev) reg += cdnsp_find_next_ext_cap(reg, 0, RTL_REV_CAP); pdev->rev_cap = reg; + pdev->rtl_revision = readl(&pdev->rev_cap->rtl_revision); + dev_info(pdev->dev, "Rev: %08x/%08x, eps: %08x, buff: %08x/%08x\n", readl(&pdev->rev_cap->ctrl_revision), readl(&pdev->rev_cap->rtl_revision), diff --git a/drivers/usb/cdns3/cdnsp-gadget.h b/drivers/usb/cdns3/cdnsp-gadget.h index 79376b1e3419..5cffc1444d3a 100644 --- a/drivers/usb/cdns3/cdnsp-gadget.h +++ b/drivers/usb/cdns3/cdnsp-gadget.h @@ -1362,6 +1362,7 @@ struct cdnsp_port { * @rev_cap: Controller Capabilities Registers. * @hcs_params1: Cached register copies of read-only HCSPARAMS1 * @hcc_params: Cached register copies of read-only HCCPARAMS1 + * @rtl_revision: Cached controller rtl revision. * @setup: Temporary buffer for setup packet. * @ep0_preq: Internal allocated request used during enumeration. * @ep0_stage: ep0 stage during enumeration process. @@ -1416,6 +1417,8 @@ struct cdnsp_device { __u32 hcs_params1; __u32 hcs_params3; __u32 hcc_params; + #define RTL_REVISION_NEW_LPM 0x2700 + __u32 rtl_revision; /* Lock used in interrupt thread context. */ spinlock_t lock; struct usb_ctrlrequest setup; diff --git a/drivers/usb/cdns3/cdnsp-ring.c b/drivers/usb/cdns3/cdnsp-ring.c index 47096b8e3179..6247584cb939 100644 --- a/drivers/usb/cdns3/cdnsp-ring.c +++ b/drivers/usb/cdns3/cdnsp-ring.c @@ -308,7 +308,8 @@ static bool cdnsp_ring_ep_doorbell(struct cdnsp_device *pdev, writel(db_value, reg_addr); - cdnsp_force_l0_go(pdev); + if (pdev->rtl_revision < RTL_REVISION_NEW_LPM) + cdnsp_force_l0_go(pdev); /* Doorbell was set. */ return true; From 6fe2677bcf24d624bfc95a42960494082a977b1e Mon Sep 17 00:00:00 2001 From: Wayne Chang Date: Fri, 18 Apr 2025 16:12:28 +0800 Subject: [PATCH 052/529] usb: gadget: tegra-xudc: ACK ST_RC after clearing CTRL_RUN commit 59820fde001500c167342257650541280c622b73 upstream. We identified a bug where the ST_RC bit in the status register was not being acknowledged after clearing the CTRL_RUN bit in the control register. This could lead to unexpected behavior in the USB gadget drivers. This patch resolves the issue by adding the necessary code to explicitly acknowledge ST_RC after clearing CTRL_RUN based on the programming sequence, ensuring proper state transition. Fixes: 49db427232fe ("usb: gadget: Add UDC driver for tegra XUSB device mode controller") Cc: stable Signed-off-by: Wayne Chang Link: https://lore.kernel.org/r/20250418081228.1194779-1-waynec@nvidia.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/tegra-xudc.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/gadget/udc/tegra-xudc.c b/drivers/usb/gadget/udc/tegra-xudc.c index fd7a9535973e..771466cdf385 100644 --- a/drivers/usb/gadget/udc/tegra-xudc.c +++ b/drivers/usb/gadget/udc/tegra-xudc.c @@ -1743,6 +1743,10 @@ static int __tegra_xudc_ep_disable(struct tegra_xudc_ep *ep) val = xudc_readl(xudc, CTRL); val &= ~CTRL_RUN; xudc_writel(xudc, val, CTRL); + + val = xudc_readl(xudc, ST); + if (val & ST_RC) + xudc_writel(xudc, ST_RC, ST); } dev_info(xudc->dev, "ep %u disabled\n", ep->index); From 7d6224d1cf57775349ac89b8e72d6ae8718e1858 Mon Sep 17 00:00:00 2001 From: Jim Lin Date: Tue, 22 Apr 2025 19:40:01 +0800 Subject: [PATCH 053/529] usb: host: tegra: Prevent host controller crash when OTG port is used commit 732f35cf8bdfece582f6e4a9c659119036577308 upstream. When a USB device is connected to the OTG port, the tegra_xhci_id_work() routine transitions the PHY to host mode and calls xhci_hub_control() with the SetPortFeature command to enable port power. In certain cases, the XHCI controller may be in a low-power state when this operation occurs. If xhci_hub_control() is invoked while the controller is suspended, the PORTSC register may return 0xFFFFFFFF, indicating a read failure. This causes xhci_hc_died() to be triggered, leading to host controller shutdown. Example backtrace: [ 105.445736] Workqueue: events tegra_xhci_id_work [ 105.445747] dump_backtrace+0x0/0x1e8 [ 105.445759] xhci_hc_died.part.48+0x40/0x270 [ 105.445769] tegra_xhci_set_port_power+0xc0/0x240 [ 105.445774] tegra_xhci_id_work+0x130/0x240 To prevent this, ensure the controller is fully resumed before interacting with hardware registers by calling pm_runtime_get_sync() prior to the host mode transition and xhci_hub_control(). Fixes: f836e7843036 ("usb: xhci-tegra: Add OTG support") Cc: stable Signed-off-by: Jim Lin Signed-off-by: Wayne Chang Link: https://lore.kernel.org/r/20250422114001.126367-1-waynec@nvidia.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci-tegra.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/host/xhci-tegra.c b/drivers/usb/host/xhci-tegra.c index 51eabc5e8770..14a772feab79 100644 --- a/drivers/usb/host/xhci-tegra.c +++ b/drivers/usb/host/xhci-tegra.c @@ -1228,6 +1228,7 @@ static void tegra_xhci_id_work(struct work_struct *work) tegra->otg_usb3_port = tegra_xusb_padctl_get_usb3_companion(tegra->padctl, tegra->otg_usb2_port); + pm_runtime_get_sync(tegra->dev); if (tegra->host_mode) { /* switch to host mode */ if (tegra->otg_usb3_port >= 0) { @@ -1257,6 +1258,7 @@ static void tegra_xhci_id_work(struct work_struct *work) } tegra_xhci_set_port_power(tegra, true, true); + pm_runtime_mark_last_busy(tegra->dev); } else { if (tegra->otg_usb3_port >= 0) @@ -1264,6 +1266,7 @@ static void tegra_xhci_id_work(struct work_struct *work) tegra_xhci_set_port_power(tegra, true, false); } + pm_runtime_put_autosuspend(tegra->dev); } #if IS_ENABLED(CONFIG_PM) || IS_ENABLED(CONFIG_PM_SLEEP) From 2e89025609df2fd8542f4504824ba6827969b987 Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Tue, 29 Apr 2025 23:47:01 +0000 Subject: [PATCH 054/529] usb: typec: tcpm: delay SNK_TRY_WAIT_DEBOUNCE to SRC_TRYWAIT transition commit e918d3959b5ae0e793b8f815ce62240e10ba03a4 upstream. This patch fixes Type-C Compliance Test TD 4.7.6 - Try.SNK DRP Connect SNKAS. The compliance tester moves into SNK_UNATTACHED during toggling and expects the PUT to apply Rp after tPDDebounce of detection. If the port is in SNK_TRY_WAIT_DEBOUNCE, it will move into SRC_TRYWAIT immediately and apply Rp. This violates TD 4.7.5.V.3, where the tester confirms that the PUT attaches Rp after the transitions to Unattached.SNK for tPDDebounce. Change the tcpm_set_state delay between SNK_TRY_WAIT_DEBOUNCE and SRC_TRYWAIT to tPDDebounce. Fixes: a0a3e04e6b2c ("staging: typec: tcpm: Check for Rp for tPDDebounce") Cc: stable Signed-off-by: RD Babiera Reviewed-by: Badhri Jagan Sridharan Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20250429234703.3748506-2-rdbabiera@google.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/tcpm/tcpm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/typec/tcpm/tcpm.c b/drivers/usb/typec/tcpm/tcpm.c index 4291093f110d..f40eabb7e24f 100644 --- a/drivers/usb/typec/tcpm/tcpm.c +++ b/drivers/usb/typec/tcpm/tcpm.c @@ -5136,7 +5136,7 @@ static void _tcpm_cc_change(struct tcpm_port *port, enum typec_cc_status cc1, case SNK_TRY_WAIT_DEBOUNCE: if (!tcpm_port_is_sink(port)) { port->max_wait = 0; - tcpm_set_state(port, SRC_TRYWAIT, 0); + tcpm_set_state(port, SRC_TRYWAIT, PD_T_PD_DEBOUNCE); } break; case SRC_TRY_WAIT: From 076ab0631ed4928905736f1701e25f1e722bc086 Mon Sep 17 00:00:00 2001 From: Andrei Kuchynski Date: Thu, 24 Apr 2025 08:44:29 +0000 Subject: [PATCH 055/529] usb: typec: ucsi: displayport: Fix NULL pointer access commit 312d79669e71283d05c05cc49a1a31e59e3d9e0e upstream. This patch ensures that the UCSI driver waits for all pending tasks in the ucsi_displayport_work workqueue to finish executing before proceeding with the partner removal. Cc: stable Fixes: af8622f6a585 ("usb: typec: ucsi: Support for DisplayPort alt mode") Signed-off-by: Andrei Kuchynski Reviewed-by: Heikki Krogerus Reviewed-by: Benson Leung Link: https://lore.kernel.org/r/20250424084429.3220757-3-akuchynski@chromium.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/displayport.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/typec/ucsi/displayport.c b/drivers/usb/typec/ucsi/displayport.c index 2431febc4615..8c19081c3255 100644 --- a/drivers/usb/typec/ucsi/displayport.c +++ b/drivers/usb/typec/ucsi/displayport.c @@ -296,6 +296,8 @@ void ucsi_displayport_remove_partner(struct typec_altmode *alt) if (!dp) return; + cancel_work_sync(&dp->work); + dp->data.conf = 0; dp->data.status = 0; dp->initialized = false; From 1956c3d8783a1a3c102127a0b1a22e4ef7615b59 Mon Sep 17 00:00:00 2001 From: Oliver Neukum Date: Wed, 30 Apr 2025 15:48:10 +0200 Subject: [PATCH 056/529] USB: usbtmc: use interruptible sleep in usbtmc_read commit 054c5145540e5ad5b80adf23a5e3e2fc281fb8aa upstream. usbtmc_read() calls usbtmc_generic_read() which uses interruptible sleep, but usbtmc_read() itself uses uninterruptble sleep for mutual exclusion between threads. That makes no sense. Both should use interruptible sleep. Fixes: 5b775f672cc99 ("USB: add USB test and measurement class driver") Cc: stable Signed-off-by: Oliver Neukum Link: https://lore.kernel.org/r/20250430134810.226015-1-oneukum@suse.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/usbtmc.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c index c2e666e82857..e483832b8d3a 100644 --- a/drivers/usb/class/usbtmc.c +++ b/drivers/usb/class/usbtmc.c @@ -1380,7 +1380,10 @@ static ssize_t usbtmc_read(struct file *filp, char __user *buf, if (!buffer) return -ENOMEM; - mutex_lock(&data->io_mutex); + retval = mutex_lock_interruptible(&data->io_mutex); + if (retval < 0) + goto exit_nolock; + if (data->zombie) { retval = -ENODEV; goto exit; @@ -1503,6 +1506,7 @@ static ssize_t usbtmc_read(struct file *filp, char __user *buf, exit: mutex_unlock(&data->io_mutex); +exit_nolock: kfree(buffer); return retval; } From a647d960fbb89ca15e263958421bcb0485369b13 Mon Sep 17 00:00:00 2001 From: Dave Penkler Date: Fri, 2 May 2025 09:09:39 +0200 Subject: [PATCH 057/529] usb: usbtmc: Fix erroneous get_stb ioctl error returns commit cac01bd178d6a2a23727f138d647ce1a0e8a73a1 upstream. wait_event_interruptible_timeout returns a long The return was being assigned to an int causing an integer overflow when the remaining jiffies > INT_MAX resulting in random error returns. Use a long return value and convert to int ioctl return only on error. When the return value of wait_event_interruptible_timeout was <= INT_MAX the number of remaining jiffies was returned which has no meaning for the user. Return 0 on success. Reported-by: Michael Katzmann Fixes: dbf3e7f654c0 ("Implement an ioctl to support the USMTMC-USB488 READ_STATUS_BYTE operation.") Cc: stable@vger.kernel.org Signed-off-by: Dave Penkler Link: https://lore.kernel.org/r/20250502070941.31819-2-dpenkler@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/usbtmc.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c index e483832b8d3a..23d69b79357b 100644 --- a/drivers/usb/class/usbtmc.c +++ b/drivers/usb/class/usbtmc.c @@ -482,6 +482,7 @@ static int usbtmc_get_stb(struct usbtmc_file_data *file_data, __u8 *stb) u8 *buffer; u8 tag; int rv; + long wait_rv; dev_dbg(dev, "Enter ioctl_read_stb iin_ep_present: %d\n", data->iin_ep_present); @@ -511,16 +512,17 @@ static int usbtmc_get_stb(struct usbtmc_file_data *file_data, __u8 *stb) } if (data->iin_ep_present) { - rv = wait_event_interruptible_timeout( + wait_rv = wait_event_interruptible_timeout( data->waitq, atomic_read(&data->iin_data_valid) != 0, file_data->timeout); - if (rv < 0) { - dev_dbg(dev, "wait interrupted %d\n", rv); + if (wait_rv < 0) { + dev_dbg(dev, "wait interrupted %ld\n", wait_rv); + rv = wait_rv; goto exit; } - if (rv == 0) { + if (wait_rv == 0) { dev_dbg(dev, "wait timed out\n"); rv = -ETIMEDOUT; goto exit; @@ -539,6 +541,8 @@ static int usbtmc_get_stb(struct usbtmc_file_data *file_data, __u8 *stb) dev_dbg(dev, "stb:0x%02x received %d\n", (unsigned int)*stb, rv); + rv = 0; + exit: /* bump interrupt bTag */ data->iin_bTag += 1; From 5f72912d35b678823f8068cc60d358c7cb9e97e1 Mon Sep 17 00:00:00 2001 From: Dave Penkler Date: Fri, 2 May 2025 09:09:40 +0200 Subject: [PATCH 058/529] usb: usbtmc: Fix erroneous wait_srq ioctl return commit a9747c9b8b59ab4207effd20eb91a890acb44e16 upstream. wait_event_interruptible_timeout returns a long The return was being assigned to an int causing an integer overflow when the remaining jiffies > INT_MAX resulting in random error returns. Use a long return value, converting to the int ioctl return only on error. Fixes: 739240a9f6ac ("usb: usbtmc: Add ioctl USBTMC488_IOCTL_WAIT_SRQ") Cc: stable@vger.kernel.org Signed-off-by: Dave Penkler Link: https://lore.kernel.org/r/20250502070941.31819-3-dpenkler@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/usbtmc.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c index 23d69b79357b..d23de1bc7eed 100644 --- a/drivers/usb/class/usbtmc.c +++ b/drivers/usb/class/usbtmc.c @@ -606,9 +606,9 @@ static int usbtmc488_ioctl_wait_srq(struct usbtmc_file_data *file_data, { struct usbtmc_device_data *data = file_data->data; struct device *dev = &data->intf->dev; - int rv; u32 timeout; unsigned long expire; + long wait_rv; if (!data->iin_ep_present) { dev_dbg(dev, "no interrupt endpoint present\n"); @@ -622,25 +622,24 @@ static int usbtmc488_ioctl_wait_srq(struct usbtmc_file_data *file_data, mutex_unlock(&data->io_mutex); - rv = wait_event_interruptible_timeout( - data->waitq, - atomic_read(&file_data->srq_asserted) != 0 || - atomic_read(&file_data->closing), - expire); + wait_rv = wait_event_interruptible_timeout( + data->waitq, + atomic_read(&file_data->srq_asserted) != 0 || + atomic_read(&file_data->closing), + expire); mutex_lock(&data->io_mutex); /* Note! disconnect or close could be called in the meantime */ if (atomic_read(&file_data->closing) || data->zombie) - rv = -ENODEV; + return -ENODEV; - if (rv < 0) { - /* dev can be invalid now! */ - pr_debug("%s - wait interrupted %d\n", __func__, rv); - return rv; + if (wait_rv < 0) { + dev_dbg(dev, "%s - wait interrupted %ld\n", __func__, wait_rv); + return wait_rv; } - if (rv == 0) { + if (wait_rv == 0) { dev_dbg(dev, "%s - wait timed out\n", __func__); return -ETIMEDOUT; } From 7591a2e6c0ac39d1321e0d972e6daf3a0cbb0358 Mon Sep 17 00:00:00 2001 From: Dave Penkler Date: Fri, 2 May 2025 09:09:41 +0200 Subject: [PATCH 059/529] usb: usbtmc: Fix erroneous generic_read ioctl return commit 4e77d3ec7c7c0d9535ccf1138827cb9bb5480b9b upstream. wait_event_interruptible_timeout returns a long The return value was being assigned to an int causing an integer overflow when the remaining jiffies > INT_MAX which resulted in random error returns. Use a long return value, converting to the int ioctl return only on error. Fixes: bb99794a4792 ("usb: usbtmc: Add ioctl for vendor specific read") Cc: stable@vger.kernel.org Signed-off-by: Dave Penkler Link: https://lore.kernel.org/r/20250502070941.31819-4-dpenkler@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/usb/class/usbtmc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/usb/class/usbtmc.c b/drivers/usb/class/usbtmc.c index d23de1bc7eed..2f92905e05ca 100644 --- a/drivers/usb/class/usbtmc.c +++ b/drivers/usb/class/usbtmc.c @@ -833,6 +833,7 @@ static ssize_t usbtmc_generic_read(struct usbtmc_file_data *file_data, unsigned long expire; int bufcount = 1; int again = 0; + long wait_rv; /* mutex already locked */ @@ -945,19 +946,24 @@ static ssize_t usbtmc_generic_read(struct usbtmc_file_data *file_data, if (!(flags & USBTMC_FLAG_ASYNC)) { dev_dbg(dev, "%s: before wait time %lu\n", __func__, expire); - retval = wait_event_interruptible_timeout( + wait_rv = wait_event_interruptible_timeout( file_data->wait_bulk_in, usbtmc_do_transfer(file_data), expire); - dev_dbg(dev, "%s: wait returned %d\n", - __func__, retval); + dev_dbg(dev, "%s: wait returned %ld\n", + __func__, wait_rv); - if (retval <= 0) { - if (retval == 0) - retval = -ETIMEDOUT; + if (wait_rv < 0) { + retval = wait_rv; goto error; } + + if (wait_rv == 0) { + retval = -ETIMEDOUT; + goto error; + } + } urb = usb_get_from_anchor(&file_data->in_anchor); From 6f371b751be017ec1da2f2076aa1a9213a51dc74 Mon Sep 17 00:00:00 2001 From: Lothar Rubusch Date: Sun, 9 Mar 2025 19:35:15 +0000 Subject: [PATCH 060/529] iio: accel: adxl367: fix setting odr for activity time update [ Upstream commit 38f67d0264929762e54ae5948703a21f841fe706 ] Fix setting the odr value to update activity time based on frequency derrived by recent odr, and not by obsolete odr value. The [small] bug: When _adxl367_set_odr() is called with a new odr value, it first writes the new odr value to the hardware register ADXL367_REG_FILTER_CTL. Second, it calls _adxl367_set_act_time_ms(), which calls adxl367_time_ms_to_samples(). Here st->odr still holds the old odr value. This st->odr member is used to derrive a frequency value, which is applied to update ADXL367_REG_TIME_ACT. Hence, the idea is to update activity time, based on possibilities and power consumption by the current ODR rate. Finally, when the function calls return, again in _adxl367_set_odr() the new ODR is assigned to st->odr. The fix: When setting a new ODR value is set to ADXL367_REG_FILTER_CTL, also ADXL367_REG_TIME_ACT should probably be updated with a frequency based on the recent ODR value and not the old one. Changing the location of the assignment to st->odr fixes this. Fixes: cbab791c5e2a5 ("iio: accel: add ADXL367 driver") Signed-off-by: Lothar Rubusch Reviewed-by: Marcelo Schmitt Link: https://patch.msgid.link/20250309193515.2974-1-l.rubusch@gmail.com Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin --- drivers/iio/accel/adxl367.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/iio/accel/adxl367.c b/drivers/iio/accel/adxl367.c index f1a41b92543a..af0ad4e5e4a5 100644 --- a/drivers/iio/accel/adxl367.c +++ b/drivers/iio/accel/adxl367.c @@ -622,18 +622,14 @@ static int _adxl367_set_odr(struct adxl367_state *st, enum adxl367_odr odr) if (ret) return ret; + st->odr = odr; + /* Activity timers depend on ODR */ ret = _adxl367_set_act_time_ms(st, st->act_time_ms); if (ret) return ret; - ret = _adxl367_set_inact_time_ms(st, st->inact_time_ms); - if (ret) - return ret; - - st->odr = odr; - - return 0; + return _adxl367_set_inact_time_ms(st, st->inact_time_ms); } static int adxl367_set_odr(struct iio_dev *indio_dev, enum adxl367_odr odr) From 35061dc5f64371124e2183e49277406dfe4121a0 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 13 Apr 2025 11:34:36 +0100 Subject: [PATCH 061/529] iio: temp: maxim-thermocouple: Fix potential lack of DMA safe buffer. [ Upstream commit f79aeb6c631b57395f37acbfbe59727e355a714c ] The trick of using __aligned(IIO_DMA_MINALIGN) ensures that there is no overlap between buffers used for DMA and those used for driver state storage that are before the marking. It doesn't ensure anything above state variables found after the marking. Hence move this particular bit of state earlier in the structure. Fixes: 10897f34309b ("iio: temp: maxim_thermocouple: Fix alignment for DMA safety") Reviewed-by: David Lechner Link: https://patch.msgid.link/20250413103443.2420727-14-jic23@kernel.org Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin --- drivers/iio/temperature/maxim_thermocouple.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/temperature/maxim_thermocouple.c b/drivers/iio/temperature/maxim_thermocouple.c index c28a7a6dea5f..555a61e2f3fd 100644 --- a/drivers/iio/temperature/maxim_thermocouple.c +++ b/drivers/iio/temperature/maxim_thermocouple.c @@ -121,9 +121,9 @@ static const struct maxim_thermocouple_chip maxim_thermocouple_chips[] = { struct maxim_thermocouple_data { struct spi_device *spi; const struct maxim_thermocouple_chip *chip; + char tc_type; u8 buffer[16] __aligned(IIO_DMA_MINALIGN); - char tc_type; }; static int maxim_thermocouple_read(struct maxim_thermocouple_data *data, From 8f0064eb224efbe28a0513e18b380b21e1381cf5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 3 Sep 2024 20:59:04 +0300 Subject: [PATCH 062/529] types: Complement the aligned types with signed 64-bit one [ Upstream commit e4ca0e59c39442546866f3dd514a3a5956577daf ] Some user may want to use aligned signed 64-bit type. Provide it for them. Signed-off-by: Andy Shevchenko Link: https://patch.msgid.link/20240903180218.3640501-2-andriy.shevchenko@linux.intel.com Signed-off-by: Jonathan Cameron Stable-dep-of: 1bb942287e05 ("iio: accel: adxl355: Make timestamp 64-bit aligned using aligned_s64") Signed-off-by: Sasha Levin --- include/linux/types.h | 3 ++- include/uapi/linux/types.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/types.h b/include/linux/types.h index ea8cf60a8a79..f1df5d8df2ef 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -109,8 +109,9 @@ typedef u64 u_int64_t; typedef s64 int64_t; #endif -/* this is a special 64bit data type that is 8-byte aligned */ +/* These are the special 64-bit data types that are 8-byte aligned */ #define aligned_u64 __aligned_u64 +#define aligned_s64 __aligned_s64 #define aligned_be64 __aligned_be64 #define aligned_le64 __aligned_le64 diff --git a/include/uapi/linux/types.h b/include/uapi/linux/types.h index 308433be33c2..c39147653094 100644 --- a/include/uapi/linux/types.h +++ b/include/uapi/linux/types.h @@ -49,6 +49,7 @@ typedef __u32 __bitwise __wsum; * No conversions are necessary between 32-bit user-space and a 64-bit kernel. */ #define __aligned_u64 __u64 __attribute__((aligned(8))) +#define __aligned_s64 __s64 __attribute__((aligned(8))) #define __aligned_be64 __be64 __attribute__((aligned(8))) #define __aligned_le64 __le64 __attribute__((aligned(8))) From b6c984f4eb91cc6fca7b314d33687d8d2ecffb1b Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 13 Apr 2025 11:34:27 +0100 Subject: [PATCH 063/529] iio: accel: adxl355: Make timestamp 64-bit aligned using aligned_s64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 1bb942287e05dc4c304a003ea85e6dd9a5e7db39 ] The IIO ABI requires 64-bit aligned timestamps. In this case insufficient padding would have been added on architectures where an s64 is only 32-bit aligned. Use aligned_s64 to enforce the correct alignment. Fixes: 327a0eaf19d5 ("iio: accel: adxl355: Add triggered buffer support") Reported-by: David Lechner Reviewed-by: Nuno Sá Reviewed-by: David Lechner Link: https://patch.msgid.link/20250413103443.2420727-5-jic23@kernel.org Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin --- drivers/iio/accel/adxl355_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/accel/adxl355_core.c b/drivers/iio/accel/adxl355_core.c index 4bc648eac8b2..e73573cd773a 100644 --- a/drivers/iio/accel/adxl355_core.c +++ b/drivers/iio/accel/adxl355_core.c @@ -175,7 +175,7 @@ struct adxl355_data { u8 transf_buf[3]; struct { u8 buf[14]; - s64 ts; + aligned_s64 ts; } buffer; } __aligned(IIO_DMA_MINALIGN); }; From ea25ee0bb42f65b5a53d93f890c72135a1f25a1a Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 13 Apr 2025 11:34:26 +0100 Subject: [PATCH 064/529] iio: adc: dln2: Use aligned_s64 for timestamp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 5097eaae98e53f9ab9d35801c70da819b92ca907 ] Here the lack of marking allows the overall structure to not be sufficiently aligned resulting in misplacement of the timestamp in iio_push_to_buffers_with_timestamp(). Use aligned_s64 to force the alignment on all architectures. Fixes: 7c0299e879dd ("iio: adc: Add support for DLN2 ADC") Reported-by: David Lechner Reviewed-by: Andy Shevchenko Reviewed-by: Nuno Sá Reviewed-by: David Lechner Link: https://patch.msgid.link/20250413103443.2420727-4-jic23@kernel.org Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin --- drivers/iio/adc/dln2-adc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/adc/dln2-adc.c b/drivers/iio/adc/dln2-adc.c index 97d162a3cba4..49a2588e7431 100644 --- a/drivers/iio/adc/dln2-adc.c +++ b/drivers/iio/adc/dln2-adc.c @@ -483,7 +483,7 @@ static irqreturn_t dln2_adc_trigger_h(int irq, void *p) struct iio_dev *indio_dev = pf->indio_dev; struct { __le16 values[DLN2_ADC_MAX_CHANNELS]; - int64_t timestamp_space; + aligned_s64 timestamp_space; } data; struct dln2_adc_get_all_vals dev_data; struct dln2_adc *dln2 = iio_priv(indio_dev); From d189b461d5101a1cec7df65f8a6c9e9bba6d9540 Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Sun, 27 Apr 2025 13:34:24 +0200 Subject: [PATCH 065/529] MIPS: Fix MAX_REG_OFFSET [ Upstream commit c44572e0cc13c9afff83fd333135a0aa9b27ba26 ] Fix MAX_REG_OFFSET to point to the last register in 'pt_regs' and not to the marker itself, which could allow regs_get_register() to return an invalid offset. Fixes: 40e084a506eb ("MIPS: Add uprobes support.") Suggested-by: Maciej W. Rozycki Signed-off-by: Thorsten Blum Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- arch/mips/include/asm/ptrace.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/mips/include/asm/ptrace.h b/arch/mips/include/asm/ptrace.h index 4a2b40ce39e0..841612913f0d 100644 --- a/arch/mips/include/asm/ptrace.h +++ b/arch/mips/include/asm/ptrace.h @@ -65,7 +65,8 @@ static inline void instruction_pointer_set(struct pt_regs *regs, /* Query offset/name of register from its name/offset */ extern int regs_query_register_offset(const char *name); -#define MAX_REG_OFFSET (offsetof(struct pt_regs, __last)) +#define MAX_REG_OFFSET \ + (offsetof(struct pt_regs, __last) - sizeof(unsigned long)) /** * regs_get_register() - get register value from its offset From bd68de433fa0c144f44f0936c87445c82901a8d7 Mon Sep 17 00:00:00 2001 From: Kevin Baker Date: Mon, 5 May 2025 12:02:56 -0500 Subject: [PATCH 066/529] drm/panel: simple: Update timings for AUO G101EVN010 [ Upstream commit 7c6fa1797a725732981f2d77711c867166737719 ] Switch to panel timings based on datasheet for the AUO G101EVN01.0 LVDS panel. Default timings were tested on the panel. Previous mode-based timings resulted in horizontal display shift. Signed-off-by: Kevin Baker Fixes: 4fb86404a977 ("drm/panel: simple: Add AUO G101EVN010 panel support") Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20250505170256.1385113-1-kevinb@ventureresearch.com Signed-off-by: Neil Armstrong Link: https://lore.kernel.org/r/20250505170256.1385113-1-kevinb@ventureresearch.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/panel/panel-simple.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/panel/panel-simple.c b/drivers/gpu/drm/panel/panel-simple.c index a35e94e2ffd0..8a86f5d5c708 100644 --- a/drivers/gpu/drm/panel/panel-simple.c +++ b/drivers/gpu/drm/panel/panel-simple.c @@ -921,27 +921,28 @@ static const struct panel_desc auo_g070vvn01 = { }, }; -static const struct drm_display_mode auo_g101evn010_mode = { - .clock = 68930, - .hdisplay = 1280, - .hsync_start = 1280 + 82, - .hsync_end = 1280 + 82 + 2, - .htotal = 1280 + 82 + 2 + 84, - .vdisplay = 800, - .vsync_start = 800 + 8, - .vsync_end = 800 + 8 + 2, - .vtotal = 800 + 8 + 2 + 6, +static const struct display_timing auo_g101evn010_timing = { + .pixelclock = { 64000000, 68930000, 85000000 }, + .hactive = { 1280, 1280, 1280 }, + .hfront_porch = { 8, 64, 256 }, + .hback_porch = { 8, 64, 256 }, + .hsync_len = { 40, 168, 767 }, + .vactive = { 800, 800, 800 }, + .vfront_porch = { 4, 8, 100 }, + .vback_porch = { 4, 8, 100 }, + .vsync_len = { 8, 16, 223 }, }; static const struct panel_desc auo_g101evn010 = { - .modes = &auo_g101evn010_mode, - .num_modes = 1, + .timings = &auo_g101evn010_timing, + .num_timings = 1, .bpc = 6, .size = { .width = 216, .height = 135, }, .bus_format = MEDIA_BUS_FMT_RGB666_1X7X3_SPWG, + .bus_flags = DRM_BUS_FLAG_DE_HIGH, .connector_type = DRM_MODE_CONNECTOR_LVDS, }; From 8642cbf11eeff6a60049e4dd6e3f4cab61625d8f Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Fri, 2 May 2025 10:58:00 +0200 Subject: [PATCH 067/529] nvme: unblock ctrl state transition for firmware update [ Upstream commit 650415fca0a97472fdd79725e35152614d1aad76 ] The original nvme subsystem design didn't have a CONNECTING state; the state machine allowed transitions from RESETTING to LIVE directly. With the introduction of nvme fabrics the CONNECTING state was introduce. Over time the nvme-pci started to use the CONNECTING state as well. Eventually, a bug fix for the nvme-fc started to depend that the only valid transition to LIVE was from CONNECTING. Though this change didn't update the firmware update handler which was still depending on RESETTING to LIVE transition. The simplest way to address it for the time being is to switch into CONNECTING state before going to LIVE state. Fixes: d2fe192348f9 ("nvme: only allow entering LIVE from CONNECTING state") Reported-by: Guenter Roeck Signed-off-by: Daniel Wagner Closes: https://lore.kernel.org/all/0134ea15-8d5f-41f7-9e9a-d7e6d82accaa@roeck-us.net Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Guenter Roeck Signed-off-by: Sasha Levin --- drivers/nvme/host/core.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4c40ebb9503d..fbe3fb4fbe95 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -4881,7 +4881,8 @@ static void nvme_fw_act_work(struct work_struct *work) msleep(100); } - if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) + if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_CONNECTING) || + !nvme_change_ctrl_state(ctrl, NVME_CTRL_LIVE)) return; nvme_start_queues(ctrl); From 0e42a14899ca8982d20599965c7b5704be27f67f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 28 Apr 2025 23:56:14 -0400 Subject: [PATCH 068/529] do_umount(): add missing barrier before refcount checks in sync case [ Upstream commit 65781e19dcfcb4aed1167d87a3ffcc2a0c071d47 ] do_umount() analogue of the race fixed in 119e1ef80ecf "fix __legitimize_mnt()/mntput() race". Here we want to make sure that if __legitimize_mnt() doesn't notice our lock_mount_hash(), we will notice their refcount increment. Harder to hit than mntput_no_expire() one, fortunately, and consequences are milder (sync umount acting like umount -l on a rare race with RCU pathwalk hitting at just the wrong time instead of use-after-free galore mntput_no_expire() counterpart used to be hit). Still a bug... Fixes: 48a066e72d97 ("RCU'd vfsmounts") Reviewed-by: Christian Brauner Signed-off-by: Al Viro Signed-off-by: Sasha Levin --- fs/namespace.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/namespace.c b/fs/namespace.c index 57166cc7e511..0dcd57a75ad4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -629,7 +629,7 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); - smp_mb(); // see mntput_no_expire() + smp_mb(); // see mntput_no_expire() and do_umount() if (likely(!read_seqretry(&mount_lock, seq))) return 0; if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { @@ -1707,6 +1707,7 @@ static int do_umount(struct mount *mnt, int flags) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { + smp_mb(); // paired with __legitimize_mnt() shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { From 95b9acb0bb4ef616f4a8cfdfd416547cdd95d58d Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 5 May 2025 08:34:39 -0600 Subject: [PATCH 069/529] io_uring: always arm linked timeouts prior to issue Commit b53e523261bf058ea4a518b482222e7a277b186b upstream. There are a few spots where linked timeouts are armed, and not all of them adhere to the pre-arm, attempt issue, post-arm pattern. This can be problematic if the linked request returns that it will trigger a callback later, and does so before the linked timeout is fully armed. Consolidate all the linked timeout handling into __io_issue_sqe(), rather than have it spread throughout the various issue entry points. Cc: stable@vger.kernel.org Link: https://github.com/axboe/liburing/issues/1390 Reported-by: Chase Hiltz Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- io_uring/io_uring.c | 53 ++++++++++++++------------------------------- 1 file changed, 16 insertions(+), 37 deletions(-) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index ceac15a6bf3b..488793b119d0 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -372,24 +372,6 @@ static struct io_kiocb *__io_prep_linked_timeout(struct io_kiocb *req) return req->link; } -static inline struct io_kiocb *io_prep_linked_timeout(struct io_kiocb *req) -{ - if (likely(!(req->flags & REQ_F_ARM_LTIMEOUT))) - return NULL; - return __io_prep_linked_timeout(req); -} - -static noinline void __io_arm_ltimeout(struct io_kiocb *req) -{ - io_queue_linked_timeout(__io_prep_linked_timeout(req)); -} - -static inline void io_arm_ltimeout(struct io_kiocb *req) -{ - if (unlikely(req->flags & REQ_F_ARM_LTIMEOUT)) - __io_arm_ltimeout(req); -} - static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -437,7 +419,6 @@ static void io_prep_async_link(struct io_kiocb *req) static void io_queue_iowq(struct io_kiocb *req) { - struct io_kiocb *link = io_prep_linked_timeout(req); struct io_uring_task *tctx = req->task->io_uring; BUG_ON(!tctx); @@ -462,8 +443,6 @@ static void io_queue_iowq(struct io_kiocb *req) trace_io_uring_queue_async_work(req, io_wq_is_hashed(&req->work)); io_wq_enqueue(tctx->io_wq, &req->work); - if (link) - io_queue_linked_timeout(link); } static __cold void io_queue_deferred(struct io_ring_ctx *ctx) @@ -1741,17 +1720,24 @@ static bool io_assign_file(struct io_kiocb *req, unsigned int issue_flags) return !!req->file; } +#define REQ_ISSUE_SLOW_FLAGS (REQ_F_CREDS | REQ_F_ARM_LTIMEOUT) + static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) { const struct io_op_def *def = &io_op_defs[req->opcode]; const struct cred *creds = NULL; + struct io_kiocb *link = NULL; int ret; if (unlikely(!io_assign_file(req, issue_flags))) return -EBADF; - if (unlikely((req->flags & REQ_F_CREDS) && req->creds != current_cred())) - creds = override_creds(req->creds); + if (unlikely(req->flags & REQ_ISSUE_SLOW_FLAGS)) { + if ((req->flags & REQ_F_CREDS) && req->creds != current_cred()) + creds = override_creds(req->creds); + if (req->flags & REQ_F_ARM_LTIMEOUT) + link = __io_prep_linked_timeout(req); + } if (!def->audit_skip) audit_uring_entry(req->opcode); @@ -1761,8 +1747,12 @@ static int io_issue_sqe(struct io_kiocb *req, unsigned int issue_flags) if (!def->audit_skip) audit_uring_exit(!ret, ret); - if (creds) - revert_creds(creds); + if (unlikely(creds || link)) { + if (creds) + revert_creds(creds); + if (link) + io_queue_linked_timeout(link); + } if (ret == IOU_OK) { if (issue_flags & IO_URING_F_COMPLETE_DEFER) @@ -1809,8 +1799,6 @@ void io_wq_submit_work(struct io_wq_work *work) else req_ref_get(req); - io_arm_ltimeout(req); - /* either cancelled or io-wq is dying, so don't touch tctx->iowq */ if (work->flags & IO_WQ_WORK_CANCEL) { fail: @@ -1908,15 +1896,11 @@ struct file *io_file_get_normal(struct io_kiocb *req, int fd) static void io_queue_async(struct io_kiocb *req, int ret) __must_hold(&req->ctx->uring_lock) { - struct io_kiocb *linked_timeout; - if (ret != -EAGAIN || (req->flags & REQ_F_NOWAIT)) { io_req_complete_failed(req, ret); return; } - linked_timeout = io_prep_linked_timeout(req); - switch (io_arm_poll_handler(req, 0)) { case IO_APOLL_READY: io_kbuf_recycle(req, 0); @@ -1929,9 +1913,6 @@ static void io_queue_async(struct io_kiocb *req, int ret) case IO_APOLL_OK: break; } - - if (linked_timeout) - io_queue_linked_timeout(linked_timeout); } static inline void io_queue_sqe(struct io_kiocb *req) @@ -1945,9 +1926,7 @@ static inline void io_queue_sqe(struct io_kiocb *req) * We async punt it if the file wasn't marked NOWAIT, or if the file * doesn't support non-blocking read/write attempts */ - if (likely(!ret)) - io_arm_ltimeout(req); - else + if (unlikely(ret)) io_queue_async(req, ret); } From b82c386898f7b00cb49abe3fbd622017aaa61230 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 7 May 2025 08:07:09 -0600 Subject: [PATCH 070/529] io_uring: ensure deferred completions are posted for multishot Commit 687b2bae0efff9b25e071737d6af5004e6e35af5 upstream. Multishot normally uses io_req_post_cqe() to post completions, but when stopping it, it may finish up with a deferred completion. This is fine, except if another multishot event triggers before the deferred completions get flushed. If this occurs, then CQEs may get reordered in the CQ ring, and cause confusion on the application side. When multishot posting via io_req_post_cqe(), flush any pending deferred completions first, if any. Cc: stable@vger.kernel.org # 6.1+ Reported-by: Norman Maurer Reported-by: Christian Mazakas Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- io_uring/io_uring.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index 488793b119d0..f39d66589180 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -819,6 +819,14 @@ bool io_post_aux_cqe(struct io_ring_ctx *ctx, { bool filled; + /* + * If multishot has already posted deferred completions, ensure that + * those are flushed first before posting this one. If not, CQEs + * could get reordered. + */ + if (!wq_list_empty(&ctx->submit_state.compl_reqs)) + __io_submit_flush_completions(ctx); + io_cq_lock(ctx); filled = io_fill_cqe_aux(ctx, user_data, res, cflags, allow_overflow); io_cq_unlock_post(ctx); From 361dfa7f5c94bdaa51b637b2784d01d70a35bfaf Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Mon, 12 May 2025 16:01:41 +0200 Subject: [PATCH 071/529] Revert "net: phy: microchip: force IRQ polling mode for lan88xx" This reverts commit 9b89102fbb8fc5393e2a0f981aafdb3cf43591ee which is commit 30a41ed32d3088cd0d682a13d7f30b23baed7e93 upstream. It is reported to cause NFS boot problems on a Raspberry Pi 3b so revert it from this branch for now. Cc: Fiona Klute Cc: Andrew Lunn Cc: Paolo Abeni Cc: Sasha Levin Link: https://lore.kernel.org/r/aB6uurX99AZWM9I1@finisterre.sirena.org.uk Reported-by: Mark Brown Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/microchip.c | 46 ++++++++++++++++++++++++++++++++++--- 1 file changed, 43 insertions(+), 3 deletions(-) diff --git a/drivers/net/phy/microchip.c b/drivers/net/phy/microchip.c index 623607fd2cef..0b88635f4fbc 100644 --- a/drivers/net/phy/microchip.c +++ b/drivers/net/phy/microchip.c @@ -31,6 +31,47 @@ static int lan88xx_write_page(struct phy_device *phydev, int page) return __phy_write(phydev, LAN88XX_EXT_PAGE_ACCESS, page); } +static int lan88xx_phy_config_intr(struct phy_device *phydev) +{ + int rc; + + if (phydev->interrupts == PHY_INTERRUPT_ENABLED) { + /* unmask all source and clear them before enable */ + rc = phy_write(phydev, LAN88XX_INT_MASK, 0x7FFF); + rc = phy_read(phydev, LAN88XX_INT_STS); + rc = phy_write(phydev, LAN88XX_INT_MASK, + LAN88XX_INT_MASK_MDINTPIN_EN_ | + LAN88XX_INT_MASK_LINK_CHANGE_); + } else { + rc = phy_write(phydev, LAN88XX_INT_MASK, 0); + if (rc) + return rc; + + /* Ack interrupts after they have been disabled */ + rc = phy_read(phydev, LAN88XX_INT_STS); + } + + return rc < 0 ? rc : 0; +} + +static irqreturn_t lan88xx_handle_interrupt(struct phy_device *phydev) +{ + int irq_status; + + irq_status = phy_read(phydev, LAN88XX_INT_STS); + if (irq_status < 0) { + phy_error(phydev); + return IRQ_NONE; + } + + if (!(irq_status & LAN88XX_INT_STS_LINK_CHANGE_)) + return IRQ_NONE; + + phy_trigger_machine(phydev); + + return IRQ_HANDLED; +} + static int lan88xx_suspend(struct phy_device *phydev) { struct lan88xx_priv *priv = phydev->priv; @@ -351,9 +392,8 @@ static struct phy_driver microchip_phy_driver[] = { .config_aneg = lan88xx_config_aneg, .link_change_notify = lan88xx_link_change_notify, - /* Interrupt handling is broken, do not define related - * functions to force polling. - */ + .config_intr = lan88xx_phy_config_intr, + .handle_interrupt = lan88xx_handle_interrupt, .suspend = lan88xx_suspend, .resume = genphy_resume, From cc0b8e148c334c4e8ea60446dfd78b72f7a70d70 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 9 Dec 2021 15:12:19 +0000 Subject: [PATCH 072/529] arm64: insn: Add support for encoding DSB commit 63de8abd97ddb9b758bd8f915ecbd18e1f1a87a0 upstream. To generate code in the eBPF epilogue that uses the DSB instruction, insn.c needs a heler to encode the type and domain. Re-use the crm encoding logic from the DMB instruction. Signed-off-by: James Morse Reviewed-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/insn.h | 1 + arch/arm64/lib/insn.c | 76 +++++++++++++++++++++-------------- 2 files changed, 46 insertions(+), 31 deletions(-) diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 834bff720582..0075dc5d8238 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -619,6 +619,7 @@ u32 aarch64_insn_gen_cas(enum aarch64_insn_register result, } #endif u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type); +u32 aarch64_insn_gen_dsb(enum aarch64_insn_mb_type type); s32 aarch64_get_branch_offset(u32 insn); u32 aarch64_set_branch_offset(u32 insn, s32 offset); diff --git a/arch/arm64/lib/insn.c b/arch/arm64/lib/insn.c index 49e972beeac7..44bb90ee2f41 100644 --- a/arch/arm64/lib/insn.c +++ b/arch/arm64/lib/insn.c @@ -5,6 +5,7 @@ * * Copyright (C) 2014-2016 Zi Shen Lim */ +#include #include #include #include @@ -1630,43 +1631,41 @@ u32 aarch64_insn_gen_extr(enum aarch64_insn_variant variant, return aarch64_insn_encode_register(AARCH64_INSN_REGTYPE_RM, insn, Rm); } +static u32 __get_barrier_crm_val(enum aarch64_insn_mb_type type) +{ + switch (type) { + case AARCH64_INSN_MB_SY: + return 0xf; + case AARCH64_INSN_MB_ST: + return 0xe; + case AARCH64_INSN_MB_LD: + return 0xd; + case AARCH64_INSN_MB_ISH: + return 0xb; + case AARCH64_INSN_MB_ISHST: + return 0xa; + case AARCH64_INSN_MB_ISHLD: + return 0x9; + case AARCH64_INSN_MB_NSH: + return 0x7; + case AARCH64_INSN_MB_NSHST: + return 0x6; + case AARCH64_INSN_MB_NSHLD: + return 0x5; + default: + pr_err("%s: unknown barrier type %d\n", __func__, type); + return AARCH64_BREAK_FAULT; + } +} + u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type) { u32 opt; u32 insn; - switch (type) { - case AARCH64_INSN_MB_SY: - opt = 0xf; - break; - case AARCH64_INSN_MB_ST: - opt = 0xe; - break; - case AARCH64_INSN_MB_LD: - opt = 0xd; - break; - case AARCH64_INSN_MB_ISH: - opt = 0xb; - break; - case AARCH64_INSN_MB_ISHST: - opt = 0xa; - break; - case AARCH64_INSN_MB_ISHLD: - opt = 0x9; - break; - case AARCH64_INSN_MB_NSH: - opt = 0x7; - break; - case AARCH64_INSN_MB_NSHST: - opt = 0x6; - break; - case AARCH64_INSN_MB_NSHLD: - opt = 0x5; - break; - default: - pr_err("%s: unknown dmb type %d\n", __func__, type); + opt = __get_barrier_crm_val(type); + if (opt == AARCH64_BREAK_FAULT) return AARCH64_BREAK_FAULT; - } insn = aarch64_insn_get_dmb_value(); insn &= ~GENMASK(11, 8); @@ -1674,3 +1673,18 @@ u32 aarch64_insn_gen_dmb(enum aarch64_insn_mb_type type) return insn; } + +u32 aarch64_insn_gen_dsb(enum aarch64_insn_mb_type type) +{ + u32 opt, insn; + + opt = __get_barrier_crm_val(type); + if (opt == AARCH64_BREAK_FAULT) + return AARCH64_BREAK_FAULT; + + insn = aarch64_insn_get_dsb_base_value(); + insn &= ~GENMASK(11, 8); + insn |= (opt << 8); + + return insn; +} From 351a505eb4785168b3cb66eec652c6cbec46de10 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 19 Aug 2024 14:15:53 +0100 Subject: [PATCH 073/529] arm64: proton-pack: Expose whether the platform is mitigated by firmware commit e7956c92f396a44eeeb6eaf7a5b5e1ad24db6748 upstream. is_spectre_bhb_fw_affected() allows the caller to determine if the CPU is known to need a firmware mitigation. CPUs are either on the list of CPUs we know about, or firmware has been queried and reported that the platform is affected - and mitigated by firmware. This helper is not useful to determine if the platform is mitigated by firmware. A CPU could be on the know list, but the firmware may not be implemented. Its affected but not mitigated. spectre_bhb_enable_mitigation() handles this distinction by checking the firmware state before enabling the mitigation. Add a helper to expose this state. This will be used by the BPF JIT to determine if calling firmware for a mitigation is necessary and supported. Signed-off-by: James Morse Reviewed-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/spectre.h | 1 + arch/arm64/kernel/proton-pack.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index 6f9f4f68b9e2..a107a87aa119 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -96,6 +96,7 @@ enum mitigation_state arm64_get_meltdown_state(void); enum mitigation_state arm64_get_spectre_bhb_state(void); bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope); +bool is_spectre_bhb_fw_mitigated(void); void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *__unused); #endif /* __ASSEMBLY__ */ #endif /* __ASM_SPECTRE_H */ diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 3d12fc5af87b..a1e5e1f9cbe0 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -1108,6 +1108,11 @@ void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *entry) update_mitigation_state(&spectre_bhb_state, state); } +bool is_spectre_bhb_fw_mitigated(void) +{ + return test_bit(BHB_FW, &system_bhb_mitigations); +} + /* Patched to NOP when enabled */ void noinstr spectre_bhb_patch_loop_mitigation_enable(struct alt_instr *alt, __le32 *origptr, From 49777123413337aa07069ac6b845617aa0cf9514 Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 29 Apr 2025 13:55:17 +0100 Subject: [PATCH 074/529] arm64: proton-pack: Expose whether the branchy loop k value commit a1152be30a043d2d4dcb1683415f328bf3c51978 upstream. Add a helper to expose the k value of the branchy loop. This is needed by the BPF JIT to generate the mitigation sequence in BPF programs. Signed-off-by: James Morse Reviewed-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/spectre.h | 1 + arch/arm64/kernel/proton-pack.c | 5 +++++ 2 files changed, 6 insertions(+) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index a107a87aa119..5b4b9854b2bc 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -96,6 +96,7 @@ enum mitigation_state arm64_get_meltdown_state(void); enum mitigation_state arm64_get_spectre_bhb_state(void); bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope); +u8 get_spectre_bhb_loop_value(void); bool is_spectre_bhb_fw_mitigated(void); void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *__unused); #endif /* __ASSEMBLY__ */ diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index a1e5e1f9cbe0..8c7b93fd778c 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -1010,6 +1010,11 @@ bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, return true; } +u8 get_spectre_bhb_loop_value(void) +{ + return max_bhb_k; +} + static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) { const char *v = arm64_get_bp_hardening_vector(slot); From 8fe5c37b0e08a97cf0210bb75970e945aaaeebab Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 9 Dec 2021 15:13:24 +0000 Subject: [PATCH 075/529] arm64: bpf: Add BHB mitigation to the epilogue for cBPF programs commit 0dfefc2ea2f29ced2416017d7e5b1253a54c2735 upstream. A malicious BPF program may manipulate the branch history to influence what the hardware speculates will happen next. On exit from a BPF program, emit the BHB mititgation sequence. This is only applied for 'classic' cBPF programs that are loaded by seccomp. Signed-off-by: James Morse Reviewed-by: Catalin Marinas Acked-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/spectre.h | 1 + arch/arm64/kernel/proton-pack.c | 2 +- arch/arm64/net/bpf_jit_comp.c | 54 +++++++++++++++++++++++++++++--- 3 files changed, 52 insertions(+), 5 deletions(-) diff --git a/arch/arm64/include/asm/spectre.h b/arch/arm64/include/asm/spectre.h index 5b4b9854b2bc..9de17d4bb036 100644 --- a/arch/arm64/include/asm/spectre.h +++ b/arch/arm64/include/asm/spectre.h @@ -96,6 +96,7 @@ enum mitigation_state arm64_get_meltdown_state(void); enum mitigation_state arm64_get_spectre_bhb_state(void); bool is_spectre_bhb_affected(const struct arm64_cpu_capabilities *entry, int scope); +extern bool __nospectre_bhb; u8 get_spectre_bhb_loop_value(void); bool is_spectre_bhb_fw_mitigated(void); void spectre_bhb_enable_mitigation(const struct arm64_cpu_capabilities *__unused); diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 8c7b93fd778c..57422af2a714 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -1035,7 +1035,7 @@ static void this_cpu_set_vectors(enum arm64_bp_harden_el1_vectors slot) isb(); } -static bool __read_mostly __nospectre_bhb; +bool __read_mostly __nospectre_bhb; static int __init parse_spectre_bhb_param(char *str) { __nospectre_bhb = true; diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index c04ace8f4843..3eca30605b8a 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -7,6 +7,7 @@ #define pr_fmt(fmt) "bpf_jit: " fmt +#include #include #include #include @@ -17,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -653,7 +655,48 @@ static void build_plt(struct jit_ctx *ctx) plt->target = (u64)&dummy_tramp; } -static void build_epilogue(struct jit_ctx *ctx) +/* Clobbers BPF registers 1-4, aka x0-x3 */ +static void __maybe_unused build_bhb_mitigation(struct jit_ctx *ctx) +{ + const u8 r1 = bpf2a64[BPF_REG_1]; /* aka x0 */ + u8 k = get_spectre_bhb_loop_value(); + + if (!IS_ENABLED(CONFIG_MITIGATE_SPECTRE_BRANCH_HISTORY) || + cpu_mitigations_off() || __nospectre_bhb || + arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) + return; + + if (supports_clearbhb(SCOPE_SYSTEM)) { + emit(aarch64_insn_gen_hint(AARCH64_INSN_HINT_CLEARBHB), ctx); + return; + } + + if (k) { + emit_a64_mov_i64(r1, k, ctx); + emit(A64_B(1), ctx); + emit(A64_SUBS_I(true, r1, r1, 1), ctx); + emit(A64_B_(A64_COND_NE, -2), ctx); + emit(aarch64_insn_gen_dsb(AARCH64_INSN_MB_ISH), ctx); + emit(aarch64_insn_get_isb_value(), ctx); + } + + if (is_spectre_bhb_fw_mitigated()) { + emit(A64_ORR_I(false, r1, AARCH64_INSN_REG_ZR, + ARM_SMCCC_ARCH_WORKAROUND_3), ctx); + switch (arm_smccc_1_1_get_conduit()) { + case SMCCC_CONDUIT_HVC: + emit(aarch64_insn_get_hvc_value(), ctx); + break; + case SMCCC_CONDUIT_SMC: + emit(aarch64_insn_get_smc_value(), ctx); + break; + default: + pr_err_once("Firmware mitigation enabled with unknown conduit\n"); + } + } +} + +static void build_epilogue(struct jit_ctx *ctx, bool was_classic) { const u8 r0 = bpf2a64[BPF_REG_0]; const u8 r6 = bpf2a64[BPF_REG_6]; @@ -675,10 +718,13 @@ static void build_epilogue(struct jit_ctx *ctx) emit(A64_POP(r8, r9, A64_SP), ctx); emit(A64_POP(r6, r7, A64_SP), ctx); + if (was_classic) + build_bhb_mitigation(ctx); + /* Restore FP/LR registers */ emit(A64_POP(A64_FP, A64_LR, A64_SP), ctx); - /* Set return value */ + /* Move the return value from bpf:r0 (aka x7) to x0 */ emit(A64_MOV(1, A64_R(0), r0), ctx); /* Authenticate lr */ @@ -1527,7 +1573,7 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *prog) } ctx.epilogue_offset = ctx.idx; - build_epilogue(&ctx); + build_epilogue(&ctx, was_classic); build_plt(&ctx); extable_align = __alignof__(struct exception_table_entry); @@ -1563,7 +1609,7 @@ skip_init_ctx: goto out_off; } - build_epilogue(&ctx); + build_epilogue(&ctx, was_classic); build_plt(&ctx); /* 3. Extra pass to validate JITed code. */ From 6e52d043f7dbf1839a24a3fab2b12b0d3839de7a Mon Sep 17 00:00:00 2001 From: James Morse Date: Tue, 29 Apr 2025 16:03:38 +0100 Subject: [PATCH 076/529] arm64: bpf: Only mitigate cBPF programs loaded by unprivileged users commit f300769ead032513a68e4a02e806393402e626f8 upstream. Support for eBPF programs loaded by unprivileged users is typically disabled. This means only cBPF programs need to be mitigated for BHB. In addition, only mitigate cBPF programs that were loaded by an unprivileged user. Privileged users can also load the same program via eBPF, making the mitigation pointless. Signed-off-by: James Morse Reviewed-by: Catalin Marinas Acked-by: Daniel Borkmann Signed-off-by: Greg Kroah-Hartman --- arch/arm64/net/bpf_jit_comp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 3eca30605b8a..09b70a9b5c39 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -666,6 +666,9 @@ static void __maybe_unused build_bhb_mitigation(struct jit_ctx *ctx) arm64_get_spectre_v2_state() == SPECTRE_VULNERABLE) return; + if (capable(CAP_SYS_ADMIN)) + return; + if (supports_clearbhb(SCOPE_SYSTEM)) { emit(aarch64_insn_gen_hint(AARCH64_INSN_HINT_CLEARBHB), ctx); return; From 9fc1391552ad2b46c66e912bcc793874b24aa731 Mon Sep 17 00:00:00 2001 From: James Morse Date: Mon, 12 Aug 2024 17:50:22 +0100 Subject: [PATCH 077/529] arm64: proton-pack: Add new CPUs 'k' values for branch mitigation commit efe676a1a7554219eae0b0dcfe1e0cdcc9ef9aef upstream. Update the list of 'k' values for the branch mitigation from arm's website. Add the values for Cortex-X1C. The MIDR_EL1 value can be found here: https://developer.arm.com/documentation/101968/0002/Register-descriptions/AArch> Link: https://developer.arm.com/documentation/110280/2-0/?lang=en Signed-off-by: James Morse Reviewed-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/cputype.h | 2 ++ arch/arm64/kernel/proton-pack.c | 1 + 2 files changed, 3 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index cdb024dd33f5..fe022fe2d4f6 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -81,6 +81,7 @@ #define ARM_CPU_PART_CORTEX_A78AE 0xD42 #define ARM_CPU_PART_CORTEX_X1 0xD44 #define ARM_CPU_PART_CORTEX_A510 0xD46 +#define ARM_CPU_PART_CORTEX_X1C 0xD4C #define ARM_CPU_PART_CORTEX_A520 0xD80 #define ARM_CPU_PART_CORTEX_A710 0xD47 #define ARM_CPU_PART_CORTEX_A715 0xD4D @@ -159,6 +160,7 @@ #define MIDR_CORTEX_A78AE MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A78AE) #define MIDR_CORTEX_X1 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1) #define MIDR_CORTEX_A510 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A510) +#define MIDR_CORTEX_X1C MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_X1C) #define MIDR_CORTEX_A520 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A520) #define MIDR_CORTEX_A710 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A710) #define MIDR_CORTEX_A715 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A715) diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index 57422af2a714..fcc641f30c93 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -903,6 +903,7 @@ static u8 spectre_bhb_loop_affected(void) MIDR_ALL_VERSIONS(MIDR_CORTEX_A78AE), MIDR_ALL_VERSIONS(MIDR_CORTEX_A78C), MIDR_ALL_VERSIONS(MIDR_CORTEX_X1), + MIDR_ALL_VERSIONS(MIDR_CORTEX_X1C), MIDR_ALL_VERSIONS(MIDR_CORTEX_A710), MIDR_ALL_VERSIONS(MIDR_CORTEX_X2), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N2), From 845c707b802c9a79d237f7b5fc02d06f1217bf2b Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Mon, 5 May 2025 14:35:12 -0700 Subject: [PATCH 078/529] x86/bpf: Call branch history clearing sequence on exit commit d4e89d212d401672e9cdfe825d947ee3a9fbe3f5 upstream. Classic BPF programs have been identified as potential vectors for intra-mode Branch Target Injection (BTI) attacks. Classic BPF programs can be run by unprivileged users. They allow unprivileged code to execute inside the kernel. Attackers can use unprivileged cBPF to craft branch history in kernel mode that can influence the target of indirect branches. Introduce a branch history buffer (BHB) clearing sequence during the JIT compilation of classic BPF programs. The clearing sequence is the same as is used in previous mitigations to protect syscalls. Since eBPF programs already have their own mitigations in place, only insert the call on classic programs that aren't run by privileged users. Signed-off-by: Daniel Sneddon Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Acked-by: Daniel Borkmann Reviewed-by: Alexandre Chartre Signed-off-by: Greg Kroah-Hartman --- arch/x86/net/bpf_jit_comp.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 92db785a0a8e..cdcca2ac9521 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -947,6 +947,29 @@ static void emit_nops(u8 **pprog, int len) #define RESTORE_TAIL_CALL_CNT(stack) \ EMIT3_off32(0x48, 0x8B, 0x85, -round_up(stack, 8) - 8) +static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip, + struct bpf_prog *bpf_prog) +{ + u8 *prog = *pprog; + u8 *func; + + if (cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP)) { + /* The clearing sequence clobbers eax and ecx. */ + EMIT1(0x50); /* push rax */ + EMIT1(0x51); /* push rcx */ + ip += 2; + + func = (u8 *)clear_bhb_loop; + + if (emit_call(&prog, func, ip)) + return -EINVAL; + EMIT1(0x59); /* pop rcx */ + EMIT1(0x58); /* pop rax */ + } + *pprog = prog; + return 0; +} + static int do_jit(struct bpf_prog *bpf_prog, int *addrs, u8 *image, u8 *rw_image, int oldproglen, struct jit_context *ctx, bool jmp_padding) { @@ -1760,6 +1783,15 @@ emit_jmp: seen_exit = true; /* Update cleanup_addr */ ctx->cleanup_addr = proglen; + + if (bpf_prog_was_classic(bpf_prog) && + !capable(CAP_SYS_ADMIN)) { + u8 *ip = image + addrs[i - 1]; + + if (emit_spectre_bhb_barrier(&prog, ip, bpf_prog)) + return -EINVAL; + } + pop_callee_regs(&prog, callee_regs_used); EMIT1(0xC9); /* leave */ emit_return(&prog, image + addrs[i - 1] + (prog - temp)); From cebc238b026de63479e7a43b9a7fc46896e1ce5a Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Mon, 5 May 2025 14:35:12 -0700 Subject: [PATCH 079/529] x86/bpf: Add IBHF call at end of classic BPF commit 9f725eec8fc0b39bdc07dcc8897283c367c1a163 upstream. Classic BPF programs can be run by unprivileged users, allowing unprivileged code to execute inside the kernel. Attackers can use this to craft branch history in kernel mode that can influence the target of indirect branches. BHI_DIS_S provides user-kernel isolation of branch history, but cBPF can be used to bypass this protection by crafting branch history in kernel mode. To stop intra-mode attacks via cBPF programs, Intel created a new instruction Indirect Branch History Fence (IBHF). IBHF prevents the predicted targets of subsequent indirect branches from being influenced by branch history prior to the IBHF. IBHF is only effective while BHI_DIS_S is enabled. Add the IBHF instruction to cBPF jitted code's exit path. Add the new fence when the hardware mitigation is enabled (i.e., X86_FEATURE_CLEAR_BHB_HW is set) or after the software sequence (X86_FEATURE_CLEAR_BHB_LOOP) is being used in a virtual machine. Note that X86_FEATURE_CLEAR_BHB_HW and X86_FEATURE_CLEAR_BHB_LOOP are mutually exclusive, so the JIT compiler will only emit the new fence, not the SW sequence, when X86_FEATURE_CLEAR_BHB_HW is set. Hardware that enumerates BHI_NO basically has BHI_DIS_S protections always enabled, regardless of the value of BHI_DIS_S. Since BHI_DIS_S doesn't protect against intra-mode attacks, enumerate BHI bug on BHI_NO hardware as well. Signed-off-by: Daniel Sneddon Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Acked-by: Daniel Borkmann Reviewed-by: Alexandre Chartre Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/common.c | 9 ++++++--- arch/x86/net/bpf_jit_comp.c | 19 +++++++++++++++++++ 2 files changed, 25 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 62aaabea7e33..def8af31139b 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1455,9 +1455,12 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (vulnerable_to_rfds(x86_arch_cap_msr)) setup_force_cpu_bug(X86_BUG_RFDS); - /* When virtualized, eIBRS could be hidden, assume vulnerable */ - if (!(x86_arch_cap_msr & ARCH_CAP_BHI_NO) && - !cpu_matches(cpu_vuln_whitelist, NO_BHI) && + /* + * Intel parts with eIBRS are vulnerable to BHI attacks. Parts with + * BHI_NO still need to use the BHI mitigation to prevent Intra-mode + * attacks. When virtualized, eIBRS could be hidden, assume vulnerable. + */ + if (!cpu_matches(cpu_vuln_whitelist, NO_BHI) && (boot_cpu_has(X86_FEATURE_IBRS_ENHANCED) || boot_cpu_has(X86_FEATURE_HYPERVISOR))) setup_force_cpu_bug(X86_BUG_BHI); diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index cdcca2ac9521..e4ccc4136695 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -36,6 +36,8 @@ static u8 *emit_code(u8 *ptr, u32 bytes, unsigned int len) #define EMIT2(b1, b2) EMIT((b1) + ((b2) << 8), 2) #define EMIT3(b1, b2, b3) EMIT((b1) + ((b2) << 8) + ((b3) << 16), 3) #define EMIT4(b1, b2, b3, b4) EMIT((b1) + ((b2) << 8) + ((b3) << 16) + ((b4) << 24), 4) +#define EMIT5(b1, b2, b3, b4, b5) \ + do { EMIT1(b1); EMIT4(b2, b3, b4, b5); } while (0) #define EMIT1_off32(b1, off) \ do { EMIT1(b1); EMIT(off, 4); } while (0) @@ -966,6 +968,23 @@ static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip, EMIT1(0x59); /* pop rcx */ EMIT1(0x58); /* pop rax */ } + /* Insert IBHF instruction */ + if ((cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP) && + cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) || + (cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_HW) && + IS_ENABLED(CONFIG_X86_64))) { + /* + * Add an Indirect Branch History Fence (IBHF). IBHF acts as a + * fence preventing branch history from before the fence from + * affecting indirect branches after the fence. This is + * specifically used in cBPF jitted code to prevent Intra-mode + * BHI attacks. The IBHF instruction is designed to be a NOP on + * hardware that doesn't need or support it. The REP and REX.W + * prefixes are required by the microcode, and they also ensure + * that the NOP is unlikely to be used in existing code. + */ + EMIT5(0xF3, 0x48, 0x0F, 0x1E, 0xF8); /* ibhf */ + } *pprog = prog; return 0; } From db734ba733daf336e5815032244ff5ed96ec41c4 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 5 May 2025 14:35:12 -0700 Subject: [PATCH 080/529] x86/bhi: Do not set BHI_DIS_S in 32-bit mode commit 073fdbe02c69c43fb7c0d547ec265c7747d4a646 upstream. With the possibility of intra-mode BHI via cBPF, complete mitigation for BHI is to use IBHF (history fence) instruction with BHI_DIS_S set. Since this new instruction is only available in 64-bit mode, setting BHI_DIS_S in 32-bit mode is only a partial mitigation. Do not set BHI_DIS_S in 32-bit mode so as to avoid reporting misleading mitigated status. With this change IBHF won't be used in 32-bit mode, also remove the CONFIG_X86_64 check from emit_spectre_bhb_barrier(). Suggested-by: Josh Poimboeuf Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 5 +++-- arch/x86/net/bpf_jit_comp.c | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 0be0edb07a2a..b22f72a177ba 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1656,10 +1656,11 @@ static void __init bhi_select_mitigation(void) return; } - if (spec_ctrl_bhi_dis()) + if (!IS_ENABLED(CONFIG_X86_64)) return; - if (!IS_ENABLED(CONFIG_X86_64)) + /* Mitigate in hardware if supported */ + if (spec_ctrl_bhi_dis()) return; /* Mitigate KVM by default */ diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index e4ccc4136695..44b49a1fe579 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -971,8 +971,7 @@ static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip, /* Insert IBHF instruction */ if ((cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_LOOP) && cpu_feature_enabled(X86_FEATURE_HYPERVISOR)) || - (cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_HW) && - IS_ENABLED(CONFIG_X86_64))) { + cpu_feature_enabled(X86_FEATURE_CLEAR_BHB_HW)) { /* * Add an Indirect Branch History Fence (IBHF). IBHF acts as a * fence preventing branch history from before the fence from @@ -982,6 +981,8 @@ static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip, * hardware that doesn't need or support it. The REP and REX.W * prefixes are required by the microcode, and they also ensure * that the NOP is unlikely to be used in existing code. + * + * IBHF is not a valid instruction in 32-bit mode. */ EMIT5(0xF3, 0x48, 0x0F, 0x1E, 0xF8); /* ibhf */ } From 4bc1033dff8ca441be4a5737a0bbad4d1ed45767 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 28 Feb 2025 18:35:43 -0800 Subject: [PATCH 081/529] x86/speculation: Simplify and make CALL_NOSPEC consistent commit cfceff8526a426948b53445c02bcb98453c7330d upstream. CALL_NOSPEC macro is used to generate Spectre-v2 mitigation friendly indirect branches. At compile time the macro defaults to indirect branch, and at runtime those can be patched to thunk based mitigations. This approach is opposite of what is done for the rest of the kernel, where the compile time default is to replace indirect calls with retpoline thunk calls. Make CALL_NOSPEC consistent with the rest of the kernel, default to retpoline thunk at compile time when CONFIG_RETPOLINE is enabled. [ pawan: s/CONFIG_MITIGATION_RETPOLINE/CONFIG_RETPOLINE/ ] Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Cc: Andrew Cooper Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250228-call-nospec-v3-1-96599fed0f33@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index daf58a96e0a7..7b5b93da50f5 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -284,16 +284,11 @@ extern void (*x86_return_thunk)(void); * Inline asm uses the %V modifier which is only in newer GCC * which is ensured when CONFIG_RETPOLINE is defined. */ -# define CALL_NOSPEC \ - ALTERNATIVE_2( \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%[thunk_target]\n", \ - "call __x86_indirect_thunk_%V[thunk_target]\n", \ - X86_FEATURE_RETPOLINE, \ - "lfence;\n" \ - ANNOTATE_RETPOLINE_SAFE \ - "call *%[thunk_target]\n", \ - X86_FEATURE_RETPOLINE_LFENCE) +#ifdef CONFIG_RETPOLINE +#define CALL_NOSPEC "call __x86_indirect_thunk_%V[thunk_target]\n" +#else +#define CALL_NOSPEC "call *%[thunk_target]\n" +#endif # define THUNK_TARGET(addr) [thunk_target] "r" (addr) From fb3768004e83c534d1e7e8888f92f358660ee0f9 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 28 Feb 2025 18:35:58 -0800 Subject: [PATCH 082/529] x86/speculation: Add a conditional CS prefix to CALL_NOSPEC commit 052040e34c08428a5a388b85787e8531970c0c67 upstream. Retpoline mitigation for spectre-v2 uses thunks for indirect branches. To support this mitigation compilers add a CS prefix with -mindirect-branch-cs-prefix. For an indirect branch in asm, this needs to be added manually. CS prefix is already being added to indirect branches in asm files, but not in inline asm. Add CS prefix to CALL_NOSPEC for inline asm as well. There is no JMP_NOSPEC for inline asm. Reported-by: Josh Poimboeuf Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Cc: Andrew Cooper Cc: Peter Zijlstra Link: https://lore.kernel.org/r/20250228-call-nospec-v3-2-96599fed0f33@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 7b5b93da50f5..dab6e9ef01b6 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -119,9 +119,8 @@ .endm /* - * Equivalent to -mindirect-branch-cs-prefix; emit the 5 byte jmp/call - * to the retpoline thunk with a CS prefix when the register requires - * a RAX prefix byte to encode. Also see apply_retpolines(). + * Emits a conditional CS prefix that is compatible with + * -mindirect-branch-cs-prefix. */ .macro __CS_PREFIX reg:req .irp rs,r8,r9,r10,r11,r12,r13,r14,r15 @@ -280,12 +279,24 @@ extern void (*x86_return_thunk)(void); #ifdef CONFIG_X86_64 +/* + * Emits a conditional CS prefix that is compatible with + * -mindirect-branch-cs-prefix. + */ +#define __CS_PREFIX(reg) \ + ".irp rs,r8,r9,r10,r11,r12,r13,r14,r15\n" \ + ".ifc \\rs," reg "\n" \ + ".byte 0x2e\n" \ + ".endif\n" \ + ".endr\n" + /* * Inline asm uses the %V modifier which is only in newer GCC * which is ensured when CONFIG_RETPOLINE is defined. */ #ifdef CONFIG_RETPOLINE -#define CALL_NOSPEC "call __x86_indirect_thunk_%V[thunk_target]\n" +#define CALL_NOSPEC __CS_PREFIX("%V[thunk_target]") \ + "call __x86_indirect_thunk_%V[thunk_target]\n" #else #define CALL_NOSPEC "call *%[thunk_target]\n" #endif From b1ef84b0ffe31a3a6bcdb9eb5ffa4410360d0bd3 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Thu, 20 Mar 2025 11:13:15 -0700 Subject: [PATCH 083/529] x86/speculation: Remove the extra #ifdef around CALL_NOSPEC commit c8c81458863ab686cda4fe1e603fccaae0f12460 upstream. Commit: 010c4a461c1d ("x86/speculation: Simplify and make CALL_NOSPEC consistent") added an #ifdef CONFIG_RETPOLINE around the CALL_NOSPEC definition. This is not required as this code is already under a larger #ifdef. Remove the extra #ifdef, no functional change. vmlinux size remains same before and after this change: CONFIG_RETPOLINE=y: text data bss dec hex filename 25434752 7342290 2301212 35078254 217406e vmlinux.before 25434752 7342290 2301212 35078254 217406e vmlinux.after # CONFIG_RETPOLINE is not set: text data bss dec hex filename 22943094 6214994 1550152 30708240 1d49210 vmlinux.before 22943094 6214994 1550152 30708240 1d49210 vmlinux.after [ pawan: s/CONFIG_MITIGATION_RETPOLINE/CONFIG_RETPOLINE/ ] Signed-off-by: Pawan Gupta Signed-off-by: Ingo Molnar Reviewed-by: Josh Poimboeuf Link: https://lore.kernel.org/r/20250320-call-nospec-extra-ifdef-v1-1-d9b084d24820@linux.intel.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index dab6e9ef01b6..95730ba530c8 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -294,12 +294,8 @@ extern void (*x86_return_thunk)(void); * Inline asm uses the %V modifier which is only in newer GCC * which is ensured when CONFIG_RETPOLINE is defined. */ -#ifdef CONFIG_RETPOLINE #define CALL_NOSPEC __CS_PREFIX("%V[thunk_target]") \ "call __x86_indirect_thunk_%V[thunk_target]\n" -#else -#define CALL_NOSPEC "call *%[thunk_target]\n" -#endif # define THUNK_TARGET(addr) [thunk_target] "r" (addr) From ed2e894a7645ac3cd22e0a8e8897496a5e5c7d0a Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 11 Apr 2025 15:36:38 -0700 Subject: [PATCH 084/529] Documentation: x86/bugs/its: Add ITS documentation commit 1ac116ce6468670eeda39345a5585df308243dca upstream. Add the admin-guide for Indirect Target Selection (ITS). Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre Signed-off-by: Greg Kroah-Hartman --- Documentation/admin-guide/hw-vuln/index.rst | 1 + .../hw-vuln/indirect-target-selection.rst | 156 ++++++++++++++++++ 2 files changed, 157 insertions(+) create mode 100644 Documentation/admin-guide/hw-vuln/indirect-target-selection.rst diff --git a/Documentation/admin-guide/hw-vuln/index.rst b/Documentation/admin-guide/hw-vuln/index.rst index 3e4a14e38b49..dc69ba0b05e4 100644 --- a/Documentation/admin-guide/hw-vuln/index.rst +++ b/Documentation/admin-guide/hw-vuln/index.rst @@ -22,3 +22,4 @@ are configurable at compile, boot or run time. gather_data_sampling.rst srso reg-file-data-sampling + indirect-target-selection diff --git a/Documentation/admin-guide/hw-vuln/indirect-target-selection.rst b/Documentation/admin-guide/hw-vuln/indirect-target-selection.rst new file mode 100644 index 000000000000..4788e14ebce0 --- /dev/null +++ b/Documentation/admin-guide/hw-vuln/indirect-target-selection.rst @@ -0,0 +1,156 @@ +.. SPDX-License-Identifier: GPL-2.0 + +Indirect Target Selection (ITS) +=============================== + +ITS is a vulnerability in some Intel CPUs that support Enhanced IBRS and were +released before Alder Lake. ITS may allow an attacker to control the prediction +of indirect branches and RETs located in the lower half of a cacheline. + +ITS is assigned CVE-2024-28956 with a CVSS score of 4.7 (Medium). + +Scope of Impact +--------------- +- **eIBRS Guest/Host Isolation**: Indirect branches in KVM/kernel may still be + predicted with unintended target corresponding to a branch in the guest. + +- **Intra-Mode BTI**: In-kernel training such as through cBPF or other native + gadgets. + +- **Indirect Branch Prediction Barrier (IBPB)**: After an IBPB, indirect + branches may still be predicted with targets corresponding to direct branches + executed prior to the IBPB. This is fixed by the IPU 2025.1 microcode, which + should be available via distro updates. Alternatively microcode can be + obtained from Intel's github repository [#f1]_. + +Affected CPUs +------------- +Below is the list of ITS affected CPUs [#f2]_ [#f3]_: + + ======================== ============ ==================== =============== + Common name Family_Model eIBRS Intra-mode BTI + Guest/Host Isolation + ======================== ============ ==================== =============== + SKYLAKE_X (step >= 6) 06_55H Affected Affected + ICELAKE_X 06_6AH Not affected Affected + ICELAKE_D 06_6CH Not affected Affected + ICELAKE_L 06_7EH Not affected Affected + TIGERLAKE_L 06_8CH Not affected Affected + TIGERLAKE 06_8DH Not affected Affected + KABYLAKE_L (step >= 12) 06_8EH Affected Affected + KABYLAKE (step >= 13) 06_9EH Affected Affected + COMETLAKE 06_A5H Affected Affected + COMETLAKE_L 06_A6H Affected Affected + ROCKETLAKE 06_A7H Not affected Affected + ======================== ============ ==================== =============== + +- All affected CPUs enumerate Enhanced IBRS feature. +- IBPB isolation is affected on all ITS affected CPUs, and need a microcode + update for mitigation. +- None of the affected CPUs enumerate BHI_CTRL which was introduced in Golden + Cove (Alder Lake and Sapphire Rapids). This can help guests to determine the + host's affected status. +- Intel Atom CPUs are not affected by ITS. + +Mitigation +---------- +As only the indirect branches and RETs that have their last byte of instruction +in the lower half of the cacheline are vulnerable to ITS, the basic idea behind +the mitigation is to not allow indirect branches in the lower half. + +This is achieved by relying on existing retpoline support in the kernel, and in +compilers. ITS-vulnerable retpoline sites are runtime patched to point to newly +added ITS-safe thunks. These safe thunks consists of indirect branch in the +second half of the cacheline. Not all retpoline sites are patched to thunks, if +a retpoline site is evaluated to be ITS-safe, it is replaced with an inline +indirect branch. + +Dynamic thunks +~~~~~~~~~~~~~~ +From a dynamically allocated pool of safe-thunks, each vulnerable site is +replaced with a new thunk, such that they get a unique address. This could +improve the branch prediction accuracy. Also, it is a defense-in-depth measure +against aliasing. + +Note, for simplicity, indirect branches in eBPF programs are always replaced +with a jump to a static thunk in __x86_indirect_its_thunk_array. If required, +in future this can be changed to use dynamic thunks. + +All vulnerable RETs are replaced with a static thunk, they do not use dynamic +thunks. This is because RETs get their prediction from RSB mostly that does not +depend on source address. RETs that underflow RSB may benefit from dynamic +thunks. But, RETs significantly outnumber indirect branches, and any benefit +from a unique source address could be outweighed by the increased icache +footprint and iTLB pressure. + +Retpoline +~~~~~~~~~ +Retpoline sequence also mitigates ITS-unsafe indirect branches. For this +reason, when retpoline is enabled, ITS mitigation only relocates the RETs to +safe thunks. Unless user requested the RSB-stuffing mitigation. + +Mitigation in guests +^^^^^^^^^^^^^^^^^^^^ +All guests deploy ITS mitigation by default, irrespective of eIBRS enumeration +and Family/Model of the guest. This is because eIBRS feature could be hidden +from a guest. One exception to this is when a guest enumerates BHI_DIS_S, which +indicates that the guest is running on an unaffected host. + +To prevent guests from unnecessarily deploying the mitigation on unaffected +platforms, Intel has defined ITS_NO bit(62) in MSR IA32_ARCH_CAPABILITIES. When +a guest sees this bit set, it should not enumerate the ITS bug. Note, this bit +is not set by any hardware, but is **intended for VMMs to synthesize** it for +guests as per the host's affected status. + +Mitigation options +^^^^^^^^^^^^^^^^^^ +The ITS mitigation can be controlled using the "indirect_target_selection" +kernel parameter. The available options are: + + ======== =================================================================== + on (default) Deploy the "Aligned branch/return thunks" mitigation. + If spectre_v2 mitigation enables retpoline, aligned-thunks are only + deployed for the affected RET instructions. Retpoline mitigates + indirect branches. + + off Disable ITS mitigation. + + vmexit Equivalent to "=on" if the CPU is affected by guest/host isolation + part of ITS. Otherwise, mitigation is not deployed. This option is + useful when host userspace is not in the threat model, and only + attacks from guest to host are considered. + + force Force the ITS bug and deploy the default mitigation. + ======== =================================================================== + +Sysfs reporting +--------------- + +The sysfs file showing ITS mitigation status is: + + /sys/devices/system/cpu/vulnerabilities/indirect_target_selection + +Note, microcode mitigation status is not reported in this file. + +The possible values in this file are: + +.. list-table:: + + * - Not affected + - The processor is not vulnerable. + * - Vulnerable + - System is vulnerable and no mitigation has been applied. + * - Vulnerable, KVM: Not affected + - System is vulnerable to intra-mode BTI, but not affected by eIBRS + guest/host isolation. + * - Mitigation: Aligned branch/return thunks + - The mitigation is enabled, affected indirect branches and RETs are + relocated to safe thunks. + +References +---------- +.. [#f1] Microcode repository - https://github.com/intel/Intel-Linux-Processor-Microcode-Data-Files + +.. [#f2] Affected Processors list - https://www.intel.com/content/www/us/en/developer/topic-technology/software-security-guidance/processors-affected-consolidated-product-cpu-model.html + +.. [#f3] Affected Processors list (machine readable) - https://github.com/intel/Intel-affected-processor-list From 0eda20c29e12ae38ff9767be802c16e06063a775 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 21 Jun 2024 17:40:41 -0700 Subject: [PATCH 085/529] x86/its: Enumerate Indirect Target Selection (ITS) bug commit 159013a7ca18c271ff64192deb62a689b622d860 upstream. ITS bug in some pre-Alderlake Intel CPUs may allow indirect branches in the first half of a cache line get predicted to a target of a branch located in the second half of the cache line. Set X86_BUG_ITS on affected CPUs. Mitigation to follow in later commits. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/msr-index.h | 8 +++++ arch/x86/kernel/cpu/common.c | 58 +++++++++++++++++++++++------- arch/x86/kvm/x86.c | 4 ++- 4 files changed, 58 insertions(+), 13 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 5709cbba28e7..2c8b7305c294 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -495,4 +495,5 @@ #define X86_BUG_RFDS X86_BUG(1*32 + 2) /* CPU is vulnerable to Register File Data Sampling */ #define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */ #define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */ +#define X86_BUG_ITS X86_BUG(1*32 + 5) /* CPU is affected by Indirect Target Selection */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 5ffcfcc41282..727947ed5e5e 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -185,6 +185,14 @@ * VERW clears CPU Register * File. */ +#define ARCH_CAP_ITS_NO BIT_ULL(62) /* + * Not susceptible to + * Indirect Target Selection. + * This bit is not set by + * HW, but is synthesized by + * VMMs for guests to know + * their affected status. + */ #define ARCH_CAP_XAPIC_DISABLE BIT(21) /* * IA32_XAPIC_DISABLE_STATUS MSR diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index def8af31139b..491069bfed42 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1251,6 +1251,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define GDS BIT(6) /* CPU is affected by Register File Data Sampling */ #define RFDS BIT(7) +/* CPU is affected by Indirect Target Selection */ +#define ITS BIT(8) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), @@ -1262,22 +1264,25 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(BROADWELL_G, X86_STEPPING_ANY, SRBDS), VULNBL_INTEL_STEPPINGS(BROADWELL_X, X86_STEPPING_ANY, MMIO), VULNBL_INTEL_STEPPINGS(BROADWELL, X86_STEPPING_ANY, SRBDS), - VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPINGS(0x0, 0x5), MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(SKYLAKE_X, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | ITS), VULNBL_INTEL_STEPPINGS(SKYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), VULNBL_INTEL_STEPPINGS(SKYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), - VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPINGS(0x0, 0xb), MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE_L, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS | ITS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xc), MMIO | RETBLEED | GDS | SRBDS), + VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS | ITS), VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), - VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS), - VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS), - VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED), - VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS), - VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS), - VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS), + VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS | ITS), + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS | ITS), + VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED | ITS), + VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS | ITS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS | ITS), VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS), + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | ITS), VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS), VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS), VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS), @@ -1341,6 +1346,32 @@ static bool __init vulnerable_to_rfds(u64 x86_arch_cap_msr) return cpu_matches(cpu_vuln_blacklist, RFDS); } +static bool __init vulnerable_to_its(u64 x86_arch_cap_msr) +{ + /* The "immunity" bit trumps everything else: */ + if (x86_arch_cap_msr & ARCH_CAP_ITS_NO) + return false; + if (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) + return false; + + /* None of the affected CPUs have BHI_CTRL */ + if (boot_cpu_has(X86_FEATURE_BHI_CTRL)) + return false; + + /* + * If a VMM did not expose ITS_NO, assume that a guest could + * be running on a vulnerable hardware or may migrate to such + * hardware. + */ + if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return true; + + if (cpu_matches(cpu_vuln_blacklist, ITS)) + return true; + + return false; +} + static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) { u64 x86_arch_cap_msr = x86_read_arch_cap_msr(); @@ -1468,6 +1499,9 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET)) setup_force_cpu_bug(X86_BUG_IBPB_NO_RET); + if (vulnerable_to_its(x86_arch_cap_msr)) + setup_force_cpu_bug(X86_BUG_ITS); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 752fa116a261..a6dc8f662fa4 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1614,7 +1614,7 @@ static unsigned int num_msr_based_features; ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ - ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO) + ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO | ARCH_CAP_ITS_NO) static u64 kvm_get_arch_capabilities(void) { @@ -1653,6 +1653,8 @@ static u64 kvm_get_arch_capabilities(void) data |= ARCH_CAP_MDS_NO; if (!boot_cpu_has_bug(X86_BUG_RFDS)) data |= ARCH_CAP_RFDS_NO; + if (!boot_cpu_has_bug(X86_BUG_ITS)) + data |= ARCH_CAP_ITS_NO; if (!boot_cpu_has(X86_FEATURE_RTM)) { /* From 5e7d4f2aceb216f458f24a22d9915e3968ed0fbb Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 21 Jun 2024 21:17:21 -0700 Subject: [PATCH 086/529] x86/its: Add support for ITS-safe indirect thunk commit 8754e67ad4ac692c67ff1f99c0d07156f04ae40c upstream. Due to ITS, indirect branches in the lower half of a cacheline may be vulnerable to branch target injection attack. Introduce ITS-safe thunks to patch indirect branches in the lower half of cacheline with the thunk. Also thunk any eBPF generated indirect branches in emit_indirect_jump(). Below category of indirect branches are not mitigated: - Indirect branches in the .init section are not mitigated because they are discarded after boot. - Indirect branches that are explicitly marked retpoline-safe. Note that retpoline also mitigates the indirect branches against ITS. This is because the retpoline sequence fills an RSB entry before RET, and it does not suffer from RSB-underflow part of the ITS. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre Signed-off-by: Greg Kroah-Hartman --- arch/x86/Kconfig | 11 ++++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/include/asm/nospec-branch.h | 4 ++ arch/x86/kernel/alternative.c | 77 ++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux.lds.S | 6 +++ arch/x86/lib/retpoline.S | 28 ++++++++++ arch/x86/net/bpf_jit_comp.c | 6 ++- 7 files changed, 132 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 48ab6e8f2d1d..8e66bb443351 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2575,6 +2575,17 @@ config MITIGATION_SPECTRE_BHI indirect branches. See +config MITIGATION_ITS + bool "Enable Indirect Target Selection mitigation" + depends on CPU_SUP_INTEL && X86_64 + depends on RETPOLINE && RETHUNK + default y + help + Enable Indirect Target Selection (ITS) mitigation. ITS is a bug in + BPU on some Intel CPUs that may allow Spectre V2 style attacks. If + disabled, mitigation cannot be enabled via cmdline. + See + endif config ARCH_HAS_ADD_PAGES diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 2c8b7305c294..fae83e82fe38 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -445,6 +445,7 @@ #define X86_FEATURE_BHI_CTRL (21*32+ 2) /* "" BHI_DIS_S HW control available */ #define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* "" BHI_DIS_S HW control enabled */ #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* "" Clear branch history at vmexit using SW loop */ +#define X86_FEATURE_INDIRECT_THUNK_ITS (21*32 + 5) /* "" Use thunk for indirect branches in lower half of cacheline */ /* * BUG word(s) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 95730ba530c8..b5ad7f5f7318 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -244,8 +244,12 @@ _ASM_PTR " 999b\n\t" \ ".popsection\n\t" +#define ITS_THUNK_SIZE 64 + typedef u8 retpoline_thunk_t[RETPOLINE_THUNK_SIZE]; +typedef u8 its_thunk_t[ITS_THUNK_SIZE]; extern retpoline_thunk_t __x86_indirect_thunk_array[]; +extern its_thunk_t __x86_indirect_its_thunk_array[]; #ifdef CONFIG_RETHUNK extern void __x86_return_thunk(void); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 69f85e274611..0cb30cefb8ed 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -396,6 +396,74 @@ static int emit_indirect(int op, int reg, u8 *bytes) return i; } +#ifdef CONFIG_MITIGATION_ITS + +static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes, + void *call_dest, void *jmp_dest) +{ + u8 op = insn->opcode.bytes[0]; + int i = 0; + + /* + * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional + * tail-calls. Deal with them. + */ + if (is_jcc32(insn)) { + bytes[i++] = op; + op = insn->opcode.bytes[1]; + goto clang_jcc; + } + + if (insn->length == 6) + bytes[i++] = 0x2e; /* CS-prefix */ + + switch (op) { + case CALL_INSN_OPCODE: + __text_gen_insn(bytes+i, op, addr+i, + call_dest, + CALL_INSN_SIZE); + i += CALL_INSN_SIZE; + break; + + case JMP32_INSN_OPCODE: +clang_jcc: + __text_gen_insn(bytes+i, op, addr+i, + jmp_dest, + JMP32_INSN_SIZE); + i += JMP32_INSN_SIZE; + break; + + default: + WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr); + return -1; + } + + WARN_ON_ONCE(i != insn->length); + + return i; +} + +static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) +{ + return __emit_trampoline(addr, insn, bytes, + __x86_indirect_its_thunk_array[reg], + __x86_indirect_its_thunk_array[reg]); +} + +/* Check if an indirect branch is at ITS-unsafe address */ +static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return false; + + /* Indirect branch opcode is 2 or 3 bytes depending on reg */ + addr += 1 + reg / 8; + + /* Lower-half of the cacheline? */ + return !(addr & 0x20); +} +#endif + /* * Rewrite the compiler generated retpoline thunk calls. * @@ -466,6 +534,15 @@ static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes) bytes[i++] = 0xe8; /* LFENCE */ } +#ifdef CONFIG_MITIGATION_ITS + /* + * Check if the address of last byte of emitted-indirect is in + * lower-half of the cacheline. Such branches need ITS mitigation. + */ + if (cpu_wants_indirect_its_thunk_at((unsigned long)addr + i, reg)) + return emit_its_trampoline(addr, insn, reg, bytes); +#endif + ret = emit_indirect(op, reg, bytes + i); if (ret < 0) return ret; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 78ccb5ec3c0e..778ea89b737c 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -528,6 +528,12 @@ INIT_PER_CPU(irq_stack_backing_store); "SRSO function pair won't alias"); #endif +#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B) +. = ASSERT(__x86_indirect_its_thunk_rax & 0x20, "__x86_indirect_thunk_rax not in second half of cacheline"); +. = ASSERT(((__x86_indirect_its_thunk_rcx - __x86_indirect_its_thunk_rax) % 64) == 0, "Indirect thunks are not cacheline apart"); +. = ASSERT(__x86_indirect_its_thunk_array == __x86_indirect_its_thunk_rax, "Gap in ITS thunk array"); +#endif + #endif /* CONFIG_X86_64 */ #ifdef CONFIG_KEXEC_CORE diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index 7880e2a7ec6a..be948f44fc24 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -258,6 +258,34 @@ SYM_FUNC_START(entry_untrain_ret) SYM_FUNC_END(entry_untrain_ret) __EXPORT_THUNK(entry_untrain_ret) +#ifdef CONFIG_MITIGATION_ITS + +.macro ITS_THUNK reg + +SYM_INNER_LABEL(__x86_indirect_its_thunk_\reg, SYM_L_GLOBAL) + UNWIND_HINT_EMPTY + ANNOTATE_NOENDBR + ANNOTATE_RETPOLINE_SAFE + jmp *%\reg + int3 + .align 32, 0xcc /* fill to the end of the line */ + .skip 32, 0xcc /* skip to the next upper half */ +.endm + +/* ITS mitigation requires thunks be aligned to upper half of cacheline */ +.align 64, 0xcc +.skip 32, 0xcc +SYM_CODE_START(__x86_indirect_its_thunk_array) + +#define GEN(reg) ITS_THUNK reg +#include +#undef GEN + + .align 64, 0xcc +SYM_CODE_END(__x86_indirect_its_thunk_array) + +#endif + SYM_CODE_START(__x86_return_thunk) UNWIND_HINT_FUNC ANNOTATE_NOENDBR diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index 44b49a1fe579..c73d85255bbc 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -464,7 +464,11 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) { u8 *prog = *pprog; - if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { + if (IS_ENABLED(CONFIG_MITIGATION_ITS) && + cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) { + OPTIMIZER_HIDE_VAR(reg); + emit_jump(&prog, &__x86_indirect_its_thunk_array[reg], ip); + } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { EMIT_LFENCE(); EMIT2(0xFF, 0xE0 + reg); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE)) { From dbd8f170af4731d1251289a9419dae6f15a06308 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 21 Jun 2024 21:17:21 -0700 Subject: [PATCH 087/529] x86/its: Add support for ITS-safe return thunk commit a75bf27fe41abe658c53276a0c486c4bf9adecfc upstream. RETs in the lower half of cacheline may be affected by ITS bug, specifically when the RSB-underflows. Use ITS-safe return thunk for such RETs. RETs that are not patched: - RET in retpoline sequence does not need to be patched, because the sequence itself fills an RSB before RET. - RET in Call Depth Tracking (CDT) thunks __x86_indirect_{call|jump}_thunk and call_depth_return_thunk are not patched because CDT by design prevents RSB-underflow. - RETs in .init section are not reachable after init. - RETs that are explicitly marked safe with ANNOTATE_UNRET_SAFE. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/alternative.h | 14 ++++++++++++++ arch/x86/include/asm/nospec-branch.h | 6 ++++++ arch/x86/kernel/alternative.c | 17 ++++++++++++++++- arch/x86/kernel/ftrace.c | 2 +- arch/x86/kernel/static_call.c | 2 +- arch/x86/kernel/vmlinux.lds.S | 4 ++++ arch/x86/lib/retpoline.S | 13 ++++++++++++- arch/x86/net/bpf_jit_comp.c | 2 +- 8 files changed, 55 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 9542c582d546..211293946085 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -81,6 +81,20 @@ extern void apply_ibt_endbr(s32 *start, s32 *end); struct module; +#if defined(CONFIG_RETHUNK) && defined(CONFIG_OBJTOOL) +extern bool cpu_wants_rethunk(void); +extern bool cpu_wants_rethunk_at(void *addr); +#else +static __always_inline bool cpu_wants_rethunk(void) +{ + return false; +} +static __always_inline bool cpu_wants_rethunk_at(void *addr) +{ + return false; +} +#endif + #ifdef CONFIG_SMP extern void alternatives_smp_module_add(struct module *mod, char *name, void *locks, void *locks_end, diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index b5ad7f5f7318..cccd5072e09f 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -257,6 +257,12 @@ extern void __x86_return_thunk(void); static inline void __x86_return_thunk(void) {} #endif +#ifdef CONFIG_MITIGATION_ITS +extern void its_return_thunk(void); +#else +static inline void its_return_thunk(void) {} +#endif + extern void retbleed_return_thunk(void); extern void srso_return_thunk(void); extern void srso_alias_return_thunk(void); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 0cb30cefb8ed..d70129eea923 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -614,6 +614,21 @@ void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) #ifdef CONFIG_RETHUNK +bool cpu_wants_rethunk(void) +{ + return cpu_feature_enabled(X86_FEATURE_RETHUNK); +} + +bool cpu_wants_rethunk_at(void *addr) +{ + if (!cpu_feature_enabled(X86_FEATURE_RETHUNK)) + return false; + if (x86_return_thunk != its_return_thunk) + return true; + + return !((unsigned long)addr & 0x20); +} + /* * Rewrite the compiler generated return thunk tail-calls. * @@ -629,7 +644,7 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { int i = 0; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (cpu_wants_rethunk_at(addr)) { if (x86_return_thunk == __x86_return_thunk) return -1; diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index ec51ce713dea..e73743701530 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -360,7 +360,7 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) goto fail; ip = trampoline + size; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk_at(ip)) __text_gen_insn(ip, JMP32_INSN_OPCODE, ip, x86_return_thunk, JMP32_INSN_SIZE); else memcpy(ip, retq, sizeof(retq)); diff --git a/arch/x86/kernel/static_call.c b/arch/x86/kernel/static_call.c index ca8dfc0c9edb..93809a4dc793 100644 --- a/arch/x86/kernel/static_call.c +++ b/arch/x86/kernel/static_call.c @@ -79,7 +79,7 @@ static void __ref __static_call_transform(void *insn, enum insn_type type, break; case RET: - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) + if (cpu_wants_rethunk_at(insn)) code = text_gen_insn(JMP32_INSN_OPCODE, insn, x86_return_thunk); else code = &retinsn; diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 778ea89b737c..35aeec9d2267 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -534,6 +534,10 @@ INIT_PER_CPU(irq_stack_backing_store); . = ASSERT(__x86_indirect_its_thunk_array == __x86_indirect_its_thunk_rax, "Gap in ITS thunk array"); #endif +#if defined(CONFIG_MITIGATION_ITS) && !defined(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B) +. = ASSERT(its_return_thunk & 0x20, "its_return_thunk not in second half of cacheline"); +#endif + #endif /* CONFIG_X86_64 */ #ifdef CONFIG_KEXEC_CORE diff --git a/arch/x86/lib/retpoline.S b/arch/x86/lib/retpoline.S index be948f44fc24..e42661500094 100644 --- a/arch/x86/lib/retpoline.S +++ b/arch/x86/lib/retpoline.S @@ -284,7 +284,18 @@ SYM_CODE_START(__x86_indirect_its_thunk_array) .align 64, 0xcc SYM_CODE_END(__x86_indirect_its_thunk_array) -#endif +.align 64, 0xcc +.skip 32, 0xcc +SYM_CODE_START(its_return_thunk) + UNWIND_HINT_FUNC + ANNOTATE_NOENDBR + ANNOTATE_UNRET_SAFE + ret + int3 +SYM_CODE_END(its_return_thunk) +EXPORT_SYMBOL(its_return_thunk) + +#endif /* CONFIG_MITIGATION_ITS */ SYM_CODE_START(__x86_return_thunk) UNWIND_HINT_FUNC diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index c73d85255bbc..b5fd521a7fa0 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -487,7 +487,7 @@ static void emit_return(u8 **pprog, u8 *ip) { u8 *prog = *pprog; - if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) { + if (cpu_wants_rethunk()) { emit_jump(&prog, x86_return_thunk, ip); } else { EMIT1(0xC3); /* ret */ From b1701fee52d1d68d1e12ad140b3551435a8e9fe7 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 21 Jun 2024 20:23:23 -0700 Subject: [PATCH 088/529] x86/its: Enable Indirect Target Selection mitigation commit f4818881c47fd91fcb6d62373c57c7844e3de1c0 upstream. Indirect Target Selection (ITS) is a bug in some pre-ADL Intel CPUs with eIBRS. It affects prediction of indirect branch and RETs in the lower half of cacheline. Due to ITS such branches may get wrongly predicted to a target of (direct or indirect) branch that is located in the upper half of the cacheline. Scope of impact =============== Guest/host isolation -------------------- When eIBRS is used for guest/host isolation, the indirect branches in the VMM may still be predicted with targets corresponding to branches in the guest. Intra-mode ---------- cBPF or other native gadgets can be used for intra-mode training and disclosure using ITS. User/kernel isolation --------------------- When eIBRS is enabled user/kernel isolation is not impacted. Indirect Branch Prediction Barrier (IBPB) ----------------------------------------- After an IBPB, indirect branches may be predicted with targets corresponding to direct branches which were executed prior to IBPB. This is mitigated by a microcode update. Add cmdline parameter indirect_target_selection=off|on|force to control the mitigation to relocate the affected branches to an ITS-safe thunk i.e. located in the upper half of cacheline. Also add the sysfs reporting. When retpoline mitigation is deployed, ITS safe-thunks are not needed, because retpoline sequence is already ITS-safe. Similarly, when call depth tracking (CDT) mitigation is deployed (retbleed=stuff), ITS safe return thunk is not used, as CDT prevents RSB-underflow. To not overcomplicate things, ITS mitigation is not supported with spectre-v2 lfence;jmp mitigation. Moreover, it is less practical to deploy lfence;jmp mitigation on ITS affected parts anyways. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre Signed-off-by: Greg Kroah-Hartman --- .../ABI/testing/sysfs-devices-system-cpu | 1 + .../admin-guide/kernel-parameters.txt | 13 ++ arch/x86/kernel/cpu/bugs.c | 128 +++++++++++++++++- drivers/base/cpu.c | 8 ++ include/linux/cpu.h | 2 + 5 files changed, 149 insertions(+), 3 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index 78c26280c473..1468609052c7 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -514,6 +514,7 @@ Description: information about CPUs heterogeneity. What: /sys/devices/system/cpu/vulnerabilities /sys/devices/system/cpu/vulnerabilities/gather_data_sampling + /sys/devices/system/cpu/vulnerabilities/indirect_target_selection /sys/devices/system/cpu/vulnerabilities/itlb_multihit /sys/devices/system/cpu/vulnerabilities/l1tf /sys/devices/system/cpu/vulnerabilities/mds diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 18c8fc60db93..6384244080d4 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2025,6 +2025,18 @@ different crypto accelerators. This option can be used to achieve best performance for particular HW. + indirect_target_selection= [X86,Intel] Mitigation control for Indirect + Target Selection(ITS) bug in Intel CPUs. Updated + microcode is also required for a fix in IBPB. + + on: Enable mitigation (default). + off: Disable mitigation. + force: Force the ITS bug and deploy default + mitigation. + + For details see: + Documentation/admin-guide/hw-vuln/indirect-target-selection.rst + init= [KNL] Format: Run specified binary instead of /sbin/init as init @@ -3263,6 +3275,7 @@ expose users to several CPU vulnerabilities. Equivalent to: if nokaslr then kpti=0 [ARM64] gather_data_sampling=off [X86] + indirect_target_selection=off [X86] kvm.nx_huge_pages=off [X86] l1tf=off [X86] mds=off [X86] diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index b22f72a177ba..30fbe241da42 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -48,6 +48,7 @@ static void __init srbds_select_mitigation(void); static void __init l1d_flush_select_mitigation(void); static void __init gds_select_mitigation(void); static void __init srso_select_mitigation(void); +static void __init its_select_mitigation(void); /* The base value of the SPEC_CTRL MSR without task-specific bits set */ u64 x86_spec_ctrl_base; @@ -66,6 +67,14 @@ static DEFINE_MUTEX(spec_ctrl_mutex); void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk; +static void __init set_return_thunk(void *thunk) +{ + if (x86_return_thunk != __x86_return_thunk) + pr_warn("x86/bugs: return thunk changed\n"); + + x86_return_thunk = thunk; +} + /* Update SPEC_CTRL MSR and its cached copy unconditionally */ static void update_spec_ctrl(u64 val) { @@ -174,6 +183,7 @@ void __init cpu_select_mitigations(void) */ srso_select_mitigation(); gds_select_mitigation(); + its_select_mitigation(); } /* @@ -1081,7 +1091,7 @@ do_cmd_auto: setup_force_cpu_cap(X86_FEATURE_UNRET); if (IS_ENABLED(CONFIG_RETHUNK)) - x86_return_thunk = retbleed_return_thunk; + set_return_thunk(retbleed_return_thunk); if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD && boot_cpu_data.x86_vendor != X86_VENDOR_HYGON) @@ -1142,6 +1152,105 @@ do_cmd_auto: pr_info("%s\n", retbleed_strings[retbleed_mitigation]); } +#undef pr_fmt +#define pr_fmt(fmt) "ITS: " fmt + +enum its_mitigation_cmd { + ITS_CMD_OFF, + ITS_CMD_ON, +}; + +enum its_mitigation { + ITS_MITIGATION_OFF, + ITS_MITIGATION_ALIGNED_THUNKS, +}; + +static const char * const its_strings[] = { + [ITS_MITIGATION_OFF] = "Vulnerable", + [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks", +}; + +static enum its_mitigation its_mitigation __ro_after_init = ITS_MITIGATION_ALIGNED_THUNKS; + +static enum its_mitigation_cmd its_cmd __ro_after_init = + IS_ENABLED(CONFIG_MITIGATION_ITS) ? ITS_CMD_ON : ITS_CMD_OFF; + +static int __init its_parse_cmdline(char *str) +{ + if (!str) + return -EINVAL; + + if (!IS_ENABLED(CONFIG_MITIGATION_ITS)) { + pr_err("Mitigation disabled at compile time, ignoring option (%s)", str); + return 0; + } + + if (!strcmp(str, "off")) { + its_cmd = ITS_CMD_OFF; + } else if (!strcmp(str, "on")) { + its_cmd = ITS_CMD_ON; + } else if (!strcmp(str, "force")) { + its_cmd = ITS_CMD_ON; + setup_force_cpu_bug(X86_BUG_ITS); + } else { + pr_err("Ignoring unknown indirect_target_selection option (%s).", str); + } + + return 0; +} +early_param("indirect_target_selection", its_parse_cmdline); + +static void __init its_select_mitigation(void) +{ + enum its_mitigation_cmd cmd = its_cmd; + + if (!boot_cpu_has_bug(X86_BUG_ITS) || cpu_mitigations_off()) { + its_mitigation = ITS_MITIGATION_OFF; + return; + } + + /* Exit early to avoid irrelevant warnings */ + if (cmd == ITS_CMD_OFF) { + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (spectre_v2_enabled == SPECTRE_V2_NONE) { + pr_err("WARNING: Spectre-v2 mitigation is off, disabling ITS\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (!IS_ENABLED(CONFIG_RETPOLINE) || !IS_ENABLED(CONFIG_RETHUNK)) { + pr_err("WARNING: ITS mitigation depends on retpoline and rethunk support\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (IS_ENABLED(CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B)) { + pr_err("WARNING: ITS mitigation is not compatible with CONFIG_DEBUG_FORCE_FUNCTION_ALIGN_64B\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + if (boot_cpu_has(X86_FEATURE_RETPOLINE_LFENCE)) { + pr_err("WARNING: ITS mitigation is not compatible with lfence mitigation\n"); + its_mitigation = ITS_MITIGATION_OFF; + goto out; + } + + switch (cmd) { + case ITS_CMD_OFF: + its_mitigation = ITS_MITIGATION_OFF; + break; + case ITS_CMD_ON: + its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; + if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) + setup_force_cpu_cap(X86_FEATURE_INDIRECT_THUNK_ITS); + setup_force_cpu_cap(X86_FEATURE_RETHUNK); + set_return_thunk(its_return_thunk); + break; + } +out: + pr_info("%s\n", its_strings[its_mitigation]); +} + #undef pr_fmt #define pr_fmt(fmt) "Spectre V2 : " fmt @@ -2592,10 +2701,10 @@ static void __init srso_select_mitigation(void) if (boot_cpu_data.x86 == 0x19) { setup_force_cpu_cap(X86_FEATURE_SRSO_ALIAS); - x86_return_thunk = srso_alias_return_thunk; + set_return_thunk(srso_alias_return_thunk); } else { setup_force_cpu_cap(X86_FEATURE_SRSO); - x86_return_thunk = srso_return_thunk; + set_return_thunk(srso_return_thunk); } srso_mitigation = SRSO_MITIGATION_SAFE_RET; } else { @@ -2775,6 +2884,11 @@ static ssize_t rfds_show_state(char *buf) return sysfs_emit(buf, "%s\n", rfds_strings[rfds_mitigation]); } +static ssize_t its_show_state(char *buf) +{ + return sysfs_emit(buf, "%s\n", its_strings[its_mitigation]); +} + static char *stibp_state(void) { if (spectre_v2_in_eibrs_mode(spectre_v2_enabled) && @@ -2959,6 +3073,9 @@ static ssize_t cpu_show_common(struct device *dev, struct device_attribute *attr case X86_BUG_RFDS: return rfds_show_state(buf); + case X86_BUG_ITS: + return its_show_state(buf); + default: break; } @@ -3038,4 +3155,9 @@ ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attrib { return cpu_show_common(dev, attr, buf, X86_BUG_RFDS); } + +ssize_t cpu_show_indirect_target_selection(struct device *dev, struct device_attribute *attr, char *buf) +{ + return cpu_show_common(dev, attr, buf, X86_BUG_ITS); +} #endif diff --git a/drivers/base/cpu.c b/drivers/base/cpu.c index 31da94afe4f3..27aff2503765 100644 --- a/drivers/base/cpu.c +++ b/drivers/base/cpu.c @@ -595,6 +595,12 @@ ssize_t __weak cpu_show_reg_file_data_sampling(struct device *dev, return sysfs_emit(buf, "Not affected\n"); } +ssize_t __weak cpu_show_indirect_target_selection(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sysfs_emit(buf, "Not affected\n"); +} + static DEVICE_ATTR(meltdown, 0444, cpu_show_meltdown, NULL); static DEVICE_ATTR(spectre_v1, 0444, cpu_show_spectre_v1, NULL); static DEVICE_ATTR(spectre_v2, 0444, cpu_show_spectre_v2, NULL); @@ -609,6 +615,7 @@ static DEVICE_ATTR(retbleed, 0444, cpu_show_retbleed, NULL); static DEVICE_ATTR(gather_data_sampling, 0444, cpu_show_gds, NULL); static DEVICE_ATTR(spec_rstack_overflow, 0444, cpu_show_spec_rstack_overflow, NULL); static DEVICE_ATTR(reg_file_data_sampling, 0444, cpu_show_reg_file_data_sampling, NULL); +static DEVICE_ATTR(indirect_target_selection, 0444, cpu_show_indirect_target_selection, NULL); static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_meltdown.attr, @@ -625,6 +632,7 @@ static struct attribute *cpu_root_vulnerabilities_attrs[] = { &dev_attr_gather_data_sampling.attr, &dev_attr_spec_rstack_overflow.attr, &dev_attr_reg_file_data_sampling.attr, + &dev_attr_indirect_target_selection.attr, NULL }; diff --git a/include/linux/cpu.h b/include/linux/cpu.h index 4b06b1f1e267..186e0e0f2e40 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -76,6 +76,8 @@ extern ssize_t cpu_show_gds(struct device *dev, struct device_attribute *attr, char *buf); extern ssize_t cpu_show_reg_file_data_sampling(struct device *dev, struct device_attribute *attr, char *buf); +extern ssize_t cpu_show_indirect_target_selection(struct device *dev, + struct device_attribute *attr, char *buf); extern __printf(4, 5) struct device *cpu_device_create(struct device *parent, void *drvdata, From 139c0b8318c2ea8afd2f12b6401466c6010286da Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Mon, 18 Nov 2024 09:53:12 -0800 Subject: [PATCH 089/529] x86/its: Add "vmexit" option to skip mitigation on some CPUs commit 2665281a07e19550944e8354a2024635a7b2714a upstream. Ice Lake generation CPUs are not affected by guest/host isolation part of ITS. If a user is only concerned about KVM guests, they can now choose a new cmdline option "vmexit" that will not deploy the ITS mitigation when CPU is not affected by guest/host isolation. This saves the performance overhead of ITS mitigation on Ice Lake gen CPUs. When "vmexit" option selected, if the CPU is affected by ITS guest/host isolation, the default ITS mitigation is deployed. Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Josh Poimboeuf Reviewed-by: Alexandre Chartre Signed-off-by: Greg Kroah-Hartman --- .../admin-guide/kernel-parameters.txt | 2 ++ arch/x86/include/asm/cpufeatures.h | 1 + arch/x86/kernel/cpu/bugs.c | 11 +++++++++++ arch/x86/kernel/cpu/common.c | 19 ++++++++++++------- 4 files changed, 26 insertions(+), 7 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6384244080d4..6938c8cd7a6f 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -2033,6 +2033,8 @@ off: Disable mitigation. force: Force the ITS bug and deploy default mitigation. + vmexit: Only deploy mitigation if CPU is affected by + guest/host isolation part of ITS. For details see: Documentation/admin-guide/hw-vuln/indirect-target-selection.rst diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index fae83e82fe38..28edef597282 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -497,4 +497,5 @@ #define X86_BUG_BHI X86_BUG(1*32 + 3) /* CPU is affected by Branch History Injection */ #define X86_BUG_IBPB_NO_RET X86_BUG(1*32 + 4) /* "ibpb_no_ret" IBPB omits return target predictions */ #define X86_BUG_ITS X86_BUG(1*32 + 5) /* CPU is affected by Indirect Target Selection */ +#define X86_BUG_ITS_NATIVE_ONLY X86_BUG(1*32 + 6) /* CPU is affected by ITS, VMX is not affected */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 30fbe241da42..766cee7fa905 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1158,15 +1158,18 @@ do_cmd_auto: enum its_mitigation_cmd { ITS_CMD_OFF, ITS_CMD_ON, + ITS_CMD_VMEXIT, }; enum its_mitigation { ITS_MITIGATION_OFF, + ITS_MITIGATION_VMEXIT_ONLY, ITS_MITIGATION_ALIGNED_THUNKS, }; static const char * const its_strings[] = { [ITS_MITIGATION_OFF] = "Vulnerable", + [ITS_MITIGATION_VMEXIT_ONLY] = "Mitigation: Vulnerable, KVM: Not affected", [ITS_MITIGATION_ALIGNED_THUNKS] = "Mitigation: Aligned branch/return thunks", }; @@ -1192,6 +1195,8 @@ static int __init its_parse_cmdline(char *str) } else if (!strcmp(str, "force")) { its_cmd = ITS_CMD_ON; setup_force_cpu_bug(X86_BUG_ITS); + } else if (!strcmp(str, "vmexit")) { + its_cmd = ITS_CMD_VMEXIT; } else { pr_err("Ignoring unknown indirect_target_selection option (%s).", str); } @@ -1239,6 +1244,12 @@ static void __init its_select_mitigation(void) case ITS_CMD_OFF: its_mitigation = ITS_MITIGATION_OFF; break; + case ITS_CMD_VMEXIT: + if (boot_cpu_has_bug(X86_BUG_ITS_NATIVE_ONLY)) { + its_mitigation = ITS_MITIGATION_VMEXIT_ONLY; + goto out; + } + fallthrough; case ITS_CMD_ON: its_mitigation = ITS_MITIGATION_ALIGNED_THUNKS; if (!boot_cpu_has(X86_FEATURE_RETPOLINE)) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 491069bfed42..48cc1612df49 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1253,6 +1253,8 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { #define RFDS BIT(7) /* CPU is affected by Indirect Target Selection */ #define ITS BIT(8) +/* CPU is affected by Indirect Target Selection, but guest-host isolation is not affected */ +#define ITS_NATIVE_ONLY BIT(9) static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(IVYBRIDGE, X86_STEPPING_ANY, SRBDS), @@ -1273,16 +1275,16 @@ static const struct x86_cpu_id cpu_vuln_blacklist[] __initconst = { VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPINGS(0x0, 0xc), MMIO | RETBLEED | GDS | SRBDS), VULNBL_INTEL_STEPPINGS(KABYLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | SRBDS | ITS), VULNBL_INTEL_STEPPINGS(CANNONLAKE_L, X86_STEPPING_ANY, RETBLEED), - VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), - VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS | ITS), - VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS | ITS), + VULNBL_INTEL_STEPPINGS(ICELAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPPINGS(ICELAKE_D, X86_STEPPING_ANY, MMIO | GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPPINGS(ICELAKE_X, X86_STEPPING_ANY, MMIO | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPPINGS(COMETLAKE, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPINGS(0x0, 0x0), MMIO | RETBLEED | ITS), VULNBL_INTEL_STEPPINGS(COMETLAKE_L, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED | GDS | ITS), - VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS | ITS), - VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS | ITS), + VULNBL_INTEL_STEPPINGS(TIGERLAKE_L, X86_STEPPING_ANY, GDS | ITS | ITS_NATIVE_ONLY), + VULNBL_INTEL_STEPPINGS(TIGERLAKE, X86_STEPPING_ANY, GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPPINGS(LAKEFIELD, X86_STEPPING_ANY, MMIO | MMIO_SBDS | RETBLEED), - VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | ITS), + VULNBL_INTEL_STEPPINGS(ROCKETLAKE, X86_STEPPING_ANY, MMIO | RETBLEED | GDS | ITS | ITS_NATIVE_ONLY), VULNBL_INTEL_STEPPINGS(ALDERLAKE, X86_STEPPING_ANY, RFDS), VULNBL_INTEL_STEPPINGS(ALDERLAKE_L, X86_STEPPING_ANY, RFDS), VULNBL_INTEL_STEPPINGS(RAPTORLAKE, X86_STEPPING_ANY, RFDS), @@ -1499,8 +1501,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) if (cpu_has(c, X86_FEATURE_AMD_IBPB) && !cpu_has(c, X86_FEATURE_AMD_IBPB_RET)) setup_force_cpu_bug(X86_BUG_IBPB_NO_RET); - if (vulnerable_to_its(x86_arch_cap_msr)) + if (vulnerable_to_its(x86_arch_cap_msr)) { setup_force_cpu_bug(X86_BUG_ITS); + if (cpu_matches(cpu_vuln_blacklist, ITS_NATIVE_ONLY)) + setup_force_cpu_bug(X86_BUG_ITS_NATIVE_ONLY); + } if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; From e6da4a83e3689e963f115c0a8cff5da6f5b51256 Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Fri, 2 May 2025 06:25:19 -0700 Subject: [PATCH 090/529] x86/its: Align RETs in BHB clear sequence to avoid thunking commit f0cd7091cc5a032c8870b4285305d9172569d126 upstream. The software mitigation for BHI is to execute BHB clear sequence at syscall entry, and possibly after a cBPF program. ITS mitigation thunks RETs in the lower half of the cacheline. This causes the RETs in the BHB clear sequence to be thunked as well, adding unnecessary branches to the BHB clear sequence. Since the sequence is in hot path, align the RET instructions in the sequence to avoid thunking. This is how disassembly clear_bhb_loop() looks like after this change: 0x44 <+4>: mov $0x5,%ecx 0x49 <+9>: call 0xffffffff81001d9b 0x4e <+14>: jmp 0xffffffff81001de5 0x53 <+19>: int3 ... 0x9b <+91>: call 0xffffffff81001dce 0xa0 <+96>: ret 0xa1 <+97>: int3 ... 0xce <+142>: mov $0x5,%eax 0xd3 <+147>: jmp 0xffffffff81001dd6 0xd5 <+149>: nop 0xd6 <+150>: sub $0x1,%eax 0xd9 <+153>: jne 0xffffffff81001dd3 0xdb <+155>: sub $0x1,%ecx 0xde <+158>: jne 0xffffffff81001d9b 0xe0 <+160>: ret 0xe1 <+161>: int3 0xe2 <+162>: int3 0xe3 <+163>: int3 0xe4 <+164>: int3 0xe5 <+165>: lfence 0xe8 <+168>: pop %rbp 0xe9 <+169>: ret Suggested-by: Andrew Cooper Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Alexandre Chartre Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a114338380a6..4f5968810450 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1559,7 +1559,9 @@ SYM_CODE_END(rewind_stack_and_make_dead) * ORC to unwind properly. * * The alignment is for performance and not for safety, and may be safely - * refactored in the future if needed. + * refactored in the future if needed. The .skips are for safety, to ensure + * that all RETs are in the second half of a cacheline to mitigate Indirect + * Target Selection, rather than taking the slowpath via its_return_thunk. */ SYM_FUNC_START(clear_bhb_loop) push %rbp @@ -1569,10 +1571,22 @@ SYM_FUNC_START(clear_bhb_loop) call 1f jmp 5f .align 64, 0xcc + /* + * Shift instructions so that the RET is in the upper half of the + * cacheline and don't take the slowpath to its_return_thunk. + */ + .skip 32 - (.Lret1 - 1f), 0xcc ANNOTATE_INTRA_FUNCTION_CALL 1: call 2f - RET +.Lret1: RET .align 64, 0xcc + /* + * As above shift instructions for RET at .Lret2 as well. + * + * This should be ideally be: .skip 32 - (.Lret2 - 2f), 0xcc + * but some Clang versions (e.g. 18) don't like this. + */ + .skip 32 - 18, 0xcc 2: movl $5, %eax 3: jmp 4f nop @@ -1580,7 +1594,7 @@ SYM_FUNC_START(clear_bhb_loop) jnz 3b sub $1, %ecx jnz 1b - RET +.Lret2: RET 5: lfence pop %rbp RET From 9502e83c22bef5ecfb2386aaa837d8afcf1fcece Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Sat, 3 May 2025 09:46:31 -0700 Subject: [PATCH 091/529] x86/ibt: Keep IBT disabled during alternative patching commit ebebe30794d38c51f71fe4951ba6af4159d9837d upstream. cfi_rewrite_callers() updates the fineIBT hash matching at the caller side, but except for paranoid-mode it relies on apply_retpoline() and friends for any ENDBR relocation. This could temporarily cause an indirect branch to land on a poisoned ENDBR. For instance, with para-virtualization enabled, a simple wrmsrl() could have an indirect branch pointing to native_write_msr() who's ENDBR has been relocated due to fineIBT: : push %rbp mov %rsp,%rbp mov %esi,%eax mov %rsi,%rdx shr $0x20,%rdx mov %edi,%edi mov %rax,%rsi call *0x21e65d0(%rip) # ^^^^^^^^^^^^^^^^^^^^^^^ Such an indirect call during the alternative patching could #CP if the caller is not *yet* adjusted for the new target ENDBR. To prevent a false #CP, keep CET-IBT disabled until all callers are patched. Patching during the module load does not need to be guarded by IBT-disable because the module code is not executed until the patching is complete. [ pawan: Since apply_paravirt() happens before __apply_fineibt() relocates the ENDBR, pv_ops in the example above is not relevant. It is still safer to keep this commit because missing an ENDBR means an oops. ] Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Alexandre Chartre Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/alternative.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index d70129eea923..f46db81b55b0 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -30,6 +30,7 @@ #include #include #include +#include int __read_mostly alternatives_patched; @@ -1006,6 +1007,8 @@ static noinline void __init int3_selftest(void) void __init alternative_instructions(void) { + u64 ibt; + int3_selftest(); /* @@ -1043,6 +1046,9 @@ void __init alternative_instructions(void) */ paravirt_set_cap(); + /* Keep CET-IBT disabled until caller/callee are patched */ + ibt = ibt_save(); + /* * First patch paravirt functions, such that we overwrite the indirect * call with the direct call. @@ -1064,6 +1070,8 @@ void __init alternative_instructions(void) apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end); + ibt_restore(ibt); + #ifdef CONFIG_SMP /* Patch to UP if other cpus not imminent. */ if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) { From 959cadf09dbae7b304f03e039b8d8e13c529e2dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Oct 2024 10:05:48 -0700 Subject: [PATCH 092/529] x86/its: Use dynamic thunks for indirect branches commit 872df34d7c51a79523820ea6a14860398c639b87 upstream. ITS mitigation moves the unsafe indirect branches to a safe thunk. This could degrade the prediction accuracy as the source address of indirect branches becomes same for different execution paths. To improve the predictions, and hence the performance, assign a separate thunk for each indirect callsite. This is also a defense-in-depth measure to avoid indirect branches aliasing with each other. As an example, 5000 dynamic thunks would utilize around 16 bits of the address space, thereby gaining entropy. For a BTB that uses 32 bits for indexing, dynamic thunks could provide better prediction accuracy over fixed thunks. Have ITS thunks be variable sized and use EXECMEM_MODULE_TEXT such that they are both more flexible (got to extend them later) and live in 2M TLBs, just like kernel code, avoiding undue TLB pressure. [ pawan: CONFIG_EXECMEM and CONFIG_EXECMEM_ROX are not supported on backport kernel, made changes to use module_alloc() and set_memory_*() for dynamic thunks. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Alexandre Chartre Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/alternative.h | 10 +++ arch/x86/kernel/alternative.c | 130 ++++++++++++++++++++++++++++- arch/x86/kernel/module.c | 7 ++ include/linux/module.h | 5 ++ 4 files changed, 149 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 211293946085..4463cf38c169 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -81,6 +81,16 @@ extern void apply_ibt_endbr(s32 *start, s32 *end); struct module; +#ifdef CONFIG_MITIGATION_ITS +extern void its_init_mod(struct module *mod); +extern void its_fini_mod(struct module *mod); +extern void its_free_mod(struct module *mod); +#else /* CONFIG_MITIGATION_ITS */ +static inline void its_init_mod(struct module *mod) { } +static inline void its_fini_mod(struct module *mod) { } +static inline void its_free_mod(struct module *mod) { } +#endif + #if defined(CONFIG_RETHUNK) && defined(CONFIG_OBJTOOL) extern bool cpu_wants_rethunk(void); extern bool cpu_wants_rethunk_at(void *addr); diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index f46db81b55b0..44bff34f7d10 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include #include @@ -31,6 +33,7 @@ #include #include #include +#include int __read_mostly alternatives_patched; @@ -399,6 +402,123 @@ static int emit_indirect(int op, int reg, u8 *bytes) #ifdef CONFIG_MITIGATION_ITS +static struct module *its_mod; +static void *its_page; +static unsigned int its_offset; + +/* Initialize a thunk with the "jmp *reg; int3" instructions. */ +static void *its_init_thunk(void *thunk, int reg) +{ + u8 *bytes = thunk; + int i = 0; + + if (reg >= 8) { + bytes[i++] = 0x41; /* REX.B prefix */ + reg -= 8; + } + bytes[i++] = 0xff; + bytes[i++] = 0xe0 + reg; /* jmp *reg */ + bytes[i++] = 0xcc; + + return thunk; +} + +void its_init_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + mutex_lock(&text_mutex); + its_mod = mod; + its_page = NULL; +} + +void its_fini_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + WARN_ON_ONCE(its_mod != mod); + + its_mod = NULL; + its_page = NULL; + mutex_unlock(&text_mutex); + + for (int i = 0; i < mod->its_num_pages; i++) { + void *page = mod->its_page_array[i]; + set_memory_ro((unsigned long)page, 1); + set_memory_x((unsigned long)page, 1); + } +} + +void its_free_mod(struct module *mod) +{ + if (!cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) + return; + + for (int i = 0; i < mod->its_num_pages; i++) { + void *page = mod->its_page_array[i]; + module_memfree(page); + } + kfree(mod->its_page_array); +} + +DEFINE_FREE(its_execmem, void *, if (_T) module_memfree(_T)); + +static void *its_alloc(void) +{ + void *page __free(its_execmem) = module_alloc(PAGE_SIZE); + + if (!page) + return NULL; + + if (its_mod) { + void *tmp = krealloc(its_mod->its_page_array, + (its_mod->its_num_pages+1) * sizeof(void *), + GFP_KERNEL); + if (!tmp) + return NULL; + + its_mod->its_page_array = tmp; + its_mod->its_page_array[its_mod->its_num_pages++] = page; + } + + return no_free_ptr(page); +} + +static void *its_allocate_thunk(int reg) +{ + int size = 3 + (reg / 8); + void *thunk; + + if (!its_page || (its_offset + size - 1) >= PAGE_SIZE) { + its_page = its_alloc(); + if (!its_page) { + pr_err("ITS page allocation failed\n"); + return NULL; + } + memset(its_page, INT3_INSN_OPCODE, PAGE_SIZE); + its_offset = 32; + } + + /* + * If the indirect branch instruction will be in the lower half + * of a cacheline, then update the offset to reach the upper half. + */ + if ((its_offset + size - 1) % 64 < 32) + its_offset = ((its_offset - 1) | 0x3F) + 33; + + thunk = its_page + its_offset; + its_offset += size; + + set_memory_rw((unsigned long)its_page, 1); + thunk = its_init_thunk(thunk, reg); + set_memory_ro((unsigned long)its_page, 1); + set_memory_x((unsigned long)its_page, 1); + + return thunk; +} + static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes, void *call_dest, void *jmp_dest) { @@ -446,9 +566,13 @@ clang_jcc: static int emit_its_trampoline(void *addr, struct insn *insn, int reg, u8 *bytes) { - return __emit_trampoline(addr, insn, bytes, - __x86_indirect_its_thunk_array[reg], - __x86_indirect_its_thunk_array[reg]); + u8 *thunk = __x86_indirect_its_thunk_array[reg]; + u8 *tmp = its_allocate_thunk(reg); + + if (tmp) + thunk = tmp; + + return __emit_trampoline(addr, insn, bytes, thunk, thunk); } /* Check if an indirect branch is at ITS-unsafe address */ diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index c032edcd3d95..7728060b640c 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -285,10 +285,16 @@ int module_finalize(const Elf_Ehdr *hdr, void *pseg = (void *)para->sh_addr; apply_paravirt(pseg, pseg + para->sh_size); } + + its_init_mod(me); + if (retpolines) { void *rseg = (void *)retpolines->sh_addr; apply_retpolines(rseg, rseg + retpolines->sh_size); } + + its_fini_mod(me); + if (returns) { void *rseg = (void *)returns->sh_addr; apply_returns(rseg, rseg + returns->sh_size); @@ -320,4 +326,5 @@ int module_finalize(const Elf_Ehdr *hdr, void module_arch_cleanup(struct module *mod) { alternatives_smp_module_del(mod); + its_free_mod(mod); } diff --git a/include/linux/module.h b/include/linux/module.h index 35876e89eb93..41c4c472ed17 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -536,6 +536,11 @@ struct module { atomic_t refcnt; #endif +#ifdef CONFIG_MITIGATION_ITS + int its_num_pages; + void **its_page_array; +#endif + #ifdef CONFIG_CONSTRUCTORS /* Constructor functions. */ ctor_fn_t *ctors; From c2bece04baf819b9614a35852e4573ee402e059d Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 12 May 2025 19:58:39 -0700 Subject: [PATCH 093/529] x86/its: Fix build errors when CONFIG_MODULES=n commit 9f35e33144ae5377d6a8de86dd3bd4d995c6ac65 upstream. Fix several build errors when CONFIG_MODULES=n, including the following: ../arch/x86/kernel/alternative.c:195:25: error: incomplete definition of type 'struct module' 195 | for (int i = 0; i < mod->its_num_pages; i++) { Fixes: 872df34d7c51 ("x86/its: Use dynamic thunks for indirect branches") Cc: stable@vger.kernel.org Signed-off-by: Eric Biggers Acked-by: Dave Hansen Tested-by: Steven Rostedt (Google) Reviewed-by: Alexandre Chartre Signed-off-by: Linus Torvalds [ pawan: backport: Bring ITS dynamic thunk code under CONFIG_MODULES ] Signed-off-by: Pawan Gupta Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/alternative.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 44bff34f7d10..acf6fce287dc 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -402,6 +402,7 @@ static int emit_indirect(int op, int reg, u8 *bytes) #ifdef CONFIG_MITIGATION_ITS +#ifdef CONFIG_MODULES static struct module *its_mod; static void *its_page; static unsigned int its_offset; @@ -518,6 +519,14 @@ static void *its_allocate_thunk(int reg) return thunk; } +#else /* CONFIG_MODULES */ + +static void *its_allocate_thunk(int reg) +{ + return NULL; +} + +#endif /* CONFIG_MODULES */ static int __emit_trampoline(void *addr, struct insn *insn, u8 *bytes, void *call_dest, void *jmp_dest) From 7e00c01ff80fd32f3cdfbebc29ddd6af1c404282 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 12 May 2023 14:05:11 +0200 Subject: [PATCH 094/529] x86/alternative: Optimize returns patching commit d2408e043e7296017420aa5929b3bba4d5e61013 upstream. Instead of decoding each instruction in the return sites range only to realize that that return site is a jump to the default return thunk which is needed - X86_FEATURE_RETHUNK is enabled - lift that check before the loop and get rid of that loop overhead. Add comments about what gets patched, while at it. Signed-off-by: Borislav Petkov (AMD) Acked-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20230512120952.7924-1-bp@alien8.de Signed-off-by: Pawan Gupta Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/alternative.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index acf6fce287dc..b42f4740cb7b 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -778,13 +778,12 @@ static int patch_return(void *addr, struct insn *insn, u8 *bytes) { int i = 0; + /* Patch the custom return thunks... */ if (cpu_wants_rethunk_at(addr)) { - if (x86_return_thunk == __x86_return_thunk) - return -1; - i = JMP32_INSN_SIZE; __text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i); } else { + /* ... or patch them out if not needed. */ bytes[i++] = RET_INSN_OPCODE; } @@ -797,6 +796,14 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; + /* + * Do not patch out the default return thunks if those needed are the + * ones generated by the compiler. + */ + if (cpu_feature_enabled(X86_FEATURE_RETHUNK) && + (x86_return_thunk == __x86_return_thunk)) + return; + for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; struct insn insn; From 73c71762fe985e50d5fb07b6dec325ebada7256f Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 4 Sep 2023 22:04:54 -0700 Subject: [PATCH 095/529] x86/alternatives: Remove faulty optimization commit 4ba89dd6ddeca2a733bdaed7c9a5cbe4e19d9124 upstream. The following commit 095b8303f383 ("x86/alternative: Make custom return thunk unconditional") made '__x86_return_thunk' a placeholder value. All code setting X86_FEATURE_RETHUNK also changes the value of 'x86_return_thunk'. So the optimization at the beginning of apply_returns() is dead code. Also, before the above-mentioned commit, the optimization actually had a bug It bypassed __static_call_fixup(), causing some raw returns to remain unpatched in static call trampolines. Thus the 'Fixes' tag. Fixes: d2408e043e72 ("x86/alternative: Optimize returns patching") Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Signed-off-by: Borislav Petkov (AMD) Acked-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/16d19d2249d4485d8380fb215ffaae81e6b8119e.1693889988.git.jpoimboe@kernel.org Signed-off-by: Pawan Gupta Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/alternative.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index b42f4740cb7b..7d61f8c7ec38 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -796,14 +796,6 @@ void __init_or_module noinline apply_returns(s32 *start, s32 *end) { s32 *s; - /* - * Do not patch out the default return thunks if those needed are the - * ones generated by the compiler. - */ - if (cpu_feature_enabled(X86_FEATURE_RETHUNK) && - (x86_return_thunk == __x86_return_thunk)) - return; - for (s = start; s < end; s++) { void *dest = NULL, *addr = (void *)s + *s; struct insn insn; From 69afd82670d85120723f5d3a1443e910ec6f58a7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 23 Apr 2025 09:57:31 +0200 Subject: [PATCH 096/529] x86/its: FineIBT-paranoid vs ITS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit e52c1dc7455d32c8a55f9949d300e5e87d011fa6 upstream. FineIBT-paranoid was using the retpoline bytes for the paranoid check, disabling retpolines, because all parts that have IBT also have eIBRS and thus don't need no stinking retpolines. Except... ITS needs the retpolines for indirect calls must not be in the first half of a cacheline :-/ So what was the paranoid call sequence: : 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d a: 4d 8d 5b lea -0x10(%r11), %r11 e: 75 fd jne d 10: 41 ff d3 call *%r11 13: 90 nop Now becomes: : 0: 41 ba 78 56 34 12 mov $0x12345678, %r10d 6: 45 3b 53 f7 cmp -0x9(%r11), %r10d a: 4d 8d 5b f0 lea -0x10(%r11), %r11 e: 2e e8 XX XX XX XX cs call __x86_indirect_paranoid_thunk_r11 Where the paranoid_thunk looks like: 1d: (bad) __x86_indirect_paranoid_thunk_r11: 1e: 75 fd jne 1d __x86_indirect_its_thunk_r11: 20: 41 ff eb jmp *%r11 23: cc int3 [ dhansen: remove initialization to false ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Pawan Gupta Signed-off-by: Dave Hansen Reviewed-by: Alexandre Chartre [ Just a portion of the original commit, in order to fix a build issue in stable kernels due to backports ] Tested-by: Holger Hoffstätte Link: https://lore.kernel.org/r/20250514113952.GB16434@noisy.programming.kicks-ass.net Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/alternative.h | 8 ++++++++ arch/x86/kernel/alternative.c | 8 ++++++++ arch/x86/net/bpf_jit_comp.c | 2 +- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 4463cf38c169..fd3b73017738 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -5,6 +5,7 @@ #include #include #include +#include #define ALTINSTR_FLAG_INV (1 << 15) #define ALT_NOT(feat) ((feat) | ALTINSTR_FLAG_INV) @@ -85,10 +86,17 @@ struct module; extern void its_init_mod(struct module *mod); extern void its_fini_mod(struct module *mod); extern void its_free_mod(struct module *mod); +extern u8 *its_static_thunk(int reg); #else /* CONFIG_MITIGATION_ITS */ static inline void its_init_mod(struct module *mod) { } static inline void its_fini_mod(struct module *mod) { } static inline void its_free_mod(struct module *mod) { } +static inline u8 *its_static_thunk(int reg) +{ + WARN_ONCE(1, "ITS not compiled in"); + + return NULL; +} #endif #if defined(CONFIG_RETHUNK) && defined(CONFIG_OBJTOOL) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 7d61f8c7ec38..3bc6be6404d2 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -596,6 +596,14 @@ static bool cpu_wants_indirect_its_thunk_at(unsigned long addr, int reg) /* Lower-half of the cacheline? */ return !(addr & 0x20); } + +u8 *its_static_thunk(int reg) +{ + u8 *thunk = __x86_indirect_its_thunk_array[reg]; + + return thunk; +} + #endif /* diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c index b5fd521a7fa0..f3068bb53c4d 100644 --- a/arch/x86/net/bpf_jit_comp.c +++ b/arch/x86/net/bpf_jit_comp.c @@ -467,7 +467,7 @@ static void emit_indirect_jump(u8 **pprog, int reg, u8 *ip) if (IS_ENABLED(CONFIG_MITIGATION_ITS) && cpu_feature_enabled(X86_FEATURE_INDIRECT_THUNK_ITS)) { OPTIMIZER_HIDE_VAR(reg); - emit_jump(&prog, &__x86_indirect_its_thunk_array[reg], ip); + emit_jump(&prog, its_static_thunk(reg), ip); } else if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) { EMIT_LFENCE(); EMIT2(0xFF, 0xE0 + reg); From 325285d9fc869c02400a697e6729a2994a81a1aa Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 18 May 2025 08:21:27 +0200 Subject: [PATCH 097/529] Linux 6.1.139 Link: https://lore.kernel.org/r/20250512172023.126467649@linuxfoundation.org Tested-by: Jon Hunter Tested-by: Salvatore Bonaccorso Tested-by: Florian Fainelli Tested-by: Mark Brown Tested-by: Ron Economos Tested-by: Peter Schneider Tested-by: Linux Kernel Functional Testing Tested-by: Shuah Khan Tested-by: Hardik Garg Link: https://lore.kernel.org/r/20250514125614.705014741@linuxfoundation.org Tested-by: Jon Hunter Tested-by: Florian Fainelli Tested-by: Mark Brown Tested-by: Linux Kernel Functional Testing Tested-by: Peter Schneider Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d8d39fe5ef70..80748b7c3540 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 PATCHLEVEL = 1 -SUBLEVEL = 138 +SUBLEVEL = 139 EXTRAVERSION = NAME = Curry Ramen From e20878d4ebfe0eb981b591bcd022f7bb78e9fba7 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 18 Oct 2022 00:14:20 -0700 Subject: [PATCH 098/529] binfmt: Fix whitespace issues [ Upstream commit 8f6e3f9e5a0f58e458a348b7e36af11d0e9702af ] Fix the annoying whitespace issues that have been following these files around for years. Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Kees Cook Acked-by: Christian Brauner (Microsoft) Link: https://lore.kernel.org/r/20221018071350.never.230-kees@kernel.org Stable-dep-of: 11854fe263eb ("binfmt_elf: Move brk for static PIE even if ASLR disabled") Signed-off-by: Sasha Levin --- fs/binfmt_elf.c | 14 +++++++------- fs/binfmt_elf_fdpic.c | 2 +- fs/exec.c | 2 +- include/uapi/linux/elf.h | 2 +- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 89e7e4826efc..d1cf3d25da4b 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -248,7 +248,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, } while (0) #ifdef ARCH_DLINFO - /* + /* * ARCH_DLINFO must come first so PPC can do its special alignment of * AUXV. * update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT() in @@ -1021,7 +1021,7 @@ out_free_interp: executable_stack); if (retval < 0) goto out_free_dentry; - + elf_bss = 0; elf_brk = 0; @@ -1044,7 +1044,7 @@ out_free_interp: if (unlikely (elf_brk > elf_bss)) { unsigned long nbyte; - + /* There was a PT_LOAD segment with p_memsz > p_filesz before this one. Map anonymous pages, if needed, and clear the area. */ @@ -1522,7 +1522,7 @@ static void fill_elf_note_phdr(struct elf_phdr *phdr, int sz, loff_t offset) phdr->p_align = 0; } -static void fill_note(struct memelfnote *note, const char *name, int type, +static void fill_note(struct memelfnote *note, const char *name, int type, unsigned int sz, void *data) { note->name = name; @@ -2005,8 +2005,8 @@ static int elf_dump_thread_status(long signr, struct elf_thread_status *t) t->num_notes = 0; fill_prstatus(&t->prstatus.common, p, signr); - elf_core_copy_task_regs(p, &t->prstatus.pr_reg); - + elf_core_copy_task_regs(p, &t->prstatus.pr_reg); + fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), &(t->prstatus)); t->num_notes++; @@ -2296,7 +2296,7 @@ static int elf_core_dump(struct coredump_params *cprm) if (!elf_core_write_extra_phdrs(cprm, offset)) goto end_coredump; - /* write out the notes section */ + /* write out the notes section */ if (!write_note_info(&info, cprm)) goto end_coredump; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c71a40927315..b2d3b6e43bb5 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1603,7 +1603,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!elf_core_write_extra_phdrs(cprm, offset)) goto end_coredump; - /* write out the notes section */ + /* write out the notes section */ if (!writenote(thread_list->notes, cprm)) goto end_coredump; if (!writenote(&psinfo_note, cprm)) diff --git a/fs/exec.c b/fs/exec.c index 2039414cc662..b65af8f9a4f9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -169,7 +169,7 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) exit: fput(file); out: - return error; + return error; } #endif /* #ifdef CONFIG_USELIB */ diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index c7b056af9ef0..3e5f07be26fc 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -91,7 +91,7 @@ typedef __s64 Elf64_Sxword; #define DT_INIT 12 #define DT_FINI 13 #define DT_SONAME 14 -#define DT_RPATH 15 +#define DT_RPATH 15 #define DT_SYMBOLIC 16 #define DT_REL 17 #define DT_RELSZ 18 From 86811e8778098529b92bc5873f409cf7b2fee468 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Thu, 28 Sep 2023 20:24:29 -0700 Subject: [PATCH 099/529] binfmt_elf: Support segments with 0 filesz and misaligned starts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 585a018627b4d7ed37387211f667916840b5c5ea ] Implement a helper elf_load() that wraps elf_map() and performs all of the necessary work to ensure that when "memsz > filesz" the bytes described by "memsz > filesz" are zeroed. An outstanding issue is if the first segment has filesz 0, and has a randomized location. But that is the same as today. In this change I replaced an open coded padzero() that did not clear all of the way to the end of the page, with padzero() that does. I also stopped checking the return of padzero() as there is at least one known case where testing for failure is the wrong thing to do. It looks like binfmt_elf_fdpic may have the proper set of tests for when error handling can be safely completed. I found a couple of commits in the old history https://git.kernel.org/pub/scm/linux/kernel/git/tglx/history.git, that look very interesting in understanding this code. commit 39b56d902bf3 ("[PATCH] binfmt_elf: clearing bss may fail") commit c6e2227e4a3e ("[SPARC64]: Missing user access return value checks in fs/binfmt_elf.c and fs/compat.c") commit 5bf3be033f50 ("v2.4.10.1 -> v2.4.10.2") Looking at commit 39b56d902bf3 ("[PATCH] binfmt_elf: clearing bss may fail"): > commit 39b56d902bf35241e7cba6cc30b828ed937175ad > Author: Pavel Machek > Date: Wed Feb 9 22:40:30 2005 -0800 > > [PATCH] binfmt_elf: clearing bss may fail > > So we discover that Borland's Kylix application builder emits weird elf > files which describe a non-writeable bss segment. > > So remove the clear_user() check at the place where we zero out the bss. I > don't _think_ there are any security implications here (plus we've never > checked that clear_user() return value, so whoops if it is a problem). > > Signed-off-by: Pavel Machek > Signed-off-by: Andrew Morton > Signed-off-by: Linus Torvalds It seems pretty clear that binfmt_elf_fdpic with skipping clear_user() for non-writable segments and otherwise calling clear_user(), aka padzero(), and checking it's return code is the right thing to do. I just skipped the error checking as that avoids breaking things. And notably, it looks like Borland's Kylix died in 2005 so it might be safe to just consider read-only segments with memsz > filesz an error. Reported-by: Sebastian Ott Reported-by: Thomas Weißschuh Closes: https://lkml.kernel.org/r/20230914-bss-alloc-v1-1-78de67d2c6dd@weissschuh.net Signed-off-by: "Eric W. Biederman" Link: https://lore.kernel.org/r/87sf71f123.fsf@email.froward.int.ebiederm.org Tested-by: Pedro Falcato Signed-off-by: Sebastian Ott Link: https://lore.kernel.org/r/20230929032435.2391507-1-keescook@chromium.org Signed-off-by: Kees Cook Stable-dep-of: 11854fe263eb ("binfmt_elf: Move brk for static PIE even if ASLR disabled") Signed-off-by: Sasha Levin --- fs/binfmt_elf.c | 111 +++++++++++++++++++++--------------------------- 1 file changed, 48 insertions(+), 63 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d1cf3d25da4b..ea2968d343bb 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -109,25 +109,6 @@ static struct linux_binfmt elf_format = { #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) -static int set_brk(unsigned long start, unsigned long end, int prot) -{ - start = ELF_PAGEALIGN(start); - end = ELF_PAGEALIGN(end); - if (end > start) { - /* - * Map the last of the bss segment. - * If the header is requesting these pages to be - * executable, honour that (ppc32 needs this). - */ - int error = vm_brk_flags(start, end - start, - prot & PROT_EXEC ? VM_EXEC : 0); - if (error) - return error; - } - current->mm->start_brk = current->mm->brk = end; - return 0; -} - /* We need to explicitly zero any fractional pages after the data section (i.e. bss). This would contain the junk from the file that should not @@ -401,6 +382,51 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } +static unsigned long elf_load(struct file *filep, unsigned long addr, + const struct elf_phdr *eppnt, int prot, int type, + unsigned long total_size) +{ + unsigned long zero_start, zero_end; + unsigned long map_addr; + + if (eppnt->p_filesz) { + map_addr = elf_map(filep, addr, eppnt, prot, type, total_size); + if (BAD_ADDR(map_addr)) + return map_addr; + if (eppnt->p_memsz > eppnt->p_filesz) { + zero_start = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) + + eppnt->p_filesz; + zero_end = map_addr + ELF_PAGEOFFSET(eppnt->p_vaddr) + + eppnt->p_memsz; + + /* Zero the end of the last mapped page */ + padzero(zero_start); + } + } else { + map_addr = zero_start = ELF_PAGESTART(addr); + zero_end = zero_start + ELF_PAGEOFFSET(eppnt->p_vaddr) + + eppnt->p_memsz; + } + if (eppnt->p_memsz > eppnt->p_filesz) { + /* + * Map the last of the segment. + * If the header is requesting these pages to be + * executable, honour that (ppc32 needs this). + */ + int error; + + zero_start = ELF_PAGEALIGN(zero_start); + zero_end = ELF_PAGEALIGN(zero_end); + + error = vm_brk_flags(zero_start, zero_end - zero_start, + prot & PROT_EXEC ? VM_EXEC : 0); + if (error) + map_addr = error; + } + return map_addr; +} + + static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { elf_addr_t min_addr = -1; @@ -830,7 +856,6 @@ static int load_elf_binary(struct linux_binprm *bprm) struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; unsigned long elf_bss, elf_brk; - int bss_prot = 0; int retval, i; unsigned long elf_entry; unsigned long e_entry; @@ -1042,33 +1067,6 @@ out_free_interp: if (elf_ppnt->p_type != PT_LOAD) continue; - if (unlikely (elf_brk > elf_bss)) { - unsigned long nbyte; - - /* There was a PT_LOAD segment with p_memsz > p_filesz - before this one. Map anonymous pages, if needed, - and clear the area. */ - retval = set_brk(elf_bss + load_bias, - elf_brk + load_bias, - bss_prot); - if (retval) - goto out_free_dentry; - nbyte = ELF_PAGEOFFSET(elf_bss); - if (nbyte) { - nbyte = ELF_MIN_ALIGN - nbyte; - if (nbyte > elf_brk - elf_bss) - nbyte = elf_brk - elf_bss; - if (clear_user((void __user *)elf_bss + - load_bias, nbyte)) { - /* - * This bss-zeroing can fail if the ELF - * file specifies odd protections. So - * we don't check the return value - */ - } - } - } - elf_prot = make_prot(elf_ppnt->p_flags, &arch_state, !!interpreter, false); @@ -1164,7 +1162,7 @@ out_free_interp: } } - error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, + error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags, total_size); if (BAD_ADDR(error)) { retval = IS_ERR((void *)error) ? @@ -1219,10 +1217,8 @@ out_free_interp: if (end_data < k) end_data = k; k = elf_ppnt->p_vaddr + elf_ppnt->p_memsz; - if (k > elf_brk) { - bss_prot = elf_prot; + if (k > elf_brk) elf_brk = k; - } } e_entry = elf_ex->e_entry + load_bias; @@ -1234,18 +1230,7 @@ out_free_interp: start_data += load_bias; end_data += load_bias; - /* Calling set_brk effectively mmaps the pages that we need - * for the bss and break sections. We must do this before - * mapping in the interpreter, to make sure it doesn't wind - * up getting placed where the bss needs to go. - */ - retval = set_brk(elf_bss, elf_brk, bss_prot); - if (retval) - goto out_free_dentry; - if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { - retval = -EFAULT; /* Nobody gets to see this, but.. */ - goto out_free_dentry; - } + current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk); if (interpreter) { elf_entry = load_elf_interp(interp_elf_ex, From 1707053766b48e1639bc3bbb6c0bdd49c33f366f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 28 Sep 2023 20:24:30 -0700 Subject: [PATCH 100/529] binfmt_elf: elf_bss no longer used by load_elf_binary() [ Upstream commit 8ed2ef21ff564cf4a25c098ace510ee6513c9836 ] With the BSS handled generically via the new filesz/memsz mismatch handling logic in elf_load(), elf_bss no longer needs to be tracked. Drop the variable. Cc: Eric Biederman Cc: Alexander Viro Cc: Christian Brauner Cc: linux-fsdevel@vger.kernel.org Cc: linux-mm@kvack.org Suggested-by: Eric Biederman Tested-by: Pedro Falcato Signed-off-by: Sebastian Ott Link: https://lore.kernel.org/r/20230929032435.2391507-2-keescook@chromium.org Signed-off-by: Kees Cook Stable-dep-of: 11854fe263eb ("binfmt_elf: Move brk for static PIE even if ASLR disabled") Signed-off-by: Sasha Levin --- fs/binfmt_elf.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index ea2968d343bb..2672b9dca1af 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -855,7 +855,7 @@ static int load_elf_binary(struct linux_binprm *bprm) unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; - unsigned long elf_bss, elf_brk; + unsigned long elf_brk; int retval, i; unsigned long elf_entry; unsigned long e_entry; @@ -1047,7 +1047,6 @@ out_free_interp: if (retval < 0) goto out_free_dentry; - elf_bss = 0; elf_brk = 0; start_code = ~0UL; @@ -1210,8 +1209,6 @@ out_free_interp: k = elf_ppnt->p_vaddr + elf_ppnt->p_filesz; - if (k > elf_bss) - elf_bss = k; if ((elf_ppnt->p_flags & PF_X) && end_code < k) end_code = k; if (end_data < k) @@ -1223,7 +1220,6 @@ out_free_interp: e_entry = elf_ex->e_entry + load_bias; phdr_addr += load_bias; - elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; end_code += load_bias; From fea22a3e06e521030312f6dd0c340710e7ec2b0c Mon Sep 17 00:00:00 2001 From: Muhammad Usama Anjum Date: Mon, 4 Mar 2024 20:59:24 +0500 Subject: [PATCH 101/529] selftests/exec: load_address: conform test to TAP format output [ Upstream commit c4095067736b7ed50316a2bc7c9577941e87ad45 ] Conform the layout, informational and status messages to TAP. No functional change is intended other than the layout of output messages. Signed-off-by: Muhammad Usama Anjum Link: https://lore.kernel.org/r/20240304155928.1818928-2-usama.anjum@collabora.com Signed-off-by: Kees Cook Stable-dep-of: 11854fe263eb ("binfmt_elf: Move brk for static PIE even if ASLR disabled") Signed-off-by: Sasha Levin --- tools/testing/selftests/exec/load_address.c | 36 +++++++++------------ 1 file changed, 16 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/exec/load_address.c b/tools/testing/selftests/exec/load_address.c index d487c2f6a615..17e3207d34ae 100644 --- a/tools/testing/selftests/exec/load_address.c +++ b/tools/testing/selftests/exec/load_address.c @@ -5,6 +5,7 @@ #include #include #include +#include "../kselftest.h" struct Statistics { unsigned long long load_address; @@ -41,28 +42,23 @@ int main(int argc, char **argv) unsigned long long misalign; int ret; - ret = dl_iterate_phdr(ExtractStatistics, &extracted); - if (ret != 1) { - fprintf(stderr, "FAILED\n"); - return 1; - } + ksft_print_header(); + ksft_set_plan(1); - if (extracted.alignment == 0) { - fprintf(stderr, "No alignment found\n"); - return 1; - } else if (extracted.alignment & (extracted.alignment - 1)) { - fprintf(stderr, "Alignment is not a power of 2\n"); - return 1; - } + ret = dl_iterate_phdr(ExtractStatistics, &extracted); + if (ret != 1) + ksft_exit_fail_msg("FAILED: dl_iterate_phdr\n"); + + if (extracted.alignment == 0) + ksft_exit_fail_msg("FAILED: No alignment found\n"); + else if (extracted.alignment & (extracted.alignment - 1)) + ksft_exit_fail_msg("FAILED: Alignment is not a power of 2\n"); misalign = extracted.load_address & (extracted.alignment - 1); - if (misalign) { - printf("alignment = %llu, load_address = %llu\n", - extracted.alignment, extracted.load_address); - fprintf(stderr, "FAILED\n"); - return 1; - } + if (misalign) + ksft_exit_fail_msg("FAILED: alignment = %llu, load_address = %llu\n", + extracted.alignment, extracted.load_address); - fprintf(stderr, "PASS\n"); - return 0; + ksft_test_result_pass("Completed\n"); + ksft_finished(); } From 7a60eba05aafa9c711fb5d6f2f853a80fc21ffdb Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 16 Feb 2024 22:25:44 -0800 Subject: [PATCH 102/529] binfmt_elf: Leave a gap between .bss and brk [ Upstream commit 2a5eb9995528441447d33838727f6ec1caf08139 ] Currently the brk starts its randomization immediately after .bss, which means there is a chance that when the random offset is 0, linear overflows from .bss can reach into the brk area. Leave at least a single page gap between .bss and brk (when it has not already been explicitly relocated into the mmap range). Reported-by: Closes: https://lore.kernel.org/linux-hardening/CA+2EKTVLvc8hDZc+2Yhwmus=dzOUG5E4gV7ayCbu0MPJTZzWkw@mail.gmail.com/ Link: https://lore.kernel.org/r/20240217062545.1631668-2-keescook@chromium.org Signed-off-by: Kees Cook Stable-dep-of: 11854fe263eb ("binfmt_elf: Move brk for static PIE even if ASLR disabled") Signed-off-by: Sasha Levin --- fs/binfmt_elf.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 2672b9dca1af..ff796910265d 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1294,6 +1294,9 @@ out_free_interp: if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && elf_ex->e_type == ET_DYN && !interpreter) { mm->brk = mm->start_brk = ELF_ET_DYN_BASE; + } else { + /* Otherwise leave a gap between .bss and brk. */ + mm->brk = mm->start_brk = mm->brk + PAGE_SIZE; } mm->brk = mm->start_brk = arch_randomize_brk(mm); From add3a49ae91a1f6390b3fbf14a7ec9c6231bfbd6 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 8 May 2024 10:31:46 -0700 Subject: [PATCH 103/529] selftests/exec: Build both static and non-static load_address tests [ Upstream commit b57a2907c9d96c56494ef25f8ec821cd0b355dd6 ] After commit 4d1cd3b2c5c1 ("tools/testing/selftests/exec: fix link error"), the load address alignment tests tried to build statically. This was silently ignored in some cases. However, after attempting to further fix the build by switching to "-static-pie", the test started failing. This appears to be due to non-PT_INTERP ET_DYN execs ("static PIE") not doing alignment correctly, which remains unfixed[1]. See commit aeb7923733d1 ("revert "fs/binfmt_elf: use PT_LOAD p_align values for static PIE"") for more details. Provide rules to build both static and non-static PIE binaries, improve debug reporting, and perform several test steps instead of a single all-or-nothing test. However, do not actually enable static-pie tests; alignment specification is only supported for ET_DYN with PT_INTERP ("regular PIE"). Link: https://bugzilla.kernel.org/show_bug.cgi?id=215275 [1] Link: https://lore.kernel.org/r/20240508173149.677910-1-keescook@chromium.org Signed-off-by: Kees Cook Stable-dep-of: 11854fe263eb ("binfmt_elf: Move brk for static PIE even if ASLR disabled") Signed-off-by: Sasha Levin --- tools/testing/selftests/exec/Makefile | 19 +++--- tools/testing/selftests/exec/load_address.c | 67 +++++++++++++++++---- 2 files changed, 66 insertions(+), 20 deletions(-) diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index a0b8688b0836..b54986078d7e 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -3,8 +3,13 @@ CFLAGS = -Wall CFLAGS += -Wno-nonnull CFLAGS += -D_GNU_SOURCE +ALIGNS := 0x1000 0x200000 0x1000000 +ALIGN_PIES := $(patsubst %,load_address.%,$(ALIGNS)) +ALIGN_STATIC_PIES := $(patsubst %,load_address.static.%,$(ALIGNS)) +ALIGNMENT_TESTS := $(ALIGN_PIES) + TEST_PROGS := binfmt_script.py -TEST_GEN_PROGS := execveat load_address_4096 load_address_2097152 load_address_16777216 non-regular +TEST_GEN_PROGS := execveat non-regular $(ALIGNMENT_TESTS) TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir # Makefile is a run-time dependency, since it's accessed by the execveat test TEST_FILES := Makefile @@ -28,9 +33,9 @@ $(OUTPUT)/execveat.symlink: $(OUTPUT)/execveat $(OUTPUT)/execveat.denatured: $(OUTPUT)/execveat cp $< $@ chmod -x $@ -$(OUTPUT)/load_address_4096: load_address.c - $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x1000 -pie -static $< -o $@ -$(OUTPUT)/load_address_2097152: load_address.c - $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x200000 -pie -static $< -o $@ -$(OUTPUT)/load_address_16777216: load_address.c - $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=0x1000000 -pie -static $< -o $@ +$(OUTPUT)/load_address.0x%: load_address.c + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=$(lastword $(subst ., ,$@)) \ + -fPIE -pie $< -o $@ +$(OUTPUT)/load_address.static.0x%: load_address.c + $(CC) $(CFLAGS) $(LDFLAGS) -Wl,-z,max-page-size=$(lastword $(subst ., ,$@)) \ + -fPIE -static-pie $< -o $@ diff --git a/tools/testing/selftests/exec/load_address.c b/tools/testing/selftests/exec/load_address.c index 17e3207d34ae..8257fddba8c8 100644 --- a/tools/testing/selftests/exec/load_address.c +++ b/tools/testing/selftests/exec/load_address.c @@ -5,11 +5,13 @@ #include #include #include +#include #include "../kselftest.h" struct Statistics { unsigned long long load_address; unsigned long long alignment; + bool interp; }; int ExtractStatistics(struct dl_phdr_info *info, size_t size, void *data) @@ -26,11 +28,20 @@ int ExtractStatistics(struct dl_phdr_info *info, size_t size, void *data) stats->alignment = 0; for (i = 0; i < info->dlpi_phnum; i++) { + unsigned long long align; + + if (info->dlpi_phdr[i].p_type == PT_INTERP) { + stats->interp = true; + continue; + } + if (info->dlpi_phdr[i].p_type != PT_LOAD) continue; - if (info->dlpi_phdr[i].p_align > stats->alignment) - stats->alignment = info->dlpi_phdr[i].p_align; + align = info->dlpi_phdr[i].p_align; + + if (align > stats->alignment) + stats->alignment = align; } return 1; // Terminate dl_iterate_phdr. @@ -38,27 +49,57 @@ int ExtractStatistics(struct dl_phdr_info *info, size_t size, void *data) int main(int argc, char **argv) { - struct Statistics extracted; - unsigned long long misalign; + struct Statistics extracted = { }; + unsigned long long misalign, pow2; + bool interp_needed; + char buf[1024]; + FILE *maps; int ret; ksft_print_header(); - ksft_set_plan(1); + ksft_set_plan(4); + /* Dump maps file for debugging reference. */ + maps = fopen("/proc/self/maps", "r"); + if (!maps) + ksft_exit_fail_msg("FAILED: /proc/self/maps: %s\n", strerror(errno)); + while (fgets(buf, sizeof(buf), maps)) { + ksft_print_msg("%s", buf); + } + fclose(maps); + + /* Walk the program headers. */ ret = dl_iterate_phdr(ExtractStatistics, &extracted); if (ret != 1) ksft_exit_fail_msg("FAILED: dl_iterate_phdr\n"); - if (extracted.alignment == 0) - ksft_exit_fail_msg("FAILED: No alignment found\n"); - else if (extracted.alignment & (extracted.alignment - 1)) - ksft_exit_fail_msg("FAILED: Alignment is not a power of 2\n"); + /* Report our findings. */ + ksft_print_msg("load_address=%#llx alignment=%#llx\n", + extracted.load_address, extracted.alignment); + /* If we're named with ".static." we expect no INTERP. */ + interp_needed = strstr(argv[0], ".static.") == NULL; + + /* Were we built as expected? */ + ksft_test_result(interp_needed == extracted.interp, + "%s INTERP program header %s\n", + interp_needed ? "Wanted" : "Unwanted", + extracted.interp ? "seen" : "missing"); + + /* Did we find an alignment? */ + ksft_test_result(extracted.alignment != 0, + "Alignment%s found\n", extracted.alignment ? "" : " NOT"); + + /* Is the alignment sane? */ + pow2 = extracted.alignment & (extracted.alignment - 1); + ksft_test_result(pow2 == 0, + "Alignment is%s a power of 2: %#llx\n", + pow2 == 0 ? "" : " NOT", extracted.alignment); + + /* Is the load address aligned? */ misalign = extracted.load_address & (extracted.alignment - 1); - if (misalign) - ksft_exit_fail_msg("FAILED: alignment = %llu, load_address = %llu\n", - extracted.alignment, extracted.load_address); + ksft_test_result(misalign == 0, "Load Address is %saligned (%#llx)\n", + misalign ? "MIS" : "", misalign); - ksft_test_result_pass("Completed\n"); ksft_finished(); } From 2fb38e1a0164cd41dfed57da36e95ecbb0f39391 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 8 May 2024 10:31:47 -0700 Subject: [PATCH 104/529] binfmt_elf: Calculate total_size earlier [ Upstream commit 2d4cf7b190bbfadd4986bf5c34da17c1a88adf8e ] In preparation to support PT_LOAD with large p_align values on non-PT_INTERP ET_DYN executables (i.e. "static pie"), we'll need to use the total_size details earlier. Move this separately now to make the next patch more readable. As total_size and load_bias are currently calculated separately, this has no behavioral impact. Link: https://lore.kernel.org/r/20240508173149.677910-2-keescook@chromium.org Signed-off-by: Kees Cook Stable-dep-of: 11854fe263eb ("binfmt_elf: Move brk for static PIE even if ASLR disabled") Signed-off-by: Sasha Levin --- fs/binfmt_elf.c | 52 +++++++++++++++++++++++++------------------------ 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index ff796910265d..26abe256e91f 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1093,7 +1093,34 @@ out_free_interp: * Header for ET_DYN binaries to calculate the * randomization (load_bias) for all the LOAD * Program Headers. + */ + + /* + * Calculate the entire size of the ELF mapping + * (total_size), used for the initial mapping, + * due to load_addr_set which is set to true later + * once the initial mapping is performed. * + * Note that this is only sensible when the LOAD + * segments are contiguous (or overlapping). If + * used for LOADs that are far apart, this would + * cause the holes between LOADs to be mapped, + * running the risk of having the mapping fail, + * as it would be larger than the ELF file itself. + * + * As a result, only ET_DYN does this, since + * some ET_EXEC (e.g. ia64) may have large virtual + * memory holes between LOADs. + * + */ + total_size = total_mapping_size(elf_phdata, + elf_ex->e_phnum); + if (!total_size) { + retval = -EINVAL; + goto out_free_dentry; + } + + /* * There are effectively two types of ET_DYN * binaries: programs (i.e. PIE: ET_DYN with INTERP) * and loaders (ET_DYN without INTERP, since they @@ -1134,31 +1161,6 @@ out_free_interp: * is then page aligned. */ load_bias = ELF_PAGESTART(load_bias - vaddr); - - /* - * Calculate the entire size of the ELF mapping - * (total_size), used for the initial mapping, - * due to load_addr_set which is set to true later - * once the initial mapping is performed. - * - * Note that this is only sensible when the LOAD - * segments are contiguous (or overlapping). If - * used for LOADs that are far apart, this would - * cause the holes between LOADs to be mapped, - * running the risk of having the mapping fail, - * as it would be larger than the ELF file itself. - * - * As a result, only ET_DYN does this, since - * some ET_EXEC (e.g. ia64) may have large virtual - * memory holes between LOADs. - * - */ - total_size = total_mapping_size(elf_phdata, - elf_ex->e_phnum); - if (!total_size) { - retval = -EINVAL; - goto out_free_dentry; - } } error = elf_load(bprm->file, load_bias + vaddr, elf_ppnt, From af66f1d950cb064462d83b7250e431c435f54633 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 8 May 2024 10:31:48 -0700 Subject: [PATCH 105/529] binfmt_elf: Honor PT_LOAD alignment for static PIE [ Upstream commit 3545deff0ec7a37de7ed9632e262598582b140e9 ] The p_align values in PT_LOAD were ignored for static PIE executables (i.e. ET_DYN without PT_INTERP). This is because there is no way to request a non-fixed mmap region with a specific alignment. ET_DYN with PT_INTERP uses a separate base address (ELF_ET_DYN_BASE) and binfmt_elf performs the ASLR itself, which means it can also apply alignment. For the mmap region, the address selection happens deep within the vm_mmap() implementation (when the requested address is 0). The earlier attempt to implement this: commit 9630f0d60fec ("fs/binfmt_elf: use PT_LOAD p_align values for static PIE") commit 925346c129da ("fs/binfmt_elf: fix PT_LOAD p_align values for loaders") did not take into account the different base address origins, and were eventually reverted: aeb7923733d1 ("revert "fs/binfmt_elf: use PT_LOAD p_align values for static PIE"") In order to get the correct alignment from an mmap base, binfmt_elf must perform a 0-address load first, then tear down the mapping and perform alignment on the resulting address. Since this is slightly more overhead, only do this when it is needed (i.e. the alignment is not the default ELF alignment). This does, however, have the benefit of being able to use MAP_FIXED_NOREPLACE, to avoid potential collisions. With this fixed, enable the static PIE self tests again. Reported-by: H.J. Lu Closes: https://bugzilla.kernel.org/show_bug.cgi?id=215275 Link: https://lore.kernel.org/r/20240508173149.677910-3-keescook@chromium.org Signed-off-by: Kees Cook Stable-dep-of: 11854fe263eb ("binfmt_elf: Move brk for static PIE even if ASLR disabled") Signed-off-by: Sasha Levin --- fs/binfmt_elf.c | 42 +++++++++++++++++++++++---- tools/testing/selftests/exec/Makefile | 2 +- 2 files changed, 38 insertions(+), 6 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 26abe256e91f..fd3d800f69c1 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1120,10 +1120,13 @@ out_free_interp: goto out_free_dentry; } + /* Calculate any requested alignment. */ + alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); + /* * There are effectively two types of ET_DYN - * binaries: programs (i.e. PIE: ET_DYN with INTERP) - * and loaders (ET_DYN without INTERP, since they + * binaries: programs (i.e. PIE: ET_DYN with PT_INTERP) + * and loaders (ET_DYN without PT_INTERP, since they * _are_ the ELF interpreter). The loaders must * be loaded away from programs since the program * may otherwise collide with the loader (especially @@ -1143,15 +1146,44 @@ out_free_interp: * without MAP_FIXED nor MAP_FIXED_NOREPLACE). */ if (interpreter) { + /* On ET_DYN with PT_INTERP, we do the ASLR. */ load_bias = ELF_ET_DYN_BASE; if (current->flags & PF_RANDOMIZE) load_bias += arch_mmap_rnd(); - alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); + /* Adjust alignment as requested. */ if (alignment) load_bias &= ~(alignment - 1); elf_flags |= MAP_FIXED_NOREPLACE; - } else - load_bias = 0; + } else { + /* + * For ET_DYN without PT_INTERP, we rely on + * the architectures's (potentially ASLR) mmap + * base address (via a load_bias of 0). + * + * When a large alignment is requested, we + * must do the allocation at address "0" right + * now to discover where things will load so + * that we can adjust the resulting alignment. + * In this case (load_bias != 0), we can use + * MAP_FIXED_NOREPLACE to make sure the mapping + * doesn't collide with anything. + */ + if (alignment > ELF_MIN_ALIGN) { + load_bias = elf_load(bprm->file, 0, elf_ppnt, + elf_prot, elf_flags, total_size); + if (BAD_ADDR(load_bias)) { + retval = IS_ERR_VALUE(load_bias) ? + PTR_ERR((void*)load_bias) : -EINVAL; + goto out_free_dentry; + } + vm_munmap(load_bias, total_size); + /* Adjust alignment as requested. */ + if (alignment) + load_bias &= ~(alignment - 1); + elf_flags |= MAP_FIXED_NOREPLACE; + } else + load_bias = 0; + } /* * Since load_bias is used for all subsequent loading diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index b54986078d7e..a705493c04bb 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -6,7 +6,7 @@ CFLAGS += -D_GNU_SOURCE ALIGNS := 0x1000 0x200000 0x1000000 ALIGN_PIES := $(patsubst %,load_address.%,$(ALIGNS)) ALIGN_STATIC_PIES := $(patsubst %,load_address.static.%,$(ALIGNS)) -ALIGNMENT_TESTS := $(ALIGN_PIES) +ALIGNMENT_TESTS := $(ALIGN_PIES) $(ALIGN_STATIC_PIES) TEST_PROGS := binfmt_script.py TEST_GEN_PROGS := execveat non-regular $(ALIGNMENT_TESTS) From 44f3f9205339540e37223bdb8ec63d532dbb1a78 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 25 Apr 2025 15:45:06 -0700 Subject: [PATCH 106/529] binfmt_elf: Move brk for static PIE even if ASLR disabled [ Upstream commit 11854fe263eb1b9a8efa33b0c087add7719ea9b4 ] In commit bbdc6076d2e5 ("binfmt_elf: move brk out of mmap when doing direct loader exec"), the brk was moved out of the mmap region when loading static PIE binaries (ET_DYN without INTERP). The common case for these binaries was testing new ELF loaders, so the brk needed to be away from mmap to avoid colliding with stack, future mmaps (of the loader-loaded binary), etc. But this was only done when ASLR was enabled, in an attempt to minimize changes to memory layouts. After adding support to respect alignment requirements for static PIE binaries in commit 3545deff0ec7 ("binfmt_elf: Honor PT_LOAD alignment for static PIE"), it became possible to have a large gap after the final PT_LOAD segment and the top of the mmap region. This means that future mmap allocations might go after the last PT_LOAD segment (where brk might be if ASLR was disabled) instead of before them (where they traditionally ended up). On arm64, running with ASLR disabled, Ubuntu 22.04's "ldconfig" binary, a static PIE, has alignment requirements that leaves a gap large enough after the last PT_LOAD segment to fit the vdso and vvar, but still leave enough space for the brk (which immediately follows the last PT_LOAD segment) to be allocated by the binary. fffff7f20000-fffff7fde000 r-xp 00000000 fe:02 8110426 /sbin/ldconfig.real fffff7fee000-fffff7ff5000 rw-p 000be000 fe:02 8110426 /sbin/ldconfig.real fffff7ff5000-fffff7ffa000 rw-p 00000000 00:00 0 ***[brk will go here at fffff7ffa000]*** fffff7ffc000-fffff7ffe000 r--p 00000000 00:00 0 [vvar] fffff7ffe000-fffff8000000 r-xp 00000000 00:00 0 [vdso] fffffffdf000-1000000000000 rw-p 00000000 00:00 0 [stack] After commit 0b3bc3354eb9 ("arm64: vdso: Switch to generic storage implementation"), the arm64 vvar grew slightly, and suddenly the brk collided with the allocation. fffff7f20000-fffff7fde000 r-xp 00000000 fe:02 8110426 /sbin/ldconfig.real fffff7fee000-fffff7ff5000 rw-p 000be000 fe:02 8110426 /sbin/ldconfig.real fffff7ff5000-fffff7ffa000 rw-p 00000000 00:00 0 ***[oops, no room any more, vvar is at fffff7ffa000!]*** fffff7ffa000-fffff7ffe000 r--p 00000000 00:00 0 [vvar] fffff7ffe000-fffff8000000 r-xp 00000000 00:00 0 [vdso] fffffffdf000-1000000000000 rw-p 00000000 00:00 0 [stack] The solution is to unconditionally move the brk out of the mmap region for static PIE binaries. Whether ASLR is enabled or not does not change if there may be future mmap allocation collisions with a growing brk region. Update memory layout comments (with kernel-doc headings), consolidate the setting of mm->brk to later (it isn't needed early), move static PIE brk out of mmap unconditionally, and make sure brk(2) knows to base brk position off of mm->start_brk not mm->end_data no matter what the cause of moving it is (via current->brk_randomized). For the CONFIG_COMPAT_BRK case, though, leave the logic unchanged, as we can never safely move the brk. These systems, however, are not using specially aligned static PIE binaries. Reported-by: Ryan Roberts Closes: https://lore.kernel.org/lkml/f93db308-4a0e-4806-9faf-98f890f5a5e6@arm.com/ Fixes: bbdc6076d2e5 ("binfmt_elf: move brk out of mmap when doing direct loader exec") Link: https://lore.kernel.org/r/20250425224502.work.520-kees@kernel.org Reviewed-by: Ryan Roberts Tested-by: Ryan Roberts Signed-off-by: Kees Cook Signed-off-by: Sasha Levin --- fs/binfmt_elf.c | 71 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 47 insertions(+), 24 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index fd3d800f69c1..762704eed9ce 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -856,6 +856,7 @@ static int load_elf_binary(struct linux_binprm *bprm) struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; unsigned long elf_brk; + bool brk_moved = false; int retval, i; unsigned long elf_entry; unsigned long e_entry; @@ -1123,15 +1124,19 @@ out_free_interp: /* Calculate any requested alignment. */ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); - /* - * There are effectively two types of ET_DYN - * binaries: programs (i.e. PIE: ET_DYN with PT_INTERP) - * and loaders (ET_DYN without PT_INTERP, since they - * _are_ the ELF interpreter). The loaders must - * be loaded away from programs since the program - * may otherwise collide with the loader (especially - * for ET_EXEC which does not have a randomized - * position). For example to handle invocations of + /** + * DOC: PIE handling + * + * There are effectively two types of ET_DYN ELF + * binaries: programs (i.e. PIE: ET_DYN with + * PT_INTERP) and loaders (i.e. static PIE: ET_DYN + * without PT_INTERP, usually the ELF interpreter + * itself). Loaders must be loaded away from programs + * since the program may otherwise collide with the + * loader (especially for ET_EXEC which does not have + * a randomized position). + * + * For example, to handle invocations of * "./ld.so someprog" to test out a new version of * the loader, the subsequent program that the * loader loads must avoid the loader itself, so @@ -1144,6 +1149,9 @@ out_free_interp: * ELF_ET_DYN_BASE and loaders are loaded into the * independently randomized mmap region (0 load_bias * without MAP_FIXED nor MAP_FIXED_NOREPLACE). + * + * See below for "brk" handling details, which is + * also affected by program vs loader and ASLR. */ if (interpreter) { /* On ET_DYN with PT_INTERP, we do the ASLR. */ @@ -1260,8 +1268,6 @@ out_free_interp: start_data += load_bias; end_data += load_bias; - current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk); - if (interpreter) { elf_entry = load_elf_interp(interp_elf_ex, interpreter, @@ -1317,27 +1323,44 @@ out_free_interp: mm->end_data = end_data; mm->start_stack = bprm->p; - if ((current->flags & PF_RANDOMIZE) && (snapshot_randomize_va_space > 1)) { + /** + * DOC: "brk" handling + * + * For architectures with ELF randomization, when executing a + * loader directly (i.e. static PIE: ET_DYN without PT_INTERP), + * move the brk area out of the mmap region and into the unused + * ELF_ET_DYN_BASE region. Since "brk" grows up it may collide + * early with the stack growing down or other regions being put + * into the mmap region by the kernel (e.g. vdso). + * + * In the CONFIG_COMPAT_BRK case, though, everything is turned + * off because we're not allowed to move the brk at all. + */ + if (!IS_ENABLED(CONFIG_COMPAT_BRK) && + IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && + elf_ex->e_type == ET_DYN && !interpreter) { + elf_brk = ELF_ET_DYN_BASE; + /* This counts as moving the brk, so let brk(2) know. */ + brk_moved = true; + } + mm->start_brk = mm->brk = ELF_PAGEALIGN(elf_brk); + + if ((current->flags & PF_RANDOMIZE) && snapshot_randomize_va_space > 1) { /* - * For architectures with ELF randomization, when executing - * a loader directly (i.e. no interpreter listed in ELF - * headers), move the brk area out of the mmap region - * (since it grows up, and may collide early with the stack - * growing down), and into the unused ELF_ET_DYN_BASE region. + * If we didn't move the brk to ELF_ET_DYN_BASE (above), + * leave a gap between .bss and brk. */ - if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && - elf_ex->e_type == ET_DYN && !interpreter) { - mm->brk = mm->start_brk = ELF_ET_DYN_BASE; - } else { - /* Otherwise leave a gap between .bss and brk. */ + if (!brk_moved) mm->brk = mm->start_brk = mm->brk + PAGE_SIZE; - } mm->brk = mm->start_brk = arch_randomize_brk(mm); + brk_moved = true; + } + #ifdef compat_brk_randomized + if (brk_moved) current->brk_randomized = 1; #endif - } if (current->personality & MMAP_PAGE_ZERO) { /* Why this, you ask??? Well SVr4 maps page 0 as read-only, From 5eadacf806e31f717ef8bba783ed68e88b9d81ef Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Thu, 1 May 2025 15:17:02 +0200 Subject: [PATCH 107/529] platform/x86: asus-wmi: Fix wlan_ctrl_by_user detection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit bfcfe6d335a967f8ea0c1980960e6f0205b5de6e ] The wlan_ctrl_by_user detection was introduced by commit a50bd128f28c ("asus-wmi: record wlan status while controlled by userapp"). Quoting from that commit's commit message: """ When you call WMIMethod(DSTS, 0x00010011) to get WLAN status, it may return (1) 0x00050001 (On) (2) 0x00050000 (Off) (3) 0x00030001 (On) (4) 0x00030000 (Off) (5) 0x00000002 (Unknown) (1), (2) means that the model has hardware GPIO for WLAN, you can call WMIMethod(DEVS, 0x00010011, 1 or 0) to turn WLAN on/off. (3), (4) means that the model doesn’t have hardware GPIO, you need to use API or driver library to turn WLAN on/off, and call WMIMethod(DEVS, 0x00010012, 1 or 0) to set WLAN LED status. After you set WLAN LED status, you can see the WLAN status is changed with WMIMethod(DSTS, 0x00010011). Because the status is recorded lastly (ex: Windows), you can use it for synchronization. (5) means that the model doesn’t have WLAN device. WLAN is the ONLY special case with upper rule. """ The wlan_ctrl_by_user flag should be set on 0x0003000? ((3), (4) above) return values, but the flag mistakenly also gets set on laptops with 0x0005000? ((1), (2)) return values. This is causing rfkill problems on laptops where 0x0005000? is returned. Fix the check to only set the wlan_ctrl_by_user flag for 0x0003000? return values. Fixes: a50bd128f28c ("asus-wmi: record wlan status while controlled by userapp") Link: https://bugzilla.kernel.org/show_bug.cgi?id=219786 Signed-off-by: Hans de Goede Reviewed-by: Armin Wolf Link: https://lore.kernel.org/r/20250501131702.103360-2-hdegoede@redhat.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen Signed-off-by: Sasha Levin --- drivers/platform/x86/asus-wmi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c index 296150eaef92..33eacb4fc4c4 100644 --- a/drivers/platform/x86/asus-wmi.c +++ b/drivers/platform/x86/asus-wmi.c @@ -3804,7 +3804,8 @@ static int asus_wmi_add(struct platform_device *pdev) goto fail_leds; asus_wmi_get_devstate(asus, ASUS_WMI_DEVID_WLAN, &result); - if (result & (ASUS_WMI_DSTS_PRESENCE_BIT | ASUS_WMI_DSTS_USER_BIT)) + if ((result & (ASUS_WMI_DSTS_PRESENCE_BIT | ASUS_WMI_DSTS_USER_BIT)) == + (ASUS_WMI_DSTS_PRESENCE_BIT | ASUS_WMI_DSTS_USER_BIT)) asus->driver->wlan_ctrl_by_user = 1; if (!(asus->driver->wlan_ctrl_by_user && ashs_present())) { From 2f81039276017f800a5f68d9e279d1ef69c549b9 Mon Sep 17 00:00:00 2001 From: "Masami Hiramatsu (Google)" Date: Sat, 10 May 2025 12:44:41 +0900 Subject: [PATCH 108/529] tracing: probes: Fix a possible race in trace_probe_log APIs [ Upstream commit fd837de3c9cb1a162c69bc1fb1f438467fe7f2f5 ] Since the shared trace_probe_log variable can be accessed and modified via probe event create operation of kprobe_events, uprobe_events, and dynamic_events, it should be protected. In the dynamic_events, all operations are serialized by `dyn_event_ops_mutex`. But kprobe_events and uprobe_events interfaces are not serialized. To solve this issue, introduces dyn_event_create(), which runs create() operation under the mutex, for kprobe_events and uprobe_events. This also uses lockdep to check the mutex is held when using trace_probe_log* APIs. Link: https://lore.kernel.org/all/174684868120.551552.3068655787654268804.stgit@devnote2/ Reported-by: Paul Cacheux Closes: https://lore.kernel.org/all/20250510074456.805a16872b591e2971a4d221@kernel.org/ Fixes: ab105a4fb894 ("tracing: Use tracing error_log with probe events") Signed-off-by: Masami Hiramatsu (Google) Signed-off-by: Sasha Levin --- kernel/trace/trace_dynevent.c | 16 +++++++++++++++- kernel/trace/trace_dynevent.h | 1 + kernel/trace/trace_kprobe.c | 2 +- kernel/trace/trace_probe.c | 9 +++++++++ kernel/trace/trace_uprobe.c | 2 +- 5 files changed, 27 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_dynevent.c b/kernel/trace/trace_dynevent.c index 4376887e0d8a..c9b0533407ed 100644 --- a/kernel/trace/trace_dynevent.c +++ b/kernel/trace/trace_dynevent.c @@ -16,7 +16,7 @@ #include "trace_output.h" /* for trace_event_sem */ #include "trace_dynevent.h" -static DEFINE_MUTEX(dyn_event_ops_mutex); +DEFINE_MUTEX(dyn_event_ops_mutex); static LIST_HEAD(dyn_event_ops_list); bool trace_event_dyn_try_get_ref(struct trace_event_call *dyn_call) @@ -125,6 +125,20 @@ out: return ret; } +/* + * Locked version of event creation. The event creation must be protected by + * dyn_event_ops_mutex because of protecting trace_probe_log. + */ +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type) +{ + int ret; + + mutex_lock(&dyn_event_ops_mutex); + ret = type->create(raw_command); + mutex_unlock(&dyn_event_ops_mutex); + return ret; +} + static int create_dyn_event(const char *raw_command) { struct dyn_event_operations *ops; diff --git a/kernel/trace/trace_dynevent.h b/kernel/trace/trace_dynevent.h index 936477a111d3..beee3f8d7544 100644 --- a/kernel/trace/trace_dynevent.h +++ b/kernel/trace/trace_dynevent.h @@ -100,6 +100,7 @@ void *dyn_event_seq_next(struct seq_file *m, void *v, loff_t *pos); void dyn_event_seq_stop(struct seq_file *m, void *v); int dyn_events_release_all(struct dyn_event_operations *type); int dyn_event_release(const char *raw_command, struct dyn_event_operations *type); +int dyn_event_create(const char *raw_command, struct dyn_event_operations *type); /* * for_each_dyn_event - iterate over the dyn_event list diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index 72655d81b37d..cc155c411768 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -975,7 +975,7 @@ static int create_or_delete_trace_kprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_kprobe_ops); - ret = trace_kprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_kprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } diff --git a/kernel/trace/trace_probe.c b/kernel/trace/trace_probe.c index ba48b5e270e1..3888a59c9dfe 100644 --- a/kernel/trace/trace_probe.c +++ b/kernel/trace/trace_probe.c @@ -143,9 +143,12 @@ fail: } static struct trace_probe_log trace_probe_log; +extern struct mutex dyn_event_ops_mutex; void trace_probe_log_init(const char *subsystem, int argc, const char **argv) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.subsystem = subsystem; trace_probe_log.argc = argc; trace_probe_log.argv = argv; @@ -154,11 +157,15 @@ void trace_probe_log_init(const char *subsystem, int argc, const char **argv) void trace_probe_log_clear(void) { + lockdep_assert_held(&dyn_event_ops_mutex); + memset(&trace_probe_log, 0, sizeof(trace_probe_log)); } void trace_probe_log_set_index(int index) { + lockdep_assert_held(&dyn_event_ops_mutex); + trace_probe_log.index = index; } @@ -167,6 +174,8 @@ void __trace_probe_log_err(int offset, int err_type) char *command, *p; int i, len = 0, pos = 0; + lockdep_assert_held(&dyn_event_ops_mutex); + if (!trace_probe_log.argv) return; diff --git a/kernel/trace/trace_uprobe.c b/kernel/trace/trace_uprobe.c index a6a3ff2a441e..53ef3cb65098 100644 --- a/kernel/trace/trace_uprobe.c +++ b/kernel/trace/trace_uprobe.c @@ -732,7 +732,7 @@ static int create_or_delete_trace_uprobe(const char *raw_command) if (raw_command[0] == '-') return dyn_event_release(raw_command, &trace_uprobe_ops); - ret = trace_uprobe_create(raw_command); + ret = dyn_event_create(raw_command, &trace_uprobe_ops); return ret == -ECANCELED ? -EINVAL : ret; } From 22bed5bd0d1b9bc6409a87096e9434301d134414 Mon Sep 17 00:00:00 2001 From: Michal Suchanek Date: Fri, 4 Apr 2025 10:23:14 +0200 Subject: [PATCH 109/529] tpm: tis: Double the timeout B to 4s [ Upstream commit 2f661f71fda1fc0c42b7746ca5b7da529eb6b5be ] With some Infineon chips the timeouts in tpm_tis_send_data (both B and C) can reach up to about 2250 ms. Timeout C is retried since commit de9e33df7762 ("tpm, tpm_tis: Workaround failed command reception on Infineon devices") Timeout B still needs to be extended. The problem is most commonly encountered with context related operation such as load context/save context. These are issued directly by the kernel, and there is no retry logic for them. When a filesystem is set up to use the TPM for unlocking the boot fails, and restarting the userspace service is ineffective. This is likely because ignoring a load context/save context result puts the real TPM state and the TPM state expected by the kernel out of sync. Chips known to be affected: tpm_tis IFX1522:00: 2.0 TPM (device-id 0x1D, rev-id 54) Description: SLB9672 Firmware Revision: 15.22 tpm_tis MSFT0101:00: 2.0 TPM (device-id 0x1B, rev-id 22) Firmware Revision: 7.83 tpm_tis MSFT0101:00: 2.0 TPM (device-id 0x1A, rev-id 16) Firmware Revision: 5.63 Link: https://lore.kernel.org/linux-integrity/Z5pI07m0Muapyu9w@kitsune.suse.cz/ Signed-off-by: Michal Suchanek Reviewed-by: Jarkko Sakkinen Signed-off-by: Jarkko Sakkinen Signed-off-by: Sasha Levin --- drivers/char/tpm/tpm_tis_core.h | 2 +- include/linux/tpm.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/char/tpm/tpm_tis_core.h b/drivers/char/tpm/tpm_tis_core.h index be72681ab8ea..5f29eebef52b 100644 --- a/drivers/char/tpm/tpm_tis_core.h +++ b/drivers/char/tpm/tpm_tis_core.h @@ -53,7 +53,7 @@ enum tis_int_flags { enum tis_defaults { TIS_MEM_LEN = 0x5000, TIS_SHORT_TIMEOUT = 750, /* ms */ - TIS_LONG_TIMEOUT = 2000, /* 2 sec */ + TIS_LONG_TIMEOUT = 4000, /* 4 secs */ TIS_TIMEOUT_MIN_ATML = 14700, /* usecs */ TIS_TIMEOUT_MAX_ATML = 15000, /* usecs */ }; diff --git a/include/linux/tpm.h b/include/linux/tpm.h index dd0784a6e07d..4a4112bb1d1b 100644 --- a/include/linux/tpm.h +++ b/include/linux/tpm.h @@ -181,7 +181,7 @@ enum tpm2_const { enum tpm2_timeouts { TPM2_TIMEOUT_A = 750, - TPM2_TIMEOUT_B = 2000, + TPM2_TIMEOUT_B = 4000, TPM2_TIMEOUT_C = 200, TPM2_TIMEOUT_D = 30, TPM2_DURATION_SHORT = 20, From d7b0db1246f1cb83093ffa96b417825bf01cbd8b Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 13 Apr 2025 11:34:24 +0100 Subject: [PATCH 110/529] iio: adc: ad7266: Fix potential timestamp alignment issue. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 52d349884738c346961e153f195f4c7fe186fcf4 ] On architectures where an s64 is only 32-bit aligned insufficient padding would be left between the earlier elements and the timestamp. Use aligned_s64 to enforce the correct placement and ensure the storage is large enough. Fixes: 54e018da3141 ("iio:ad7266: Mark transfer buffer as __be16") # aligned_s64 is much newer. Reported-by: David Lechner Reviewed-by: Nuno Sá Reviewed-by: David Lechner Link: https://patch.msgid.link/20250413103443.2420727-2-jic23@kernel.org Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin --- drivers/iio/adc/ad7266.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/adc/ad7266.c b/drivers/iio/adc/ad7266.c index 98648c679a55..2ace3aafe497 100644 --- a/drivers/iio/adc/ad7266.c +++ b/drivers/iio/adc/ad7266.c @@ -44,7 +44,7 @@ struct ad7266_state { */ struct { __be16 sample[2]; - s64 timestamp; + aligned_s64 timestamp; } data __aligned(IIO_DMA_MINALIGN); }; From 6b9418c825d7282a63c7fef5a7d6c063db720824 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 7 Feb 2024 23:52:55 -0600 Subject: [PATCH 111/529] drm/amd: Stop evicting resources on APUs in suspend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 226db36032c61d8717dfdd052adac351b22d3e83 ] commit 5095d5418193 ("drm/amd: Evict resources during PM ops prepare() callback") intentionally moved the eviction of resources to earlier in the suspend process, but this introduced a subtle change that it occurs before adev->in_s0ix or adev->in_s3 are set. This meant that APUs actually started to evict resources at suspend time as well. Explicitly set s0ix or s3 in the prepare() stage, and unset them if the prepare() stage failed. v2: squash in warning fix from Stephen Rothwell Reported-by: Jürg Billeter Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3132#note_2271038 Fixes: 5095d5418193 ("drm/amd: Evict resources during PM ops prepare() callback") Acked-by: Alex Deucher Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher Stable-dep-of: d0ce1aaa8531 ("Revert "drm/amd: Stop evicting resources on APUs in suspend"") Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 ++ drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 15 +++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 +++++++++-- 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index dd22d2559720..705703bab2fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1408,9 +1408,11 @@ static inline int amdgpu_acpi_smart_shift_update(struct drm_device *dev, #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev); bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev); +void amdgpu_choose_low_power_state(struct amdgpu_device *adev); #else static inline bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { return false; } static inline bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) { return false; } +static inline void amdgpu_choose_low_power_state(struct amdgpu_device *adev) { } #endif #if defined(CONFIG_DRM_AMD_DC) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index 5fa7f6d8aa30..70d761f79770 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -1118,4 +1118,19 @@ bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) #endif /* CONFIG_AMD_PMC */ } +/** + * amdgpu_choose_low_power_state + * + * @adev: amdgpu_device_pointer + * + * Choose the target low power state for the GPU + */ +void amdgpu_choose_low_power_state(struct amdgpu_device *adev) +{ + if (amdgpu_acpi_is_s0ix_active(adev)) + adev->in_s0ix = true; + else if (amdgpu_acpi_is_s3_active(adev)) + adev->in_s3 = true; +} + #endif /* CONFIG_SUSPEND */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index fcd0c61499f8..52f4ef4e8b4b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4195,13 +4195,15 @@ int amdgpu_device_prepare(struct drm_device *dev) struct amdgpu_device *adev = drm_to_adev(dev); int i, r; + amdgpu_choose_low_power_state(adev); + if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; /* Evict the majority of BOs before starting suspend sequence */ r = amdgpu_device_evict_resources(adev); if (r) - return r; + goto unprepare; flush_delayed_work(&adev->gfx.gfx_off_delay_work); @@ -4212,10 +4214,15 @@ int amdgpu_device_prepare(struct drm_device *dev) continue; r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); if (r) - return r; + goto unprepare; } return 0; + +unprepare: + adev->in_s0ix = adev->in_s3 = false; + + return r; } /** From c3408b49e32d35be00c628071d8c3fb1243d0615 Mon Sep 17 00:00:00 2001 From: Ma Jun Date: Wed, 21 Feb 2024 17:16:49 +0800 Subject: [PATCH 112/529] drm/amdgpu: Fix the runtime resume failure issue [ Upstream commit bbfaf2aea7164db59739728d62d9cc91d64ff856 ] Don't set power state flag when system enter runtime suspend, or it may cause runtime resume failure issue. Fixes: 3a9626c816db ("drm/amd: Stop evicting resources on APUs in suspend") Signed-off-by: Ma Jun Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher Cc: stable@vger.kernel.org Stable-dep-of: d0ce1aaa8531 ("Revert "drm/amd: Stop evicting resources on APUs in suspend"") Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index 70d761f79770..46916680f044 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -1127,6 +1127,9 @@ bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) */ void amdgpu_choose_low_power_state(struct amdgpu_device *adev) { + if (adev->in_runpm) + return; + if (amdgpu_acpi_is_s0ix_active(adev)) adev->in_s0ix = true; else if (amdgpu_acpi_is_s3_active(adev)) From 43b8b33b81ecafccd8a66eff8d883b12123d55d2 Mon Sep 17 00:00:00 2001 From: Zhigang Luo Date: Thu, 29 Feb 2024 16:04:35 -0500 Subject: [PATCH 113/529] drm/amdgpu: trigger flr_work if reading pf2vf data failed [ Upstream commit ab66c832847fcdffc97d4591ba5547e3990d9d33 ] if reading pf2vf data failed 30 times continuously, it means something is wrong. Need to trigger flr_work to recover the issue. also use dev_err to print the error message to get which device has issue and add warning message if waiting IDH_FLR_NOTIFICATION_CMPL timeout. Signed-off-by: Zhigang Luo Acked-by: Hawking Zhang Signed-off-by: Alex Deucher Stable-dep-of: d0ce1aaa8531 ("Revert "drm/amd: Stop evicting resources on APUs in suspend"") Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 15 +++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c | 29 ++++++++++++++++++---- drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h | 3 +++ drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c | 2 ++ drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c | 2 ++ 5 files changed, 41 insertions(+), 10 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 52f4ef4e8b4b..aac0e49f77c7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -139,6 +139,8 @@ const char *amdgpu_asic_name[] = { "LAST", }; +static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); + /** * DOC: pcie_replay_count * @@ -4638,6 +4640,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, retry: amdgpu_amdkfd_pre_reset(adev); + amdgpu_device_stop_pending_resets(adev); + if (from_hypervisor) r = amdgpu_virt_request_full_gpu(adev, true); else @@ -5509,11 +5513,12 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ tmp_adev->asic_reset_res = r; } - /* - * Drop all pending non scheduler resets. Scheduler resets - * were already dropped during drm_sched_stop - */ - amdgpu_device_stop_pending_resets(tmp_adev); + if (!amdgpu_sriov_vf(tmp_adev)) + /* + * Drop all pending non scheduler resets. Scheduler resets + * were already dropped during drm_sched_stop + */ + amdgpu_device_stop_pending_resets(tmp_adev); } tmp_vram_lost_counter = atomic_read(&((adev)->vram_lost_counter)); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c index d7b76a3d2d55..6174bef0ecb8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.c @@ -32,6 +32,7 @@ #include "amdgpu.h" #include "amdgpu_ras.h" +#include "amdgpu_reset.h" #include "vi.h" #include "soc15.h" #include "nv.h" @@ -456,7 +457,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) return -EINVAL; if (pf2vf_info->size > 1024) { - DRM_ERROR("invalid pf2vf message size\n"); + dev_err(adev->dev, "invalid pf2vf message size: 0x%x\n", pf2vf_info->size); return -EINVAL; } @@ -467,7 +468,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size, adev->virt.fw_reserve.checksum_key, checksum); if (checksum != checkval) { - DRM_ERROR("invalid pf2vf message\n"); + dev_err(adev->dev, + "invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n", + checksum, checkval); return -EINVAL; } @@ -481,7 +484,9 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) adev->virt.fw_reserve.p_pf2vf, pf2vf_info->size, 0, checksum); if (checksum != checkval) { - DRM_ERROR("invalid pf2vf message\n"); + dev_err(adev->dev, + "invalid pf2vf message: header checksum=0x%x calculated checksum=0x%x\n", + checksum, checkval); return -EINVAL; } @@ -517,7 +522,7 @@ static int amdgpu_virt_read_pf2vf_data(struct amdgpu_device *adev) ((struct amd_sriov_msg_pf2vf_info *)pf2vf_info)->uuid; break; default: - DRM_ERROR("invalid pf2vf version\n"); + dev_err(adev->dev, "invalid pf2vf version: 0x%x\n", pf2vf_info->version); return -EINVAL; } @@ -617,8 +622,21 @@ static void amdgpu_virt_update_vf2pf_work_item(struct work_struct *work) int ret; ret = amdgpu_virt_read_pf2vf_data(adev); - if (ret) + if (ret) { + adev->virt.vf2pf_update_retry_cnt++; + if ((adev->virt.vf2pf_update_retry_cnt >= AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT) && + amdgpu_sriov_runtime(adev) && !amdgpu_in_reset(adev)) { + if (amdgpu_reset_domain_schedule(adev->reset_domain, + &adev->virt.flr_work)) + return; + else + dev_err(adev->dev, "Failed to queue work! at %s", __func__); + } + goto out; + } + + adev->virt.vf2pf_update_retry_cnt = 0; amdgpu_virt_write_vf2pf_data(adev); out: @@ -639,6 +657,7 @@ void amdgpu_virt_init_data_exchange(struct amdgpu_device *adev) adev->virt.fw_reserve.p_pf2vf = NULL; adev->virt.fw_reserve.p_vf2pf = NULL; adev->virt.vf2pf_update_interval_ms = 0; + adev->virt.vf2pf_update_retry_cnt = 0; if (adev->mman.fw_vram_usage_va != NULL) { /* go through this logic in ip_init and reset to init workqueue*/ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h index dc6aaa4d67be..fc2859726f0a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_virt.h @@ -51,6 +51,8 @@ /* tonga/fiji use this offset */ #define mmBIF_IOV_FUNC_IDENTIFIER 0x1503 +#define AMDGPU_VF2PF_UPDATE_MAX_RETRY_LIMIT 30 + enum amdgpu_sriov_vf_mode { SRIOV_VF_MODE_BARE_METAL = 0, SRIOV_VF_MODE_ONE_VF, @@ -250,6 +252,7 @@ struct amdgpu_virt { /* vf2pf message */ struct delayed_work vf2pf_work; uint32_t vf2pf_update_interval_ms; + int vf2pf_update_retry_cnt; /* multimedia bandwidth config */ bool is_mm_bw_enabled; diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c index 12906ba74462..d39c6baae007 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_ai.c @@ -276,6 +276,8 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) timeout -= 10; } while (timeout > 1); + dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n"); + flr_done: atomic_set(&adev->reset_domain->in_gpu_reset, 0); up_write(&adev->reset_domain->sem); diff --git a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c index e07757eea7ad..a311a2425ec0 100644 --- a/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c +++ b/drivers/gpu/drm/amd/amdgpu/mxgpu_nv.c @@ -300,6 +300,8 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) timeout -= 10; } while (timeout > 1); + dev_warn(adev->dev, "waiting IDH_FLR_NOTIFICATION_CMPL timeout\n"); + flr_done: atomic_set(&adev->reset_domain->in_gpu_reset, 0); up_write(&adev->reset_domain->sem); From a2419fa7fe0bab33643fe78f10d2038449d90d7e Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Wed, 27 Nov 2024 21:26:56 -0600 Subject: [PATCH 114/529] drm/amd: Add Suspend/Hibernate notification callback support [ Upstream commit 2965e6355dcdf157b5fafa25a2715f00064da8bf ] As part of the suspend sequence VRAM needs to be evicted on dGPUs. In order to make suspend/resume more reliable we moved this into the pmops prepare() callback so that the suspend sequence would fail but the system could remain operational under high memory usage suspend. Another class of issues exist though where due to memory fragementation there isn't a large enough contiguous space and swap isn't accessible. Add support for a suspend/hibernate notification callback that could evict VRAM before tasks are frozen. This should allow paging out to swap if necessary. Link: https://github.com/ROCm/ROCK-Kernel-Driver/issues/174 Link: https://gitlab.freedesktop.org/drm/amd/-/issues/3476 Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/2362 Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3781 Reviewed-by: Lijo Lazar Link: https://lore.kernel.org/r/20241128032656.2090059-2-superm1@kernel.org Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher Stable-dep-of: d0ce1aaa8531 ("Revert "drm/amd: Stop evicting resources on APUs in suspend"") Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 46 +++++++++++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 1 - 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 705703bab2fd..019a83c7170d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -781,6 +781,7 @@ struct amdgpu_device { bool need_swiotlb; bool accel_working; struct notifier_block acpi_nb; + struct notifier_block pm_nb; struct amdgpu_i2c_chan *i2c_bus[AMDGPU_MAX_I2C_BUS]; struct debugfs_blob_wrapper debugfs_vbios_blob; struct debugfs_blob_wrapper debugfs_discovery_blob; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index aac0e49f77c7..32c571e9288a 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -140,6 +140,8 @@ const char *amdgpu_asic_name[] = { }; static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); +static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, + void *data); /** * DOC: pcie_replay_count @@ -3992,6 +3994,11 @@ fence_driver_init: amdgpu_device_check_iommu_direct_map(adev); + adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; + r = register_pm_notifier(&adev->pm_nb); + if (r) + goto failed; + return 0; release_ras_con: @@ -4053,6 +4060,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) flush_delayed_work(&adev->delayed_init_work); adev->shutdown = true; + unregister_pm_notifier(&adev->pm_nb); + /* make sure IB test finished before entering exclusive mode * to avoid preemption on IB test * */ @@ -4183,6 +4192,41 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev) /* * Suspend & resume. */ +/** + * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events + * @nb: notifier block + * @mode: suspend mode + * @data: data + * + * This function is called when the system is about to suspend or hibernate. + * It is used to evict resources from the device before the system goes to + * sleep while there is still access to swap. + */ +static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, + void *data) +{ + struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); + int r; + + switch (mode) { + case PM_HIBERNATION_PREPARE: + adev->in_s4 = true; + fallthrough; + case PM_SUSPEND_PREPARE: + r = amdgpu_device_evict_resources(adev); + /* + * This is considered non-fatal at this time because + * amdgpu_device_prepare() will also fatally evict resources. + * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 + */ + if (r) + drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); + break; + } + + return NOTIFY_DONE; +} + /** * amdgpu_device_prepare - prepare for device suspend * @@ -4222,7 +4266,7 @@ int amdgpu_device_prepare(struct drm_device *dev) return 0; unprepare: - adev->in_s0ix = adev->in_s3 = false; + adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; return r; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 0a85a59519ca..06958608c984 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2468,7 +2468,6 @@ static int amdgpu_pmops_freeze(struct device *dev) struct amdgpu_device *adev = drm_to_adev(drm_dev); int r; - adev->in_s4 = true; r = amdgpu_device_suspend(drm_dev, true); if (r) return r; From ced7c789e3aba2889712b48533919d01d601219a Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 1 May 2025 13:00:16 -0400 Subject: [PATCH 115/529] Revert "drm/amd: Stop evicting resources on APUs in suspend" [ Upstream commit d0ce1aaa8531a4a4707711cab5721374751c51b0 ] This reverts commit 3a9626c816db901def438dc2513622e281186d39. This breaks S4 because we end up setting the s3/s0ix flags even when we are entering s4 since prepare is used by both flows. The causes both the S3/s0ix and s4 flags to be set which breaks several checks in the driver which assume they are mutually exclusive. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/3634 Cc: Mario Limonciello Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit ce8f7d95899c2869b47ea6ce0b3e5bf304b2fff4) Cc: stable@vger.kernel.org Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 -- drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c | 18 ------------------ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 11 ++--------- 3 files changed, 2 insertions(+), 29 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 019a83c7170d..af86402c70a9 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1409,11 +1409,9 @@ static inline int amdgpu_acpi_smart_shift_update(struct drm_device *dev, #if defined(CONFIG_ACPI) && defined(CONFIG_SUSPEND) bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev); bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev); -void amdgpu_choose_low_power_state(struct amdgpu_device *adev); #else static inline bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) { return false; } static inline bool amdgpu_acpi_is_s3_active(struct amdgpu_device *adev) { return false; } -static inline void amdgpu_choose_low_power_state(struct amdgpu_device *adev) { } #endif #if defined(CONFIG_DRM_AMD_DC) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c index 46916680f044..5fa7f6d8aa30 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_acpi.c @@ -1118,22 +1118,4 @@ bool amdgpu_acpi_is_s0ix_active(struct amdgpu_device *adev) #endif /* CONFIG_AMD_PMC */ } -/** - * amdgpu_choose_low_power_state - * - * @adev: amdgpu_device_pointer - * - * Choose the target low power state for the GPU - */ -void amdgpu_choose_low_power_state(struct amdgpu_device *adev) -{ - if (adev->in_runpm) - return; - - if (amdgpu_acpi_is_s0ix_active(adev)) - adev->in_s0ix = true; - else if (amdgpu_acpi_is_s3_active(adev)) - adev->in_s3 = true; -} - #endif /* CONFIG_SUSPEND */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 32c571e9288a..57a7f72f366f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4241,15 +4241,13 @@ int amdgpu_device_prepare(struct drm_device *dev) struct amdgpu_device *adev = drm_to_adev(dev); int i, r; - amdgpu_choose_low_power_state(adev); - if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; /* Evict the majority of BOs before starting suspend sequence */ r = amdgpu_device_evict_resources(adev); if (r) - goto unprepare; + return r; flush_delayed_work(&adev->gfx.gfx_off_delay_work); @@ -4260,15 +4258,10 @@ int amdgpu_device_prepare(struct drm_device *dev) continue; r = adev->ip_blocks[i].version->funcs->prepare_suspend((void *)adev); if (r) - goto unprepare; + return r; } return 0; - -unprepare: - adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; - - return r; } /** From 39d30f8ecc98eba52dd0dc82562c0d9282e5f802 Mon Sep 17 00:00:00 2001 From: Jonathan Cameron Date: Sun, 13 Apr 2025 11:34:25 +0100 Subject: [PATCH 116/529] iio: adc: ad7768-1: Fix insufficient alignment of timestamp. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit ffbc26bc91c1f1eb3dcf5d8776e74cbae21ee13a ] On architectures where an s64 is not 64-bit aligned, this may result insufficient alignment of the timestamp and the structure being too small. Use aligned_s64 to force the alignment. Fixes: a1caeebab07e ("iio: adc: ad7768-1: Fix too small buffer passed to iio_push_to_buffers_with_timestamp()") # aligned_s64 newer Reported-by: David Lechner Reviewed-by: Nuno Sá Reviewed-by: David Lechner Link: https://patch.msgid.link/20250413103443.2420727-3-jic23@kernel.org Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin --- drivers/iio/adc/ad7768-1.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/adc/ad7768-1.c b/drivers/iio/adc/ad7768-1.c index 74b0c85944bd..967f06cd3f94 100644 --- a/drivers/iio/adc/ad7768-1.c +++ b/drivers/iio/adc/ad7768-1.c @@ -169,7 +169,7 @@ struct ad7768_state { union { struct { __be32 chan; - s64 timestamp; + aligned_s64 timestamp; } scan; __be32 d32; u8 d8[2]; From c4a550e0bae6b476c0ee5b1f53c93ed404b93193 Mon Sep 17 00:00:00 2001 From: David Lechner Date: Thu, 17 Apr 2025 11:52:37 -0500 Subject: [PATCH 117/529] iio: chemical: sps30: use aligned_s64 for timestamp MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit bb49d940344bcb8e2b19e69d7ac86f567887ea9a ] Follow the pattern of other drivers and use aligned_s64 for the timestamp. This will ensure that the timestamp is correctly aligned on all architectures. Fixes: a5bf6fdd19c3 ("iio:chemical:sps30: Fix timestamp alignment") Signed-off-by: David Lechner Reviewed-by: Nuno Sá Link: https://patch.msgid.link/20250417-iio-more-timestamp-alignment-v1-5-eafac1e22318@baylibre.com Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin --- drivers/iio/chemical/sps30.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/chemical/sps30.c b/drivers/iio/chemical/sps30.c index 814ce0aad1cc..4085a36cd1db 100644 --- a/drivers/iio/chemical/sps30.c +++ b/drivers/iio/chemical/sps30.c @@ -108,7 +108,7 @@ static irqreturn_t sps30_trigger_handler(int irq, void *p) int ret; struct { s32 data[4]; /* PM1, PM2P5, PM4, PM10 */ - s64 ts; + aligned_s64 ts; } scan; mutex_lock(&state->lock); From 718df1494811db9d467e5a93c41f85ed0fdbc85e Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Fri, 4 Apr 2025 15:31:16 +0200 Subject: [PATCH 118/529] clocksource/i8253: Use raw_spinlock_irqsave() in clockevent_i8253_disable() [ Upstream commit 94cff94634e506a4a44684bee1875d2dbf782722 ] On x86 during boot, clockevent_i8253_disable() can be invoked via x86_late_time_init -> hpet_time_init() -> pit_timer_init() which happens with enabled interrupts. If some of the old i8253 hardware is actually used then lockdep will notice that i8253_lock is used in hard interrupt context. This causes lockdep to complain because it observed the lock being acquired with interrupts enabled and in hard interrupt context. Make clockevent_i8253_disable() acquire the lock with raw_spinlock_irqsave() to cure this. [ tglx: Massage change log and use guard() ] Fixes: c8c4076723dac ("x86/timer: Skip PIT initialization on modern chipsets") Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lore.kernel.org/all/20250404133116.p-XRWJXf@linutronix.de Signed-off-by: Sasha Levin --- drivers/clocksource/i8253.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/clocksource/i8253.c b/drivers/clocksource/i8253.c index 39f7c2d736d1..b603c25f3dfa 100644 --- a/drivers/clocksource/i8253.c +++ b/drivers/clocksource/i8253.c @@ -103,7 +103,7 @@ int __init clocksource_i8253_init(void) #ifdef CONFIG_CLKEVT_I8253 void clockevent_i8253_disable(void) { - raw_spin_lock(&i8253_lock); + guard(raw_spinlock_irqsave)(&i8253_lock); /* * Writing the MODE register should stop the counter, according to @@ -132,8 +132,6 @@ void clockevent_i8253_disable(void) outb_p(0, PIT_CH0); outb_p(0x30, PIT_MODE); - - raw_spin_unlock(&i8253_lock); } static int pit_shutdown(struct clock_event_device *evt) From 52daccfc3fa68ee1902d52124921453d7a335591 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Sat, 12 Apr 2025 09:57:14 +0200 Subject: [PATCH 119/529] RDMA/rxe: Fix slab-use-after-free Read in rxe_queue_cleanup bug [ Upstream commit f81b33582f9339d2dc17c69b92040d3650bb4bae ] Call Trace: __dump_stack lib/dump_stack.c:94 [inline] dump_stack_lvl+0x7d/0xa0 lib/dump_stack.c:120 print_address_description mm/kasan/report.c:378 [inline] print_report+0xcf/0x610 mm/kasan/report.c:489 kasan_report+0xb5/0xe0 mm/kasan/report.c:602 rxe_queue_cleanup+0xd0/0xe0 drivers/infiniband/sw/rxe/rxe_queue.c:195 rxe_cq_cleanup+0x3f/0x50 drivers/infiniband/sw/rxe/rxe_cq.c:132 __rxe_cleanup+0x168/0x300 drivers/infiniband/sw/rxe/rxe_pool.c:232 rxe_create_cq+0x22e/0x3a0 drivers/infiniband/sw/rxe/rxe_verbs.c:1109 create_cq+0x658/0xb90 drivers/infiniband/core/uverbs_cmd.c:1052 ib_uverbs_create_cq+0xc7/0x120 drivers/infiniband/core/uverbs_cmd.c:1095 ib_uverbs_write+0x969/0xc90 drivers/infiniband/core/uverbs_main.c:679 vfs_write fs/read_write.c:677 [inline] vfs_write+0x26a/0xcc0 fs/read_write.c:659 ksys_write+0x1b8/0x200 fs/read_write.c:731 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xaa/0x1b0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f In the function rxe_create_cq, when rxe_cq_from_init fails, the function rxe_cleanup will be called to handle the allocated resources. In fact, some memory resources have already been freed in the function rxe_cq_from_init. Thus, this problem will occur. The solution is to let rxe_cleanup do all the work. Fixes: 8700e3e7c485 ("Soft RoCE driver") Link: https://paste.ubuntu.com/p/tJgC42wDf6/ Tested-by: liuyi Signed-off-by: Zhu Yanjun Link: https://patch.msgid.link/20250412075714.3257358-1-yanjun.zhu@linux.dev Reviewed-by: Daisuke Matsuda Signed-off-by: Leon Romanovsky Signed-off-by: Sasha Levin --- drivers/infiniband/sw/rxe/rxe_cq.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/infiniband/sw/rxe/rxe_cq.c b/drivers/infiniband/sw/rxe/rxe_cq.c index b1a0ab3cd4bd..43dfc6fd8a3e 100644 --- a/drivers/infiniband/sw/rxe/rxe_cq.c +++ b/drivers/infiniband/sw/rxe/rxe_cq.c @@ -71,11 +71,8 @@ int rxe_cq_from_init(struct rxe_dev *rxe, struct rxe_cq *cq, int cqe, err = do_mmap_info(rxe, uresp ? &uresp->mi : NULL, udata, cq->queue->buf, cq->queue->buf_size, &cq->queue->ip); - if (err) { - vfree(cq->queue->buf); - kfree(cq->queue); + if (err) return err; - } cq->is_user = uresp; From de9b6d0635cc887d2c71f57a3421b6e25715cc54 Mon Sep 17 00:00:00 2001 From: Qasim Ijaz Date: Thu, 27 Mar 2025 23:11:46 +0000 Subject: [PATCH 120/529] HID: thrustmaster: fix memory leak in thrustmaster_interrupts() [ Upstream commit 09d546303b370113323bfff456c4e8cff8756005 ] In thrustmaster_interrupts(), the allocated send_buf is not freed if the usb_check_int_endpoints() check fails, leading to a memory leak. Fix this by ensuring send_buf is freed before returning in the error path. Fixes: 50420d7c79c3 ("HID: hid-thrustmaster: Fix warning in thrustmaster_probe by adding endpoint check") Signed-off-by: Qasim Ijaz Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/hid-thrustmaster.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/hid/hid-thrustmaster.c b/drivers/hid/hid-thrustmaster.c index 3b81468a1df2..0bf70664c35e 100644 --- a/drivers/hid/hid-thrustmaster.c +++ b/drivers/hid/hid-thrustmaster.c @@ -174,6 +174,7 @@ static void thrustmaster_interrupts(struct hid_device *hdev) u8 ep_addr[2] = {b_ep, 0}; if (!usb_check_int_endpoints(usbif, ep_addr)) { + kfree(send_buf); hid_err(hdev, "Unexpected non-int endpoint\n"); return; } From 01b76cc8ca243fc3376b035aa326bbc4f03d384b Mon Sep 17 00:00:00 2001 From: Henry Martin Date: Tue, 1 Apr 2025 17:48:53 +0800 Subject: [PATCH 121/529] HID: uclogic: Add NULL check in uclogic_input_configured() [ Upstream commit bd07f751208ba190f9b0db5e5b7f35d5bb4a8a1e ] devm_kasprintf() returns NULL when memory allocation fails. Currently, uclogic_input_configured() does not check for this case, which results in a NULL pointer dereference. Add NULL check after devm_kasprintf() to prevent this issue. Fixes: dd613a4e45f8 ("HID: uclogic: Correct devm device reference for hidinput input_dev name") Signed-off-by: Henry Martin Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/hid-uclogic-core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/hid/hid-uclogic-core.c b/drivers/hid/hid-uclogic-core.c index 39114d5c55a0..5b35f9f321d4 100644 --- a/drivers/hid/hid-uclogic-core.c +++ b/drivers/hid/hid-uclogic-core.c @@ -142,11 +142,12 @@ static int uclogic_input_configured(struct hid_device *hdev, suffix = "System Control"; break; } - } - - if (suffix) + } else { hi->input->name = devm_kasprintf(&hdev->dev, GFP_KERNEL, "%s %s", hdev->name, suffix); + if (!hi->input->name) + return -ENOMEM; + } return 0; } From 85fb7f8ca5f8c138579fdfc9b97b3083e6077d40 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 17 Apr 2025 15:25:08 +0800 Subject: [PATCH 122/529] nfs: handle failure of nfs_get_lock_context in unlock path [ Upstream commit c457dc1ec770a22636b473ce5d35614adfe97636 ] When memory is insufficient, the allocation of nfs_lock_context in nfs_get_lock_context() fails and returns -ENOMEM. If we mistakenly treat an nfs4_unlockdata structure (whose l_ctx member has been set to -ENOMEM) as valid and proceed to execute rpc_run_task(), this will trigger a NULL pointer dereference in nfs4_locku_prepare. For example: BUG: kernel NULL pointer dereference, address: 000000000000000c PGD 0 P4D 0 Oops: Oops: 0000 [#1] SMP PTI CPU: 15 UID: 0 PID: 12 Comm: kworker/u64:0 Not tainted 6.15.0-rc2-dirty #60 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 Workqueue: rpciod rpc_async_schedule RIP: 0010:nfs4_locku_prepare+0x35/0xc2 Code: 89 f2 48 89 fd 48 c7 c7 68 69 ef b5 53 48 8b 8e 90 00 00 00 48 89 f3 RSP: 0018:ffffbbafc006bdb8 EFLAGS: 00010246 RAX: 000000000000004b RBX: ffff9b964fc1fa00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: fffffffffffffff4 RDI: ffff9ba53fddbf40 RBP: ffff9ba539934000 R08: 0000000000000000 R09: ffffbbafc006bc38 R10: ffffffffb6b689c8 R11: 0000000000000003 R12: ffff9ba539934030 R13: 0000000000000001 R14: 0000000004248060 R15: ffffffffb56d1c30 FS: 0000000000000000(0000) GS:ffff9ba5881f0000(0000) knlGS:00000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000000000000c CR3: 000000093f244000 CR4: 00000000000006f0 Call Trace: __rpc_execute+0xbc/0x480 rpc_async_schedule+0x2f/0x40 process_one_work+0x232/0x5d0 worker_thread+0x1da/0x3d0 ? __pfx_worker_thread+0x10/0x10 kthread+0x10d/0x240 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x34/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Modules linked in: CR2: 000000000000000c ---[ end trace 0000000000000000 ]--- Free the allocated nfs4_unlockdata when nfs_get_lock_context() fails and return NULL to terminate subsequent rpc_run_task, preventing NULL pointer dereference. Fixes: f30cb757f680 ("NFS: Always wait for I/O completion before unlock") Signed-off-by: Li Lingfeng Reviewed-by: Jeff Layton Link: https://lore.kernel.org/r/20250417072508.3850532-1-lilingfeng3@huawei.com Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/nfs4proc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 5b06b8d4e014..acef50824d1a 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -6886,10 +6886,18 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs4_unlockdata *p; struct nfs4_state *state = lsp->ls_state; struct inode *inode = state->inode; + struct nfs_lock_context *l_ctx; p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return NULL; + l_ctx = nfs_get_lock_context(ctx); + if (!IS_ERR(l_ctx)) { + p->l_ctx = l_ctx; + } else { + kfree(p); + return NULL; + } p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; p->arg.seqid = seqid; @@ -6897,7 +6905,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->lsp = lsp; /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); - p->l_ctx = nfs_get_lock_context(ctx); locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); From 915c3de392678f805f06af9371afc103f4f7a1b9 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Fri, 2 May 2025 13:10:35 +0200 Subject: [PATCH 123/529] spi: loopback-test: Do not split 1024-byte hexdumps [ Upstream commit a73fa3690a1f3014d6677e368dce4e70767a6ba2 ] spi_test_print_hex_dump() prints buffers holding less than 1024 bytes in full. Larger buffers are truncated: only the first 512 and the last 512 bytes are printed, separated by a truncation message. The latter is confusing in case the buffer holds exactly 1024 bytes, as all data is printed anyway. Fix this by printing buffers holding up to and including 1024 bytes in full. Fixes: 84e0c4e5e2c4ef42 ("spi: add loopback test driver to allow for spi_master regression tests") Signed-off-by: Geert Uytterhoeven Link: https://patch.msgid.link/37ee1bc90c6554c9347040adabf04188c8f704aa.1746184171.git.geert+renesas@glider.be Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-loopback-test.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/spi/spi-loopback-test.c b/drivers/spi/spi-loopback-test.c index dd7de8fa37d0..ab29ae463f67 100644 --- a/drivers/spi/spi-loopback-test.c +++ b/drivers/spi/spi-loopback-test.c @@ -410,7 +410,7 @@ MODULE_LICENSE("GPL"); static void spi_test_print_hex_dump(char *pre, const void *ptr, size_t len) { /* limit the hex_dump */ - if (len < 1024) { + if (len <= 1024) { print_hex_dump(KERN_INFO, pre, DUMP_PREFIX_OFFSET, 16, 1, ptr, len, 0); From d38939ebe0d992d581acb6885c1723fa83c1fb2c Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Tue, 6 May 2025 21:35:58 -0700 Subject: [PATCH 124/529] net_sched: Flush gso_skb list too during ->change() [ Upstream commit 2d3cbfd6d54a2c39ce3244f33f85c595844bd7b8 ] Previously, when reducing a qdisc's limit via the ->change() operation, only the main skb queue was trimmed, potentially leaving packets in the gso_skb list. This could result in NULL pointer dereference when we only check sch->limit against sch->q.qlen. This patch introduces a new helper, qdisc_dequeue_internal(), which ensures both the gso_skb list and the main queue are properly flushed when trimming excess packets. All relevant qdiscs (codel, fq, fq_codel, fq_pie, hhf, pie) are updated to use this helper in their ->change() routines. Fixes: 76e3cc126bb2 ("codel: Controlled Delay AQM") Fixes: 4b549a2ef4be ("fq_codel: Fair Queue Codel AQM") Fixes: afe4fd062416 ("pkt_sched: fq: Fair Queue packet scheduler") Fixes: ec97ecf1ebe4 ("net: sched: add Flow Queue PIE packet scheduler") Fixes: 10239edf86f1 ("net-qdisc-hhf: Heavy-Hitter Filter (HHF) qdisc") Fixes: d4b36210c2e6 ("net: pkt_sched: PIE AQM scheme") Reported-by: Will Reported-by: Savy Signed-off-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/sch_generic.h | 15 +++++++++++++++ net/sched/sch_codel.c | 2 +- net/sched/sch_fq.c | 2 +- net/sched/sch_fq_codel.c | 2 +- net/sched/sch_fq_pie.c | 2 +- net/sched/sch_hhf.c | 2 +- net/sched/sch_pie.c | 2 +- 7 files changed, 21 insertions(+), 6 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 80f657bf2e04..b34e9e93a146 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -997,6 +997,21 @@ static inline struct sk_buff *__qdisc_dequeue_head(struct qdisc_skb_head *qh) return skb; } +static inline struct sk_buff *qdisc_dequeue_internal(struct Qdisc *sch, bool direct) +{ + struct sk_buff *skb; + + skb = __skb_dequeue(&sch->gso_skb); + if (skb) { + sch->q.qlen--; + return skb; + } + if (direct) + return __qdisc_dequeue_head(&sch->q); + else + return sch->dequeue(sch); +} + static inline struct sk_buff *qdisc_dequeue_head(struct Qdisc *sch) { struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c index 5f2e06815745..63c02040b426 100644 --- a/net/sched/sch_codel.c +++ b/net/sched/sch_codel.c @@ -168,7 +168,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt, qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c index f59a2cb2c803..91f5ef6be0f2 100644 --- a/net/sched/sch_fq.c +++ b/net/sched/sch_fq.c @@ -901,7 +901,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt, sch_tree_lock(sch); } while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = fq_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); if (!skb) break; diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index 9330923a624c..47b5a056165c 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -431,7 +431,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, while (sch->q.qlen > sch->limit || q->memory_usage > q->memory_limit) { - struct sk_buff *skb = fq_codel_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); q->cstats.drop_len += qdisc_pkt_len(skb); rtnl_kfree_skbs(skb, skb); diff --git a/net/sched/sch_fq_pie.c b/net/sched/sch_fq_pie.c index 68e6acd0f130..607c580d75e4 100644 --- a/net/sched/sch_fq_pie.c +++ b/net/sched/sch_fq_pie.c @@ -357,7 +357,7 @@ static int fq_pie_change(struct Qdisc *sch, struct nlattr *opt, /* Drop excess packets if new limit is lower */ while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = fq_pie_qdisc_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); len_dropped += qdisc_pkt_len(skb); num_dropped += 1; diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c index d26cd436cbe3..83fc44f20e31 100644 --- a/net/sched/sch_hhf.c +++ b/net/sched/sch_hhf.c @@ -560,7 +560,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt, qlen = sch->q.qlen; prev_backlog = sch->qstats.backlog; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = hhf_dequeue(sch); + struct sk_buff *skb = qdisc_dequeue_internal(sch, false); rtnl_kfree_skbs(skb, skb); } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index b60b31ef71cc..e1bb151a9719 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -190,7 +190,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt, /* Drop excess packets if new limit is lower */ qlen = sch->q.qlen; while (sch->q.qlen > sch->limit) { - struct sk_buff *skb = __qdisc_dequeue_head(&sch->q); + struct sk_buff *skb = qdisc_dequeue_internal(sch, true); dropped += qdisc_pkt_len(skb); qdisc_qstats_backlog_dec(sch, skb); From 1cb9a891cf16cc91c42ea3a6da1bfcd724e4e623 Mon Sep 17 00:00:00 2001 From: Andrew Jeffery Date: Thu, 8 May 2025 14:16:00 +0930 Subject: [PATCH 125/529] net: mctp: Ensure keys maintain only one ref to corresponding dev [ Upstream commit e4f349bd6e58051df698b82f94721f18a02a293d ] mctp_flow_prepare_output() is called in mctp_route_output(), which places outbound packets onto a given interface. The packet may represent a message fragment, in which case we provoke an unbalanced reference count to the underlying device. This causes trouble if we ever attempt to remove the interface: [ 48.702195] usb 1-1: USB disconnect, device number 2 [ 58.883056] unregister_netdevice: waiting for mctpusb0 to become free. Usage count = 2 [ 69.022548] unregister_netdevice: waiting for mctpusb0 to become free. Usage count = 2 [ 79.172568] unregister_netdevice: waiting for mctpusb0 to become free. Usage count = 2 ... Predicate the invocation of mctp_dev_set_key() in mctp_flow_prepare_output() on not already having associated the device with the key. It's not yet realistic to uphold the property that the key maintains only one device reference earlier in the transmission sequence as the route (and therefore the device) may not be known at the time the key is associated with the socket. Fixes: 67737c457281 ("mctp: Pass flow data & flow release events to drivers") Acked-by: Jeremy Kerr Signed-off-by: Andrew Jeffery Link: https://patch.msgid.link/20250508-mctp-dev-refcount-v1-1-d4f965c67bb5@codeconstruct.com.au Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/mctp/route.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/mctp/route.c b/net/mctp/route.c index e72cdd4ce588..62952ad5cb63 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -274,8 +274,10 @@ static void mctp_flow_prepare_output(struct sk_buff *skb, struct mctp_dev *dev) key = flow->key; - if (WARN_ON(key->dev && key->dev != dev)) + if (key->dev) { + WARN_ON(key->dev != dev); return; + } mctp_dev_set_key(dev, key); } From aace6b63892ce8307e502a60fe2f5a4bc6e1cfe7 Mon Sep 17 00:00:00 2001 From: Mathieu Othacehe Date: Fri, 9 May 2025 14:19:35 +0200 Subject: [PATCH 126/529] net: cadence: macb: Fix a possible deadlock in macb_halt_tx. [ Upstream commit c92d6089d8ad7d4d815ebcedee3f3907b539ff1f ] There is a situation where after THALT is set high, TGO stays high as well. Because jiffies are never updated, as we are in a context with interrupts disabled, we never exit that loop and have a deadlock. That deadlock was noticed on a sama5d4 device that stayed locked for days. Use retries instead of jiffies so that the timeout really works and we do not have a deadlock anymore. Fixes: e86cd53afc590 ("net/macb: better manage tx errors") Signed-off-by: Mathieu Othacehe Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250509121935.16282-1-othacehe@gnu.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/cadence/macb_main.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb_main.c b/drivers/net/ethernet/cadence/macb_main.c index fc3342944dbc..d2f4709dee0d 100644 --- a/drivers/net/ethernet/cadence/macb_main.c +++ b/drivers/net/ethernet/cadence/macb_main.c @@ -962,22 +962,15 @@ static void macb_update_stats(struct macb *bp) static int macb_halt_tx(struct macb *bp) { - unsigned long halt_time, timeout; - u32 status; + u32 status; macb_writel(bp, NCR, macb_readl(bp, NCR) | MACB_BIT(THALT)); - timeout = jiffies + usecs_to_jiffies(MACB_HALT_TIMEOUT); - do { - halt_time = jiffies; - status = macb_readl(bp, TSR); - if (!(status & MACB_BIT(TGO))) - return 0; - - udelay(250); - } while (time_before(halt_time, timeout)); - - return -ETIMEDOUT; + /* Poll TSR until TGO is cleared or timeout. */ + return read_poll_timeout_atomic(macb_readl, status, + !(status & MACB_BIT(TGO)), + 250, MACB_HALT_TIMEOUT, false, + bp, TSR); } static void macb_tx_unmap(struct macb *bp, struct macb_tx_skb *tx_skb, int budget) From 4626234ca316386beb707abaed53564c77b459bb Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Fri, 9 May 2025 14:38:16 +0300 Subject: [PATCH 127/529] net: dsa: sja1105: discard incoming frames in BR_STATE_LISTENING [ Upstream commit 498625a8ab2c8e1c9ab5105744310e8d6952cc01 ] It has been reported that when under a bridge with stp_state=1, the logs get spammed with this message: [ 251.734607] fsl_dpaa2_eth dpni.5 eth0: Couldn't decode source port Further debugging shows the following info associated with packets: source_port=-1, switch_id=-1, vid=-1, vbid=1 In other words, they are data plane packets which are supposed to be decoded by dsa_tag_8021q_find_port_by_vbid(), but the latter (correctly) refuses to do so, because no switch port is currently in BR_STATE_LEARNING or BR_STATE_FORWARDING - so the packet is effectively unexpected. The error goes away after the port progresses to BR_STATE_LEARNING in 15 seconds (the default forward_time of the bridge), because then, dsa_tag_8021q_find_port_by_vbid() can correctly associate the data plane packets with a plausible bridge port in a plausible STP state. Re-reading IEEE 802.1D-1990, I see the following: "4.4.2 Learning: (...) The Forwarding Process shall discard received frames." IEEE 802.1D-2004 further clarifies: "DISABLED, BLOCKING, LISTENING, and BROKEN all correspond to the DISCARDING port state. While those dot1dStpPortStates serve to distinguish reasons for discarding frames, the operation of the Forwarding and Learning processes is the same for all of them. (...) LISTENING represents a port that the spanning tree algorithm has selected to be part of the active topology (computing a Root Port or Designated Port role) but is temporarily discarding frames to guard against loops or incorrect learning." Well, this is not what the driver does - instead it sets mac[port].ingress = true. To get rid of the log spam, prevent unexpected data plane packets to be received by software by discarding them on ingress in the LISTENING state. In terms of blame attribution: the prints only date back to commit d7f9787a763f ("net: dsa: tag_8021q: add support for imprecise RX based on the VBID"). However, the settings would permit a LISTENING port to forward to a FORWARDING port, and the standard suggests that's not OK. Fixes: 640f763f98c2 ("net: dsa: sja1105: Add support for Spanning Tree Protocol") Signed-off-by: Vladimir Oltean Link: https://patch.msgid.link/20250509113816.2221992-1-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/dsa/sja1105/sja1105_main.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/net/dsa/sja1105/sja1105_main.c b/drivers/net/dsa/sja1105/sja1105_main.c index f1f1368e8146..d51d982c4bc0 100644 --- a/drivers/net/dsa/sja1105/sja1105_main.c +++ b/drivers/net/dsa/sja1105/sja1105_main.c @@ -2083,6 +2083,7 @@ static void sja1105_bridge_stp_state_set(struct dsa_switch *ds, int port, switch (state) { case BR_STATE_DISABLED: case BR_STATE_BLOCKING: + case BR_STATE_LISTENING: /* From UM10944 description of DRPDTAG (why put this there?): * "Management traffic flows to the port regardless of the state * of the INGRESS flag". So BPDUs are still be allowed to pass. @@ -2092,11 +2093,6 @@ static void sja1105_bridge_stp_state_set(struct dsa_switch *ds, int port, mac[port].egress = false; mac[port].dyn_learn = false; break; - case BR_STATE_LISTENING: - mac[port].ingress = true; - mac[port].egress = false; - mac[port].dyn_learn = false; - break; case BR_STATE_LEARNING: mac[port].ingress = true; mac[port].egress = false; From 6fc167d45fc3c871a3fd20c0cb8a647a46336799 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 6 May 2025 20:35:40 -0700 Subject: [PATCH 128/529] nvme-pci: make nvme_pci_npages_prp() __always_inline [ Upstream commit 40696426b8c8c4f13cf6ac52f0470eed144be4b2 ] The only reason nvme_pci_npages_prp() could be used as a compile-time known result in BUILD_BUG_ON() is because the compiler was always choosing to inline the function. Under special circumstances (sanitizer coverage functions disabled for __init functions on ARCH=um), the compiler decided to stop inlining it: drivers/nvme/host/pci.c: In function 'nvme_init': include/linux/compiler_types.h:557:45: error: call to '__compiletime_assert_678' declared with attribute error: BUILD_BUG_ON failed: nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS 557 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^ include/linux/compiler_types.h:538:25: note: in definition of macro '__compiletime_assert' 538 | prefix ## suffix(); \ | ^~~~~~ include/linux/compiler_types.h:557:9: note: in expansion of macro '_compiletime_assert' 557 | _compiletime_assert(condition, msg, __compiletime_assert_, __COUNTER__) | ^~~~~~~~~~~~~~~~~~~ include/linux/build_bug.h:39:37: note: in expansion of macro 'compiletime_assert' 39 | #define BUILD_BUG_ON_MSG(cond, msg) compiletime_assert(!(cond), msg) | ^~~~~~~~~~~~~~~~~~ include/linux/build_bug.h:50:9: note: in expansion of macro 'BUILD_BUG_ON_MSG' 50 | BUILD_BUG_ON_MSG(condition, "BUILD_BUG_ON failed: " #condition) | ^~~~~~~~~~~~~~~~ drivers/nvme/host/pci.c:3804:9: note: in expansion of macro 'BUILD_BUG_ON' 3804 | BUILD_BUG_ON(nvme_pci_npages_prp() > NVME_MAX_NR_ALLOCATIONS); | ^~~~~~~~~~~~ Force it to be __always_inline to make sure it is always available for use with BUILD_BUG_ON(). Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505061846.12FMyRjj-lkp@intel.com/ Fixes: c372cdd1efdf ("nvme-pci: iod npages fits in s8") Signed-off-by: Kees Cook Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index da858463b255..553e6bb461b3 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -376,7 +376,7 @@ static bool nvme_dbbuf_update_and_check_event(u16 value, __le32 *dbbuf_db, * as it only leads to a small amount of wasted memory for the lifetime of * the I/O. */ -static int nvme_pci_npages_prp(void) +static __always_inline int nvme_pci_npages_prp(void) { unsigned max_bytes = (NVME_MAX_KB_SZ * 1024) + NVME_CTRL_PAGE_SIZE; unsigned nprps = DIV_ROUND_UP(max_bytes, NVME_CTRL_PAGE_SIZE); From 960946915988c5ffb9becb35771a9254ecc95527 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 8 May 2025 16:57:06 +0200 Subject: [PATCH 129/529] nvme-pci: acquire cq_poll_lock in nvme_poll_irqdisable [ Upstream commit 3d8932133dcecbd9bef1559533c1089601006f45 ] We need to lock this queue for that condition because the timeout work executes per-namespace and can poll the poll CQ. Reported-by: Hannes Reinecke Closes: https://lore.kernel.org/all/20240902130728.1999-1-hare@kernel.org/ Fixes: a0fa9647a54e ("NVMe: add blk polling support") Signed-off-by: Keith Busch Signed-off-by: Daniel Wagner Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 553e6bb461b3..49a3cb8f1f10 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1154,7 +1154,9 @@ static void nvme_poll_irqdisable(struct nvme_queue *nvmeq) WARN_ON_ONCE(test_bit(NVMEQ_POLLED, &nvmeq->flags)); disable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); + spin_lock(&nvmeq->cq_poll_lock); nvme_poll_cq(nvmeq, NULL); + spin_unlock(&nvmeq->cq_poll_lock); enable_irq(pci_irq_vector(pdev, nvmeq->cq_vector)); } From 6fbcfa3691f7500e51e8193f5283e1eb98a1a34c Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 13 May 2025 09:31:04 +0200 Subject: [PATCH 130/529] ALSA: sh: SND_AICA should depend on SH_DMA_API [ Upstream commit 66e48ef6ef506c89ec1b3851c6f9f5f80b5835ff ] If CONFIG_SH_DMA_API=n: WARNING: unmet direct dependencies detected for G2_DMA Depends on [n]: SH_DREAMCAST [=y] && SH_DMA_API [=n] Selected by [y]: - SND_AICA [=y] && SOUND [=y] && SND [=y] && SND_SUPERH [=y] && SH_DREAMCAST [=y] SND_AICA selects G2_DMA. As the latter depends on SH_DMA_API, the former should depend on SH_DMA_API, too. Fixes: f477a538c14d07f8 ("sh: dma: fix kconfig dependency for G2_DMA") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505131320.PzgTtl9H-lkp@intel.com/ Signed-off-by: Geert Uytterhoeven Link: https://patch.msgid.link/b90625f8a9078d0d304bafe862cbe3a3fab40082.1747121335.git.geert+renesas@glider.be Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/sh/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/sh/Kconfig b/sound/sh/Kconfig index b75fbb3236a7..f5fa09d740b4 100644 --- a/sound/sh/Kconfig +++ b/sound/sh/Kconfig @@ -14,7 +14,7 @@ if SND_SUPERH config SND_AICA tristate "Dreamcast Yamaha AICA sound" - depends on SH_DREAMCAST + depends on SH_DREAMCAST && SH_DMA_API select SND_PCM select G2_DMA help From 1e577aeb51e9deba4f2c10edfcb07cb3cb406598 Mon Sep 17 00:00:00 2001 From: Carolina Jubran Date: Sun, 11 May 2025 13:15:52 +0300 Subject: [PATCH 131/529] net/mlx5e: Disable MACsec offload for uplink representor profile [ Upstream commit 588431474eb7572e57a927fa8558c9ba2f8af143 ] MACsec offload is not supported in switchdev mode for uplink representors. When switching to the uplink representor profile, the MACsec offload feature must be cleared from the netdevice's features. If left enabled, attempts to add offloads result in a null pointer dereference, as the uplink representor does not support MACsec offload even though the feature bit remains set. Clear NETIF_F_HW_MACSEC in mlx5e_fix_uplink_rep_features(). Kernel log: Oops: general protection fault, probably for non-canonical address 0xdffffc000000000f: 0000 [#1] SMP KASAN KASAN: null-ptr-deref in range [0x0000000000000078-0x000000000000007f] CPU: 29 UID: 0 PID: 4714 Comm: ip Not tainted 6.14.0-rc4_for_upstream_debug_2025_03_02_17_35 #1 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 RIP: 0010:__mutex_lock+0x128/0x1dd0 Code: d0 7c 08 84 d2 0f 85 ad 15 00 00 8b 35 91 5c fe 03 85 f6 75 29 49 8d 7e 60 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 a6 15 00 00 4d 3b 76 60 0f 85 fd 0b 00 00 65 ff RSP: 0018:ffff888147a4f160 EFLAGS: 00010206 RAX: dffffc0000000000 RBX: 0000000000000000 RCX: 0000000000000001 RDX: 000000000000000f RSI: 0000000000000000 RDI: 0000000000000078 RBP: ffff888147a4f2e0 R08: ffffffffa05d2c19 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 R13: dffffc0000000000 R14: 0000000000000018 R15: ffff888152de0000 FS: 00007f855e27d800(0000) GS:ffff88881ee80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004e5768 CR3: 000000013ae7c005 CR4: 0000000000372eb0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe07f0 DR7: 0000000000000400 Call Trace: ? die_addr+0x3d/0xa0 ? exc_general_protection+0x144/0x220 ? asm_exc_general_protection+0x22/0x30 ? mlx5e_macsec_add_secy+0xf9/0x700 [mlx5_core] ? __mutex_lock+0x128/0x1dd0 ? lockdep_set_lock_cmp_fn+0x190/0x190 ? mlx5e_macsec_add_secy+0xf9/0x700 [mlx5_core] ? mutex_lock_io_nested+0x1ae0/0x1ae0 ? lock_acquire+0x1c2/0x530 ? macsec_upd_offload+0x145/0x380 ? lockdep_hardirqs_on_prepare+0x400/0x400 ? kasan_save_stack+0x30/0x40 ? kasan_save_stack+0x20/0x40 ? kasan_save_track+0x10/0x30 ? __kasan_kmalloc+0x77/0x90 ? __kmalloc_noprof+0x249/0x6b0 ? genl_family_rcv_msg_attrs_parse.constprop.0+0xb5/0x240 ? mlx5e_macsec_add_secy+0xf9/0x700 [mlx5_core] mlx5e_macsec_add_secy+0xf9/0x700 [mlx5_core] ? mlx5e_macsec_add_rxsa+0x11a0/0x11a0 [mlx5_core] macsec_update_offload+0x26c/0x820 ? macsec_set_mac_address+0x4b0/0x4b0 ? lockdep_hardirqs_on_prepare+0x284/0x400 ? _raw_spin_unlock_irqrestore+0x47/0x50 macsec_upd_offload+0x2c8/0x380 ? macsec_update_offload+0x820/0x820 ? __nla_parse+0x22/0x30 ? genl_family_rcv_msg_attrs_parse.constprop.0+0x15e/0x240 genl_family_rcv_msg_doit+0x1cc/0x2a0 ? genl_family_rcv_msg_attrs_parse.constprop.0+0x240/0x240 ? cap_capable+0xd4/0x330 genl_rcv_msg+0x3ea/0x670 ? genl_family_rcv_msg_dumpit+0x2a0/0x2a0 ? lockdep_set_lock_cmp_fn+0x190/0x190 ? macsec_update_offload+0x820/0x820 netlink_rcv_skb+0x12b/0x390 ? genl_family_rcv_msg_dumpit+0x2a0/0x2a0 ? netlink_ack+0xd80/0xd80 ? rwsem_down_read_slowpath+0xf90/0xf90 ? netlink_deliver_tap+0xcd/0xac0 ? netlink_deliver_tap+0x155/0xac0 ? _copy_from_iter+0x1bb/0x12c0 genl_rcv+0x24/0x40 netlink_unicast+0x440/0x700 ? netlink_attachskb+0x760/0x760 ? lock_acquire+0x1c2/0x530 ? __might_fault+0xbb/0x170 netlink_sendmsg+0x749/0xc10 ? netlink_unicast+0x700/0x700 ? __might_fault+0xbb/0x170 ? netlink_unicast+0x700/0x700 __sock_sendmsg+0xc5/0x190 ____sys_sendmsg+0x53f/0x760 ? import_iovec+0x7/0x10 ? kernel_sendmsg+0x30/0x30 ? __copy_msghdr+0x3c0/0x3c0 ? filter_irq_stacks+0x90/0x90 ? stack_depot_save_flags+0x28/0xa30 ___sys_sendmsg+0xeb/0x170 ? kasan_save_stack+0x30/0x40 ? copy_msghdr_from_user+0x110/0x110 ? do_syscall_64+0x6d/0x140 ? lock_acquire+0x1c2/0x530 ? __virt_addr_valid+0x116/0x3b0 ? __virt_addr_valid+0x1da/0x3b0 ? lock_downgrade+0x680/0x680 ? __delete_object+0x21/0x50 __sys_sendmsg+0xf7/0x180 ? __sys_sendmsg_sock+0x20/0x20 ? kmem_cache_free+0x14c/0x4e0 ? __x64_sys_close+0x78/0xd0 do_syscall_64+0x6d/0x140 entry_SYSCALL_64_after_hwframe+0x4b/0x53 RIP: 0033:0x7f855e113367 Code: 0e 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb b9 0f 1f 00 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 28 89 54 24 1c 48 89 74 24 10 RSP: 002b:00007ffd15e90c88 EFLAGS: 00000246 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007f855e113367 RDX: 0000000000000000 RSI: 00007ffd15e90cf0 RDI: 0000000000000004 RBP: 00007ffd15e90dbc R08: 0000000000000028 R09: 000000000045d100 R10: 00007f855e011dd8 R11: 0000000000000246 R12: 0000000000000019 R13: 0000000067c6b785 R14: 00000000004a1e80 R15: 0000000000000000 Modules linked in: 8021q garp mrp sch_ingress openvswitch nsh mlx5_ib mlx5_fwctl mlx5_dpll mlx5_core rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_uverbs ib_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay zram zsmalloc fuse [last unloaded: mlx5_core] ---[ end trace 0000000000000000 ]--- Fixes: 8ff0ac5be144 ("net/mlx5: Add MACsec offload Tx command support") Signed-off-by: Carolina Jubran Reviewed-by: Shahar Shitrit Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Reviewed-by: Simon Horman Link: https://patch.msgid.link/1746958552-561295-1-git-send-email-tariqt@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/en_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index f520949b2024..887d44635400 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -4023,6 +4023,10 @@ static netdev_features_t mlx5e_fix_uplink_rep_features(struct net_device *netdev if (netdev->features & NETIF_F_HW_VLAN_CTAG_FILTER) netdev_warn(netdev, "Disabling HW_VLAN CTAG FILTERING, not supported in switchdev mode\n"); + features &= ~NETIF_F_HW_MACSEC; + if (netdev->features & NETIF_F_HW_MACSEC) + netdev_warn(netdev, "Disabling HW MACsec offload, not supported in switchdev mode\n"); + return features; } From 10ea81e478be4b2bbd86a2876473b0991258066f Mon Sep 17 00:00:00 2001 From: Abdun Nihaal Date: Mon, 12 May 2025 10:18:27 +0530 Subject: [PATCH 132/529] qlcnic: fix memory leak in qlcnic_sriov_channel_cfg_cmd() [ Upstream commit 9d8a99c5a7c7f4f7eca2c168a4ec254409670035 ] In one of the error paths in qlcnic_sriov_channel_cfg_cmd(), the memory allocated in qlcnic_sriov_alloc_bc_mbx_args() for mailbox arguments is not freed. Fix that by jumping to the error path that frees them, by calling qlcnic_free_mbx_args(). This was found using static analysis. Fixes: f197a7aa6288 ("qlcnic: VF-PF communication channel implementation") Signed-off-by: Abdun Nihaal Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250512044829.36400-1-abdun.nihaal@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c index 28d24d59efb8..d57b976b9040 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_sriov_common.c @@ -1484,8 +1484,11 @@ static int qlcnic_sriov_channel_cfg_cmd(struct qlcnic_adapter *adapter, u8 cmd_o } cmd_op = (cmd.rsp.arg[0] & 0xff); - if (cmd.rsp.arg[0] >> 25 == 2) - return 2; + if (cmd.rsp.arg[0] >> 25 == 2) { + ret = 2; + goto out; + } + if (cmd_op == QLCNIC_BC_CMD_CHANNEL_INIT) set_bit(QLC_BC_VF_STATE, &vf->state); else From 6ba30f7aa2c550b2ac04f16b81a19a8c045b8660 Mon Sep 17 00:00:00 2001 From: Cosmin Tanislav Date: Thu, 8 May 2025 09:49:43 +0300 Subject: [PATCH 133/529] regulator: max20086: fix invalid memory access [ Upstream commit 6b0cd72757c69bc2d45da42b41023e288d02e772 ] max20086_parse_regulators_dt() calls of_regulator_match() using an array of struct of_regulator_match allocated on the stack for the matches argument. of_regulator_match() calls devm_of_regulator_put_matches(), which calls devres_alloc() to allocate a struct devm_of_regulator_matches which will be de-allocated using devm_of_regulator_put_matches(). struct devm_of_regulator_matches is populated with the stack allocated matches array. If the device fails to probe, devm_of_regulator_put_matches() will be called and will try to call of_node_put() on that stack pointer, generating the following dmesg entries: max20086 6-0028: Failed to read DEVICE_ID reg: -121 kobject: '\xc0$\xa5\x03' (000000002cebcb7a): is not initialized, yet kobject_put() is being called. Followed by a stack trace matching the call flow described above. Switch to allocating the matches array using devm_kcalloc() to avoid accessing the stack pointer long after it's out of scope. This also has the advantage of allowing multiple max20086 to probe without overriding the data stored inside the global of_regulator_match. Fixes: bfff546aae50 ("regulator: Add MAX20086-MAX20089 driver") Signed-off-by: Cosmin Tanislav Link: https://patch.msgid.link/20250508064947.2567255-1-demonsingur@gmail.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/regulator/max20086-regulator.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/drivers/regulator/max20086-regulator.c b/drivers/regulator/max20086-regulator.c index b8bf76c170fe..332fb58f9095 100644 --- a/drivers/regulator/max20086-regulator.c +++ b/drivers/regulator/max20086-regulator.c @@ -133,7 +133,7 @@ static int max20086_regulators_register(struct max20086 *chip) static int max20086_parse_regulators_dt(struct max20086 *chip, bool *boot_on) { - struct of_regulator_match matches[MAX20086_MAX_REGULATORS] = { }; + struct of_regulator_match *matches; struct device_node *node; unsigned int i; int ret; @@ -144,6 +144,11 @@ static int max20086_parse_regulators_dt(struct max20086 *chip, bool *boot_on) return -ENODEV; } + matches = devm_kcalloc(chip->dev, chip->info->num_outputs, + sizeof(*matches), GFP_KERNEL); + if (!matches) + return -ENOMEM; + for (i = 0; i < chip->info->num_outputs; ++i) matches[i].name = max20086_output_names[i]; From 2c09d6460fe9deac3e2262331f52b1d89646bd8c Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Mon, 12 May 2025 18:12:36 +0530 Subject: [PATCH 134/529] octeontx2-pf: macsec: Fix incorrect max transmit size in TX secy [ Upstream commit 865ab2461375e3a5a2526f91f9a9f17b8931bc9e ] MASCEC hardware block has a field called maximum transmit size for TX secy. Max packet size going out of MCS block has be programmed taking into account full packet size which has L2 header,SecTag and ICV. MACSEC offload driver is configuring max transmit size as macsec interface MTU which is incorrect. Say with 1500 MTU of real device, macsec interface created on top of real device will have MTU of 1468(1500 - (SecTag + ICV)). This is causing packets from macsec interface of size greater than or equal to 1468 are not getting transmitted out because driver programmed max transmit size as 1468 instead of 1514(1500 + ETH_HDR_LEN). Fixes: c54ffc73601c ("octeontx2-pf: mcs: Introduce MACSEC hardware offloading") Signed-off-by: Subbaraya Sundeep Reviewed-by: Simon Horman Link: https://patch.msgid.link/1747053756-4529-1-git-send-email-sbhatta@marvell.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c index a487a98eac88..6da8d8f2a870 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k_macsec.c @@ -428,7 +428,8 @@ static int cn10k_mcs_write_tx_secy(struct otx2_nic *pfvf, if (sw_tx_sc->encrypt) sectag_tci |= (MCS_TCI_E | MCS_TCI_C); - policy = FIELD_PREP(MCS_TX_SECY_PLCY_MTU, secy->netdev->mtu); + policy = FIELD_PREP(MCS_TX_SECY_PLCY_MTU, + pfvf->netdev->mtu + OTX2_ETH_HLEN); /* Write SecTag excluding AN bits(1..0) */ policy |= FIELD_PREP(MCS_TX_SECY_PLCY_ST_TCI, sectag_tci >> 2); policy |= FIELD_PREP(MCS_TX_SECY_PLCY_ST_OFFSET, tag_offset); From 8f7f96549bc55e4ef3a6b499bc5011e5de2f46c4 Mon Sep 17 00:00:00 2001 From: Pengtao He Date: Wed, 14 May 2025 21:20:13 +0800 Subject: [PATCH 135/529] net/tls: fix kernel panic when alloc_page failed [ Upstream commit 491deb9b8c4ad12fe51d554a69b8165b9ef9429f ] We cannot set frag_list to NULL pointer when alloc_page failed. It will be used in tls_strp_check_queue_ok when the next time tls_strp_read_sock is called. This is because we don't reset full_len in tls_strp_flush_anchor_copy() so the recv path will try to continue handling the partial record on the next call but we dettached the rcvq from the frag list. Alternative fix would be to reset full_len. Unable to handle kernel NULL pointer dereference at virtual address 0000000000000028 Call trace: tls_strp_check_rcv+0x128/0x27c tls_strp_data_ready+0x34/0x44 tls_data_ready+0x3c/0x1f0 tcp_data_ready+0x9c/0xe4 tcp_data_queue+0xf6c/0x12d0 tcp_rcv_established+0x52c/0x798 Fixes: 84c61fe1a75b ("tls: rx: do not use the standard strparser") Signed-off-by: Pengtao He Link: https://patch.msgid.link/20250514132013.17274-1-hept.hept.hept@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/tls/tls_strp.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index f37f4a0fcd3c..44d7f1aef9f1 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -396,7 +396,6 @@ static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort) return 0; shinfo = skb_shinfo(strp->anchor); - shinfo->frag_list = NULL; /* If we don't know the length go max plus page for cipher overhead */ need_spc = strp->stm.full_len ?: TLS_MAX_PAYLOAD_SIZE + PAGE_SIZE; @@ -412,6 +411,8 @@ static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort) page, 0, 0); } + shinfo->frag_list = NULL; + strp->copy_mode = 1; strp->stm.offset = 0; From 8f2eb3adb39498ec7171d24a42d4dcbfaed71cb6 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 10 May 2025 10:50:13 -0400 Subject: [PATCH 136/529] NFSv4/pnfs: Reset the layout state after a layoutreturn [ Upstream commit 6d6d7f91cc8c111d40416ac9240a3bb9396c5235 ] If there are still layout segments in the layout plh_return_lsegs list after a layout return, we should be resetting the state to ensure they eventually get returned as well. Fixes: 68f744797edd ("pNFS: Do not free layout segments that are marked for return") Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/pnfs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index f68286932019..fe0ddbce3bcb 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -732,6 +732,14 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return remaining; } +static void pnfs_reset_return_info(struct pnfs_layout_hdr *lo) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); +} + static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, struct list_head *free_me, @@ -1180,6 +1188,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); pnfs_free_returned_lsegs(lo, &freeme, range, seq); pnfs_set_layout_stateid(lo, stateid, NULL, true); + pnfs_reset_return_info(lo); } else pnfs_mark_layout_stateid_invalid(lo, &freeme); out_unlock: From 02a11d863864065cfc25c4a71995917c278ac41d Mon Sep 17 00:00:00 2001 From: Nathan Lynch Date: Thu, 3 Apr 2025 11:24:19 -0500 Subject: [PATCH 137/529] dmaengine: Revert "dmaengine: dmatest: Fix dmatest waiting less when interrupted" commit df180e65305f8c1e020d54bfc2132349fd693de1 upstream. Several issues with this change: * The analysis is flawed and it's unclear what problem is being fixed. There is no difference between wait_event_freezable_timeout() and wait_event_timeout() with respect to device interrupts. And of course "the interrupt notifying the finish of an operation happens during wait_event_freezable_timeout()" -- that's how it's supposed to work. * The link at the "Closes:" tag appears to be an unrelated use-after-free in idxd. * It introduces a regression: dmatest threads are meant to be freezable and this change breaks that. See discussion here: https://lore.kernel.org/dmaengine/878qpa13fe.fsf@AUSNATLYNCH.amd.com/ Fixes: e87ca16e9911 ("dmaengine: dmatest: Fix dmatest waiting less when interrupted") Signed-off-by: Nathan Lynch Link: https://lore.kernel.org/r/20250403-dmaengine-dmatest-revert-waiting-less-v1-1-8227c5a3d7c8@amd.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/dmatest.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index 78b8a97b2363..ffe621695e47 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -827,9 +827,9 @@ static int dmatest_func(void *data) } else { dma_async_issue_pending(chan); - wait_event_timeout(thread->done_wait, - done->done, - msecs_to_jiffies(params->timeout)); + wait_event_freezable_timeout(thread->done_wait, + done->done, + msecs_to_jiffies(params->timeout)); status = dma_async_is_tx_complete(chan, cookie, NULL, NULL); From 0567e7951fbd0c75c94223ff0f05427e90687339 Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Wed, 14 May 2025 22:17:43 +0800 Subject: [PATCH 138/529] LoongArch: Fix MAX_REG_OFFSET calculation commit 90436d234230e9a950ccd87831108b688b27a234 upstream. Fix MAX_REG_OFFSET calculation, make it point to the last register in 'struct pt_regs' and not to the marker itself, which could allow regs_get_register() to return an invalid offset. Cc: stable@vger.kernel.org Fixes: 803b0fc5c3f2baa6e5 ("LoongArch: Add process management") Signed-off-by: Huacai Chen Signed-off-by: Greg Kroah-Hartman --- arch/loongarch/include/asm/ptrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/include/asm/ptrace.h b/arch/loongarch/include/asm/ptrace.h index f5d9096d18aa..af86241ae08f 100644 --- a/arch/loongarch/include/asm/ptrace.h +++ b/arch/loongarch/include/asm/ptrace.h @@ -54,7 +54,7 @@ static inline void instruction_pointer_set(struct pt_regs *regs, unsigned long v /* Query offset/name of register from its name/offset */ extern int regs_query_register_offset(const char *name); -#define MAX_REG_OFFSET (offsetof(struct pt_regs, __last)) +#define MAX_REG_OFFSET (offsetof(struct pt_regs, __last) - sizeof(unsigned long)) /** * regs_get_register() - get register value from its offset From 0f035835b48c149ec19a0dfd7e3cb0a2800ea874 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 5 May 2025 16:03:16 +0100 Subject: [PATCH 139/529] btrfs: fix discard worker infinite loop after disabling discard commit 54db6d1bdd71fa90172a2a6aca3308bbf7fa7eb5 upstream. If the discard worker is running and there's currently only one block group, that block group is a data block group, it's in the unused block groups discard list and is being used (it got an extent allocated from it after becoming unused), the worker can end up in an infinite loop if a transaction abort happens or the async discard is disabled (during remount or unmount for example). This happens like this: 1) Task A, the discard worker, is at peek_discard_list() and find_next_block_group() returns block group X; 2) Block group X is in the unused block groups discard list (its discard index is BTRFS_DISCARD_INDEX_UNUSED) since at some point in the past it become an unused block group and was added to that list, but then later it got an extent allocated from it, so its ->used counter is not zero anymore; 3) The current transaction is aborted by task B and we end up at __btrfs_handle_fs_error() in the transaction abort path, where we call btrfs_discard_stop(), which clears BTRFS_FS_DISCARD_RUNNING from fs_info, and then at __btrfs_handle_fs_error() we set the fs to RO mode (setting SB_RDONLY in the super block's s_flags field); 4) Task A calls __add_to_discard_list() with the goal of moving the block group from the unused block groups discard list into another discard list, but at __add_to_discard_list() we end up doing nothing because btrfs_run_discard_work() returns false, since the super block has SB_RDONLY set in its flags and BTRFS_FS_DISCARD_RUNNING is not set anymore in fs_info->flags. So block group X remains in the unused block groups discard list; 5) Task A then does a goto into the 'again' label, calls find_next_block_group() again we gets block group X again. Then it repeats the previous steps over and over since there are not other block groups in the discard lists and block group X is never moved out of the unused block groups discard list since btrfs_run_discard_work() keeps returning false and therefore __add_to_discard_list() doesn't move block group X out of that discard list. When this happens we can get a soft lockup report like this: [71.957] watchdog: BUG: soft lockup - CPU#0 stuck for 27s! [kworker/u4:3:97] [71.957] Modules linked in: xfs af_packet rfkill (...) [71.957] CPU: 0 UID: 0 PID: 97 Comm: kworker/u4:3 Tainted: G W 6.14.2-1-default #1 openSUSE Tumbleweed 968795ef2b1407352128b466fe887416c33af6fa [71.957] Tainted: [W]=WARN [71.957] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014 [71.957] Workqueue: btrfs_discard btrfs_discard_workfn [btrfs] [71.957] RIP: 0010:btrfs_discard_workfn+0xc4/0x400 [btrfs] [71.957] Code: c1 01 48 83 (...) [71.957] RSP: 0018:ffffafaec03efe08 EFLAGS: 00000246 [71.957] RAX: ffff897045500000 RBX: ffff8970413ed8d0 RCX: 0000000000000000 [71.957] RDX: 0000000000000001 RSI: ffff8970413ed8d0 RDI: 0000000a8f1272ad [71.957] RBP: 0000000a9d61c60e R08: ffff897045500140 R09: 8080808080808080 [71.957] R10: ffff897040276800 R11: fefefefefefefeff R12: ffff8970413ed860 [71.957] R13: ffff897045500000 R14: ffff8970413ed868 R15: 0000000000000000 [71.957] FS: 0000000000000000(0000) GS:ffff89707bc00000(0000) knlGS:0000000000000000 [71.957] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [71.957] CR2: 00005605bcc8d2f0 CR3: 000000010376a001 CR4: 0000000000770ef0 [71.957] PKRU: 55555554 [71.957] Call Trace: [71.957] [71.957] process_one_work+0x17e/0x330 [71.957] worker_thread+0x2ce/0x3f0 [71.957] ? __pfx_worker_thread+0x10/0x10 [71.957] kthread+0xef/0x220 [71.957] ? __pfx_kthread+0x10/0x10 [71.957] ret_from_fork+0x34/0x50 [71.957] ? __pfx_kthread+0x10/0x10 [71.957] ret_from_fork_asm+0x1a/0x30 [71.957] [71.957] Kernel panic - not syncing: softlockup: hung tasks [71.987] CPU: 0 UID: 0 PID: 97 Comm: kworker/u4:3 Tainted: G W L 6.14.2-1-default #1 openSUSE Tumbleweed 968795ef2b1407352128b466fe887416c33af6fa [71.989] Tainted: [W]=WARN, [L]=SOFTLOCKUP [71.989] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.16.2-3-gd478f380-rebuilt.opensuse.org 04/01/2014 [71.991] Workqueue: btrfs_discard btrfs_discard_workfn [btrfs] [71.992] Call Trace: [71.993] [71.994] dump_stack_lvl+0x5a/0x80 [71.994] panic+0x10b/0x2da [71.995] watchdog_timer_fn.cold+0x9a/0xa1 [71.996] ? __pfx_watchdog_timer_fn+0x10/0x10 [71.997] __hrtimer_run_queues+0x132/0x2a0 [71.997] hrtimer_interrupt+0xff/0x230 [71.998] __sysvec_apic_timer_interrupt+0x55/0x100 [71.999] sysvec_apic_timer_interrupt+0x6c/0x90 [72.000] [72.000] [72.001] asm_sysvec_apic_timer_interrupt+0x1a/0x20 [72.002] RIP: 0010:btrfs_discard_workfn+0xc4/0x400 [btrfs] [72.002] Code: c1 01 48 83 (...) [72.005] RSP: 0018:ffffafaec03efe08 EFLAGS: 00000246 [72.006] RAX: ffff897045500000 RBX: ffff8970413ed8d0 RCX: 0000000000000000 [72.006] RDX: 0000000000000001 RSI: ffff8970413ed8d0 RDI: 0000000a8f1272ad [72.007] RBP: 0000000a9d61c60e R08: ffff897045500140 R09: 8080808080808080 [72.008] R10: ffff897040276800 R11: fefefefefefefeff R12: ffff8970413ed860 [72.009] R13: ffff897045500000 R14: ffff8970413ed868 R15: 0000000000000000 [72.010] ? btrfs_discard_workfn+0x51/0x400 [btrfs 23b01089228eb964071fb7ca156eee8cd3bf996f] [72.011] process_one_work+0x17e/0x330 [72.012] worker_thread+0x2ce/0x3f0 [72.013] ? __pfx_worker_thread+0x10/0x10 [72.014] kthread+0xef/0x220 [72.014] ? __pfx_kthread+0x10/0x10 [72.015] ret_from_fork+0x34/0x50 [72.015] ? __pfx_kthread+0x10/0x10 [72.016] ret_from_fork_asm+0x1a/0x30 [72.017] [72.017] Kernel Offset: 0x15000000 from 0xffffffff81000000 (relocation range: 0xffffffff80000000-0xffffffffbfffffff) [72.019] Rebooting in 90 seconds.. So fix this by making sure we move a block group out of the unused block groups discard list when calling __add_to_discard_list(). Fixes: 2bee7eb8bb81 ("btrfs: discard one region at a time in async discard") Link: https://bugzilla.suse.com/show_bug.cgi?id=1242012 CC: stable@vger.kernel.org # 5.10+ Reviewed-by: Boris Burkov Reviewed-by: Daniel Vacek Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/discard.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index bd9dde374e5d..7b2f77a8aa98 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -78,8 +78,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { lockdep_assert_held(&discard_ctl->lock); - if (!btrfs_run_discard_work(discard_ctl)) - return; if (list_empty(&block_group->discard_list) || block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { @@ -102,6 +100,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, if (!btrfs_is_block_group_data_only(block_group)) return; + if (!btrfs_run_discard_work(discard_ctl)) + return; + spin_lock(&discard_ctl->lock); __add_to_discard_list(discard_ctl, block_group); spin_unlock(&discard_ctl->lock); @@ -233,6 +234,18 @@ again: block_group->used != 0) { if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); + /* + * The block group must have been moved to other + * discard list even if discard was disabled in + * the meantime or a transaction abort happened, + * otherwise we can end up in an infinite loop, + * always jumping into the 'again' label and + * keep getting this block group over and over + * in case there are no other block groups in + * the discard lists. + */ + ASSERT(block_group->discard_index != + BTRFS_DISCARD_INDEX_UNUSED); } else { list_del_init(&block_group->discard_list); btrfs_put_block_group(block_group); From d9632f4aae23547ce4b5ebd7498a4e9dfff530c2 Mon Sep 17 00:00:00 2001 From: Wayne Lin Date: Fri, 25 Apr 2025 14:44:02 +0800 Subject: [PATCH 140/529] drm/amd/display: Correct the reply value when AUX write incomplete commit d433981385c62c72080e26f1c00a961d18b233be upstream. [Why] Now forcing aux->transfer to return 0 when incomplete AUX write is inappropriate. It should return bytes have been transferred. [How] aux->transfer is asked not to change original msg except reply field of drm_dp_aux_msg structure. Copy the msg->buffer when it's write request, and overwrite the first byte when sink reply 1 byte indicating partially written byte number. Then we can return the correct value without changing the original msg. Fixes: 3637e457eb00 ("drm/amd/display: Fix wrong handling for AUX_DEFER case") Cc: Mario Limonciello Cc: Alex Deucher Reviewed-by: Ray Wu Signed-off-by: Wayne Lin Signed-off-by: Ray Wu Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher (cherry picked from commit 7ac37f0dcd2e0b729fa7b5513908dc8ab802b540) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 3 ++- .../drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 10 ++++++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 43aaf3a42b2e..0d8c020cd121 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -10707,7 +10707,8 @@ int amdgpu_dm_process_dmub_aux_transfer_sync( /* The reply is stored in the top nibble of the command. */ payload->reply[0] = (adev->dm.dmub_notify->aux_reply.command >> 4) & 0xF; - if (!payload->write && p_notify->aux_reply.length) + /*write req may receive a byte indicating partially written number as well*/ + if (p_notify->aux_reply.length) memcpy(payload->data, p_notify->aux_reply.data, p_notify->aux_reply.length); diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index ade5dd5d8231..60171bd4fe08 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -66,6 +66,7 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, enum aux_return_code_type operation_result; struct amdgpu_device *adev; struct ddc_service *ddc; + uint8_t copy[16]; if (WARN_ON(msg->size > 16)) return -E2BIG; @@ -81,6 +82,11 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, (msg->request & DP_AUX_I2C_WRITE_STATUS_UPDATE) != 0; payload.defer_delay = 0; + if (payload.write) { + memcpy(copy, msg->buffer, msg->size); + payload.data = copy; + } + result = dc_link_aux_transfer_raw(TO_DM_AUX(aux)->ddc_service, &payload, &operation_result); @@ -104,9 +110,9 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, */ if (payload.write && result >= 0) { if (result) { - /*one byte indicating partially written bytes. Force 0 to retry*/ + /*one byte indicating partially written bytes*/ drm_info(adev_to_drm(adev), "amdgpu: AUX partially written\n"); - result = 0; + result = payload.data[0]; } else if (!payload.reply[0]) /*I2C_ACK|AUX_ACK*/ result = msg->size; From 0638bad18d49c5da0b80ffb8c0e9172597422302 Mon Sep 17 00:00:00 2001 From: Wayne Lin Date: Tue, 13 May 2025 11:20:24 +0800 Subject: [PATCH 141/529] drm/amd/display: Avoid flooding unnecessary info messages commit d33724ffb743d3d2698bd969e29253ae0cff9739 upstream. It's expected that we'll encounter temporary exceptions during aux transactions. Adjust logging from drm_info to drm_dbg_dp to prevent flooding with unnecessary log messages. Fixes: 3637e457eb00 ("drm/amd/display: Fix wrong handling for AUX_DEFER case") Cc: Mario Limonciello Cc: Alex Deucher Signed-off-by: Wayne Lin Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20250513032026.838036-1-Wayne.Lin@amd.com Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 9a9c3e1fe5256da14a0a307dff0478f90c55fc8c) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c index 60171bd4fe08..495491decec1 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_mst_types.c @@ -111,7 +111,7 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, if (payload.write && result >= 0) { if (result) { /*one byte indicating partially written bytes*/ - drm_info(adev_to_drm(adev), "amdgpu: AUX partially written\n"); + drm_dbg_dp(adev_to_drm(adev), "amdgpu: AUX partially written\n"); result = payload.data[0]; } else if (!payload.reply[0]) /*I2C_ACK|AUX_ACK*/ @@ -137,11 +137,11 @@ static ssize_t dm_dp_aux_transfer(struct drm_dp_aux *aux, break; } - drm_info(adev_to_drm(adev), "amdgpu: DP AUX transfer fail:%d\n", operation_result); + drm_dbg_dp(adev_to_drm(adev), "amdgpu: DP AUX transfer fail:%d\n", operation_result); } if (payload.reply[0]) - drm_info(adev_to_drm(adev), "amdgpu: AUX reply command not ACK: 0x%02x.", + drm_dbg_dp(adev_to_drm(adev), "amdgpu: AUX reply command not ACK: 0x%02x.", payload.reply[0]); return result; From b71a04bae287b6009fb8c7573a5a85c64ec07e07 Mon Sep 17 00:00:00 2001 From: Jeremy Linton Date: Wed, 7 May 2025 21:30:25 -0500 Subject: [PATCH 142/529] ACPI: PPTT: Fix processor subtable walk commit adfab6b39202481bb43286fff94def4953793fdb upstream. The original PPTT code had a bug where the processor subtable length was not correctly validated when encountering a truncated acpi_pptt_processor node. Commit 7ab4f0e37a0f4 ("ACPI PPTT: Fix coding mistakes in a couple of sizeof() calls") attempted to fix this by validating the size is as large as the acpi_pptt_processor node structure. This introduced a regression where the last processor node in the PPTT table is ignored if it doesn't contain any private resources. That results errors like: ACPI PPTT: PPTT table found, but unable to locate core XX (XX) ACPI: SPE must be homogeneous Furthermore, it fails in a common case where the node length isn't equal to the acpi_pptt_processor structure size, leaving the original bug in a modified form. Correct the regression by adjusting the loop termination conditions as suggested by the bug reporters. An additional check performed after the subtable node type is detected, validates the acpi_pptt_processor node is fully contained in the PPTT table. Repeating the check in acpi_pptt_leaf_node() is largely redundant as the node is already known to be fully contained in the table. The case where a final truncated node's parent property is accepted, but the node itself is rejected should not be considered a bug. Fixes: 7ab4f0e37a0f4 ("ACPI PPTT: Fix coding mistakes in a couple of sizeof() calls") Reported-by: Maximilian Heyne Closes: https://lore.kernel.org/linux-acpi/20250506-draco-taped-15f475cd@mheyne-amazon/ Reported-by: Yicong Yang Closes: https://lore.kernel.org/linux-acpi/20250507035124.28071-1-yangyicong@huawei.com/ Signed-off-by: Jeremy Linton Tested-by: Yicong Yang Reviewed-by: Sudeep Holla Tested-by: Maximilian Heyne Cc: All applicable # 7ab4f0e37a0f4: ACPI PPTT: Fix coding mistakes ... Link: https://patch.msgid.link/20250508023025.1301030-1-jeremy.linton@arm.com Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/pptt.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/acpi/pptt.c b/drivers/acpi/pptt.c index 79a83d8236cb..1938e4778725 100644 --- a/drivers/acpi/pptt.c +++ b/drivers/acpi/pptt.c @@ -219,16 +219,18 @@ static int acpi_pptt_leaf_node(struct acpi_table_header *table_hdr, sizeof(struct acpi_table_pptt)); proc_sz = sizeof(struct acpi_pptt_processor); - while ((unsigned long)entry + proc_sz < table_end) { + /* ignore subtable types that are smaller than a processor node */ + while ((unsigned long)entry + proc_sz <= table_end) { cpu_node = (struct acpi_pptt_processor *)entry; + if (entry->type == ACPI_PPTT_TYPE_PROCESSOR && cpu_node->parent == node_entry) return 0; if (entry->length == 0) return 0; + entry = ACPI_ADD_PTR(struct acpi_subtable_header, entry, entry->length); - } return 1; } @@ -261,15 +263,18 @@ static struct acpi_pptt_processor *acpi_find_processor_node(struct acpi_table_he proc_sz = sizeof(struct acpi_pptt_processor); /* find the processor structure associated with this cpuid */ - while ((unsigned long)entry + proc_sz < table_end) { + while ((unsigned long)entry + proc_sz <= table_end) { cpu_node = (struct acpi_pptt_processor *)entry; if (entry->length == 0) { pr_warn("Invalid zero length subtable\n"); break; } + /* entry->length may not equal proc_sz, revalidate the processor structure length */ if (entry->type == ACPI_PPTT_TYPE_PROCESSOR && acpi_cpu_id == cpu_node->acpi_processor_id && + (unsigned long)entry + entry->length <= table_end && + entry->length == proc_sz + cpu_node->number_of_priv_resources * sizeof(u32) && acpi_pptt_leaf_node(table_hdr, cpu_node)) { return (struct acpi_pptt_processor *)entry; } From f5abc1344f9e511495377a93453a8828f0797e98 Mon Sep 17 00:00:00 2001 From: Wentao Liang Date: Wed, 14 May 2025 17:24:44 +0800 Subject: [PATCH 143/529] ALSA: es1968: Add error handling for snd_pcm_hw_constraint_pow2() commit 9e000f1b7f31684cc5927e034360b87ac7919593 upstream. The function snd_es1968_capture_open() calls the function snd_pcm_hw_constraint_pow2(), but does not check its return value. A proper implementation can be found in snd_cx25821_pcm_open(). Add error handling for snd_pcm_hw_constraint_pow2() and propagate its error code. Fixes: b942cf815b57 ("[ALSA] es1968 - Fix stuttering capture") Cc: stable@vger.kernel.org # v2.6.22 Signed-off-by: Wentao Liang Link: https://patch.msgid.link/20250514092444.331-1-vulab@iscas.ac.cn Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/es1968.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/pci/es1968.c b/sound/pci/es1968.c index 4a7e20bb11bc..802c5ed4a5ba 100644 --- a/sound/pci/es1968.c +++ b/sound/pci/es1968.c @@ -1569,7 +1569,7 @@ static int snd_es1968_capture_open(struct snd_pcm_substream *substream) struct snd_pcm_runtime *runtime = substream->runtime; struct es1968 *chip = snd_pcm_substream_chip(substream); struct esschan *es; - int apu1, apu2; + int err, apu1, apu2; apu1 = snd_es1968_alloc_apu_pair(chip, ESM_APU_PCM_CAPTURE); if (apu1 < 0) @@ -1613,7 +1613,9 @@ static int snd_es1968_capture_open(struct snd_pcm_substream *substream) runtime->hw = snd_es1968_capture; runtime->hw.buffer_bytes_max = runtime->hw.period_bytes_max = calc_available_memory_size(chip) - 1024; /* keep MIXBUF size */ - snd_pcm_hw_constraint_pow2(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_BYTES); + err = snd_pcm_hw_constraint_pow2(runtime, 0, SNDRV_PCM_HW_PARAM_BUFFER_BYTES); + if (err < 0) + return err; spin_lock_irq(&chip->substream_lock); list_add(&es->list, &chip->substream_list); From 7ba07e109ff6a0834438e5ed1d1a9781faaa10d2 Mon Sep 17 00:00:00 2001 From: Christian Heusel Date: Mon, 12 May 2025 22:23:37 +0200 Subject: [PATCH 144/529] ALSA: usb-audio: Add sample rate quirk for Audioengine D1 commit 2b24eb060c2bb9ef79e1d3bcf633ba1bc95215d6 upstream. A user reported on the Arch Linux Forums that their device is emitting the following message in the kernel journal, which is fixed by adding the quirk as submitted in this patch: > kernel: usb 1-2: current rate 8436480 is different from the runtime rate 48000 There also is an entry for this product line added long time ago. Their specific device has the following ID: $ lsusb | grep Audio Bus 001 Device 002: ID 1101:0003 EasyPass Industrial Co., Ltd Audioengine D1 Link: https://bbs.archlinux.org/viewtopic.php?id=305494 Fixes: 93f9d1a4ac593 ("ALSA: usb-audio: Apply sample rate quirk for Audioengine D1") Cc: stable@vger.kernel.org Signed-off-by: Christian Heusel Link: https://patch.msgid.link/20250512-audioengine-quirk-addition-v1-1-4c370af6eff7@heusel.eu Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 42302c7812c7..4dc632cf57bc 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2146,6 +2146,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_FIXED_RATE), DEVICE_FLG(0x0fd9, 0x0008, /* Hauppauge HVR-950Q */ QUIRK_FLAG_SHARE_MEDIA_DEVICE | QUIRK_FLAG_ALIGN_TRANSFER), + DEVICE_FLG(0x1101, 0x0003, /* Audioengine D1 */ + QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x1224, 0x2a25, /* Jieli Technology USB PHY 2.0 */ QUIRK_FLAG_GET_SAMPLE_RATE | QUIRK_FLAG_MIC_RES_16), DEVICE_FLG(0x1395, 0x740a, /* Sennheiser DECT */ From 01dfc57326f83c5dc225497fb326c33f7abead3f Mon Sep 17 00:00:00 2001 From: Nicolas Chauvet Date: Thu, 15 May 2025 12:21:32 +0200 Subject: [PATCH 145/529] ALSA: usb-audio: Add sample rate quirk for Microdia JP001 USB Camera commit 7b9938a14460e8ec7649ca2e80ac0aae9815bf02 upstream. Microdia JP001 does not support reading the sample rate which leads to many lines of "cannot get freq at ep 0x84". This patch adds the USB ID to quirks.c and avoids those error messages. usb 7-4: New USB device found, idVendor=0c45, idProduct=636b, bcdDevice= 1.00 usb 7-4: New USB device strings: Mfr=2, Product=1, SerialNumber=3 usb 7-4: Product: JP001 usb 7-4: Manufacturer: JP001 usb 7-4: SerialNumber: JP001 usb 7-4: 3:1: cannot get freq at ep 0x84 Cc: Signed-off-by: Nicolas Chauvet Link: https://patch.msgid.link/20250515102132.73062-1-kwizart@gmail.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 4dc632cf57bc..b2a612c5b299 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -2138,6 +2138,8 @@ static const struct usb_audio_quirk_flags_table quirk_flags_table[] = { QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x0c45, 0x6340, /* Sonix HD USB Camera */ QUIRK_FLAG_GET_SAMPLE_RATE), + DEVICE_FLG(0x0c45, 0x636b, /* Microdia JP001 USB Camera */ + QUIRK_FLAG_GET_SAMPLE_RATE), DEVICE_FLG(0x0d8c, 0x0014, /* USB Audio Device */ QUIRK_FLAG_CTL_MSG_DELAY_1M), DEVICE_FLG(0x0ecb, 0x205c, /* JBL Quantum610 Wireless */ From 3becc659f9cb76b481ad1fb71f54d5c8d6332d3f Mon Sep 17 00:00:00 2001 From: Hyejeong Choi Date: Mon, 12 May 2025 21:06:38 -0500 Subject: [PATCH 146/529] dma-buf: insert memory barrier before updating num_fences MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 72c7d62583ebce7baeb61acce6057c361f73be4a upstream. smp_store_mb() inserts memory barrier after storing operation. It is different with what the comment is originally aiming so Null pointer dereference can be happened if memory update is reordered. Signed-off-by: Hyejeong Choi Fixes: a590d0fdbaa5 ("dma-buf: Update reservation shared_count after adding the new fence") CC: stable@vger.kernel.org Reviewed-by: Christian König Link: https://lore.kernel.org/r/20250513020638.GA2329653@au1-maretx-p37.eng.sarc.samsung.com Signed-off-by: Christian König Signed-off-by: Greg Kroah-Hartman --- drivers/dma-buf/dma-resv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/dma-buf/dma-resv.c b/drivers/dma-buf/dma-resv.c index e78ff9333c7a..759b4f80baee 100644 --- a/drivers/dma-buf/dma-resv.c +++ b/drivers/dma-buf/dma-resv.c @@ -308,8 +308,9 @@ void dma_resv_add_fence(struct dma_resv *obj, struct dma_fence *fence, count++; dma_resv_list_set(fobj, i, fence, usage); - /* pointer update must be visible before we extend the num_fences */ - smp_store_mb(fobj->num_fences, count); + /* fence update must be visible before we extend the num_fences */ + smp_wmb(); + fobj->num_fences = count; } EXPORT_SYMBOL(dma_resv_add_fence); From 3e5210def3d06441c0f2725eb05a823614260327 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 12 May 2025 17:06:01 -0700 Subject: [PATCH 147/529] hv_netvsc: Use vmbus_sendpacket_mpb_desc() to send VMBus messages commit 4f98616b855cb0e3b5917918bb07b44728eb96ea upstream. netvsc currently uses vmbus_sendpacket_pagebuffer() to send VMBus messages. This function creates a series of GPA ranges, each of which contains a single PFN. However, if the rndis header in the VMBus message crosses a page boundary, the netvsc protocol with the host requires that both PFNs for the rndis header must be in a single "GPA range" data structure, which isn't possible with vmbus_sendpacket_pagebuffer(). As the first step in fixing this, add a new function netvsc_build_mpb_array() to build a VMBus message with multiple GPA ranges, each of which may contain multiple PFNs. Use vmbus_sendpacket_mpb_desc() to send this VMBus message to the host. There's no functional change since higher levels of netvsc don't maintain or propagate knowledge of contiguous PFNs. Based on its input, netvsc_build_mpb_array() still produces a separate GPA range for each PFN and the behavior is the same as with vmbus_sendpacket_pagebuffer(). But the groundwork is laid for a subsequent patch to provide the necessary grouping. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-3-mhklinux@outlook.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/hyperv/netvsc.c | 50 +++++++++++++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 5 deletions(-) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 3a834d4e1c84..7df291ffbff1 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1082,6 +1082,42 @@ static int netvsc_dma_map(struct hv_device *hv_dev, return 0; } +/* Build an "array" of mpb entries describing the data to be transferred + * over VMBus. After the desc header fields, each "array" entry is variable + * size, and each entry starts after the end of the previous entry. The + * "offset" and "len" fields for each entry imply the size of the entry. + * + * The pfns are in HV_HYP_PAGE_SIZE, because all communication with Hyper-V + * uses that granularity, even if the system page size of the guest is larger. + * Each entry in the input "pb" array must describe a contiguous range of + * guest physical memory so that the pfns are sequential if the range crosses + * a page boundary. The offset field must be < HV_HYP_PAGE_SIZE. + */ +static inline void netvsc_build_mpb_array(struct hv_page_buffer *pb, + u32 page_buffer_count, + struct vmbus_packet_mpb_array *desc, + u32 *desc_size) +{ + struct hv_mpb_array *mpb_entry = &desc->range; + int i, j; + + for (i = 0; i < page_buffer_count; i++) { + u32 offset = pb[i].offset; + u32 len = pb[i].len; + + mpb_entry->offset = offset; + mpb_entry->len = len; + + for (j = 0; j < HVPFN_UP(offset + len); j++) + mpb_entry->pfn_array[j] = pb[i].pfn + j; + + mpb_entry = (struct hv_mpb_array *)&mpb_entry->pfn_array[j]; + } + + desc->rangecount = page_buffer_count; + *desc_size = (char *)mpb_entry - (char *)desc; +} + static inline int netvsc_send_pkt( struct hv_device *device, struct hv_netvsc_packet *packet, @@ -1124,6 +1160,9 @@ static inline int netvsc_send_pkt( packet->dma_range = NULL; if (packet->page_buf_cnt) { + struct vmbus_channel_packet_page_buffer desc; + u32 desc_size; + if (packet->cp_partial) pb += packet->rmsg_pgcnt; @@ -1133,11 +1172,12 @@ static inline int netvsc_send_pkt( goto exit; } - ret = vmbus_sendpacket_pagebuffer(out_channel, - pb, packet->page_buf_cnt, - &nvmsg, sizeof(nvmsg), - req_id); - + netvsc_build_mpb_array(pb, packet->page_buf_cnt, + (struct vmbus_packet_mpb_array *)&desc, + &desc_size); + ret = vmbus_sendpacket_mpb_desc(out_channel, + (struct vmbus_packet_mpb_array *)&desc, + desc_size, &nvmsg, sizeof(nvmsg), req_id); if (ret) netvsc_dma_unmap(ndev_ctx->device_ctx, packet); } else { From 86b05e14c4cfdd8ce73fc7cdbb3d2527012db20b Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 12 May 2025 17:06:02 -0700 Subject: [PATCH 148/529] hv_netvsc: Preserve contiguous PFN grouping in the page buffer array commit 41a6328b2c55276f89ea3812069fd7521e348bbf upstream. Starting with commit dca5161f9bd0 ("hv_netvsc: Check status in SEND_RNDIS_PKT completion message") in the 6.3 kernel, the Linux driver for Hyper-V synthetic networking (netvsc) occasionally reports "nvsp_rndis_pkt_complete error status: 2".[1] This error indicates that Hyper-V has rejected a network packet transmit request from the guest, and the outgoing network packet is dropped. Higher level network protocols presumably recover and resend the packet so there is no functional error, but performance is slightly impacted. Commit dca5161f9bd0 is not the cause of the error -- it only added reporting of an error that was already happening without any notice. The error has presumably been present since the netvsc driver was originally introduced into Linux. The root cause of the problem is that the netvsc driver in Linux may send an incorrectly formatted VMBus message to Hyper-V when transmitting the network packet. The incorrect formatting occurs when the rndis header of the VMBus message crosses a page boundary due to how the Linux skb head memory is aligned. In such a case, two PFNs are required to describe the location of the rndis header, even though they are contiguous in guest physical address (GPA) space. Hyper-V requires that two rndis header PFNs be in a single "GPA range" data struture, but current netvsc code puts each PFN in its own GPA range, which Hyper-V rejects as an error. The incorrect formatting occurs only for larger packets that netvsc must transmit via a VMBus "GPA Direct" message. There's no problem when netvsc transmits a smaller packet by copying it into a pre- allocated send buffer slot because the pre-allocated slots don't have page crossing issues. After commit 14ad6ed30a10 ("net: allow small head cache usage with large MAX_SKB_FRAGS values") in the 6.14-rc4 kernel, the error occurs much more frequently in VMs with 16 or more vCPUs. It may occur every few seconds, or even more frequently, in an ssh session that outputs a lot of text. Commit 14ad6ed30a10 subtly changes how skb head memory is allocated, making it much more likely that the rndis header will cross a page boundary when the vCPU count is 16 or more. The changes in commit 14ad6ed30a10 are perfectly valid -- they just had the side effect of making the netvsc bug more prominent. Current code in init_page_array() creates a separate page buffer array entry for each PFN required to identify the data to be transmitted. Contiguous PFNs get separate entries in the page buffer array, and any information about contiguity is lost. Fix the core issue by having init_page_array() construct the page buffer array to represent contiguous ranges rather than individual pages. When these ranges are subsequently passed to netvsc_build_mpb_array(), it can build GPA ranges that contain multiple PFNs, as required to avoid the error "nvsp_rndis_pkt_complete error status: 2". If instead the network packet is sent by copying into a pre-allocated send buffer slot, the copy proceeds using the contiguous ranges rather than individual pages, but the result of the copying is the same. Also fix rndis_filter_send_request() to construct a contiguous range, since it has its own page buffer array. This change has a side benefit in CoCo VMs in that netvsc_dma_map() calls dma_map_single() on each contiguous range instead of on each page. This results in fewer calls to dma_map_single() but on larger chunks of memory, which should reduce contention on the swiotlb. Since the page buffer array now contains one entry for each contiguous range instead of for each individual page, the number of entries in the array can be reduced, saving 208 bytes of stack space in netvsc_xmit() when MAX_SKG_FRAGS has the default value of 17. [1] https://bugzilla.kernel.org/show_bug.cgi?id=217503 Closes: https://bugzilla.kernel.org/show_bug.cgi?id=217503 Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-4-mhklinux@outlook.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/hyperv/hyperv_net.h | 12 ++++++ drivers/net/hyperv/netvsc_drv.c | 63 ++++++++----------------------- drivers/net/hyperv/rndis_filter.c | 24 +++--------- 3 files changed, 32 insertions(+), 67 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index dd5919ec408b..08f26679ba58 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -891,6 +891,18 @@ struct nvsp_message { sizeof(struct nvsp_message)) #define NETVSC_MIN_IN_MSG_SIZE sizeof(struct vmpacket_descriptor) +/* Maximum # of contiguous data ranges that can make up a trasmitted packet. + * Typically it's the max SKB fragments plus 2 for the rndis packet and the + * linear portion of the SKB. But if MAX_SKB_FRAGS is large, the value may + * need to be limited to MAX_PAGE_BUFFER_COUNT, which is the max # of entries + * in a GPA direct packet sent to netvsp over VMBus. + */ +#if MAX_SKB_FRAGS + 2 < MAX_PAGE_BUFFER_COUNT +#define MAX_DATA_RANGES (MAX_SKB_FRAGS + 2) +#else +#define MAX_DATA_RANGES MAX_PAGE_BUFFER_COUNT +#endif + /* Estimated requestor size: * out_ring_size/min_out_msg_size + in_ring_size/min_in_msg_size */ diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 2fba5dfd9cd0..07e91fb45d9e 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -325,43 +325,10 @@ static u16 netvsc_select_queue(struct net_device *ndev, struct sk_buff *skb, return txq; } -static u32 fill_pg_buf(unsigned long hvpfn, u32 offset, u32 len, - struct hv_page_buffer *pb) -{ - int j = 0; - - hvpfn += offset >> HV_HYP_PAGE_SHIFT; - offset = offset & ~HV_HYP_PAGE_MASK; - - while (len > 0) { - unsigned long bytes; - - bytes = HV_HYP_PAGE_SIZE - offset; - if (bytes > len) - bytes = len; - pb[j].pfn = hvpfn; - pb[j].offset = offset; - pb[j].len = bytes; - - offset += bytes; - len -= bytes; - - if (offset == HV_HYP_PAGE_SIZE && len) { - hvpfn++; - offset = 0; - j++; - } - } - - return j + 1; -} - static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, struct hv_netvsc_packet *packet, struct hv_page_buffer *pb) { - u32 slots_used = 0; - char *data = skb->data; int frags = skb_shinfo(skb)->nr_frags; int i; @@ -370,28 +337,28 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, * 2. skb linear data * 3. skb fragment data */ - slots_used += fill_pg_buf(virt_to_hvpfn(hdr), - offset_in_hvpage(hdr), - len, - &pb[slots_used]); + pb[0].offset = offset_in_hvpage(hdr); + pb[0].len = len; + pb[0].pfn = virt_to_hvpfn(hdr); packet->rmsg_size = len; - packet->rmsg_pgcnt = slots_used; + packet->rmsg_pgcnt = 1; - slots_used += fill_pg_buf(virt_to_hvpfn(data), - offset_in_hvpage(data), - skb_headlen(skb), - &pb[slots_used]); + pb[1].offset = offset_in_hvpage(skb->data); + pb[1].len = skb_headlen(skb); + pb[1].pfn = virt_to_hvpfn(skb->data); for (i = 0; i < frags; i++) { skb_frag_t *frag = skb_shinfo(skb)->frags + i; + struct hv_page_buffer *cur_pb = &pb[i + 2]; + u64 pfn = page_to_hvpfn(skb_frag_page(frag)); + u32 offset = skb_frag_off(frag); - slots_used += fill_pg_buf(page_to_hvpfn(skb_frag_page(frag)), - skb_frag_off(frag), - skb_frag_size(frag), - &pb[slots_used]); + cur_pb->offset = offset_in_hvpage(offset); + cur_pb->len = skb_frag_size(frag); + cur_pb->pfn = pfn + (offset >> HV_HYP_PAGE_SHIFT); } - return slots_used; + return frags + 2; } static int count_skb_frag_slots(struct sk_buff *skb) @@ -482,7 +449,7 @@ static int netvsc_xmit(struct sk_buff *skb, struct net_device *net, bool xdp_tx) struct net_device *vf_netdev; u32 rndis_msg_size; u32 hash; - struct hv_page_buffer pb[MAX_PAGE_BUFFER_COUNT]; + struct hv_page_buffer pb[MAX_DATA_RANGES]; /* If VF is present and up then redirect packets to it. * Skip the VF if it is marked down or has no carrier. diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index eea777ec2541..bb656ea09773 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -225,8 +225,7 @@ static int rndis_filter_send_request(struct rndis_device *dev, struct rndis_request *req) { struct hv_netvsc_packet *packet; - struct hv_page_buffer page_buf[2]; - struct hv_page_buffer *pb = page_buf; + struct hv_page_buffer pb; int ret; /* Setup the packet to send it */ @@ -235,27 +234,14 @@ static int rndis_filter_send_request(struct rndis_device *dev, packet->total_data_buflen = req->request_msg.msg_len; packet->page_buf_cnt = 1; - pb[0].pfn = virt_to_phys(&req->request_msg) >> - HV_HYP_PAGE_SHIFT; - pb[0].len = req->request_msg.msg_len; - pb[0].offset = offset_in_hvpage(&req->request_msg); - - /* Add one page_buf when request_msg crossing page boundary */ - if (pb[0].offset + pb[0].len > HV_HYP_PAGE_SIZE) { - packet->page_buf_cnt++; - pb[0].len = HV_HYP_PAGE_SIZE - - pb[0].offset; - pb[1].pfn = virt_to_phys((void *)&req->request_msg - + pb[0].len) >> HV_HYP_PAGE_SHIFT; - pb[1].offset = 0; - pb[1].len = req->request_msg.msg_len - - pb[0].len; - } + pb.pfn = virt_to_phys(&req->request_msg) >> HV_HYP_PAGE_SHIFT; + pb.len = req->request_msg.msg_len; + pb.offset = offset_in_hvpage(&req->request_msg); trace_rndis_send(dev->ndev, 0, &req->request_msg); rcu_read_lock_bh(); - ret = netvsc_send(dev->ndev, packet, NULL, pb, NULL, false); + ret = netvsc_send(dev->ndev, packet, NULL, &pb, NULL, false); rcu_read_unlock_bh(); return ret; From b47a984faf5c0f1b7092e4b4faf2ccd2b7142444 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 12 May 2025 17:06:03 -0700 Subject: [PATCH 149/529] hv_netvsc: Remove rmsg_pgcnt commit 5bbc644bbf4e97a05bc0cb052189004588ff8a09 upstream. init_page_array() now always creates a single page buffer array entry for the rndis message, even if the rndis message crosses a page boundary. As such, the number of page buffer array entries used for the rndis message must no longer be tracked -- it is always just 1. Remove the rmsg_pgcnt field and use "1" where the value is needed. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-5-mhklinux@outlook.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/hyperv/hyperv_net.h | 1 - drivers/net/hyperv/netvsc.c | 7 +++---- drivers/net/hyperv/netvsc_drv.c | 1 - 3 files changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h index 08f26679ba58..e7fd72f57e36 100644 --- a/drivers/net/hyperv/hyperv_net.h +++ b/drivers/net/hyperv/hyperv_net.h @@ -156,7 +156,6 @@ struct hv_netvsc_packet { u8 cp_partial; /* partial copy into send buffer */ u8 rmsg_size; /* RNDIS header and PPI size */ - u8 rmsg_pgcnt; /* page count of RNDIS header and PPI */ u8 page_buf_cnt; u16 q_idx; diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 7df291ffbff1..ac2d706ef2d0 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -980,8 +980,7 @@ static void netvsc_copy_to_send_buf(struct netvsc_device *net_device, + pend_size; int i; u32 padding = 0; - u32 page_count = packet->cp_partial ? packet->rmsg_pgcnt : - packet->page_buf_cnt; + u32 page_count = packet->cp_partial ? 1 : packet->page_buf_cnt; u32 remain; /* Add padding */ @@ -1164,7 +1163,7 @@ static inline int netvsc_send_pkt( u32 desc_size; if (packet->cp_partial) - pb += packet->rmsg_pgcnt; + pb++; ret = netvsc_dma_map(ndev_ctx->device_ctx, packet, pb); if (ret) { @@ -1326,7 +1325,7 @@ int netvsc_send(struct net_device *ndev, packet->send_buf_index = section_index; if (packet->cp_partial) { - packet->page_buf_cnt -= packet->rmsg_pgcnt; + packet->page_buf_cnt--; packet->total_data_buflen = msd_len + packet->rmsg_size; } else { packet->page_buf_cnt = 0; diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c index 07e91fb45d9e..96a1767281f7 100644 --- a/drivers/net/hyperv/netvsc_drv.c +++ b/drivers/net/hyperv/netvsc_drv.c @@ -342,7 +342,6 @@ static u32 init_page_array(void *hdr, u32 len, struct sk_buff *skb, pb[0].len = len; pb[0].pfn = virt_to_hvpfn(hdr); packet->rmsg_size = len; - packet->rmsg_pgcnt = 1; pb[1].offset = offset_in_hvpage(skb->data); pb[1].len = skb_headlen(skb); From 6ef4c72213abdc4ba56bb3b998962acadd592c24 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 12 May 2025 17:06:00 -0700 Subject: [PATCH 150/529] Drivers: hv: Allow vmbus_sendpacket_mpb_desc() to create multiple ranges commit 380b75d3078626aadd0817de61f3143f5db6e393 upstream. vmbus_sendpacket_mpb_desc() is currently used only by the storvsc driver and is hardcoded to create a single GPA range. To allow it to also be used by the netvsc driver to create multiple GPA ranges, no longer hardcode as having a single GPA range. Allow the calling driver to specify the rangecount in the supplied descriptor. Update the storvsc driver to reflect this new approach. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-2-mhklinux@outlook.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/hv/channel.c | 6 +++--- drivers/scsi/storvsc_drv.c | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 47e1bd8de9fc..50f44ceb6621 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -1172,9 +1172,10 @@ int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer); /* - * vmbus_sendpacket_multipagebuffer - Send a multi-page buffer packet + * vmbus_sendpacket_mpb_desc - Send one or more multi-page buffer packets * using a GPADL Direct packet type. - * The buffer includes the vmbus descriptor. + * The desc argument must include space for the VMBus descriptor. The + * rangecount field must already be set. */ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, struct vmbus_packet_mpb_array *desc, @@ -1196,7 +1197,6 @@ int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, desc->length8 = (u16)(packetlen_aligned >> 3); desc->transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ desc->reserved = 0; - desc->rangecount = 1; bufferlist[0].iov_base = desc; bufferlist[0].iov_len = desc_size; diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 6ab3fdb06965..591186daf46f 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1810,6 +1810,7 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd) return SCSI_MLQUEUE_DEVICE_BUSY; } + payload->rangecount = 1; payload->range.len = length; payload->range.offset = offset_in_hvpg; From 763db1b8b003695d91f732c353e44b2bb6429199 Mon Sep 17 00:00:00 2001 From: Michael Kelley Date: Mon, 12 May 2025 17:06:04 -0700 Subject: [PATCH 151/529] Drivers: hv: vmbus: Remove vmbus_sendpacket_pagebuffer() commit 45a442fe369e6c4e0b4aa9f63b31c3f2f9e2090e upstream. With the netvsc driver changed to use vmbus_sendpacket_mpb_desc() instead of vmbus_sendpacket_pagebuffer(), the latter has no remaining callers. Remove it. Cc: # 6.1.x Signed-off-by: Michael Kelley Link: https://patch.msgid.link/20250513000604.1396-6-mhklinux@outlook.com Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/hv/channel.c | 59 ------------------------------------------ include/linux/hyperv.h | 7 ----- 2 files changed, 66 deletions(-) diff --git a/drivers/hv/channel.c b/drivers/hv/channel.c index 50f44ceb6621..53026356475a 100644 --- a/drivers/hv/channel.c +++ b/drivers/hv/channel.c @@ -1112,65 +1112,6 @@ int vmbus_sendpacket(struct vmbus_channel *channel, void *buffer, } EXPORT_SYMBOL(vmbus_sendpacket); -/* - * vmbus_sendpacket_pagebuffer - Send a range of single-page buffer - * packets using a GPADL Direct packet type. This interface allows you - * to control notifying the host. This will be useful for sending - * batched data. Also the sender can control the send flags - * explicitly. - */ -int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, - struct hv_page_buffer pagebuffers[], - u32 pagecount, void *buffer, u32 bufferlen, - u64 requestid) -{ - int i; - struct vmbus_channel_packet_page_buffer desc; - u32 descsize; - u32 packetlen; - u32 packetlen_aligned; - struct kvec bufferlist[3]; - u64 aligned_data = 0; - - if (pagecount > MAX_PAGE_BUFFER_COUNT) - return -EINVAL; - - /* - * Adjust the size down since vmbus_channel_packet_page_buffer is the - * largest size we support - */ - descsize = sizeof(struct vmbus_channel_packet_page_buffer) - - ((MAX_PAGE_BUFFER_COUNT - pagecount) * - sizeof(struct hv_page_buffer)); - packetlen = descsize + bufferlen; - packetlen_aligned = ALIGN(packetlen, sizeof(u64)); - - /* Setup the descriptor */ - desc.type = VM_PKT_DATA_USING_GPA_DIRECT; - desc.flags = VMBUS_DATA_PACKET_FLAG_COMPLETION_REQUESTED; - desc.dataoffset8 = descsize >> 3; /* in 8-bytes granularity */ - desc.length8 = (u16)(packetlen_aligned >> 3); - desc.transactionid = VMBUS_RQST_ERROR; /* will be updated in hv_ringbuffer_write() */ - desc.reserved = 0; - desc.rangecount = pagecount; - - for (i = 0; i < pagecount; i++) { - desc.range[i].len = pagebuffers[i].len; - desc.range[i].offset = pagebuffers[i].offset; - desc.range[i].pfn = pagebuffers[i].pfn; - } - - bufferlist[0].iov_base = &desc; - bufferlist[0].iov_len = descsize; - bufferlist[1].iov_base = buffer; - bufferlist[1].iov_len = bufferlen; - bufferlist[2].iov_base = &aligned_data; - bufferlist[2].iov_len = (packetlen_aligned - packetlen); - - return hv_ringbuffer_write(channel, bufferlist, 3, requestid, NULL); -} -EXPORT_SYMBOL_GPL(vmbus_sendpacket_pagebuffer); - /* * vmbus_sendpacket_mpb_desc - Send one or more multi-page buffer packets * using a GPADL Direct packet type. diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h index afc204370901..80d876593caa 100644 --- a/include/linux/hyperv.h +++ b/include/linux/hyperv.h @@ -1224,13 +1224,6 @@ extern int vmbus_sendpacket(struct vmbus_channel *channel, enum vmbus_packet_type type, u32 flags); -extern int vmbus_sendpacket_pagebuffer(struct vmbus_channel *channel, - struct hv_page_buffer pagebuffers[], - u32 pagecount, - void *buffer, - u32 bufferlen, - u64 requestid); - extern int vmbus_sendpacket_mpb_desc(struct vmbus_channel *channel, struct vmbus_packet_mpb_array *mpb, u32 desc_size, From c4edc834d2c0bba168cc5dbd0e0943f47856f7c6 Mon Sep 17 00:00:00 2001 From: pengdonglin Date: Mon, 12 May 2025 17:42:45 +0800 Subject: [PATCH 152/529] ftrace: Fix preemption accounting for stacktrace trigger command commit e333332657f615ac2b55aa35565c4a882018bbe9 upstream. When using the stacktrace trigger command to trace syscalls, the preemption count was consistently reported as 1 when the system call event itself had 0 ("."). For example: root@ubuntu22-vm:/sys/kernel/tracing/events/syscalls/sys_enter_read $ echo stacktrace > trigger $ echo 1 > enable sshd-416 [002] ..... 232.864910: sys_read(fd: a, buf: 556b1f3221d0, count: 8000) sshd-416 [002] ...1. 232.864913: => ftrace_syscall_enter => syscall_trace_enter => do_syscall_64 => entry_SYSCALL_64_after_hwframe The root cause is that the trace framework disables preemption in __DO_TRACE before invoking the trigger callback. Use the tracing_gen_ctx_dec() that will accommodate for the increase of the preemption count in __DO_TRACE when calling the callback. The result is the accurate reporting of: sshd-410 [004] ..... 210.117660: sys_read(fd: 4, buf: 559b725ba130, count: 40000) sshd-410 [004] ..... 210.117662: => ftrace_syscall_enter => syscall_trace_enter => do_syscall_64 => entry_SYSCALL_64_after_hwframe Cc: stable@vger.kernel.org Fixes: ce33c845b030c ("tracing: Dump stacktrace trigger to the corresponding instance") Link: https://lore.kernel.org/20250512094246.1167956-1-dolinux.peng@gmail.com Signed-off-by: pengdonglin Signed-off-by: Steven Rostedt (Google) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_events_trigger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_trigger.c b/kernel/trace/trace_events_trigger.c index f941ce01ee35..afdbad16d00a 100644 --- a/kernel/trace/trace_events_trigger.c +++ b/kernel/trace/trace_events_trigger.c @@ -1539,7 +1539,7 @@ stacktrace_trigger(struct event_trigger_data *data, struct trace_event_file *file = data->private_data; if (file) - __trace_stack(file->tr, tracing_gen_ctx(), STACK_SKIP); + __trace_stack(file->tr, tracing_gen_ctx_dec(), STACK_SKIP); else trace_dump_stack(STACK_SKIP); } From cbe20c2c83313ecc48b3011866b2bfe002a74f79 Mon Sep 17 00:00:00 2001 From: pengdonglin Date: Mon, 12 May 2025 17:42:46 +0800 Subject: [PATCH 153/529] ftrace: Fix preemption accounting for stacktrace filter command commit 11aff32439df6ca5b3b891b43032faf88f4a6a29 upstream. The preemption count of the stacktrace filter command to trace ksys_read is consistently incorrect: $ echo ksys_read:stacktrace > set_ftrace_filter <...>-453 [004] ...1. 38.308956: => ksys_read => do_syscall_64 => entry_SYSCALL_64_after_hwframe The root cause is that the trace framework disables preemption when invoking the filter command callback in function_trace_probe_call: preempt_disable_notrace(); probe_ops->func(ip, parent_ip, probe_opsbe->tr, probe_ops, probe->data); preempt_enable_notrace(); Use tracing_gen_ctx_dec() to account for the preempt_disable_notrace(), which will output the correct preemption count: $ echo ksys_read:stacktrace > set_ftrace_filter <...>-410 [006] ..... 31.420396: => ksys_read => do_syscall_64 => entry_SYSCALL_64_after_hwframe Cc: stable@vger.kernel.org Fixes: 36590c50b2d07 ("tracing: Merge irqflags + preempt counter.") Link: https://lore.kernel.org/20250512094246.1167956-2-dolinux.peng@gmail.com Signed-off-by: pengdonglin Signed-off-by: Steven Rostedt (Google) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_functions.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/kernel/trace/trace_functions.c b/kernel/trace/trace_functions.c index 69e92c7359fb..44ae51d1dc45 100644 --- a/kernel/trace/trace_functions.c +++ b/kernel/trace/trace_functions.c @@ -561,11 +561,7 @@ ftrace_traceoff(unsigned long ip, unsigned long parent_ip, static __always_inline void trace_stack(struct trace_array *tr) { - unsigned int trace_ctx; - - trace_ctx = tracing_gen_ctx(); - - __trace_stack(tr, trace_ctx, FTRACE_STACK_SKIP); + __trace_stack(tr, tracing_gen_ctx_dec(), FTRACE_STACK_SKIP); } static void From 12ba469abe81c0d00b6cc5b31fee186f01c0a0b9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 9 May 2025 15:26:57 -0400 Subject: [PATCH 154/529] tracing: samples: Initialize trace_array_printk() with the correct function commit 1b0c192c92ea1fe2dcb178f84adf15fe37c3e7c8 upstream. When using trace_array_printk() on a created instance, the correct function to use to initialize it is: trace_array_init_printk() Not trace_printk_init_buffer() The former is a proper function to use, the latter is for initializing trace_printk() and causes the NOTICE banner to be displayed. Cc: stable@vger.kernel.org Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Divya Indi Link: https://lore.kernel.org/20250509152657.0f6744d9@gandalf.local.home Fixes: 89ed42495ef4a ("tracing: Sample module to demonstrate kernel access to Ftrace instances.") Fixes: 38ce2a9e33db6 ("tracing: Add trace_array_init_printk() to initialize instance trace_printk() buffers") Signed-off-by: Steven Rostedt (Google) Signed-off-by: Greg Kroah-Hartman --- samples/ftrace/sample-trace-array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/ftrace/sample-trace-array.c b/samples/ftrace/sample-trace-array.c index 6aba02a31c96..77685a7eb767 100644 --- a/samples/ftrace/sample-trace-array.c +++ b/samples/ftrace/sample-trace-array.c @@ -112,7 +112,7 @@ static int __init sample_trace_array_init(void) /* * If context specific per-cpu buffers havent already been allocated. */ - trace_printk_init_buffers(); + trace_array_init_printk(tr); simple_tsk = kthread_run(simple_thread, NULL, "sample-instance"); if (IS_ERR(simple_tsk)) { From f774628bc1602a2b7395f1d5c9afc0de972ec7b9 Mon Sep 17 00:00:00 2001 From: Ma Ke Date: Mon, 3 Mar 2025 15:27:39 +0800 Subject: [PATCH 155/529] phy: Fix error handling in tegra_xusb_port_init commit b2ea5f49580c0762d17d80d8083cb89bc3acf74f upstream. If device_add() fails, do not use device_unregister() for error handling. device_unregister() consists two functions: device_del() and put_device(). device_unregister() should only be called after device_add() succeeded because device_del() undoes what device_add() does if successful. Change device_unregister() to put_device() call before returning from the function. As comment of device_add() says, 'if device_add() succeeds, you should call device_del() when you want to get rid of it. If device_add() has not succeeded, use only put_device() to drop the reference count'. Found by code review. Cc: stable@vger.kernel.org Fixes: 53d2a715c240 ("phy: Add Tegra XUSB pad controller support") Signed-off-by: Ma Ke Acked-by: Thierry Reding Link: https://lore.kernel.org/r/20250303072739.3874987-1-make24@iscas.ac.cn Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/phy/tegra/xusb.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/phy/tegra/xusb.c b/drivers/phy/tegra/xusb.c index dc22b1dd2c8b..e3acd40e7cbc 100644 --- a/drivers/phy/tegra/xusb.c +++ b/drivers/phy/tegra/xusb.c @@ -542,16 +542,16 @@ static int tegra_xusb_port_init(struct tegra_xusb_port *port, err = dev_set_name(&port->dev, "%s-%u", name, index); if (err < 0) - goto unregister; + goto put_device; err = device_add(&port->dev); if (err < 0) - goto unregister; + goto put_device; return 0; -unregister: - device_unregister(&port->dev); +put_device: + put_device(&port->dev); return err; } From 9b85a453ea69a27ada7c6216b042ea6d4f821e02 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 7 May 2025 15:50:28 +0300 Subject: [PATCH 156/529] phy: renesas: rcar-gen3-usb2: Fix role detection on unbind/bind commit 54c4c58713aaff76c2422ff5750e557ab3b100d7 upstream. It has been observed on the Renesas RZ/G3S SoC that unbinding and binding the PHY driver leads to role autodetection failures. This issue occurs when PHY 3 is the first initialized PHY. PHY 3 does not have an interrupt associated with the USB2_INT_ENABLE register (as rcar_gen3_int_enable[3] = 0). As a result, rcar_gen3_init_otg() is called to initialize OTG without enabling PHY interrupts. To resolve this, add rcar_gen3_is_any_otg_rphy_initialized() and call it in role_store(), role_show(), and rcar_gen3_init_otg(). At the same time, rcar_gen3_init_otg() is only called when initialization for a PHY with interrupt bits is in progress. As a result, the struct rcar_gen3_phy::otg_initialized is no longer needed. Fixes: 549b6b55b005 ("phy: renesas: rcar-gen3-usb2: enable/disable independent irqs") Cc: stable@vger.kernel.org Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Reviewed-by: Lad Prabhakar Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20250507125032.565017-2-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/phy/renesas/phy-rcar-gen3-usb2.c | 33 ++++++++++-------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/drivers/phy/renesas/phy-rcar-gen3-usb2.c b/drivers/phy/renesas/phy-rcar-gen3-usb2.c index 7e61c6b278a7..64513a7921c7 100644 --- a/drivers/phy/renesas/phy-rcar-gen3-usb2.c +++ b/drivers/phy/renesas/phy-rcar-gen3-usb2.c @@ -103,7 +103,6 @@ struct rcar_gen3_phy { struct rcar_gen3_chan *ch; u32 int_enable_bits; bool initialized; - bool otg_initialized; bool powered; }; @@ -311,16 +310,15 @@ static bool rcar_gen3_is_any_rphy_initialized(struct rcar_gen3_chan *ch) return false; } -static bool rcar_gen3_needs_init_otg(struct rcar_gen3_chan *ch) +static bool rcar_gen3_is_any_otg_rphy_initialized(struct rcar_gen3_chan *ch) { - int i; - - for (i = 0; i < NUM_OF_PHYS; i++) { - if (ch->rphys[i].otg_initialized) - return false; + for (enum rcar_gen3_phy_index i = PHY_INDEX_BOTH_HC; i <= PHY_INDEX_EHCI; + i++) { + if (ch->rphys[i].initialized) + return true; } - return true; + return false; } static bool rcar_gen3_are_all_rphys_power_off(struct rcar_gen3_chan *ch) @@ -342,7 +340,7 @@ static ssize_t role_store(struct device *dev, struct device_attribute *attr, bool is_b_device; enum phy_mode cur_mode, new_mode; - if (!ch->is_otg_channel || !rcar_gen3_is_any_rphy_initialized(ch)) + if (!ch->is_otg_channel || !rcar_gen3_is_any_otg_rphy_initialized(ch)) return -EIO; if (sysfs_streq(buf, "host")) @@ -380,7 +378,7 @@ static ssize_t role_show(struct device *dev, struct device_attribute *attr, { struct rcar_gen3_chan *ch = dev_get_drvdata(dev); - if (!ch->is_otg_channel || !rcar_gen3_is_any_rphy_initialized(ch)) + if (!ch->is_otg_channel || !rcar_gen3_is_any_otg_rphy_initialized(ch)) return -EIO; return sprintf(buf, "%s\n", rcar_gen3_is_host(ch) ? "host" : @@ -393,6 +391,9 @@ static void rcar_gen3_init_otg(struct rcar_gen3_chan *ch) void __iomem *usb2_base = ch->base; u32 val; + if (!ch->is_otg_channel || rcar_gen3_is_any_otg_rphy_initialized(ch)) + return; + /* Should not use functions of read-modify-write a register */ val = readl(usb2_base + USB2_LINECTRL1); val = (val & ~USB2_LINECTRL1_DP_RPD) | USB2_LINECTRL1_DPRPD_EN | @@ -456,12 +457,9 @@ static int rcar_gen3_phy_usb2_init(struct phy *p) writel(USB2_SPD_RSM_TIMSET_INIT, usb2_base + USB2_SPD_RSM_TIMSET); writel(USB2_OC_TIMSET_INIT, usb2_base + USB2_OC_TIMSET); - /* Initialize otg part */ - if (channel->is_otg_channel) { - if (rcar_gen3_needs_init_otg(channel)) - rcar_gen3_init_otg(channel); - rphy->otg_initialized = true; - } + /* Initialize otg part (only if we initialize a PHY with IRQs). */ + if (rphy->int_enable_bits) + rcar_gen3_init_otg(channel); rphy->initialized = true; @@ -477,9 +475,6 @@ static int rcar_gen3_phy_usb2_exit(struct phy *p) rphy->initialized = false; - if (channel->is_otg_channel) - rphy->otg_initialized = false; - val = readl(usb2_base + USB2_INT_ENABLE); val &= ~rphy->int_enable_bits; if (!rcar_gen3_is_any_rphy_initialized(channel)) From 4861b3d2a4789f7c25ad08a8cc517256f3da15ee Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 7 May 2025 15:50:32 +0300 Subject: [PATCH 157/529] phy: renesas: rcar-gen3-usb2: Set timing registers only once commit 86e70849f4b2b4597ac9f7c7931f2a363774be25 upstream. phy-rcar-gen3-usb2 driver exports 4 PHYs. The timing registers are common to all PHYs. There is no need to set them every time a PHY is initialized. Set timing register only when the 1st PHY is initialized. Fixes: f3b5a8d9b50d ("phy: rcar-gen3-usb2: Add R-Car Gen3 USB2 PHY driver") Cc: stable@vger.kernel.org Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Reviewed-by: Lad Prabhakar Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20250507125032.565017-6-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/phy/renesas/phy-rcar-gen3-usb2.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/phy/renesas/phy-rcar-gen3-usb2.c b/drivers/phy/renesas/phy-rcar-gen3-usb2.c index 64513a7921c7..3824e338b61e 100644 --- a/drivers/phy/renesas/phy-rcar-gen3-usb2.c +++ b/drivers/phy/renesas/phy-rcar-gen3-usb2.c @@ -454,8 +454,11 @@ static int rcar_gen3_phy_usb2_init(struct phy *p) val = readl(usb2_base + USB2_INT_ENABLE); val |= USB2_INT_ENABLE_UCOM_INTEN | rphy->int_enable_bits; writel(val, usb2_base + USB2_INT_ENABLE); - writel(USB2_SPD_RSM_TIMSET_INIT, usb2_base + USB2_SPD_RSM_TIMSET); - writel(USB2_OC_TIMSET_INIT, usb2_base + USB2_OC_TIMSET); + + if (!rcar_gen3_is_any_rphy_initialized(channel)) { + writel(USB2_SPD_RSM_TIMSET_INIT, usb2_base + USB2_SPD_RSM_TIMSET); + writel(USB2_OC_TIMSET_INIT, usb2_base + USB2_OC_TIMSET); + } /* Initialize otg part (only if we initialize a PHY with IRQs). */ if (rphy->int_enable_bits) From d5b4310993947b0c561d4d648ca96cd464c58f57 Mon Sep 17 00:00:00 2001 From: Steve Siwinski Date: Thu, 8 May 2025 16:01:22 -0400 Subject: [PATCH 158/529] scsi: sd_zbc: block: Respect bio vector limits for REPORT ZONES buffer commit e8007fad5457ea547ca63bb011fdb03213571c7e upstream. The REPORT ZONES buffer size is currently limited by the HBA's maximum segment count to ensure the buffer can be mapped. However, the block layer further limits the number of iovec entries to 1024 when allocating a bio. To avoid allocation of buffers too large to be mapped, further restrict the maximum buffer size to BIO_MAX_INLINE_VECS. Replace the UIO_MAXIOV symbolic name with the more contextually appropriate BIO_MAX_INLINE_VECS. Fixes: b091ac616846 ("sd_zbc: Fix report zones buffer allocation") Cc: stable@vger.kernel.org Signed-off-by: Steve Siwinski Link: https://lore.kernel.org/r/20250508200122.243129-1-ssiwinski@atto.com Reviewed-by: Damien Le Moal Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- block/bio.c | 2 +- drivers/scsi/sd_zbc.c | 6 +++++- include/linux/bio.h | 1 + 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/block/bio.c b/block/bio.c index a7323d567c79..7e2ce61bf247 100644 --- a/block/bio.c +++ b/block/bio.c @@ -576,7 +576,7 @@ struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { struct bio *bio; - if (nr_vecs > UIO_MAXIOV) + if (nr_vecs > BIO_MAX_INLINE_VECS) return NULL; return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); } diff --git a/drivers/scsi/sd_zbc.c b/drivers/scsi/sd_zbc.c index 4c78288ffa72..5fc6bf95258c 100644 --- a/drivers/scsi/sd_zbc.c +++ b/drivers/scsi/sd_zbc.c @@ -197,6 +197,7 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, unsigned int nr_zones, size_t *buflen) { struct request_queue *q = sdkp->disk->queue; + unsigned int max_segments; size_t bufsize; void *buf; @@ -208,12 +209,15 @@ static void *sd_zbc_alloc_report_buffer(struct scsi_disk *sdkp, * Furthermore, since the report zone command cannot be split, make * sure that the allocated buffer can always be mapped by limiting the * number of pages allocated to the HBA max segments limit. + * Since max segments can be larger than the max inline bio vectors, + * further limit the allocated buffer to BIO_MAX_INLINE_VECS. */ nr_zones = min(nr_zones, sdkp->zone_info.nr_zones); bufsize = roundup((nr_zones + 1) * 64, SECTOR_SIZE); bufsize = min_t(size_t, bufsize, queue_max_hw_sectors(q) << SECTOR_SHIFT); - bufsize = min_t(size_t, bufsize, queue_max_segments(q) << PAGE_SHIFT); + max_segments = min(BIO_MAX_INLINE_VECS, queue_max_segments(q)); + bufsize = min_t(size_t, bufsize, max_segments << PAGE_SHIFT); while (bufsize >= SECTOR_SIZE) { buf = kvzalloc(bufsize, GFP_KERNEL | __GFP_NORETRY); diff --git a/include/linux/bio.h b/include/linux/bio.h index 2b9fc5edf49d..00ab98ce1d43 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -11,6 +11,7 @@ #include #define BIO_MAX_VECS 256U +#define BIO_MAX_INLINE_VECS UIO_MAXIOV static inline unsigned int bio_max_segs(unsigned int nr_segs) { From e424894340abe0a1034829b635e1646be473aab4 Mon Sep 17 00:00:00 2001 From: Jethro Donaldson Date: Thu, 15 May 2025 01:23:23 +1200 Subject: [PATCH 159/529] smb: client: fix memory leak during error handling for POSIX mkdir commit 1fe4a44b7fa3955bcb7b4067c07b778fe90d8ee7 upstream. The response buffer for the CREATE request handled by smb311_posix_mkdir() is leaked on the error path (goto err_free_rsp_buf) because the structure pointer *rsp passed to free_rsp_buf() is not assigned until *after* the error condition is checked. As *rsp is initialised to NULL, free_rsp_buf() becomes a no-op and the leak is instead reported by __kmem_cache_shutdown() upon subsequent rmmod of cifs.ko if (and only if) the error path has been hit. Pass rsp_iov.iov_base to free_rsp_buf() instead, similar to the code in other functions in smb2pdu.c for which *rsp is assigned late. Cc: stable@vger.kernel.org Signed-off-by: Jethro Donaldson Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/smb/client/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 96faa22b9cb6..c28e39c67c4f 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -2826,7 +2826,7 @@ int smb311_posix_mkdir(const unsigned int xid, struct inode *inode, /* Eventually save off posix specific response info and timestaps */ err_free_rsp_buf: - free_rsp_buf(resp_buftype, rsp); + free_rsp_buf(resp_buftype, rsp_iov.iov_base); kfree(pc_buf); err_free_req: cifs_small_buf_release(req); From b892e830d1ea8c5475254b98827771f7366f1039 Mon Sep 17 00:00:00 2001 From: Fedor Pchelkin Date: Tue, 6 May 2025 14:55:39 +0300 Subject: [PATCH 160/529] wifi: mt76: disable napi on driver removal commit 78ab4be549533432d97ea8989d2f00b508fa68d8 upstream. A warning on driver removal started occurring after commit 9dd05df8403b ("net: warn if NAPI instance wasn't shut down"). Disable tx napi before deleting it in mt76_dma_cleanup(). WARNING: CPU: 4 PID: 18828 at net/core/dev.c:7288 __netif_napi_del_locked+0xf0/0x100 CPU: 4 UID: 0 PID: 18828 Comm: modprobe Not tainted 6.15.0-rc4 #4 PREEMPT(lazy) Hardware name: ASUS System Product Name/PRIME X670E-PRO WIFI, BIOS 3035 09/05/2024 RIP: 0010:__netif_napi_del_locked+0xf0/0x100 Call Trace: mt76_dma_cleanup+0x54/0x2f0 [mt76] mt7921_pci_remove+0xd5/0x190 [mt7921e] pci_device_remove+0x47/0xc0 device_release_driver_internal+0x19e/0x200 driver_detach+0x48/0x90 bus_remove_driver+0x6d/0xf0 pci_unregister_driver+0x2e/0xb0 __do_sys_delete_module.isra.0+0x197/0x2e0 do_syscall_64+0x7b/0x160 entry_SYSCALL_64_after_hwframe+0x76/0x7e Tested with mt7921e but the same pattern can be actually applied to other mt76 drivers calling mt76_dma_cleanup() during removal. Tx napi is enabled in their *_dma_init() functions and only toggled off and on again inside their suspend/resume/reset paths. So it should be okay to disable tx napi in such a generic way. Found by Linux Verification Center (linuxtesting.org). Fixes: 2ac515a5d74f ("mt76: mt76x02: use napi polling for tx cleanup") Cc: stable@vger.kernel.org Signed-off-by: Fedor Pchelkin Tested-by: Ming Yen Hsieh Link: https://patch.msgid.link/20250506115540.19045-1-pchelkin@ispras.ru Signed-off-by: Felix Fietkau Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/mediatek/mt76/dma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/mediatek/mt76/dma.c b/drivers/net/wireless/mediatek/mt76/dma.c index c406cb1a102f..e5c72a7d6e92 100644 --- a/drivers/net/wireless/mediatek/mt76/dma.c +++ b/drivers/net/wireless/mediatek/mt76/dma.c @@ -789,6 +789,7 @@ void mt76_dma_cleanup(struct mt76_dev *dev) int i; mt76_worker_disable(&dev->tx_worker); + napi_disable(&dev->tx_napi); netif_napi_del(&dev->tx_napi); for (i = 0; i < ARRAY_SIZE(dev->phys); i++) { From 8e460b77b7cc20e6b4af9759cf8abf2de10c4a74 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 7 May 2025 21:47:45 +0100 Subject: [PATCH 161/529] net: qede: Initialize qede_ll_ops with designated initializer commit 6b3ab7f2cbfaeb6580709cd8ef4d72cfd01bfde4 upstream. After a recent change [1] in clang's randstruct implementation to randomize structures that only contain function pointers, there is an error because qede_ll_ops get randomized but does not use a designated initializer for the first member: drivers/net/ethernet/qlogic/qede/qede_main.c:206:2: error: a randomized struct can only be initialized with a designated initializer 206 | { | ^ Explicitly initialize the common member using a designated initializer to fix the build. Cc: stable@vger.kernel.org Fixes: 035f7f87b729 ("randstruct: Enable Clang support") Link: https://github.com/llvm/llvm-project/commit/04364fb888eea6db9811510607bed4b200bcb082 [1] Signed-off-by: Nathan Chancellor Link: https://patch.msgid.link/20250507-qede-fix-clang-randstruct-v1-1-5ccc15626fba@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/qlogic/qede/qede_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index dc43e74147fb..4bc950d36607 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -204,7 +204,7 @@ static struct pci_driver qede_pci_driver = { }; static struct qed_eth_cb_ops qede_ll_ops = { - { + .common = { #ifdef CONFIG_RFS_ACCEL .arfs_filter_op = qede_arfs_filter_op, #endif From df5987e76a4ae4cbd705d81ab4b15ed232250a4a Mon Sep 17 00:00:00 2001 From: Ronald Wahl Date: Mon, 14 Apr 2025 19:31:13 +0200 Subject: [PATCH 162/529] dmaengine: ti: k3-udma: Add missing locking commit fca280992af8c2fbd511bc43f65abb4a17363f2f upstream. Recent kernels complain about a missing lock in k3-udma.c when the lock validator is enabled: [ 4.128073] WARNING: CPU: 0 PID: 746 at drivers/dma/ti/../virt-dma.h:169 udma_start.isra.0+0x34/0x238 [ 4.137352] CPU: 0 UID: 0 PID: 746 Comm: kworker/0:3 Not tainted 6.12.9-arm64 #28 [ 4.144867] Hardware name: pp-v12 (DT) [ 4.148648] Workqueue: events udma_check_tx_completion [ 4.153841] pstate: 60000005 (nZCv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 4.160834] pc : udma_start.isra.0+0x34/0x238 [ 4.165227] lr : udma_start.isra.0+0x30/0x238 [ 4.169618] sp : ffffffc083cabcf0 [ 4.172963] x29: ffffffc083cabcf0 x28: 0000000000000000 x27: ffffff800001b005 [ 4.180167] x26: ffffffc0812f0000 x25: 0000000000000000 x24: 0000000000000000 [ 4.187370] x23: 0000000000000001 x22: 00000000e21eabe9 x21: ffffff8000fa0670 [ 4.194571] x20: ffffff8001b6bf00 x19: ffffff8000fa0430 x18: ffffffc083b95030 [ 4.201773] x17: 0000000000000000 x16: 00000000f0000000 x15: 0000000000000048 [ 4.208976] x14: 0000000000000048 x13: 0000000000000000 x12: 0000000000000001 [ 4.216179] x11: ffffffc08151a240 x10: 0000000000003ea1 x9 : ffffffc08046ab68 [ 4.223381] x8 : ffffffc083cabac0 x7 : ffffffc081df3718 x6 : 0000000000029fc8 [ 4.230583] x5 : ffffffc0817ee6d8 x4 : 0000000000000bc0 x3 : 0000000000000000 [ 4.237784] x2 : 0000000000000000 x1 : 00000000001fffff x0 : 0000000000000000 [ 4.244986] Call trace: [ 4.247463] udma_start.isra.0+0x34/0x238 [ 4.251509] udma_check_tx_completion+0xd0/0xdc [ 4.256076] process_one_work+0x244/0x3fc [ 4.260129] process_scheduled_works+0x6c/0x74 [ 4.264610] worker_thread+0x150/0x1dc [ 4.268398] kthread+0xd8/0xe8 [ 4.271492] ret_from_fork+0x10/0x20 [ 4.275107] irq event stamp: 220 [ 4.278363] hardirqs last enabled at (219): [] _raw_spin_unlock_irq+0x38/0x50 [ 4.287183] hardirqs last disabled at (220): [] el1_dbg+0x24/0x50 [ 4.294879] softirqs last enabled at (182): [] handle_softirqs+0x1c0/0x3cc [ 4.303437] softirqs last disabled at (177): [] __do_softirq+0x1c/0x28 [ 4.311559] ---[ end trace 0000000000000000 ]--- This commit adds the missing locking. Fixes: 25dcb5dd7b7c ("dmaengine: ti: New driver for K3 UDMA") Cc: Peter Ujfalusi Cc: Vignesh Raghavendra Cc: Vinod Koul Cc: dmaengine@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Ronald Wahl Acked-by: Peter Ujfalusi Link: https://lore.kernel.org/r/20250414173113.80677-1-rwahl@gmx.de Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/ti/k3-udma.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c index e323e1a5f20f..18db49fbcc00 100644 --- a/drivers/dma/ti/k3-udma.c +++ b/drivers/dma/ti/k3-udma.c @@ -1088,8 +1088,11 @@ static void udma_check_tx_completion(struct work_struct *work) u32 residue_diff; ktime_t time_diff; unsigned long delay; + unsigned long flags; while (1) { + spin_lock_irqsave(&uc->vc.lock, flags); + if (uc->desc) { /* Get previous residue and time stamp */ residue_diff = uc->tx_drain.residue; @@ -1124,6 +1127,8 @@ static void udma_check_tx_completion(struct work_struct *work) break; } + spin_unlock_irqrestore(&uc->vc.lock, flags); + usleep_range(ktime_to_us(delay), ktime_to_us(delay) + 10); continue; @@ -1140,6 +1145,8 @@ static void udma_check_tx_completion(struct work_struct *work) break; } + + spin_unlock_irqrestore(&uc->vc.lock, flags); } static irqreturn_t udma_ring_irq_handler(int irq, void *data) From c70ec7dc880cf23f70f0745d384267810f289878 Mon Sep 17 00:00:00 2001 From: Yemike Abhilash Chandra Date: Thu, 17 Apr 2025 13:25:21 +0530 Subject: [PATCH 163/529] dmaengine: ti: k3-udma: Use cap_mask directly from dma_device structure instead of a local copy commit 8ca9590c39b69b55a8de63d2b21b0d44f523b43a upstream. Currently, a local dma_cap_mask_t variable is used to store device cap_mask within udma_of_xlate(). However, the DMA_PRIVATE flag in the device cap_mask can get cleared when the last channel is released. This can happen right after storing the cap_mask locally in udma_of_xlate(), and subsequent dma_request_channel() can fail due to mismatch in the cap_mask. Fix this by removing the local dma_cap_mask_t variable and directly using the one from the dma_device structure. Fixes: 25dcb5dd7b7c ("dmaengine: ti: New driver for K3 UDMA") Cc: stable@vger.kernel.org Signed-off-by: Vaishnav Achath Acked-by: Peter Ujfalusi Reviewed-by: Udit Kumar Signed-off-by: Yemike Abhilash Chandra Link: https://lore.kernel.org/r/20250417075521.623651-1-y-abhilashchandra@ti.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/ti/k3-udma.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/dma/ti/k3-udma.c b/drivers/dma/ti/k3-udma.c index 18db49fbcc00..edad538928dd 100644 --- a/drivers/dma/ti/k3-udma.c +++ b/drivers/dma/ti/k3-udma.c @@ -4216,7 +4216,6 @@ static struct dma_chan *udma_of_xlate(struct of_phandle_args *dma_spec, struct of_dma *ofdma) { struct udma_dev *ud = ofdma->of_dma_data; - dma_cap_mask_t mask = ud->ddev.cap_mask; struct udma_filter_param filter_param; struct dma_chan *chan; @@ -4248,7 +4247,7 @@ static struct dma_chan *udma_of_xlate(struct of_phandle_args *dma_spec, } } - chan = __dma_request_channel(&mask, udma_dma_filter_fn, &filter_param, + chan = __dma_request_channel(&ud->ddev.cap_mask, udma_dma_filter_fn, &filter_param, ofdma->of_node); if (!chan) { dev_err(ud->dev, "get channel fail in %s.\n", __func__); From d584acdf54f409cb7eae1359ae6c12aaabedeed8 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:09 +0800 Subject: [PATCH 164/529] dmaengine: idxd: fix memory leak in error handling path of idxd_setup_wqs commit 3fd2f4bc010cdfbc07dd21018dc65bd9370eb7a4 upstream. Memory allocated for wqs is not freed if an error occurs during idxd_setup_wqs(). To fix it, free the allocated memory in the reverse order of allocation before exiting the function in case of an error. Fixes: 7c5dd23e57c1 ("dmaengine: idxd: fix wq conf_dev 'struct device' lifetime") Fixes: 700af3a0a26c ("dmaengine: idxd: add 'struct idxd_dev' as wrapper for conf_dev") Fixes: de5819b99489 ("dmaengine: idxd: track enabled workqueues in bitmap") Fixes: b0325aefd398 ("dmaengine: idxd: add WQ operation cap restriction support") Cc: stable@vger.kernel.org Signed-off-by: Shuai Xue Reviewed-by: Dave Jiang Reviewed-by: Fenghua Yu Link: https://lore.kernel.org/r/20250404120217.48772-2-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/idxd/init.c | 30 +++++++++++++++++++++--------- 1 file changed, 21 insertions(+), 9 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 30193195c813..9763769b301c 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -155,8 +155,8 @@ static int idxd_setup_wqs(struct idxd_device *idxd) idxd->wq_enable_map = bitmap_zalloc_node(idxd->max_wqs, GFP_KERNEL, dev_to_node(dev)); if (!idxd->wq_enable_map) { - kfree(idxd->wqs); - return -ENOMEM; + rc = -ENOMEM; + goto err_bitmap; } for (i = 0; i < idxd->max_wqs; i++) { @@ -175,10 +175,8 @@ static int idxd_setup_wqs(struct idxd_device *idxd) conf_dev->bus = &dsa_bus_type; conf_dev->type = &idxd_wq_device_type; rc = dev_set_name(conf_dev, "wq%d.%d", idxd->id, wq->id); - if (rc < 0) { - put_device(conf_dev); + if (rc < 0) goto err; - } mutex_init(&wq->wq_lock); init_waitqueue_head(&wq->err_queue); @@ -189,7 +187,6 @@ static int idxd_setup_wqs(struct idxd_device *idxd) wq->enqcmds_retries = IDXD_ENQCMDS_RETRIES; wq->wqcfg = kzalloc_node(idxd->wqcfg_size, GFP_KERNEL, dev_to_node(dev)); if (!wq->wqcfg) { - put_device(conf_dev); rc = -ENOMEM; goto err; } @@ -197,9 +194,8 @@ static int idxd_setup_wqs(struct idxd_device *idxd) if (idxd->hw.wq_cap.op_config) { wq->opcap_bmap = bitmap_zalloc(IDXD_MAX_OPCAP_BITS, GFP_KERNEL); if (!wq->opcap_bmap) { - put_device(conf_dev); rc = -ENOMEM; - goto err; + goto err_opcap_bmap; } bitmap_copy(wq->opcap_bmap, idxd->opcap_bmap, IDXD_MAX_OPCAP_BITS); } @@ -208,12 +204,28 @@ static int idxd_setup_wqs(struct idxd_device *idxd) return 0; - err: +err_opcap_bmap: + kfree(wq->wqcfg); + +err: + put_device(conf_dev); + kfree(wq); + while (--i >= 0) { wq = idxd->wqs[i]; + if (idxd->hw.wq_cap.op_config) + bitmap_free(wq->opcap_bmap); + kfree(wq->wqcfg); conf_dev = wq_confdev(wq); put_device(conf_dev); + kfree(wq); + } + bitmap_free(idxd->wq_enable_map); + +err_bitmap: + kfree(idxd->wqs); + return rc; } From 11fd63ea08b9ff18a9020bf48be2aee34920590d Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:10 +0800 Subject: [PATCH 165/529] dmaengine: idxd: fix memory leak in error handling path of idxd_setup_engines commit 817bced19d1dbdd0b473580d026dc0983e30e17b upstream. Memory allocated for engines is not freed if an error occurs during idxd_setup_engines(). To fix it, free the allocated memory in the reverse order of allocation before exiting the function in case of an error. Fixes: 75b911309060 ("dmaengine: idxd: fix engine conf_dev lifetime") Cc: stable@vger.kernel.org Signed-off-by: Shuai Xue Reviewed-by: Dave Jiang Reviewed-by: Fenghua Yu Link: https://lore.kernel.org/r/20250404120217.48772-3-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/idxd/init.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 9763769b301c..fef609cde83e 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -259,6 +259,7 @@ static int idxd_setup_engines(struct idxd_device *idxd) rc = dev_set_name(conf_dev, "engine%d.%d", idxd->id, engine->id); if (rc < 0) { put_device(conf_dev); + kfree(engine); goto err; } @@ -272,7 +273,10 @@ static int idxd_setup_engines(struct idxd_device *idxd) engine = idxd->engines[i]; conf_dev = engine_confdev(engine); put_device(conf_dev); + kfree(engine); } + kfree(idxd->engines); + return rc; } From 50f23001780a52c0ea490bdd65ebd842b02df78a Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:11 +0800 Subject: [PATCH 166/529] dmaengine: idxd: fix memory leak in error handling path of idxd_setup_groups commit aa6f4f945b10eac57aed46154ae7d6fada7fccc7 upstream. Memory allocated for groups is not freed if an error occurs during idxd_setup_groups(). To fix it, free the allocated memory in the reverse order of allocation before exiting the function in case of an error. Fixes: defe49f96012 ("dmaengine: idxd: fix group conf_dev lifetime") Cc: stable@vger.kernel.org Signed-off-by: Shuai Xue Reviewed-by: Dave Jiang Reviewed-by: Fenghua Yu Link: https://lore.kernel.org/r/20250404120217.48772-4-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/idxd/init.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index fef609cde83e..31158466f1ed 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -310,6 +310,7 @@ static int idxd_setup_groups(struct idxd_device *idxd) rc = dev_set_name(conf_dev, "group%d.%d", idxd->id, group->id); if (rc < 0) { put_device(conf_dev); + kfree(group); goto err; } @@ -329,7 +330,10 @@ static int idxd_setup_groups(struct idxd_device *idxd) while (--i >= 0) { group = idxd->groups[i]; put_device(group_confdev(group)); + kfree(group); } + kfree(idxd->groups); + return rc; } From 9b4947544bbd0968c8eef1d8de7a9e752e87505d Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:12 +0800 Subject: [PATCH 167/529] dmaengine: idxd: Add missing cleanup for early error out in idxd_setup_internals commit 61259fb96e023f7299c442c48b13e72c441fc0f2 upstream. The idxd_setup_internals() is missing some cleanup when things fail in the middle. Add the appropriate cleanup routines: - cleanup groups - cleanup enginces - cleanup wqs to make sure it exits gracefully. Fixes: defe49f96012 ("dmaengine: idxd: fix group conf_dev lifetime") Cc: stable@vger.kernel.org Suggested-by: Fenghua Yu Signed-off-by: Shuai Xue Reviewed-by: Fenghua Yu Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20250404120217.48772-5-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/idxd/init.c | 58 ++++++++++++++++++++++++++++++++++++----- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 31158466f1ed..3e89891781e3 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -141,6 +141,25 @@ static void idxd_cleanup_interrupts(struct idxd_device *idxd) pci_free_irq_vectors(pdev); } +static void idxd_clean_wqs(struct idxd_device *idxd) +{ + struct idxd_wq *wq; + struct device *conf_dev; + int i; + + for (i = 0; i < idxd->max_wqs; i++) { + wq = idxd->wqs[i]; + if (idxd->hw.wq_cap.op_config) + bitmap_free(wq->opcap_bmap); + kfree(wq->wqcfg); + conf_dev = wq_confdev(wq); + put_device(conf_dev); + kfree(wq); + } + bitmap_free(idxd->wq_enable_map); + kfree(idxd->wqs); +} + static int idxd_setup_wqs(struct idxd_device *idxd) { struct device *dev = &idxd->pdev->dev; @@ -229,6 +248,21 @@ err_bitmap: return rc; } +static void idxd_clean_engines(struct idxd_device *idxd) +{ + struct idxd_engine *engine; + struct device *conf_dev; + int i; + + for (i = 0; i < idxd->max_engines; i++) { + engine = idxd->engines[i]; + conf_dev = engine_confdev(engine); + put_device(conf_dev); + kfree(engine); + } + kfree(idxd->engines); +} + static int idxd_setup_engines(struct idxd_device *idxd) { struct idxd_engine *engine; @@ -280,6 +314,19 @@ static int idxd_setup_engines(struct idxd_device *idxd) return rc; } +static void idxd_clean_groups(struct idxd_device *idxd) +{ + struct idxd_group *group; + int i; + + for (i = 0; i < idxd->max_groups; i++) { + group = idxd->groups[i]; + put_device(group_confdev(group)); + kfree(group); + } + kfree(idxd->groups); +} + static int idxd_setup_groups(struct idxd_device *idxd) { struct device *dev = &idxd->pdev->dev; @@ -353,7 +400,7 @@ static void idxd_cleanup_internals(struct idxd_device *idxd) static int idxd_setup_internals(struct idxd_device *idxd) { struct device *dev = &idxd->pdev->dev; - int rc, i; + int rc; init_waitqueue_head(&idxd->cmd_waitq); @@ -378,14 +425,11 @@ static int idxd_setup_internals(struct idxd_device *idxd) return 0; err_wkq_create: - for (i = 0; i < idxd->max_groups; i++) - put_device(group_confdev(idxd->groups[i])); + idxd_clean_groups(idxd); err_group: - for (i = 0; i < idxd->max_engines; i++) - put_device(engine_confdev(idxd->engines[i])); + idxd_clean_engines(idxd); err_engine: - for (i = 0; i < idxd->max_wqs; i++) - put_device(wq_confdev(idxd->wqs[i])); + idxd_clean_wqs(idxd); err_wqs: return rc; } From 24d9c14fdc63b6df552d5bbe71f9fc113690a1e5 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:13 +0800 Subject: [PATCH 168/529] dmaengine: idxd: Add missing cleanups in cleanup internals commit 61d651572b6c4fe50c7b39a390760f3a910c7ccf upstream. The idxd_cleanup_internals() function only decreases the reference count of groups, engines, and wqs but is missing the step to release memory resources. To fix this, use the cleanup helper to properly release the memory resources. Fixes: ddf742d4f3f1 ("dmaengine: idxd: Add missing cleanup for early error out in probe call") Cc: stable@vger.kernel.org Signed-off-by: Shuai Xue Reviewed-by: Fenghua Yu Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20250404120217.48772-6-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/idxd/init.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 3e89891781e3..6b5c1e803793 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -386,14 +386,9 @@ static int idxd_setup_groups(struct idxd_device *idxd) static void idxd_cleanup_internals(struct idxd_device *idxd) { - int i; - - for (i = 0; i < idxd->max_groups; i++) - put_device(group_confdev(idxd->groups[i])); - for (i = 0; i < idxd->max_engines; i++) - put_device(engine_confdev(idxd->engines[i])); - for (i = 0; i < idxd->max_wqs; i++) - put_device(wq_confdev(idxd->wqs[i])); + idxd_clean_groups(idxd); + idxd_clean_engines(idxd); + idxd_clean_wqs(idxd); destroy_workqueue(idxd->wq); } From 68ac5a01f635b3791196fd1c39bc48497252c36f Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:16 +0800 Subject: [PATCH 169/529] dmaengine: idxd: Add missing idxd cleanup to fix memory leak in remove call commit d5449ff1b04dfe9ed8e455769aa01e4c2ccf6805 upstream. The remove call stack is missing idxd cleanup to free bitmap, ida and the idxd_device. Call idxd_free() helper routines to make sure we exit gracefully. Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators") Cc: stable@vger.kernel.org Suggested-by: Vinicius Costa Gomes Signed-off-by: Shuai Xue Reviewed-by: Fenghua Yu Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20250404120217.48772-9-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/idxd/init.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 6b5c1e803793..a84f5a676666 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -796,6 +796,7 @@ static void idxd_remove(struct pci_dev *pdev) destroy_workqueue(idxd->wq); perfmon_pmu_remove(idxd); put_device(idxd_confdev(idxd)); + idxd_free(idxd); } static struct pci_driver idxd_pci_driver = { From 64afd9a1f644b27661420257dcc007d5009c99dd Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:14 +0800 Subject: [PATCH 170/529] dmaengine: idxd: fix memory leak in error handling path of idxd_alloc commit 46a5cca76c76c86063000a12936f8e7875295838 upstream. Memory allocated for idxd is not freed if an error occurs during idxd_alloc(). To fix it, free the allocated memory in the reverse order of allocation before exiting the function in case of an error. Fixes: a8563a33a5e2 ("dmanegine: idxd: reformat opcap output to match bitmap_parse() input") Cc: stable@vger.kernel.org Signed-off-by: Shuai Xue Reviewed-by: Dave Jiang Reviewed-by: Fenghua Yu Link: https://lore.kernel.org/r/20250404120217.48772-7-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/idxd/init.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index a84f5a676666..d29178dfae3f 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -537,28 +537,34 @@ static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_d idxd_dev_set_type(&idxd->idxd_dev, idxd->data->type); idxd->id = ida_alloc(&idxd_ida, GFP_KERNEL); if (idxd->id < 0) - return NULL; + goto err_ida; idxd->opcap_bmap = bitmap_zalloc_node(IDXD_MAX_OPCAP_BITS, GFP_KERNEL, dev_to_node(dev)); - if (!idxd->opcap_bmap) { - ida_free(&idxd_ida, idxd->id); - return NULL; - } + if (!idxd->opcap_bmap) + goto err_opcap; device_initialize(conf_dev); conf_dev->parent = dev; conf_dev->bus = &dsa_bus_type; conf_dev->type = idxd->data->dev_type; rc = dev_set_name(conf_dev, "%s%d", idxd->data->name_prefix, idxd->id); - if (rc < 0) { - put_device(conf_dev); - return NULL; - } + if (rc < 0) + goto err_name; spin_lock_init(&idxd->dev_lock); spin_lock_init(&idxd->cmd_lock); return idxd; + +err_name: + put_device(conf_dev); + bitmap_free(idxd->opcap_bmap); +err_opcap: + ida_free(&idxd_ida, idxd->id); +err_ida: + kfree(idxd); + + return NULL; } static int idxd_enable_system_pasid(struct idxd_device *idxd) From d8ef6140fe9e6c4062203ae1a2ec95cf1f861775 Mon Sep 17 00:00:00 2001 From: Shuai Xue Date: Fri, 4 Apr 2025 20:02:15 +0800 Subject: [PATCH 171/529] dmaengine: idxd: fix memory leak in error handling path of idxd_pci_probe commit 90022b3a6981ec234902be5dbf0f983a12c759fc upstream. Memory allocated for idxd is not freed if an error occurs during idxd_pci_probe(). To fix it, free the allocated memory in the reverse order of allocation before exiting the function in case of an error. Fixes: bfe1d56091c1 ("dmaengine: idxd: Init and probe for Intel data accelerators") Cc: stable@vger.kernel.org Signed-off-by: Shuai Xue Reviewed-by: Dave Jiang Reviewed-by: Fenghua Yu Link: https://lore.kernel.org/r/20250404120217.48772-8-xueshuai@linux.alibaba.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/idxd/init.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index d29178dfae3f..7cb76db5ad60 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -520,6 +520,17 @@ static void idxd_read_caps(struct idxd_device *idxd) multi_u64_to_bmap(idxd->opcap_bmap, &idxd->hw.opcap.bits[0], 4); } +static void idxd_free(struct idxd_device *idxd) +{ + if (!idxd) + return; + + put_device(idxd_confdev(idxd)); + bitmap_free(idxd->opcap_bmap); + ida_free(&idxd_ida, idxd->id); + kfree(idxd); +} + static struct idxd_device *idxd_alloc(struct pci_dev *pdev, struct idxd_driver_data *data) { struct device *dev = &pdev->dev; @@ -739,7 +750,7 @@ static int idxd_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) err: pci_iounmap(pdev, idxd->reg_base); err_iomap: - put_device(idxd_confdev(idxd)); + idxd_free(idxd); err_idxd_alloc: pci_disable_device(pdev); return rc; From f32451ca4cb7dc53f2a0e2e66b84d34162747eb7 Mon Sep 17 00:00:00 2001 From: Andrei Kuchynski Date: Thu, 24 Apr 2025 08:44:28 +0000 Subject: [PATCH 172/529] usb: typec: ucsi: displayport: Fix deadlock commit 364618c89d4c57c85e5fc51a2446cd939bf57802 upstream. This patch introduces the ucsi_con_mutex_lock / ucsi_con_mutex_unlock functions to the UCSI driver. ucsi_con_mutex_lock ensures the connector mutex is only locked if a connection is established and the partner pointer is valid. This resolves a deadlock scenario where ucsi_displayport_remove_partner holds con->mutex waiting for dp_altmode_work to complete while dp_altmode_work attempts to acquire it. Cc: stable Fixes: af8622f6a585 ("usb: typec: ucsi: Support for DisplayPort alt mode") Signed-off-by: Andrei Kuchynski Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20250424084429.3220757-2-akuchynski@chromium.org Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/displayport.c | 19 +++++++++------- drivers/usb/typec/ucsi/ucsi.c | 34 ++++++++++++++++++++++++++++ drivers/usb/typec/ucsi/ucsi.h | 3 +++ 3 files changed, 48 insertions(+), 8 deletions(-) diff --git a/drivers/usb/typec/ucsi/displayport.c b/drivers/usb/typec/ucsi/displayport.c index 8c19081c3255..e3b5fa3b5f95 100644 --- a/drivers/usb/typec/ucsi/displayport.c +++ b/drivers/usb/typec/ucsi/displayport.c @@ -54,7 +54,8 @@ static int ucsi_displayport_enter(struct typec_altmode *alt, u32 *vdo) u8 cur = 0; int ret; - mutex_lock(&dp->con->lock); + if (!ucsi_con_mutex_lock(dp->con)) + return -ENOTCONN; if (!dp->override && dp->initialized) { const struct typec_altmode *p = typec_altmode_get_partner(alt); @@ -100,7 +101,7 @@ static int ucsi_displayport_enter(struct typec_altmode *alt, u32 *vdo) schedule_work(&dp->work); ret = 0; err_unlock: - mutex_unlock(&dp->con->lock); + ucsi_con_mutex_unlock(dp->con); return ret; } @@ -112,7 +113,8 @@ static int ucsi_displayport_exit(struct typec_altmode *alt) u64 command; int ret = 0; - mutex_lock(&dp->con->lock); + if (!ucsi_con_mutex_lock(dp->con)) + return -ENOTCONN; if (!dp->override) { const struct typec_altmode *p = typec_altmode_get_partner(alt); @@ -144,7 +146,7 @@ static int ucsi_displayport_exit(struct typec_altmode *alt) schedule_work(&dp->work); out_unlock: - mutex_unlock(&dp->con->lock); + ucsi_con_mutex_unlock(dp->con); return ret; } @@ -202,20 +204,21 @@ static int ucsi_displayport_vdm(struct typec_altmode *alt, int cmd = PD_VDO_CMD(header); int svdm_version; - mutex_lock(&dp->con->lock); + if (!ucsi_con_mutex_lock(dp->con)) + return -ENOTCONN; if (!dp->override && dp->initialized) { const struct typec_altmode *p = typec_altmode_get_partner(alt); dev_warn(&p->dev, "firmware doesn't support alternate mode overriding\n"); - mutex_unlock(&dp->con->lock); + ucsi_con_mutex_unlock(dp->con); return -EOPNOTSUPP; } svdm_version = typec_altmode_get_svdm_version(alt); if (svdm_version < 0) { - mutex_unlock(&dp->con->lock); + ucsi_con_mutex_unlock(dp->con); return svdm_version; } @@ -259,7 +262,7 @@ static int ucsi_displayport_vdm(struct typec_altmode *alt, break; } - mutex_unlock(&dp->con->lock); + ucsi_con_mutex_unlock(dp->con); return 0; } diff --git a/drivers/usb/typec/ucsi/ucsi.c b/drivers/usb/typec/ucsi/ucsi.c index 2adf5fdc0c56..2a03bb992806 100644 --- a/drivers/usb/typec/ucsi/ucsi.c +++ b/drivers/usb/typec/ucsi/ucsi.c @@ -1398,6 +1398,40 @@ void ucsi_set_drvdata(struct ucsi *ucsi, void *data) } EXPORT_SYMBOL_GPL(ucsi_set_drvdata); +/** + * ucsi_con_mutex_lock - Acquire the connector mutex + * @con: The connector interface to lock + * + * Returns true on success, false if the connector is disconnected + */ +bool ucsi_con_mutex_lock(struct ucsi_connector *con) +{ + bool mutex_locked = false; + bool connected = true; + + while (connected && !mutex_locked) { + mutex_locked = mutex_trylock(&con->lock) != 0; + connected = con->status.flags & UCSI_CONSTAT_CONNECTED; + if (connected && !mutex_locked) + msleep(20); + } + + connected = connected && con->partner; + if (!connected && mutex_locked) + mutex_unlock(&con->lock); + + return connected; +} + +/** + * ucsi_con_mutex_unlock - Release the connector mutex + * @con: The connector interface to unlock + */ +void ucsi_con_mutex_unlock(struct ucsi_connector *con) +{ + mutex_unlock(&con->lock); +} + /** * ucsi_create - Allocate UCSI instance * @dev: Device interface to the PPM (Platform Policy Manager) diff --git a/drivers/usb/typec/ucsi/ucsi.h b/drivers/usb/typec/ucsi/ucsi.h index 4a1a86e37fd5..793a8307dded 100644 --- a/drivers/usb/typec/ucsi/ucsi.h +++ b/drivers/usb/typec/ucsi/ucsi.h @@ -15,6 +15,7 @@ struct ucsi; struct ucsi_altmode; +struct ucsi_connector; /* UCSI offsets (Bytes) */ #define UCSI_VERSION 0 @@ -62,6 +63,8 @@ int ucsi_register(struct ucsi *ucsi); void ucsi_unregister(struct ucsi *ucsi); void *ucsi_get_drvdata(struct ucsi *ucsi); void ucsi_set_drvdata(struct ucsi *ucsi, void *data); +bool ucsi_con_mutex_lock(struct ucsi_connector *con); +void ucsi_con_mutex_unlock(struct ucsi_connector *con); void ucsi_connector_change(struct ucsi *ucsi, u8 num); From f1c5ddaef506e3517dce338c08a60663b1521920 Mon Sep 17 00:00:00 2001 From: RD Babiera Date: Thu, 29 Feb 2024 00:11:02 +0000 Subject: [PATCH 173/529] usb: typec: altmodes/displayport: create sysfs nodes as driver's default device attribute group commit 165376f6b23e9a779850e750fb2eb06622e5a531 upstream. The DisplayPort driver's sysfs nodes may be present to the userspace before typec_altmode_set_drvdata() completes in dp_altmode_probe. This means that a sysfs read can trigger a NULL pointer error by deferencing dp->hpd in hpd_show or dp->lock in pin_assignment_show, as dev_get_drvdata() returns NULL in those cases. Remove manual sysfs node creation in favor of adding attribute group as default for devices bound to the driver. The ATTRIBUTE_GROUPS() macro is not used here otherwise the path to the sysfs nodes is no longer compliant with the ABI. Fixes: 0e3bb7d6894d ("usb: typec: Add driver for DisplayPort alternate mode") Cc: stable@vger.kernel.org Signed-off-by: RD Babiera Link: https://lore.kernel.org/r/20240229001101.3889432-2-rdbabiera@google.com [Minor conflict resolved due to code context change.] Signed-off-by: Jianqi Ren Signed-off-by: He Zhe Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/altmodes/displayport.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/usb/typec/altmodes/displayport.c b/drivers/usb/typec/altmodes/displayport.c index f564d0d471bb..f80124102328 100644 --- a/drivers/usb/typec/altmodes/displayport.c +++ b/drivers/usb/typec/altmodes/displayport.c @@ -543,15 +543,20 @@ static ssize_t pin_assignment_show(struct device *dev, } static DEVICE_ATTR_RW(pin_assignment); -static struct attribute *dp_altmode_attrs[] = { +static struct attribute *displayport_attrs[] = { &dev_attr_configuration.attr, &dev_attr_pin_assignment.attr, NULL }; -static const struct attribute_group dp_altmode_group = { +static const struct attribute_group displayport_group = { .name = "displayport", - .attrs = dp_altmode_attrs, + .attrs = displayport_attrs, +}; + +static const struct attribute_group *displayport_groups[] = { + &displayport_group, + NULL, }; int dp_altmode_probe(struct typec_altmode *alt) @@ -559,7 +564,6 @@ int dp_altmode_probe(struct typec_altmode *alt) const struct typec_altmode *port = typec_altmode_get_partner(alt); struct fwnode_handle *fwnode; struct dp_altmode *dp; - int ret; /* FIXME: Port can only be DFP_U. */ @@ -570,10 +574,6 @@ int dp_altmode_probe(struct typec_altmode *alt) DP_CAP_PIN_ASSIGN_DFP_D(alt->vdo))) return -ENODEV; - ret = sysfs_create_group(&alt->dev.kobj, &dp_altmode_group); - if (ret) - return ret; - dp = devm_kzalloc(&alt->dev, sizeof(*dp), GFP_KERNEL); if (!dp) return -ENOMEM; @@ -604,7 +604,6 @@ void dp_altmode_remove(struct typec_altmode *alt) { struct dp_altmode *dp = typec_altmode_get_drvdata(alt); - sysfs_remove_group(&alt->dev.kobj, &dp_altmode_group); cancel_work_sync(&dp->work); if (dp->connector_fwnode) { @@ -629,6 +628,7 @@ static struct typec_altmode_driver dp_altmode_driver = { .driver = { .name = "typec_displayport", .owner = THIS_MODULE, + .dev_groups = displayport_groups, }, }; module_typec_altmode_driver(dp_altmode_driver); From e44189455c62469eb91d383ce9103d54c1f807a3 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Mon, 11 Nov 2024 14:08:06 +0300 Subject: [PATCH 174/529] usb: typec: fix potential array underflow in ucsi_ccg_sync_control() commit e56aac6e5a25630645607b6856d4b2a17b2311a5 upstream. The "command" variable can be controlled by the user via debugfs. The worry is that if con_index is zero then "&uc->ucsi->connector[con_index - 1]" would be an array underflow. Fixes: 170a6726d0e2 ("usb: typec: ucsi: add support for separate DP altmode devices") Signed-off-by: Dan Carpenter Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/c69ef0b3-61b0-4dde-98dd-97b97f81d912@stanley.mountain Signed-off-by: Greg Kroah-Hartman [ The function ucsi_ccg_sync_write() is renamed to ucsi_ccg_sync_control() in commit 13f2ec3115c8 ("usb: typec: ucsi:simplify command sending API"). Apply this patch to ucsi_ccg_sync_write() in 6.1.y accordingly. ] Signed-off-by: Bin Lan Signed-off-by: He Zhe Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi_ccg.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/typec/ucsi/ucsi_ccg.c b/drivers/usb/typec/ucsi/ucsi_ccg.c index 8e500fe41e78..4801d783bd0c 100644 --- a/drivers/usb/typec/ucsi/ucsi_ccg.c +++ b/drivers/usb/typec/ucsi/ucsi_ccg.c @@ -585,6 +585,10 @@ static int ucsi_ccg_sync_write(struct ucsi *ucsi, unsigned int offset, uc->has_multiple_dp) { con_index = (uc->last_cmd_sent >> 16) & UCSI_CMD_CONNECTOR_MASK; + if (con_index == 0) { + ret = -EINVAL; + goto unlock; + } con = &uc->ucsi->connector[con_index - 1]; ucsi_ccg_update_set_new_cam_cmd(uc, con, (u64 *)val); } @@ -600,6 +604,7 @@ static int ucsi_ccg_sync_write(struct ucsi *ucsi, unsigned int offset, err_clear_bit: clear_bit(DEV_CMD_PENDING, &uc->flags); pm_runtime_put_sync(uc->dev); +unlock: mutex_unlock(&uc->lock); return ret; From 0529646acdfadc48a131b3956f3820abd805e744 Mon Sep 17 00:00:00 2001 From: GONG Ruiqi Date: Tue, 7 Jan 2025 09:57:50 +0800 Subject: [PATCH 175/529] usb: typec: fix pm usage counter imbalance in ucsi_ccg_sync_control() commit b0e525d7a22ea350e75e2aec22e47fcfafa4cacd upstream. The error handling for the case `con_index == 0` should involve dropping the pm usage counter, as ucsi_ccg_sync_control() gets it at the beginning. Fix it. Cc: stable Fixes: e56aac6e5a25 ("usb: typec: fix potential array underflow in ucsi_ccg_sync_control()") Signed-off-by: GONG Ruiqi Reviewed-by: Dan Carpenter Reviewed-by: Heikki Krogerus Link: https://lore.kernel.org/r/20250107015750.2778646-1-gongruiqi1@huawei.com Signed-off-by: Greg Kroah-Hartman [Minor context change fixed.] Signed-off-by: Bin Lan Signed-off-by: He Zhe Signed-off-by: Greg Kroah-Hartman --- drivers/usb/typec/ucsi/ucsi_ccg.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/typec/ucsi/ucsi_ccg.c b/drivers/usb/typec/ucsi/ucsi_ccg.c index 4801d783bd0c..e690b6e53480 100644 --- a/drivers/usb/typec/ucsi/ucsi_ccg.c +++ b/drivers/usb/typec/ucsi/ucsi_ccg.c @@ -587,7 +587,7 @@ static int ucsi_ccg_sync_write(struct ucsi *ucsi, unsigned int offset, UCSI_CMD_CONNECTOR_MASK; if (con_index == 0) { ret = -EINVAL; - goto unlock; + goto err_put; } con = &uc->ucsi->connector[con_index - 1]; ucsi_ccg_update_set_new_cam_cmd(uc, con, (u64 *)val); @@ -603,8 +603,8 @@ static int ucsi_ccg_sync_write(struct ucsi *ucsi, unsigned int offset, err_clear_bit: clear_bit(DEV_CMD_PENDING, &uc->flags); +err_put: pm_runtime_put_sync(uc->dev); -unlock: mutex_unlock(&uc->lock); return ret; From 09b18c24350330a25cfc3eb64c410cb4f67f5445 Mon Sep 17 00:00:00 2001 From: Feng Tang Date: Wed, 23 Apr 2025 18:36:45 +0800 Subject: [PATCH 176/529] selftests/mm: compaction_test: support platform with huge mount of memory commit ab00ddd802f80e31fc9639c652d736fe3913feae upstream. When running mm selftest to verify mm patches, 'compaction_test' case failed on an x86 server with 1TB memory. And the root cause is that it has too much free memory than what the test supports. The test case tries to allocate 100000 huge pages, which is about 200 GB for that x86 server, and when it succeeds, it expects it's large than 1/3 of 80% of the free memory in system. This logic only works for platform with 750 GB ( 200 / (1/3) / 80% ) or less free memory, and may raise false alarm for others. Fix it by changing the fixed page number to self-adjustable number according to the real number of free memory. Link: https://lkml.kernel.org/r/20250423103645.2758-1-feng.tang@linux.alibaba.com Fixes: bd67d5c15cc1 ("Test compaction of mlocked memory") Signed-off-by: Feng Tang Acked-by: Dev Jain Reviewed-by: Baolin Wang Tested-by: Baolin Wang Cc: Shuah Khan Cc: Sri Jayaramappa Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/vm/compaction_test.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/vm/compaction_test.c b/tools/testing/selftests/vm/compaction_test.c index 309b3750e57e..38fec412206b 100644 --- a/tools/testing/selftests/vm/compaction_test.c +++ b/tools/testing/selftests/vm/compaction_test.c @@ -89,6 +89,8 @@ int check_compaction(unsigned long mem_free, unsigned long hugepage_size) int compaction_index = 0; char initial_nr_hugepages[20] = {0}; char nr_hugepages[20] = {0}; + char target_nr_hugepages[24] = {0}; + int slen; /* We want to test with 80% of available memory. Else, OOM killer comes in to play */ @@ -119,11 +121,18 @@ int check_compaction(unsigned long mem_free, unsigned long hugepage_size) lseek(fd, 0, SEEK_SET); - /* Request a large number of huge pages. The Kernel will allocate - as much as it can */ - if (write(fd, "100000", (6*sizeof(char))) != (6*sizeof(char))) { - ksft_print_msg("Failed to write 100000 to /proc/sys/vm/nr_hugepages: %s\n", - strerror(errno)); + /* + * Request huge pages for about half of the free memory. The Kernel + * will allocate as much as it can, and we expect it will get at least 1/3 + */ + nr_hugepages_ul = mem_free / hugepage_size / 2; + snprintf(target_nr_hugepages, sizeof(target_nr_hugepages), + "%lu", nr_hugepages_ul); + + slen = strlen(target_nr_hugepages); + if (write(fd, target_nr_hugepages, slen) != slen) { + ksft_print_msg("Failed to write %lu to /proc/sys/vm/nr_hugepages: %s\n", + nr_hugepages_ul, strerror(errno)); goto close_fd; } From e5ec1c24e71dbf144677a975d6ba91043c2193db Mon Sep 17 00:00:00 2001 From: Byungchul Park Date: Fri, 16 Feb 2024 20:15:02 +0900 Subject: [PATCH 177/529] mm/vmscan: fix a bug calling wakeup_kswapd() with a wrong zone index commit 2774f256e7c0219e2b0a0894af1c76bdabc4f974 upstream. With numa balancing on, when a numa system is running where a numa node doesn't have its local memory so it has no managed zones, the following oops has been observed. It's because wakeup_kswapd() is called with a wrong zone index, -1. Fixed it by checking the index before calling wakeup_kswapd(). > BUG: unable to handle page fault for address: 00000000000033f3 > #PF: supervisor read access in kernel mode > #PF: error_code(0x0000) - not-present page > PGD 0 P4D 0 > Oops: 0000 [#1] PREEMPT SMP NOPTI > CPU: 2 PID: 895 Comm: masim Not tainted 6.6.0-dirty #255 > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS > rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 > RIP: 0010:wakeup_kswapd (./linux/mm/vmscan.c:7812) > Code: (omitted) > RSP: 0000:ffffc90004257d58 EFLAGS: 00010286 > RAX: ffffffffffffffff RBX: ffff88883fff0480 RCX: 0000000000000003 > RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff88883fff0480 > RBP: ffffffffffffffff R08: ff0003ffffffffff R09: ffffffffffffffff > R10: ffff888106c95540 R11: 0000000055555554 R12: 0000000000000003 > R13: 0000000000000000 R14: 0000000000000000 R15: ffff88883fff0940 > FS: 00007fc4b8124740(0000) GS:ffff888827c00000(0000) knlGS:0000000000000000 > CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 > CR2: 00000000000033f3 CR3: 000000026cc08004 CR4: 0000000000770ee0 > DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 > DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 > PKRU: 55555554 > Call Trace: > > ? __die > ? page_fault_oops > ? __pte_offset_map_lock > ? exc_page_fault > ? asm_exc_page_fault > ? wakeup_kswapd > migrate_misplaced_page > __handle_mm_fault > handle_mm_fault > do_user_addr_fault > exc_page_fault > asm_exc_page_fault > RIP: 0033:0x55b897ba0808 > Code: (omitted) > RSP: 002b:00007ffeefa821a0 EFLAGS: 00010287 > RAX: 000055b89983acd0 RBX: 00007ffeefa823f8 RCX: 000055b89983acd0 > RDX: 00007fc2f8122010 RSI: 0000000000020000 RDI: 000055b89983acd0 > RBP: 00007ffeefa821a0 R08: 0000000000000037 R09: 0000000000000075 > R10: 0000000000000000 R11: 0000000000000202 R12: 0000000000000000 > R13: 00007ffeefa82410 R14: 000055b897ba5dd8 R15: 00007fc4b8340000 > Link: https://lkml.kernel.org/r/20240216111502.79759-1-byungchul@sk.com Signed-off-by: Byungchul Park Reported-by: Hyeongtak Ji Fixes: c574bbe917036 ("NUMA balancing: optimize page placement for memory tiering system") Reviewed-by: Oscar Salvador Cc: Baolin Wang Cc: "Huang, Ying" Cc: Johannes Weiner Cc: Signed-off-by: Andrew Morton Signed-off-by: Jianqi Ren Signed-off-by: He Zhe Signed-off-by: Greg Kroah-Hartman --- mm/migrate.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/migrate.c b/mm/migrate.c index e37b18376714..7b986c9f4032 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -2422,6 +2422,14 @@ static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page) if (managed_zone(pgdat->node_zones + z)) break; } + + /* + * If there are no managed zones, it should not proceed + * further. + */ + if (z < 0) + return 0; + wakeup_kswapd(pgdat->node_zones + z, 0, order, ZONE_MOVABLE); return 0; } From 92f08673d3f1893191323572f60e3c62f2e57c2f Mon Sep 17 00:00:00 2001 From: Xu Lu Date: Mon, 9 Dec 2024 20:26:17 +0800 Subject: [PATCH 178/529] riscv: mm: Fix the out of bound issue of vmemmap address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f754f27e98f88428aaf6be6e00f5cbce97f62d4b upstream. In sparse vmemmap model, the virtual address of vmemmap is calculated as: ((struct page *)VMEMMAP_START - (phys_ram_base >> PAGE_SHIFT)). And the struct page's va can be calculated with an offset: (vmemmap + (pfn)). However, when initializing struct pages, kernel actually starts from the first page from the same section that phys_ram_base belongs to. If the first page's physical address is not (phys_ram_base >> PAGE_SHIFT), then we get an va below VMEMMAP_START when calculating va for it's struct page. For example, if phys_ram_base starts from 0x82000000 with pfn 0x82000, the first page in the same section is actually pfn 0x80000. During init_unavailable_range(), we will initialize struct page for pfn 0x80000 with virtual address ((struct page *)VMEMMAP_START - 0x2000), which is below VMEMMAP_START as well as PCI_IO_END. This commit fixes this bug by introducing a new variable 'vmemmap_start_pfn' which is aligned with memory section size and using it to calculate vmemmap address instead of phys_ram_base. Fixes: a11dd49dcb93 ("riscv: Sparse-Memory/vmemmap out-of-bounds fix") Signed-off-by: Xu Lu Reviewed-by: Alexandre Ghiti Tested-by: Björn Töpel Reviewed-by: Björn Töpel Link: https://lore.kernel.org/r/20241209122617.53341-1-luxu.kernel@bytedance.com Signed-off-by: Palmer Dabbelt Signed-off-by: Zhaoyang Li Signed-off-by: Greg Kroah-Hartman --- arch/riscv/include/asm/page.h | 1 + arch/riscv/include/asm/pgtable.h | 2 +- arch/riscv/mm/init.c | 17 ++++++++++++++++- 3 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 86048c60f700..fd861ac47a89 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -115,6 +115,7 @@ struct kernel_mapping { extern struct kernel_mapping kernel_map; extern phys_addr_t phys_ram_base; +extern unsigned long vmemmap_start_pfn; #define is_kernel_mapping(x) \ ((x) >= kernel_map.virt_addr && (x) < (kernel_map.virt_addr + kernel_map.size)) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 7d1688f850c3..bb19a643c5c2 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -79,7 +79,7 @@ * Define vmemmap for pfn_to_page & page_to_pfn calls. Needed if kernel * is configured with CONFIG_SPARSEMEM_VMEMMAP enabled. */ -#define vmemmap ((struct page *)VMEMMAP_START - (phys_ram_base >> PAGE_SHIFT)) +#define vmemmap ((struct page *)VMEMMAP_START - vmemmap_start_pfn) #define PCI_IO_SIZE SZ_16M #define PCI_IO_END VMEMMAP_START diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index ba2210b553f9..72b3462babbf 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -22,6 +22,7 @@ #include #include +#include #include #include #include @@ -52,6 +53,13 @@ EXPORT_SYMBOL(pgtable_l5_enabled); phys_addr_t phys_ram_base __ro_after_init; EXPORT_SYMBOL(phys_ram_base); +#ifdef CONFIG_SPARSEMEM_VMEMMAP +#define VMEMMAP_ADDR_ALIGN (1ULL << SECTION_SIZE_BITS) + +unsigned long vmemmap_start_pfn __ro_after_init; +EXPORT_SYMBOL(vmemmap_start_pfn); +#endif + unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] __page_aligned_bss; EXPORT_SYMBOL(empty_zero_page); @@ -210,8 +218,12 @@ static void __init setup_bootmem(void) memblock_reserve(vmlinux_start, vmlinux_end - vmlinux_start); phys_ram_end = memblock_end_of_DRAM(); - if (!IS_ENABLED(CONFIG_XIP_KERNEL)) + if (!IS_ENABLED(CONFIG_XIP_KERNEL)) { phys_ram_base = memblock_start_of_DRAM(); +#ifdef CONFIG_SPARSEMEM_VMEMMAP + vmemmap_start_pfn = round_down(phys_ram_base, VMEMMAP_ADDR_ALIGN) >> PAGE_SHIFT; +#endif +} /* * Reserve physical address space that would be mapped to virtual * addresses greater than (void *)(-PAGE_SIZE) because: @@ -946,6 +958,9 @@ asmlinkage void __init setup_vm(uintptr_t dtb_pa) kernel_map.xiprom_sz = (uintptr_t)(&_exiprom) - (uintptr_t)(&_xiprom); phys_ram_base = CONFIG_PHYS_RAM_BASE; +#ifdef CONFIG_SPARSEMEM_VMEMMAP + vmemmap_start_pfn = round_down(phys_ram_base, VMEMMAP_ADDR_ALIGN) >> PAGE_SHIFT; +#endif kernel_map.phys_addr = (uintptr_t)CONFIG_PHYS_RAM_BASE; kernel_map.size = (uintptr_t)(&_end) - (uintptr_t)(&_start); From 077149478497b2f00ff4fd9da2c892defa6418d8 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Thu, 11 Jul 2024 15:18:38 +0000 Subject: [PATCH 179/529] bpf, arm64: Fix trampoline for BPF_TRAMP_F_CALL_ORIG commit 19d3c179a37730caf600a97fed3794feac2b197b upstream. When BPF_TRAMP_F_CALL_ORIG is set, the trampoline calls __bpf_tramp_enter() and __bpf_tramp_exit() functions, passing them the struct bpf_tramp_image *im pointer as an argument in R0. The trampoline generation code uses emit_addr_mov_i64() to emit instructions for moving the bpf_tramp_image address into R0, but emit_addr_mov_i64() assumes the address to be in the vmalloc() space and uses only 48 bits. Because bpf_tramp_image is allocated using kzalloc(), its address can use more than 48-bits, in this case the trampoline will pass an invalid address to __bpf_tramp_enter/exit() causing a kernel crash. Fix this by using emit_a64_mov_i64() in place of emit_addr_mov_i64() as it can work with addresses that are greater than 48-bits. Fixes: efc9909fdce0 ("bpf, arm64: Add bpf trampoline for arm64") Signed-off-by: Puranjay Mohan Signed-off-by: Daniel Borkmann Closes: https://lore.kernel.org/all/SJ0PR15MB461564D3F7E7A763498CA6A8CBDB2@SJ0PR15MB4615.namprd15.prod.outlook.com/ Link: https://lore.kernel.org/bpf/20240711151838.43469-1-puranjay@kernel.org [Minor context change fixed.] Signed-off-by: Bin Lan Signed-off-by: He Zhe Signed-off-by: Greg Kroah-Hartman --- arch/arm64/net/bpf_jit_comp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 09b70a9b5c39..3309dc33a658 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1942,7 +1942,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, emit(A64_STR64I(A64_R(20), A64_SP, regs_off + 8), ctx); if (flags & BPF_TRAMP_F_CALL_ORIG) { - emit_addr_mov_i64(A64_R(0), (const u64)im, ctx); + emit_a64_mov_i64(A64_R(0), (const u64)im, ctx); emit_call((const u64)__bpf_tramp_enter, ctx); } @@ -1986,7 +1986,7 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, if (flags & BPF_TRAMP_F_CALL_ORIG) { im->ip_epilogue = ctx->image + ctx->idx; - emit_addr_mov_i64(A64_R(0), (const u64)im, ctx); + emit_a64_mov_i64(A64_R(0), (const u64)im, ctx); emit_call((const u64)__bpf_tramp_exit, ctx); } From 9e80f366ebfdfafc685fe83a84c34f7ef01cbe88 Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Fri, 18 Oct 2024 15:16:43 -0700 Subject: [PATCH 180/529] bpf, arm64: Fix address emission with tag-based KASAN enabled commit a552e2ef5fd1a6c78267cd4ec5a9b49aa11bbb1c upstream. When BPF_TRAMP_F_CALL_ORIG is enabled, the address of a bpf_tramp_image struct on the stack is passed during the size calculation pass and an address on the heap is passed during code generation. This may cause a heap buffer overflow if the heap address is tagged because emit_a64_mov_i64() will emit longer code than it did during the size calculation pass. The same problem could occur without tag-based KASAN if one of the 16-bit words of the stack address happened to be all-ones during the size calculation pass. Fix the problem by assuming the worst case (4 instructions) when calculating the size of the bpf_tramp_image address emission. Fixes: 19d3c179a377 ("bpf, arm64: Fix trampoline for BPF_TRAMP_F_CALL_ORIG") Signed-off-by: Peter Collingbourne Signed-off-by: Daniel Borkmann Acked-by: Xu Kuohai Link: https://linux-review.googlesource.com/id/I1496f2bc24fba7a1d492e16e2b94cf43714f2d3c Link: https://lore.kernel.org/bpf/20241018221644.3240898-1-pcc@google.com [Minor context change fixed.] Signed-off-by: Bin Lan Signed-off-by: He Zhe Signed-off-by: Greg Kroah-Hartman --- arch/arm64/net/bpf_jit_comp.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 3309dc33a658..3dd23050a6c8 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1942,7 +1942,11 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, emit(A64_STR64I(A64_R(20), A64_SP, regs_off + 8), ctx); if (flags & BPF_TRAMP_F_CALL_ORIG) { - emit_a64_mov_i64(A64_R(0), (const u64)im, ctx); + /* for the first pass, assume the worst case */ + if (!ctx->image) + ctx->idx += 4; + else + emit_a64_mov_i64(A64_R(0), (const u64)im, ctx); emit_call((const u64)__bpf_tramp_enter, ctx); } @@ -1986,7 +1990,11 @@ static int prepare_trampoline(struct jit_ctx *ctx, struct bpf_tramp_image *im, if (flags & BPF_TRAMP_F_CALL_ORIG) { im->ip_epilogue = ctx->image + ctx->idx; - emit_a64_mov_i64(A64_R(0), (const u64)im, ctx); + /* for the first pass, assume the worst case */ + if (!ctx->image) + ctx->idx += 4; + else + emit_a64_mov_i64(A64_R(0), (const u64)im, ctx); emit_call((const u64)__bpf_tramp_exit, ctx); } From 091a7f20d5d12b66a58f3a1e462a4619a4b0af3a Mon Sep 17 00:00:00 2001 From: Huacai Chen Date: Fri, 22 Nov 2024 15:47:47 +0800 Subject: [PATCH 181/529] LoongArch: Explicitly specify code model in Makefile commit e67e0eb6a98b261caf45048f9eb95fd7609289c0 upstream. LoongArch's toolchain may change the default code model from normal to medium. This is unnecessary for kernel, and generates some relocations which cannot be handled by the module loader. So explicitly specify the code model to normal in Makefile (for Rust 'normal' is 'small'). Cc: stable@vger.kernel.org Tested-by: Haiyong Sun Signed-off-by: Huacai Chen Signed-off-by: Greg Kroah-Hartman --- arch/loongarch/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/loongarch/Makefile b/arch/loongarch/Makefile index 275d4d5260c7..9d4a6ab7bad9 100644 --- a/arch/loongarch/Makefile +++ b/arch/loongarch/Makefile @@ -38,7 +38,7 @@ endif ifdef CONFIG_64BIT ld-emul = $(64bit-emul) -cflags-y += -mabi=lp64s +cflags-y += -mabi=lp64s -mcmodel=normal endif cflags-y += -pipe -msoft-float From 3926b572fd073491bde13ec42ee08ac1b337bf4d Mon Sep 17 00:00:00 2001 From: Ma Wupeng Date: Mon, 17 Feb 2025 09:43:29 +0800 Subject: [PATCH 182/529] hwpoison, memory_hotplug: lock folio before unmap hwpoisoned folio commit af288a426c3e3552b62595c6138ec6371a17dbba upstream. Commit b15c87263a69 ("hwpoison, memory_hotplug: allow hwpoisoned pages to be offlined) add page poison checks in do_migrate_range in order to make offline hwpoisoned page possible by introducing isolate_lru_page and try_to_unmap for hwpoisoned page. However folio lock must be held before calling try_to_unmap. Add it to fix this problem. Warning will be produced if folio is not locked during unmap: ------------[ cut here ]------------ kernel BUG at ./include/linux/swapops.h:400! Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP Modules linked in: CPU: 4 UID: 0 PID: 411 Comm: bash Tainted: G W 6.13.0-rc1-00016-g3c434c7ee82a-dirty #41 Tainted: [W]=WARN Hardware name: QEMU QEMU Virtual Machine, BIOS 0.0.0 02/06/2015 pstate: 40400005 (nZcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : try_to_unmap_one+0xb08/0xd3c lr : try_to_unmap_one+0x3dc/0xd3c Call trace: try_to_unmap_one+0xb08/0xd3c (P) try_to_unmap_one+0x3dc/0xd3c (L) rmap_walk_anon+0xdc/0x1f8 rmap_walk+0x3c/0x58 try_to_unmap+0x88/0x90 unmap_poisoned_folio+0x30/0xa8 do_migrate_range+0x4a0/0x568 offline_pages+0x5a4/0x670 memory_block_action+0x17c/0x374 memory_subsys_offline+0x3c/0x78 device_offline+0xa4/0xd0 state_store+0x8c/0xf0 dev_attr_store+0x18/0x2c sysfs_kf_write+0x44/0x54 kernfs_fop_write_iter+0x118/0x1a8 vfs_write+0x3a8/0x4bc ksys_write+0x6c/0xf8 __arm64_sys_write+0x1c/0x28 invoke_syscall+0x44/0x100 el0_svc_common.constprop.0+0x40/0xe0 do_el0_svc+0x1c/0x28 el0_svc+0x30/0xd0 el0t_64_sync_handler+0xc8/0xcc el0t_64_sync+0x198/0x19c Code: f9407be0 b5fff320 d4210000 17ffff97 (d4210000) ---[ end trace 0000000000000000 ]--- Link: https://lkml.kernel.org/r/20250217014329.3610326-4-mawupeng1@huawei.com Fixes: b15c87263a69 ("hwpoison, memory_hotplug: allow hwpoisoned pages to be offlined") Signed-off-by: Ma Wupeng Acked-by: David Hildenbrand Acked-by: Miaohe Lin Cc: Michal Hocko Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Signed-off-by: Andrew Morton Signed-off-by: Xiangyu Chen Signed-off-by: He Zhe Signed-off-by: Greg Kroah-Hartman --- mm/memory_hotplug.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index dc17618bad8b..c8cc2f63c3ea 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1656,8 +1656,12 @@ do_migrate_range(unsigned long start_pfn, unsigned long end_pfn) if (PageHWPoison(page)) { if (WARN_ON(folio_test_lru(folio))) folio_isolate_lru(folio); - if (folio_mapped(folio)) + if (folio_mapped(folio)) { + folio_lock(folio); try_to_unmap(folio, TTU_IGNORE_MLOCK); + folio_unlock(folio); + } + continue; } From 386507cb6fb7cdef598ddcb3f0fa37e6ca9e789d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 31 Mar 2025 09:15:32 +0000 Subject: [PATCH 183/529] sctp: add mutual exclusion in proc_sctp_do_udp_port() commit 10206302af856791fbcc27a33ed3c3eb09b2793d upstream. We must serialize calls to sctp_udp_sock_stop() and sctp_udp_sock_start() or risk a crash as syzbot reported: Oops: general protection fault, probably for non-canonical address 0xdffffc000000000d: 0000 [#1] SMP KASAN PTI KASAN: null-ptr-deref in range [0x0000000000000068-0x000000000000006f] CPU: 1 UID: 0 PID: 6551 Comm: syz.1.44 Not tainted 6.14.0-syzkaller-g7f2ff7b62617 #0 PREEMPT(full) Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 02/12/2025 RIP: 0010:kernel_sock_shutdown+0x47/0x70 net/socket.c:3653 Call Trace: udp_tunnel_sock_release+0x68/0x80 net/ipv4/udp_tunnel_core.c:181 sctp_udp_sock_stop+0x71/0x160 net/sctp/protocol.c:930 proc_sctp_do_udp_port+0x264/0x450 net/sctp/sysctl.c:553 proc_sys_call_handler+0x3d0/0x5b0 fs/proc/proc_sysctl.c:601 iter_file_splice_write+0x91c/0x1150 fs/splice.c:738 do_splice_from fs/splice.c:935 [inline] direct_splice_actor+0x18f/0x6c0 fs/splice.c:1158 splice_direct_to_actor+0x342/0xa30 fs/splice.c:1102 do_splice_direct_actor fs/splice.c:1201 [inline] do_splice_direct+0x174/0x240 fs/splice.c:1227 do_sendfile+0xafd/0xe50 fs/read_write.c:1368 __do_sys_sendfile64 fs/read_write.c:1429 [inline] __se_sys_sendfile64 fs/read_write.c:1415 [inline] __x64_sys_sendfile64+0x1d8/0x220 fs/read_write.c:1415 do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] Fixes: 046c052b475e ("sctp: enable udp tunneling socks") Reported-by: syzbot+fae49d997eb56fa7c74d@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/67ea5c01.050a0220.1547ec.012b.GAE@google.com/T/#u Signed-off-by: Eric Dumazet Cc: Marcelo Ricardo Leitner Acked-by: Xin Long Link: https://patch.msgid.link/20250331091532.224982-1-edumazet@google.com Signed-off-by: Jakub Kicinski [Minor conflict resolved due to code context change.] Signed-off-by: Jianqi Ren Signed-off-by: He Zhe Signed-off-by: Greg Kroah-Hartman --- net/sctp/sysctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c index 916dc2e81e42..f3d09998c24d 100644 --- a/net/sctp/sysctl.c +++ b/net/sctp/sysctl.c @@ -518,6 +518,8 @@ static int proc_sctp_do_auth(struct ctl_table *ctl, int write, return ret; } +static DEFINE_MUTEX(sctp_sysctl_mutex); + static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write, void *buffer, size_t *lenp, loff_t *ppos) { @@ -542,6 +544,7 @@ static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write, if (new_value > max || new_value < min) return -EINVAL; + mutex_lock(&sctp_sysctl_mutex); net->sctp.udp_port = new_value; sctp_udp_sock_stop(net); if (new_value) { @@ -554,6 +557,7 @@ static int proc_sctp_do_udp_port(struct ctl_table *ctl, int write, lock_sock(sk); sctp_sk(sk)->udp_port = htons(net->sctp.udp_port); release_sock(sk); + mutex_unlock(&sctp_sysctl_mutex); } return ret; From 18eb53a2734ff61b9a72c4fef5db7b38cb48ae16 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 18 Jun 2024 12:15:01 +0100 Subject: [PATCH 184/529] btrfs: don't BUG_ON() when 0 reference count at btrfs_lookup_extent_info() commit 28cb13f29faf6290597b24b728dc3100c019356f upstream. Instead of doing a BUG_ON() handle the error by returning -EUCLEAN, aborting the transaction and logging an error message. Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba [Minor conflict resolved due to code context change.] Signed-off-by: Jianqi Ren Signed-off-by: He Zhe Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent-tree.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9040108eda64..5395e27f9e89 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -179,6 +179,14 @@ search_again: ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); num_refs = btrfs_extent_refs(leaf, ei); + if (unlikely(num_refs == 0)) { + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected zero reference count for extent item (%llu %u %llu)", + key.objectid, key.type, key.offset); + btrfs_abort_transaction(trans, ret); + goto out_free; + } extent_flags = btrfs_extent_flags(leaf, ei); } else { ret = -EINVAL; @@ -190,8 +198,6 @@ search_again: goto out_free; } - - BUG_ON(num_refs == 0); } else { num_refs = 0; extent_flags = 0; @@ -221,10 +227,19 @@ search_again: goto search_again; } spin_lock(&head->lock); - if (head->extent_op && head->extent_op->update_flags) + if (head->extent_op && head->extent_op->update_flags) { extent_flags |= head->extent_op->flags_to_set; - else - BUG_ON(num_refs == 0); + } else if (unlikely(num_refs == 0)) { + spin_unlock(&head->lock); + mutex_unlock(&head->mutex); + spin_unlock(&delayed_refs->lock); + ret = -EUCLEAN; + btrfs_err(fs_info, + "unexpected zero reference count for extent %llu (%s)", + bytenr, metadata ? "metadata" : "data"); + btrfs_abort_transaction(trans, ret); + goto out_free; + } num_refs += head->ref_mod; spin_unlock(&head->lock); From bbd68196ac9503ec623a242bd2074b53dfd3f86d Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 20 May 2025 01:34:36 +0200 Subject: [PATCH 185/529] netfilter: nf_tables: pass nft_chain to destroy function, not nft_ctx commit 8965d42bcf54d42cbc72fe34a9d0ec3f8527debd upstream. It would be better to not store nft_ctx inside nft_trans object, the netlink ctx strucutre is huge and most of its information is never needed in places that use trans->ctx. Avoid/reduce its usage if possible, no runtime behaviour change intended. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Stable-dep-of: c03d278fdf35 ("netfilter: nf_tables: wait for rcu grace period on net_device removal") Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/net/netfilter/nf_tables.h | 2 +- net/netfilter/nf_tables_api.c | 17 ++++++++--------- net/netfilter/nft_immediate.c | 2 +- 3 files changed, 10 insertions(+), 11 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index d11398aa642e..41abb5982348 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1121,7 +1121,7 @@ static inline bool nft_chain_is_bound(struct nft_chain *chain) int nft_chain_add(struct nft_table *table, struct nft_chain *chain); void nft_chain_del(struct nft_chain *chain); -void nf_tables_chain_destroy(struct nft_ctx *ctx); +void nf_tables_chain_destroy(struct nft_chain *chain); struct nft_stats { u64 bytes; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 656c4fb76773..dc1e11c0b168 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2034,9 +2034,9 @@ static void nf_tables_chain_free_chain_rules(struct nft_chain *chain) kvfree(chain->blob_next); } -void nf_tables_chain_destroy(struct nft_ctx *ctx) +void nf_tables_chain_destroy(struct nft_chain *chain) { - struct nft_chain *chain = ctx->chain; + const struct nft_table *table = chain->table; struct nft_hook *hook, *next; if (WARN_ON(chain->use > 0)) @@ -2048,7 +2048,7 @@ void nf_tables_chain_destroy(struct nft_ctx *ctx) if (nft_is_base_chain(chain)) { struct nft_base_chain *basechain = nft_base_chain(chain); - if (nft_base_chain_netdev(ctx->family, basechain->ops.hooknum)) { + if (nft_base_chain_netdev(table->family, basechain->ops.hooknum)) { list_for_each_entry_safe(hook, next, &basechain->hook_list, list) { list_del_rcu(&hook->list); @@ -2515,7 +2515,7 @@ err_chain_add: err_trans: nft_use_dec_restore(&table->use); err_destroy_chain: - nf_tables_chain_destroy(ctx); + nf_tables_chain_destroy(chain); return err; } @@ -8994,7 +8994,7 @@ static void nft_commit_release(struct nft_trans *trans) kfree(nft_trans_chain_name(trans)); break; case NFT_MSG_DELCHAIN: - nf_tables_chain_destroy(&trans->ctx); + nf_tables_chain_destroy(nft_trans_chain(trans)); break; case NFT_MSG_DELRULE: nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); @@ -9955,7 +9955,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) nf_tables_table_destroy(&trans->ctx); break; case NFT_MSG_NEWCHAIN: - nf_tables_chain_destroy(&trans->ctx); + nf_tables_chain_destroy(nft_trans_chain(trans)); break; case NFT_MSG_NEWRULE: nf_tables_rule_destroy(&trans->ctx, nft_trans_rule(trans)); @@ -10677,7 +10677,7 @@ int __nft_release_basechain(struct nft_ctx *ctx) } nft_chain_del(ctx->chain); nft_use_dec(&ctx->table->use); - nf_tables_chain_destroy(ctx); + nf_tables_chain_destroy(ctx->chain); return 0; } @@ -10753,10 +10753,9 @@ static void __nft_release_table(struct net *net, struct nft_table *table) nft_obj_destroy(&ctx, obj); } list_for_each_entry_safe(chain, nc, &table->chains, list) { - ctx.chain = chain; nft_chain_del(chain); nft_use_dec(&table->use); - nf_tables_chain_destroy(&ctx); + nf_tables_chain_destroy(chain); } nf_tables_table_destroy(&ctx); } diff --git a/net/netfilter/nft_immediate.c b/net/netfilter/nft_immediate.c index 55fcf0280c5c..731511d58b7c 100644 --- a/net/netfilter/nft_immediate.c +++ b/net/netfilter/nft_immediate.c @@ -221,7 +221,7 @@ static void nft_immediate_destroy(const struct nft_ctx *ctx, list_del(&rule->list); nf_tables_rule_destroy(&chain_ctx, rule); } - nf_tables_chain_destroy(&chain_ctx); + nf_tables_chain_destroy(chain); break; default: break; From e6c32a64d61184c2bdf89442b3d31ef530afba34 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 20 May 2025 01:34:37 +0200 Subject: [PATCH 186/529] netfilter: nf_tables: wait for rcu grace period on net_device removal commit c03d278fdf35e73dd0ec543b9b556876b9d9a8dc upstream. 8c873e219970 ("netfilter: core: free hooks with call_rcu") removed synchronize_net() call when unregistering basechain hook, however, net_device removal event handler for the NFPROTO_NETDEV was not updated to wait for RCU grace period. Note that 835b803377f5 ("netfilter: nf_tables_netdev: unregister hooks on net_device removal") does not remove basechain rules on device removal, I was hinted to remove rules on net_device removal later, see 5ebe0b0eec9d ("netfilter: nf_tables: destroy basechain and rules on netdevice removal"). Although NETDEV_UNREGISTER event is guaranteed to be handled after synchronize_net() call, this path needs to wait for rcu grace period via rcu callback to release basechain hooks if netns is alive because an ongoing netlink dump could be in progress (sockets hold a reference on the netns). Note that nf_tables_pre_exit_net() unregisters and releases basechain hooks but it is possible to see NETDEV_UNREGISTER at a later stage in the netns exit path, eg. veth peer device in another netns: cleanup_net() default_device_exit_batch() unregister_netdevice_many_notify() notifier_call_chain() nf_tables_netdev_event() __nft_release_basechain() In this particular case, same rule of thumb applies: if netns is alive, then wait for rcu grace period because netlink dump in the other netns could be in progress. Otherwise, if the other netns is going away then no netlink dump can be in progress and basechain hooks can be released inmediately. While at it, turn WARN_ON() into WARN_ON_ONCE() for the basechain validation, which should not ever happen. Fixes: 835b803377f5 ("netfilter: nf_tables_netdev: unregister hooks on net_device removal") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- include/net/netfilter/nf_tables.h | 4 +++ net/netfilter/nf_tables_api.c | 41 +++++++++++++++++++++++++------ 2 files changed, 38 insertions(+), 7 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 41abb5982348..76a51ed432ca 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1045,6 +1045,7 @@ struct nft_rule_blob { * @use: number of jump references to this chain * @flags: bitmask of enum nft_chain_flags * @name: name of the chain + * @rcu_head: rcu head for deferred release */ struct nft_chain { struct nft_rule_blob __rcu *blob_gen_0; @@ -1061,6 +1062,7 @@ struct nft_chain { char *name; u16 udlen; u8 *udata; + struct rcu_head rcu_head; /* Only used during control plane commit phase: */ struct nft_rule_blob *blob_next; @@ -1203,6 +1205,7 @@ static inline void nft_use_inc_restore(u32 *use) * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table + * @net: netnamespace this table belongs to * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table @@ -1218,6 +1221,7 @@ struct nft_table { struct list_head sets; struct list_head objects; struct list_head flowtables; + possible_net_t net; u64 hgenerator; u64 handle; u32 use; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index dc1e11c0b168..aa1a85eff61b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1413,6 +1413,7 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, INIT_LIST_HEAD(&table->sets); INIT_LIST_HEAD(&table->objects); INIT_LIST_HEAD(&table->flowtables); + write_pnet(&table->net, net); table->family = family; table->flags = flags; table->handle = ++nft_net->table_handle; @@ -10662,22 +10663,48 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data, } EXPORT_SYMBOL_GPL(nft_data_dump); -int __nft_release_basechain(struct nft_ctx *ctx) +static void __nft_release_basechain_now(struct nft_ctx *ctx) { struct nft_rule *rule, *nr; - if (WARN_ON(!nft_is_base_chain(ctx->chain))) + list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { + list_del(&rule->list); + nf_tables_rule_release(ctx, rule); + } + nf_tables_chain_destroy(ctx->chain); +} + +static void nft_release_basechain_rcu(struct rcu_head *head) +{ + struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head); + struct nft_ctx ctx = { + .family = chain->table->family, + .chain = chain, + .net = read_pnet(&chain->table->net), + }; + + __nft_release_basechain_now(&ctx); + put_net(ctx.net); +} + +int __nft_release_basechain(struct nft_ctx *ctx) +{ + struct nft_rule *rule; + + if (WARN_ON_ONCE(!nft_is_base_chain(ctx->chain))) return 0; nf_tables_unregister_hook(ctx->net, ctx->chain->table, ctx->chain); - list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) { - list_del(&rule->list); + list_for_each_entry(rule, &ctx->chain->rules, list) nft_use_dec(&ctx->chain->use); - nf_tables_rule_release(ctx, rule); - } + nft_chain_del(ctx->chain); nft_use_dec(&ctx->table->use); - nf_tables_chain_destroy(ctx->chain); + + if (maybe_get_net(ctx->net)) + call_rcu(&ctx->chain->rcu_head, nft_release_basechain_rcu); + else + __nft_release_basechain_now(ctx); return 0; } From b0f013bebf94fe7ae75e5a53be2f2bd1cc1841e3 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 20 May 2025 01:34:38 +0200 Subject: [PATCH 187/529] netfilter: nf_tables: do not defer rule destruction via call_rcu commit b04df3da1b5c6f6dc7cdccc37941740c078c4043 upstream. nf_tables_chain_destroy can sleep, it can't be used from call_rcu callbacks. Moreover, nf_tables_rule_release() is only safe for error unwinding, while transaction mutex is held and the to-be-desroyed rule was not exposed to either dataplane or dumps, as it deactives+frees without the required synchronize_rcu() in-between. nft_rule_expr_deactivate() callbacks will change ->use counters of other chains/sets, see e.g. nft_lookup .deactivate callback, these must be serialized via transaction mutex. Also add a few lockdep asserts to make this more explicit. Calling synchronize_rcu() isn't ideal, but fixing this without is hard and way more intrusive. As-is, we can get: WARNING: .. net/netfilter/nf_tables_api.c:5515 nft_set_destroy+0x.. Workqueue: events nf_tables_trans_destroy_work RIP: 0010:nft_set_destroy+0x3fe/0x5c0 Call Trace: nf_tables_trans_destroy_work+0x6b7/0xad0 process_one_work+0x64a/0xce0 worker_thread+0x613/0x10d0 In case the synchronize_rcu becomes an issue, we can explore alternatives. One way would be to allocate nft_trans_rule objects + one nft_trans_chain object, deactivate the rules + the chain and then defer the freeing to the nft destroy workqueue. We'd still need to keep the synchronize_rcu path as a fallback to handle -ENOMEM corner cases though. Reported-by: syzbot+b26935466701e56cfdc2@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67478d92.050a0220.253251.0062.GAE@google.com/T/ Fixes: c03d278fdf35 ("netfilter: nf_tables: wait for rcu grace period on net_device removal") Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Greg Kroah-Hartman --- include/net/netfilter/nf_tables.h | 3 --- net/netfilter/nf_tables_api.c | 32 +++++++++++++++---------------- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 76a51ed432ca..7252a5aae069 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -1062,7 +1062,6 @@ struct nft_chain { char *name; u16 udlen; u8 *udata; - struct rcu_head rcu_head; /* Only used during control plane commit phase: */ struct nft_rule_blob *blob_next; @@ -1205,7 +1204,6 @@ static inline void nft_use_inc_restore(u32 *use) * @sets: sets in the table * @objects: stateful objects in the table * @flowtables: flow tables in the table - * @net: netnamespace this table belongs to * @hgenerator: handle generator state * @handle: table handle * @use: number of chain references to this table @@ -1221,7 +1219,6 @@ struct nft_table { struct list_head sets; struct list_head objects; struct list_head flowtables; - possible_net_t net; u64 hgenerator; u64 handle; u32 use; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index aa1a85eff61b..0bf347a0a1dd 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1413,7 +1413,6 @@ static int nf_tables_newtable(struct sk_buff *skb, const struct nfnl_info *info, INIT_LIST_HEAD(&table->sets); INIT_LIST_HEAD(&table->objects); INIT_LIST_HEAD(&table->flowtables); - write_pnet(&table->net, net); table->family = family; table->flags = flags; table->handle = ++nft_net->table_handle; @@ -3511,8 +3510,11 @@ void nf_tables_rule_destroy(const struct nft_ctx *ctx, struct nft_rule *rule) kfree(rule); } +/* can only be used if rule is no longer visible to dumps */ static void nf_tables_rule_release(const struct nft_ctx *ctx, struct nft_rule *rule) { + lockdep_commit_lock_is_held(ctx->net); + nft_rule_expr_deactivate(ctx, rule, NFT_TRANS_RELEASE); nf_tables_rule_destroy(ctx, rule); } @@ -5248,6 +5250,8 @@ void nf_tables_deactivate_set(const struct nft_ctx *ctx, struct nft_set *set, struct nft_set_binding *binding, enum nft_trans_phase phase) { + lockdep_commit_lock_is_held(ctx->net); + switch (phase) { case NFT_TRANS_PREPARE_ERROR: nft_set_trans_unbind(ctx, set); @@ -10674,19 +10678,6 @@ static void __nft_release_basechain_now(struct nft_ctx *ctx) nf_tables_chain_destroy(ctx->chain); } -static void nft_release_basechain_rcu(struct rcu_head *head) -{ - struct nft_chain *chain = container_of(head, struct nft_chain, rcu_head); - struct nft_ctx ctx = { - .family = chain->table->family, - .chain = chain, - .net = read_pnet(&chain->table->net), - }; - - __nft_release_basechain_now(&ctx); - put_net(ctx.net); -} - int __nft_release_basechain(struct nft_ctx *ctx) { struct nft_rule *rule; @@ -10701,11 +10692,18 @@ int __nft_release_basechain(struct nft_ctx *ctx) nft_chain_del(ctx->chain); nft_use_dec(&ctx->table->use); - if (maybe_get_net(ctx->net)) - call_rcu(&ctx->chain->rcu_head, nft_release_basechain_rcu); - else + if (!maybe_get_net(ctx->net)) { __nft_release_basechain_now(ctx); + return 0; + } + /* wait for ruleset dumps to complete. Owning chain is no longer in + * lists, so new dumps can't find any of these rules anymore. + */ + synchronize_rcu(); + + __nft_release_basechain_now(ctx); + put_net(ctx->net); return 0; } EXPORT_SYMBOL_GPL(__nft_release_basechain); From f6421555dbd7cb3d4d70b69f33f998aaeca1e3b5 Mon Sep 17 00:00:00 2001 From: Mark Brown Date: Mon, 15 Jan 2024 20:15:46 +0000 Subject: [PATCH 188/529] arm64/sme: Always exit sme_alloc() early with existing storage commit dc7eb8755797ed41a0d1b5c0c39df3c8f401b3d9 upstream. When sme_alloc() is called with existing storage and we are not flushing we will always allocate new storage, both leaking the existing storage and corrupting the state. Fix this by separating the checks for flushing and for existing storage as we do for SVE. Callers that reallocate (eg, due to changing the vector length) should call sme_free() themselves. Fixes: 5d0a8d2fba50 ("arm64/ptrace: Ensure that SME is set up for target when writing SSVE state") Signed-off-by: Mark Brown Cc: Link: https://lore.kernel.org/r/20240115-arm64-sme-flush-v1-1-7472bd3459b7@kernel.org Signed-off-by: Will Deacon Signed-off-by: Zhaoyang Li Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/fpsimd.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 47425311acc5..b3e101a7d04f 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -1259,8 +1259,10 @@ void fpsimd_release_task(struct task_struct *dead_task) */ void sme_alloc(struct task_struct *task, bool flush) { - if (task->thread.za_state && flush) { - memset(task->thread.za_state, 0, za_state_size(task)); + if (task->thread.za_state) { + if (flush) + memset(task->thread.za_state, 0, + za_state_size(task)); return; } From ab47d72b736e78d3c2370b26e0bfc46eb0918391 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Mon, 6 Jan 2025 18:40:34 +0100 Subject: [PATCH 189/529] platform/x86/amd/pmc: Only disable IRQ1 wakeup where i8042 actually enabled it MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit dd410d784402c5775f66faf8b624e85e41c38aaf upstream. Wakeup for IRQ1 should be disabled only in cases where i8042 had actually enabled it, otherwise "wake_depth" for this IRQ will try to drop below zero and there will be an unpleasant WARN() logged: kernel: atkbd serio0: Disabling IRQ1 wakeup source to avoid platform firmware bug kernel: ------------[ cut here ]------------ kernel: Unbalanced IRQ 1 wake disable kernel: WARNING: CPU: 10 PID: 6431 at kernel/irq/manage.c:920 irq_set_irq_wake+0x147/0x1a0 The PMC driver uses DEFINE_SIMPLE_DEV_PM_OPS() to define its dev_pm_ops which sets amd_pmc_suspend_handler() to the .suspend, .freeze, and .poweroff handlers. i8042_pm_suspend(), however, is only set as the .suspend handler. Fix the issue by call PMC suspend handler only from the same set of dev_pm_ops handlers as i8042_pm_suspend(), which currently means just the .suspend handler. To reproduce this issue try hibernating (S4) the machine after a fresh boot without putting it into s2idle first. Fixes: 8e60615e8932 ("platform/x86/amd: pmc: Disable IRQ1 wakeup for RN/CZN") Reviewed-by: Mario Limonciello Signed-off-by: Maciej S. Szmigiero Link: https://lore.kernel.org/r/c8f28c002ca3c66fbeeb850904a1f43118e17200.1736184606.git.mail@maciej.szmigiero.name [ij: edited the commit message.] Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen Signed-off-by: Zhaoyang Li Signed-off-by: Greg Kroah-Hartman --- drivers/platform/x86/amd/pmc.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/platform/x86/amd/pmc.c b/drivers/platform/x86/amd/pmc.c index f237c1ea8d35..8eaeb1e8f975 100644 --- a/drivers/platform/x86/amd/pmc.c +++ b/drivers/platform/x86/amd/pmc.c @@ -834,6 +834,10 @@ static int __maybe_unused amd_pmc_suspend_handler(struct device *dev) { struct amd_pmc_dev *pdev = dev_get_drvdata(dev); + /* + * Must be called only from the same set of dev_pm_ops handlers + * as i8042_pm_suspend() is called: currently just from .suspend. + */ if (pdev->cpu_id == AMD_CPU_ID_CZN) { int rc = amd_pmc_czn_wa_irq1(pdev); @@ -846,7 +850,9 @@ static int __maybe_unused amd_pmc_suspend_handler(struct device *dev) return 0; } -static SIMPLE_DEV_PM_OPS(amd_pmc_pm, amd_pmc_suspend_handler, NULL); +static const struct dev_pm_ops amd_pmc_pm = { + .suspend = amd_pmc_suspend_handler, +}; #endif From b7fd784d7c6a1bd927a23e0d06f09a776ee3889b Mon Sep 17 00:00:00 2001 From: Shravya KN Date: Fri, 22 Nov 2024 14:45:44 -0800 Subject: [PATCH 190/529] bnxt_en: Fix receive ring space parameters when XDP is active commit 3051a77a09dfe3022aa012071346937fdf059033 upstream. The MTU setting at the time an XDP multi-buffer is attached determines whether the aggregation ring will be used and the rx_skb_func handler. This is done in bnxt_set_rx_skb_mode(). If the MTU is later changed, the aggregation ring setting may need to be changed and it may become out-of-sync with the settings initially done in bnxt_set_rx_skb_mode(). This may result in random memory corruption and crashes as the HW may DMA data larger than the allocated buffer size, such as: BUG: kernel NULL pointer dereference, address: 00000000000003c0 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 17 PID: 0 Comm: swapper/17 Kdump: loaded Tainted: G S OE 6.1.0-226bf9805506 #1 Hardware name: Wiwynn Delta Lake PVT BZA.02601.0150/Delta Lake-Class1, BIOS F0E_3A12 08/26/2021 RIP: 0010:bnxt_rx_pkt+0xe97/0x1ae0 [bnxt_en] Code: 8b 95 70 ff ff ff 4c 8b 9d 48 ff ff ff 66 41 89 87 b4 00 00 00 e9 0b f7 ff ff 0f b7 43 0a 49 8b 95 a8 04 00 00 25 ff 0f 00 00 <0f> b7 14 42 48 c1 e2 06 49 03 95 a0 04 00 00 0f b6 42 33f RSP: 0018:ffffa19f40cc0d18 EFLAGS: 00010202 RAX: 00000000000001e0 RBX: ffff8e2c805c6100 RCX: 00000000000007ff RDX: 0000000000000000 RSI: ffff8e2c271ab990 RDI: ffff8e2c84f12380 RBP: ffffa19f40cc0e48 R08: 000000000001000d R09: 974ea2fcddfa4cbf R10: 0000000000000000 R11: ffffa19f40cc0ff8 R12: ffff8e2c94b58980 R13: ffff8e2c952d6600 R14: 0000000000000016 R15: ffff8e2c271ab990 FS: 0000000000000000(0000) GS:ffff8e3b3f840000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000000003c0 CR3: 0000000e8580a004 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: __bnxt_poll_work+0x1c2/0x3e0 [bnxt_en] To address the issue, we now call bnxt_set_rx_skb_mode() within bnxt_change_mtu() to properly set the AGG rings configuration and update rx_skb_func based on the new MTU value. Additionally, BNXT_FLAG_NO_AGG_RINGS is cleared at the beginning of bnxt_set_rx_skb_mode() to make sure it gets set or cleared based on the current MTU. Fixes: 08450ea98ae9 ("bnxt_en: Fix max_mtu setting for multi-buf XDP") Co-developed-by: Somnath Kotur Signed-off-by: Somnath Kotur Signed-off-by: Shravya KN Signed-off-by: Michael Chan Signed-off-by: Paolo Abeni Signed-off-by: Zhaoyang Li Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/broadcom/bnxt/bnxt.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 393a983f6d69..6b1245a3ab4b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -4041,7 +4041,7 @@ int bnxt_set_rx_skb_mode(struct bnxt *bp, bool page_mode) struct net_device *dev = bp->dev; if (page_mode) { - bp->flags &= ~BNXT_FLAG_AGG_RINGS; + bp->flags &= ~(BNXT_FLAG_AGG_RINGS | BNXT_FLAG_NO_AGG_RINGS); bp->flags |= BNXT_FLAG_RX_PAGE_MODE; if (bp->xdp_prog->aux->xdp_has_frags) @@ -12799,6 +12799,14 @@ static int bnxt_change_mtu(struct net_device *dev, int new_mtu) bnxt_close_nic(bp, true, false); dev->mtu = new_mtu; + + /* MTU change may change the AGG ring settings if an XDP multi-buffer + * program is attached. We need to set the AGG rings settings and + * rx_skb_func accordingly. + */ + if (READ_ONCE(bp->xdp_prog)) + bnxt_set_rx_skb_mode(bp, true); + bnxt_set_ring_params(bp); if (netif_running(dev)) From a05c1ede50e9656f0752e523c7b54f3a3489e9a8 Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Mon, 6 May 2024 23:11:29 +0900 Subject: [PATCH 191/529] ipv6: Fix potential uninit-value access in __ip6_make_skb() commit 4e13d3a9c25b7080f8a619f961e943fe08c2672c upstream. As it was done in commit fc1092f51567 ("ipv4: Fix uninit-value access in __ip_make_skb()") for IPv4, check FLOWI_FLAG_KNOWN_NH on fl6->flowi6_flags instead of testing HDRINCL on the socket to avoid a race condition which causes uninit-value access. Fixes: ea30388baebc ("ipv6: Fix an uninit variable access bug in __ip6_make_skb()") Signed-off-by: Shigeru Yoshida Signed-off-by: David S. Miller Signed-off-by: Zhaoyang Li Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_output.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d7f7a714bd23..f7a225da8525 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1985,7 +1985,8 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, struct inet6_dev *idev = ip6_dst_idev(skb_dst(skb)); u8 icmp6_type; - if (sk->sk_socket->type == SOCK_RAW && !inet_sk(sk)->hdrincl) + if (sk->sk_socket->type == SOCK_RAW && + !(fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH)) icmp6_type = fl6->fl6_icmp_type; else icmp6_type = icmp6_hdr(skb)->icmp6_type; From 55bf541e018b76b3750cb6c6ea18c46e1ac5562e Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Tue, 30 Apr 2024 21:39:45 +0900 Subject: [PATCH 192/529] ipv4: Fix uninit-value access in __ip_make_skb() commit fc1092f51567277509563800a3c56732070b6aa4 upstream. KMSAN reported uninit-value access in __ip_make_skb() [1]. __ip_make_skb() tests HDRINCL to know if the skb has icmphdr. However, HDRINCL can cause a race condition. If calling setsockopt(2) with IP_HDRINCL changes HDRINCL while __ip_make_skb() is running, the function will access icmphdr in the skb even if it is not included. This causes the issue reported by KMSAN. Check FLOWI_FLAG_KNOWN_NH on fl4->flowi4_flags instead of testing HDRINCL on the socket. Also, fl4->fl4_icmp_type and fl4->fl4_icmp_code are not initialized. These are union in struct flowi4 and are implicitly initialized by flowi4_init_output(), but we should not rely on specific union layout. Initialize these explicitly in raw_sendmsg(). [1] BUG: KMSAN: uninit-value in __ip_make_skb+0x2b74/0x2d20 net/ipv4/ip_output.c:1481 __ip_make_skb+0x2b74/0x2d20 net/ipv4/ip_output.c:1481 ip_finish_skb include/net/ip.h:243 [inline] ip_push_pending_frames+0x4c/0x5c0 net/ipv4/ip_output.c:1508 raw_sendmsg+0x2381/0x2690 net/ipv4/raw.c:654 inet_sendmsg+0x27b/0x2a0 net/ipv4/af_inet.c:851 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x274/0x3c0 net/socket.c:745 __sys_sendto+0x62c/0x7b0 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x130/0x200 net/socket.c:2199 do_syscall_64+0xd8/0x1f0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x6d/0x75 Uninit was created at: slab_post_alloc_hook mm/slub.c:3804 [inline] slab_alloc_node mm/slub.c:3845 [inline] kmem_cache_alloc_node+0x5f6/0xc50 mm/slub.c:3888 kmalloc_reserve+0x13c/0x4a0 net/core/skbuff.c:577 __alloc_skb+0x35a/0x7c0 net/core/skbuff.c:668 alloc_skb include/linux/skbuff.h:1318 [inline] __ip_append_data+0x49ab/0x68c0 net/ipv4/ip_output.c:1128 ip_append_data+0x1e7/0x260 net/ipv4/ip_output.c:1365 raw_sendmsg+0x22b1/0x2690 net/ipv4/raw.c:648 inet_sendmsg+0x27b/0x2a0 net/ipv4/af_inet.c:851 sock_sendmsg_nosec net/socket.c:730 [inline] __sock_sendmsg+0x274/0x3c0 net/socket.c:745 __sys_sendto+0x62c/0x7b0 net/socket.c:2191 __do_sys_sendto net/socket.c:2203 [inline] __se_sys_sendto net/socket.c:2199 [inline] __x64_sys_sendto+0x130/0x200 net/socket.c:2199 do_syscall_64+0xd8/0x1f0 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x6d/0x75 CPU: 1 PID: 15709 Comm: syz-executor.7 Not tainted 6.8.0-11567-gb3603fcb79b1 #25 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-1.fc39 04/01/2014 Fixes: 99e5acae193e ("ipv4: Fix potential uninit variable access bug in __ip_make_skb()") Reported-by: syzkaller Signed-off-by: Shigeru Yoshida Link: https://lore.kernel.org/r/20240430123945.2057348-1-syoshida@redhat.com Signed-off-by: Paolo Abeni Signed-off-by: Zhaoyang Li Signed-off-by: Greg Kroah-Hartman --- net/ipv4/ip_output.c | 3 ++- net/ipv4/raw.c | 3 +++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index c82107bbd981..543d029102cf 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -1580,7 +1580,8 @@ struct sk_buff *__ip_make_skb(struct sock *sk, * so icmphdr does not in skb linear region and can not get icmp_type * by icmp_hdr(skb)->type. */ - if (sk->sk_type == SOCK_RAW && !inet_sk(sk)->hdrincl) + if (sk->sk_type == SOCK_RAW && + !(fl4->flowi4_flags & FLOWI_FLAG_KNOWN_NH)) icmp_type = fl4->fl4_icmp_type; else icmp_type = icmp_hdr(skb)->type; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index ee0efd0efec4..c109bc376cc5 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -608,6 +608,9 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) (hdrincl ? FLOWI_FLAG_KNOWN_NH : 0), daddr, saddr, 0, 0, sk->sk_uid); + fl4.fl4_icmp_type = 0; + fl4.fl4_icmp_code = 0; + if (!hdrincl) { rfv.msg = msg; rfv.hlen = 0; From 2c914aac9522f6e93822c18dff233d3e92399c81 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Th=C3=A9o=20Lebrun?= Date: Thu, 22 Feb 2024 11:12:29 +0100 Subject: [PATCH 193/529] spi: cadence-qspi: fix pointer reference in runtime PM hooks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 32ce3bb57b6b402de2aec1012511e7ac4e7449dc upstream. dev_get_drvdata() gets used to acquire the pointer to cqspi and the SPI controller. Neither embed the other; this lead to memory corruption. On a given platform (Mobileye EyeQ5) the memory corruption is hidden inside cqspi->f_pdata. Also, this uninitialised memory is used as a mutex (ctlr->bus_lock_mutex) by spi_controller_suspend(). Fixes: 2087e85bb66e ("spi: cadence-quadspi: fix suspend-resume implementations") Reviewed-by: Dhruva Gole Signed-off-by: Théo Lebrun Link: https://msgid.link/r/20240222-cdns-qspi-pm-fix-v4-1-6b6af8bcbf59@bootlin.com Signed-off-by: Mark Brown Signed-off-by: Zhaoyang Li Signed-off-by: Greg Kroah-Hartman --- drivers/spi/spi-cadence-quadspi.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/spi/spi-cadence-quadspi.c b/drivers/spi/spi-cadence-quadspi.c index 42861bc45073..02c5f00c8a57 100644 --- a/drivers/spi/spi-cadence-quadspi.c +++ b/drivers/spi/spi-cadence-quadspi.c @@ -1775,10 +1775,9 @@ static int cqspi_remove(struct platform_device *pdev) static int cqspi_suspend(struct device *dev) { struct cqspi_st *cqspi = dev_get_drvdata(dev); - struct spi_master *master = dev_get_drvdata(dev); int ret; - ret = spi_master_suspend(master); + ret = spi_master_suspend(cqspi->master); cqspi_controller_enable(cqspi, 0); clk_disable_unprepare(cqspi->clk); @@ -1789,7 +1788,6 @@ static int cqspi_suspend(struct device *dev) static int cqspi_resume(struct device *dev) { struct cqspi_st *cqspi = dev_get_drvdata(dev); - struct spi_master *master = dev_get_drvdata(dev); clk_prepare_enable(cqspi->clk); cqspi_wait_idle(cqspi); @@ -1798,7 +1796,7 @@ static int cqspi_resume(struct device *dev) cqspi->current_cs = -1; cqspi->sclk = 0; - return spi_master_resume(master); + return spi_master_resume(cqspi->master); } static DEFINE_SIMPLE_DEV_PM_OPS(cqspi_dev_pm_ops, cqspi_suspend, cqspi_resume); From 4e6310e8d471441f9c1327c5f554a08993a1df94 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 1 May 2025 13:46:46 -0400 Subject: [PATCH 194/529] drm/amdgpu: fix pm notifier handling commit 4aaffc85751da5722e858e4333e8cf0aa4b6c78f upstream. Set the s3/s0ix and s4 flags in the pm notifier so that we can skip the resource evictions properly in pm prepare based on whether we are suspending or hibernating. Drop the eviction as processes are not frozen at this time, we we can end up getting stuck trying to evict VRAM while applications continue to submit work which causes the buffers to get pulled back into VRAM. v2: Move suspend flags out of pm notifier (Mario) Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4178 Fixes: 2965e6355dcd ("drm/amd: Add Suspend/Hibernate notification callback support") Cc: Mario Limonciello Reviewed-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit 06f2dcc241e7e5c681f81fbc46cacdf4bfd7d6d7) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 18 +++++------------- drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 +--------- 2 files changed, 6 insertions(+), 22 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 57a7f72f366f..079cf3292f63 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -4199,28 +4199,20 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev) * @data: data * * This function is called when the system is about to suspend or hibernate. - * It is used to evict resources from the device before the system goes to - * sleep while there is still access to swap. + * It is used to set the appropriate flags so that eviction can be optimized + * in the pm prepare callback. */ static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, void *data) { struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); - int r; switch (mode) { case PM_HIBERNATION_PREPARE: adev->in_s4 = true; - fallthrough; - case PM_SUSPEND_PREPARE: - r = amdgpu_device_evict_resources(adev); - /* - * This is considered non-fatal at this time because - * amdgpu_device_prepare() will also fatally evict resources. - * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 - */ - if (r) - drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); + break; + case PM_POST_HIBERNATION: + adev->in_s4 = false; break; } diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 06958608c984..1fbb73b2691d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -2480,13 +2480,8 @@ static int amdgpu_pmops_freeze(struct device *dev) static int amdgpu_pmops_thaw(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); - struct amdgpu_device *adev = drm_to_adev(drm_dev); - int r; - r = amdgpu_device_resume(drm_dev, true); - adev->in_s4 = false; - - return r; + return amdgpu_device_resume(drm_dev, true); } static int amdgpu_pmops_poweroff(struct device *dev) @@ -2499,9 +2494,6 @@ static int amdgpu_pmops_poweroff(struct device *dev) static int amdgpu_pmops_restore(struct device *dev) { struct drm_device *drm_dev = dev_get_drvdata(dev); - struct amdgpu_device *adev = drm_to_adev(drm_dev); - - adev->in_s4 = false; return amdgpu_device_resume(drm_dev, true); } From 05e85d3767200f2825f7b26200e44c8ec93f30eb Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 15 Sep 2022 13:10:44 +0200 Subject: [PATCH 195/529] x86/modules: Set VM_FLUSH_RESET_PERMS in module_alloc() commit 4c4eb3ecc91f4fee6d6bf7cfbc1e21f2e38d19ff upstream. Instead of resetting permissions all over the place when freeing module memory tell the vmalloc code to do so. Avoids the exercise for the next upcoming user. Signed-off-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Link: https://lore.kernel.org/r/20220915111143.406703869@infradead.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/ftrace.c | 2 -- arch/x86/kernel/kprobes/core.c | 1 - arch/x86/kernel/module.c | 9 +++++---- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/ftrace.c b/arch/x86/kernel/ftrace.c index e73743701530..48cf91625a16 100644 --- a/arch/x86/kernel/ftrace.c +++ b/arch/x86/kernel/ftrace.c @@ -415,8 +415,6 @@ create_trampoline(struct ftrace_ops *ops, unsigned int *tramp_size) /* ALLOC_TRAMP flags lets us know we created it */ ops->flags |= FTRACE_OPS_FL_ALLOC_TRAMP; - set_vm_flush_reset_perms(trampoline); - if (likely(system_state != SYSTEM_BOOTING)) set_memory_ro((unsigned long)trampoline, npages); set_memory_x((unsigned long)trampoline, npages); diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 991f00c817e6..180c708879d2 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -427,7 +427,6 @@ void *alloc_insn_page(void) if (!page) return NULL; - set_vm_flush_reset_perms(page); /* * First make the page read-only, and only then make it executable to * prevent it from being W+X in between. diff --git a/arch/x86/kernel/module.c b/arch/x86/kernel/module.c index 7728060b640c..c34ea5e028c4 100644 --- a/arch/x86/kernel/module.c +++ b/arch/x86/kernel/module.c @@ -74,10 +74,11 @@ void *module_alloc(unsigned long size) return NULL; p = __vmalloc_node_range(size, MODULE_ALIGN, - MODULES_VADDR + get_module_load_offset(), - MODULES_END, gfp_mask, - PAGE_KERNEL, VM_DEFER_KMEMLEAK, NUMA_NO_NODE, - __builtin_return_address(0)); + MODULES_VADDR + get_module_load_offset(), + MODULES_END, gfp_mask, PAGE_KERNEL, + VM_FLUSH_RESET_PERMS | VM_DEFER_KMEMLEAK, + NUMA_NO_NODE, __builtin_return_address(0)); + if (p && (kasan_alloc_module_shadow(p, size, gfp_mask) < 0)) { vfree(p); return NULL; From da3c5173c55f7a0cf65c967d864386c79dcba3f7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 22 May 2025 14:10:11 +0200 Subject: [PATCH 196/529] Linux 6.1.140 Link: https://lore.kernel.org/r/20250520125800.653047540@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Salvatore Bonaccorso Tested-by: Shuah Khan Tested-by: Ron Economos Tested-by: Jon Hunter Tested-by: Linux Kernel Functional Testing Tested-by: Mark Brown Tested-by: Peter Schneider Tested-by: Hardik Garg Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 80748b7c3540..f86e26fa0b31 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 PATCHLEVEL = 1 -SUBLEVEL = 139 +SUBLEVEL = 140 EXTRAVERSION = NAME = Curry Ramen From 59047be46c7d04c00980342dbd01ed21b22e3702 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 7 Oct 2022 16:44:44 +0300 Subject: [PATCH 197/529] gpio: pca953x: Add missing header(s) [ Upstream commit c20a395f9bf939ef0587ce5fa14316ac26252e9b ] Do not imply that some of the generic headers may be always included. Instead, include explicitly what we are direct user of. While at it, sort headers alphabetically. Signed-off-by: Andy Shevchenko Stable-dep-of: 3e38f946062b ("gpio: pca953x: fix IRQ storm on system wake up") Signed-off-by: Sasha Levin --- drivers/gpio/gpio-pca953x.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index 262b3d276df7..caf3bb6cb6b9 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -10,8 +10,8 @@ #include #include -#include #include +#include #include #include #include @@ -20,6 +20,7 @@ #include #include #include +#include #include #include From af7488d114a92412c804d6d46e246faf409e0ae4 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Sep 2023 16:40:35 +0300 Subject: [PATCH 198/529] gpio: pca953x: Split pca953x_restore_context() and pca953x_save_context() [ Upstream commit ec5bde62019b0a5300c67bd81b9864a8ea12274e ] Split regcache handling to the respective helpers. It will allow to have further refactoring with ease. Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski Stable-dep-of: 3e38f946062b ("gpio: pca953x: fix IRQ storm on system wake up") Signed-off-by: Sasha Levin --- drivers/gpio/gpio-pca953x.c | 46 ++++++++++++++++++++++++------------- 1 file changed, 30 insertions(+), 16 deletions(-) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index caf3bb6cb6b9..db4a48558c67 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -1200,9 +1200,9 @@ static void pca953x_remove(struct i2c_client *client) } #ifdef CONFIG_PM_SLEEP -static int pca953x_regcache_sync(struct device *dev) +static int pca953x_regcache_sync(struct pca953x_chip *chip) { - struct pca953x_chip *chip = dev_get_drvdata(dev); + struct device *dev = &chip->client->dev; int ret; u8 regaddr; @@ -1249,13 +1249,37 @@ static int pca953x_regcache_sync(struct device *dev) return 0; } +static int pca953x_restore_context(struct pca953x_chip *chip) +{ + int ret; + + mutex_lock(&chip->i2c_lock); + + regcache_cache_only(chip->regmap, false); + regcache_mark_dirty(chip->regmap); + ret = pca953x_regcache_sync(chip); + if (ret) { + mutex_unlock(&chip->i2c_lock); + return ret; + } + + ret = regcache_sync(chip->regmap); + mutex_unlock(&chip->i2c_lock); + return ret; +} + +static void pca953x_save_context(struct pca953x_chip *chip) +{ + mutex_lock(&chip->i2c_lock); + regcache_cache_only(chip->regmap, true); + mutex_unlock(&chip->i2c_lock); +} + static int pca953x_suspend(struct device *dev) { struct pca953x_chip *chip = dev_get_drvdata(dev); - mutex_lock(&chip->i2c_lock); - regcache_cache_only(chip->regmap, true); - mutex_unlock(&chip->i2c_lock); + pca953x_save_context(chip); if (atomic_read(&chip->wakeup_path)) device_set_wakeup_path(dev); @@ -1278,17 +1302,7 @@ static int pca953x_resume(struct device *dev) } } - mutex_lock(&chip->i2c_lock); - regcache_cache_only(chip->regmap, false); - regcache_mark_dirty(chip->regmap); - ret = pca953x_regcache_sync(dev); - if (ret) { - mutex_unlock(&chip->i2c_lock); - return ret; - } - - ret = regcache_sync(chip->regmap); - mutex_unlock(&chip->i2c_lock); + ret = pca953x_restore_context(chip); if (ret) { dev_err(dev, "Failed to restore register map: %d\n", ret); return ret; From aa34c055d34a7dcd9d305f423804ced56e1018ba Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 1 Sep 2023 16:40:36 +0300 Subject: [PATCH 199/529] gpio: pca953x: Simplify code with cleanup helpers [ Upstream commit 8e471b784a720f6f34f9fb449ba0744359dcaccb ] Use macros defined in linux/cleanup.h to automate resource lifetime control in gpio-pca953x. Signed-off-by: Andy Shevchenko Signed-off-by: Bartosz Golaszewski Stable-dep-of: 3e38f946062b ("gpio: pca953x: fix IRQ storm on system wake up") Signed-off-by: Sasha Levin --- drivers/gpio/gpio-pca953x.c | 77 ++++++++++++++----------------------- 1 file changed, 29 insertions(+), 48 deletions(-) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index db4a48558c67..a5be47a916d3 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -10,6 +10,7 @@ #include #include +#include #include #include #include @@ -523,12 +524,10 @@ static int pca953x_gpio_direction_input(struct gpio_chip *gc, unsigned off) struct pca953x_chip *chip = gpiochip_get_data(gc); u8 dirreg = chip->recalc_addr(chip, chip->regs->direction, off); u8 bit = BIT(off % BANK_SZ); - int ret; - mutex_lock(&chip->i2c_lock); - ret = regmap_write_bits(chip->regmap, dirreg, bit, bit); - mutex_unlock(&chip->i2c_lock); - return ret; + guard(mutex)(&chip->i2c_lock); + + return regmap_write_bits(chip->regmap, dirreg, bit, bit); } static int pca953x_gpio_direction_output(struct gpio_chip *gc, @@ -540,17 +539,15 @@ static int pca953x_gpio_direction_output(struct gpio_chip *gc, u8 bit = BIT(off % BANK_SZ); int ret; - mutex_lock(&chip->i2c_lock); + guard(mutex)(&chip->i2c_lock); + /* set output level */ ret = regmap_write_bits(chip->regmap, outreg, bit, val ? bit : 0); if (ret) - goto exit; + return ret; /* then direction */ - ret = regmap_write_bits(chip->regmap, dirreg, bit, 0); -exit: - mutex_unlock(&chip->i2c_lock); - return ret; + return regmap_write_bits(chip->regmap, dirreg, bit, 0); } static int pca953x_gpio_get_value(struct gpio_chip *gc, unsigned off) @@ -561,9 +558,8 @@ static int pca953x_gpio_get_value(struct gpio_chip *gc, unsigned off) u32 reg_val; int ret; - mutex_lock(&chip->i2c_lock); - ret = regmap_read(chip->regmap, inreg, ®_val); - mutex_unlock(&chip->i2c_lock); + scoped_guard(mutex, &chip->i2c_lock) + ret = regmap_read(chip->regmap, inreg, ®_val); if (ret < 0) return ret; @@ -576,9 +572,9 @@ static void pca953x_gpio_set_value(struct gpio_chip *gc, unsigned off, int val) u8 outreg = chip->recalc_addr(chip, chip->regs->output, off); u8 bit = BIT(off % BANK_SZ); - mutex_lock(&chip->i2c_lock); + guard(mutex)(&chip->i2c_lock); + regmap_write_bits(chip->regmap, outreg, bit, val ? bit : 0); - mutex_unlock(&chip->i2c_lock); } static int pca953x_gpio_get_direction(struct gpio_chip *gc, unsigned off) @@ -589,9 +585,8 @@ static int pca953x_gpio_get_direction(struct gpio_chip *gc, unsigned off) u32 reg_val; int ret; - mutex_lock(&chip->i2c_lock); - ret = regmap_read(chip->regmap, dirreg, ®_val); - mutex_unlock(&chip->i2c_lock); + scoped_guard(mutex, &chip->i2c_lock) + ret = regmap_read(chip->regmap, dirreg, ®_val); if (ret < 0) return ret; @@ -608,9 +603,8 @@ static int pca953x_gpio_get_multiple(struct gpio_chip *gc, DECLARE_BITMAP(reg_val, MAX_LINE); int ret; - mutex_lock(&chip->i2c_lock); - ret = pca953x_read_regs(chip, chip->regs->input, reg_val); - mutex_unlock(&chip->i2c_lock); + scoped_guard(mutex, &chip->i2c_lock) + ret = pca953x_read_regs(chip, chip->regs->input, reg_val); if (ret) return ret; @@ -625,16 +619,15 @@ static void pca953x_gpio_set_multiple(struct gpio_chip *gc, DECLARE_BITMAP(reg_val, MAX_LINE); int ret; - mutex_lock(&chip->i2c_lock); + guard(mutex)(&chip->i2c_lock); + ret = pca953x_read_regs(chip, chip->regs->output, reg_val); if (ret) - goto exit; + return; bitmap_replace(reg_val, reg_val, bits, mask, gc->ngpio); pca953x_write_regs(chip, chip->regs->output, reg_val); -exit: - mutex_unlock(&chip->i2c_lock); } static int pca953x_gpio_set_pull_up_down(struct pca953x_chip *chip, @@ -642,7 +635,6 @@ static int pca953x_gpio_set_pull_up_down(struct pca953x_chip *chip, unsigned long config) { enum pin_config_param param = pinconf_to_config_param(config); - u8 pull_en_reg = chip->recalc_addr(chip, PCAL953X_PULL_EN, offset); u8 pull_sel_reg = chip->recalc_addr(chip, PCAL953X_PULL_SEL, offset); u8 bit = BIT(offset % BANK_SZ); @@ -655,7 +647,7 @@ static int pca953x_gpio_set_pull_up_down(struct pca953x_chip *chip, if (!(chip->driver_data & PCA_PCAL)) return -ENOTSUPP; - mutex_lock(&chip->i2c_lock); + guard(mutex)(&chip->i2c_lock); /* Configure pull-up/pull-down */ if (param == PIN_CONFIG_BIAS_PULL_UP) @@ -665,17 +657,13 @@ static int pca953x_gpio_set_pull_up_down(struct pca953x_chip *chip, else ret = 0; if (ret) - goto exit; + return ret; /* Disable/Enable pull-up/pull-down */ if (param == PIN_CONFIG_BIAS_DISABLE) - ret = regmap_write_bits(chip->regmap, pull_en_reg, bit, 0); + return regmap_write_bits(chip->regmap, pull_en_reg, bit, 0); else - ret = regmap_write_bits(chip->regmap, pull_en_reg, bit, bit); - -exit: - mutex_unlock(&chip->i2c_lock); - return ret; + return regmap_write_bits(chip->regmap, pull_en_reg, bit, bit); } static int pca953x_gpio_set_config(struct gpio_chip *gc, unsigned int offset, @@ -888,10 +876,8 @@ static irqreturn_t pca953x_irq_handler(int irq, void *devid) bitmap_zero(pending, MAX_LINE); - mutex_lock(&chip->i2c_lock); - ret = pca953x_irq_pending(chip, pending); - mutex_unlock(&chip->i2c_lock); - + scoped_guard(mutex, &chip->i2c_lock) + ret = pca953x_irq_pending(chip, pending); if (ret) { ret = 0; @@ -1253,26 +1239,21 @@ static int pca953x_restore_context(struct pca953x_chip *chip) { int ret; - mutex_lock(&chip->i2c_lock); + guard(mutex)(&chip->i2c_lock); regcache_cache_only(chip->regmap, false); regcache_mark_dirty(chip->regmap); ret = pca953x_regcache_sync(chip); - if (ret) { - mutex_unlock(&chip->i2c_lock); + if (ret) return ret; - } - ret = regcache_sync(chip->regmap); - mutex_unlock(&chip->i2c_lock); - return ret; + return regcache_sync(chip->regmap); } static void pca953x_save_context(struct pca953x_chip *chip) { - mutex_lock(&chip->i2c_lock); + guard(mutex)(&chip->i2c_lock); regcache_cache_only(chip->regmap, true); - mutex_unlock(&chip->i2c_lock); } static int pca953x_suspend(struct device *dev) From 262e32568acc2a2fbdc4322fb71f602dcaa929b5 Mon Sep 17 00:00:00 2001 From: Emanuele Ghidoli Date: Mon, 12 May 2025 11:54:41 +0200 Subject: [PATCH 200/529] gpio: pca953x: fix IRQ storm on system wake up [ Upstream commit 3e38f946062b4845961ab86b726651b4457b2af8 ] If an input changes state during wake-up and is used as an interrupt source, the IRQ handler reads the volatile input register to clear the interrupt mask and deassert the IRQ line. However, the IRQ handler is triggered before access to the register is granted, causing the read operation to fail. As a result, the IRQ handler enters a loop, repeatedly printing the "failed reading register" message, until `pca953x_resume()` is eventually called, which restores the driver context and enables access to registers. Fix by disabling the IRQ line before entering suspend mode, and re-enabling it after the driver context is restored in `pca953x_resume()`. An IRQ can be disabled with disable_irq() and still wake the system as long as the IRQ has wake enabled, so the wake-up functionality is preserved. Fixes: b76574300504 ("gpio: pca953x: Restore registers after suspend/resume cycle") Cc: stable@vger.kernel.org Signed-off-by: Emanuele Ghidoli Signed-off-by: Francesco Dolcini Reviewed-by: Andy Shevchenko Tested-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20250512095441.31645-1-francesco@dolcini.it Signed-off-by: Bartosz Golaszewski Signed-off-by: Sasha Levin --- drivers/gpio/gpio-pca953x.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpio/gpio-pca953x.c b/drivers/gpio/gpio-pca953x.c index a5be47a916d3..f81d79a297a5 100644 --- a/drivers/gpio/gpio-pca953x.c +++ b/drivers/gpio/gpio-pca953x.c @@ -1241,6 +1241,8 @@ static int pca953x_restore_context(struct pca953x_chip *chip) guard(mutex)(&chip->i2c_lock); + if (chip->client->irq > 0) + enable_irq(chip->client->irq); regcache_cache_only(chip->regmap, false); regcache_mark_dirty(chip->regmap); ret = pca953x_regcache_sync(chip); @@ -1253,6 +1255,10 @@ static int pca953x_restore_context(struct pca953x_chip *chip) static void pca953x_save_context(struct pca953x_chip *chip) { guard(mutex)(&chip->i2c_lock); + + /* Disable IRQ to prevent early triggering while regmap "cache only" is on */ + if (chip->client->irq > 0) + disable_irq(chip->client->irq); regcache_cache_only(chip->regmap, true); } From ba7694f61c3eef02e1a2fb169bd74da948f7a6dc Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Thu, 22 Aug 2024 18:27:55 +0300 Subject: [PATCH 201/529] phy: renesas: rcar-gen3-usb2: Add support to initialize the bus [ Upstream commit 4eae16375357a2a7e8501be5469532f7636064b3 ] The Renesas RZ/G3S need to initialize the USB BUS before transferring data due to hardware limitation. As the register that need to be touched for this is in the address space of the USB PHY, and the UBS PHY need to be initialized before any other USB drivers handling data transfer, add support to initialize the USB BUS. As the USB PHY is probed before any other USB drivers that enables clocks and de-assert the reset signals and the BUS initialization is done in the probe phase, we need to add code to de-assert reset signal and runtime resume the device (which enables its clocks) before accessing the registers. As the reset signals are not required by the USB PHY driver for the other USB PHY hardware variants, the reset signals and runtime PM was handled only in the function that initialize the USB BUS. The PHY initialization was done right after runtime PM enable to have all in place when the PHYs are registered. Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20240822152801.602318-11-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Vinod Koul Stable-dep-of: 9ce71e85b29e ("phy: renesas: rcar-gen3-usb2: Assert PLL reset on PHY power off") Signed-off-by: Sasha Levin --- drivers/phy/renesas/phy-rcar-gen3-usb2.c | 50 ++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/drivers/phy/renesas/phy-rcar-gen3-usb2.c b/drivers/phy/renesas/phy-rcar-gen3-usb2.c index 3824e338b61e..d13083f60d89 100644 --- a/drivers/phy/renesas/phy-rcar-gen3-usb2.c +++ b/drivers/phy/renesas/phy-rcar-gen3-usb2.c @@ -21,12 +21,14 @@ #include #include #include +#include #include #include #include /******* USB2.0 Host registers (original offset is +0x200) *******/ #define USB2_INT_ENABLE 0x000 +#define USB2_AHB_BUS_CTR 0x008 #define USB2_USBCTR 0x00c #define USB2_SPD_RSM_TIMSET 0x10c #define USB2_OC_TIMSET 0x110 @@ -42,6 +44,10 @@ #define USB2_INT_ENABLE_USBH_INTB_EN BIT(2) /* For EHCI */ #define USB2_INT_ENABLE_USBH_INTA_EN BIT(1) /* For OHCI */ +/* AHB_BUS_CTR */ +#define USB2_AHB_BUS_CTR_MBL_MASK GENMASK(1, 0) +#define USB2_AHB_BUS_CTR_MBL_INCR4 2 + /* USBCTR */ #define USB2_USBCTR_DIRPD BIT(2) #define USB2_USBCTR_PLL_RST BIT(1) @@ -112,6 +118,7 @@ struct rcar_gen3_chan { struct extcon_dev *extcon; struct rcar_gen3_phy rphys[NUM_OF_PHYS]; struct regulator *vbus; + struct reset_control *rstc; struct work_struct work; struct mutex lock; /* protects rphys[...].powered */ enum usb_dr_mode dr_mode; @@ -126,6 +133,7 @@ struct rcar_gen3_chan { struct rcar_gen3_phy_drv_data { const struct phy_ops *phy_usb2_ops; bool no_adp_ctrl; + bool init_bus; }; /* @@ -647,6 +655,35 @@ static enum usb_dr_mode rcar_gen3_get_dr_mode(struct device_node *np) return candidate; } +static int rcar_gen3_phy_usb2_init_bus(struct rcar_gen3_chan *channel) +{ + struct device *dev = channel->dev; + int ret; + u32 val; + + channel->rstc = devm_reset_control_array_get_shared(dev); + if (IS_ERR(channel->rstc)) + return PTR_ERR(channel->rstc); + + ret = pm_runtime_resume_and_get(dev); + if (ret) + return ret; + + ret = reset_control_deassert(channel->rstc); + if (ret) + goto rpm_put; + + val = readl(channel->base + USB2_AHB_BUS_CTR); + val &= ~USB2_AHB_BUS_CTR_MBL_MASK; + val |= USB2_AHB_BUS_CTR_MBL_INCR4; + writel(val, channel->base + USB2_AHB_BUS_CTR); + +rpm_put: + pm_runtime_put(dev); + + return ret; +} + static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev) { const struct rcar_gen3_phy_drv_data *phy_data; @@ -700,6 +737,15 @@ static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev) goto error; } + platform_set_drvdata(pdev, channel); + channel->dev = dev; + + if (phy_data->init_bus) { + ret = rcar_gen3_phy_usb2_init_bus(channel); + if (ret) + goto error; + } + channel->soc_no_adp_ctrl = phy_data->no_adp_ctrl; if (phy_data->no_adp_ctrl) channel->obint_enable_bits = USB2_OBINT_IDCHG_EN; @@ -727,9 +773,6 @@ static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev) channel->vbus = NULL; } - platform_set_drvdata(pdev, channel); - channel->dev = dev; - provider = devm_of_phy_provider_register(dev, rcar_gen3_phy_usb2_xlate); if (IS_ERR(provider)) { dev_err(dev, "Failed to register PHY provider\n"); @@ -756,6 +799,7 @@ static int rcar_gen3_phy_usb2_remove(struct platform_device *pdev) if (channel->is_otg_channel) device_remove_file(&pdev->dev, &dev_attr_role); + reset_control_assert(channel->rstc); pm_runtime_disable(&pdev->dev); return 0; From 6dbb6f00dcac1f680e2e7f1b3ad2debfab429047 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 7 May 2025 15:50:29 +0300 Subject: [PATCH 202/529] phy: renesas: rcar-gen3-usb2: Move IRQ request in probe [ Upstream commit de76809f60cc938d3580bbbd5b04b7d12af6ce3a ] Commit 08b0ad375ca6 ("phy: renesas: rcar-gen3-usb2: move IRQ registration to init") moved the IRQ request operation from probe to struct phy_ops::phy_init API to avoid triggering interrupts (which lead to register accesses) while the PHY clocks (enabled through runtime PM APIs) are not active. If this happens, it results in a synchronous abort. One way to reproduce this issue is by enabling CONFIG_DEBUG_SHIRQ, which calls free_irq() on driver removal. Move the IRQ request and free operations back to probe, and take the runtime PM state into account in IRQ handler. This commit is preparatory for the subsequent fixes in this series. Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Reviewed-by: Lad Prabhakar Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20250507125032.565017-3-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Vinod Koul Stable-dep-of: 9ce71e85b29e ("phy: renesas: rcar-gen3-usb2: Assert PLL reset on PHY power off") Signed-off-by: Sasha Levin --- drivers/phy/renesas/phy-rcar-gen3-usb2.c | 46 +++++++++++++----------- 1 file changed, 26 insertions(+), 20 deletions(-) diff --git a/drivers/phy/renesas/phy-rcar-gen3-usb2.c b/drivers/phy/renesas/phy-rcar-gen3-usb2.c index d13083f60d89..69cc99c60f58 100644 --- a/drivers/phy/renesas/phy-rcar-gen3-usb2.c +++ b/drivers/phy/renesas/phy-rcar-gen3-usb2.c @@ -122,7 +122,6 @@ struct rcar_gen3_chan { struct work_struct work; struct mutex lock; /* protects rphys[...].powered */ enum usb_dr_mode dr_mode; - int irq; u32 obint_enable_bits; bool extcon_host; bool is_otg_channel; @@ -427,16 +426,25 @@ static irqreturn_t rcar_gen3_phy_usb2_irq(int irq, void *_ch) { struct rcar_gen3_chan *ch = _ch; void __iomem *usb2_base = ch->base; - u32 status = readl(usb2_base + USB2_OBINTSTA); + struct device *dev = ch->dev; irqreturn_t ret = IRQ_NONE; + u32 status; + pm_runtime_get_noresume(dev); + + if (pm_runtime_suspended(dev)) + goto rpm_put; + + status = readl(usb2_base + USB2_OBINTSTA); if (status & ch->obint_enable_bits) { - dev_vdbg(ch->dev, "%s: %08x\n", __func__, status); + dev_vdbg(dev, "%s: %08x\n", __func__, status); writel(ch->obint_enable_bits, usb2_base + USB2_OBINTSTA); rcar_gen3_device_recognition(ch); ret = IRQ_HANDLED; } +rpm_put: + pm_runtime_put_noidle(dev); return ret; } @@ -446,17 +454,6 @@ static int rcar_gen3_phy_usb2_init(struct phy *p) struct rcar_gen3_chan *channel = rphy->ch; void __iomem *usb2_base = channel->base; u32 val; - int ret; - - if (!rcar_gen3_is_any_rphy_initialized(channel) && channel->irq >= 0) { - INIT_WORK(&channel->work, rcar_gen3_phy_usb2_work); - ret = request_irq(channel->irq, rcar_gen3_phy_usb2_irq, - IRQF_SHARED, dev_name(channel->dev), channel); - if (ret < 0) { - dev_err(channel->dev, "No irq handler (%d)\n", channel->irq); - return ret; - } - } /* Initialize USB2 part */ val = readl(usb2_base + USB2_INT_ENABLE); @@ -492,9 +489,6 @@ static int rcar_gen3_phy_usb2_exit(struct phy *p) val &= ~USB2_INT_ENABLE_UCOM_INTEN; writel(val, usb2_base + USB2_INT_ENABLE); - if (channel->irq >= 0 && !rcar_gen3_is_any_rphy_initialized(channel)) - free_irq(channel->irq, channel); - return 0; } @@ -690,7 +684,7 @@ static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct rcar_gen3_chan *channel; struct phy_provider *provider; - int ret = 0, i; + int ret = 0, i, irq; if (!dev->of_node) { dev_err(dev, "This driver needs device tree\n"); @@ -706,8 +700,6 @@ static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev) return PTR_ERR(channel->base); channel->obint_enable_bits = USB2_OBINT_BITS; - /* get irq number here and request_irq for OTG in phy_init */ - channel->irq = platform_get_irq_optional(pdev, 0); channel->dr_mode = rcar_gen3_get_dr_mode(dev->of_node); if (channel->dr_mode != USB_DR_MODE_UNKNOWN) { channel->is_otg_channel = true; @@ -773,6 +765,20 @@ static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev) channel->vbus = NULL; } + irq = platform_get_irq_optional(pdev, 0); + if (irq < 0 && irq != -ENXIO) { + ret = irq; + goto error; + } else if (irq > 0) { + INIT_WORK(&channel->work, rcar_gen3_phy_usb2_work); + ret = devm_request_irq(dev, irq, rcar_gen3_phy_usb2_irq, + IRQF_SHARED, dev_name(dev), channel); + if (ret < 0) { + dev_err(dev, "Failed to request irq (%d)\n", irq); + goto error; + } + } + provider = devm_of_phy_provider_register(dev, rcar_gen3_phy_usb2_xlate); if (IS_ERR(provider)) { dev_err(dev, "Failed to register PHY provider\n"); From a640e906d92698fb4fcb5447a2600d0f36dcecf5 Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 7 May 2025 15:50:30 +0300 Subject: [PATCH 203/529] phy: renesas: rcar-gen3-usb2: Lock around hardware registers and driver data [ Upstream commit 55a387ebb9219cbe4edfa8ba9996ccb0e7ad4932 ] The phy-rcar-gen3-usb2 driver exposes four individual PHYs that are requested and configured by PHY users. The struct phy_ops APIs access the same set of registers to configure all PHYs. Additionally, PHY settings can be modified through sysfs or an IRQ handler. While some struct phy_ops APIs are protected by a driver-wide mutex, others rely on individual PHY-specific mutexes. This approach can lead to various issues, including: 1/ the IRQ handler may interrupt PHY settings in progress, racing with hardware configuration protected by a mutex lock 2/ due to msleep(20) in rcar_gen3_init_otg(), while a configuration thread suspends to wait for the delay, another thread may try to configure another PHY (with phy_init() + phy_power_on()); re-running the phy_init() goes to the exact same configuration code, re-running the same hardware configuration on the same set of registers (and bits) which might impact the result of the msleep for the 1st configuring thread 3/ sysfs can configure the hardware (though role_store()) and it can still race with the phy_init()/phy_power_on() APIs calling into the drivers struct phy_ops To address these issues, add a spinlock to protect hardware register access and driver private data structures (e.g., calls to rcar_gen3_is_any_rphy_initialized()). Checking driver-specific data remains necessary as all PHY instances share common settings. With this change, the existing mutex protection is removed and the cleanup.h helpers are used. While at it, to keep the code simpler, do not skip regulator_enable()/regulator_disable() APIs in rcar_gen3_phy_usb2_power_on()/rcar_gen3_phy_usb2_power_off() as the regulators enable/disable operations are reference counted anyway. Fixes: f3b5a8d9b50d ("phy: rcar-gen3-usb2: Add R-Car Gen3 USB2 PHY driver") Cc: stable@vger.kernel.org Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Reviewed-by: Lad Prabhakar Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20250507125032.565017-4-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Vinod Koul Stable-dep-of: 9ce71e85b29e ("phy: renesas: rcar-gen3-usb2: Assert PLL reset on PHY power off") Signed-off-by: Sasha Levin --- drivers/phy/renesas/phy-rcar-gen3-usb2.c | 49 +++++++++++++----------- 1 file changed, 26 insertions(+), 23 deletions(-) diff --git a/drivers/phy/renesas/phy-rcar-gen3-usb2.c b/drivers/phy/renesas/phy-rcar-gen3-usb2.c index 69cc99c60f58..8b1280cdbcef 100644 --- a/drivers/phy/renesas/phy-rcar-gen3-usb2.c +++ b/drivers/phy/renesas/phy-rcar-gen3-usb2.c @@ -9,6 +9,7 @@ * Copyright (C) 2014 Cogent Embedded, Inc. */ +#include #include #include #include @@ -120,7 +121,7 @@ struct rcar_gen3_chan { struct regulator *vbus; struct reset_control *rstc; struct work_struct work; - struct mutex lock; /* protects rphys[...].powered */ + spinlock_t lock; /* protects access to hardware and driver data structure. */ enum usb_dr_mode dr_mode; u32 obint_enable_bits; bool extcon_host; @@ -347,6 +348,8 @@ static ssize_t role_store(struct device *dev, struct device_attribute *attr, bool is_b_device; enum phy_mode cur_mode, new_mode; + guard(spinlock_irqsave)(&ch->lock); + if (!ch->is_otg_channel || !rcar_gen3_is_any_otg_rphy_initialized(ch)) return -EIO; @@ -414,7 +417,7 @@ static void rcar_gen3_init_otg(struct rcar_gen3_chan *ch) val = readl(usb2_base + USB2_ADPCTRL); writel(val | USB2_ADPCTRL_IDPULLUP, usb2_base + USB2_ADPCTRL); } - msleep(20); + mdelay(20); writel(0xffffffff, usb2_base + USB2_OBINTSTA); writel(ch->obint_enable_bits, usb2_base + USB2_OBINTEN); @@ -435,12 +438,14 @@ static irqreturn_t rcar_gen3_phy_usb2_irq(int irq, void *_ch) if (pm_runtime_suspended(dev)) goto rpm_put; - status = readl(usb2_base + USB2_OBINTSTA); - if (status & ch->obint_enable_bits) { - dev_vdbg(dev, "%s: %08x\n", __func__, status); - writel(ch->obint_enable_bits, usb2_base + USB2_OBINTSTA); - rcar_gen3_device_recognition(ch); - ret = IRQ_HANDLED; + scoped_guard(spinlock, &ch->lock) { + status = readl(usb2_base + USB2_OBINTSTA); + if (status & ch->obint_enable_bits) { + dev_vdbg(dev, "%s: %08x\n", __func__, status); + writel(ch->obint_enable_bits, usb2_base + USB2_OBINTSTA); + rcar_gen3_device_recognition(ch); + ret = IRQ_HANDLED; + } } rpm_put: @@ -455,6 +460,8 @@ static int rcar_gen3_phy_usb2_init(struct phy *p) void __iomem *usb2_base = channel->base; u32 val; + guard(spinlock_irqsave)(&channel->lock); + /* Initialize USB2 part */ val = readl(usb2_base + USB2_INT_ENABLE); val |= USB2_INT_ENABLE_UCOM_INTEN | rphy->int_enable_bits; @@ -481,6 +488,8 @@ static int rcar_gen3_phy_usb2_exit(struct phy *p) void __iomem *usb2_base = channel->base; u32 val; + guard(spinlock_irqsave)(&channel->lock); + rphy->initialized = false; val = readl(usb2_base + USB2_INT_ENABLE); @@ -500,16 +509,17 @@ static int rcar_gen3_phy_usb2_power_on(struct phy *p) u32 val; int ret = 0; - mutex_lock(&channel->lock); - if (!rcar_gen3_are_all_rphys_power_off(channel)) - goto out; - if (channel->vbus) { ret = regulator_enable(channel->vbus); if (ret) - goto out; + return ret; } + guard(spinlock_irqsave)(&channel->lock); + + if (!rcar_gen3_are_all_rphys_power_off(channel)) + goto out; + val = readl(usb2_base + USB2_USBCTR); val |= USB2_USBCTR_PLL_RST; writel(val, usb2_base + USB2_USBCTR); @@ -519,7 +529,6 @@ static int rcar_gen3_phy_usb2_power_on(struct phy *p) out: /* The powered flag should be set for any other phys anyway */ rphy->powered = true; - mutex_unlock(&channel->lock); return 0; } @@ -530,18 +539,12 @@ static int rcar_gen3_phy_usb2_power_off(struct phy *p) struct rcar_gen3_chan *channel = rphy->ch; int ret = 0; - mutex_lock(&channel->lock); - rphy->powered = false; - - if (!rcar_gen3_are_all_rphys_power_off(channel)) - goto out; + scoped_guard(spinlock_irqsave, &channel->lock) + rphy->powered = false; if (channel->vbus) ret = regulator_disable(channel->vbus); -out: - mutex_unlock(&channel->lock); - return ret; } @@ -742,7 +745,7 @@ static int rcar_gen3_phy_usb2_probe(struct platform_device *pdev) if (phy_data->no_adp_ctrl) channel->obint_enable_bits = USB2_OBINT_IDCHG_EN; - mutex_init(&channel->lock); + spin_lock_init(&channel->lock); for (i = 0; i < NUM_OF_PHYS; i++) { channel->rphys[i].phy = devm_phy_create(dev, NULL, phy_data->phy_usb2_ops); From ee963a98477d65fba479823e51873994e5ddcb0b Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Wed, 7 May 2025 15:50:31 +0300 Subject: [PATCH 204/529] phy: renesas: rcar-gen3-usb2: Assert PLL reset on PHY power off [ Upstream commit 9ce71e85b29eb63e48e294479742e670513f03a0 ] Assert PLL reset on PHY power off. This saves power. Fixes: f3b5a8d9b50d ("phy: rcar-gen3-usb2: Add R-Car Gen3 USB2 PHY driver") Cc: stable@vger.kernel.org Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Reviewed-by: Lad Prabhakar Signed-off-by: Claudiu Beznea Link: https://lore.kernel.org/r/20250507125032.565017-5-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/phy/renesas/phy-rcar-gen3-usb2.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/phy/renesas/phy-rcar-gen3-usb2.c b/drivers/phy/renesas/phy-rcar-gen3-usb2.c index 8b1280cdbcef..024cc5ce68a3 100644 --- a/drivers/phy/renesas/phy-rcar-gen3-usb2.c +++ b/drivers/phy/renesas/phy-rcar-gen3-usb2.c @@ -539,9 +539,17 @@ static int rcar_gen3_phy_usb2_power_off(struct phy *p) struct rcar_gen3_chan *channel = rphy->ch; int ret = 0; - scoped_guard(spinlock_irqsave, &channel->lock) + scoped_guard(spinlock_irqsave, &channel->lock) { rphy->powered = false; + if (rcar_gen3_are_all_rphys_power_off(channel)) { + u32 val = readl(channel->base + USB2_USBCTR); + + val |= USB2_USBCTR_PLL_RST; + writel(val, channel->base + USB2_USBCTR); + } + } + if (channel->vbus) ret = regulator_disable(channel->vbus); From 6815846e0c3a62116a7da9740e3a7c10edc5c7e9 Mon Sep 17 00:00:00 2001 From: Dmitry Bogdanov Date: Tue, 24 Dec 2024 13:17:57 +0300 Subject: [PATCH 205/529] scsi: target: iscsi: Fix timeout on deleted connection [ Upstream commit 7f533cc5ee4c4436cee51dc58e81dfd9c3384418 ] NOPIN response timer may expire on a deleted connection and crash with such logs: Did not receive response to NOPIN on CID: 0, failing connection for I_T Nexus (null),i,0x00023d000125,iqn.2017-01.com.iscsi.target,t,0x3d BUG: Kernel NULL pointer dereference on read at 0x00000000 NIP strlcpy+0x8/0xb0 LR iscsit_fill_cxn_timeout_err_stats+0x5c/0xc0 [iscsi_target_mod] Call Trace: iscsit_handle_nopin_response_timeout+0xfc/0x120 [iscsi_target_mod] call_timer_fn+0x58/0x1f0 run_timer_softirq+0x740/0x860 __do_softirq+0x16c/0x420 irq_exit+0x188/0x1c0 timer_interrupt+0x184/0x410 That is because nopin response timer may be re-started on nopin timer expiration. Stop nopin timer before stopping the nopin response timer to be sure that no one of them will be re-started. Signed-off-by: Dmitry Bogdanov Link: https://lore.kernel.org/r/20241224101757.32300-1-d.bogdanov@yadro.com Reviewed-by: Maurizio Lombardi Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/target/iscsi/iscsi_target.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/target/iscsi/iscsi_target.c b/drivers/target/iscsi/iscsi_target.c index 07e196b44b91..04d40e76772b 100644 --- a/drivers/target/iscsi/iscsi_target.c +++ b/drivers/target/iscsi/iscsi_target.c @@ -4314,8 +4314,8 @@ int iscsit_close_connection( spin_unlock(&iscsit_global->ts_bitmap_lock); iscsit_stop_timers_for_cmds(conn); - iscsit_stop_nopin_response_timer(conn); iscsit_stop_nopin_timer(conn); + iscsit_stop_nopin_response_timer(conn); if (conn->conn_transport->iscsit_wait_conn) conn->conn_transport->iscsit_wait_conn(conn); From b6d6419548286b2b9d2b90df824d3cab797f6ae8 Mon Sep 17 00:00:00 2001 From: Zhongqiu Han Date: Wed, 12 Mar 2025 21:04:12 +0800 Subject: [PATCH 206/529] virtio_ring: Fix data race by tagging event_triggered as racy for KCSAN [ Upstream commit 2e2f925fe737576df2373931c95e1a2b66efdfef ] syzbot reports a data-race when accessing the event_triggered, here is the simplified stack when the issue occurred: ================================================================== BUG: KCSAN: data-race in virtqueue_disable_cb / virtqueue_enable_cb_delayed write to 0xffff8881025bc452 of 1 bytes by task 3288 on cpu 0: virtqueue_enable_cb_delayed+0x42/0x3c0 drivers/virtio/virtio_ring.c:2653 start_xmit+0x230/0x1310 drivers/net/virtio_net.c:3264 __netdev_start_xmit include/linux/netdevice.h:5151 [inline] netdev_start_xmit include/linux/netdevice.h:5160 [inline] xmit_one net/core/dev.c:3800 [inline] read to 0xffff8881025bc452 of 1 bytes by interrupt on cpu 1: virtqueue_disable_cb_split drivers/virtio/virtio_ring.c:880 [inline] virtqueue_disable_cb+0x92/0x180 drivers/virtio/virtio_ring.c:2566 skb_xmit_done+0x5f/0x140 drivers/net/virtio_net.c:777 vring_interrupt+0x161/0x190 drivers/virtio/virtio_ring.c:2715 __handle_irq_event_percpu+0x95/0x490 kernel/irq/handle.c:158 handle_irq_event_percpu kernel/irq/handle.c:193 [inline] value changed: 0x01 -> 0x00 ================================================================== When the data race occurs, the function virtqueue_enable_cb_delayed() sets event_triggered to false, and virtqueue_disable_cb_split/packed() reads it as false due to the race condition. Since event_triggered is an unreliable hint used for optimization, this should only cause the driver temporarily suggest that the device not send an interrupt notification when the event index is used. Fix this KCSAN reported data-race issue by explicitly tagging the access as data_racy. Reported-by: syzbot+efe683d57990864b8c8e@syzkaller.appspotmail.com Closes: https://lore.kernel.org/all/67c7761a.050a0220.15b4b9.0018.GAE@google.com/ Signed-off-by: Zhongqiu Han Message-Id: <20250312130412.3516307-1-quic_zhonhan@quicinc.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: Sasha Levin --- drivers/virtio/virtio_ring.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 7d320f799ca1..06a64c4adc98 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -2415,7 +2415,7 @@ bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) struct vring_virtqueue *vq = to_vvq(_vq); if (vq->event_triggered) - vq->event_triggered = false; + data_race(vq->event_triggered = false); return vq->packed_ring ? virtqueue_enable_cb_delayed_packed(_vq) : virtqueue_enable_cb_delayed_split(_vq); From 04aa1f6d160a359270fb8715daf62808557fc513 Mon Sep 17 00:00:00 2001 From: Marek Szyprowski Date: Tue, 15 Apr 2025 09:56:59 +0200 Subject: [PATCH 207/529] dma-mapping: avoid potential unused data compilation warning [ Upstream commit c9b19ea63036fc537a69265acea1b18dabd1cbd3 ] When CONFIG_NEED_DMA_MAP_STATE is not defined, dma-mapping clients might report unused data compilation warnings for dma_unmap_*() calls arguments. Redefine macros for those calls to let compiler to notice that it is okay when the provided arguments are not used. Reported-by: Andy Shevchenko Suggested-by: Jakub Kicinski Signed-off-by: Marek Szyprowski Tested-by: Andy Shevchenko Link: https://lore.kernel.org/r/20250415075659.428549-1-m.szyprowski@samsung.com Signed-off-by: Sasha Levin --- include/linux/dma-mapping.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index e13050eb9777..af3f39ecc1b8 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -598,10 +598,14 @@ static inline int dma_mmap_wc(struct device *dev, #else #define DEFINE_DMA_UNMAP_ADDR(ADDR_NAME) #define DEFINE_DMA_UNMAP_LEN(LEN_NAME) -#define dma_unmap_addr(PTR, ADDR_NAME) (0) -#define dma_unmap_addr_set(PTR, ADDR_NAME, VAL) do { } while (0) -#define dma_unmap_len(PTR, LEN_NAME) (0) -#define dma_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) +#define dma_unmap_addr(PTR, ADDR_NAME) \ + ({ typeof(PTR) __p __maybe_unused = PTR; 0; }) +#define dma_unmap_addr_set(PTR, ADDR_NAME, VAL) \ + do { typeof(PTR) __p __maybe_unused = PTR; } while (0) +#define dma_unmap_len(PTR, LEN_NAME) \ + ({ typeof(PTR) __p __maybe_unused = PTR; 0; }) +#define dma_unmap_len_set(PTR, LEN_NAME, VAL) \ + do { typeof(PTR) __p __maybe_unused = PTR; } while (0) #endif #endif /* _LINUX_DMA_MAPPING_H */ From 91fe35809eb04ae7920512fa90dc666ebd64a936 Mon Sep 17 00:00:00 2001 From: gaoxu Date: Thu, 17 Apr 2025 07:30:00 +0000 Subject: [PATCH 208/529] cgroup: Fix compilation issue due to cgroup_mutex not being exported MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 87c259a7a359e73e6c52c68fcbec79988999b4e6 ] When adding folio_memcg function call in the zram module for Android16-6.12, the following error occurs during compilation: ERROR: modpost: "cgroup_mutex" [../soc-repo/zram.ko] undefined! This error is caused by the indirect call to lockdep_is_held(&cgroup_mutex) within folio_memcg. The export setting for cgroup_mutex is controlled by the CONFIG_PROVE_RCU macro. If CONFIG_LOCKDEP is enabled while CONFIG_PROVE_RCU is not, this compilation error will occur. To resolve this issue, add a parallel macro CONFIG_LOCKDEP control to ensure cgroup_mutex is properly exported when needed. Signed-off-by: gao xu Acked-by: Michal Koutný Signed-off-by: Tejun Heo Signed-off-by: Sasha Levin --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 6e54f0daebef..7997c8021b62 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -90,7 +90,7 @@ DEFINE_MUTEX(cgroup_mutex); DEFINE_SPINLOCK(css_set_lock); -#ifdef CONFIG_PROVE_RCU +#if (defined CONFIG_PROVE_RCU || defined CONFIG_LOCKDEP) EXPORT_SYMBOL_GPL(cgroup_mutex); EXPORT_SYMBOL_GPL(css_set_lock); #endif From 1d44f1511be651831543053f669bd481fcf7d2cf Mon Sep 17 00:00:00 2001 From: Ranjan Kumar Date: Tue, 15 Apr 2025 15:45:46 +0530 Subject: [PATCH 209/529] scsi: mpi3mr: Add level check to control event logging [ Upstream commit b0b7ee3b574a72283399b9232f6190be07f220c0 ] Ensure event logs are only generated when the debug logging level MPI3_DEBUG_EVENT is enabled. This prevents unnecessary logging. Signed-off-by: Ranjan Kumar Link: https://lore.kernel.org/r/20250415101546.204018-1-ranjan.kumar@broadcom.com Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/mpi3mr/mpi3mr_fw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/mpi3mr/mpi3mr_fw.c b/drivers/scsi/mpi3mr/mpi3mr_fw.c index 41636c4c43af..015a875a46a1 100644 --- a/drivers/scsi/mpi3mr/mpi3mr_fw.c +++ b/drivers/scsi/mpi3mr/mpi3mr_fw.c @@ -174,6 +174,9 @@ static void mpi3mr_print_event_data(struct mpi3mr_ioc *mrioc, char *desc = NULL; u16 event; + if (!(mrioc->logging_level & MPI3_DEBUG_EVENT)) + return; + event = event_reply->event; switch (event) { From 496464623411abd50622036c08434b2d86f38507 Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Thu, 17 Apr 2025 15:00:04 +0300 Subject: [PATCH 210/529] net: enetc: refactor bulk flipping of RX buffers to separate function [ Upstream commit 1d587faa5be7e9785b682cc5f58ba8f4100c13ea ] This small snippet of code ensures that we do something with the array of RX software buffer descriptor elements after passing the skb to the stack. In this case, we see if the other half of the page is reusable, and if so, we "turn around" the buffers, making them directly usable by enetc_refill_rx_ring() without going to enetc_new_page(). We will need to perform this kind of buffer flipping from a new code path, i.e. from XDP_PASS. Currently, enetc_build_skb() does it there buffer by buffer, but in a subsequent change we will stop using enetc_build_skb() for XDP_PASS. Signed-off-by: Vladimir Oltean Reviewed-by: Wei Fang Link: https://patch.msgid.link/20250417120005.3288549-3-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/freescale/enetc/enetc.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/freescale/enetc/enetc.c b/drivers/net/ethernet/freescale/enetc/enetc.c index 230b317d93da..bf49c07c8b51 100644 --- a/drivers/net/ethernet/freescale/enetc/enetc.c +++ b/drivers/net/ethernet/freescale/enetc/enetc.c @@ -1536,6 +1536,16 @@ static void enetc_xdp_drop(struct enetc_bdr *rx_ring, int rx_ring_first, } } +static void enetc_bulk_flip_buff(struct enetc_bdr *rx_ring, int rx_ring_first, + int rx_ring_last) +{ + while (rx_ring_first != rx_ring_last) { + enetc_flip_rx_buff(rx_ring, + &rx_ring->rx_swbd[rx_ring_first]); + enetc_bdr_idx_inc(rx_ring, &rx_ring_first); + } +} + static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, struct napi_struct *napi, int work_limit, struct bpf_prog *prog) @@ -1659,11 +1669,7 @@ static int enetc_clean_rx_ring_xdp(struct enetc_bdr *rx_ring, enetc_xdp_drop(rx_ring, orig_i, i); rx_ring->stats.xdp_redirect_failures++; } else { - while (orig_i != i) { - enetc_flip_rx_buff(rx_ring, - &rx_ring->rx_swbd[orig_i]); - enetc_bdr_idx_inc(rx_ring, &orig_i); - } + enetc_bulk_flip_buff(rx_ring, orig_i, i); xdp_redirect_frm_cnt++; rx_ring->stats.xdp_redirect++; } From f238c9c15d5fcb495e753bd6edab8f0b257d1c41 Mon Sep 17 00:00:00 2001 From: Felix Kuehling Date: Wed, 16 Apr 2025 00:19:13 -0400 Subject: [PATCH 211/529] drm/amdgpu: Allow P2P access through XGMI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit a92741e72f91b904c1d8c3d409ed8dbe9c1f2b26 ] If peer memory is accessible through XGMI, allow leaving it in VRAM rather than forcing its migration to GTT on DMABuf attachment. Signed-off-by: Felix Kuehling Tested-by: Hao (Claire) Zhou Reviewed-by: Christian König Signed-off-by: Alex Deucher (cherry picked from commit 372c8d72c3680fdea3fbb2d6b089f76b4a6d596a) Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c | 30 ++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c index ab06cb4d7b35..4dcc7de961d0 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_dma_buf.c @@ -42,6 +42,29 @@ #include #include +static const struct dma_buf_attach_ops amdgpu_dma_buf_attach_ops; + +/** + * dma_buf_attach_adev - Helper to get adev of an attachment + * + * @attach: attachment + * + * Returns: + * A struct amdgpu_device * if the attaching device is an amdgpu device or + * partition, NULL otherwise. + */ +static struct amdgpu_device *dma_buf_attach_adev(struct dma_buf_attachment *attach) +{ + if (attach->importer_ops == &amdgpu_dma_buf_attach_ops) { + struct drm_gem_object *obj = attach->importer_priv; + struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); + + return amdgpu_ttm_adev(bo->tbo.bdev); + } + + return NULL; +} + /** * amdgpu_dma_buf_attach - &dma_buf_ops.attach implementation * @@ -53,12 +76,14 @@ static int amdgpu_dma_buf_attach(struct dma_buf *dmabuf, struct dma_buf_attachment *attach) { + struct amdgpu_device *attach_adev = dma_buf_attach_adev(attach); struct drm_gem_object *obj = dmabuf->priv; struct amdgpu_bo *bo = gem_to_amdgpu_bo(obj); struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev); int r; - if (pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0) + if (!amdgpu_dmabuf_is_xgmi_accessible(attach_adev, bo) && + pci_p2pdma_distance(adev->pdev, attach->dev, false) < 0) attach->peer2peer = false; r = pm_runtime_get_sync(adev_to_drm(adev)->dev); @@ -479,6 +504,9 @@ bool amdgpu_dmabuf_is_xgmi_accessible(struct amdgpu_device *adev, struct drm_gem_object *obj = &bo->tbo.base; struct drm_gem_object *gobj; + if (!adev) + return false; + if (obj->import_attach) { struct dma_buf *dma_buf = obj->import_attach->dmabuf; From 5f3d693861c711d6bf6c74b6fe083910763fe1f8 Mon Sep 17 00:00:00 2001 From: Ihor Solodrai Date: Wed, 16 Apr 2025 10:02:46 -0700 Subject: [PATCH 212/529] selftests/bpf: Mitigate sockmap_ktls disconnect_after_delete failure [ Upstream commit f2858f308131a09e33afb766cd70119b5b900569 ] "sockmap_ktls disconnect_after_delete" test has been failing on BPF CI after recent merges from netdev: * https://github.com/kernel-patches/bpf/actions/runs/14458537639 * https://github.com/kernel-patches/bpf/actions/runs/14457178732 It happens because disconnect has been disabled for TLS [1], and it renders the test case invalid. Removing all the test code creates a conflict between bpf and bpf-next, so for now only remove the offending assert [2]. The test will be removed later on bpf-next. [1] https://lore.kernel.org/netdev/20250404180334.3224206-1-kuba@kernel.org/ [2] https://lore.kernel.org/bpf/cfc371285323e1a3f3b006bfcf74e6cf7ad65258@linux.dev/ Signed-off-by: Ihor Solodrai Signed-off-by: Andrii Nakryiko Reviewed-by: Jiayuan Chen Link: https://lore.kernel.org/bpf/20250416170246.2438524-1-ihor.solodrai@linux.dev Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c | 1 - 1 file changed, 1 deletion(-) diff --git a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c index 2d0796314862..0a99fd404f6d 100644 --- a/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c +++ b/tools/testing/selftests/bpf/prog_tests/sockmap_ktls.c @@ -68,7 +68,6 @@ static void test_sockmap_ktls_disconnect_after_delete(int family, int map) goto close_cli; err = disconnect(cli); - ASSERT_OK(err, "disconnect"); close_cli: close(cli); From 095335326957a013f7a2db8164abbb0d70d4987a Mon Sep 17 00:00:00 2001 From: Brandon Kammerdiener Date: Thu, 24 Apr 2025 11:32:51 -0400 Subject: [PATCH 213/529] bpf: fix possible endless loop in BPF map iteration [ Upstream commit 75673fda0c557ae26078177dd14d4857afbf128d ] The _safe variant used here gets the next element before running the callback, avoiding the endless loop condition. Signed-off-by: Brandon Kammerdiener Link: https://lore.kernel.org/r/20250424153246.141677-2-brandon.kammerdiener@intel.com Signed-off-by: Alexei Starovoitov Acked-by: Hou Tao Signed-off-by: Sasha Levin --- kernel/bpf/hashtab.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index dae9ed02a75b..06bc7f26be06 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -2172,7 +2172,7 @@ static int bpf_for_each_hash_elem(struct bpf_map *map, bpf_callback_t callback_f b = &htab->buckets[i]; rcu_read_lock(); head = &b->head; - hlist_nulls_for_each_entry_rcu(elem, n, head, hash_node) { + hlist_nulls_for_each_entry_safe(elem, n, head, hash_node) { key = elem->key; if (is_percpu) { /* current cpu value for percpu map */ From aa8687470a08296b214ae971fdb5dd45b35db860 Mon Sep 17 00:00:00 2001 From: Haoran Jiang Date: Fri, 25 Apr 2025 17:50:42 +0800 Subject: [PATCH 214/529] samples/bpf: Fix compilation failure for samples/bpf on LoongArch Fedora [ Upstream commit 548762f05d19c5542db7590bcdfb9be1fb928376 ] When building the latest samples/bpf on LoongArch Fedora make M=samples/bpf There are compilation errors as follows: In file included from ./linux/samples/bpf/sockex2_kern.c:2: In file included from ./include/uapi/linux/in.h:25: In file included from ./include/linux/socket.h:8: In file included from ./include/linux/uio.h:9: In file included from ./include/linux/thread_info.h:60: In file included from ./arch/loongarch/include/asm/thread_info.h:15: In file included from ./arch/loongarch/include/asm/processor.h:13: In file included from ./arch/loongarch/include/asm/cpu-info.h:11: ./arch/loongarch/include/asm/loongarch.h:13:10: fatal error: 'larchintrin.h' file not found ^~~~~~~~~~~~~~~ 1 error generated. larchintrin.h is included in /usr/lib64/clang/14.0.6/include, and the header file location is specified at compile time. Test on LoongArch Fedora: https://github.com/fedora-remix-loongarch/releases-info Signed-off-by: Haoran Jiang Signed-off-by: zhangxi Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250425095042.838824-1-jianghaoran@kylinos.cn Signed-off-by: Sasha Levin --- samples/bpf/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 727da3c5879b..77bf18cfdae7 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -434,7 +434,7 @@ $(obj)/%.o: $(src)/%.c @echo " CLANG-bpf " $@ $(Q)$(CLANG) $(NOSTDINC_FLAGS) $(LINUXINCLUDE) $(BPF_EXTRA_CFLAGS) \ -I$(obj) -I$(srctree)/tools/testing/selftests/bpf/ \ - -I$(LIBBPF_INCLUDE) \ + -I$(LIBBPF_INCLUDE) $(CLANG_SYS_INCLUDES) \ -D__KERNEL__ -D__BPF_TRACING__ -Wno-unused-value -Wno-pointer-sign \ -D__TARGET_ARCH_$(SRCARCH) -Wno-compare-distinct-pointer-types \ -Wno-gnu-variable-sized-type-not-at-end \ From 53dd81d7ba723aaf8cae304eb5474bd6bbda21bd Mon Sep 17 00:00:00 2001 From: Daniel Gomez Date: Fri, 28 Mar 2025 14:28:37 +0000 Subject: [PATCH 215/529] kconfig: merge_config: use an empty file as initfile [ Upstream commit a26fe287eed112b4e21e854f173c8918a6a8596d ] The scripts/kconfig/merge_config.sh script requires an existing $INITFILE (or the $1 argument) as a base file for merging Kconfig fragments. However, an empty $INITFILE can serve as an initial starting point, later referenced by the KCONFIG_ALLCONFIG Makefile variable if -m is not used. This variable can point to any configuration file containing preset config symbols (the merged output) as stated in Documentation/kbuild/kconfig.rst. When -m is used $INITFILE will contain just the merge output requiring the user to run make (i.e. KCONFIG_ALLCONFIG=<$INITFILE> make or make olddefconfig). Instead of failing when `$INITFILE` is missing, create an empty file and use it as the starting point for merges. Signed-off-by: Daniel Gomez Signed-off-by: Masahiro Yamada Signed-off-by: Sasha Levin --- scripts/kconfig/merge_config.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/kconfig/merge_config.sh b/scripts/kconfig/merge_config.sh index 72da3b8d6f30..151f9938abaa 100755 --- a/scripts/kconfig/merge_config.sh +++ b/scripts/kconfig/merge_config.sh @@ -105,8 +105,8 @@ INITFILE=$1 shift; if [ ! -r "$INITFILE" ]; then - echo "The base file '$INITFILE' does not exist. Exit." >&2 - exit 1 + echo "The base file '$INITFILE' does not exist. Creating one..." >&2 + touch "$INITFILE" fi MERGE_LIST=$* From 8696f0e4f6d908f3ace61920a291070b63419419 Mon Sep 17 00:00:00 2001 From: Anthony Krowiak Date: Tue, 11 Mar 2025 06:32:57 -0400 Subject: [PATCH 216/529] s390/vfio-ap: Fix no AP queue sharing allowed message written to kernel log [ Upstream commit d33d729afcc8ad2148d99f9bc499b33fd0c0d73b ] An erroneous message is written to the kernel log when either of the following actions are taken by a user: 1. Assign an adapter or domain to a vfio_ap mediated device via its sysfs assign_adapter or assign_domain attributes that would result in one or more AP queues being assigned that are already assigned to a different mediated device. Sharing of queues between mdevs is not allowed. 2. Reserve an adapter or domain for the host device driver via the AP bus driver's sysfs apmask or aqmask attribute that would result in providing host access to an AP queue that is in use by a vfio_ap mediated device. Reserving a queue for a host driver that is in use by an mdev is not allowed. In both cases, the assignment will return an error; however, a message like the following is written to the kernel log: vfio_ap_mdev e1839397-51a0-4e3c-91e0-c3b9c3d3047d: Userspace may not re-assign queue 00.0028 already assigned to \ e1839397-51a0-4e3c-91e0-c3b9c3d3047d Notice the mdev reporting the error is the same as the mdev identified in the message as the one to which the queue is being assigned. It is perfectly okay to assign a queue to an mdev to which it is already assigned; the assignment is simply ignored by the vfio_ap device driver. This patch logs more descriptive and accurate messages for both 1 and 2 above to the kernel log: Example for 1: vfio_ap_mdev 0fe903a0-a323-44db-9daf-134c68627d61: Userspace may not assign queue 00.0033 to mdev: already assigned to \ 62177883-f1bb-47f0-914d-32a22e3a8804 Example for 2: vfio_ap_mdev 62177883-f1bb-47f0-914d-32a22e3a8804: Can not reserve queue 00.0033 for host driver: in use by mdev Signed-off-by: Anthony Krowiak Link: https://lore.kernel.org/r/20250311103304.1539188-1-akrowiak@linux.ibm.com Signed-off-by: Heiko Carstens Signed-off-by: Vasily Gorbik Signed-off-by: Sasha Levin --- drivers/s390/crypto/vfio_ap_ops.c | 72 ++++++++++++++++++++----------- 1 file changed, 46 insertions(+), 26 deletions(-) diff --git a/drivers/s390/crypto/vfio_ap_ops.c b/drivers/s390/crypto/vfio_ap_ops.c index 86a8bd532489..11fe917fbd9d 100644 --- a/drivers/s390/crypto/vfio_ap_ops.c +++ b/drivers/s390/crypto/vfio_ap_ops.c @@ -789,48 +789,66 @@ static void vfio_ap_mdev_remove(struct mdev_device *mdev) vfio_put_device(&matrix_mdev->vdev); } -#define MDEV_SHARING_ERR "Userspace may not re-assign queue %02lx.%04lx " \ - "already assigned to %s" +#define MDEV_SHARING_ERR "Userspace may not assign queue %02lx.%04lx to mdev: already assigned to %s" -static void vfio_ap_mdev_log_sharing_err(struct ap_matrix_mdev *matrix_mdev, - unsigned long *apm, - unsigned long *aqm) +#define MDEV_IN_USE_ERR "Can not reserve queue %02lx.%04lx for host driver: in use by mdev" + +static void vfio_ap_mdev_log_sharing_err(struct ap_matrix_mdev *assignee, + struct ap_matrix_mdev *assigned_to, + unsigned long *apm, unsigned long *aqm) { unsigned long apid, apqi; - const struct device *dev = mdev_dev(matrix_mdev->mdev); - const char *mdev_name = dev_name(dev); - for_each_set_bit_inv(apid, apm, AP_DEVICES) + for_each_set_bit_inv(apid, apm, AP_DEVICES) { + for_each_set_bit_inv(apqi, aqm, AP_DOMAINS) { + dev_warn(mdev_dev(assignee->mdev), MDEV_SHARING_ERR, + apid, apqi, dev_name(mdev_dev(assigned_to->mdev))); + } + } +} + +static void vfio_ap_mdev_log_in_use_err(struct ap_matrix_mdev *assignee, + unsigned long *apm, unsigned long *aqm) +{ + unsigned long apid, apqi; + + for_each_set_bit_inv(apid, apm, AP_DEVICES) { for_each_set_bit_inv(apqi, aqm, AP_DOMAINS) - dev_warn(dev, MDEV_SHARING_ERR, apid, apqi, mdev_name); + dev_warn(mdev_dev(assignee->mdev), MDEV_IN_USE_ERR, apid, apqi); + } } /** * vfio_ap_mdev_verify_no_sharing - verify APQNs are not shared by matrix mdevs * + * @assignee: the matrix mdev to which @mdev_apm and @mdev_aqm are being + * assigned; or, NULL if this function was called by the AP bus + * driver in_use callback to verify none of the APQNs being reserved + * for the host device driver are in use by a vfio_ap mediated device * @mdev_apm: mask indicating the APIDs of the APQNs to be verified * @mdev_aqm: mask indicating the APQIs of the APQNs to be verified * - * Verifies that each APQN derived from the Cartesian product of a bitmap of - * AP adapter IDs and AP queue indexes is not configured for any matrix - * mediated device. AP queue sharing is not allowed. + * Verifies that each APQN derived from the Cartesian product of APIDs + * represented by the bits set in @mdev_apm and the APQIs of the bits set in + * @mdev_aqm is not assigned to a mediated device other than the mdev to which + * the APQN is being assigned (@assignee). AP queue sharing is not allowed. * * Return: 0 if the APQNs are not shared; otherwise return -EADDRINUSE. */ -static int vfio_ap_mdev_verify_no_sharing(unsigned long *mdev_apm, +static int vfio_ap_mdev_verify_no_sharing(struct ap_matrix_mdev *assignee, + unsigned long *mdev_apm, unsigned long *mdev_aqm) { - struct ap_matrix_mdev *matrix_mdev; + struct ap_matrix_mdev *assigned_to; DECLARE_BITMAP(apm, AP_DEVICES); DECLARE_BITMAP(aqm, AP_DOMAINS); - list_for_each_entry(matrix_mdev, &matrix_dev->mdev_list, node) { + list_for_each_entry(assigned_to, &matrix_dev->mdev_list, node) { /* - * If the input apm and aqm are fields of the matrix_mdev - * object, then move on to the next matrix_mdev. + * If the mdev to which the mdev_apm and mdev_aqm is being + * assigned is the same as the mdev being verified */ - if (mdev_apm == matrix_mdev->matrix.apm && - mdev_aqm == matrix_mdev->matrix.aqm) + if (assignee == assigned_to) continue; memset(apm, 0, sizeof(apm)); @@ -840,15 +858,16 @@ static int vfio_ap_mdev_verify_no_sharing(unsigned long *mdev_apm, * We work on full longs, as we can only exclude the leftover * bits in non-inverse order. The leftover is all zeros. */ - if (!bitmap_and(apm, mdev_apm, matrix_mdev->matrix.apm, - AP_DEVICES)) + if (!bitmap_and(apm, mdev_apm, assigned_to->matrix.apm, AP_DEVICES)) continue; - if (!bitmap_and(aqm, mdev_aqm, matrix_mdev->matrix.aqm, - AP_DOMAINS)) + if (!bitmap_and(aqm, mdev_aqm, assigned_to->matrix.aqm, AP_DOMAINS)) continue; - vfio_ap_mdev_log_sharing_err(matrix_mdev, apm, aqm); + if (assignee) + vfio_ap_mdev_log_sharing_err(assignee, assigned_to, apm, aqm); + else + vfio_ap_mdev_log_in_use_err(assigned_to, apm, aqm); return -EADDRINUSE; } @@ -877,7 +896,8 @@ static int vfio_ap_mdev_validate_masks(struct ap_matrix_mdev *matrix_mdev) matrix_mdev->matrix.aqm)) return -EADDRNOTAVAIL; - return vfio_ap_mdev_verify_no_sharing(matrix_mdev->matrix.apm, + return vfio_ap_mdev_verify_no_sharing(matrix_mdev, + matrix_mdev->matrix.apm, matrix_mdev->matrix.aqm); } @@ -1945,7 +1965,7 @@ int vfio_ap_mdev_resource_in_use(unsigned long *apm, unsigned long *aqm) mutex_lock(&matrix_dev->guests_lock); mutex_lock(&matrix_dev->mdevs_lock); - ret = vfio_ap_mdev_verify_no_sharing(apm, aqm); + ret = vfio_ap_mdev_verify_no_sharing(NULL, apm, aqm); mutex_unlock(&matrix_dev->mdevs_lock); mutex_unlock(&matrix_dev->guests_lock); From 17e53a15e64b65623b8f2b1185d27d7b1cbf69ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Mon, 9 Dec 2024 20:44:23 +0100 Subject: [PATCH 217/529] cifs: Add fallback for SMB2 CREATE without FILE_READ_ATTRIBUTES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit e255612b5ed9f179abe8196df7c2ba09dd227900 ] Some operations, like WRITE, does not require FILE_READ_ATTRIBUTES access. So when FILE_READ_ATTRIBUTES is not explicitly requested for smb2_open_file() then first try to do SMB2 CREATE with FILE_READ_ATTRIBUTES access (like it was before) and then fallback to SMB2 CREATE without FILE_READ_ATTRIBUTES access (less common case). This change allows to complete WRITE operation to a file when it does not grant FILE_READ_ATTRIBUTES permission and its parent directory does not grant READ_DATA permission (parent directory READ_DATA is implicit grant of child FILE_READ_ATTRIBUTES permission). Signed-off-by: Pali Rohár Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/smb/client/smb2file.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index a7475bc05cac..afdc78e92ee9 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -108,16 +108,25 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 int err_buftype = CIFS_NO_BUFFER; struct cifs_fid *fid = oparms->fid; struct network_resiliency_req nr_ioctl_req; + bool retry_without_read_attributes = false; smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb); if (smb2_path == NULL) return -ENOMEM; - oparms->desired_access |= FILE_READ_ATTRIBUTES; + if (!(oparms->desired_access & FILE_READ_ATTRIBUTES)) { + oparms->desired_access |= FILE_READ_ATTRIBUTES; + retry_without_read_attributes = true; + } smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov, &err_buftype); + if (rc == -EACCES && retry_without_read_attributes) { + oparms->desired_access &= ~FILE_READ_ATTRIBUTES; + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov, + &err_buftype); + } if (rc && data) { struct smb2_hdr *hdr = err_iov.iov_base; From 5d736eee3c488ffd357415d1ab6f8b2f9ffafda8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sat, 28 Dec 2024 21:09:54 +0100 Subject: [PATCH 218/529] cifs: Fix querying and creating MF symlinks over SMB1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 4236ac9fe5b8b42756070d4abfb76fed718e87c2 ] Old SMB1 servers without CAP_NT_SMBS do not support CIFS_open() function and instead SMBLegacyOpen() needs to be used. This logic is already handled in cifs_open_file() function, which is server->ops->open callback function. So for querying and creating MF symlinks use open callback function instead of CIFS_open() function directly. This change fixes querying and creating new MF symlinks on Windows 98. Currently cifs_query_mf_symlink() is not able to detect MF symlink and cifs_create_mf_symlink() is failing with EIO error. Signed-off-by: Pali Rohár Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/smb/client/link.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index c0f101fc1e5d..d71feb3fdbd2 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -269,7 +269,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_io_parms io_parms = {0}; int buf_type = CIFS_NO_BUFFER; - FILE_ALL_INFO file_info; + struct cifs_open_info_data query_data; oparms = (struct cifs_open_parms) { .tcon = tcon, @@ -281,11 +281,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, .fid = &fid, }; - rc = CIFS_open(xid, &oparms, &oplock, &file_info); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &query_data); if (rc) return rc; - if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { + if (query_data.fi.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { rc = -ENOENT; /* it's not a symlink */ goto out; @@ -324,7 +324,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, .fid = &fid, }; - rc = CIFS_open(xid, &oparms, &oplock, NULL); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL); if (rc) return rc; From b413cf7ff5a164537fb3710b25daa0481ee9f135 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Sat, 2 Nov 2024 20:06:50 +0100 Subject: [PATCH 219/529] cifs: Fix negotiate retry functionality MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit e94e882a6d69525c07589222cf3a6ff57ad12b5b ] SMB negotiate retry functionality in cifs_negotiate() is currently broken and does not work when doing socket reconnect. Caller of this function, which is cifs_negotiate_protocol() requires that tcpStatus after successful execution of negotiate callback stay in CifsInNegotiate. But if the CIFSSMBNegotiate() called from cifs_negotiate() fails due to connection issues then tcpStatus is changed as so repeated CIFSSMBNegotiate() call does not help. Fix this problem by moving retrying code from negotiate callback (which is either cifs_negotiate() or smb2_negotiate()) to cifs_negotiate_protocol() which is caller of those callbacks. This allows to properly handle and implement correct transistions between tcpStatus states as function cifs_negotiate_protocol() already handles it. With this change, cifs_negotiate_protocol() now handles also -EAGAIN error set by the RFC1002_NEGATIVE_SESSION_RESPONSE processing after reconnecting with NetBIOS session. Signed-off-by: Pali Rohár Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/smb/client/connect.c | 10 ++++++++++ fs/smb/client/smb1ops.c | 7 ------- fs/smb/client/smb2ops.c | 3 --- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 6aeb25006db8..0baa64c48c3d 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -4178,11 +4178,13 @@ int cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, struct TCP_Server_Info *server) { + bool in_retry = false; int rc = 0; if (!server->ops->need_neg || !server->ops->negotiate) return -ENOSYS; +retry: /* only send once per connect */ spin_lock(&server->srv_lock); if (server->tcpStatus != CifsGood && @@ -4202,6 +4204,14 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, spin_unlock(&server->srv_lock); rc = server->ops->negotiate(xid, ses, server); + if (rc == -EAGAIN) { + /* Allow one retry attempt */ + if (!in_retry) { + in_retry = true; + goto retry; + } + rc = -EHOSTDOWN; + } if (rc == 0) { spin_lock(&server->srv_lock); if (server->tcpStatus == CifsInNegotiate) diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index 225cc7e0304c..1489b9d21b60 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -426,13 +426,6 @@ cifs_negotiate(const unsigned int xid, { int rc; rc = CIFSSMBNegotiate(xid, ses, server); - if (rc == -EAGAIN) { - /* retry only once on 1st time connection */ - set_credits(server, 1); - rc = CIFSSMBNegotiate(xid, ses, server); - if (rc == -EAGAIN) - rc = -EHOSTDOWN; - } return rc; } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index a62f3e5a7689..c9b9892b510e 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -422,9 +422,6 @@ smb2_negotiate(const unsigned int xid, server->CurrentMid = 0; spin_unlock(&server->mid_lock); rc = SMB2_negotiate(xid, ses, server); - /* BB we probably don't need to retry with modern servers */ - if (rc == -EAGAIN) - rc = -EHOSTDOWN; return rc; } From f706cb4e370322273c20e30f10750f88553af135 Mon Sep 17 00:00:00 2001 From: Matt Johnston Date: Fri, 14 Feb 2025 09:17:53 +0800 Subject: [PATCH 220/529] fuse: Return EPERM rather than ENOSYS from link() [ Upstream commit 8344213571b2ac8caf013cfd3b37bc3467c3a893 ] link() is documented to return EPERM when a filesystem doesn't support the operation, return that instead. Link: https://github.com/libfuse/libfuse/issues/925 Signed-off-by: Matt Johnston Signed-off-by: Miklos Szeredi Signed-off-by: Sasha Levin --- fs/fuse/dir.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index c431abbf48e6..0dbacdd7bb0d 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -1068,6 +1068,8 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, else if (err == -EINTR) fuse_invalidate_attr(inode); + if (err == -ENOSYS) + err = -EPERM; return err; } From 429d487d940b470e81e094b347b59a9a6a7ef54f Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 27 Mar 2025 19:20:53 -0400 Subject: [PATCH 221/529] NFSv4: Check for delegation validity in nfs_start_delegation_return_locked() [ Upstream commit 9e8f324bd44c1fe026b582b75213de4eccfa1163 ] Check that the delegation is still attached after taking the spin lock in nfs_start_delegation_return_locked(). Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/delegation.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 6363bbc37f42..17b38da17288 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -297,7 +297,8 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi) if (delegation == NULL) goto out; spin_lock(&delegation->lock); - if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + if (delegation->inode && + !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); /* Refcount matched in nfs_end_delegation_return() */ ret = nfs_get_delegation(delegation); From 2bd25a7916a48e0266e82a11f1f0ed9b4500d056 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 28 Mar 2025 13:19:18 -0400 Subject: [PATCH 222/529] NFS: Don't allow waiting for exiting tasks [ Upstream commit 8d3ca331026a7f9700d3747eed59a67b8f828cdc ] Once a task calls exit_signals() it can no longer be signalled. So do not allow it to do killable waits. Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/inode.c | 2 ++ fs/nfs/internal.h | 5 +++++ fs/nfs/nfs3proc.c | 2 +- fs/nfs/nfs4proc.c | 9 +++++++-- 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 964df0725f4c..f2e66b946f4b 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -74,6 +74,8 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { + if (unlikely(nfs_current_task_exiting())) + return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index ece517ebcca0..84361674bffc 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -832,6 +832,11 @@ static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) NFS4_STATEID_OTHER_SIZE); } +static inline bool nfs_current_task_exiting(void) +{ + return (current->flags & PF_EXITING) != 0; +} + static inline bool nfs_error_is_fatal(int err) { switch (err) { diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 2e7579626cf0..f036d30f7515 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -39,7 +39,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; - } while (!fatal_signal_pending(current)); + } while (!fatal_signal_pending(current) && !nfs_current_task_exiting()); return res; } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index acef50824d1a..0f28607c5747 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -422,6 +422,8 @@ static int nfs4_delay_killable(long *timeout) { might_sleep(); + if (unlikely(nfs_current_task_exiting())) + return -EINTR; __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(nfs4_update_delay(timeout)); if (!__fatal_signal_pending(current)) @@ -433,6 +435,8 @@ static int nfs4_delay_interruptible(long *timeout) { might_sleep(); + if (unlikely(nfs_current_task_exiting())) + return -EINTR; __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(nfs4_update_delay(timeout)); if (!signal_pending(current)) @@ -1712,7 +1716,8 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, rcu_read_unlock(); trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0); - if (!fatal_signal_pending(current)) { + if (!fatal_signal_pending(current) && + !nfs_current_task_exiting()) { if (schedule_timeout(5*HZ) == 0) status = -EAGAIN; else @@ -3500,7 +3505,7 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, write_sequnlock(&state->seqlock); trace_nfs4_close_stateid_update_wait(state->inode, dst, 0); - if (fatal_signal_pending(current)) + if (fatal_signal_pending(current) || nfs_current_task_exiting()) status = -EINTR; else if (schedule_timeout(5*HZ) != 0) From 0b99bcbd6276d81df7dcfda9fa1a5ff1cd067416 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 28 Mar 2025 12:52:52 -0400 Subject: [PATCH 223/529] SUNRPC: Don't allow waiting for exiting tasks [ Upstream commit 14e41b16e8cb677bb440dca2edba8b041646c742 ] Once a task calls exit_signals() it can no longer be signalled. So do not allow it to do killable waits. Reviewed-by: Jeff Layton Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- net/sunrpc/sched.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 9b45fbdc90ca..73bc39281ef5 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -276,6 +276,8 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { + if (unlikely(current->flags & PF_EXITING)) + return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; From cdfe09355cfd7befa3d907653cff4b11232524b8 Mon Sep 17 00:00:00 2001 From: Jinqian Yang Date: Tue, 25 Mar 2025 22:19:00 +0800 Subject: [PATCH 224/529] arm64: Add support for HIP09 Spectre-BHB mitigation [ Upstream commit e18c09b204e81702ea63b9f1a81ab003b72e3174 ] The HIP09 processor is vulnerable to the Spectre-BHB (Branch History Buffer) attack, which can be exploited to leak information through branch prediction side channels. This commit adds the MIDR of HIP09 to the list for software mitigation. Signed-off-by: Jinqian Yang Link: https://lore.kernel.org/r/20250325141900.2057314-1-yangjinqian1@huawei.com Signed-off-by: Catalin Marinas Signed-off-by: Sasha Levin --- arch/arm64/include/asm/cputype.h | 2 ++ arch/arm64/kernel/proton-pack.c | 1 + 2 files changed, 3 insertions(+) diff --git a/arch/arm64/include/asm/cputype.h b/arch/arm64/include/asm/cputype.h index fe022fe2d4f6..41612b03af63 100644 --- a/arch/arm64/include/asm/cputype.h +++ b/arch/arm64/include/asm/cputype.h @@ -132,6 +132,7 @@ #define FUJITSU_CPU_PART_A64FX 0x001 #define HISI_CPU_PART_TSV110 0xD01 +#define HISI_CPU_PART_HIP09 0xD02 #define APPLE_CPU_PART_M1_ICESTORM 0x022 #define APPLE_CPU_PART_M1_FIRESTORM 0x023 @@ -201,6 +202,7 @@ #define MIDR_NVIDIA_CARMEL MIDR_CPU_MODEL(ARM_CPU_IMP_NVIDIA, NVIDIA_CPU_PART_CARMEL) #define MIDR_FUJITSU_A64FX MIDR_CPU_MODEL(ARM_CPU_IMP_FUJITSU, FUJITSU_CPU_PART_A64FX) #define MIDR_HISI_TSV110 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_TSV110) +#define MIDR_HISI_HIP09 MIDR_CPU_MODEL(ARM_CPU_IMP_HISI, HISI_CPU_PART_HIP09) #define MIDR_APPLE_M1_ICESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM) #define MIDR_APPLE_M1_FIRESTORM MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_FIRESTORM) #define MIDR_APPLE_M1_ICESTORM_PRO MIDR_CPU_MODEL(ARM_CPU_IMP_APPLE, APPLE_CPU_PART_M1_ICESTORM_PRO) diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index fcc641f30c93..4978c466e325 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -916,6 +916,7 @@ static u8 spectre_bhb_loop_affected(void) MIDR_ALL_VERSIONS(MIDR_CORTEX_A77), MIDR_ALL_VERSIONS(MIDR_NEOVERSE_N1), MIDR_ALL_VERSIONS(MIDR_QCOM_KRYO_4XX_GOLD), + MIDR_ALL_VERSIONS(MIDR_HISI_HIP09), {}, }; static const struct midr_range spectre_bhb_k11_list[] = { From 6707f9749deb9d804cf64b3027480d698424f4fd Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Fri, 21 Mar 2025 16:40:49 +0200 Subject: [PATCH 225/529] tracing: Mark binary printing functions with __printf() attribute MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 196a062641fe68d9bfe0ad36b6cd7628c99ad22c ] Binary printing functions are using printf() type of format, and compiler is not happy about them as is: kernel/trace/trace.c:3292:9: error: function ‘trace_vbprintk’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format] kernel/trace/trace_seq.c:182:9: error: function ‘trace_seq_bprintf’ might be a candidate for ‘gnu_printf’ format attribute [-Werror=suggest-attribute=format] Fix the compilation errors by adding __printf() attribute. While at it, move existing __printf() attributes from the implementations to the declarations. IT also fixes incorrect attribute parameters that are used for trace_array_printk(). Signed-off-by: Andy Shevchenko Reviewed-by: Kees Cook Reviewed-by: Petr Mladek Link: https://lore.kernel.org/r/20250321144822.324050-4-andriy.shevchenko@linux.intel.com Signed-off-by: Petr Mladek Signed-off-by: Sasha Levin --- include/linux/trace.h | 4 ++-- include/linux/trace_seq.h | 8 ++++---- kernel/trace/trace.c | 11 +++-------- kernel/trace/trace.h | 16 +++++++++------- 4 files changed, 18 insertions(+), 21 deletions(-) diff --git a/include/linux/trace.h b/include/linux/trace.h index 2a70a447184c..bb4d84f1c58c 100644 --- a/include/linux/trace.h +++ b/include/linux/trace.h @@ -72,8 +72,8 @@ static inline int unregister_ftrace_export(struct trace_export *export) static inline void trace_printk_init_buffers(void) { } -static inline int trace_array_printk(struct trace_array *tr, unsigned long ip, - const char *fmt, ...) +static inline __printf(3, 4) +int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { return 0; } diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 5a2c650d9e1c..c230cbd25aee 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -77,8 +77,8 @@ extern __printf(2, 3) void trace_seq_printf(struct trace_seq *s, const char *fmt, ...); extern __printf(2, 0) void trace_seq_vprintf(struct trace_seq *s, const char *fmt, va_list args); -extern void -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); +extern __printf(2, 0) +void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); extern int trace_print_seq(struct seq_file *m, struct trace_seq *s); extern int trace_seq_to_user(struct trace_seq *s, char __user *ubuf, int cnt); @@ -100,8 +100,8 @@ extern int trace_seq_hex_dump(struct trace_seq *s, const char *prefix_str, static inline void trace_seq_printf(struct trace_seq *s, const char *fmt, ...) { } -static inline void -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +static inline __printf(2, 0) +void trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9da29583dfbc..9e0b9c9a7dff 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3422,10 +3422,9 @@ out_nobuffer: } EXPORT_SYMBOL_GPL(trace_vbprintk); -__printf(3, 0) -static int -__trace_array_vprintk(struct trace_buffer *buffer, - unsigned long ip, const char *fmt, va_list args) +static __printf(3, 0) +int __trace_array_vprintk(struct trace_buffer *buffer, + unsigned long ip, const char *fmt, va_list args) { struct trace_event_call *call = &event_print; struct ring_buffer_event *event; @@ -3478,7 +3477,6 @@ out_nobuffer: return len; } -__printf(3, 0) int trace_array_vprintk(struct trace_array *tr, unsigned long ip, const char *fmt, va_list args) { @@ -3505,7 +3503,6 @@ int trace_array_vprintk(struct trace_array *tr, * Note, trace_array_init_printk() must be called on @tr before this * can be used. */ -__printf(3, 0) int trace_array_printk(struct trace_array *tr, unsigned long ip, const char *fmt, ...) { @@ -3550,7 +3547,6 @@ int trace_array_init_printk(struct trace_array *tr) } EXPORT_SYMBOL_GPL(trace_array_init_printk); -__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...) { @@ -3566,7 +3562,6 @@ int trace_array_printk_buf(struct trace_buffer *buffer, return ret; } -__printf(2, 0) int trace_vprintk(unsigned long ip, const char *fmt, va_list args) { return trace_array_vprintk(&global_trace, ip, fmt, args); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index aad7fcd84617..49b297ca7fc7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -780,13 +780,15 @@ static inline void __init disable_tracing_selftest(const char *reason) extern void *head_page(struct trace_array_cpu *data); extern unsigned long long ns2usecs(u64 nsec); -extern int -trace_vbprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_vprintk(unsigned long ip, const char *fmt, va_list args); -extern int -trace_array_vprintk(struct trace_array *tr, - unsigned long ip, const char *fmt, va_list args); + +__printf(2, 0) +int trace_vbprintk(unsigned long ip, const char *fmt, va_list args); +__printf(2, 0) +int trace_vprintk(unsigned long ip, const char *fmt, va_list args); +__printf(3, 0) +int trace_array_vprintk(struct trace_array *tr, + unsigned long ip, const char *fmt, va_list args); +__printf(3, 4) int trace_array_printk_buf(struct trace_buffer *buffer, unsigned long ip, const char *fmt, ...); void trace_printk_seq(struct trace_seq *s); From d8aea4921f3fb6baca1c29b55f938f5e2d11e6e7 Mon Sep 17 00:00:00 2001 From: Tudor Ambarus Date: Mon, 24 Feb 2025 08:27:13 +0000 Subject: [PATCH 226/529] mailbox: use error ret code of of_parse_phandle_with_args() [ Upstream commit 24fdd5074b205cfb0ef4cd0751a2d03031455929 ] In case of error, of_parse_phandle_with_args() returns -EINVAL when the passed index is negative, or -ENOENT when the index is for an empty phandle. The mailbox core overwrote the error return code with a less precise -ENODEV. Use the error returned code from of_parse_phandle_with_args(). Signed-off-by: Tudor Ambarus Signed-off-by: Jassi Brar Signed-off-by: Sasha Levin --- drivers/mailbox/mailbox.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/mailbox/mailbox.c b/drivers/mailbox/mailbox.c index 4229b9b5da98..6f54501dc776 100644 --- a/drivers/mailbox/mailbox.c +++ b/drivers/mailbox/mailbox.c @@ -350,11 +350,12 @@ struct mbox_chan *mbox_request_channel(struct mbox_client *cl, int index) mutex_lock(&con_mutex); - if (of_parse_phandle_with_args(dev->of_node, "mboxes", - "#mbox-cells", index, &spec)) { + ret = of_parse_phandle_with_args(dev->of_node, "mboxes", "#mbox-cells", + index, &spec); + if (ret) { dev_dbg(dev, "%s: can't parse \"mboxes\" property\n", __func__); mutex_unlock(&con_mutex); - return ERR_PTR(-ENODEV); + return ERR_PTR(ret); } chan = ERR_PTR(-EPROBE_DEFER); From af6e4ccb08823293c51523c9c6b201a48e0a5753 Mon Sep 17 00:00:00 2001 From: Shixiong Ou Date: Mon, 10 Mar 2025 09:54:31 +0800 Subject: [PATCH 227/529] fbdev: fsl-diu-fb: add missing device_remove_file() [ Upstream commit 86d16cd12efa547ed43d16ba7a782c1251c80ea8 ] Call device_remove_file() when driver remove. Signed-off-by: Shixiong Ou Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- drivers/video/fbdev/fsl-diu-fb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/video/fbdev/fsl-diu-fb.c b/drivers/video/fbdev/fsl-diu-fb.c index ce3c5b0b8f4e..53be4ab374cc 100644 --- a/drivers/video/fbdev/fsl-diu-fb.c +++ b/drivers/video/fbdev/fsl-diu-fb.c @@ -1829,6 +1829,7 @@ static int fsl_diu_remove(struct platform_device *pdev) int i; data = dev_get_drvdata(&pdev->dev); + device_remove_file(&pdev->dev, &data->dev_attr); disable_lcdc(&data->fsl_diu_info[0]); free_irq(data->irq, data->diu_reg); From 57aa1e8197781dc82c7ca7d6e3d289420baffbf7 Mon Sep 17 00:00:00 2001 From: Zsolt Kajtar Date: Sun, 2 Feb 2025 21:33:46 +0100 Subject: [PATCH 228/529] fbcon: Use correct erase colour for clearing in fbcon [ Upstream commit 892c788d73fe4a94337ed092cb998c49fa8ecaf4 ] The erase colour calculation for fbcon clearing should use get_color instead of attr_col_ec, like everything else. The latter is similar but is not correct. For example it's missing the depth dependent remapping and doesn't care about blanking. The problem can be reproduced by setting up the background colour to grey (vt.color=0x70) and having an fbcon console set to 2bpp (4 shades of gray). Now the background attribute should be 1 (dark gray) on the console. If the screen is scrolled when pressing enter in a shell prompt at the bottom line then the new line is cleared using colour 7 instead of 1. That's not something fillrect likes (at 2bbp it expect 0-3) so the result is interesting. This patch switches to get_color with vc_video_erase_char to determine the erase colour from attr_col_ec. That makes the latter function redundant as no other users were left. Use correct erase colour for clearing in fbcon Signed-off-by: Zsolt Kajtar Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- drivers/video/fbdev/core/bitblit.c | 5 ++-- drivers/video/fbdev/core/fbcon.c | 10 +++++--- drivers/video/fbdev/core/fbcon.h | 38 +--------------------------- drivers/video/fbdev/core/fbcon_ccw.c | 5 ++-- drivers/video/fbdev/core/fbcon_cw.c | 5 ++-- drivers/video/fbdev/core/fbcon_ud.c | 5 ++-- drivers/video/fbdev/core/tileblit.c | 8 +++--- 7 files changed, 18 insertions(+), 58 deletions(-) diff --git a/drivers/video/fbdev/core/bitblit.c b/drivers/video/fbdev/core/bitblit.c index 8587c9da0670..42e681a78136 100644 --- a/drivers/video/fbdev/core/bitblit.c +++ b/drivers/video/fbdev/core/bitblit.c @@ -59,12 +59,11 @@ static void bit_bmove(struct vc_data *vc, struct fb_info *info, int sy, } static void bit_clear(struct vc_data *vc, struct fb_info *info, int sy, - int sx, int height, int width) + int sx, int height, int width, int fg, int bg) { - int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; struct fb_fillrect region; - region.color = attr_bgcol_ec(bgshift, vc, info); + region.color = bg; region.dx = sx * vc->vc_font.width; region.dy = sy * vc->vc_font.height; region.width = width * vc->vc_font.width; diff --git a/drivers/video/fbdev/core/fbcon.c b/drivers/video/fbdev/core/fbcon.c index e6640edec155..538e932055ca 100644 --- a/drivers/video/fbdev/core/fbcon.c +++ b/drivers/video/fbdev/core/fbcon.c @@ -1240,7 +1240,7 @@ static void fbcon_clear(struct vc_data *vc, int sy, int sx, int height, { struct fb_info *info = fbcon_info_from_console(vc->vc_num); struct fbcon_ops *ops = info->fbcon_par; - + int fg, bg; struct fbcon_display *p = &fb_display[vc->vc_num]; u_int y_break; @@ -1261,16 +1261,18 @@ static void fbcon_clear(struct vc_data *vc, int sy, int sx, int height, fbcon_clear_margins(vc, 0); } + fg = get_color(vc, info, vc->vc_video_erase_char, 1); + bg = get_color(vc, info, vc->vc_video_erase_char, 0); /* Split blits that cross physical y_wrap boundary */ y_break = p->vrows - p->yscroll; if (sy < y_break && sy + height - 1 >= y_break) { u_int b = y_break - sy; - ops->clear(vc, info, real_y(p, sy), sx, b, width); + ops->clear(vc, info, real_y(p, sy), sx, b, width, fg, bg); ops->clear(vc, info, real_y(p, sy + b), sx, height - b, - width); + width, fg, bg); } else - ops->clear(vc, info, real_y(p, sy), sx, height, width); + ops->clear(vc, info, real_y(p, sy), sx, height, width, fg, bg); } static void fbcon_putcs(struct vc_data *vc, const unsigned short *s, diff --git a/drivers/video/fbdev/core/fbcon.h b/drivers/video/fbdev/core/fbcon.h index 0eaf54a21151..25691d4b027b 100644 --- a/drivers/video/fbdev/core/fbcon.h +++ b/drivers/video/fbdev/core/fbcon.h @@ -55,7 +55,7 @@ struct fbcon_ops { void (*bmove)(struct vc_data *vc, struct fb_info *info, int sy, int sx, int dy, int dx, int height, int width); void (*clear)(struct vc_data *vc, struct fb_info *info, int sy, - int sx, int height, int width); + int sx, int height, int width, int fb, int bg); void (*putcs)(struct vc_data *vc, struct fb_info *info, const unsigned short *s, int count, int yy, int xx, int fg, int bg); @@ -116,42 +116,6 @@ static inline int mono_col(const struct fb_info *info) return (~(0xfff << max_len)) & 0xff; } -static inline int attr_col_ec(int shift, struct vc_data *vc, - struct fb_info *info, int is_fg) -{ - int is_mono01; - int col; - int fg; - int bg; - - if (!vc) - return 0; - - if (vc->vc_can_do_color) - return is_fg ? attr_fgcol(shift,vc->vc_video_erase_char) - : attr_bgcol(shift,vc->vc_video_erase_char); - - if (!info) - return 0; - - col = mono_col(info); - is_mono01 = info->fix.visual == FB_VISUAL_MONO01; - - if (attr_reverse(vc->vc_video_erase_char)) { - fg = is_mono01 ? col : 0; - bg = is_mono01 ? 0 : col; - } - else { - fg = is_mono01 ? 0 : col; - bg = is_mono01 ? col : 0; - } - - return is_fg ? fg : bg; -} - -#define attr_bgcol_ec(bgshift, vc, info) attr_col_ec(bgshift, vc, info, 0) -#define attr_fgcol_ec(fgshift, vc, info) attr_col_ec(fgshift, vc, info, 1) - /* * Scroll Method */ diff --git a/drivers/video/fbdev/core/fbcon_ccw.c b/drivers/video/fbdev/core/fbcon_ccw.c index 2789ace79634..9f4d65478554 100644 --- a/drivers/video/fbdev/core/fbcon_ccw.c +++ b/drivers/video/fbdev/core/fbcon_ccw.c @@ -78,14 +78,13 @@ static void ccw_bmove(struct vc_data *vc, struct fb_info *info, int sy, } static void ccw_clear(struct vc_data *vc, struct fb_info *info, int sy, - int sx, int height, int width) + int sx, int height, int width, int fg, int bg) { struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; - int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; u32 vyres = GETVYRES(ops->p, info); - region.color = attr_bgcol_ec(bgshift,vc,info); + region.color = bg; region.dx = sy * vc->vc_font.height; region.dy = vyres - ((sx + width) * vc->vc_font.width); region.height = width * vc->vc_font.width; diff --git a/drivers/video/fbdev/core/fbcon_cw.c b/drivers/video/fbdev/core/fbcon_cw.c index 86a254c1b2b7..b18e31886da1 100644 --- a/drivers/video/fbdev/core/fbcon_cw.c +++ b/drivers/video/fbdev/core/fbcon_cw.c @@ -63,14 +63,13 @@ static void cw_bmove(struct vc_data *vc, struct fb_info *info, int sy, } static void cw_clear(struct vc_data *vc, struct fb_info *info, int sy, - int sx, int height, int width) + int sx, int height, int width, int fg, int bg) { struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; - int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; u32 vxres = GETVXRES(ops->p, info); - region.color = attr_bgcol_ec(bgshift,vc,info); + region.color = bg; region.dx = vxres - ((sy + height) * vc->vc_font.height); region.dy = sx * vc->vc_font.width; region.height = width * vc->vc_font.width; diff --git a/drivers/video/fbdev/core/fbcon_ud.c b/drivers/video/fbdev/core/fbcon_ud.c index 23bc045769d0..b6b074cfd9dc 100644 --- a/drivers/video/fbdev/core/fbcon_ud.c +++ b/drivers/video/fbdev/core/fbcon_ud.c @@ -64,15 +64,14 @@ static void ud_bmove(struct vc_data *vc, struct fb_info *info, int sy, } static void ud_clear(struct vc_data *vc, struct fb_info *info, int sy, - int sx, int height, int width) + int sx, int height, int width, int fg, int bg) { struct fbcon_ops *ops = info->fbcon_par; struct fb_fillrect region; - int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; u32 vyres = GETVYRES(ops->p, info); u32 vxres = GETVXRES(ops->p, info); - region.color = attr_bgcol_ec(bgshift,vc,info); + region.color = bg; region.dy = vyres - ((sy + height) * vc->vc_font.height); region.dx = vxres - ((sx + width) * vc->vc_font.width); region.width = width * vc->vc_font.width; diff --git a/drivers/video/fbdev/core/tileblit.c b/drivers/video/fbdev/core/tileblit.c index 2768eff247ba..674ca6a410ec 100644 --- a/drivers/video/fbdev/core/tileblit.c +++ b/drivers/video/fbdev/core/tileblit.c @@ -32,16 +32,14 @@ static void tile_bmove(struct vc_data *vc, struct fb_info *info, int sy, } static void tile_clear(struct vc_data *vc, struct fb_info *info, int sy, - int sx, int height, int width) + int sx, int height, int width, int fg, int bg) { struct fb_tilerect rect; - int bgshift = (vc->vc_hi_font_mask) ? 13 : 12; - int fgshift = (vc->vc_hi_font_mask) ? 9 : 8; rect.index = vc->vc_video_erase_char & ((vc->vc_hi_font_mask) ? 0x1ff : 0xff); - rect.fg = attr_fgcol_ec(fgshift, vc, info); - rect.bg = attr_bgcol_ec(bgshift, vc, info); + rect.fg = fg; + rect.bg = bg; rect.sx = sx; rect.sy = sy; rect.width = width; From 96106d4f3f09524a8fe70abf33c52429dde42dd6 Mon Sep 17 00:00:00 2001 From: Zsolt Kajtar Date: Sat, 1 Feb 2025 09:18:09 +0100 Subject: [PATCH 229/529] fbdev: core: tileblit: Implement missing margin clearing for tileblit [ Upstream commit 76d3ca89981354e1f85a3e0ad9ac4217d351cc72 ] I was wondering why there's garbage at the bottom of the screen when tile blitting is used with an odd mode like 1080, 600 or 200. Sure there's only space for half a tile but the same area is clean when the buffer is bitmap. Then later I found that it's supposed to be cleaned but that's not implemented. So I took what's in bitblit and adapted it for tileblit. This implementation was tested for both the horizontal and vertical case, and now does the same as what's done for bitmap buffers. If anyone is interested to reproduce the problem then I could bet that'd be on a S3 or Ark. Just set up a mode with an odd line count and make sure that the virtual size covers the complete tile at the bottom. E.g. for 600 lines that's 608 virtual lines for a 16 tall tile. Then the bottom area should be cleaned. For the right side it's more difficult as there the drivers won't let an odd size happen, unless the code is modified. But once it reports back a few pixel columns short then fbcon won't use the last column. With the patch that column is now clean. Btw. the virtual size should be rounded up by the driver for both axes (not only the horizontal) so that it's dividable by the tile size. That's a driver bug but correcting it is not in scope for this patch. Implement missing margin clearing for tileblit Signed-off-by: Zsolt Kajtar Signed-off-by: Helge Deller Signed-off-by: Sasha Levin --- drivers/video/fbdev/core/tileblit.c | 37 ++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/drivers/video/fbdev/core/tileblit.c b/drivers/video/fbdev/core/tileblit.c index 674ca6a410ec..b3aa0c6620c7 100644 --- a/drivers/video/fbdev/core/tileblit.c +++ b/drivers/video/fbdev/core/tileblit.c @@ -74,7 +74,42 @@ static void tile_putcs(struct vc_data *vc, struct fb_info *info, static void tile_clear_margins(struct vc_data *vc, struct fb_info *info, int color, int bottom_only) { - return; + unsigned int cw = vc->vc_font.width; + unsigned int ch = vc->vc_font.height; + unsigned int rw = info->var.xres - (vc->vc_cols*cw); + unsigned int bh = info->var.yres - (vc->vc_rows*ch); + unsigned int rs = info->var.xres - rw; + unsigned int bs = info->var.yres - bh; + unsigned int vwt = info->var.xres_virtual / cw; + unsigned int vht = info->var.yres_virtual / ch; + struct fb_tilerect rect; + + rect.index = vc->vc_video_erase_char & + ((vc->vc_hi_font_mask) ? 0x1ff : 0xff); + rect.fg = color; + rect.bg = color; + + if ((int) rw > 0 && !bottom_only) { + rect.sx = (info->var.xoffset + rs + cw - 1) / cw; + rect.sy = 0; + rect.width = (rw + cw - 1) / cw; + rect.height = vht; + if (rect.width + rect.sx > vwt) + rect.width = vwt - rect.sx; + if (rect.sx < vwt) + info->tileops->fb_tilefill(info, &rect); + } + + if ((int) bh > 0) { + rect.sx = info->var.xoffset / cw; + rect.sy = (info->var.yoffset + bs) / ch; + rect.width = rs / cw; + rect.height = (bh + ch - 1) / ch; + if (rect.height + rect.sy > vht) + rect.height = vht - rect.sy; + if (rect.sy < vht) + info->tileops->fb_tilefill(info, &rect); + } } static void tile_cursor(struct vc_data *vc, struct fb_info *info, int mode, From 8e93857d8d5957136e070b2e1680c44bfafdd8dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Wed, 30 Oct 2024 22:46:20 +0100 Subject: [PATCH 230/529] cifs: Fix establishing NetBIOS session for SMB2+ connection MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 781802aa5a5950f99899f13ff9d760f5db81d36d ] Function ip_rfc1001_connect() which establish NetBIOS session for SMB connections, currently uses smb_send() function for sending NetBIOS Session Request packet. This function expects that the passed buffer is SMB packet and for SMB2+ connections it mangles packet header, which breaks prepared NetBIOS Session Request packet. Result is that this function send garbage packet for SMB2+ connection, which SMB2+ server cannot parse. That function is not mangling packets for SMB1 connections, so it somehow works for SMB1. Fix this problem and instead of smb_send(), use smb_send_kvec() function which does not mangle prepared packet, this function send them as is. Just API of this function takes struct msghdr (kvec) instead of packet buffer. [MS-SMB2] specification allows SMB2 protocol to use NetBIOS as a transport protocol. NetBIOS can be used over TCP via port 139. So this is a valid configuration, just not so common. And even recent Windows versions (e.g. Windows Server 2022) still supports this configuration: SMB over TCP port 139, including for modern SMB2 and SMB3 dialects. This change fixes SMB2 and SMB3 connections over TCP port 139 which requires establishing of NetBIOS session. Tested that this change fixes establishing of SMB2 and SMB3 connections with Windows Server 2022. Signed-off-by: Pali Rohár Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/smb/client/cifsproto.h | 3 +++ fs/smb/client/connect.c | 20 +++++++++++++++----- fs/smb/client/transport.c | 2 +- 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index d1fd54fb3cc1..9a30425b75a9 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -30,6 +30,9 @@ extern void cifs_small_buf_release(void *); extern void free_rsp_buf(int, void *); extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, unsigned int /* length */); +extern int smb_send_kvec(struct TCP_Server_Info *server, + struct msghdr *msg, + size_t *sent); extern unsigned int _get_xid(void); extern void _free_xid(unsigned int); #define get_xid() \ diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 0baa64c48c3d..76c04c4ed45f 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -2966,8 +2966,10 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * sessinit is sent but no second negprot */ struct rfc1002_session_packet req = {}; - struct smb_hdr *smb_buf = (struct smb_hdr *)&req; + struct msghdr msg = {}; + struct kvec iov = {}; unsigned int len; + size_t sent; req.trailer.session_req.called_len = sizeof(req.trailer.session_req.called_name); @@ -2996,10 +2998,18 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * As per rfc1002, @len must be the number of bytes that follows the * length field of a rfc1002 session request payload. */ - len = sizeof(req) - offsetof(struct rfc1002_session_packet, trailer.session_req); + len = sizeof(req.trailer.session_req); + req.type = RFC1002_SESSION_REQUEST; + req.flags = 0; + req.length = cpu_to_be16(len); + len += offsetof(typeof(req), trailer.session_req); + iov.iov_base = &req; + iov.iov_len = len; + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, len); + rc = smb_send_kvec(server, &msg, &sent); + if (rc < 0 || len != sent) + return (rc == -EINTR || rc == -EAGAIN) ? rc : -ECONNABORTED; - smb_buf->smb_buf_length = cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | len); - rc = smb_send(server, smb_buf, len); /* * RFC1001 layer in at least one server requires very short break before * negprot presumably because not expecting negprot to follow so fast. @@ -3008,7 +3018,7 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) */ usleep_range(1000, 2000); - return rc; + return 0; } static int diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 3fdafb9297f1..d2867bd263c5 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -178,7 +178,7 @@ delete_mid(struct mid_q_entry *mid) * Our basic "send data to server" function. Should be called with srv_mutex * held. The caller is responsible for handling the results. */ -static int +int smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, size_t *sent) { From b11ffd1dc37a7931debbe813bbe79dd76db3859c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 24 Mar 2025 20:35:33 -0400 Subject: [PATCH 231/529] NFSv4: Treat ENETUNREACH errors as fatal for state recovery [ Upstream commit 0af5fb5ed3d2fd9e110c6112271f022b744a849a ] If a containerised process is killed and causes an ENETUNREACH or ENETDOWN error to be propagated to the state manager, then mark the nfs_client as being dead so that we don't loop in functions that are expecting recovery to succeed. Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/nfs4state.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 48ea40660422..80a7c5bd7a47 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2737,7 +2737,15 @@ out_error: pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" " with error %d\n", section_sep, section, clp->cl_hostname, -status); - ssleep(1); + switch (status) { + case -ENETDOWN: + case -ENETUNREACH: + nfs_mark_client_ready(clp, -EIO); + break; + default: + ssleep(1); + break; + } out_drain: memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); From 18dd5c41f619d7dc56ba0c7a9030cae7252feb13 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 24 Mar 2025 19:35:01 -0400 Subject: [PATCH 232/529] SUNRPC: rpc_clnt_set_transport() must not change the autobind setting [ Upstream commit bf9be373b830a3e48117da5d89bb6145a575f880 ] The autobind setting was supposed to be determined in rpc_create(), since commit c2866763b402 ("SUNRPC: use sockaddr + size when creating remote transport endpoints"). Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- net/sunrpc/clnt.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b6529a9d37d3..a390a4e5592f 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -275,9 +275,6 @@ static struct rpc_xprt *rpc_clnt_set_transport(struct rpc_clnt *clnt, old = rcu_dereference_protected(clnt->cl_xprt, lockdep_is_held(&clnt->cl_lock)); - if (!xprt_bound(xprt)) - clnt->cl_autobind = 1; - clnt->cl_timeout = timeout; rcu_assign_pointer(clnt->cl_xprt, xprt); spin_unlock(&clnt->cl_lock); From 61ce7181fb5603328d04d67ecbe7171b36c0d476 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 24 Mar 2025 19:05:48 -0400 Subject: [PATCH 233/529] SUNRPC: rpcbind should never reset the port to the value '0' [ Upstream commit 214c13e380ad7636631279f426387f9c4e3c14d9 ] If we already had a valid port number for the RPC service, then we should not allow the rpcbind client to set it to the invalid value '0'. Reviewed-by: Jeff Layton Reviewed-by: Benjamin Coddington Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- net/sunrpc/rpcb_clnt.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c index 82afb56695f8..1ec20163a0b7 100644 --- a/net/sunrpc/rpcb_clnt.c +++ b/net/sunrpc/rpcb_clnt.c @@ -797,9 +797,10 @@ static void rpcb_getport_done(struct rpc_task *child, void *data) } trace_rpcb_setport(child, map->r_status, map->r_port); - xprt->ops->set_port(xprt, map->r_port); - if (map->r_port) + if (map->r_port) { + xprt->ops->set_port(xprt, map->r_port); xprt_set_bound(xprt); + } } /* From 33aaf2e4d3c1491790aceb21c5b2b5abe38028a8 Mon Sep 17 00:00:00 2001 From: Alice Guo Date: Mon, 9 Dec 2024 11:48:59 -0500 Subject: [PATCH 234/529] thermal/drivers/qoriq: Power down TMU on system suspend [ Upstream commit 229f3feb4b0442835b27d519679168bea2de96c2 ] Enable power-down of TMU (Thermal Management Unit) for TMU version 2 during system suspend to save power. Save approximately 4.3mW on VDD_ANA_1P8 on i.MX93 platforms. Signed-off-by: Alice Guo Signed-off-by: Frank Li Link: https://lore.kernel.org/r/20241209164859.3758906-2-Frank.Li@nxp.com Signed-off-by: Daniel Lezcano Signed-off-by: Sasha Levin --- drivers/thermal/qoriq_thermal.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c index d111e218f362..b33cb1d880b7 100644 --- a/drivers/thermal/qoriq_thermal.c +++ b/drivers/thermal/qoriq_thermal.c @@ -19,6 +19,7 @@ #define SITES_MAX 16 #define TMR_DISABLE 0x0 #define TMR_ME 0x80000000 +#define TMR_CMD BIT(29) #define TMR_ALPF 0x0c000000 #define TMR_ALPF_V2 0x03000000 #define TMTMIR_DEFAULT 0x0000000f @@ -345,6 +346,12 @@ static int __maybe_unused qoriq_tmu_suspend(struct device *dev) if (ret) return ret; + if (data->ver > TMU_VER1) { + ret = regmap_set_bits(data->regmap, REGS_TMR, TMR_CMD); + if (ret) + return ret; + } + clk_disable_unprepare(data->clk); return 0; @@ -359,6 +366,12 @@ static int __maybe_unused qoriq_tmu_resume(struct device *dev) if (ret) return ret; + if (data->ver > TMU_VER1) { + ret = regmap_clear_bits(data->regmap, REGS_TMR, TMR_CMD); + if (ret) + return ret; + } + /* Enable monitoring */ return regmap_update_bits(data->regmap, REGS_TMR, TMR_ME, TMR_ME); } From 2d905fdb7f7271420c393f6386074938b98f3b62 Mon Sep 17 00:00:00 2001 From: Jing Su Date: Wed, 19 Mar 2025 16:57:51 +0800 Subject: [PATCH 235/529] dql: Fix dql->limit value when reset. [ Upstream commit 3a17f23f7c36bac3a3584aaf97d3e3e0b2790396 ] Executing dql_reset after setting a non-zero value for limit_min can lead to an unreasonable situation where dql->limit is less than dql->limit_min. For instance, after setting /sys/class/net/eth*/queues/tx-0/byte_queue_limits/limit_min, an ifconfig down/up operation might cause the ethernet driver to call netdev_tx_reset_queue, which in turn invokes dql_reset. In this case, dql->limit is reset to 0 while dql->limit_min remains non-zero value, which is unexpected. The limit should always be greater than or equal to limit_min. Signed-off-by: Jing Su Link: https://patch.msgid.link/Z9qHD1s/NEuQBdgH@pilot-ThinkCentre-M930t-N000 Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- lib/dynamic_queue_limits.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/dynamic_queue_limits.c b/lib/dynamic_queue_limits.c index fde0aa244148..a75a9ca46b59 100644 --- a/lib/dynamic_queue_limits.c +++ b/lib/dynamic_queue_limits.c @@ -116,7 +116,7 @@ EXPORT_SYMBOL(dql_completed); void dql_reset(struct dql *dql) { /* Reset all dynamic values */ - dql->limit = 0; + dql->limit = dql->min_limit; dql->num_queued = 0; dql->num_completed = 0; dql->last_obj_cnt = 0; From 2896063907d5c8585ef008e23e02d54a76b5efe4 Mon Sep 17 00:00:00 2001 From: Ryo Takakura Date: Fri, 21 Mar 2025 07:33:22 -0700 Subject: [PATCH 236/529] lockdep: Fix wait context check on softirq for PREEMPT_RT [ Upstream commit 61c39d8c83e2077f33e0a2c8980a76a7f323f0ce ] Since: 0c1d7a2c2d32 ("lockdep: Remove softirq accounting on PREEMPT_RT.") the wait context test for mutex usage within "in softirq context" fails as it references @softirq_context: | wait context tests | -------------------------------------------------------------------------- | rcu | raw | spin |mutex | -------------------------------------------------------------------------- in hardirq context: ok | ok | ok | ok | in hardirq context (not threaded): ok | ok | ok | ok | in softirq context: ok | ok | ok |FAILED| As a fix, add lockdep map for BH disabled section. This fixes the issue by letting us catch cases when local_bh_disable() gets called with preemption disabled where local_lock doesn't get acquired. In the case of "in softirq context" selftest, local_bh_disable() was being called with preemption disable as it's early in the boot. [ boqun: Move the lockdep annotations into __local_bh_*() to avoid false positives because of unpaired local_bh_disable() reported by Borislav Petkov and Peter Zijlstra, and make bh_lock_map only exist for PREEMPT_RT. ] [ mingo: Restored authorship and improved the bh_lock_map definition. ] Signed-off-by: Ryo Takakura Signed-off-by: Boqun Feng Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/20250321143322.79651-1-boqun.feng@gmail.com Signed-off-by: Sasha Levin --- kernel/softirq.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/kernel/softirq.c b/kernel/softirq.c index 6665f5cd60cb..9ab5ca399a99 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -140,6 +140,18 @@ static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = { .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock), }; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +static struct lock_class_key bh_lock_key; +struct lockdep_map bh_lock_map = { + .name = "local_bh", + .key = &bh_lock_key, + .wait_type_outer = LD_WAIT_FREE, + .wait_type_inner = LD_WAIT_CONFIG, /* PREEMPT_RT makes BH preemptible. */ + .lock_type = LD_LOCK_PERCPU, +}; +EXPORT_SYMBOL_GPL(bh_lock_map); +#endif + /** * local_bh_blocked() - Check for idle whether BH processing is blocked * @@ -162,6 +174,8 @@ void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) WARN_ON_ONCE(in_hardirq()); + lock_map_acquire_read(&bh_lock_map); + /* First entry of a task into a BH disabled section? */ if (!current->softirq_disable_cnt) { if (preemptible()) { @@ -225,6 +239,8 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) WARN_ON_ONCE(in_hardirq()); lockdep_assert_irqs_enabled(); + lock_map_release(&bh_lock_map); + local_irq_save(flags); curcnt = __this_cpu_read(softirq_ctrl.cnt); @@ -275,6 +291,8 @@ static inline void ksoftirqd_run_begin(void) /* Counterpart to ksoftirqd_run_begin() */ static inline void ksoftirqd_run_end(void) { + /* pairs with the lock_map_acquire_read() in ksoftirqd_run_begin() */ + lock_map_release(&bh_lock_map); __local_bh_enable(SOFTIRQ_OFFSET, true); WARN_ON_ONCE(in_interrupt()); local_irq_enable(); From 1fdd7255d5b2e79302f17f4c0f3f03df13f94c7d Mon Sep 17 00:00:00 2001 From: Josh Poimboeuf Date: Mon, 24 Mar 2025 14:55:58 -0700 Subject: [PATCH 237/529] objtool: Properly disable uaccess validation [ Upstream commit e1a9dda74dbffbc3fa2069ff418a1876dc99fb14 ] If opts.uaccess isn't set, the uaccess validation is disabled, but only partially: it doesn't read the uaccess_safe_builtin list but still tries to do the validation. Disable it completely to prevent false warnings. Signed-off-by: Josh Poimboeuf Signed-off-by: Ingo Molnar Cc: Linus Torvalds Link: https://lore.kernel.org/r/0e95581c1d2107fb5f59418edf2b26bba38b0cbb.1742852846.git.jpoimboe@kernel.org Signed-off-by: Sasha Levin --- tools/objtool/check.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 828c91aaf55b..bf75628c5389 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -3083,7 +3083,7 @@ static int handle_insn_ops(struct instruction *insn, if (update_cfi_state(insn, next_insn, &state->cfi, op)) return 1; - if (!insn->alt_group) + if (!opts.uaccess || !insn->alt_group) continue; if (op->dest.type == OP_DEST_PUSHF) { @@ -3535,6 +3535,9 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, return 0; case INSN_STAC: + if (!opts.uaccess) + break; + if (state.uaccess) { WARN_FUNC("recursive UACCESS enable", sec, insn->offset); return 1; @@ -3544,6 +3547,9 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, break; case INSN_CLAC: + if (!opts.uaccess) + break; + if (!state.uaccess && func) { WARN_FUNC("redundant UACCESS disable", sec, insn->offset); return 1; @@ -3956,7 +3962,8 @@ static int validate_symbol(struct objtool_file *file, struct section *sec, if (!insn || insn->ignore || insn->visited) return 0; - state->uaccess = sym->uaccess_safe; + if (opts.uaccess) + state->uaccess = sym->uaccess_safe; ret = validate_branch(file, insn->func, insn, *state); if (ret && opts.backtrace) From 1325473aba000c99ad08d14dd9c13514f776ca5d Mon Sep 17 00:00:00 2001 From: Frank Li Date: Sat, 15 Mar 2025 15:15:46 -0500 Subject: [PATCH 238/529] PCI: dwc: ep: Ensure proper iteration over outbound map windows [ Upstream commit f3e1dccba0a0833fc9a05fb838ebeb6ea4ca0e1a ] Most systems' PCIe outbound map windows have non-zero physical addresses, but the possibility of encountering zero increased after following commit ("PCI: dwc: Use parent_bus_offset"). 'ep->outbound_addr[n]', representing 'parent_bus_address', might be 0 on some hardware, which trims high address bits through bus fabric before sending to the PCIe controller. Replace the iteration logic with 'for_each_set_bit()' to ensure only allocated map windows are iterated when determining the ATU index from a given address. Link: https://lore.kernel.org/r/20250315201548.858189-12-helgaas@kernel.org Signed-off-by: Frank Li Signed-off-by: Bjorn Helgaas Signed-off-by: Sasha Levin --- drivers/pci/controller/dwc/pcie-designware-ep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c b/drivers/pci/controller/dwc/pcie-designware-ep.c index 449ad709495d..3b3f079d0d2d 100644 --- a/drivers/pci/controller/dwc/pcie-designware-ep.c +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c @@ -283,7 +283,7 @@ static int dw_pcie_find_index(struct dw_pcie_ep *ep, phys_addr_t addr, u32 index; struct dw_pcie *pci = to_dw_pcie_from_ep(ep); - for (index = 0; index < pci->num_ob_windows; index++) { + for_each_set_bit(index, ep->ob_window_map, pci->num_ob_windows) { if (ep->outbound_addr[index] != addr) continue; *atu_index = index; From 1d5786b3baa831a1c39a0e3f093f5588cff807e3 Mon Sep 17 00:00:00 2001 From: Ian Rogers Date: Tue, 11 Mar 2025 14:36:23 -0700 Subject: [PATCH 239/529] tools/build: Don't pass test log files to linker [ Upstream commit 935e7cb5bb80106ff4f2fe39640f430134ef8cd8 ] Separate test log files from object files. Depend on test log output but don't pass to the linker. Reviewed-by: James Clark Signed-off-by: Ian Rogers Link: https://lore.kernel.org/r/20250311213628.569562-2-irogers@google.com Signed-off-by: Namhyung Kim Signed-off-by: Sasha Levin --- tools/build/Makefile.build | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/build/Makefile.build b/tools/build/Makefile.build index 715092fc6a23..6a043b729b36 100644 --- a/tools/build/Makefile.build +++ b/tools/build/Makefile.build @@ -130,6 +130,10 @@ objprefix := $(subst ./,,$(OUTPUT)$(dir)/) obj-y := $(addprefix $(objprefix),$(obj-y)) subdir-obj-y := $(addprefix $(objprefix),$(subdir-obj-y)) +# Separate out test log files from real build objects. +test-y := $(filter %_log, $(obj-y)) +obj-y := $(filter-out %_log, $(obj-y)) + # Final '$(obj)-in.o' object in-target := $(objprefix)$(obj)-in.o @@ -140,7 +144,7 @@ $(subdir-y): $(sort $(subdir-obj-y)): $(subdir-y) ; -$(in-target): $(obj-y) FORCE +$(in-target): $(obj-y) $(test-y) FORCE $(call rule_mkdir) $(call if_changed,$(host)ld_multi) From 5ad81426671090f30b9bf908865ab219207fe2e0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 20 Mar 2025 12:45:01 -0400 Subject: [PATCH 240/529] pNFS/flexfiles: Report ENETDOWN as a connection error [ Upstream commit aa42add73ce9b9e3714723d385c254b75814e335 ] If the client should see an ENETDOWN when trying to connect to the data server, it might still be able to talk to the metadata server through another NIC. If so, report the error. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Tested-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: Sasha Levin --- fs/nfs/flexfilelayout/flexfilelayout.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 8056b05bd8dc..07e5ea64dcd6 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1255,6 +1255,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -ECONNRESET: case -EHOSTDOWN: case -EHOSTUNREACH: + case -ENETDOWN: case -ENETUNREACH: case -EADDRINUSE: case -ENOBUFS: From 1671494b7195c030391b162339d5f38997da3285 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Wed, 19 Feb 2025 10:20:56 +0100 Subject: [PATCH 241/529] PCI: vmd: Disable MSI remapping bypass under Xen MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 6c4d5aadf5df31ea0ac025980670eee9beaf466b ] MSI remapping bypass (directly configuring MSI entries for devices on the VMD bus) won't work under Xen, as Xen is not aware of devices in such bus, and hence cannot configure the entries using the pIRQ interface in the PV case, and in the PVH case traps won't be setup for MSI entries for such devices. Until Xen is aware of devices in the VMD bus prevent the VMD_FEAT_CAN_BYPASS_MSI_REMAP capability from being used when running as any kind of Xen guest. The MSI remapping bypass is an optional feature of VMD bridges, and hence when running under Xen it will be masked and devices will be forced to redirect its interrupts from the VMD bridge. That mode of operation must always be supported by VMD bridges and works when Xen is not aware of devices behind the VMD bridge. Signed-off-by: Roger Pau Monné Acked-by: Bjorn Helgaas Message-ID: <20250219092059.90850-3-roger.pau@citrix.com> Signed-off-by: Juergen Gross Signed-off-by: Sasha Levin --- drivers/pci/controller/vmd.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/drivers/pci/controller/vmd.c b/drivers/pci/controller/vmd.c index 09995b6e73bc..771ff0f6971f 100644 --- a/drivers/pci/controller/vmd.c +++ b/drivers/pci/controller/vmd.c @@ -17,6 +17,8 @@ #include #include +#include + #include #define VMD_CFGBAR 0 @@ -919,6 +921,24 @@ static int vmd_probe(struct pci_dev *dev, const struct pci_device_id *id) struct vmd_dev *vmd; int err; + if (xen_domain()) { + /* + * Xen doesn't have knowledge about devices in the VMD bus + * because the config space of devices behind the VMD bridge is + * not known to Xen, and hence Xen cannot discover or configure + * them in any way. + * + * Bypass of MSI remapping won't work in that case as direct + * write by Linux to the MSI entries won't result in functional + * interrupts, as Xen is the entity that manages the host + * interrupt controller and must configure interrupts. However + * multiplexing of interrupts by the VMD bridge will work under + * Xen, so force the usage of that mode which must always be + * supported by VMD bridges. + */ + features &= ~VMD_FEAT_CAN_BYPASS_MSI_REMAP; + } + if (resource_size(&dev->resource[VMD_CFGBAR]) < (1 << 20)) return -ENOMEM; From db1aef51b8e66a77f76b1250b914589c31a0a0ed Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Thu, 20 Mar 2025 12:22:22 +0100 Subject: [PATCH 242/529] libnvdimm/labels: Fix divide error in nd_label_data_init() [ Upstream commit ef1d3455bbc1922f94a91ed58d3d7db440652959 ] If a faulty CXL memory device returns a broken zero LSA size in its memory device information (Identify Memory Device (Opcode 4000h), CXL spec. 3.1, 8.2.9.9.1.1), a divide error occurs in the libnvdimm driver: Oops: divide error: 0000 [#1] PREEMPT SMP NOPTI RIP: 0010:nd_label_data_init+0x10e/0x800 [libnvdimm] Code and flow: 1) CXL Command 4000h returns LSA size = 0 2) config_size is assigned to zero LSA size (CXL pmem driver): drivers/cxl/pmem.c: .config_size = mds->lsa_size, 3) max_xfer is set to zero (nvdimm driver): drivers/nvdimm/label.c: max_xfer = min_t(size_t, ndd->nsarea.max_xfer, config_size); 4) A subsequent DIV_ROUND_UP() causes a division by zero: drivers/nvdimm/label.c: /* Make our initial read size a multiple of max_xfer size */ drivers/nvdimm/label.c: read_size = min(DIV_ROUND_UP(read_size, max_xfer) * max_xfer, drivers/nvdimm/label.c- config_size); Fix this by checking the config size parameter by extending an existing check. Signed-off-by: Robert Richter Reviewed-by: Pankaj Gupta Reviewed-by: Ira Weiny Link: https://patch.msgid.link/20250320112223.608320-1-rrichter@amd.com Signed-off-by: Ira Weiny Signed-off-by: Sasha Levin --- drivers/nvdimm/label.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvdimm/label.c b/drivers/nvdimm/label.c index 082253a3a956..04f4a049599a 100644 --- a/drivers/nvdimm/label.c +++ b/drivers/nvdimm/label.c @@ -442,7 +442,8 @@ int nd_label_data_init(struct nvdimm_drvdata *ndd) if (ndd->data) return 0; - if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0) { + if (ndd->nsarea.status || ndd->nsarea.max_xfer == 0 || + ndd->nsarea.config_size == 0) { dev_dbg(ndd->dev, "failed to init config data area: (%u:%u)\n", ndd->nsarea.max_xfer, ndd->nsarea.config_size); return -ENXIO; From fa73abd3599ee5d893aabda5c19c8e7b1b9ec2c8 Mon Sep 17 00:00:00 2001 From: Erick Shepherd Date: Fri, 14 Mar 2025 14:50:21 -0500 Subject: [PATCH 243/529] mmc: host: Wait for Vdd to settle on card power off [ Upstream commit 31e75ed964582257f59156ce6a42860e1ae4cc39 ] The SD spec version 6.0 section 6.4.1.5 requires that Vdd must be lowered to less than 0.5V for a minimum of 1 ms when powering off a card. Increase wait to 15 ms so that voltage has time to drain down to 0.5V and cards can power off correctly. Issues with voltage drain time were only observed on Apollo Lake and Bay Trail host controllers so this fix is limited to those devices. Signed-off-by: Erick Shepherd Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20250314195021.1588090-1-erick.shepherd@ni.com Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/sdhci-pci-core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index 5a5cc40d4bc3..c71d9956b398 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -613,8 +613,12 @@ static void sdhci_intel_set_power(struct sdhci_host *host, unsigned char mode, sdhci_set_power(host, mode, vdd); - if (mode == MMC_POWER_OFF) + if (mode == MMC_POWER_OFF) { + if (slot->chip->pdev->device == PCI_DEVICE_ID_INTEL_APL_SD || + slot->chip->pdev->device == PCI_DEVICE_ID_INTEL_BYT_SD) + usleep_range(15000, 17500); return; + } /* * Bus power might not enable after D3 -> D0 transition due to the From 8c18c904d301ffeb33b071eadc55cd6131e1e9be Mon Sep 17 00:00:00 2001 From: Philip Redkin Date: Fri, 15 Nov 2024 20:36:59 +0300 Subject: [PATCH 244/529] x86/mm: Check return value from memblock_phys_alloc_range() [ Upstream commit 631ca8909fd5c62b9fda9edda93924311a78a9c4 ] At least with CONFIG_PHYSICAL_START=0x100000, if there is < 4 MiB of contiguous free memory available at this point, the kernel will crash and burn because memblock_phys_alloc_range() returns 0 on failure, which leads memblock_phys_free() to throw the first 4 MiB of physical memory to the wolves. At a minimum it should fail gracefully with a meaningful diagnostic, but in fact everything seems to work fine without the weird reserve allocation. Signed-off-by: Philip Redkin Signed-off-by: Ingo Molnar Cc: Dave Hansen Cc: Rik van Riel Cc: "H. Peter Anvin" Link: https://lore.kernel.org/r/94b3e98f-96a7-3560-1f76-349eb95ccf7f@rarity.fan Signed-off-by: Sasha Levin --- arch/x86/mm/init.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index ab697ee64528..446bf7fbc325 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -654,8 +654,13 @@ static void __init memory_map_top_down(unsigned long map_start, */ addr = memblock_phys_alloc_range(PMD_SIZE, PMD_SIZE, map_start, map_end); - memblock_phys_free(addr, PMD_SIZE); - real_end = addr + PMD_SIZE; + if (!addr) { + pr_warn("Failed to release memory for alloc_low_pages()"); + real_end = max(map_start, ALIGN_DOWN(map_end, PMD_SIZE)); + } else { + memblock_phys_free(addr, PMD_SIZE); + real_end = addr + PMD_SIZE; + } /* step_size need to be small so pgt_buf from BRK could cover it */ step_size = PMD_SIZE; From fb6a04c3b76a77176c009bdfef763195f133f87c Mon Sep 17 00:00:00 2001 From: Stephan Gerhold Date: Tue, 28 Nov 2023 10:48:37 +0100 Subject: [PATCH 245/529] i2c: qup: Vote for interconnect bandwidth to DRAM [ Upstream commit d4f35233a6345f62637463ef6e0708f44ffaa583 ] When the I2C QUP controller is used together with a DMA engine it needs to vote for the interconnect path to the DRAM. Otherwise it may be unable to access the memory quickly enough. The requested peak bandwidth is dependent on the I2C core clock. To avoid sending votes too often the bandwidth is always requested when a DMA transfer starts, but dropped only on runtime suspend. Runtime suspend should only happen if no transfer is active. After resumption we can defer the next vote until the first DMA transfer actually happens. The implementation is largely identical to the one introduced for spi-qup in commit ecdaa9473019 ("spi: qup: Vote for interconnect bandwidth to DRAM") since both drivers represent the same hardware block. Signed-off-by: Stephan Gerhold Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20231128-i2c-qup-dvfs-v1-3-59a0e3039111@kernkonzept.com Signed-off-by: Sasha Levin --- drivers/i2c/busses/i2c-qup.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/drivers/i2c/busses/i2c-qup.c b/drivers/i2c/busses/i2c-qup.c index 78682388e02e..82de4651d18f 100644 --- a/drivers/i2c/busses/i2c-qup.c +++ b/drivers/i2c/busses/i2c-qup.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -150,6 +151,8 @@ /* TAG length for DATA READ in RX FIFO */ #define READ_RX_TAGS_LEN 2 +#define QUP_BUS_WIDTH 8 + static unsigned int scl_freq; module_param_named(scl_freq, scl_freq, uint, 0444); MODULE_PARM_DESC(scl_freq, "SCL frequency override"); @@ -227,6 +230,7 @@ struct qup_i2c_dev { int irq; struct clk *clk; struct clk *pclk; + struct icc_path *icc_path; struct i2c_adapter adap; int clk_ctl; @@ -255,6 +259,10 @@ struct qup_i2c_dev { /* To configure when bus is in run state */ u32 config_run; + /* bandwidth votes */ + u32 src_clk_freq; + u32 cur_bw_clk_freq; + /* dma parameters */ bool is_dma; /* To check if the current transfer is using DMA */ @@ -453,6 +461,23 @@ static int qup_i2c_bus_active(struct qup_i2c_dev *qup, int len) return ret; } +static int qup_i2c_vote_bw(struct qup_i2c_dev *qup, u32 clk_freq) +{ + u32 needed_peak_bw; + int ret; + + if (qup->cur_bw_clk_freq == clk_freq) + return 0; + + needed_peak_bw = Bps_to_icc(clk_freq * QUP_BUS_WIDTH); + ret = icc_set_bw(qup->icc_path, 0, needed_peak_bw); + if (ret) + return ret; + + qup->cur_bw_clk_freq = clk_freq; + return 0; +} + static void qup_i2c_write_tx_fifo_v1(struct qup_i2c_dev *qup) { struct qup_i2c_block *blk = &qup->blk; @@ -840,6 +865,10 @@ static int qup_i2c_bam_xfer(struct i2c_adapter *adap, struct i2c_msg *msg, int ret = 0; int idx = 0; + ret = qup_i2c_vote_bw(qup, qup->src_clk_freq); + if (ret) + return ret; + enable_irq(qup->irq); ret = qup_i2c_req_dma(qup); @@ -1645,6 +1674,7 @@ static void qup_i2c_disable_clocks(struct qup_i2c_dev *qup) config = readl(qup->base + QUP_CONFIG); config |= QUP_CLOCK_AUTO_GATE; writel(config, qup->base + QUP_CONFIG); + qup_i2c_vote_bw(qup, 0); clk_disable_unprepare(qup->pclk); } @@ -1745,6 +1775,11 @@ static int qup_i2c_probe(struct platform_device *pdev) goto fail_dma; } qup->is_dma = true; + + qup->icc_path = devm_of_icc_get(&pdev->dev, NULL); + if (IS_ERR(qup->icc_path)) + return dev_err_probe(&pdev->dev, PTR_ERR(qup->icc_path), + "failed to get interconnect path\n"); } nodma: @@ -1793,6 +1828,7 @@ nodma: qup_i2c_enable_clocks(qup); src_clk_freq = clk_get_rate(qup->clk); } + qup->src_clk_freq = src_clk_freq; /* * Bootloaders might leave a pending interrupt on certain QUP's, From e212c8e9f2f37ee2a94b2bd4ed57e802e039e978 Mon Sep 17 00:00:00 2001 From: Vitalii Mordan Date: Wed, 12 Feb 2025 20:28:03 +0300 Subject: [PATCH 246/529] i2c: pxa: fix call balance of i2c->clk handling routines [ Upstream commit be7113d2e2a6f20cbee99c98d261a1fd6fd7b549 ] If the clock i2c->clk was not enabled in i2c_pxa_probe(), it should not be disabled in any path. Found by Linux Verification Center (linuxtesting.org) with Klever. Signed-off-by: Vitalii Mordan Signed-off-by: Andi Shyti Link: https://lore.kernel.org/r/20250212172803.1422136-1-mordan@ispras.ru Signed-off-by: Sasha Levin --- drivers/i2c/busses/i2c-pxa.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-pxa.c b/drivers/i2c/busses/i2c-pxa.c index ade3f0ea5955..8263e017577d 100644 --- a/drivers/i2c/busses/i2c-pxa.c +++ b/drivers/i2c/busses/i2c-pxa.c @@ -1508,7 +1508,10 @@ static int i2c_pxa_probe(struct platform_device *dev) i2c->adap.name); } - clk_prepare_enable(i2c->clk); + ret = clk_prepare_enable(i2c->clk); + if (ret) + return dev_err_probe(&dev->dev, ret, + "failed to enable clock\n"); if (i2c->use_pio) { i2c->adap.algo = &i2c_pxa_pio_algorithm; From df4af023f6a209106e1ee72d7f3d21f15e8edc8a Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Mon, 3 Mar 2025 15:01:05 -0800 Subject: [PATCH 247/529] btrfs: make btrfs_discard_workfn() block_group ref explicit [ Upstream commit 895c6721d310c036dcfebb5ab845822229fa35eb ] Currently, the async discard machinery owns a ref to the block_group when the block_group is queued on a discard list. However, to handle races with discard cancellation and the discard workfn, we have a specific logic to detect that the block_group is *currently* running in the workfn, to protect the workfn's usage amidst cancellation. As far as I can tell, this doesn't have any overt bugs (though finish_discard_pass() and remove_from_discard_list() racing can have a surprising outcome for the caller of remove_from_discard_list() in that it is again added at the end). But it is needlessly complicated to rely on locking and the nullity of discard_ctl->block_group. Simplify this significantly by just taking a refcount while we are in the workfn and unconditionally drop it in both the remove and workfn paths, regardless of if they race. Reviewed-by: Filipe Manana Signed-off-by: Boris Burkov Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/discard.c | 34 ++++++++++++++++------------------ 1 file changed, 16 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index 7b2f77a8aa98..a90f3cb83c70 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -152,13 +152,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = 0; queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); - /* - * If the block group is currently running in the discard workfn, we - * don't want to deref it, since it's still being used by the workfn. - * The workfn will notice this case and deref the block group when it is - * finished. - */ - if (queued && !running) + if (queued) btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -256,9 +250,10 @@ again: block_group->discard_cursor = block_group->start; block_group->discard_state = BTRFS_DISCARD_EXTENTS; } - discard_ctl->block_group = block_group; } if (block_group) { + btrfs_get_block_group(block_group); + discard_ctl->block_group = block_group; *discard_state = block_group->discard_state; *discard_index = block_group->discard_index; } @@ -482,9 +477,20 @@ static void btrfs_discard_workfn(struct work_struct *work) block_group = peek_discard_list(discard_ctl, &discard_state, &discard_index, now); - if (!block_group || !btrfs_run_discard_work(discard_ctl)) + if (!block_group) return; + if (!btrfs_run_discard_work(discard_ctl)) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); + return; + } if (now < block_group->discard_eligible_time) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); btrfs_discard_schedule_work(discard_ctl, false); return; } @@ -536,15 +542,7 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; - /* - * If the block group was removed from the discard list while it was - * running in this workfn, then we didn't deref it, since this function - * still owned that reference. But we set the discard_ctl->block_group - * back to NULL, so we can use that condition to know that now we need - * to deref the block_group. - */ - if (discard_ctl->block_group == NULL) - btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); From 2abb4bb79559639fdffac76a35ad8a50fd652690 Mon Sep 17 00:00:00 2001 From: Mark Harmstone Date: Thu, 6 Mar 2025 10:58:46 +0000 Subject: [PATCH 248/529] btrfs: avoid linker error in btrfs_find_create_tree_block() [ Upstream commit 7ef3cbf17d2734ca66c4ed8573be45f4e461e7ee ] The inline function btrfs_is_testing() is hardcoded to return 0 if CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set. Currently we're relying on the compiler optimizing out the call to alloc_test_extent_buffer() in btrfs_find_create_tree_block(), as it's not been defined (it's behind an #ifdef). Add a stub version of alloc_test_extent_buffer() to avoid linker errors on non-standard optimization levels. This problem was seen on GCC 14 with -O0 and is helps to see symbols that would be otherwise optimized out. Reviewed-by: Qu Wenruo Signed-off-by: Mark Harmstone Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/extent_io.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 72227c0b4b5a..d5552875f872 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4459,10 +4459,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, return eb; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *eb, *exists = NULL; int ret; @@ -4498,8 +4498,11 @@ again: free_eb: btrfs_release_extent_buffer(eb); return exists; -} +#else + /* Stub to avoid linker error when compiled with optimizations turned off. */ + return NULL; #endif +} static struct extent_buffer *grab_extent_buffer( struct btrfs_fs_info *fs_info, struct page *page) From 013614c23e72927c689f3f4e104e10a0ed4d7e87 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 7 Mar 2025 14:36:10 +1030 Subject: [PATCH 249/529] btrfs: run btrfs_error_commit_super() early [ Upstream commit df94a342efb451deb0e32b495d1d6cd4bb3a1648 ] [BUG] Even after all the error fixes related the "ASSERT(list_empty(&fs_info->delayed_iputs));" in close_ctree(), I can still hit it reliably with my experimental 2K block size. [CAUSE] In my case, all the error is triggered after the fs is already in error status. I find the following call trace to be the cause of race: Main thread | endio_write_workers ---------------------------------------------+--------------------------- close_ctree() | |- btrfs_error_commit_super() | | |- btrfs_cleanup_transaction() | | | |- btrfs_destroy_all_ordered_extents() | | | |- btrfs_wait_ordered_roots() | | |- btrfs_run_delayed_iputs() | | | btrfs_finish_ordered_io() | | |- btrfs_put_ordered_extent() | | |- btrfs_add_delayed_iput() |- ASSERT(list_empty(delayed_iputs)) | !!! Triggered !!! The root cause is that, btrfs_wait_ordered_roots() only wait for ordered extents to finish their IOs, not to wait for them to finish and removed. [FIX] Since btrfs_error_commit_super() will flush and wait for all ordered extents, it should be executed early, before we start flushing the workqueues. And since btrfs_error_commit_super() now runs early, there is no need to run btrfs_run_delayed_iputs() inside it, so just remove the btrfs_run_delayed_iputs() call from btrfs_error_commit_super(). Reviewed-by: Filipe Manana Signed-off-by: Qu Wenruo Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/disk-io.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index de4f590fe30f..6670188b9eb6 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4641,6 +4641,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* clear out the rbtree of defraggable inodes */ btrfs_cleanup_defrag_inodes(fs_info); + /* + * Handle the error fs first, as it will flush and wait for all ordered + * extents. This will generate delayed iputs, thus we want to handle + * it first. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) + btrfs_error_commit_super(fs_info); + /* * Wait for any fixup workers to complete. * If we don't wait for them here and they are still running by the time @@ -4730,9 +4738,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (BTRFS_FS_ERROR(fs_info)) - btrfs_error_commit_super(fs_info); - kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -4888,10 +4893,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) /* cleanup FS via transaction */ btrfs_cleanup_transaction(fs_info); - mutex_lock(&fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_mutex); - down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); } From e2965d9d21e5f36cdc5f94bb4cfd82ce52c53da3 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 6 Mar 2025 14:25:38 +0000 Subject: [PATCH 250/529] btrfs: fix non-empty delayed iputs list on unmount due to async workers [ Upstream commit cda76788f8b0f7de3171100e3164ec1ce702292e ] At close_ctree() after we have ran delayed iputs either explicitly through calling btrfs_run_delayed_iputs() or later during the call to btrfs_commit_super() or btrfs_error_commit_super(), we assert that the delayed iputs list is empty. We have (another) race where this assertion might fail because we have queued an async write into the fs_info->workers workqueue. Here's how it happens: 1) We are submitting a data bio for an inode that is not the data relocation inode, so we call btrfs_wq_submit_bio(); 2) btrfs_wq_submit_bio() submits a work for the fs_info->workers queue that will run run_one_async_done(); 3) We enter close_ctree(), flush several work queues except fs_info->workers, explicitly run delayed iputs with a call to btrfs_run_delayed_iputs() and then again shortly after by calling btrfs_commit_super() or btrfs_error_commit_super(), which also run delayed iputs; 4) run_one_async_done() is executed in the work queue, and because there was an IO error (bio->bi_status is not 0) it calls btrfs_bio_end_io(), which drops the final reference on the associated ordered extent by calling btrfs_put_ordered_extent() - and that adds a delayed iput for the inode; 5) At close_ctree() we find that after stopping the cleaner and transaction kthreads the delayed iputs list is not empty, failing the following assertion: ASSERT(list_empty(&fs_info->delayed_iputs)); Fix this by flushing the fs_info->workers workqueue before running delayed iputs at close_ctree(). David reported this when running generic/648, which exercises IO error paths by using the DM error table. Reported-by: David Sterba Reviewed-by: Qu Wenruo Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/disk-io.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 6670188b9eb6..8c0da0025bc7 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4669,6 +4669,19 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) */ btrfs_flush_workqueue(fs_info->delalloc_workers); + /* + * We can have ordered extents getting their last reference dropped from + * the fs_info->workers queue because for async writes for data bios we + * queue a work for that queue, at btrfs_wq_submit_bio(), that runs + * run_one_async_done() which calls btrfs_bio_end_io() in case the bio + * has an error, and that later function can do the final + * btrfs_put_ordered_extent() on the ordered extent attached to the bio, + * which adds a delayed iput for the inode. So we must flush the queue + * so that we don't have delayed iputs after committing the current + * transaction below and stopping the cleaner and transaction kthreads. + */ + btrfs_flush_workqueue(fs_info->workers); + /* * When finishing a compressed write bio we schedule a work queue item * to finish an ordered extent - btrfs_finish_compressed_write_work() From 22bb11b3d53403518d728ca10086acfe3ab84a3b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Fri, 21 Feb 2025 16:12:15 +0000 Subject: [PATCH 251/529] btrfs: get zone unusable bytes while holding lock at btrfs_reclaim_bgs_work() [ Upstream commit 1283b8c125a83bf7a7dbe90c33d3472b6d7bf612 ] At btrfs_reclaim_bgs_work(), we are grabbing a block group's zone unusable bytes while not under the protection of the block group's spinlock, so this can trigger race reports from KCSAN (or similar tools) since that field is typically updated while holding the lock, such as at __btrfs_add_free_space_zoned() for example. Fix this by grabbing the zone unusable bytes while we are still in the critical section holding the block group's spinlock, which is right above where we are currently grabbing it. Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/block-group.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 0dcf7fecaf55..91440ef79a26 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1678,6 +1678,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) up_write(&space_info->groups_sem); goto next; } + + /* + * Cache the zone_unusable value before turning the block group + * to read only. As soon as the block group is read only it's + * zone_unusable value gets moved to the block group's read-only + * bytes and isn't available for calculations anymore. We also + * cache it before unlocking the block group, to prevent races + * (reports from KCSAN and such tools) with tasks updating it. + */ + zone_unusable = bg->zone_unusable; + spin_unlock(&bg->lock); /* @@ -1693,13 +1704,6 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) goto next; } - /* - * Cache the zone_unusable value before turning the block group - * to read only. As soon as the blog group is read only it's - * zone_unusable value gets moved to the block group's read-only - * bytes and isn't available for calculations anymore. - */ - zone_unusable = bg->zone_unusable; ret = inc_block_group_ro(bg, 0); up_write(&space_info->groups_sem); if (ret < 0) From 822c0e09f401a4d716e0a2af2b0ea185788d2834 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 5 Feb 2025 13:09:25 +0000 Subject: [PATCH 252/529] btrfs: send: return -ENAMETOOLONG when attempting a path that is too long [ Upstream commit a77749b3e21813566cea050bbb3414ae74562eba ] When attempting to build a too long path we are currently returning -ENOMEM, which is very odd and misleading. So update fs_path_ensure_buf() to return -ENAMETOOLONG instead. Also, while at it, move the WARN_ON() into the if statement's expression, as it makes it clear what is being tested and also has the effect of adding 'unlikely' to the statement, which allows the compiler to generate better code as this condition is never expected to happen. Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/send.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index a2b95ccb4cf5..0735decec99b 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -431,10 +431,8 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; - if (len > PATH_MAX) { - WARN_ON(1); - return -ENOMEM; - } + if (WARN_ON(len > PATH_MAX)) + return -ENAMETOOLONG; path_len = p->end - p->start; old_buf_len = p->buf_len; From 227c253c9ef9dfb9647697b5a6e5e3a3c469d11e Mon Sep 17 00:00:00 2001 From: Jing Zhou Date: Tue, 4 Mar 2025 23:15:56 +0800 Subject: [PATCH 253/529] drm/amd/display: Guard against setting dispclk low for dcn31x [ Upstream commit 9c2f4ae64bb6f6d83a54d88b9ee0f369cdbb9fa8 ] [WHY] We should never apply a minimum dispclk value while in prepare_bandwidth or while displays are active. This is always an optimizaiton for when all displays are disabled. [HOW] Defer dispclk optimization until safe_to_lower = true and display_count reaches 0. Since 0 has a special value in this logic (ie. no dispclk required) we also need adjust the logic that clamps it for the actual request to PMFW. Reviewed-by: Charlene Liu Reviewed-by: Chris Park Reviewed-by: Eric Yang Signed-off-by: Jing Zhou Signed-off-by: Nicholas Kazlauskas Signed-off-by: Alex Hung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- .../dc/clk_mgr/dcn315/dcn315_clk_mgr.c | 20 +++++++++++-------- .../dc/clk_mgr/dcn316/dcn316_clk_mgr.c | 13 +++++++++--- 2 files changed, 22 insertions(+), 11 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn315/dcn315_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn315/dcn315_clk_mgr.c index 09eb1bc9aa03..9549f9c15229 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn315/dcn315_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn315/dcn315_clk_mgr.c @@ -116,7 +116,7 @@ static void dcn315_update_clocks(struct clk_mgr *clk_mgr_base, struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base); struct dc_clocks *new_clocks = &context->bw_ctx.bw.dcn.clk; struct dc *dc = clk_mgr_base->ctx->dc; - int display_count; + int display_count = 0; bool update_dppclk = false; bool update_dispclk = false; bool dpp_clock_lowered = false; @@ -192,15 +192,19 @@ static void dcn315_update_clocks(struct clk_mgr *clk_mgr_base, update_dppclk = true; } - if (should_set_clock(safe_to_lower, new_clocks->dispclk_khz, clk_mgr_base->clks.dispclk_khz)) { - /* No need to apply the w/a if we haven't taken over from bios yet */ - if (clk_mgr_base->clks.dispclk_khz) - dcn315_disable_otg_wa(clk_mgr_base, context, true); + if (should_set_clock(safe_to_lower, new_clocks->dispclk_khz, clk_mgr_base->clks.dispclk_khz) && + (new_clocks->dispclk_khz > 0 || (safe_to_lower && display_count == 0))) { + int requested_dispclk_khz = new_clocks->dispclk_khz; + dcn315_disable_otg_wa(clk_mgr_base, context, true); + + /* Clamp the requested clock to PMFW based on their limit. */ + if (dc->debug.min_disp_clk_khz > 0 && requested_dispclk_khz < dc->debug.min_disp_clk_khz) + requested_dispclk_khz = dc->debug.min_disp_clk_khz; + + dcn315_smu_set_dispclk(clk_mgr, requested_dispclk_khz); clk_mgr_base->clks.dispclk_khz = new_clocks->dispclk_khz; - dcn315_smu_set_dispclk(clk_mgr, clk_mgr_base->clks.dispclk_khz); - if (clk_mgr_base->clks.dispclk_khz) - dcn315_disable_otg_wa(clk_mgr_base, context, false); + dcn315_disable_otg_wa(clk_mgr_base, context, false); update_dispclk = true; } diff --git a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c index 29d2003fb712..afce15aa2ff1 100644 --- a/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c +++ b/drivers/gpu/drm/amd/display/dc/clk_mgr/dcn316/dcn316_clk_mgr.c @@ -153,7 +153,7 @@ static void dcn316_update_clocks(struct clk_mgr *clk_mgr_base, struct clk_mgr_internal *clk_mgr = TO_CLK_MGR_INTERNAL(clk_mgr_base); struct dc_clocks *new_clocks = &context->bw_ctx.bw.dcn.clk; struct dc *dc = clk_mgr_base->ctx->dc; - int display_count; + int display_count = 0; bool update_dppclk = false; bool update_dispclk = false; bool dpp_clock_lowered = false; @@ -226,11 +226,18 @@ static void dcn316_update_clocks(struct clk_mgr *clk_mgr_base, update_dppclk = true; } - if (should_set_clock(safe_to_lower, new_clocks->dispclk_khz, clk_mgr_base->clks.dispclk_khz)) { + if (should_set_clock(safe_to_lower, new_clocks->dispclk_khz, clk_mgr_base->clks.dispclk_khz) && + (new_clocks->dispclk_khz > 0 || (safe_to_lower && display_count == 0))) { + int requested_dispclk_khz = new_clocks->dispclk_khz; + dcn316_disable_otg_wa(clk_mgr_base, context, safe_to_lower, true); + /* Clamp the requested clock to PMFW based on their limit. */ + if (dc->debug.min_disp_clk_khz > 0 && requested_dispclk_khz < dc->debug.min_disp_clk_khz) + requested_dispclk_khz = dc->debug.min_disp_clk_khz; + + dcn316_smu_set_dispclk(clk_mgr, requested_dispclk_khz); clk_mgr_base->clks.dispclk_khz = new_clocks->dispclk_khz; - dcn316_smu_set_dispclk(clk_mgr, clk_mgr_base->clks.dispclk_khz); dcn316_disable_otg_wa(clk_mgr_base, context, safe_to_lower, false); update_dispclk = true; From f2985a1de6f53abbc42c7a696ee6ba8bdecc9426 Mon Sep 17 00:00:00 2001 From: Stanley Chu Date: Tue, 18 Mar 2025 13:36:06 +0800 Subject: [PATCH 254/529] i3c: master: svc: Fix missing STOP for master request [ Upstream commit 0430bf9bc1ac068c8b8c540eb93e5751872efc51 ] The controller driver nacked the master request but didn't emit a STOP to end the transaction. The driver shall refuse the unsupported requests and return the controller state to IDLE by emitting a STOP. Signed-off-by: Stanley Chu Reviewed-by: Frank Li Link: https://lore.kernel.org/r/20250318053606.3087121-4-yschu@nuvoton.com Signed-off-by: Alexandre Belloni Signed-off-by: Sasha Levin --- drivers/i3c/master/svc-i3c-master.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c index 38095649ed27..cf0550c6e95f 100644 --- a/drivers/i3c/master/svc-i3c-master.c +++ b/drivers/i3c/master/svc-i3c-master.c @@ -495,6 +495,7 @@ static void svc_i3c_master_ibi_work(struct work_struct *work) queue_work(master->base.wq, &master->hj_work); break; case SVC_I3C_MSTATUS_IBITYPE_MASTER_REQUEST: + svc_i3c_master_emit_stop(master); default: break; } From 75f74c89a8dc474f0d890326c6f5ba08ad206bff Mon Sep 17 00:00:00 2001 From: Heming Zhao Date: Mon, 10 Mar 2025 15:36:21 +0800 Subject: [PATCH 255/529] dlm: make tcp still work in multi-link env [ Upstream commit 03d2b62208a336a3bb984b9465ef6d89a046ea22 ] This patch bypasses multi-link errors in TCP mode, allowing dlm to operate on the first tcp link. Signed-off-by: Heming Zhao Signed-off-by: David Teigland Signed-off-by: Sasha Levin --- fs/dlm/lowcomms.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 2c797eb519da..51a641822d6c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1863,8 +1863,8 @@ static int dlm_tcp_listen_validate(void) { /* We don't support multi-homed hosts */ if (dlm_local_count > 1) { - log_print("TCP protocol can't handle multi-homed hosts, try SCTP"); - return -EINVAL; + log_print("Detect multi-homed hosts but use only the first IP address."); + log_print("Try SCTP, if you want to enable multi-link."); } return 0; From f8ddfaab46e78e50933dfd26ce86e45a2b2316e9 Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Mon, 24 Feb 2025 19:18:19 +0100 Subject: [PATCH 256/529] um: Store full CSGSFS and SS register from mcontext [ Upstream commit cef721e0d53d2b64f2ba177c63a0dfdd7c0daf17 ] Doing this allows using registers as retrieved from an mcontext to be pushed to a process using PTRACE_SETREGS. It is not entirely clear to me why CSGSFS was masked. Doing so creates issues when using the mcontext as process state in seccomp and simply copying the register appears to work perfectly fine for ptrace. Signed-off-by: Benjamin Berg Link: https://patch.msgid.link/20250224181827.647129-2-benjamin@sipsolutions.net Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- arch/x86/um/os-Linux/mcontext.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/x86/um/os-Linux/mcontext.c b/arch/x86/um/os-Linux/mcontext.c index 49c3744cac37..81b9d1f9f4e6 100644 --- a/arch/x86/um/os-Linux/mcontext.c +++ b/arch/x86/um/os-Linux/mcontext.c @@ -26,7 +26,6 @@ void get_regs_from_mc(struct uml_pt_regs *regs, mcontext_t *mc) COPY(RIP); COPY2(EFLAGS, EFL); COPY2(CS, CSGSFS); - regs->gp[CS / sizeof(unsigned long)] &= 0xffff; - regs->gp[CS / sizeof(unsigned long)] |= 3; + regs->gp[SS / sizeof(unsigned long)] = mc->gregs[REG_CSGSFS] >> 48; #endif } From ea4d9e1a6cac9a19fb0200a3930275c0685773a0 Mon Sep 17 00:00:00 2001 From: Tiwei Bie Date: Fri, 21 Feb 2025 12:18:55 +0800 Subject: [PATCH 257/529] um: Update min_low_pfn to match changes in uml_reserved [ Upstream commit e82cf3051e6193f61e03898f8dba035199064d36 ] When uml_reserved is updated, min_low_pfn must also be updated accordingly. Otherwise, min_low_pfn will not accurately reflect the lowest available PFN. Signed-off-by: Tiwei Bie Link: https://patch.msgid.link/20250221041855.1156109-1-tiwei.btw@antgroup.com Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- arch/um/kernel/mem.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/kernel/mem.c b/arch/um/kernel/mem.c index 38d5a71a579b..f6c766b2bdf5 100644 --- a/arch/um/kernel/mem.c +++ b/arch/um/kernel/mem.c @@ -68,6 +68,7 @@ void __init mem_init(void) map_memory(brk_end, __pa(brk_end), uml_reserved - brk_end, 1, 1, 0); memblock_free((void *)brk_end, uml_reserved - brk_end); uml_reserved = brk_end; + min_low_pfn = PFN_UP(__pa(uml_reserved)); /* this will put all low memory onto the freelists */ memblock_free_all(); From 7f778cbffe807f3997426fc6bae92063bb3dd0e5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christian=20G=C3=B6ttsche?= Date: Sun, 2 Mar 2025 17:06:39 +0100 Subject: [PATCH 258/529] ext4: reorder capability check last MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 1b419c889c0767a5b66d0a6c566cae491f1cb0f7 ] capable() calls refer to enabled LSMs whether to permit or deny the request. This is relevant in connection with SELinux, where a capability check results in a policy decision and by default a denial message on insufficient permission is issued. It can lead to three undesired cases: 1. A denial message is generated, even in case the operation was an unprivileged one and thus the syscall succeeded, creating noise. 2. To avoid the noise from 1. the policy writer adds a rule to ignore those denial messages, hiding future syscalls, where the task performs an actual privileged operation, leading to hidden limited functionality of that task. 3. To avoid the noise from 1. the policy writer adds a rule to permit the task the requested capability, while it does not need it, violating the principle of least privilege. Signed-off-by: Christian Göttsche Reviewed-by: Serge Hallyn Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250302160657.127253-2-cgoettsche@seltendoof.de Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/ext4/balloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index fbd0329cf254..9efe97f3721b 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -638,8 +638,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || - capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + (flags & EXT4_MB_USE_ROOT_BLOCKS) || + capable(CAP_SYS_RESOURCE)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) From 1f20b8eccf01bbd3be34a4f8d7c3ee3f331ac814 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=A4kisara?= Date: Tue, 11 Mar 2025 13:25:16 +0200 Subject: [PATCH 259/529] scsi: st: Tighten the page format heuristics with MODE SELECT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 8db816c6f176321e42254badd5c1a8df8bfcfdb4 ] In the days when SCSI-2 was emerging, some drives did claim SCSI-2 but did not correctly implement it. The st driver first tries MODE SELECT with the page format bit set to set the block descriptor. If not successful, the non-page format is tried. The test only tests the sense code and this triggers also from illegal parameter in the parameter list. The test is limited to "old" devices and made more strict to remove false alarms. Signed-off-by: Kai Mäkisara Link: https://lore.kernel.org/r/20250311112516.5548-4-Kai.Makisara@kolumbus.fi Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/st.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 7f107be34423..284c2cf1ae66 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -3074,7 +3074,9 @@ static int st_int_ioctl(struct scsi_tape *STp, unsigned int cmd_in, unsigned lon cmd_in == MTSETDRVBUFFER || cmd_in == SET_DENS_AND_BLK) { if (cmdstatp->sense_hdr.sense_key == ILLEGAL_REQUEST && - !(STp->use_pf & PF_TESTED)) { + cmdstatp->sense_hdr.asc == 0x24 && + (STp->device)->scsi_level <= SCSI_2 && + !(STp->use_pf & PF_TESTED)) { /* Try the other possible state of Page Format if not already tried */ STp->use_pf = (STp->use_pf ^ USE_PF) | PF_TESTED; From 4c92971bede873f45746572c828187a34b3a9caa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=A4kisara?= Date: Tue, 11 Mar 2025 13:25:15 +0200 Subject: [PATCH 260/529] scsi: st: ERASE does not change tape location MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit ad77cebf97bd42c93ab4e3bffd09f2b905c1959a ] The SCSI ERASE command erases from the current position onwards. Don't clear the position variables. Signed-off-by: Kai Mäkisara Link: https://lore.kernel.org/r/20250311112516.5548-3-Kai.Makisara@kolumbus.fi Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/st.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 284c2cf1ae66..3ff4e6d44db8 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -2887,7 +2887,6 @@ static int st_int_ioctl(struct scsi_tape *STp, unsigned int cmd_in, unsigned lon timeout = STp->long_timeout * 8; DEBC_printk(STp, "Erasing tape.\n"); - fileno = blkno = at_sm = 0; break; case MTSETBLK: /* Set block length */ case MTSETDENSITY: /* Set tape density */ From 6cd9c9167cee4b5493d96bb1a4ef1e0e3e11764b Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 11 Mar 2025 17:06:21 -0600 Subject: [PATCH 261/529] vfio/pci: Handle INTx IRQ_NOTCONNECTED [ Upstream commit 860be250fc32de9cb24154bf21b4e36f40925707 ] Some systems report INTx as not routed by setting pdev->irq to IRQ_NOTCONNECTED, resulting in a -ENOTCONN error when trying to setup eventfd signaling. Include this in the set of conditions for which the PIN register is virtualized to zero. Additionally consolidate vfio_pci_get_irq_count() to use this virtualized value in reporting INTx support via ioctl and sanity checking ioctl paths since pdev->irq is re-used when the device is in MSI mode. The combination of these results in both the config space of the device and the ioctl interface behaving as if the device does not support INTx. Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20250311230623.1264283-1-alex.williamson@redhat.com Signed-off-by: Alex Williamson Signed-off-by: Sasha Levin --- drivers/vfio/pci/vfio_pci_config.c | 3 ++- drivers/vfio/pci/vfio_pci_core.c | 10 +--------- drivers/vfio/pci/vfio_pci_intrs.c | 2 +- 3 files changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 7902e1ec0fef..105243d83b2d 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -1806,7 +1806,8 @@ int vfio_config_init(struct vfio_pci_core_device *vdev) cpu_to_le16(PCI_COMMAND_MEMORY); } - if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx) + if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || vdev->nointx || + vdev->pdev->irq == IRQ_NOTCONNECTED) vconfig[PCI_INTERRUPT_PIN] = 0; ret = vfio_cap_init(vdev); diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index f357fd157e1e..aa362b434413 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -719,15 +719,7 @@ EXPORT_SYMBOL_GPL(vfio_pci_core_finish_enable); static int vfio_pci_get_irq_count(struct vfio_pci_core_device *vdev, int irq_type) { if (irq_type == VFIO_PCI_INTX_IRQ_INDEX) { - u8 pin; - - if (!IS_ENABLED(CONFIG_VFIO_PCI_INTX) || - vdev->nointx || vdev->pdev->is_virtfn) - return 0; - - pci_read_config_byte(vdev->pdev, PCI_INTERRUPT_PIN, &pin); - - return pin ? 1 : 0; + return vdev->vconfig[PCI_INTERRUPT_PIN] ? 1 : 0; } else if (irq_type == VFIO_PCI_MSI_IRQ_INDEX) { u8 pos; u16 flags; diff --git a/drivers/vfio/pci/vfio_pci_intrs.c b/drivers/vfio/pci/vfio_pci_intrs.c index 5cbcde32ff79..64d78944efa5 100644 --- a/drivers/vfio/pci/vfio_pci_intrs.c +++ b/drivers/vfio/pci/vfio_pci_intrs.c @@ -207,7 +207,7 @@ static int vfio_intx_enable(struct vfio_pci_core_device *vdev, if (!is_irq_none(vdev)) return -EINVAL; - if (!pdev->irq) + if (!pdev->irq || pdev->irq == IRQ_NOTCONNECTED) return -ENODEV; name = kasprintf(GFP_KERNEL_ACCOUNT, "vfio-intx(%s)", pci_name(pdev)); From a73f1ba99440a3c7c4d0cff4f3c26b3c3efd987a Mon Sep 17 00:00:00 2001 From: Mykyta Yatsenko Date: Mon, 17 Mar 2025 17:40:37 +0000 Subject: [PATCH 262/529] bpf: Return prog btf_id without capable check [ Upstream commit 07651ccda9ff10a8ca427670cdd06ce2c8e4269c ] Return prog's btf_id from bpf_prog_get_info_by_fd regardless of capable check. This patch enables scenario, when freplace program, running from user namespace, requires to query target prog's btf. Signed-off-by: Mykyta Yatsenko Signed-off-by: Andrii Nakryiko Acked-by: Yonghong Song Link: https://lore.kernel.org/bpf/20250317174039.161275-3-mykyta.yatsenko5@gmail.com Signed-off-by: Sasha Levin --- kernel/bpf/syscall.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 27fdf1b2fc46..b145f3ef3695 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -4005,6 +4005,8 @@ static int bpf_prog_get_info_by_fd(struct file *file, info.recursion_misses = stats.misses; info.verified_insns = prog->aux->verified_insns; + if (prog->aux->btf) + info.btf_id = btf_obj_id(prog->aux->btf); if (!bpf_capable()) { info.jited_prog_len = 0; @@ -4151,8 +4153,6 @@ static int bpf_prog_get_info_by_fd(struct file *file, } } - if (prog->aux->btf) - info.btf_id = btf_obj_id(prog->aux->btf); info.attach_btf_id = prog->aux->attach_btf_id; if (attach_btf) info.attach_btf_obj_id = btf_obj_id(attach_btf); From 9ff6d39f1732fd7d7c72440011c5f3627244f94d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Wed, 5 Mar 2025 23:38:41 +0100 Subject: [PATCH 263/529] tcp: reorganize tcp_in_ack_event() and tcp_count_delivered() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 149dfb31615e22271d2525f078c95ea49bc4db24 ] - Move tcp_count_delivered() earlier and split tcp_count_delivered_ce() out of it - Move tcp_in_ack_event() later - While at it, remove the inline from tcp_in_ack_event() and let the compiler to decide Accurate ECN's heuristics does not know if there is going to be ACE field based CE counter increase or not until after rtx queue has been processed. Only then the number of ACKed bytes/pkts is available. As CE or not affects presence of FLAG_ECE, that information for tcp_in_ack_event is not yet available in the old location of the call to tcp_in_ack_event(). Signed-off-by: Ilpo Järvinen Signed-off-by: Chia-Yu Chang Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv4/tcp_input.c | 56 +++++++++++++++++++++++++------------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 3b81f6df829f..db1a99df29d5 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -404,6 +404,20 @@ static bool tcp_ecn_rcv_ecn_echo(const struct tcp_sock *tp, const struct tcphdr return false; } +static void tcp_count_delivered_ce(struct tcp_sock *tp, u32 ecn_count) +{ + tp->delivered_ce += ecn_count; +} + +/* Updates the delivered and delivered_ce counts */ +static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, + bool ece_ack) +{ + tp->delivered += delivered; + if (ece_ack) + tcp_count_delivered_ce(tp, delivered); +} + /* Buffer size and advertised window tuning. * * 1. Tuning sk->sk_sndbuf, when connection enters established state. @@ -1119,15 +1133,6 @@ void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) } } -/* Updates the delivered and delivered_ce counts */ -static void tcp_count_delivered(struct tcp_sock *tp, u32 delivered, - bool ece_ack) -{ - tp->delivered += delivered; - if (ece_ack) - tp->delivered_ce += delivered; -} - /* This procedure tags the retransmission queue when SACKs arrive. * * We have three tag bits: SACKED(S), RETRANS(R) and LOST(L). @@ -3783,12 +3788,23 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) } } -static inline void tcp_in_ack_event(struct sock *sk, u32 flags) +static void tcp_in_ack_event(struct sock *sk, int flag) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (icsk->icsk_ca_ops->in_ack_event) - icsk->icsk_ca_ops->in_ack_event(sk, flags); + if (icsk->icsk_ca_ops->in_ack_event) { + u32 ack_ev_flags = 0; + + if (flag & FLAG_WIN_UPDATE) + ack_ev_flags |= CA_ACK_WIN_UPDATE; + if (flag & FLAG_SLOWPATH) { + ack_ev_flags |= CA_ACK_SLOWPATH; + if (flag & FLAG_ECE) + ack_ev_flags |= CA_ACK_ECE; + } + + icsk->icsk_ca_ops->in_ack_event(sk, ack_ev_flags); + } } /* Congestion control has updated the cwnd already. So if we're in @@ -3905,12 +3921,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_snd_una_update(tp, ack); flag |= FLAG_WIN_UPDATE; - tcp_in_ack_event(sk, CA_ACK_WIN_UPDATE); - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPHPACKS); } else { - u32 ack_ev_flags = CA_ACK_SLOWPATH; - if (ack_seq != TCP_SKB_CB(skb)->end_seq) flag |= FLAG_DATA; else @@ -3922,19 +3934,12 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una, &sack_state); - if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) { + if (tcp_ecn_rcv_ecn_echo(tp, tcp_hdr(skb))) flag |= FLAG_ECE; - ack_ev_flags |= CA_ACK_ECE; - } if (sack_state.sack_delivered) tcp_count_delivered(tp, sack_state.sack_delivered, flag & FLAG_ECE); - - if (flag & FLAG_WIN_UPDATE) - ack_ev_flags |= CA_ACK_WIN_UPDATE; - - tcp_in_ack_event(sk, ack_ev_flags); } /* This is a deviation from RFC3168 since it states that: @@ -3961,6 +3966,8 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); + tcp_in_ack_event(sk, flag); + if (tp->tlp_high_seq) tcp_process_tlp_ack(sk, ack, flag); @@ -3992,6 +3999,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) return 1; no_queue: + tcp_in_ack_event(sk, flag); /* If data was DSACKed, see if we can undo a cwnd reduction. */ if (flag & FLAG_DSACKING_ACK) { tcp_fastretrans_alert(sk, prior_snd_una, num_dupack, &flag, From 7d9682b01519a6e99f6c79a4e8e5c0e77d834913 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Thu, 6 Mar 2025 22:42:41 +0100 Subject: [PATCH 264/529] rtc: rv3032: fix EERD location [ Upstream commit b0f9cb4a0706b0356e84d67e48500b77b343debe ] EERD is bit 2 in CTRL1 Link: https://lore.kernel.org/r/20250306214243.1167692-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni Signed-off-by: Sasha Levin --- drivers/rtc/rtc-rv3032.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/rtc/rtc-rv3032.c b/drivers/rtc/rtc-rv3032.c index c3bee305eacc..9c85ecd9afb8 100644 --- a/drivers/rtc/rtc-rv3032.c +++ b/drivers/rtc/rtc-rv3032.c @@ -69,7 +69,7 @@ #define RV3032_CLKOUT2_FD_MSK GENMASK(6, 5) #define RV3032_CLKOUT2_OS BIT(7) -#define RV3032_CTRL1_EERD BIT(3) +#define RV3032_CTRL1_EERD BIT(2) #define RV3032_CTRL1_WADA BIT(5) #define RV3032_CTRL2_STOP BIT(0) From 669a53eeccc8dfec4d21dceba98593a28f036d3d Mon Sep 17 00:00:00 2001 From: Mika Westerberg Date: Wed, 5 Mar 2025 14:56:20 +0200 Subject: [PATCH 265/529] thunderbolt: Do not add non-active NVM if NVM upgrade is disabled for retimer [ Upstream commit ad79c278e478ca8c1a3bf8e7a0afba8f862a48a1 ] This is only used to write a new NVM in order to upgrade the retimer firmware. It does not make sense to expose it if upgrade is disabled. This also makes it consistent with the router NVM upgrade. Signed-off-by: Mika Westerberg Signed-off-by: Sasha Levin --- drivers/thunderbolt/retimer.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/thunderbolt/retimer.c b/drivers/thunderbolt/retimer.c index 5bd5c22a5085..d2038337ea03 100644 --- a/drivers/thunderbolt/retimer.c +++ b/drivers/thunderbolt/retimer.c @@ -89,9 +89,11 @@ static int tb_retimer_nvm_add(struct tb_retimer *rt) if (ret) goto err_nvm; - ret = tb_nvm_add_non_active(nvm, nvm_write); - if (ret) - goto err_nvm; + if (!rt->no_nvm_upgrade) { + ret = tb_nvm_add_non_active(nvm, nvm_write); + if (ret) + goto err_nvm; + } rt->nvm = nvm; return 0; From e78a34da61ab3d8ac0733aad2e083f0b2e5da45d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=ADcolas=20F=2E=20R=2E=20A=2E=20Prado?= Date: Thu, 6 Mar 2025 16:52:17 -0300 Subject: [PATCH 266/529] ASoC: mediatek: mt6359: Add stub for mt6359_accdet_enable_jack_detect MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 0116a7d84b32537a10d9bea1fd1bfc06577ef527 ] Add a stub for mt6359_accdet_enable_jack_detect() to prevent linker failures in the machine sound drivers calling it when CONFIG_SND_SOC_MT6359_ACCDET is not enabled. Suggested-by: AngeloGioacchino Del Regno Signed-off-by: Nícolas F. R. A. Prado Link: https://patch.msgid.link/20250306-mt8188-accdet-v3-3-7828e835ff4b@collabora.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/mt6359-accdet.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/sound/soc/codecs/mt6359-accdet.h b/sound/soc/codecs/mt6359-accdet.h index c234f2f4276a..78ada3a5bfae 100644 --- a/sound/soc/codecs/mt6359-accdet.h +++ b/sound/soc/codecs/mt6359-accdet.h @@ -123,6 +123,15 @@ struct mt6359_accdet { struct workqueue_struct *jd_workqueue; }; +#if IS_ENABLED(CONFIG_SND_SOC_MT6359_ACCDET) int mt6359_accdet_enable_jack_detect(struct snd_soc_component *component, struct snd_soc_jack *jack); +#else +static inline int +mt6359_accdet_enable_jack_detect(struct snd_soc_component *component, + struct snd_soc_jack *jack) +{ + return -EOPNOTSUPP; +} +#endif #endif From df5a4416c04a94cdda79b40509fdb201dcfdf533 Mon Sep 17 00:00:00 2001 From: Seyediman Seyedarab Date: Sat, 1 Mar 2025 17:21:37 -0500 Subject: [PATCH 267/529] kbuild: fix argument parsing in scripts/config [ Upstream commit f757f6011c92b5a01db742c39149bed9e526478f ] The script previously assumed --file was always the first argument, which caused issues when it appeared later. This patch updates the parsing logic to scan all arguments to find --file, sets the config file correctly, and resets the argument list with the remaining commands. It also fixes --refresh to respect --file by passing KCONFIG_CONFIG=$FN to make oldconfig. Signed-off-by: Seyediman Seyedarab Signed-off-by: Masahiro Yamada Signed-off-by: Sasha Levin --- scripts/config | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/scripts/config b/scripts/config index ff88e2faefd3..ea475c07de28 100755 --- a/scripts/config +++ b/scripts/config @@ -32,6 +32,7 @@ commands: Disable option directly after other option --module-after|-M beforeopt option Turn option into module directly after other option + --refresh Refresh the config using old settings commands can be repeated multiple times @@ -124,16 +125,22 @@ undef_var() { txt_delete "^# $name is not set" "$FN" } -if [ "$1" = "--file" ]; then - FN="$2" - if [ "$FN" = "" ] ; then - usage +FN=.config +CMDS=() +while [[ $# -gt 0 ]]; do + if [ "$1" = "--file" ]; then + if [ "$2" = "" ]; then + usage + fi + FN="$2" + shift 2 + else + CMDS+=("$1") + shift fi - shift 2 -else - FN=.config -fi +done +set -- "${CMDS[@]}" if [ "$1" = "" ] ; then usage fi @@ -217,9 +224,8 @@ while [ "$1" != "" ] ; do set_var "${CONFIG_}$B" "${CONFIG_}$B=m" "${CONFIG_}$A" ;; - # undocumented because it ignores --file (fixme) --refresh) - yes "" | make oldconfig + yes "" | make oldconfig KCONFIG_CONFIG=$FN ;; *) From 3b5cc6e49c69d8e4dfc19ebc6386a1310ff19eb2 Mon Sep 17 00:00:00 2001 From: Shashank Gupta Date: Wed, 5 Mar 2025 13:27:05 +0530 Subject: [PATCH 268/529] crypto: octeontx2 - suppress auth failure screaming due to negative tests [ Upstream commit 64b7871522a4cba99d092e1c849d6f9092868aaa ] This patch addresses an issue where authentication failures were being erroneously reported due to negative test failures in the "ccm(aes)" selftest. pr_debug suppress unnecessary screaming of these tests. Signed-off-by: Shashank Gupta Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/crypto/marvell/octeontx2/otx2_cptvf_reqmgr.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/crypto/marvell/octeontx2/otx2_cptvf_reqmgr.c b/drivers/crypto/marvell/octeontx2/otx2_cptvf_reqmgr.c index 811ded72ce5f..798bb40fed68 100644 --- a/drivers/crypto/marvell/octeontx2/otx2_cptvf_reqmgr.c +++ b/drivers/crypto/marvell/octeontx2/otx2_cptvf_reqmgr.c @@ -410,9 +410,10 @@ static int cpt_process_ccode(struct otx2_cptlfs_info *lfs, break; } - dev_err(&pdev->dev, - "Request failed with software error code 0x%x\n", - cpt_status->s.uc_compcode); + pr_debug("Request failed with software error code 0x%x: algo = %s driver = %s\n", + cpt_status->s.uc_compcode, + info->req->areq->tfm->__crt_alg->cra_name, + info->req->areq->tfm->__crt_alg->cra_driver_name); otx2_cpt_dump_sg_list(pdev, info->req); break; } From 6d974bd69249d74899a62f9143f51468a0d18683 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 14 Mar 2025 13:51:32 +0100 Subject: [PATCH 269/529] dm: restrict dm device size to 2^63-512 bytes [ Upstream commit 45fc728515c14f53f6205789de5bfd72a95af3b8 ] The devices with size >= 2^63 bytes can't be used reliably by userspace because the type off_t is a signed 64-bit integer. Therefore, we limit the maximum size of a device mapper device to 2^63-512 bytes. Signed-off-by: Mikulas Patocka Signed-off-by: Sasha Levin --- drivers/md/dm-table.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index a20cf54d12dc..8b23b8bc5a03 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -671,6 +671,10 @@ int dm_table_add_target(struct dm_table *t, const char *type, DMERR("%s: zero-length target", dm_device_name(t->md)); return -EINVAL; } + if (start + len < start || start + len > LLONG_MAX >> SECTOR_SHIFT) { + DMERR("%s: too large device", dm_device_name(t->md)); + return -EINVAL; + } ti->type = dm_get_target_type(type); if (!ti->type) { From 9af7628ee64b18d39c0a31ac260bd3c9e064251d Mon Sep 17 00:00:00 2001 From: Guangguan Wang Date: Tue, 4 Mar 2025 20:43:04 +0800 Subject: [PATCH 270/529] net/smc: use the correct ndev to find pnetid by pnetid table [ Upstream commit bfc6c67ec2d64d0ca4e5cc3e1ac84298a10b8d62 ] When using smc_pnet in SMC, it will only search the pnetid in the base_ndev of the netdev hierarchy(both HW PNETID and User-defined sw pnetid). This may not work for some scenarios when using SMC in container on cloud environment. In container, there have choices of different container network, such as directly using host network, virtual network IPVLAN, veth, etc. Different choices of container network have different netdev hierarchy. Examples of netdev hierarchy show below. (eth0 and eth1 in host below is the netdev directly related to the physical device). _______________________________ | _________________ | | |POD | | | | | | | | eth0_________ | | | |____| |__| | | | | | | | | | | eth1|base_ndev| eth0_______ | | | | | RDMA || | host |_________| |_______|| --------------------------------- netdev hierarchy if directly using host network ________________________________ | _________________ | | |POD __________ | | | | |upper_ndev| | | | |eth0|__________| | | | |_______|_________| | | |lower netdev | | __|______ | | eth1| | eth0_______ | | |base_ndev| | RDMA || | host |_________| |_______|| --------------------------------- netdev hierarchy if using IPVLAN _______________________________ | _____________________ | | |POD _________ | | | | |base_ndev|| | | |eth0(veth)|_________|| | | |____________|________| | | |pairs | | _______|_ | | | | eth0_______ | | veth|base_ndev| | RDMA || | |_________| |_______|| | _________ | | eth1|base_ndev| | | host |_________| | --------------------------------- netdev hierarchy if using veth Due to some reasons, the eth1 in host is not RDMA attached netdevice, pnetid is needed to map the eth1(in host) with RDMA device so that POD can do SMC-R. Because the eth1(in host) is managed by CNI plugin(such as Terway, network management plugin in container environment), and in cloud environment the eth(in host) can dynamically be inserted by CNI when POD create and dynamically be removed by CNI when POD destroy and no POD related to the eth(in host) anymore. It is hard to config the pnetid to the eth1(in host). But it is easy to config the pnetid to the netdevice which can be seen in POD. When do SMC-R, both the container directly using host network and the container using veth network can successfully match the RDMA device, because the configured pnetid netdev is a base_ndev. But the container using IPVLAN can not successfully match the RDMA device and 0x03030000 fallback happens, because the configured pnetid netdev is not a base_ndev. Additionally, if config pnetid to the eth1(in host) also can not work for matching RDMA device when using veth network and doing SMC-R in POD. To resolve the problems list above, this patch extends to search user -defined sw pnetid in the clc handshake ndev when no pnetid can be found in the base_ndev, and the base_ndev take precedence over ndev for backward compatibility. This patch also can unify the pnetid setup of different network choices list above in container(Config user-defined sw pnetid in the netdevice can be seen in POD). Signed-off-by: Guangguan Wang Reviewed-by: Wenjia Zhang Reviewed-by: Halil Pasic Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/smc/smc_pnet.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/net/smc/smc_pnet.c b/net/smc/smc_pnet.c index 399314cfab90..af12e0215274 100644 --- a/net/smc/smc_pnet.c +++ b/net/smc/smc_pnet.c @@ -1080,14 +1080,16 @@ static void smc_pnet_find_roce_by_pnetid(struct net_device *ndev, struct smc_init_info *ini) { u8 ndev_pnetid[SMC_MAX_PNETID_LEN]; + struct net_device *base_ndev; struct net *net; - ndev = pnet_find_base_ndev(ndev); + base_ndev = pnet_find_base_ndev(ndev); net = dev_net(ndev); - if (smc_pnetid_by_dev_port(ndev->dev.parent, ndev->dev_port, + if (smc_pnetid_by_dev_port(base_ndev->dev.parent, base_ndev->dev_port, ndev_pnetid) && + smc_pnet_find_ndev_pnetid_by_table(base_ndev, ndev_pnetid) && smc_pnet_find_ndev_pnetid_by_table(ndev, ndev_pnetid)) { - smc_pnet_find_rdma_dev(ndev, ini); + smc_pnet_find_rdma_dev(base_ndev, ini); return; /* pnetid could not be determined */ } _smc_pnet_find_roce_by_pnetid(ndev_pnetid, ini, NULL, net); From 02d850de9495699f2029886a6a69f0ed07a39b84 Mon Sep 17 00:00:00 2001 From: Frediano Ziglio Date: Thu, 27 Feb 2025 14:50:15 +0000 Subject: [PATCH 271/529] xen: Add support for XenServer 6.1 platform device [ Upstream commit 2356f15caefc0cc63d9cc5122641754f76ef9b25 ] On XenServer on Windows machine a platform device with ID 2 instead of 1 is used. This device is mainly identical to device 1 but due to some Windows update behaviour it was decided to use a device with a different ID. This causes compatibility issues with Linux which expects, if Xen is detected, to find a Xen platform device (5853:0001) otherwise code will crash due to some missing initialization (specifically grant tables). Specifically from dmesg RIP: 0010:gnttab_expand+0x29/0x210 Code: 90 0f 1f 44 00 00 55 31 d2 48 89 e5 41 57 41 56 41 55 41 89 fd 41 54 53 48 83 ec 10 48 8b 05 7e 9a 49 02 44 8b 35 a7 9a 49 02 <8b> 48 04 8d 44 39 ff f7 f1 45 8d 24 06 89 c3 e8 43 fe ff ff 44 39 RSP: 0000:ffffba34c01fbc88 EFLAGS: 00010086 ... The device 2 is presented by Xapi adding device specification to Qemu command line. Signed-off-by: Frediano Ziglio Acked-by: Juergen Gross Message-ID: <20250227145016.25350-1-frediano.ziglio@cloud.com> Signed-off-by: Juergen Gross Signed-off-by: Sasha Levin --- drivers/xen/platform-pci.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/xen/platform-pci.c b/drivers/xen/platform-pci.c index 544d3f9010b9..1db82da56db6 100644 --- a/drivers/xen/platform-pci.c +++ b/drivers/xen/platform-pci.c @@ -26,6 +26,8 @@ #define DRV_NAME "xen-platform-pci" +#define PCI_DEVICE_ID_XEN_PLATFORM_XS61 0x0002 + static unsigned long platform_mmio; static unsigned long platform_mmio_alloc; static unsigned long platform_mmiolen; @@ -174,6 +176,8 @@ pci_out: static const struct pci_device_id platform_pci_tbl[] = { {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM, PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, + {PCI_VENDOR_ID_XEN, PCI_DEVICE_ID_XEN_PLATFORM_XS61, + PCI_ANY_ID, PCI_ANY_ID, 0, 0, 0}, {0,} }; From 7fcfed72a51b8cfce7c9c654f4f8399dffb067c5 Mon Sep 17 00:00:00 2001 From: Prathamesh Shete Date: Wed, 5 Mar 2025 16:19:39 +0530 Subject: [PATCH 272/529] pinctrl-tegra: Restore SFSEL bit when freeing pins [ Upstream commit c12bfa0fee65940b10ff5187349f76c6f6b1df9c ] Each pin can be configured as a Special Function IO (SFIO) or GPIO, where the SFIO enables the pin to operate in alternative modes such as I2C, SPI, etc. The current implementation sets all the pins back to SFIO mode even if they were initially in GPIO mode. This can cause glitches on the pins when pinctrl_gpio_free() is called. Avoid these undesired glitches by storing the pin's SFIO/GPIO state on GPIO request and restoring it on GPIO free. Signed-off-by: Prathamesh Shete Link: https://lore.kernel.org/20250305104939.15168-2-pshete@nvidia.com Signed-off-by: Linus Walleij Signed-off-by: Sasha Levin --- drivers/pinctrl/tegra/pinctrl-tegra.c | 59 +++++++++++++++++++++++---- drivers/pinctrl/tegra/pinctrl-tegra.h | 6 +++ 2 files changed, 57 insertions(+), 8 deletions(-) diff --git a/drivers/pinctrl/tegra/pinctrl-tegra.c b/drivers/pinctrl/tegra/pinctrl-tegra.c index 30341c43da59..ba7bcc876e30 100644 --- a/drivers/pinctrl/tegra/pinctrl-tegra.c +++ b/drivers/pinctrl/tegra/pinctrl-tegra.c @@ -278,8 +278,8 @@ static int tegra_pinctrl_set_mux(struct pinctrl_dev *pctldev, return 0; } -static const struct tegra_pingroup *tegra_pinctrl_get_group(struct pinctrl_dev *pctldev, - unsigned int offset) +static int tegra_pinctrl_get_group_index(struct pinctrl_dev *pctldev, + unsigned int offset) { struct tegra_pmx *pmx = pinctrl_dev_get_drvdata(pctldev); unsigned int group, num_pins, j; @@ -292,12 +292,35 @@ static const struct tegra_pingroup *tegra_pinctrl_get_group(struct pinctrl_dev * continue; for (j = 0; j < num_pins; j++) { if (offset == pins[j]) - return &pmx->soc->groups[group]; + return group; } } - dev_err(pctldev->dev, "Pingroup not found for pin %u\n", offset); - return NULL; + return -EINVAL; +} + +static const struct tegra_pingroup *tegra_pinctrl_get_group(struct pinctrl_dev *pctldev, + unsigned int offset, + int group_index) +{ + struct tegra_pmx *pmx = pinctrl_dev_get_drvdata(pctldev); + + if (group_index < 0 || group_index > pmx->soc->ngroups) + return NULL; + + return &pmx->soc->groups[group_index]; +} + +static struct tegra_pingroup_config *tegra_pinctrl_get_group_config(struct pinctrl_dev *pctldev, + unsigned int offset, + int group_index) +{ + struct tegra_pmx *pmx = pinctrl_dev_get_drvdata(pctldev); + + if (group_index < 0) + return NULL; + + return &pmx->pingroup_configs[group_index]; } static int tegra_pinctrl_gpio_request_enable(struct pinctrl_dev *pctldev, @@ -306,12 +329,15 @@ static int tegra_pinctrl_gpio_request_enable(struct pinctrl_dev *pctldev, { struct tegra_pmx *pmx = pinctrl_dev_get_drvdata(pctldev); const struct tegra_pingroup *group; + struct tegra_pingroup_config *config; + int group_index; u32 value; if (!pmx->soc->sfsel_in_mux) return 0; - group = tegra_pinctrl_get_group(pctldev, offset); + group_index = tegra_pinctrl_get_group_index(pctldev, offset); + group = tegra_pinctrl_get_group(pctldev, offset, group_index); if (!group) return -EINVAL; @@ -319,7 +345,11 @@ static int tegra_pinctrl_gpio_request_enable(struct pinctrl_dev *pctldev, if (group->mux_reg < 0 || group->sfsel_bit < 0) return -EINVAL; + config = tegra_pinctrl_get_group_config(pctldev, offset, group_index); + if (!config) + return -EINVAL; value = pmx_readl(pmx, group->mux_bank, group->mux_reg); + config->is_sfsel = (value & BIT(group->sfsel_bit)) != 0; value &= ~BIT(group->sfsel_bit); pmx_writel(pmx, value, group->mux_bank, group->mux_reg); @@ -332,12 +362,15 @@ static void tegra_pinctrl_gpio_disable_free(struct pinctrl_dev *pctldev, { struct tegra_pmx *pmx = pinctrl_dev_get_drvdata(pctldev); const struct tegra_pingroup *group; + struct tegra_pingroup_config *config; + int group_index; u32 value; if (!pmx->soc->sfsel_in_mux) return; - group = tegra_pinctrl_get_group(pctldev, offset); + group_index = tegra_pinctrl_get_group_index(pctldev, offset); + group = tegra_pinctrl_get_group(pctldev, offset, group_index); if (!group) return; @@ -345,8 +378,12 @@ static void tegra_pinctrl_gpio_disable_free(struct pinctrl_dev *pctldev, if (group->mux_reg < 0 || group->sfsel_bit < 0) return; + config = tegra_pinctrl_get_group_config(pctldev, offset, group_index); + if (!config) + return; value = pmx_readl(pmx, group->mux_bank, group->mux_reg); - value |= BIT(group->sfsel_bit); + if (config->is_sfsel) + value |= BIT(group->sfsel_bit); pmx_writel(pmx, value, group->mux_bank, group->mux_reg); } @@ -799,6 +836,12 @@ int tegra_pinctrl_probe(struct platform_device *pdev, pmx->dev = &pdev->dev; pmx->soc = soc_data; + pmx->pingroup_configs = devm_kcalloc(&pdev->dev, + pmx->soc->ngroups, sizeof(*pmx->pingroup_configs), + GFP_KERNEL); + if (!pmx->pingroup_configs) + return -ENOMEM; + /* * Each mux group will appear in 4 functions' list of groups. * This over-allocates slightly, since not all groups are mux groups. diff --git a/drivers/pinctrl/tegra/pinctrl-tegra.h b/drivers/pinctrl/tegra/pinctrl-tegra.h index f8269858eb78..ec5198d391ea 100644 --- a/drivers/pinctrl/tegra/pinctrl-tegra.h +++ b/drivers/pinctrl/tegra/pinctrl-tegra.h @@ -8,6 +8,10 @@ #ifndef __PINMUX_TEGRA_H__ #define __PINMUX_TEGRA_H__ +struct tegra_pingroup_config { + bool is_sfsel; +}; + struct tegra_pmx { struct device *dev; struct pinctrl_dev *pctl; @@ -18,6 +22,8 @@ struct tegra_pmx { int nbanks; void __iomem **regs; u32 *backup_regs; + /* Array of size soc->ngroups */ + struct tegra_pingroup_config *pingroup_configs; }; enum tegra_pinconf_param { From e472c3d8ad57abe51c4439fc78f63ad7a6a5cde7 Mon Sep 17 00:00:00 2001 From: Ryan Walklin Date: Sat, 15 Feb 2025 11:02:25 +1300 Subject: [PATCH 273/529] ASoC: sun4i-codec: support hp-det-gpios property [ Upstream commit a149377c033afe6557c50892ebbfc0e8b7e2e253 ] Add support for GPIO headphone detection with the hp-det-gpios property. In order for this to properly disable the path upon removal of headphones, the output must be labelled Headphone which is a common sink in the driver. Describe a headphone jack and detection GPIO in the driver, check for a corresponding device tree node, and enable jack detection in a new machine init function if described. Signed-off-by: Chris Morgan Signed-off-by: Ryan Walklin -- Changelog v1..v2: - Separate DAPM changes into separate patch and add rationale. Tested-by: Philippe Simons Link: https://patch.msgid.link/20250214220247.10810-4-ryan@testtoast.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/sunxi/sun4i-codec.c | 53 +++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/sound/soc/sunxi/sun4i-codec.c b/sound/soc/sunxi/sun4i-codec.c index 835dc3404367..1e310958f8c0 100644 --- a/sound/soc/sunxi/sun4i-codec.c +++ b/sound/soc/sunxi/sun4i-codec.c @@ -25,6 +25,7 @@ #include #include +#include #include #include #include @@ -239,6 +240,7 @@ struct sun4i_codec { struct clk *clk_module; struct reset_control *rst; struct gpio_desc *gpio_pa; + struct gpio_desc *gpio_hp; /* ADC_FIFOC register is at different offset on different SoCs */ struct regmap_field *reg_adc_fifoc; @@ -1273,6 +1275,49 @@ static struct snd_soc_dai_driver dummy_cpu_dai = { }, }; +static struct snd_soc_jack sun4i_headphone_jack; + +static struct snd_soc_jack_pin sun4i_headphone_jack_pins[] = { + { .pin = "Headphone", .mask = SND_JACK_HEADPHONE }, +}; + +static struct snd_soc_jack_gpio sun4i_headphone_jack_gpio = { + .name = "hp-det", + .report = SND_JACK_HEADPHONE, + .debounce_time = 150, +}; + +static int sun4i_codec_machine_init(struct snd_soc_pcm_runtime *rtd) +{ + struct snd_soc_card *card = rtd->card; + struct sun4i_codec *scodec = snd_soc_card_get_drvdata(card); + int ret; + + if (scodec->gpio_hp) { + ret = snd_soc_card_jack_new_pins(card, "Headphone Jack", + SND_JACK_HEADPHONE, + &sun4i_headphone_jack, + sun4i_headphone_jack_pins, + ARRAY_SIZE(sun4i_headphone_jack_pins)); + if (ret) { + dev_err(rtd->dev, + "Headphone jack creation failed: %d\n", ret); + return ret; + } + + sun4i_headphone_jack_gpio.desc = scodec->gpio_hp; + ret = snd_soc_jack_add_gpios(&sun4i_headphone_jack, 1, + &sun4i_headphone_jack_gpio); + + if (ret) { + dev_err(rtd->dev, "Headphone GPIO not added: %d\n", ret); + return ret; + } + } + + return 0; +} + static struct snd_soc_dai_link *sun4i_codec_create_link(struct device *dev, int *num_links) { @@ -1298,6 +1343,7 @@ static struct snd_soc_dai_link *sun4i_codec_create_link(struct device *dev, link->codecs->name = dev_name(dev); link->platforms->name = dev_name(dev); link->dai_fmt = SND_SOC_DAIFMT_I2S; + link->init = sun4i_codec_machine_init; *num_links = 1; @@ -1738,6 +1784,13 @@ static int sun4i_codec_probe(struct platform_device *pdev) return ret; } + scodec->gpio_hp = devm_gpiod_get_optional(&pdev->dev, "hp-det", GPIOD_IN); + if (IS_ERR(scodec->gpio_hp)) { + ret = PTR_ERR(scodec->gpio_hp); + dev_err_probe(&pdev->dev, ret, "Failed to get hp-det gpio\n"); + return ret; + } + /* reg_field setup */ scodec->reg_adc_fifoc = devm_regmap_field_alloc(&pdev->dev, scodec->regmap, From 6944537397b31c58b3060f94dbbbe5eaedcf8e75 Mon Sep 17 00:00:00 2001 From: Baokun Li Date: Wed, 22 Jan 2025 19:05:27 +0800 Subject: [PATCH 274/529] ext4: reject the 'data_err=abort' option in nojournal mode [ Upstream commit 26343ca0df715097065b02a6cddb4a029d5b9327 ] data_err=abort aborts the journal on I/O errors. However, this option is meaningless if journal is disabled, so it is rejected in nojournal mode to reduce unnecessary checks. Also, this option is ignored upon remount. Signed-off-by: Baokun Li Reviewed-by: Zhang Yi Reviewed-by: Jan Kara Link: https://patch.msgid.link/20250122110533.4116662-4-libaokun@huaweicloud.com Signed-off-by: Theodore Ts'o Signed-off-by: Sasha Levin --- fs/ext4/super.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7f0231b34905..f829f989f2b5 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2741,6 +2741,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc, } if (is_remount) { + if (!sbi->s_journal && + ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) { + ext4_msg(NULL, KERN_WARNING, + "Remounting fs w/o journal so ignoring data_err option"); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT); + } + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " @@ -5318,6 +5325,11 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) "data=, fs mounted w/o journal"); goto failed_mount3a; } + if (test_opt(sb, DATA_ERR_ABORT)) { + ext4_msg(sb, KERN_ERR, + "can't mount with data_err=abort, fs mounted w/o journal"); + goto failed_mount3a; + } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); From 8d6795397866ddbecb79b2e795721d9d65b89fb4 Mon Sep 17 00:00:00 2001 From: Maher Sanalla Date: Wed, 26 Feb 2025 15:54:13 +0200 Subject: [PATCH 275/529] RDMA/uverbs: Propagate errors from rdma_lookup_get_uobject() [ Upstream commit 81f8f7454ad9e0bf95efdec6542afdc9a6ab1e24 ] Currently, the IB uverbs API calls uobj_get_uobj_read(), which in turn uses the rdma_lookup_get_uobject() helper to retrieve user objects. In case of failure, uobj_get_uobj_read() returns NULL, overriding the error code from rdma_lookup_get_uobject(). The IB uverbs API then translates this NULL to -EINVAL, masking the actual error and complicating debugging. For example, applications calling ibv_modify_qp that fails with EBUSY when retrieving the QP uobject will see the overridden error code EINVAL instead, masking the actual error. Furthermore, based on rdma-core commit: "2a22f1ced5f3 ("Merge pull request #1568 from jakemoroni/master")" Kernel's IB uverbs return values are either ignored and passed on as is to application or overridden with other errnos in a few cases. Thus, to improve error reporting and debuggability, propagate the original error from rdma_lookup_get_uobject() instead of replacing it with EINVAL. Signed-off-by: Maher Sanalla Link: https://patch.msgid.link/64f9d3711b183984e939962c2f83383904f97dfb.1740577869.git.leon@kernel.org Signed-off-by: Leon Romanovsky Signed-off-by: Sasha Levin --- drivers/infiniband/core/uverbs_cmd.c | 144 ++++++++++++++------------- include/rdma/uverbs_std_types.h | 2 +- 2 files changed, 77 insertions(+), 69 deletions(-) diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index c6053e82ecf6..33e2fe0facd5 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -718,8 +718,8 @@ static int ib_uverbs_reg_mr(struct uverbs_attr_bundle *attrs) goto err_free; pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_free; } @@ -809,8 +809,8 @@ static int ib_uverbs_rereg_mr(struct uverbs_attr_bundle *attrs) if (cmd.flags & IB_MR_REREG_PD) { new_pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!new_pd) { - ret = -EINVAL; + if (IS_ERR(new_pd)) { + ret = PTR_ERR(new_pd); goto put_uobjs; } } else { @@ -919,8 +919,8 @@ static int ib_uverbs_alloc_mw(struct uverbs_attr_bundle *attrs) return PTR_ERR(uobj); pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_free; } @@ -1127,8 +1127,8 @@ static int ib_uverbs_resize_cq(struct uverbs_attr_bundle *attrs) return ret; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); ret = cq->device->ops.resize_cq(cq, cmd.cqe, &attrs->driver_udata); if (ret) @@ -1189,8 +1189,8 @@ static int ib_uverbs_poll_cq(struct uverbs_attr_bundle *attrs) return ret; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); /* we copy a struct ib_uverbs_poll_cq_resp to user space */ header_ptr = attrs->ucore.outbuf; @@ -1238,8 +1238,8 @@ static int ib_uverbs_req_notify_cq(struct uverbs_attr_bundle *attrs) return ret; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); ib_req_notify_cq(cq, cmd.solicited_only ? IB_CQ_SOLICITED : IB_CQ_NEXT_COMP); @@ -1321,8 +1321,8 @@ static int create_qp(struct uverbs_attr_bundle *attrs, ind_tbl = uobj_get_obj_read(rwq_ind_table, UVERBS_OBJECT_RWQ_IND_TBL, cmd->rwq_ind_tbl_handle, attrs); - if (!ind_tbl) { - ret = -EINVAL; + if (IS_ERR(ind_tbl)) { + ret = PTR_ERR(ind_tbl); goto err_put; } @@ -1360,8 +1360,10 @@ static int create_qp(struct uverbs_attr_bundle *attrs, if (cmd->is_srq) { srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd->srq_handle, attrs); - if (!srq || srq->srq_type == IB_SRQT_XRC) { - ret = -EINVAL; + if (IS_ERR(srq) || + srq->srq_type == IB_SRQT_XRC) { + ret = IS_ERR(srq) ? PTR_ERR(srq) : + -EINVAL; goto err_put; } } @@ -1371,23 +1373,29 @@ static int create_qp(struct uverbs_attr_bundle *attrs, rcq = uobj_get_obj_read( cq, UVERBS_OBJECT_CQ, cmd->recv_cq_handle, attrs); - if (!rcq) { - ret = -EINVAL; + if (IS_ERR(rcq)) { + ret = PTR_ERR(rcq); goto err_put; } } } } - if (has_sq) + if (has_sq) { scq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->send_cq_handle, attrs); + if (IS_ERR(scq)) { + ret = PTR_ERR(scq); + goto err_put; + } + } + if (!ind_tbl && cmd->qp_type != IB_QPT_XRC_INI) rcq = rcq ?: scq; pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs); - if (!pd || (!scq && has_sq)) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_put; } @@ -1482,18 +1490,18 @@ static int create_qp(struct uverbs_attr_bundle *attrs, err_put: if (!IS_ERR(xrcd_uobj)) uobj_put_read(xrcd_uobj); - if (pd) + if (!IS_ERR_OR_NULL(pd)) uobj_put_obj_read(pd); - if (scq) + if (!IS_ERR_OR_NULL(scq)) rdma_lookup_put_uobject(&scq->uobject->uevent.uobject, UVERBS_LOOKUP_READ); - if (rcq && rcq != scq) + if (!IS_ERR_OR_NULL(rcq) && rcq != scq) rdma_lookup_put_uobject(&rcq->uobject->uevent.uobject, UVERBS_LOOKUP_READ); - if (srq) + if (!IS_ERR_OR_NULL(srq)) rdma_lookup_put_uobject(&srq->uobject->uevent.uobject, UVERBS_LOOKUP_READ); - if (ind_tbl) + if (!IS_ERR_OR_NULL(ind_tbl)) uobj_put_obj_read(ind_tbl); uobj_alloc_abort(&obj->uevent.uobject, attrs); @@ -1655,8 +1663,8 @@ static int ib_uverbs_query_qp(struct uverbs_attr_bundle *attrs) } qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } @@ -1761,8 +1769,8 @@ static int modify_qp(struct uverbs_attr_bundle *attrs, qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd->base.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } @@ -2027,8 +2035,8 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) return -ENOMEM; qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } @@ -2065,9 +2073,9 @@ static int ib_uverbs_post_send(struct uverbs_attr_bundle *attrs) ud->ah = uobj_get_obj_read(ah, UVERBS_OBJECT_AH, user_wr->wr.ud.ah, attrs); - if (!ud->ah) { + if (IS_ERR(ud->ah)) { + ret = PTR_ERR(ud->ah); kfree(ud); - ret = -EINVAL; goto out_put; } ud->remote_qpn = user_wr->wr.ud.remote_qpn; @@ -2304,8 +2312,8 @@ static int ib_uverbs_post_recv(struct uverbs_attr_bundle *attrs) return PTR_ERR(wr); qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - ret = -EINVAL; + if (IS_ERR(qp)) { + ret = PTR_ERR(qp); goto out; } @@ -2355,8 +2363,8 @@ static int ib_uverbs_post_srq_recv(struct uverbs_attr_bundle *attrs) return PTR_ERR(wr); srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); - if (!srq) { - ret = -EINVAL; + if (IS_ERR(srq)) { + ret = PTR_ERR(srq); goto out; } @@ -2412,8 +2420,8 @@ static int ib_uverbs_create_ah(struct uverbs_attr_bundle *attrs) } pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err; } @@ -2482,8 +2490,8 @@ static int ib_uverbs_attach_mcast(struct uverbs_attr_bundle *attrs) return ret; qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) - return -EINVAL; + if (IS_ERR(qp)) + return PTR_ERR(qp); obj = qp->uobject; @@ -2532,8 +2540,8 @@ static int ib_uverbs_detach_mcast(struct uverbs_attr_bundle *attrs) return ret; qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) - return -EINVAL; + if (IS_ERR(qp)) + return PTR_ERR(qp); obj = qp->uobject; mutex_lock(&obj->mcast_lock); @@ -2667,8 +2675,8 @@ static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs, UVERBS_OBJECT_FLOW_ACTION, kern_spec->action.handle, attrs); - if (!ib_spec->action.act) - return -EINVAL; + if (IS_ERR(ib_spec->action.act)) + return PTR_ERR(ib_spec->action.act); ib_spec->action.size = sizeof(struct ib_flow_spec_action_handle); flow_resources_add(uflow_res, @@ -2685,8 +2693,8 @@ static int kern_spec_to_ib_spec_action(struct uverbs_attr_bundle *attrs, UVERBS_OBJECT_COUNTERS, kern_spec->flow_count.handle, attrs); - if (!ib_spec->flow_count.counters) - return -EINVAL; + if (IS_ERR(ib_spec->flow_count.counters)) + return PTR_ERR(ib_spec->flow_count.counters); ib_spec->flow_count.size = sizeof(struct ib_flow_spec_action_count); flow_resources_add(uflow_res, @@ -2904,14 +2912,14 @@ static int ib_uverbs_ex_create_wq(struct uverbs_attr_bundle *attrs) return PTR_ERR(obj); pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd.pd_handle, attrs); - if (!pd) { - err = -EINVAL; + if (IS_ERR(pd)) { + err = PTR_ERR(pd); goto err_uobj; } cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) { - err = -EINVAL; + if (IS_ERR(cq)) { + err = PTR_ERR(cq); goto err_put_pd; } @@ -3012,8 +3020,8 @@ static int ib_uverbs_ex_modify_wq(struct uverbs_attr_bundle *attrs) return -EINVAL; wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, cmd.wq_handle, attrs); - if (!wq) - return -EINVAL; + if (IS_ERR(wq)) + return PTR_ERR(wq); if (cmd.attr_mask & IB_WQ_FLAGS) { wq_attr.flags = cmd.flags; @@ -3096,8 +3104,8 @@ static int ib_uverbs_ex_create_rwq_ind_table(struct uverbs_attr_bundle *attrs) num_read_wqs++) { wq = uobj_get_obj_read(wq, UVERBS_OBJECT_WQ, wqs_handles[num_read_wqs], attrs); - if (!wq) { - err = -EINVAL; + if (IS_ERR(wq)) { + err = PTR_ERR(wq); goto put_wqs; } @@ -3252,8 +3260,8 @@ static int ib_uverbs_ex_create_flow(struct uverbs_attr_bundle *attrs) } qp = uobj_get_obj_read(qp, UVERBS_OBJECT_QP, cmd.qp_handle, attrs); - if (!qp) { - err = -EINVAL; + if (IS_ERR(qp)) { + err = PTR_ERR(qp); goto err_uobj; } @@ -3399,15 +3407,15 @@ static int __uverbs_create_xsrq(struct uverbs_attr_bundle *attrs, if (ib_srq_has_cq(cmd->srq_type)) { attr.ext.cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd->cq_handle, attrs); - if (!attr.ext.cq) { - ret = -EINVAL; + if (IS_ERR(attr.ext.cq)) { + ret = PTR_ERR(attr.ext.cq); goto err_put_xrcd; } } pd = uobj_get_obj_read(pd, UVERBS_OBJECT_PD, cmd->pd_handle, attrs); - if (!pd) { - ret = -EINVAL; + if (IS_ERR(pd)) { + ret = PTR_ERR(pd); goto err_put_cq; } @@ -3514,8 +3522,8 @@ static int ib_uverbs_modify_srq(struct uverbs_attr_bundle *attrs) return ret; srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); - if (!srq) - return -EINVAL; + if (IS_ERR(srq)) + return PTR_ERR(srq); attr.max_wr = cmd.max_wr; attr.srq_limit = cmd.srq_limit; @@ -3542,8 +3550,8 @@ static int ib_uverbs_query_srq(struct uverbs_attr_bundle *attrs) return ret; srq = uobj_get_obj_read(srq, UVERBS_OBJECT_SRQ, cmd.srq_handle, attrs); - if (!srq) - return -EINVAL; + if (IS_ERR(srq)) + return PTR_ERR(srq); ret = ib_query_srq(srq, &attr); @@ -3668,8 +3676,8 @@ static int ib_uverbs_ex_modify_cq(struct uverbs_attr_bundle *attrs) return -EOPNOTSUPP; cq = uobj_get_obj_read(cq, UVERBS_OBJECT_CQ, cmd.cq_handle, attrs); - if (!cq) - return -EINVAL; + if (IS_ERR(cq)) + return PTR_ERR(cq); ret = rdma_set_cq_moderation(cq, cmd.attr.cq_count, cmd.attr.cq_period); diff --git a/include/rdma/uverbs_std_types.h b/include/rdma/uverbs_std_types.h index fe0512116958..555ea3d142a4 100644 --- a/include/rdma/uverbs_std_types.h +++ b/include/rdma/uverbs_std_types.h @@ -34,7 +34,7 @@ static inline void *_uobj_get_obj_read(struct ib_uobject *uobj) { if (IS_ERR(uobj)) - return NULL; + return ERR_CAST(uobj); return uobj->object; } #define uobj_get_obj_read(_object, _type, _id, _attrs) \ From 209f290b4f50006ee9b3a341d396f8152c821d8c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 8 Mar 2025 17:48:17 +0100 Subject: [PATCH 276/529] posix-timers: Add cond_resched() to posix_timer_add() search loop [ Upstream commit 5f2909c6cd13564a07ae692a95457f52295c4f22 ] With a large number of POSIX timers the search for a valid ID might cause a soft lockup on PREEMPT_NONE/VOLUNTARY kernels. Add cond_resched() to the loop to prevent that. [ tglx: Split out from Eric's series ] Signed-off-by: Eric Dumazet Signed-off-by: Thomas Gleixner Reviewed-by: Frederic Weisbecker Link: https://lore.kernel.org/all/20250214135911.2037402-2-edumazet@google.com Link: https://lore.kernel.org/all/20250308155623.635612865@linutronix.de Signed-off-by: Sasha Levin --- kernel/time/posix-timers.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/time/posix-timers.c b/kernel/time/posix-timers.c index 2d6cf93ca370..fc08d4ccdeeb 100644 --- a/kernel/time/posix-timers.c +++ b/kernel/time/posix-timers.c @@ -161,6 +161,7 @@ static int posix_timer_add(struct k_itimer *timer) return id; } spin_unlock(&hash_lock); + cond_resched(); } /* POSIX return code when no timer ID could be allocated */ return -EAGAIN; From e563401934e41804ecd8ff2badba3edc93cb9bc5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Thomas=20Wei=C3=9Fschuh?= Date: Tue, 11 Mar 2025 10:54:47 +0100 Subject: [PATCH 277/529] timer_list: Don't use %pK through printk() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit a52067c24ccf6ee4c85acffa0f155e9714f9adce ] This reverts commit f590308536db ("timer debug: Hide kernel addresses via %pK in /proc/timer_list") The timer list helper SEQ_printf() uses either the real seq_printf() for procfs output or vprintk() to print to the kernel log, when invoked from SysRq-q. It uses %pK for printing pointers. In the past %pK was prefered over %p as it would not leak raw pointer values into the kernel log. Since commit ad67b74d2469 ("printk: hash addresses printed with %p") the regular %p has been improved to avoid this issue. Furthermore, restricted pointers ("%pK") were never meant to be used through printk(). They can still unintentionally leak raw pointers or acquire sleeping looks in atomic contexts. Switch to the regular pointer formatting which is safer, easier to reason about and sufficient here. Signed-off-by: Thomas Weißschuh Signed-off-by: Thomas Gleixner Link: https://lore.kernel.org/lkml/20250113171731-dc10e3c1-da64-4af0-b767-7c7070468023@linutronix.de/ Link: https://lore.kernel.org/all/20250311-restricted-pointers-timer-v1-1-6626b91e54ab@linutronix.de Signed-off-by: Sasha Levin --- kernel/time/timer_list.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index ed7d6ad694fb..20a5e6962b69 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -46,7 +46,7 @@ static void print_timer(struct seq_file *m, struct hrtimer *taddr, struct hrtimer *timer, int idx, u64 now) { - SEQ_printf(m, " #%d: <%pK>, %ps", idx, taddr, timer->function); + SEQ_printf(m, " #%d: <%p>, %ps", idx, taddr, timer->function); SEQ_printf(m, ", S:%02x", timer->state); SEQ_printf(m, "\n"); SEQ_printf(m, " # expires at %Lu-%Lu nsecs [in %Ld to %Ld nsecs]\n", @@ -98,7 +98,7 @@ next_one: static void print_base(struct seq_file *m, struct hrtimer_clock_base *base, u64 now) { - SEQ_printf(m, " .base: %pK\n", base); + SEQ_printf(m, " .base: %p\n", base); SEQ_printf(m, " .index: %d\n", base->index); SEQ_printf(m, " .resolution: %u nsecs\n", hrtimer_resolution); From 3bb62cc3131d781ed7bac6a53cb88432409fdf90 Mon Sep 17 00:00:00 2001 From: Nicolas Bouchinet Date: Wed, 29 Jan 2025 18:06:30 +0100 Subject: [PATCH 278/529] netfilter: conntrack: Bound nf_conntrack sysctl writes [ Upstream commit 8b6861390ffee6b8ed78b9395e3776c16fec6579 ] nf_conntrack_max and nf_conntrack_expect_max sysctls were authorized to be written any negative value, which would then be stored in the unsigned int variables nf_conntrack_max and nf_ct_expect_max variables. While the do_proc_dointvec_conv function is supposed to limit writing handled by proc_dointvec proc_handler to INT_MAX. Such a negative value being written in an unsigned int leads to a very high value, exceeding this limit. Moreover, the nf_conntrack_expect_max sysctl documentation specifies the minimum value is 1. The proc_handlers have thus been updated to proc_dointvec_minmax in order to specify the following write bounds : * Bound nf_conntrack_max sysctl writings between SYSCTL_ZERO and SYSCTL_INT_MAX. * Bound nf_conntrack_expect_max sysctl writings between SYSCTL_ONE and SYSCTL_INT_MAX as defined in the sysctl documentation. With this patch applied, sysctl writes outside the defined in the bound will thus lead to a write error : ``` sysctl -w net.netfilter.nf_conntrack_expect_max=-1 sysctl: setting key "net.netfilter.nf_conntrack_expect_max": Invalid argument ``` Signed-off-by: Nicolas Bouchinet Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_conntrack_standalone.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 52245dbfae31..c333132e2079 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -631,7 +631,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_COUNT] = { .procname = "nf_conntrack_count", @@ -667,7 +669,9 @@ static struct ctl_table nf_ct_sysctl_table[] = { .data = &nf_ct_expect_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_INT_MAX, }, [NF_SYSCTL_CT_ACCT] = { .procname = "nf_conntrack_acct", @@ -970,7 +974,9 @@ static struct ctl_table nf_ct_netfilter_table[] = { .data = &nf_conntrack_max, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { } }; From 5cdb89f76e1db8a9323415f8fe7b6e91930c74b3 Mon Sep 17 00:00:00 2001 From: Ryan Roberts Date: Fri, 21 Feb 2025 10:12:25 +0530 Subject: [PATCH 279/529] arm64/mm: Check PUD_TYPE_TABLE in pud_bad() [ Upstream commit bfb1d2b9021c21891427acc86eb848ccedeb274e ] pud_bad() is currently defined in terms of pud_table(). Although for some configs, pud_table() is hard-coded to true i.e. when using 64K base pages or when page table levels are less than 3. pud_bad() is intended to check that the pud is configured correctly. Hence let's open-code the same check that the full version of pud_table() uses into pud_bad(). Then it always performs the check regardless of the config. Cc: Will Deacon Cc: Ard Biesheuvel Cc: Ryan Roberts Cc: Mark Rutland Cc: linux-arm-kernel@lists.infradead.org Cc: linux-kernel@vger.kernel.org Signed-off-by: Ryan Roberts Signed-off-by: Anshuman Khandual Link: https://lore.kernel.org/r/20250221044227.1145393-7-anshuman.khandual@arm.com Signed-off-by: Catalin Marinas Signed-off-by: Sasha Levin --- arch/arm64/include/asm/pgtable.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 1d713cfb0af1..426c3cb3e3bb 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -677,7 +677,8 @@ static inline unsigned long pmd_page_vaddr(pmd_t pmd) pr_err("%s:%d: bad pmd %016llx.\n", __FILE__, __LINE__, pmd_val(e)) #define pud_none(pud) (!pud_val(pud)) -#define pud_bad(pud) (!pud_table(pud)) +#define pud_bad(pud) ((pud_val(pud) & PUD_TYPE_MASK) != \ + PUD_TYPE_TABLE) #define pud_present(pud) pte_present(pud_pte(pud)) #define pud_leaf(pud) (pud_present(pud) && !pud_table(pud)) #define pud_valid(pud) pte_valid(pud_pte(pud)) From 03c9ac48aefbeddc28ac741c56e47d20a6f978cc Mon Sep 17 00:00:00 2001 From: Kaustabh Chakraborty Date: Wed, 19 Feb 2025 00:17:49 +0530 Subject: [PATCH 280/529] mmc: dw_mmc: add exynos7870 DW MMC support [ Upstream commit 7cbe799ac10fd8be85af5e0615c4337f81e575f3 ] Add support for Exynos7870 DW MMC controllers, for both SMU and non-SMU variants. These controllers require a quirk to access 64-bit FIFO in 32-bit accesses (DW_MMC_QUIRK_FIFO64_32). Signed-off-by: Kaustabh Chakraborty Link: https://lore.kernel.org/r/20250219-exynos7870-mmc-v2-3-b4255a3e39ed@disroot.org Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/dw_mmc-exynos.c | 41 +++++++++++++++++++++++++++++++- 1 file changed, 40 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/dw_mmc-exynos.c b/drivers/mmc/host/dw_mmc-exynos.c index 9f20ac524c8b..2a5c3c822f6a 100644 --- a/drivers/mmc/host/dw_mmc-exynos.c +++ b/drivers/mmc/host/dw_mmc-exynos.c @@ -28,6 +28,8 @@ enum dw_mci_exynos_type { DW_MCI_TYPE_EXYNOS5420_SMU, DW_MCI_TYPE_EXYNOS7, DW_MCI_TYPE_EXYNOS7_SMU, + DW_MCI_TYPE_EXYNOS7870, + DW_MCI_TYPE_EXYNOS7870_SMU, DW_MCI_TYPE_ARTPEC8, }; @@ -70,6 +72,12 @@ static struct dw_mci_exynos_compatible { }, { .compatible = "samsung,exynos7-dw-mshc-smu", .ctrl_type = DW_MCI_TYPE_EXYNOS7_SMU, + }, { + .compatible = "samsung,exynos7870-dw-mshc", + .ctrl_type = DW_MCI_TYPE_EXYNOS7870, + }, { + .compatible = "samsung,exynos7870-dw-mshc-smu", + .ctrl_type = DW_MCI_TYPE_EXYNOS7870_SMU, }, { .compatible = "axis,artpec8-dw-mshc", .ctrl_type = DW_MCI_TYPE_ARTPEC8, @@ -86,6 +94,8 @@ static inline u8 dw_mci_exynos_get_ciu_div(struct dw_mci *host) return EXYNOS4210_FIXED_CIU_CLK_DIV; else if (priv->ctrl_type == DW_MCI_TYPE_EXYNOS7 || priv->ctrl_type == DW_MCI_TYPE_EXYNOS7_SMU || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870 || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870_SMU || priv->ctrl_type == DW_MCI_TYPE_ARTPEC8) return SDMMC_CLKSEL_GET_DIV(mci_readl(host, CLKSEL64)) + 1; else @@ -101,7 +111,8 @@ static void dw_mci_exynos_config_smu(struct dw_mci *host) * set for non-ecryption mode at this time. */ if (priv->ctrl_type == DW_MCI_TYPE_EXYNOS5420_SMU || - priv->ctrl_type == DW_MCI_TYPE_EXYNOS7_SMU) { + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7_SMU || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870_SMU) { mci_writel(host, MPSBEGIN0, 0); mci_writel(host, MPSEND0, SDMMC_ENDING_SEC_NR_MAX); mci_writel(host, MPSCTRL0, SDMMC_MPSCTRL_SECURE_WRITE_BIT | @@ -127,6 +138,12 @@ static int dw_mci_exynos_priv_init(struct dw_mci *host) DQS_CTRL_GET_RD_DELAY(priv->saved_strobe_ctrl); } + if (priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870 || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870_SMU) { + /* Quirk needed for certain Exynos SoCs */ + host->quirks |= DW_MMC_QUIRK_FIFO64_32; + } + if (priv->ctrl_type == DW_MCI_TYPE_ARTPEC8) { /* Quirk needed for the ARTPEC-8 SoC */ host->quirks |= DW_MMC_QUIRK_EXTENDED_TMOUT; @@ -144,6 +161,8 @@ static void dw_mci_exynos_set_clksel_timing(struct dw_mci *host, u32 timing) if (priv->ctrl_type == DW_MCI_TYPE_EXYNOS7 || priv->ctrl_type == DW_MCI_TYPE_EXYNOS7_SMU || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870 || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870_SMU || priv->ctrl_type == DW_MCI_TYPE_ARTPEC8) clksel = mci_readl(host, CLKSEL64); else @@ -153,6 +172,8 @@ static void dw_mci_exynos_set_clksel_timing(struct dw_mci *host, u32 timing) if (priv->ctrl_type == DW_MCI_TYPE_EXYNOS7 || priv->ctrl_type == DW_MCI_TYPE_EXYNOS7_SMU || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870 || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870_SMU || priv->ctrl_type == DW_MCI_TYPE_ARTPEC8) mci_writel(host, CLKSEL64, clksel); else @@ -223,6 +244,8 @@ static int dw_mci_exynos_resume_noirq(struct device *dev) if (priv->ctrl_type == DW_MCI_TYPE_EXYNOS7 || priv->ctrl_type == DW_MCI_TYPE_EXYNOS7_SMU || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870 || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870_SMU || priv->ctrl_type == DW_MCI_TYPE_ARTPEC8) clksel = mci_readl(host, CLKSEL64); else @@ -231,6 +254,8 @@ static int dw_mci_exynos_resume_noirq(struct device *dev) if (clksel & SDMMC_CLKSEL_WAKEUP_INT) { if (priv->ctrl_type == DW_MCI_TYPE_EXYNOS7 || priv->ctrl_type == DW_MCI_TYPE_EXYNOS7_SMU || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870 || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870_SMU || priv->ctrl_type == DW_MCI_TYPE_ARTPEC8) mci_writel(host, CLKSEL64, clksel); else @@ -410,6 +435,8 @@ static inline u8 dw_mci_exynos_get_clksmpl(struct dw_mci *host) if (priv->ctrl_type == DW_MCI_TYPE_EXYNOS7 || priv->ctrl_type == DW_MCI_TYPE_EXYNOS7_SMU || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870 || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870_SMU || priv->ctrl_type == DW_MCI_TYPE_ARTPEC8) return SDMMC_CLKSEL_CCLK_SAMPLE(mci_readl(host, CLKSEL64)); else @@ -423,6 +450,8 @@ static inline void dw_mci_exynos_set_clksmpl(struct dw_mci *host, u8 sample) if (priv->ctrl_type == DW_MCI_TYPE_EXYNOS7 || priv->ctrl_type == DW_MCI_TYPE_EXYNOS7_SMU || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870 || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870_SMU || priv->ctrl_type == DW_MCI_TYPE_ARTPEC8) clksel = mci_readl(host, CLKSEL64); else @@ -430,6 +459,8 @@ static inline void dw_mci_exynos_set_clksmpl(struct dw_mci *host, u8 sample) clksel = SDMMC_CLKSEL_UP_SAMPLE(clksel, sample); if (priv->ctrl_type == DW_MCI_TYPE_EXYNOS7 || priv->ctrl_type == DW_MCI_TYPE_EXYNOS7_SMU || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870 || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870_SMU || priv->ctrl_type == DW_MCI_TYPE_ARTPEC8) mci_writel(host, CLKSEL64, clksel); else @@ -444,6 +475,8 @@ static inline u8 dw_mci_exynos_move_next_clksmpl(struct dw_mci *host) if (priv->ctrl_type == DW_MCI_TYPE_EXYNOS7 || priv->ctrl_type == DW_MCI_TYPE_EXYNOS7_SMU || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870 || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870_SMU || priv->ctrl_type == DW_MCI_TYPE_ARTPEC8) clksel = mci_readl(host, CLKSEL64); else @@ -454,6 +487,8 @@ static inline u8 dw_mci_exynos_move_next_clksmpl(struct dw_mci *host) if (priv->ctrl_type == DW_MCI_TYPE_EXYNOS7 || priv->ctrl_type == DW_MCI_TYPE_EXYNOS7_SMU || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870 || + priv->ctrl_type == DW_MCI_TYPE_EXYNOS7870_SMU || priv->ctrl_type == DW_MCI_TYPE_ARTPEC8) mci_writel(host, CLKSEL64, clksel); else @@ -633,6 +668,10 @@ static const struct of_device_id dw_mci_exynos_match[] = { .data = &exynos_drv_data, }, { .compatible = "samsung,exynos7-dw-mshc-smu", .data = &exynos_drv_data, }, + { .compatible = "samsung,exynos7870-dw-mshc", + .data = &exynos_drv_data, }, + { .compatible = "samsung,exynos7870-dw-mshc-smu", + .data = &exynos_drv_data, }, { .compatible = "axis,artpec8-dw-mshc", .data = &artpec_drv_data, }, {}, From d0306f88b44d051dff95c27ab8686d644a8323c2 Mon Sep 17 00:00:00 2001 From: Erick Shepherd Date: Tue, 11 Feb 2025 15:46:45 -0600 Subject: [PATCH 281/529] mmc: sdhci: Disable SD card clock before changing parameters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit fb3bbc46c94f261b6156ee863c1b06c84cf157dc ] Per the SD Host Controller Simplified Specification v4.20 §3.2.3, change the SD card clock parameters only after first disabling the external card clock. Doing this fixes a spurious clock pulse on Baytrail and Apollo Lake SD controllers which otherwise breaks voltage switching with a specific Swissbit SD card. Signed-off-by: Kyle Roeschley Signed-off-by: Brad Mouring Signed-off-by: Erick Shepherd Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20250211214645.469279-1-erick.shepherd@ni.com Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/sdhci.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 536d21028a11..6822a3249286 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -2049,10 +2049,15 @@ void sdhci_set_clock(struct sdhci_host *host, unsigned int clock) host->mmc->actual_clock = 0; - sdhci_writew(host, 0, SDHCI_CLOCK_CONTROL); + clk = sdhci_readw(host, SDHCI_CLOCK_CONTROL); + if (clk & SDHCI_CLOCK_CARD_EN) + sdhci_writew(host, clk & ~SDHCI_CLOCK_CARD_EN, + SDHCI_CLOCK_CONTROL); - if (clock == 0) + if (clock == 0) { + sdhci_writew(host, 0, SDHCI_CLOCK_CONTROL); return; + } clk = sdhci_calc_clk(host, clock, &host->mmc->actual_clock); sdhci_enable_clk(host, clk); From 4a8fcd77ce219cdc32cef9f15616b9287ad9a9f8 Mon Sep 17 00:00:00 2001 From: Kurt Borja Date: Tue, 4 Mar 2025 00:52:50 -0500 Subject: [PATCH 282/529] hwmon: (dell-smm) Increment the number of fans [ Upstream commit dbcfcb239b3b452ef8782842c36fb17dd1b9092f ] Some Alienware laptops that support the SMM interface, may have up to 4 fans. Tested on an Alienware x15 r1. Signed-off-by: Kurt Borja Link: https://lore.kernel.org/r/20250304055249.51940-2-kuurtb@gmail.com Signed-off-by: Guenter Roeck Signed-off-by: Sasha Levin --- Documentation/hwmon/dell-smm-hwmon.rst | 14 +++++++------- drivers/hwmon/dell-smm-hwmon.c | 5 ++++- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/Documentation/hwmon/dell-smm-hwmon.rst b/Documentation/hwmon/dell-smm-hwmon.rst index d8f1d6859b96..1c12fbba440b 100644 --- a/Documentation/hwmon/dell-smm-hwmon.rst +++ b/Documentation/hwmon/dell-smm-hwmon.rst @@ -32,12 +32,12 @@ Temperature sensors and fans can be queried and set via the standard =============================== ======= ======================================= Name Perm Description =============================== ======= ======================================= -fan[1-3]_input RO Fan speed in RPM. -fan[1-3]_label RO Fan label. -fan[1-3]_min RO Minimal Fan speed in RPM -fan[1-3]_max RO Maximal Fan speed in RPM -fan[1-3]_target RO Expected Fan speed in RPM -pwm[1-3] RW Control the fan PWM duty-cycle. +fan[1-4]_input RO Fan speed in RPM. +fan[1-4]_label RO Fan label. +fan[1-4]_min RO Minimal Fan speed in RPM +fan[1-4]_max RO Maximal Fan speed in RPM +fan[1-4]_target RO Expected Fan speed in RPM +pwm[1-4] RW Control the fan PWM duty-cycle. pwm1_enable WO Enable or disable automatic BIOS fan control (not supported on all laptops, see below for details). @@ -93,7 +93,7 @@ Again, when you find new codes, we'd be happy to have your patches! --------------------------- The driver also exports the fans as thermal cooling devices with -``type`` set to ``dell-smm-fan[1-3]``. This allows for easy fan control +``type`` set to ``dell-smm-fan[1-4]``. This allows for easy fan control using one of the thermal governors. Module parameters diff --git a/drivers/hwmon/dell-smm-hwmon.c b/drivers/hwmon/dell-smm-hwmon.c index 1572b5416015..dbcb8f362061 100644 --- a/drivers/hwmon/dell-smm-hwmon.c +++ b/drivers/hwmon/dell-smm-hwmon.c @@ -67,7 +67,7 @@ #define I8K_POWER_BATTERY 0x01 #define DELL_SMM_NO_TEMP 10 -#define DELL_SMM_NO_FANS 3 +#define DELL_SMM_NO_FANS 4 struct dell_smm_data { struct mutex i8k_mutex; /* lock for sensors writes */ @@ -940,11 +940,14 @@ static const struct hwmon_channel_info *dell_smm_info[] = { HWMON_F_INPUT | HWMON_F_LABEL | HWMON_F_MIN | HWMON_F_MAX | HWMON_F_TARGET, HWMON_F_INPUT | HWMON_F_LABEL | HWMON_F_MIN | HWMON_F_MAX | + HWMON_F_TARGET, + HWMON_F_INPUT | HWMON_F_LABEL | HWMON_F_MIN | HWMON_F_MAX | HWMON_F_TARGET ), HWMON_CHANNEL_INFO(pwm, HWMON_PWM_INPUT | HWMON_PWM_ENABLE, HWMON_PWM_INPUT, + HWMON_PWM_INPUT, HWMON_PWM_INPUT ), NULL From 4f809be95d9f3db13d31c574b8764c8d429f0c3b Mon Sep 17 00:00:00 2001 From: Willem de Bruijn Date: Thu, 6 Mar 2025 22:34:09 -0500 Subject: [PATCH 283/529] ipv6: save dontfrag in cork [ Upstream commit a18dfa9925b9ef6107ea3aa5814ca3c704d34a8a ] When spanning datagram construction over multiple send calls using MSG_MORE, per datagram settings are configured on the first send. That is when ip(6)_setup_cork stores these settings for subsequent use in __ip(6)_append_data and others. The only flag that escaped this was dontfrag. As a result, a datagram could be constructed with df=0 on the first sendmsg, but df=1 on a next. Which is what cmsg_ip.sh does in an upcoming MSG_MORE test in the "diff" scenario. Changing datagram conditions in the middle of constructing an skb makes this already complex code path even more convoluted. It is here unintentional. Bring this flag in line with expected sockopt/cmsg behavior. And stop passing ipc6 to __ip6_append_data, to avoid such issues in the future. This is already the case for __ip_append_data. inet6_cork had a 6 byte hole, so the 1B flag has no impact. Signed-off-by: Willem de Bruijn Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20250307033620.411611-3-willemdebruijn.kernel@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- include/linux/ipv6.h | 1 + net/ipv6/ip6_output.c | 9 +++++---- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 9a44de45cc1f..9f27e004127b 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -199,6 +199,7 @@ struct inet6_cork { struct ipv6_txoptions *opt; u8 hop_limit; u8 tclass; + u8 dontfrag:1; }; /** diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index f7a225da8525..cfc276e5a249 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1450,6 +1450,7 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, } v6_cork->hop_limit = ipc6->hlimit; v6_cork->tclass = ipc6->tclass; + v6_cork->dontfrag = ipc6->dontfrag; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); @@ -1483,7 +1484,7 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, - unsigned int flags, struct ipcm6_cookie *ipc6) + unsigned int flags) { struct sk_buff *skb, *skb_prev = NULL; struct inet_cork *cork = &cork_full->base; @@ -1539,7 +1540,7 @@ static int __ip6_append_data(struct sock *sk, if (headersize + transhdrlen > mtu) goto emsgsize; - if (cork->length + length > mtu - headersize && ipc6->dontfrag && + if (cork->length + length > mtu - headersize && v6_cork->dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_ICMPV6 || sk->sk_protocol == IPPROTO_RAW)) { @@ -1884,7 +1885,7 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags, ipc6); + from, length, transhdrlen, flags); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -2089,7 +2090,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, err = __ip6_append_data(sk, &queue, cork, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags, ipc6); + flags); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); From 0b60d03644711ed4e5cf8c008557d7d5dc19b422 Mon Sep 17 00:00:00 2001 From: Zhikai Zhai Date: Thu, 27 Feb 2025 20:09:14 +0800 Subject: [PATCH 284/529] drm/amd/display: calculate the remain segments for all pipes [ Upstream commit d3069feecdb5542604d29b59acfd1fd213bad95b ] [WHY] In some cases the remain de-tile buffer segments will be greater than zero if we don't add the non-top pipe to calculate, at this time the override de-tile buffer size will be valid and used. But it makes the de-tile buffer segments used finally for all of pipes exceed the maximum. [HOW] Add the non-top pipe to calculate the remain de-tile buffer segments. Don't set override size to use the average according to pipe count if the value exceed the maximum. Reviewed-by: Charlene Liu Signed-off-by: Zhikai Zhai Signed-off-by: Tom Chung Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- .../amd/display/dc/dcn315/dcn315_resource.c | 40 +++++++++---------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn315/dcn315_resource.c b/drivers/gpu/drm/amd/display/dc/dcn315/dcn315_resource.c index 958170fbfece..9d643c79afea 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn315/dcn315_resource.c +++ b/drivers/gpu/drm/amd/display/dc/dcn315/dcn315_resource.c @@ -1717,7 +1717,7 @@ static int dcn315_populate_dml_pipes_from_context( pipes[pipe_cnt].dout.dsc_input_bpc = 0; DC_FP_START(); dcn31_zero_pipe_dcc_fraction(pipes, pipe_cnt); - if (pixel_rate_crb && !pipe->top_pipe && !pipe->prev_odm_pipe) { + if (pixel_rate_crb) { int bpp = source_format_to_bpp(pipes[pipe_cnt].pipe.src.source_format); /* Ceil to crb segment size */ int approx_det_segs_required_for_pstate = dcn_get_approx_det_segs_required_for_pstate( @@ -1768,28 +1768,26 @@ static int dcn315_populate_dml_pipes_from_context( continue; } - if (!pipe->top_pipe && !pipe->prev_odm_pipe) { - bool split_required = pipe->stream->timing.pix_clk_100hz >= dcn_get_max_non_odm_pix_rate_100hz(&dc->dml.soc) - || (pipe->plane_state && pipe->plane_state->src_rect.width > 5120); + bool split_required = pipe->stream->timing.pix_clk_100hz >= dcn_get_max_non_odm_pix_rate_100hz(&dc->dml.soc) + || (pipe->plane_state && pipe->plane_state->src_rect.width > 5120); - if (remaining_det_segs > MIN_RESERVED_DET_SEGS && crb_pipes != 0) - pipes[pipe_cnt].pipe.src.det_size_override += (remaining_det_segs - MIN_RESERVED_DET_SEGS) / crb_pipes + - (crb_idx < (remaining_det_segs - MIN_RESERVED_DET_SEGS) % crb_pipes ? 1 : 0); - if (pipes[pipe_cnt].pipe.src.det_size_override > 2 * DCN3_15_MAX_DET_SEGS) { - /* Clamp to 2 pipe split max det segments */ - remaining_det_segs += pipes[pipe_cnt].pipe.src.det_size_override - 2 * (DCN3_15_MAX_DET_SEGS); - pipes[pipe_cnt].pipe.src.det_size_override = 2 * DCN3_15_MAX_DET_SEGS; - } - if (pipes[pipe_cnt].pipe.src.det_size_override > DCN3_15_MAX_DET_SEGS || split_required) { - /* If we are splitting we must have an even number of segments */ - remaining_det_segs += pipes[pipe_cnt].pipe.src.det_size_override % 2; - pipes[pipe_cnt].pipe.src.det_size_override -= pipes[pipe_cnt].pipe.src.det_size_override % 2; - } - /* Convert segments into size for DML use */ - pipes[pipe_cnt].pipe.src.det_size_override *= DCN3_15_CRB_SEGMENT_SIZE_KB; - - crb_idx++; + if (remaining_det_segs > MIN_RESERVED_DET_SEGS && crb_pipes != 0) + pipes[pipe_cnt].pipe.src.det_size_override += (remaining_det_segs - MIN_RESERVED_DET_SEGS) / crb_pipes + + (crb_idx < (remaining_det_segs - MIN_RESERVED_DET_SEGS) % crb_pipes ? 1 : 0); + if (pipes[pipe_cnt].pipe.src.det_size_override > 2 * DCN3_15_MAX_DET_SEGS) { + /* Clamp to 2 pipe split max det segments */ + remaining_det_segs += pipes[pipe_cnt].pipe.src.det_size_override - 2 * (DCN3_15_MAX_DET_SEGS); + pipes[pipe_cnt].pipe.src.det_size_override = 2 * DCN3_15_MAX_DET_SEGS; } + if (pipes[pipe_cnt].pipe.src.det_size_override > DCN3_15_MAX_DET_SEGS || split_required) { + /* If we are splitting we must have an even number of segments */ + remaining_det_segs += pipes[pipe_cnt].pipe.src.det_size_override % 2; + pipes[pipe_cnt].pipe.src.det_size_override -= pipes[pipe_cnt].pipe.src.det_size_override % 2; + } + /* Convert segments into size for DML use */ + pipes[pipe_cnt].pipe.src.det_size_override *= DCN3_15_CRB_SEGMENT_SIZE_KB; + + crb_idx++; pipe_cnt++; } } From 9e5228610ff4f805dacf82dd81c903c48537f783 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Thu, 6 Feb 2025 14:58:39 +0100 Subject: [PATCH 285/529] gfs2: Check for empty queue in run_queue [ Upstream commit d838605fea6eabae3746a276fd448f6719eb3926 ] In run_queue(), check if the queue of pending requests is empty instead of blindly assuming that it won't be. Signed-off-by: Andreas Gruenbacher Signed-off-by: Sasha Levin --- fs/gfs2/glock.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 6ba8460f5331..428c1db295fa 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -885,11 +885,12 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { - struct gfs2_holder *gh = NULL; + struct gfs2_holder *gh; if (test_and_set_bit(GLF_LOCK, &gl->gl_flags)) return; + /* While a demote is in progress, the GLF_LOCK flag must be set. */ GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); if (test_bit(GLF_DEMOTE, &gl->gl_flags) && @@ -901,18 +902,22 @@ __acquires(&gl->gl_lockref.lock) set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); gl->gl_target = gl->gl_demote_state; + do_xmote(gl, NULL, gl->gl_target); + return; } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); if (do_promote(gl) == 0) goto out_unlock; gh = find_first_waiter(gl); + if (!gh) + goto out_unlock; gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) do_error(gl, 0); /* Fail queued try locks */ + do_xmote(gl, gh, gl->gl_target); + return; } - do_xmote(gl, gh, gl->gl_target); - return; out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); From f306821a7b610bb1b296c13e19e85e9ac6d9357a Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 24 Feb 2025 19:27:38 +0200 Subject: [PATCH 286/529] auxdisplay: charlcd: Partially revert "Move hwidth and bwidth to struct hd44780_common" [ Upstream commit 09965a142078080fe7807bab0f6f1890cb5987a4 ] Commit 2545c1c948a6 ("auxdisplay: Move hwidth and bwidth to struct hd44780_common") makes charlcd_alloc() argument-less effectively dropping the single allocation for the struct charlcd_priv object along with the driver specific one. Restore that behaviour here. Signed-off-by: Andy Shevchenko Reviewed-by: Geert Uytterhoeven Signed-off-by: Sasha Levin --- drivers/auxdisplay/charlcd.c | 5 +++-- drivers/auxdisplay/charlcd.h | 5 +++-- drivers/auxdisplay/hd44780.c | 2 +- drivers/auxdisplay/lcd2s.c | 2 +- drivers/auxdisplay/panel.c | 2 +- 5 files changed, 9 insertions(+), 7 deletions(-) diff --git a/drivers/auxdisplay/charlcd.c b/drivers/auxdisplay/charlcd.c index 6d309e4971b6..e243291a7e77 100644 --- a/drivers/auxdisplay/charlcd.c +++ b/drivers/auxdisplay/charlcd.c @@ -594,18 +594,19 @@ static int charlcd_init(struct charlcd *lcd) return 0; } -struct charlcd *charlcd_alloc(void) +struct charlcd *charlcd_alloc(unsigned int drvdata_size) { struct charlcd_priv *priv; struct charlcd *lcd; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); + priv = kzalloc(sizeof(*priv) + drvdata_size, GFP_KERNEL); if (!priv) return NULL; priv->esc_seq.len = -1; lcd = &priv->lcd; + lcd->drvdata = priv->drvdata; return lcd; } diff --git a/drivers/auxdisplay/charlcd.h b/drivers/auxdisplay/charlcd.h index eed80063a6d2..4bbf106b2dd8 100644 --- a/drivers/auxdisplay/charlcd.h +++ b/drivers/auxdisplay/charlcd.h @@ -49,7 +49,7 @@ struct charlcd { unsigned long y; } addr; - void *drvdata; + void *drvdata; /* Set by charlcd_alloc() */ }; /** @@ -93,7 +93,8 @@ struct charlcd_ops { }; void charlcd_backlight(struct charlcd *lcd, enum charlcd_onoff on); -struct charlcd *charlcd_alloc(void); + +struct charlcd *charlcd_alloc(unsigned int drvdata_size); void charlcd_free(struct charlcd *lcd); int charlcd_register(struct charlcd *lcd); diff --git a/drivers/auxdisplay/hd44780.c b/drivers/auxdisplay/hd44780.c index 8b690f59df27..ebaf0ff518f4 100644 --- a/drivers/auxdisplay/hd44780.c +++ b/drivers/auxdisplay/hd44780.c @@ -226,7 +226,7 @@ static int hd44780_probe(struct platform_device *pdev) if (!hdc) return -ENOMEM; - lcd = charlcd_alloc(); + lcd = charlcd_alloc(0); if (!lcd) goto fail1; diff --git a/drivers/auxdisplay/lcd2s.c b/drivers/auxdisplay/lcd2s.c index 135831a16514..2b597f226c0c 100644 --- a/drivers/auxdisplay/lcd2s.c +++ b/drivers/auxdisplay/lcd2s.c @@ -307,7 +307,7 @@ static int lcd2s_i2c_probe(struct i2c_client *i2c) if (err < 0) return err; - lcd = charlcd_alloc(); + lcd = charlcd_alloc(0); if (!lcd) return -ENOMEM; diff --git a/drivers/auxdisplay/panel.c b/drivers/auxdisplay/panel.c index eba04c0de7eb..0f3999b665e7 100644 --- a/drivers/auxdisplay/panel.c +++ b/drivers/auxdisplay/panel.c @@ -835,7 +835,7 @@ static void lcd_init(void) if (!hdc) return; - charlcd = charlcd_alloc(); + charlcd = charlcd_alloc(0); if (!charlcd) { kfree(hdc); return; From 64e88f718e7e66c5ce8be7cc1aa440f188571464 Mon Sep 17 00:00:00 2001 From: Alexey Klimov Date: Fri, 28 Feb 2025 16:14:30 +0000 Subject: [PATCH 287/529] ASoC: qcom: sm8250: explicitly set format in sm8250_be_hw_params_fixup() [ Upstream commit 89be3c15a58b2ccf31e969223c8ac93ca8932d81 ] Setting format to s16le is required for compressed playback on compatible soundcards. Cc: Srinivas Kandagatla Signed-off-by: Alexey Klimov Link: https://patch.msgid.link/20250228161430.373961-1-alexey.klimov@linaro.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/qcom/sm8250.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/soc/qcom/sm8250.c b/sound/soc/qcom/sm8250.c index 41be09a07ca7..65e51b6b46ff 100644 --- a/sound/soc/qcom/sm8250.c +++ b/sound/soc/qcom/sm8250.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -39,9 +40,11 @@ static int sm8250_be_hw_params_fixup(struct snd_soc_pcm_runtime *rtd, SNDRV_PCM_HW_PARAM_RATE); struct snd_interval *channels = hw_param_interval(params, SNDRV_PCM_HW_PARAM_CHANNELS); + struct snd_mask *fmt = hw_param_mask(params, SNDRV_PCM_HW_PARAM_FORMAT); rate->min = rate->max = 48000; channels->min = channels->max = 2; + snd_mask_set_format(fmt, SNDRV_PCM_FORMAT_S16_LE); return 0; } From 239afef07e3c8fb9ca595ae5f28c0664f1416eab Mon Sep 17 00:00:00 2001 From: Vasant Hegde Date: Thu, 27 Feb 2025 16:23:16 +0000 Subject: [PATCH 288/529] iommu/amd/pgtbl_v2: Improve error handling [ Upstream commit 36a1cfd497435ba5e37572fe9463bb62a7b1b984 ] Return -ENOMEM if v2_alloc_pte() fails to allocate memory. Signed-off-by: Vasant Hegde Reviewed-by: Jason Gunthorpe Link: https://lore.kernel.org/r/20250227162320.5805-4-vasant.hegde@amd.com Signed-off-by: Joerg Roedel Signed-off-by: Sasha Levin --- drivers/iommu/amd/io_pgtable_v2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iommu/amd/io_pgtable_v2.c b/drivers/iommu/amd/io_pgtable_v2.c index 232d17bd941f..c86cbbc21e88 100644 --- a/drivers/iommu/amd/io_pgtable_v2.c +++ b/drivers/iommu/amd/io_pgtable_v2.c @@ -264,7 +264,7 @@ static int iommu_v2_map_pages(struct io_pgtable_ops *ops, unsigned long iova, map_size = get_alloc_page_size(pgsize); pte = v2_alloc_pte(pdom->iop.pgd, iova, map_size, &updated); if (!pte) { - ret = -EINVAL; + ret = -ENOMEM; goto out; } From 89172666228de1cefcacf5bc6f61c6281751d2ed Mon Sep 17 00:00:00 2001 From: Aaron Kling Date: Mon, 10 Mar 2025 00:28:48 -0500 Subject: [PATCH 289/529] cpufreq: tegra186: Share policy per cluster [ Upstream commit be4ae8c19492cd6d5de61ccb34ffb3f5ede5eec8 ] This functionally brings tegra186 in line with tegra210 and tegra194, sharing a cpufreq policy between all cores in a cluster. Reviewed-by: Sumit Gupta Acked-by: Thierry Reding Signed-off-by: Aaron Kling Signed-off-by: Viresh Kumar Signed-off-by: Sasha Levin --- drivers/cpufreq/tegra186-cpufreq.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/cpufreq/tegra186-cpufreq.c b/drivers/cpufreq/tegra186-cpufreq.c index 6c88827f4e62..1d6b54303723 100644 --- a/drivers/cpufreq/tegra186-cpufreq.c +++ b/drivers/cpufreq/tegra186-cpufreq.c @@ -73,11 +73,18 @@ static int tegra186_cpufreq_init(struct cpufreq_policy *policy) { struct tegra186_cpufreq_data *data = cpufreq_get_driver_data(); unsigned int cluster = data->cpus[policy->cpu].bpmp_cluster_id; + u32 cpu; policy->freq_table = data->clusters[cluster].table; policy->cpuinfo.transition_latency = 300 * 1000; policy->driver_data = NULL; + /* set same policy for all cpus in a cluster */ + for (cpu = 0; cpu < ARRAY_SIZE(tegra186_cpus); cpu++) { + if (data->cpus[cpu].bpmp_cluster_id == cluster) + cpumask_set_cpu(cpu, policy->cpus); + } + return 0; } From a98bd864e16f91c70b2469adf013d713d04d1d13 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Thu, 27 Feb 2025 17:04:46 +0800 Subject: [PATCH 290/529] crypto: lzo - Fix compression buffer overrun [ Upstream commit cc47f07234f72cbd8e2c973cdbf2a6730660a463 ] Unlike the decompression code, the compression code in LZO never checked for output overruns. It instead assumes that the caller always provides enough buffer space, disregarding the buffer length provided by the caller. Add a safe compression interface that checks for the end of buffer before each write. Use the safe interface in crypto/lzo. Signed-off-by: Herbert Xu Reviewed-by: David Sterba Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- crypto/lzo-rle.c | 2 +- crypto/lzo.c | 2 +- include/linux/lzo.h | 8 +++ lib/lzo/Makefile | 2 +- lib/lzo/lzo1x_compress.c | 102 +++++++++++++++++++++++++--------- lib/lzo/lzo1x_compress_safe.c | 18 ++++++ 6 files changed, 106 insertions(+), 28 deletions(-) create mode 100644 lib/lzo/lzo1x_compress_safe.c diff --git a/crypto/lzo-rle.c b/crypto/lzo-rle.c index 0631d975bfac..0abc2d87f042 100644 --- a/crypto/lzo-rle.c +++ b/crypto/lzo-rle.c @@ -55,7 +55,7 @@ static int __lzorle_compress(const u8 *src, unsigned int slen, size_t tmp_len = *dlen; /* size_t(ulong) <-> uint on 64 bit */ int err; - err = lzorle1x_1_compress(src, slen, dst, &tmp_len, ctx); + err = lzorle1x_1_compress_safe(src, slen, dst, &tmp_len, ctx); if (err != LZO_E_OK) return -EINVAL; diff --git a/crypto/lzo.c b/crypto/lzo.c index ebda132dd22b..8338851c7406 100644 --- a/crypto/lzo.c +++ b/crypto/lzo.c @@ -55,7 +55,7 @@ static int __lzo_compress(const u8 *src, unsigned int slen, size_t tmp_len = *dlen; /* size_t(ulong) <-> uint on 64 bit */ int err; - err = lzo1x_1_compress(src, slen, dst, &tmp_len, ctx); + err = lzo1x_1_compress_safe(src, slen, dst, &tmp_len, ctx); if (err != LZO_E_OK) return -EINVAL; diff --git a/include/linux/lzo.h b/include/linux/lzo.h index e95c7d1092b2..4d30e3624acd 100644 --- a/include/linux/lzo.h +++ b/include/linux/lzo.h @@ -24,10 +24,18 @@ int lzo1x_1_compress(const unsigned char *src, size_t src_len, unsigned char *dst, size_t *dst_len, void *wrkmem); +/* Same as above but does not write more than dst_len to dst. */ +int lzo1x_1_compress_safe(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + /* This requires 'wrkmem' of size LZO1X_1_MEM_COMPRESS */ int lzorle1x_1_compress(const unsigned char *src, size_t src_len, unsigned char *dst, size_t *dst_len, void *wrkmem); +/* Same as above but does not write more than dst_len to dst. */ +int lzorle1x_1_compress_safe(const unsigned char *src, size_t src_len, + unsigned char *dst, size_t *dst_len, void *wrkmem); + /* safe decompression with overrun testing */ int lzo1x_decompress_safe(const unsigned char *src, size_t src_len, unsigned char *dst, size_t *dst_len); diff --git a/lib/lzo/Makefile b/lib/lzo/Makefile index 2f58fafbbddd..fc7b2b7ef4b2 100644 --- a/lib/lzo/Makefile +++ b/lib/lzo/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0-only -lzo_compress-objs := lzo1x_compress.o +lzo_compress-objs := lzo1x_compress.o lzo1x_compress_safe.o lzo_decompress-objs := lzo1x_decompress_safe.o obj-$(CONFIG_LZO_COMPRESS) += lzo_compress.o diff --git a/lib/lzo/lzo1x_compress.c b/lib/lzo/lzo1x_compress.c index 9d31e7126606..f00dff9b9d4e 100644 --- a/lib/lzo/lzo1x_compress.c +++ b/lib/lzo/lzo1x_compress.c @@ -18,11 +18,22 @@ #include #include "lzodefs.h" -static noinline size_t -lzo1x_1_do_compress(const unsigned char *in, size_t in_len, - unsigned char *out, size_t *out_len, - size_t ti, void *wrkmem, signed char *state_offset, - const unsigned char bitstream_version) +#undef LZO_UNSAFE + +#ifndef LZO_SAFE +#define LZO_UNSAFE 1 +#define LZO_SAFE(name) name +#define HAVE_OP(x) 1 +#endif + +#define NEED_OP(x) if (!HAVE_OP(x)) goto output_overrun + +static noinline int +LZO_SAFE(lzo1x_1_do_compress)(const unsigned char *in, size_t in_len, + unsigned char **out, unsigned char *op_end, + size_t *tp, void *wrkmem, + signed char *state_offset, + const unsigned char bitstream_version) { const unsigned char *ip; unsigned char *op; @@ -30,8 +41,9 @@ lzo1x_1_do_compress(const unsigned char *in, size_t in_len, const unsigned char * const ip_end = in + in_len - 20; const unsigned char *ii; lzo_dict_t * const dict = (lzo_dict_t *) wrkmem; + size_t ti = *tp; - op = out; + op = *out; ip = in; ii = ip; ip += ti < 4 ? 4 - ti : 0; @@ -116,25 +128,32 @@ next: if (t != 0) { if (t <= 3) { op[*state_offset] |= t; + NEED_OP(4); COPY4(op, ii); op += t; } else if (t <= 16) { + NEED_OP(17); *op++ = (t - 3); COPY8(op, ii); COPY8(op + 8, ii + 8); op += t; } else { if (t <= 18) { + NEED_OP(1); *op++ = (t - 3); } else { size_t tt = t - 18; + NEED_OP(1); *op++ = 0; while (unlikely(tt > 255)) { tt -= 255; + NEED_OP(1); *op++ = 0; } + NEED_OP(1); *op++ = tt; } + NEED_OP(t); do { COPY8(op, ii); COPY8(op + 8, ii + 8); @@ -151,6 +170,7 @@ next: if (unlikely(run_length)) { ip += run_length; run_length -= MIN_ZERO_RUN_LENGTH; + NEED_OP(4); put_unaligned_le32((run_length << 21) | 0xfffc18 | (run_length & 0x7), op); op += 4; @@ -243,10 +263,12 @@ m_len_done: ip += m_len; if (m_len <= M2_MAX_LEN && m_off <= M2_MAX_OFFSET) { m_off -= 1; + NEED_OP(2); *op++ = (((m_len - 1) << 5) | ((m_off & 7) << 2)); *op++ = (m_off >> 3); } else if (m_off <= M3_MAX_OFFSET) { m_off -= 1; + NEED_OP(1); if (m_len <= M3_MAX_LEN) *op++ = (M3_MARKER | (m_len - 2)); else { @@ -254,14 +276,18 @@ m_len_done: *op++ = M3_MARKER | 0; while (unlikely(m_len > 255)) { m_len -= 255; + NEED_OP(1); *op++ = 0; } + NEED_OP(1); *op++ = (m_len); } + NEED_OP(2); *op++ = (m_off << 2); *op++ = (m_off >> 6); } else { m_off -= 0x4000; + NEED_OP(1); if (m_len <= M4_MAX_LEN) *op++ = (M4_MARKER | ((m_off >> 11) & 8) | (m_len - 2)); @@ -282,11 +308,14 @@ m_len_done: m_len -= M4_MAX_LEN; *op++ = (M4_MARKER | ((m_off >> 11) & 8)); while (unlikely(m_len > 255)) { + NEED_OP(1); m_len -= 255; *op++ = 0; } + NEED_OP(1); *op++ = (m_len); } + NEED_OP(2); *op++ = (m_off << 2); *op++ = (m_off >> 6); } @@ -295,14 +324,20 @@ finished_writing_instruction: ii = ip; goto next; } - *out_len = op - out; - return in_end - (ii - ti); + *out = op; + *tp = in_end - (ii - ti); + return LZO_E_OK; + +output_overrun: + return LZO_E_OUTPUT_OVERRUN; } -static int lzogeneric1x_1_compress(const unsigned char *in, size_t in_len, - unsigned char *out, size_t *out_len, - void *wrkmem, const unsigned char bitstream_version) +static int LZO_SAFE(lzogeneric1x_1_compress)( + const unsigned char *in, size_t in_len, + unsigned char *out, size_t *out_len, + void *wrkmem, const unsigned char bitstream_version) { + unsigned char * const op_end = out + *out_len; const unsigned char *ip = in; unsigned char *op = out; unsigned char *data_start; @@ -326,14 +361,18 @@ static int lzogeneric1x_1_compress(const unsigned char *in, size_t in_len, while (l > 20) { size_t ll = min_t(size_t, l, m4_max_offset + 1); uintptr_t ll_end = (uintptr_t) ip + ll; + int err; + if ((ll_end + ((t + ll) >> 5)) <= ll_end) break; BUILD_BUG_ON(D_SIZE * sizeof(lzo_dict_t) > LZO1X_1_MEM_COMPRESS); memset(wrkmem, 0, D_SIZE * sizeof(lzo_dict_t)); - t = lzo1x_1_do_compress(ip, ll, op, out_len, t, wrkmem, - &state_offset, bitstream_version); + err = LZO_SAFE(lzo1x_1_do_compress)( + ip, ll, &op, op_end, &t, wrkmem, + &state_offset, bitstream_version); + if (err != LZO_E_OK) + return err; ip += ll; - op += *out_len; l -= ll; } t += l; @@ -342,20 +381,26 @@ static int lzogeneric1x_1_compress(const unsigned char *in, size_t in_len, const unsigned char *ii = in + in_len - t; if (op == data_start && t <= 238) { + NEED_OP(1); *op++ = (17 + t); } else if (t <= 3) { op[state_offset] |= t; } else if (t <= 18) { + NEED_OP(1); *op++ = (t - 3); } else { size_t tt = t - 18; + NEED_OP(1); *op++ = 0; while (tt > 255) { tt -= 255; + NEED_OP(1); *op++ = 0; } + NEED_OP(1); *op++ = tt; } + NEED_OP(t); if (t >= 16) do { COPY8(op, ii); COPY8(op + 8, ii + 8); @@ -368,31 +413,38 @@ static int lzogeneric1x_1_compress(const unsigned char *in, size_t in_len, } while (--t > 0); } + NEED_OP(3); *op++ = M4_MARKER | 1; *op++ = 0; *op++ = 0; *out_len = op - out; return LZO_E_OK; + +output_overrun: + return LZO_E_OUTPUT_OVERRUN; } -int lzo1x_1_compress(const unsigned char *in, size_t in_len, - unsigned char *out, size_t *out_len, - void *wrkmem) +int LZO_SAFE(lzo1x_1_compress)(const unsigned char *in, size_t in_len, + unsigned char *out, size_t *out_len, + void *wrkmem) { - return lzogeneric1x_1_compress(in, in_len, out, out_len, wrkmem, 0); + return LZO_SAFE(lzogeneric1x_1_compress)( + in, in_len, out, out_len, wrkmem, 0); } -int lzorle1x_1_compress(const unsigned char *in, size_t in_len, - unsigned char *out, size_t *out_len, - void *wrkmem) +int LZO_SAFE(lzorle1x_1_compress)(const unsigned char *in, size_t in_len, + unsigned char *out, size_t *out_len, + void *wrkmem) { - return lzogeneric1x_1_compress(in, in_len, out, out_len, - wrkmem, LZO_VERSION); + return LZO_SAFE(lzogeneric1x_1_compress)( + in, in_len, out, out_len, wrkmem, LZO_VERSION); } -EXPORT_SYMBOL_GPL(lzo1x_1_compress); -EXPORT_SYMBOL_GPL(lzorle1x_1_compress); +EXPORT_SYMBOL_GPL(LZO_SAFE(lzo1x_1_compress)); +EXPORT_SYMBOL_GPL(LZO_SAFE(lzorle1x_1_compress)); +#ifndef LZO_UNSAFE MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("LZO1X-1 Compressor"); +#endif diff --git a/lib/lzo/lzo1x_compress_safe.c b/lib/lzo/lzo1x_compress_safe.c new file mode 100644 index 000000000000..371c9f849492 --- /dev/null +++ b/lib/lzo/lzo1x_compress_safe.c @@ -0,0 +1,18 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * LZO1X Compressor from LZO + * + * Copyright (C) 1996-2012 Markus F.X.J. Oberhumer + * + * The full LZO package can be found at: + * http://www.oberhumer.com/opensource/lzo/ + * + * Changed for Linux kernel use by: + * Nitin Gupta + * Richard Purdie + */ + +#define LZO_SAFE(name) name##_safe +#define HAVE_OP(x) ((size_t)(op_end - op) >= (size_t)(x)) + +#include "lzo1x_compress.c" From 806d3ee9303b9ecb4219f9161b4236120fd12355 Mon Sep 17 00:00:00 2001 From: Diogo Ivo Date: Mon, 24 Feb 2025 12:17:36 +0000 Subject: [PATCH 291/529] arm64: tegra: p2597: Fix gpio for vdd-1v8-dis regulator [ Upstream commit f34621f31e3be81456c903287f7e4c0609829e29 ] According to the board schematics the enable pin of this regulator is connected to gpio line #9 of the first instance of the TCA9539 GPIO expander, so adjust it. Signed-off-by: Diogo Ivo Link: https://lore.kernel.org/r/20250224-diogo-gpio_exp-v1-1-80fb84ac48c6@tecnico.ulisboa.pt Signed-off-by: Thierry Reding Signed-off-by: Sasha Levin --- arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi b/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi index 634373a423ef..481a88d83a65 100644 --- a/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi +++ b/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi @@ -1631,7 +1631,7 @@ regulator-min-microvolt = <1800000>; regulator-max-microvolt = <1800000>; regulator-always-on; - gpio = <&exp1 14 GPIO_ACTIVE_HIGH>; + gpio = <&exp1 9 GPIO_ACTIVE_HIGH>; enable-active-high; vin-supply = <&vdd_1v8>; }; From aba0c5046336c7c715943b2897b6e4c4d881f1fb Mon Sep 17 00:00:00 2001 From: Andreas Schwab Date: Mon, 13 Jan 2025 18:19:09 +0100 Subject: [PATCH 292/529] powerpc/prom_init: Fixup missing #size-cells on PowerBook6,7 [ Upstream commit 7e67ef889c9ab7246547db73d524459f47403a77 ] Similar to the PowerMac3,1, the PowerBook6,7 is missing the #size-cells property on the i2s node. Depends-on: commit 045b14ca5c36 ("of: WARN on deprecated #address-cells/#size-cells handling") Signed-off-by: Andreas Schwab Acked-by: Rob Herring (Arm) [maddy: added "commit" work in depends-on to avoid checkpatch error] Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/875xmizl6a.fsf@igel.home Signed-off-by: Sasha Levin --- arch/powerpc/kernel/prom_init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/prom_init.c b/arch/powerpc/kernel/prom_init.c index a6090896f749..ac669e58e202 100644 --- a/arch/powerpc/kernel/prom_init.c +++ b/arch/powerpc/kernel/prom_init.c @@ -2974,11 +2974,11 @@ static void __init fixup_device_tree_pmac(void) char type[8]; phandle node; - // Some pmacs are missing #size-cells on escc nodes + // Some pmacs are missing #size-cells on escc or i2s nodes for (node = 0; prom_next_node(&node); ) { type[0] = '\0'; prom_getprop(node, "device_type", type, sizeof(type)); - if (prom_strcmp(type, "escc")) + if (prom_strcmp(type, "escc") && prom_strcmp(type, "i2s")) continue; if (prom_getproplen(node, "#size-cells") != PROM_ERROR) From e6ca4d234b05820e88fe9dc87b632d0757b8724c Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 7 Mar 2025 09:42:42 +0100 Subject: [PATCH 293/529] ALSA: seq: Improve data consistency at polling [ Upstream commit e3cd33ab17c33bd8f1a9df66ec83a15dd8f7afbb ] snd_seq_poll() calls snd_seq_write_pool_allocated() that reads out a field in client->pool object, while it can be updated concurrently via ioctls, as reported by syzbot. The data race itself is harmless, as it's merely a poll() call, and the state is volatile. OTOH, the read out of poll object info from the caller side is fragile, and we can leave it better in snd_seq_pool_poll_wait() alone. A similar pattern is seen in snd_seq_kernel_client_write_poll(), too, which is called from the OSS sequencer. This patch drops the pool checks from the caller side and add the pool->lock in snd_seq_pool_poll_wait() for better data consistency. Reported-by: syzbot+2d373c9936c00d7e120c@syzkaller.appspotmail.com Closes: https://lore.kernel.org/67c88903.050a0220.15b4b9.0028.GAE@google.com Link: https://patch.msgid.link/20250307084246.29271-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/core/seq/seq_clientmgr.c | 5 +---- sound/core/seq/seq_memory.c | 1 + 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index 2d707afa1ef1..1252ea7ad55e 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -1140,8 +1140,7 @@ static __poll_t snd_seq_poll(struct file *file, poll_table * wait) if (snd_seq_file_flags(file) & SNDRV_SEQ_LFLG_OUTPUT) { /* check if data is available in the pool */ - if (!snd_seq_write_pool_allocated(client) || - snd_seq_pool_poll_wait(client->pool, file, wait)) + if (snd_seq_pool_poll_wait(client->pool, file, wait)) mask |= EPOLLOUT | EPOLLWRNORM; } @@ -2382,8 +2381,6 @@ int snd_seq_kernel_client_write_poll(int clientid, struct file *file, poll_table if (client == NULL) return -ENXIO; - if (! snd_seq_write_pool_allocated(client)) - return 1; if (snd_seq_pool_poll_wait(client->pool, file, wait)) return 1; return 0; diff --git a/sound/core/seq/seq_memory.c b/sound/core/seq/seq_memory.c index 47ef6bc30c0e..e30b92d85079 100644 --- a/sound/core/seq/seq_memory.c +++ b/sound/core/seq/seq_memory.c @@ -366,6 +366,7 @@ int snd_seq_pool_poll_wait(struct snd_seq_pool *pool, struct file *file, poll_table *wait) { poll_wait(file, &pool->output_sleep, wait); + guard(spinlock_irq)(&pool->lock); return snd_seq_output_ok(pool); } From 2c86fafda7e2431adc16779c124926dcfd19a485 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 5 Mar 2025 13:05:50 +0000 Subject: [PATCH 294/529] tcp: bring back NUMA dispersion in inet_ehash_locks_alloc() [ Upstream commit f8ece40786c9342249aa0a1b55e148ee23b2a746 ] We have platforms with 6 NUMA nodes and 480 cpus. inet_ehash_locks_alloc() currently allocates a single 64KB page to hold all ehash spinlocks. This adds more pressure on a single node. Change inet_ehash_locks_alloc() to use vmalloc() to spread the spinlocks on all online nodes, driven by NUMA policies. At boot time, NUMA policy is interleave=all, meaning that tcp_hashinfo.ehash_locks gets hash dispersion on all nodes. Tested: lack5:~# grep inet_ehash_locks_alloc /proc/vmallocinfo 0x00000000d9aec4d1-0x00000000a828b652 69632 inet_ehash_locks_alloc+0x90/0x100 pages=16 vmalloc N0=2 N1=3 N2=3 N3=3 N4=3 N5=2 lack5:~# echo 8192 >/proc/sys/net/ipv4/tcp_child_ehash_entries lack5:~# numactl --interleave=all unshare -n bash -c "grep inet_ehash_locks_alloc /proc/vmallocinfo" 0x000000004e99d30c-0x00000000763f3279 36864 inet_ehash_locks_alloc+0x90/0x100 pages=8 vmalloc N0=1 N1=2 N2=2 N3=1 N4=1 N5=1 0x00000000d9aec4d1-0x00000000a828b652 69632 inet_ehash_locks_alloc+0x90/0x100 pages=16 vmalloc N0=2 N1=3 N2=3 N3=3 N4=3 N5=2 lack5:~# numactl --interleave=0,5 unshare -n bash -c "grep inet_ehash_locks_alloc /proc/vmallocinfo" 0x00000000fd73a33e-0x0000000004b9a177 36864 inet_ehash_locks_alloc+0x90/0x100 pages=8 vmalloc N0=4 N5=4 0x00000000d9aec4d1-0x00000000a828b652 69632 inet_ehash_locks_alloc+0x90/0x100 pages=16 vmalloc N0=2 N1=3 N2=3 N3=3 N4=3 N5=2 lack5:~# echo 1024 >/proc/sys/net/ipv4/tcp_child_ehash_entries lack5:~# numactl --interleave=all unshare -n bash -c "grep inet_ehash_locks_alloc /proc/vmallocinfo" 0x00000000db07d7a2-0x00000000ad697d29 8192 inet_ehash_locks_alloc+0x90/0x100 pages=1 vmalloc N2=1 0x00000000d9aec4d1-0x00000000a828b652 69632 inet_ehash_locks_alloc+0x90/0x100 pages=16 vmalloc N0=2 N1=3 N2=3 N3=3 N4=3 N5=2 Signed-off-by: Eric Dumazet Tested-by: Jason Xing Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250305130550.1865988-1-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv4/inet_hashtables.c | 37 ++++++++++++++++++++++++++----------- 1 file changed, 26 insertions(+), 11 deletions(-) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 321f509f2347..5e7cdcebd64f 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -1218,22 +1218,37 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) { unsigned int locksz = sizeof(spinlock_t); unsigned int i, nblocks = 1; + spinlock_t *ptr = NULL; - if (locksz != 0) { - /* allocate 2 cache lines or at least one spinlock per cpu */ - nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U); - nblocks = roundup_pow_of_two(nblocks * num_possible_cpus()); + if (locksz == 0) + goto set_mask; - /* no more locks than number of hash buckets */ - nblocks = min(nblocks, hashinfo->ehash_mask + 1); + /* Allocate 2 cache lines or at least one spinlock per cpu. */ + nblocks = max(2U * L1_CACHE_BYTES / locksz, 1U) * num_possible_cpus(); - hashinfo->ehash_locks = kvmalloc_array(nblocks, locksz, GFP_KERNEL); - if (!hashinfo->ehash_locks) - return -ENOMEM; + /* At least one page per NUMA node. */ + nblocks = max(nblocks, num_online_nodes() * PAGE_SIZE / locksz); - for (i = 0; i < nblocks; i++) - spin_lock_init(&hashinfo->ehash_locks[i]); + nblocks = roundup_pow_of_two(nblocks); + + /* No more locks than number of hash buckets. */ + nblocks = min(nblocks, hashinfo->ehash_mask + 1); + + if (num_online_nodes() > 1) { + /* Use vmalloc() to allow NUMA policy to spread pages + * on all available nodes if desired. + */ + ptr = vmalloc_array(nblocks, locksz); } + if (!ptr) { + ptr = kvmalloc_array(nblocks, locksz, GFP_KERNEL); + if (!ptr) + return -ENOMEM; + } + for (i = 0; i < nblocks; i++) + spin_lock_init(&ptr[i]); + hashinfo->ehash_locks = ptr; +set_mask: hashinfo->ehash_locks_mask = nblocks - 1; return 0; } From 9569e3589097ca9f5627ccca01bf04ef34994529 Mon Sep 17 00:00:00 2001 From: Alexandre Belloni Date: Mon, 3 Mar 2025 23:37:44 +0100 Subject: [PATCH 295/529] rtc: ds1307: stop disabling alarms on probe [ Upstream commit dcec12617ee61beed928e889607bf37e145bf86b ] It is a bad practice to disable alarms on probe or remove as this will prevent alarms across reboots. Link: https://lore.kernel.org/r/20250303223744.1135672-1-alexandre.belloni@bootlin.com Signed-off-by: Alexandre Belloni Signed-off-by: Sasha Levin --- drivers/rtc/rtc-ds1307.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/rtc/rtc-ds1307.c b/drivers/rtc/rtc-ds1307.c index d51565bcc189..b7f8b3f9b059 100644 --- a/drivers/rtc/rtc-ds1307.c +++ b/drivers/rtc/rtc-ds1307.c @@ -1802,10 +1802,8 @@ static int ds1307_probe(struct i2c_client *client, * For some variants, be sure alarms can trigger when we're * running on Vbackup (BBSQI/BBSQW) */ - if (want_irq || ds1307_can_wakeup_device) { + if (want_irq || ds1307_can_wakeup_device) regs[0] |= DS1337_BIT_INTCN | chip->bbsqi_bit; - regs[0] &= ~(DS1337_BIT_A2IE | DS1337_BIT_A1IE); - } regmap_write(ds1307->regmap, DS1337_REG_CONTROL, regs[0]); From fd10aa99ef398f791399c3cd075caf292b4ccd3b Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 5 Mar 2025 12:55:34 +0200 Subject: [PATCH 296/529] ieee802154: ca8210: Use proper setters and getters for bitwise types [ Upstream commit 169b2262205836a5d1213ff44dca2962276bece1 ] Sparse complains that the driver doesn't respect the bitwise types: drivers/net/ieee802154/ca8210.c:1796:27: warning: incorrect type in assignment (different base types) drivers/net/ieee802154/ca8210.c:1796:27: expected restricted __le16 [addressable] [assigned] [usertype] pan_id drivers/net/ieee802154/ca8210.c:1796:27: got unsigned short [usertype] drivers/net/ieee802154/ca8210.c:1801:25: warning: incorrect type in assignment (different base types) drivers/net/ieee802154/ca8210.c:1801:25: expected restricted __le16 [addressable] [assigned] [usertype] pan_id drivers/net/ieee802154/ca8210.c:1801:25: got unsigned short [usertype] drivers/net/ieee802154/ca8210.c:1928:28: warning: incorrect type in argument 3 (different base types) drivers/net/ieee802154/ca8210.c:1928:28: expected unsigned short [usertype] dst_pan_id drivers/net/ieee802154/ca8210.c:1928:28: got restricted __le16 [addressable] [usertype] pan_id Use proper setters and getters for bitwise types. Note, in accordance with [1] the protocol is little endian. Link: https://www.cascoda.com/wp-content/uploads/2018/11/CA-8210_datasheet_0418.pdf [1] Reviewed-by: Miquel Raynal Reviewed-by: Linus Walleij Signed-off-by: Andy Shevchenko Link: https://lore.kernel.org/20250305105656.2133487-2-andriy.shevchenko@linux.intel.com Signed-off-by: Stefan Schmidt Signed-off-by: Sasha Levin --- drivers/net/ieee802154/ca8210.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/drivers/net/ieee802154/ca8210.c b/drivers/net/ieee802154/ca8210.c index 1659bbffdb91..463be34a4ca4 100644 --- a/drivers/net/ieee802154/ca8210.c +++ b/drivers/net/ieee802154/ca8210.c @@ -1446,8 +1446,7 @@ static u8 mcps_data_request( command.pdata.data_req.src_addr_mode = src_addr_mode; command.pdata.data_req.dst.mode = dst_address_mode; if (dst_address_mode != MAC_MODE_NO_ADDR) { - command.pdata.data_req.dst.pan_id[0] = LS_BYTE(dst_pan_id); - command.pdata.data_req.dst.pan_id[1] = MS_BYTE(dst_pan_id); + put_unaligned_le16(dst_pan_id, command.pdata.data_req.dst.pan_id); if (dst_address_mode == MAC_MODE_SHORT_ADDR) { command.pdata.data_req.dst.address[0] = LS_BYTE( dst_addr->short_address @@ -1795,12 +1794,12 @@ static int ca8210_skb_rx( } hdr.source.mode = data_ind[0]; dev_dbg(&priv->spi->dev, "srcAddrMode: %#03x\n", hdr.source.mode); - hdr.source.pan_id = *(u16 *)&data_ind[1]; + hdr.source.pan_id = cpu_to_le16(get_unaligned_le16(&data_ind[1])); dev_dbg(&priv->spi->dev, "srcPanId: %#06x\n", hdr.source.pan_id); memcpy(&hdr.source.extended_addr, &data_ind[3], 8); hdr.dest.mode = data_ind[11]; dev_dbg(&priv->spi->dev, "dstAddrMode: %#03x\n", hdr.dest.mode); - hdr.dest.pan_id = *(u16 *)&data_ind[12]; + hdr.dest.pan_id = cpu_to_le16(get_unaligned_le16(&data_ind[12])); dev_dbg(&priv->spi->dev, "dstPanId: %#06x\n", hdr.dest.pan_id); memcpy(&hdr.dest.extended_addr, &data_ind[14], 8); @@ -1927,7 +1926,7 @@ static int ca8210_skb_tx( status = mcps_data_request( header.source.mode, header.dest.mode, - header.dest.pan_id, + le16_to_cpu(header.dest.pan_id), (union macaddr *)&header.dest.extended_addr, skb->len - mac_len, &skb->data[mac_len], From b649481a1c14f64c3bd52cd2cfe5a837ab6d7fc7 Mon Sep 17 00:00:00 2001 From: Svyatoslav Ryhel Date: Wed, 26 Feb 2025 12:56:11 +0200 Subject: [PATCH 297/529] ARM: tegra: Switch DSI-B clock parent to PLLD on Tegra114 [ Upstream commit 2b3db788f2f614b875b257cdb079adadedc060f3 ] PLLD is usually used as parent clock for internal video devices, like DSI for example, while PLLD2 is used as parent for HDMI. Signed-off-by: Svyatoslav Ryhel Link: https://lore.kernel.org/r/20250226105615.61087-3-clamor95@gmail.com Signed-off-by: Thierry Reding Signed-off-by: Sasha Levin --- arch/arm/boot/dts/tegra114.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/boot/dts/tegra114.dtsi b/arch/arm/boot/dts/tegra114.dtsi index 09996acad639..bb23bb39dd5b 100644 --- a/arch/arm/boot/dts/tegra114.dtsi +++ b/arch/arm/boot/dts/tegra114.dtsi @@ -139,7 +139,7 @@ reg = <0x54400000 0x00040000>; clocks = <&tegra_car TEGRA114_CLK_DSIB>, <&tegra_car TEGRA114_CLK_DSIBLP>, - <&tegra_car TEGRA114_CLK_PLL_D2_OUT0>; + <&tegra_car TEGRA114_CLK_PLL_D_OUT0>; clock-names = "dsi", "lp", "parent"; resets = <&tegra_car 82>; reset-names = "dsi"; From 2996d38f2dee64e0141878617a03198b198b721e Mon Sep 17 00:00:00 2001 From: Markus Elfring Date: Fri, 4 Oct 2024 15:50:15 +0200 Subject: [PATCH 298/529] media: c8sectpfe: Call of_node_put(i2c_bus) only once in c8sectpfe_probe() [ Upstream commit b773530a34df0687020520015057075f8b7b4ac4 ] An of_node_put(i2c_bus) call was immediately used after a pointer check for an of_find_i2c_adapter_by_node() call in this function implementation. Thus call such a function only once instead directly before the check. This issue was transformed by using the Coccinelle software. Signed-off-by: Markus Elfring Signed-off-by: Hans Verkuil Signed-off-by: Sasha Levin --- drivers/media/platform/st/sti/c8sectpfe/c8sectpfe-core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/media/platform/st/sti/c8sectpfe/c8sectpfe-core.c b/drivers/media/platform/st/sti/c8sectpfe/c8sectpfe-core.c index 1dbb89f0ddb8..b2a977f1ec18 100644 --- a/drivers/media/platform/st/sti/c8sectpfe/c8sectpfe-core.c +++ b/drivers/media/platform/st/sti/c8sectpfe/c8sectpfe-core.c @@ -802,13 +802,12 @@ static int c8sectpfe_probe(struct platform_device *pdev) } tsin->i2c_adapter = of_find_i2c_adapter_by_node(i2c_bus); + of_node_put(i2c_bus); if (!tsin->i2c_adapter) { dev_err(&pdev->dev, "No i2c adapter found\n"); - of_node_put(i2c_bus); ret = -ENODEV; goto err_node_put; } - of_node_put(i2c_bus); tsin->rst_gpio = of_get_named_gpio(child, "reset-gpios", 0); From 00586b78eeb7c626a14ca13453a1631f88a7cf36 Mon Sep 17 00:00:00 2001 From: Ming-Hung Tsai Date: Thu, 6 Mar 2025 16:41:50 +0800 Subject: [PATCH 299/529] dm cache: prevent BUG_ON by blocking retries on failed device resumes [ Upstream commit 5da692e2262b8f81993baa9592f57d12c2703dea ] A cache device failing to resume due to mapping errors should not be retried, as the failure leaves a partially initialized policy object. Repeating the resume operation risks triggering BUG_ON when reloading cache mappings into the incomplete policy object. Reproduce steps: 1. create a cache metadata consisting of 512 or more cache blocks, with some mappings stored in the first array block of the mapping array. Here we use cache_restore v1.0 to build the metadata. cat <> cmeta.xml EOF dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" cache_restore -i cmeta.xml -o /dev/mapper/cmeta --metadata-version=2 dmsetup remove cmeta 2. wipe the second array block of the mapping array to simulate data degradations. mapping_root=$(dd if=/dev/sdc bs=1c count=8 skip=192 \ 2>/dev/null | hexdump -e '1/8 "%u\n"') ablock=$(dd if=/dev/sdc bs=1c count=8 skip=$((4096*mapping_root+2056)) \ 2>/dev/null | hexdump -e '1/8 "%u\n"') dd if=/dev/zero of=/dev/sdc bs=4k count=1 seek=$ablock 3. try bringing up the cache device. The resume is expected to fail due to the broken array block. dmsetup create cmeta --table "0 8192 linear /dev/sdc 0" dmsetup create cdata --table "0 65536 linear /dev/sdc 8192" dmsetup create corig --table "0 524288 linear /dev/sdc 262144" dmsetup create cache --notable dmsetup load cache --table "0 524288 cache /dev/mapper/cmeta \ /dev/mapper/cdata /dev/mapper/corig 128 2 metadata2 writethrough smq 0" dmsetup resume cache 4. try resuming the cache again. An unexpected BUG_ON is triggered while loading cache mappings. dmsetup resume cache Kernel logs: (snip) ------------[ cut here ]------------ kernel BUG at drivers/md/dm-cache-policy-smq.c:752! Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN NOPTI CPU: 0 UID: 0 PID: 332 Comm: dmsetup Not tainted 6.13.4 #3 RIP: 0010:smq_load_mapping+0x3e5/0x570 Fix by disallowing resume operations for devices that failed the initial attempt. Signed-off-by: Ming-Hung Tsai Signed-off-by: Mikulas Patocka Signed-off-by: Sasha Levin --- drivers/md/dm-cache-target.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/md/dm-cache-target.c b/drivers/md/dm-cache-target.c index e714114d495a..66608b42ee1a 100644 --- a/drivers/md/dm-cache-target.c +++ b/drivers/md/dm-cache-target.c @@ -2875,6 +2875,27 @@ static dm_cblock_t get_cache_dev_size(struct cache *cache) return to_cblock(size); } +static bool can_resume(struct cache *cache) +{ + /* + * Disallow retrying the resume operation for devices that failed the + * first resume attempt, as the failure leaves the policy object partially + * initialized. Retrying could trigger BUG_ON when loading cache mappings + * into the incomplete policy object. + */ + if (cache->sized && !cache->loaded_mappings) { + if (get_cache_mode(cache) != CM_WRITE) + DMERR("%s: unable to resume a failed-loaded cache, please check metadata.", + cache_device_name(cache)); + else + DMERR("%s: unable to resume cache due to missing proper cache table reload", + cache_device_name(cache)); + return false; + } + + return true; +} + static bool can_resize(struct cache *cache, dm_cblock_t new_size) { if (from_cblock(new_size) > from_cblock(cache->cache_size)) { @@ -2923,6 +2944,9 @@ static int cache_preresume(struct dm_target *ti) struct cache *cache = ti->private; dm_cblock_t csize = get_cache_dev_size(cache); + if (!can_resume(cache)) + return -EINVAL; + /* * Check to see if the cache has resized. */ From 15602508ad2f923e228b9521960b4addcd27d9c4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Wed, 5 Mar 2025 20:47:25 +0000 Subject: [PATCH 300/529] orangefs: Do not truncate file size [ Upstream commit 062e8093592fb866b8e016641a8b27feb6ac509d ] 'len' is used to store the result of i_size_read(), so making 'len' a size_t results in truncation to 4GiB on 32-bit systems. Signed-off-by: "Matthew Wilcox (Oracle)" Link: https://lore.kernel.org/r/20250305204734.1475264-2-willy@infradead.org Tested-by: Mike Marshall Signed-off-by: Christian Brauner Signed-off-by: Sasha Levin --- fs/orangefs/inode.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index b3bbb5a5787a..cc81ff6ac735 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -23,9 +23,9 @@ static int orangefs_writepage_locked(struct page *page, struct orangefs_write_range *wr = NULL; struct iov_iter iter; struct bio_vec bv; - size_t len, wlen; + size_t wlen; ssize_t ret; - loff_t off; + loff_t len, off; set_page_writeback(page); @@ -94,8 +94,7 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, struct orangefs_write_range *wrp, wr; struct iov_iter iter; ssize_t ret; - size_t len; - loff_t off; + loff_t len, off; int i; len = i_size_read(inode); From fdf85aa795f4c2cebdcdf93cf66eb5c1e5194c0d Mon Sep 17 00:00:00 2001 From: Choong Yong Liang Date: Thu, 27 Feb 2025 20:15:17 +0800 Subject: [PATCH 301/529] net: phylink: use pl->link_interface in phylink_expects_phy() [ Upstream commit b63263555eaafbf9ab1a82f2020bbee872d83759 ] The phylink_expects_phy() function allows MAC drivers to check if they are expecting a PHY to attach. The checking condition in phylink_expects_phy() aims to achieve the same result as the checking condition in phylink_attach_phy(). However, the checking condition in phylink_expects_phy() uses pl->link_config.interface, while phylink_attach_phy() uses pl->link_interface. Initially, both pl->link_interface and pl->link_config.interface are set to SGMII, and pl->cfg_link_an_mode is set to MLO_AN_INBAND. When the interface switches from SGMII to 2500BASE-X, pl->link_config.interface is updated by phylink_major_config(). At this point, pl->cfg_link_an_mode remains MLO_AN_INBAND, and pl->link_config.interface is set to 2500BASE-X. Subsequently, when the STMMAC interface is taken down administratively and brought back up, it is blocked by phylink_expects_phy(). Since phylink_expects_phy() and phylink_attach_phy() aim to achieve the same result, phylink_expects_phy() should check pl->link_interface, which never changes, instead of pl->link_config.interface, which is updated by phylink_major_config(). Reviewed-by: Russell King (Oracle) Signed-off-by: Choong Yong Liang Link: https://patch.msgid.link/20250227121522.1802832-2-yong.liang.choong@linux.intel.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/phy/phylink.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index fc58e4afb38d..3069a7df25d3 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -1566,7 +1566,7 @@ bool phylink_expects_phy(struct phylink *pl) { if (pl->cfg_link_an_mode == MLO_AN_FIXED || (pl->cfg_link_an_mode == MLO_AN_INBAND && - phy_interface_mode_is_8023z(pl->link_config.interface))) + phy_interface_mode_is_8023z(pl->link_interface))) return false; return true; } From 8d0d4c11cace475f1b07f98ee3d2bd334590e17b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matti=20Lehtim=C3=A4ki?= Date: Thu, 6 Feb 2025 20:56:48 +0100 Subject: [PATCH 302/529] remoteproc: qcom_wcnss: Handle platforms with only single power domain MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 65991ea8a6d1e68effdc01d95ebe39f1653f7b71 ] Both MSM8974 and MSM8226 have only CX as power domain with MX & PX being handled as regulators. Handle this case by reodering pd_names to have CX first, and handling that the driver core will already attach a single power domain internally. Signed-off-by: Matti Lehtimäki [luca: minor changes] Signed-off-by: Luca Weiss Link: https://lore.kernel.org/r/20250206-wcnss-singlepd-v2-2-9a53ee953dee@lucaweiss.eu [bjorn: Added missing braces to else after multi-statement if] Signed-off-by: Bjorn Andersson Signed-off-by: Sasha Levin --- drivers/remoteproc/qcom_wcnss.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/drivers/remoteproc/qcom_wcnss.c b/drivers/remoteproc/qcom_wcnss.c index 68f37296b151..ce61e0e7cbeb 100644 --- a/drivers/remoteproc/qcom_wcnss.c +++ b/drivers/remoteproc/qcom_wcnss.c @@ -117,10 +117,10 @@ static const struct wcnss_data pronto_v1_data = { .pmu_offset = 0x1004, .spare_offset = 0x1088, - .pd_names = { "mx", "cx" }, + .pd_names = { "cx", "mx" }, .vregs = (struct wcnss_vreg_info[]) { - { "vddmx", 950000, 1150000, 0 }, { "vddcx", .super_turbo = true}, + { "vddmx", 950000, 1150000, 0 }, { "vddpx", 1800000, 1800000, 0 }, }, .num_pd_vregs = 2, @@ -131,10 +131,10 @@ static const struct wcnss_data pronto_v2_data = { .pmu_offset = 0x1004, .spare_offset = 0x1088, - .pd_names = { "mx", "cx" }, + .pd_names = { "cx", "mx" }, .vregs = (struct wcnss_vreg_info[]) { - { "vddmx", 1287500, 1287500, 0 }, { "vddcx", .super_turbo = true }, + { "vddmx", 1287500, 1287500, 0 }, { "vddpx", 1800000, 1800000, 0 }, }, .num_pd_vregs = 2, @@ -386,8 +386,17 @@ static irqreturn_t wcnss_stop_ack_interrupt(int irq, void *dev) static int wcnss_init_pds(struct qcom_wcnss *wcnss, const char * const pd_names[WCNSS_MAX_PDS]) { + struct device *dev = wcnss->dev; int i, ret; + /* Handle single power domain */ + if (dev->pm_domain) { + wcnss->pds[0] = dev; + wcnss->num_pds = 1; + pm_runtime_enable(dev); + return 0; + } + for (i = 0; i < WCNSS_MAX_PDS; i++) { if (!pd_names[i]) break; @@ -407,8 +416,15 @@ static int wcnss_init_pds(struct qcom_wcnss *wcnss, static void wcnss_release_pds(struct qcom_wcnss *wcnss) { + struct device *dev = wcnss->dev; int i; + /* Handle single power domain */ + if (wcnss->num_pds == 1 && dev->pm_domain) { + pm_runtime_disable(dev); + return; + } + for (i = 0; i < wcnss->num_pds; i++) dev_pm_domain_detach(wcnss->pds[i], false); } @@ -426,10 +442,13 @@ static int wcnss_init_regulators(struct qcom_wcnss *wcnss, * the regulators for the power domains. For old device trees we need to * reserve extra space to manage them through the regulator interface. */ - if (wcnss->num_pds) - info += num_pd_vregs; - else + if (wcnss->num_pds) { + info += wcnss->num_pds; + /* Handle single power domain case */ + num_vregs += num_pd_vregs - wcnss->num_pds; + } else { num_vregs += num_pd_vregs; + } bulk = devm_kcalloc(wcnss->dev, num_vregs, sizeof(struct regulator_bulk_data), From 41f654291ba7e440db4a370da16aab8286df1a54 Mon Sep 17 00:00:00 2001 From: Victor Lu Date: Thu, 13 Feb 2025 18:38:28 -0500 Subject: [PATCH 303/529] drm/amdgpu: Do not program AGP BAR regs under SRIOV in gfxhub_v1_0.c [ Upstream commit 057fef20b8401110a7bc1c2fe9d804a8a0bf0d24 ] SRIOV VF does not have write access to AGP BAR regs. Skip the writes to avoid a dmesg warning. Signed-off-by: Victor Lu Acked-by: Alex Deucher Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c index ec4d5e15b766..de74686cb1db 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfxhub_v1_0.c @@ -92,12 +92,12 @@ static void gfxhub_v1_0_init_system_aperture_regs(struct amdgpu_device *adev) { uint64_t value; - /* Program the AGP BAR */ - WREG32_SOC15_RLC(GC, 0, mmMC_VM_AGP_BASE, 0); - WREG32_SOC15_RLC(GC, 0, mmMC_VM_AGP_BOT, adev->gmc.agp_start >> 24); - WREG32_SOC15_RLC(GC, 0, mmMC_VM_AGP_TOP, adev->gmc.agp_end >> 24); - if (!amdgpu_sriov_vf(adev) || adev->asic_type <= CHIP_VEGA10) { + /* Program the AGP BAR */ + WREG32_SOC15_RLC(GC, 0, mmMC_VM_AGP_BASE, 0); + WREG32_SOC15_RLC(GC, 0, mmMC_VM_AGP_BOT, adev->gmc.agp_start >> 24); + WREG32_SOC15_RLC(GC, 0, mmMC_VM_AGP_TOP, adev->gmc.agp_end >> 24); + /* Program the system aperture low logical page number. */ WREG32_SOC15_RLC(GC, 0, mmMC_VM_SYSTEM_APERTURE_LOW_ADDR, min(adev->gmc.fb_start, adev->gmc.agp_start) >> 18); From 9d1a5be86dbe074bd8dd6bdd63a99d6bb66d5930 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 24 Feb 2025 14:13:24 +0100 Subject: [PATCH 304/529] media: cx231xx: set device_caps for 417 [ Upstream commit a79efc44b51432490538a55b9753a721f7d3ea42 ] The video_device for the MPEG encoder did not set device_caps. Add this, otherwise the video device can't be registered (you get a WARN_ON instead). Not seen before since currently 417 support is disabled, but I found this while experimenting with it. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/usb/cx231xx/cx231xx-417.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/media/usb/cx231xx/cx231xx-417.c b/drivers/media/usb/cx231xx/cx231xx-417.c index c5e21785fafe..02343e88cc61 100644 --- a/drivers/media/usb/cx231xx/cx231xx-417.c +++ b/drivers/media/usb/cx231xx/cx231xx-417.c @@ -1722,6 +1722,8 @@ static void cx231xx_video_dev_init( vfd->lock = &dev->lock; vfd->release = video_device_release_empty; vfd->ctrl_handler = &dev->mpeg_ctrl_handler.hdl; + vfd->device_caps = V4L2_CAP_READWRITE | V4L2_CAP_STREAMING | + V4L2_CAP_VIDEO_CAPTURE; video_set_drvdata(vfd, dev); if (dev->tuner_type == TUNER_ABSENT) { v4l2_disable_ioctl(vfd, VIDIOC_G_FREQUENCY); From 4473bd0993c4eecee9a66ab8c3138d27b5485a68 Mon Sep 17 00:00:00 2001 From: Artur Weber Date: Mon, 3 Mar 2025 21:54:47 +0100 Subject: [PATCH 305/529] pinctrl: bcm281xx: Use "unsigned int" instead of bare "unsigned" [ Upstream commit 07b5a2a13f4704c5eae3be7277ec54ffdba45f72 ] Replace uses of bare "unsigned" with "unsigned int" to fix checkpatch warnings. No functional change. Signed-off-by: Artur Weber Link: https://lore.kernel.org/20250303-bcm21664-pinctrl-v3-2-5f8b80e4ab51@gmail.com Signed-off-by: Linus Walleij Signed-off-by: Sasha Levin --- drivers/pinctrl/bcm/pinctrl-bcm281xx.c | 44 +++++++++++++------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/pinctrl/bcm/pinctrl-bcm281xx.c b/drivers/pinctrl/bcm/pinctrl-bcm281xx.c index bba5496335ee..c313f0178957 100644 --- a/drivers/pinctrl/bcm/pinctrl-bcm281xx.c +++ b/drivers/pinctrl/bcm/pinctrl-bcm281xx.c @@ -69,7 +69,7 @@ static enum bcm281xx_pin_type hdmi_pin = BCM281XX_PIN_TYPE_HDMI; struct bcm281xx_pin_function { const char *name; const char * const *groups; - const unsigned ngroups; + const unsigned int ngroups; }; /* @@ -81,10 +81,10 @@ struct bcm281xx_pinctrl_data { /* List of all pins */ const struct pinctrl_pin_desc *pins; - const unsigned npins; + const unsigned int npins; const struct bcm281xx_pin_function *functions; - const unsigned nfunctions; + const unsigned int nfunctions; struct regmap *regmap; }; @@ -938,7 +938,7 @@ static struct bcm281xx_pinctrl_data bcm281xx_pinctrl = { }; static inline enum bcm281xx_pin_type pin_type_get(struct pinctrl_dev *pctldev, - unsigned pin) + unsigned int pin) { struct bcm281xx_pinctrl_data *pdata = pinctrl_dev_get_drvdata(pctldev); @@ -982,7 +982,7 @@ static int bcm281xx_pinctrl_get_groups_count(struct pinctrl_dev *pctldev) } static const char *bcm281xx_pinctrl_get_group_name(struct pinctrl_dev *pctldev, - unsigned group) + unsigned int group) { struct bcm281xx_pinctrl_data *pdata = pinctrl_dev_get_drvdata(pctldev); @@ -990,9 +990,9 @@ static const char *bcm281xx_pinctrl_get_group_name(struct pinctrl_dev *pctldev, } static int bcm281xx_pinctrl_get_group_pins(struct pinctrl_dev *pctldev, - unsigned group, + unsigned int group, const unsigned **pins, - unsigned *num_pins) + unsigned int *num_pins) { struct bcm281xx_pinctrl_data *pdata = pinctrl_dev_get_drvdata(pctldev); @@ -1004,7 +1004,7 @@ static int bcm281xx_pinctrl_get_group_pins(struct pinctrl_dev *pctldev, static void bcm281xx_pinctrl_pin_dbg_show(struct pinctrl_dev *pctldev, struct seq_file *s, - unsigned offset) + unsigned int offset) { seq_printf(s, " %s", dev_name(pctldev->dev)); } @@ -1026,7 +1026,7 @@ static int bcm281xx_pinctrl_get_fcns_count(struct pinctrl_dev *pctldev) } static const char *bcm281xx_pinctrl_get_fcn_name(struct pinctrl_dev *pctldev, - unsigned function) + unsigned int function) { struct bcm281xx_pinctrl_data *pdata = pinctrl_dev_get_drvdata(pctldev); @@ -1034,9 +1034,9 @@ static const char *bcm281xx_pinctrl_get_fcn_name(struct pinctrl_dev *pctldev, } static int bcm281xx_pinctrl_get_fcn_groups(struct pinctrl_dev *pctldev, - unsigned function, + unsigned int function, const char * const **groups, - unsigned * const num_groups) + unsigned int * const num_groups) { struct bcm281xx_pinctrl_data *pdata = pinctrl_dev_get_drvdata(pctldev); @@ -1047,8 +1047,8 @@ static int bcm281xx_pinctrl_get_fcn_groups(struct pinctrl_dev *pctldev, } static int bcm281xx_pinmux_set(struct pinctrl_dev *pctldev, - unsigned function, - unsigned group) + unsigned int function, + unsigned int group) { struct bcm281xx_pinctrl_data *pdata = pinctrl_dev_get_drvdata(pctldev); const struct bcm281xx_pin_function *f = &pdata->functions[function]; @@ -1079,7 +1079,7 @@ static const struct pinmux_ops bcm281xx_pinctrl_pinmux_ops = { }; static int bcm281xx_pinctrl_pin_config_get(struct pinctrl_dev *pctldev, - unsigned pin, + unsigned int pin, unsigned long *config) { return -ENOTSUPP; @@ -1088,9 +1088,9 @@ static int bcm281xx_pinctrl_pin_config_get(struct pinctrl_dev *pctldev, /* Goes through the configs and update register val/mask */ static int bcm281xx_std_pin_update(struct pinctrl_dev *pctldev, - unsigned pin, + unsigned int pin, unsigned long *configs, - unsigned num_configs, + unsigned int num_configs, u32 *val, u32 *mask) { @@ -1204,9 +1204,9 @@ static const u16 bcm281xx_pullup_map[] = { /* Goes through the configs and update register val/mask */ static int bcm281xx_i2c_pin_update(struct pinctrl_dev *pctldev, - unsigned pin, + unsigned int pin, unsigned long *configs, - unsigned num_configs, + unsigned int num_configs, u32 *val, u32 *mask) { @@ -1274,9 +1274,9 @@ static int bcm281xx_i2c_pin_update(struct pinctrl_dev *pctldev, /* Goes through the configs and update register val/mask */ static int bcm281xx_hdmi_pin_update(struct pinctrl_dev *pctldev, - unsigned pin, + unsigned int pin, unsigned long *configs, - unsigned num_configs, + unsigned int num_configs, u32 *val, u32 *mask) { @@ -1318,9 +1318,9 @@ static int bcm281xx_hdmi_pin_update(struct pinctrl_dev *pctldev, } static int bcm281xx_pinctrl_pin_config_set(struct pinctrl_dev *pctldev, - unsigned pin, + unsigned int pin, unsigned long *configs, - unsigned num_configs) + unsigned int num_configs) { struct bcm281xx_pinctrl_data *pdata = pinctrl_dev_get_drvdata(pctldev); enum bcm281xx_pin_type pin_type; From d1f95fbdf654fe834fa8e5de1d956dfd9bee78f0 Mon Sep 17 00:00:00 2001 From: Alexander Sverdlin Date: Mon, 3 Mar 2025 08:46:57 +0100 Subject: [PATCH 306/529] net: ethernet: ti: cpsw_new: populate netdev of_node [ Upstream commit 7ff1c88fc89688c27f773ba956f65f0c11367269 ] So that of_find_net_device_by_node() can find CPSW ports and other DSA switches can be stacked downstream. Tested in conjunction with KSZ8873. Reviewed-by: Siddharth Vadapalli Reviewed-by: Andrew Lunn Signed-off-by: Alexander Sverdlin Link: https://patch.msgid.link/20250303074703.1758297-1-alexander.sverdlin@siemens.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/ti/cpsw_new.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/ti/cpsw_new.c b/drivers/net/ethernet/ti/cpsw_new.c index 6e70aa1cc7bf..42684cb83606 100644 --- a/drivers/net/ethernet/ti/cpsw_new.c +++ b/drivers/net/ethernet/ti/cpsw_new.c @@ -1411,6 +1411,7 @@ static int cpsw_create_ports(struct cpsw_common *cpsw) ndev->netdev_ops = &cpsw_netdev_ops; ndev->ethtool_ops = &cpsw_ethtool_ops; SET_NETDEV_DEV(ndev, dev); + ndev->dev.of_node = slave_data->slave_node; if (!napi_ndev) { /* CPSW Host port CPDMA interface is shared between From 48bd9b9d666ee6b14b03476d47d30fc2ac7eab29 Mon Sep 17 00:00:00 2001 From: Peter Seiderer Date: Thu, 27 Feb 2025 14:56:00 +0100 Subject: [PATCH 307/529] net: pktgen: fix mpls maximum labels list parsing [ Upstream commit 2b15a0693f70d1e8119743ee89edbfb1271b3ea8 ] Fix mpls maximum labels list parsing up to MAX_MPLS_LABELS entries (instead of up to MAX_MPLS_LABELS - 1). Addresses the following: $ echo "mpls 00000f00,00000f01,00000f02,00000f03,00000f04,00000f05,00000f06,00000f07,00000f08,00000f09,00000f0a,00000f0b,00000f0c,00000f0d,00000f0e,00000f0f" > /proc/net/pktgen/lo\@0 -bash: echo: write error: Argument list too long Signed-off-by: Peter Seiderer Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/core/pktgen.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index a2fb951996b8..5917820f92c3 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -897,6 +897,10 @@ static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) pkt_dev->nr_labels = 0; do { __u32 tmp; + + if (n >= MAX_MPLS_LABELS) + return -E2BIG; + len = hex32_arg(&buffer[i], 8, &tmp); if (len <= 0) return len; @@ -908,8 +912,6 @@ static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) return -EFAULT; i++; n++; - if (n >= MAX_MPLS_LABELS) - return -E2BIG; } while (c == ','); pkt_dev->nr_labels = n; From 628ff556e4ab56e59655939e67034d9e0475f7da Mon Sep 17 00:00:00 2001 From: Saket Kumar Bhaskar Date: Mon, 3 Mar 2025 14:54:51 +0530 Subject: [PATCH 308/529] perf/hw_breakpoint: Return EOPNOTSUPP for unsupported breakpoint type [ Upstream commit 061c991697062f3bf87b72ed553d1d33a0e370dd ] Currently, __reserve_bp_slot() returns -ENOSPC for unsupported breakpoint types on the architecture. For example, powerpc does not support hardware instruction breakpoints. This causes the perf_skip BPF selftest to fail, as neither ENOENT nor EOPNOTSUPP is returned by perf_event_open for unsupported breakpoint types. As a result, the test that should be skipped for this arch is not correctly identified. To resolve this, hw_breakpoint_event_init() should exit early by checking for unsupported breakpoint types using hw_breakpoint_slots_cached() and return the appropriate error (-EOPNOTSUPP). Signed-off-by: Saket Kumar Bhaskar Signed-off-by: Ingo Molnar Cc: Marco Elver Cc: Dmitry Vyukov Cc: Ian Rogers Cc: Frederic Weisbecker Link: https://lore.kernel.org/r/20250303092451.1862862-1-skb99@linux.ibm.com Signed-off-by: Sasha Levin --- kernel/events/hw_breakpoint.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/events/hw_breakpoint.c b/kernel/events/hw_breakpoint.c index c3797701339c..382a3b04f6d3 100644 --- a/kernel/events/hw_breakpoint.c +++ b/kernel/events/hw_breakpoint.c @@ -978,9 +978,10 @@ static int hw_breakpoint_event_init(struct perf_event *bp) return -ENOENT; /* - * no branch sampling for breakpoint events + * Check if breakpoint type is supported before proceeding. + * Also, no branch sampling for breakpoint events. */ - if (has_branch_stack(bp)) + if (!hw_breakpoint_slots_cached(find_slot_idx(bp->attr.bp_type)) || has_branch_stack(bp)) return -EOPNOTSUPP; err = register_perf_hw_breakpoint(bp); From f29dd5afa1f2e085617ca265eb36cead1bdb304a Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Sun, 16 Feb 2025 22:31:03 +0100 Subject: [PATCH 309/529] ALSA: hda/realtek: Enable PC beep passthrough for HP EliteBook 855 G7 [ Upstream commit aa85822c611aef7cd4dc17d27121d43e21bb82f0 ] PC speaker works well on this platform in BIOS and in Linux until sound card drivers are loaded. Then it stops working. There seems to be a beep generator node at 0x1a in this CODEC (ALC269_TYPE_ALC215) but it seems to be only connected to capture mixers at nodes 0x22 and 0x23. If I unmute the mixer input for 0x1a at node 0x23 and start recording from its "ALC285 Analog" capture device I can clearly hear beeps in that recording. So the beep generator is indeed working properly, however I wasn't able to figure out any way to connect it to speakers. However, the bits in the "Passthrough Control" register (0x36) seems to work at least partially: by zeroing "B" and "h" and setting "S" I can at least make the PIT PC speaker output appear either in this laptop speakers or headphones (depending on whether they are connected or not). There are some caveats, however: * If the CODEC gets runtime-suspended the beeps stop so it needs HDA beep device for keeping it awake during beeping. * If the beep generator node is generating any beep the PC beep passthrough seems to be temporarily inhibited, so the HDA beep device has to be prevented from using the actual beep generator node - but the beep device is still necessary due to the previous point. * In contrast with other platforms here beep amplification has to be disabled otherwise the beeps output are WAY louder than they were on pure BIOS setup. Unless someone (from Realtek probably) knows how to make the beep generator node output appear in speakers / headphones using PC beep passthrough seems to be the only way to make PC speaker beeping actually work on this platform. Signed-off-by: Maciej S. Szmigiero Acked-by: kailang@realtek.com Link: https://patch.msgid.link/7461f695b4daed80f2fc4b1463ead47f04f9ad05.1739741254.git.mail@maciej.szmigiero.name Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- include/sound/hda_codec.h | 1 + sound/pci/hda/hda_beep.c | 15 +++++++++------ sound/pci/hda/patch_realtek.c | 34 +++++++++++++++++++++++++++++++++- 3 files changed, 43 insertions(+), 7 deletions(-) diff --git a/include/sound/hda_codec.h b/include/sound/hda_codec.h index bbb7805e85d8..4ca45d5895df 100644 --- a/include/sound/hda_codec.h +++ b/include/sound/hda_codec.h @@ -199,6 +199,7 @@ struct hda_codec { /* beep device */ struct hda_beep *beep; unsigned int beep_mode; + bool beep_just_power_on; /* widget capabilities cache */ u32 *wcaps; diff --git a/sound/pci/hda/hda_beep.c b/sound/pci/hda/hda_beep.c index e63621bcb214..1a684e47d4d1 100644 --- a/sound/pci/hda/hda_beep.c +++ b/sound/pci/hda/hda_beep.c @@ -31,8 +31,9 @@ static void generate_tone(struct hda_beep *beep, int tone) beep->power_hook(beep, true); beep->playing = 1; } - snd_hda_codec_write(codec, beep->nid, 0, - AC_VERB_SET_BEEP_CONTROL, tone); + if (!codec->beep_just_power_on) + snd_hda_codec_write(codec, beep->nid, 0, + AC_VERB_SET_BEEP_CONTROL, tone); if (!tone && beep->playing) { beep->playing = 0; if (beep->power_hook) @@ -212,10 +213,12 @@ int snd_hda_attach_beep_device(struct hda_codec *codec, int nid) struct hda_beep *beep; int err; - if (!snd_hda_get_bool_hint(codec, "beep")) - return 0; /* disabled explicitly by hints */ - if (codec->beep_mode == HDA_BEEP_MODE_OFF) - return 0; /* disabled by module option */ + if (!codec->beep_just_power_on) { + if (!snd_hda_get_bool_hint(codec, "beep")) + return 0; /* disabled explicitly by hints */ + if (codec->beep_mode == HDA_BEEP_MODE_OFF) + return 0; /* disabled by module option */ + } beep = kzalloc(sizeof(*beep), GFP_KERNEL); if (beep == NULL) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 61b48f2418bf..2f67cd955d65 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -24,6 +24,7 @@ #include #include "hda_local.h" #include "hda_auto_parser.h" +#include "hda_beep.h" #include "hda_jack.h" #include "hda_generic.h" #include "hda_component.h" @@ -6858,6 +6859,30 @@ static void alc285_fixup_hp_envy_x360(struct hda_codec *codec, } } +static void alc285_fixup_hp_beep(struct hda_codec *codec, + const struct hda_fixup *fix, int action) +{ + if (action == HDA_FIXUP_ACT_PRE_PROBE) { + codec->beep_just_power_on = true; + } else if (action == HDA_FIXUP_ACT_INIT) { +#ifdef CONFIG_SND_HDA_INPUT_BEEP + /* + * Just enable loopback to internal speaker and headphone jack. + * Disable amplification to get about the same beep volume as + * was on pure BIOS setup before loading the driver. + */ + alc_update_coef_idx(codec, 0x36, 0x7070, BIT(13)); + + snd_hda_enable_beep_device(codec, 1); + +#if !IS_ENABLED(CONFIG_INPUT_PCSPKR) + dev_warn_once(hda_codec_dev(codec), + "enable CONFIG_INPUT_PCSPKR to get PC beeps\n"); +#endif +#endif + } +} + /* for hda_fixup_thinkpad_acpi() */ #include "thinkpad_helper.c" @@ -7400,6 +7425,7 @@ enum { ALC285_FIXUP_HP_GPIO_LED, ALC285_FIXUP_HP_MUTE_LED, ALC285_FIXUP_HP_SPECTRE_X360_MUTE_LED, + ALC285_FIXUP_HP_BEEP_MICMUTE_LED, ALC236_FIXUP_HP_MUTE_LED_COEFBIT2, ALC236_FIXUP_HP_GPIO_LED, ALC236_FIXUP_HP_MUTE_LED, @@ -8947,6 +8973,12 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc285_fixup_hp_spectre_x360_mute_led, }, + [ALC285_FIXUP_HP_BEEP_MICMUTE_LED] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_hp_beep, + .chained = true, + .chain_id = ALC285_FIXUP_HP_MUTE_LED, + }, [ALC236_FIXUP_HP_MUTE_LED_COEFBIT2] = { .type = HDA_FIXUP_FUNC, .v.func = alc236_fixup_hp_mute_led_coefbit2, @@ -9860,7 +9892,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8730, "HP ProBook 445 G7", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8735, "HP ProBook 435 G7", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8736, "HP", ALC285_FIXUP_HP_GPIO_AMP_INIT), - SND_PCI_QUIRK(0x103c, 0x8760, "HP", ALC285_FIXUP_HP_MUTE_LED), + SND_PCI_QUIRK(0x103c, 0x8760, "HP EliteBook 8{4,5}5 G7", ALC285_FIXUP_HP_BEEP_MICMUTE_LED), SND_PCI_QUIRK(0x103c, 0x876e, "HP ENVY x360 Convertible 13-ay0xxx", ALC245_FIXUP_HP_X360_MUTE_LEDS), SND_PCI_QUIRK(0x103c, 0x877a, "HP", ALC285_FIXUP_HP_MUTE_LED), SND_PCI_QUIRK(0x103c, 0x877d, "HP", ALC236_FIXUP_HP_MUTE_LED), From 79f3edb9374f2aeb9c3c30afda9d623b0efafbdb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 27 Feb 2025 20:23:27 -0800 Subject: [PATCH 310/529] ipv4: fib: Move fib_valid_key_len() to rtm_to_fib_config(). [ Upstream commit 254ba7e6032d3fc738050d500b0c1d8197af90ca ] fib_valid_key_len() is called in the beginning of fib_table_insert() or fib_table_delete() to check if the prefix length is valid. fib_table_insert() and fib_table_delete() are called from 3 paths - ip_rt_ioctl() - inet_rtm_newroute() / inet_rtm_delroute() - fib_magic() In the first ioctl() path, rtentry_to_fib_config() checks the prefix length with bad_mask(). Also, fib_magic() always passes the correct prefix: 32 or ifa->ifa_prefixlen, which is already validated. Let's move fib_valid_key_len() to the rtnetlink path, rtm_to_fib_config(). While at it, 2 direct returns in rtm_to_fib_config() are changed to goto to match other places in the same function Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: David Ahern Link: https://patch.msgid.link/20250228042328.96624-12-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv4/fib_frontend.c | 18 ++++++++++++++++-- net/ipv4/fib_trie.c | 22 ---------------------- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 90ce87ffed46..7993ff46de23 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -829,19 +829,33 @@ static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, } } + if (cfg->fc_dst_len > 32) { + NL_SET_ERR_MSG(extack, "Invalid prefix length"); + err = -EINVAL; + goto errout; + } + + if (cfg->fc_dst_len < 32 && (ntohl(cfg->fc_dst) << cfg->fc_dst_len)) { + NL_SET_ERR_MSG(extack, "Invalid prefix for given prefix length"); + err = -EINVAL; + goto errout; + } + if (cfg->fc_nh_id) { if (cfg->fc_oif || cfg->fc_gw_family || cfg->fc_encap || cfg->fc_mp) { NL_SET_ERR_MSG(extack, "Nexthop specification and nexthop id are mutually exclusive"); - return -EINVAL; + err = -EINVAL; + goto errout; } } if (has_gw && has_via) { NL_SET_ERR_MSG(extack, "Nexthop configuration can not contain both GATEWAY and VIA"); - return -EINVAL; + err = -EINVAL; + goto errout; } if (!cfg->fc_table) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 77b97c48da5e..fa54b36b241a 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1192,22 +1192,6 @@ static int fib_insert_alias(struct trie *t, struct key_vector *tp, return 0; } -static bool fib_valid_key_len(u32 key, u8 plen, struct netlink_ext_ack *extack) -{ - if (plen > KEYLENGTH) { - NL_SET_ERR_MSG(extack, "Invalid prefix length"); - return false; - } - - if ((plen < KEYLENGTH) && (key << plen)) { - NL_SET_ERR_MSG(extack, - "Invalid prefix for given prefix length"); - return false; - } - - return true; -} - static void fib_remove_alias(struct trie *t, struct key_vector *tp, struct key_vector *l, struct fib_alias *old); @@ -1228,9 +1212,6 @@ int fib_table_insert(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - pr_debug("Insert table=%u %08x/%d\n", tb->tb_id, key, plen); fi = fib_create_info(cfg, extack); @@ -1723,9 +1704,6 @@ int fib_table_delete(struct net *net, struct fib_table *tb, key = ntohl(cfg->fc_dst); - if (!fib_valid_key_len(key, plen, extack)) - return -EINVAL; - l = fib_find_node(t, &tp, key); if (!l) return -ESRCH; From 7d49558be034709534ae4c93450538d51d87bb4b Mon Sep 17 00:00:00 2001 From: Andy Yan Date: Mon, 3 Mar 2025 11:44:17 +0800 Subject: [PATCH 311/529] drm/rockchip: vop2: Add uv swap for cluster window [ Upstream commit e7aae9f6d762139f8d2b86db03793ae0ab3dd802 ] The Cluster windows of upcoming VOP on rk3576 also support linear YUV support, we need to set uv swap bit for it. As the VOP2_WIN_UV_SWA register defined on rk3568/rk3588 is 0xffffffff, so this register will not be touched on these two platforms. Signed-off-by: Andy Yan Tested-by: Michael Riesch # on RK3568 Tested-by: Detlev Casanova Signed-off-by: Heiko Stuebner Link: https://patchwork.freedesktop.org/patch/msgid/20250303034436.192400-4-andyshrk@163.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/rockchip/rockchip_drm_vop2.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c index 955ef2caac89..6efa0a51b7d6 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop2.c @@ -1289,10 +1289,8 @@ static void vop2_plane_atomic_update(struct drm_plane *plane, rb_swap = vop2_win_rb_swap(fb->format->format); vop2_win_write(win, VOP2_WIN_RB_SWAP, rb_swap); - if (!vop2_cluster_window(win)) { - uv_swap = vop2_win_uv_swap(fb->format->format); - vop2_win_write(win, VOP2_WIN_UV_SWAP, uv_swap); - } + uv_swap = vop2_win_uv_swap(fb->format->format); + vop2_win_write(win, VOP2_WIN_UV_SWAP, uv_swap); if (fb->format->is_yuv) { vop2_win_write(win, VOP2_WIN_UV_VIR, DIV_ROUND_UP(fb->pitches[1], 4)); From 6782a62c32896ef9dc482fd6b264a390df046747 Mon Sep 17 00:00:00 2001 From: Ricardo Ribalda Date: Mon, 3 Feb 2025 11:55:51 +0000 Subject: [PATCH 312/529] media: uvcvideo: Add sanity check to uvc_ioctl_xu_ctrl_map [ Upstream commit 990262fdfce24d6055df9711424343d94d829e6a ] Do not process unknown data types. Tested-by: Yunke Cao Reviewed-by: Hans de Goede Signed-off-by: Ricardo Ribalda Link: https://lore.kernel.org/r/20250203-uvc-roi-v17-15-5900a9fed613@chromium.org Signed-off-by: Hans de Goede Signed-off-by: Hans Verkuil Signed-off-by: Sasha Levin --- drivers/media/usb/uvc/uvc_v4l2.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/media/usb/uvc/uvc_v4l2.c b/drivers/media/usb/uvc/uvc_v4l2.c index bd4677a6e653..0aaa4fce61da 100644 --- a/drivers/media/usb/uvc/uvc_v4l2.c +++ b/drivers/media/usb/uvc/uvc_v4l2.c @@ -36,6 +36,12 @@ static int uvc_ioctl_ctrl_map(struct uvc_video_chain *chain, unsigned int size; int ret; + if (xmap->data_type > UVC_CTRL_DATA_TYPE_BITMASK) { + uvc_dbg(chain->dev, CONTROL, + "Unsupported UVC data type %u\n", xmap->data_type); + return -EINVAL; + } + map = kzalloc(sizeof(*map), GFP_KERNEL); if (map == NULL) return -ENOMEM; From a5df17f35ba523723b365e34c28cbcd34fcae730 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Tue, 18 Feb 2025 19:26:46 +0100 Subject: [PATCH 313/529] clk: imx8mp: inform CCF of maximum frequency of clocks [ Upstream commit 06a61b5cb6a8638fa8823cd09b17233b29696fa2 ] The IMX8MPCEC datasheet lists maximum frequencies allowed for different modules. Some of these limits are universal, but some depend on whether the SoC is operating in nominal or in overdrive mode. The imx8mp.dtsi currently assumes overdrive mode and configures some clocks in accordance with this. Boards wishing to make use of nominal mode will need to override some of the clock rates manually. As operating the clocks outside of their allowed range can lead to difficult to debug issues, it makes sense to register the maximum rates allowed in the driver, so the CCF can take them into account. Reviewed-by: Peng Fan Signed-off-by: Ahmad Fatoum Link: https://lore.kernel.org/r/20250218-imx8m-clk-v4-6-b7697dc2dcd0@pengutronix.de Signed-off-by: Abel Vesa Signed-off-by: Sasha Levin --- drivers/clk/imx/clk-imx8mp.c | 151 +++++++++++++++++++++++++++++++++++ 1 file changed, 151 insertions(+) diff --git a/drivers/clk/imx/clk-imx8mp.c b/drivers/clk/imx/clk-imx8mp.c index 444dfd6adfe6..a74b73cd243e 100644 --- a/drivers/clk/imx/clk-imx8mp.c +++ b/drivers/clk/imx/clk-imx8mp.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -405,11 +406,151 @@ static const char * const imx8mp_clkout_sels[] = {"audio_pll1_out", "audio_pll2_ static struct clk_hw **hws; static struct clk_hw_onecell_data *clk_hw_data; +struct imx8mp_clock_constraints { + unsigned int clkid; + u32 maxrate; +}; + +/* + * Below tables are taken from IMX8MPCEC Rev. 2.1, 07/2023 + * Table 13. Maximum frequency of modules. + * Probable typos fixed are marked with a comment. + */ +static const struct imx8mp_clock_constraints imx8mp_clock_common_constraints[] = { + { IMX8MP_CLK_A53_DIV, 1000 * HZ_PER_MHZ }, + { IMX8MP_CLK_ENET_AXI, 266666667 }, /* Datasheet claims 266MHz */ + { IMX8MP_CLK_NAND_USDHC_BUS, 266666667 }, /* Datasheet claims 266MHz */ + { IMX8MP_CLK_MEDIA_APB, 200 * HZ_PER_MHZ }, + { IMX8MP_CLK_HDMI_APB, 133333333 }, /* Datasheet claims 133MHz */ + { IMX8MP_CLK_ML_AXI, 800 * HZ_PER_MHZ }, + { IMX8MP_CLK_AHB, 133333333 }, + { IMX8MP_CLK_IPG_ROOT, 66666667 }, + { IMX8MP_CLK_AUDIO_AHB, 400 * HZ_PER_MHZ }, + { IMX8MP_CLK_MEDIA_DISP2_PIX, 170 * HZ_PER_MHZ }, + { IMX8MP_CLK_DRAM_ALT, 666666667 }, + { IMX8MP_CLK_DRAM_APB, 200 * HZ_PER_MHZ }, + { IMX8MP_CLK_CAN1, 80 * HZ_PER_MHZ }, + { IMX8MP_CLK_CAN2, 80 * HZ_PER_MHZ }, + { IMX8MP_CLK_PCIE_AUX, 10 * HZ_PER_MHZ }, + { IMX8MP_CLK_I2C5, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_I2C6, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_SAI1, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_SAI2, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_SAI3, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_SAI5, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_SAI6, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_ENET_QOS, 125 * HZ_PER_MHZ }, + { IMX8MP_CLK_ENET_QOS_TIMER, 200 * HZ_PER_MHZ }, + { IMX8MP_CLK_ENET_REF, 125 * HZ_PER_MHZ }, + { IMX8MP_CLK_ENET_TIMER, 125 * HZ_PER_MHZ }, + { IMX8MP_CLK_ENET_PHY_REF, 125 * HZ_PER_MHZ }, + { IMX8MP_CLK_NAND, 500 * HZ_PER_MHZ }, + { IMX8MP_CLK_QSPI, 400 * HZ_PER_MHZ }, + { IMX8MP_CLK_USDHC1, 400 * HZ_PER_MHZ }, + { IMX8MP_CLK_USDHC2, 400 * HZ_PER_MHZ }, + { IMX8MP_CLK_I2C1, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_I2C2, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_I2C3, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_I2C4, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_UART1, 80 * HZ_PER_MHZ }, + { IMX8MP_CLK_UART2, 80 * HZ_PER_MHZ }, + { IMX8MP_CLK_UART3, 80 * HZ_PER_MHZ }, + { IMX8MP_CLK_UART4, 80 * HZ_PER_MHZ }, + { IMX8MP_CLK_ECSPI1, 80 * HZ_PER_MHZ }, + { IMX8MP_CLK_ECSPI2, 80 * HZ_PER_MHZ }, + { IMX8MP_CLK_PWM1, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_PWM2, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_PWM3, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_PWM4, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_GPT1, 100 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPT2, 100 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPT3, 100 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPT4, 100 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPT5, 100 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPT6, 100 * HZ_PER_MHZ }, + { IMX8MP_CLK_WDOG, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_IPP_DO_CLKO1, 200 * HZ_PER_MHZ }, + { IMX8MP_CLK_IPP_DO_CLKO2, 200 * HZ_PER_MHZ }, + { IMX8MP_CLK_HDMI_REF_266M, 266 * HZ_PER_MHZ }, + { IMX8MP_CLK_USDHC3, 400 * HZ_PER_MHZ }, + { IMX8MP_CLK_MEDIA_MIPI_PHY1_REF, 300 * HZ_PER_MHZ }, + { IMX8MP_CLK_MEDIA_DISP1_PIX, 250 * HZ_PER_MHZ }, + { IMX8MP_CLK_MEDIA_CAM2_PIX, 277 * HZ_PER_MHZ }, + { IMX8MP_CLK_MEDIA_LDB, 595 * HZ_PER_MHZ }, + { IMX8MP_CLK_MEDIA_MIPI_TEST_BYTE, 200 * HZ_PER_MHZ }, + { IMX8MP_CLK_ECSPI3, 80 * HZ_PER_MHZ }, + { IMX8MP_CLK_PDM, 200 * HZ_PER_MHZ }, + { IMX8MP_CLK_SAI7, 66666667 }, /* Datasheet claims 66MHz */ + { IMX8MP_CLK_MAIN_AXI, 400 * HZ_PER_MHZ }, + { /* Sentinel */ } +}; + +static const struct imx8mp_clock_constraints imx8mp_clock_nominal_constraints[] = { + { IMX8MP_CLK_M7_CORE, 600 * HZ_PER_MHZ }, + { IMX8MP_CLK_ML_CORE, 800 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPU3D_CORE, 800 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPU3D_SHADER_CORE, 800 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPU2D_CORE, 800 * HZ_PER_MHZ }, + { IMX8MP_CLK_AUDIO_AXI_SRC, 600 * HZ_PER_MHZ }, + { IMX8MP_CLK_HSIO_AXI, 400 * HZ_PER_MHZ }, + { IMX8MP_CLK_MEDIA_ISP, 400 * HZ_PER_MHZ }, + { IMX8MP_CLK_VPU_BUS, 600 * HZ_PER_MHZ }, + { IMX8MP_CLK_MEDIA_AXI, 400 * HZ_PER_MHZ }, + { IMX8MP_CLK_HDMI_AXI, 400 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPU_AXI, 600 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPU_AHB, 300 * HZ_PER_MHZ }, + { IMX8MP_CLK_NOC, 800 * HZ_PER_MHZ }, + { IMX8MP_CLK_NOC_IO, 600 * HZ_PER_MHZ }, + { IMX8MP_CLK_ML_AHB, 300 * HZ_PER_MHZ }, + { IMX8MP_CLK_VPU_G1, 600 * HZ_PER_MHZ }, + { IMX8MP_CLK_VPU_G2, 500 * HZ_PER_MHZ }, + { IMX8MP_CLK_MEDIA_CAM1_PIX, 400 * HZ_PER_MHZ }, + { IMX8MP_CLK_VPU_VC8000E, 400 * HZ_PER_MHZ }, /* Datasheet claims 500MHz */ + { IMX8MP_CLK_DRAM_CORE, 800 * HZ_PER_MHZ }, + { IMX8MP_CLK_GIC, 400 * HZ_PER_MHZ }, + { /* Sentinel */ } +}; + +static const struct imx8mp_clock_constraints imx8mp_clock_overdrive_constraints[] = { + { IMX8MP_CLK_M7_CORE, 800 * HZ_PER_MHZ}, + { IMX8MP_CLK_ML_CORE, 1000 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPU3D_CORE, 1000 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPU3D_SHADER_CORE, 1000 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPU2D_CORE, 1000 * HZ_PER_MHZ }, + { IMX8MP_CLK_AUDIO_AXI_SRC, 800 * HZ_PER_MHZ }, + { IMX8MP_CLK_HSIO_AXI, 500 * HZ_PER_MHZ }, + { IMX8MP_CLK_MEDIA_ISP, 500 * HZ_PER_MHZ }, + { IMX8MP_CLK_VPU_BUS, 800 * HZ_PER_MHZ }, + { IMX8MP_CLK_MEDIA_AXI, 500 * HZ_PER_MHZ }, + { IMX8MP_CLK_HDMI_AXI, 500 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPU_AXI, 800 * HZ_PER_MHZ }, + { IMX8MP_CLK_GPU_AHB, 400 * HZ_PER_MHZ }, + { IMX8MP_CLK_NOC, 1000 * HZ_PER_MHZ }, + { IMX8MP_CLK_NOC_IO, 800 * HZ_PER_MHZ }, + { IMX8MP_CLK_ML_AHB, 400 * HZ_PER_MHZ }, + { IMX8MP_CLK_VPU_G1, 800 * HZ_PER_MHZ }, + { IMX8MP_CLK_VPU_G2, 700 * HZ_PER_MHZ }, + { IMX8MP_CLK_MEDIA_CAM1_PIX, 500 * HZ_PER_MHZ }, + { IMX8MP_CLK_VPU_VC8000E, 500 * HZ_PER_MHZ }, /* Datasheet claims 400MHz */ + { IMX8MP_CLK_DRAM_CORE, 1000 * HZ_PER_MHZ }, + { IMX8MP_CLK_GIC, 500 * HZ_PER_MHZ }, + { /* Sentinel */ } +}; + +static void imx8mp_clocks_apply_constraints(const struct imx8mp_clock_constraints constraints[]) +{ + const struct imx8mp_clock_constraints *constr; + + for (constr = constraints; constr->clkid; constr++) + clk_hw_set_rate_range(hws[constr->clkid], 0, constr->maxrate); +} + static int imx8mp_clocks_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct device_node *np; void __iomem *anatop_base, *ccm_base; + const char *opmode; int err; np = of_find_compatible_node(NULL, NULL, "fsl,imx8mp-anatop"); @@ -704,6 +845,16 @@ static int imx8mp_clocks_probe(struct platform_device *pdev) imx_check_clk_hws(hws, IMX8MP_CLK_END); + imx8mp_clocks_apply_constraints(imx8mp_clock_common_constraints); + + err = of_property_read_string(np, "fsl,operating-mode", &opmode); + if (!err) { + if (!strcmp(opmode, "nominal")) + imx8mp_clocks_apply_constraints(imx8mp_clock_nominal_constraints); + else if (!strcmp(opmode, "overdrive")) + imx8mp_clocks_apply_constraints(imx8mp_clock_overdrive_constraints); + } + err = of_clk_add_hw_provider(np, of_clk_hw_onecell_get, clk_hw_data); if (err < 0) { dev_err(dev, "failed to register hws for i.MX8MP\n"); From 594dbf0a19d607f106ed552332b9b8fecd2b64a3 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Thu, 31 Oct 2024 04:06:17 -0700 Subject: [PATCH 314/529] x86/bugs: Make spectre user default depend on MITIGATION_SPECTRE_V2 [ Upstream commit 98fdaeb296f51ef08e727a7cc72e5b5c864c4f4d ] Change the default value of spectre v2 in user mode to respect the CONFIG_MITIGATION_SPECTRE_V2 config option. Currently, user mode spectre v2 is set to auto (SPECTRE_V2_USER_CMD_AUTO) by default, even if CONFIG_MITIGATION_SPECTRE_V2 is disabled. Set the spectre_v2 value to auto (SPECTRE_V2_USER_CMD_AUTO) if the Spectre v2 config (CONFIG_MITIGATION_SPECTRE_V2) is enabled, otherwise set the value to none (SPECTRE_V2_USER_CMD_NONE). Important to say the command line argument "spectre_v2_user" overwrites the default value in both cases. When CONFIG_MITIGATION_SPECTRE_V2 is not set, users have the flexibility to opt-in for specific mitigations independently. In this scenario, setting spectre_v2= will not enable spectre_v2_user=, and command line options spectre_v2_user and spectre_v2 are independent when CONFIG_MITIGATION_SPECTRE_V2=n. Signed-off-by: Breno Leitao Signed-off-by: Ingo Molnar Reviewed-by: Pawan Gupta Acked-by: Josh Poimboeuf Cc: Peter Zijlstra Cc: David Kaplan Link: https://lore.kernel.org/r/20241031-x86_bugs_last_v2-v2-2-b7ff1dab840e@debian.org Signed-off-by: Sasha Levin --- Documentation/admin-guide/kernel-parameters.txt | 2 ++ arch/x86/kernel/cpu/bugs.c | 10 +++++++--- 2 files changed, 9 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 6938c8cd7a6f..15e40774e9bc 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5780,6 +5780,8 @@ Selecting 'on' will also enable the mitigation against user space to user space task attacks. + Selecting specific mitigation does not force enable + user mitigations. Selecting 'off' will disable both the kernel and the user space protections. diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index 766cee7fa905..7233474c798f 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1382,9 +1382,13 @@ static __ro_after_init enum spectre_v2_mitigation_cmd spectre_v2_cmd; static enum spectre_v2_user_cmd __init spectre_v2_parse_user_cmdline(void) { + enum spectre_v2_user_cmd mode; char arg[20]; int ret, i; + mode = IS_ENABLED(CONFIG_MITIGATION_SPECTRE_V2) ? + SPECTRE_V2_USER_CMD_AUTO : SPECTRE_V2_USER_CMD_NONE; + switch (spectre_v2_cmd) { case SPECTRE_V2_CMD_NONE: return SPECTRE_V2_USER_CMD_NONE; @@ -1397,7 +1401,7 @@ spectre_v2_parse_user_cmdline(void) ret = cmdline_find_option(boot_command_line, "spectre_v2_user", arg, sizeof(arg)); if (ret < 0) - return SPECTRE_V2_USER_CMD_AUTO; + return mode; for (i = 0; i < ARRAY_SIZE(v2_user_options); i++) { if (match_option(arg, ret, v2_user_options[i].option)) { @@ -1407,8 +1411,8 @@ spectre_v2_parse_user_cmdline(void) } } - pr_err("Unknown user space protection option (%s). Switching to AUTO select\n", arg); - return SPECTRE_V2_USER_CMD_AUTO; + pr_err("Unknown user space protection option (%s). Switching to default\n", arg); + return mode; } static inline bool spectre_v2_in_eibrs_mode(enum spectre_v2_mitigation mode) From 05315e1507fcc33c825f72f83574d9e4fc2df0cb Mon Sep 17 00:00:00 2001 From: Alexander Stein Date: Mon, 10 Feb 2025 15:59:30 +0100 Subject: [PATCH 315/529] hwmon: (gpio-fan) Add missing mutex locks [ Upstream commit 9fee7d19bab635f89223cc40dfd2c8797fdc4988 ] set_fan_speed() is expected to be called with fan_data->lock being locked. Add locking for proper synchronization. Signed-off-by: Alexander Stein Link: https://lore.kernel.org/r/20250210145934.761280-3-alexander.stein@ew.tq-group.com Signed-off-by: Guenter Roeck Signed-off-by: Sasha Levin --- drivers/hwmon/gpio-fan.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/drivers/hwmon/gpio-fan.c b/drivers/hwmon/gpio-fan.c index ba408942dbe7..f1926b9171e0 100644 --- a/drivers/hwmon/gpio-fan.c +++ b/drivers/hwmon/gpio-fan.c @@ -392,7 +392,12 @@ static int gpio_fan_set_cur_state(struct thermal_cooling_device *cdev, if (state >= fan_data->num_speed) return -EINVAL; + mutex_lock(&fan_data->lock); + set_fan_speed(fan_data, state); + + mutex_unlock(&fan_data->lock); + return 0; } @@ -488,7 +493,11 @@ MODULE_DEVICE_TABLE(of, of_gpio_fan_match); static void gpio_fan_stop(void *data) { + struct gpio_fan_data *fan_data = data; + + mutex_lock(&fan_data->lock); set_fan_speed(data, 0); + mutex_unlock(&fan_data->lock); } static int gpio_fan_probe(struct platform_device *pdev) @@ -561,7 +570,9 @@ static int gpio_fan_suspend(struct device *dev) if (fan_data->gpios) { fan_data->resume_speed = fan_data->speed_index; + mutex_lock(&fan_data->lock); set_fan_speed(fan_data, 0); + mutex_unlock(&fan_data->lock); } return 0; @@ -571,8 +582,11 @@ static int gpio_fan_resume(struct device *dev) { struct gpio_fan_data *fan_data = dev_get_drvdata(dev); - if (fan_data->gpios) + if (fan_data->gpios) { + mutex_lock(&fan_data->lock); set_fan_speed(fan_data, fan_data->resume_speed); + mutex_unlock(&fan_data->lock); + } return 0; } From eeb808ce1e5f36975a8e4838d798ed12f0861888 Mon Sep 17 00:00:00 2001 From: Li Bin Date: Thu, 27 Feb 2025 08:51:56 -0700 Subject: [PATCH 316/529] ARM: at91: pm: fix at91_suspend_finish for ZQ calibration [ Upstream commit bc4722c3598d0e2c2dbf9609a3d3198993093e2b ] For sama7g5 and sama7d65 backup mode, we encountered a "ZQ calibrate error" during recalibrating the impedance in BootStrap. We found that the impedance value saved in at91_suspend_finish() before the DDR entered self-refresh mode did not match the resistor values. The ZDATA field in the DDR3PHY_ZQ0CR0 register uses a modified gray code to select the different impedance setting. But these gray code are incorrect, a workaournd from design team fixed the bug in the calibration logic. The ZDATA contains four independent impedance elements, but the algorithm combined the four elements into one. The elements were fixed using properly shifted offsets. Signed-off-by: Li Bin [nicolas.ferre@microchip.com: fix indentation and combine 2 patches] Signed-off-by: Nicolas Ferre Tested-by: Ryan Wanner Tested-by: Durai Manickam KR Tested-by: Andrei Simion Signed-off-by: Ryan Wanner Link: https://lore.kernel.org/r/28b33f9bcd0ca60ceba032969fe054d38f2b9577.1740671156.git.Ryan.Wanner@microchip.com Signed-off-by: Claudiu Beznea Signed-off-by: Sasha Levin --- arch/arm/mach-at91/pm.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index 4d0d0d49a744..77aec670f635 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -537,11 +537,12 @@ extern u32 at91_pm_suspend_in_sram_sz; static int at91_suspend_finish(unsigned long val) { - unsigned char modified_gray_code[] = { - 0x00, 0x01, 0x02, 0x03, 0x06, 0x07, 0x04, 0x05, 0x0c, 0x0d, - 0x0e, 0x0f, 0x0a, 0x0b, 0x08, 0x09, 0x18, 0x19, 0x1a, 0x1b, - 0x1e, 0x1f, 0x1c, 0x1d, 0x14, 0x15, 0x16, 0x17, 0x12, 0x13, - 0x10, 0x11, + /* SYNOPSYS workaround to fix a bug in the calibration logic */ + unsigned char modified_fix_code[] = { + 0x00, 0x01, 0x01, 0x06, 0x07, 0x0c, 0x06, 0x07, 0x0b, 0x18, + 0x0a, 0x0b, 0x0c, 0x0d, 0x0d, 0x0a, 0x13, 0x13, 0x12, 0x13, + 0x14, 0x15, 0x15, 0x12, 0x18, 0x19, 0x19, 0x1e, 0x1f, 0x14, + 0x1e, 0x1f, }; unsigned int tmp, index; int i; @@ -552,25 +553,25 @@ static int at91_suspend_finish(unsigned long val) * restore the ZQ0SR0 with the value saved here. But the * calibration is buggy and restoring some values from ZQ0SR0 * is forbidden and risky thus we need to provide processed - * values for these (modified gray code values). + * values for these. */ tmp = readl(soc_pm.data.ramc_phy + DDR3PHY_ZQ0SR0); /* Store pull-down output impedance select. */ index = (tmp >> DDR3PHY_ZQ0SR0_PDO_OFF) & 0x1f; - soc_pm.bu->ddr_phy_calibration[0] = modified_gray_code[index]; + soc_pm.bu->ddr_phy_calibration[0] = modified_fix_code[index] << DDR3PHY_ZQ0SR0_PDO_OFF; /* Store pull-up output impedance select. */ index = (tmp >> DDR3PHY_ZQ0SR0_PUO_OFF) & 0x1f; - soc_pm.bu->ddr_phy_calibration[0] |= modified_gray_code[index]; + soc_pm.bu->ddr_phy_calibration[0] |= modified_fix_code[index] << DDR3PHY_ZQ0SR0_PUO_OFF; /* Store pull-down on-die termination impedance select. */ index = (tmp >> DDR3PHY_ZQ0SR0_PDODT_OFF) & 0x1f; - soc_pm.bu->ddr_phy_calibration[0] |= modified_gray_code[index]; + soc_pm.bu->ddr_phy_calibration[0] |= modified_fix_code[index] << DDR3PHY_ZQ0SR0_PDODT_OFF; /* Store pull-up on-die termination impedance select. */ index = (tmp >> DDR3PHY_ZQ0SRO_PUODT_OFF) & 0x1f; - soc_pm.bu->ddr_phy_calibration[0] |= modified_gray_code[index]; + soc_pm.bu->ddr_phy_calibration[0] |= modified_fix_code[index] << DDR3PHY_ZQ0SRO_PUODT_OFF; /* * The 1st 8 words of memory might get corrupted in the process From 31b96c1543c9f4eafdc66d06846c7aa325679ec3 Mon Sep 17 00:00:00 2001 From: AngeloGioacchino Del Regno Date: Mon, 17 Feb 2025 16:47:58 +0100 Subject: [PATCH 317/529] drm/mediatek: mtk_dpi: Add checks for reg_h_fre_con existence [ Upstream commit 8c9da7cd0bbcc90ab444454fecf535320456a312 ] In preparation for adding support for newer DPI instances which do support direct-pin but do not have any H_FRE_CON register, like the one found in MT8195 and MT8188, add a branch to check if the reg_h_fre_con variable was declared in the mtk_dpi_conf structure for the probed SoC DPI version. As a note, this is useful specifically only for cases in which the support_direct_pin variable is true, so mt8195-dpintf is not affected by any issue. Reviewed-by: CK Hu Signed-off-by: AngeloGioacchino Del Regno Link: https://patchwork.kernel.org/project/dri-devel/patch/20250217154836.108895-6-angelogioacchino.delregno@collabora.com/ Signed-off-by: Chun-Kuang Hu Signed-off-by: Sasha Levin --- drivers/gpu/drm/mediatek/mtk_dpi.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/mediatek/mtk_dpi.c b/drivers/gpu/drm/mediatek/mtk_dpi.c index 1fa958e8c40a..5ad9c384046c 100644 --- a/drivers/gpu/drm/mediatek/mtk_dpi.c +++ b/drivers/gpu/drm/mediatek/mtk_dpi.c @@ -406,12 +406,13 @@ static void mtk_dpi_config_swap_input(struct mtk_dpi *dpi, bool enable) static void mtk_dpi_config_2n_h_fre(struct mtk_dpi *dpi) { - mtk_dpi_mask(dpi, dpi->conf->reg_h_fre_con, H_FRE_2N, H_FRE_2N); + if (dpi->conf->reg_h_fre_con) + mtk_dpi_mask(dpi, dpi->conf->reg_h_fre_con, H_FRE_2N, H_FRE_2N); } static void mtk_dpi_config_disable_edge(struct mtk_dpi *dpi) { - if (dpi->conf->edge_sel_en) + if (dpi->conf->edge_sel_en && dpi->conf->reg_h_fre_con) mtk_dpi_mask(dpi, dpi->conf->reg_h_fre_con, 0, EDGE_SEL_EN); } From 0c14267f4d6f7d8173605952196f1ed2a56453b3 Mon Sep 17 00:00:00 2001 From: Kuhanh Murugasen Krishnan Date: Thu, 13 Feb 2025 06:12:49 +0800 Subject: [PATCH 318/529] fpga: altera-cvp: Increase credit timeout [ Upstream commit 0f05886a40fdc55016ba4d9ae0a9c41f8312f15b ] Increase the timeout for SDM (Secure device manager) data credits from 20ms to 40ms. Internal stress tests running at 500 loops failed with the current timeout of 20ms. At the start of a FPGA configuration, the CVP host driver reads the transmit credits from SDM. It then sends bitstream FPGA data to SDM based on the total credits. Each credit allows the CVP host driver to send 4kBytes of data. There are situations whereby, the SDM did not respond in time during testing. Signed-off-by: Ang Tien Sung Signed-off-by: Kuhanh Murugasen Krishnan Acked-by: Xu Yilun Link: https://lore.kernel.org/r/20250212221249.2715929-1-tien.sung.ang@intel.com Signed-off-by: Xu Yilun Signed-off-by: Sasha Levin --- drivers/fpga/altera-cvp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/fpga/altera-cvp.c b/drivers/fpga/altera-cvp.c index 4ffb9da537d8..5295ff90482b 100644 --- a/drivers/fpga/altera-cvp.c +++ b/drivers/fpga/altera-cvp.c @@ -52,7 +52,7 @@ /* V2 Defines */ #define VSE_CVP_TX_CREDITS 0x49 /* 8bit */ -#define V2_CREDIT_TIMEOUT_US 20000 +#define V2_CREDIT_TIMEOUT_US 40000 #define V2_CHECK_CREDIT_US 10 #define V2_POLL_TIMEOUT_US 1000000 #define V2_USER_TIMEOUT_US 500000 From 8d64b2aa3e6287ebdbd6231e07822af902ddf22a Mon Sep 17 00:00:00 2001 From: Janne Grunau Date: Wed, 26 Feb 2025 19:00:05 +0000 Subject: [PATCH 319/529] soc: apple: rtkit: Use high prio work queue [ Upstream commit 22af2fac88fa5dbc310bfe7d0b66d4de3ac47305 ] rtkit messages as communication with the DCP firmware for framebuffer swaps or input events are time critical so use WQ_HIGHPRI to prevent user space CPU load to increase latency. With kwin_wayland 6's explicit sync mode user space load was able to delay the IOMFB rtkit communication enough to miss vsync for surface swaps. Minimal test scenario is constantly resizing a glxgears Xwayland window. Signed-off-by: Janne Grunau Reviewed-by: Alyssa Rosenzweig Link: https://lore.kernel.org/r/20250226-apple-soc-misc-v2-3-c3ec37f9021b@svenpeter.dev Signed-off-by: Sven Peter Signed-off-by: Sasha Levin --- drivers/soc/apple/rtkit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/apple/rtkit.c b/drivers/soc/apple/rtkit.c index 8ec74d7539eb..1ec0c3ba0be2 100644 --- a/drivers/soc/apple/rtkit.c +++ b/drivers/soc/apple/rtkit.c @@ -731,7 +731,7 @@ static struct apple_rtkit *apple_rtkit_init(struct device *dev, void *cookie, rtk->mbox_cl.rx_callback = &apple_rtkit_rx; rtk->mbox_cl.tx_done = &apple_rtkit_tx_done; - rtk->wq = alloc_ordered_workqueue("rtkit-%s", WQ_MEM_RECLAIM, + rtk->wq = alloc_ordered_workqueue("rtkit-%s", WQ_HIGHPRI | WQ_MEM_RECLAIM, dev_name(rtk->dev)); if (!rtk->wq) { ret = -ENOMEM; From 2d533b62db11da44ef6a6d71c0cdaf51a37c8f6a Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Wed, 26 Feb 2025 19:00:04 +0000 Subject: [PATCH 320/529] soc: apple: rtkit: Implement OSLog buffers properly [ Upstream commit a06398687065e0c334dc5fc4d2778b5b87292e43 ] Apparently nobody can figure out where the old logic came from, but it seems like it has never been actually used on any supported firmware to this day. OSLog buffers were apparently never requested. But starting with 13.3, we actually need this implemented properly for MTP (and later AOP) to work, so let's actually do that. Signed-off-by: Hector Martin Reviewed-by: Alyssa Rosenzweig Link: https://lore.kernel.org/r/20250226-apple-soc-misc-v2-2-c3ec37f9021b@svenpeter.dev Signed-off-by: Sven Peter Signed-off-by: Sasha Levin --- drivers/soc/apple/rtkit-internal.h | 1 + drivers/soc/apple/rtkit.c | 56 ++++++++++++++++++------------ 2 files changed, 35 insertions(+), 22 deletions(-) diff --git a/drivers/soc/apple/rtkit-internal.h b/drivers/soc/apple/rtkit-internal.h index 24bd619ec5e4..1da1dfd9cb19 100644 --- a/drivers/soc/apple/rtkit-internal.h +++ b/drivers/soc/apple/rtkit-internal.h @@ -48,6 +48,7 @@ struct apple_rtkit { struct apple_rtkit_shmem ioreport_buffer; struct apple_rtkit_shmem crashlog_buffer; + struct apple_rtkit_shmem oslog_buffer; struct apple_rtkit_shmem syslog_buffer; char *syslog_msg_buffer; diff --git a/drivers/soc/apple/rtkit.c b/drivers/soc/apple/rtkit.c index 1ec0c3ba0be2..968f9f633393 100644 --- a/drivers/soc/apple/rtkit.c +++ b/drivers/soc/apple/rtkit.c @@ -65,8 +65,9 @@ enum { #define APPLE_RTKIT_SYSLOG_MSG_SIZE GENMASK_ULL(31, 24) #define APPLE_RTKIT_OSLOG_TYPE GENMASK_ULL(63, 56) -#define APPLE_RTKIT_OSLOG_INIT 1 -#define APPLE_RTKIT_OSLOG_ACK 3 +#define APPLE_RTKIT_OSLOG_BUFFER_REQUEST 1 +#define APPLE_RTKIT_OSLOG_SIZE GENMASK_ULL(55, 36) +#define APPLE_RTKIT_OSLOG_IOVA GENMASK_ULL(35, 0) #define APPLE_RTKIT_MIN_SUPPORTED_VERSION 11 #define APPLE_RTKIT_MAX_SUPPORTED_VERSION 12 @@ -255,15 +256,21 @@ static int apple_rtkit_common_rx_get_buffer(struct apple_rtkit *rtk, struct apple_rtkit_shmem *buffer, u8 ep, u64 msg) { - size_t n_4kpages = FIELD_GET(APPLE_RTKIT_BUFFER_REQUEST_SIZE, msg); u64 reply; int err; + /* The different size vs. IOVA shifts look odd but are indeed correct this way */ + if (ep == APPLE_RTKIT_EP_OSLOG) { + buffer->size = FIELD_GET(APPLE_RTKIT_OSLOG_SIZE, msg); + buffer->iova = FIELD_GET(APPLE_RTKIT_OSLOG_IOVA, msg) << 12; + } else { + buffer->size = FIELD_GET(APPLE_RTKIT_BUFFER_REQUEST_SIZE, msg) << 12; + buffer->iova = FIELD_GET(APPLE_RTKIT_BUFFER_REQUEST_IOVA, msg); + } + buffer->buffer = NULL; buffer->iomem = NULL; buffer->is_mapped = false; - buffer->iova = FIELD_GET(APPLE_RTKIT_BUFFER_REQUEST_IOVA, msg); - buffer->size = n_4kpages << 12; dev_dbg(rtk->dev, "RTKit: buffer request for 0x%zx bytes at %pad\n", buffer->size, &buffer->iova); @@ -288,11 +295,21 @@ static int apple_rtkit_common_rx_get_buffer(struct apple_rtkit *rtk, } if (!buffer->is_mapped) { - reply = FIELD_PREP(APPLE_RTKIT_SYSLOG_TYPE, - APPLE_RTKIT_BUFFER_REQUEST); - reply |= FIELD_PREP(APPLE_RTKIT_BUFFER_REQUEST_SIZE, n_4kpages); - reply |= FIELD_PREP(APPLE_RTKIT_BUFFER_REQUEST_IOVA, - buffer->iova); + /* oslog uses different fields and needs a shifted IOVA instead of size */ + if (ep == APPLE_RTKIT_EP_OSLOG) { + reply = FIELD_PREP(APPLE_RTKIT_OSLOG_TYPE, + APPLE_RTKIT_OSLOG_BUFFER_REQUEST); + reply |= FIELD_PREP(APPLE_RTKIT_OSLOG_SIZE, buffer->size); + reply |= FIELD_PREP(APPLE_RTKIT_OSLOG_IOVA, + buffer->iova >> 12); + } else { + reply = FIELD_PREP(APPLE_RTKIT_SYSLOG_TYPE, + APPLE_RTKIT_BUFFER_REQUEST); + reply |= FIELD_PREP(APPLE_RTKIT_BUFFER_REQUEST_SIZE, + buffer->size >> 12); + reply |= FIELD_PREP(APPLE_RTKIT_BUFFER_REQUEST_IOVA, + buffer->iova); + } apple_rtkit_send_message(rtk, ep, reply, NULL, false); } @@ -474,25 +491,18 @@ static void apple_rtkit_syslog_rx(struct apple_rtkit *rtk, u64 msg) } } -static void apple_rtkit_oslog_rx_init(struct apple_rtkit *rtk, u64 msg) -{ - u64 ack; - - dev_dbg(rtk->dev, "RTKit: oslog init: msg: 0x%llx\n", msg); - ack = FIELD_PREP(APPLE_RTKIT_OSLOG_TYPE, APPLE_RTKIT_OSLOG_ACK); - apple_rtkit_send_message(rtk, APPLE_RTKIT_EP_OSLOG, ack, NULL, false); -} - static void apple_rtkit_oslog_rx(struct apple_rtkit *rtk, u64 msg) { u8 type = FIELD_GET(APPLE_RTKIT_OSLOG_TYPE, msg); switch (type) { - case APPLE_RTKIT_OSLOG_INIT: - apple_rtkit_oslog_rx_init(rtk, msg); + case APPLE_RTKIT_OSLOG_BUFFER_REQUEST: + apple_rtkit_common_rx_get_buffer(rtk, &rtk->oslog_buffer, + APPLE_RTKIT_EP_OSLOG, msg); break; default: - dev_warn(rtk->dev, "RTKit: Unknown oslog message: %llx\n", msg); + dev_warn(rtk->dev, "RTKit: Unknown oslog message: %llx\n", + msg); } } @@ -773,6 +783,7 @@ int apple_rtkit_reinit(struct apple_rtkit *rtk) apple_rtkit_free_buffer(rtk, &rtk->ioreport_buffer); apple_rtkit_free_buffer(rtk, &rtk->crashlog_buffer); + apple_rtkit_free_buffer(rtk, &rtk->oslog_buffer); apple_rtkit_free_buffer(rtk, &rtk->syslog_buffer); kfree(rtk->syslog_msg_buffer); @@ -935,6 +946,7 @@ static void apple_rtkit_free(void *data) apple_rtkit_free_buffer(rtk, &rtk->ioreport_buffer); apple_rtkit_free_buffer(rtk, &rtk->crashlog_buffer); + apple_rtkit_free_buffer(rtk, &rtk->oslog_buffer); apple_rtkit_free_buffer(rtk, &rtk->syslog_buffer); kfree(rtk->syslog_msg_buffer); From 9fe2a6513afa52f89a5540ce6f528ee64e3a323d Mon Sep 17 00:00:00 2001 From: Stanimir Varbanov Date: Mon, 24 Feb 2025 10:35:58 +0200 Subject: [PATCH 321/529] PCI: brcmstb: Expand inbound window size up to 64GB MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 25a98c727015638baffcfa236e3f37b70cedcf87 ] The BCM2712 memory map can support up to 64GB of system memory, thus expand the inbound window size in calculation helper function. The change is safe for the currently supported SoCs that have smaller inbound window sizes. Signed-off-by: Stanimir Varbanov Reviewed-by: Florian Fainelli Reviewed-by: Jim Quinlan Tested-by: Ivan T. Ivanov Link: https://lore.kernel.org/r/20250224083559.47645-7-svarbanov@suse.de [kwilczynski: commit log] Signed-off-by: Krzysztof Wilczyński Signed-off-by: Sasha Levin --- drivers/pci/controller/pcie-brcmstb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index 425db793080d..fe37bd28761a 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -281,8 +281,8 @@ static int brcm_pcie_encode_ibar_size(u64 size) if (log2_in >= 12 && log2_in <= 15) /* Covers 4KB to 32KB (inclusive) */ return (log2_in - 12) + 0x1c; - else if (log2_in >= 16 && log2_in <= 35) - /* Covers 64KB to 32GB, (inclusive) */ + else if (log2_in >= 16 && log2_in <= 36) + /* Covers 64KB to 64GB, (inclusive) */ return log2_in - 15; /* Something is awry so disable */ return 0; From de963561eff83a75965e71f302b02f6c28c01289 Mon Sep 17 00:00:00 2001 From: Stanimir Varbanov Date: Mon, 24 Feb 2025 10:35:56 +0200 Subject: [PATCH 322/529] PCI: brcmstb: Add a softdep to MIP MSI-X driver MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 2294059118c550464dd8906286324d90c33b152b ] Then the brcmstb PCIe driver and MIP MSI-X interrupt controller drivers are built as modules there could be a race in probing. To avoid this, add a softdep to MIP driver to guarantee that MIP driver will be load first. Signed-off-by: Stanimir Varbanov Reviewed-by: Florian Fainelli Tested-by: Ivan T. Ivanov Link: https://lore.kernel.org/r/20250224083559.47645-5-svarbanov@suse.de [kwilczynski: commit log] Signed-off-by: Krzysztof Wilczyński Signed-off-by: Sasha Levin --- drivers/pci/controller/pcie-brcmstb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/pci/controller/pcie-brcmstb.c b/drivers/pci/controller/pcie-brcmstb.c index fe37bd28761a..c89ad1f92a07 100644 --- a/drivers/pci/controller/pcie-brcmstb.c +++ b/drivers/pci/controller/pcie-brcmstb.c @@ -1619,3 +1619,4 @@ module_platform_driver(brcm_pcie_driver); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("Broadcom STB PCIe RC driver"); MODULE_AUTHOR("Broadcom"); +MODULE_SOFTDEP("pre: irq_bcm2712_mip"); From c6aa1d6bd6ccff4ecdf064d288817657ec8532f0 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Fri, 17 Jan 2025 15:35:52 +0530 Subject: [PATCH 323/529] firmware: arm_ffa: Set dma_mask for ffa devices [ Upstream commit cc0aac7ca17e0ea3ca84b552fc79f3e86fd07f53 ] Set dma_mask for FFA devices, otherwise DMA allocation using the device pointer lead to following warning: WARNING: CPU: 1 PID: 1 at kernel/dma/mapping.c:597 dma_alloc_attrs+0xe0/0x124 Signed-off-by: Viresh Kumar Message-Id: Signed-off-by: Sudeep Holla Signed-off-by: Sasha Levin --- drivers/firmware/arm_ffa/bus.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/firmware/arm_ffa/bus.c b/drivers/firmware/arm_ffa/bus.c index 248594b59c64..5bda5d7ade42 100644 --- a/drivers/firmware/arm_ffa/bus.c +++ b/drivers/firmware/arm_ffa/bus.c @@ -191,6 +191,7 @@ struct ffa_device *ffa_device_register(const uuid_t *uuid, int vm_id, dev = &ffa_dev->dev; dev->bus = &ffa_bus_type; dev->release = ffa_release_device; + dev->dma_mask = &dev->coherent_dma_mask; dev_set_name(&ffa_dev->dev, "arm-ffa-%d", id); ffa_dev->id = id; From 866ae3322f74438525903bd16d525f71ed010feb Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Wed, 26 Feb 2025 14:25:40 +0200 Subject: [PATCH 324/529] net/mlx5: Avoid report two health errors on same syndrome [ Upstream commit b5d7b2f04ebcff740f44ef4d295b3401aeb029f4 ] In case health counter has not increased for few polling intervals, miss counter will reach max misses threshold and health report will be triggered for FW health reporter. In case syndrome found on same health poll another health report will be triggered. Avoid two health reports on same syndrome by marking this syndrome as already known. Signed-off-by: Moshe Shemesh Reviewed-by: Shahar Shitrit Signed-off-by: Tariq Toukan Reviewed-by: Kalesh AP Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/health.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/health.c b/drivers/net/ethernet/mellanox/mlx5/core/health.c index 65483dab9057..b4faac12789d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/health.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/health.c @@ -850,6 +850,7 @@ static void poll_health(struct timer_list *t) health->prev = count; if (health->miss_counter == MAX_MISSES) { mlx5_core_err(dev, "device's health compromised - reached miss count\n"); + health->synd = ioread8(&h->synd); print_health_info(dev); queue_work(health->wq, &health->report_work); } From 1964a698cb6bd5b6566ac56c50f7450fb36ee58c Mon Sep 17 00:00:00 2001 From: Kevin Krakauer Date: Wed, 26 Feb 2025 11:27:23 -0800 Subject: [PATCH 325/529] selftests/net: have `gro.sh -t` return a correct exit code [ Upstream commit 784e6abd99f24024a8998b5916795f0bec9d2fd9 ] Modify gro.sh to return a useful exit code when the -t flag is used. It formerly returned 0 no matter what. Tested: Ran `gro.sh -t large` and verified that test failures return 1. Signed-off-by: Kevin Krakauer Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250226192725.621969-2-krakauer@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- tools/testing/selftests/net/gro.sh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/gro.sh b/tools/testing/selftests/net/gro.sh index 342ad27f631b..e771f5f7faa2 100755 --- a/tools/testing/selftests/net/gro.sh +++ b/tools/testing/selftests/net/gro.sh @@ -95,5 +95,6 @@ trap cleanup EXIT if [[ "${test}" == "all" ]]; then run_all_tests else - run_test "${proto}" "${test}" + exit_code=$(run_test "${proto}" "${test}") + exit $exit_code fi; From c8cc14eeb231b6153d407f4f10dbb1bf802cb820 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Mon, 17 Feb 2025 20:08:29 -0500 Subject: [PATCH 326/529] drm/amdkfd: KFD release_work possible circular locking [ Upstream commit 1b9366c601039d60546794c63fbb83ce8e53b978 ] If waiting for gpu reset done in KFD release_work, thers is WARNING: possible circular locking dependency detected #2 kfd_create_process kfd_process_mutex flush kfd release work #1 kfd release work wait for amdgpu reset work #0 amdgpu_device_gpu_reset kgd2kfd_pre_reset kfd_process_mutex Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock((work_completion)(&p->release_work)); lock((wq_completion)kfd_process_wq); lock((work_completion)(&p->release_work)); lock((wq_completion)amdgpu-reset-dev); To fix this, KFD create process move flush release work outside kfd_process_mutex. Signed-off-by: Philip Yang Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdkfd/kfd_process.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c index bc01c5173ab9..fd7fecaa9254 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c @@ -813,6 +813,14 @@ struct kfd_process *kfd_create_process(struct file *filep) if (thread->group_leader->mm != thread->mm) return ERR_PTR(-EINVAL); + /* If the process just called exec(3), it is possible that the + * cleanup of the kfd_process (following the release of the mm + * of the old process image) is still in the cleanup work queue. + * Make sure to drain any job before trying to recreate any + * resource for this process. + */ + flush_workqueue(kfd_process_wq); + /* * take kfd processes mutex before starting of process creation * so there won't be a case where two threads of the same process @@ -825,14 +833,6 @@ struct kfd_process *kfd_create_process(struct file *filep) if (process) { pr_debug("Process already found\n"); } else { - /* If the process just called exec(3), it is possible that the - * cleanup of the kfd_process (following the release of the mm - * of the old process image) is still in the cleanup work queue. - * Make sure to drain any job before trying to recreate any - * resource for this process. - */ - flush_workqueue(kfd_process_wq); - process = create_process(thread); if (IS_ERR(process)) goto out; From 2272c9d14c86e67f1b7afe68971e3510d12a36ac Mon Sep 17 00:00:00 2001 From: Yuanjun Gong Date: Sun, 23 Feb 2025 20:14:59 +0800 Subject: [PATCH 327/529] leds: pwm-multicolor: Add check for fwnode_property_read_u32 [ Upstream commit 6d91124e7edc109f114b1afe6d00d85d0d0ac174 ] Add a check to the return value of fwnode_property_read_u32() in case it fails. Signed-off-by: Yuanjun Gong Link: https://lore.kernel.org/r/20250223121459.2889484-1-ruc_gongyuanjun@163.com Signed-off-by: Lee Jones Signed-off-by: Sasha Levin --- drivers/leds/rgb/leds-pwm-multicolor.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/leds/rgb/leds-pwm-multicolor.c b/drivers/leds/rgb/leds-pwm-multicolor.c index da9d2218ae18..97aa06e2ff60 100644 --- a/drivers/leds/rgb/leds-pwm-multicolor.c +++ b/drivers/leds/rgb/leds-pwm-multicolor.c @@ -135,8 +135,11 @@ static int led_pwm_mc_probe(struct platform_device *pdev) /* init the multicolor's LED class device */ cdev = &priv->mc_cdev.led_cdev; - fwnode_property_read_u32(mcnode, "max-brightness", + ret = fwnode_property_read_u32(mcnode, "max-brightness", &cdev->max_brightness); + if (ret) + goto release_mcnode; + cdev->flags = LED_CORE_SUSPENDRESUME; cdev->brightness_set_blocking = led_pwm_mc_set; From 62e7868f069ac604bea6da25895721a422f5fb80 Mon Sep 17 00:00:00 2001 From: Eric Woudstra Date: Tue, 25 Feb 2025 21:15:09 +0100 Subject: [PATCH 328/529] net: ethernet: mtk_ppe_offload: Allow QinQ, double ETH_P_8021Q only [ Upstream commit 7fe0353606d77a32c4c7f2814833dd1c043ebdd2 ] mtk_foe_entry_set_vlan() in mtk_ppe.c already supports double vlan tagging, but mtk_flow_offload_replace() in mtk_ppe_offload.c only allows for 1 vlan tag, optionally in combination with pppoe and dsa tags. However, mtk_foe_entry_set_vlan() only allows for setting the vlan id. The protocol cannot be set, it is always ETH_P_8021Q, for inner and outer tag. This patch adds QinQ support to mtk_flow_offload_replace(), only in the case that both inner and outer tags are ETH_P_8021Q. Only PPPoE-in-Q (as before) and Q-in-Q are allowed. A combination of PPPoE and Q-in-Q is not allowed. Signed-off-by: Eric Woudstra Link: https://patch.msgid.link/20250225201509.20843-1-ericwouds@gmail.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- .../net/ethernet/mediatek/mtk_ppe_offload.c | 22 +++++++++---------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c index 6a72687d5b83..8cb8d47227f5 100644 --- a/drivers/net/ethernet/mediatek/mtk_ppe_offload.c +++ b/drivers/net/ethernet/mediatek/mtk_ppe_offload.c @@ -34,8 +34,10 @@ struct mtk_flow_data { u16 vlan_in; struct { - u16 id; - __be16 proto; + struct { + u16 id; + __be16 proto; + } vlans[2]; u8 num; } vlan; struct { @@ -321,18 +323,19 @@ mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f) case FLOW_ACTION_CSUM: break; case FLOW_ACTION_VLAN_PUSH: - if (data.vlan.num == 1 || + if (data.vlan.num + data.pppoe.num == 2 || act->vlan.proto != htons(ETH_P_8021Q)) return -EOPNOTSUPP; - data.vlan.id = act->vlan.vid; - data.vlan.proto = act->vlan.proto; + data.vlan.vlans[data.vlan.num].id = act->vlan.vid; + data.vlan.vlans[data.vlan.num].proto = act->vlan.proto; data.vlan.num++; break; case FLOW_ACTION_VLAN_POP: break; case FLOW_ACTION_PPPOE_PUSH: - if (data.pppoe.num == 1) + if (data.pppoe.num == 1 || + data.vlan.num == 2) return -EOPNOTSUPP; data.pppoe.sid = act->pppoe.sid; @@ -422,12 +425,9 @@ mtk_flow_offload_replace(struct mtk_eth *eth, struct flow_cls_offload *f) if (offload_type == MTK_PPE_PKT_TYPE_BRIDGE) foe.bridge.vlan = data.vlan_in; - if (data.vlan.num == 1) { - if (data.vlan.proto != htons(ETH_P_8021Q)) - return -EOPNOTSUPP; + for (i = 0; i < data.vlan.num; i++) + mtk_foe_entry_set_vlan(eth, &foe, data.vlan.vlans[i].id); - mtk_foe_entry_set_vlan(eth, &foe, data.vlan.id); - } if (data.pppoe.num == 1) mtk_foe_entry_set_pppoe(eth, &foe, data.pppoe.sid); From 365e617487c35bb7bd831a15ab5d14bbab2264ca Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 25 Feb 2025 17:33:33 +0100 Subject: [PATCH 329/529] net: xgene-v2: remove incorrect ACPI_PTR annotation [ Upstream commit 01358e8fe922f716c05d7864ac2213b2440026e7 ] Building with W=1 shows a warning about xge_acpi_match being unused when CONFIG_ACPI is disabled: drivers/net/ethernet/apm/xgene-v2/main.c:723:36: error: unused variable 'xge_acpi_match' [-Werror,-Wunused-const-variable] Signed-off-by: Arnd Bergmann Link: https://patch.msgid.link/20250225163341.4168238-2-arnd@kernel.org Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/ethernet/apm/xgene-v2/main.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/net/ethernet/apm/xgene-v2/main.c b/drivers/net/ethernet/apm/xgene-v2/main.c index 379d19d18dbe..5808e3c73a8f 100644 --- a/drivers/net/ethernet/apm/xgene-v2/main.c +++ b/drivers/net/ethernet/apm/xgene-v2/main.c @@ -9,8 +9,6 @@ #include "main.h" -static const struct acpi_device_id xge_acpi_match[]; - static int xge_get_resources(struct xge_pdata *pdata) { struct platform_device *pdev; @@ -733,7 +731,7 @@ MODULE_DEVICE_TABLE(acpi, xge_acpi_match); static struct platform_driver xge_driver = { .driver = { .name = "xgene-enet-v2", - .acpi_match_table = ACPI_PTR(xge_acpi_match), + .acpi_match_table = xge_acpi_match, }, .probe = xge_probe, .remove = xge_remove, From 3572663c555bea7c5c52de0e76c39c5748f59353 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Tue, 25 Feb 2025 03:39:14 +0000 Subject: [PATCH 330/529] bonding: report duplicate MAC address in all situations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 28d68d396a1cd21591e8c6d74afbde33a7ea107e ] Normally, a bond uses the MAC address of the first added slave as the bond’s MAC address. And the bond will set active slave’s MAC address to bond’s address if fail_over_mac is set to none (0) or follow (2). When the first slave is removed, the bond will still use the removed slave’s MAC address, which can lead to a duplicate MAC address and potentially cause issues with the switch. To avoid confusion, let's warn the user in all situations, including when fail_over_mac is set to 2 or not in active-backup mode. Signed-off-by: Hangbin Liu Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250225033914.18617-1-liuhangbin@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/bonding/bond_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index ded9e369e403..3cedadef9c8a 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2431,7 +2431,7 @@ static int __bond_release_one(struct net_device *bond_dev, RCU_INIT_POINTER(bond->current_arp_slave, NULL); - if (!all && (!bond->params.fail_over_mac || + if (!all && (bond->params.fail_over_mac != BOND_FOM_ACTIVE || BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)) { if (ether_addr_equal_64bits(bond_dev->dev_addr, slave->perm_hwaddr) && bond_has_slaves(bond)) From 5f496a9f1c8c7a58f35a8b6bcb826e23c2d58ec0 Mon Sep 17 00:00:00 2001 From: Andrew Davis Date: Thu, 23 Jan 2025 12:17:26 -0600 Subject: [PATCH 331/529] soc: ti: k3-socinfo: Do not use syscon helper to build regmap [ Upstream commit a5caf03188e44388e8c618dcbe5fffad1a249385 ] The syscon helper device_node_to_regmap() is used to fetch a regmap registered to a device node. It also currently creates this regmap if the node did not already have a regmap associated with it. This should only be used on "syscon" nodes. This driver is not such a device and instead uses device_node_to_regmap() on its own node as a hacky way to create a regmap for itself. This will not work going forward and so we should create our regmap the normal way by defining our regmap_config, fetching our memory resource, then using the normal regmap_init_mmio() function. Signed-off-by: Andrew Davis Link: https://lore.kernel.org/r/20250123181726.597144-1-afd@ti.com Signed-off-by: Nishanth Menon Signed-off-by: Sasha Levin --- drivers/soc/ti/k3-socinfo.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/soc/ti/k3-socinfo.c b/drivers/soc/ti/k3-socinfo.c index 91f441ee6175..5b0d8260918d 100644 --- a/drivers/soc/ti/k3-socinfo.c +++ b/drivers/soc/ti/k3-socinfo.c @@ -60,6 +60,12 @@ k3_chipinfo_partno_to_names(unsigned int partno, return -EINVAL; } +static const struct regmap_config k3_chipinfo_regmap_cfg = { + .reg_bits = 32, + .val_bits = 32, + .reg_stride = 4, +}; + static int k3_chipinfo_probe(struct platform_device *pdev) { struct device_node *node = pdev->dev.of_node; @@ -67,13 +73,18 @@ static int k3_chipinfo_probe(struct platform_device *pdev) struct device *dev = &pdev->dev; struct soc_device *soc_dev; struct regmap *regmap; + void __iomem *base; u32 partno_id; u32 variant; u32 jtag_id; u32 mfg; int ret; - regmap = device_node_to_regmap(node); + base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(base)) + return PTR_ERR(base); + + regmap = regmap_init_mmio(dev, base, &k3_chipinfo_regmap_cfg); if (IS_ERR(regmap)) return PTR_ERR(regmap); From a66f46751ca4a90b223535ea656308efd2ad2d53 Mon Sep 17 00:00:00 2001 From: Nir Lichtman Date: Fri, 10 Jan 2025 12:05:00 +0000 Subject: [PATCH 332/529] x86/build: Fix broken copy command in genimage.sh when making isoimage [ Upstream commit e451630226bd09dc730eedb4e32cab1cc7155ae8 ] Problem: Currently when running the "make isoimage" command there is an error related to wrong parameters passed to the cp command: "cp: missing destination file operand after 'arch/x86/boot/isoimage/'" This is caused because FDINITRDS is an empty array. Solution: Check if FDINITRDS is empty before executing the "cp" command, similar to how it is done in the case of hdimage. Signed-off-by: Nir Lichtman Signed-off-by: Ingo Molnar Cc: "H. Peter Anvin" Cc: Ard Biesheuvel Cc: Masahiro Yamada Cc: Michal Marek Link: https://lore.kernel.org/r/20250110120500.GA923218@lichtman.org Signed-off-by: Sasha Levin --- arch/x86/boot/genimage.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh index c9299aeb7333..3882ead513f7 100644 --- a/arch/x86/boot/genimage.sh +++ b/arch/x86/boot/genimage.sh @@ -22,6 +22,7 @@ # This script requires: # bash # syslinux +# genisoimage # mtools (for fdimage* and hdimage) # edk2/OVMF (for hdimage) # @@ -251,7 +252,9 @@ geniso() { cp "$isolinux" "$ldlinux" "$tmp_dir" cp "$FBZIMAGE" "$tmp_dir"/linux echo default linux "$KCMDLINE" > "$tmp_dir"/isolinux.cfg - cp "${FDINITRDS[@]}" "$tmp_dir"/ + if [ ${#FDINITRDS[@]} -gt 0 ]; then + cp "${FDINITRDS[@]}" "$tmp_dir"/ + fi genisoimage -J -r -appid 'LINUX_BOOT' -input-charset=utf-8 \ -quiet -o "$FIMAGE" -b isolinux.bin \ -c boot.cat -no-emul-boot -boot-load-size 4 \ From 42733703c462f12b8aed64e8efc4589718c09145 Mon Sep 17 00:00:00 2001 From: Yihan Zhu Date: Wed, 12 Feb 2025 15:17:56 -0500 Subject: [PATCH 333/529] drm/amd/display: handle max_downscale_src_width fail check [ Upstream commit 02a940da2ccc0cc0299811379580852b405a0ea2 ] [WHY] If max_downscale_src_width check fails, we exit early from TAP calculation and left a NULL value to the scaling data structure to cause the zero divide in the DML validation. [HOW] Call set default TAP calculation before early exit in get_optimal_number_of_taps due to max downscale limit exceed. Reviewed-by: Samson Tam Signed-off-by: Yihan Zhu Signed-off-by: Zaeem Mohamed Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c index 50dc83404644..4ce45f1bdac0 100644 --- a/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c +++ b/drivers/gpu/drm/amd/display/dc/dcn30/dcn30_dpp.c @@ -392,11 +392,6 @@ bool dpp3_get_optimal_number_of_taps( int min_taps_y, min_taps_c; enum lb_memory_config lb_config; - if (scl_data->viewport.width > scl_data->h_active && - dpp->ctx->dc->debug.max_downscale_src_width != 0 && - scl_data->viewport.width > dpp->ctx->dc->debug.max_downscale_src_width) - return false; - /* * Set default taps if none are provided * From programming guide: taps = min{ ceil(2*H_RATIO,1), 8} for downscaling @@ -434,6 +429,12 @@ bool dpp3_get_optimal_number_of_taps( else scl_data->taps.h_taps_c = in_taps->h_taps_c; + // Avoid null data in the scl data with this early return, proceed non-adaptive calcualtion first + if (scl_data->viewport.width > scl_data->h_active && + dpp->ctx->dc->debug.max_downscale_src_width != 0 && + scl_data->viewport.width > dpp->ctx->dc->debug.max_downscale_src_width) + return false; + /*Ensure we can support the requested number of vtaps*/ min_taps_y = dc_fixpt_ceil(scl_data->ratios.vert); min_taps_c = dc_fixpt_ceil(scl_data->ratios.vert_c); From 60750801ca1ca031ff83973cdfbe1f726fa8141e Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 6 Feb 2025 14:18:44 -0500 Subject: [PATCH 334/529] x86/nmi: Add an emergency handler in nmi_desc & use it in nmi_shootdown_cpus() [ Upstream commit fe37c699ae3eed6e02ee55fbf5cb9ceb7fcfd76c ] Depending on the type of panics, it was found that the __register_nmi_handler() function can be called in NMI context from nmi_shootdown_cpus() leading to a lockdep splat: WARNING: inconsistent lock state inconsistent {INITIAL USE} -> {IN-NMI} usage. lock(&nmi_desc[0].lock); lock(&nmi_desc[0].lock); Call Trace: _raw_spin_lock_irqsave __register_nmi_handler nmi_shootdown_cpus kdump_nmi_shootdown_cpus native_machine_crash_shutdown __crash_kexec In this particular case, the following panic message was printed before: Kernel panic - not syncing: Fatal hardware error! This message seemed to be given out from __ghes_panic() running in NMI context. The __register_nmi_handler() function which takes the nmi_desc lock with irq disabled shouldn't be called from NMI context as this can lead to deadlock. The nmi_shootdown_cpus() function can only be invoked once. After the first invocation, all other CPUs should be stuck in the newly added crash_nmi_callback() and cannot respond to a second NMI. Fix it by adding a new emergency NMI handler to the nmi_desc structure and provide a new set_emergency_nmi_handler() helper to set crash_nmi_callback() in any context. The new emergency handler will preempt other handlers in the linked list. That will eliminate the need to take any lock and serve the panic in NMI use case. Signed-off-by: Waiman Long Signed-off-by: Ingo Molnar Acked-by: Rik van Riel Cc: Thomas Gleixner Link: https://lore.kernel.org/r/20250206191844.131700-1-longman@redhat.com Signed-off-by: Sasha Levin --- arch/x86/include/asm/nmi.h | 2 ++ arch/x86/kernel/nmi.c | 42 ++++++++++++++++++++++++++++++++++++++ arch/x86/kernel/reboot.c | 10 +++------ 3 files changed, 47 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/nmi.h b/arch/x86/include/asm/nmi.h index 5c5f1e56c404..6f3d145670a9 100644 --- a/arch/x86/include/asm/nmi.h +++ b/arch/x86/include/asm/nmi.h @@ -59,6 +59,8 @@ int __register_nmi_handler(unsigned int, struct nmiaction *); void unregister_nmi_handler(unsigned int, const char *); +void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler); + void stop_nmi(void); void restart_nmi(void); void local_touch_nmi(void); diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index ed6cce6c3950..b9a128546970 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -38,8 +38,12 @@ #define CREATE_TRACE_POINTS #include +/* + * An emergency handler can be set in any context including NMI + */ struct nmi_desc { raw_spinlock_t lock; + nmi_handler_t emerg_handler; struct list_head head; }; @@ -121,9 +125,22 @@ static void nmi_check_duration(struct nmiaction *action, u64 duration) static int nmi_handle(unsigned int type, struct pt_regs *regs) { struct nmi_desc *desc = nmi_to_desc(type); + nmi_handler_t ehandler; struct nmiaction *a; int handled=0; + /* + * Call the emergency handler, if set + * + * In the case of crash_nmi_callback() emergency handler, it will + * return in the case of the crashing CPU to enable it to complete + * other necessary crashing actions ASAP. Other handlers in the + * linked list won't need to be run. + */ + ehandler = desc->emerg_handler; + if (ehandler) + return ehandler(type, regs); + rcu_read_lock(); /* @@ -213,6 +230,31 @@ void unregister_nmi_handler(unsigned int type, const char *name) } EXPORT_SYMBOL_GPL(unregister_nmi_handler); +/** + * set_emergency_nmi_handler - Set emergency handler + * @type: NMI type + * @handler: the emergency handler to be stored + * + * Set an emergency NMI handler which, if set, will preempt all the other + * handlers in the linked list. If a NULL handler is passed in, it will clear + * it. It is expected that concurrent calls to this function will not happen + * or the system is screwed beyond repair. + */ +void set_emergency_nmi_handler(unsigned int type, nmi_handler_t handler) +{ + struct nmi_desc *desc = nmi_to_desc(type); + + if (WARN_ON_ONCE(desc->emerg_handler == handler)) + return; + desc->emerg_handler = handler; + + /* + * Ensure the emergency handler is visible to other CPUs before + * function return + */ + smp_wmb(); +} + static void pci_serr_error(unsigned char reason, struct pt_regs *regs) { diff --git a/arch/x86/kernel/reboot.c b/arch/x86/kernel/reboot.c index 299b970e5f82..d9dbcd1cf75f 100644 --- a/arch/x86/kernel/reboot.c +++ b/arch/x86/kernel/reboot.c @@ -896,15 +896,11 @@ void nmi_shootdown_cpus(nmi_shootdown_cb callback) shootdown_callback = callback; atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); - /* Would it be better to replace the trap vector here? */ - if (register_nmi_handler(NMI_LOCAL, crash_nmi_callback, - NMI_FLAG_FIRST, "crash")) - return; /* Return what? */ + /* - * Ensure the new callback function is set before sending - * out the NMI + * Set emergency handler to preempt other handlers. */ - wmb(); + set_emergency_nmi_handler(NMI_LOCAL, crash_nmi_callback); apic_send_IPI_allbutself(NMI_VECTOR); From a9edb700846ab1e38e3211cbb6d41d575291305e Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Thu, 6 Feb 2025 15:29:05 +0100 Subject: [PATCH 335/529] cpuidle: menu: Avoid discarding useful information [ Upstream commit 85975daeaa4d6ec560bfcd354fc9c08ad7f38888 ] When giving up on making a high-confidence prediction, get_typical_interval() always returns UINT_MAX which means that the next idle interval prediction will be based entirely on the time till the next timer. However, the information represented by the most recent intervals may not be completely useless in those cases. Namely, the largest recent idle interval is an upper bound on the recently observed idle duration, so it is reasonable to assume that the next idle duration is unlikely to exceed it. Moreover, this is still true after eliminating the suspected outliers if the sample set still under consideration is at least as large as 50% of the maximum sample set size. Accordingly, make get_typical_interval() return the current maximum recent interval value in that case instead of UINT_MAX. Signed-off-by: Rafael J. Wysocki Reported-by: Artem Bityutskiy Tested-by: Artem Bityutskiy Reviewed-by: Christian Loehle Tested-by: Christian Loehle Tested-by: Aboorva Devarajan Link: https://patch.msgid.link/7770672.EvYhyI6sBW@rjwysocki.net Signed-off-by: Sasha Levin --- drivers/cpuidle/governors/menu.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index c4922684f305..4edac724983a 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -249,8 +249,19 @@ again: * This can deal with workloads that have long pauses interspersed * with sporadic activity with a bunch of short pauses. */ - if ((divisor * 4) <= INTERVALS * 3) + if (divisor * 4 <= INTERVALS * 3) { + /* + * If there are sufficiently many data points still under + * consideration after the outliers have been eliminated, + * returning without a prediction would be a mistake because it + * is likely that the next interval will not exceed the current + * maximum, so return the latter in that case. + */ + if (divisor >= INTERVALS / 2) + return max; + return UINT_MAX; + } thresh = max - 1; goto again; From 01b4545bbc23391646354f94441ad2cc75e8e3f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Sat, 22 Feb 2025 00:09:07 +0100 Subject: [PATCH 336/529] media: adv7180: Disable test-pattern control on adv7180 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit a980bc5f56b0292336e408f657f79e574e8067c0 ] The register that enables selecting a test-pattern to be outputted in free-run mode (FREE_RUN_PAT_SEL[2:0]) is only available on adv7280 based devices, not the adv7180 based ones. Add a flag to mark devices that are capable of generating test-patterns, and those that are not. And only register the control on supported devices. Signed-off-by: Niklas Söderlund Signed-off-by: Hans Verkuil Signed-off-by: Sasha Levin --- drivers/media/i2c/adv7180.c | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/drivers/media/i2c/adv7180.c b/drivers/media/i2c/adv7180.c index 216fe396973f..46912a7b671a 100644 --- a/drivers/media/i2c/adv7180.c +++ b/drivers/media/i2c/adv7180.c @@ -194,6 +194,7 @@ struct adv7180_state; #define ADV7180_FLAG_V2 BIT(1) #define ADV7180_FLAG_MIPI_CSI2 BIT(2) #define ADV7180_FLAG_I2P BIT(3) +#define ADV7180_FLAG_TEST_PATTERN BIT(4) struct adv7180_chip_info { unsigned int flags; @@ -673,11 +674,15 @@ static int adv7180_init_controls(struct adv7180_state *state) ADV7180_HUE_MAX, 1, ADV7180_HUE_DEF); v4l2_ctrl_new_custom(&state->ctrl_hdl, &adv7180_ctrl_fast_switch, NULL); - v4l2_ctrl_new_std_menu_items(&state->ctrl_hdl, &adv7180_ctrl_ops, - V4L2_CID_TEST_PATTERN, - ARRAY_SIZE(test_pattern_menu) - 1, - 0, ARRAY_SIZE(test_pattern_menu) - 1, - test_pattern_menu); + if (state->chip_info->flags & ADV7180_FLAG_TEST_PATTERN) { + v4l2_ctrl_new_std_menu_items(&state->ctrl_hdl, + &adv7180_ctrl_ops, + V4L2_CID_TEST_PATTERN, + ARRAY_SIZE(test_pattern_menu) - 1, + 0, + ARRAY_SIZE(test_pattern_menu) - 1, + test_pattern_menu); + } state->sd.ctrl_handler = &state->ctrl_hdl; if (state->ctrl_hdl.error) { @@ -1209,7 +1214,7 @@ static const struct adv7180_chip_info adv7182_info = { }; static const struct adv7180_chip_info adv7280_info = { - .flags = ADV7180_FLAG_V2 | ADV7180_FLAG_I2P, + .flags = ADV7180_FLAG_V2 | ADV7180_FLAG_I2P | ADV7180_FLAG_TEST_PATTERN, .valid_input_mask = BIT(ADV7182_INPUT_CVBS_AIN1) | BIT(ADV7182_INPUT_CVBS_AIN2) | BIT(ADV7182_INPUT_CVBS_AIN3) | @@ -1223,7 +1228,8 @@ static const struct adv7180_chip_info adv7280_info = { }; static const struct adv7180_chip_info adv7280_m_info = { - .flags = ADV7180_FLAG_V2 | ADV7180_FLAG_MIPI_CSI2 | ADV7180_FLAG_I2P, + .flags = ADV7180_FLAG_V2 | ADV7180_FLAG_MIPI_CSI2 | ADV7180_FLAG_I2P | + ADV7180_FLAG_TEST_PATTERN, .valid_input_mask = BIT(ADV7182_INPUT_CVBS_AIN1) | BIT(ADV7182_INPUT_CVBS_AIN2) | BIT(ADV7182_INPUT_CVBS_AIN3) | @@ -1244,7 +1250,8 @@ static const struct adv7180_chip_info adv7280_m_info = { }; static const struct adv7180_chip_info adv7281_info = { - .flags = ADV7180_FLAG_V2 | ADV7180_FLAG_MIPI_CSI2, + .flags = ADV7180_FLAG_V2 | ADV7180_FLAG_MIPI_CSI2 | + ADV7180_FLAG_TEST_PATTERN, .valid_input_mask = BIT(ADV7182_INPUT_CVBS_AIN1) | BIT(ADV7182_INPUT_CVBS_AIN2) | BIT(ADV7182_INPUT_CVBS_AIN7) | @@ -1259,7 +1266,8 @@ static const struct adv7180_chip_info adv7281_info = { }; static const struct adv7180_chip_info adv7281_m_info = { - .flags = ADV7180_FLAG_V2 | ADV7180_FLAG_MIPI_CSI2, + .flags = ADV7180_FLAG_V2 | ADV7180_FLAG_MIPI_CSI2 | + ADV7180_FLAG_TEST_PATTERN, .valid_input_mask = BIT(ADV7182_INPUT_CVBS_AIN1) | BIT(ADV7182_INPUT_CVBS_AIN2) | BIT(ADV7182_INPUT_CVBS_AIN3) | @@ -1279,7 +1287,8 @@ static const struct adv7180_chip_info adv7281_m_info = { }; static const struct adv7180_chip_info adv7281_ma_info = { - .flags = ADV7180_FLAG_V2 | ADV7180_FLAG_MIPI_CSI2, + .flags = ADV7180_FLAG_V2 | ADV7180_FLAG_MIPI_CSI2 | + ADV7180_FLAG_TEST_PATTERN, .valid_input_mask = BIT(ADV7182_INPUT_CVBS_AIN1) | BIT(ADV7182_INPUT_CVBS_AIN2) | BIT(ADV7182_INPUT_CVBS_AIN3) | @@ -1304,7 +1313,7 @@ static const struct adv7180_chip_info adv7281_ma_info = { }; static const struct adv7180_chip_info adv7282_info = { - .flags = ADV7180_FLAG_V2 | ADV7180_FLAG_I2P, + .flags = ADV7180_FLAG_V2 | ADV7180_FLAG_I2P | ADV7180_FLAG_TEST_PATTERN, .valid_input_mask = BIT(ADV7182_INPUT_CVBS_AIN1) | BIT(ADV7182_INPUT_CVBS_AIN2) | BIT(ADV7182_INPUT_CVBS_AIN7) | @@ -1319,7 +1328,8 @@ static const struct adv7180_chip_info adv7282_info = { }; static const struct adv7180_chip_info adv7282_m_info = { - .flags = ADV7180_FLAG_V2 | ADV7180_FLAG_MIPI_CSI2 | ADV7180_FLAG_I2P, + .flags = ADV7180_FLAG_V2 | ADV7180_FLAG_MIPI_CSI2 | ADV7180_FLAG_I2P | + ADV7180_FLAG_TEST_PATTERN, .valid_input_mask = BIT(ADV7182_INPUT_CVBS_AIN1) | BIT(ADV7182_INPUT_CVBS_AIN2) | BIT(ADV7182_INPUT_CVBS_AIN3) | From 75b01247b43015fe168273540bba8ea4ac0608e8 Mon Sep 17 00:00:00 2001 From: Nandakumar Edamana Date: Sat, 22 Feb 2025 02:31:11 +0530 Subject: [PATCH 337/529] libbpf: Fix out-of-bound read [ Upstream commit 236d3910117e9f97ebf75e511d8bcc950f1a4e5f ] In `set_kcfg_value_str`, an untrusted string is accessed with the assumption that it will be at least two characters long due to the presence of checks for opening and closing quotes. But the check for the closing quote (value[len - 1] != '"') misses the fact that it could be checking the opening quote itself in case of an invalid input that consists of just the opening quote. This commit adds an explicit check to make sure the string is at least two characters long. Signed-off-by: Nandakumar Edamana Signed-off-by: Andrii Nakryiko Link: https://lore.kernel.org/bpf/20250221210110.3182084-1-nandakumar@nandakumar.co.in Signed-off-by: Sasha Levin --- tools/lib/bpf/libbpf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c index a0fb50718dae..98d5e566e058 100644 --- a/tools/lib/bpf/libbpf.c +++ b/tools/lib/bpf/libbpf.c @@ -1751,7 +1751,7 @@ static int set_kcfg_value_str(struct extern_desc *ext, char *ext_val, } len = strlen(value); - if (value[len - 1] != '"') { + if (len < 2 || value[len - 1] != '"') { pr_warn("extern (kcfg) '%s': invalid string config '%s'\n", ext->name, value); return -EINVAL; From 95d08924335f3b6f4ea0b92ebfe4fe0731c502d9 Mon Sep 17 00:00:00 2001 From: Jinliang Zheng Date: Thu, 20 Feb 2025 19:20:14 +0800 Subject: [PATCH 338/529] dm: fix unconditional IO throttle caused by REQ_PREFLUSH [ Upstream commit 88f7f56d16f568f19e1a695af34a7f4a6ce537a6 ] When a bio with REQ_PREFLUSH is submitted to dm, __send_empty_flush() generates a flush_bio with REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC, which causes the flush_bio to be throttled by wbt_wait(). An example from v5.4, similar problem also exists in upstream: crash> bt 2091206 PID: 2091206 TASK: ffff2050df92a300 CPU: 109 COMMAND: "kworker/u260:0" #0 [ffff800084a2f7f0] __switch_to at ffff80004008aeb8 #1 [ffff800084a2f820] __schedule at ffff800040bfa0c4 #2 [ffff800084a2f880] schedule at ffff800040bfa4b4 #3 [ffff800084a2f8a0] io_schedule at ffff800040bfa9c4 #4 [ffff800084a2f8c0] rq_qos_wait at ffff8000405925bc #5 [ffff800084a2f940] wbt_wait at ffff8000405bb3a0 #6 [ffff800084a2f9a0] __rq_qos_throttle at ffff800040592254 #7 [ffff800084a2f9c0] blk_mq_make_request at ffff80004057cf38 #8 [ffff800084a2fa60] generic_make_request at ffff800040570138 #9 [ffff800084a2fae0] submit_bio at ffff8000405703b4 #10 [ffff800084a2fb50] xlog_write_iclog at ffff800001280834 [xfs] #11 [ffff800084a2fbb0] xlog_sync at ffff800001280c3c [xfs] #12 [ffff800084a2fbf0] xlog_state_release_iclog at ffff800001280df4 [xfs] #13 [ffff800084a2fc10] xlog_write at ffff80000128203c [xfs] #14 [ffff800084a2fcd0] xlog_cil_push at ffff8000012846dc [xfs] #15 [ffff800084a2fda0] xlog_cil_push_work at ffff800001284a2c [xfs] #16 [ffff800084a2fdb0] process_one_work at ffff800040111d08 #17 [ffff800084a2fe00] worker_thread at ffff8000401121cc #18 [ffff800084a2fe70] kthread at ffff800040118de4 After commit 2def2845cc33 ("xfs: don't allow log IO to be throttled"), the metadata submitted by xlog_write_iclog() should not be throttled. But due to the existence of the dm layer, throttling flush_bio indirectly causes the metadata bio to be throttled. Fix this by conditionally adding REQ_IDLE to flush_bio.bi_opf, which makes wbt_should_throttle() return false to avoid wbt_wait(). Signed-off-by: Jinliang Zheng Reviewed-by: Tianxiang Peng Reviewed-by: Hao Peng Signed-off-by: Mikulas Patocka Signed-off-by: Sasha Levin --- drivers/md/dm.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/md/dm.c b/drivers/md/dm.c index f70129bc703b..4767265793de 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -1523,14 +1523,18 @@ static void __send_empty_flush(struct clone_info *ci) { struct dm_table *t = ci->map; struct bio flush_bio; + blk_opf_t opf = REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC; + + if ((ci->io->orig_bio->bi_opf & (REQ_IDLE | REQ_SYNC)) == + (REQ_IDLE | REQ_SYNC)) + opf |= REQ_IDLE; /* * Use an on-stack bio for this, it's safe since we don't * need to reference it after submit. It's just used as * the basis for the clone(s). */ - bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, - REQ_OP_WRITE | REQ_PREFLUSH | REQ_SYNC); + bio_init(&flush_bio, ci->io->md->disk->part0, NULL, 0, opf); ci->bio = &flush_bio; ci->sector_count = 0; From ea4497337f444a9fd1d7538cb031b10117b97806 Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Fri, 7 Feb 2025 10:42:34 +1100 Subject: [PATCH 339/529] x86/kaslr: Reduce KASLR entropy on most x86 systems [ Upstream commit 7ffb791423c7c518269a9aad35039ef824a40adb ] When CONFIG_PCI_P2PDMA=y (which is basically enabled on all large x86 distros), it maps the PFN's via a ZONE_DEVICE mapping using devm_memremap_pages(). The mapped virtual address range corresponds to the pci_resource_start() of the BAR address and size corresponding to the BAR length. When KASLR is enabled, the direct map range of the kernel is reduced to the size of physical memory plus additional padding. If the BAR address is beyond this limit, PCI peer to peer DMA mappings fail. Fix this by not shrinking the size of the direct map when CONFIG_PCI_P2PDMA=y. This reduces the total available entropy, but it's better than the current work around of having to disable KASLR completely. [ mingo: Clarified the changelog to point out the broad impact ... ] Signed-off-by: Balbir Singh Signed-off-by: Ingo Molnar Reviewed-by: Kees Cook Acked-by: Bjorn Helgaas # drivers/pci/Kconfig Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Andy Lutomirski Link: https://lore.kernel.org/lkml/20250206023201.1481957-1-balbirs@nvidia.com/ Link: https://lore.kernel.org/r/20250206234234.1912585-1-balbirs@nvidia.com -- arch/x86/mm/kaslr.c | 10 ++++++++-- drivers/pci/Kconfig | 6 ++++++ 2 files changed, 14 insertions(+), 2 deletions(-) Signed-off-by: Sasha Levin --- arch/x86/mm/kaslr.c | 10 ++++++++-- drivers/pci/Kconfig | 6 ++++++ 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 230f1dee4f09..e0b0ec0f8245 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -109,8 +109,14 @@ void __init kernel_randomize_memory(void) memory_tb = DIV_ROUND_UP(max_pfn << PAGE_SHIFT, 1UL << TB_SHIFT) + CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; - /* Adapt physical memory region size based on available memory */ - if (memory_tb < kaslr_regions[0].size_tb) + /* + * Adapt physical memory region size based on available memory, + * except when CONFIG_PCI_P2PDMA is enabled. P2PDMA exposes the + * device BAR space assuming the direct map space is large enough + * for creating a ZONE_DEVICE mapping in the direct map corresponding + * to the physical BAR address. + */ + if (!IS_ENABLED(CONFIG_PCI_P2PDMA) && (memory_tb < kaslr_regions[0].size_tb)) kaslr_regions[0].size_tb = memory_tb; /* diff --git a/drivers/pci/Kconfig b/drivers/pci/Kconfig index 55c028af4bd9..c4f9ac5c00c1 100644 --- a/drivers/pci/Kconfig +++ b/drivers/pci/Kconfig @@ -184,6 +184,12 @@ config PCI_P2PDMA P2P DMA transactions must be between devices behind the same root port. + Enabling this option will reduce the entropy of x86 KASLR memory + regions. For example - on a 46 bit system, the entropy goes down + from 16 bits to 15 bits. The actual reduction in entropy depends + on the physical address bits, on processor features, kernel config + (5 level page table) and physical memory present on the system. + If unsure, say N. config PCI_LABEL From 3394aafdc850cf10d228bb0b2daccace4d37cbd4 Mon Sep 17 00:00:00 2001 From: Bibo Mao Date: Tue, 9 Jun 2020 10:54:35 +0800 Subject: [PATCH 340/529] MIPS: Use arch specific syscall name match function [ Upstream commit 756276ce78d5624dc814f9d99f7d16c8fd51076e ] On MIPS system, most of the syscall function name begin with prefix sys_. Some syscalls are special such as clone/fork, function name of these begin with __sys_. Since scratch registers need be saved in stack when these system calls happens. With ftrace system call method, system call functions are declared with SYSCALL_DEFINEx, metadata of the system call symbol name begins with sys_. Here mips specific function arch_syscall_match_sym_name is used to compare function name between sys_call_table[] and metadata of syscall symbol. Signed-off-by: Bibo Mao Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- arch/mips/include/asm/ftrace.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/arch/mips/include/asm/ftrace.h b/arch/mips/include/asm/ftrace.h index db497a8167da..e3212f44446f 100644 --- a/arch/mips/include/asm/ftrace.h +++ b/arch/mips/include/asm/ftrace.h @@ -87,4 +87,20 @@ struct dyn_arch_ftrace { #endif /* CONFIG_DYNAMIC_FTRACE */ #endif /* __ASSEMBLY__ */ #endif /* CONFIG_FUNCTION_TRACER */ + +#ifdef CONFIG_FTRACE_SYSCALLS +#ifndef __ASSEMBLY__ +/* + * Some syscall entry functions on mips start with "__sys_" (fork and clone, + * for instance). We should also match the sys_ variant with those. + */ +#define ARCH_HAS_SYSCALL_MATCH_SYM_NAME +static inline bool arch_syscall_match_sym_name(const char *sym, + const char *name) +{ + return !strcmp(sym, name) || + (!strncmp(sym, "__sys_", 6) && !strcmp(sym + 6, name + 4)); +} +#endif /* __ASSEMBLY__ */ +#endif /* CONFIG_FTRACE_SYSCALLS */ #endif /* _ASM_MIPS_FTRACE_H */ From e4d3763223c7b72ded53425207075e7453b4e3d5 Mon Sep 17 00:00:00 2001 From: Jason Gunthorpe Date: Wed, 19 Feb 2025 17:31:36 -0800 Subject: [PATCH 341/529] genirq/msi: Store the IOMMU IOVA directly in msi_desc instead of iommu_cookie [ Upstream commit 1f7df3a691740a7736bbc99dc4ed536120eb4746 ] The IOMMU translation for MSI message addresses has been a 2-step process, separated in time: 1) iommu_dma_prepare_msi(): A cookie pointer containing the IOVA address is stored in the MSI descriptor when an MSI interrupt is allocated. 2) iommu_dma_compose_msi_msg(): this cookie pointer is used to compute a translated message address. This has an inherent lifetime problem for the pointer stored in the cookie that must remain valid between the two steps. However, there is no locking at the irq layer that helps protect the lifetime. Today, this works under the assumption that the iommu domain is not changed while MSI interrupts being programmed. This is true for normal DMA API users within the kernel, as the iommu domain is attached before the driver is probed and cannot be changed while a driver is attached. Classic VFIO type1 also prevented changing the iommu domain while VFIO was running as it does not support changing the "container" after starting up. However, iommufd has improved this so that the iommu domain can be changed during VFIO operation. This potentially allows userspace to directly race VFIO_DEVICE_ATTACH_IOMMUFD_PT (which calls iommu_attach_group()) and VFIO_DEVICE_SET_IRQS (which calls into iommu_dma_compose_msi_msg()). This potentially causes both the cookie pointer and the unlocked call to iommu_get_domain_for_dev() on the MSI translation path to become UAFs. Fix the MSI cookie UAF by removing the cookie pointer. The translated IOVA address is already known during iommu_dma_prepare_msi() and cannot change. Thus, it can simply be stored as an integer in the MSI descriptor. The other UAF related to iommu_get_domain_for_dev() will be addressed in patch "iommu: Make iommu_dma_prepare_msi() into a generic operation" by using the IOMMU group mutex. Link: https://patch.msgid.link/r/a4f2cd76b9dc1833ee6c1cf325cba57def22231c.1740014950.git.nicolinc@nvidia.com Signed-off-by: Nicolin Chen Reviewed-by: Thomas Gleixner Signed-off-by: Jason Gunthorpe Signed-off-by: Sasha Levin --- drivers/iommu/dma-iommu.c | 28 +++++++++++++--------------- include/linux/msi.h | 33 ++++++++++++--------------------- 2 files changed, 25 insertions(+), 36 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 3fa66dba0a32..cbf9ec320691 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1661,7 +1661,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) static DEFINE_MUTEX(msi_prepare_lock); /* see below */ if (!domain || !domain->iova_cookie) { - desc->iommu_cookie = NULL; + msi_desc_set_iommu_msi_iova(desc, 0, 0); return 0; } @@ -1673,11 +1673,12 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) mutex_lock(&msi_prepare_lock); msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); mutex_unlock(&msi_prepare_lock); - - msi_desc_set_iommu_cookie(desc, msi_page); - if (!msi_page) return -ENOMEM; + + msi_desc_set_iommu_msi_iova( + desc, msi_page->iova, + ilog2(cookie_msi_granule(domain->iova_cookie))); return 0; } @@ -1688,18 +1689,15 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) */ void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) { - struct device *dev = msi_desc_to_dev(desc); - const struct iommu_domain *domain = iommu_get_domain_for_dev(dev); - const struct iommu_dma_msi_page *msi_page; +#ifdef CONFIG_IRQ_MSI_IOMMU + if (desc->iommu_msi_shift) { + u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; - msi_page = msi_desc_get_iommu_cookie(desc); - - if (!domain || !domain->iova_cookie || WARN_ON(!msi_page)) - return; - - msg->address_hi = upper_32_bits(msi_page->iova); - msg->address_lo &= cookie_msi_granule(domain->iova_cookie) - 1; - msg->address_lo += lower_32_bits(msi_page->iova); + msg->address_hi = upper_32_bits(msi_iova); + msg->address_lo = lower_32_bits(msi_iova) | + (msg->address_lo & ((1 << desc->iommu_msi_shift) - 1)); + } +#endif } static int iommu_dma_init(void) diff --git a/include/linux/msi.h b/include/linux/msi.h index e5dfb9cf3aa1..1bf8d126f792 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -129,6 +129,10 @@ struct pci_msi_desc { * @dev: Pointer to the device which uses this descriptor * @msg: The last set MSI message cached for reuse * @affinity: Optional pointer to a cpu affinity mask for this descriptor + * @iommu_msi_iova: Optional shifted IOVA from the IOMMU to override the msi_addr. + * Only used if iommu_msi_shift != 0 + * @iommu_msi_shift: Indicates how many bits of the original address should be + * preserved when using iommu_msi_iova. * @sysfs_attr: Pointer to sysfs device attribute * * @write_msi_msg: Callback that may be called when the MSI message @@ -146,7 +150,8 @@ struct msi_desc { struct msi_msg msg; struct irq_affinity_desc *affinity; #ifdef CONFIG_IRQ_MSI_IOMMU - const void *iommu_cookie; + u64 iommu_msi_iova : 58; + u64 iommu_msi_shift : 6; #endif #ifdef CONFIG_SYSFS struct device_attribute *sysfs_attrs; @@ -214,28 +219,14 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter); #define msi_desc_to_dev(desc) ((desc)->dev) +static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc, u64 msi_iova, + unsigned int msi_shift) +{ #ifdef CONFIG_IRQ_MSI_IOMMU -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) -{ - return desc->iommu_cookie; -} - -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, - const void *iommu_cookie) -{ - desc->iommu_cookie = iommu_cookie; -} -#else -static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) -{ - return NULL; -} - -static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, - const void *iommu_cookie) -{ -} + desc->iommu_msi_iova = msi_iova >> msi_shift; + desc->iommu_msi_shift = msi_shift; #endif +} #ifdef CONFIG_PCI_MSI struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc); From 3972bd4ca50a3a42c23cd3e43436f1af64d903d4 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Wed, 29 Jan 2025 13:32:48 +0100 Subject: [PATCH 342/529] MIPS: pm-cps: Use per-CPU variables as per-CPU, not per-core [ Upstream commit 00a134fc2bb4a5f8fada58cf7ff4259149691d64 ] The pm-cps code has up until now used per-CPU variables indexed by core, rather than CPU number, in order to share data amongst sibling CPUs (ie. VPs/threads in a core). This works fine for single cluster systems, but with multi-cluster systems a core number is no longer unique in the system, leading to sharing between CPUs that are not actually siblings. Avoid this issue by using per-CPU variables as they are more generally used - ie. access them using CPU numbers rather than core numbers. Sharing between siblings is then accomplished by: - Assigning the same pointer to entries for each sibling CPU for the nc_asm_enter & ready_count variables, which allow this by virtue of being per-CPU pointers. - Indexing by the first CPU set in a CPUs cpu_sibling_map in the case of pm_barrier, for which we can't use the previous approach because the per-CPU variable is not a pointer. Signed-off-by: Paul Burton Signed-off-by: Dragan Mladjenovic Signed-off-by: Aleksandar Rikalo Tested-by: Serge Semin Tested-by: Gregory CLEMENT Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- arch/mips/kernel/pm-cps.c | 30 +++++++++++++++++------------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/arch/mips/kernel/pm-cps.c b/arch/mips/kernel/pm-cps.c index 9bf60d7d44d3..a7bcf2b814c8 100644 --- a/arch/mips/kernel/pm-cps.c +++ b/arch/mips/kernel/pm-cps.c @@ -56,10 +56,7 @@ static DEFINE_PER_CPU_ALIGNED(u32*, ready_count); /* Indicates online CPUs coupled with the current CPU */ static DEFINE_PER_CPU_ALIGNED(cpumask_t, online_coupled); -/* - * Used to synchronize entry to deep idle states. Actually per-core rather - * than per-CPU. - */ +/* Used to synchronize entry to deep idle states */ static DEFINE_PER_CPU_ALIGNED(atomic_t, pm_barrier); /* Saved CPU state across the CPS_PM_POWER_GATED state */ @@ -118,9 +115,10 @@ int cps_pm_enter_state(enum cps_pm_state state) cps_nc_entry_fn entry; struct core_boot_config *core_cfg; struct vpe_boot_config *vpe_cfg; + atomic_t *barrier; /* Check that there is an entry function for this state */ - entry = per_cpu(nc_asm_enter, core)[state]; + entry = per_cpu(nc_asm_enter, cpu)[state]; if (!entry) return -EINVAL; @@ -156,7 +154,7 @@ int cps_pm_enter_state(enum cps_pm_state state) smp_mb__after_atomic(); /* Create a non-coherent mapping of the core ready_count */ - core_ready_count = per_cpu(ready_count, core); + core_ready_count = per_cpu(ready_count, cpu); nc_addr = kmap_noncoherent(virt_to_page(core_ready_count), (unsigned long)core_ready_count); nc_addr += ((unsigned long)core_ready_count & ~PAGE_MASK); @@ -164,7 +162,8 @@ int cps_pm_enter_state(enum cps_pm_state state) /* Ensure ready_count is zero-initialised before the assembly runs */ WRITE_ONCE(*nc_core_ready_count, 0); - coupled_barrier(&per_cpu(pm_barrier, core), online); + barrier = &per_cpu(pm_barrier, cpumask_first(&cpu_sibling_map[cpu])); + coupled_barrier(barrier, online); /* Run the generated entry code */ left = entry(online, nc_core_ready_count); @@ -635,12 +634,14 @@ out_err: static int cps_pm_online_cpu(unsigned int cpu) { - enum cps_pm_state state; - unsigned core = cpu_core(&cpu_data[cpu]); + unsigned int sibling, core; void *entry_fn, *core_rc; + enum cps_pm_state state; + + core = cpu_core(&cpu_data[cpu]); for (state = CPS_PM_NC_WAIT; state < CPS_PM_STATE_COUNT; state++) { - if (per_cpu(nc_asm_enter, core)[state]) + if (per_cpu(nc_asm_enter, cpu)[state]) continue; if (!test_bit(state, state_support)) continue; @@ -652,16 +653,19 @@ static int cps_pm_online_cpu(unsigned int cpu) clear_bit(state, state_support); } - per_cpu(nc_asm_enter, core)[state] = entry_fn; + for_each_cpu(sibling, &cpu_sibling_map[cpu]) + per_cpu(nc_asm_enter, sibling)[state] = entry_fn; } - if (!per_cpu(ready_count, core)) { + if (!per_cpu(ready_count, cpu)) { core_rc = kmalloc(sizeof(u32), GFP_KERNEL); if (!core_rc) { pr_err("Failed allocate core %u ready_count\n", core); return -ENOMEM; } - per_cpu(ready_count, core) = core_rc; + + for_each_cpu(sibling, &cpu_sibling_map[cpu]) + per_cpu(ready_count, sibling) = core_rc; } return 0; From 790f74d6843971102b66f7da180e99307997b8e0 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Wed, 29 Jan 2025 13:32:47 +0100 Subject: [PATCH 343/529] clocksource: mips-gic-timer: Enable counter when CPUs start MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 3128b0a2e0cf6e07aa78e5f8cf7dd9cd59dc8174 ] In multi-cluster MIPS I6500 systems there is a GIC in each cluster, each with its own counter. When a cluster powers up the counter will be stopped, with the COUNTSTOP bit set in the GIC_CONFIG register. In single cluster systems, it has been fine to clear COUNTSTOP once in gic_clocksource_of_init() to start the counter. In multi-cluster systems, this will only have started the counter in the boot cluster, and any CPUs in other clusters will find their counter stopped which will break the GIC clock_event_device. Resolve this by having CPUs clear the COUNTSTOP bit when they come online, using the existing gic_starting_cpu() CPU hotplug callback. This will allow CPUs in secondary clusters to ensure that the cluster's GIC counter is running as expected. Signed-off-by: Paul Burton Signed-off-by: Chao-ying Fu Signed-off-by: Dragan Mladjenovic Signed-off-by: Aleksandar Rikalo Reviewed-by: Philippe Mathieu-Daudé Tested-by: Serge Semin Tested-by: Gregory CLEMENT Acked-by: Daniel Lezcano Signed-off-by: Thomas Bogendoerfer Signed-off-by: Sasha Levin --- drivers/clocksource/mips-gic-timer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/clocksource/mips-gic-timer.c b/drivers/clocksource/mips-gic-timer.c index b3ae38f36720..39c70b5ac44c 100644 --- a/drivers/clocksource/mips-gic-timer.c +++ b/drivers/clocksource/mips-gic-timer.c @@ -114,6 +114,9 @@ static void gic_update_frequency(void *data) static int gic_starting_cpu(unsigned int cpu) { + /* Ensure the GIC counter is running */ + clear_gic_config(GIC_CONFIG_COUNTSTOP); + gic_clockevent_cpu_init(cpu, this_cpu_ptr(&gic_clockevent_device)); return 0; } @@ -248,9 +251,6 @@ static int __init gic_clocksource_of_init(struct device_node *node) pr_warn("Unable to register clock notifier\n"); } - /* And finally start the counter */ - clear_gic_config(GIC_CONFIG_COUNTSTOP); - /* * It's safe to use the MIPS GIC timer as a sched clock source only if * its ticks are stable, which is true on either the platforms with From 526ec913980d05187cbbf52e3372b73834255be4 Mon Sep 17 00:00:00 2001 From: Shivasharan S Date: Wed, 12 Feb 2025 17:26:55 -0800 Subject: [PATCH 344/529] scsi: mpt3sas: Send a diag reset if target reset fails [ Upstream commit 5612d6d51ed2634a033c95de2edec7449409cbb9 ] When an IOCTL times out and driver issues a target reset, if firmware fails the task management elevate the recovery by issuing a diag reset to controller. Signed-off-by: Shivasharan S Link: https://lore.kernel.org/r/1739410016-27503-5-git-send-email-shivasharan.srikanteshwara@broadcom.com Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/mpt3sas/mpt3sas_ctl.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/mpt3sas/mpt3sas_ctl.c b/drivers/scsi/mpt3sas/mpt3sas_ctl.c index fc5af6a5114e..863503e8a4d1 100644 --- a/drivers/scsi/mpt3sas/mpt3sas_ctl.c +++ b/drivers/scsi/mpt3sas/mpt3sas_ctl.c @@ -679,6 +679,7 @@ _ctl_do_mpt_command(struct MPT3SAS_ADAPTER *ioc, struct mpt3_ioctl_command karg, size_t data_in_sz = 0; long ret; u16 device_handle = MPT3SAS_INVALID_DEVICE_HANDLE; + int tm_ret; issue_reset = 0; @@ -1120,18 +1121,25 @@ _ctl_do_mpt_command(struct MPT3SAS_ADAPTER *ioc, struct mpt3_ioctl_command karg, if (pcie_device && (!ioc->tm_custom_handling) && (!(mpt3sas_scsih_is_pcie_scsi_device( pcie_device->device_info)))) - mpt3sas_scsih_issue_locked_tm(ioc, + tm_ret = mpt3sas_scsih_issue_locked_tm(ioc, le16_to_cpu(mpi_request->FunctionDependent1), 0, 0, 0, MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET, 0, 0, pcie_device->reset_timeout, MPI26_SCSITASKMGMT_MSGFLAGS_PROTOCOL_LVL_RST_PCIE); else - mpt3sas_scsih_issue_locked_tm(ioc, + tm_ret = mpt3sas_scsih_issue_locked_tm(ioc, le16_to_cpu(mpi_request->FunctionDependent1), 0, 0, 0, MPI2_SCSITASKMGMT_TASKTYPE_TARGET_RESET, 0, 0, 30, MPI2_SCSITASKMGMT_MSGFLAGS_LINK_RESET); + + if (tm_ret != SUCCESS) { + ioc_info(ioc, + "target reset failed, issue hard reset: handle (0x%04x)\n", + le16_to_cpu(mpi_request->FunctionDependent1)); + mpt3sas_base_hard_reset_handler(ioc, FORCE_BIG_HAMMER); + } } else mpt3sas_base_hard_reset_handler(ioc, FORCE_BIG_HAMMER); } From 100c6e2d0cad2c865c8b33e836db6289a92ae3dd Mon Sep 17 00:00:00 2001 From: Bitterblue Smith Date: Tue, 18 Feb 2025 01:30:48 +0200 Subject: [PATCH 345/529] wifi: rtw88: Fix rtw_init_vht_cap() for RTL8814AU [ Upstream commit 6be7544d19fcfcb729495e793bc6181f85bb8949 ] Set the MCS maps and the highest rates according to the number of spatial streams the chip has. For RTL8814AU that is 3. Signed-off-by: Bitterblue Smith Acked-by: Ping-Ke Shih Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/e86aa009-b5bf-4b3a-8112-ea5e3cd49465@gmail.com Signed-off-by: Sasha Levin --- drivers/net/wireless/realtek/rtw88/main.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c index 7c390c2c608d..8a47cfb358b1 100644 --- a/drivers/net/wireless/realtek/rtw88/main.c +++ b/drivers/net/wireless/realtek/rtw88/main.c @@ -1552,8 +1552,9 @@ static void rtw_init_vht_cap(struct rtw_dev *rtwdev, struct ieee80211_sta_vht_cap *vht_cap) { struct rtw_efuse *efuse = &rtwdev->efuse; - u16 mcs_map; + u16 mcs_map = 0; __le16 highest; + int i; if (efuse->hw_cap.ptcl != EFUSE_HW_CAP_IGNORE && efuse->hw_cap.ptcl != EFUSE_HW_CAP_PTCL_VHT) @@ -1576,21 +1577,15 @@ static void rtw_init_vht_cap(struct rtw_dev *rtwdev, if (rtw_chip_has_rx_ldpc(rtwdev)) vht_cap->cap |= IEEE80211_VHT_CAP_RXLDPC; - mcs_map = IEEE80211_VHT_MCS_SUPPORT_0_9 << 0 | - IEEE80211_VHT_MCS_NOT_SUPPORTED << 4 | - IEEE80211_VHT_MCS_NOT_SUPPORTED << 6 | - IEEE80211_VHT_MCS_NOT_SUPPORTED << 8 | - IEEE80211_VHT_MCS_NOT_SUPPORTED << 10 | - IEEE80211_VHT_MCS_NOT_SUPPORTED << 12 | - IEEE80211_VHT_MCS_NOT_SUPPORTED << 14; - if (efuse->hw_cap.nss > 1) { - highest = cpu_to_le16(780); - mcs_map |= IEEE80211_VHT_MCS_SUPPORT_0_9 << 2; - } else { - highest = cpu_to_le16(390); - mcs_map |= IEEE80211_VHT_MCS_NOT_SUPPORTED << 2; + for (i = 0; i < 8; i++) { + if (i < efuse->hw_cap.nss) + mcs_map |= IEEE80211_VHT_MCS_SUPPORT_0_9 << (i * 2); + else + mcs_map |= IEEE80211_VHT_MCS_NOT_SUPPORTED << (i * 2); } + highest = cpu_to_le16(390 * efuse->hw_cap.nss); + vht_cap->vht_mcs.rx_mcs_map = cpu_to_le16(mcs_map); vht_cap->vht_mcs.tx_mcs_map = cpu_to_le16(mcs_map); vht_cap->vht_mcs.rx_highest = highest; From 1c17d07fc40c599fb3f64d1793a5a03b7bfd2ad0 Mon Sep 17 00:00:00 2001 From: Bitterblue Smith Date: Tue, 18 Feb 2025 01:30:22 +0200 Subject: [PATCH 346/529] wifi: rtw88: Fix rtw_init_ht_cap() for RTL8814AU [ Upstream commit c7eea1ba05ca5b0dbf77a27cf2e1e6e2fb3c0043 ] Set the RX mask and the highest RX rate according to the number of spatial streams the chip can receive. For RTL8814AU that is 3. Signed-off-by: Bitterblue Smith Acked-by: Ping-Ke Shih Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/4e786f50-ed1c-4387-8b28-e6ff00e35e81@gmail.com Signed-off-by: Sasha Levin --- drivers/net/wireless/realtek/rtw88/main.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/main.c b/drivers/net/wireless/realtek/rtw88/main.c index 8a47cfb358b1..0a913cf6a615 100644 --- a/drivers/net/wireless/realtek/rtw88/main.c +++ b/drivers/net/wireless/realtek/rtw88/main.c @@ -1516,6 +1516,7 @@ static void rtw_init_ht_cap(struct rtw_dev *rtwdev, { const struct rtw_chip_info *chip = rtwdev->chip; struct rtw_efuse *efuse = &rtwdev->efuse; + int i; ht_cap->ht_supported = true; ht_cap->cap = 0; @@ -1535,17 +1536,11 @@ static void rtw_init_ht_cap(struct rtw_dev *rtwdev, ht_cap->ampdu_factor = IEEE80211_HT_MAX_AMPDU_64K; ht_cap->ampdu_density = chip->ampdu_density; ht_cap->mcs.tx_params = IEEE80211_HT_MCS_TX_DEFINED; - if (efuse->hw_cap.nss > 1) { - ht_cap->mcs.rx_mask[0] = 0xFF; - ht_cap->mcs.rx_mask[1] = 0xFF; - ht_cap->mcs.rx_mask[4] = 0x01; - ht_cap->mcs.rx_highest = cpu_to_le16(300); - } else { - ht_cap->mcs.rx_mask[0] = 0xFF; - ht_cap->mcs.rx_mask[1] = 0x00; - ht_cap->mcs.rx_mask[4] = 0x01; - ht_cap->mcs.rx_highest = cpu_to_le16(150); - } + + for (i = 0; i < efuse->hw_cap.nss; i++) + ht_cap->mcs.rx_mask[i] = 0xFF; + ht_cap->mcs.rx_mask[4] = 0x01; + ht_cap->mcs.rx_highest = cpu_to_le16(150 * efuse->hw_cap.nss); } static void rtw_init_vht_cap(struct rtw_dev *rtwdev, From 3635589fa5e656b8f2bf2981970781a239016c2f Mon Sep 17 00:00:00 2001 From: Bitterblue Smith Date: Tue, 18 Feb 2025 01:29:52 +0200 Subject: [PATCH 347/529] wifi: rtw88: Fix rtw_desc_to_mcsrate() to handle MCS16-31 [ Upstream commit 86d04f8f991a0509e318fe886d5a1cf795736c7d ] This function translates the rate number reported by the hardware into something mac80211 can understand. It was ignoring the 3SS and 4SS HT rates. Translate them too. Also set *nss to 0 for the HT rates, just to make sure it's initialised. Signed-off-by: Bitterblue Smith Acked-by: Ping-Ke Shih Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/d0a5a86b-4869-47f6-a5a7-01c0f987cc7f@gmail.com Signed-off-by: Sasha Levin --- drivers/net/wireless/realtek/rtw88/util.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtw88/util.c b/drivers/net/wireless/realtek/rtw88/util.c index cdfd66a85075..43cd06aa39b1 100644 --- a/drivers/net/wireless/realtek/rtw88/util.c +++ b/drivers/net/wireless/realtek/rtw88/util.c @@ -101,7 +101,8 @@ void rtw_desc_to_mcsrate(u16 rate, u8 *mcs, u8 *nss) *nss = 4; *mcs = rate - DESC_RATEVHT4SS_MCS0; } else if (rate >= DESC_RATEMCS0 && - rate <= DESC_RATEMCS15) { + rate <= DESC_RATEMCS31) { + *nss = 0; *mcs = rate - DESC_RATEMCS0; } } From 70a9888018196f475df04f06eeac20bac296a8c3 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Mon, 17 Feb 2025 14:43:06 +0800 Subject: [PATCH 348/529] wifi: rtw89: fw: propagate error code from rtw89_h2c_tx() [ Upstream commit 56e1acaa0f80620b8e2c3410db35b4b975782b0a ] The error code should be propagated to callers during downloading firmware header and body. Remove unnecessary assignment of -1. Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20250217064308.43559-4-pkshih@realtek.com Signed-off-by: Sasha Levin --- drivers/net/wireless/realtek/rtw89/fw.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw89/fw.c b/drivers/net/wireless/realtek/rtw89/fw.c index 1d57a8c5e97d..0f022a5192ac 100644 --- a/drivers/net/wireless/realtek/rtw89/fw.c +++ b/drivers/net/wireless/realtek/rtw89/fw.c @@ -373,7 +373,6 @@ static int __rtw89_fw_download_hdr(struct rtw89_dev *rtwdev, const u8 *fw, u32 l ret = rtw89_h2c_tx(rtwdev, skb, false); if (ret) { rtw89_err(rtwdev, "failed to send h2c\n"); - ret = -1; goto fail; } @@ -434,7 +433,6 @@ static int __rtw89_fw_download_main(struct rtw89_dev *rtwdev, ret = rtw89_h2c_tx(rtwdev, skb, true); if (ret) { rtw89_err(rtwdev, "failed to send h2c\n"); - ret = -1; goto fail; } From ef1158a6a650ecee72ab40851b1d52e04d3f9cb5 Mon Sep 17 00:00:00 2001 From: Peter Seiderer Date: Wed, 19 Feb 2025 09:45:27 +0100 Subject: [PATCH 349/529] net: pktgen: fix access outside of user given buffer in pktgen_thread_write() [ Upstream commit 425e64440ad0a2f03bdaf04be0ae53dededbaa77 ] Honour the user given buffer size for the strn_len() calls (otherwise strn_len() will access memory outside of the user given buffer). Signed-off-by: Peter Seiderer Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250219084527.20488-8-ps.report@gmx.net Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/core/pktgen.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 5917820f92c3..a2838c15aa9d 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -1877,8 +1877,8 @@ static ssize_t pktgen_thread_write(struct file *file, i = len; /* Read variable name */ - - len = strn_len(&user_buffer[i], sizeof(name) - 1); + max = min(sizeof(name) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) return len; @@ -1908,7 +1908,8 @@ static ssize_t pktgen_thread_write(struct file *file, if (!strcmp(name, "add_device")) { char f[32]; memset(f, 0, 32); - len = strn_len(&user_buffer[i], sizeof(f) - 1); + max = min(sizeof(f) - 1, count - i); + len = strn_len(&user_buffer[i], max); if (len < 0) { ret = len; goto out; From 07bf524a784832356aa48fc2f472472bcdcf88c2 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 22 Jan 2025 07:50:26 +0100 Subject: [PATCH 350/529] EDAC/ie31200: work around false positive build warning [ Upstream commit c29dfd661fe2f8d1b48c7f00590929c04b25bf40 ] gcc-14 produces a bogus warning in some configurations: drivers/edac/ie31200_edac.c: In function 'ie31200_probe1.isra': drivers/edac/ie31200_edac.c:412:26: error: 'dimm_info' is used uninitialized [-Werror=uninitialized] 412 | struct dimm_data dimm_info[IE31200_CHANNELS][IE31200_DIMMS_PER_CHANNEL]; | ^~~~~~~~~ drivers/edac/ie31200_edac.c:412:26: note: 'dimm_info' declared here 412 | struct dimm_data dimm_info[IE31200_CHANNELS][IE31200_DIMMS_PER_CHANNEL]; | ^~~~~~~~~ I don't see any way the unintialized access could really happen here, but I can see why the compiler gets confused by the two loops. Instead, rework the two nested loops to only read the addr_decode registers and then keep only one instance of the dimm info structure. [Tony: Qiuxu pointed out that the "populate DIMM info" comment was left behind in the refactor and suggested moving it. I deleted the comment as unnecessry in front os a call to populate_dimm_info(). That seems pretty self-describing.] Signed-off-by: Arnd Bergmann Acked-by: Jason Baron Signed-off-by: Tony Luck Link: https://lore.kernel.org/all/20250122065031.1321015-1-arnd@kernel.org Signed-off-by: Sasha Levin --- drivers/edac/ie31200_edac.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/edac/ie31200_edac.c b/drivers/edac/ie31200_edac.c index 56be8ef40f37..e3635fba63b4 100644 --- a/drivers/edac/ie31200_edac.c +++ b/drivers/edac/ie31200_edac.c @@ -405,10 +405,9 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx) int i, j, ret; struct mem_ctl_info *mci = NULL; struct edac_mc_layer layers[2]; - struct dimm_data dimm_info[IE31200_CHANNELS][IE31200_DIMMS_PER_CHANNEL]; void __iomem *window; struct ie31200_priv *priv; - u32 addr_decode, mad_offset; + u32 addr_decode[IE31200_CHANNELS], mad_offset; /* * Kaby Lake, Coffee Lake seem to work like Skylake. Please re-visit @@ -466,19 +465,10 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx) mad_offset = IE31200_MAD_DIMM_0_OFFSET; } - /* populate DIMM info */ for (i = 0; i < IE31200_CHANNELS; i++) { - addr_decode = readl(window + mad_offset + + addr_decode[i] = readl(window + mad_offset + (i * 4)); - edac_dbg(0, "addr_decode: 0x%x\n", addr_decode); - for (j = 0; j < IE31200_DIMMS_PER_CHANNEL; j++) { - populate_dimm_info(&dimm_info[i][j], addr_decode, j, - skl); - edac_dbg(0, "size: 0x%x, rank: %d, width: %d\n", - dimm_info[i][j].size, - dimm_info[i][j].dual_rank, - dimm_info[i][j].x16_width); - } + edac_dbg(0, "addr_decode: 0x%x\n", addr_decode[i]); } /* @@ -489,14 +479,22 @@ static int ie31200_probe1(struct pci_dev *pdev, int dev_idx) */ for (i = 0; i < IE31200_DIMMS_PER_CHANNEL; i++) { for (j = 0; j < IE31200_CHANNELS; j++) { + struct dimm_data dimm_info; struct dimm_info *dimm; unsigned long nr_pages; - nr_pages = IE31200_PAGES(dimm_info[j][i].size, skl); + populate_dimm_info(&dimm_info, addr_decode[j], i, + skl); + edac_dbg(0, "size: 0x%x, rank: %d, width: %d\n", + dimm_info.size, + dimm_info.dual_rank, + dimm_info.x16_width); + + nr_pages = IE31200_PAGES(dimm_info.size, skl); if (nr_pages == 0) continue; - if (dimm_info[j][i].dual_rank) { + if (dimm_info.dual_rank) { nr_pages = nr_pages / 2; dimm = edac_get_dimm(mci, (i * 2) + 1, j, 0); dimm->nr_pages = nr_pages; From 74db6edb69a3e715e03aaa5dc1f63661fb0990f5 Mon Sep 17 00:00:00 2001 From: Frank Li Date: Wed, 29 Jan 2025 11:22:50 -0500 Subject: [PATCH 351/529] i3c: master: svc: Flush FIFO before sending Dynamic Address Assignment(DAA) [ Upstream commit a892ee4cf22a50e1d6988d0464a9a421f3e5db2f ] Ensure the FIFO is empty before issuing the DAA command to prevent incorrect command data from being sent. Align with other data transfers, such as svc_i3c_master_start_xfer_locked(), which flushes the FIFO before sending a command. Signed-off-by: Frank Li Reviewed-by: Miquel Raynal Link: https://lore.kernel.org/r/20250129162250.3629189-1-Frank.Li@nxp.com Signed-off-by: Alexandre Belloni Signed-off-by: Sasha Levin --- drivers/i3c/master/svc-i3c-master.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c index cf0550c6e95f..8cf3aa800d89 100644 --- a/drivers/i3c/master/svc-i3c-master.c +++ b/drivers/i3c/master/svc-i3c-master.c @@ -833,6 +833,8 @@ static int svc_i3c_master_do_daa_locked(struct svc_i3c_master *master, u32 reg; int ret, i; + svc_i3c_master_flush_fifo(master); + while (true) { /* Enter/proceed with DAA */ writel(SVC_I3C_MCTRL_REQUEST_PROC_DAA | From 68435c1fa3db696db4f480385db9e50e26691d0d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Alexis=20Lothor=C3=A9?= Date: Mon, 17 Feb 2025 07:21:53 +0100 Subject: [PATCH 352/529] serial: mctrl_gpio: split disable_ms into sync and no_sync APIs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 1bd2aad57da95f7f2d2bb52f7ad15c0f4993a685 ] The following splat has been observed on a SAMA5D27 platform using atmel_serial: BUG: sleeping function called from invalid context at kernel/irq/manage.c:738 in_atomic(): 1, irqs_disabled(): 128, non_block: 0, pid: 27, name: kworker/u5:0 preempt_count: 1, expected: 0 INFO: lockdep is turned off. irq event stamp: 0 hardirqs last enabled at (0): [<00000000>] 0x0 hardirqs last disabled at (0): [] copy_process+0x1c4c/0x7bec softirqs last enabled at (0): [] copy_process+0x1ca0/0x7bec softirqs last disabled at (0): [<00000000>] 0x0 CPU: 0 UID: 0 PID: 27 Comm: kworker/u5:0 Not tainted 6.13.0-rc7+ #74 Hardware name: Atmel SAMA5 Workqueue: hci0 hci_power_on [bluetooth] Call trace: unwind_backtrace from show_stack+0x18/0x1c show_stack from dump_stack_lvl+0x44/0x70 dump_stack_lvl from __might_resched+0x38c/0x598 __might_resched from disable_irq+0x1c/0x48 disable_irq from mctrl_gpio_disable_ms+0x74/0xc0 mctrl_gpio_disable_ms from atmel_disable_ms.part.0+0x80/0x1f4 atmel_disable_ms.part.0 from atmel_set_termios+0x764/0x11e8 atmel_set_termios from uart_change_line_settings+0x15c/0x994 uart_change_line_settings from uart_set_termios+0x2b0/0x668 uart_set_termios from tty_set_termios+0x600/0x8ec tty_set_termios from ttyport_set_flow_control+0x188/0x1e0 ttyport_set_flow_control from wilc_setup+0xd0/0x524 [hci_wilc] wilc_setup [hci_wilc] from hci_dev_open_sync+0x330/0x203c [bluetooth] hci_dev_open_sync [bluetooth] from hci_dev_do_open+0x40/0xb0 [bluetooth] hci_dev_do_open [bluetooth] from hci_power_on+0x12c/0x664 [bluetooth] hci_power_on [bluetooth] from process_one_work+0x998/0x1a38 process_one_work from worker_thread+0x6e0/0xfb4 worker_thread from kthread+0x3d4/0x484 kthread from ret_from_fork+0x14/0x28 This warning is emitted when trying to toggle, at the highest level, some flow control (with serdev_device_set_flow_control) in a device driver. At the lowest level, the atmel_serial driver is using serial_mctrl_gpio lib to enable/disable the corresponding IRQs accordingly. The warning emitted by CONFIG_DEBUG_ATOMIC_SLEEP is due to disable_irq (called in mctrl_gpio_disable_ms) being possibly called in some atomic context (some tty drivers perform modem lines configuration in regions protected by port lock). Split mctrl_gpio_disable_ms into two differents APIs, a non-blocking one and a blocking one. Replace mctrl_gpio_disable_ms calls with the relevant version depending on whether the call is protected by some port lock. Suggested-by: Jiri Slaby Signed-off-by: Alexis Lothoré Acked-by: Richard Genoud Link: https://lore.kernel.org/r/20250217-atomic_sleep_mctrl_serial_gpio-v3-1-59324b313eef@bootlin.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- Documentation/driver-api/serial/driver.rst | 2 +- drivers/tty/serial/8250/8250_port.c | 2 +- drivers/tty/serial/atmel_serial.c | 2 +- drivers/tty/serial/imx.c | 2 +- drivers/tty/serial/serial_mctrl_gpio.c | 34 +++++++++++++++++----- drivers/tty/serial/serial_mctrl_gpio.h | 17 +++++++++-- drivers/tty/serial/sh-sci.c | 2 +- drivers/tty/serial/stm32-usart.c | 2 +- 8 files changed, 47 insertions(+), 16 deletions(-) diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst index 23c6b956cd90..9436f7c11306 100644 --- a/Documentation/driver-api/serial/driver.rst +++ b/Documentation/driver-api/serial/driver.rst @@ -100,4 +100,4 @@ Some helpers are provided in order to set/get modem control lines via GPIO. .. kernel-doc:: drivers/tty/serial/serial_mctrl_gpio.c :identifiers: mctrl_gpio_init mctrl_gpio_free mctrl_gpio_to_gpiod mctrl_gpio_set mctrl_gpio_get mctrl_gpio_enable_ms - mctrl_gpio_disable_ms + mctrl_gpio_disable_ms_sync mctrl_gpio_disable_ms_no_sync diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 711de54eda98..c1917774e0bb 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -1694,7 +1694,7 @@ static void serial8250_disable_ms(struct uart_port *port) if (up->bugs & UART_BUG_NOMSR) return; - mctrl_gpio_disable_ms(up->gpios); + mctrl_gpio_disable_ms_no_sync(up->gpios); up->ier &= ~UART_IER_MSI; serial_port_out(port, UART_IER, up->ier); diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c index 6a9310379dc2..b3463cdd1d4b 100644 --- a/drivers/tty/serial/atmel_serial.c +++ b/drivers/tty/serial/atmel_serial.c @@ -692,7 +692,7 @@ static void atmel_disable_ms(struct uart_port *port) atmel_port->ms_irq_enabled = false; - mctrl_gpio_disable_ms(atmel_port->gpios); + mctrl_gpio_disable_ms_no_sync(atmel_port->gpios); if (!mctrl_gpio_to_gpiod(atmel_port->gpios, UART_GPIO_CTS)) idr |= ATMEL_US_CTSIC; diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index 94e0781e00e8..fe22ca009fb3 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -1586,7 +1586,7 @@ static void imx_uart_shutdown(struct uart_port *port) imx_uart_dma_exit(sport); } - mctrl_gpio_disable_ms(sport->gpios); + mctrl_gpio_disable_ms_sync(sport->gpios); spin_lock_irqsave(&sport->port.lock, flags); ucr2 = imx_uart_readl(sport, UCR2); diff --git a/drivers/tty/serial/serial_mctrl_gpio.c b/drivers/tty/serial/serial_mctrl_gpio.c index 7d5aaa8d422b..d5fb293dd5a9 100644 --- a/drivers/tty/serial/serial_mctrl_gpio.c +++ b/drivers/tty/serial/serial_mctrl_gpio.c @@ -322,11 +322,7 @@ void mctrl_gpio_enable_ms(struct mctrl_gpios *gpios) } EXPORT_SYMBOL_GPL(mctrl_gpio_enable_ms); -/** - * mctrl_gpio_disable_ms - disable irqs and handling of changes to the ms lines - * @gpios: gpios to disable - */ -void mctrl_gpio_disable_ms(struct mctrl_gpios *gpios) +static void mctrl_gpio_disable_ms(struct mctrl_gpios *gpios, bool sync) { enum mctrl_gpio_idx i; @@ -342,10 +338,34 @@ void mctrl_gpio_disable_ms(struct mctrl_gpios *gpios) if (!gpios->irq[i]) continue; - disable_irq(gpios->irq[i]); + if (sync) + disable_irq(gpios->irq[i]); + else + disable_irq_nosync(gpios->irq[i]); } } -EXPORT_SYMBOL_GPL(mctrl_gpio_disable_ms); + +/** + * mctrl_gpio_disable_ms_sync - disable irqs and handling of changes to the ms + * lines, and wait for any pending IRQ to be processed + * @gpios: gpios to disable + */ +void mctrl_gpio_disable_ms_sync(struct mctrl_gpios *gpios) +{ + mctrl_gpio_disable_ms(gpios, true); +} +EXPORT_SYMBOL_GPL(mctrl_gpio_disable_ms_sync); + +/** + * mctrl_gpio_disable_ms_no_sync - disable irqs and handling of changes to the + * ms lines, and return immediately + * @gpios: gpios to disable + */ +void mctrl_gpio_disable_ms_no_sync(struct mctrl_gpios *gpios) +{ + mctrl_gpio_disable_ms(gpios, false); +} +EXPORT_SYMBOL_GPL(mctrl_gpio_disable_ms_no_sync); void mctrl_gpio_enable_irq_wake(struct mctrl_gpios *gpios) { diff --git a/drivers/tty/serial/serial_mctrl_gpio.h b/drivers/tty/serial/serial_mctrl_gpio.h index fc76910fb105..79e97838ebe5 100644 --- a/drivers/tty/serial/serial_mctrl_gpio.h +++ b/drivers/tty/serial/serial_mctrl_gpio.h @@ -87,9 +87,16 @@ void mctrl_gpio_free(struct device *dev, struct mctrl_gpios *gpios); void mctrl_gpio_enable_ms(struct mctrl_gpios *gpios); /* - * Disable gpio interrupts to report status line changes. + * Disable gpio interrupts to report status line changes, and block until + * any corresponding IRQ is processed */ -void mctrl_gpio_disable_ms(struct mctrl_gpios *gpios); +void mctrl_gpio_disable_ms_sync(struct mctrl_gpios *gpios); + +/* + * Disable gpio interrupts to report status line changes, and return + * immediately + */ +void mctrl_gpio_disable_ms_no_sync(struct mctrl_gpios *gpios); /* * Enable gpio wakeup interrupts to enable wake up source. @@ -148,7 +155,11 @@ static inline void mctrl_gpio_enable_ms(struct mctrl_gpios *gpios) { } -static inline void mctrl_gpio_disable_ms(struct mctrl_gpios *gpios) +static inline void mctrl_gpio_disable_ms_sync(struct mctrl_gpios *gpios) +{ +} + +static inline void mctrl_gpio_disable_ms_no_sync(struct mctrl_gpios *gpios) { } diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index 6182ae5f6fa1..e2dfca4c2eff 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -2182,7 +2182,7 @@ static void sci_shutdown(struct uart_port *port) dev_dbg(port->dev, "%s(%d)\n", __func__, port->line); s->autorts = false; - mctrl_gpio_disable_ms(to_sci_port(port)->gpios); + mctrl_gpio_disable_ms_sync(to_sci_port(port)->gpios); spin_lock_irqsave(&port->lock, flags); sci_stop_rx(port); diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index 7d11511c8c12..8670bb5042c4 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -850,7 +850,7 @@ static void stm32_usart_enable_ms(struct uart_port *port) static void stm32_usart_disable_ms(struct uart_port *port) { - mctrl_gpio_disable_ms(to_stm32_port(port)->gpios); + mctrl_gpio_disable_ms_sync(to_stm32_port(port)->gpios); } /* Transmit stop */ From 24eb0979ff94456b109133be5e672ba184931d9e Mon Sep 17 00:00:00 2001 From: Michael Margolin Date: Mon, 17 Feb 2025 14:16:23 +0000 Subject: [PATCH 353/529] RDMA/core: Fix best page size finding when it can cross SG entries [ Upstream commit 486055f5e09df959ad4e3aa4ee75b5c91ddeec2e ] A single scatter-gather entry is limited by a 32 bits "length" field that is practically 4GB - PAGE_SIZE. This means that even when the memory is physically contiguous, we might need more than one entry to represent it. Additionally when using dmabuf, the sg_table might be originated outside the subsystem and optimized for other needs. For instance an SGT of 16GB GPU continuous memory might look like this: (a real life example) dma_address 34401400000, length fffff000 dma_address 345013ff000, length fffff000 dma_address 346013fe000, length fffff000 dma_address 347013fd000, length fffff000 dma_address 348013fc000, length 4000 Since ib_umem_find_best_pgsz works within SG entries, in the above case we will result with the worst possible 4KB page size. Fix this by taking into consideration only the alignment of addresses of real discontinuity points rather than treating SG entries as such, and adjust the page iterator to correctly handle cross SG entry pages. There is currently an assumption that drivers do not ask for pages bigger than maximal DMA size supported by their devices. Reviewed-by: Firas Jahjah Reviewed-by: Yonatan Nachum Signed-off-by: Michael Margolin Link: https://patch.msgid.link/20250217141623.12428-1-mrgolin@amazon.com Signed-off-by: Leon Romanovsky Signed-off-by: Sasha Levin --- drivers/infiniband/core/umem.c | 36 ++++++++++++++++++++++++--------- drivers/infiniband/core/verbs.c | 11 +++++----- 2 files changed, 32 insertions(+), 15 deletions(-) diff --git a/drivers/infiniband/core/umem.c b/drivers/infiniband/core/umem.c index 8ce569bf7525..1d154055a335 100644 --- a/drivers/infiniband/core/umem.c +++ b/drivers/infiniband/core/umem.c @@ -80,9 +80,12 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, unsigned long pgsz_bitmap, unsigned long virt) { - struct scatterlist *sg; + unsigned long curr_len = 0; + dma_addr_t curr_base = ~0; unsigned long va, pgoff; + struct scatterlist *sg; dma_addr_t mask; + dma_addr_t end; int i; umem->iova = va = virt; @@ -107,17 +110,30 @@ unsigned long ib_umem_find_best_pgsz(struct ib_umem *umem, pgoff = umem->address & ~PAGE_MASK; for_each_sgtable_dma_sg(&umem->sgt_append.sgt, sg, i) { - /* Walk SGL and reduce max page size if VA/PA bits differ - * for any address. + /* If the current entry is physically contiguous with the previous + * one, no need to take its start addresses into consideration. */ - mask |= (sg_dma_address(sg) + pgoff) ^ va; + if (check_add_overflow(curr_base, curr_len, &end) || + end != sg_dma_address(sg)) { + + curr_base = sg_dma_address(sg); + curr_len = 0; + + /* Reduce max page size if VA/PA bits differ */ + mask |= (curr_base + pgoff) ^ va; + + /* The alignment of any VA matching a discontinuity point + * in the physical memory sets the maximum possible page + * size as this must be a starting point of a new page that + * needs to be aligned. + */ + if (i != 0) + mask |= va; + } + + curr_len += sg_dma_len(sg); va += sg_dma_len(sg) - pgoff; - /* Except for the last entry, the ending iova alignment sets - * the maximum possible page size as the low bits of the iova - * must be zero when starting the next chunk. - */ - if (i != (umem->sgt_append.sgt.nents - 1)) - mask |= va; + pgoff = 0; } diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index b99b3cc283b6..97a116960f31 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -2959,22 +2959,23 @@ EXPORT_SYMBOL(__rdma_block_iter_start); bool __rdma_block_iter_next(struct ib_block_iter *biter) { unsigned int block_offset; - unsigned int sg_delta; + unsigned int delta; if (!biter->__sg_nents || !biter->__sg) return false; biter->__dma_addr = sg_dma_address(biter->__sg) + biter->__sg_advance; block_offset = biter->__dma_addr & (BIT_ULL(biter->__pg_bit) - 1); - sg_delta = BIT_ULL(biter->__pg_bit) - block_offset; + delta = BIT_ULL(biter->__pg_bit) - block_offset; - if (sg_dma_len(biter->__sg) - biter->__sg_advance > sg_delta) { - biter->__sg_advance += sg_delta; - } else { + while (biter->__sg_nents && biter->__sg && + sg_dma_len(biter->__sg) - biter->__sg_advance <= delta) { + delta -= sg_dma_len(biter->__sg) - biter->__sg_advance; biter->__sg_advance = 0; biter->__sg = sg_next(biter->__sg); biter->__sg_nents--; } + biter->__sg_advance += delta; return true; } From e3c93c79170f5267b19873c6fbcbd8694f04d597 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Tue, 18 Feb 2025 20:18:32 +0100 Subject: [PATCH 354/529] pmdomain: imx: gpcv2: use proper helper for property detection [ Upstream commit 6568cb40e73163fa25e2779f7234b169b2e1a32e ] Starting with commit c141ecc3cecd7 ("of: Warn when of_property_read_bool() is used on non-boolean properties"), probing the gpcv2 device on i.MX8M SoCs leads to warnings when LOCKDEP is enabled. Fix this by checking property presence with of_property_present as intended. Signed-off-by: Ahmad Fatoum Link: https://lore.kernel.org/r/20250218-gpcv2-of-property-present-v1-1-3bb1a9789654@pengutronix.de Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/soc/imx/gpcv2.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/soc/imx/gpcv2.c b/drivers/soc/imx/gpcv2.c index 88aee59730e3..6d5b6ed36169 100644 --- a/drivers/soc/imx/gpcv2.c +++ b/drivers/soc/imx/gpcv2.c @@ -1347,7 +1347,7 @@ static int imx_pgc_domain_probe(struct platform_device *pdev) } if (IS_ENABLED(CONFIG_LOCKDEP) && - of_property_read_bool(domain->dev->of_node, "power-domains")) + of_property_present(domain->dev->of_node, "power-domains")) lockdep_set_subclass(&domain->genpd.mlock, 1); ret = of_genpd_add_provider_simple(domain->dev->of_node, From 0d9d54113f4380c32752c6004c006b348aa54cec Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 12 Feb 2025 21:23:14 +0100 Subject: [PATCH 355/529] can: c_can: Use of_property_present() to test existence of DT property [ Upstream commit ab1bc2290fd8311d49b87c29f1eb123fcb581bee ] of_property_read_bool() should be used only on boolean properties. Cc: Rob Herring Signed-off-by: Krzysztof Kozlowski Reviewed-by: Vincent Mailhol Link: https://patch.msgid.link/20250212-syscon-phandle-args-can-v2-3-ac9a1253396b@linaro.org Signed-off-by: Marc Kleine-Budde Signed-off-by: Sasha Levin --- drivers/net/can/c_can/c_can_platform.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/can/c_can/c_can_platform.c b/drivers/net/can/c_can/c_can_platform.c index c5d7093d5413..c29862b3bb1f 100644 --- a/drivers/net/can/c_can/c_can_platform.c +++ b/drivers/net/can/c_can/c_can_platform.c @@ -334,7 +334,7 @@ static int c_can_plat_probe(struct platform_device *pdev) /* Check if we need custom RAMINIT via syscon. Mostly for TI * platforms. Only supported with DT boot. */ - if (np && of_property_read_bool(np, "syscon-raminit")) { + if (np && of_property_present(np, "syscon-raminit")) { u32 id; struct c_can_raminit *raminit = &priv->raminit_sys; From 5f1ecc9aa59f0e9cdd9583258a94051a75579582 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 12 Feb 2025 17:06:33 -0800 Subject: [PATCH 356/529] eth: mlx4: don't try to complete XDP frames in netpoll [ Upstream commit 8fdeafd66edaf420ea0063a1f13442fe3470fe70 ] mlx4 doesn't support ndo_xdp_xmit / XDP_REDIRECT and wasn't using page pool until now, so it could run XDP completions in netpoll (NAPI budget == 0) just fine. Page pool has calling context requirements, make sure we don't try to call it from what is potentially HW IRQ context. Reviewed-by: Tariq Toukan Link: https://patch.msgid.link/20250213010635.1354034-3-kuba@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index 7fccf1a79f09..95eae224bbb4 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -446,6 +446,8 @@ int mlx4_en_process_tx_cq(struct net_device *dev, if (unlikely(!priv->port_up)) return 0; + if (unlikely(!napi_budget) && cq->type == TX_XDP) + return 0; netdev_txq_bql_complete_prefetchw(ring->tx_queue); From 58fe8fead90fd6b0ebba94254c8b5c25ccba2921 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Mon, 16 Dec 2024 19:56:12 +0200 Subject: [PATCH 357/529] PCI: Fix old_size lower bound in calculate_iosize() too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit ff61f380de5652e723168341480cc7adf1dd6213 ] Commit 903534fa7d30 ("PCI: Fix resource double counting on remove & rescan") fixed double counting of mem resources because of old_size being applied too early. Fix a similar counting bug on the io resource side. Link: https://lore.kernel.org/r/20241216175632.4175-6-ilpo.jarvinen@linux.intel.com Signed-off-by: Ilpo Järvinen Signed-off-by: Bjorn Helgaas Tested-by: Xiaochun Lee Signed-off-by: Sasha Levin --- drivers/pci/setup-bus.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/pci/setup-bus.c b/drivers/pci/setup-bus.c index 8c1ad20a21ec..3ce68adda9b7 100644 --- a/drivers/pci/setup-bus.c +++ b/drivers/pci/setup-bus.c @@ -806,11 +806,9 @@ static resource_size_t calculate_iosize(resource_size_t size, size = (size & 0xff) + ((size & ~0xffUL) << 2); #endif size = size + size1; - if (size < old_size) - size = old_size; - size = ALIGN(max(size, add_size) + children_add_size, align); - return size; + size = max(size, add_size) + children_add_size; + return ALIGN(max(size, old_size), align); } static resource_size_t calculate_memsize(resource_size_t size, From 23ecfd538338aac8583d76265208cbad65f7db9a Mon Sep 17 00:00:00 2001 From: Xiaofei Tan Date: Wed, 12 Feb 2025 14:34:08 +0800 Subject: [PATCH 358/529] ACPI: HED: Always initialize before evged [ Upstream commit cccf6ee090c8c133072d5d5b52ae25f3bc907a16 ] When the HED driver is built-in, it initializes after evged because they both are at the same initcall level, so the initialization ordering depends on the Makefile order. However, this prevents RAS records coming in between the evged driver initialization and the HED driver initialization from being handled. If the number of such RAS records is above the APEI HEST error source number, the HEST resources may be exhausted, and that may affect subsequent RAS error reporting. To fix this issue, change the initcall level of HED to subsys_initcall and prevent the driver from being built as a module by changing ACPI_HED in Kconfig from "tristate" to "bool". Signed-off-by: Xiaofei Tan Link: https://patch.msgid.link/20250212063408.927666-1-tanxiaofei@huawei.com [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- drivers/acpi/Kconfig | 2 +- drivers/acpi/hed.c | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index 473241b5193f..596e96d3b3bd 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -438,7 +438,7 @@ config ACPI_SBS the modules will be called sbs and sbshc. config ACPI_HED - tristate "Hardware Error Device" + bool "Hardware Error Device" help This driver supports the Hardware Error Device (PNP0C33), which is used to report some hardware errors notified via diff --git a/drivers/acpi/hed.c b/drivers/acpi/hed.c index 60a2939cde6c..e8e9b1ac06b8 100644 --- a/drivers/acpi/hed.c +++ b/drivers/acpi/hed.c @@ -72,7 +72,12 @@ static struct acpi_driver acpi_hed_driver = { .notify = acpi_hed_notify, }, }; -module_acpi_driver(acpi_hed_driver); + +static int __init acpi_hed_driver_init(void) +{ + return acpi_bus_register_driver(&acpi_hed_driver); +} +subsys_initcall(acpi_hed_driver_init); MODULE_AUTHOR("Huang Ying"); MODULE_DESCRIPTION("ACPI Hardware Error Device Driver"); From 79dbd04fcca688333a7327f57084370c5f453e03 Mon Sep 17 00:00:00 2001 From: Petr Machata Date: Fri, 14 Feb 2025 17:18:21 +0100 Subject: [PATCH 359/529] vxlan: Join / leave MC group after remote changes [ Upstream commit d42d543368343c0449a4e433b5f02e063a86209c ] When a vxlan netdevice is brought up, if its default remote is a multicast address, the device joins the indicated group. Therefore when the multicast remote address changes, the device should leave the current group and subscribe to the new one. Similarly when the interface used for endpoint communication is changed in a situation when multicast remote is configured. This is currently not done. Both vxlan_igmp_join() and vxlan_igmp_leave() can however fail. So it is possible that with such fix, the netdevice will end up in an inconsistent situation where the old group is not joined anymore, but joining the new group fails. Should we join the new group first, and leave the old one second, we might end up in the opposite situation, where both groups are joined. Undoing any of this during rollback is going to be similarly problematic. One solution would be to just forbid the change when the netdevice is up. However in vnifilter mode, changing the group address is allowed, and these problems are simply ignored (see vxlan_vni_update_group()): # ip link add name br up type bridge vlan_filtering 1 # ip link add vx1 up master br type vxlan external vnifilter local 192.0.2.1 dev lo dstport 4789 # bridge vni add dev vx1 vni 200 group 224.0.0.1 # tcpdump -i lo & # bridge vni add dev vx1 vni 200 group 224.0.0.2 18:55:46.523438 IP 0.0.0.0 > 224.0.0.22: igmp v3 report, 1 group record(s) 18:55:46.943447 IP 0.0.0.0 > 224.0.0.22: igmp v3 report, 1 group record(s) # bridge vni dev vni group/remote vx1 200 224.0.0.2 Having two different modes of operation for conceptually the same interface is silly, so in this patch, just do what the vnifilter code does and deal with the errors by crossing fingers real hard. The vnifilter code leaves old before joining new, and in case of join / leave failures does not roll back the configuration changes that have already been applied, but bails out of joining if it could not leave. Do the same here: leave before join, apply changes unconditionally and do not attempt to join if we couldn't leave. Signed-off-by: Petr Machata Reviewed-by: Ido Schimmel Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/vxlan/vxlan_core.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 50be5a3c4779..0afd7eb976e6 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -4231,6 +4231,7 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], struct netlink_ext_ack *extack) { struct vxlan_dev *vxlan = netdev_priv(dev); + bool rem_ip_changed, change_igmp; struct net_device *lowerdev; struct vxlan_config conf; struct vxlan_rdst *dst; @@ -4254,8 +4255,13 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], if (err) return err; + rem_ip_changed = !vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip); + change_igmp = vxlan->dev->flags & IFF_UP && + (rem_ip_changed || + dst->remote_ifindex != conf.remote_ifindex); + /* handle default dst entry */ - if (!vxlan_addr_equal(&conf.remote_ip, &dst->remote_ip)) { + if (rem_ip_changed) { u32 hash_index = fdb_head_index(vxlan, all_zeros_mac, conf.vni); spin_lock_bh(&vxlan->hash_lock[hash_index]); @@ -4299,6 +4305,9 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], } } + if (change_igmp && vxlan_addr_multicast(&dst->remote_ip)) + err = vxlan_multicast_leave(vxlan); + if (conf.age_interval != vxlan->cfg.age_interval) mod_timer(&vxlan->age_timer, jiffies); @@ -4306,7 +4315,12 @@ static int vxlan_changelink(struct net_device *dev, struct nlattr *tb[], if (lowerdev && lowerdev != dst->remote_dev) dst->remote_dev = lowerdev; vxlan_config_apply(dev, &conf, lowerdev, vxlan->net, true); - return 0; + + if (!err && change_igmp && + vxlan_addr_multicast(&dst->remote_ip)) + err = vxlan_multicast_join(vxlan); + + return err; } static void vxlan_dellink(struct net_device *dev, struct list_head *head) From 4d142115a96016fa039514b3bedee03897130d7c Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Mon, 9 Dec 2024 16:00:16 +0100 Subject: [PATCH 360/529] media: test-drivers: vivid: don't call schedule in loop [ Upstream commit e4740118b752005cbed339aec9a1d1c43620e0b9 ] Artem reported that the CPU load was 100% when capturing from vivid at low resolution with ffmpeg. This was caused by: while (time_is_after_jiffies(cur_jiffies + wait_jiffies) && !kthread_should_stop()) schedule(); If there are no other processes running that can be scheduled, then this is basically a busy-loop. Change it to wait_event_interruptible_timeout() which doesn't have that problem. Signed-off-by: Hans Verkuil Reported-by: Artem S. Tashkinov Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219570 Reviewed-by: Nicolas Dufresne Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin --- drivers/media/test-drivers/vivid/vivid-kthread-cap.c | 11 ++++++++--- drivers/media/test-drivers/vivid/vivid-kthread-out.c | 11 ++++++++--- .../media/test-drivers/vivid/vivid-kthread-touch.c | 11 ++++++++--- drivers/media/test-drivers/vivid/vivid-sdr-cap.c | 11 ++++++++--- 4 files changed, 32 insertions(+), 12 deletions(-) diff --git a/drivers/media/test-drivers/vivid/vivid-kthread-cap.c b/drivers/media/test-drivers/vivid/vivid-kthread-cap.c index 690daada7db4..54e6a6772035 100644 --- a/drivers/media/test-drivers/vivid/vivid-kthread-cap.c +++ b/drivers/media/test-drivers/vivid/vivid-kthread-cap.c @@ -894,9 +894,14 @@ static int vivid_thread_vid_cap(void *data) next_jiffies_since_start = jiffies_since_start; wait_jiffies = next_jiffies_since_start - jiffies_since_start; - while (time_is_after_jiffies(cur_jiffies + wait_jiffies) && - !kthread_should_stop()) - schedule(); + if (!time_is_after_jiffies(cur_jiffies + wait_jiffies)) + continue; + + wait_queue_head_t wait; + + init_waitqueue_head(&wait); + wait_event_interruptible_timeout(wait, kthread_should_stop(), + cur_jiffies + wait_jiffies - jiffies); } dprintk(dev, 1, "Video Capture Thread End\n"); return 0; diff --git a/drivers/media/test-drivers/vivid/vivid-kthread-out.c b/drivers/media/test-drivers/vivid/vivid-kthread-out.c index 0833e021bb11..8a17a01e6e42 100644 --- a/drivers/media/test-drivers/vivid/vivid-kthread-out.c +++ b/drivers/media/test-drivers/vivid/vivid-kthread-out.c @@ -235,9 +235,14 @@ static int vivid_thread_vid_out(void *data) next_jiffies_since_start = jiffies_since_start; wait_jiffies = next_jiffies_since_start - jiffies_since_start; - while (time_is_after_jiffies(cur_jiffies + wait_jiffies) && - !kthread_should_stop()) - schedule(); + if (!time_is_after_jiffies(cur_jiffies + wait_jiffies)) + continue; + + wait_queue_head_t wait; + + init_waitqueue_head(&wait); + wait_event_interruptible_timeout(wait, kthread_should_stop(), + cur_jiffies + wait_jiffies - jiffies); } dprintk(dev, 1, "Video Output Thread End\n"); return 0; diff --git a/drivers/media/test-drivers/vivid/vivid-kthread-touch.c b/drivers/media/test-drivers/vivid/vivid-kthread-touch.c index fa711ee36a3f..c862689786b6 100644 --- a/drivers/media/test-drivers/vivid/vivid-kthread-touch.c +++ b/drivers/media/test-drivers/vivid/vivid-kthread-touch.c @@ -135,9 +135,14 @@ static int vivid_thread_touch_cap(void *data) next_jiffies_since_start = jiffies_since_start; wait_jiffies = next_jiffies_since_start - jiffies_since_start; - while (time_is_after_jiffies(cur_jiffies + wait_jiffies) && - !kthread_should_stop()) - schedule(); + if (!time_is_after_jiffies(cur_jiffies + wait_jiffies)) + continue; + + wait_queue_head_t wait; + + init_waitqueue_head(&wait); + wait_event_interruptible_timeout(wait, kthread_should_stop(), + cur_jiffies + wait_jiffies - jiffies); } dprintk(dev, 1, "Touch Capture Thread End\n"); return 0; diff --git a/drivers/media/test-drivers/vivid/vivid-sdr-cap.c b/drivers/media/test-drivers/vivid/vivid-sdr-cap.c index 0ae5628b86c9..abccd1d0109e 100644 --- a/drivers/media/test-drivers/vivid/vivid-sdr-cap.c +++ b/drivers/media/test-drivers/vivid/vivid-sdr-cap.c @@ -206,9 +206,14 @@ static int vivid_thread_sdr_cap(void *data) next_jiffies_since_start = jiffies_since_start; wait_jiffies = next_jiffies_since_start - jiffies_since_start; - while (time_is_after_jiffies(cur_jiffies + wait_jiffies) && - !kthread_should_stop()) - schedule(); + if (!time_is_after_jiffies(cur_jiffies + wait_jiffies)) + continue; + + wait_queue_head_t wait; + + init_waitqueue_head(&wait); + wait_event_interruptible_timeout(wait, kthread_should_stop(), + cur_jiffies + wait_jiffies - jiffies); } dprintk(dev, 1, "SDR Capture Thread End\n"); return 0; From 89acf46b9ec1a69b32407ded84388a3052e3549d Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Thu, 13 Feb 2025 11:46:40 +0200 Subject: [PATCH 361/529] net/mlx5: Modify LSB bitmask in temperature event to include only the first bit [ Upstream commit 633f16d7e07c129a36b882c05379e01ce5bdb542 ] In the sensor_count field of the MTEWE register, bits 1-62 are supported only for unmanaged switches, not for NICs, and bit 63 is reserved for internal use. To prevent confusing output that may include set bits that are not relevant to NIC sensors, we update the bitmask to retain only the first bit, which corresponds to the sensor ASIC. Signed-off-by: Shahar Shitrit Signed-off-by: Tariq Toukan Reviewed-by: Mateusz Polchlopek Link: https://patch.msgid.link/20250213094641.226501-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/events.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c index 9459e56ee90a..68b92927c74e 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/events.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c @@ -163,6 +163,10 @@ static int temp_warn(struct notifier_block *nb, unsigned long type, void *data) u64 value_msb; value_lsb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_lsb); + /* bit 1-63 are not supported for NICs, + * hence read only bit 0 (asic) from lsb. + */ + value_lsb &= 0x1; value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb); mlx5_core_warn(events->dev, From 963bac7f04f6bee673d1c0cadca3e95b03451d1c Mon Sep 17 00:00:00 2001 From: Shahar Shitrit Date: Thu, 13 Feb 2025 11:46:38 +0200 Subject: [PATCH 362/529] net/mlx5: Apply rate-limiting to high temperature warning [ Upstream commit 9dd3d5d258aceb37bdf09c8b91fa448f58ea81f0 ] Wrap the high temperature warning in a temperature event with a call to net_ratelimit() to prevent flooding the kernel log with repeated warning messages when temperature exceeds the threshold multiple times within a short duration. Signed-off-by: Shahar Shitrit Signed-off-by: Tariq Toukan Reviewed-by: Mateusz Polchlopek Link: https://patch.msgid.link/20250213094641.226501-2-tariqt@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/events.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/events.c b/drivers/net/ethernet/mellanox/mlx5/core/events.c index 68b92927c74e..6aa96d33c210 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/events.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/events.c @@ -169,9 +169,10 @@ static int temp_warn(struct notifier_block *nb, unsigned long type, void *data) value_lsb &= 0x1; value_msb = be64_to_cpu(eqe->data.temp_warning.sensor_warning_msb); - mlx5_core_warn(events->dev, - "High temperature on sensors with bit set %llx %llx", - value_msb, value_lsb); + if (net_ratelimit()) + mlx5_core_warn(events->dev, + "High temperature on sensors with bit set %llx %llx", + value_msb, value_lsb); return NOTIFY_OK; } From d9f7ac25ff67bd3989a2adda12554baa2309e560 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Martin=20Povi=C5=A1er?= Date: Sat, 8 Feb 2025 00:57:22 +0000 Subject: [PATCH 363/529] ASoC: ops: Enforce platform maximum on initial value MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 783db6851c1821d8b983ffb12b99c279ff64f2ee ] Lower the volume if it is violating the platform maximum at its initial value (i.e. at the time of the 'snd_soc_limit_volume' call). Signed-off-by: Martin Povišer [Cherry picked from the Asahi kernel with fixups -- broonie] Signed-off-by: Mark Brown Link: https://patch.msgid.link/20250208-asoc-volume-limit-v1-1-b98fcf4cdbad@kernel.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/soc-ops.c | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/sound/soc/soc-ops.c b/sound/soc/soc-ops.c index b4cfc34d00ee..eff1355cc3df 100644 --- a/sound/soc/soc-ops.c +++ b/sound/soc/soc-ops.c @@ -638,6 +638,33 @@ int snd_soc_get_volsw_range(struct snd_kcontrol *kcontrol, } EXPORT_SYMBOL_GPL(snd_soc_get_volsw_range); +static int snd_soc_clip_to_platform_max(struct snd_kcontrol *kctl) +{ + struct soc_mixer_control *mc = (struct soc_mixer_control *)kctl->private_value; + struct snd_ctl_elem_value uctl; + int ret; + + if (!mc->platform_max) + return 0; + + ret = kctl->get(kctl, &uctl); + if (ret < 0) + return ret; + + if (uctl.value.integer.value[0] > mc->platform_max) + uctl.value.integer.value[0] = mc->platform_max; + + if (snd_soc_volsw_is_stereo(mc) && + uctl.value.integer.value[1] > mc->platform_max) + uctl.value.integer.value[1] = mc->platform_max; + + ret = kctl->put(kctl, &uctl); + if (ret < 0) + return ret; + + return 0; +} + /** * snd_soc_limit_volume - Set new limit to an existing volume control. * @@ -662,7 +689,7 @@ int snd_soc_limit_volume(struct snd_soc_card *card, struct soc_mixer_control *mc = (struct soc_mixer_control *)kctl->private_value; if (max <= mc->max - mc->min) { mc->platform_max = max; - ret = 0; + ret = snd_soc_clip_to_platform_max(kctl); } } return ret; From 051f0ee928f5ef63da7034b51cf5a32708d9439e Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Sat, 8 Feb 2025 01:03:27 +0000 Subject: [PATCH 364/529] ASoC: tas2764: Add reg defaults for TAS2764_INT_CLK_CFG [ Upstream commit d64c4c3d1c578f98d70db1c5e2535b47adce9d07 ] Signed-off-by: Hector Martin Signed-off-by: Mark Brown Link: https://patch.msgid.link/20250208-asoc-tas2764-v1-4-dbab892a69b5@kernel.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/tas2764.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/codecs/tas2764.c b/sound/soc/codecs/tas2764.c index fc8479d3d285..72db361ac361 100644 --- a/sound/soc/codecs/tas2764.c +++ b/sound/soc/codecs/tas2764.c @@ -636,6 +636,7 @@ static const struct reg_default tas2764_reg_defaults[] = { { TAS2764_TDM_CFG2, 0x0a }, { TAS2764_TDM_CFG3, 0x10 }, { TAS2764_TDM_CFG5, 0x42 }, + { TAS2764_INT_CLK_CFG, 0x19 }, }; static const struct regmap_range_cfg tas2764_regmap_ranges[] = { From ec3a91e22b182f4a02edf715eba5290826e832fe Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Sat, 8 Feb 2025 01:03:26 +0000 Subject: [PATCH 365/529] ASoC: tas2764: Mark SW_RESET as volatile [ Upstream commit f37f1748564ac51d32f7588bd7bfc99913ccab8e ] Since the bit is self-clearing. Signed-off-by: Hector Martin Signed-off-by: Mark Brown Link: https://patch.msgid.link/20250208-asoc-tas2764-v1-3-dbab892a69b5@kernel.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/tas2764.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/soc/codecs/tas2764.c b/sound/soc/codecs/tas2764.c index 72db361ac361..94428487a885 100644 --- a/sound/soc/codecs/tas2764.c +++ b/sound/soc/codecs/tas2764.c @@ -654,6 +654,7 @@ static const struct regmap_range_cfg tas2764_regmap_ranges[] = { static bool tas2764_volatile_register(struct device *dev, unsigned int reg) { switch (reg) { + case TAS2764_SW_RST: case TAS2764_INT_LTCH0 ... TAS2764_INT_LTCH4: case TAS2764_INT_CLK_CFG: return true; From 4ffaac5e218cc68873e6088e4813af183c55c5f2 Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Sat, 8 Feb 2025 01:03:24 +0000 Subject: [PATCH 366/529] ASoC: tas2764: Power up/down amp on mute ops [ Upstream commit 1c3b5f37409682184669457a5bdf761268eafbe5 ] The ASoC convention is that clocks are removed after codec mute, and power up/down is more about top level power management. For these chips, the "mute" state still expects a TDM clock, and yanking the clock in this state will trigger clock errors. So, do the full shutdown<->mute<->active transition on the mute operation, so the amp is in software shutdown by the time the clocks are removed. This fixes TDM clock errors when streams are stopped. Signed-off-by: Hector Martin Signed-off-by: Mark Brown Link: https://patch.msgid.link/20250208-asoc-tas2764-v1-1-dbab892a69b5@kernel.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/tas2764.c | 51 ++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 30 deletions(-) diff --git a/sound/soc/codecs/tas2764.c b/sound/soc/codecs/tas2764.c index 94428487a885..10f0f07b90ff 100644 --- a/sound/soc/codecs/tas2764.c +++ b/sound/soc/codecs/tas2764.c @@ -182,33 +182,6 @@ static SOC_ENUM_SINGLE_DECL( static const struct snd_kcontrol_new tas2764_asi1_mux = SOC_DAPM_ENUM("ASI1 Source", tas2764_ASI1_src_enum); -static int tas2764_dac_event(struct snd_soc_dapm_widget *w, - struct snd_kcontrol *kcontrol, int event) -{ - struct snd_soc_component *component = snd_soc_dapm_to_component(w->dapm); - struct tas2764_priv *tas2764 = snd_soc_component_get_drvdata(component); - int ret; - - switch (event) { - case SND_SOC_DAPM_POST_PMU: - tas2764->dac_powered = true; - ret = tas2764_update_pwr_ctrl(tas2764); - break; - case SND_SOC_DAPM_PRE_PMD: - tas2764->dac_powered = false; - ret = tas2764_update_pwr_ctrl(tas2764); - break; - default: - dev_err(tas2764->dev, "Unsupported event\n"); - return -EINVAL; - } - - if (ret < 0) - return ret; - - return 0; -} - static const struct snd_kcontrol_new isense_switch = SOC_DAPM_SINGLE("Switch", TAS2764_PWR_CTRL, TAS2764_ISENSE_POWER_EN, 1, 1); static const struct snd_kcontrol_new vsense_switch = @@ -221,8 +194,7 @@ static const struct snd_soc_dapm_widget tas2764_dapm_widgets[] = { 1, &isense_switch), SND_SOC_DAPM_SWITCH("VSENSE", TAS2764_PWR_CTRL, TAS2764_VSENSE_POWER_EN, 1, &vsense_switch), - SND_SOC_DAPM_DAC_E("DAC", NULL, SND_SOC_NOPM, 0, 0, tas2764_dac_event, - SND_SOC_DAPM_POST_PMU | SND_SOC_DAPM_PRE_PMD), + SND_SOC_DAPM_DAC("DAC", NULL, SND_SOC_NOPM, 0, 0), SND_SOC_DAPM_OUTPUT("OUT"), SND_SOC_DAPM_SIGGEN("VMON"), SND_SOC_DAPM_SIGGEN("IMON") @@ -243,9 +215,28 @@ static int tas2764_mute(struct snd_soc_dai *dai, int mute, int direction) { struct tas2764_priv *tas2764 = snd_soc_component_get_drvdata(dai->component); + int ret; + + if (!mute) { + tas2764->dac_powered = true; + ret = tas2764_update_pwr_ctrl(tas2764); + if (ret) + return ret; + } tas2764->unmuted = !mute; - return tas2764_update_pwr_ctrl(tas2764); + ret = tas2764_update_pwr_ctrl(tas2764); + if (ret) + return ret; + + if (mute) { + tas2764->dac_powered = false; + ret = tas2764_update_pwr_ctrl(tas2764); + if (ret) + return ret; + } + + return 0; } static int tas2764_set_bitwidth(struct tas2764_priv *tas2764, int bitwidth) From 709b1fb22328d847dc6c5ddf92d6f60add8185ba Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 12 Feb 2025 02:24:38 +0000 Subject: [PATCH 367/529] ASoC: soc-dai: check return value at snd_soc_dai_set_tdm_slot() [ Upstream commit 7f1186a8d738661b941b298fd6d1d5725ed71428 ] snd_soc_dai_set_tdm_slot() calls .xlate_tdm_slot_mask() or snd_soc_xlate_tdm_slot_mask(), but didn't check its return value. Let's check it. This patch might break existing driver. In such case, let's makes each func to void instead of int. Signed-off-by: Kuninori Morimoto Link: https://patch.msgid.link/87o6z7yk61.wl-kuninori.morimoto.gx@renesas.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/soc-dai.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/sound/soc/soc-dai.c b/sound/soc/soc-dai.c index 49752af0e205..ba38b6e6b264 100644 --- a/sound/soc/soc-dai.c +++ b/sound/soc/soc-dai.c @@ -270,10 +270,11 @@ int snd_soc_dai_set_tdm_slot(struct snd_soc_dai *dai, if (dai->driver->ops && dai->driver->ops->xlate_tdm_slot_mask) - dai->driver->ops->xlate_tdm_slot_mask(slots, - &tx_mask, &rx_mask); + ret = dai->driver->ops->xlate_tdm_slot_mask(slots, &tx_mask, &rx_mask); else - snd_soc_xlate_tdm_slot_mask(slots, &tx_mask, &rx_mask); + ret = snd_soc_xlate_tdm_slot_mask(slots, &tx_mask, &rx_mask); + if (ret) + goto err; dai->tx_mask = tx_mask; dai->rx_mask = rx_mask; @@ -282,6 +283,7 @@ int snd_soc_dai_set_tdm_slot(struct snd_soc_dai *dai, dai->driver->ops->set_tdm_slot) ret = dai->driver->ops->set_tdm_slot(dai, tx_mask, rx_mask, slots, slot_width); +err: return soc_dai_ret(dai, ret); } EXPORT_SYMBOL_GPL(snd_soc_dai_set_tdm_slot); From b23976c6f9a6efe488660fd39f47cb08813e4462 Mon Sep 17 00:00:00 2001 From: Valentin Caron Date: Thu, 16 Jan 2025 18:00:09 +0100 Subject: [PATCH 368/529] pinctrl: devicetree: do not goto err when probing hogs in pinctrl_dt_to_map [ Upstream commit c98868e816209e568c9d72023ba0bc1e4d96e611 ] Cross case in pinctrl framework make impossible to an hogged pin and another, not hogged, used within the same device-tree node. For example with this simplified device-tree : &pinctrl { pinctrl_pin_1: pinctrl-pin-1 { pins = "dummy-pinctrl-pin"; }; }; &rtc { pinctrl-names = "default" pinctrl-0 = <&pinctrl_pin_1 &rtc_pin_1> rtc_pin_1: rtc-pin-1 { pins = "dummy-rtc-pin"; }; }; "pinctrl_pin_1" configuration is never set. This produces this path in the code: really_probe() pinctrl_bind_pins() | devm_pinctrl_get() | pinctrl_get() | create_pinctrl() | pinctrl_dt_to_map() | // Hog pin create an abort for all pins of the node | ret = dt_to_map_one_config() | | /* Do not defer probing of hogs (circular loop) */ | | if (np_pctldev == p->dev->of_node) | | return -ENODEV; | if (ret) | goto err | call_driver_probe() stm32_rtc_probe() pinctrl_enable() pinctrl_claim_hogs() create_pinctrl() for_each_maps(maps_node, i, map) // Not hog pin is skipped if (pctldev && strcmp(dev_name(pctldev->dev), map->ctrl_dev_name)) continue; At the first call of create_pinctrl() the hogged pin produces an abort to avoid a defer of hogged pins. All other pin configurations are trashed. At the second call, create_pinctrl is now called with pctldev parameter to get hogs, but in this context only hogs are set. And other pins are skipped. To handle this, do not produce an abort in the first call of create_pinctrl(). Classic pin configuration will be set in pinctrl_bind_pins() context. And the hogged pin configuration will be set in pinctrl_claim_hogs() context. Signed-off-by: Valentin Caron Link: https://lore.kernel.org/20250116170009.2075544-1-valentin.caron@foss.st.com Signed-off-by: Linus Walleij Signed-off-by: Sasha Levin --- drivers/pinctrl/devicetree.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/pinctrl/devicetree.c b/drivers/pinctrl/devicetree.c index 5ee746cb81f5..6520b88db110 100644 --- a/drivers/pinctrl/devicetree.c +++ b/drivers/pinctrl/devicetree.c @@ -143,10 +143,14 @@ static int dt_to_map_one_config(struct pinctrl *p, pctldev = get_pinctrl_dev_from_of_node(np_pctldev); if (pctldev) break; - /* Do not defer probing of hogs (circular loop) */ + /* + * Do not defer probing of hogs (circular loop) + * + * Return 1 to let the caller catch the case. + */ if (np_pctldev == p->dev->of_node) { of_node_put(np_pctldev); - return -ENODEV; + return 1; } } of_node_put(np_pctldev); @@ -265,6 +269,8 @@ int pinctrl_dt_to_map(struct pinctrl *p, struct pinctrl_dev *pctldev) ret = dt_to_map_one_config(p, pctldev, statename, np_config); of_node_put(np_config); + if (ret == 1) + continue; if (ret < 0) goto err; } From 5d08c89c2a0281061e2cdc94ec5a5b56805912a1 Mon Sep 17 00:00:00 2001 From: Konstantin Andreev Date: Fri, 17 Jan 2025 02:40:34 +0300 Subject: [PATCH 369/529] smack: recognize ipv4 CIPSO w/o categories [ Upstream commit a158a937d864d0034fea14913c1f09c6d5f574b8 ] If SMACK label has CIPSO representation w/o categories, e.g.: | # cat /smack/cipso2 | foo 10 | @ 250/2 | ... then SMACK does not recognize such CIPSO in input ipv4 packets and substitues '*' label instead. Audit records may look like | lsm=SMACK fn=smack_socket_sock_rcv_skb action=denied | subject="*" object="_" requested=w pid=0 comm="swapper/1" ... This happens in two steps: 1) security/smack/smackfs.c`smk_set_cipso does not clear NETLBL_SECATTR_MLS_CAT from (struct smack_known *)skp->smk_netlabel.flags on assigning CIPSO w/o categories: | rcu_assign_pointer(skp->smk_netlabel.attr.mls.cat, ncats.attr.mls.cat); | skp->smk_netlabel.attr.mls.lvl = ncats.attr.mls.lvl; 2) security/smack/smack_lsm.c`smack_from_secattr can not match skp->smk_netlabel with input packet's struct netlbl_lsm_secattr *sap because sap->flags have not NETLBL_SECATTR_MLS_CAT (what is correct) but skp->smk_netlabel.flags have (what is incorrect): | if ((sap->flags & NETLBL_SECATTR_MLS_CAT) == 0) { | if ((skp->smk_netlabel.flags & | NETLBL_SECATTR_MLS_CAT) == 0) | found = 1; | break; | } This commit sets/clears NETLBL_SECATTR_MLS_CAT in skp->smk_netlabel.flags according to the presense of CIPSO categories. The update of smk_netlabel is not atomic, so input packets processing still may be incorrect during short time while update proceeds. Signed-off-by: Konstantin Andreev Signed-off-by: Casey Schaufler Signed-off-by: Sasha Levin --- security/smack/smackfs.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index d955f3dcb3a5..9dca3672d82b 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -922,6 +922,10 @@ static ssize_t smk_set_cipso(struct file *file, const char __user *buf, if (rc >= 0) { old_cat = skp->smk_netlabel.attr.mls.cat; rcu_assign_pointer(skp->smk_netlabel.attr.mls.cat, ncats.attr.mls.cat); + if (ncats.attr.mls.cat) + skp->smk_netlabel.flags |= NETLBL_SECATTR_MLS_CAT; + else + skp->smk_netlabel.flags &= ~(u32)NETLBL_SECATTR_MLS_CAT; skp->smk_netlabel.attr.mls.lvl = ncats.attr.mls.lvl; synchronize_rcu(); netlbl_catmap_free(old_cat); From 0d52c6168dec044babb53eaad0325be430637d27 Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Fri, 24 Jan 2025 11:01:42 +0000 Subject: [PATCH 370/529] kunit: tool: Use qboot on QEMU x86_64 [ Upstream commit 08fafac4c9f289a9d9a22d838921e4b3eb22c664 ] As noted in [0], SeaBIOS (QEMU default) makes a mess of the terminal, qboot does not. It turns out this is actually useful with kunit.py, since the user is exposed to this issue if they set --raw_output=all. qboot is also faster than SeaBIOS, but it's is marginal for this usecase. [0] https://lore.kernel.org/all/CA+i-1C0wYb-gZ8Mwh3WSVpbk-LF-Uo+njVbASJPe1WXDURoV7A@mail.gmail.com/ Both SeaBIOS and qboot are x86-specific. Link: https://lore.kernel.org/r/20250124-kunit-qboot-v1-1-815e4d4c6f7c@google.com Signed-off-by: Brendan Jackman Reviewed-by: David Gow Signed-off-by: Shuah Khan Signed-off-by: Sasha Levin --- tools/testing/kunit/qemu_configs/x86_64.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/testing/kunit/qemu_configs/x86_64.py b/tools/testing/kunit/qemu_configs/x86_64.py index dc7949076863..4a6bf4e048f5 100644 --- a/tools/testing/kunit/qemu_configs/x86_64.py +++ b/tools/testing/kunit/qemu_configs/x86_64.py @@ -7,4 +7,6 @@ CONFIG_SERIAL_8250_CONSOLE=y''', qemu_arch='x86_64', kernel_path='arch/x86/boot/bzImage', kernel_command_line='console=ttyS0', - extra_qemu_params=[]) + # qboot is faster than SeaBIOS and doesn't mess up + # the terminal. + extra_qemu_params=['-bios', 'qboot.rom']) From a6e871c9125d8654105bb521c52b709f7ad5e77f Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 10 Feb 2025 09:45:05 -0800 Subject: [PATCH 371/529] net/mlx4_core: Avoid impossible mlx4_db_alloc() order value [ Upstream commit 4a6f18f28627e121bd1f74b5fcc9f945d6dbeb1e ] GCC can see that the value range for "order" is capped, but this leads it to consider that it might be negative, leading to a false positive warning (with GCC 15 with -Warray-bounds -fdiagnostics-details): ../drivers/net/ethernet/mellanox/mlx4/alloc.c:691:47: error: array subscript -1 is below array bounds of 'long unsigned int *[2]' [-Werror=array-bounds=] 691 | i = find_first_bit(pgdir->bits[o], MLX4_DB_PER_PAGE >> o); | ~~~~~~~~~~~^~~ 'mlx4_alloc_db_from_pgdir': events 1-2 691 | i = find_first_bit(pgdir->bits[o], MLX4_DB_PER_PAGE >> o); | ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | | | | | (2) out of array bounds here | (1) when the condition is evaluated to true In file included from ../drivers/net/ethernet/mellanox/mlx4/mlx4.h:53, from ../drivers/net/ethernet/mellanox/mlx4/alloc.c:42: ../include/linux/mlx4/device.h:664:33: note: while referencing 'bits' 664 | unsigned long *bits[2]; | ^~~~ Switch the argument to unsigned int, which removes the compiler needing to consider negative values. Signed-off-by: Kees Cook Link: https://patch.msgid.link/20250210174504.work.075-kees@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx4/alloc.c | 6 +++--- include/linux/mlx4/device.h | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/alloc.c b/drivers/net/ethernet/mellanox/mlx4/alloc.c index b330020dc0d6..f2bded847e61 100644 --- a/drivers/net/ethernet/mellanox/mlx4/alloc.c +++ b/drivers/net/ethernet/mellanox/mlx4/alloc.c @@ -682,9 +682,9 @@ static struct mlx4_db_pgdir *mlx4_alloc_db_pgdir(struct device *dma_device) } static int mlx4_alloc_db_from_pgdir(struct mlx4_db_pgdir *pgdir, - struct mlx4_db *db, int order) + struct mlx4_db *db, unsigned int order) { - int o; + unsigned int o; int i; for (o = order; o <= 1; ++o) { @@ -712,7 +712,7 @@ found: return 0; } -int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order) +int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, unsigned int order) { struct mlx4_priv *priv = mlx4_priv(dev); struct mlx4_db_pgdir *pgdir; diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index 6646634a0b9d..0cb296f0f8d1 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -1115,7 +1115,7 @@ int mlx4_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, int mlx4_buf_write_mtt(struct mlx4_dev *dev, struct mlx4_mtt *mtt, struct mlx4_buf *buf); -int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, int order); +int mlx4_db_alloc(struct mlx4_dev *dev, struct mlx4_db *db, unsigned int order); void mlx4_db_free(struct mlx4_dev *dev, struct mlx4_db *db); int mlx4_alloc_hwq_res(struct mlx4_dev *dev, struct mlx4_hwq_resources *wqres, From 5e3ee618d6f1065deb173df8902bbe49bdaaf5ba Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 12 Feb 2025 21:01:35 +0100 Subject: [PATCH 372/529] clk: qcom: clk-alpha-pll: Do not use random stack value for recalc rate [ Upstream commit 7a243e1b814a02ab40793026ef64223155d86395 ] If regmap_read() fails, random stack value was used in calculating new frequency in recalc_rate() callbacks. Such failure is really not expected as these are all MMIO reads, however code should be here correct and bail out. This also avoids possible warning on uninitialized value. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20250212-b4-clk-qcom-clean-v3-1-499f37444f5d@linaro.org Signed-off-by: Bjorn Andersson Signed-off-by: Sasha Levin --- drivers/clk/qcom/clk-alpha-pll.c | 52 ++++++++++++++++++++++---------- 1 file changed, 36 insertions(+), 16 deletions(-) diff --git a/drivers/clk/qcom/clk-alpha-pll.c b/drivers/clk/qcom/clk-alpha-pll.c index e63a90db1505..c591fa1ad802 100644 --- a/drivers/clk/qcom/clk-alpha-pll.c +++ b/drivers/clk/qcom/clk-alpha-pll.c @@ -561,14 +561,19 @@ clk_alpha_pll_recalc_rate(struct clk_hw *hw, unsigned long parent_rate) struct clk_alpha_pll *pll = to_clk_alpha_pll(hw); u32 alpha_width = pll_alpha_width(pll); - regmap_read(pll->clkr.regmap, PLL_L_VAL(pll), &l); + if (regmap_read(pll->clkr.regmap, PLL_L_VAL(pll), &l)) + return 0; + + if (regmap_read(pll->clkr.regmap, PLL_USER_CTL(pll), &ctl)) + return 0; - regmap_read(pll->clkr.regmap, PLL_USER_CTL(pll), &ctl); if (ctl & PLL_ALPHA_EN) { - regmap_read(pll->clkr.regmap, PLL_ALPHA_VAL(pll), &low); + if (regmap_read(pll->clkr.regmap, PLL_ALPHA_VAL(pll), &low)) + return 0; if (alpha_width > 32) { - regmap_read(pll->clkr.regmap, PLL_ALPHA_VAL_U(pll), - &high); + if (regmap_read(pll->clkr.regmap, PLL_ALPHA_VAL_U(pll), + &high)) + return 0; a = (u64)high << 32 | low; } else { a = low & GENMASK(alpha_width - 1, 0); @@ -760,8 +765,11 @@ alpha_pll_huayra_recalc_rate(struct clk_hw *hw, unsigned long parent_rate) struct clk_alpha_pll *pll = to_clk_alpha_pll(hw); u32 l, alpha = 0, ctl, alpha_m, alpha_n; - regmap_read(pll->clkr.regmap, PLL_L_VAL(pll), &l); - regmap_read(pll->clkr.regmap, PLL_USER_CTL(pll), &ctl); + if (regmap_read(pll->clkr.regmap, PLL_L_VAL(pll), &l)) + return 0; + + if (regmap_read(pll->clkr.regmap, PLL_USER_CTL(pll), &ctl)) + return 0; if (ctl & PLL_ALPHA_EN) { regmap_read(pll->clkr.regmap, PLL_ALPHA_VAL(pll), &alpha); @@ -955,8 +963,11 @@ clk_trion_pll_recalc_rate(struct clk_hw *hw, unsigned long parent_rate) struct clk_alpha_pll *pll = to_clk_alpha_pll(hw); u32 l, frac, alpha_width = pll_alpha_width(pll); - regmap_read(pll->clkr.regmap, PLL_L_VAL(pll), &l); - regmap_read(pll->clkr.regmap, PLL_ALPHA_VAL(pll), &frac); + if (regmap_read(pll->clkr.regmap, PLL_L_VAL(pll), &l)) + return 0; + + if (regmap_read(pll->clkr.regmap, PLL_ALPHA_VAL(pll), &frac)) + return 0; return alpha_pll_calc_rate(parent_rate, l, frac, alpha_width); } @@ -1014,7 +1025,8 @@ clk_alpha_pll_postdiv_recalc_rate(struct clk_hw *hw, unsigned long parent_rate) struct clk_alpha_pll_postdiv *pll = to_clk_alpha_pll_postdiv(hw); u32 ctl; - regmap_read(pll->clkr.regmap, PLL_USER_CTL(pll), &ctl); + if (regmap_read(pll->clkr.regmap, PLL_USER_CTL(pll), &ctl)) + return 0; ctl >>= PLL_POST_DIV_SHIFT; ctl &= PLL_POST_DIV_MASK(pll); @@ -1230,8 +1242,11 @@ static unsigned long alpha_pll_fabia_recalc_rate(struct clk_hw *hw, struct clk_alpha_pll *pll = to_clk_alpha_pll(hw); u32 l, frac, alpha_width = pll_alpha_width(pll); - regmap_read(pll->clkr.regmap, PLL_L_VAL(pll), &l); - regmap_read(pll->clkr.regmap, PLL_FRAC(pll), &frac); + if (regmap_read(pll->clkr.regmap, PLL_L_VAL(pll), &l)) + return 0; + + if (regmap_read(pll->clkr.regmap, PLL_FRAC(pll), &frac)) + return 0; return alpha_pll_calc_rate(parent_rate, l, frac, alpha_width); } @@ -1381,7 +1396,8 @@ clk_trion_pll_postdiv_recalc_rate(struct clk_hw *hw, unsigned long parent_rate) struct regmap *regmap = pll->clkr.regmap; u32 i, div = 1, val; - regmap_read(regmap, PLL_USER_CTL(pll), &val); + if (regmap_read(regmap, PLL_USER_CTL(pll), &val)) + return 0; val >>= pll->post_div_shift; val &= PLL_POST_DIV_MASK(pll); @@ -2254,9 +2270,12 @@ static unsigned long alpha_pll_lucid_evo_recalc_rate(struct clk_hw *hw, struct regmap *regmap = pll->clkr.regmap; u32 l, frac; - regmap_read(regmap, PLL_L_VAL(pll), &l); + if (regmap_read(regmap, PLL_L_VAL(pll), &l)) + return 0; l &= LUCID_EVO_PLL_L_VAL_MASK; - regmap_read(regmap, PLL_ALPHA_VAL(pll), &frac); + + if (regmap_read(regmap, PLL_ALPHA_VAL(pll), &frac)) + return 0; return alpha_pll_calc_rate(parent_rate, l, frac, pll_alpha_width(pll)); } @@ -2331,7 +2350,8 @@ static unsigned long clk_rivian_evo_pll_recalc_rate(struct clk_hw *hw, struct clk_alpha_pll *pll = to_clk_alpha_pll(hw); u32 l; - regmap_read(pll->clkr.regmap, PLL_L_VAL(pll), &l); + if (regmap_read(pll->clkr.regmap, PLL_L_VAL(pll), &l)) + return 0; return parent_rate * l; } From b6e568caafa0fc64111c238126b352c8a0b580eb Mon Sep 17 00:00:00 2001 From: Claudiu Beznea Date: Fri, 7 Feb 2025 13:33:13 +0200 Subject: [PATCH 373/529] serial: sh-sci: Update the suspend/resume support [ Upstream commit 22a6984c5b5df8eab864d7f3e8b94d5a554d31ab ] The Renesas RZ/G3S supports a power saving mode where power to most of the SoC components is turned off. When returning from this power saving mode, SoC components need to be re-configured. The SCIFs on the Renesas RZ/G3S need to be re-configured as well when returning from this power saving mode. The sh-sci code already configures the SCIF clocks, power domain and registers by calling uart_resume_port() in sci_resume(). On suspend path the SCIF UART ports are suspended accordingly (by calling uart_suspend_port() in sci_suspend()). The only missing setting is the reset signal. For this assert/de-assert the reset signal on driver suspend/resume. In case the no_console_suspend is specified by the user, the registers need to be saved on suspend path and restore on resume path. To do this the sci_console_save()/sci_console_restore() functions were added. There is no need to cache/restore the status or FIFO registers. Only the control registers. The registers that will be saved/restored on suspend/resume are specified by the struct sci_suspend_regs data structure. Signed-off-by: Claudiu Beznea Reviewed-by: Geert Uytterhoeven Link: https://lore.kernel.org/r/20250207113313.545432-1-claudiu.beznea.uj@bp.renesas.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/tty/serial/sh-sci.c | 71 +++++++++++++++++++++++++++++++++++-- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index e2dfca4c2eff..191136dcb94d 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -105,6 +105,15 @@ struct plat_sci_reg { u8 offset, size; }; +struct sci_suspend_regs { + u16 scsmr; + u16 scscr; + u16 scfcr; + u16 scsptr; + u8 scbrr; + u8 semr; +}; + struct sci_port_params { const struct plat_sci_reg regs[SCIx_NR_REGS]; unsigned int fifosize; @@ -135,6 +144,8 @@ struct sci_port { struct dma_chan *chan_tx; struct dma_chan *chan_rx; + struct reset_control *rstc; + #ifdef CONFIG_SERIAL_SH_SCI_DMA struct dma_chan *chan_tx_saved; struct dma_chan *chan_rx_saved; @@ -154,6 +165,7 @@ struct sci_port { int rx_trigger; struct timer_list rx_fifo_timer; int rx_fifo_timeout; + struct sci_suspend_regs suspend_regs; u16 hscif_tot; bool has_rtscts; @@ -3252,6 +3264,7 @@ static struct plat_sci_port *sci_parse_dt(struct platform_device *pdev, } sp = &sci_ports[id]; + sp->rstc = rstc; *dev_id = id; p->type = SCI_OF_TYPE(data); @@ -3400,13 +3413,57 @@ static int sci_probe(struct platform_device *dev) return 0; } +static void sci_console_save(struct sci_port *s) +{ + struct sci_suspend_regs *regs = &s->suspend_regs; + struct uart_port *port = &s->port; + + if (sci_getreg(port, SCSMR)->size) + regs->scsmr = sci_serial_in(port, SCSMR); + if (sci_getreg(port, SCSCR)->size) + regs->scscr = sci_serial_in(port, SCSCR); + if (sci_getreg(port, SCFCR)->size) + regs->scfcr = sci_serial_in(port, SCFCR); + if (sci_getreg(port, SCSPTR)->size) + regs->scsptr = sci_serial_in(port, SCSPTR); + if (sci_getreg(port, SCBRR)->size) + regs->scbrr = sci_serial_in(port, SCBRR); + if (sci_getreg(port, SEMR)->size) + regs->semr = sci_serial_in(port, SEMR); +} + +static void sci_console_restore(struct sci_port *s) +{ + struct sci_suspend_regs *regs = &s->suspend_regs; + struct uart_port *port = &s->port; + + if (sci_getreg(port, SCSMR)->size) + sci_serial_out(port, SCSMR, regs->scsmr); + if (sci_getreg(port, SCSCR)->size) + sci_serial_out(port, SCSCR, regs->scscr); + if (sci_getreg(port, SCFCR)->size) + sci_serial_out(port, SCFCR, regs->scfcr); + if (sci_getreg(port, SCSPTR)->size) + sci_serial_out(port, SCSPTR, regs->scsptr); + if (sci_getreg(port, SCBRR)->size) + sci_serial_out(port, SCBRR, regs->scbrr); + if (sci_getreg(port, SEMR)->size) + sci_serial_out(port, SEMR, regs->semr); +} + static __maybe_unused int sci_suspend(struct device *dev) { struct sci_port *sport = dev_get_drvdata(dev); - if (sport) + if (sport) { uart_suspend_port(&sci_uart_driver, &sport->port); + if (!console_suspend_enabled && uart_console(&sport->port)) + sci_console_save(sport); + else + return reset_control_assert(sport->rstc); + } + return 0; } @@ -3414,8 +3471,18 @@ static __maybe_unused int sci_resume(struct device *dev) { struct sci_port *sport = dev_get_drvdata(dev); - if (sport) + if (sport) { + if (!console_suspend_enabled && uart_console(&sport->port)) { + sci_console_restore(sport); + } else { + int ret = reset_control_deassert(sport->rstc); + + if (ret) + return ret; + } + uart_resume_port(&sci_uart_driver, &sport->port); + } return 0; } From cfdf164ef5a4ff57188f8c299c74db481a6231ba Mon Sep 17 00:00:00 2001 From: Dmitry Baryshkov Date: Sun, 9 Feb 2025 14:31:45 +0200 Subject: [PATCH 374/529] phy: core: don't require set_mode() callback for phy_get_mode() to work [ Upstream commit d58c04e305afbaa9dda7969151f06c4efe2c98b0 ] As reported by Damon Ding, the phy_get_mode() call doesn't work as expected unless the PHY driver has a .set_mode() call. This prompts PHY drivers to have empty stubs for .set_mode() for the sake of being able to get the mode. Make .set_mode() callback truly optional and update PHY's mode even if it there is none. Cc: Damon Ding Link: https://lore.kernel.org/r/96f8310f-93f1-4bcb-8637-137e1159ff83@rock-chips.com Tested-by: Damon Ding Signed-off-by: Dmitry Baryshkov Link: https://lore.kernel.org/r/20250209-phy-fix-set-moe-v2-1-76e248503856@linaro.org Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/phy/phy-core.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/phy/phy-core.c b/drivers/phy/phy-core.c index 0730fe80dc3c..069bcf49ee8f 100644 --- a/drivers/phy/phy-core.c +++ b/drivers/phy/phy-core.c @@ -398,13 +398,14 @@ EXPORT_SYMBOL_GPL(phy_power_off); int phy_set_mode_ext(struct phy *phy, enum phy_mode mode, int submode) { - int ret; + int ret = 0; - if (!phy || !phy->ops->set_mode) + if (!phy) return 0; mutex_lock(&phy->mutex); - ret = phy->ops->set_mode(phy, mode, submode); + if (phy->ops->set_mode) + ret = phy->ops->set_mode(phy, mode, submode); if (!ret) phy->attrs.mode = mode; mutex_unlock(&phy->mutex); From 81f4b82cf3b7064edce5d172394210a002fd05b8 Mon Sep 17 00:00:00 2001 From: Jiang Liu Date: Fri, 7 Feb 2025 14:28:49 +0800 Subject: [PATCH 375/529] drm/amdgpu: reset psp->cmd to NULL after releasing the buffer [ Upstream commit e92f3f94cad24154fd3baae30c6dfb918492278d ] Reset psp->cmd to NULL after releasing the buffer in function psp_sw_fini(). Reviewed-by: Lijo Lazar Signed-off-by: Jiang Liu Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index f8740ad08af4..a176b1da03bd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -484,7 +484,6 @@ static int psp_sw_fini(void *handle) { struct amdgpu_device *adev = (struct amdgpu_device *)handle; struct psp_context *psp = &adev->psp; - struct psp_gfx_cmd_resp *cmd = psp->cmd; psp_memory_training_fini(psp); if (psp->sos_fw) { @@ -511,8 +510,8 @@ static int psp_sw_fini(void *handle) adev->ip_versions[MP0_HWIP][0] == IP_VERSION(11, 0, 7)) psp_sysfs_fini(adev); - kfree(cmd); - cmd = NULL; + kfree(psp->cmd); + psp->cmd = NULL; psp_free_shared_bufs(psp); From bc40b6248a5b15576a667f531b204ff8e2091e8d Mon Sep 17 00:00:00 2001 From: Tom Chung Date: Mon, 13 Jan 2025 14:22:31 +0800 Subject: [PATCH 376/529] drm/amd/display: Initial psr_version with correct setting [ Upstream commit d8c782cac5007e68e7484d420168f12d3490def6 ] [Why & How] The initial setting for psr_version is not correct while create a virtual link. The default psr_version should be DC_PSR_VERSION_UNSUPPORTED. Reviewed-by: Roman Li Signed-off-by: Tom Chung Signed-off-by: Zaeem Mohamed Tested-by: Daniel Wheeler Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/display/dc/core/dc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c index 2721842af806..10672bb90a02 100644 --- a/drivers/gpu/drm/amd/display/dc/core/dc.c +++ b/drivers/gpu/drm/amd/display/dc/core/dc.c @@ -267,6 +267,7 @@ static bool create_links( link->link_id.type = OBJECT_TYPE_CONNECTOR; link->link_id.id = CONNECTOR_ID_VIRTUAL; link->link_id.enum_id = ENUM_ID_1; + link->psr_settings.psr_version = DC_PSR_VERSION_UNSUPPORTED; link->link_enc = kzalloc(sizeof(*link->link_enc), GFP_KERNEL); if (!link->link_enc) { From 9131a4be795ebc3e04977e2cf95edff093fd4884 Mon Sep 17 00:00:00 2001 From: Shiwu Zhang Date: Tue, 19 Nov 2024 15:58:39 +0800 Subject: [PATCH 377/529] drm/amdgpu: enlarge the VBIOS binary size limit [ Upstream commit 667b96134c9e206aebe40985650bf478935cbe04 ] Some chips have a larger VBIOS file so raise the size limit to support the flashing tool. Signed-off-by: Shiwu Zhang Reviewed-by: Hawking Zhang Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c index a176b1da03bd..ae6643c8ade6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_psp.c @@ -43,7 +43,7 @@ #include "amdgpu_securedisplay.h" #include "amdgpu_atomfirmware.h" -#define AMD_VBIOS_FILE_MAX_SIZE_B (1024*1024*3) +#define AMD_VBIOS_FILE_MAX_SIZE_B (1024*1024*16) static int psp_sysfs_init(struct amdgpu_device *adev); static void psp_sysfs_fini(struct amdgpu_device *adev); From e4f6a56f457e36a37c4988a9c494304cfa2f9352 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Tue, 17 Dec 2024 09:25:18 -0500 Subject: [PATCH 378/529] drm/amd/display/dm: drop hw_support check in amdgpu_dm_i2c_xfer() [ Upstream commit 33da70bd1e115d7d73f45fb1c09f5ecc448f3f13 ] DC supports SW i2c as well. Drop the check. Reviewed-by: Harry Wentland Signed-off-by: Alex Deucher Signed-off-by: Sasha Levin --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 0d8c020cd121..998dde73ecc6 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -7281,7 +7281,7 @@ static int amdgpu_dm_i2c_xfer(struct i2c_adapter *i2c_adap, int i; int result = -EIO; - if (!ddc_service->ddc_pin || !ddc_service->ddc_pin->hw_info.hw_supported) + if (!ddc_service->ddc_pin) return result; cmd.payloads = kcalloc(num, sizeof(struct i2c_payload), GFP_KERNEL); From d525e62f189dda671ce55ef2ed021f6f63e09065 Mon Sep 17 00:00:00 2001 From: Alexei Lazar Date: Sun, 9 Feb 2025 12:17:15 +0200 Subject: [PATCH 379/529] net/mlx5: Extend Ethtool loopback selftest to support non-linear SKB [ Upstream commit 95b9606b15bb3ce1198d28d2393dd0e1f0a5f3e9 ] Current loopback test validation ignores non-linear SKB case in the SKB access, which can lead to failures in scenarios such as when HW GRO is enabled. Linearize the SKB so both cases will be handled. Signed-off-by: Alexei Lazar Reviewed-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20250209101716.112774-15-tariqt@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c index 08a75654f5f1..c170503b3aac 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_selftest.c @@ -165,6 +165,9 @@ mlx5e_test_loopback_validate(struct sk_buff *skb, struct udphdr *udph; struct iphdr *iph; + if (skb_linearize(skb)) + goto out; + /* We are only going to peek, no need to clone the SKB */ if (MLX5E_TEST_PKT_SIZE - ETH_HLEN > skb_headlen(skb)) goto out; From 468255c8edd8465a95d13bb4983f13290d0d0ccb Mon Sep 17 00:00:00 2001 From: William Tu Date: Sun, 9 Feb 2025 12:17:09 +0200 Subject: [PATCH 380/529] net/mlx5e: set the tx_queue_len for pfifo_fast [ Upstream commit a38cc5706fb9f7dc4ee3a443f61de13ce1e410ed ] By default, the mq netdev creates a pfifo_fast qdisc. On a system with 16 core, the pfifo_fast with 3 bands consumes 16 * 3 * 8 (size of pointer) * 1024 (default tx queue len) = 393KB. The patch sets the tx qlen to representor default value, 128 (1< Reviewed-by: Daniel Jurgens Signed-off-by: Tariq Toukan Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250209101716.112774-9-tariqt@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 5aeca9534f15..837524d1d225 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -726,6 +726,8 @@ static void mlx5e_build_rep_netdev(struct net_device *netdev, netdev->ethtool_ops = &mlx5e_rep_ethtool_ops; netdev->watchdog_timeo = 15 * HZ; + if (mlx5_core_is_ecpf(mdev)) + netdev->tx_queue_len = 1 << MLX5E_REP_PARAMS_DEF_LOG_SQ_SIZE; #if IS_ENABLED(CONFIG_MLX5_CLS_ACT) netdev->hw_features |= NETIF_F_HW_TC; From 5d2af9951808f6c6490330cc56632c35add61d7a Mon Sep 17 00:00:00 2001 From: William Tu Date: Sun, 9 Feb 2025 12:17:08 +0200 Subject: [PATCH 381/529] net/mlx5e: reduce rep rxq depth to 256 for ECPF [ Upstream commit b9cc8f9d700867aaa77aedddfea85e53d5e5d584 ] By experiments, a single queue representor netdev consumes kernel memory around 2.8MB, and 1.8MB out of the 2.8MB is due to page pool for the RXQ. Scaling to a thousand representors consumes 2.8GB, which becomes a memory pressure issue for embedded devices such as BlueField-2 16GB / BlueField-3 32GB memory. Since representor netdevs mostly handles miss traffic, and ideally, most of the traffic will be offloaded, reduce the default non-uplink rep netdev's RXQ default depth from 1024 to 256 if mdev is ecpf eswitch manager. This saves around 1MB of memory per regular RQ, (1024 - 256) * 2KB, allocated from page pool. With rxq depth of 256, the netlink page pool tool reports $./tools/net/ynl/cli.py --spec Documentation/netlink/specs/netdev.yaml \ --dump page-pool-get {'id': 277, 'ifindex': 9, 'inflight': 128, 'inflight-mem': 786432, 'napi-id': 775}] This is due to mtu 1500 + headroom consumes half pages, so 256 rxq entries consumes around 128 pages (thus create a page pool with size 128), shown above at inflight. Note that each netdev has multiple types of RQs, including Regular RQ, XSK, PTP, Drop, Trap RQ. Since non-uplink representor only supports regular rq, this patch only changes the regular RQ's default depth. Signed-off-by: William Tu Reviewed-by: Bodong Wang Reviewed-by: Saeed Mahameed Signed-off-by: Tariq Toukan Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250209101716.112774-8-tariqt@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/en_rep.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 837524d1d225..b4980245b50b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -61,6 +61,7 @@ #define MLX5E_REP_PARAMS_DEF_LOG_SQ_SIZE \ max(0x7, MLX5E_PARAMS_MINIMUM_LOG_SQ_SIZE) #define MLX5E_REP_PARAMS_DEF_NUM_CHANNELS 1 +#define MLX5E_REP_PARAMS_DEF_LOG_RQ_SIZE 0x8 static const char mlx5e_rep_driver_name[] = "mlx5e_rep"; @@ -705,6 +706,8 @@ static void mlx5e_build_rep_params(struct net_device *netdev) /* RQ */ mlx5e_build_rq_params(mdev, params); + if (!mlx5e_is_uplink_rep(priv) && mlx5_core_is_ecpf(mdev)) + params->log_rq_mtu_frames = MLX5E_REP_PARAMS_DEF_LOG_RQ_SIZE; /* CQ moderation params */ params->rx_dim_enabled = MLX5_CAP_GEN(mdev, cq_moderation); From 9fc9237ffaea3db39413dee7c5ce19bb37fb1948 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Feb 2025 11:39:22 +0200 Subject: [PATCH 382/529] wifi: mac80211: don't unconditionally call drv_mgd_complete_tx() [ Upstream commit 1798271b3604b902d45033ec569f2bf77e94ecc2 ] We might not have called drv_mgd_prepare_tx(), so only call drv_mgd_complete_tx() under the same conditions. Signed-off-by: Johannes Berg Reviewed-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250205110958.e091fc39a351.Ie6a3cdca070612a0aa4b3c6914ab9ed602d1f456@changeid Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/mac80211/mlme.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 9a5530ca2f6b..8f0e6d7fe716 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2896,7 +2896,8 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, if (tx) ieee80211_flush_queues(local, sdata, false); - drv_mgd_complete_tx(sdata->local, sdata, &info); + if (tx || frame_buf) + drv_mgd_complete_tx(sdata->local, sdata, &info); /* clear AP addr only after building the needed mgmt frames */ eth_zero_addr(sdata->deflink.u.mgd.bssid); From 6dc0a704f54112293aba4520169492fc436f35ca Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 5 Feb 2025 11:39:21 +0200 Subject: [PATCH 383/529] wifi: mac80211: remove misplaced drv_mgd_complete_tx() call [ Upstream commit f4995cdc4d02d0abc8e9fcccad5c71ce676c1e3f ] In the original commit 15fae3410f1d ("mac80211: notify driver on mgd TX completion") I evidently made a mistake and placed the call in the "associated" if, rather than the "assoc_data". Later I noticed the missing call and placed it in commit c042600c17d8 ("wifi: mac80211: adding missing drv_mgd_complete_tx() call"), but didn't remove the wrong one. Remove it now. Signed-off-by: Johannes Berg Reviewed-by: Emmanuel Grumbach Signed-off-by: Miri Korenblit Link: https://patch.msgid.link/20250205110958.6ed954179bbf.Id8ef8835b7e6da3bf913c76f77d201017dc8a3c9@changeid Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- net/mac80211/mlme.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 8f0e6d7fe716..b300972c3150 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -7312,7 +7312,6 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, ieee80211_report_disconnect(sdata, frame_buf, sizeof(frame_buf), true, req->reason_code, false); - drv_mgd_complete_tx(sdata->local, sdata, &info); return 0; } From 81d2a85c65bf7be6d48c7d7689a6567d19358baf Mon Sep 17 00:00:00 2001 From: Athira Rajeev Date: Tue, 21 Jan 2025 18:46:20 +0530 Subject: [PATCH 384/529] arch/powerpc/perf: Check the instruction type before creating sample with perf_mem_data_src [ Upstream commit 2ffb26afa64261139e608bf087a0c1fe24d76d4d ] perf mem report aborts as below sometimes (during some corner case) in powerpc: # ./perf mem report 1>out *** stack smashing detected ***: terminated Aborted (core dumped) The backtrace is as below: __pthread_kill_implementation () raise () abort () __libc_message __fortify_fail __stack_chk_fail hist_entry.lvl_snprintf __sort__hpp_entry __hist_entry__snprintf hists.fprintf cmd_report cmd_mem Snippet of code which triggers the issue from tools/perf/util/sort.c static int hist_entry__lvl_snprintf(struct hist_entry *he, char *bf, size_t size, unsigned int width) { char out[64]; perf_mem__lvl_scnprintf(out, sizeof(out), he->mem_info); return repsep_snprintf(bf, size, "%-*s", width, out); } The value of "out" is filled from perf_mem_data_src value. Debugging this further showed that for some corner cases, the value of "data_src" was pointing to wrong value. This resulted in bigger size of string and causing stack check fail. The perf mem data source values are captured in the sample via isa207_get_mem_data_src function. The initial check is to fetch the type of sampled instruction. If the type of instruction is not valid (not a load/store instruction), the function returns. Since 'commit e16fd7f2cb1a ("perf: Use sample_flags for data_src")', data_src field is not initialized by the perf_sample_data_init() function. If the PMU driver doesn't set the data_src value to zero if type is not valid, this will result in uninitailised value for data_src. The uninitailised value of data_src resulted in stack check fail followed by abort for "perf mem report". When requesting for data source information in the sample, the instruction type is expected to be load or store instruction. In ISA v3.0, due to hardware limitation, there are corner cases where the instruction type other than load or store is observed. In ISA v3.0 and before values "0" and "7" are considered reserved. In ISA v3.1, value "7" has been used to indicate "larx/stcx". Drop the sample if instruction type has reserved values for this field with a ISA version check. Initialize data_src to zero in isa207_get_mem_data_src if the instruction type is not load/store. Reported-by: Disha Goel Signed-off-by: Athira Rajeev Signed-off-by: Madhavan Srinivasan Link: https://patch.msgid.link/20250121131621.39054-1-atrajeev@linux.vnet.ibm.com Signed-off-by: Sasha Levin --- arch/powerpc/perf/core-book3s.c | 20 ++++++++++++++++++++ arch/powerpc/perf/isa207-common.c | 4 +++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index e3c31c771ce9..470d7715ecf4 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -2229,6 +2229,10 @@ static struct pmu power_pmu = { #define PERF_SAMPLE_ADDR_TYPE (PERF_SAMPLE_ADDR | \ PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_DATA_PAGE_SIZE) + +#define SIER_TYPE_SHIFT 15 +#define SIER_TYPE_MASK (0x7ull << SIER_TYPE_SHIFT) + /* * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled @@ -2297,6 +2301,22 @@ static void record_and_restart(struct perf_event *event, unsigned long val, is_kernel_addr(mfspr(SPRN_SIAR))) record = 0; + /* + * SIER[46-48] presents instruction type of the sampled instruction. + * In ISA v3.0 and before values "0" and "7" are considered reserved. + * In ISA v3.1, value "7" has been used to indicate "larx/stcx". + * Drop the sample if "type" has reserved values for this field with a + * ISA version check. + */ + if (event->attr.sample_type & PERF_SAMPLE_DATA_SRC && + ppmu->get_mem_data_src) { + val = (regs->dar & SIER_TYPE_MASK) >> SIER_TYPE_SHIFT; + if (val == 0 || (val == 7 && !cpu_has_feature(CPU_FTR_ARCH_31))) { + record = 0; + atomic64_inc(&event->lost_samples); + } + } + /* * Finally record data if requested. */ diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c index 56301b2bc8ae..031a2b63c171 100644 --- a/arch/powerpc/perf/isa207-common.c +++ b/arch/powerpc/perf/isa207-common.c @@ -321,8 +321,10 @@ void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags, sier = mfspr(SPRN_SIER); val = (sier & ISA207_SIER_TYPE_MASK) >> ISA207_SIER_TYPE_SHIFT; - if (val != 1 && val != 2 && !(val == 7 && cpu_has_feature(CPU_FTR_ARCH_31))) + if (val != 1 && val != 2 && !(val == 7 && cpu_has_feature(CPU_FTR_ARCH_31))) { + dsrc->val = 0; return; + } idx = (sier & ISA207_SIER_LDST_MASK) >> ISA207_SIER_LDST_SHIFT; sub_idx = (sier & ISA207_SIER_DATA_SRC_MASK) >> ISA207_SIER_DATA_SRC_SHIFT; From ce534438a87e4b4e73cff7ca6bbcc85e39e44cc5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 7 Feb 2025 16:24:58 +0900 Subject: [PATCH 385/529] ip: fib_rules: Fetch net from fib_rule in fib[46]_rule_configure(). [ Upstream commit 5a1ccffd30a08f5a2428cd5fbb3ab03e8eb6c66d ] The following patch will not set skb->sk from VRF path. Let's fetch net from fib_rule->fr_net instead of sock_net(skb->sk) in fib[46]_rule_configure(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Reviewed-by: Ido Schimmel Tested-by: Ido Schimmel Link: https://patch.msgid.link/20250207072502.87775-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv4/fib_rules.c | 4 ++-- net/ipv6/fib6_rules.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 513f475c6a53..298a9944a3d1 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -222,9 +222,9 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct nlattr **tb, struct netlink_ext_ack *extack) { - struct net *net = sock_net(skb->sk); + struct fib4_rule *rule4 = (struct fib4_rule *)rule; + struct net *net = rule->fr_net; int err = -EINVAL; - struct fib4_rule *rule4 = (struct fib4_rule *) rule; if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index 6eeab21512ba..e0f0c5f8cccd 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -350,9 +350,9 @@ static int fib6_rule_configure(struct fib_rule *rule, struct sk_buff *skb, struct nlattr **tb, struct netlink_ext_ack *extack) { + struct fib6_rule *rule6 = (struct fib6_rule *)rule; + struct net *net = rule->fr_net; int err = -EINVAL; - struct net *net = sock_net(skb->sk); - struct fib6_rule *rule6 = (struct fib6_rule *) rule; if (!inet_validate_dscp(frh->tos)) { NL_SET_ERR_MSG(extack, From 83b5df3df8c68661a5642a4e2f0279c521d83df4 Mon Sep 17 00:00:00 2001 From: Aleksander Jan Bajkowski Date: Thu, 6 Feb 2025 23:40:33 +0100 Subject: [PATCH 386/529] r8152: add vendor/device ID pair for Dell Alienware AW1022z [ Upstream commit 848b09d53d923b4caee5491f57a5c5b22d81febc ] The Dell AW1022z is an RTL8156B based 2.5G Ethernet controller. Add the vendor and product ID values to the driver. This makes Ethernet work with the adapter. Signed-off-by: Aleksander Jan Bajkowski Link: https://patch.msgid.link/20250206224033.980115-1-olek2@wp.pl Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/usb/r8152.c | 1 + include/linux/usb/r8152.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 061a7a9afad0..c2b715541989 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -9880,6 +9880,7 @@ static const struct usb_device_id rtl8152_table[] = { { USB_DEVICE(VENDOR_ID_NVIDIA, 0x09ff) }, { USB_DEVICE(VENDOR_ID_TPLINK, 0x0601) }, { USB_DEVICE(VENDOR_ID_DLINK, 0xb301) }, + { USB_DEVICE(VENDOR_ID_DELL, 0xb097) }, { USB_DEVICE(VENDOR_ID_ASUS, 0x1976) }, {} }; diff --git a/include/linux/usb/r8152.h b/include/linux/usb/r8152.h index 33a4c146dc19..2ca60828f28b 100644 --- a/include/linux/usb/r8152.h +++ b/include/linux/usb/r8152.h @@ -30,6 +30,7 @@ #define VENDOR_ID_NVIDIA 0x0955 #define VENDOR_ID_TPLINK 0x2357 #define VENDOR_ID_DLINK 0x2001 +#define VENDOR_ID_DELL 0x413c #define VENDOR_ID_ASUS 0x0b05 #if IS_REACHABLE(CONFIG_USB_RTL8152) From 53383430802a012bbe9a244432c96d72a72e0890 Mon Sep 17 00:00:00 2001 From: Bitterblue Smith Date: Tue, 4 Feb 2025 20:37:36 +0200 Subject: [PATCH 387/529] wifi: rtw88: Fix download_firmware_validate() for RTL8814AU [ Upstream commit 9e8243025cc06abc975c876dffda052073207ab3 ] After the firmware is uploaded, download_firmware_validate() checks some bits in REG_MCUFW_CTRL to see if everything went okay. The RTL8814AU power on sequence sets bits 13 and 12 to 2, which this function does not expect, so it thinks the firmware upload failed. Make download_firmware_validate() ignore bits 13 and 12. Signed-off-by: Bitterblue Smith Acked-by: Ping-Ke Shih Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/049d2887-22fc-47b7-9e59-62627cb525f8@gmail.com Signed-off-by: Sasha Levin --- drivers/net/wireless/realtek/rtw88/reg.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/realtek/rtw88/reg.h b/drivers/net/wireless/realtek/rtw88/reg.h index 03bd8dc53f72..08628ba3419d 100644 --- a/drivers/net/wireless/realtek/rtw88/reg.h +++ b/drivers/net/wireless/realtek/rtw88/reg.h @@ -107,6 +107,7 @@ #define BIT_SHIFT_ROM_PGE 16 #define BIT_FW_INIT_RDY BIT(15) #define BIT_FW_DW_RDY BIT(14) +#define BIT_CPU_CLK_SEL (BIT(12) | BIT(13)) #define BIT_RPWM_TOGGLE BIT(7) #define BIT_RAM_DL_SEL BIT(7) /* legacy only */ #define BIT_DMEM_CHKSUM_OK BIT(6) @@ -124,7 +125,7 @@ BIT_CHECK_SUM_OK) #define FW_READY_LEGACY (BIT_MCUFWDL_RDY | BIT_FWDL_CHK_RPT | \ BIT_WINTINI_RDY | BIT_RAM_DL_SEL) -#define FW_READY_MASK 0xffff +#define FW_READY_MASK (0xffff & ~BIT_CPU_CLK_SEL) #define REG_MCU_TST_CFG 0x84 #define VAL_FW_TRIGGER 0x1 From e35875dd2baf1c686394551f2864663694a03909 Mon Sep 17 00:00:00 2001 From: Jordan Crouse Date: Wed, 22 Jan 2025 22:26:12 +0000 Subject: [PATCH 388/529] clk: qcom: camcc-sm8250: Use clk_rcg2_shared_ops for some RCGs [ Upstream commit 52b10b591f83dc6d9a1d6c2dc89433470a787ecd ] Update some RCGs on the sm8250 camera clock controller to use clk_rcg2_shared_ops. The shared_ops ensure the RCGs get parked to the XO during clock disable to prevent the clocks from locking up when the GDSC is enabled. These mirror similar fixes for other controllers such as commit e5c359f70e4b ("clk: qcom: camcc: Update the clock ops for the SC7180"). Signed-off-by: Jordan Crouse Reviewed-by: Dmitry Baryshkov Reviewed-by: Bryan O'Donoghue Link: https://lore.kernel.org/r/20250122222612.32351-1-jorcrous@amazon.com Signed-off-by: Bjorn Andersson Signed-off-by: Sasha Levin --- drivers/clk/qcom/camcc-sm8250.c | 56 ++++++++++++++++----------------- 1 file changed, 28 insertions(+), 28 deletions(-) diff --git a/drivers/clk/qcom/camcc-sm8250.c b/drivers/clk/qcom/camcc-sm8250.c index 9b32c56a5bc5..e29706d78287 100644 --- a/drivers/clk/qcom/camcc-sm8250.c +++ b/drivers/clk/qcom/camcc-sm8250.c @@ -411,7 +411,7 @@ static struct clk_rcg2 cam_cc_bps_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -433,7 +433,7 @@ static struct clk_rcg2 cam_cc_camnoc_axi_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -454,7 +454,7 @@ static struct clk_rcg2 cam_cc_cci_0_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -469,7 +469,7 @@ static struct clk_rcg2 cam_cc_cci_1_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -490,7 +490,7 @@ static struct clk_rcg2 cam_cc_cphy_rx_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -511,7 +511,7 @@ static struct clk_rcg2 cam_cc_csi0phytimer_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -526,7 +526,7 @@ static struct clk_rcg2 cam_cc_csi1phytimer_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -556,7 +556,7 @@ static struct clk_rcg2 cam_cc_csi3phytimer_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -571,7 +571,7 @@ static struct clk_rcg2 cam_cc_csi4phytimer_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -586,7 +586,7 @@ static struct clk_rcg2 cam_cc_csi5phytimer_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -611,7 +611,7 @@ static struct clk_rcg2 cam_cc_fast_ahb_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -634,7 +634,7 @@ static struct clk_rcg2 cam_cc_fd_core_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -649,7 +649,7 @@ static struct clk_rcg2 cam_cc_icp_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -673,7 +673,7 @@ static struct clk_rcg2 cam_cc_ife_0_clk_src = { .parent_data = cam_cc_parent_data_2, .num_parents = ARRAY_SIZE(cam_cc_parent_data_2), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -710,7 +710,7 @@ static struct clk_rcg2 cam_cc_ife_0_csid_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -734,7 +734,7 @@ static struct clk_rcg2 cam_cc_ife_1_clk_src = { .parent_data = cam_cc_parent_data_3, .num_parents = ARRAY_SIZE(cam_cc_parent_data_3), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -749,7 +749,7 @@ static struct clk_rcg2 cam_cc_ife_1_csid_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -771,7 +771,7 @@ static struct clk_rcg2 cam_cc_ife_lite_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -786,7 +786,7 @@ static struct clk_rcg2 cam_cc_ife_lite_csid_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -810,7 +810,7 @@ static struct clk_rcg2 cam_cc_ipe_0_clk_src = { .parent_data = cam_cc_parent_data_4, .num_parents = ARRAY_SIZE(cam_cc_parent_data_4), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -825,7 +825,7 @@ static struct clk_rcg2 cam_cc_jpeg_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -847,7 +847,7 @@ static struct clk_rcg2 cam_cc_mclk0_clk_src = { .parent_data = cam_cc_parent_data_1, .num_parents = ARRAY_SIZE(cam_cc_parent_data_1), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -862,7 +862,7 @@ static struct clk_rcg2 cam_cc_mclk1_clk_src = { .parent_data = cam_cc_parent_data_1, .num_parents = ARRAY_SIZE(cam_cc_parent_data_1), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -877,7 +877,7 @@ static struct clk_rcg2 cam_cc_mclk2_clk_src = { .parent_data = cam_cc_parent_data_1, .num_parents = ARRAY_SIZE(cam_cc_parent_data_1), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -892,7 +892,7 @@ static struct clk_rcg2 cam_cc_mclk3_clk_src = { .parent_data = cam_cc_parent_data_1, .num_parents = ARRAY_SIZE(cam_cc_parent_data_1), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -907,7 +907,7 @@ static struct clk_rcg2 cam_cc_mclk4_clk_src = { .parent_data = cam_cc_parent_data_1, .num_parents = ARRAY_SIZE(cam_cc_parent_data_1), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -922,7 +922,7 @@ static struct clk_rcg2 cam_cc_mclk5_clk_src = { .parent_data = cam_cc_parent_data_1, .num_parents = ARRAY_SIZE(cam_cc_parent_data_1), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; @@ -993,7 +993,7 @@ static struct clk_rcg2 cam_cc_slow_ahb_clk_src = { .parent_data = cam_cc_parent_data_0, .num_parents = ARRAY_SIZE(cam_cc_parent_data_0), .flags = CLK_SET_RATE_PARENT, - .ops = &clk_rcg2_ops, + .ops = &clk_rcg2_shared_ops, }, }; From 50b7e3276bdec6455b2f096a7b987571168d828b Mon Sep 17 00:00:00 2001 From: Andrey Vatoropin Date: Tue, 4 Feb 2025 09:54:08 +0000 Subject: [PATCH 389/529] hwmon: (xgene-hwmon) use appropriate type for the latency value [ Upstream commit 8df0f002827e18632dcd986f7546c1abf1953a6f ] The expression PCC_NUM_RETRIES * pcc_chan->latency is currently being evaluated using 32-bit arithmetic. Since a value of type 'u64' is used to store the eventual result, and this result is later sent to the function usecs_to_jiffies with input parameter unsigned int, the current data type is too wide to store the value of ctx->usecs_lat. Change the data type of "usecs_lat" to a more suitable (narrower) type. Found by Linux Verification Center (linuxtesting.org) with SVACE. Signed-off-by: Andrey Vatoropin Link: https://lore.kernel.org/r/20250204095400.95013-1-a.vatoropin@crpt.ru Signed-off-by: Guenter Roeck Signed-off-by: Sasha Levin --- drivers/hwmon/xgene-hwmon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwmon/xgene-hwmon.c b/drivers/hwmon/xgene-hwmon.c index 207084d55044..6768dbf39039 100644 --- a/drivers/hwmon/xgene-hwmon.c +++ b/drivers/hwmon/xgene-hwmon.c @@ -111,7 +111,7 @@ struct xgene_hwmon_dev { phys_addr_t comm_base_addr; void *pcc_comm_addr; - u64 usecs_lat; + unsigned int usecs_lat; }; /* From 73b9240149d4dc079491ac846f3bee5299b15c6b Mon Sep 17 00:00:00 2001 From: Depeng Shao Date: Mon, 13 Jan 2025 10:01:28 +0530 Subject: [PATCH 390/529] media: qcom: camss: csid: Only add TPG v4l2 ctrl if TPG hardware is available [ Upstream commit 2f1361f862a68063f37362f1beb400e78e289581 ] There is no CSID TPG on some SoCs, so the v4l2 ctrl in CSID driver shouldn't be registered. Checking the supported TPG modes to indicate if the TPG hardware exists or not and only registering v4l2 ctrl for CSID only when the TPG hardware is present. Signed-off-by: Depeng Shao Signed-off-by: Hans Verkuil Signed-off-by: Sasha Levin --- .../media/platform/qcom/camss/camss-csid.c | 64 +++++++++++-------- 1 file changed, 37 insertions(+), 27 deletions(-) diff --git a/drivers/media/platform/qcom/camss/camss-csid.c b/drivers/media/platform/qcom/camss/camss-csid.c index 6360314f04a6..b90e2e690f3a 100644 --- a/drivers/media/platform/qcom/camss/camss-csid.c +++ b/drivers/media/platform/qcom/camss/camss-csid.c @@ -239,11 +239,13 @@ static int csid_set_stream(struct v4l2_subdev *sd, int enable) int ret; if (enable) { - ret = v4l2_ctrl_handler_setup(&csid->ctrls); - if (ret < 0) { - dev_err(csid->camss->dev, - "could not sync v4l2 controls: %d\n", ret); - return ret; + if (csid->testgen.nmodes != CSID_PAYLOAD_MODE_DISABLED) { + ret = v4l2_ctrl_handler_setup(&csid->ctrls); + if (ret < 0) { + dev_err(csid->camss->dev, + "could not sync v4l2 controls: %d\n", ret); + return ret; + } } if (!csid->testgen.enabled && @@ -318,7 +320,8 @@ static void csid_try_format(struct csid_device *csid, break; case MSM_CSID_PAD_SRC: - if (csid->testgen_mode->cur.val == 0) { + if (csid->testgen.nmodes == CSID_PAYLOAD_MODE_DISABLED || + csid->testgen_mode->cur.val == 0) { /* Test generator is disabled, */ /* keep pad formats in sync */ u32 code = fmt->code; @@ -368,7 +371,8 @@ static int csid_enum_mbus_code(struct v4l2_subdev *sd, code->code = csid->formats[code->index].code; } else { - if (csid->testgen_mode->cur.val == 0) { + if (csid->testgen.nmodes == CSID_PAYLOAD_MODE_DISABLED || + csid->testgen_mode->cur.val == 0) { struct v4l2_mbus_framefmt *sink_fmt; sink_fmt = __csid_get_format(csid, sd_state, @@ -750,7 +754,8 @@ static int csid_link_setup(struct media_entity *entity, /* If test generator is enabled */ /* do not allow a link from CSIPHY to CSID */ - if (csid->testgen_mode->cur.val != 0) + if (csid->testgen.nmodes != CSID_PAYLOAD_MODE_DISABLED && + csid->testgen_mode->cur.val != 0) return -EBUSY; sd = media_entity_to_v4l2_subdev(remote->entity); @@ -843,25 +848,28 @@ int msm_csid_register_entity(struct csid_device *csid, MSM_CSID_NAME, csid->id); v4l2_set_subdevdata(sd, csid); - ret = v4l2_ctrl_handler_init(&csid->ctrls, 1); - if (ret < 0) { - dev_err(dev, "Failed to init ctrl handler: %d\n", ret); - return ret; + if (csid->testgen.nmodes != CSID_PAYLOAD_MODE_DISABLED) { + ret = v4l2_ctrl_handler_init(&csid->ctrls, 1); + if (ret < 0) { + dev_err(dev, "Failed to init ctrl handler: %d\n", ret); + return ret; + } + + csid->testgen_mode = + v4l2_ctrl_new_std_menu_items(&csid->ctrls, + &csid_ctrl_ops, V4L2_CID_TEST_PATTERN, + csid->testgen.nmodes, 0, 0, + csid->testgen.modes); + + if (csid->ctrls.error) { + dev_err(dev, "Failed to init ctrl: %d\n", csid->ctrls.error); + ret = csid->ctrls.error; + goto free_ctrl; + } + + csid->subdev.ctrl_handler = &csid->ctrls; } - csid->testgen_mode = v4l2_ctrl_new_std_menu_items(&csid->ctrls, - &csid_ctrl_ops, V4L2_CID_TEST_PATTERN, - csid->testgen.nmodes, 0, 0, - csid->testgen.modes); - - if (csid->ctrls.error) { - dev_err(dev, "Failed to init ctrl: %d\n", csid->ctrls.error); - ret = csid->ctrls.error; - goto free_ctrl; - } - - csid->subdev.ctrl_handler = &csid->ctrls; - ret = csid_init_formats(sd, NULL); if (ret < 0) { dev_err(dev, "Failed to init format: %d\n", ret); @@ -891,7 +899,8 @@ int msm_csid_register_entity(struct csid_device *csid, media_cleanup: media_entity_cleanup(&sd->entity); free_ctrl: - v4l2_ctrl_handler_free(&csid->ctrls); + if (csid->testgen.nmodes != CSID_PAYLOAD_MODE_DISABLED) + v4l2_ctrl_handler_free(&csid->ctrls); return ret; } @@ -904,5 +913,6 @@ void msm_csid_unregister_entity(struct csid_device *csid) { v4l2_device_unregister_subdev(&csid->subdev); media_entity_cleanup(&csid->subdev.entity); - v4l2_ctrl_handler_free(&csid->ctrls); + if (csid->testgen.nmodes != CSID_PAYLOAD_MODE_DISABLED) + v4l2_ctrl_handler_free(&csid->ctrls); } From 784b78295a3a58bf052339dd669e6e03710220d3 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Tue, 4 Feb 2025 16:55:42 +0200 Subject: [PATCH 391/529] vxlan: Annotate FDB data races [ Upstream commit f6205f8215f12a96518ac9469ff76294ae7bd612 ] The 'used' and 'updated' fields in the FDB entry structure can be accessed concurrently by multiple threads, leading to reports such as [1]. Can be reproduced using [2]. Suppress these reports by annotating these accesses using READ_ONCE() / WRITE_ONCE(). [1] BUG: KCSAN: data-race in vxlan_xmit / vxlan_xmit write to 0xffff942604d263a8 of 8 bytes by task 286 on cpu 0: vxlan_xmit+0xb29/0x2380 dev_hard_start_xmit+0x84/0x2f0 __dev_queue_xmit+0x45a/0x1650 packet_xmit+0x100/0x150 packet_sendmsg+0x2114/0x2ac0 __sys_sendto+0x318/0x330 __x64_sys_sendto+0x76/0x90 x64_sys_call+0x14e8/0x1c00 do_syscall_64+0x9e/0x1a0 entry_SYSCALL_64_after_hwframe+0x77/0x7f read to 0xffff942604d263a8 of 8 bytes by task 287 on cpu 2: vxlan_xmit+0xadf/0x2380 dev_hard_start_xmit+0x84/0x2f0 __dev_queue_xmit+0x45a/0x1650 packet_xmit+0x100/0x150 packet_sendmsg+0x2114/0x2ac0 __sys_sendto+0x318/0x330 __x64_sys_sendto+0x76/0x90 x64_sys_call+0x14e8/0x1c00 do_syscall_64+0x9e/0x1a0 entry_SYSCALL_64_after_hwframe+0x77/0x7f value changed: 0x00000000fffbac6e -> 0x00000000fffbac6f Reported by Kernel Concurrency Sanitizer on: CPU: 2 UID: 0 PID: 287 Comm: mausezahn Not tainted 6.13.0-rc7-01544-gb4b270f11a02 #5 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-3.fc41 04/01/2014 [2] #!/bin/bash set +H echo whitelist > /sys/kernel/debug/kcsan echo !vxlan_xmit > /sys/kernel/debug/kcsan ip link add name vx0 up type vxlan id 10010 dstport 4789 local 192.0.2.1 bridge fdb add 00:11:22:33:44:55 dev vx0 self static dst 198.51.100.1 taskset -c 0 mausezahn vx0 -a own -b 00:11:22:33:44:55 -c 0 -q & taskset -c 2 mausezahn vx0 -a own -b 00:11:22:33:44:55 -c 0 -q & Reviewed-by: Petr Machata Signed-off-by: Ido Schimmel Reviewed-by: Eric Dumazet Reviewed-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250204145549.1216254-2-idosch@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/vxlan/vxlan_core.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/vxlan/vxlan_core.c b/drivers/net/vxlan/vxlan_core.c index 0afd7eb976e6..ef61eab81707 100644 --- a/drivers/net/vxlan/vxlan_core.c +++ b/drivers/net/vxlan/vxlan_core.c @@ -274,9 +274,9 @@ static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan, be32_to_cpu(fdb->vni))) goto nla_put_failure; - ci.ndm_used = jiffies_to_clock_t(now - fdb->used); + ci.ndm_used = jiffies_to_clock_t(now - READ_ONCE(fdb->used)); ci.ndm_confirmed = 0; - ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated); + ci.ndm_updated = jiffies_to_clock_t(now - READ_ONCE(fdb->updated)); ci.ndm_refcnt = 0; if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci)) @@ -482,8 +482,8 @@ static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan, struct vxlan_fdb *f; f = __vxlan_find_mac(vxlan, mac, vni); - if (f && f->used != jiffies) - f->used = jiffies; + if (f && READ_ONCE(f->used) != jiffies) + WRITE_ONCE(f->used, jiffies); return f; } @@ -1057,12 +1057,12 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, !(f->flags & NTF_VXLAN_ADDED_BY_USER)) { if (f->state != state) { f->state = state; - f->updated = jiffies; + WRITE_ONCE(f->updated, jiffies); notify = 1; } if (f->flags != fdb_flags) { f->flags = fdb_flags; - f->updated = jiffies; + WRITE_ONCE(f->updated, jiffies); notify = 1; } } @@ -1096,7 +1096,7 @@ static int vxlan_fdb_update_existing(struct vxlan_dev *vxlan, } if (ndm_flags & NTF_USE) - f->used = jiffies; + WRITE_ONCE(f->used, jiffies); if (notify) { if (rd == NULL) @@ -1525,7 +1525,7 @@ static bool vxlan_snoop(struct net_device *dev, src_mac, &rdst->remote_ip.sa, &src_ip->sa); rdst->remote_ip = *src_ip; - f->updated = jiffies; + WRITE_ONCE(f->updated, jiffies); vxlan_fdb_notify(vxlan, f, rdst, RTM_NEWNEIGH, true, NULL); } else { u32 hash_index = fdb_head_index(vxlan, src_mac, vni); @@ -2936,7 +2936,7 @@ static void vxlan_cleanup(struct timer_list *t) if (f->flags & NTF_EXT_LEARNED) continue; - timeout = f->used + vxlan->cfg.age_interval * HZ; + timeout = READ_ONCE(f->used) + vxlan->cfg.age_interval * HZ; if (time_before_eq(timeout, jiffies)) { netdev_dbg(vxlan->dev, "garbage collect %pM\n", From 77835a04b184a669a9f49a06af65e738570dd353 Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 4 Feb 2025 07:58:17 +0100 Subject: [PATCH 392/529] r8169: don't scan PHY addresses > 0 [ Upstream commit faac69a4ae5abb49e62c79c66b51bb905c9aa5ec ] The PHY address is a dummy, because r8169 PHY access registers don't support a PHY address. Therefore scan address 0 only. Signed-off-by: Heiner Kallweit Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/830637dd-4016-4a68-92b3-618fcac6589d@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/realtek/r8169_main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c index 4b461e93ffe9..6346821d480b 100644 --- a/drivers/net/ethernet/realtek/r8169_main.c +++ b/drivers/net/ethernet/realtek/r8169_main.c @@ -5156,6 +5156,7 @@ static int r8169_mdio_register(struct rtl8169_private *tp) new_bus->priv = tp; new_bus->parent = &pdev->dev; new_bus->irq[0] = PHY_MAC_INTERRUPT; + new_bus->phy_mask = GENMASK(31, 1); snprintf(new_bus->id, MII_BUS_ID_SIZE, "r8169-%x-%x", pci_domain_nr(pdev->bus), pci_dev_id(pdev)); From 6090e604285f214ba707cbbd4149567d6206e720 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:56 -0800 Subject: [PATCH 393/529] rcu: handle quiescent states for PREEMPT_RCU=n, PREEMPT_COUNT=y [ Upstream commit 83b28cfe796464ebbde1cf7916c126da6d572685 ] With PREEMPT_RCU=n, cond_resched() provides urgently needed quiescent states for read-side critical sections via rcu_all_qs(). One reason why this was needed: lacking preempt-count, the tick handler has no way of knowing whether it is executing in a read-side critical section or not. With (PREEMPT_LAZY=y, PREEMPT_DYNAMIC=n), we get (PREEMPT_COUNT=y, PREEMPT_RCU=n). In this configuration cond_resched() is a stub and does not provide quiescent states via rcu_all_qs(). (PREEMPT_RCU=y provides this information via rcu_read_unlock() and its nesting counter.) So, use the availability of preempt_count() to report quiescent states in rcu_flavor_sched_clock_irq(). Suggested-by: Paul E. McKenney Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Ankur Arora Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng Signed-off-by: Sasha Levin --- kernel/rcu/tree_plugin.h | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 044026abfdd7..4f45562be7b5 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -963,13 +963,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) */ static void rcu_flavor_sched_clock_irq(int user) { - if (user || rcu_is_cpu_rrupt_from_idle()) { + if (user || rcu_is_cpu_rrupt_from_idle() || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) && + (preempt_count() == HARDIRQ_OFFSET))) { /* * Get here if this CPU took its interrupt from user - * mode or from the idle loop, and if this is not a - * nested interrupt. In this case, the CPU is in - * a quiescent state, so note it. + * mode, from the idle loop without this being a nested + * interrupt, or while not holding the task preempt count + * (with PREEMPT_COUNT=y). In this case, the CPU is in a + * quiescent state, so note it. * * No memory barrier is required here because rcu_qs() * references only CPU-local variables that other CPUs From e2df1936c126eb87033cbe4a60bc2786d9c06aa8 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:55 -0800 Subject: [PATCH 394/529] rcu: handle unstable rdp in rcu_read_unlock_strict() [ Upstream commit fcf0e25ad4c8d14d2faab4d9a17040f31efce205 ] rcu_read_unlock_strict() can be called with preemption enabled which can make for an unstable rdp and a racy norm value. Fix this by dropping the preempt-count in __rcu_read_unlock() after the call to rcu_read_unlock_strict(), adjusting the preempt-count check appropriately. Suggested-by: Frederic Weisbecker Signed-off-by: Ankur Arora Reviewed-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng Signed-off-by: Sasha Levin --- include/linux/rcupdate.h | 2 +- kernel/rcu/tree_plugin.h | 11 ++++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index aef8c7304d45..d1c35009831b 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -97,9 +97,9 @@ static inline void __rcu_read_lock(void) static inline void __rcu_read_unlock(void) { - preempt_enable(); if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) rcu_read_unlock_strict(); + preempt_enable(); } static inline int rcu_preempt_depth(void) diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 4f45562be7b5..3929ef8148c1 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -821,8 +821,17 @@ void rcu_read_unlock_strict(void) { struct rcu_data *rdp; - if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) + if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread) return; + + /* + * rcu_report_qs_rdp() can only be invoked with a stable rdp and + * from the local CPU. + * + * The in_atomic_preempt_off() check ensures that we come here holding + * the last preempt_count (which will get dropped once we return to + * __rcu_read_unlock(). + */ rdp = this_cpu_ptr(&rcu_data); rdp->cpu_no_qs.b.norm = false; rcu_report_qs_rdp(rdp); From 84916c757cb40b6c940961903144017977d88524 Mon Sep 17 00:00:00 2001 From: Ankur Arora Date: Thu, 12 Dec 2024 20:06:52 -0800 Subject: [PATCH 395/529] rcu: fix header guard for rcu_all_qs() [ Upstream commit ad6b5b73ff565e88aca7a7d1286788d80c97ba71 ] rcu_all_qs() is defined for !CONFIG_PREEMPT_RCU but the declaration is conditioned on CONFIG_PREEMPTION. With CONFIG_PREEMPT_LAZY, CONFIG_PREEMPTION=y does not imply CONFIG_PREEMPT_RCU=y. Decouple the two. Cc: Paul E. McKenney Reviewed-by: Frederic Weisbecker Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: Ankur Arora Signed-off-by: Paul E. McKenney Signed-off-by: Boqun Feng Signed-off-by: Sasha Levin --- include/linux/rcutree.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 5efb51486e8a..54483d5e6f91 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -105,7 +105,7 @@ extern int rcu_scheduler_active; void rcu_end_inkernel_boot(void); bool rcu_inkernel_boot_has_ended(void); bool rcu_is_watching(void); -#ifndef CONFIG_PREEMPTION +#ifndef CONFIG_PREEMPT_RCU void rcu_all_qs(void); #endif From e1c3bfe365f10c3c5cfa53e16ad60201879f74f4 Mon Sep 17 00:00:00 2001 From: "Peter Zijlstra (Intel)" Date: Tue, 21 Jan 2025 07:23:02 -0800 Subject: [PATCH 396/529] perf: Avoid the read if the count is already updated [ Upstream commit 8ce939a0fa194939cc1f92dbd8bc1a7806e7d40a ] The event may have been updated in the PMU-specific implementation, e.g., Intel PEBS counters snapshotting. The common code should not read and overwrite the value. The PERF_SAMPLE_READ in the data->sample_type can be used to detect whether the PMU-specific value is available. If yes, avoid the pmu->read() in the common code. Add a new flag, skip_read, to track the case. Factor out a perf_pmu_read() to clean up the code. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Link: https://lkml.kernel.org/r/20250121152303.3128733-3-kan.liang@linux.intel.com Signed-off-by: Sasha Levin --- include/linux/perf_event.h | 8 +++++++- kernel/events/core.c | 33 ++++++++++++++++----------------- kernel/events/ring_buffer.c | 1 + 3 files changed, 24 insertions(+), 18 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 27b694552d58..41ff70f315a9 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -935,7 +935,13 @@ struct perf_output_handle { struct perf_buffer *rb; unsigned long wakeup; unsigned long size; - u64 aux_flags; + union { + u64 flags; /* perf_output*() */ + u64 aux_flags; /* perf_aux_output*() */ + struct { + u64 skip_read : 1; + }; + }; union { void *addr; unsigned long head; diff --git a/kernel/events/core.c b/kernel/events/core.c index 8fc2bc5646ee..552bb00bfceb 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1180,6 +1180,12 @@ static void perf_event_ctx_deactivate(struct perf_event_context *ctx) list_del_init(&ctx->active_ctx_list); } +static inline void perf_pmu_read(struct perf_event *event) +{ + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); +} + static void get_ctx(struct perf_event_context *ctx) { refcount_inc(&ctx->refcount); @@ -3389,8 +3395,7 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); + perf_pmu_read(event); perf_event_update_time(event); @@ -4444,15 +4449,8 @@ static void __perf_event_read(void *info) pmu->read(event); - for_each_sibling_event(sub, event) { - if (sub->state == PERF_EVENT_STATE_ACTIVE) { - /* - * Use sibling's PMU rather than @event's since - * sibling could be on different (eg: software) PMU. - */ - sub->pmu->read(sub); - } - } + for_each_sibling_event(sub, event) + perf_pmu_read(sub); data->ret = pmu->commit_txn(pmu); @@ -7101,9 +7099,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if ((leader != event) && - (leader->state == PERF_EVENT_STATE_ACTIVE)) - leader->pmu->read(leader); + if ((leader != event) && !handle->skip_read) + perf_pmu_read(leader); values[n++] = perf_event_count(leader); if (read_format & PERF_FORMAT_ID) @@ -7116,9 +7113,8 @@ static void perf_output_read_group(struct perf_output_handle *handle, for_each_sibling_event(sub, leader) { n = 0; - if ((sub != event) && - (sub->state == PERF_EVENT_STATE_ACTIVE)) - sub->pmu->read(sub); + if ((sub != event) && !handle->skip_read) + perf_pmu_read(sub); values[n++] = perf_event_count(sub); if (read_format & PERF_FORMAT_ID) @@ -7173,6 +7169,9 @@ void perf_output_sample(struct perf_output_handle *handle, { u64 sample_type = data->type; + if (data->sample_flags & PERF_SAMPLE_READ) + handle->skip_read = 1; + perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 3e1655374c2e..4c4894de6e5d 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -181,6 +181,7 @@ __perf_output_begin(struct perf_output_handle *handle, handle->rb = rb; handle->event = event; + handle->flags = 0; have_lost = local_read(&rb->lost); if (unlikely(have_lost)) { From e0d0424a8a10db75b482dd2bca0d24538b6e97f6 Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 3 Dec 2024 07:58:09 +0100 Subject: [PATCH 397/529] ice: count combined queues using Rx/Tx count [ Upstream commit c3a392bdd31adc474f1009ee85c13fdd01fe800d ] Previous implementation assumes that there is 1:1 matching between vectors and queues. It isn't always true. Get minimum value from Rx/Tx queues to determine combined queues number. Reviewed-by: Jacob Keller Tested-by: Pucha Himasekhar Reddy Signed-off-by: Michal Swiatkowski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/ice/ice_ethtool.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_ethtool.c b/drivers/net/ethernet/intel/ice/ice_ethtool.c index a163e7717a53..1f62d1183156 100644 --- a/drivers/net/ethernet/intel/ice/ice_ethtool.c +++ b/drivers/net/ethernet/intel/ice/ice_ethtool.c @@ -3373,8 +3373,7 @@ static u32 ice_get_combined_cnt(struct ice_vsi *vsi) ice_for_each_q_vector(vsi, q_idx) { struct ice_q_vector *q_vector = vsi->q_vectors[q_idx]; - if (q_vector->rx.rx_ring && q_vector->tx.tx_ring) - combined++; + combined += min(q_vector->num_ring_tx, q_vector->num_ring_rx); } return combined; From 31fbeed785db8ae984f672d2779fa8c0c5b0e7c5 Mon Sep 17 00:00:00 2001 From: Konstantin Taranov Date: Mon, 20 Jan 2025 09:27:14 -0800 Subject: [PATCH 398/529] net/mana: fix warning in the writer of client oob [ Upstream commit 5ec7e1c86c441c46a374577bccd9488abea30037 ] Do not warn on missing pad_data when oob is in sgl. Signed-off-by: Konstantin Taranov Link: https://patch.msgid.link/1737394039-28772-9-git-send-email-kotaranov@linux.microsoft.com Reviewed-by: Shiraz Saleem Reviewed-by: Long Li Signed-off-by: Leon Romanovsky Signed-off-by: Sasha Levin --- drivers/net/ethernet/microsoft/mana/gdma_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/microsoft/mana/gdma_main.c b/drivers/net/ethernet/microsoft/mana/gdma_main.c index d674ebda2053..9e55679796d9 100644 --- a/drivers/net/ethernet/microsoft/mana/gdma_main.c +++ b/drivers/net/ethernet/microsoft/mana/gdma_main.c @@ -995,7 +995,7 @@ static u32 mana_gd_write_client_oob(const struct gdma_wqe_request *wqe_req, header->inline_oob_size_div4 = client_oob_size / sizeof(u32); if (oob_in_sgl) { - WARN_ON_ONCE(!pad_data || wqe_req->num_sge < 2); + WARN_ON_ONCE(wqe_req->num_sge < 2); header->client_oob_in_sgl = 1; From 14f8b37e1ce769fda57c826ca04ec090d57a04d8 Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Thu, 30 Jan 2025 16:05:22 -0800 Subject: [PATCH 399/529] scsi: lpfc: Handle duplicate D_IDs in ndlp search-by D_ID routine [ Upstream commit 56c3d809b7b450379162d0b8a70bbe71ab8db706 ] After a port swap between separate fabrics, there may be multiple nodes in the vport's fc_nodes list with the same fabric well known address. Duplication is temporary and eventually resolves itself after dev_loss_tmo expires, but nameserver queries may still occur before dev_loss_tmo. This possibly results in returning stale fabric ndlp objects. Fix by adding an nlp_state check to ensure the ndlp search routine returns the correct newer allocated ndlp fabric object. Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20250131000524.163662-5-justintee8345@gmail.com Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/lpfc/lpfc_hbadisc.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/lpfc/lpfc_hbadisc.c b/drivers/scsi/lpfc/lpfc_hbadisc.c index 57be02f8d5c1..b04112c77fcd 100644 --- a/drivers/scsi/lpfc/lpfc_hbadisc.c +++ b/drivers/scsi/lpfc/lpfc_hbadisc.c @@ -5619,6 +5619,7 @@ static struct lpfc_nodelist * __lpfc_findnode_did(struct lpfc_vport *vport, uint32_t did) { struct lpfc_nodelist *ndlp; + struct lpfc_nodelist *np = NULL; uint32_t data1; list_for_each_entry(ndlp, &vport->fc_nodes, nlp_listp) { @@ -5633,14 +5634,20 @@ __lpfc_findnode_did(struct lpfc_vport *vport, uint32_t did) ndlp, ndlp->nlp_DID, ndlp->nlp_flag, data1, ndlp->nlp_rpi, ndlp->active_rrqs_xri_bitmap); - return ndlp; + + /* Check for new or potentially stale node */ + if (ndlp->nlp_state != NLP_STE_UNUSED_NODE) + return ndlp; + np = ndlp; } } - /* FIND node did NOT FOUND */ - lpfc_printf_vlog(vport, KERN_INFO, LOG_NODE, - "0932 FIND node did x%x NOT FOUND.\n", did); - return NULL; + if (!np) + /* FIND node did NOT FOUND */ + lpfc_printf_vlog(vport, KERN_INFO, LOG_NODE, + "0932 FIND node did x%x NOT FOUND.\n", did); + + return np; } struct lpfc_nodelist * From 9f9a65de83e74e2a3313fcacb685b59848208b68 Mon Sep 17 00:00:00 2001 From: Justin Tee Date: Thu, 30 Jan 2025 16:05:20 -0800 Subject: [PATCH 400/529] scsi: lpfc: Free phba irq in lpfc_sli4_enable_msi() when pci_irq_vector() fails [ Upstream commit f0842902b383982d1f72c490996aa8fc29a7aa0d ] Fix smatch warning regarding missed calls to free_irq(). Free the phba IRQ in the failed pci_irq_vector cases. lpfc_init.c: lpfc_sli4_enable_msi() warn: 'phba->pcidev->irq' from request_irq() not released. Signed-off-by: Justin Tee Link: https://lore.kernel.org/r/20250131000524.163662-3-justintee8345@gmail.com Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/lpfc/lpfc_init.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/lpfc/lpfc_init.c b/drivers/scsi/lpfc/lpfc_init.c index 1a0bafde34d8..97f3c5240d57 100644 --- a/drivers/scsi/lpfc/lpfc_init.c +++ b/drivers/scsi/lpfc/lpfc_init.c @@ -13204,6 +13204,7 @@ lpfc_sli4_enable_msi(struct lpfc_hba *phba) eqhdl = lpfc_get_eq_hdl(0); rc = pci_irq_vector(phba->pcidev, 0); if (rc < 0) { + free_irq(phba->pcidev->irq, phba); pci_free_irq_vectors(phba->pcidev); lpfc_printf_log(phba, KERN_WARNING, LOG_INIT, "0496 MSI pci_irq_vec failed (%d)\n", rc); @@ -13284,6 +13285,7 @@ lpfc_sli4_enable_intr(struct lpfc_hba *phba, uint32_t cfg_mode) eqhdl = lpfc_get_eq_hdl(0); retval = pci_irq_vector(phba->pcidev, 0); if (retval < 0) { + free_irq(phba->pcidev->irq, phba); lpfc_printf_log(phba, KERN_WARNING, LOG_INIT, "0502 INTR pci_irq_vec failed (%d)\n", retval); From 3b72b124877040352165f389b66ac77f3057f36f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kai=20M=C3=A4kisara?= Date: Mon, 20 Jan 2025 21:49:22 +0200 Subject: [PATCH 401/529] scsi: st: Restore some drive settings after reset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 7081dc75df79696d8322d01821c28e53416c932c ] Some of the allowed operations put the tape into a known position to continue operation assuming only the tape position has changed. But reset sets partition, density and block size to drive default values. These should be restored to the values before reset. Normally the current block size and density are stored by the drive. If the settings have been changed, the changed values have to be saved by the driver across reset. Signed-off-by: Kai Mäkisara Link: https://lore.kernel.org/r/20250120194925.44432-2-Kai.Makisara@kolumbus.fi Reviewed-by: John Meneghini Tested-by: John Meneghini Signed-off-by: Martin K. Petersen Signed-off-by: Sasha Levin --- drivers/scsi/st.c | 24 +++++++++++++++++++++--- drivers/scsi/st.h | 2 ++ 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 3ff4e6d44db8..9ba5ad106b65 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -950,7 +950,6 @@ static void reset_state(struct scsi_tape *STp) STp->partition = find_partition(STp); if (STp->partition < 0) STp->partition = 0; - STp->new_partition = STp->partition; } } @@ -2919,14 +2918,17 @@ static int st_int_ioctl(struct scsi_tape *STp, unsigned int cmd_in, unsigned lon if (cmd_in == MTSETDENSITY) { (STp->buffer)->b_data[4] = arg; STp->density_changed = 1; /* At least we tried ;-) */ + STp->changed_density = arg; } else if (cmd_in == SET_DENS_AND_BLK) (STp->buffer)->b_data[4] = arg >> 24; else (STp->buffer)->b_data[4] = STp->density; if (cmd_in == MTSETBLK || cmd_in == SET_DENS_AND_BLK) { ltmp = arg & MT_ST_BLKSIZE_MASK; - if (cmd_in == MTSETBLK) + if (cmd_in == MTSETBLK) { STp->blksize_changed = 1; /* At least we tried ;-) */ + STp->changed_blksize = arg; + } } else ltmp = STp->block_size; (STp->buffer)->b_data[9] = (ltmp >> 16); @@ -3627,9 +3629,25 @@ static long st_ioctl(struct file *file, unsigned int cmd_in, unsigned long arg) retval = (-EIO); goto out; } - reset_state(STp); + reset_state(STp); /* Clears pos_unknown */ /* remove this when the midlevel properly clears was_reset */ STp->device->was_reset = 0; + + /* Fix the device settings after reset, ignore errors */ + if (mtc.mt_op == MTREW || mtc.mt_op == MTSEEK || + mtc.mt_op == MTEOM) { + if (STp->can_partitions) { + /* STp->new_partition contains the + * latest partition set + */ + STp->partition = 0; + switch_partition(STp); + } + if (STp->density_changed) + st_int_ioctl(STp, MTSETDENSITY, STp->changed_density); + if (STp->blksize_changed) + st_int_ioctl(STp, MTSETBLK, STp->changed_blksize); + } } if (mtc.mt_op != MTNOP && mtc.mt_op != MTSETBLK && diff --git a/drivers/scsi/st.h b/drivers/scsi/st.h index 7a68eaba7e81..2105c6a5b458 100644 --- a/drivers/scsi/st.h +++ b/drivers/scsi/st.h @@ -165,12 +165,14 @@ struct scsi_tape { unsigned char compression_changed; unsigned char drv_buffer; unsigned char density; + unsigned char changed_density; unsigned char door_locked; unsigned char autorew_dev; /* auto-rewind device */ unsigned char rew_at_close; /* rewind necessary at close */ unsigned char inited; unsigned char cleaning_req; /* cleaning requested? */ int block_size; + int changed_blksize; int min_block; int max_block; int recover_count; /* From tape opening */ From 08aec29a16db769c8449745265d53022261443f6 Mon Sep 17 00:00:00 2001 From: junan Date: Thu, 28 Nov 2024 10:35:18 +0800 Subject: [PATCH 402/529] HID: usbkbd: Fix the bit shift number for LED_KANA [ Upstream commit d73a4bfa2881a6859b384b75a414c33d4898b055 ] Since "LED_KANA" was defined as "0x04", the shift number should be "4". Signed-off-by: junan Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/usbhid/usbkbd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hid/usbhid/usbkbd.c b/drivers/hid/usbhid/usbkbd.c index c439ed2f16db..af6bc76dbf64 100644 --- a/drivers/hid/usbhid/usbkbd.c +++ b/drivers/hid/usbhid/usbkbd.c @@ -160,7 +160,7 @@ static int usb_kbd_event(struct input_dev *dev, unsigned int type, return -1; spin_lock_irqsave(&kbd->leds_lock, flags); - kbd->newleds = (!!test_bit(LED_KANA, dev->led) << 3) | (!!test_bit(LED_COMPOSE, dev->led) << 3) | + kbd->newleds = (!!test_bit(LED_KANA, dev->led) << 4) | (!!test_bit(LED_COMPOSE, dev->led) << 3) | (!!test_bit(LED_SCROLLL, dev->led) << 2) | (!!test_bit(LED_CAPSL, dev->led) << 1) | (!!test_bit(LED_NUML, dev->led)); From 762535bc37be79344df1d722783faabfb0496bab Mon Sep 17 00:00:00 2001 From: Cezary Rojewski Date: Mon, 3 Feb 2025 15:10:43 +0100 Subject: [PATCH 403/529] ASoC: codecs: pcm3168a: Allow for 24-bit in provider mode [ Upstream commit 7d92a38d67e5d937b64b20aa4fd14451ee1772f3 ] As per codec device specification, 24-bit is allowed in provider mode. Update the code to reflect that. Signed-off-by: Cezary Rojewski Link: https://patch.msgid.link/20250203141051.2361323-4-cezary.rojewski@intel.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/codecs/pcm3168a.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/sound/soc/codecs/pcm3168a.c b/sound/soc/codecs/pcm3168a.c index 9d6431338fb7..329549936bd5 100644 --- a/sound/soc/codecs/pcm3168a.c +++ b/sound/soc/codecs/pcm3168a.c @@ -494,9 +494,9 @@ static int pcm3168a_hw_params(struct snd_pcm_substream *substream, } break; case 24: - if (provider_mode || (format == SND_SOC_DAIFMT_DSP_A) || - (format == SND_SOC_DAIFMT_DSP_B)) { - dev_err(component->dev, "24-bit slots not supported in provider mode, or consumer mode using DSP\n"); + if (!provider_mode && ((format == SND_SOC_DAIFMT_DSP_A) || + (format == SND_SOC_DAIFMT_DSP_B))) { + dev_err(component->dev, "24-bit slots not supported in consumer mode using DSP\n"); return -EINVAL; } break; From 658a93303891a583c0e0fb069dbba2b701a198be Mon Sep 17 00:00:00 2001 From: Thomas Zimmermann Date: Fri, 31 Jan 2025 10:21:08 +0100 Subject: [PATCH 404/529] drm/ast: Find VBIOS mode from regular display size [ Upstream commit c81202906b5cd56db403e95db3d29c9dfc8c74c1 ] The ast driver looks up supplied display modes from an internal list of display modes supported by the VBIOS. Do not use the crtc_-prefixed display values from struct drm_display_mode for looking up the VBIOS mode. The fields contain raw values that the driver programs to hardware. They are affected by display settings like double-scan or interlace. Instead use the regular vdisplay and hdisplay fields for lookup. As the programmed values can now differ from the values used for lookup, set struct drm_display_mode.crtc_vdisplay and .crtc_hdisplay from the VBIOS mode. Signed-off-by: Thomas Zimmermann Reviewed-by: Jocelyn Falempe Link: https://patchwork.freedesktop.org/patch/msgid/20250131092257.115596-9-tzimmermann@suse.de Signed-off-by: Sasha Levin --- drivers/gpu/drm/ast/ast_mode.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/ast/ast_mode.c b/drivers/gpu/drm/ast/ast_mode.c index 1bc0220e6783..9fe856fd8a84 100644 --- a/drivers/gpu/drm/ast/ast_mode.c +++ b/drivers/gpu/drm/ast/ast_mode.c @@ -103,7 +103,7 @@ static bool ast_get_vbios_mode_info(const struct drm_format_info *format, return false; } - switch (mode->crtc_hdisplay) { + switch (mode->hdisplay) { case 640: vbios_mode->enh_table = &res_640x480[refresh_rate_index]; break; @@ -117,7 +117,7 @@ static bool ast_get_vbios_mode_info(const struct drm_format_info *format, vbios_mode->enh_table = &res_1152x864[refresh_rate_index]; break; case 1280: - if (mode->crtc_vdisplay == 800) + if (mode->vdisplay == 800) vbios_mode->enh_table = &res_1280x800[refresh_rate_index]; else vbios_mode->enh_table = &res_1280x1024[refresh_rate_index]; @@ -129,7 +129,7 @@ static bool ast_get_vbios_mode_info(const struct drm_format_info *format, vbios_mode->enh_table = &res_1440x900[refresh_rate_index]; break; case 1600: - if (mode->crtc_vdisplay == 900) + if (mode->vdisplay == 900) vbios_mode->enh_table = &res_1600x900[refresh_rate_index]; else vbios_mode->enh_table = &res_1600x1200[refresh_rate_index]; @@ -138,7 +138,7 @@ static bool ast_get_vbios_mode_info(const struct drm_format_info *format, vbios_mode->enh_table = &res_1680x1050[refresh_rate_index]; break; case 1920: - if (mode->crtc_vdisplay == 1080) + if (mode->vdisplay == 1080) vbios_mode->enh_table = &res_1920x1080[refresh_rate_index]; else vbios_mode->enh_table = &res_1920x1200[refresh_rate_index]; @@ -182,6 +182,7 @@ static bool ast_get_vbios_mode_info(const struct drm_format_info *format, hborder = (vbios_mode->enh_table->flags & HBorder) ? 8 : 0; vborder = (vbios_mode->enh_table->flags & VBorder) ? 8 : 0; + adjusted_mode->crtc_hdisplay = vbios_mode->enh_table->hde; adjusted_mode->crtc_htotal = vbios_mode->enh_table->ht; adjusted_mode->crtc_hblank_start = vbios_mode->enh_table->hde + hborder; adjusted_mode->crtc_hblank_end = vbios_mode->enh_table->ht - hborder; @@ -191,6 +192,7 @@ static bool ast_get_vbios_mode_info(const struct drm_format_info *format, vbios_mode->enh_table->hfp + vbios_mode->enh_table->hsync); + adjusted_mode->crtc_vdisplay = vbios_mode->enh_table->vde; adjusted_mode->crtc_vtotal = vbios_mode->enh_table->vt; adjusted_mode->crtc_vblank_start = vbios_mode->enh_table->vde + vborder; adjusted_mode->crtc_vblank_end = vbios_mode->enh_table->vt - vborder; From 77a7df4b236682e71b1b464da9fd268d6421576e Mon Sep 17 00:00:00 2001 From: Viktor Malik Date: Wed, 29 Jan 2025 08:18:57 +0100 Subject: [PATCH 405/529] bpftool: Fix readlink usage in get_fd_type [ Upstream commit 0053f7d39d491b6138d7c526876d13885cbb65f1 ] The `readlink(path, buf, sizeof(buf))` call reads at most sizeof(buf) bytes and *does not* append null-terminator to buf. With respect to that, fix two pieces in get_fd_type: 1. Change the truncation check to contain sizeof(buf) rather than sizeof(path). 2. Append null-terminator to buf. Reported by Coverity. Signed-off-by: Viktor Malik Signed-off-by: Andrii Nakryiko Reviewed-by: Quentin Monnet Link: https://lore.kernel.org/bpf/20250129071857.75182-1-vmalik@redhat.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- tools/bpf/bpftool/common.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c index db02b000fbeb..eea00bc15b5c 100644 --- a/tools/bpf/bpftool/common.c +++ b/tools/bpf/bpftool/common.c @@ -384,10 +384,11 @@ int get_fd_type(int fd) p_err("can't read link type: %s", strerror(errno)); return -1; } - if (n == sizeof(path)) { + if (n == sizeof(buf)) { p_err("can't read link type: path too long!"); return -1; } + buf[n] = '\0'; if (strstr(buf, "bpf-map")) return BPF_OBJ_MAP; From e7e61e09ce5dbc12ca8693155ab2408931637239 Mon Sep 17 00:00:00 2001 From: Ravi Bangoria Date: Wed, 15 Jan 2025 05:44:33 +0000 Subject: [PATCH 406/529] perf/amd/ibs: Fix perf_ibs_op.cnt_mask for CurCnt [ Upstream commit 46dcf85566170d4528b842bf83ffc350d71771fa ] IBS Op uses two counters: MaxCnt and CurCnt. MaxCnt is programmed with the desired sample period. IBS hw generates sample when CurCnt reaches to MaxCnt. The size of these counter used to be 20 bits but later they were extended to 27 bits. The 7 bit extension is indicated by CPUID Fn8000_001B_EAX[6 / OpCntExt]. perf_ibs->cnt_mask variable contains bit masks for MaxCnt and CurCnt. But IBS driver does not set upper 7 bits of CurCnt in cnt_mask even when OpCntExt CPUID bit is set. Fix this. IBS driver uses cnt_mask[CurCnt] bits only while disabling an event. Fortunately, CurCnt bits are not read from MSR while re-enabling the event, instead MaxCnt is programmed with desired period and CurCnt is set to 0. Hence, we did not see any issues so far. Signed-off-by: Ravi Bangoria Signed-off-by: Peter Zijlstra (Intel) Acked-by: Namhyung Kim Link: https://lkml.kernel.org/r/20250115054438.1021-5-ravi.bangoria@amd.com Signed-off-by: Sasha Levin --- arch/x86/events/amd/ibs.c | 3 ++- arch/x86/include/asm/perf_event.h | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/events/amd/ibs.c b/arch/x86/events/amd/ibs.c index 37cbbc5c659a..8c385e231e07 100644 --- a/arch/x86/events/amd/ibs.c +++ b/arch/x86/events/amd/ibs.c @@ -1216,7 +1216,8 @@ static __init int perf_ibs_op_init(void) if (ibs_caps & IBS_CAPS_OPCNTEXT) { perf_ibs_op.max_period |= IBS_OP_MAX_CNT_EXT_MASK; perf_ibs_op.config_mask |= IBS_OP_MAX_CNT_EXT_MASK; - perf_ibs_op.cnt_mask |= IBS_OP_MAX_CNT_EXT_MASK; + perf_ibs_op.cnt_mask |= (IBS_OP_MAX_CNT_EXT_MASK | + IBS_OP_CUR_CNT_EXT_MASK); } if (ibs_caps & IBS_CAPS_ZEN4) diff --git a/arch/x86/include/asm/perf_event.h b/arch/x86/include/asm/perf_event.h index 4d810b9478a4..fa3576ef19da 100644 --- a/arch/x86/include/asm/perf_event.h +++ b/arch/x86/include/asm/perf_event.h @@ -456,6 +456,7 @@ struct pebs_xmm { */ #define IBS_OP_CUR_CNT (0xFFF80ULL<<32) #define IBS_OP_CUR_CNT_RAND (0x0007FULL<<32) +#define IBS_OP_CUR_CNT_EXT_MASK (0x7FULL<<52) #define IBS_OP_CNT_CTL (1ULL<<19) #define IBS_OP_VAL (1ULL<<18) #define IBS_OP_ENABLE (1ULL<<17) From 60f09a491f42d3f780b997383f1aa8d185cd29a1 Mon Sep 17 00:00:00 2001 From: Soeren Moch Date: Mon, 27 Jan 2025 20:48:28 +0100 Subject: [PATCH 407/529] wifi: rtl8xxxu: retry firmware download on error [ Upstream commit 3d3e28feca7ac8c6cf2a390dbbe1f97e3feb7f36 ] Occasionally there is an EPROTO error during firmware download. This error is converted to EAGAIN in the download function. But nobody tries again and so device probe fails. Implement download retry to fix this. This error was observed (and fix tested) on a tbs2910 board [1] with an embedded RTL8188EU (0bda:8179) device behind a USB hub. [1] arch/arm/boot/dts/nxp/imx/imx6q-tbs2910.dts Signed-off-by: Soeren Moch Acked-by: Ping-Ke Shih Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20250127194828.599379-1-smoch@web.de Signed-off-by: Sasha Levin --- .../wireless/realtek/rtl8xxxu/rtl8xxxu_core.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c index 9ccf8550a067..cd22c756acc6 100644 --- a/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c +++ b/drivers/net/wireless/realtek/rtl8xxxu/rtl8xxxu_core.c @@ -798,9 +798,10 @@ rtl8xxxu_writeN(struct rtl8xxxu_priv *priv, u16 addr, u8 *buf, u16 len) return len; write_error: - dev_info(&udev->dev, - "%s: Failed to write block at addr: %04x size: %04x\n", - __func__, addr, blocksize); + if (rtl8xxxu_debug & RTL8XXXU_DEBUG_REG_WRITE) + dev_info(&udev->dev, + "%s: Failed to write block at addr: %04x size: %04x\n", + __func__, addr, blocksize); return -EAGAIN; } @@ -3920,8 +3921,14 @@ static int rtl8xxxu_init_device(struct ieee80211_hw *hw) */ rtl8xxxu_write16(priv, REG_TRXFF_BNDY + 2, fops->trxff_boundary); - ret = rtl8xxxu_download_firmware(priv); - dev_dbg(dev, "%s: download_firmware %i\n", __func__, ret); + for (int retry = 5; retry >= 0 ; retry--) { + ret = rtl8xxxu_download_firmware(priv); + dev_dbg(dev, "%s: download_firmware %i\n", __func__, ret); + if (ret != -EAGAIN) + break; + if (retry) + dev_dbg(dev, "%s: retry firmware download\n", __func__); + } if (ret) goto exit; ret = rtl8xxxu_start_firmware(priv); From f4c4d18f72d59cebfacc1bc7f16826b7d6812531 Mon Sep 17 00:00:00 2001 From: Bitterblue Smith Date: Sun, 26 Jan 2025 16:03:11 +0200 Subject: [PATCH 408/529] wifi: rtw88: Don't use static local variable in rtw8822b_set_tx_power_index_by_rate [ Upstream commit 00451eb3bec763f708e7e58326468c1e575e5a66 ] Some users want to plug two identical USB devices at the same time. This static variable could theoretically cause them to use incorrect TX power values. Move the variable to the caller and pass a pointer to it to rtw8822b_set_tx_power_index_by_rate(). Signed-off-by: Bitterblue Smith Acked-by: Ping-Ke Shih Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/8a60f581-0ab5-4d98-a97d-dd83b605008f@gmail.com Signed-off-by: Sasha Levin --- drivers/net/wireless/realtek/rtw88/rtw8822b.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/realtek/rtw88/rtw8822b.c b/drivers/net/wireless/realtek/rtw88/rtw8822b.c index 690e35c98f6e..0b071a116c58 100644 --- a/drivers/net/wireless/realtek/rtw88/rtw8822b.c +++ b/drivers/net/wireless/realtek/rtw88/rtw8822b.c @@ -957,11 +957,11 @@ static void rtw8822b_query_rx_desc(struct rtw_dev *rtwdev, u8 *rx_desc, } static void -rtw8822b_set_tx_power_index_by_rate(struct rtw_dev *rtwdev, u8 path, u8 rs) +rtw8822b_set_tx_power_index_by_rate(struct rtw_dev *rtwdev, u8 path, + u8 rs, u32 *phy_pwr_idx) { struct rtw_hal *hal = &rtwdev->hal; static const u32 offset_txagc[2] = {0x1d00, 0x1d80}; - static u32 phy_pwr_idx; u8 rate, rate_idx, pwr_index, shift; int j; @@ -969,12 +969,12 @@ rtw8822b_set_tx_power_index_by_rate(struct rtw_dev *rtwdev, u8 path, u8 rs) rate = rtw_rate_section[rs][j]; pwr_index = hal->tx_pwr_tbl[path][rate]; shift = rate & 0x3; - phy_pwr_idx |= ((u32)pwr_index << (shift * 8)); + *phy_pwr_idx |= ((u32)pwr_index << (shift * 8)); if (shift == 0x3) { rate_idx = rate & 0xfc; rtw_write32(rtwdev, offset_txagc[path] + rate_idx, - phy_pwr_idx); - phy_pwr_idx = 0; + *phy_pwr_idx); + *phy_pwr_idx = 0; } } } @@ -982,11 +982,13 @@ rtw8822b_set_tx_power_index_by_rate(struct rtw_dev *rtwdev, u8 path, u8 rs) static void rtw8822b_set_tx_power_index(struct rtw_dev *rtwdev) { struct rtw_hal *hal = &rtwdev->hal; + u32 phy_pwr_idx = 0; int rs, path; for (path = 0; path < hal->rf_path_num; path++) { for (rs = 0; rs < RTW_RATE_SECTION_MAX; rs++) - rtw8822b_set_tx_power_index_by_rate(rtwdev, path, rs); + rtw8822b_set_tx_power_index_by_rate(rtwdev, path, rs, + &phy_pwr_idx); } } From 8d0c67acdf66e0627f9632049b6f7219eaee9aa1 Mon Sep 17 00:00:00 2001 From: Ping-Ke Shih Date: Wed, 22 Jan 2025 14:03:01 +0800 Subject: [PATCH 409/529] wifi: rtw89: add wiphy_lock() to work that isn't held wiphy_lock() yet [ Upstream commit ebfc9199df05d37b67f4d1b7ee997193f3d2e7c8 ] To ensure where are protected by driver mutex can also be protected by wiphy_lock(), so afterward we can remove driver mutex safely. Signed-off-by: Ping-Ke Shih Link: https://patch.msgid.link/20250122060310.31976-2-pkshih@realtek.com Signed-off-by: Sasha Levin --- drivers/net/wireless/realtek/rtw89/regd.c | 2 ++ drivers/net/wireless/realtek/rtw89/ser.c | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/drivers/net/wireless/realtek/rtw89/regd.c b/drivers/net/wireless/realtek/rtw89/regd.c index 6e5a740b128f..2d31193fcc87 100644 --- a/drivers/net/wireless/realtek/rtw89/regd.c +++ b/drivers/net/wireless/realtek/rtw89/regd.c @@ -334,6 +334,7 @@ void rtw89_regd_notifier(struct wiphy *wiphy, struct regulatory_request *request struct ieee80211_hw *hw = wiphy_to_ieee80211_hw(wiphy); struct rtw89_dev *rtwdev = hw->priv; + wiphy_lock(wiphy); mutex_lock(&rtwdev->mutex); rtw89_leave_ps_mode(rtwdev); @@ -350,4 +351,5 @@ void rtw89_regd_notifier(struct wiphy *wiphy, struct regulatory_request *request exit: mutex_unlock(&rtwdev->mutex); + wiphy_unlock(wiphy); } diff --git a/drivers/net/wireless/realtek/rtw89/ser.c b/drivers/net/wireless/realtek/rtw89/ser.c index afb1b41e1a9a..f5dacdc4d11a 100644 --- a/drivers/net/wireless/realtek/rtw89/ser.c +++ b/drivers/net/wireless/realtek/rtw89/ser.c @@ -153,9 +153,11 @@ static void ser_state_run(struct rtw89_ser *ser, u8 evt) rtw89_debug(rtwdev, RTW89_DBG_SER, "ser: %s receive %s\n", ser_st_name(ser), ser_ev_name(ser, evt)); + wiphy_lock(rtwdev->hw->wiphy); mutex_lock(&rtwdev->mutex); rtw89_leave_lps(rtwdev); mutex_unlock(&rtwdev->mutex); + wiphy_unlock(rtwdev->hw->wiphy); ser->st_tbl[ser->state].st_func(ser, evt); } @@ -624,9 +626,11 @@ static void ser_l2_reset_st_hdl(struct rtw89_ser *ser, u8 evt) switch (evt) { case SER_EV_STATE_IN: + wiphy_lock(rtwdev->hw->wiphy); mutex_lock(&rtwdev->mutex); ser_l2_reset_st_pre_hdl(ser); mutex_unlock(&rtwdev->mutex); + wiphy_unlock(rtwdev->hw->wiphy); ieee80211_restart_hw(rtwdev->hw); ser_set_alarm(ser, SER_RECFG_TIMEOUT, SER_EV_L2_RECFG_TIMEOUT); From d97d423ad162242b13f4c8a77773f53473698593 Mon Sep 17 00:00:00 2001 From: Sean Anderson Date: Thu, 16 Jan 2025 17:41:30 -0500 Subject: [PATCH 410/529] spi: zynqmp-gqspi: Always acknowledge interrupts [ Upstream commit 89785306453ce6d949e783f6936821a0b7649ee2 ] RXEMPTY can cause an IRQ, even though we may not do anything about it (such as if we are waiting for more received data). We must still handle these IRQs because we can tell they were caused by the device. Signed-off-by: Sean Anderson Link: https://patch.msgid.link/20250116224130.2684544-6-sean.anderson@linux.dev Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-zynqmp-gqspi.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/drivers/spi/spi-zynqmp-gqspi.c b/drivers/spi/spi-zynqmp-gqspi.c index c89544ae5ed9..fde7c3810359 100644 --- a/drivers/spi/spi-zynqmp-gqspi.c +++ b/drivers/spi/spi-zynqmp-gqspi.c @@ -698,7 +698,6 @@ static void zynqmp_process_dma_irq(struct zynqmp_qspi *xqspi) static irqreturn_t zynqmp_qspi_irq(int irq, void *dev_id) { struct zynqmp_qspi *xqspi = (struct zynqmp_qspi *)dev_id; - irqreturn_t ret = IRQ_NONE; u32 status, mask, dma_status = 0; status = zynqmp_gqspi_read(xqspi, GQSPI_ISR_OFST); @@ -713,27 +712,24 @@ static irqreturn_t zynqmp_qspi_irq(int irq, void *dev_id) dma_status); } - if (mask & GQSPI_ISR_TXNOT_FULL_MASK) { - zynqmp_qspi_filltxfifo(xqspi, GQSPI_TX_FIFO_FILL); - ret = IRQ_HANDLED; - } + if (!mask && !dma_status) + return IRQ_NONE; - if (dma_status & GQSPI_QSPIDMA_DST_I_STS_DONE_MASK) { + if (mask & GQSPI_ISR_TXNOT_FULL_MASK) + zynqmp_qspi_filltxfifo(xqspi, GQSPI_TX_FIFO_FILL); + + if (dma_status & GQSPI_QSPIDMA_DST_I_STS_DONE_MASK) zynqmp_process_dma_irq(xqspi); - ret = IRQ_HANDLED; - } else if (!(mask & GQSPI_IER_RXEMPTY_MASK) && - (mask & GQSPI_IER_GENFIFOEMPTY_MASK)) { + else if (!(mask & GQSPI_IER_RXEMPTY_MASK) && + (mask & GQSPI_IER_GENFIFOEMPTY_MASK)) zynqmp_qspi_readrxfifo(xqspi, GQSPI_RX_FIFO_FILL); - ret = IRQ_HANDLED; - } if (xqspi->bytes_to_receive == 0 && xqspi->bytes_to_transfer == 0 && ((status & GQSPI_IRQ_MASK) == GQSPI_IRQ_MASK)) { zynqmp_gqspi_write(xqspi, GQSPI_IDR_OFST, GQSPI_ISR_IDR_MASK); complete(&xqspi->data_completion); - ret = IRQ_HANDLED; } - return ret; + return IRQ_HANDLED; } /** From f60d34d4a426af4a1447485b94c99c132b58f7e2 Mon Sep 17 00:00:00 2001 From: Isaac Scott Date: Tue, 28 Jan 2025 17:31:43 +0000 Subject: [PATCH 411/529] regulator: ad5398: Add device tree support [ Upstream commit 5a6a461079decea452fdcae955bccecf92e07e97 ] Previously, the ad5398 driver used only platform_data, which is deprecated in favour of device tree. This caused the AD5398 to fail to probe as it could not load its init_data. If the AD5398 has a device tree node, pull the init_data from there using of_get_regulator_init_data. Signed-off-by: Isaac Scott Acked-by: Michael Hennerich Link: https://patch.msgid.link/20250128173143.959600-4-isaac.scott@ideasonboard.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/regulator/ad5398.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/regulator/ad5398.c b/drivers/regulator/ad5398.c index 75f432f61e91..f4d6e62bd963 100644 --- a/drivers/regulator/ad5398.c +++ b/drivers/regulator/ad5398.c @@ -14,6 +14,7 @@ #include #include #include +#include #define AD5398_CURRENT_EN_MASK 0x8000 @@ -221,15 +222,20 @@ static int ad5398_probe(struct i2c_client *client, const struct ad5398_current_data_format *df = (struct ad5398_current_data_format *)id->driver_data; - if (!init_data) - return -EINVAL; - chip = devm_kzalloc(&client->dev, sizeof(*chip), GFP_KERNEL); if (!chip) return -ENOMEM; config.dev = &client->dev; + if (client->dev.of_node) + init_data = of_get_regulator_init_data(&client->dev, + client->dev.of_node, + &ad5398_reg); + if (!init_data) + return -EINVAL; + config.init_data = init_data; + config.of_node = client->dev.of_node; config.driver_data = chip; chip->client = client; From a78b779206b0cd7647a676b3eefec56cb2a03c1e Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Tue, 5 Nov 2024 14:23:26 -0800 Subject: [PATCH 412/529] wifi: ath9k: return by of_get_mac_address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit dfffb317519f88534bb82797f055f0a2fd867e7b ] When using nvmem, ath9k could potentially be loaded before nvmem, which loads after mtd. This is an issue if DT contains an nvmem mac address. If nvmem is not ready in time for ath9k, -EPROBE_DEFER is returned. Pass it to _probe so that ath9k can properly grab a potentially present MAC address. Signed-off-by: Rosen Penev Acked-by: Toke Høiland-Jørgensen Link: https://patch.msgid.link/20241105222326.194417-1-rosenp@gmail.com Signed-off-by: Jeff Johnson Signed-off-by: Sasha Levin --- drivers/net/wireless/ath/ath9k/init.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/ath/ath9k/init.c b/drivers/net/wireless/ath/ath9k/init.c index 4f00400c7ffb..58386906598a 100644 --- a/drivers/net/wireless/ath/ath9k/init.c +++ b/drivers/net/wireless/ath/ath9k/init.c @@ -691,7 +691,9 @@ static int ath9k_of_init(struct ath_softc *sc) ah->ah_flags |= AH_NO_EEP_SWAP; } - of_get_mac_address(np, common->macaddr); + ret = of_get_mac_address(np, common->macaddr); + if (ret == -EPROBE_DEFER) + return ret; return 0; } From 6a934380187357b6bdd4e26251bdcfca16aaf069 Mon Sep 17 00:00:00 2001 From: Simona Vetter Date: Wed, 8 Jan 2025 18:24:16 +0100 Subject: [PATCH 413/529] drm/atomic: clarify the rules around drm_atomic_state->allow_modeset MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit c5e3306a424b52e38ad2c28c7f3399fcd03e383d ] msm is automagically upgrading normal commits to full modesets, and that's a big no-no: - for one this results in full on->off->on transitions on all these crtc, at least if you're using the usual helpers. Which seems to be the case, and is breaking uapi - further even if the ctm change itself would not result in flicker, this can hide modesets for other reasons. Which again breaks the uapi v2: I forgot the case of adding unrelated crtc state. Add that case and link to the existing kerneldoc explainers. This has come up in an irc discussion with Manasi and Ville about intel's bigjoiner mode. Also cc everyone involved in the msm irc discussion, more people joined after I sent out v1. v3: Wording polish from Pekka and Thomas Acked-by: Pekka Paalanen Acked-by: Dmitry Baryshkov Cc: Maarten Lankhorst Cc: Maxime Ripard Cc: Thomas Zimmermann Cc: David Airlie Cc: Daniel Vetter Cc: Pekka Paalanen Cc: Rob Clark Cc: Simon Ser Cc: Manasi Navare Cc: Ville Syrjälä Cc: Abhinav Kumar Cc: Dmitry Baryshkov Signed-off-by: Simona Vetter Signed-off-by: Simona Vetter Link: https://patchwork.freedesktop.org/patch/msgid/20250108172417.160831-1-simona.vetter@ffwll.ch Signed-off-by: Sasha Levin --- include/drm/drm_atomic.h | 23 +++++++++++++++++++++-- 1 file changed, 21 insertions(+), 2 deletions(-) diff --git a/include/drm/drm_atomic.h b/include/drm/drm_atomic.h index 10b1990bc1f6..36225aedf613 100644 --- a/include/drm/drm_atomic.h +++ b/include/drm/drm_atomic.h @@ -372,8 +372,27 @@ struct drm_atomic_state { * * Allow full modeset. This is used by the ATOMIC IOCTL handler to * implement the DRM_MODE_ATOMIC_ALLOW_MODESET flag. Drivers should - * never consult this flag, instead looking at the output of - * drm_atomic_crtc_needs_modeset(). + * generally not consult this flag, but instead look at the output of + * drm_atomic_crtc_needs_modeset(). The detailed rules are: + * + * - Drivers must not consult @allow_modeset in the atomic commit path. + * Use drm_atomic_crtc_needs_modeset() instead. + * + * - Drivers must consult @allow_modeset before adding unrelated struct + * drm_crtc_state to this commit by calling + * drm_atomic_get_crtc_state(). See also the warning in the + * documentation for that function. + * + * - Drivers must never change this flag, it is under the exclusive + * control of userspace. + * + * - Drivers may consult @allow_modeset in the atomic check path, if + * they have the choice between an optimal hardware configuration + * which requires a modeset, and a less optimal configuration which + * can be committed without a modeset. An example would be suboptimal + * scanout FIFO allocation resulting in increased idle power + * consumption. This allows userspace to avoid flickering and delays + * for the normal composition loop at reasonable cost. */ bool allow_modeset : 1; /** From d822a8e3fb8465678c386f8754f0e5d046a9ac3e Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 9 Jan 2025 14:28:53 -0800 Subject: [PATCH 414/529] drm/panel-edp: Add Starry 116KHD024006 [ Upstream commit 749b5b279e5636cdcef51e15d67b77162cca6caa ] We have a few reports of sc7180-trogdor-pompom devices that have a panel in them that IDs as STA 0x0004 and has the following raw EDID: 00 ff ff ff ff ff ff 00 4e 81 04 00 00 00 00 00 10 20 01 04 a5 1a 0e 78 0a dc dd 96 5b 5b 91 28 1f 52 54 00 00 00 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 01 8e 1c 56 a0 50 00 1e 30 28 20 55 00 00 90 10 00 00 18 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 fe 00 31 31 36 4b 48 44 30 32 34 30 30 36 0a 00 e6 We've been unable to locate a datasheet for this panel and our partner has not been responsive, but all Starry eDP datasheets that we can find agree on the same timing (delay_100_500_e200) so it should be safe to use that here instead of the super conservative timings. We'll still go a little extra conservative and allow `hpd_absent` of 200 instead of 100 because that won't add any real-world delay in most cases. We'll associate the string from the EDID ("116KHD024006") with this panel. Given that the ID is the suspicious value of 0x0004 it seems likely that Starry doesn't always update their IDs but the string will still work to differentiate if we ever need to in the future. Reviewed-by: Neil Armstrong Signed-off-by: Douglas Anderson Link: https://patchwork.freedesktop.org/patch/msgid/20250109142853.1.Ibcc3009933fd19507cc9c713ad0c99c7a9e4fe17@changeid Signed-off-by: Sasha Levin --- drivers/gpu/drm/panel/panel-edp.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/panel/panel-edp.c b/drivers/gpu/drm/panel/panel-edp.c index 2c14779a39e8..1ef1b4c966d2 100644 --- a/drivers/gpu/drm/panel/panel-edp.c +++ b/drivers/gpu/drm/panel/panel-edp.c @@ -1944,6 +1944,7 @@ static const struct edp_panel_entry edp_panels[] = { EDP_PANEL_ENTRY('S', 'H', 'P', 0x1523, &sharp_lq140m1jw46.delay, "LQ140M1JW46"), EDP_PANEL_ENTRY('S', 'H', 'P', 0x154c, &delay_200_500_p2e100, "LQ116M1JW10"), + EDP_PANEL_ENTRY('S', 'T', 'A', 0x0004, &delay_200_500_e200, "116KHD024006"), EDP_PANEL_ENTRY('S', 'T', 'A', 0x0100, &delay_100_500_e200, "2081116HHD028001-51D"), { /* sentinal */ } From ffb55ddf26850a8af666d9c04086ad9b34b6a8a4 Mon Sep 17 00:00:00 2001 From: Jessica Zhang Date: Mon, 16 Dec 2024 16:43:14 -0800 Subject: [PATCH 415/529] drm: Add valid clones check [ Upstream commit 41b4b11da02157c7474caf41d56baae0e941d01a ] Check that all encoders attached to a given CRTC are valid possible_clones of each other. Signed-off-by: Jessica Zhang Reviewed-by: Maxime Ripard Link: https://patchwork.freedesktop.org/patch/msgid/20241216-concurrent-wb-v4-3-fe220297a7f0@quicinc.com Signed-off-by: Dmitry Baryshkov Signed-off-by: Sasha Levin --- drivers/gpu/drm/drm_atomic_helper.c | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/drivers/gpu/drm/drm_atomic_helper.c b/drivers/gpu/drm/drm_atomic_helper.c index 66d223c2d9ab..e737e45a3a70 100644 --- a/drivers/gpu/drm/drm_atomic_helper.c +++ b/drivers/gpu/drm/drm_atomic_helper.c @@ -573,6 +573,30 @@ mode_valid(struct drm_atomic_state *state) return 0; } +static int drm_atomic_check_valid_clones(struct drm_atomic_state *state, + struct drm_crtc *crtc) +{ + struct drm_encoder *drm_enc; + struct drm_crtc_state *crtc_state = drm_atomic_get_new_crtc_state(state, + crtc); + + drm_for_each_encoder_mask(drm_enc, crtc->dev, crtc_state->encoder_mask) { + if (!drm_enc->possible_clones) { + DRM_DEBUG("enc%d possible_clones is 0\n", drm_enc->base.id); + continue; + } + + if ((crtc_state->encoder_mask & drm_enc->possible_clones) != + crtc_state->encoder_mask) { + DRM_DEBUG("crtc%d failed valid clone check for mask 0x%x\n", + crtc->base.id, crtc_state->encoder_mask); + return -EINVAL; + } + } + + return 0; +} + /** * drm_atomic_helper_check_modeset - validate state object for modeset changes * @dev: DRM device @@ -744,6 +768,10 @@ drm_atomic_helper_check_modeset(struct drm_device *dev, ret = drm_atomic_add_affected_planes(state, crtc); if (ret != 0) return ret; + + ret = drm_atomic_check_valid_clones(state, crtc); + if (ret != 0) + return ret; } /* From f4ae54bffbfd787a7123ad002959c5fc7fbe759b Mon Sep 17 00:00:00 2001 From: Chenyuan Yang Date: Sun, 6 Apr 2025 16:08:54 -0500 Subject: [PATCH 416/529] ASoC: imx-card: Adjust over allocation of memory in imx_card_parse_of() [ Upstream commit a9a69c3b38c89d7992fb53db4abb19104b531d32 ] Incorrect types are used as sizeof() arguments in devm_kcalloc(). It should be sizeof(dai_link_data) for link_data instead of sizeof(snd_soc_dai_link). This is found by our static analysis tool. Signed-off-by: Chenyuan Yang Link: https://patch.msgid.link/20250406210854.149316-1-chenyuan0y@gmail.com Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/fsl/imx-card.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/fsl/imx-card.c b/sound/soc/fsl/imx-card.c index c6d55b21f949..11430f9f4996 100644 --- a/sound/soc/fsl/imx-card.c +++ b/sound/soc/fsl/imx-card.c @@ -517,7 +517,7 @@ static int imx_card_parse_of(struct imx_card_data *data) if (!card->dai_link) return -ENOMEM; - data->link_data = devm_kcalloc(dev, num_links, sizeof(*link), GFP_KERNEL); + data->link_data = devm_kcalloc(dev, num_links, sizeof(*link_data), GFP_KERNEL); if (!data->link_data) return -ENOMEM; From 181438633a56c820ba242e4dd89f2de0fa4f464e Mon Sep 17 00:00:00 2001 From: Martin Blumenstingl Date: Sat, 29 Mar 2025 20:01:32 +0100 Subject: [PATCH 417/529] pinctrl: meson: define the pull up/down resistor value as 60 kOhm [ Upstream commit e56088a13708757da68ad035269d69b93ac8c389 ] The public datasheets of the following Amlogic SoCs describe a typical resistor value for the built-in pull up/down resistor: - Meson8/8b/8m2: not documented - GXBB (S905): 60 kOhm - GXL (S905X): 60 kOhm - GXM (S912): 60 kOhm - G12B (S922X): 60 kOhm - SM1 (S905D3): 60 kOhm The public G12B and SM1 datasheets additionally state min and max values: - min value: 50 kOhm for both, pull-up and pull-down - max value for the pull-up: 70 kOhm - max value for the pull-down: 130 kOhm Use 60 kOhm in the pinctrl-meson driver as well so it's shown in the debugfs output. It may not be accurate for Meson8/8b/8m2 but in reality 60 kOhm is closer to the actual value than 1 Ohm. Signed-off-by: Martin Blumenstingl Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/20250329190132.855196-1-martin.blumenstingl@googlemail.com Signed-off-by: Linus Walleij Signed-off-by: Sasha Levin --- drivers/pinctrl/meson/pinctrl-meson.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/meson/pinctrl-meson.c b/drivers/pinctrl/meson/pinctrl-meson.c index 530f3f934e19..1f05f7f1a9ae 100644 --- a/drivers/pinctrl/meson/pinctrl-meson.c +++ b/drivers/pinctrl/meson/pinctrl-meson.c @@ -487,7 +487,7 @@ static int meson_pinconf_get(struct pinctrl_dev *pcdev, unsigned int pin, case PIN_CONFIG_BIAS_PULL_DOWN: case PIN_CONFIG_BIAS_PULL_UP: if (meson_pinconf_get_pull(pc, pin) == param) - arg = 1; + arg = 60000; else return -EINVAL; break; From 9fce40ac35f00546a41b9537c039185fe87d1f15 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 20 Apr 2025 10:56:59 +0200 Subject: [PATCH 418/529] ASoC: Intel: bytcr_rt5640: Add DMI quirk for Acer Aspire SW3-013 [ Upstream commit a549b927ea3f5e50b1394209b64e6e17e31d4db8 ] Acer Aspire SW3-013 requires the very same quirk as other Acer Aspire model for making it working. Link: https://bugzilla.kernel.org/show_bug.cgi?id=220011 Signed-off-by: Takashi Iwai Link: https://patch.msgid.link/20250420085716.12095-1-tiwai@suse.de Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- sound/soc/intel/boards/bytcr_rt5640.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/sound/soc/intel/boards/bytcr_rt5640.c b/sound/soc/intel/boards/bytcr_rt5640.c index 67b343632a10..b00a9fdd7a9c 100644 --- a/sound/soc/intel/boards/bytcr_rt5640.c +++ b/sound/soc/intel/boards/bytcr_rt5640.c @@ -576,6 +576,19 @@ static const struct dmi_system_id byt_rt5640_quirk_table[] = { BYT_RT5640_SSP0_AIF2 | BYT_RT5640_MCLK_EN), }, + { /* Acer Aspire SW3-013 */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Acer"), + DMI_MATCH(DMI_PRODUCT_NAME, "Aspire SW3-013"), + }, + .driver_data = (void *)(BYT_RT5640_DMIC1_MAP | + BYT_RT5640_JD_SRC_JD2_IN4N | + BYT_RT5640_OVCD_TH_2000UA | + BYT_RT5640_OVCD_SF_0P75 | + BYT_RT5640_DIFF_MIC | + BYT_RT5640_SSP0_AIF1 | + BYT_RT5640_MCLK_EN), + }, { .matches = { DMI_MATCH(DMI_SYS_VENDOR, "Acer"), From e104460e8a267e962b6adf512736071576ed431f Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sun, 27 Apr 2025 10:10:34 +0200 Subject: [PATCH 419/529] ALSA: hda/realtek: Add quirk for HP Spectre x360 15-df1xxx [ Upstream commit be0c40da888840fe91b45474cb70779e6cbaf7ca ] HP Spectre x360 15-df1xxx with SSID 13c:863e requires similar workarounds that were applied to another HP Spectre x360 models; it has a mute LED only, no micmute LEDs, and needs the speaker GPIO seup. Link: https://bugzilla.kernel.org/show_bug.cgi?id=220054 Link: https://patch.msgid.link/20250427081035.11567-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/pci/hda/patch_realtek.c | 42 +++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 2f67cd955d65..682ae18e211c 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6787,6 +6787,41 @@ static void alc285_fixup_hp_spectre_x360_eb1(struct hda_codec *codec, } } +/* GPIO1 = amplifier on/off */ +static void alc285_fixup_hp_spectre_x360_df1(struct hda_codec *codec, + const struct hda_fixup *fix, + int action) +{ + struct alc_spec *spec = codec->spec; + static const hda_nid_t conn[] = { 0x02 }; + static const struct hda_pintbl pincfgs[] = { + { 0x14, 0x90170110 }, /* front/high speakers */ + { 0x17, 0x90170130 }, /* back/bass speakers */ + { } + }; + + // enable mute led + alc285_fixup_hp_mute_led_coefbit(codec, fix, action); + + switch (action) { + case HDA_FIXUP_ACT_PRE_PROBE: + /* needed for amp of back speakers */ + spec->gpio_mask |= 0x01; + spec->gpio_dir |= 0x01; + snd_hda_apply_pincfgs(codec, pincfgs); + /* share DAC to have unified volume control */ + snd_hda_override_conn_list(codec, 0x14, ARRAY_SIZE(conn), conn); + snd_hda_override_conn_list(codec, 0x17, ARRAY_SIZE(conn), conn); + break; + case HDA_FIXUP_ACT_INIT: + /* need to toggle GPIO to enable the amp of back speakers */ + alc_update_gpio_data(codec, 0x01, true); + msleep(100); + alc_update_gpio_data(codec, 0x01, false); + break; + } +} + static void alc285_fixup_hp_spectre_x360(struct hda_codec *codec, const struct hda_fixup *fix, int action) { @@ -7326,6 +7361,7 @@ enum { ALC280_FIXUP_HP_9480M, ALC245_FIXUP_HP_X360_AMP, ALC285_FIXUP_HP_SPECTRE_X360_EB1, + ALC285_FIXUP_HP_SPECTRE_X360_DF1, ALC285_FIXUP_HP_ENVY_X360, ALC288_FIXUP_DELL_HEADSET_MODE, ALC288_FIXUP_DELL1_MIC_NO_PRESENCE, @@ -9322,6 +9358,10 @@ static const struct hda_fixup alc269_fixups[] = { .type = HDA_FIXUP_FUNC, .v.func = alc285_fixup_hp_spectre_x360_eb1 }, + [ALC285_FIXUP_HP_SPECTRE_X360_DF1] = { + .type = HDA_FIXUP_FUNC, + .v.func = alc285_fixup_hp_spectre_x360_df1 + }, [ALC285_FIXUP_HP_ENVY_X360] = { .type = HDA_FIXUP_FUNC, .v.func = alc285_fixup_hp_envy_x360, @@ -9882,6 +9922,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x86c1, "HP Laptop 15-da3001TU", ALC236_FIXUP_HP_MUTE_LED_COEFBIT2), SND_PCI_QUIRK(0x103c, 0x86c7, "HP Envy AiO 32", ALC274_FIXUP_HP_ENVY_GPIO), SND_PCI_QUIRK(0x103c, 0x86e7, "HP Spectre x360 15-eb0xxx", ALC285_FIXUP_HP_SPECTRE_X360_EB1), + SND_PCI_QUIRK(0x103c, 0x863e, "HP Spectre x360 15-df1xxx", ALC285_FIXUP_HP_SPECTRE_X360_DF1), SND_PCI_QUIRK(0x103c, 0x86e8, "HP Spectre x360 15-eb0xxx", ALC285_FIXUP_HP_SPECTRE_X360_EB1), SND_PCI_QUIRK(0x103c, 0x86f9, "HP Spectre x360 13-aw0xxx", ALC285_FIXUP_HP_SPECTRE_X360_MUTE_LED), SND_PCI_QUIRK(0x103c, 0x8716, "HP Elite Dragonfly G2 Notebook PC", ALC285_FIXUP_HP_GPIO_AMP_INIT), @@ -10591,6 +10632,7 @@ static const struct hda_model_fixup alc269_fixup_models[] = { {.id = ALC295_FIXUP_HP_OMEN, .name = "alc295-hp-omen"}, {.id = ALC285_FIXUP_HP_SPECTRE_X360, .name = "alc285-hp-spectre-x360"}, {.id = ALC285_FIXUP_HP_SPECTRE_X360_EB1, .name = "alc285-hp-spectre-x360-eb1"}, + {.id = ALC285_FIXUP_HP_SPECTRE_X360_DF1, .name = "alc285-hp-spectre-x360-df1"}, {.id = ALC285_FIXUP_HP_ENVY_X360, .name = "alc285-hp-envy-x360"}, {.id = ALC287_FIXUP_IDEAPAD_BASS_SPK_AMP, .name = "alc287-ideapad-bass-spk-amp"}, {.id = ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN, .name = "alc287-yoga9-bass-spk-pin"}, From c240375587ddcc80e1022f52ee32b946bbc3a639 Mon Sep 17 00:00:00 2001 From: Alistair Francis Date: Wed, 23 Apr 2025 16:06:21 +1000 Subject: [PATCH 420/529] nvmet-tcp: don't restore null sk_state_change [ Upstream commit 46d22b47df2741996af277a2838b95f130436c13 ] queue->state_change is set as part of nvmet_tcp_set_queue_sock(), but if the TCP connection isn't established when nvmet_tcp_set_queue_sock() is called then queue->state_change isn't set and sock->sk->sk_state_change isn't replaced. As such we don't need to restore sock->sk->sk_state_change if queue->state_change is NULL. This avoids NULL pointer dereferences such as this: [ 286.462026][ C0] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 286.462814][ C0] #PF: supervisor instruction fetch in kernel mode [ 286.463796][ C0] #PF: error_code(0x0010) - not-present page [ 286.464392][ C0] PGD 8000000140620067 P4D 8000000140620067 PUD 114201067 PMD 0 [ 286.465086][ C0] Oops: Oops: 0010 [#1] SMP KASAN PTI [ 286.465559][ C0] CPU: 0 UID: 0 PID: 1628 Comm: nvme Not tainted 6.15.0-rc2+ #11 PREEMPT(voluntary) [ 286.466393][ C0] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-3.fc41 04/01/2014 [ 286.467147][ C0] RIP: 0010:0x0 [ 286.467420][ C0] Code: Unable to access opcode bytes at 0xffffffffffffffd6. [ 286.467977][ C0] RSP: 0018:ffff8883ae008580 EFLAGS: 00010246 [ 286.468425][ C0] RAX: 0000000000000000 RBX: ffff88813fd34100 RCX: ffffffffa386cc43 [ 286.469019][ C0] RDX: 1ffff11027fa68b6 RSI: 0000000000000008 RDI: ffff88813fd34100 [ 286.469545][ C0] RBP: ffff88813fd34160 R08: 0000000000000000 R09: ffffed1027fa682c [ 286.470072][ C0] R10: ffff88813fd34167 R11: 0000000000000000 R12: ffff88813fd344c3 [ 286.470585][ C0] R13: ffff88813fd34112 R14: ffff88813fd34aec R15: ffff888132cdd268 [ 286.471070][ C0] FS: 00007fe3c04c7d80(0000) GS:ffff88840743f000(0000) knlGS:0000000000000000 [ 286.471644][ C0] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 286.472543][ C0] CR2: ffffffffffffffd6 CR3: 000000012daca000 CR4: 00000000000006f0 [ 286.473500][ C0] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 286.474467][ C0] DR3: 0000000000000000 DR6: 00000000ffff07f0 DR7: 0000000000000400 [ 286.475453][ C0] Call Trace: [ 286.476102][ C0] [ 286.476719][ C0] tcp_fin+0x2bb/0x440 [ 286.477429][ C0] tcp_data_queue+0x190f/0x4e60 [ 286.478174][ C0] ? __build_skb_around+0x234/0x330 [ 286.478940][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.479659][ C0] ? __pfx_tcp_data_queue+0x10/0x10 [ 286.480431][ C0] ? tcp_try_undo_loss+0x640/0x6c0 [ 286.481196][ C0] ? seqcount_lockdep_reader_access.constprop.0+0x82/0x90 [ 286.482046][ C0] ? kvm_clock_get_cycles+0x14/0x30 [ 286.482769][ C0] ? ktime_get+0x66/0x150 [ 286.483433][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.484146][ C0] tcp_rcv_established+0x6e4/0x2050 [ 286.484857][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.485523][ C0] ? ipv4_dst_check+0x160/0x2b0 [ 286.486203][ C0] ? __pfx_tcp_rcv_established+0x10/0x10 [ 286.486917][ C0] ? lock_release+0x217/0x2c0 [ 286.487595][ C0] tcp_v4_do_rcv+0x4d6/0x9b0 [ 286.488279][ C0] tcp_v4_rcv+0x2af8/0x3e30 [ 286.488904][ C0] ? raw_local_deliver+0x51b/0xad0 [ 286.489551][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.490198][ C0] ? __pfx_tcp_v4_rcv+0x10/0x10 [ 286.490813][ C0] ? __pfx_raw_local_deliver+0x10/0x10 [ 286.491487][ C0] ? __pfx_nf_confirm+0x10/0x10 [nf_conntrack] [ 286.492275][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.492900][ C0] ip_protocol_deliver_rcu+0x8f/0x370 [ 286.493579][ C0] ip_local_deliver_finish+0x297/0x420 [ 286.494268][ C0] ip_local_deliver+0x168/0x430 [ 286.494867][ C0] ? __pfx_ip_local_deliver+0x10/0x10 [ 286.495498][ C0] ? __pfx_ip_local_deliver_finish+0x10/0x10 [ 286.496204][ C0] ? ip_rcv_finish_core+0x19a/0x1f20 [ 286.496806][ C0] ? lock_release+0x217/0x2c0 [ 286.497414][ C0] ip_rcv+0x455/0x6e0 [ 286.497945][ C0] ? __pfx_ip_rcv+0x10/0x10 [ 286.498550][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.499137][ C0] ? __pfx_ip_rcv_finish+0x10/0x10 [ 286.499763][ C0] ? lock_release+0x217/0x2c0 [ 286.500327][ C0] ? dl_scaled_delta_exec+0xd1/0x2c0 [ 286.500922][ C0] ? __pfx_ip_rcv+0x10/0x10 [ 286.501480][ C0] __netif_receive_skb_one_core+0x166/0x1b0 [ 286.502173][ C0] ? __pfx___netif_receive_skb_one_core+0x10/0x10 [ 286.502903][ C0] ? lock_acquire+0x2b2/0x310 [ 286.503487][ C0] ? process_backlog+0x372/0x1350 [ 286.504087][ C0] ? lock_release+0x217/0x2c0 [ 286.504642][ C0] process_backlog+0x3b9/0x1350 [ 286.505214][ C0] ? process_backlog+0x372/0x1350 [ 286.505779][ C0] __napi_poll.constprop.0+0xa6/0x490 [ 286.506363][ C0] net_rx_action+0x92e/0xe10 [ 286.506889][ C0] ? __pfx_net_rx_action+0x10/0x10 [ 286.507437][ C0] ? timerqueue_add+0x1f0/0x320 [ 286.507977][ C0] ? sched_clock_cpu+0x68/0x540 [ 286.508492][ C0] ? lock_acquire+0x2b2/0x310 [ 286.509043][ C0] ? kvm_sched_clock_read+0xd/0x20 [ 286.509607][ C0] ? handle_softirqs+0x1aa/0x7d0 [ 286.510187][ C0] handle_softirqs+0x1f2/0x7d0 [ 286.510754][ C0] ? __pfx_handle_softirqs+0x10/0x10 [ 286.511348][ C0] ? irqtime_account_irq+0x181/0x290 [ 286.511937][ C0] ? __dev_queue_xmit+0x85d/0x3450 [ 286.512510][ C0] do_softirq.part.0+0x89/0xc0 [ 286.513100][ C0] [ 286.513548][ C0] [ 286.513953][ C0] __local_bh_enable_ip+0x112/0x140 [ 286.514522][ C0] ? __dev_queue_xmit+0x85d/0x3450 [ 286.515072][ C0] __dev_queue_xmit+0x872/0x3450 [ 286.515619][ C0] ? nft_do_chain+0xe16/0x15b0 [nf_tables] [ 286.516252][ C0] ? __pfx___dev_queue_xmit+0x10/0x10 [ 286.516817][ C0] ? selinux_ip_postroute+0x43c/0xc50 [ 286.517433][ C0] ? __pfx_selinux_ip_postroute+0x10/0x10 [ 286.518061][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.518606][ C0] ? ip_output+0x164/0x4a0 [ 286.519149][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.519671][ C0] ? ip_finish_output2+0x17d5/0x1fb0 [ 286.520258][ C0] ip_finish_output2+0xb4b/0x1fb0 [ 286.520787][ C0] ? __pfx_ip_finish_output2+0x10/0x10 [ 286.521355][ C0] ? __ip_finish_output+0x15d/0x750 [ 286.521890][ C0] ip_output+0x164/0x4a0 [ 286.522372][ C0] ? __pfx_ip_output+0x10/0x10 [ 286.522872][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.523402][ C0] ? _raw_spin_unlock_irqrestore+0x4c/0x60 [ 286.524031][ C0] ? __pfx_ip_finish_output+0x10/0x10 [ 286.524605][ C0] ? __ip_queue_xmit+0x999/0x2260 [ 286.525200][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.525744][ C0] ? ipv4_dst_check+0x16a/0x2b0 [ 286.526279][ C0] ? lock_release+0x217/0x2c0 [ 286.526793][ C0] __ip_queue_xmit+0x1883/0x2260 [ 286.527324][ C0] ? __skb_clone+0x54c/0x730 [ 286.527827][ C0] __tcp_transmit_skb+0x209b/0x37a0 [ 286.528374][ C0] ? __pfx___tcp_transmit_skb+0x10/0x10 [ 286.528952][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.529472][ C0] ? seqcount_lockdep_reader_access.constprop.0+0x82/0x90 [ 286.530152][ C0] ? trace_hardirqs_on+0x12/0x120 [ 286.530691][ C0] tcp_write_xmit+0xb81/0x88b0 [ 286.531224][ C0] ? mod_memcg_state+0x4d/0x60 [ 286.531736][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.532253][ C0] __tcp_push_pending_frames+0x90/0x320 [ 286.532826][ C0] tcp_send_fin+0x141/0xb50 [ 286.533352][ C0] ? __pfx_tcp_send_fin+0x10/0x10 [ 286.533908][ C0] ? __local_bh_enable_ip+0xab/0x140 [ 286.534495][ C0] inet_shutdown+0x243/0x320 [ 286.535077][ C0] nvme_tcp_alloc_queue+0xb3b/0x2590 [nvme_tcp] [ 286.535709][ C0] ? do_raw_spin_lock+0x129/0x260 [ 286.536314][ C0] ? __pfx_nvme_tcp_alloc_queue+0x10/0x10 [nvme_tcp] [ 286.536996][ C0] ? do_raw_spin_unlock+0x54/0x1e0 [ 286.537550][ C0] ? _raw_spin_unlock+0x29/0x50 [ 286.538127][ C0] ? do_raw_spin_lock+0x129/0x260 [ 286.538664][ C0] ? __pfx_do_raw_spin_lock+0x10/0x10 [ 286.539249][ C0] ? nvme_tcp_alloc_admin_queue+0xd5/0x340 [nvme_tcp] [ 286.539892][ C0] ? __wake_up+0x40/0x60 [ 286.540392][ C0] nvme_tcp_alloc_admin_queue+0xd5/0x340 [nvme_tcp] [ 286.541047][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.541589][ C0] nvme_tcp_setup_ctrl+0x8b/0x7a0 [nvme_tcp] [ 286.542254][ C0] ? _raw_spin_unlock_irqrestore+0x4c/0x60 [ 286.542887][ C0] ? __pfx_nvme_tcp_setup_ctrl+0x10/0x10 [nvme_tcp] [ 286.543568][ C0] ? trace_hardirqs_on+0x12/0x120 [ 286.544166][ C0] ? _raw_spin_unlock_irqrestore+0x35/0x60 [ 286.544792][ C0] ? nvme_change_ctrl_state+0x196/0x2e0 [nvme_core] [ 286.545477][ C0] nvme_tcp_create_ctrl+0x839/0xb90 [nvme_tcp] [ 286.546126][ C0] nvmf_dev_write+0x3db/0x7e0 [nvme_fabrics] [ 286.546775][ C0] ? rw_verify_area+0x69/0x520 [ 286.547334][ C0] vfs_write+0x218/0xe90 [ 286.547854][ C0] ? do_syscall_64+0x9f/0x190 [ 286.548408][ C0] ? trace_hardirqs_on_prepare+0xdb/0x120 [ 286.549037][ C0] ? syscall_exit_to_user_mode+0x93/0x280 [ 286.549659][ C0] ? __pfx_vfs_write+0x10/0x10 [ 286.550259][ C0] ? do_syscall_64+0x9f/0x190 [ 286.550840][ C0] ? syscall_exit_to_user_mode+0x8e/0x280 [ 286.551516][ C0] ? trace_hardirqs_on_prepare+0xdb/0x120 [ 286.552180][ C0] ? syscall_exit_to_user_mode+0x93/0x280 [ 286.552834][ C0] ? ksys_read+0xf5/0x1c0 [ 286.553386][ C0] ? __pfx_ksys_read+0x10/0x10 [ 286.553964][ C0] ksys_write+0xf5/0x1c0 [ 286.554499][ C0] ? __pfx_ksys_write+0x10/0x10 [ 286.555072][ C0] ? trace_hardirqs_on_prepare+0xdb/0x120 [ 286.555698][ C0] ? syscall_exit_to_user_mode+0x93/0x280 [ 286.556319][ C0] ? do_syscall_64+0x54/0x190 [ 286.556866][ C0] do_syscall_64+0x93/0x190 [ 286.557420][ C0] ? rcu_read_unlock+0x17/0x60 [ 286.557986][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.558526][ C0] ? lock_release+0x217/0x2c0 [ 286.559087][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.559659][ C0] ? count_memcg_events.constprop.0+0x4a/0x60 [ 286.560476][ C0] ? exc_page_fault+0x7a/0x110 [ 286.561064][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.561647][ C0] ? lock_release+0x217/0x2c0 [ 286.562257][ C0] ? do_user_addr_fault+0x171/0xa00 [ 286.562839][ C0] ? do_user_addr_fault+0x4a2/0xa00 [ 286.563453][ C0] ? irqentry_exit_to_user_mode+0x84/0x270 [ 286.564112][ C0] ? rcu_is_watching+0x11/0xb0 [ 286.564677][ C0] ? irqentry_exit_to_user_mode+0x84/0x270 [ 286.565317][ C0] ? trace_hardirqs_on_prepare+0xdb/0x120 [ 286.565922][ C0] entry_SYSCALL_64_after_hwframe+0x76/0x7e [ 286.566542][ C0] RIP: 0033:0x7fe3c05e6504 [ 286.567102][ C0] Code: c7 00 16 00 00 00 b8 ff ff ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 80 3d c5 8b 10 00 00 74 13 b8 01 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 54 c3 0f 1f 00 55 48 89 e5 48 83 ec 20 48 89 [ 286.568931][ C0] RSP: 002b:00007fff76444f58 EFLAGS: 00000202 ORIG_RAX: 0000000000000001 [ 286.569807][ C0] RAX: ffffffffffffffda RBX: 000000003b40d930 RCX: 00007fe3c05e6504 [ 286.570621][ C0] RDX: 00000000000000cf RSI: 000000003b40d930 RDI: 0000000000000003 [ 286.571443][ C0] RBP: 0000000000000003 R08: 00000000000000cf R09: 000000003b40d930 [ 286.572246][ C0] R10: 0000000000000000 R11: 0000000000000202 R12: 000000003b40cd60 [ 286.573069][ C0] R13: 00000000000000cf R14: 00007fe3c07417f8 R15: 00007fe3c073502e [ 286.573886][ C0] Closes: https://lore.kernel.org/linux-nvme/5hdonndzoqa265oq3bj6iarwtfk5dewxxjtbjvn5uqnwclpwt6@a2n6w3taxxex/ Signed-off-by: Alistair Francis Reviewed-by: Sagi Grimberg Tested-by: Shin'ichiro Kawasaki Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/target/tcp.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index 125e22bd34e2..eee052dbf80c 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -1417,6 +1417,9 @@ static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue) { struct socket *sock = queue->sock; + if (!queue->state_change) + return; + write_lock_bh(&sock->sk->sk_callback_lock); sock->sk->sk_data_ready = queue->data_ready; sock->sk->sk_state_change = queue->state_change; From 7c0394dbbd58998872fae303efcfa943ff1fa96e Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 30 Apr 2025 07:17:17 -0600 Subject: [PATCH 421/529] io_uring/fdinfo: annotate racy sq/cq head/tail reads [ Upstream commit f024d3a8ded0d8d2129ae123d7a5305c29ca44ce ] syzbot complains about the cached sq head read, and it's totally right. But we don't need to care, it's just reading fdinfo, and reading the CQ or SQ tail/head entries are known racy in that they are just a view into that very instant and may of course be outdated by the time they are reported. Annotate both the SQ head and CQ tail read with data_race() to avoid this syzbot complaint. Link: https://lore.kernel.org/io-uring/6811f6dc.050a0220.39e3a1.0d0e.GAE@google.com/ Reported-by: syzbot+3e77fd302e99f5af9394@syzkaller.appspotmail.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- io_uring/fdinfo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/io_uring/fdinfo.c b/io_uring/fdinfo.c index ea2c2ded4e41..4a5053169977 100644 --- a/io_uring/fdinfo.c +++ b/io_uring/fdinfo.c @@ -79,11 +79,11 @@ static __cold void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, seq_printf(m, "SqMask:\t0x%x\n", sq_mask); seq_printf(m, "SqHead:\t%u\n", sq_head); seq_printf(m, "SqTail:\t%u\n", sq_tail); - seq_printf(m, "CachedSqHead:\t%u\n", ctx->cached_sq_head); + seq_printf(m, "CachedSqHead:\t%u\n", data_race(ctx->cached_sq_head)); seq_printf(m, "CqMask:\t0x%x\n", cq_mask); seq_printf(m, "CqHead:\t%u\n", cq_head); seq_printf(m, "CqTail:\t%u\n", cq_tail); - seq_printf(m, "CachedCqTail:\t%u\n", ctx->cached_cq_tail); + seq_printf(m, "CachedCqTail:\t%u\n", data_race(ctx->cached_cq_tail)); seq_printf(m, "SQEs:\t%u\n", sq_tail - sq_head); sq_entries = min(sq_tail - sq_head, ctx->sq_entries); for (i = 0; i < sq_entries; i++) { From 0528bba48dce7820d2da72e1a114e1c4552367eb Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Fri, 25 Apr 2025 09:25:06 -0400 Subject: [PATCH 422/529] btrfs: correct the order of prelim_ref arguments in btrfs__prelim_ref [ Upstream commit bc7e0975093567f51be8e1bdf4aa5900a3cf0b1e ] btrfs_prelim_ref() calls the old and new reference variables in the incorrect order. This causes a NULL pointer dereference because oldref is passed as NULL to trace_btrfs_prelim_ref_insert(). Note, trace_btrfs_prelim_ref_insert() is being called with newref as oldref (and oldref as NULL) on purpose in order to print out the values of newref. To reproduce: echo 1 > /sys/kernel/debug/tracing/events/btrfs/btrfs_prelim_ref_insert/enable Perform some writeback operations. Backtrace: BUG: kernel NULL pointer dereference, address: 0000000000000018 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page PGD 115949067 P4D 115949067 PUD 11594a067 PMD 0 Oops: Oops: 0000 [#1] SMP NOPTI CPU: 1 UID: 0 PID: 1188 Comm: fsstress Not tainted 6.15.0-rc2-tester+ #47 PREEMPT(voluntary) 7ca2cef72d5e9c600f0c7718adb6462de8149622 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.3-2-gc13ff2cd-prebuilt.qemu.org 04/01/2014 RIP: 0010:trace_event_raw_event_btrfs__prelim_ref+0x72/0x130 Code: e8 43 81 9f ff 48 85 c0 74 78 4d 85 e4 0f 84 8f 00 00 00 49 8b 94 24 c0 06 00 00 48 8b 0a 48 89 48 08 48 8b 52 08 48 89 50 10 <49> 8b 55 18 48 89 50 18 49 8b 55 20 48 89 50 20 41 0f b6 55 28 88 RSP: 0018:ffffce44820077a0 EFLAGS: 00010286 RAX: ffff8c6b403f9014 RBX: ffff8c6b55825730 RCX: 304994edf9cf506b RDX: d8b11eb7f0fdb699 RSI: ffff8c6b403f9010 RDI: ffff8c6b403f9010 RBP: 0000000000000001 R08: 0000000000000001 R09: 0000000000000010 R10: 00000000ffffffff R11: 0000000000000000 R12: ffff8c6b4e8fb000 R13: 0000000000000000 R14: ffffce44820077a8 R15: ffff8c6b4abd1540 FS: 00007f4dc6813740(0000) GS:ffff8c6c1d378000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000018 CR3: 000000010eb42000 CR4: 0000000000750ef0 PKRU: 55555554 Call Trace: prelim_ref_insert+0x1c1/0x270 find_parent_nodes+0x12a6/0x1ee0 ? __entry_text_end+0x101f06/0x101f09 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 ? srso_alias_return_thunk+0x5/0xfbef5 btrfs_is_data_extent_shared+0x167/0x640 ? fiemap_process_hole+0xd0/0x2c0 extent_fiemap+0xa5c/0xbc0 ? __entry_text_end+0x101f05/0x101f09 btrfs_fiemap+0x7e/0xd0 do_vfs_ioctl+0x425/0x9d0 __x64_sys_ioctl+0x75/0xc0 Signed-off-by: Goldwyn Rodrigues Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- include/trace/events/btrfs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 7a6c5a870d33..31847ccae493 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1847,7 +1847,7 @@ DECLARE_EVENT_CLASS(btrfs__prelim_ref, TP_PROTO(const struct btrfs_fs_info *fs_info, const struct prelim_ref *oldref, const struct prelim_ref *newref, u64 tree_size), - TP_ARGS(fs_info, newref, oldref, tree_size), + TP_ARGS(fs_info, oldref, newref, tree_size), TP_STRUCT__entry_btrfs( __field( u64, root_id ) From 390940c7f3952547762c869945727c3f31fa6ccc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 6 May 2025 21:42:59 +0200 Subject: [PATCH 423/529] wifi: iwlwifi: add support for Killer on MTL [ Upstream commit ebedf8b7f05b9c886d68d63025db8d1b12343157 ] For now, we need another entry for these devices, this will be changed completely for 6.16. Closes: https://bugzilla.kernel.org/show_bug.cgi?id=219926 Link: https://patch.msgid.link/20250506214258.2efbdc9e9a82.I31915ec252bd1c74bd53b89a0e214e42a74b6f2e@changeid Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- drivers/net/wireless/intel/iwlwifi/pcie/drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 7f30e6add993..39ac9d81d10d 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -552,6 +552,8 @@ static const struct iwl_dev_info iwl_dev_info_table[] = { IWL_DEV_INFO(0x7A70, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name), IWL_DEV_INFO(0x7AF0, 0x1691, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690s_name), IWL_DEV_INFO(0x7AF0, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name), + IWL_DEV_INFO(0x7F70, 0x1691, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690s_name), + IWL_DEV_INFO(0x7F70, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name), IWL_DEV_INFO(0x271C, 0x0214, iwl9260_2ac_cfg, iwl9260_1_name), IWL_DEV_INFO(0x7E40, 0x1691, iwl_cfg_ma_a0_gf4_a0, iwl_ax411_killer_1690s_name), From a58442f75be49c2be28a6eba387f5dbb1b11395c Mon Sep 17 00:00:00 2001 From: Jason Andryuk Date: Tue, 6 May 2025 16:44:56 -0400 Subject: [PATCH 424/529] xenbus: Allow PVH dom0 a non-local xenstore [ Upstream commit 90989869baae47ee2aa3bcb6f6eb9fbbe4287958 ] Make xenbus_init() allow a non-local xenstore for a PVH dom0 - it is currently forced to XS_LOCAL. With Hyperlaunch booting dom0 and a xenstore stubdom, dom0 can be handled as a regular XS_HVM following the late init path. Ideally we'd drop the use of xen_initial_domain() and just check for the event channel instead. However, ARM has a xen,enhanced no-xenstore mode, where the event channel and PFN would both be 0. Retain the xen_initial_domain() check, and use that for an additional check when the event channel is 0. Check the full 64bit HVM_PARAM_STORE_EVTCHN value to catch the off chance that high bits are set for the 32bit event channel. Signed-off-by: Jason Andryuk Change-Id: I5506da42e4c6b8e85079fefb2f193c8de17c7437 Reviewed-by: Stefano Stabellini Signed-off-by: Juergen Gross Message-ID: <20250506204456.5220-1-jason.andryuk@amd.com> Signed-off-by: Sasha Levin --- drivers/xen/xenbus/xenbus_probe.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 25164d56c9d9..d3b6908110c6 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -966,9 +966,15 @@ static int __init xenbus_init(void) if (xen_pv_domain()) xen_store_domain_type = XS_PV; if (xen_hvm_domain()) + { xen_store_domain_type = XS_HVM; - if (xen_hvm_domain() && xen_initial_domain()) - xen_store_domain_type = XS_LOCAL; + err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); + if (err) + goto out_error; + xen_store_evtchn = (int)v; + if (!v && xen_initial_domain()) + xen_store_domain_type = XS_LOCAL; + } if (xen_pv_domain() && !xen_start_info->store_evtchn) xen_store_domain_type = XS_LOCAL; if (xen_pv_domain() && xen_start_info->store_evtchn) @@ -987,10 +993,6 @@ static int __init xenbus_init(void) xen_store_interface = gfn_to_virt(xen_store_gfn); break; case XS_HVM: - err = hvm_get_parameter(HVM_PARAM_STORE_EVTCHN, &v); - if (err) - goto out_error; - xen_store_evtchn = (int)v; err = hvm_get_parameter(HVM_PARAM_STORE_PFN, &v); if (err) goto out_error; From 9b0915e72b3cf52474dcee0b24a2f99d93e604a3 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 27 Apr 2025 15:41:51 -0400 Subject: [PATCH 425/529] __legitimize_mnt(): check for MNT_SYNC_UMOUNT should be under mount_lock [ Upstream commit 250cf3693060a5f803c5f1ddc082bb06b16112a9 ] ... or we risk stealing final mntput from sync umount - raising mnt_count after umount(2) has verified that victim is not busy, but before it has set MNT_SYNC_UMOUNT; in that case __legitimize_mnt() doesn't see that it's safe to quietly undo mnt_count increment and leaves dropping the reference to caller, where it'll be a full-blown mntput(). Check under mount_lock is needed; leaving the current one done before taking that makes no sense - it's nowhere near common enough to bother with. Reviewed-by: Christian Brauner Signed-off-by: Al Viro Signed-off-by: Sasha Levin --- fs/namespace.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 0dcd57a75ad4..211a81240680 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -632,12 +632,8 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) smp_mb(); // see mntput_no_expire() and do_umount() if (likely(!read_seqretry(&mount_lock, seq))) return 0; - if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { - mnt_add_count(mnt, -1); - return 1; - } lock_mount_hash(); - if (unlikely(bastard->mnt_flags & MNT_DOOMED)) { + if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) { mnt_add_count(mnt, -1); unlock_mount_hash(); return 1; From e4cde54b46a87231c77256a633be1bef62687d69 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Wed, 9 Apr 2025 15:59:57 +0200 Subject: [PATCH 426/529] espintcp: remove encap socket caching to avoid reference leak [ Upstream commit 028363685bd0b7a19b4a820f82dd905b1dc83999 ] The current scheme for caching the encap socket can lead to reference leaks when we try to delete the netns. The reference chain is: xfrm_state -> enacp_sk -> netns Since the encap socket is a userspace socket, it holds a reference on the netns. If we delete the espintcp state (through flush or individual delete) before removing the netns, the reference on the socket is dropped and the netns is correctly deleted. Otherwise, the netns may not be reachable anymore (if all processes within the ns have terminated), so we cannot delete the xfrm state to drop its reference on the socket. This patch results in a small (~2% in my tests) performance regression. A GC-type mechanism could be added for the socket cache, to clear references if the state hasn't been used "recently", but it's a lot more complex than just not caching the socket. Fixes: e27cca96cd68 ("xfrm: add espintcp (RFC 8229)") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- include/net/xfrm.h | 1 - net/ipv4/esp4.c | 49 ++++--------------------------------------- net/ipv6/esp6.c | 49 ++++--------------------------------------- net/xfrm/xfrm_state.c | 3 --- 4 files changed, 8 insertions(+), 94 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index bf670929622d..64911162ab5f 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -212,7 +212,6 @@ struct xfrm_state { /* Data for encapsulator */ struct xfrm_encap_tmpl *encap; - struct sock __rcu *encap_sk; /* Data for care-of address */ xfrm_address_t *coaddr; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 419969b26822..8f5417ff355d 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -118,47 +118,16 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) } #ifdef CONFIG_INET_ESPINTCP -struct esp_tcp_sk { - struct sock *sk; - struct rcu_head rcu; -}; - -static void esp_free_tcp_sk(struct rcu_head *head) -{ - struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); - - sock_put(esk->sk); - kfree(esk); -} - static struct sock *esp_find_tcp_sk(struct xfrm_state *x) { struct xfrm_encap_tmpl *encap = x->encap; struct net *net = xs_net(x); - struct esp_tcp_sk *esk; __be16 sport, dport; - struct sock *nsk; struct sock *sk; - sk = rcu_dereference(x->encap_sk); - if (sk && sk->sk_state == TCP_ESTABLISHED) - return sk; - spin_lock_bh(&x->lock); sport = encap->encap_sport; dport = encap->encap_dport; - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (sk && sk == nsk) { - esk = kmalloc(sizeof(*esk), GFP_ATOMIC); - if (!esk) { - spin_unlock_bh(&x->lock); - return ERR_PTR(-ENOMEM); - } - RCU_INIT_POINTER(x->encap_sk, NULL); - esk->sk = sk; - call_rcu(&esk->rcu, esp_free_tcp_sk); - } spin_unlock_bh(&x->lock); sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4, @@ -171,20 +140,6 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x) return ERR_PTR(-EINVAL); } - spin_lock_bh(&x->lock); - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (encap->encap_sport != sport || - encap->encap_dport != dport) { - sock_put(sk); - sk = nsk ?: ERR_PTR(-EREMCHG); - } else if (sk == nsk) { - sock_put(sk); - } else { - rcu_assign_pointer(x->encap_sk, sk); - } - spin_unlock_bh(&x->lock); - return sk; } @@ -207,6 +162,8 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) err = espintcp_push_skb(sk, skb); bh_unlock_sock(sk); + sock_put(sk); + out: rcu_read_unlock(); return err; @@ -391,6 +348,8 @@ static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x, if (IS_ERR(sk)) return ERR_CAST(sk); + sock_put(sk); + *lenp = htons(len); esph = (struct ip_esp_hdr *)(lenp + 1); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index a021c88d3d9b..085a83b807af 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -135,47 +135,16 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) } #ifdef CONFIG_INET6_ESPINTCP -struct esp_tcp_sk { - struct sock *sk; - struct rcu_head rcu; -}; - -static void esp_free_tcp_sk(struct rcu_head *head) -{ - struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); - - sock_put(esk->sk); - kfree(esk); -} - static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) { struct xfrm_encap_tmpl *encap = x->encap; struct net *net = xs_net(x); - struct esp_tcp_sk *esk; __be16 sport, dport; - struct sock *nsk; struct sock *sk; - sk = rcu_dereference(x->encap_sk); - if (sk && sk->sk_state == TCP_ESTABLISHED) - return sk; - spin_lock_bh(&x->lock); sport = encap->encap_sport; dport = encap->encap_dport; - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (sk && sk == nsk) { - esk = kmalloc(sizeof(*esk), GFP_ATOMIC); - if (!esk) { - spin_unlock_bh(&x->lock); - return ERR_PTR(-ENOMEM); - } - RCU_INIT_POINTER(x->encap_sk, NULL); - esk->sk = sk; - call_rcu(&esk->rcu, esp_free_tcp_sk); - } spin_unlock_bh(&x->lock); sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6, @@ -188,20 +157,6 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) return ERR_PTR(-EINVAL); } - spin_lock_bh(&x->lock); - nsk = rcu_dereference_protected(x->encap_sk, - lockdep_is_held(&x->lock)); - if (encap->encap_sport != sport || - encap->encap_dport != dport) { - sock_put(sk); - sk = nsk ?: ERR_PTR(-EREMCHG); - } else if (sk == nsk) { - sock_put(sk); - } else { - rcu_assign_pointer(x->encap_sk, sk); - } - spin_unlock_bh(&x->lock); - return sk; } @@ -224,6 +179,8 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) err = espintcp_push_skb(sk, skb); bh_unlock_sock(sk); + sock_put(sk); + out: rcu_read_unlock(); return err; @@ -427,6 +384,8 @@ static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x, if (IS_ERR(sk)) return ERR_CAST(sk); + sock_put(sk); + *lenp = htons(len); esph = (struct ip_esp_hdr *)(lenp + 1); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 2f4cf976b59a..b5047a94c7d0 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -694,9 +694,6 @@ int __xfrm_state_delete(struct xfrm_state *x) net->xfrm.state_num--; spin_unlock(&net->xfrm.xfrm_state_lock); - if (x->encap_sk) - sock_put(rcu_dereference_raw(x->encap_sk)); - xfrm_dev_state_delete(x); /* All xfrm_state objects are created by xfrm_state_alloc. From 22d907d7350a0790b5291880e66ee78cb9c28a67 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Fri, 7 Apr 2023 13:31:33 -0700 Subject: [PATCH 427/529] dmaengine: idxd: add per DSA wq workqueue for processing cr faults [ Upstream commit 2f30decd2f23a376d2ed73dfe4c601421edf501a ] Add a workqueue for user submitted completion record fault processing. The workqueue creation and destruction lifetime will be tied to the user sub-driver since it will only be used when the wq is a user type. Tested-by: Tony Zhu Signed-off-by: Dave Jiang Co-developed-by: Fenghua Yu Signed-off-by: Fenghua Yu Link: https://lore.kernel.org/r/20230407203143.2189681-7-fenghua.yu@intel.com Signed-off-by: Vinod Koul Stable-dep-of: 8dfa57aabff6 ("dmaengine: idxd: Fix allowing write() from different address spaces") Signed-off-by: Sasha Levin --- drivers/dma/idxd/cdev.c | 11 +++++++++++ drivers/dma/idxd/idxd.h | 1 + 2 files changed, 12 insertions(+) diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index 9f8adb7013eb..e2a89873c6e1 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -408,6 +408,13 @@ static int idxd_user_drv_probe(struct idxd_dev *idxd_dev) } mutex_lock(&wq->wq_lock); + + wq->wq = create_workqueue(dev_name(wq_confdev(wq))); + if (!wq->wq) { + rc = -ENOMEM; + goto wq_err; + } + wq->type = IDXD_WQT_USER; rc = drv_enable_wq(wq); if (rc < 0) @@ -426,7 +433,9 @@ static int idxd_user_drv_probe(struct idxd_dev *idxd_dev) err_cdev: drv_disable_wq(wq); err: + destroy_workqueue(wq->wq); wq->type = IDXD_WQT_NONE; +wq_err: mutex_unlock(&wq->wq_lock); return rc; } @@ -439,6 +448,8 @@ static void idxd_user_drv_remove(struct idxd_dev *idxd_dev) idxd_wq_del_cdev(wq); drv_disable_wq(wq); wq->type = IDXD_WQT_NONE; + destroy_workqueue(wq->wq); + wq->wq = NULL; mutex_unlock(&wq->wq_lock); } diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 14c6ef987fed..5dbb67ff1c0c 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -185,6 +185,7 @@ struct idxd_wq { struct idxd_dev idxd_dev; struct idxd_cdev *idxd_cdev; struct wait_queue_head err_queue; + struct workqueue_struct *wq; struct idxd_device *idxd; int id; struct idxd_irq_entry ie; From 206d8a7b8c543ff74b50bc40bff7684c034392a9 Mon Sep 17 00:00:00 2001 From: Fenghua Yu Date: Fri, 7 Apr 2023 13:31:35 -0700 Subject: [PATCH 428/529] dmaengine: idxd: add idxd_copy_cr() to copy user completion record during page fault handling [ Upstream commit b022f59725f0ae846191abbd6d2e611d7f60f826 ] Define idxd_copy_cr() to copy completion record to fault address in user address that is found by work queue (wq) and PASID. It will be used to write the user's completion record that the hardware device is not able to write due to user completion record page fault. An xarray is added to associate the PASID and mm with the struct idxd_user_context so mm can be found by PASID and wq. It is called when handling the completion record fault in a kernel thread context. Switch to the mm using kthread_use_vm() and copy the completion record to the mm via copy_to_user(). Once the copy is completed, switch back to the current mm using kthread_unuse_mm(). Suggested-by: Christoph Hellwig Suggested-by: Jason Gunthorpe Suggested-by: Tony Luck Tested-by: Tony Zhu Signed-off-by: Fenghua Yu Reviewed-by: Dave Jiang Link: https://lore.kernel.org/r/20230407203143.2189681-9-fenghua.yu@intel.com Signed-off-by: Vinod Koul Stable-dep-of: 8dfa57aabff6 ("dmaengine: idxd: Fix allowing write() from different address spaces") Signed-off-by: Sasha Levin --- drivers/dma/idxd/cdev.c | 107 +++++++++++++++++++++++++++++++++++++-- drivers/dma/idxd/idxd.h | 6 +++ drivers/dma/idxd/init.c | 2 + drivers/dma/idxd/sysfs.c | 1 + 4 files changed, 111 insertions(+), 5 deletions(-) diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index e2a89873c6e1..c7aa47f01df0 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -12,7 +12,9 @@ #include #include #include +#include #include +#include #include "registers.h" #include "idxd.h" @@ -35,6 +37,7 @@ struct idxd_user_context { struct idxd_wq *wq; struct task_struct *task; unsigned int pasid; + struct mm_struct *mm; unsigned int flags; struct iommu_sva *sva; }; @@ -69,6 +72,19 @@ static inline struct idxd_wq *inode_wq(struct inode *inode) return idxd_cdev->wq; } +static void idxd_xa_pasid_remove(struct idxd_user_context *ctx) +{ + struct idxd_wq *wq = ctx->wq; + void *ptr; + + mutex_lock(&wq->uc_lock); + ptr = xa_cmpxchg(&wq->upasid_xa, ctx->pasid, ctx, NULL, GFP_KERNEL); + if (ptr != (void *)ctx) + dev_warn(&wq->idxd->pdev->dev, "xarray cmpxchg failed for pasid %u\n", + ctx->pasid); + mutex_unlock(&wq->uc_lock); +} + static int idxd_cdev_open(struct inode *inode, struct file *filp) { struct idxd_user_context *ctx; @@ -109,20 +125,26 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp) pasid = iommu_sva_get_pasid(sva); if (pasid == IOMMU_PASID_INVALID) { - iommu_sva_unbind_device(sva); rc = -EINVAL; - goto failed; + goto failed_get_pasid; } ctx->sva = sva; ctx->pasid = pasid; + ctx->mm = current->mm; + + mutex_lock(&wq->uc_lock); + rc = xa_insert(&wq->upasid_xa, pasid, ctx, GFP_KERNEL); + mutex_unlock(&wq->uc_lock); + if (rc < 0) + dev_warn(dev, "PASID entry already exist in xarray.\n"); if (wq_dedicated(wq)) { rc = idxd_wq_set_pasid(wq, pasid); if (rc < 0) { iommu_sva_unbind_device(sva); dev_err(dev, "wq set pasid failed: %d\n", rc); - goto failed; + goto failed_set_pasid; } } } @@ -131,7 +153,13 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp) mutex_unlock(&wq->wq_lock); return 0; - failed: +failed_set_pasid: + if (device_user_pasid_enabled(idxd)) + idxd_xa_pasid_remove(ctx); +failed_get_pasid: + if (device_user_pasid_enabled(idxd)) + iommu_sva_unbind_device(sva); +failed: mutex_unlock(&wq->wq_lock); kfree(ctx); return rc; @@ -162,8 +190,10 @@ static int idxd_cdev_release(struct inode *node, struct file *filep) } } - if (ctx->sva) + if (ctx->sva) { iommu_sva_unbind_device(ctx->sva); + idxd_xa_pasid_remove(ctx); + } kfree(ctx); mutex_lock(&wq->wq_lock); idxd_wq_put(wq); @@ -496,3 +526,70 @@ void idxd_cdev_remove(void) ida_destroy(&ictx[i].minor_ida); } } + +/** + * idxd_copy_cr - copy completion record to user address space found by wq and + * PASID + * @wq: work queue + * @pasid: PASID + * @addr: user fault address to write + * @cr: completion record + * @len: number of bytes to copy + * + * This is called by a work that handles completion record fault. + * + * Return: number of bytes copied. + */ +int idxd_copy_cr(struct idxd_wq *wq, ioasid_t pasid, unsigned long addr, + void *cr, int len) +{ + struct device *dev = &wq->idxd->pdev->dev; + int left = len, status_size = 1; + struct idxd_user_context *ctx; + struct mm_struct *mm; + + mutex_lock(&wq->uc_lock); + + ctx = xa_load(&wq->upasid_xa, pasid); + if (!ctx) { + dev_warn(dev, "No user context\n"); + goto out; + } + + mm = ctx->mm; + /* + * The completion record fault handling work is running in kernel + * thread context. It temporarily switches to the mm to copy cr + * to addr in the mm. + */ + kthread_use_mm(mm); + left = copy_to_user((void __user *)addr + status_size, cr + status_size, + len - status_size); + /* + * Copy status only after the rest of completion record is copied + * successfully so that the user gets the complete completion record + * when a non-zero status is polled. + */ + if (!left) { + u8 status; + + /* + * Ensure that the completion record's status field is written + * after the rest of the completion record has been written. + * This ensures that the user receives the correct completion + * record information once polling for a non-zero status. + */ + wmb(); + status = *(u8 *)cr; + if (put_user(status, (u8 __user *)addr)) + left += status_size; + } else { + left += status_size; + } + kthread_unuse_mm(mm); + +out: + mutex_unlock(&wq->uc_lock); + + return len - left; +} diff --git a/drivers/dma/idxd/idxd.h b/drivers/dma/idxd/idxd.h index 5dbb67ff1c0c..c3ace4aed0fc 100644 --- a/drivers/dma/idxd/idxd.h +++ b/drivers/dma/idxd/idxd.h @@ -215,6 +215,10 @@ struct idxd_wq { char name[WQ_NAME_SIZE + 1]; u64 max_xfer_bytes; u32 max_batch_size; + + /* Lock to protect upasid_xa access. */ + struct mutex uc_lock; + struct xarray upasid_xa; }; struct idxd_engine { @@ -666,6 +670,8 @@ void idxd_cdev_remove(void); int idxd_cdev_get_major(struct idxd_device *idxd); int idxd_wq_add_cdev(struct idxd_wq *wq); void idxd_wq_del_cdev(struct idxd_wq *wq); +int idxd_copy_cr(struct idxd_wq *wq, ioasid_t pasid, unsigned long addr, + void *buf, int len); /* perfmon */ #if IS_ENABLED(CONFIG_INTEL_IDXD_PERFMON) diff --git a/drivers/dma/idxd/init.c b/drivers/dma/idxd/init.c index 7cb76db5ad60..ea651d5cf332 100644 --- a/drivers/dma/idxd/init.c +++ b/drivers/dma/idxd/init.c @@ -218,6 +218,8 @@ static int idxd_setup_wqs(struct idxd_device *idxd) } bitmap_copy(wq->opcap_bmap, idxd->opcap_bmap, IDXD_MAX_OPCAP_BITS); } + mutex_init(&wq->uc_lock); + xa_init(&wq->upasid_xa); idxd->wqs[i] = wq; } diff --git a/drivers/dma/idxd/sysfs.c b/drivers/dma/idxd/sysfs.c index c811757d0f97..0689464c4816 100644 --- a/drivers/dma/idxd/sysfs.c +++ b/drivers/dma/idxd/sysfs.c @@ -1315,6 +1315,7 @@ static void idxd_conf_wq_release(struct device *dev) bitmap_free(wq->opcap_bmap); kfree(wq->wqcfg); + xa_destroy(&wq->upasid_xa); kfree(wq); } From d19d0157edd69a18f5bf77aac4d614f2a831b6b5 Mon Sep 17 00:00:00 2001 From: Vinicius Costa Gomes Date: Mon, 21 Apr 2025 10:03:37 -0700 Subject: [PATCH 429/529] dmaengine: idxd: Fix allowing write() from different address spaces [ Upstream commit 8dfa57aabff625bf445548257f7711ef294cd30e ] Check if the process submitting the descriptor belongs to the same address space as the one that opened the file, reject otherwise. Fixes: 6827738dc684 ("dmaengine: idxd: add a write() method for applications to submit work") Signed-off-by: Vinicius Costa Gomes Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/20250421170337.3008875-1-dave.jiang@intel.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/dma/idxd/cdev.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index c7aa47f01df0..186f005bfa8f 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -240,6 +240,9 @@ static int idxd_cdev_mmap(struct file *filp, struct vm_area_struct *vma) if (!idxd->user_submission_safe && !capable(CAP_SYS_RAWIO)) return -EPERM; + if (current->mm != ctx->mm) + return -EPERM; + rc = check_vma(wq, vma, __func__); if (rc < 0) return rc; @@ -306,6 +309,9 @@ static ssize_t idxd_cdev_write(struct file *filp, const char __user *buf, size_t ssize_t written = 0; int i; + if (current->mm != ctx->mm) + return -EPERM; + for (i = 0; i < len/sizeof(struct dsa_hw_desc); i++) { int rc = idxd_submit_user_descriptor(ctx, udesc + i); @@ -326,6 +332,9 @@ static __poll_t idxd_cdev_poll(struct file *filp, struct idxd_device *idxd = wq->idxd; __poll_t out = 0; + if (current->mm != ctx->mm) + return -EPERM; + poll_wait(filp, &wq->err_queue, wait); spin_lock(&idxd->dev_lock); if (idxd->sw_err.valid) From 8509fb9dec13ef29a8e4cc5b8cbee9293c716c9e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Matti=20Lehtim=C3=A4ki?= Date: Mon, 12 May 2025 02:40:15 +0300 Subject: [PATCH 430/529] remoteproc: qcom_wcnss: Fix on platforms without fallback regulators MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 4ca45af0a56d00b86285d6fdd720dca3215059a7 ] Recent change to handle platforms with only single power domain broke pronto-v3 which requires power domains and doesn't have fallback voltage regulators in case power domains are missing. Add a check to verify the number of fallback voltage regulators before using the code which handles single power domain situation. Fixes: 65991ea8a6d1 ("remoteproc: qcom_wcnss: Handle platforms with only single power domain") Signed-off-by: Matti Lehtimäki Tested-by: Luca Weiss # sdm632-fairphone-fp3 Link: https://lore.kernel.org/r/20250511234026.94735-1-matti.lehtimaki@gmail.com Signed-off-by: Bjorn Andersson Signed-off-by: Sasha Levin --- drivers/remoteproc/qcom_wcnss.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/remoteproc/qcom_wcnss.c b/drivers/remoteproc/qcom_wcnss.c index ce61e0e7cbeb..af96541c9b69 100644 --- a/drivers/remoteproc/qcom_wcnss.c +++ b/drivers/remoteproc/qcom_wcnss.c @@ -445,7 +445,8 @@ static int wcnss_init_regulators(struct qcom_wcnss *wcnss, if (wcnss->num_pds) { info += wcnss->num_pds; /* Handle single power domain case */ - num_vregs += num_pd_vregs - wcnss->num_pds; + if (wcnss->num_pds < num_pd_vregs) + num_vregs += num_pd_vregs - wcnss->num_pds; } else { num_vregs += num_pd_vregs; } From c4bfea4b611d3c627f49d0ad3163d85901a38692 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 1 May 2025 13:06:31 +0100 Subject: [PATCH 431/529] clk: sunxi-ng: d1: Add missing divider for MMC mod clocks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 98e6da673cc6dd46ca9a599802bd2c8f83606710 ] The D1/R528/T113 SoCs have a hidden divider of 2 in the MMC mod clocks, just as other recent SoCs. So far we did not describe that, which led to the resulting MMC clock rate to be only half of its intended value. Use a macro that allows to describe a fixed post-divider, to compensate for that divisor. This brings the MMC performance on those SoCs to its expected level, so about 23 MB/s for SD cards, instead of the 11 MB/s measured so far. Fixes: 35b97bb94111 ("clk: sunxi-ng: Add support for the D1 SoC clocks") Reported-by: Kuba Szczodrzyński Signed-off-by: Andre Przywara Link: https://patch.msgid.link/20250501120631.837186-1-andre.przywara@arm.com Signed-off-by: Chen-Yu Tsai Signed-off-by: Sasha Levin --- drivers/clk/sunxi-ng/ccu-sun20i-d1.c | 42 ++++++++++++++++------------ drivers/clk/sunxi-ng/ccu_mp.h | 22 +++++++++++++++ 2 files changed, 46 insertions(+), 18 deletions(-) diff --git a/drivers/clk/sunxi-ng/ccu-sun20i-d1.c b/drivers/clk/sunxi-ng/ccu-sun20i-d1.c index cb4bf038e17f..89d8bf4a30a2 100644 --- a/drivers/clk/sunxi-ng/ccu-sun20i-d1.c +++ b/drivers/clk/sunxi-ng/ccu-sun20i-d1.c @@ -412,19 +412,23 @@ static const struct clk_parent_data mmc0_mmc1_parents[] = { { .hw = &pll_periph0_2x_clk.common.hw }, { .hw = &pll_audio1_div2_clk.common.hw }, }; -static SUNXI_CCU_MP_DATA_WITH_MUX_GATE(mmc0_clk, "mmc0", mmc0_mmc1_parents, 0x830, - 0, 4, /* M */ - 8, 2, /* P */ - 24, 3, /* mux */ - BIT(31), /* gate */ - 0); +static SUNXI_CCU_MP_DATA_WITH_MUX_GATE_POSTDIV(mmc0_clk, "mmc0", + mmc0_mmc1_parents, 0x830, + 0, 4, /* M */ + 8, 2, /* P */ + 24, 3, /* mux */ + BIT(31), /* gate */ + 2, /* post-div */ + 0); -static SUNXI_CCU_MP_DATA_WITH_MUX_GATE(mmc1_clk, "mmc1", mmc0_mmc1_parents, 0x834, - 0, 4, /* M */ - 8, 2, /* P */ - 24, 3, /* mux */ - BIT(31), /* gate */ - 0); +static SUNXI_CCU_MP_DATA_WITH_MUX_GATE_POSTDIV(mmc1_clk, "mmc1", + mmc0_mmc1_parents, 0x834, + 0, 4, /* M */ + 8, 2, /* P */ + 24, 3, /* mux */ + BIT(31), /* gate */ + 2, /* post-div */ + 0); static const struct clk_parent_data mmc2_parents[] = { { .fw_name = "hosc" }, @@ -433,12 +437,14 @@ static const struct clk_parent_data mmc2_parents[] = { { .hw = &pll_periph0_800M_clk.common.hw }, { .hw = &pll_audio1_div2_clk.common.hw }, }; -static SUNXI_CCU_MP_DATA_WITH_MUX_GATE(mmc2_clk, "mmc2", mmc2_parents, 0x838, - 0, 4, /* M */ - 8, 2, /* P */ - 24, 3, /* mux */ - BIT(31), /* gate */ - 0); +static SUNXI_CCU_MP_DATA_WITH_MUX_GATE_POSTDIV(mmc2_clk, "mmc2", mmc2_parents, + 0x838, + 0, 4, /* M */ + 8, 2, /* P */ + 24, 3, /* mux */ + BIT(31), /* gate */ + 2, /* post-div */ + 0); static SUNXI_CCU_GATE_HWS(bus_mmc0_clk, "bus-mmc0", psi_ahb_hws, 0x84c, BIT(0), 0); diff --git a/drivers/clk/sunxi-ng/ccu_mp.h b/drivers/clk/sunxi-ng/ccu_mp.h index 6e50f3728fb5..7d836a9fb3db 100644 --- a/drivers/clk/sunxi-ng/ccu_mp.h +++ b/drivers/clk/sunxi-ng/ccu_mp.h @@ -52,6 +52,28 @@ struct ccu_mp { } \ } +#define SUNXI_CCU_MP_DATA_WITH_MUX_GATE_POSTDIV(_struct, _name, _parents, \ + _reg, \ + _mshift, _mwidth, \ + _pshift, _pwidth, \ + _muxshift, _muxwidth, \ + _gate, _postdiv, _flags)\ + struct ccu_mp _struct = { \ + .enable = _gate, \ + .m = _SUNXI_CCU_DIV(_mshift, _mwidth), \ + .p = _SUNXI_CCU_DIV(_pshift, _pwidth), \ + .mux = _SUNXI_CCU_MUX(_muxshift, _muxwidth), \ + .fixed_post_div = _postdiv, \ + .common = { \ + .reg = _reg, \ + .features = CCU_FEATURE_FIXED_POSTDIV, \ + .hw.init = CLK_HW_INIT_PARENTS_DATA(_name, \ + _parents, \ + &ccu_mp_ops, \ + _flags), \ + } \ + } + #define SUNXI_CCU_MP_WITH_MUX_GATE(_struct, _name, _parents, _reg, \ _mshift, _mwidth, \ _pshift, _pwidth, \ From 35e6a84883a79a7cf70a2f44631403a65ba44231 Mon Sep 17 00:00:00 2001 From: Paul Chaignon Date: Wed, 7 May 2025 13:31:58 +0200 Subject: [PATCH 432/529] xfrm: Sanitize marks before insert [ Upstream commit 0b91fda3a1f044141e1e615456ff62508c32b202 ] Prior to this patch, the mark is sanitized (applying the state's mask to the state's value) only on inserts when checking if a conflicting XFRM state or policy exists. We discovered in Cilium that this same sanitization does not occur in the hot-path __xfrm_state_lookup. In the hot-path, the sk_buff's mark is simply compared to the state's value: if ((mark & x->mark.m) != x->mark.v) continue; Therefore, users can define unsanitized marks (ex. 0xf42/0xf00) which will never match any packet. This commit updates __xfrm_state_insert and xfrm_policy_insert to store the sanitized marks, thus removing this footgun. This has the side effect of changing the ip output, as the returned mark will have the mask applied to it when printed. Fixes: 3d6acfa7641f ("xfrm: SA lookups with mark") Signed-off-by: Paul Chaignon Signed-off-by: Louis DeLosSantos Co-developed-by: Louis DeLosSantos Signed-off-by: Steffen Klassert Signed-off-by: Sasha Levin --- net/xfrm/xfrm_policy.c | 3 +++ net/xfrm/xfrm_state.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index a022f4984687..e015ff225b27 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1597,6 +1597,9 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) struct xfrm_policy *delpol; struct hlist_head *chain; + /* Sanitize mark before store */ + policy->mark.v &= policy->mark.m; + spin_lock_bh(&net->xfrm.xfrm_policy_lock); chain = policy_hash_bysel(net, &policy->selector, policy->family, dir); if (chain) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index b5047a94c7d0..58c53bb1c583 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -1275,6 +1275,9 @@ static void __xfrm_state_insert(struct xfrm_state *x) list_add(&x->km.all, &net->xfrm.state_all); + /* Sanitize mark before store */ + x->mark.v &= x->mark.m; + h = xfrm_dst_hash(net, &x->id.daddr, &x->props.saddr, x->props.reqid, x->props.family); hlist_add_head_rcu(&x->bydst, net->xfrm.state_bydst + h); From 7549955105b72d458351ef07399fdc2405ee0194 Mon Sep 17 00:00:00 2001 From: Dave Jiang Date: Thu, 8 May 2025 10:05:48 -0700 Subject: [PATCH 433/529] dmaengine: idxd: Fix ->poll() return value [ Upstream commit ae74cd15ade833adc289279b5c6f12e78f64d4d7 ] The fix to block access from different address space did not return a correct value for ->poll() change. kernel test bot reported that a return value of type __poll_t is expected rather than int. Fix to return POLLNVAL to indicate invalid request. Fixes: 8dfa57aabff6 ("dmaengine: idxd: Fix allowing write() from different address spaces") Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-kbuild-all/202505081851.rwD7jVxg-lkp@intel.com/ Signed-off-by: Dave Jiang Link: https://lore.kernel.org/r/20250508170548.2747425-1-dave.jiang@intel.com Signed-off-by: Vinod Koul Signed-off-by: Sasha Levin --- drivers/dma/idxd/cdev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index 186f005bfa8f..d736ab15ade2 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -333,7 +333,7 @@ static __poll_t idxd_cdev_poll(struct file *filp, __poll_t out = 0; if (current->mm != ctx->mm) - return -EPERM; + return POLLNVAL; poll_wait(filp, &wq->err_queue, wait); spin_lock(&idxd->dev_lock); From 15efa7d00fb3049ce3532c129908fedb27797e46 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Wed, 7 May 2025 15:00:30 -0400 Subject: [PATCH 434/529] Bluetooth: L2CAP: Fix not checking l2cap_chan security level [ Upstream commit 7af8479d9eb4319b4ba7b47a8c4d2c55af1c31e1 ] l2cap_check_enc_key_size shall check the security level of the l2cap_chan rather than the hci_conn since for incoming connection request that may be different as hci_conn may already been encrypted using a different security level. Fixes: 522e9ed157e3 ("Bluetooth: l2cap: Check encryption key size on incoming connection") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- net/bluetooth/l2cap_core.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 222105e24d2d..cb9b1edfcea2 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1561,7 +1561,8 @@ static void l2cap_request_info(struct l2cap_conn *conn) sizeof(req), &req); } -static bool l2cap_check_enc_key_size(struct hci_conn *hcon) +static bool l2cap_check_enc_key_size(struct hci_conn *hcon, + struct l2cap_chan *chan) { /* The minimum encryption key size needs to be enforced by the * host stack before establishing any L2CAP connections. The @@ -1575,7 +1576,7 @@ static bool l2cap_check_enc_key_size(struct hci_conn *hcon) int min_key_size = hcon->hdev->min_enc_key_size; /* On FIPS security level, key size must be 16 bytes */ - if (hcon->sec_level == BT_SECURITY_FIPS) + if (chan->sec_level == BT_SECURITY_FIPS) min_key_size = 16; return (!test_bit(HCI_CONN_ENCRYPT, &hcon->flags) || @@ -1603,7 +1604,7 @@ static void l2cap_do_start(struct l2cap_chan *chan) !__l2cap_no_conn_pending(chan)) return; - if (l2cap_check_enc_key_size(conn->hcon)) + if (l2cap_check_enc_key_size(conn->hcon, chan)) l2cap_start_connection(chan); else __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); @@ -1685,7 +1686,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn) continue; } - if (l2cap_check_enc_key_size(conn->hcon)) + if (l2cap_check_enc_key_size(conn->hcon, chan)) l2cap_start_connection(chan); else l2cap_chan_close(chan, ECONNREFUSED); @@ -4187,7 +4188,7 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn, /* Check if the ACL is secure enough (if not SDP) */ if (psm != cpu_to_le16(L2CAP_PSM_SDP) && (!hci_conn_check_link_mode(conn->hcon) || - !l2cap_check_enc_key_size(conn->hcon))) { + !l2cap_check_enc_key_size(conn->hcon, pchan))) { conn->disc_reason = HCI_ERROR_AUTH_FAILURE; result = L2CAP_CR_SEC_BLOCK; goto response; @@ -8418,7 +8419,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) } if (chan->state == BT_CONNECT) { - if (!status && l2cap_check_enc_key_size(hcon)) + if (!status && l2cap_check_enc_key_size(hcon, chan)) l2cap_start_connection(chan); else __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); @@ -8428,7 +8429,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) struct l2cap_conn_rsp rsp; __u16 res, stat; - if (!status && l2cap_check_enc_key_size(hcon)) { + if (!status && l2cap_check_enc_key_size(hcon, chan)) { if (test_bit(FLAG_DEFER_SETUP, &chan->flags)) { res = L2CAP_CR_PEND; stat = L2CAP_CS_AUTHOR_PEND; From 2cee71a1a2738f39b09ddaec24d434572ce2c91a Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 15 May 2025 11:48:48 +0300 Subject: [PATCH 435/529] bridge: netfilter: Fix forwarding of fragmented packets [ Upstream commit 91b6dbced0ef1d680afdd69b14fc83d50ebafaf3 ] When netfilter defrag hooks are loaded (due to the presence of conntrack rules, for example), fragmented packets entering the bridge will be defragged by the bridge's pre-routing hook (br_nf_pre_routing() -> ipv4_conntrack_defrag()). Later on, in the bridge's post-routing hook, the defragged packet will be fragmented again. If the size of the largest fragment is larger than what the kernel has determined as the destination MTU (using ip_skb_dst_mtu()), the defragged packet will be dropped. Before commit ac6627a28dbf ("net: ipv4: Consolidate ipv4_mtu and ip_dst_mtu_maybe_forward"), ip_skb_dst_mtu() would return dst_mtu() as the destination MTU. Assuming the dst entry attached to the packet is the bridge's fake rtable one, this would simply be the bridge's MTU (see fake_mtu()). However, after above mentioned commit, ip_skb_dst_mtu() ends up returning the route's MTU stored in the dst entry's metrics. Ideally, in case the dst entry is the bridge's fake rtable one, this should be the bridge's MTU as the bridge takes care of updating this metric when its MTU changes (see br_change_mtu()). Unfortunately, the last operation is a no-op given the metrics attached to the fake rtable entry are marked as read-only. Therefore, ip_skb_dst_mtu() ends up returning 1500 (the initial MTU value) and defragged packets are dropped during fragmentation when dealing with large fragments and high MTU (e.g., 9k). Fix by moving the fake rtable entry's metrics to be per-bridge (in a similar fashion to the fake rtable entry itself) and marking them as writable, thereby allowing MTU changes to be reflected. Fixes: 62fa8a846d7d ("net: Implement read-only protection and COW'ing of metrics.") Fixes: 33eb9873a283 ("bridge: initialize fake_rtable metrics") Reported-by: Venkat Venkatsubra Closes: https://lore.kernel.org/netdev/PH0PR10MB4504888284FF4CBA648197D0ACB82@PH0PR10MB4504.namprd10.prod.outlook.com/ Tested-by: Venkat Venkatsubra Signed-off-by: Ido Schimmel Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20250515084848.727706-1-idosch@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/bridge/br_nf_core.c | 7 ++----- net/bridge/br_private.h | 1 + 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/net/bridge/br_nf_core.c b/net/bridge/br_nf_core.c index 8c69f0c95a8e..b8c8deb87407 100644 --- a/net/bridge/br_nf_core.c +++ b/net/bridge/br_nf_core.c @@ -65,17 +65,14 @@ static struct dst_ops fake_dst_ops = { * ipt_REJECT needs it. Future netfilter modules might * require us to fill additional fields. */ -static const u32 br_dst_default_metrics[RTAX_MAX] = { - [RTAX_MTU - 1] = 1500, -}; - void br_netfilter_rtable_init(struct net_bridge *br) { struct rtable *rt = &br->fake_rtable; atomic_set(&rt->dst.__refcnt, 1); rt->dst.dev = br->dev; - dst_init_metrics(&rt->dst, br_dst_default_metrics, true); + dst_init_metrics(&rt->dst, br->metrics, false); + dst_metric_set(&rt->dst, RTAX_MTU, br->dev->mtu); rt->dst.flags = DST_NOXFRM | DST_FAKE_RTABLE; rt->dst.ops = &fake_dst_ops; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 940de9516768..19fb50549252 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -478,6 +478,7 @@ struct net_bridge { struct rtable fake_rtable; struct rt6_info fake_rt6_info; }; + u32 metrics[RTAX_MAX]; #endif u16 group_fwd_mask; u16 group_fwd_mask_required; From b6556470bb2753886dde5f0b9b10ce20cd6a0cb5 Mon Sep 17 00:00:00 2001 From: Jacob Keller Date: Thu, 10 Apr 2025 11:13:52 -0700 Subject: [PATCH 436/529] ice: fix vf->num_mac count with port representors [ Upstream commit bbd95160a03dbfcd01a541f25c27ddb730dfbbd5 ] The ice_vc_repr_add_mac() function indicates that it does not store the MAC address filters in the firmware. However, it still increments vf->num_mac. This is incorrect, as vf->num_mac should represent the number of MAC filters currently programmed to firmware. Indeed, we only perform this increment if the requested filter is a unicast address that doesn't match the existing vf->hw_lan_addr. In addition, ice_vc_repr_del_mac() does not decrement the vf->num_mac counter. This results in the counter becoming out of sync with the actual count. As it turns out, vf->num_mac is currently only used in legacy made without port representors. The single place where the value is checked is for enforcing a filter limit on untrusted VFs. Upcoming patches to support VF Live Migration will use this value when determining the size of the TLV for MAC address filters. Fix the representor mode function to stop incrementing the counter incorrectly. Fixes: ac19e03ef780 ("ice: allow process VF opcodes in different ways") Signed-off-by: Jacob Keller Reviewed-by: Michal Swiatkowski Reviewed-by: Simon Horman Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/ice/ice_virtchnl.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_virtchnl.c b/drivers/net/ethernet/intel/ice/ice_virtchnl.c index 42d8e5e771b7..fa9d928081d6 100644 --- a/drivers/net/ethernet/intel/ice/ice_virtchnl.c +++ b/drivers/net/ethernet/intel/ice/ice_virtchnl.c @@ -3551,7 +3551,6 @@ static int ice_vc_repr_add_mac(struct ice_vf *vf, u8 *msg) } ice_vfhw_mac_add(vf, &al->list[i]); - vf->num_mac++; break; } From e8138b81d38d906717c9252db43d78f552068eea Mon Sep 17 00:00:00 2001 From: Paul Kocialkowski Date: Mon, 19 May 2025 18:49:36 +0200 Subject: [PATCH 437/529] net: dwmac-sun8i: Use parsed internal PHY address instead of 1 [ Upstream commit 47653e4243f2b0a26372e481ca098936b51ec3a8 ] While the MDIO address of the internal PHY on Allwinner sun8i chips is generally 1, of_mdio_parse_addr is used to cleanly parse the address from the device-tree instead of hardcoding it. A commit reworking the code ditched the parsed value and hardcoded the value 1 instead, which didn't really break anything but is more fragile and not future-proof. Restore the initial behavior using the parsed address returned from the helper. Fixes: 634db83b8265 ("net: stmmac: dwmac-sun8i: Handle integrated/external MDIOs") Signed-off-by: Paul Kocialkowski Reviewed-by: Andrew Lunn Acked-by: Corentin LABBE Tested-by: Corentin LABBE Link: https://patch.msgid.link/20250519164936.4172658-1-paulk@sys-base.io Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c index f834472599f7..0921b78c6244 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c @@ -948,7 +948,7 @@ static int sun8i_dwmac_set_syscon(struct device *dev, /* of_mdio_parse_addr returns a valid (0 ~ 31) PHY * address. No need to mask it again. */ - reg |= 1 << H3_EPHY_ADDR_SHIFT; + reg |= ret << H3_EPHY_ADDR_SHIFT; } else { /* For SoCs without internal PHY the PHY selection bit should be * set to 0 (external PHY). From 8620be38cf577d4cd6f1ca1102bc4115a8dd4b2f Mon Sep 17 00:00:00 2001 From: Thangaraj Samynathan Date: Fri, 16 May 2025 09:27:19 +0530 Subject: [PATCH 438/529] net: lan743x: Restore SGMII CTRL register on resume [ Upstream commit 293e38ff4e4c2ba53f3fd47d8a4a9f0f0414a7a6 ] SGMII_CTRL register, which specifies the active interface, was not properly restored when resuming from suspend. This led to incorrect interface selection after resume particularly in scenarios involving the FPGA. To fix this: - Move the SGMII_CTRL setup out of the probe function. - Initialize the register in the hardware initialization helper function, which is called during both device initialization and resume. This ensures the interface configuration is consistently restored after suspend/resume cycles. Fixes: a46d9d37c4f4f ("net: lan743x: Add support for SGMII interface") Signed-off-by: Thangaraj Samynathan Link: https://patch.msgid.link/20250516035719.117960-1-thangaraj.s@microchip.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/microchip/lan743x_main.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index 2e69ba0143b1..fd3555419179 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -3253,6 +3253,7 @@ static int lan743x_hardware_init(struct lan743x_adapter *adapter, struct pci_dev *pdev) { struct lan743x_tx *tx; + u32 sgmii_ctl; int index; int ret; @@ -3265,6 +3266,15 @@ static int lan743x_hardware_init(struct lan743x_adapter *adapter, spin_lock_init(&adapter->eth_syslock_spinlock); mutex_init(&adapter->sgmii_rw_lock); pci11x1x_set_rfe_rd_fifo_threshold(adapter); + sgmii_ctl = lan743x_csr_read(adapter, SGMII_CTL); + if (adapter->is_sgmii_en) { + sgmii_ctl |= SGMII_CTL_SGMII_ENABLE_; + sgmii_ctl &= ~SGMII_CTL_SGMII_POWER_DN_; + } else { + sgmii_ctl &= ~SGMII_CTL_SGMII_ENABLE_; + sgmii_ctl |= SGMII_CTL_SGMII_POWER_DN_; + } + lan743x_csr_write(adapter, SGMII_CTL, sgmii_ctl); } else { adapter->max_tx_channels = LAN743X_MAX_TX_CHANNELS; adapter->used_tx_channels = LAN743X_USED_TX_CHANNELS; @@ -3313,7 +3323,6 @@ static int lan743x_hardware_init(struct lan743x_adapter *adapter, static int lan743x_mdiobus_init(struct lan743x_adapter *adapter) { - u32 sgmii_ctl; int ret; adapter->mdiobus = devm_mdiobus_alloc(&adapter->pdev->dev); @@ -3325,10 +3334,6 @@ static int lan743x_mdiobus_init(struct lan743x_adapter *adapter) adapter->mdiobus->priv = (void *)adapter; if (adapter->is_pci11x1x) { if (adapter->is_sgmii_en) { - sgmii_ctl = lan743x_csr_read(adapter, SGMII_CTL); - sgmii_ctl |= SGMII_CTL_SGMII_ENABLE_; - sgmii_ctl &= ~SGMII_CTL_SGMII_POWER_DN_; - lan743x_csr_write(adapter, SGMII_CTL, sgmii_ctl); netif_dbg(adapter, drv, adapter->netdev, "SGMII operation\n"); adapter->mdiobus->probe_capabilities = MDIOBUS_C22_C45; @@ -3338,10 +3343,6 @@ static int lan743x_mdiobus_init(struct lan743x_adapter *adapter) netif_dbg(adapter, drv, adapter->netdev, "lan743x-mdiobus-c45\n"); } else { - sgmii_ctl = lan743x_csr_read(adapter, SGMII_CTL); - sgmii_ctl &= ~SGMII_CTL_SGMII_ENABLE_; - sgmii_ctl |= SGMII_CTL_SGMII_POWER_DN_; - lan743x_csr_write(adapter, SGMII_CTL, sgmii_ctl); netif_dbg(adapter, drv, adapter->netdev, "RGMII operation\n"); // Only C22 support when RGMII I/F From f2138e462c575befaee26714974d03d4d3036062 Mon Sep 17 00:00:00 2001 From: Pavel Begunkov Date: Sat, 17 May 2025 13:27:37 +0100 Subject: [PATCH 439/529] io_uring: fix overflow resched cqe reordering [ Upstream commit a7d755ed9ce9738af3db602eb29d32774a180bc7 ] Leaving the CQ critical section in the middle of a overflow flushing can cause cqe reordering since the cache cq pointers are reset and any new cqe emitters that might get called in between are not going to be forced into io_cqe_cache_refill(). Fixes: eac2ca2d682f9 ("io_uring: check if we need to reschedule during overflow flush") Signed-off-by: Pavel Begunkov Link: https://lore.kernel.org/r/90ba817f1a458f091f355f407de1c911d2b93bbf.1747483784.git.asml.silence@gmail.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin --- io_uring/io_uring.c | 1 + 1 file changed, 1 insertion(+) diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c index f39d66589180..ad462724246a 100644 --- a/io_uring/io_uring.c +++ b/io_uring/io_uring.c @@ -627,6 +627,7 @@ static bool __io_cqring_overflow_flush(struct io_ring_ctx *ctx, bool force) * to care for a non-real case. */ if (need_resched()) { + ctx->cqe_sentinel = ctx->cqe_cached; io_cq_unlock_post(ctx); mutex_unlock(&ctx->uring_lock); cond_resched(); From f1dde3eb17dc1b8bd07aed00004b1e05fc87a3d4 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Sun, 18 May 2025 15:20:37 -0700 Subject: [PATCH 440/529] sch_hfsc: Fix qlen accounting bug when using peek in hfsc_enqueue() [ Upstream commit 3f981138109f63232a5fb7165938d4c945cc1b9d ] When enqueuing the first packet to an HFSC class, hfsc_enqueue() calls the child qdisc's peek() operation before incrementing sch->q.qlen and sch->qstats.backlog. If the child qdisc uses qdisc_peek_dequeued(), this may trigger an immediate dequeue and potential packet drop. In such cases, qdisc_tree_reduce_backlog() is called, but the HFSC qdisc's qlen and backlog have not yet been updated, leading to inconsistent queue accounting. This can leave an empty HFSC class in the active list, causing further consequences like use-after-free. This patch fixes the bug by moving the increment of sch->q.qlen and sch->qstats.backlog before the call to the child qdisc's peek() operation. This ensures that queue length and backlog are always accurate when packet drops or dequeues are triggered during the peek. Fixes: 12d0ad3be9c3 ("net/sched/sch_hfsc.c: handle corner cases where head may change invalidating calculated deadline") Reported-by: Mingi Cho Signed-off-by: Cong Wang Reviewed-by: Simon Horman Link: https://patch.msgid.link/20250518222038.58538-2-xiyou.wangcong@gmail.com Reviewed-by: Jamal Hadi Salim Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/sched/sch_hfsc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index fc1370c29373..ec6ee4510013 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1568,6 +1568,9 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) return err; } + sch->qstats.backlog += len; + sch->q.qlen++; + if (first && !cl->cl_nactive) { if (cl->cl_flags & HFSC_RSC) init_ed(cl, len); @@ -1583,9 +1586,6 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) } - sch->qstats.backlog += len; - sch->q.qlen++; - return NET_XMIT_SUCCESS; } From 2a2d7bf86dd32f835141ac8991f3d7bf2492f4d2 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Mon, 22 May 2023 07:34:04 +0530 Subject: [PATCH 441/529] octeontx2-pf: Add support for page pool [ Upstream commit b2e3406a38f0f48b1dfb81e5bb73d243ff6af179 ] Page pool for each rx queue enhance rx side performance by reclaiming buffers back to each queue specific pool. DMA mapping is done only for first allocation of buffers. As subsequent buffers allocation avoid DMA mapping, it results in performance improvement. Image | Performance ------------ | ------------ Vannila | 3Mpps | with this | 42Mpps change | --------------------------- Signed-off-by: Ratheesh Kannoth Link: https://lore.kernel.org/r/20230522020404.152020-1-rkannoth@marvell.com Signed-off-by: Paolo Abeni Stable-dep-of: b4164de5041b ("octeontx2-pf: Add AF_XDP non-zero copy support") Signed-off-by: Sasha Levin --- .../net/ethernet/marvell/octeontx2/Kconfig | 1 + .../marvell/octeontx2/nic/otx2_common.c | 78 ++++++++++++++++--- .../marvell/octeontx2/nic/otx2_common.h | 6 +- .../ethernet/marvell/octeontx2/nic/otx2_pf.c | 11 ++- .../marvell/octeontx2/nic/otx2_txrx.c | 19 +++-- .../marvell/octeontx2/nic/otx2_txrx.h | 1 + .../ethernet/marvell/octeontx2/nic/qos_sq.c | 2 +- 7 files changed, 96 insertions(+), 22 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/Kconfig b/drivers/net/ethernet/marvell/octeontx2/Kconfig index 993ac180a5db..a32d85d6f599 100644 --- a/drivers/net/ethernet/marvell/octeontx2/Kconfig +++ b/drivers/net/ethernet/marvell/octeontx2/Kconfig @@ -32,6 +32,7 @@ config OCTEONTX2_PF tristate "Marvell OcteonTX2 NIC Physical Function driver" select OCTEONTX2_MBOX select NET_DEVLINK + select PAGE_POOL depends on (64BIT && COMPILE_TEST) || ARM64 select DIMLIB depends on PCI diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index d05f91f97a9a..5e11599d1322 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -513,11 +513,32 @@ void otx2_config_irq_coalescing(struct otx2_nic *pfvf, int qidx) (pfvf->hw.cq_ecount_wait - 1)); } +static int otx2_alloc_pool_buf(struct otx2_nic *pfvf, struct otx2_pool *pool, + dma_addr_t *dma) +{ + unsigned int offset = 0; + struct page *page; + size_t sz; + + sz = SKB_DATA_ALIGN(pool->rbsize); + sz = ALIGN(sz, OTX2_ALIGN); + + page = page_pool_alloc_frag(pool->page_pool, &offset, sz, GFP_ATOMIC); + if (unlikely(!page)) + return -ENOMEM; + + *dma = page_pool_get_dma_addr(page) + offset; + return 0; +} + static int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, dma_addr_t *dma) { u8 *buf; + if (pool->page_pool) + return otx2_alloc_pool_buf(pfvf, pool, dma); + buf = napi_alloc_frag_align(pool->rbsize, OTX2_ALIGN); if (unlikely(!buf)) return -ENOMEM; @@ -1206,10 +1227,31 @@ void otx2_sq_free_sqbs(struct otx2_nic *pfvf) } } +void otx2_free_bufs(struct otx2_nic *pfvf, struct otx2_pool *pool, + u64 iova, int size) +{ + struct page *page; + u64 pa; + + pa = otx2_iova_to_phys(pfvf->iommu_domain, iova); + page = virt_to_head_page(phys_to_virt(pa)); + + if (pool->page_pool) { + page_pool_put_full_page(pool->page_pool, page, true); + } else { + dma_unmap_page_attrs(pfvf->dev, iova, size, + DMA_FROM_DEVICE, + DMA_ATTR_SKIP_CPU_SYNC); + + put_page(page); + } +} + void otx2_free_aura_ptr(struct otx2_nic *pfvf, int type) { int pool_id, pool_start = 0, pool_end = 0, size = 0; - u64 iova, pa; + struct otx2_pool *pool; + u64 iova; if (type == AURA_NIX_SQ) { pool_start = otx2_get_pool_idx(pfvf, type, 0); @@ -1225,15 +1267,13 @@ void otx2_free_aura_ptr(struct otx2_nic *pfvf, int type) /* Free SQB and RQB pointers from the aura pool */ for (pool_id = pool_start; pool_id < pool_end; pool_id++) { iova = otx2_aura_allocptr(pfvf, pool_id); + pool = &pfvf->qset.pool[pool_id]; while (iova) { if (type == AURA_NIX_RQ) iova -= OTX2_HEAD_ROOM; - pa = otx2_iova_to_phys(pfvf->iommu_domain, iova); - dma_unmap_page_attrs(pfvf->dev, iova, size, - DMA_FROM_DEVICE, - DMA_ATTR_SKIP_CPU_SYNC); - put_page(virt_to_page(phys_to_virt(pa))); + otx2_free_bufs(pfvf, pool, iova, size); + iova = otx2_aura_allocptr(pfvf, pool_id); } } @@ -1251,6 +1291,8 @@ void otx2_aura_pool_free(struct otx2_nic *pfvf) pool = &pfvf->qset.pool[pool_id]; qmem_free(pfvf->dev, pool->stack); qmem_free(pfvf->dev, pool->fc_addr); + page_pool_destroy(pool->page_pool); + pool->page_pool = NULL; } devm_kfree(pfvf->dev, pfvf->qset.pool); pfvf->qset.pool = NULL; @@ -1334,8 +1376,9 @@ int otx2_aura_init(struct otx2_nic *pfvf, int aura_id, } int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id, - int stack_pages, int numptrs, int buf_size) + int stack_pages, int numptrs, int buf_size, int type) { + struct page_pool_params pp_params = { 0 }; struct npa_aq_enq_req *aq; struct otx2_pool *pool; int err; @@ -1379,6 +1422,22 @@ int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id, aq->ctype = NPA_AQ_CTYPE_POOL; aq->op = NPA_AQ_INSTOP_INIT; + if (type != AURA_NIX_RQ) { + pool->page_pool = NULL; + return 0; + } + + pp_params.flags = PP_FLAG_PAGE_FRAG | PP_FLAG_DMA_MAP; + pp_params.pool_size = numptrs; + pp_params.nid = NUMA_NO_NODE; + pp_params.dev = pfvf->dev; + pp_params.dma_dir = DMA_FROM_DEVICE; + pool->page_pool = page_pool_create(&pp_params); + if (IS_ERR(pool->page_pool)) { + netdev_err(pfvf->netdev, "Creation of page pool failed\n"); + return PTR_ERR(pool->page_pool); + } + return 0; } @@ -1413,7 +1472,7 @@ int otx2_sq_aura_pool_init(struct otx2_nic *pfvf) /* Initialize pool context */ err = otx2_pool_init(pfvf, pool_id, stack_pages, - num_sqbs, hw->sqb_size); + num_sqbs, hw->sqb_size, AURA_NIX_SQ); if (err) goto fail; } @@ -1476,7 +1535,7 @@ int otx2_rq_aura_pool_init(struct otx2_nic *pfvf) } for (pool_id = 0; pool_id < hw->rqpool_cnt; pool_id++) { err = otx2_pool_init(pfvf, pool_id, stack_pages, - num_ptrs, pfvf->rbsize); + num_ptrs, pfvf->rbsize, AURA_NIX_RQ); if (err) goto fail; } @@ -1660,7 +1719,6 @@ int otx2_nix_config_bp(struct otx2_nic *pfvf, bool enable) req->bpid_per_chan = 0; #endif - return otx2_sync_mbox_msg(&pfvf->mbox); } EXPORT_SYMBOL(otx2_nix_config_bp); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index c15d1864a637..4f0ac8158ed1 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -934,7 +934,7 @@ int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, int otx2_rxtx_enable(struct otx2_nic *pfvf, bool enable); void otx2_ctx_disable(struct mbox *mbox, int type, bool npa); int otx2_nix_config_bp(struct otx2_nic *pfvf, bool enable); -void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq); +void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq, int qidx); void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq); int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura); int otx2_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura); @@ -942,7 +942,7 @@ int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura); int otx2_alloc_buffer(struct otx2_nic *pfvf, struct otx2_cq_queue *cq, dma_addr_t *dma); int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id, - int stack_pages, int numptrs, int buf_size); + int stack_pages, int numptrs, int buf_size, int type); int otx2_aura_init(struct otx2_nic *pfvf, int aura_id, int pool_id, int numptrs); @@ -1012,6 +1012,8 @@ u16 otx2_get_max_mtu(struct otx2_nic *pfvf); int otx2_handle_ntuple_tc_features(struct net_device *netdev, netdev_features_t features); int otx2_smq_flush(struct otx2_nic *pfvf, int smq); +void otx2_free_bufs(struct otx2_nic *pfvf, struct otx2_pool *pool, + u64 iova, int size); /* tc support */ int otx2_init_tc(struct otx2_nic *nic); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 6b7fb324e756..8385b4673693 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1591,7 +1591,9 @@ static void otx2_free_hw_resources(struct otx2_nic *pf) struct nix_lf_free_req *free_req; struct mbox *mbox = &pf->mbox; struct otx2_cq_queue *cq; + struct otx2_pool *pool; struct msg_req *req; + int pool_id; int qidx; /* Ensure all SQE are processed */ @@ -1618,7 +1620,7 @@ static void otx2_free_hw_resources(struct otx2_nic *pf) for (qidx = 0; qidx < qset->cq_cnt; qidx++) { cq = &qset->cq[qidx]; if (cq->cq_type == CQ_RX) - otx2_cleanup_rx_cqes(pf, cq); + otx2_cleanup_rx_cqes(pf, cq, qidx); else otx2_cleanup_tx_cqes(pf, cq); } @@ -1629,6 +1631,13 @@ static void otx2_free_hw_resources(struct otx2_nic *pf) /* Free RQ buffer pointers*/ otx2_free_aura_ptr(pf, AURA_NIX_RQ); + for (qidx = 0; qidx < pf->hw.rx_queues; qidx++) { + pool_id = otx2_get_pool_idx(pf, AURA_NIX_RQ, qidx); + pool = &pf->qset.pool[pool_id]; + page_pool_destroy(pool->page_pool); + pool->page_pool = NULL; + } + otx2_free_cq_res(pf); /* Free all ingress bandwidth profiles allocated */ diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index e579183e5239..cc704cd3b5ae 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -218,9 +218,6 @@ static bool otx2_skb_add_frag(struct otx2_nic *pfvf, struct sk_buff *skb, skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page, va - page_address(page) + off, len - off, pfvf->rbsize); - - otx2_dma_unmap_page(pfvf, iova - OTX2_HEAD_ROOM, - pfvf->rbsize, DMA_FROM_DEVICE); return true; } @@ -383,6 +380,8 @@ static void otx2_rcv_pkt_handler(struct otx2_nic *pfvf, if (pfvf->netdev->features & NETIF_F_RXCSUM) skb->ip_summed = CHECKSUM_UNNECESSARY; + skb_mark_for_recycle(skb); + napi_gro_frags(napi); } @@ -1191,11 +1190,13 @@ bool otx2_sq_append_skb(struct net_device *netdev, struct otx2_snd_queue *sq, } EXPORT_SYMBOL(otx2_sq_append_skb); -void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq) +void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq, int qidx) { struct nix_cqe_rx_s *cqe; + struct otx2_pool *pool; int processed_cqe = 0; - u64 iova, pa; + u16 pool_id; + u64 iova; if (pfvf->xdp_prog) xdp_rxq_info_unreg(&cq->xdp_rxq); @@ -1203,6 +1204,9 @@ void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq) if (otx2_nix_cq_op_status(pfvf, cq) || !cq->pend_cqe) return; + pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_RQ, qidx); + pool = &pfvf->qset.pool[pool_id]; + while (cq->pend_cqe) { cqe = (struct nix_cqe_rx_s *)otx2_get_next_cqe(cq); processed_cqe++; @@ -1215,9 +1219,8 @@ void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq) continue; } iova = cqe->sg.seg_addr - OTX2_HEAD_ROOM; - pa = otx2_iova_to_phys(pfvf->iommu_domain, iova); - otx2_dma_unmap_page(pfvf, iova, pfvf->rbsize, DMA_FROM_DEVICE); - put_page(virt_to_page(phys_to_virt(pa))); + + otx2_free_bufs(pfvf, pool, iova, pfvf->rbsize); } /* Free CQEs to HW */ diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h index 7ab6db9a986f..b5d689eeff80 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h @@ -118,6 +118,7 @@ struct otx2_cq_poll { struct otx2_pool { struct qmem *stack; struct qmem *fc_addr; + struct page_pool *page_pool; u16 rbsize; }; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c index e142d43f5a62..95a2c8e616bd 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c @@ -63,7 +63,7 @@ static int otx2_qos_sq_aura_pool_init(struct otx2_nic *pfvf, int qidx) /* Initialize pool context */ err = otx2_pool_init(pfvf, pool_id, stack_pages, - num_sqbs, hw->sqb_size); + num_sqbs, hw->sqb_size, AURA_NIX_SQ); if (err) goto aura_free; From a0059910132ab03653e8b751a7f0a9ba533c9156 Mon Sep 17 00:00:00 2001 From: Suman Ghosh Date: Thu, 13 Feb 2025 11:01:37 +0530 Subject: [PATCH 442/529] octeontx2-pf: Add AF_XDP non-zero copy support [ Upstream commit b4164de5041b51cda3438e75bce668e2556057c3 ] Set xdp rx ring memory type as MEM_TYPE_PAGE_POOL for af-xdp to work. This is needed since xdp_return_frame internally will use page pools. Fixes: 06059a1a9a4a ("octeontx2-pf: Add XDP support to netdev PF") Signed-off-by: Suman Ghosh Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 5e11599d1322..59a7e6f376f4 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -988,6 +988,7 @@ static int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx) int err, pool_id, non_xdp_queues; struct nix_aq_enq_req *aq; struct otx2_cq_queue *cq; + struct otx2_pool *pool; cq = &qset->cq[qidx]; cq->cq_idx = qidx; @@ -996,8 +997,13 @@ static int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx) cq->cq_type = CQ_RX; cq->cint_idx = qidx; cq->cqe_cnt = qset->rqe_cnt; - if (pfvf->xdp_prog) + if (pfvf->xdp_prog) { + pool = &qset->pool[qidx]; xdp_rxq_info_reg(&cq->xdp_rxq, pfvf->netdev, qidx, 0); + xdp_rxq_info_reg_mem_model(&cq->xdp_rxq, + MEM_TYPE_PAGE_POOL, + pool->page_pool); + } } else if (qidx < non_xdp_queues) { cq->cq_type = CQ_TX; cq->cint_idx = qidx - pfvf->hw.rx_queues; From b8fcae6d2e93c54cacb8f579a77d827c1c643eb5 Mon Sep 17 00:00:00 2001 From: Wang Liang Date: Tue, 20 May 2025 18:14:04 +0800 Subject: [PATCH 443/529] net/tipc: fix slab-use-after-free Read in tipc_aead_encrypt_done [ Upstream commit e279024617134c94fd3e37470156534d5f2b3472 ] Syzbot reported a slab-use-after-free with the following call trace: ================================================================== BUG: KASAN: slab-use-after-free in tipc_aead_encrypt_done+0x4bd/0x510 net/tipc/crypto.c:840 Read of size 8 at addr ffff88807a733000 by task kworker/1:0/25 Call Trace: kasan_report+0xd9/0x110 mm/kasan/report.c:601 tipc_aead_encrypt_done+0x4bd/0x510 net/tipc/crypto.c:840 crypto_request_complete include/crypto/algapi.h:266 aead_request_complete include/crypto/internal/aead.h:85 cryptd_aead_crypt+0x3b8/0x750 crypto/cryptd.c:772 crypto_request_complete include/crypto/algapi.h:266 cryptd_queue_worker+0x131/0x200 crypto/cryptd.c:181 process_one_work+0x9fb/0x1b60 kernel/workqueue.c:3231 Allocated by task 8355: kzalloc_noprof include/linux/slab.h:778 tipc_crypto_start+0xcc/0x9e0 net/tipc/crypto.c:1466 tipc_init_net+0x2dd/0x430 net/tipc/core.c:72 ops_init+0xb9/0x650 net/core/net_namespace.c:139 setup_net+0x435/0xb40 net/core/net_namespace.c:343 copy_net_ns+0x2f0/0x670 net/core/net_namespace.c:508 create_new_namespaces+0x3ea/0xb10 kernel/nsproxy.c:110 unshare_nsproxy_namespaces+0xc0/0x1f0 kernel/nsproxy.c:228 ksys_unshare+0x419/0x970 kernel/fork.c:3323 __do_sys_unshare kernel/fork.c:3394 Freed by task 63: kfree+0x12a/0x3b0 mm/slub.c:4557 tipc_crypto_stop+0x23c/0x500 net/tipc/crypto.c:1539 tipc_exit_net+0x8c/0x110 net/tipc/core.c:119 ops_exit_list+0xb0/0x180 net/core/net_namespace.c:173 cleanup_net+0x5b7/0xbf0 net/core/net_namespace.c:640 process_one_work+0x9fb/0x1b60 kernel/workqueue.c:3231 After freed the tipc_crypto tx by delete namespace, tipc_aead_encrypt_done may still visit it in cryptd_queue_worker workqueue. I reproduce this issue by: ip netns add ns1 ip link add veth1 type veth peer name veth2 ip link set veth1 netns ns1 ip netns exec ns1 tipc bearer enable media eth dev veth1 ip netns exec ns1 tipc node set key this_is_a_master_key master ip netns exec ns1 tipc bearer disable media eth dev veth1 ip netns del ns1 The key of reproduction is that, simd_aead_encrypt is interrupted, leading to crypto_simd_usable() return false. Thus, the cryptd_queue_worker is triggered, and the tipc_crypto tx will be visited. tipc_disc_timeout tipc_bearer_xmit_skb tipc_crypto_xmit tipc_aead_encrypt crypto_aead_encrypt // encrypt() simd_aead_encrypt // crypto_simd_usable() is false child = &ctx->cryptd_tfm->base; simd_aead_encrypt crypto_aead_encrypt // encrypt() cryptd_aead_encrypt_enqueue cryptd_aead_enqueue cryptd_enqueue_request // trigger cryptd_queue_worker queue_work_on(smp_processor_id(), cryptd_wq, &cpu_queue->work) Fix this by holding net reference count before encrypt. Reported-by: syzbot+55c12726619ff85ce1f6@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=55c12726619ff85ce1f6 Fixes: fc1b6d6de220 ("tipc: introduce TIPC encryption & authentication") Signed-off-by: Wang Liang Link: https://patch.msgid.link/20250520101404.1341730-1-wangliang74@huawei.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/tipc/crypto.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/tipc/crypto.c b/net/tipc/crypto.c index 25c18f8783ce..a9c02fac039b 100644 --- a/net/tipc/crypto.c +++ b/net/tipc/crypto.c @@ -817,12 +817,16 @@ static int tipc_aead_encrypt(struct tipc_aead *aead, struct sk_buff *skb, goto exit; } + /* Get net to avoid freed tipc_crypto when delete namespace */ + get_net(aead->crypto->net); + /* Now, do encrypt */ rc = crypto_aead_encrypt(req); if (rc == -EINPROGRESS || rc == -EBUSY) return rc; tipc_bearer_put(b); + put_net(aead->crypto->net); exit: kfree(ctx); @@ -860,6 +864,7 @@ static void tipc_aead_encrypt_done(struct crypto_async_request *base, int err) kfree(tx_ctx); tipc_bearer_put(b); tipc_aead_put(aead); + put_net(net); } /** From 7b1357a108ecb3e107c6e8ea6af6e62fd58c07ad Mon Sep 17 00:00:00 2001 From: Subbaraya Sundeep Date: Wed, 21 May 2025 11:38:33 +0530 Subject: [PATCH 444/529] octeontx2-af: Set LMT_ENA bit for APR table entries [ Upstream commit 0eefa27b493306928d88af6368193b134c98fd64 ] This patch enables the LMT line for a PF/VF by setting the LMT_ENA bit in the APR_LMT_MAP_ENTRY_S structure. Additionally, it simplifies the logic for calculating the LMTST table index by consistently using the maximum number of hw supported VFs (i.e., 256). Fixes: 873a1e3d207a ("octeontx2-af: cn10k: Setting up lmtst map table"). Signed-off-by: Subbaraya Sundeep Signed-off-by: Geetha sowjanya Reviewed-by: Michal Swiatkowski Link: https://patch.msgid.link/20250521060834.19780-2-gakula@marvell.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- .../net/ethernet/marvell/octeontx2/af/rvu_cn10k.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c index f9faa5b23bb9..6ec0609074dc 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c @@ -15,13 +15,17 @@ #define LMT_TBL_OP_WRITE 1 #define LMT_MAP_TABLE_SIZE (128 * 1024) #define LMT_MAPTBL_ENTRY_SIZE 16 +#define LMT_MAX_VFS 256 + +#define LMT_MAP_ENTRY_ENA BIT_ULL(20) +#define LMT_MAP_ENTRY_LINES GENMASK_ULL(18, 16) /* Function to perform operations (read/write) on lmtst map table */ static int lmtst_map_table_ops(struct rvu *rvu, u32 index, u64 *val, int lmt_tbl_op) { void __iomem *lmt_map_base; - u64 tbl_base; + u64 tbl_base, cfg; tbl_base = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_MAP_BASE); @@ -35,6 +39,13 @@ static int lmtst_map_table_ops(struct rvu *rvu, u32 index, u64 *val, *val = readq(lmt_map_base + index); } else { writeq((*val), (lmt_map_base + index)); + + cfg = FIELD_PREP(LMT_MAP_ENTRY_ENA, 0x1); + /* 2048 LMTLINES */ + cfg |= FIELD_PREP(LMT_MAP_ENTRY_LINES, 0x6); + + writeq(cfg, (lmt_map_base + (index + 8))); + /* Flushing the AP interceptor cache to make APR_LMT_MAP_ENTRY_S * changes effective. Write 1 for flush and read is being used as a * barrier and sets up a data dependency. Write to 0 after a write @@ -52,7 +63,7 @@ static int lmtst_map_table_ops(struct rvu *rvu, u32 index, u64 *val, #define LMT_MAP_TBL_W1_OFF 8 static u32 rvu_get_lmtst_tbl_index(struct rvu *rvu, u16 pcifunc) { - return ((rvu_get_pf(pcifunc) * rvu->hw->total_vfs) + + return ((rvu_get_pf(pcifunc) * LMT_MAX_VFS) + (pcifunc & RVU_PFVF_FUNC_MASK)) * LMT_MAPTBL_ENTRY_SIZE; } From 3cb003b959d7e5a1abaddf5434fb8a5a69b916a2 Mon Sep 17 00:00:00 2001 From: Geetha sowjanya Date: Wed, 21 May 2025 11:38:34 +0530 Subject: [PATCH 445/529] octeontx2-af: Fix APR entry mapping based on APR_LMT_CFG [ Upstream commit a6ae7129819ad20788e610261246e71736543b8b ] The current implementation maps the APR table using a fixed size, which can lead to incorrect mapping when the number of PFs and VFs varies. This patch corrects the mapping by calculating the APR table size dynamically based on the values configured in the APR_LMT_CFG register, ensuring accurate representation of APR entries in debugfs. Fixes: 0daa55d033b0 ("octeontx2-af: cn10k: debugfs for dumping LMTST map table"). Signed-off-by: Geetha sowjanya Link: https://patch.msgid.link/20250521060834.19780-3-gakula@marvell.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c | 9 ++++++--- .../net/ethernet/marvell/octeontx2/af/rvu_debugfs.c | 11 ++++++++--- 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c index 6ec0609074dc..5cd45846237e 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cn10k.c @@ -13,7 +13,6 @@ /* RVU LMTST */ #define LMT_TBL_OP_READ 0 #define LMT_TBL_OP_WRITE 1 -#define LMT_MAP_TABLE_SIZE (128 * 1024) #define LMT_MAPTBL_ENTRY_SIZE 16 #define LMT_MAX_VFS 256 @@ -26,10 +25,14 @@ static int lmtst_map_table_ops(struct rvu *rvu, u32 index, u64 *val, { void __iomem *lmt_map_base; u64 tbl_base, cfg; + int pfs, vfs; tbl_base = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_MAP_BASE); + cfg = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_CFG); + vfs = 1 << (cfg & 0xF); + pfs = 1 << ((cfg >> 4) & 0x7); - lmt_map_base = ioremap_wc(tbl_base, LMT_MAP_TABLE_SIZE); + lmt_map_base = ioremap_wc(tbl_base, pfs * vfs * LMT_MAPTBL_ENTRY_SIZE); if (!lmt_map_base) { dev_err(rvu->dev, "Failed to setup lmt map table mapping!!\n"); return -ENOMEM; @@ -80,7 +83,7 @@ static int rvu_get_lmtaddr(struct rvu *rvu, u16 pcifunc, mutex_lock(&rvu->rsrc_lock); rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_ADDR_REQ, iova); - pf = rvu_get_pf(pcifunc) & 0x1F; + pf = rvu_get_pf(pcifunc) & RVU_PFVF_PF_MASK; val = BIT_ULL(63) | BIT_ULL(14) | BIT_ULL(13) | pf << 8 | ((pcifunc & RVU_PFVF_FUNC_MASK) & 0xFF); rvu_write64(rvu, BLKADDR_RVUM, RVU_AF_SMMU_TXN_REQ, val); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c index a3c1d82032f5..aa2ab987eb75 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c @@ -580,6 +580,7 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp, u64 lmt_addr, val, tbl_base; int pf, vf, num_vfs, hw_vfs; void __iomem *lmt_map_base; + int apr_pfs, apr_vfs; int buf_size = 10240; size_t off = 0; int index = 0; @@ -595,8 +596,12 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp, return -ENOMEM; tbl_base = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_MAP_BASE); + val = rvu_read64(rvu, BLKADDR_APR, APR_AF_LMT_CFG); + apr_vfs = 1 << (val & 0xF); + apr_pfs = 1 << ((val >> 4) & 0x7); - lmt_map_base = ioremap_wc(tbl_base, 128 * 1024); + lmt_map_base = ioremap_wc(tbl_base, apr_pfs * apr_vfs * + LMT_MAPTBL_ENTRY_SIZE); if (!lmt_map_base) { dev_err(rvu->dev, "Failed to setup lmt map table mapping!!\n"); kfree(buf); @@ -618,7 +623,7 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp, off += scnprintf(&buf[off], buf_size - 1 - off, "PF%d \t\t\t", pf); - index = pf * rvu->hw->total_vfs * LMT_MAPTBL_ENTRY_SIZE; + index = pf * apr_vfs * LMT_MAPTBL_ENTRY_SIZE; off += scnprintf(&buf[off], buf_size - 1 - off, " 0x%llx\t\t", (tbl_base + index)); lmt_addr = readq(lmt_map_base + index); @@ -631,7 +636,7 @@ static ssize_t rvu_dbg_lmtst_map_table_display(struct file *filp, /* Reading num of VFs per PF */ rvu_get_pf_numvfs(rvu, pf, &num_vfs, &hw_vfs); for (vf = 0; vf < num_vfs; vf++) { - index = (pf * rvu->hw->total_vfs * 16) + + index = (pf * apr_vfs * LMT_MAPTBL_ENTRY_SIZE) + ((vf + 1) * LMT_MAPTBL_ENTRY_SIZE); off += scnprintf(&buf[off], buf_size - 1 - off, "PF%d:VF%d \t\t", pf, vf); From f0f3d09f53534ea385d55ced408f2b67059b16e4 Mon Sep 17 00:00:00 2001 From: Ivan Pravdin Date: Sun, 18 May 2025 18:41:02 -0400 Subject: [PATCH 446/529] crypto: algif_hash - fix double free in hash_accept commit b2df03ed4052e97126267e8c13ad4204ea6ba9b6 upstream. If accept(2) is called on socket type algif_hash with MSG_MORE flag set and crypto_ahash_import fails, sk2 is freed. However, it is also freed in af_alg_release, leading to slab-use-after-free error. Fixes: fe869cdb89c9 ("crypto: algif_hash - User-space interface for hash operations") Cc: Signed-off-by: Ivan Pravdin Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/algif_hash.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index 1d017ec5c63c..84f4e9c2b5d9 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -263,10 +263,6 @@ static int hash_accept(struct socket *sock, struct socket *newsock, int flags, return err; err = crypto_ahash_import(&ctx2->req, state); - if (err) { - sock_orphan(sk2); - sock_put(sk2); - } return err; } From cceb15864e1612ebfbc10ec4e4dcd19a10c0056c Mon Sep 17 00:00:00 2001 From: Dominik Grzegorzek Date: Sun, 18 May 2025 19:45:31 +0200 Subject: [PATCH 447/529] padata: do not leak refcount in reorder_work commit d6ebcde6d4ecf34f8495fb30516645db3aea8993 upstream. A recent patch that addressed a UAF introduced a reference count leak: the parallel_data refcount is incremented unconditionally, regardless of the return value of queue_work(). If the work item is already queued, the incremented refcount is never decremented. Fix this by checking the return value of queue_work() and decrementing the refcount when necessary. Resolves: Unreferenced object 0xffff9d9f421e3d80 (size 192): comm "cryptomgr_probe", pid 157, jiffies 4294694003 hex dump (first 32 bytes): 80 8b cf 41 9f 9d ff ff b8 97 e0 89 ff ff ff ff ...A............ d0 97 e0 89 ff ff ff ff 19 00 00 00 1f 88 23 00 ..............#. backtrace (crc 838fb36): __kmalloc_cache_noprof+0x284/0x320 padata_alloc_pd+0x20/0x1e0 padata_alloc_shell+0x3b/0xa0 0xffffffffc040a54d cryptomgr_probe+0x43/0xc0 kthread+0xf6/0x1f0 ret_from_fork+0x2f/0x50 ret_from_fork_asm+0x1a/0x30 Fixes: dd7d37ccf6b1 ("padata: avoid UAF for reorder_work") Cc: Signed-off-by: Dominik Grzegorzek Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- kernel/padata.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/padata.c b/kernel/padata.c index e2bef90e6fd0..b3f79f23060c 100644 --- a/kernel/padata.c +++ b/kernel/padata.c @@ -350,7 +350,8 @@ static void padata_reorder(struct parallel_data *pd) * To avoid UAF issue, add pd ref here, and put pd ref after reorder_work finish. */ padata_get_pd(pd); - queue_work(pinst->serial_wq, &pd->reorder_work); + if (!queue_work(pinst->serial_wq, &pd->reorder_work)) + padata_put_pd(pd); } } From 24ee050bb96d755a4de295f460a9028ba351209a Mon Sep 17 00:00:00 2001 From: Carlos Sanchez Date: Tue, 20 May 2025 12:23:05 +0200 Subject: [PATCH 448/529] can: slcan: allow reception of short error messages commit ef0841e4cb08754be6cb42bf97739fce5d086e5f upstream. Allows slcan to receive short messages (typically errors) from the serial interface. When error support was added to slcan protocol in b32ff4668544e1333b694fcc7812b2d7397b4d6a ("can: slcan: extend the protocol with error info") the minimum valid message size changed from 5 (minimum standard can frame tIII0) to 3 ("e1a" is a valid protocol message, it is one of the examples given in the comments for slcan_bump_err() ), but the check for minimum message length prodicating all decoding was not adjusted. This makes short error messages discarded and error frames not being generated. This patch changes the minimum length to the new minimum (3 characters, excluding terminator, is now a valid message). Signed-off-by: Carlos Sanchez Fixes: b32ff4668544 ("can: slcan: extend the protocol with error info") Reviewed-by: Vincent Mailhol Link: https://patch.msgid.link/20250520102305.1097494-1-carlossanchez@geotab.com Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- drivers/net/can/slcan/slcan-core.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/drivers/net/can/slcan/slcan-core.c b/drivers/net/can/slcan/slcan-core.c index f4db77007c13..982637f8ea2f 100644 --- a/drivers/net/can/slcan/slcan-core.c +++ b/drivers/net/can/slcan/slcan-core.c @@ -71,12 +71,21 @@ MODULE_AUTHOR("Dario Binacchi "); #define SLCAN_CMD_LEN 1 #define SLCAN_SFF_ID_LEN 3 #define SLCAN_EFF_ID_LEN 8 +#define SLCAN_DATA_LENGTH_LEN 1 +#define SLCAN_ERROR_LEN 1 #define SLCAN_STATE_LEN 1 #define SLCAN_STATE_BE_RXCNT_LEN 3 #define SLCAN_STATE_BE_TXCNT_LEN 3 -#define SLCAN_STATE_FRAME_LEN (1 + SLCAN_CMD_LEN + \ - SLCAN_STATE_BE_RXCNT_LEN + \ - SLCAN_STATE_BE_TXCNT_LEN) +#define SLCAN_STATE_MSG_LEN (SLCAN_CMD_LEN + \ + SLCAN_STATE_LEN + \ + SLCAN_STATE_BE_RXCNT_LEN + \ + SLCAN_STATE_BE_TXCNT_LEN) +#define SLCAN_ERROR_MSG_LEN_MIN (SLCAN_CMD_LEN + \ + SLCAN_ERROR_LEN + \ + SLCAN_DATA_LENGTH_LEN) +#define SLCAN_FRAME_MSG_LEN_MIN (SLCAN_CMD_LEN + \ + SLCAN_SFF_ID_LEN + \ + SLCAN_DATA_LENGTH_LEN) struct slcan { struct can_priv can; @@ -176,6 +185,9 @@ static void slcan_bump_frame(struct slcan *sl) u32 tmpid; char *cmd = sl->rbuff; + if (sl->rcount < SLCAN_FRAME_MSG_LEN_MIN) + return; + skb = alloc_can_skb(sl->dev, &cf); if (unlikely(!skb)) { sl->dev->stats.rx_dropped++; @@ -281,7 +293,7 @@ static void slcan_bump_state(struct slcan *sl) return; } - if (state == sl->can.state || sl->rcount < SLCAN_STATE_FRAME_LEN) + if (state == sl->can.state || sl->rcount != SLCAN_STATE_MSG_LEN) return; cmd += SLCAN_STATE_BE_RXCNT_LEN + SLCAN_CMD_LEN + 1; @@ -328,6 +340,9 @@ static void slcan_bump_err(struct slcan *sl) bool rx_errors = false, tx_errors = false, rx_over_errors = false; int i, len; + if (sl->rcount < SLCAN_ERROR_MSG_LEN_MIN) + return; + /* get len from sanitized ASCII value */ len = cmd[1]; if (len >= '0' && len < '9') @@ -456,8 +471,7 @@ static void slcan_bump(struct slcan *sl) static void slcan_unesc(struct slcan *sl, unsigned char s) { if ((s == '\r') || (s == '\a')) { /* CR or BEL ends the pdu */ - if (!test_and_clear_bit(SLF_ERROR, &sl->flags) && - sl->rcount > 4) + if (!test_and_clear_bit(SLF_ERROR, &sl->flags)) slcan_bump(sl); sl->rcount = 0; From 2a437b86ac5a9893c902f30ef66815bf13587bf6 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 19 May 2025 14:50:26 +0200 Subject: [PATCH 449/529] can: bcm: add locking for bcm_op runtime updates commit c2aba69d0c36a496ab4f2e81e9c2b271f2693fd7 upstream. The CAN broadcast manager (CAN BCM) can send a sequence of CAN frames via hrtimer. The content and also the length of the sequence can be changed resp reduced at runtime where the 'currframe' counter is then set to zero. Although this appeared to be a safe operation the updates of 'currframe' can be triggered from user space and hrtimer context in bcm_can_tx(). Anderson Nascimento created a proof of concept that triggered a KASAN slab-out-of-bounds read access which can be prevented with a spin_lock_bh. At the rework of bcm_can_tx() the 'count' variable has been moved into the protected section as this variable can be modified from both contexts too. Fixes: ffd980f976e7 ("[CAN]: Add broadcast manager (bcm) protocol") Reported-by: Anderson Nascimento Tested-by: Anderson Nascimento Reviewed-by: Marc Kleine-Budde Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20250519125027.11900-1-socketcan@hartkopp.net Cc: stable@vger.kernel.org Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- net/can/bcm.c | 66 +++++++++++++++++++++++++++++++++++---------------- 1 file changed, 45 insertions(+), 21 deletions(-) diff --git a/net/can/bcm.c b/net/can/bcm.c index 3817692e83af..d1fad212a7cc 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -58,6 +58,7 @@ #include #include #include +#include #include #include @@ -120,6 +121,7 @@ struct bcm_op { struct canfd_frame last_sframe; struct sock *sk; struct net_device *rx_reg_dev; + spinlock_t bcm_tx_lock; /* protect currframe/count in runtime updates */ }; struct bcm_sock { @@ -273,13 +275,18 @@ static void bcm_can_tx(struct bcm_op *op) { struct sk_buff *skb; struct net_device *dev; - struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe; + struct canfd_frame *cf; int err; /* no target device? => exit */ if (!op->ifindex) return; + /* read currframe under lock protection */ + spin_lock_bh(&op->bcm_tx_lock); + cf = op->frames + op->cfsiz * op->currframe; + spin_unlock_bh(&op->bcm_tx_lock); + dev = dev_get_by_index(sock_net(op->sk), op->ifindex); if (!dev) { /* RFC: should this bcm_op remove itself here? */ @@ -300,6 +307,10 @@ static void bcm_can_tx(struct bcm_op *op) skb->dev = dev; can_skb_set_owner(skb, op->sk); err = can_send(skb, 1); + + /* update currframe and count under lock protection */ + spin_lock_bh(&op->bcm_tx_lock); + if (!err) op->frames_abs++; @@ -308,6 +319,11 @@ static void bcm_can_tx(struct bcm_op *op) /* reached last frame? */ if (op->currframe >= op->nframes) op->currframe = 0; + + if (op->count > 0) + op->count--; + + spin_unlock_bh(&op->bcm_tx_lock); out: dev_put(dev); } @@ -404,7 +420,7 @@ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) struct bcm_msg_head msg_head; if (op->kt_ival1 && (op->count > 0)) { - op->count--; + bcm_can_tx(op); if (!op->count && (op->flags & TX_COUNTEVT)) { /* create notification to user */ @@ -419,7 +435,6 @@ static enum hrtimer_restart bcm_tx_timeout_handler(struct hrtimer *hrtimer) bcm_send_to_user(op, &msg_head, NULL, 0); } - bcm_can_tx(op); } else if (op->kt_ival2) { bcm_can_tx(op); @@ -914,6 +929,27 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } op->flags = msg_head->flags; + /* only lock for unlikely count/nframes/currframe changes */ + if (op->nframes != msg_head->nframes || + op->flags & TX_RESET_MULTI_IDX || + op->flags & SETTIMER) { + + spin_lock_bh(&op->bcm_tx_lock); + + if (op->nframes != msg_head->nframes || + op->flags & TX_RESET_MULTI_IDX) { + /* potentially update changed nframes */ + op->nframes = msg_head->nframes; + /* restart multiple frame transmission */ + op->currframe = 0; + } + + if (op->flags & SETTIMER) + op->count = msg_head->count; + + spin_unlock_bh(&op->bcm_tx_lock); + } + } else { /* insert new BCM operation for the given can_id */ @@ -921,9 +957,14 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, if (!op) return -ENOMEM; + spin_lock_init(&op->bcm_tx_lock); op->can_id = msg_head->can_id; op->cfsiz = CFSIZ(msg_head->flags); op->flags = msg_head->flags; + op->nframes = msg_head->nframes; + + if (op->flags & SETTIMER) + op->count = msg_head->count; /* create array for CAN frames and copy the data */ if (msg_head->nframes > 1) { @@ -982,22 +1023,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, } /* if ((op = bcm_find_op(&bo->tx_ops, msg_head->can_id, ifindex))) */ - if (op->nframes != msg_head->nframes) { - op->nframes = msg_head->nframes; - /* start multiple frame transmission with index 0 */ - op->currframe = 0; - } - - /* check flags */ - - if (op->flags & TX_RESET_MULTI_IDX) { - /* start multiple frame transmission with index 0 */ - op->currframe = 0; - } - if (op->flags & SETTIMER) { /* set timer values */ - op->count = msg_head->count; op->ival1 = msg_head->ival1; op->ival2 = msg_head->ival2; op->kt_ival1 = bcm_timeval_to_ktime(msg_head->ival1); @@ -1014,11 +1041,8 @@ static int bcm_tx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, op->flags |= TX_ANNOUNCE; } - if (op->flags & TX_ANNOUNCE) { + if (op->flags & TX_ANNOUNCE) bcm_can_tx(op); - if (op->count) - op->count--; - } if (op->flags & STARTTIMER) bcm_tx_start_timer(op); From 6d7d458c41b98a5c1670cbd36f2923c37de51cf5 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 19 May 2025 14:50:27 +0200 Subject: [PATCH 450/529] can: bcm: add missing rcu read protection for procfs content commit dac5e6249159ac255dad9781793dbe5908ac9ddb upstream. When the procfs content is generated for a bcm_op which is in the process to be removed the procfs output might show unreliable data (UAF). As the removal of bcm_op's is already implemented with rcu handling this patch adds the missing rcu_read_lock() and makes sure the list entries are properly removed under rcu protection. Fixes: f1b4e32aca08 ("can: bcm: use call_rcu() instead of costly synchronize_rcu()") Reported-by: Anderson Nascimento Suggested-by: Anderson Nascimento Tested-by: Anderson Nascimento Signed-off-by: Oliver Hartkopp Link: https://patch.msgid.link/20250519125027.11900-2-socketcan@hartkopp.net Cc: stable@vger.kernel.org # >= 5.4 Signed-off-by: Marc Kleine-Budde Signed-off-by: Greg Kroah-Hartman --- net/can/bcm.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/can/bcm.c b/net/can/bcm.c index d1fad212a7cc..4fb5cfaf74f3 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -207,7 +207,9 @@ static int bcm_proc_show(struct seq_file *m, void *v) seq_printf(m, " / bound %s", bcm_proc_getifname(net, ifname, bo->ifindex)); seq_printf(m, " <<<\n"); - list_for_each_entry(op, &bo->rx_ops, list) { + rcu_read_lock(); + + list_for_each_entry_rcu(op, &bo->rx_ops, list) { unsigned long reduction; @@ -263,6 +265,9 @@ static int bcm_proc_show(struct seq_file *m, void *v) seq_printf(m, "# sent %ld\n", op->frames_abs); } seq_putc(m, '\n'); + + rcu_read_unlock(); + return 0; } #endif /* CONFIG_PROC_FS */ @@ -816,7 +821,7 @@ static int bcm_delete_rx_op(struct list_head *ops, struct bcm_msg_head *mh, REGMASK(op->can_id), bcm_rx_handler, op); - list_del(&op->list); + list_del_rcu(&op->list); bcm_remove_op(op); return 1; /* done */ } @@ -836,7 +841,7 @@ static int bcm_delete_tx_op(struct list_head *ops, struct bcm_msg_head *mh, list_for_each_entry_safe(op, n, ops, list) { if ((op->can_id == mh->can_id) && (op->ifindex == ifindex) && (op->flags & CAN_FD_FRAME) == (mh->flags & CAN_FD_FRAME)) { - list_del(&op->list); + list_del_rcu(&op->list); bcm_remove_op(op); return 1; /* done */ } @@ -1258,7 +1263,7 @@ static int bcm_rx_setup(struct bcm_msg_head *msg_head, struct msghdr *msg, bcm_rx_handler, op, "bcm", sk); if (err) { /* this bcm rx op is broken -> remove it */ - list_del(&op->list); + list_del_rcu(&op->list); bcm_remove_op(op); return err; } From f3e14d706ec18faf19f5a6e75060e140fea05d4a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 16 May 2025 10:08:16 +0200 Subject: [PATCH 451/529] ALSA: pcm: Fix race of buffer access at PCM OSS layer commit 93a81ca0657758b607c3f4ba889ae806be9beb73 upstream. The PCM OSS layer tries to clear the buffer with the silence data at initialization (or reconfiguration) of a stream with the explicit call of snd_pcm_format_set_silence() with runtime->dma_area. But this may lead to a UAF because the accessed runtime->dma_area might be freed concurrently, as it's performed outside the PCM ops. For avoiding it, move the code into the PCM core and perform it inside the buffer access lock, so that it won't be changed during the operation. Reported-by: syzbot+32d4647f551007595173@syzkaller.appspotmail.com Closes: https://lore.kernel.org/68164d8e.050a0220.11da1b.0019.GAE@google.com Cc: Link: https://patch.msgid.link/20250516080817.20068-1-tiwai@suse.de Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- include/sound/pcm.h | 2 ++ sound/core/oss/pcm_oss.c | 3 +-- sound/core/pcm_native.c | 11 +++++++++++ 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/include/sound/pcm.h b/include/sound/pcm.h index 27040b472a4f..51881ffbd155 100644 --- a/include/sound/pcm.h +++ b/include/sound/pcm.h @@ -1429,6 +1429,8 @@ int snd_pcm_lib_mmap_iomem(struct snd_pcm_substream *substream, struct vm_area_s #define snd_pcm_lib_mmap_iomem NULL #endif +void snd_pcm_runtime_buffer_set_silence(struct snd_pcm_runtime *runtime); + /** * snd_pcm_limit_isa_dma_size - Get the max size fitting with ISA DMA transfer * @dma: DMA number diff --git a/sound/core/oss/pcm_oss.c b/sound/core/oss/pcm_oss.c index ac2efeb63a39..2b9fada97532 100644 --- a/sound/core/oss/pcm_oss.c +++ b/sound/core/oss/pcm_oss.c @@ -1085,8 +1085,7 @@ static int snd_pcm_oss_change_params_locked(struct snd_pcm_substream *substream) runtime->oss.params = 0; runtime->oss.prepare = 1; runtime->oss.buffer_used = 0; - if (runtime->dma_area) - snd_pcm_format_set_silence(runtime->format, runtime->dma_area, bytes_to_samples(runtime, runtime->dma_bytes)); + snd_pcm_runtime_buffer_set_silence(runtime); runtime->oss.period_frames = snd_pcm_alsa_frames(substream, oss_period_size); diff --git a/sound/core/pcm_native.c b/sound/core/pcm_native.c index f46b87ca76d0..bf752b188b05 100644 --- a/sound/core/pcm_native.c +++ b/sound/core/pcm_native.c @@ -703,6 +703,17 @@ static void snd_pcm_buffer_access_unlock(struct snd_pcm_runtime *runtime) atomic_inc(&runtime->buffer_accessing); } +/* fill the PCM buffer with the current silence format; called from pcm_oss.c */ +void snd_pcm_runtime_buffer_set_silence(struct snd_pcm_runtime *runtime) +{ + snd_pcm_buffer_access_lock(runtime); + if (runtime->dma_area) + snd_pcm_format_set_silence(runtime->format, runtime->dma_area, + bytes_to_samples(runtime, runtime->dma_bytes)); + snd_pcm_buffer_access_unlock(runtime); +} +EXPORT_SYMBOL_GPL(snd_pcm_runtime_buffer_set_silence); + #if IS_ENABLED(CONFIG_SND_PCM_OSS) #define is_oss_stream(substream) ((substream)->oss.oss) #else From eefcc081ac99346a8a29e9e8607301355936cd4b Mon Sep 17 00:00:00 2001 From: Ed Burcher Date: Mon, 19 May 2025 23:49:07 +0100 Subject: [PATCH 452/529] ALSA: hda/realtek: Add quirk for Lenovo Yoga Pro 7 14ASP10 commit 8d70503068510e6080c2c649cccb154f16de26c9 upstream. Lenovo Yoga Pro 7 (gen 10) with Realtek ALC3306 and combined CS35L56 amplifiers need quirk ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN to enable bass Signed-off-by: Ed Burcher Cc: Link: https://patch.msgid.link/20250519224907.31265-2-git@edburcher.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 682ae18e211c..115c675819e6 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10379,6 +10379,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x17aa, 0x3866, "Lenovo 13X", ALC287_FIXUP_CS35L41_I2C_2), SND_PCI_QUIRK(0x17aa, 0x3869, "Lenovo Yoga7 14IAL7", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), SND_PCI_QUIRK(0x17aa, 0x3902, "Lenovo E50-80", ALC269_FIXUP_DMIC_THINKPAD_ACPI), + SND_PCI_QUIRK(0x17aa, 0x390d, "Lenovo Yoga Pro 7 14ASP10", ALC287_FIXUP_YOGA9_14IAP7_BASS_SPK_PIN), SND_PCI_QUIRK(0x17aa, 0x3913, "Lenovo 145", ALC236_FIXUP_LENOVO_INV_DMIC), SND_PCI_QUIRK(0x17aa, 0x3977, "IdeaPad S210", ALC283_FIXUP_INT_MIC), SND_PCI_QUIRK(0x17aa, 0x3978, "Lenovo B50-70", ALC269_FIXUP_DMIC_THINKPAD_ACPI), From 836024a6c89ab92da28aaa5aa5849dcf763eb906 Mon Sep 17 00:00:00 2001 From: Ilia Gavrilov Date: Thu, 15 May 2025 12:20:15 +0000 Subject: [PATCH 453/529] llc: fix data loss when reading from a socket in llc_ui_recvmsg() commit 239af1970bcb039a1551d2c438d113df0010c149 upstream. For SOCK_STREAM sockets, if user buffer size (len) is less than skb size (skb->len), the remaining data from skb will be lost after calling kfree_skb(). To fix this, move the statement for partial reading above skb deletion. Found by InfoTeCS on behalf of Linux Verification Center (linuxtesting.org) Fixes: 30a584d944fb ("[LLX]: SOCK_DGRAM interface fixes") Cc: stable@vger.kernel.org Signed-off-by: Ilia Gavrilov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/llc/af_llc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 447031c5eac4..39c7d62c933f 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -888,15 +888,15 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, if (sk->sk_type != SOCK_STREAM) goto copy_uaddr; + /* Partial read */ + if (used + offset < skb_len) + continue; + if (!(flags & MSG_PEEK)) { skb_unlink(skb, &sk->sk_receive_queue); kfree_skb(skb); *seq = 0; } - - /* Partial read */ - if (used + offset < skb_len) - continue; } while (len > 0); out: From 60bd13f8c4b3de2c910ae1cdbef85b9bbc9685f5 Mon Sep 17 00:00:00 2001 From: Vladimir Moskovkin Date: Wed, 14 May 2025 12:12:55 +0000 Subject: [PATCH 454/529] platform/x86: dell-wmi-sysman: Avoid buffer overflow in current_password_store() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4e89a4077490f52cde652d17e32519b666abf3a6 upstream. If the 'buf' array received from the user contains an empty string, the 'length' variable will be zero. Accessing the 'buf' array element with index 'length - 1' will result in a buffer overflow. Add a check for an empty string. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: e8a60aa7404b ("platform/x86: Introduce support for Systems Management Driver over WMI for Dell Systems") Cc: stable@vger.kernel.org Signed-off-by: Vladimir Moskovkin Link: https://lore.kernel.org/r/39973642a4f24295b4a8fad9109c5b08@kaspersky.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen Signed-off-by: Greg Kroah-Hartman --- drivers/platform/x86/dell/dell-wmi-sysman/passobj-attributes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/platform/x86/dell/dell-wmi-sysman/passobj-attributes.c b/drivers/platform/x86/dell/dell-wmi-sysman/passobj-attributes.c index 230e6ee96636..d8f1bf5e58a0 100644 --- a/drivers/platform/x86/dell/dell-wmi-sysman/passobj-attributes.c +++ b/drivers/platform/x86/dell/dell-wmi-sysman/passobj-attributes.c @@ -45,7 +45,7 @@ static ssize_t current_password_store(struct kobject *kobj, int length; length = strlen(buf); - if (buf[length-1] == '\n') + if (length && buf[length - 1] == '\n') length--; /* firmware does verifiation of min/max password length, From 34e2f19e0eb9f5ba8538ec3b8ce49e25b6245640 Mon Sep 17 00:00:00 2001 From: "feijuan.li" Date: Wed, 14 May 2025 14:35:11 +0800 Subject: [PATCH 455/529] drm/edid: fixed the bug that hdr metadata was not reset commit 6692dbc15e5ed40a3aa037aced65d7b8826c58cd upstream. When DP connected to a device with HDR capability, the hdr structure was filled.Then connected to another sink device without hdr capability, but the hdr info still exist. Fixes: e85959d6cbe0 ("drm: Parse HDR metadata info from EDID") Cc: # v5.3+ Signed-off-by: "feijuan.li" Reviewed-by: Jani Nikula Link: https://lore.kernel.org/r/20250514063511.4151780-1-feijuan.li@samsung.com Signed-off-by: Jani Nikula Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/drm_edid.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c index 5ed77e3361fd..fc6028aae9bd 100644 --- a/drivers/gpu/drm/drm_edid.c +++ b/drivers/gpu/drm/drm_edid.c @@ -6164,6 +6164,7 @@ static void drm_reset_display_info(struct drm_connector *connector) info->has_hdmi_infoframe = false; info->rgb_quant_range_selectable = false; memset(&info->hdmi, 0, sizeof(info->hdmi)); + memset(&connector->hdr_sink_metadata, 0, sizeof(connector->hdr_sink_metadata)); info->edid_hdmi_rgb444_dc_modes = 0; info->edid_hdmi_ycbcr444_dc_modes = 0; From c8623231e0edfcccb7cc6add0288fa0f0594282f Mon Sep 17 00:00:00 2001 From: Wang Zhaolong Date: Fri, 16 May 2025 17:12:55 +0800 Subject: [PATCH 456/529] smb: client: Fix use-after-free in cifs_fill_dirent commit a7a8fe56e932a36f43e031b398aef92341bf5ea0 upstream. There is a race condition in the readdir concurrency process, which may access the rsp buffer after it has been released, triggering the following KASAN warning. ================================================================== BUG: KASAN: slab-use-after-free in cifs_fill_dirent+0xb03/0xb60 [cifs] Read of size 4 at addr ffff8880099b819c by task a.out/342975 CPU: 2 UID: 0 PID: 342975 Comm: a.out Not tainted 6.15.0-rc6+ #240 PREEMPT(full) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.1-2.fc37 04/01/2014 Call Trace: dump_stack_lvl+0x53/0x70 print_report+0xce/0x640 kasan_report+0xb8/0xf0 cifs_fill_dirent+0xb03/0xb60 [cifs] cifs_readdir+0x12cb/0x3190 [cifs] iterate_dir+0x1a1/0x520 __x64_sys_getdents+0x134/0x220 do_syscall_64+0x4b/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e RIP: 0033:0x7f996f64b9f9 Code: ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 0d f7 c3 0c 00 f7 d8 64 89 8 RSP: 002b:00007f996f53de78 EFLAGS: 00000207 ORIG_RAX: 000000000000004e RAX: ffffffffffffffda RBX: 00007f996f53ecdc RCX: 00007f996f64b9f9 RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000003 RBP: 00007f996f53dea0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000207 R12: ffffffffffffff88 R13: 0000000000000000 R14: 00007ffc8cd9a500 R15: 00007f996f51e000 Allocated by task 408: kasan_save_stack+0x20/0x40 kasan_save_track+0x14/0x30 __kasan_slab_alloc+0x6e/0x70 kmem_cache_alloc_noprof+0x117/0x3d0 mempool_alloc_noprof+0xf2/0x2c0 cifs_buf_get+0x36/0x80 [cifs] allocate_buffers+0x1d2/0x330 [cifs] cifs_demultiplex_thread+0x22b/0x2690 [cifs] kthread+0x394/0x720 ret_from_fork+0x34/0x70 ret_from_fork_asm+0x1a/0x30 Freed by task 342979: kasan_save_stack+0x20/0x40 kasan_save_track+0x14/0x30 kasan_save_free_info+0x3b/0x60 __kasan_slab_free+0x37/0x50 kmem_cache_free+0x2b8/0x500 cifs_buf_release+0x3c/0x70 [cifs] cifs_readdir+0x1c97/0x3190 [cifs] iterate_dir+0x1a1/0x520 __x64_sys_getdents64+0x134/0x220 do_syscall_64+0x4b/0x110 entry_SYSCALL_64_after_hwframe+0x76/0x7e The buggy address belongs to the object at ffff8880099b8000 which belongs to the cache cifs_request of size 16588 The buggy address is located 412 bytes inside of freed 16588-byte region [ffff8880099b8000, ffff8880099bc0cc) The buggy address belongs to the physical page: page: refcount:0 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x99b8 head: order:3 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 anon flags: 0x80000000000040(head|node=0|zone=1) page_type: f5(slab) raw: 0080000000000040 ffff888001e03400 0000000000000000 dead000000000001 raw: 0000000000000000 0000000000010001 00000000f5000000 0000000000000000 head: 0080000000000040 ffff888001e03400 0000000000000000 dead000000000001 head: 0000000000000000 0000000000010001 00000000f5000000 0000000000000000 head: 0080000000000003 ffffea0000266e01 00000000ffffffff 00000000ffffffff head: ffffffffffffffff 0000000000000000 00000000ffffffff 0000000000000008 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8880099b8080: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880099b8100: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff8880099b8180: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8880099b8200: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8880099b8280: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ================================================================== POC is available in the link [1]. The problem triggering process is as follows: Process 1 Process 2 ----------------------------------------------------------------- cifs_readdir /* file->private_data == NULL */ initiate_cifs_search cifsFile = kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); smb2_query_dir_first ->query_dir_first() SMB2_query_directory SMB2_query_directory_init cifs_send_recv smb2_parse_query_directory srch_inf->ntwrk_buf_start = (char *)rsp; srch_inf->srch_entries_start = (char *)rsp + ... srch_inf->last_entry = (char *)rsp + ... srch_inf->smallBuf = true; find_cifs_entry /* if (cfile->srch_inf.ntwrk_buf_start) */ cifs_small_buf_release(cfile->srch_inf // free cifs_readdir ->iterate_shared() /* file->private_data != NULL */ find_cifs_entry /* in while (...) loop */ smb2_query_dir_next ->query_dir_next() SMB2_query_directory SMB2_query_directory_init cifs_send_recv compound_send_recv smb_send_rqst __smb_send_rqst rc = -ERESTARTSYS; /* if (fatal_signal_pending()) */ goto out; return rc /* if (cfile->srch_inf.last_entry) */ cifs_save_resume_key() cifs_fill_dirent // UAF /* if (rc) */ return -ENOENT; Fix this by ensuring the return code is checked before using pointers from the srch_inf. Link: https://bugzilla.kernel.org/show_bug.cgi?id=220131 [1] Fixes: a364bc0b37f1 ("[CIFS] fix saving of resume key before CIFSFindNext") Cc: stable@vger.kernel.org Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: Wang Zhaolong Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/smb/client/readdir.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 9a1f1913fb59..338e4cd81fbd 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -788,11 +788,11 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, rc = server->ops->query_dir_next(xid, tcon, &cfile->fid, search_flags, &cfile->srch_inf); + if (rc) + return -ENOENT; /* FindFirst/Next set last_entry to NULL on malformed reply */ if (cfile->srch_inf.last_entry) cifs_save_resume_key(cfile->srch_inf.last_entry, cfile); - if (rc) - return -ENOENT; } if (index_to_find < cfile->srch_inf.index_of_last_entry) { /* we found the buffer that contains the entry */ From 080da7b2fc28abdad5cdacf569a1ce83c6c27752 Mon Sep 17 00:00:00 2001 From: Wang Zhaolong Date: Fri, 16 May 2025 17:12:56 +0800 Subject: [PATCH 457/529] smb: client: Reset all search buffer pointers when releasing buffer commit e48f9d849bfdec276eebf782a84fd4dfbe1c14c0 upstream. Multiple pointers in struct cifs_search_info (ntwrk_buf_start, srch_entries_start, and last_entry) point to the same allocated buffer. However, when freeing this buffer, only ntwrk_buf_start was set to NULL, while the other pointers remained pointing to freed memory. This is defensive programming to prevent potential issues with stale pointers. While the active UAF vulnerability is fixed by the previous patch, this change ensures consistent pointer state and more robust error handling. Signed-off-by: Wang Zhaolong Cc: stable@vger.kernel.org Reviewed-by: Paulo Alcantara (Red Hat) Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/smb/client/readdir.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 338e4cd81fbd..402ac06ae9b6 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -765,7 +765,10 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, else cifs_buf_release(cfile->srch_inf. ntwrk_buf_start); + /* Reset all pointers to the network buffer to prevent stale references */ cfile->srch_inf.ntwrk_buf_start = NULL; + cfile->srch_inf.srch_entries_start = NULL; + cfile->srch_inf.last_entry = NULL; } rc = initiate_cifs_search(xid, file, full_path); if (rc) { From 9c40d1f7b75fc93d7ef02acc3a2a712cb057e576 Mon Sep 17 00:00:00 2001 From: Mario Limonciello Date: Thu, 22 May 2025 09:13:28 -0500 Subject: [PATCH 458/529] Revert "drm/amd: Keep display off while going into S4" commit 7e7cb7a13c81073d38a10fa7b450d23712281ec4 upstream. commit 68bfdc8dc0a1a ("drm/amd: Keep display off while going into S4") attempted to keep displays off during the S4 sequence by not resuming display IP. This however leads to hangs because DRM clients such as the console can try to access registers and cause a hang. Closes: https://gitlab.freedesktop.org/drm/amd/-/issues/4155 Fixes: 68bfdc8dc0a1a ("drm/amd: Keep display off while going into S4") Reviewed-by: Alex Deucher Link: https://lore.kernel.org/r/20250522141328.115095-1-mario.limonciello@amd.com Signed-off-by: Mario Limonciello Signed-off-by: Alex Deucher (cherry picked from commit e485502c37b097b0bd773baa7e2741bf7bd2909a) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c index 998dde73ecc6..64f626cc7913 100644 --- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c @@ -2897,11 +2897,6 @@ static int dm_resume(void *handle) return 0; } - - /* leave display off for S4 sequence */ - if (adev->in_s4) - return 0; - /* Recreate dc_state - DC invalidates it when setting power state to S3. */ dc_release_state(dm_state->context); dm_state->context = dc_create_state(dm->dc); From 460664bf8baab45876316d3bb308403b114c34cf Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Fri, 23 May 2025 10:21:06 -0700 Subject: [PATCH 459/529] memcg: always call cond_resched() after fn() commit 06717a7b6c86514dbd6ab322e8083ffaa4db5712 upstream. I am seeing soft lockup on certain machine types when a cgroup OOMs. This is happening because killing the process in certain machine might be very slow, which causes the soft lockup and RCU stalls. This happens usually when the cgroup has MANY processes and memory.oom.group is set. Example I am seeing in real production: [462012.244552] Memory cgroup out of memory: Killed process 3370438 (crosvm) .... .... [462037.318059] Memory cgroup out of memory: Killed process 4171372 (adb) .... [462037.348314] watchdog: BUG: soft lockup - CPU#64 stuck for 26s! [stat_manager-ag:1618982] .... Quick look at why this is so slow, it seems to be related to serial flush for certain machine types. For all the crashes I saw, the target CPU was at console_flush_all(). In the case above, there are thousands of processes in the cgroup, and it is soft locking up before it reaches the 1024 limit in the code (which would call the cond_resched()). So, cond_resched() in 1024 blocks is not sufficient. Remove the counter-based conditional rescheduling logic and call cond_resched() unconditionally after each task iteration, after fn() is called. This avoids the lockup independently of how slow fn() is. Link: https://lkml.kernel.org/r/20250523-memcg_fix-v1-1-ad3eafb60477@debian.org Fixes: ade81479c7dd ("memcg: fix soft lockup in the OOM process") Signed-off-by: Breno Leitao Suggested-by: Rik van Riel Acked-by: Shakeel Butt Cc: Michael van der Westhuizen Cc: Usama Arif Cc: Pavel Begunkov Cc: Chen Ridong Cc: Greg Kroah-Hartman Cc: Johannes Weiner Cc: Michal Hocko Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/memcontrol.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3f7cab196eb6..420efa1d2b20 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1242,7 +1242,6 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, { struct mem_cgroup *iter; int ret = 0; - int i = 0; BUG_ON(memcg == root_mem_cgroup); @@ -1252,10 +1251,9 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); while (!ret && (task = css_task_iter_next(&it))) { - /* Avoid potential softlockup warning */ - if ((++i & 1023) == 0) - cond_resched(); ret = fn(task, arg); + /* Avoid potential softlockup warning */ + cond_resched(); } css_task_iter_end(&it); if (ret) { From b9f7969173ba8ffe0fbe0eb4ccd09a485151cc2f Mon Sep 17 00:00:00 2001 From: Tianyang Zhang Date: Wed, 16 Apr 2025 16:24:05 +0800 Subject: [PATCH 460/529] mm/page_alloc.c: avoid infinite retries caused by cpuset race commit e05741fb10c38d70bbd7ec12b23c197b6355d519 upstream. __alloc_pages_slowpath has no change detection for ac->nodemask in the part of retry path, while cpuset can modify it in parallel. For some processes that set mempolicy as MPOL_BIND, this results ac->nodemask changes, and then the should_reclaim_retry will judge based on the latest nodemask and jump to retry, while the get_page_from_freelist only traverses the zonelist from ac->preferred_zoneref, which selected by a expired nodemask and may cause infinite retries in some cases cpu 64: __alloc_pages_slowpath { /* ..... */ retry: /* ac->nodemask = 0x1, ac->preferred->zone->nid = 1 */ if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); /* cpu 1: cpuset_write_resmask update_nodemask update_nodemasks_hier update_tasks_nodemask mpol_rebind_task mpol_rebind_policy mpol_rebind_nodemask // mempolicy->nodes has been modified, // which ac->nodemask point to */ /* ac->nodemask = 0x3, ac->preferred->zone->nid = 1 */ if (should_reclaim_retry(gfp_mask, order, ac, alloc_flags, did_some_progress > 0, &no_progress_loops)) goto retry; } Simultaneously starting multiple cpuset01 from LTP can quickly reproduce this issue on a multi node server when the maximum memory pressure is reached and the swap is enabled Link: https://lkml.kernel.org/r/20250416082405.20988-1-zhangtianyang@loongson.cn Fixes: c33d6c06f60f ("mm, page_alloc: avoid looking up the first zone in a zonelist twice") Signed-off-by: Tianyang Zhang Reviewed-by: Suren Baghdasaryan Reviewed-by: Vlastimil Babka Cc: Michal Hocko Cc: Brendan Jackman Cc: Johannes Weiner Cc: Zi Yan Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 65ad214e21f3..5a2bae590ed7 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5195,6 +5195,14 @@ restart: } retry: + /* + * Deal with possible cpuset update races or zonelist updates to avoid + * infinite retries. + */ + if (check_retry_cpuset(cpuset_mems_cookie, ac) || + check_retry_zonelist(zonelist_iter_cookie)) + goto restart; + /* Ensure kswapd doesn't accidentally go to sleep as long as we loop */ if (alloc_flags & ALLOC_KSWAPD) wake_all_kswapds(order, gfp_mask, ac); From 4c8d92233719f09efa419e8dd02e0c95dab02c18 Mon Sep 17 00:00:00 2001 From: Jernej Skrabec Date: Sun, 13 Apr 2025 15:58:48 +0200 Subject: [PATCH 461/529] Revert "arm64: dts: allwinner: h6: Use RSB for AXP805 PMIC connection" [ Upstream commit 573f99c7585f597630f14596550c79e73ffaeef4 ] This reverts commit 531fdbeedeb89bd32018a35c6e137765c9cc9e97. Hardware that uses I2C wasn't designed with high speeds in mind, so communication with PMIC via RSB can intermittently fail. Go back to I2C as higher speed and efficiency isn't worth the trouble. Fixes: 531fdbeedeb8 ("arm64: dts: allwinner: h6: Use RSB for AXP805 PMIC connection") Link: https://github.com/LibreELEC/LibreELEC.tv/issues/7731 Signed-off-by: Jernej Skrabec Link: https://patch.msgid.link/20250413135848.67283-1-jernej.skrabec@gmail.com Signed-off-by: Chen-Yu Tsai Signed-off-by: Sasha Levin --- .../dts/allwinner/sun50i-h6-beelink-gs1.dts | 38 +++++++++---------- .../dts/allwinner/sun50i-h6-orangepi-3.dts | 14 +++---- .../dts/allwinner/sun50i-h6-orangepi.dtsi | 22 +++++------ 3 files changed, 37 insertions(+), 37 deletions(-) diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts index 381d58cea092..c854c7e31051 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-beelink-gs1.dts @@ -151,28 +151,12 @@ vcc-pg-supply = <®_aldo1>; }; -&r_ir { - linux,rc-map-name = "rc-beelink-gs1"; - status = "okay"; -}; - -&r_pio { - /* - * FIXME: We can't add that supply for now since it would - * create a circular dependency between pinctrl, the regulator - * and the RSB Bus. - * - * vcc-pl-supply = <®_aldo1>; - */ - vcc-pm-supply = <®_aldo1>; -}; - -&r_rsb { +&r_i2c { status = "okay"; - axp805: pmic@745 { + axp805: pmic@36 { compatible = "x-powers,axp805", "x-powers,axp806"; - reg = <0x745>; + reg = <0x36>; interrupt-parent = <&r_intc>; interrupts = ; interrupt-controller; @@ -290,6 +274,22 @@ }; }; +&r_ir { + linux,rc-map-name = "rc-beelink-gs1"; + status = "okay"; +}; + +&r_pio { + /* + * PL0 and PL1 are used for PMIC I2C + * don't enable the pl-supply else + * it will fail at boot + * + * vcc-pl-supply = <®_aldo1>; + */ + vcc-pm-supply = <®_aldo1>; +}; + &spdif { pinctrl-names = "default"; pinctrl-0 = <&spdif_tx_pin>; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi-3.dts b/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi-3.dts index 6fc65e8db220..8c476e089185 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi-3.dts +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi-3.dts @@ -175,16 +175,12 @@ vcc-pg-supply = <®_vcc_wifi_io>; }; -&r_ir { - status = "okay"; -}; - -&r_rsb { +&r_i2c { status = "okay"; - axp805: pmic@745 { + axp805: pmic@36 { compatible = "x-powers,axp805", "x-powers,axp806"; - reg = <0x745>; + reg = <0x36>; interrupt-parent = <&r_intc>; interrupts = ; interrupt-controller; @@ -295,6 +291,10 @@ }; }; +&r_ir { + status = "okay"; +}; + &rtc { clocks = <&ext_osc32k>; }; diff --git a/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi.dtsi b/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi.dtsi index 92745128fcfe..4ec4996592be 100644 --- a/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi.dtsi +++ b/arch/arm64/boot/dts/allwinner/sun50i-h6-orangepi.dtsi @@ -112,20 +112,12 @@ vcc-pg-supply = <®_aldo1>; }; -&r_ir { - status = "okay"; -}; - -&r_pio { - vcc-pm-supply = <®_bldo3>; -}; - -&r_rsb { +&r_i2c { status = "okay"; - axp805: pmic@745 { + axp805: pmic@36 { compatible = "x-powers,axp805", "x-powers,axp806"; - reg = <0x745>; + reg = <0x36>; interrupt-parent = <&r_intc>; interrupts = ; interrupt-controller; @@ -240,6 +232,14 @@ }; }; +&r_ir { + status = "okay"; +}; + +&r_pio { + vcc-pm-supply = <®_bldo3>; +}; + &rtc { clocks = <&ext_osc32k>; }; From 795cea4731af29cf2f7f51c9272ac48a4396d145 Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Thu, 8 May 2025 16:46:11 +0900 Subject: [PATCH 462/529] ksmbd: fix stream write failure [ Upstream commit 1f4bbedd4e5a69b01cde2cc21d01151ab2d0884f ] If there is no stream data in file, v_len is zero. So, If position(*pos) is zero, stream write will fail due to stream write position validation check. This patch reorganize stream write position validation. Fixes: 0ca6df4f40cf ("ksmbd: prevent out-of-bounds stream writes by validating *pos") Signed-off-by: Namjae Jeon Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/smb/server/vfs.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index cf1b241a1578..fa647b75fba8 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -423,10 +423,15 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, ksmbd_debug(VFS, "write stream data pos : %llu, count : %zd\n", *pos, count); + if (*pos >= XATTR_SIZE_MAX) { + pr_err("stream write position %lld is out of bounds\n", *pos); + return -EINVAL; + } + size = *pos + count; if (size > XATTR_SIZE_MAX) { size = XATTR_SIZE_MAX; - count = (*pos + count) - XATTR_SIZE_MAX; + count = XATTR_SIZE_MAX - *pos; } v_len = ksmbd_vfs_getcasexattr(user_ns, @@ -440,13 +445,6 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, goto out; } - if (v_len <= *pos) { - pr_err("stream write position %lld is out of bounds (stream length: %zd)\n", - *pos, v_len); - err = -EINVAL; - goto out; - } - if (v_len < size) { wbuf = kvzalloc(size, GFP_KERNEL); if (!wbuf) { From f48dc6849cdd78f4822491391077740d0c770fee Mon Sep 17 00:00:00 2001 From: Larisa Grigore Date: Thu, 22 May 2025 15:51:30 +0100 Subject: [PATCH 463/529] spi: spi-fsl-dspi: restrict register range for regmap access [ Upstream commit 283ae0c65e9c592f4a1ba4f31917f5e766da7f31 ] DSPI registers are NOT continuous, some registers are reserved and accessing them from userspace will trigger external abort, add regmap register access table to avoid below abort. For example on S32G: # cat /sys/kernel/debug/regmap/401d8000.spi/registers Internal error: synchronous external abort: 96000210 1 PREEMPT SMP ... Call trace: regmap_mmio_read32le+0x24/0x48 regmap_mmio_read+0x48/0x70 _regmap_bus_reg_read+0x38/0x48 _regmap_read+0x68/0x1b0 regmap_read+0x50/0x78 regmap_read_debugfs+0x120/0x338 Fixes: 1acbdeb92c87 ("spi/fsl-dspi: Convert to use regmap and add big-endian support") Co-developed-by: Xulin Sun Signed-off-by: Xulin Sun Signed-off-by: Larisa Grigore Signed-off-by: James Clark Link: https://patch.msgid.link/20250522-james-nxp-spi-v2-1-bea884630cfb@linaro.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-fsl-dspi.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 01930b52c4fb..3f9609edac94 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0+ // // Copyright 2013 Freescale Semiconductor, Inc. -// Copyright 2020 NXP +// Copyright 2020-2025 NXP // // Freescale DSPI driver // This file contains a driver for the Freescale DSPI @@ -1128,6 +1128,20 @@ static int dspi_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(dspi_pm, dspi_suspend, dspi_resume); +static const struct regmap_range dspi_yes_ranges[] = { + regmap_reg_range(SPI_MCR, SPI_MCR), + regmap_reg_range(SPI_TCR, SPI_CTAR(3)), + regmap_reg_range(SPI_SR, SPI_TXFR3), + regmap_reg_range(SPI_RXFR0, SPI_RXFR3), + regmap_reg_range(SPI_CTARE(0), SPI_CTARE(3)), + regmap_reg_range(SPI_SREX, SPI_SREX), +}; + +static const struct regmap_access_table dspi_access_table = { + .yes_ranges = dspi_yes_ranges, + .n_yes_ranges = ARRAY_SIZE(dspi_yes_ranges), +}; + static const struct regmap_range dspi_volatile_ranges[] = { regmap_reg_range(SPI_MCR, SPI_TCR), regmap_reg_range(SPI_SR, SPI_SR), @@ -1145,6 +1159,8 @@ static const struct regmap_config dspi_regmap_config = { .reg_stride = 4, .max_register = 0x88, .volatile_table = &dspi_volatile_table, + .rd_table = &dspi_access_table, + .wr_table = &dspi_access_table, }; static const struct regmap_range dspi_xspi_volatile_ranges[] = { @@ -1166,6 +1182,8 @@ static const struct regmap_config dspi_xspi_regmap_config[] = { .reg_stride = 4, .max_register = 0x13c, .volatile_table = &dspi_xspi_volatile_table, + .rd_table = &dspi_access_table, + .wr_table = &dspi_access_table, }, { .name = "pushr", From 8bdaffcdd7c6edc46071ace7a392ad2ca2ae4024 Mon Sep 17 00:00:00 2001 From: Bogdan-Gabriel Roman Date: Thu, 22 May 2025 15:51:31 +0100 Subject: [PATCH 464/529] spi: spi-fsl-dspi: Halt the module after a new message transfer [ Upstream commit 8a30a6d35a11ff5ccdede7d6740765685385a917 ] The XSPI mode implementation in this driver still uses the EOQ flag to signal the last word in a transmission and deassert the PCS signal. However, at speeds lower than ~200kHZ, the PCS signal seems to remain asserted even when SR[EOQF] = 1 indicates the end of a transmission. This is a problem for target devices which require the deassertation of the PCS signal between transfers. Hence, this commit 'forces' the deassertation of the PCS by stopping the module through MCR[HALT] after completing a new transfer. According to the reference manual, the module stops or transitions from the Running state to the Stopped state after the current frame, when any one of the following conditions exist: - The value of SR[EOQF] = 1. - The chip is in Debug mode and the value of MCR[FRZ] = 1. - The value of MCR[HALT] = 1. This shouldn't be done if the last transfer in the message has cs_change set. Fixes: ea93ed4c181b ("spi: spi-fsl-dspi: Use EOQ for last word in buffer even for XSPI mode") Signed-off-by: Bogdan-Gabriel Roman Signed-off-by: Larisa Grigore Signed-off-by: James Clark Link: https://patch.msgid.link/20250522-james-nxp-spi-v2-2-bea884630cfb@linaro.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-fsl-dspi.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 3f9609edac94..93cdc52f0fb0 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -61,6 +61,7 @@ #define SPI_SR_TFIWF BIT(18) #define SPI_SR_RFDF BIT(17) #define SPI_SR_CMDFFF BIT(16) +#define SPI_SR_TXRXS BIT(30) #define SPI_SR_CLEAR (SPI_SR_TCFQF | \ SPI_SR_TFUF | SPI_SR_TFFF | \ SPI_SR_CMDTCF | SPI_SR_SPEF | \ @@ -907,9 +908,20 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, struct spi_device *spi = message->spi; struct spi_transfer *transfer; int status = 0; + u32 val = 0; + bool cs_change = false; message->actual_length = 0; + /* Put DSPI in running mode if halted. */ + regmap_read(dspi->regmap, SPI_MCR, &val); + if (val & SPI_MCR_HALT) { + regmap_update_bits(dspi->regmap, SPI_MCR, SPI_MCR_HALT, 0); + while (regmap_read(dspi->regmap, SPI_SR, &val) >= 0 && + !(val & SPI_SR_TXRXS)) + ; + } + list_for_each_entry(transfer, &message->transfers, transfer_list) { dspi->cur_transfer = transfer; dspi->cur_msg = message; @@ -934,6 +946,7 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, dspi->tx_cmd |= SPI_PUSHR_CMD_CONT; } + cs_change = transfer->cs_change; dspi->tx = transfer->tx_buf; dspi->rx = transfer->rx_buf; dspi->len = transfer->len; @@ -966,6 +979,15 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, spi_transfer_delay_exec(transfer); } + if (status || !cs_change) { + /* Put DSPI in stop mode */ + regmap_update_bits(dspi->regmap, SPI_MCR, + SPI_MCR_HALT, SPI_MCR_HALT); + while (regmap_read(dspi->regmap, SPI_SR, &val) >= 0 && + val & SPI_SR_TXRXS) + ; + } + message->status = status; spi_finalize_current_message(ctlr); @@ -1206,6 +1228,8 @@ static int dspi_init(struct fsl_dspi *dspi) if (!spi_controller_is_slave(dspi->ctlr)) mcr |= SPI_MCR_MASTER; + mcr |= SPI_MCR_HALT; + regmap_write(dspi->regmap, SPI_MCR, mcr); regmap_write(dspi->regmap, SPI_SR, SPI_SR_CLEAR); From 2c29113154f510e25485e8a41ef109f7c223cacf Mon Sep 17 00:00:00 2001 From: Larisa Grigore Date: Thu, 22 May 2025 15:51:32 +0100 Subject: [PATCH 465/529] spi: spi-fsl-dspi: Reset SR flags before sending a new message [ Upstream commit 7aba292eb15389073c7f3bd7847e3862dfdf604d ] If, in a previous transfer, the controller sends more data than expected by the DSPI target, SR.RFDF (RX FIFO is not empty) will remain asserted. When flushing the FIFOs at the beginning of a new transfer (writing 1 into MCR.CLR_TXF and MCR.CLR_RXF), SR.RFDF should also be cleared. Otherwise, when running in target mode with DMA, if SR.RFDF remains asserted, the DMA callback will be fired before the controller sends any data. Take this opportunity to reset all Status Register fields. Fixes: 5ce3cc567471 ("spi: spi-fsl-dspi: Provide support for DSPI slave mode operation (Vybryd vf610)") Signed-off-by: Larisa Grigore Signed-off-by: James Clark Link: https://patch.msgid.link/20250522-james-nxp-spi-v2-3-bea884630cfb@linaro.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-fsl-dspi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/spi/spi-fsl-dspi.c b/drivers/spi/spi-fsl-dspi.c index 93cdc52f0fb0..5374c6d44519 100644 --- a/drivers/spi/spi-fsl-dspi.c +++ b/drivers/spi/spi-fsl-dspi.c @@ -956,6 +956,8 @@ static int dspi_transfer_one_message(struct spi_controller *ctlr, SPI_MCR_CLR_TXF | SPI_MCR_CLR_RXF, SPI_MCR_CLR_TXF | SPI_MCR_CLR_RXF); + regmap_write(dspi->regmap, SPI_SR, SPI_SR_CLEAR); + spi_take_timestamp_pre(dspi->ctlr, dspi->cur_transfer, dspi->progress, !dspi->irq); From 9502ebf2d2143f9086df9a1375141cef0f5e1899 Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Tue, 6 May 2025 14:02:01 -0700 Subject: [PATCH 466/529] kbuild: Disable -Wdefault-const-init-unsafe commit d0afcfeb9e3810ec89d1ffde1a0e36621bb75dca upstream. A new on by default warning in clang [1] aims to flags instances where const variables without static or thread local storage or const members in aggregate types are not initialized because it can lead to an indeterminate value. This is quite noisy for the kernel due to instances originating from header files such as: drivers/gpu/drm/i915/gt/intel_ring.h:62:2: error: default initialization of an object of type 'typeof (ring->size)' (aka 'const unsigned int') leaves the object uninitialized [-Werror,-Wdefault-const-init-var-unsafe] 62 | typecheck(typeof(ring->size), next); | ^ include/linux/typecheck.h:10:9: note: expanded from macro 'typecheck' 10 | ({ type __dummy; \ | ^ include/net/ip.h:478:14: error: default initialization of an object of type 'typeof (rt->dst.expires)' (aka 'const unsigned long') leaves the object uninitialized [-Werror,-Wdefault-const-init-var-unsafe] 478 | if (mtu && time_before(jiffies, rt->dst.expires)) | ^ include/linux/jiffies.h:138:26: note: expanded from macro 'time_before' 138 | #define time_before(a,b) time_after(b,a) | ^ include/linux/jiffies.h:128:3: note: expanded from macro 'time_after' 128 | (typecheck(unsigned long, a) && \ | ^ include/linux/typecheck.h:11:12: note: expanded from macro 'typecheck' 11 | typeof(x) __dummy2; \ | ^ include/linux/list.h:409:27: warning: default initialization of an object of type 'union (unnamed union at include/linux/list.h:409:27)' with const member leaves the object uninitialized [-Wdefault-const-init-field-unsafe] 409 | struct list_head *next = smp_load_acquire(&head->next); | ^ include/asm-generic/barrier.h:176:29: note: expanded from macro 'smp_load_acquire' 176 | #define smp_load_acquire(p) __smp_load_acquire(p) | ^ arch/arm64/include/asm/barrier.h:164:59: note: expanded from macro '__smp_load_acquire' 164 | union { __unqual_scalar_typeof(*p) __val; char __c[1]; } __u; \ | ^ include/linux/list.h:409:27: note: member '__val' declared 'const' here crypto/scatterwalk.c:66:22: error: default initialization of an object of type 'struct scatter_walk' with const member leaves the object uninitialized [-Werror,-Wdefault-const-init-field-unsafe] 66 | struct scatter_walk walk; | ^ include/crypto/algapi.h:112:15: note: member 'addr' declared 'const' here 112 | void *const addr; | ^ fs/hugetlbfs/inode.c:733:24: error: default initialization of an object of type 'struct vm_area_struct' with const member leaves the object uninitialized [-Werror,-Wdefault-const-init-field-unsafe] 733 | struct vm_area_struct pseudo_vma; | ^ include/linux/mm_types.h:803:20: note: member 'vm_flags' declared 'const' here 803 | const vm_flags_t vm_flags; | ^ Silencing the instances from typecheck.h is difficult because '= {}' is not available in older but supported compilers and '= {0}' would cause warnings about a literal 0 being treated as NULL. While it might be possible to come up with a local hack to silence the warning for clang-21+, it may not be worth it since -Wuninitialized will still trigger if an uninitialized const variable is actually used. In all audited cases of the "field" variant of the warning, the members are either not used in the particular call path, modified through other means such as memset() / memcpy() because the containing object is not const, or are within a union with other non-const members. Since this warning does not appear to have a high signal to noise ratio, just disable it. Cc: stable@vger.kernel.org Link: https://github.com/llvm/llvm-project/commit/576161cb6069e2c7656a8ef530727a0f4aefff30 [1] Reported-by: Linux Kernel Functional Testing Closes: https://lore.kernel.org/CA+G9fYuNjKcxFKS_MKPRuga32XbndkLGcY-PVuoSwzv6VWbY=w@mail.gmail.com/ Reported-by: Marcus Seyfarth Closes: https://github.com/ClangBuiltLinux/linux/issues/2088 Signed-off-by: Nathan Chancellor Signed-off-by: Masahiro Yamada [nathan: Apply change to Makefile instead of scripts/Makefile.extrawarn due to lack of e88ca24319e4 in older stable branches] Signed-off-by: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- Makefile | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/Makefile b/Makefile index f86e26fa0b31..9a7bc4372e35 100644 --- a/Makefile +++ b/Makefile @@ -875,6 +875,18 @@ ifdef CONFIG_CC_IS_CLANG KBUILD_CPPFLAGS += -Qunused-arguments # The kernel builds with '-std=gnu11' so use of GNU extensions is acceptable. KBUILD_CFLAGS += -Wno-gnu + +# Clang may emit a warning when a const variable, such as the dummy variables +# in typecheck(), or const member of an aggregate type are not initialized, +# which can result in unexpected behavior. However, in many audited cases of +# the "field" variant of the warning, this is intentional because the field is +# never used within a particular call path, the field is within a union with +# other non-const members, or the containing object is not const so the field +# can be modified via memcpy() / memset(). While the variable warning also gets +# disabled with this same switch, there should not be too much coverage lost +# because -Wuninitialized will still flag when an uninitialized const variable +# is used. +KBUILD_CFLAGS += $(call cc-disable-warning, default-const-init-unsafe) else # gcc inanely warns about local variables called 'main' From c0c0b03ea1ab93a617c3bccde2e8935b4fe81fe4 Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Tue, 4 Mar 2025 20:06:11 +0100 Subject: [PATCH 467/529] serial: sh-sci: Save and restore more registers commit 81100b9a7b0515132996d62a7a676a77676cb6e3 upstream. On (H)SCIF with a Baud Rate Generator for External Clock (BRG), there are multiple ways to configure the requested serial speed. If firmware uses a different method than Linux, and if any debug info is printed after the Bit Rate Register (SCBRR) is restored, but before termios is reconfigured (which configures the alternative method), the system may lock-up during resume. Fix this by saving and restoring the contents of the BRG Frequency Division (SCDL) and Clock Select (SCCKS) registers as well. Also save and restore the HSCIF's Sampling Rate Register (HSSRR), which configures the sampling point, and the SCIFA/SCIFB's Serial Port Control and Data Registers (SCPCR/SCPDR), which configure the optional control flow signals. After this, all registers that are not saved/restored are either: - read-only, - write-only, - status registers containing flags with clear-after-set semantics, - FIFO Data Count Trigger registers, which do not matter much for the serial console. Fixes: 22a6984c5b5df8ea ("serial: sh-sci: Update the suspend/resume support") Signed-off-by: Geert Uytterhoeven Tested-by: Claudiu Beznea Reviewed-by: Claudiu Beznea Link: https://lore.kernel.org/r/11c2eab45d48211e75d8b8202cce60400880fe55.1741114989.git.geert+renesas@glider.be Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- drivers/tty/serial/sh-sci.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index 191136dcb94d..ed468b676f0b 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -106,10 +106,15 @@ struct plat_sci_reg { }; struct sci_suspend_regs { + u16 scdl; + u16 sccks; u16 scsmr; u16 scscr; u16 scfcr; u16 scsptr; + u16 hssrr; + u16 scpcr; + u16 scpdr; u8 scbrr; u8 semr; }; @@ -3418,6 +3423,10 @@ static void sci_console_save(struct sci_port *s) struct sci_suspend_regs *regs = &s->suspend_regs; struct uart_port *port = &s->port; + if (sci_getreg(port, SCDL)->size) + regs->scdl = sci_serial_in(port, SCDL); + if (sci_getreg(port, SCCKS)->size) + regs->sccks = sci_serial_in(port, SCCKS); if (sci_getreg(port, SCSMR)->size) regs->scsmr = sci_serial_in(port, SCSMR); if (sci_getreg(port, SCSCR)->size) @@ -3428,6 +3437,12 @@ static void sci_console_save(struct sci_port *s) regs->scsptr = sci_serial_in(port, SCSPTR); if (sci_getreg(port, SCBRR)->size) regs->scbrr = sci_serial_in(port, SCBRR); + if (sci_getreg(port, HSSRR)->size) + regs->hssrr = sci_serial_in(port, HSSRR); + if (sci_getreg(port, SCPCR)->size) + regs->scpcr = sci_serial_in(port, SCPCR); + if (sci_getreg(port, SCPDR)->size) + regs->scpdr = sci_serial_in(port, SCPDR); if (sci_getreg(port, SEMR)->size) regs->semr = sci_serial_in(port, SEMR); } @@ -3437,6 +3452,10 @@ static void sci_console_restore(struct sci_port *s) struct sci_suspend_regs *regs = &s->suspend_regs; struct uart_port *port = &s->port; + if (sci_getreg(port, SCDL)->size) + sci_serial_out(port, SCDL, regs->scdl); + if (sci_getreg(port, SCCKS)->size) + sci_serial_out(port, SCCKS, regs->sccks); if (sci_getreg(port, SCSMR)->size) sci_serial_out(port, SCSMR, regs->scsmr); if (sci_getreg(port, SCSCR)->size) @@ -3447,6 +3466,12 @@ static void sci_console_restore(struct sci_port *s) sci_serial_out(port, SCSPTR, regs->scsptr); if (sci_getreg(port, SCBRR)->size) sci_serial_out(port, SCBRR, regs->scbrr); + if (sci_getreg(port, HSSRR)->size) + sci_serial_out(port, HSSRR, regs->hssrr); + if (sci_getreg(port, SCPCR)->size) + sci_serial_out(port, SCPCR, regs->scpcr); + if (sci_getreg(port, SCPDR)->size) + sci_serial_out(port, SCPDR, regs->scpdr); if (sci_getreg(port, SEMR)->size) sci_serial_out(port, SEMR, regs->semr); } From 79c9df2d7a2b126a7e020e41fe8638c699471aeb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 19 Mar 2025 10:05:47 +0300 Subject: [PATCH 468/529] pinctrl: tegra: Fix off by one in tegra_pinctrl_get_group() commit 5a062c3c3b82004766bc3ece82b594d337076152 upstream. This should be >= pmx->soc->ngroups instead of > to avoid an out of bounds access. The pmx->soc->groups[] array is allocated in tegra_pinctrl_probe(). Fixes: c12bfa0fee65 ("pinctrl-tegra: Restore SFSEL bit when freeing pins") Signed-off-by: Dan Carpenter Reviewed-by: Kunwu Chan Link: https://lore.kernel.org/82b40d9d-b437-42a9-9eb3-2328aa6877ac@stanley.mountain Signed-off-by: Linus Walleij Signed-off-by: Greg Kroah-Hartman --- drivers/pinctrl/tegra/pinctrl-tegra.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pinctrl/tegra/pinctrl-tegra.c b/drivers/pinctrl/tegra/pinctrl-tegra.c index ba7bcc876e30..78de32666d56 100644 --- a/drivers/pinctrl/tegra/pinctrl-tegra.c +++ b/drivers/pinctrl/tegra/pinctrl-tegra.c @@ -305,7 +305,7 @@ static const struct tegra_pingroup *tegra_pinctrl_get_group(struct pinctrl_dev * { struct tegra_pmx *pmx = pinctrl_dev_get_drvdata(pctldev); - if (group_index < 0 || group_index > pmx->soc->ngroups) + if (group_index < 0 || group_index >= pmx->soc->ngroups) return NULL; return &pmx->soc->groups[group_index]; From bb6e80472ea9ba617e2ed403c679c72617fbf6de Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 19 Mar 2025 09:08:01 -0700 Subject: [PATCH 469/529] i3c: master: svc: Fix implicit fallthrough in svc_i3c_master_ibi_work() commit e8d2d287e26d9bd9114cf258a123a6b70812442e upstream. Clang warns (or errors with CONFIG_WERROR=y): drivers/i3c/master/svc-i3c-master.c:596:2: error: unannotated fall-through between switch labels [-Werror,-Wimplicit-fallthrough] 596 | default: | ^ drivers/i3c/master/svc-i3c-master.c:596:2: note: insert 'break;' to avoid fall-through 596 | default: | ^ | break; 1 error generated. Clang is a little more pedantic than GCC, which does not warn when falling through to a case that is just break or return. Clang's version is more in line with the kernel's own stance in deprecated.rst, which states that all switch/case blocks must end in either break, fallthrough, continue, goto, or return. Add the missing break to silence the warning. Fixes: 0430bf9bc1ac ("i3c: master: svc: Fix missing STOP for master request") Signed-off-by: Nathan Chancellor Link: https://lore.kernel.org/r/20250319-i3c-fix-clang-fallthrough-v1-1-d8e02be1ef5c@kernel.org Signed-off-by: Alexandre Belloni Signed-off-by: Greg Kroah-Hartman --- drivers/i3c/master/svc-i3c-master.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/i3c/master/svc-i3c-master.c b/drivers/i3c/master/svc-i3c-master.c index 8cf3aa800d89..9b287f92d078 100644 --- a/drivers/i3c/master/svc-i3c-master.c +++ b/drivers/i3c/master/svc-i3c-master.c @@ -496,6 +496,7 @@ static void svc_i3c_master_ibi_work(struct work_struct *work) break; case SVC_I3C_MSTATUS_IBITYPE_MASTER_REQUEST: svc_i3c_master_emit_stop(master); + break; default: break; } From 50026aa530a693a848827c6371a3cc9476fd299b Mon Sep 17 00:00:00 2001 From: Balbir Singh Date: Tue, 1 Apr 2025 11:07:52 +1100 Subject: [PATCH 470/529] x86/mm/init: Handle the special case of device private pages in add_pages(), to not increase max_pfn and trigger dma_addressing_limited() bounce buffers bounce buffers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7170130e4c72ce0caa0cb42a1627c635cc262821 upstream. As Bert Karwatzki reported, the following recent commit causes a performance regression on AMD iGPU and dGPU systems: 7ffb791423c7 ("x86/kaslr: Reduce KASLR entropy on most x86 systems") It exposed a bug with nokaslr and zone device interaction. The root cause of the bug is that, the GPU driver registers a zone device private memory region. When KASLR is disabled or the above commit is applied, the direct_map_physmem_end is set to much higher than 10 TiB typically to the 64TiB address. When zone device private memory is added to the system via add_pages(), it bumps up the max_pfn to the same value. This causes dma_addressing_limited() to return true, since the device cannot address memory all the way up to max_pfn. This caused a regression for games played on the iGPU, as it resulted in the DMA32 zone being used for GPU allocations. Fix this by not bumping up max_pfn on x86 systems, when pgmap is passed into add_pages(). The presence of pgmap is used to determine if device private memory is being added via add_pages(). More details: devm_request_mem_region() and request_free_mem_region() request for device private memory. iomem_resource is passed as the base resource with start and end parameters. iomem_resource's end depends on several factors, including the platform and virtualization. On x86 for example on bare metal, this value is set to boot_cpu_data.x86_phys_bits. boot_cpu_data.x86_phys_bits can change depending on support for MKTME. By default it is set to the same as log2(direct_map_physmem_end) which is 46 to 52 bits depending on the number of levels in the page table. The allocation routines used iomem_resource's end and direct_map_physmem_end to figure out where to allocate the region. [ arch/powerpc is also impacted by this problem, but this patch does not fix the issue for PowerPC. ] Testing: 1. Tested on a virtual machine with test_hmm for zone device inseration 2. A previous version of this patch was tested by Bert, please see: https://lore.kernel.org/lkml/d87680bab997fdc9fb4e638983132af235d9a03a.camel@web.de/ [ mingo: Clarified the comments and the changelog. ] Reported-by: Bert Karwatzki Tested-by: Bert Karwatzki Fixes: 7ffb791423c7 ("x86/kaslr: Reduce KASLR entropy on most x86 systems") Signed-off-by: Balbir Singh Signed-off-by: Ingo Molnar Cc: Brian Gerst Cc: Juergen Gross Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Andrew Morton Cc: Christoph Hellwig Cc: Pierre-Eric Pelloux-Prayer Cc: Alex Deucher Cc: Christian König Cc: David Airlie Cc: Simona Vetter Link: https://lore.kernel.org/r/20250401000752.249348-1-balbirs@nvidia.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/init_64.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 851711509d38..9d75b89fa257 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -959,9 +959,18 @@ int add_pages(int nid, unsigned long start_pfn, unsigned long nr_pages, ret = __add_pages(nid, start_pfn, nr_pages, params); WARN_ON_ONCE(ret); - /* update max_pfn, max_low_pfn and high_memory */ - update_end_of_memory_vars(start_pfn << PAGE_SHIFT, - nr_pages << PAGE_SHIFT); + /* + * Special case: add_pages() is called by memremap_pages() for adding device + * private pages. Do not bump up max_pfn in the device private path, + * because max_pfn changes affect dma_addressing_limited(). + * + * dma_addressing_limited() returning true when max_pfn is the device's + * addressable memory can force device drivers to use bounce buffers + * and impact their performance negatively: + */ + if (!params->pgmap) + /* update max_pfn, max_low_pfn and high_memory */ + update_end_of_memory_vars(start_pfn << PAGE_SHIFT, nr_pages << PAGE_SHIFT); return ret; } From bc952bf4e07d0f15e77c6484b26e1d0058d6912f Mon Sep 17 00:00:00 2001 From: Harshit Mogalapalli Date: Mon, 8 May 2023 23:07:16 -0700 Subject: [PATCH 471/529] dmaengine: idxd: Fix passing freed memory in idxd_cdev_open() commit 0642287e3ecdd0d1f88e6a2e63768e16153a990c upstream. Smatch warns: drivers/dma/idxd/cdev.c:327: idxd_cdev_open() warn: 'sva' was already freed. When idxd_wq_set_pasid() fails, the current code unbinds sva and then goes to 'failed_set_pasid' where iommu_sva_unbind_device is called again causing the above warning. [ device_user_pasid_enabled(idxd) is still true when calling failed_set_pasid ] Fix this by removing additional unbind when idxd_wq_set_pasid() fails Fixes: b022f59725f0 ("dmaengine: idxd: add idxd_copy_cr() to copy user completion record during page fault handling") Signed-off-by: Harshit Mogalapalli Acked-by: Fenghua Yu Acked-by: Dave Jiang Link: https://lore.kernel.org/r/20230509060716.2830630-1-harshit.m.mogalapalli@oracle.com Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/idxd/cdev.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/dma/idxd/cdev.c b/drivers/dma/idxd/cdev.c index d736ab15ade2..9b07474f450b 100644 --- a/drivers/dma/idxd/cdev.c +++ b/drivers/dma/idxd/cdev.c @@ -142,7 +142,6 @@ static int idxd_cdev_open(struct inode *inode, struct file *filp) if (wq_dedicated(wq)) { rc = idxd_wq_set_pasid(wq, pasid); if (rc < 0) { - iommu_sva_unbind_device(sva); dev_err(dev, "wq set pasid failed: %d\n", rc); goto failed_set_pasid; } From a84f95fecbba808c001f16140e00fe8f58bdc8fe Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Thu, 24 Aug 2023 08:33:01 +0530 Subject: [PATCH 472/529] octeontx2-pf: fix page_pool creation fail for rings > 32k commit 49fa4b0d06705a24a81bb8be6eb175059b77f0a7 upstream. octeontx2 driver calls page_pool_create() during driver probe() and fails if queue size > 32k. Page pool infra uses these buffers as shock absorbers for burst traffic. These pages are pinned down over time as working sets varies, due to the recycling nature of page pool, given page pool (currently) don't have a shrinker mechanism, the pages remain pinned down in ptr_ring. Instead of clamping page_pool size to 32k at most, limit it even more to 2k to avoid wasting memory. This have been tested on octeontx2 CN10KA hardware. TCP and UDP tests using iperf shows no performance regressions. Fixes: b2e3406a38f0 ("octeontx2-pf: Add support for page pool") Suggested-by: Alexander Lobakin Reviewed-by: Sunil Goutham Signed-off-by: Ratheesh Kannoth Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c | 2 +- drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 59a7e6f376f4..3fca896f8f90 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -1434,7 +1434,7 @@ int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id, } pp_params.flags = PP_FLAG_PAGE_FRAG | PP_FLAG_DMA_MAP; - pp_params.pool_size = numptrs; + pp_params.pool_size = min(OTX2_PAGE_POOL_SZ, numptrs); pp_params.nid = NUMA_NO_NODE; pp_params.dev = pfvf->dev; pp_params.dma_dir = DMA_FROM_DEVICE; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h index b5d689eeff80..9e3bfbe5c480 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h @@ -23,6 +23,8 @@ #define OTX2_ETH_HLEN (VLAN_ETH_HLEN + VLAN_HLEN) #define OTX2_MIN_MTU 60 +#define OTX2_PAGE_POOL_SZ 2048 + #define OTX2_MAX_GSO_SEGS 255 #define OTX2_MAX_FRAGS_IN_SQE 9 From 173d9d060c9202975c2e1fccafeaf89b52407f31 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Fri, 8 Sep 2023 08:23:09 +0530 Subject: [PATCH 473/529] octeontx2-pf: Fix page pool cache index corruption. commit 88e69af061f2e061a68751ef9cad47a674527a1b upstream. The access to page pool `cache' array and the `count' variable is not locked. Page pool cache access is fine as long as there is only one consumer per pool. octeontx2 driver fills in rx buffers from page pool in NAPI context. If system is stressed and could not allocate buffers, refiiling work will be delegated to a delayed workqueue. This means that there are two cosumers to the page pool cache. Either workqueue or IRQ/NAPI can be run on other CPU. This will lead to lock less access, hence corruption of cache pool indexes. To fix this issue, NAPI is rescheduled from workqueue context to refill rx buffers. Fixes: b2e3406a38f0 ("octeontx2-pf: Add support for page pool") Signed-off-by: Ratheesh Kannoth Reported-by: Sebastian Andrzej Siewior Reviewed-by: Sebastian Andrzej Siewior Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- .../ethernet/marvell/octeontx2/nic/cn10k.c | 6 ++- .../ethernet/marvell/octeontx2/nic/cn10k.h | 2 +- .../marvell/octeontx2/nic/otx2_common.c | 43 +++---------------- .../marvell/octeontx2/nic/otx2_common.h | 3 +- .../ethernet/marvell/octeontx2/nic/otx2_pf.c | 7 +-- .../marvell/octeontx2/nic/otx2_txrx.c | 30 ++++++++++--- .../marvell/octeontx2/nic/otx2_txrx.h | 4 +- 7 files changed, 44 insertions(+), 51 deletions(-) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c index 8663bdf014d8..7417087b6db5 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c @@ -107,12 +107,13 @@ int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura) } #define NPA_MAX_BURST 16 -void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) +int cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) { struct otx2_nic *pfvf = dev; + int cnt = cq->pool_ptrs; u64 ptrs[NPA_MAX_BURST]; - int num_ptrs = 1; dma_addr_t bufptr; + int num_ptrs = 1; /* Refill pool with new buffers */ while (cq->pool_ptrs) { @@ -131,6 +132,7 @@ void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) num_ptrs = 1; } } + return cnt - cq->pool_ptrs; } void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h index 8ae96815865e..c1861f7de254 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.h @@ -24,7 +24,7 @@ static inline int mtu_to_dwrr_weight(struct otx2_nic *pfvf, int mtu) return weight; } -void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); +int cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx); int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura); int cn10k_lmtst_init(struct otx2_nic *pfvf); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 3fca896f8f90..036f971699aa 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -567,20 +567,8 @@ int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, int otx2_alloc_buffer(struct otx2_nic *pfvf, struct otx2_cq_queue *cq, dma_addr_t *dma) { - if (unlikely(__otx2_alloc_rbuf(pfvf, cq->rbpool, dma))) { - struct refill_work *work; - struct delayed_work *dwork; - - work = &pfvf->refill_wrk[cq->cq_idx]; - dwork = &work->pool_refill_work; - /* Schedule a task if no other task is running */ - if (!cq->refill_task_sched) { - cq->refill_task_sched = true; - schedule_delayed_work(dwork, - msecs_to_jiffies(100)); - } + if (unlikely(__otx2_alloc_rbuf(pfvf, cq->rbpool, dma))) return -ENOMEM; - } return 0; } @@ -1081,39 +1069,20 @@ static int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx) static void otx2_pool_refill_task(struct work_struct *work) { struct otx2_cq_queue *cq; - struct otx2_pool *rbpool; struct refill_work *wrk; - int qidx, free_ptrs = 0; struct otx2_nic *pfvf; - dma_addr_t bufptr; + int qidx; wrk = container_of(work, struct refill_work, pool_refill_work.work); pfvf = wrk->pf; qidx = wrk - pfvf->refill_wrk; cq = &pfvf->qset.cq[qidx]; - rbpool = cq->rbpool; - free_ptrs = cq->pool_ptrs; - while (cq->pool_ptrs) { - if (otx2_alloc_rbuf(pfvf, rbpool, &bufptr)) { - /* Schedule a WQ if we fails to free atleast half of the - * pointers else enable napi for this RQ. - */ - if (!((free_ptrs - cq->pool_ptrs) > free_ptrs / 2)) { - struct delayed_work *dwork; - - dwork = &wrk->pool_refill_work; - schedule_delayed_work(dwork, - msecs_to_jiffies(100)); - } else { - cq->refill_task_sched = false; - } - return; - } - pfvf->hw_ops->aura_freeptr(pfvf, qidx, bufptr + OTX2_HEAD_ROOM); - cq->pool_ptrs--; - } cq->refill_task_sched = false; + + local_bh_disable(); + napi_schedule(wrk->napi); + local_bh_enable(); } int otx2_config_nix_queues(struct otx2_nic *pfvf) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 4f0ac8158ed1..1d3c4180d341 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -280,6 +280,7 @@ struct flr_work { struct refill_work { struct delayed_work pool_refill_work; struct otx2_nic *pf; + struct napi_struct *napi; }; /* PTPv2 originTimestamp structure */ @@ -347,7 +348,7 @@ struct dev_hw_ops { int (*sq_aq_init)(void *dev, u16 qidx, u16 sqb_aura); void (*sqe_flush)(void *dev, struct otx2_snd_queue *sq, int size, int qidx); - void (*refill_pool_ptrs)(void *dev, struct otx2_cq_queue *cq); + int (*refill_pool_ptrs)(void *dev, struct otx2_cq_queue *cq); void (*aura_freeptr)(void *dev, int aura, u64 buf); }; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 8385b4673693..c29cb56caf08 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -2005,6 +2005,10 @@ int otx2_stop(struct net_device *netdev) netif_tx_disable(netdev); + for (wrk = 0; wrk < pf->qset.cq_cnt; wrk++) + cancel_delayed_work_sync(&pf->refill_wrk[wrk].pool_refill_work); + devm_kfree(pf->dev, pf->refill_wrk); + otx2_free_hw_resources(pf); otx2_free_cints(pf, pf->hw.cint_cnt); otx2_disable_napi(pf); @@ -2012,9 +2016,6 @@ int otx2_stop(struct net_device *netdev) for (qidx = 0; qidx < netdev->num_tx_queues; qidx++) netdev_tx_reset_queue(netdev_get_tx_queue(netdev, qidx)); - for (wrk = 0; wrk < pf->qset.cq_cnt; wrk++) - cancel_delayed_work_sync(&pf->refill_wrk[wrk].pool_refill_work); - devm_kfree(pf->dev, pf->refill_wrk); kfree(qset->sq); kfree(qset->cq); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c index cc704cd3b5ae..31caf7a7a3ad 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c @@ -428,9 +428,10 @@ process_cqe: return processed_cqe; } -void otx2_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) +int otx2_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) { struct otx2_nic *pfvf = dev; + int cnt = cq->pool_ptrs; dma_addr_t bufptr; while (cq->pool_ptrs) { @@ -439,6 +440,8 @@ void otx2_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) otx2_aura_freeptr(pfvf, cq->cq_idx, bufptr + OTX2_HEAD_ROOM); cq->pool_ptrs--; } + + return cnt - cq->pool_ptrs; } static int otx2_tx_napi_handler(struct otx2_nic *pfvf, @@ -532,6 +535,7 @@ int otx2_napi_handler(struct napi_struct *napi, int budget) struct otx2_cq_queue *cq; struct otx2_qset *qset; struct otx2_nic *pfvf; + int filled_cnt = -1; cq_poll = container_of(napi, struct otx2_cq_poll, napi); pfvf = (struct otx2_nic *)cq_poll->dev; @@ -552,7 +556,7 @@ int otx2_napi_handler(struct napi_struct *napi, int budget) } if (rx_cq && rx_cq->pool_ptrs) - pfvf->hw_ops->refill_pool_ptrs(pfvf, rx_cq); + filled_cnt = pfvf->hw_ops->refill_pool_ptrs(pfvf, rx_cq); /* Clear the IRQ */ otx2_write64(pfvf, NIX_LF_CINTX_INT(cq_poll->cint_idx), BIT_ULL(0)); @@ -565,9 +569,25 @@ int otx2_napi_handler(struct napi_struct *napi, int budget) if (pfvf->flags & OTX2_FLAG_ADPTV_INT_COAL_ENABLED) otx2_adjust_adaptive_coalese(pfvf, cq_poll); - /* Re-enable interrupts */ - otx2_write64(pfvf, NIX_LF_CINTX_ENA_W1S(cq_poll->cint_idx), - BIT_ULL(0)); + if (unlikely(!filled_cnt)) { + struct refill_work *work; + struct delayed_work *dwork; + + work = &pfvf->refill_wrk[cq->cq_idx]; + dwork = &work->pool_refill_work; + /* Schedule a task if no other task is running */ + if (!cq->refill_task_sched) { + work->napi = napi; + cq->refill_task_sched = true; + schedule_delayed_work(dwork, + msecs_to_jiffies(100)); + } + } else { + /* Re-enable interrupts */ + otx2_write64(pfvf, + NIX_LF_CINTX_ENA_W1S(cq_poll->cint_idx), + BIT_ULL(0)); + } } return workdone; } diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h index 9e3bfbe5c480..a82ffca8ce1b 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h @@ -170,6 +170,6 @@ void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx); void otx2_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx); -void otx2_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); -void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); +int otx2_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); +int cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq); #endif /* OTX2_TXRX_H */ From 22e3ccf1d56de9d6e6e0830cc8f1ca78e5894268 Mon Sep 17 00:00:00 2001 From: Ratheesh Kannoth Date: Tue, 10 Oct 2023 09:18:42 +0530 Subject: [PATCH 474/529] octeontx2-pf: Fix page pool frag allocation warning commit 50e492143374c17ad89c865a1a44837b3f5c8226 upstream. Since page pool param's "order" is set to 0, will result in below warn message if interface is configured with higher rx buffer size. Steps to reproduce the issue. 1. devlink dev param set pci/0002:04:00.0 name receive_buffer_size \ value 8196 cmode runtime 2. ifconfig eth0 up [ 19.901356] ------------[ cut here ]------------ [ 19.901361] WARNING: CPU: 11 PID: 12331 at net/core/page_pool.c:567 page_pool_alloc_frag+0x3c/0x230 [ 19.901449] pstate: 82401009 (Nzcv daif +PAN -UAO +TCO -DIT +SSBS BTYPE=--) [ 19.901451] pc : page_pool_alloc_frag+0x3c/0x230 [ 19.901453] lr : __otx2_alloc_rbuf+0x60/0xbc [rvu_nicpf] [ 19.901460] sp : ffff80000f66b970 [ 19.901461] x29: ffff80000f66b970 x28: 0000000000000000 x27: 0000000000000000 [ 19.901464] x26: ffff800000d15b68 x25: ffff000195b5c080 x24: ffff0002a5a32dc0 [ 19.901467] x23: ffff0001063c0878 x22: 0000000000000100 x21: 0000000000000000 [ 19.901469] x20: 0000000000000000 x19: ffff00016f781000 x18: 0000000000000000 [ 19.901472] x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 [ 19.901474] x14: 0000000000000000 x13: ffff0005ffdc9c80 x12: 0000000000000000 [ 19.901477] x11: ffff800009119a38 x10: 4c6ef2e3ba300519 x9 : ffff800000d13844 [ 19.901479] x8 : ffff0002a5a33cc8 x7 : 0000000000000030 x6 : 0000000000000030 [ 19.901482] x5 : 0000000000000005 x4 : 0000000000000000 x3 : 0000000000000a20 [ 19.901484] x2 : 0000000000001080 x1 : ffff80000f66b9d4 x0 : 0000000000001000 [ 19.901487] Call trace: [ 19.901488] page_pool_alloc_frag+0x3c/0x230 [ 19.901490] __otx2_alloc_rbuf+0x60/0xbc [rvu_nicpf] [ 19.901494] otx2_rq_aura_pool_init+0x1c4/0x240 [rvu_nicpf] [ 19.901498] otx2_open+0x228/0xa70 [rvu_nicpf] [ 19.901501] otx2vf_open+0x20/0xd0 [rvu_nicvf] [ 19.901504] __dev_open+0x114/0x1d0 [ 19.901507] __dev_change_flags+0x194/0x210 [ 19.901510] dev_change_flags+0x2c/0x70 [ 19.901512] devinet_ioctl+0x3a4/0x6c4 [ 19.901515] inet_ioctl+0x228/0x240 [ 19.901518] sock_ioctl+0x2ac/0x480 [ 19.901522] __arm64_sys_ioctl+0x564/0xe50 [ 19.901525] invoke_syscall.constprop.0+0x58/0xf0 [ 19.901529] do_el0_svc+0x58/0x150 [ 19.901531] el0_svc+0x30/0x140 [ 19.901533] el0t_64_sync_handler+0xe8/0x114 [ 19.901535] el0t_64_sync+0x1a0/0x1a4 [ 19.901537] ---[ end trace 678c0bf660ad8116 ]--- Fixes: b2e3406a38f0 ("octeontx2-pf: Add support for page pool") Signed-off-by: Ratheesh Kannoth Reviewed-by: Yunsheng Lin Link: https://lore.kernel.org/r/20231010034842.3807816-1-rkannoth@marvell.com Signed-off-by: Paolo Abeni Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index 036f971699aa..4fc73ac721b0 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -1402,6 +1402,7 @@ int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id, return 0; } + pp_params.order = get_order(buf_size); pp_params.flags = PP_FLAG_PAGE_FRAG | PP_FLAG_DMA_MAP; pp_params.pool_size = min(OTX2_PAGE_POOL_SZ, numptrs); pp_params.nid = NUMA_NO_NODE; From 82ac6adbbb2aad14548a71d5e2e37f4964a15e38 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 18 Jan 2025 00:24:33 +0100 Subject: [PATCH 475/529] hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING commit 53dac345395c0d2493cbc2f4c85fe38aef5b63f5 upstream. hrtimers are migrated away from the dying CPU to any online target at the CPUHP_AP_HRTIMERS_DYING stage in order not to delay bandwidth timers handling tasks involved in the CPU hotplug forward progress. However wakeups can still be performed by the outgoing CPU after CPUHP_AP_HRTIMERS_DYING. Those can result again in bandwidth timers being armed. Depending on several considerations (crystal ball power management based election, earliest timer already enqueued, timer migration enabled or not), the target may eventually be the current CPU even if offline. If that happens, the timer is eventually ignored. The most notable example is RCU which had to deal with each and every of those wake-ups by deferring them to an online CPU, along with related workarounds: _ e787644caf76 (rcu: Defer RCU kthreads wakeup when CPU is dying) _ 9139f93209d1 (rcu/nocb: Fix RT throttling hrtimer armed from offline CPU) _ f7345ccc62a4 (rcu/nocb: Fix rcuog wake-up from offline softirq) The problem isn't confined to RCU though as the stop machine kthread (which runs CPUHP_AP_HRTIMERS_DYING) reports its completion at the end of its work through cpu_stop_signal_done() and performs a wake up that eventually arms the deadline server timer: WARNING: CPU: 94 PID: 588 at kernel/time/hrtimer.c:1086 hrtimer_start_range_ns+0x289/0x2d0 CPU: 94 UID: 0 PID: 588 Comm: migration/94 Not tainted Stopper: multi_cpu_stop+0x0/0x120 <- stop_machine_cpuslocked+0x66/0xc0 RIP: 0010:hrtimer_start_range_ns+0x289/0x2d0 Call Trace: start_dl_timer enqueue_dl_entity dl_server_start enqueue_task_fair enqueue_task ttwu_do_activate try_to_wake_up complete cpu_stopper_thread Instead of providing yet another bandaid to work around the situation, fix it in the hrtimers infrastructure instead: always migrate away a timer to an online target whenever it is enqueued from an offline CPU. This will also allow to revert all the above RCU disgraceful hacks. Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") Reported-by: Vlad Poenaru Reported-by: Usama Arif Signed-off-by: Frederic Weisbecker Signed-off-by: Paul E. McKenney Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Tested-by: Paul E. McKenney Link: https://lore.kernel.org/all/20250117232433.24027-1-frederic@kernel.org Closes: 20241213203739.1519801-1-usamaarif642@gmail.com Signed-off-by: Zhaoyang Li Signed-off-by: Greg Kroah-Hartman --- include/linux/hrtimer.h | 1 + kernel/time/hrtimer.c | 103 ++++++++++++++++++++++++++++++++-------- 2 files changed, 83 insertions(+), 21 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 8f77bb0f4ae0..05f8b7d7d1e9 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -237,6 +237,7 @@ struct hrtimer_cpu_base { ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; + call_single_data_t csd; } ____cacheline_aligned; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index e60863ab74b5..b3860ec12450 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -58,6 +58,8 @@ #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) +static void retrigger_next_event(void *arg); + /* * The timer bases: * @@ -111,7 +113,8 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, - } + }, + .csd = CSD_INIT(retrigger_next_event, NULL) }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { @@ -124,6 +127,14 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_TAI] = HRTIMER_BASE_TAI, }; +static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) +{ + if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) + return true; + else + return likely(base->online); +} + /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -177,27 +188,54 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * We do not migrate the timer when it is expiring before the next - * event on the target cpu. When high resolution is enabled, we cannot - * reprogram the target cpu hardware and we would cause it to fire - * late. To keep it simple, we handle the high resolution enabled and - * disabled case similar. + * Check if the elected target is suitable considering its next + * event and the hotplug state of the current CPU. + * + * If the elected target is remote and its next event is after the timer + * to queue, then a remote reprogram is necessary. However there is no + * guarantee the IPI handling the operation would arrive in time to meet + * the high resolution deadline. In this case the local CPU becomes a + * preferred target, unless it is offline. + * + * High and low resolution modes are handled the same way for simplicity. * * Called with cpu_base->lock of target cpu held. */ -static int -hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) +static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, + struct hrtimer_cpu_base *new_cpu_base, + struct hrtimer_cpu_base *this_cpu_base) { ktime_t expires; + /* + * The local CPU clockevent can be reprogrammed. Also get_target_base() + * guarantees it is online. + */ + if (new_cpu_base == this_cpu_base) + return true; + + /* + * The offline local CPU can't be the default target if the + * next remote target event is after this timer. Keep the + * elected new base. An IPI will we issued to reprogram + * it as a last resort. + */ + if (!hrtimer_base_is_online(this_cpu_base)) + return true; + expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - return expires < new_base->cpu_base->expires_next; + + return expires >= new_base->cpu_base->expires_next; } -static inline -struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, - int pinned) +static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) { + if (!hrtimer_base_is_online(base)) { + int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); + + return &per_cpu(hrtimer_bases, cpu); + } + #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); @@ -248,8 +286,8 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, + this_cpu_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; @@ -258,8 +296,7 @@ again: } WRITE_ONCE(timer->base, new_base); } else { - if (new_cpu_base != this_cpu_base && - hrtimer_check_target(timer, new_base)) { + if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { new_cpu_base = this_cpu_base; goto again; } @@ -718,8 +755,6 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } -static void retrigger_next_event(void *arg); - /* * Switch to high resolution mode */ @@ -1205,6 +1240,7 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { + struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *new_base; bool force_local, first; @@ -1216,9 +1252,15 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ - force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); + force_local = base->cpu_base == this_cpu_base; force_local &= base->cpu_base->next_timer == timer; + /* + * Don't force local queuing if this enqueue happens on a unplugged + * CPU after hrtimer_cpu_dying() has been invoked. + */ + force_local &= this_cpu_base->online; + /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the @@ -1248,8 +1290,27 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) - return first; + if (!force_local) { + /* + * If the current CPU base is online, then the timer is + * never queued on a remote CPU if it would be the first + * expiring timer there. + */ + if (hrtimer_base_is_online(this_cpu_base)) + return first; + + /* + * Timer was enqueued remote because the current base is + * already offline. If the timer is the first to expire, + * kick the remote CPU to reprogram the clock event. + */ + if (first) { + struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; + + smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); + } + return 0; + } /* * Timer was forced to stay on the current CPU to avoid From 36679fab54fa7bcffafd469e2c474c1fc4beaee0 Mon Sep 17 00:00:00 2001 From: Boris Burkov Date: Fri, 13 Dec 2024 12:22:32 -0800 Subject: [PATCH 476/529] btrfs: check folio mapping after unlock in relocate_one_folio() commit 3e74859ee35edc33a022c3f3971df066ea0ca6b9 upstream. When we call btrfs_read_folio() to bring a folio uptodate, we unlock the folio. The result of that is that a different thread can modify the mapping (like remove it with invalidate) before we call folio_lock(). This results in an invalid page and we need to try again. In particular, if we are relocating concurrently with aborting a transaction, this can result in a crash like the following: BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] SMP CPU: 76 PID: 1411631 Comm: kworker/u322:5 Workqueue: events_unbound btrfs_reclaim_bgs_work RIP: 0010:set_page_extent_mapped+0x20/0xb0 RSP: 0018:ffffc900516a7be8 EFLAGS: 00010246 RAX: ffffea009e851d08 RBX: ffffea009e0b1880 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffc900516a7b90 RDI: ffffea009e0b1880 RBP: 0000000003573000 R08: 0000000000000001 R09: ffff88c07fd2f3f0 R10: 0000000000000000 R11: 0000194754b575be R12: 0000000003572000 R13: 0000000003572fff R14: 0000000000100cca R15: 0000000005582fff FS: 0000000000000000(0000) GS:ffff88c07fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000407d00f002 CR4: 00000000007706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: ? __die+0x78/0xc0 ? page_fault_oops+0x2a8/0x3a0 ? __switch_to+0x133/0x530 ? wq_worker_running+0xa/0x40 ? exc_page_fault+0x63/0x130 ? asm_exc_page_fault+0x22/0x30 ? set_page_extent_mapped+0x20/0xb0 relocate_file_extent_cluster+0x1a7/0x940 relocate_data_extent+0xaf/0x120 relocate_block_group+0x20f/0x480 btrfs_relocate_block_group+0x152/0x320 btrfs_relocate_chunk+0x3d/0x120 btrfs_reclaim_bgs_work+0x2ae/0x4e0 process_scheduled_works+0x184/0x370 worker_thread+0xc6/0x3e0 ? blk_add_timer+0xb0/0xb0 kthread+0xae/0xe0 ? flush_tlb_kernel_range+0x90/0x90 ret_from_fork+0x2f/0x40 ? flush_tlb_kernel_range+0x90/0x90 ret_from_fork_asm+0x11/0x20 This occurs because cleanup_one_transaction() calls destroy_delalloc_inodes() which calls invalidate_inode_pages2() which takes the folio_lock before setting mapping to NULL. We fail to check this, and subsequently call set_extent_mapping(), which assumes that mapping != NULL (in fact it asserts that in debug mode) Note that the "fixes" patch here is not the one that introduced the race (the very first iteration of this code from 2009) but a more recent change that made this particular crash happen in practice. Fixes: e7f1326cc24e ("btrfs: set page extent mapped after read_folio in relocate_one_page") CC: stable@vger.kernel.org # 6.1+ Reviewed-by: Qu Wenruo Signed-off-by: Boris Burkov Signed-off-by: David Sterba Signed-off-by: Zhaoyang Li Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/relocation.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index d6cda0b2e925..fd6ea3fcab33 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2977,6 +2977,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, int ret; ASSERT(page_index <= last_index); +again: page = find_lock_page(inode->i_mapping, page_index); if (!page) { page_cache_sync_readahead(inode->i_mapping, ra, NULL, @@ -2998,6 +2999,11 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra, ret = -EIO; goto release_page; } + if (page->mapping != inode->i_mapping) { + unlock_page(page); + put_page(page); + goto again; + } } /* From 63d00b25e02b94ec2107df79798d0c97fc16701f Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Wed, 21 May 2025 16:27:00 +0100 Subject: [PATCH 477/529] af_unix: Kconfig: make CONFIG_UNIX bool commit 97154bcf4d1b7cabefec8a72cff5fbb91d5afb7b upstream. Let's make CONFIG_UNIX a bool instead of a tristate. We've decided to do that during discussion about SCM_PIDFD patchset [1]. [1] https://lore.kernel.org/lkml/20230524081933.44dc8bea@kernel.org/ Cc: "David S. Miller" Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Leon Romanovsky Cc: David Ahern Cc: Arnd Bergmann Cc: Kees Cook Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: linux-arch@vger.kernel.org Suggested-by: Jakub Kicinski Signed-off-by: Alexander Mikhalitsyn Acked-by: Christian Brauner Reviewed-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- net/unix/Kconfig | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/net/unix/Kconfig b/net/unix/Kconfig index b7f811216820..28b232f281ab 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -4,7 +4,7 @@ # config UNIX - tristate "Unix domain sockets" + bool "Unix domain sockets" help If you say Y here, you will include support for Unix domain sockets; sockets are the standard Unix mechanism for establishing and @@ -14,10 +14,6 @@ config UNIX an embedded system or something similar, you therefore definitely want to say Y here. - To compile this driver as a module, choose M here: the module will be - called unix. Note that several important services won't work - correctly if you say M here and then neglect to load the module. - Say Y unless you know what you are doing. config UNIX_SCM From 7637a75aedf7db759eb049bb6138ee8ea04aaa64 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:01 +0100 Subject: [PATCH 478/529] af_unix: Return struct unix_sock from unix_get_socket(). commit 5b17307bd0789edea0675d524a2b277b93bbde62 upstream. Currently, unix_get_socket() returns struct sock, but after calling it, we always cast it to unix_sk(). Let's return struct unix_sock from unix_get_socket(). Signed-off-by: Kuniyuki Iwashima Acked-by: Pavel Begunkov Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240123170856.41348-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 2 +- net/unix/garbage.c | 19 +++++++------------ net/unix/scm.c | 19 +++++++------------ 3 files changed, 15 insertions(+), 25 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index e7d71a516bd4..52ae023c3e93 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -13,7 +13,7 @@ void unix_notinflight(struct user_struct *user, struct file *fp); void unix_destruct_scm(struct sk_buff *skb); void unix_gc(void); void wait_for_unix_gc(void); -struct sock *unix_get_socket(struct file *filp); +struct unix_sock *unix_get_socket(struct file *filp); struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_MOD (256 - 1) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index d2fc795394a5..438f5d9b9173 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -105,20 +105,15 @@ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), while (nfd--) { /* Get the socket the fd matches if it indeed does so */ - struct sock *sk = unix_get_socket(*fp++); + struct unix_sock *u = unix_get_socket(*fp++); - if (sk) { - struct unix_sock *u = unix_sk(sk); + /* Ignore non-candidates, they could have been added + * to the queues after starting the garbage collection + */ + if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { + hit = true; - /* Ignore non-candidates, they could - * have been added to the queues after - * starting the garbage collection - */ - if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { - hit = true; - - func(u); - } + func(u); } } if (hit && hitlist != NULL) { diff --git a/net/unix/scm.c b/net/unix/scm.c index 4eff7da9f6f9..693817a31ad8 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -21,9 +21,8 @@ EXPORT_SYMBOL(gc_inflight_list); DEFINE_SPINLOCK(unix_gc_lock); EXPORT_SYMBOL(unix_gc_lock); -struct sock *unix_get_socket(struct file *filp) +struct unix_sock *unix_get_socket(struct file *filp) { - struct sock *u_sock = NULL; struct inode *inode = file_inode(filp); /* Socket ? */ @@ -33,10 +32,10 @@ struct sock *unix_get_socket(struct file *filp) /* PF_UNIX ? */ if (s && sock->ops && sock->ops->family == PF_UNIX) - u_sock = s; + return unix_sk(s); } - return u_sock; + return NULL; } EXPORT_SYMBOL(unix_get_socket); @@ -45,13 +44,11 @@ EXPORT_SYMBOL(unix_get_socket); */ void unix_inflight(struct user_struct *user, struct file *fp) { - struct sock *s = unix_get_socket(fp); + struct unix_sock *u = unix_get_socket(fp); spin_lock(&unix_gc_lock); - if (s) { - struct unix_sock *u = unix_sk(s); - + if (u) { if (!u->inflight) { BUG_ON(!list_empty(&u->link)); list_add_tail(&u->link, &gc_inflight_list); @@ -68,13 +65,11 @@ void unix_inflight(struct user_struct *user, struct file *fp) void unix_notinflight(struct user_struct *user, struct file *fp) { - struct sock *s = unix_get_socket(fp); + struct unix_sock *u = unix_get_socket(fp); spin_lock(&unix_gc_lock); - if (s) { - struct unix_sock *u = unix_sk(s); - + if (u) { BUG_ON(!u->inflight); BUG_ON(list_empty(&u->link)); From ceb8bd6c69c1680fd9b45e7f16d7170c9c7513a5 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:02 +0100 Subject: [PATCH 479/529] af_unix: Run GC on only one CPU. commit 8b90a9f819dc2a06baae4ec1a64d875e53b824ec upstream. If more than 16000 inflight AF_UNIX sockets exist and the garbage collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc(). Also, they wait for unix_gc() to complete. In unix_gc(), all inflight AF_UNIX sockets are traversed at least once, and more if they are the GC candidate. Thus, sendmsg() significantly slows down with too many inflight AF_UNIX sockets. There is a small window to invoke multiple unix_gc() instances, which will then be blocked by the same spinlock except for one. Let's convert unix_gc() to use struct work so that it will not consume CPUs unnecessarily. Note WRITE_ONCE(gc_in_progress, true) is moved before running GC. If we leave the WRITE_ONCE() as is and use the following test to call flush_work(), a process might not call it. CPU 0 CPU 1 --- --- start work and call __unix_gc() if (work_pending(&unix_gc_work) || <-- false READ_ONCE(gc_in_progress)) <-- false flush_work(); <-- missed! WRITE_ONCE(gc_in_progress, true) Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240123170856.41348-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- net/unix/garbage.c | 54 +++++++++++++++++++++++----------------------- 1 file changed, 27 insertions(+), 27 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 438f5d9b9173..bf628bfb6d35 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -86,7 +86,6 @@ /* Internal data structures and random procedures: */ static LIST_HEAD(gc_candidates); -static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) @@ -182,23 +181,8 @@ static void inc_inflight_move_tail(struct unix_sock *u) } static bool gc_in_progress; -#define UNIX_INFLIGHT_TRIGGER_GC 16000 -void wait_for_unix_gc(void) -{ - /* If number of inflight sockets is insane, - * force a garbage collect right now. - * Paired with the WRITE_ONCE() in unix_inflight(), - * unix_notinflight() and gc_in_progress(). - */ - if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && - !READ_ONCE(gc_in_progress)) - unix_gc(); - wait_event(unix_gc_wait, !READ_ONCE(gc_in_progress)); -} - -/* The external entry point: unix_gc() */ -void unix_gc(void) +static void __unix_gc(struct work_struct *work) { struct sk_buff *next_skb, *skb; struct unix_sock *u; @@ -209,13 +193,6 @@ void unix_gc(void) spin_lock(&unix_gc_lock); - /* Avoid a recursive GC. */ - if (gc_in_progress) - goto out; - - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ - WRITE_ONCE(gc_in_progress, true); - /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. @@ -346,8 +323,31 @@ void unix_gc(void) /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, false); - wake_up(&unix_gc_wait); - - out: spin_unlock(&unix_gc_lock); } + +static DECLARE_WORK(unix_gc_work, __unix_gc); + +void unix_gc(void) +{ + WRITE_ONCE(gc_in_progress, true); + queue_work(system_unbound_wq, &unix_gc_work); +} + +#define UNIX_INFLIGHT_TRIGGER_GC 16000 + +void wait_for_unix_gc(void) +{ + /* If number of inflight sockets is insane, + * force a garbage collect right now. + * + * Paired with the WRITE_ONCE() in unix_inflight(), + * unix_notinflight(), and __unix_gc(). + */ + if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && + !READ_ONCE(gc_in_progress)) + unix_gc(); + + if (READ_ONCE(gc_in_progress)) + flush_work(&unix_gc_work); +} From e9bd632f985f715f2f6253af82aa191b1a7166a8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:03 +0100 Subject: [PATCH 480/529] af_unix: Try to run GC async. commit d9f21b3613337b55cc9d4a6ead484dca68475143 upstream. If more than 16000 inflight AF_UNIX sockets exist and the garbage collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc(). Also, they wait for unix_gc() to complete. In unix_gc(), all inflight AF_UNIX sockets are traversed at least once, and more if they are the GC candidate. Thus, sendmsg() significantly slows down with too many inflight AF_UNIX sockets. However, if a process sends data with no AF_UNIX FD, the sendmsg() call does not need to wait for GC. After this change, only the process that meets the condition below will be blocked under such a situation. 1) cmsg contains AF_UNIX socket 2) more than 32 AF_UNIX sent by the same user are still inflight Note that even a sendmsg() call that does not meet the condition but has AF_UNIX FD will be blocked later in unix_scm_to_skb() by the spinlock, but we allow that as a bonus for sane users. The results below are the time spent in unix_dgram_sendmsg() sending 1 byte of data with no FD 4096 times on a host where 32K inflight AF_UNIX sockets exist. Without series: the sane sendmsg() needs to wait gc unreasonably. $ sudo /usr/share/bcc/tools/funclatency -p 11165 unix_dgram_sendmsg Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end. ^C nsecs : count distribution [...] 524288 -> 1048575 : 0 | | 1048576 -> 2097151 : 3881 |****************************************| 2097152 -> 4194303 : 214 |** | 4194304 -> 8388607 : 1 | | avg = 1825567 nsecs, total: 7477526027 nsecs, count: 4096 With series: the sane sendmsg() can finish much faster. $ sudo /usr/share/bcc/tools/funclatency -p 8702 unix_dgram_sendmsg Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end. ^C nsecs : count distribution [...] 128 -> 255 : 0 | | 256 -> 511 : 4092 |****************************************| 512 -> 1023 : 2 | | 1024 -> 2047 : 0 | | 2048 -> 4095 : 0 | | 4096 -> 8191 : 1 | | 8192 -> 16383 : 1 | | avg = 410 nsecs, total: 1680510 nsecs, count: 4096 Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240123170856.41348-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 12 ++++++++++-- include/net/scm.h | 1 + net/core/scm.c | 5 +++++ net/unix/af_unix.c | 6 ++++-- net/unix/garbage.c | 10 +++++++++- 5 files changed, 29 insertions(+), 5 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 52ae023c3e93..be488d627531 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -8,12 +8,20 @@ #include #include +#if IS_ENABLED(CONFIG_UNIX) +struct unix_sock *unix_get_socket(struct file *filp); +#else +static inline struct unix_sock *unix_get_socket(struct file *filp) +{ + return NULL; +} +#endif + void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); void unix_destruct_scm(struct sk_buff *skb); void unix_gc(void); -void wait_for_unix_gc(void); -struct unix_sock *unix_get_socket(struct file *filp); +void wait_for_unix_gc(struct scm_fp_list *fpl); struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_MOD (256 - 1) diff --git a/include/net/scm.h b/include/net/scm.h index 585adc1346bd..a5c26008fcec 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -23,6 +23,7 @@ struct scm_creds { struct scm_fp_list { short count; + short count_unix; short max; struct user_struct *user; struct file *fp[SCM_MAX_FD]; diff --git a/net/core/scm.c b/net/core/scm.c index a877c4ef4c25..bb25052624ee 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -36,6 +36,7 @@ #include #include #include +#include /* @@ -85,6 +86,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) return -ENOMEM; *fplp = fpl; fpl->count = 0; + fpl->count_unix = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; } @@ -109,6 +111,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fput(file); return -EINVAL; } + if (unix_get_socket(file)) + fpl->count_unix++; + *fpp++ = file; fpl->count++; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5ce60087086c..f74f7878b3fe 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1875,11 +1875,12 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, long timeo; int err; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags&MSG_OOB) goto out; @@ -2145,11 +2146,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, bool fds_sent = false; int data_len; - wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); if (err < 0) return err; + wait_for_unix_gc(scm.fp); + err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index bf628bfb6d35..2934d7b68036 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -335,8 +335,9 @@ void unix_gc(void) } #define UNIX_INFLIGHT_TRIGGER_GC 16000 +#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) -void wait_for_unix_gc(void) +void wait_for_unix_gc(struct scm_fp_list *fpl) { /* If number of inflight sockets is insane, * force a garbage collect right now. @@ -348,6 +349,13 @@ void wait_for_unix_gc(void) !READ_ONCE(gc_in_progress)) unix_gc(); + /* Penalise users who want to send AF_UNIX sockets + * but whose sockets have not been received yet. + */ + if (!fpl || !fpl->count_unix || + READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) + return; + if (READ_ONCE(gc_in_progress)) flush_work(&unix_gc_work); } From a1ee63f7a0adf21360d7e21c7ef7fcb9bb1f938b Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:04 +0100 Subject: [PATCH 481/529] af_unix: Replace BUG_ON() with WARN_ON_ONCE(). commit d0f6dc26346863e1f4a23117f5468614e54df064 upstream. This is a prep patch for the last patch in this series so that checkpatch will not warn about BUG_ON(). Signed-off-by: Kuniyuki Iwashima Acked-by: Jens Axboe Link: https://lore.kernel.org/r/20240129190435.57228-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- net/unix/garbage.c | 8 ++++---- net/unix/scm.c | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 2934d7b68036..7eeaac165e85 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -145,7 +145,7 @@ static void scan_children(struct sock *x, void (*func)(struct unix_sock *), /* An embryo cannot be in-flight, so it's safe * to use the list link. */ - BUG_ON(!list_empty(&u->link)); + WARN_ON_ONCE(!list_empty(&u->link)); list_add_tail(&u->link, &embryos); } spin_unlock(&x->sk_receive_queue.lock); @@ -224,8 +224,8 @@ static void __unix_gc(struct work_struct *work) total_refs = file_count(sk->sk_socket->file); - BUG_ON(!u->inflight); - BUG_ON(total_refs < u->inflight); + WARN_ON_ONCE(!u->inflight); + WARN_ON_ONCE(total_refs < u->inflight); if (total_refs == u->inflight) { list_move_tail(&u->link, &gc_candidates); __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); @@ -318,7 +318,7 @@ static void __unix_gc(struct work_struct *work) list_move_tail(&u->link, &gc_inflight_list); /* All candidates should have been detached by now. */ - BUG_ON(!list_empty(&gc_candidates)); + WARN_ON_ONCE(!list_empty(&gc_candidates)); /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, false); diff --git a/net/unix/scm.c b/net/unix/scm.c index 693817a31ad8..6f446dd2deed 100644 --- a/net/unix/scm.c +++ b/net/unix/scm.c @@ -50,10 +50,10 @@ void unix_inflight(struct user_struct *user, struct file *fp) if (u) { if (!u->inflight) { - BUG_ON(!list_empty(&u->link)); + WARN_ON_ONCE(!list_empty(&u->link)); list_add_tail(&u->link, &gc_inflight_list); } else { - BUG_ON(list_empty(&u->link)); + WARN_ON_ONCE(list_empty(&u->link)); } u->inflight++; /* Paired with READ_ONCE() in wait_for_unix_gc() */ @@ -70,8 +70,8 @@ void unix_notinflight(struct user_struct *user, struct file *fp) spin_lock(&unix_gc_lock); if (u) { - BUG_ON(!u->inflight); - BUG_ON(list_empty(&u->link)); + WARN_ON_ONCE(!u->inflight); + WARN_ON_ONCE(list_empty(&u->link)); u->inflight--; if (!u->inflight) From 6afc1286651842195bcde02909539c658af24d8c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:05 +0100 Subject: [PATCH 482/529] af_unix: Remove io_uring code for GC. commit 11498715f266a3fb4caabba9dd575636cbcaa8f1 upstream. Since commit 705318a99a13 ("io_uring/af_unix: disable sending io_uring over sockets"), io_uring's unix socket cannot be passed via SCM_RIGHTS, so it does not contribute to cyclic reference and no longer be candidate for garbage collection. Also, commit 6e5e6d274956 ("io_uring: drop any code related to SCM_RIGHTS") cleaned up SCM_RIGHTS code in io_uring. Let's do it in AF_UNIX as well by reverting commit 0091bfc81741 ("io_uring/af_unix: defer registered files gc to io_uring release") and commit 10369080454d ("net: reclaim skb->scm_io_uring bit"). Signed-off-by: Kuniyuki Iwashima Acked-by: Jens Axboe Link: https://lore.kernel.org/r/20240129190435.57228-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- net/unix/garbage.c | 25 ++----------------------- 1 file changed, 2 insertions(+), 23 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 7eeaac165e85..c04f82489abb 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -184,12 +184,10 @@ static bool gc_in_progress; static void __unix_gc(struct work_struct *work) { - struct sk_buff *next_skb, *skb; - struct unix_sock *u; - struct unix_sock *next; struct sk_buff_head hitlist; - struct list_head cursor; + struct unix_sock *u, *next; LIST_HEAD(not_cycle_list); + struct list_head cursor; spin_lock(&unix_gc_lock); @@ -293,30 +291,11 @@ static void __unix_gc(struct work_struct *work) spin_unlock(&unix_gc_lock); - /* We need io_uring to clean its registered files, ignore all io_uring - * originated skbs. It's fine as io_uring doesn't keep references to - * other io_uring instances and so killing all other files in the cycle - * will put all io_uring references forcing it to go through normal - * release.path eventually putting registered files. - */ - skb_queue_walk_safe(&hitlist, skb, next_skb) { - if (skb->scm_io_uring) { - __skb_unlink(skb, &hitlist); - skb_queue_tail(&skb->sk->sk_receive_queue, skb); - } - } - /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); spin_lock(&unix_gc_lock); - /* There could be io_uring registered files, just push them back to - * the inflight list - */ - list_for_each_entry_safe(u, next, &gc_candidates, link) - list_move_tail(&u->link, &gc_inflight_list); - /* All candidates should have been detached by now. */ WARN_ON_ONCE(!list_empty(&gc_candidates)); From 44aebf50fad29bc901fcf415b69c3a1f0b92729c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:06 +0100 Subject: [PATCH 483/529] af_unix: Remove CONFIG_UNIX_SCM. commit 99a7a5b9943ea2d05fb0dee38e4ae2290477ed83 upstream. Originally, the code related to garbage collection was all in garbage.c. Commit f4e65870e5ce ("net: split out functions related to registering inflight socket files") moved some functions to scm.c for io_uring and added CONFIG_UNIX_SCM just in case AF_UNIX was built as module. However, since commit 97154bcf4d1b ("af_unix: Kconfig: make CONFIG_UNIX bool"), AF_UNIX is no longer built separately. Also, io_uring does not support SCM_RIGHTS now. Let's move the functions back to garbage.c Signed-off-by: Kuniyuki Iwashima Acked-by: Jens Axboe Link: https://lore.kernel.org/r/20240129190435.57228-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 7 +- net/Makefile | 2 +- net/unix/Kconfig | 5 -- net/unix/Makefile | 2 - net/unix/af_unix.c | 63 +++++++++++++++++- net/unix/garbage.c | 73 ++++++++++++++++++++- net/unix/scm.c | 149 ------------------------------------------ net/unix/scm.h | 10 --- 8 files changed, 137 insertions(+), 174 deletions(-) delete mode 100644 net/unix/scm.c delete mode 100644 net/unix/scm.h diff --git a/include/net/af_unix.h b/include/net/af_unix.h index be488d627531..91d2036fc182 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -17,19 +17,20 @@ static inline struct unix_sock *unix_get_socket(struct file *filp) } #endif +extern spinlock_t unix_gc_lock; +extern unsigned int unix_tot_inflight; + void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); -void unix_destruct_scm(struct sk_buff *skb); void unix_gc(void); void wait_for_unix_gc(struct scm_fp_list *fpl); + struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_MOD (256 - 1) #define UNIX_HASH_SIZE (256 * 2) #define UNIX_HASH_BITS 8 -extern unsigned int unix_tot_inflight; - struct unix_address { refcount_t refcnt; int len; diff --git a/net/Makefile b/net/Makefile index 0914bea9c335..103cd8d61f68 100644 --- a/net/Makefile +++ b/net/Makefile @@ -17,7 +17,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/ obj-$(CONFIG_INET) += ipv4/ obj-$(CONFIG_TLS) += tls/ obj-$(CONFIG_XFRM) += xfrm/ -obj-$(CONFIG_UNIX_SCM) += unix/ +obj-$(CONFIG_UNIX) += unix/ obj-y += ipv6/ obj-$(CONFIG_BPFILTER) += bpfilter/ obj-$(CONFIG_PACKET) += packet/ diff --git a/net/unix/Kconfig b/net/unix/Kconfig index 28b232f281ab..8b5d04210d7c 100644 --- a/net/unix/Kconfig +++ b/net/unix/Kconfig @@ -16,11 +16,6 @@ config UNIX Say Y unless you know what you are doing. -config UNIX_SCM - bool - depends on UNIX - default y - config AF_UNIX_OOB bool depends on UNIX diff --git a/net/unix/Makefile b/net/unix/Makefile index 20491825b4d0..4ddd125c4642 100644 --- a/net/unix/Makefile +++ b/net/unix/Makefile @@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o obj-$(CONFIG_UNIX_DIAG) += unix_diag.o unix_diag-y := diag.o - -obj-$(CONFIG_UNIX_SCM) += scm.o diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f74f7878b3fe..7bcc4c526274 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -116,8 +116,6 @@ #include #include -#include "scm.h" - static atomic_long_t unix_nr_socks; static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; @@ -1726,6 +1724,52 @@ out: return err; } +/* The "user->unix_inflight" variable is protected by the garbage + * collection lock, and we just read it locklessly here. If you go + * over the limit, there might be a tiny race in actually noticing + * it across threads. Tough. + */ +static inline bool too_many_unix_fds(struct task_struct *p) +{ + struct user_struct *user = current_user(); + + if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) + return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); + return false; +} + +static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + + if (too_many_unix_fds(current)) + return -ETOOMANYREFS; + + /* Need to duplicate file references for the sake of garbage + * collection. Otherwise a socket in the fps might become a + * candidate for GC while the skb is not yet queued. + */ + UNIXCB(skb).fp = scm_fp_dup(scm->fp); + if (!UNIXCB(skb).fp) + return -ENOMEM; + + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->user, scm->fp->fp[i]); + + return 0; +} + +static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +{ + int i; + + scm->fp = UNIXCB(skb).fp; + UNIXCB(skb).fp = NULL; + + for (i = scm->fp->count - 1; i >= 0; i--) + unix_notinflight(scm->fp->user, scm->fp->fp[i]); +} + static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); @@ -1773,6 +1817,21 @@ static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) spin_unlock(&unix_gc_lock); } +static void unix_destruct_scm(struct sk_buff *skb) +{ + struct scm_cookie scm; + + memset(&scm, 0, sizeof(scm)); + scm.pid = UNIXCB(skb).pid; + if (UNIXCB(skb).fp) + unix_detach_fds(&scm, skb); + + /* Alas, it calls VFS */ + /* So fscking what? fput() had been SMP-safe since the last Summer */ + scm_destroy(&scm); + sock_wfree(skb); +} + static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) { int err = 0; diff --git a/net/unix/garbage.c b/net/unix/garbage.c index c04f82489abb..0104be9d4704 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -81,11 +81,80 @@ #include #include -#include "scm.h" +struct unix_sock *unix_get_socket(struct file *filp) +{ + struct inode *inode = file_inode(filp); -/* Internal data structures and random procedures: */ + /* Socket ? */ + if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { + struct socket *sock = SOCKET_I(inode); + const struct proto_ops *ops; + struct sock *sk = sock->sk; + ops = READ_ONCE(sock->ops); + + /* PF_UNIX ? */ + if (sk && ops && ops->family == PF_UNIX) + return unix_sk(sk); + } + + return NULL; +} + +DEFINE_SPINLOCK(unix_gc_lock); +unsigned int unix_tot_inflight; static LIST_HEAD(gc_candidates); +static LIST_HEAD(gc_inflight_list); + +/* Keep the number of times in flight count for the file + * descriptor if it is for an AF_UNIX socket. + */ +void unix_inflight(struct user_struct *user, struct file *filp) +{ + struct unix_sock *u = unix_get_socket(filp); + + spin_lock(&unix_gc_lock); + + if (u) { + if (!u->inflight) { + WARN_ON_ONCE(!list_empty(&u->link)); + list_add_tail(&u->link, &gc_inflight_list); + } else { + WARN_ON_ONCE(list_empty(&u->link)); + } + u->inflight++; + + /* Paired with READ_ONCE() in wait_for_unix_gc() */ + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); + } + + WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); + + spin_unlock(&unix_gc_lock); +} + +void unix_notinflight(struct user_struct *user, struct file *filp) +{ + struct unix_sock *u = unix_get_socket(filp); + + spin_lock(&unix_gc_lock); + + if (u) { + WARN_ON_ONCE(!u->inflight); + WARN_ON_ONCE(list_empty(&u->link)); + + u->inflight--; + if (!u->inflight) + list_del_init(&u->link); + + /* Paired with READ_ONCE() in wait_for_unix_gc() */ + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); + } + + WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); + + spin_unlock(&unix_gc_lock); +} static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), struct sk_buff_head *hitlist) diff --git a/net/unix/scm.c b/net/unix/scm.c deleted file mode 100644 index 6f446dd2deed..000000000000 --- a/net/unix/scm.c +++ /dev/null @@ -1,149 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "scm.h" - -unsigned int unix_tot_inflight; -EXPORT_SYMBOL(unix_tot_inflight); - -LIST_HEAD(gc_inflight_list); -EXPORT_SYMBOL(gc_inflight_list); - -DEFINE_SPINLOCK(unix_gc_lock); -EXPORT_SYMBOL(unix_gc_lock); - -struct unix_sock *unix_get_socket(struct file *filp) -{ - struct inode *inode = file_inode(filp); - - /* Socket ? */ - if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { - struct socket *sock = SOCKET_I(inode); - struct sock *s = sock->sk; - - /* PF_UNIX ? */ - if (s && sock->ops && sock->ops->family == PF_UNIX) - return unix_sk(s); - } - - return NULL; -} -EXPORT_SYMBOL(unix_get_socket); - -/* Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. - */ -void unix_inflight(struct user_struct *user, struct file *fp) -{ - struct unix_sock *u = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (u) { - if (!u->inflight) { - WARN_ON_ONCE(!list_empty(&u->link)); - list_add_tail(&u->link, &gc_inflight_list); - } else { - WARN_ON_ONCE(list_empty(&u->link)); - } - u->inflight++; - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); - } - WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); - spin_unlock(&unix_gc_lock); -} - -void unix_notinflight(struct user_struct *user, struct file *fp) -{ - struct unix_sock *u = unix_get_socket(fp); - - spin_lock(&unix_gc_lock); - - if (u) { - WARN_ON_ONCE(!u->inflight); - WARN_ON_ONCE(list_empty(&u->link)); - - u->inflight--; - if (!u->inflight) - list_del_init(&u->link); - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); - } - WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); - spin_unlock(&unix_gc_lock); -} - -/* - * The "user->unix_inflight" variable is protected by the garbage - * collection lock, and we just read it locklessly here. If you go - * over the limit, there might be a tiny race in actually noticing - * it across threads. Tough. - */ -static inline bool too_many_unix_fds(struct task_struct *p) -{ - struct user_struct *user = current_user(); - - if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) - return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); - return false; -} - -int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - - if (too_many_unix_fds(current)) - return -ETOOMANYREFS; - - /* - * Need to duplicate file references for the sake of garbage - * collection. Otherwise a socket in the fps might become a - * candidate for GC while the skb is not yet queued. - */ - UNIXCB(skb).fp = scm_fp_dup(scm->fp); - if (!UNIXCB(skb).fp) - return -ENOMEM; - - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->user, scm->fp->fp[i]); - return 0; -} -EXPORT_SYMBOL(unix_attach_fds); - -void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) -{ - int i; - - scm->fp = UNIXCB(skb).fp; - UNIXCB(skb).fp = NULL; - - for (i = scm->fp->count-1; i >= 0; i--) - unix_notinflight(scm->fp->user, scm->fp->fp[i]); -} -EXPORT_SYMBOL(unix_detach_fds); - -void unix_destruct_scm(struct sk_buff *skb) -{ - struct scm_cookie scm; - - memset(&scm, 0, sizeof(scm)); - scm.pid = UNIXCB(skb).pid; - if (UNIXCB(skb).fp) - unix_detach_fds(&scm, skb); - - /* Alas, it calls VFS */ - /* So fscking what? fput() had been SMP-safe since the last Summer */ - scm_destroy(&scm); - sock_wfree(skb); -} -EXPORT_SYMBOL(unix_destruct_scm); diff --git a/net/unix/scm.h b/net/unix/scm.h deleted file mode 100644 index 5a255a477f16..000000000000 --- a/net/unix/scm.h +++ /dev/null @@ -1,10 +0,0 @@ -#ifndef NET_UNIX_SCM_H -#define NET_UNIX_SCM_H - -extern struct list_head gc_inflight_list; -extern spinlock_t unix_gc_lock; - -int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb); -void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb); - -#endif From 1002e86c46968e5567174e5f14292d351fec2e90 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:07 +0100 Subject: [PATCH 484/529] af_unix: Allocate struct unix_vertex for each inflight AF_UNIX fd. commit 1fbfdfaa590248c1d86407f578e40e5c65136330 upstream. We will replace the garbage collection algorithm for AF_UNIX, where we will consider each inflight AF_UNIX socket as a vertex and its file descriptor as an edge in a directed graph. This patch introduces a new struct unix_vertex representing a vertex in the graph and adds its pointer to struct unix_sock. When we send a fd using the SCM_RIGHTS message, we allocate struct scm_fp_list to struct scm_cookie in scm_fp_copy(). Then, we bump each refcount of the inflight fds' struct file and save them in scm_fp_list.fp. After that, unix_attach_fds() inexplicably clones scm_fp_list of scm_cookie and sets it to skb. (We will remove this part after replacing GC.) Here, we add a new function call in unix_attach_fds() to preallocate struct unix_vertex per inflight AF_UNIX fd and link each vertex to skb's scm_fp_list.vertices. When sendmsg() succeeds later, if the socket of the inflight fd is still not inflight yet, we will set the preallocated vertex to struct unix_sock.vertex and link it to a global list unix_unvisited_vertices under spin_lock(&unix_gc_lock). If the socket is already inflight, we free the preallocated vertex. This is to avoid taking the lock unnecessarily when sendmsg() could fail later. In the following patch, we will similarly allocate another struct per edge, which will finally be linked to the inflight socket's unix_vertex.edges. And then, we will count the number of edges as unix_vertex.out_degree. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-2-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 9 +++++++++ include/net/scm.h | 3 +++ net/core/scm.c | 7 +++++++ net/unix/af_unix.c | 6 ++++++ net/unix/garbage.c | 38 ++++++++++++++++++++++++++++++++++++++ 5 files changed, 63 insertions(+) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 91d2036fc182..b41aff1ac688 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -22,9 +22,17 @@ extern unsigned int unix_tot_inflight; void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); +int unix_prepare_fpl(struct scm_fp_list *fpl); +void unix_destroy_fpl(struct scm_fp_list *fpl); void unix_gc(void); void wait_for_unix_gc(struct scm_fp_list *fpl); +struct unix_vertex { + struct list_head edges; + struct list_head entry; + unsigned long out_degree; +}; + struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_MOD (256 - 1) @@ -62,6 +70,7 @@ struct unix_sock { struct path path; struct mutex iolock, bindlock; struct sock *peer; + struct unix_vertex *vertex; struct list_head link; unsigned long inflight; spinlock_t lock; diff --git a/include/net/scm.h b/include/net/scm.h index a5c26008fcec..4183495d1981 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -25,6 +25,9 @@ struct scm_fp_list { short count; short count_unix; short max; +#ifdef CONFIG_UNIX + struct list_head vertices; +#endif struct user_struct *user; struct file *fp[SCM_MAX_FD]; }; diff --git a/net/core/scm.c b/net/core/scm.c index bb25052624ee..09bacb3d36f2 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -89,6 +89,9 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fpl->count_unix = 0; fpl->max = SCM_MAX_FD; fpl->user = NULL; +#if IS_ENABLED(CONFIG_UNIX) + INIT_LIST_HEAD(&fpl->vertices); +#endif } fpp = &fpl->fp[fpl->count]; @@ -372,8 +375,12 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) if (new_fpl) { for (i = 0; i < fpl->count; i++) get_file(fpl->fp[i]); + new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); +#if IS_ENABLED(CONFIG_UNIX) + INIT_LIST_HEAD(&new_fpl->vertices); +#endif } return new_fpl; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7bcc4c526274..0d3ba0d210c0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -955,6 +955,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); u->inflight = 0; + u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); @@ -1756,6 +1757,9 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) for (i = scm->fp->count - 1; i >= 0; i--) unix_inflight(scm->fp->user, scm->fp->fp[i]); + if (unix_prepare_fpl(UNIXCB(skb).fp)) + return -ENOMEM; + return 0; } @@ -1766,6 +1770,8 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) scm->fp = UNIXCB(skb).fp; UNIXCB(skb).fp = NULL; + unix_destroy_fpl(scm->fp); + for (i = scm->fp->count - 1; i >= 0; i--) unix_notinflight(scm->fp->user, scm->fp->fp[i]); } diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 0104be9d4704..8ea7640e032e 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -101,6 +101,44 @@ struct unix_sock *unix_get_socket(struct file *filp) return NULL; } +static void unix_free_vertices(struct scm_fp_list *fpl) +{ + struct unix_vertex *vertex, *next_vertex; + + list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) { + list_del(&vertex->entry); + kfree(vertex); + } +} + +int unix_prepare_fpl(struct scm_fp_list *fpl) +{ + struct unix_vertex *vertex; + int i; + + if (!fpl->count_unix) + return 0; + + for (i = 0; i < fpl->count_unix; i++) { + vertex = kmalloc(sizeof(*vertex), GFP_KERNEL); + if (!vertex) + goto err; + + list_add(&vertex->entry, &fpl->vertices); + } + + return 0; + +err: + unix_free_vertices(fpl); + return -ENOMEM; +} + +void unix_destroy_fpl(struct scm_fp_list *fpl) +{ + unix_free_vertices(fpl); +} + DEFINE_SPINLOCK(unix_gc_lock); unsigned int unix_tot_inflight; static LIST_HEAD(gc_candidates); From 6b7a036eaa3e0a79f2442f4c5affbc4fa1145845 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:08 +0100 Subject: [PATCH 485/529] af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd. commit 29b64e354029cfcf1eea4d91b146c7b769305930 upstream. As with the previous patch, we preallocate to skb's scm_fp_list an array of struct unix_edge in the number of inflight AF_UNIX fds. There we just preallocate memory and do not use immediately because sendmsg() could fail after this point. The actual use will be in the next patch. When we queue skb with inflight edges, we will set the inflight socket's unix_sock as unix_edge->predecessor and the receiver's unix_sock as successor, and then we will link the edge to the inflight socket's unix_vertex.edges. Note that we set NULL to cloned scm_fp_list.edges in scm_fp_dup() so that MSG_PEEK does not change the shape of the directed graph. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 6 ++++++ include/net/scm.h | 5 +++++ net/core/scm.c | 2 ++ net/unix/garbage.c | 6 ++++++ 4 files changed, 19 insertions(+) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index b41aff1ac688..279087595966 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -33,6 +33,12 @@ struct unix_vertex { unsigned long out_degree; }; +struct unix_edge { + struct unix_sock *predecessor; + struct unix_sock *successor; + struct list_head vertex_entry; +}; + struct sock *unix_peer_get(struct sock *sk); #define UNIX_HASH_MOD (256 - 1) diff --git a/include/net/scm.h b/include/net/scm.h index 4183495d1981..19d7d802ed6c 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -21,12 +21,17 @@ struct scm_creds { kgid_t gid; }; +#ifdef CONFIG_UNIX +struct unix_edge; +#endif + struct scm_fp_list { short count; short count_unix; short max; #ifdef CONFIG_UNIX struct list_head vertices; + struct unix_edge *edges; #endif struct user_struct *user; struct file *fp[SCM_MAX_FD]; diff --git a/net/core/scm.c b/net/core/scm.c index 09bacb3d36f2..4c343729f960 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -90,6 +90,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fpl->max = SCM_MAX_FD; fpl->user = NULL; #if IS_ENABLED(CONFIG_UNIX) + fpl->edges = NULL; INIT_LIST_HEAD(&fpl->vertices); #endif } @@ -379,6 +380,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); #if IS_ENABLED(CONFIG_UNIX) + new_fpl->edges = NULL; INIT_LIST_HEAD(&new_fpl->vertices); #endif } diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 8ea7640e032e..912b7945692c 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -127,6 +127,11 @@ int unix_prepare_fpl(struct scm_fp_list *fpl) list_add(&vertex->entry, &fpl->vertices); } + fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges), + GFP_KERNEL_ACCOUNT); + if (!fpl->edges) + goto err; + return 0; err: @@ -136,6 +141,7 @@ err: void unix_destroy_fpl(struct scm_fp_list *fpl) { + kvfree(fpl->edges); unix_free_vertices(fpl); } From f8194e511c04d0449137b29d03ad2ec156ca3d92 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:09 +0100 Subject: [PATCH 486/529] af_unix: Link struct unix_edge when queuing skb. commit 42f298c06b30bfe0a8cbee5d38644e618699e26e upstream. Just before queuing skb with inflight fds, we call scm_stat_add(), which is a good place to set up the preallocated struct unix_vertex and struct unix_edge in UNIXCB(skb).fp. Then, we call unix_add_edges() and construct the directed graph as follows: 1. Set the inflight socket's unix_sock to unix_edge.predecessor. 2. Set the receiver's unix_sock to unix_edge.successor. 3. Set the preallocated vertex to inflight socket's unix_sock.vertex. 4. Link inflight socket's unix_vertex.entry to unix_unvisited_vertices. 5. Link unix_edge.vertex_entry to the inflight socket's unix_vertex.edges. Let's say we pass the fd of AF_UNIX socket A to B and the fd of B to C. The graph looks like this: +-------------------------+ | unix_unvisited_vertices | <-------------------------. +-------------------------+ | + | | +--------------+ +--------------+ | +--------------+ | | unix_sock A | <---. .---> | unix_sock B | <-|-. .---> | unix_sock C | | +--------------+ | | +--------------+ | | | +--------------+ | .-+ | vertex | | | .-+ | vertex | | | | | vertex | | | +--------------+ | | | +--------------+ | | | +--------------+ | | | | | | | | | | +--------------+ | | | +--------------+ | | | | '-> | unix_vertex | | | '-> | unix_vertex | | | | | +--------------+ | | +--------------+ | | | `---> | entry | +---------> | entry | +-' | | |--------------| | | |--------------| | | | edges | <-. | | | edges | <-. | | +--------------+ | | | +--------------+ | | | | | | | | | .----------------------' | | .----------------------' | | | | | | | | | +--------------+ | | | +--------------+ | | | | unix_edge | | | | | unix_edge | | | | +--------------+ | | | +--------------+ | | `-> | vertex_entry | | | `-> | vertex_entry | | | |--------------| | | |--------------| | | | predecessor | +---' | | predecessor | +---' | |--------------| | |--------------| | | successor | +-----' | successor | +-----' +--------------+ +--------------+ Henceforth, we denote such a graph as A -> B (-> C). Now, we can express all inflight fd graphs that do not contain embryo sockets. We will support the particular case later. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-4-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 2 + include/net/scm.h | 1 + net/core/scm.c | 2 + net/unix/af_unix.c | 8 +++- net/unix/garbage.c | 90 ++++++++++++++++++++++++++++++++++++++++++- 5 files changed, 100 insertions(+), 3 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 279087595966..08cc90348043 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -22,6 +22,8 @@ extern unsigned int unix_tot_inflight; void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); +void unix_del_edges(struct scm_fp_list *fpl); int unix_prepare_fpl(struct scm_fp_list *fpl); void unix_destroy_fpl(struct scm_fp_list *fpl); void unix_gc(void); diff --git a/include/net/scm.h b/include/net/scm.h index 19d7d802ed6c..19789096424d 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -30,6 +30,7 @@ struct scm_fp_list { short count_unix; short max; #ifdef CONFIG_UNIX + bool inflight; struct list_head vertices; struct unix_edge *edges; #endif diff --git a/net/core/scm.c b/net/core/scm.c index 4c343729f960..1ff78bd4ee83 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -90,6 +90,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fpl->max = SCM_MAX_FD; fpl->user = NULL; #if IS_ENABLED(CONFIG_UNIX) + fpl->inflight = false; fpl->edges = NULL; INIT_LIST_HEAD(&fpl->vertices); #endif @@ -380,6 +381,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl) new_fpl->max = new_fpl->count; new_fpl->user = get_uid(fpl->user); #if IS_ENABLED(CONFIG_UNIX) + new_fpl->inflight = false; new_fpl->edges = NULL; INIT_LIST_HEAD(&new_fpl->vertices); #endif diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0d3ba0d210c0..658a1680a92e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1910,8 +1910,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_add(fp->count, &u->scm_stat.nr_fds); + unix_add_edges(fp, u); + } } static void scm_stat_del(struct sock *sk, struct sk_buff *skb) @@ -1919,8 +1921,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) struct scm_fp_list *fp = UNIXCB(skb).fp; struct unix_sock *u = unix_sk(sk); - if (unlikely(fp && fp->count)) + if (unlikely(fp && fp->count)) { atomic_sub(fp->count, &u->scm_stat.nr_fds); + unix_del_edges(fp); + } } /* diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 912b7945692c..b5b4a200dbf3 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -101,6 +101,38 @@ struct unix_sock *unix_get_socket(struct file *filp) return NULL; } +static LIST_HEAD(unix_unvisited_vertices); + +static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) +{ + struct unix_vertex *vertex = edge->predecessor->vertex; + + if (!vertex) { + vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); + vertex->out_degree = 0; + INIT_LIST_HEAD(&vertex->edges); + + list_move_tail(&vertex->entry, &unix_unvisited_vertices); + edge->predecessor->vertex = vertex; + } + + vertex->out_degree++; + list_add_tail(&edge->vertex_entry, &vertex->edges); +} + +static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) +{ + struct unix_vertex *vertex = edge->predecessor->vertex; + + list_del(&edge->vertex_entry); + vertex->out_degree--; + + if (!vertex->out_degree) { + edge->predecessor->vertex = NULL; + list_move_tail(&vertex->entry, &fpl->vertices); + } +} + static void unix_free_vertices(struct scm_fp_list *fpl) { struct unix_vertex *vertex, *next_vertex; @@ -111,6 +143,60 @@ static void unix_free_vertices(struct scm_fp_list *fpl) } } +DEFINE_SPINLOCK(unix_gc_lock); + +void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) +{ + int i = 0, j = 0; + + spin_lock(&unix_gc_lock); + + if (!fpl->count_unix) + goto out; + + do { + struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]); + struct unix_edge *edge; + + if (!inflight) + continue; + + edge = fpl->edges + i++; + edge->predecessor = inflight; + edge->successor = receiver; + + unix_add_edge(fpl, edge); + } while (i < fpl->count_unix); + +out: + spin_unlock(&unix_gc_lock); + + fpl->inflight = true; + + unix_free_vertices(fpl); +} + +void unix_del_edges(struct scm_fp_list *fpl) +{ + int i = 0; + + spin_lock(&unix_gc_lock); + + if (!fpl->count_unix) + goto out; + + do { + struct unix_edge *edge = fpl->edges + i++; + + unix_del_edge(fpl, edge); + } while (i < fpl->count_unix); + +out: + spin_unlock(&unix_gc_lock); + + fpl->inflight = false; +} + int unix_prepare_fpl(struct scm_fp_list *fpl) { struct unix_vertex *vertex; @@ -141,11 +227,13 @@ err: void unix_destroy_fpl(struct scm_fp_list *fpl) { + if (fpl->inflight) + unix_del_edges(fpl); + kvfree(fpl->edges); unix_free_vertices(fpl); } -DEFINE_SPINLOCK(unix_gc_lock); unsigned int unix_tot_inflight; static LIST_HEAD(gc_candidates); static LIST_HEAD(gc_inflight_list); From 5593cb9b1c3512d3eafaacac1a5ffdf58aca191c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:10 +0100 Subject: [PATCH 487/529] af_unix: Bulk update unix_tot_inflight/unix_inflight when queuing skb. commit 22c3c0c52d32f41cc38cd936ea0c93f22ced3315 upstream. Currently, we track the number of inflight sockets in two variables. unix_tot_inflight is the total number of inflight AF_UNIX sockets on the host, and user->unix_inflight is the number of inflight fds per user. We update them one by one in unix_inflight(), which can be done once in batch. Also, sendmsg() could fail even after unix_inflight(), then we need to acquire unix_gc_lock only to decrement the counters. Let's bulk update the counters in unix_add_edges() and unix_del_edges(), which is called only for successfully passed fds. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-5-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- net/unix/garbage.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index b5b4a200dbf3..f7041fc23000 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -144,6 +144,7 @@ static void unix_free_vertices(struct scm_fp_list *fpl) } DEFINE_SPINLOCK(unix_gc_lock); +unsigned int unix_tot_inflight; void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) { @@ -168,7 +169,10 @@ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) unix_add_edge(fpl, edge); } while (i < fpl->count_unix); + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); + spin_unlock(&unix_gc_lock); fpl->inflight = true; @@ -191,7 +195,10 @@ void unix_del_edges(struct scm_fp_list *fpl) unix_del_edge(fpl, edge); } while (i < fpl->count_unix); + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); + spin_unlock(&unix_gc_lock); fpl->inflight = false; @@ -234,7 +241,6 @@ void unix_destroy_fpl(struct scm_fp_list *fpl) unix_free_vertices(fpl); } -unsigned int unix_tot_inflight; static LIST_HEAD(gc_candidates); static LIST_HEAD(gc_inflight_list); @@ -255,13 +261,8 @@ void unix_inflight(struct user_struct *user, struct file *filp) WARN_ON_ONCE(list_empty(&u->link)); } u->inflight++; - - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); } - WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); - spin_unlock(&unix_gc_lock); } @@ -278,13 +279,8 @@ void unix_notinflight(struct user_struct *user, struct file *filp) u->inflight--; if (!u->inflight) list_del_init(&u->link); - - /* Paired with READ_ONCE() in wait_for_unix_gc() */ - WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); } - WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); - spin_unlock(&unix_gc_lock); } From 878df6a0690a50591b13335b421940bd946c998c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:11 +0100 Subject: [PATCH 488/529] af_unix: Iterate all vertices by DFS. commit 6ba76fd2848e107594ea4f03b737230f74bc23ea upstream. The new GC will use a depth first search graph algorithm to find cyclic references. The algorithm visits every vertex exactly once. Here, we implement the DFS part without recursion so that no one can abuse it. unix_walk_scc() marks every vertex unvisited by initialising index as UNIX_VERTEX_INDEX_UNVISITED and iterates inflight vertices in unix_unvisited_vertices and call __unix_walk_scc() to start DFS from an arbitrary vertex. __unix_walk_scc() iterates all edges starting from the vertex and explores the neighbour vertices with DFS using edge_stack. After visiting all neighbours, __unix_walk_scc() moves the visited vertex to unix_visited_vertices so that unix_walk_scc() will not restart DFS from the visited vertex. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-6-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 2 ++ net/unix/garbage.c | 74 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 76 insertions(+) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 08cc90348043..9d51d675cc9f 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -33,12 +33,14 @@ struct unix_vertex { struct list_head edges; struct list_head entry; unsigned long out_degree; + unsigned long index; }; struct unix_edge { struct unix_sock *predecessor; struct unix_sock *successor; struct list_head vertex_entry; + struct list_head stack_entry; }; struct sock *unix_peer_get(struct sock *sk); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index f7041fc23000..295dd1a7b8e0 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -103,6 +103,11 @@ struct unix_sock *unix_get_socket(struct file *filp) static LIST_HEAD(unix_unvisited_vertices); +enum unix_vertex_index { + UNIX_VERTEX_INDEX_UNVISITED, + UNIX_VERTEX_INDEX_START, +}; + static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { struct unix_vertex *vertex = edge->predecessor->vertex; @@ -241,6 +246,73 @@ void unix_destroy_fpl(struct scm_fp_list *fpl) unix_free_vertices(fpl); } +static LIST_HEAD(unix_visited_vertices); + +static void __unix_walk_scc(struct unix_vertex *vertex) +{ + unsigned long index = UNIX_VERTEX_INDEX_START; + struct unix_edge *edge; + LIST_HEAD(edge_stack); + +next_vertex: + vertex->index = index; + index++; + + /* Explore neighbour vertices (receivers of the current vertex's fd). */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + struct unix_vertex *next_vertex = edge->successor->vertex; + + if (!next_vertex) + continue; + + if (next_vertex->index == UNIX_VERTEX_INDEX_UNVISITED) { + /* Iterative deepening depth first search + * + * 1. Push a forward edge to edge_stack and set + * the successor to vertex for the next iteration. + */ + list_add(&edge->stack_entry, &edge_stack); + + vertex = next_vertex; + goto next_vertex; + + /* 2. Pop the edge directed to the current vertex + * and restore the ancestor for backtracking. + */ +prev_vertex: + edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); + list_del_init(&edge->stack_entry); + + vertex = edge->predecessor->vertex; + } + } + + /* Don't restart DFS from this vertex in unix_walk_scc(). */ + list_move_tail(&vertex->entry, &unix_visited_vertices); + + /* Need backtracking ? */ + if (!list_empty(&edge_stack)) + goto prev_vertex; +} + +static void unix_walk_scc(void) +{ + struct unix_vertex *vertex; + + list_for_each_entry(vertex, &unix_unvisited_vertices, entry) + vertex->index = UNIX_VERTEX_INDEX_UNVISITED; + + /* Visit every vertex exactly once. + * __unix_walk_scc() moves visited vertices to unix_visited_vertices. + */ + while (!list_empty(&unix_unvisited_vertices)) { + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + __unix_walk_scc(vertex); + } + + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); +} + static LIST_HEAD(gc_candidates); static LIST_HEAD(gc_inflight_list); @@ -388,6 +460,8 @@ static void __unix_gc(struct work_struct *work) spin_lock(&unix_gc_lock); + unix_walk_scc(); + /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones * which don't have any external reference. From a6c7ce40eac1a895e60aa5a4875458b88cecf297 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:12 +0100 Subject: [PATCH 489/529] af_unix: Detect Strongly Connected Components. commit 3484f063172dd88776b062046d721d7c2ae1af7c upstream. In the new GC, we use a simple graph algorithm, Tarjan's Strongly Connected Components (SCC) algorithm, to find cyclic references. The algorithm visits every vertex exactly once using depth-first search (DFS). DFS starts by pushing an input vertex to a stack and assigning it a unique number. Two fields, index and lowlink, are initialised with the number, but lowlink could be updated later during DFS. If a vertex has an edge to an unvisited inflight vertex, we visit it and do the same processing. So, we will have vertices in the stack in the order they appear and number them consecutively in the same order. If a vertex has a back-edge to a visited vertex in the stack, we update the predecessor's lowlink with the successor's index. After iterating edges from the vertex, we check if its index equals its lowlink. If the lowlink is different from the index, it shows there was a back-edge. Then, we go backtracking and propagate the lowlink to its predecessor and resume the previous edge iteration from the next edge. If the lowlink is the same as the index, we pop vertices before and including the vertex from the stack. Then, the set of vertices is SCC, possibly forming a cycle. At the same time, we move the vertices to unix_visited_vertices. When we finish the algorithm, all vertices in each SCC will be linked via unix_vertex.scc_entry. Let's take an example. We have a graph including five inflight vertices (F is not inflight): A -> B -> C -> D -> E (-> F) ^ | `---------' Suppose that we start DFS from C. We will visit C, D, and B first and initialise their index and lowlink. Then, the stack looks like this: > B = (3, 3) (index, lowlink) D = (2, 2) C = (1, 1) When checking B's edge to C, we update B's lowlink with C's index and propagate it to D. B = (3, 1) (index, lowlink) > D = (2, 1) C = (1, 1) Next, we visit E, which has no edge to an inflight vertex. > E = (4, 4) (index, lowlink) B = (3, 1) D = (2, 1) C = (1, 1) When we leave from E, its index and lowlink are the same, so we pop E from the stack as single-vertex SCC. Next, we leave from B and D but do nothing because their lowlink are different from their index. B = (3, 1) (index, lowlink) D = (2, 1) > C = (1, 1) Then, we leave from C, whose index and lowlink are the same, so we pop B, D and C as SCC. Last, we do DFS for the rest of vertices, A, which is also a single-vertex SCC. Finally, each unix_vertex.scc_entry is linked as follows: A -. B -> C -> D E -. ^ | ^ | ^ | `--' `---------' `--' We use SCC later to decide whether we can garbage-collect the sockets. Note that we still cannot detect SCC properly if an edge points to an embryo socket. The following two patches will sort it out. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-7-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 3 +++ net/unix/garbage.c | 46 +++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 9d51d675cc9f..e6f7bba19152 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -32,8 +32,11 @@ void wait_for_unix_gc(struct scm_fp_list *fpl); struct unix_vertex { struct list_head edges; struct list_head entry; + struct list_head scc_entry; unsigned long out_degree; unsigned long index; + unsigned long lowlink; + bool on_stack; }; struct unix_edge { diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 295dd1a7b8e0..cdeff548e130 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -251,11 +251,19 @@ static LIST_HEAD(unix_visited_vertices); static void __unix_walk_scc(struct unix_vertex *vertex) { unsigned long index = UNIX_VERTEX_INDEX_START; + LIST_HEAD(vertex_stack); struct unix_edge *edge; LIST_HEAD(edge_stack); next_vertex: + /* Push vertex to vertex_stack. + * The vertex will be popped when finalising SCC later. + */ + vertex->on_stack = true; + list_add(&vertex->scc_entry, &vertex_stack); + vertex->index = index; + vertex->lowlink = index; index++; /* Explore neighbour vertices (receivers of the current vertex's fd). */ @@ -283,12 +291,46 @@ prev_vertex: edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); list_del_init(&edge->stack_entry); + next_vertex = vertex; vertex = edge->predecessor->vertex; + + /* If the successor has a smaller lowlink, two vertices + * are in the same SCC, so propagate the smaller lowlink + * to skip SCC finalisation. + */ + vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink); + } else if (next_vertex->on_stack) { + /* Loop detected by a back/cross edge. + * + * The successor is on vertex_stack, so two vertices are + * in the same SCC. If the successor has a smaller index, + * propagate it to skip SCC finalisation. + */ + vertex->lowlink = min(vertex->lowlink, next_vertex->index); + } else { + /* The successor was already grouped as another SCC */ } } - /* Don't restart DFS from this vertex in unix_walk_scc(). */ - list_move_tail(&vertex->entry, &unix_visited_vertices); + if (vertex->index == vertex->lowlink) { + struct list_head scc; + + /* SCC finalised. + * + * If the lowlink was not updated, all the vertices above on + * vertex_stack are in the same SCC. Group them using scc_entry. + */ + __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); + + list_for_each_entry_reverse(vertex, &scc, scc_entry) { + /* Don't restart DFS from this vertex in unix_walk_scc(). */ + list_move_tail(&vertex->entry, &unix_visited_vertices); + + vertex->on_stack = false; + } + + list_del(&scc); + } /* Need backtracking ? */ if (!list_empty(&edge_stack)) From 12365b707cd54264a847f8d3a8304a15912deb28 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:13 +0100 Subject: [PATCH 490/529] af_unix: Save listener for embryo socket. commit aed6ecef55d70de3762ce41c561b7f547dbaf107 upstream. This is a prep patch for the following change, where we need to fetch the listening socket from the successor embryo socket during GC. We add a new field to struct unix_sock to save a pointer to a listening socket. We set it when connect() creates a new socket, and clear it when accept() is called. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-8-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 5 ++++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index e6f7bba19152..624fea657518 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -83,6 +83,7 @@ struct unix_sock { struct path path; struct mutex iolock, bindlock; struct sock *peer; + struct sock *listener; struct unix_vertex *vertex; struct list_head link; unsigned long inflight; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 658a1680a92e..6075ecbe40b2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -954,6 +954,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); + u->listener = NULL; u->inflight = 0; u->vertex = NULL; u->path.dentry = NULL; @@ -1558,6 +1559,7 @@ restart: newsk->sk_type = sk->sk_type; init_peercred(newsk); newu = unix_sk(newsk); + newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); otheru = unix_sk(other); @@ -1651,8 +1653,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { struct sock *sk = sock->sk; - struct sock *tsk; struct sk_buff *skb; + struct sock *tsk; int err; err = -EOPNOTSUPP; @@ -1677,6 +1679,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, } tsk = skb->sk; + unix_sk(tsk)->listener = NULL; skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); From edfa4872d0ec0c16f133281bf83a5fb5e9c80dcd Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:14 +0100 Subject: [PATCH 491/529] af_unix: Fix up unix_edge.successor for embryo socket. commit dcf70df2048d27c5d186f013f101a4aefd63aa41 upstream. To garbage collect inflight AF_UNIX sockets, we must define the cyclic reference appropriately. This is a bit tricky if the loop consists of embryo sockets. Suppose that the fd of AF_UNIX socket A is passed to D and the fd B to C and that C and D are embryo sockets of A and B, respectively. It may appear that there are two separate graphs, A (-> D) and B (-> C), but this is not correct. A --. .-- B X C <-' `-> D Now, D holds A's refcount, and C has B's refcount, so unix_release() will never be called for A and B when we close() them. However, no one can call close() for D and C to free skbs holding refcounts of A and B because C/D is in A/B's receive queue, which should have been purged by unix_release() for A and B. So, here's another type of cyclic reference. When a fd of an AF_UNIX socket is passed to an embryo socket, the reference is indirectly held by its parent listening socket. .-> A .-> B | `- sk_receive_queue | `- sk_receive_queue | `- skb | `- skb | `- sk == C | `- sk == D | `- sk_receive_queue | `- sk_receive_queue | `- skb +---------' `- skb +-. | | `---------------------------------------------------------' Technically, the graph must be denoted as A <-> B instead of A (-> D) and B (-> C) to find such a cyclic reference without touching each socket's receive queue. .-> A --. .-- B <-. | X | == A <-> B `-- C <-' `-> D --' We apply this fixup during GC by fetching the real successor by unix_edge_successor(). When we call accept(), we clear unix_sock.listener under unix_gc_lock not to confuse GC. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-9-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 2 +- net/unix/garbage.c | 20 +++++++++++++++++++- 3 files changed, 21 insertions(+), 2 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 624fea657518..d7f589e14467 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -24,6 +24,7 @@ void unix_inflight(struct user_struct *user, struct file *fp); void unix_notinflight(struct user_struct *user, struct file *fp); void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); void unix_del_edges(struct scm_fp_list *fpl); +void unix_update_edges(struct unix_sock *receiver); int unix_prepare_fpl(struct scm_fp_list *fpl); void unix_destroy_fpl(struct scm_fp_list *fpl); void unix_gc(void); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 6075ecbe40b2..4d8b2b2b9a70 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1679,7 +1679,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, } tsk = skb->sk; - unix_sk(tsk)->listener = NULL; + unix_update_edges(unix_sk(tsk)); skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index cdeff548e130..6ff7e0b5c544 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -101,6 +101,17 @@ struct unix_sock *unix_get_socket(struct file *filp) return NULL; } +static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) +{ + /* If an embryo socket has a fd, + * the listener indirectly holds the fd's refcnt. + */ + if (edge->successor->listener) + return unix_sk(edge->successor->listener)->vertex; + + return edge->successor->vertex; +} + static LIST_HEAD(unix_unvisited_vertices); enum unix_vertex_index { @@ -209,6 +220,13 @@ out: fpl->inflight = false; } +void unix_update_edges(struct unix_sock *receiver) +{ + spin_lock(&unix_gc_lock); + receiver->listener = NULL; + spin_unlock(&unix_gc_lock); +} + int unix_prepare_fpl(struct scm_fp_list *fpl) { struct unix_vertex *vertex; @@ -268,7 +286,7 @@ next_vertex: /* Explore neighbour vertices (receivers of the current vertex's fd). */ list_for_each_entry(edge, &vertex->edges, vertex_entry) { - struct unix_vertex *next_vertex = edge->successor->vertex; + struct unix_vertex *next_vertex = unix_edge_successor(edge); if (!next_vertex) continue; From ccbe3d2aca11548b0d52b10f561b989d5691f5a3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:15 +0100 Subject: [PATCH 492/529] af_unix: Save O(n) setup of Tarjan's algo. commit ba31b4a4e1018f5844c6eb31734976e2184f2f9a upstream. Before starting Tarjan's algorithm, we need to mark all vertices as unvisited. We can save this O(n) setup by reserving two special indices (0, 1) and using two variables. The first time we link a vertex to unix_unvisited_vertices, we set unix_vertex_unvisited_index to index. During DFS, we can see that the index of unvisited vertices is the same as unix_vertex_unvisited_index. When we finalise SCC later, we set unix_vertex_grouped_index to each vertex's index. Then, we can know (i) that the vertex is on the stack if the index of a visited vertex is >= 2 and (ii) that it is not on the stack and belongs to a different SCC if the index is unix_vertex_grouped_index. After the whole algorithm, all indices of vertices are set as unix_vertex_grouped_index. Next time we start DFS, we know that all unvisited vertices have unix_vertex_grouped_index, and we can use unix_vertex_unvisited_index as the not-on-stack marker. To use the same variable in __unix_walk_scc(), we can swap unix_vertex_(grouped|unvisited)_index at the end of Tarjan's algorithm. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-10-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 1 - net/unix/garbage.c | 26 +++++++++++++++----------- 2 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index d7f589e14467..ffbc7322e41b 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -37,7 +37,6 @@ struct unix_vertex { unsigned long out_degree; unsigned long index; unsigned long lowlink; - bool on_stack; }; struct unix_edge { diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 6ff7e0b5c544..feae6c17b291 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -115,16 +115,20 @@ static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) static LIST_HEAD(unix_unvisited_vertices); enum unix_vertex_index { - UNIX_VERTEX_INDEX_UNVISITED, + UNIX_VERTEX_INDEX_MARK1, + UNIX_VERTEX_INDEX_MARK2, UNIX_VERTEX_INDEX_START, }; +static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; + static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { struct unix_vertex *vertex = edge->predecessor->vertex; if (!vertex) { vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); + vertex->index = unix_vertex_unvisited_index; vertex->out_degree = 0; INIT_LIST_HEAD(&vertex->edges); @@ -265,6 +269,7 @@ void unix_destroy_fpl(struct scm_fp_list *fpl) } static LIST_HEAD(unix_visited_vertices); +static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; static void __unix_walk_scc(struct unix_vertex *vertex) { @@ -274,10 +279,10 @@ static void __unix_walk_scc(struct unix_vertex *vertex) LIST_HEAD(edge_stack); next_vertex: - /* Push vertex to vertex_stack. + /* Push vertex to vertex_stack and mark it as on-stack + * (index >= UNIX_VERTEX_INDEX_START). * The vertex will be popped when finalising SCC later. */ - vertex->on_stack = true; list_add(&vertex->scc_entry, &vertex_stack); vertex->index = index; @@ -291,7 +296,7 @@ next_vertex: if (!next_vertex) continue; - if (next_vertex->index == UNIX_VERTEX_INDEX_UNVISITED) { + if (next_vertex->index == unix_vertex_unvisited_index) { /* Iterative deepening depth first search * * 1. Push a forward edge to edge_stack and set @@ -317,7 +322,7 @@ prev_vertex: * to skip SCC finalisation. */ vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink); - } else if (next_vertex->on_stack) { + } else if (next_vertex->index != unix_vertex_grouped_index) { /* Loop detected by a back/cross edge. * * The successor is on vertex_stack, so two vertices are @@ -344,7 +349,8 @@ prev_vertex: /* Don't restart DFS from this vertex in unix_walk_scc(). */ list_move_tail(&vertex->entry, &unix_visited_vertices); - vertex->on_stack = false; + /* Mark vertex as off-stack. */ + vertex->index = unix_vertex_grouped_index; } list_del(&scc); @@ -357,20 +363,18 @@ prev_vertex: static void unix_walk_scc(void) { - struct unix_vertex *vertex; - - list_for_each_entry(vertex, &unix_unvisited_vertices, entry) - vertex->index = UNIX_VERTEX_INDEX_UNVISITED; - /* Visit every vertex exactly once. * __unix_walk_scc() moves visited vertices to unix_visited_vertices. */ while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); __unix_walk_scc(vertex); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); + swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); } static LIST_HEAD(gc_candidates); From b5b54a38755fc7135e66c4c051f976180e4540b3 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:16 +0100 Subject: [PATCH 493/529] af_unix: Skip GC if no cycle exists. commit 77e5593aebba823bcbcf2c4b58b07efcd63933b8 upstream. We do not need to run GC if there is no possible cyclic reference. We use unix_graph_maybe_cyclic to decide if we should run GC. If a fd of an AF_UNIX socket is passed to an already inflight AF_UNIX socket, they could form a cyclic reference. Then, we set true to unix_graph_maybe_cyclic and later run Tarjan's algorithm to group them into SCC. Once we run Tarjan's algorithm, we are 100% sure whether cyclic references exist or not. If there is no cycle, we set false to unix_graph_maybe_cyclic and can skip the entire garbage collection next time. When finalising SCC, we set true to unix_graph_maybe_cyclic if SCC consists of multiple vertices. Even if SCC is a single vertex, a cycle might exist as self-fd passing. Given the corner case is rare, we detect it by checking all edges of the vertex and set true to unix_graph_maybe_cyclic. With this change, __unix_gc() is just a spin_lock() dance in the normal usage. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-11-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- net/unix/garbage.c | 48 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index feae6c17b291..8f0dc39bb72f 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -112,6 +112,19 @@ static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) return edge->successor->vertex; } +static bool unix_graph_maybe_cyclic; + +static void unix_update_graph(struct unix_vertex *vertex) +{ + /* If the receiver socket is not inflight, no cyclic + * reference could be formed. + */ + if (!vertex) + return; + + unix_graph_maybe_cyclic = true; +} + static LIST_HEAD(unix_unvisited_vertices); enum unix_vertex_index { @@ -138,12 +151,16 @@ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) vertex->out_degree++; list_add_tail(&edge->vertex_entry, &vertex->edges); + + unix_update_graph(unix_edge_successor(edge)); } static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { struct unix_vertex *vertex = edge->predecessor->vertex; + unix_update_graph(unix_edge_successor(edge)); + list_del(&edge->vertex_entry); vertex->out_degree--; @@ -227,6 +244,7 @@ out: void unix_update_edges(struct unix_sock *receiver) { spin_lock(&unix_gc_lock); + unix_update_graph(unix_sk(receiver->listener)->vertex); receiver->listener = NULL; spin_unlock(&unix_gc_lock); } @@ -268,6 +286,26 @@ void unix_destroy_fpl(struct scm_fp_list *fpl) unix_free_vertices(fpl); } +static bool unix_scc_cyclic(struct list_head *scc) +{ + struct unix_vertex *vertex; + struct unix_edge *edge; + + /* SCC containing multiple vertices ? */ + if (!list_is_singular(scc)) + return true; + + vertex = list_first_entry(scc, typeof(*vertex), scc_entry); + + /* Self-reference or a embryo-listener circle ? */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + if (unix_edge_successor(edge) == vertex) + return true; + } + + return false; +} + static LIST_HEAD(unix_visited_vertices); static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; @@ -353,6 +391,9 @@ prev_vertex: vertex->index = unix_vertex_grouped_index; } + if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + list_del(&scc); } @@ -363,6 +404,8 @@ prev_vertex: static void unix_walk_scc(void) { + unix_graph_maybe_cyclic = false; + /* Visit every vertex exactly once. * __unix_walk_scc() moves visited vertices to unix_visited_vertices. */ @@ -524,6 +567,9 @@ static void __unix_gc(struct work_struct *work) spin_lock(&unix_gc_lock); + if (!unix_graph_maybe_cyclic) + goto skip_gc; + unix_walk_scc(); /* First, select candidates for garbage collection. Only @@ -633,7 +679,7 @@ static void __unix_gc(struct work_struct *work) /* All candidates should have been detached by now. */ WARN_ON_ONCE(!list_empty(&gc_candidates)); - +skip_gc: /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, false); From adfb68b39b39767d6bfb53e48c4f19c183765686 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:17 +0100 Subject: [PATCH 494/529] af_unix: Avoid Tarjan's algorithm if unnecessary. commit ad081928a8b0f57f269df999a28087fce6f2b6ce upstream. Once a cyclic reference is formed, we need to run GC to check if there is dead SCC. However, we do not need to run Tarjan's algorithm if we know that the shape of the inflight graph has not been changed. If an edge is added/updated/deleted and the edge's successor is inflight, we set false to unix_graph_grouped, which means we need to re-classify SCC. Once we finalise SCC, we set true to unix_graph_grouped. While unix_graph_grouped is true, we can iterate the grouped SCC using vertex->scc_entry in unix_walk_scc_fast(). list_add() and list_for_each_entry_reverse() uses seem weird, but they are to keep the vertex order consistent and make writing test easier. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-12-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- net/unix/garbage.c | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 8f0dc39bb72f..d25841ab2de4 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -113,6 +113,7 @@ static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) } static bool unix_graph_maybe_cyclic; +static bool unix_graph_grouped; static void unix_update_graph(struct unix_vertex *vertex) { @@ -123,6 +124,7 @@ static void unix_update_graph(struct unix_vertex *vertex) return; unix_graph_maybe_cyclic = true; + unix_graph_grouped = false; } static LIST_HEAD(unix_unvisited_vertices); @@ -144,6 +146,7 @@ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) vertex->index = unix_vertex_unvisited_index; vertex->out_degree = 0; INIT_LIST_HEAD(&vertex->edges); + INIT_LIST_HEAD(&vertex->scc_entry); list_move_tail(&vertex->entry, &unix_unvisited_vertices); edge->predecessor->vertex = vertex; @@ -418,6 +421,26 @@ static void unix_walk_scc(void) list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); + + unix_graph_grouped = true; +} + +static void unix_walk_scc_fast(void) +{ + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + struct list_head scc; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + list_add(&scc, &vertex->scc_entry); + + list_for_each_entry_reverse(vertex, &scc, scc_entry) + list_move_tail(&vertex->entry, &unix_visited_vertices); + + list_del(&scc); + } + + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); } static LIST_HEAD(gc_candidates); @@ -570,7 +593,10 @@ static void __unix_gc(struct work_struct *work) if (!unix_graph_maybe_cyclic) goto skip_gc; - unix_walk_scc(); + if (unix_graph_grouped) + unix_walk_scc_fast(); + else + unix_walk_scc(); /* First, select candidates for garbage collection. Only * in-flight sockets are considered, and from those only ones From 61f3d2706c3161f83c35d85ac8ee87789a83fbf2 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:18 +0100 Subject: [PATCH 495/529] af_unix: Assign a unique index to SCC. commit bfdb01283ee8f2f3089656c3ff8f62bb072dabb2 upstream. The definition of the lowlink in Tarjan's algorithm is the smallest index of a vertex that is reachable with at most one back-edge in SCC. This is not useful for a cross-edge. If we start traversing from A in the following graph, the final lowlink of D is 3. The cross-edge here is one between D and C. A -> B -> D D = (4, 3) (index, lowlink) ^ | | C = (3, 1) | V | B = (2, 1) `--- C <--' A = (1, 1) This is because the lowlink of D is updated with the index of C. In the following patch, we detect a dead SCC by checking two conditions for each vertex. 1) vertex has no edge directed to another SCC (no bridge) 2) vertex's out_degree is the same as the refcount of its file If 1) is false, there is a receiver of all fds of the SCC and its ancestor SCC. To evaluate 1), we need to assign a unique index to each SCC and assign it to all vertices in the SCC. This patch changes the lowlink update logic for cross-edge so that in the example above, the lowlink of D is updated with the lowlink of C. A -> B -> D D = (4, 1) (index, lowlink) ^ | | C = (3, 1) | V | B = (2, 1) `--- C <--' A = (1, 1) Then, all vertices in the same SCC have the same lowlink, and we can quickly find the bridge connecting to different SCC if exists. However, it is no longer called lowlink, so we rename it to scc_index. (It's sometimes called lowpoint.) Also, we add a global variable to hold the last index used in DFS so that we do not reset the initial index in each DFS. This patch can be squashed to the SCC detection patch but is split deliberately for anyone wondering why lowlink is not used as used in the original Tarjan's algorithm and many reference implementations. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-13-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 2 +- net/unix/garbage.c | 29 +++++++++++++++-------------- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index ffbc7322e41b..14d56b07a54d 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -36,7 +36,7 @@ struct unix_vertex { struct list_head scc_entry; unsigned long out_degree; unsigned long index; - unsigned long lowlink; + unsigned long scc_index; }; struct unix_edge { diff --git a/net/unix/garbage.c b/net/unix/garbage.c index d25841ab2de4..2e66b57f3f0f 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -312,9 +312,8 @@ static bool unix_scc_cyclic(struct list_head *scc) static LIST_HEAD(unix_visited_vertices); static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; -static void __unix_walk_scc(struct unix_vertex *vertex) +static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index) { - unsigned long index = UNIX_VERTEX_INDEX_START; LIST_HEAD(vertex_stack); struct unix_edge *edge; LIST_HEAD(edge_stack); @@ -326,9 +325,9 @@ next_vertex: */ list_add(&vertex->scc_entry, &vertex_stack); - vertex->index = index; - vertex->lowlink = index; - index++; + vertex->index = *last_index; + vertex->scc_index = *last_index; + (*last_index)++; /* Explore neighbour vertices (receivers of the current vertex's fd). */ list_for_each_entry(edge, &vertex->edges, vertex_entry) { @@ -358,30 +357,30 @@ prev_vertex: next_vertex = vertex; vertex = edge->predecessor->vertex; - /* If the successor has a smaller lowlink, two vertices - * are in the same SCC, so propagate the smaller lowlink + /* If the successor has a smaller scc_index, two vertices + * are in the same SCC, so propagate the smaller scc_index * to skip SCC finalisation. */ - vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink); + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); } else if (next_vertex->index != unix_vertex_grouped_index) { /* Loop detected by a back/cross edge. * - * The successor is on vertex_stack, so two vertices are - * in the same SCC. If the successor has a smaller index, + * The successor is on vertex_stack, so two vertices are in + * the same SCC. If the successor has a smaller *scc_index*, * propagate it to skip SCC finalisation. */ - vertex->lowlink = min(vertex->lowlink, next_vertex->index); + vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); } else { /* The successor was already grouped as another SCC */ } } - if (vertex->index == vertex->lowlink) { + if (vertex->index == vertex->scc_index) { struct list_head scc; /* SCC finalised. * - * If the lowlink was not updated, all the vertices above on + * If the scc_index was not updated, all the vertices above on * vertex_stack are in the same SCC. Group them using scc_entry. */ __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); @@ -407,6 +406,8 @@ prev_vertex: static void unix_walk_scc(void) { + unsigned long last_index = UNIX_VERTEX_INDEX_START; + unix_graph_maybe_cyclic = false; /* Visit every vertex exactly once. @@ -416,7 +417,7 @@ static void unix_walk_scc(void) struct unix_vertex *vertex; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); - __unix_walk_scc(vertex); + __unix_walk_scc(vertex, &last_index); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); From 9734d332ef5431f9641d8bd97dbcf457b5f8bd8f Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:19 +0100 Subject: [PATCH 496/529] af_unix: Detect dead SCC. commit a15702d8b3aad8ce5268c565bd29f0e02fd2db83 upstream. When iterating SCC, we call unix_vertex_dead() for each vertex to check if the vertex is close()d and has no bridge to another SCC. If both conditions are true for every vertex in SCC, we can execute garbage collection for all skb in the SCC. The actual garbage collection is done in the following patch, replacing the old implementation. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-14-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- net/unix/garbage.c | 44 +++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 2e66b57f3f0f..1f53c25fc71b 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -289,6 +289,39 @@ void unix_destroy_fpl(struct scm_fp_list *fpl) unix_free_vertices(fpl); } +static bool unix_vertex_dead(struct unix_vertex *vertex) +{ + struct unix_edge *edge; + struct unix_sock *u; + long total_ref; + + list_for_each_entry(edge, &vertex->edges, vertex_entry) { + struct unix_vertex *next_vertex = unix_edge_successor(edge); + + /* The vertex's fd can be received by a non-inflight socket. */ + if (!next_vertex) + return false; + + /* The vertex's fd can be received by an inflight socket in + * another SCC. + */ + if (next_vertex->scc_index != vertex->scc_index) + return false; + } + + /* No receiver exists out of the same SCC. */ + + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + total_ref = file_count(u->sk.sk_socket->file); + + /* If not close()d, total_ref > out_degree. */ + if (total_ref != vertex->out_degree) + return false; + + return true; +} + static bool unix_scc_cyclic(struct list_head *scc) { struct unix_vertex *vertex; @@ -377,6 +410,7 @@ prev_vertex: if (vertex->index == vertex->scc_index) { struct list_head scc; + bool scc_dead = true; /* SCC finalised. * @@ -391,6 +425,9 @@ prev_vertex: /* Mark vertex as off-stack. */ vertex->index = unix_vertex_grouped_index; + + if (scc_dead) + scc_dead = unix_vertex_dead(vertex); } if (!unix_graph_maybe_cyclic) @@ -431,13 +468,18 @@ static void unix_walk_scc_fast(void) while (!list_empty(&unix_unvisited_vertices)) { struct unix_vertex *vertex; struct list_head scc; + bool scc_dead = true; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); list_add(&scc, &vertex->scc_entry); - list_for_each_entry_reverse(vertex, &scc, scc_entry) + list_for_each_entry_reverse(vertex, &scc, scc_entry) { list_move_tail(&vertex->entry, &unix_visited_vertices); + if (scc_dead) + scc_dead = unix_vertex_dead(vertex); + } + list_del(&scc); } From 5dfd283f4651d04dbb70ceb9ae5c4a30eda3c52a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:20 +0100 Subject: [PATCH 497/529] af_unix: Replace garbage collection algorithm. commit 4090fa373f0e763c43610853d2774b5979915959 upstream. If we find a dead SCC during iteration, we call unix_collect_skb() to splice all skb in the SCC to the global sk_buff_head, hitlist. After iterating all SCC, we unlock unix_gc_lock and purge the queue. Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240325202425.60930-15-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 8 -- net/unix/af_unix.c | 12 -- net/unix/garbage.c | 318 +++++++++--------------------------------- 3 files changed, 64 insertions(+), 274 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 14d56b07a54d..47808e366731 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -19,9 +19,6 @@ static inline struct unix_sock *unix_get_socket(struct file *filp) extern spinlock_t unix_gc_lock; extern unsigned int unix_tot_inflight; - -void unix_inflight(struct user_struct *user, struct file *fp); -void unix_notinflight(struct user_struct *user, struct file *fp); void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); void unix_del_edges(struct scm_fp_list *fpl); void unix_update_edges(struct unix_sock *receiver); @@ -85,12 +82,7 @@ struct unix_sock { struct sock *peer; struct sock *listener; struct unix_vertex *vertex; - struct list_head link; - unsigned long inflight; spinlock_t lock; - unsigned long gc_flags; -#define UNIX_GC_CANDIDATE 0 -#define UNIX_GC_MAYBE_CYCLE 1 struct socket_wq peer_wq; wait_queue_entry_t peer_wake; struct scm_stat scm_stat; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4d8b2b2b9a70..25f66adf47d1 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -955,12 +955,10 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sk->sk_destruct = unix_sock_destructor; u = unix_sk(sk); u->listener = NULL; - u->inflight = 0; u->vertex = NULL; u->path.dentry = NULL; u->path.mnt = NULL; spin_lock_init(&u->lock); - INIT_LIST_HEAD(&u->link); mutex_init(&u->iolock); /* single task reading lock */ mutex_init(&u->bindlock); /* single task binding lock */ init_waitqueue_head(&u->peer_wait); @@ -1744,8 +1742,6 @@ static inline bool too_many_unix_fds(struct task_struct *p) static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) { - int i; - if (too_many_unix_fds(current)) return -ETOOMANYREFS; @@ -1757,9 +1753,6 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) if (!UNIXCB(skb).fp) return -ENOMEM; - for (i = scm->fp->count - 1; i >= 0; i--) - unix_inflight(scm->fp->user, scm->fp->fp[i]); - if (unix_prepare_fpl(UNIXCB(skb).fp)) return -ENOMEM; @@ -1768,15 +1761,10 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) { - int i; - scm->fp = UNIXCB(skb).fp; UNIXCB(skb).fp = NULL; unix_destroy_fpl(scm->fp); - - for (i = scm->fp->count - 1; i >= 0; i--) - unix_notinflight(scm->fp->user, scm->fp->fp[i]); } static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 1f53c25fc71b..89ea71d9297b 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -322,6 +322,52 @@ static bool unix_vertex_dead(struct unix_vertex *vertex) return true; } +enum unix_recv_queue_lock_class { + U_RECVQ_LOCK_NORMAL, + U_RECVQ_LOCK_EMBRYO, +}; + +static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) +{ + struct unix_vertex *vertex; + + list_for_each_entry_reverse(vertex, scc, scc_entry) { + struct sk_buff_head *queue; + struct unix_edge *edge; + struct unix_sock *u; + + edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); + u = edge->predecessor; + queue = &u->sk.sk_receive_queue; + + spin_lock(&queue->lock); + + if (u->sk.sk_state == TCP_LISTEN) { + struct sk_buff *skb; + + skb_queue_walk(queue, skb) { + struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; + + /* listener -> embryo order, the inversion never happens. */ + spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO); + skb_queue_splice_init(embryo_queue, hitlist); + spin_unlock(&embryo_queue->lock); + } + } else { + skb_queue_splice_init(queue, hitlist); + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + kfree_skb(u->oob_skb); + u->oob_skb = NULL; + } +#endif + } + + spin_unlock(&queue->lock); + } +} + static bool unix_scc_cyclic(struct list_head *scc) { struct unix_vertex *vertex; @@ -345,7 +391,8 @@ static bool unix_scc_cyclic(struct list_head *scc) static LIST_HEAD(unix_visited_vertices); static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; -static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index) +static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, + struct sk_buff_head *hitlist) { LIST_HEAD(vertex_stack); struct unix_edge *edge; @@ -430,7 +477,9 @@ prev_vertex: scc_dead = unix_vertex_dead(vertex); } - if (!unix_graph_maybe_cyclic) + if (scc_dead) + unix_collect_skb(&scc, hitlist); + else if (!unix_graph_maybe_cyclic) unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); list_del(&scc); @@ -441,7 +490,7 @@ prev_vertex: goto prev_vertex; } -static void unix_walk_scc(void) +static void unix_walk_scc(struct sk_buff_head *hitlist) { unsigned long last_index = UNIX_VERTEX_INDEX_START; @@ -454,7 +503,7 @@ static void unix_walk_scc(void) struct unix_vertex *vertex; vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); - __unix_walk_scc(vertex, &last_index); + __unix_walk_scc(vertex, &last_index, hitlist); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); @@ -463,7 +512,7 @@ static void unix_walk_scc(void) unix_graph_grouped = true; } -static void unix_walk_scc_fast(void) +static void unix_walk_scc_fast(struct sk_buff_head *hitlist) { while (!list_empty(&unix_unvisited_vertices)) { struct unix_vertex *vertex; @@ -480,279 +529,40 @@ static void unix_walk_scc_fast(void) scc_dead = unix_vertex_dead(vertex); } + if (scc_dead) + unix_collect_skb(&scc, hitlist); + list_del(&scc); } list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); } -static LIST_HEAD(gc_candidates); -static LIST_HEAD(gc_inflight_list); - -/* Keep the number of times in flight count for the file - * descriptor if it is for an AF_UNIX socket. - */ -void unix_inflight(struct user_struct *user, struct file *filp) -{ - struct unix_sock *u = unix_get_socket(filp); - - spin_lock(&unix_gc_lock); - - if (u) { - if (!u->inflight) { - WARN_ON_ONCE(!list_empty(&u->link)); - list_add_tail(&u->link, &gc_inflight_list); - } else { - WARN_ON_ONCE(list_empty(&u->link)); - } - u->inflight++; - } - - spin_unlock(&unix_gc_lock); -} - -void unix_notinflight(struct user_struct *user, struct file *filp) -{ - struct unix_sock *u = unix_get_socket(filp); - - spin_lock(&unix_gc_lock); - - if (u) { - WARN_ON_ONCE(!u->inflight); - WARN_ON_ONCE(list_empty(&u->link)); - - u->inflight--; - if (!u->inflight) - list_del_init(&u->link); - } - - spin_unlock(&unix_gc_lock); -} - -static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) -{ - struct sk_buff *skb; - struct sk_buff *next; - - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - /* Do we have file descriptors ? */ - if (UNIXCB(skb).fp) { - bool hit = false; - /* Process the descriptors of this socket */ - int nfd = UNIXCB(skb).fp->count; - struct file **fp = UNIXCB(skb).fp->fp; - - while (nfd--) { - /* Get the socket the fd matches if it indeed does so */ - struct unix_sock *u = unix_get_socket(*fp++); - - /* Ignore non-candidates, they could have been added - * to the queues after starting the garbage collection - */ - if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { - hit = true; - - func(u); - } - } - if (hit && hitlist != NULL) { - __skb_unlink(skb, &x->sk_receive_queue); - __skb_queue_tail(hitlist, skb); - } - } - } - spin_unlock(&x->sk_receive_queue.lock); -} - -static void scan_children(struct sock *x, void (*func)(struct unix_sock *), - struct sk_buff_head *hitlist) -{ - if (x->sk_state != TCP_LISTEN) { - scan_inflight(x, func, hitlist); - } else { - struct sk_buff *skb; - struct sk_buff *next; - struct unix_sock *u; - LIST_HEAD(embryos); - - /* For a listening socket collect the queued embryos - * and perform a scan on them as well. - */ - spin_lock(&x->sk_receive_queue.lock); - skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { - u = unix_sk(skb->sk); - - /* An embryo cannot be in-flight, so it's safe - * to use the list link. - */ - WARN_ON_ONCE(!list_empty(&u->link)); - list_add_tail(&u->link, &embryos); - } - spin_unlock(&x->sk_receive_queue.lock); - - while (!list_empty(&embryos)) { - u = list_entry(embryos.next, struct unix_sock, link); - scan_inflight(&u->sk, func, hitlist); - list_del_init(&u->link); - } - } -} - -static void dec_inflight(struct unix_sock *usk) -{ - usk->inflight--; -} - -static void inc_inflight(struct unix_sock *usk) -{ - usk->inflight++; -} - -static void inc_inflight_move_tail(struct unix_sock *u) -{ - u->inflight++; - - /* If this still might be part of a cycle, move it to the end - * of the list, so that it's checked even if it was already - * passed over - */ - if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) - list_move_tail(&u->link, &gc_candidates); -} - static bool gc_in_progress; static void __unix_gc(struct work_struct *work) { struct sk_buff_head hitlist; - struct unix_sock *u, *next; - LIST_HEAD(not_cycle_list); - struct list_head cursor; spin_lock(&unix_gc_lock); - if (!unix_graph_maybe_cyclic) + if (!unix_graph_maybe_cyclic) { + spin_unlock(&unix_gc_lock); goto skip_gc; + } + + __skb_queue_head_init(&hitlist); if (unix_graph_grouped) - unix_walk_scc_fast(); + unix_walk_scc_fast(&hitlist); else - unix_walk_scc(); - - /* First, select candidates for garbage collection. Only - * in-flight sockets are considered, and from those only ones - * which don't have any external reference. - * - * Holding unix_gc_lock will protect these candidates from - * being detached, and hence from gaining an external - * reference. Since there are no possible receivers, all - * buffers currently on the candidates' queues stay there - * during the garbage collection. - * - * We also know that no new candidate can be added onto the - * receive queues. Other, non candidate sockets _can_ be - * added to queue, so we must make sure only to touch - * candidates. - * - * Embryos, though never candidates themselves, affect which - * candidates are reachable by the garbage collector. Before - * being added to a listener's queue, an embryo may already - * receive data carrying SCM_RIGHTS, potentially making the - * passed socket a candidate that is not yet reachable by the - * collector. It becomes reachable once the embryo is - * enqueued. Therefore, we must ensure that no SCM-laden - * embryo appears in a (candidate) listener's queue between - * consecutive scan_children() calls. - */ - list_for_each_entry_safe(u, next, &gc_inflight_list, link) { - struct sock *sk = &u->sk; - long total_refs; - - total_refs = file_count(sk->sk_socket->file); - - WARN_ON_ONCE(!u->inflight); - WARN_ON_ONCE(total_refs < u->inflight); - if (total_refs == u->inflight) { - list_move_tail(&u->link, &gc_candidates); - __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - - if (sk->sk_state == TCP_LISTEN) { - unix_state_lock_nested(sk, U_LOCK_GC_LISTENER); - unix_state_unlock(sk); - } - } - } - - /* Now remove all internal in-flight reference to children of - * the candidates. - */ - list_for_each_entry(u, &gc_candidates, link) - scan_children(&u->sk, dec_inflight, NULL); - - /* Restore the references for children of all candidates, - * which have remaining references. Do this recursively, so - * only those remain, which form cyclic references. - * - * Use a "cursor" link, to make the list traversal safe, even - * though elements might be moved about. - */ - list_add(&cursor, &gc_candidates); - while (cursor.next != &gc_candidates) { - u = list_entry(cursor.next, struct unix_sock, link); - - /* Move cursor to after the current position. */ - list_move(&cursor, &u->link); - - if (u->inflight) { - list_move_tail(&u->link, ¬_cycle_list); - __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); - scan_children(&u->sk, inc_inflight_move_tail, NULL); - } - } - list_del(&cursor); - - /* Now gc_candidates contains only garbage. Restore original - * inflight counters for these as well, and remove the skbuffs - * which are creating the cycle(s). - */ - skb_queue_head_init(&hitlist); - list_for_each_entry(u, &gc_candidates, link) { - scan_children(&u->sk, inc_inflight, &hitlist); - -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - kfree_skb(u->oob_skb); - u->oob_skb = NULL; - } -#endif - } - - /* not_cycle_list contains those sockets which do not make up a - * cycle. Restore these to the inflight list. - */ - while (!list_empty(¬_cycle_list)) { - u = list_entry(not_cycle_list.next, struct unix_sock, link); - __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); - list_move_tail(&u->link, &gc_inflight_list); - } + unix_walk_scc(&hitlist); spin_unlock(&unix_gc_lock); - /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); - - spin_lock(&unix_gc_lock); - - /* All candidates should have been detached by now. */ - WARN_ON_ONCE(!list_empty(&gc_candidates)); skip_gc: - /* Paired with READ_ONCE() in wait_for_unix_gc(). */ WRITE_ONCE(gc_in_progress, false); - - spin_unlock(&unix_gc_lock); } static DECLARE_WORK(unix_gc_work, __unix_gc); From 61a75360dca93c945ef6bd757f8b8a96f39b77cb Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:21 +0100 Subject: [PATCH 498/529] af_unix: Remove lock dance in unix_peek_fds(). commit 118f457da9ed58a79e24b73c2ef0aa1987241f0e upstream. In the previous GC implementation, the shape of the inflight socket graph was not expected to change while GC was in progress. MSG_PEEK was tricky because it could install inflight fd silently and transform the graph. Let's say we peeked a fd, which was a listening socket, and accept()ed some embryo sockets from it. The garbage collection algorithm would have been confused because the set of sockets visited in scan_inflight() would change within the same GC invocation. That's why we placed spin_lock(&unix_gc_lock) and spin_unlock() in unix_peek_fds() with a fat comment. In the new GC implementation, we no longer garbage-collect the socket if it exists in another queue, that is, if it has a bridge to another SCC. Also, accept() will require the lock if it has edges. Thus, we need not do the complicated lock dance. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240401173125.92184-3-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 1 - net/unix/af_unix.c | 42 ------------------------------------------ net/unix/garbage.c | 2 +- 3 files changed, 1 insertion(+), 44 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 47808e366731..4c726df56c0b 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -17,7 +17,6 @@ static inline struct unix_sock *unix_get_socket(struct file *filp) } #endif -extern spinlock_t unix_gc_lock; extern unsigned int unix_tot_inflight; void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); void unix_del_edges(struct scm_fp_list *fpl); diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 25f66adf47d1..ce5b74dfd8ae 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1770,48 +1770,6 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) { scm->fp = scm_fp_dup(UNIXCB(skb).fp); - - /* - * Garbage collection of unix sockets starts by selecting a set of - * candidate sockets which have reference only from being in flight - * (total_refs == inflight_refs). This condition is checked once during - * the candidate collection phase, and candidates are marked as such, so - * that non-candidates can later be ignored. While inflight_refs is - * protected by unix_gc_lock, total_refs (file count) is not, hence this - * is an instantaneous decision. - * - * Once a candidate, however, the socket must not be reinstalled into a - * file descriptor while the garbage collection is in progress. - * - * If the above conditions are met, then the directed graph of - * candidates (*) does not change while unix_gc_lock is held. - * - * Any operations that changes the file count through file descriptors - * (dup, close, sendmsg) does not change the graph since candidates are - * not installed in fds. - * - * Dequeing a candidate via recvmsg would install it into an fd, but - * that takes unix_gc_lock to decrement the inflight count, so it's - * serialized with garbage collection. - * - * MSG_PEEK is special in that it does not change the inflight count, - * yet does install the socket into an fd. The following lock/unlock - * pair is to ensure serialization with garbage collection. It must be - * done between incrementing the file count and installing the file into - * an fd. - * - * If garbage collection starts after the barrier provided by the - * lock/unlock, then it will see the elevated refcount and not mark this - * as a candidate. If a garbage collection is already in progress - * before the file count was incremented, then the lock/unlock pair will - * ensure that garbage collection is finished before progressing to - * installing the fd. - * - * (*) A -> B where B is on the queue of A or B is on the queue of C - * which is on the queue of listening socket A. - */ - spin_lock(&unix_gc_lock); - spin_unlock(&unix_gc_lock); } static void unix_destruct_scm(struct sk_buff *skb) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 89ea71d9297b..12a4ec27e0d4 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -183,7 +183,7 @@ static void unix_free_vertices(struct scm_fp_list *fpl) } } -DEFINE_SPINLOCK(unix_gc_lock); +static DEFINE_SPINLOCK(unix_gc_lock); unsigned int unix_tot_inflight; void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) From 951e454715c8024a7ffe4ce44ccbe4fc58502dab Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:22 +0100 Subject: [PATCH 499/529] af_unix: Try not to hold unix_gc_lock during accept(). commit fd86344823b521149bb31d91eba900ba3525efa6 upstream. Commit dcf70df2048d ("af_unix: Fix up unix_edge.successor for embryo socket.") added spin_lock(&unix_gc_lock) in accept() path, and it caused regression in a stress test as reported by kernel test robot. If the embryo socket is not part of the inflight graph, we need not hold the lock. To decide that in O(1) time and avoid the regression in the normal use case, 1. add a new stat unix_sk(sk)->scm_stat.nr_unix_fds 2. count the number of inflight AF_UNIX sockets in the receive queue under unix_state_lock() 3. move unix_update_edges() call under unix_state_lock() 4. avoid locking if nr_unix_fds is 0 in unix_update_edges() Reported-by: kernel test robot Closes: https://lore.kernel.org/oe-lkp/202404101427.92a08551-oliver.sang@intel.com Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240413021928.20946-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/af_unix.h | 1 + net/unix/af_unix.c | 2 +- net/unix/garbage.c | 20 ++++++++++++++++---- 3 files changed, 18 insertions(+), 5 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 4c726df56c0b..b1f82d74339e 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -67,6 +67,7 @@ struct unix_skb_parms { struct scm_stat { atomic_t nr_fds; + unsigned long nr_unix_fds; }; #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index ce5b74dfd8ae..79b783a70c87 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1677,12 +1677,12 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, } tsk = skb->sk; - unix_update_edges(unix_sk(tsk)); skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); /* attach accepted sock to socket */ unix_state_lock(tsk); + unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; unix_sock_inherit_flags(sock, newsock); sock_graft(tsk, newsock); diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 12a4ec27e0d4..95240a59808f 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -209,6 +209,7 @@ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) unix_add_edge(fpl, edge); } while (i < fpl->count_unix); + receiver->scm_stat.nr_unix_fds += fpl->count_unix; WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); @@ -222,6 +223,7 @@ out: void unix_del_edges(struct scm_fp_list *fpl) { + struct unix_sock *receiver; int i = 0; spin_lock(&unix_gc_lock); @@ -235,6 +237,8 @@ void unix_del_edges(struct scm_fp_list *fpl) unix_del_edge(fpl, edge); } while (i < fpl->count_unix); + receiver = fpl->edges[0].successor; + receiver->scm_stat.nr_unix_fds -= fpl->count_unix; WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); @@ -246,10 +250,18 @@ out: void unix_update_edges(struct unix_sock *receiver) { - spin_lock(&unix_gc_lock); - unix_update_graph(unix_sk(receiver->listener)->vertex); - receiver->listener = NULL; - spin_unlock(&unix_gc_lock); + /* nr_unix_fds is only updated under unix_state_lock(). + * If it's 0 here, the embryo socket is not part of the + * inflight graph, and GC will not see it, so no lock needed. + */ + if (!receiver->scm_stat.nr_unix_fds) { + receiver->listener = NULL; + } else { + spin_lock(&unix_gc_lock); + unix_update_graph(unix_sk(receiver->listener)->vertex); + receiver->listener = NULL; + spin_unlock(&unix_gc_lock); + } } int unix_prepare_fpl(struct scm_fp_list *fpl) From f9977b176f512a83ccedcc2b8229b6590b374fe7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:23 +0100 Subject: [PATCH 500/529] af_unix: Don't access successor in unix_del_edges() during GC. commit 1af2dface5d286dd1f2f3405a0d6fa9f2c8fb998 upstream. syzbot reported use-after-free in unix_del_edges(). [0] What the repro does is basically repeat the following quickly. 1. pass a fd of an AF_UNIX socket to itself socketpair(AF_UNIX, SOCK_DGRAM, 0, [3, 4]) = 0 sendmsg(3, {..., msg_control=[{cmsg_len=20, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, cmsg_data=[4]}], ...}, 0) = 0 2. pass other fds of AF_UNIX sockets to the socket above socketpair(AF_UNIX, SOCK_SEQPACKET, 0, [5, 6]) = 0 sendmsg(3, {..., msg_control=[{cmsg_len=48, cmsg_level=SOL_SOCKET, cmsg_type=SCM_RIGHTS, cmsg_data=[5, 6]}], ...}, 0) = 0 3. close all sockets Here, two skb are created, and every unix_edge->successor is the first socket. Then, __unix_gc() will garbage-collect the two skb: (a) free skb with self-referencing fd (b) free skb holding other sockets After (a), the self-referencing socket will be scheduled to be freed later by the delayed_fput() task. syzbot repeated the sequences above (1. ~ 3.) quickly and triggered the task concurrently while GC was running. So, at (b), the socket was already freed, and accessing it was illegal. unix_del_edges() accesses the receiver socket as edge->successor to optimise GC. However, we should not do it during GC. Garbage-collecting sockets does not change the shape of the rest of the graph, so we need not call unix_update_graph() to update unix_graph_grouped when we purge skb. However, if we clean up all loops in the unix_walk_scc_fast() path, unix_graph_maybe_cyclic remains unchanged (true), and __unix_gc() will call unix_walk_scc_fast() continuously even though there is no socket to garbage-collect. To keep that optimisation while fixing UAF, let's add the same updating logic of unix_graph_maybe_cyclic in unix_walk_scc_fast() as done in unix_walk_scc() and __unix_walk_scc(). Note that when unix_del_edges() is called from other places, the receiver socket is always alive: - sendmsg: the successor's sk_refcnt is bumped by sock_hold() unix_find_other() for SOCK_DGRAM, connect() for SOCK_STREAM - recvmsg: the successor is the receiver, and its fd is alive [0]: BUG: KASAN: slab-use-after-free in unix_edge_successor net/unix/garbage.c:109 [inline] BUG: KASAN: slab-use-after-free in unix_del_edge net/unix/garbage.c:165 [inline] BUG: KASAN: slab-use-after-free in unix_del_edges+0x148/0x630 net/unix/garbage.c:237 Read of size 8 at addr ffff888079c6e640 by task kworker/u8:6/1099 CPU: 0 PID: 1099 Comm: kworker/u8:6 Not tainted 6.9.0-rc4-next-20240418-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Workqueue: events_unbound __unix_gc Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 unix_edge_successor net/unix/garbage.c:109 [inline] unix_del_edge net/unix/garbage.c:165 [inline] unix_del_edges+0x148/0x630 net/unix/garbage.c:237 unix_destroy_fpl+0x59/0x210 net/unix/garbage.c:298 unix_detach_fds net/unix/af_unix.c:1811 [inline] unix_destruct_scm+0x13e/0x210 net/unix/af_unix.c:1826 skb_release_head_state+0x100/0x250 net/core/skbuff.c:1127 skb_release_all net/core/skbuff.c:1138 [inline] __kfree_skb net/core/skbuff.c:1154 [inline] kfree_skb_reason+0x16d/0x3b0 net/core/skbuff.c:1190 __skb_queue_purge_reason include/linux/skbuff.h:3251 [inline] __skb_queue_purge include/linux/skbuff.h:3256 [inline] __unix_gc+0x1732/0x1830 net/unix/garbage.c:575 process_one_work kernel/workqueue.c:3218 [inline] process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3299 worker_thread+0x86d/0xd70 kernel/workqueue.c:3380 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Allocated by task 14427: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 unpoison_slab_object mm/kasan/common.c:312 [inline] __kasan_slab_alloc+0x66/0x80 mm/kasan/common.c:338 kasan_slab_alloc include/linux/kasan.h:201 [inline] slab_post_alloc_hook mm/slub.c:3897 [inline] slab_alloc_node mm/slub.c:3957 [inline] kmem_cache_alloc_noprof+0x135/0x290 mm/slub.c:3964 sk_prot_alloc+0x58/0x210 net/core/sock.c:2074 sk_alloc+0x38/0x370 net/core/sock.c:2133 unix_create1+0xb4/0x770 unix_create+0x14e/0x200 net/unix/af_unix.c:1034 __sock_create+0x490/0x920 net/socket.c:1571 sock_create net/socket.c:1622 [inline] __sys_socketpair+0x33e/0x720 net/socket.c:1773 __do_sys_socketpair net/socket.c:1822 [inline] __se_sys_socketpair net/socket.c:1819 [inline] __x64_sys_socketpair+0x9b/0xb0 net/socket.c:1819 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f Freed by task 1805: kasan_save_stack mm/kasan/common.c:47 [inline] kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:579 poison_slab_object+0xe0/0x150 mm/kasan/common.c:240 __kasan_slab_free+0x37/0x60 mm/kasan/common.c:256 kasan_slab_free include/linux/kasan.h:184 [inline] slab_free_hook mm/slub.c:2190 [inline] slab_free mm/slub.c:4393 [inline] kmem_cache_free+0x145/0x340 mm/slub.c:4468 sk_prot_free net/core/sock.c:2114 [inline] __sk_destruct+0x467/0x5f0 net/core/sock.c:2208 sock_put include/net/sock.h:1948 [inline] unix_release_sock+0xa8b/0xd20 net/unix/af_unix.c:665 unix_release+0x91/0xc0 net/unix/af_unix.c:1049 __sock_release net/socket.c:659 [inline] sock_close+0xbc/0x240 net/socket.c:1421 __fput+0x406/0x8b0 fs/file_table.c:422 delayed_fput+0x59/0x80 fs/file_table.c:445 process_one_work kernel/workqueue.c:3218 [inline] process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3299 worker_thread+0x86d/0xd70 kernel/workqueue.c:3380 kthread+0x2f0/0x390 kernel/kthread.c:389 ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 The buggy address belongs to the object at ffff888079c6e000 which belongs to the cache UNIX of size 1920 The buggy address is located 1600 bytes inside of freed 1920-byte region [ffff888079c6e000, ffff888079c6e780) Reported-by: syzbot+f3f3eef1d2100200e593@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=f3f3eef1d2100200e593 Fixes: 77e5593aebba ("af_unix: Skip GC if no cycle exists.") Fixes: fd86344823b5 ("af_unix: Try not to hold unix_gc_lock during accept().") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20240419235102.31707-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- net/unix/garbage.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 95240a59808f..d76450133e4f 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -158,11 +158,14 @@ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) unix_update_graph(unix_edge_successor(edge)); } +static bool gc_in_progress; + static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { struct unix_vertex *vertex = edge->predecessor->vertex; - unix_update_graph(unix_edge_successor(edge)); + if (!gc_in_progress) + unix_update_graph(unix_edge_successor(edge)); list_del(&edge->vertex_entry); vertex->out_degree--; @@ -237,8 +240,10 @@ void unix_del_edges(struct scm_fp_list *fpl) unix_del_edge(fpl, edge); } while (i < fpl->count_unix); - receiver = fpl->edges[0].successor; - receiver->scm_stat.nr_unix_fds -= fpl->count_unix; + if (!gc_in_progress) { + receiver = fpl->edges[0].successor; + receiver->scm_stat.nr_unix_fds -= fpl->count_unix; + } WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); out: WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); @@ -526,6 +531,8 @@ static void unix_walk_scc(struct sk_buff_head *hitlist) static void unix_walk_scc_fast(struct sk_buff_head *hitlist) { + unix_graph_maybe_cyclic = false; + while (!list_empty(&unix_unvisited_vertices)) { struct unix_vertex *vertex; struct list_head scc; @@ -543,6 +550,8 @@ static void unix_walk_scc_fast(struct sk_buff_head *hitlist) if (scc_dead) unix_collect_skb(&scc, hitlist); + else if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); list_del(&scc); } @@ -550,8 +559,6 @@ static void unix_walk_scc_fast(struct sk_buff_head *hitlist) list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); } -static bool gc_in_progress; - static void __unix_gc(struct work_struct *work) { struct sk_buff_head hitlist; From 28201f38dc5f65cf7f5f54eceea5e7b12122535e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 21 May 2025 16:27:24 +0100 Subject: [PATCH 501/529] af_unix: Add dead flag to struct scm_fp_list. commit 7172dc93d621d5dc302d007e95ddd1311ec64283 upstream. Commit 1af2dface5d2 ("af_unix: Don't access successor in unix_del_edges() during GC.") fixed use-after-free by avoid accessing edge->successor while GC is in progress. However, there could be a small race window where another process could call unix_del_edges() while gc_in_progress is true and __skb_queue_purge() is on the way. So, we need another marker for struct scm_fp_list which indicates if the skb is garbage-collected. This patch adds dead flag in struct scm_fp_list and set it true before calling __skb_queue_purge(). Fixes: 1af2dface5d2 ("af_unix: Don't access successor in unix_del_edges() during GC.") Signed-off-by: Kuniyuki Iwashima Acked-by: Paolo Abeni Link: https://lore.kernel.org/r/20240508171150.50601-1-kuniyu@amazon.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- include/net/scm.h | 1 + net/core/scm.c | 1 + net/unix/garbage.c | 14 ++++++++++---- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/net/scm.h b/include/net/scm.h index 19789096424d..0be0dc3eb1dc 100644 --- a/include/net/scm.h +++ b/include/net/scm.h @@ -31,6 +31,7 @@ struct scm_fp_list { short max; #ifdef CONFIG_UNIX bool inflight; + bool dead; struct list_head vertices; struct unix_edge *edges; #endif diff --git a/net/core/scm.c b/net/core/scm.c index 1ff78bd4ee83..cdd4e5befb14 100644 --- a/net/core/scm.c +++ b/net/core/scm.c @@ -91,6 +91,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp) fpl->user = NULL; #if IS_ENABLED(CONFIG_UNIX) fpl->inflight = false; + fpl->dead = false; fpl->edges = NULL; INIT_LIST_HEAD(&fpl->vertices); #endif diff --git a/net/unix/garbage.c b/net/unix/garbage.c index d76450133e4f..1f8b8cdfcdc8 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -158,13 +158,11 @@ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) unix_update_graph(unix_edge_successor(edge)); } -static bool gc_in_progress; - static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) { struct unix_vertex *vertex = edge->predecessor->vertex; - if (!gc_in_progress) + if (!fpl->dead) unix_update_graph(unix_edge_successor(edge)); list_del(&edge->vertex_entry); @@ -240,7 +238,7 @@ void unix_del_edges(struct scm_fp_list *fpl) unix_del_edge(fpl, edge); } while (i < fpl->count_unix); - if (!gc_in_progress) { + if (!fpl->dead) { receiver = fpl->edges[0].successor; receiver->scm_stat.nr_unix_fds -= fpl->count_unix; } @@ -559,9 +557,12 @@ static void unix_walk_scc_fast(struct sk_buff_head *hitlist) list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); } +static bool gc_in_progress; + static void __unix_gc(struct work_struct *work) { struct sk_buff_head hitlist; + struct sk_buff *skb; spin_lock(&unix_gc_lock); @@ -579,6 +580,11 @@ static void __unix_gc(struct work_struct *work) spin_unlock(&unix_gc_lock); + skb_queue_walk(&hitlist, skb) { + if (UNIXCB(skb).fp) + UNIXCB(skb).fp->dead = true; + } + __skb_queue_purge(&hitlist); skip_gc: WRITE_ONCE(gc_in_progress, false); From c7edc6e616fbff4102fed73ce4f9e285e57edf3a Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Wed, 21 May 2025 16:27:25 +0100 Subject: [PATCH 502/529] af_unix: Fix garbage collection of embryos carrying OOB with SCM_RIGHTS commit 041933a1ec7b4173a8e638cae4f8e394331d7e54 upstream. GC attempts to explicitly drop oob_skb's reference before purging the hit list. The problem is with embryos: kfree_skb(u->oob_skb) is never called on an embryo socket. The python script below [0] sends a listener's fd to its embryo as OOB data. While GC does collect the embryo's queue, it fails to drop the OOB skb's refcount. The skb which was in embryo's receive queue stays as unix_sk(sk)->oob_skb and keeps the listener's refcount [1]. Tell GC to dispose embryo's oob_skb. [0]: from array import array from socket import * addr = '\x00unix-oob' lis = socket(AF_UNIX, SOCK_STREAM) lis.bind(addr) lis.listen(1) s = socket(AF_UNIX, SOCK_STREAM) s.connect(addr) scm = (SOL_SOCKET, SCM_RIGHTS, array('i', [lis.fileno()])) s.sendmsg([b'x'], [scm], MSG_OOB) lis.close() [1] $ grep unix-oob /proc/net/unix $ ./unix-oob.py $ grep unix-oob /proc/net/unix 0000000000000000: 00000002 00000000 00000000 0001 02 0 @unix-oob 0000000000000000: 00000002 00000000 00010000 0001 01 6072 @unix-oob Fixes: 4090fa373f0e ("af_unix: Replace garbage collection algorithm.") Signed-off-by: Michal Luczaj Reviewed-by: Kuniyuki Iwashima Signed-off-by: Paolo Abeni Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- net/unix/garbage.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 1f8b8cdfcdc8..dfe94a90ece4 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -342,6 +342,18 @@ enum unix_recv_queue_lock_class { U_RECVQ_LOCK_EMBRYO, }; +static void unix_collect_queue(struct unix_sock *u, struct sk_buff_head *hitlist) +{ + skb_queue_splice_init(&u->sk.sk_receive_queue, hitlist); + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + WARN_ON_ONCE(skb_unref(u->oob_skb)); + u->oob_skb = NULL; + } +#endif +} + static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) { struct unix_vertex *vertex; @@ -365,18 +377,11 @@ static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist /* listener -> embryo order, the inversion never happens. */ spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO); - skb_queue_splice_init(embryo_queue, hitlist); + unix_collect_queue(unix_sk(skb->sk), hitlist); spin_unlock(&embryo_queue->lock); } } else { - skb_queue_splice_init(queue, hitlist); - -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - kfree_skb(u->oob_skb); - u->oob_skb = NULL; - } -#endif + unix_collect_queue(u, hitlist); } spin_unlock(&queue->lock); From 0297e7ddf55ba5892e9236ebef532f44e593ca3e Mon Sep 17 00:00:00 2001 From: Shigeru Yoshida Date: Wed, 21 May 2025 16:27:26 +0100 Subject: [PATCH 503/529] af_unix: Fix uninit-value in __unix_walk_scc() commit 927fa5b3e4f52e0967bfc859afc98ad1c523d2d5 upstream. KMSAN reported uninit-value access in __unix_walk_scc() [1]. In the list_for_each_entry_reverse() loop, when the vertex's index equals it's scc_index, the loop uses the variable vertex as a temporary variable that points to a vertex in scc. And when the loop is finished, the variable vertex points to the list head, in this case scc, which is a local variable on the stack (more precisely, it's not even scc and might underflow the call stack of __unix_walk_scc(): container_of(&scc, struct unix_vertex, scc_entry)). However, the variable vertex is used under the label prev_vertex. So if the edge_stack is not empty and the function jumps to the prev_vertex label, the function will access invalid data on the stack. This causes the uninit-value access issue. Fix this by introducing a new temporary variable for the loop. [1] BUG: KMSAN: uninit-value in __unix_walk_scc net/unix/garbage.c:478 [inline] BUG: KMSAN: uninit-value in unix_walk_scc net/unix/garbage.c:526 [inline] BUG: KMSAN: uninit-value in __unix_gc+0x2589/0x3c20 net/unix/garbage.c:584 __unix_walk_scc net/unix/garbage.c:478 [inline] unix_walk_scc net/unix/garbage.c:526 [inline] __unix_gc+0x2589/0x3c20 net/unix/garbage.c:584 process_one_work kernel/workqueue.c:3231 [inline] process_scheduled_works+0xade/0x1bf0 kernel/workqueue.c:3312 worker_thread+0xeb6/0x15b0 kernel/workqueue.c:3393 kthread+0x3c4/0x530 kernel/kthread.c:389 ret_from_fork+0x6e/0x90 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Uninit was stored to memory at: unix_walk_scc net/unix/garbage.c:526 [inline] __unix_gc+0x2adf/0x3c20 net/unix/garbage.c:584 process_one_work kernel/workqueue.c:3231 [inline] process_scheduled_works+0xade/0x1bf0 kernel/workqueue.c:3312 worker_thread+0xeb6/0x15b0 kernel/workqueue.c:3393 kthread+0x3c4/0x530 kernel/kthread.c:389 ret_from_fork+0x6e/0x90 arch/x86/kernel/process.c:147 ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 Local variable entries created at: ref_tracker_free+0x48/0xf30 lib/ref_tracker.c:222 netdev_tracker_free include/linux/netdevice.h:4058 [inline] netdev_put include/linux/netdevice.h:4075 [inline] dev_put include/linux/netdevice.h:4101 [inline] update_gid_event_work_handler+0xaa/0x1b0 drivers/infiniband/core/roce_gid_mgmt.c:813 CPU: 1 PID: 12763 Comm: kworker/u8:31 Not tainted 6.10.0-rc4-00217-g35bb670d65fc #32 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014 Workqueue: events_unbound __unix_gc Fixes: 3484f063172d ("af_unix: Detect Strongly Connected Components.") Reported-by: syzkaller Signed-off-by: Shigeru Yoshida Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20240702160428.10153-1-syoshida@redhat.com Signed-off-by: Jakub Kicinski Signed-off-by: Lee Jones Signed-off-by: Greg Kroah-Hartman --- net/unix/garbage.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index dfe94a90ece4..23efb78fe9ef 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -476,6 +476,7 @@ prev_vertex: } if (vertex->index == vertex->scc_index) { + struct unix_vertex *v; struct list_head scc; bool scc_dead = true; @@ -486,15 +487,15 @@ prev_vertex: */ __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); - list_for_each_entry_reverse(vertex, &scc, scc_entry) { + list_for_each_entry_reverse(v, &scc, scc_entry) { /* Don't restart DFS from this vertex in unix_walk_scc(). */ - list_move_tail(&vertex->entry, &unix_visited_vertices); + list_move_tail(&v->entry, &unix_visited_vertices); /* Mark vertex as off-stack. */ - vertex->index = unix_vertex_grouped_index; + v->index = unix_vertex_grouped_index; if (scc_dead) - scc_dead = unix_vertex_dead(vertex); + scc_dead = unix_vertex_dead(v); } if (scc_dead) From a5a9c42cd7d1013e245c2e3f6f6591d1c7ae063f Mon Sep 17 00:00:00 2001 From: Alok Tiwari Date: Wed, 14 May 2025 04:46:51 -0700 Subject: [PATCH 504/529] arm64: dts: qcom: sm8350: Fix typo in pil_camera_mem node commit 295217420a44403a33c30f99d8337fe7b07eb02b upstream. There is a typo in sm8350.dts where the node label mmeory@85200000 should be memory@85200000. This patch corrects the typo for clarity and consistency. Fixes: b7e8f433a673 ("arm64: dts: qcom: Add basic devicetree support for SM8350 SoC") Cc: stable@vger.kernel.org Signed-off-by: Alok Tiwari Link: https://lore.kernel.org/r/20250514114656.2307828-1-alok.a.tiwari@oracle.com Signed-off-by: Bjorn Andersson Signed-off-by: Greg Kroah-Hartman --- arch/arm64/boot/dts/qcom/sm8350.dtsi | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/boot/dts/qcom/sm8350.dtsi b/arch/arm64/boot/dts/qcom/sm8350.dtsi index 5a4972afc977..75292ec3ee77 100644 --- a/arch/arm64/boot/dts/qcom/sm8350.dtsi +++ b/arch/arm64/boot/dts/qcom/sm8350.dtsi @@ -421,7 +421,7 @@ no-map; }; - pil_camera_mem: mmeory@85200000 { + pil_camera_mem: memory@85200000 { reg = <0x0 0x85200000 0x0 0x500000>; no-map; }; From a0ec22fa20b252edbe070a9de8501eef63c17ef5 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Thu, 22 May 2025 15:14:47 -0300 Subject: [PATCH 505/529] net_sched: hfsc: Address reentrant enqueue adding class to eltree twice commit ac9fe7dd8e730a103ae4481147395cc73492d786 upstream. Savino says: "We are writing to report that this recent patch (141d34391abbb315d68556b7c67ad97885407547) [1] can be bypassed, and a UAF can still occur when HFSC is utilized with NETEM. The patch only checks the cl->cl_nactive field to determine whether it is the first insertion or not [2], but this field is only incremented by init_vf [3]. By using HFSC_RSC (which uses init_ed) [4], it is possible to bypass the check and insert the class twice in the eltree. Under normal conditions, this would lead to an infinite loop in hfsc_dequeue for the reasons we already explained in this report [5]. However, if TBF is added as root qdisc and it is configured with a very low rate, it can be utilized to prevent packets from being dequeued. This behavior can be exploited to perform subsequent insertions in the HFSC eltree and cause a UAF." To fix both the UAF and the infinite loop, with netem as an hfsc child, check explicitly in hfsc_enqueue whether the class is already in the eltree whenever the HFSC_RSC flag is set. [1] https://web.git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/?id=141d34391abbb315d68556b7c67ad97885407547 [2] https://elixir.bootlin.com/linux/v6.15-rc5/source/net/sched/sch_hfsc.c#L1572 [3] https://elixir.bootlin.com/linux/v6.15-rc5/source/net/sched/sch_hfsc.c#L677 [4] https://elixir.bootlin.com/linux/v6.15-rc5/source/net/sched/sch_hfsc.c#L1574 [5] https://lore.kernel.org/netdev/8DuRWwfqjoRDLDmBMlIfbrsZg9Gx50DHJc1ilxsEBNe2D6NMoigR_eIRIG0LOjMc3r10nUUZtArXx4oZBIdUfZQrwjcQhdinnMis_0G7VEk=@willsroot.io/T/#u Fixes: 37d9cf1a3ce3 ("sched: Fix detection of empty queues in child qdiscs") Reported-by: Savino Dicanosa Reported-by: William Liu Acked-by: Jamal Hadi Salim Tested-by: Victor Nogueira Signed-off-by: Pedro Tammela Link: https://patch.msgid.link/20250522181448.1439717-2-pctammela@mojatatu.com Signed-off-by: Paolo Abeni Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_hfsc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index ec6ee4510013..61b91de8065f 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -176,6 +176,11 @@ struct hfsc_sched { #define HT_INFINITY 0xffffffffffffffffULL /* infinite time value */ +static bool cl_in_el_or_vttree(struct hfsc_class *cl) +{ + return ((cl->cl_flags & HFSC_FSC) && cl->cl_nactive) || + ((cl->cl_flags & HFSC_RSC) && !RB_EMPTY_NODE(&cl->el_node)); +} /* * eligible tree holds backlogged classes being sorted by their eligible times. @@ -1041,6 +1046,8 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, if (cl == NULL) return -ENOBUFS; + RB_CLEAR_NODE(&cl->el_node); + err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack); if (err) { kfree(cl); @@ -1571,7 +1578,7 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) sch->qstats.backlog += len; sch->q.qlen++; - if (first && !cl->cl_nactive) { + if (first && !cl_in_el_or_vttree(cl)) { if (cl->cl_flags & HFSC_RSC) init_ed(cl, len); if (cl->cl_flags & HFSC_FSC) From 05b21a536a7149d3678eda8148e608e14b0ea4e2 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 8 May 2025 16:16:40 +0100 Subject: [PATCH 506/529] perf/arm-cmn: Fix REQ2/SNP2 mixup commit 11b0f576e0cbde6a12258f2af6753b17b8df342b upstream. Somehow the encodings for REQ2/SNP2 channels in XP events got mixed up... Unmix them. CC: stable@vger.kernel.org Fixes: 23760a014417 ("perf/arm-cmn: Add CMN-700 support") Signed-off-by: Robin Murphy Link: https://lore.kernel.org/r/087023e9737ac93d7ec7a841da904758c254cb01.1746717400.git.robin.murphy@arm.com Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- drivers/perf/arm-cmn.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index 4445cae427b2..a884835f6830 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -675,8 +675,8 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj, if ((chan == 5 && cmn->rsp_vc_num < 2) || (chan == 6 && cmn->dat_vc_num < 2) || - (chan == 7 && cmn->snp_vc_num < 2) || - (chan == 8 && cmn->req_vc_num < 2)) + (chan == 7 && cmn->req_vc_num < 2) || + (chan == 8 && cmn->snp_vc_num < 2)) return 0; } @@ -794,8 +794,8 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj, _CMN_EVENT_XP(pub_##_name, (_event) | (4 << 5)), \ _CMN_EVENT_XP(rsp2_##_name, (_event) | (5 << 5)), \ _CMN_EVENT_XP(dat2_##_name, (_event) | (6 << 5)), \ - _CMN_EVENT_XP(snp2_##_name, (_event) | (7 << 5)), \ - _CMN_EVENT_XP(req2_##_name, (_event) | (8 << 5)) + _CMN_EVENT_XP(req2_##_name, (_event) | (7 << 5)), \ + _CMN_EVENT_XP(snp2_##_name, (_event) | (8 << 5)) static struct attribute *arm_cmn_event_attrs[] = { From 3939280be98ed943eb2a7b7fc74a3e77e061a213 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Mon, 12 May 2025 18:11:54 +0100 Subject: [PATCH 507/529] perf/arm-cmn: Initialise cmn->cpu earlier commit 597704e201068db3d104de3c7a4d447ff8209127 upstream. For all the complexity of handling affinity for CPU hotplug, what we've apparently managed to overlook is that arm_cmn_init_irqs() has in fact always been setting the *initial* affinity of all IRQs to CPU 0, not the CPU we subsequently choose for event scheduling. Oh dear. Cc: stable@vger.kernel.org Fixes: 0ba64770a2f2 ("perf: Add Arm CMN-600 PMU driver") Signed-off-by: Robin Murphy Reviewed-by: Ilkka Koskinen Link: https://lore.kernel.org/r/b12fccba6b5b4d2674944f59e4daad91cd63420b.1747069914.git.robin.murphy@arm.com Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- drivers/perf/arm-cmn.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c index a884835f6830..b2e0a79254f1 100644 --- a/drivers/perf/arm-cmn.c +++ b/drivers/perf/arm-cmn.c @@ -2313,6 +2313,7 @@ static int arm_cmn_probe(struct platform_device *pdev) cmn->dev = &pdev->dev; cmn->part = (unsigned long)device_get_match_data(cmn->dev); + cmn->cpu = cpumask_local_spread(0, dev_to_node(cmn->dev)); platform_set_drvdata(pdev, cmn); if (cmn->part == PART_CMN600 && has_acpi_companion(cmn->dev)) { @@ -2340,7 +2341,6 @@ static int arm_cmn_probe(struct platform_device *pdev) if (err) return err; - cmn->cpu = cpumask_local_spread(0, dev_to_node(cmn->dev)); cmn->pmu = (struct pmu) { .module = THIS_MODULE, .attr_groups = arm_cmn_attr_groups, From fc7846a7d5320d37cbc027f13043e0b58b345766 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 14 Apr 2025 15:55:06 +0200 Subject: [PATCH 508/529] coredump: fix error handling for replace_fd() commit 95c5f43181fe9c1b5e5a4bd3281c857a5259991f upstream. The replace_fd() helper returns the file descriptor number on success and a negative error code on failure. The current error handling in umh_pipe_setup() only works because the file descriptor that is replaced is zero but that's pretty volatile. Explicitly check for a negative error code. Link: https://lore.kernel.org/20250414-work-coredump-v2-2-685bf231f828@kernel.org Tested-by: Luca Boccassi Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner Signed-off-by: Greg Kroah-Hartman --- fs/coredump.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 4d332f147137..2e2b66aed59d 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -493,7 +493,9 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { struct file *files[2]; struct coredump_params *cp = (struct coredump_params *)info->data; - int err = create_pipe_files(files, 0); + int err; + + err = create_pipe_files(files, 0); if (err) return err; @@ -501,10 +503,13 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) err = replace_fd(0, files[0], 0); fput(files[0]); + if (err < 0) + return err; + /* and disallow core files too */ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; - return err; + return 0; } void do_coredump(const kernel_siginfo_t *siginfo) From 1ced79b25fa29a1b38ef12d5acd85b54e1c8593c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 27 Mar 2023 20:22:51 +0200 Subject: [PATCH 509/529] pid: add pidfd_prepare() commit 6ae930d9dbf2d093157be33428538c91966d8a9f upstream. Add a new helper that allows to reserve a pidfd and allocates a new pidfd file that stashes the provided struct pid. This will allow us to remove places that either open code this function or that call pidfd_create() but then have to call close_fd() because there are still failure points after pidfd_create() has been called. Reviewed-by: Jan Kara Message-Id: <20230327-pidfd-file-api-v1-1-5c0e9a3158e4@kernel.org> Signed-off-by: Christian Brauner Signed-off-by: Greg Kroah-Hartman --- include/linux/pid.h | 1 + kernel/fork.c | 85 +++++++++++++++++++++++++++++++++++++++++++++ kernel/pid.c | 19 ++++------ 3 files changed, 93 insertions(+), 12 deletions(-) diff --git a/include/linux/pid.h b/include/linux/pid.h index bf3af54de616..653a527574c4 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -80,6 +80,7 @@ extern struct pid *pidfd_pid(const struct file *file); struct pid *pidfd_get_pid(unsigned int fd, unsigned int *flags); struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags); int pidfd_create(struct pid *pid, unsigned int flags); +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret); static inline struct pid *get_pid(struct pid *pid) { diff --git a/kernel/fork.c b/kernel/fork.c index 09a935724bd9..a9b9a24c7932 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1943,6 +1943,91 @@ const struct file_operations pidfd_fops = { #endif }; +/** + * __pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * @pid: the struct pid for which to create a pidfd + * @flags: flags of the new @pidfd + * @pidfd: the pidfd to return + * + * Allocate a new file that stashes @pid and reserve a new pidfd number in the + * caller's file descriptor table. The pidfd is reserved but not installed yet. + + * The helper doesn't perform checks on @pid which makes it useful for pidfds + * created via CLONE_PIDFD where @pid has no task attached when the pidfd and + * pidfd file are prepared. + * + * If this function returns successfully the caller is responsible to either + * call fd_install() passing the returned pidfd and pidfd file as arguments in + * order to install the pidfd into its file descriptor table or they must use + * put_unused_fd() and fput() on the returned pidfd and pidfd file + * respectively. + * + * This function is useful when a pidfd must already be reserved but there + * might still be points of failure afterwards and the caller wants to ensure + * that no pidfd is leaked into its file descriptor table. + * + * Return: On success, a reserved pidfd is returned from the function and a new + * pidfd file is returned in the last argument to the function. On + * error, a negative error code is returned from the function and the + * last argument remains unchanged. + */ +static int __pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +{ + int pidfd; + struct file *pidfd_file; + + if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) + return -EINVAL; + + pidfd = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + if (pidfd < 0) + return pidfd; + + pidfd_file = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, + flags | O_RDWR | O_CLOEXEC); + if (IS_ERR(pidfd_file)) { + put_unused_fd(pidfd); + return PTR_ERR(pidfd_file); + } + get_pid(pid); /* held by pidfd_file now */ + *ret = pidfd_file; + return pidfd; +} + +/** + * pidfd_prepare - allocate a new pidfd_file and reserve a pidfd + * @pid: the struct pid for which to create a pidfd + * @flags: flags of the new @pidfd + * @pidfd: the pidfd to return + * + * Allocate a new file that stashes @pid and reserve a new pidfd number in the + * caller's file descriptor table. The pidfd is reserved but not installed yet. + * + * The helper verifies that @pid is used as a thread group leader. + * + * If this function returns successfully the caller is responsible to either + * call fd_install() passing the returned pidfd and pidfd file as arguments in + * order to install the pidfd into its file descriptor table or they must use + * put_unused_fd() and fput() on the returned pidfd and pidfd file + * respectively. + * + * This function is useful when a pidfd must already be reserved but there + * might still be points of failure afterwards and the caller wants to ensure + * that no pidfd is leaked into its file descriptor table. + * + * Return: On success, a reserved pidfd is returned from the function and a new + * pidfd file is returned in the last argument to the function. On + * error, a negative error code is returned from the function and the + * last argument remains unchanged. + */ +int pidfd_prepare(struct pid *pid, unsigned int flags, struct file **ret) +{ + if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) + return -EINVAL; + + return __pidfd_prepare(pid, flags, ret); +} + static void __delayed_free_task(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); diff --git a/kernel/pid.c b/kernel/pid.c index 74834c04a081..8bce3aebc949 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -594,20 +594,15 @@ struct task_struct *pidfd_get_task(int pidfd, unsigned int *flags) */ int pidfd_create(struct pid *pid, unsigned int flags) { - int fd; + int pidfd; + struct file *pidfd_file; - if (!pid || !pid_has_task(pid, PIDTYPE_TGID)) - return -EINVAL; + pidfd = pidfd_prepare(pid, flags, &pidfd_file); + if (pidfd < 0) + return pidfd; - if (flags & ~(O_NONBLOCK | O_RDWR | O_CLOEXEC)) - return -EINVAL; - - fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid), - flags | O_RDWR | O_CLOEXEC); - if (fd < 0) - put_pid(pid); - - return fd; + fd_install(pidfd, pidfd_file); + return pidfd; } /** From b2a5bf1cf413d188757de8bc3568d7081d202eb5 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 27 Mar 2023 20:22:52 +0200 Subject: [PATCH 510/529] fork: use pidfd_prepare() commit ca7707f5430ad6b1c9cb7cee0a7f67d69328bb2d upstream. Stop open-coding get_unused_fd_flags() and anon_inode_getfile(). That's brittle just for keeping the flags between both calls in sync. Use the dedicated helper. Message-Id: <20230327-pidfd-file-api-v1-2-5c0e9a3158e4@kernel.org> Signed-off-by: Christian Brauner Signed-off-by: Greg Kroah-Hartman --- kernel/fork.c | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index a9b9a24c7932..8cc313d27188 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2378,21 +2378,12 @@ static __latent_entropy struct task_struct *copy_process( * if the fd table isn't shared). */ if (clone_flags & CLONE_PIDFD) { - retval = get_unused_fd_flags(O_RDWR | O_CLOEXEC); + /* Note that no task has been attached to @pid yet. */ + retval = __pidfd_prepare(pid, O_RDWR | O_CLOEXEC, &pidfile); if (retval < 0) goto bad_fork_free_pid; - pidfd = retval; - pidfile = anon_inode_getfile("[pidfd]", &pidfd_fops, pid, - O_RDWR | O_CLOEXEC); - if (IS_ERR(pidfile)) { - put_unused_fd(pidfd); - retval = PTR_ERR(pidfile); - goto bad_fork_free_pid; - } - get_pid(pid); /* held by pidfile now */ - retval = put_user(pidfd, args->pidfd); if (retval) goto bad_fork_put_pidfd; From ac190912887386e77d5c95a8e001af7f6838f255 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Mon, 14 Apr 2025 15:55:07 +0200 Subject: [PATCH 511/529] coredump: hand a pidfd to the usermode coredump helper commit b5325b2a270fcaf7b2a9a0f23d422ca8a5a8bdea upstream. Give userspace a way to instruct the kernel to install a pidfd into the usermode helper process. This makes coredump handling a lot more reliable for userspace. In parallel with this commit we already have systemd adding support for this in [1]. We create a pidfs file for the coredumping process when we process the corename pattern. When the usermode helper process is forked we then install the pidfs file as file descriptor three into the usermode helpers file descriptor table so it's available to the exec'd program. Since usermode helpers are either children of the system_unbound_wq workqueue or kthreadd we know that the file descriptor table is empty and can thus always use three as the file descriptor number. Note, that we'll install a pidfd for the thread-group leader even if a subthread is calling do_coredump(). We know that task linkage hasn't been removed due to delay_group_leader() and even if this @current isn't the actual thread-group leader we know that the thread-group leader cannot be reaped until @current has exited. [brauner: This is a backport for the v6.1 series. Upstream has significantly changed and backporting all that infra is a non-starter. So simply backport the pidfd_prepare() helper and waste the file descriptor we allocated. Then we minimally massage the umh coredump setup code.] Link: https://github.com/systemd/systemd/pull/37125 [1] Link: https://lore.kernel.org/20250414-work-coredump-v2-3-685bf231f828@kernel.org Tested-by: Luca Boccassi Reviewed-by: Oleg Nesterov Signed-off-by: Christian Brauner Signed-off-by: Greg Kroah-Hartman --- fs/coredump.c | 78 ++++++++++++++++++++++++++++++++++++---- include/linux/coredump.h | 1 + 2 files changed, 72 insertions(+), 7 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 2e2b66aed59d..b0d782367fcc 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -56,6 +57,13 @@ static bool dump_vma_snapshot(struct coredump_params *cprm); static void free_vma_snapshot(struct coredump_params *cprm); +/* + * File descriptor number for the pidfd for the thread-group leader of + * the coredumping task installed into the usermode helper's file + * descriptor table. + */ +#define COREDUMP_PIDFD_NUMBER 3 + static int core_uses_pid; static unsigned int core_pipe_limit; static char core_pattern[CORENAME_MAX_SIZE] = "core"; @@ -325,6 +333,27 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, err = cn_printf(cn, "%lu", rlimit(RLIMIT_CORE)); break; + /* pidfd number */ + case 'F': { + /* + * Installing a pidfd only makes sense if + * we actually spawn a usermode helper. + */ + if (!ispipe) + break; + + /* + * Note that we'll install a pidfd for the + * thread-group leader. We know that task + * linkage hasn't been removed yet and even if + * this @current isn't the actual thread-group + * leader we know that the thread-group leader + * cannot be reaped until @current has exited. + */ + cprm->pid = task_tgid(current); + err = cn_printf(cn, "%d", COREDUMP_PIDFD_NUMBER); + break; + } default: break; } @@ -479,7 +508,7 @@ static void wait_for_dump_helpers(struct file *file) } /* - * umh_pipe_setup + * umh_coredump_setup * helper function to customize the process used * to collect the core in userspace. Specifically * it sets up a pipe and installs it as fd 0 (stdin) @@ -489,27 +518,62 @@ static void wait_for_dump_helpers(struct file *file) * is a special value that we use to trap recursive * core dumps */ -static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) +static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) { struct file *files[2]; + struct file *pidfs_file = NULL; struct coredump_params *cp = (struct coredump_params *)info->data; int err; + if (cp->pid) { + int fd; + + fd = pidfd_prepare(cp->pid, 0, &pidfs_file); + if (fd < 0) + return fd; + + /* + * We don't care about the fd. We also cannot simply + * replace it below because dup2() will refuse to close + * this file descriptor if its in a larval state. So + * close it! + */ + put_unused_fd(fd); + + /* + * Usermode helpers are childen of either + * system_unbound_wq or of kthreadd. So we know that + * we're starting off with a clean file descriptor + * table. So we should always be able to use + * COREDUMP_PIDFD_NUMBER as our file descriptor value. + */ + err = replace_fd(COREDUMP_PIDFD_NUMBER, pidfs_file, 0); + if (err < 0) + goto out_fail; + + pidfs_file = NULL; + } + err = create_pipe_files(files, 0); if (err) - return err; + goto out_fail; cp->file = files[1]; err = replace_fd(0, files[0], 0); fput(files[0]); if (err < 0) - return err; + goto out_fail; /* and disallow core files too */ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; - return 0; + err = 0; + +out_fail: + if (pidfs_file) + fput(pidfs_file); + return err; } void do_coredump(const kernel_siginfo_t *siginfo) @@ -585,7 +649,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) } if (cprm.limit == 1) { - /* See umh_pipe_setup() which sets RLIMIT_CORE = 1. + /* See umh_coredump_setup() which sets RLIMIT_CORE = 1. * * Normally core limits are irrelevant to pipes, since * we're not writing to the file system, but we use @@ -630,7 +694,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) retval = -ENOMEM; sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL, GFP_KERNEL, - umh_pipe_setup, NULL, &cprm); + umh_coredump_setup, NULL, &cprm); if (sub_info) retval = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 08a1d3e7e46d..2b7ec58299ca 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -28,6 +28,7 @@ struct coredump_params { int vma_count; size_t vma_data_size; struct core_vma_metadata *vma_meta; + struct pid *pid; }; /* From 84b06a42930861bd56d0809e9a366a4d9875b1af Mon Sep 17 00:00:00 2001 From: Milton Barrera Date: Wed, 9 Apr 2025 00:04:28 -0600 Subject: [PATCH 512/529] HID: quirks: Add ADATA XPG alpha wireless mouse support [ Upstream commit fa9fdeea1b7d6440c22efa6d59a769eae8bc89f1 ] This patch adds HID_QUIRK_ALWAYS_POLL for the ADATA XPG wireless gaming mouse (USB ID 125f:7505) and its USB dongle (USB ID 125f:7506). Without this quirk, the device does not generate input events properly. Signed-off-by: Milton Barrera Signed-off-by: Jiri Kosina Signed-off-by: Sasha Levin --- drivers/hid/hid-ids.h | 4 ++++ drivers/hid/hid-quirks.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h index 4187d890bcc1..e078d2ac92c8 100644 --- a/drivers/hid/hid-ids.h +++ b/drivers/hid/hid-ids.h @@ -41,6 +41,10 @@ #define USB_VENDOR_ID_ACTIONSTAR 0x2101 #define USB_DEVICE_ID_ACTIONSTAR_1011 0x1011 +#define USB_VENDOR_ID_ADATA_XPG 0x125f +#define USB_VENDOR_ID_ADATA_XPG_WL_GAMING_MOUSE 0x7505 +#define USB_VENDOR_ID_ADATA_XPG_WL_GAMING_MOUSE_DONGLE 0x7506 + #define USB_VENDOR_ID_ADS_TECH 0x06e1 #define USB_DEVICE_ID_ADS_TECH_RADIO_SI470X 0xa155 diff --git a/drivers/hid/hid-quirks.c b/drivers/hid/hid-quirks.c index 875c44e5cf6c..d8c5c7d451ef 100644 --- a/drivers/hid/hid-quirks.c +++ b/drivers/hid/hid-quirks.c @@ -27,6 +27,8 @@ static const struct hid_device_id hid_quirks[] = { { HID_USB_DEVICE(USB_VENDOR_ID_AASHIMA, USB_DEVICE_ID_AASHIMA_GAMEPAD), HID_QUIRK_BADPAD }, { HID_USB_DEVICE(USB_VENDOR_ID_AASHIMA, USB_DEVICE_ID_AASHIMA_PREDATOR), HID_QUIRK_BADPAD }, + { HID_USB_DEVICE(USB_VENDOR_ID_ADATA_XPG, USB_VENDOR_ID_ADATA_XPG_WL_GAMING_MOUSE), HID_QUIRK_ALWAYS_POLL }, + { HID_USB_DEVICE(USB_VENDOR_ID_ADATA_XPG, USB_VENDOR_ID_ADATA_XPG_WL_GAMING_MOUSE_DONGLE), HID_QUIRK_ALWAYS_POLL }, { HID_USB_DEVICE(USB_VENDOR_ID_AFATECH, USB_DEVICE_ID_AFATECH_AF9016), HID_QUIRK_FULLSPEED_INTERVAL }, { HID_USB_DEVICE(USB_VENDOR_ID_AIREN, USB_DEVICE_ID_AIREN_SLIMPLUS), HID_QUIRK_NOGET }, { HID_USB_DEVICE(USB_VENDOR_ID_AKAI_09E8, USB_DEVICE_ID_AKAI_09E8_MIDIMIX), HID_QUIRK_NO_INIT_REPORTS }, From 38e8c73f02f70e6336ceb5a4e6eaaa6ca01c03dd Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Thu, 10 Apr 2025 16:42:03 -0400 Subject: [PATCH 513/529] nfs: don't share pNFS DS connections between net namespaces [ Upstream commit 6b9785dc8b13d9fb75ceec8cf4ea7ec3f3b1edbc ] Currently, different NFS clients can share the same DS connections, even when they are in different net namespaces. If a containerized client creates a DS connection, another container can find and use it. When the first client exits, the connection will close which can lead to stalls in other clients. Add a net namespace pointer to struct nfs4_pnfs_ds, and compare those value to the caller's netns in _data_server_lookup_locked() when searching for a nfs4_pnfs_ds to match. Reported-by: Omar Sandoval Reported-by: Sargun Dillon Closes: https://lore.kernel.org/linux-nfs/Z_ArpQC_vREh_hEA@telecaster/ Tested-by: Sargun Dillon Signed-off-by: Jeff Layton Reviewed-by: Benjamin Coddington Link: https://lore.kernel.org/r/20250410-nfs-ds-netns-v2-1-f80b7979ba80@kernel.org Signed-off-by: Trond Myklebust Signed-off-by: Sasha Levin --- fs/nfs/filelayout/filelayoutdev.c | 6 +++--- fs/nfs/flexfilelayout/flexfilelayoutdev.c | 6 +++--- fs/nfs/pnfs.h | 4 +++- fs/nfs/pnfs_nfs.c | 9 +++++---- 4 files changed, 14 insertions(+), 11 deletions(-) diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index acf4b88889dc..d5f1fbfd9a0c 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -75,6 +75,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct page *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; + struct net *net = server->nfs_client->cl_net; /* set up xdr stream */ scratch = alloc_page(gfp_flags); @@ -158,8 +159,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, mp_count = be32_to_cpup(p); /* multipath count */ for (j = 0; j < mp_count; j++) { - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -169,7 +169,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_free_deviceid; } - dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + dsaddr->ds_list[i] = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!dsaddr->ds_list[i]) goto out_err_drain_dsaddrs; diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e028f5a0ef5f..d21c5ecfbf1c 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -49,6 +49,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct nfs4_pnfs_ds_addr *da; struct nfs4_ff_layout_ds *new_ds = NULL; struct nfs4_ff_ds_version *ds_versions = NULL; + struct net *net = server->nfs_client->cl_net; u32 mp_count; u32 version_count; __be32 *p; @@ -80,8 +81,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, for (i = 0; i < mp_count; i++) { /* multipath ds */ - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -149,7 +149,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, new_ds->ds_versions = ds_versions; new_ds->ds_versions_cnt = version_count; - new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + new_ds->ds = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!new_ds->ds) goto out_err_drain_dsaddrs; diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index e3e6a41f19de..f5173c188184 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -59,6 +59,7 @@ struct nfs4_pnfs_ds { struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ char *ds_remotestr; /* comma sep list of addrs */ struct list_head ds_addrs; + const struct net *ds_net; struct nfs_client *ds_clp; refcount_t ds_count; unsigned long ds_state; @@ -405,7 +406,8 @@ int pnfs_generic_commit_pagelist(struct inode *inode, int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max); void pnfs_generic_write_commit_done(struct rpc_task *task, void *data); void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds); -struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, +struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net, + struct list_head *dsaddrs, gfp_t gfp_flags); void nfs4_pnfs_v3_ds_connect_unload(void); int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 47a8da3f5c9f..31afa88742f6 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -651,12 +651,12 @@ _same_data_server_addrs_locked(const struct list_head *dsaddrs1, * Lookup DS by addresses. nfs4_ds_cache_lock is held */ static struct nfs4_pnfs_ds * -_data_server_lookup_locked(const struct list_head *dsaddrs) +_data_server_lookup_locked(const struct net *net, const struct list_head *dsaddrs) { struct nfs4_pnfs_ds *ds; list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) - if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) + if (ds->ds_net == net && _same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) return ds; return NULL; } @@ -763,7 +763,7 @@ out_err: * uncached and return cached struct nfs4_pnfs_ds. */ struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) +nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_flags) { struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; char *remotestr; @@ -781,13 +781,14 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); spin_lock(&nfs4_ds_cache_lock); - tmp_ds = _data_server_lookup_locked(dsaddrs); + tmp_ds = _data_server_lookup_locked(net, dsaddrs); if (tmp_ds == NULL) { INIT_LIST_HEAD(&ds->ds_addrs); list_splice_init(dsaddrs, &ds->ds_addrs); ds->ds_remotestr = remotestr; refcount_set(&ds->ds_count, 1); INIT_LIST_HEAD(&ds->ds_node); + ds->ds_net = net; ds->ds_clp = NULL; list_add(&ds->ds_node, &nfs4_data_server_cache); dprintk("%s add new data server %s\n", __func__, From fe4fb4134f5a089cf3390c8f6de65f5686861858 Mon Sep 17 00:00:00 2001 From: John Chau Date: Mon, 5 May 2025 01:55:13 +0900 Subject: [PATCH 514/529] platform/x86: thinkpad_acpi: Support also NEC Lavie X1475JAS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit a032f29a15412fab9f4352e0032836d51420a338 ] Change get_thinkpad_model_data() to check for additional vendor name "NEC" in order to support NEC Lavie X1475JAS notebook (and perhaps more). The reason of this works with minimal changes is because NEC Lavie X1475JAS is a Thinkpad inside. ACPI dumps reveals its OEM ID to be "LENOVO", BIOS version "R2PET30W" matches typical Lenovo BIOS version, the existence of HKEY of LEN0268, with DMI fw string is "R2PHT24W". I compiled and tested with my own machine, attached the dmesg below as proof of work: [ 6.288932] thinkpad_acpi: ThinkPad ACPI Extras v0.26 [ 6.288937] thinkpad_acpi: http://ibm-acpi.sf.net/ [ 6.288938] thinkpad_acpi: ThinkPad BIOS R2PET30W (1.11 ), EC R2PHT24W [ 6.307000] thinkpad_acpi: radio switch found; radios are enabled [ 6.307030] thinkpad_acpi: This ThinkPad has standard ACPI backlight brightness control, supported by the ACPI video driver [ 6.307033] thinkpad_acpi: Disabling thinkpad-acpi brightness events by default... [ 6.320322] thinkpad_acpi: rfkill switch tpacpi_bluetooth_sw: radio is unblocked [ 6.371963] thinkpad_acpi: secondary fan control detected & enabled [ 6.391922] thinkpad_acpi: battery 1 registered (start 0, stop 85, behaviours: 0x7) [ 6.398375] input: ThinkPad Extra Buttons as /devices/platform/thinkpad_acpi/input/input13 Signed-off-by: John Chau Link: https://lore.kernel.org/r/20250504165513.295135-1-johnchau@0atlas.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen Signed-off-by: Sasha Levin --- drivers/platform/x86/thinkpad_acpi.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 26ca9c453a59..295939e9ac69 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -11517,6 +11517,8 @@ static int __must_check __init get_thinkpad_model_data( tp->vendor = PCI_VENDOR_ID_IBM; else if (dmi_name_in_vendors("LENOVO")) tp->vendor = PCI_VENDOR_ID_LENOVO; + else if (dmi_name_in_vendors("NEC")) + tp->vendor = PCI_VENDOR_ID_LENOVO; else return 0; From 8f55d71b57e9d3513a5b4d6f342425812cbb3e81 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Wed, 7 May 2025 16:49:33 +0900 Subject: [PATCH 515/529] um: let 'make clean' properly clean underlying SUBARCH as well [ Upstream commit ab09da75700e9d25c7dfbc7f7934920beb5e39b9 ] Building the kernel with O= is affected by stale in-tree build artifacts. So, if the source tree is not clean, Kbuild displays the following: $ make ARCH=um O=build defconfig make[1]: Entering directory '/.../linux/build' *** *** The source tree is not clean, please run 'make ARCH=um mrproper' *** in /.../linux *** make[2]: *** [/.../linux/Makefile:673: outputmakefile] Error 1 make[1]: *** [/.../linux/Makefile:248: __sub-make] Error 2 make[1]: Leaving directory '/.../linux/build' make: *** [Makefile:248: __sub-make] Error 2 Usually, running 'make mrproper' is sufficient for cleaning the source tree for out-of-tree builds. However, building UML generates build artifacts not only in arch/um/, but also in the SUBARCH directory (i.e., arch/x86/). If in-tree stale files remain under arch/x86/, Kbuild will reuse them instead of creating new ones under the specified build directory. This commit makes 'make ARCH=um clean' recurse into the SUBARCH directory. Reported-by: Shuah Khan Closes: https://lore.kernel.org/lkml/20250502172459.14175-1-skhan@linuxfoundation.org/ Signed-off-by: Masahiro Yamada Acked-by: Johannes Berg Reviewed-by: David Gow Reviewed-by: Shuah Khan Signed-off-by: Sasha Levin --- arch/um/Makefile | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/um/Makefile b/arch/um/Makefile index 778c50f27399..25d0501549f5 100644 --- a/arch/um/Makefile +++ b/arch/um/Makefile @@ -155,5 +155,6 @@ MRPROPER_FILES += $(HOST_DIR)/include/generated archclean: @find . \( -name '*.bb' -o -name '*.bbg' -o -name '*.da' \ -o -name '*.gcov' \) -type f -print | xargs rm -f + $(Q)$(MAKE) -f $(srctree)/Makefile ARCH=$(HEADER_ARCH) clean export HEADER_ARCH SUBARCH USER_CFLAGS CFLAGS_NO_HARDENING OS DEV_NULL_PATH From d8f13a8aeff61fb5de7998f17aa8d0c805c8e49f Mon Sep 17 00:00:00 2001 From: Alessandro Grassi Date: Fri, 2 May 2025 11:55:20 +0200 Subject: [PATCH 516/529] spi: spi-sun4i: fix early activation [ Upstream commit fb98bd0a13de2c9d96cb5c00c81b5ca118ac9d71 ] The SPI interface is activated before the CPOL setting is applied. In that moment, the clock idles high and CS goes low. After a short delay, CPOL and other settings are applied, which may cause the clock to change state and idle low. This transition is not part of a clock cycle, and it can confuse the receiving device. To prevent this unexpected transition, activate the interface while CPOL and the other settings are being applied. Signed-off-by: Alessandro Grassi Link: https://patch.msgid.link/20250502095520.13825-1-alessandro.grassi@mailbox.org Signed-off-by: Mark Brown Signed-off-by: Sasha Levin --- drivers/spi/spi-sun4i.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/spi/spi-sun4i.c b/drivers/spi/spi-sun4i.c index 6000d0761206..6937f5c4d868 100644 --- a/drivers/spi/spi-sun4i.c +++ b/drivers/spi/spi-sun4i.c @@ -263,6 +263,9 @@ static int sun4i_spi_transfer_one(struct spi_master *master, else reg |= SUN4I_CTL_DHB; + /* Now that the settings are correct, enable the interface */ + reg |= SUN4I_CTL_ENABLE; + sun4i_spi_write(sspi, SUN4I_CTL_REG, reg); /* Ensure that we have a parent clock fast enough */ @@ -403,7 +406,7 @@ static int sun4i_spi_runtime_resume(struct device *dev) } sun4i_spi_write(sspi, SUN4I_CTL_REG, - SUN4I_CTL_ENABLE | SUN4I_CTL_MASTER | SUN4I_CTL_TP); + SUN4I_CTL_MASTER | SUN4I_CTL_TP); return 0; From e1143267e928ff012cf0a88bb7622946278a96e1 Mon Sep 17 00:00:00 2001 From: Ilya Guterman Date: Sat, 10 May 2025 19:21:30 +0900 Subject: [PATCH 517/529] nvme-pci: add NVME_QUIRK_NO_DEEPEST_PS quirk for SOLIDIGM P44 Pro [ Upstream commit e765bf89f42b5c82132a556b630affeb82b2a21f ] This commit adds the NVME_QUIRK_NO_DEEPEST_PS quirk for device [126f:2262], which belongs to device SOLIDIGM P44 Pro SSDPFKKW020X7 The device frequently have trouble exiting the deepest power state (5), resulting in the entire disk being unresponsive. Verified by setting nvme_core.default_ps_max_latency_us=10000 and observing the expected behavior. Signed-off-by: Ilya Guterman Signed-off-by: Christoph Hellwig Signed-off-by: Sasha Levin --- drivers/nvme/host/pci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 49a3cb8f1f10..218c1d69090e 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -3592,6 +3592,8 @@ static const struct pci_device_id nvme_id_table[] = { .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0x1e49, 0x0041), /* ZHITAI TiPro7000 NVMe SSD */ .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, + { PCI_DEVICE(0x025e, 0xf1ac), /* SOLIDIGM P44 pro SSDPFKKW020X7 */ + .driver_data = NVME_QUIRK_NO_DEEPEST_PS, }, { PCI_DEVICE(0xc0a9, 0x540a), /* Crucial P2 */ .driver_data = NVME_QUIRK_BOGUS_NID, }, { PCI_DEVICE(0x1d97, 0x2263), /* Lexar NM610 */ From dcabad63e07e9494faf932184b1eef22032c974e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sun, 27 Apr 2025 18:21:06 -0400 Subject: [PATCH 518/529] NFS: Avoid flushing data while holding directory locks in nfs_rename() [ Upstream commit dcd21b609d4abc7303f8683bce4f35d78d7d6830 ] The Linux client assumes that all filehandles are non-volatile for renames within the same directory (otherwise sillyrename cannot work). However, the existence of the Linux 'subtree_check' export option has meant that nfs_rename() has always assumed it needs to flush writes before attempting to rename. Since NFSv4 does allow the client to query whether or not the server exhibits this behaviour, and since knfsd does actually set the appropriate flag when 'subtree_check' is enabled on an export, it should be OK to optimise away the write flushing behaviour in the cases where it is clearly not needed. Signed-off-by: Trond Myklebust Reviewed-by: Jeff Layton Signed-off-by: Sasha Levin --- fs/nfs/client.c | 2 ++ fs/nfs/dir.c | 15 ++++++++++++++- include/linux/nfs_fs_sb.h | 12 +++++++++--- 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index a8930e6c417f..de4ad41b14e2 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1052,6 +1052,8 @@ struct nfs_server *nfs_create_server(struct fs_context *fc) if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } + /* Linux 'subtree_check' borkenness mandates this setting */ + server->fh_expire_type = NFS_FH_VOL_RENAME; if (!(fattr->valid & NFS_ATTR_FATTR)) { error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 70660ff248b7..1876978107ca 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2632,6 +2632,18 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) unblock_revalidate(new_dentry); } +static bool nfs_rename_is_unsafe_cross_dir(struct dentry *old_dentry, + struct dentry *new_dentry) +{ + struct nfs_server *server = NFS_SB(old_dentry->d_sb); + + if (old_dentry->d_parent != new_dentry->d_parent) + return false; + if (server->fh_expire_type & NFS_FH_RENAME_UNSAFE) + return !(server->fh_expire_type & NFS_FH_NOEXPIRE_WITH_OPEN); + return true; +} + /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -2719,7 +2731,8 @@ int nfs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, } - if (S_ISREG(old_inode->i_mode)) + if (S_ISREG(old_inode->i_mode) && + nfs_rename_is_unsafe_cross_dir(old_dentry, new_dentry)) nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, must_unblock ? nfs_unblock_rename : NULL); diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 9ea9f9087a71..a9671f930084 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -196,6 +196,15 @@ struct nfs_server { char *fscache_uniq; /* Uniquifier (or NULL) */ #endif + /* The following #defines numerically match the NFSv4 equivalents */ +#define NFS_FH_NOEXPIRE_WITH_OPEN (0x1) +#define NFS_FH_VOLATILE_ANY (0x2) +#define NFS_FH_VOL_MIGRATION (0x4) +#define NFS_FH_VOL_RENAME (0x8) +#define NFS_FH_RENAME_UNSAFE (NFS_FH_VOLATILE_ANY | NFS_FH_VOL_RENAME) + u32 fh_expire_type; /* V4 bitmask representing file + handle volatility type for + this filesystem */ u32 pnfs_blksize; /* layout_blksize attr */ #if IS_ENABLED(CONFIG_NFS_V4) u32 attr_bitmask[3];/* V4 bitmask representing the set @@ -219,9 +228,6 @@ struct nfs_server { u32 acl_bitmask; /* V4 bitmask representing the ACEs that are supported on this filesystem */ - u32 fh_expire_type; /* V4 bitmask representing file - handle volatility type for - this filesystem */ struct pnfs_layoutdriver_type *pnfs_curr_ld; /* Active layout driver */ struct rpc_wait_queue roc_rpcwaitq; void *pnfs_ld_data; /* per mount point data */ From 5d286f3cadd3513a50e2bf3038e9ed604143d063 Mon Sep 17 00:00:00 2001 From: Valtteri Koskivuori Date: Fri, 9 May 2025 21:42:49 +0300 Subject: [PATCH 519/529] platform/x86: fujitsu-laptop: Support Lifebook S2110 hotkeys MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit a7e255ff9fe4d9b8b902023aaf5b7a673786bb50 ] The S2110 has an additional set of media playback control keys enabled by a hardware toggle button that switches the keys between "Application" and "Player" modes. Toggling "Player" mode just shifts the scancode of each hotkey up by 4. Add defines for new scancodes, and a keymap and dmi id for the S2110. Tested on a Fujitsu Lifebook S2110. Signed-off-by: Valtteri Koskivuori Acked-by: Jonathan Woithe Link: https://lore.kernel.org/r/20250509184251.713003-1-vkoskiv@gmail.com Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen Signed-off-by: Sasha Levin --- drivers/platform/x86/fujitsu-laptop.c | 33 +++++++++++++++++++++++---- 1 file changed, 29 insertions(+), 4 deletions(-) diff --git a/drivers/platform/x86/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c index b543d117b12c..259eb2a2858f 100644 --- a/drivers/platform/x86/fujitsu-laptop.c +++ b/drivers/platform/x86/fujitsu-laptop.c @@ -17,13 +17,13 @@ /* * fujitsu-laptop.c - Fujitsu laptop support, providing access to additional * features made available on a range of Fujitsu laptops including the - * P2xxx/P5xxx/S6xxx/S7xxx series. + * P2xxx/P5xxx/S2xxx/S6xxx/S7xxx series. * * This driver implements a vendor-specific backlight control interface for * Fujitsu laptops and provides support for hotkeys present on certain Fujitsu * laptops. * - * This driver has been tested on a Fujitsu Lifebook S6410, S7020 and + * This driver has been tested on a Fujitsu Lifebook S2110, S6410, S7020 and * P8010. It should work on most P-series and S-series Lifebooks, but * YMMV. * @@ -102,7 +102,11 @@ #define KEY2_CODE 0x411 #define KEY3_CODE 0x412 #define KEY4_CODE 0x413 -#define KEY5_CODE 0x420 +#define KEY5_CODE 0x414 +#define KEY6_CODE 0x415 +#define KEY7_CODE 0x416 +#define KEY8_CODE 0x417 +#define KEY9_CODE 0x420 /* Hotkey ringbuffer limits */ #define MAX_HOTKEY_RINGBUFFER_SIZE 100 @@ -450,7 +454,7 @@ static const struct key_entry keymap_default[] = { { KE_KEY, KEY2_CODE, { KEY_PROG2 } }, { KE_KEY, KEY3_CODE, { KEY_PROG3 } }, { KE_KEY, KEY4_CODE, { KEY_PROG4 } }, - { KE_KEY, KEY5_CODE, { KEY_RFKILL } }, + { KE_KEY, KEY9_CODE, { KEY_RFKILL } }, /* Soft keys read from status flags */ { KE_KEY, FLAG_RFKILL, { KEY_RFKILL } }, { KE_KEY, FLAG_TOUCHPAD_TOGGLE, { KEY_TOUCHPAD_TOGGLE } }, @@ -474,6 +478,18 @@ static const struct key_entry keymap_p8010[] = { { KE_END, 0 } }; +static const struct key_entry keymap_s2110[] = { + { KE_KEY, KEY1_CODE, { KEY_PROG1 } }, /* "A" */ + { KE_KEY, KEY2_CODE, { KEY_PROG2 } }, /* "B" */ + { KE_KEY, KEY3_CODE, { KEY_WWW } }, /* "Internet" */ + { KE_KEY, KEY4_CODE, { KEY_EMAIL } }, /* "E-mail" */ + { KE_KEY, KEY5_CODE, { KEY_STOPCD } }, + { KE_KEY, KEY6_CODE, { KEY_PLAYPAUSE } }, + { KE_KEY, KEY7_CODE, { KEY_PREVIOUSSONG } }, + { KE_KEY, KEY8_CODE, { KEY_NEXTSONG } }, + { KE_END, 0 } +}; + static const struct key_entry *keymap = keymap_default; static int fujitsu_laptop_dmi_keymap_override(const struct dmi_system_id *id) @@ -511,6 +527,15 @@ static const struct dmi_system_id fujitsu_laptop_dmi_table[] = { }, .driver_data = (void *)keymap_p8010 }, + { + .callback = fujitsu_laptop_dmi_keymap_override, + .ident = "Fujitsu LifeBook S2110", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "FUJITSU SIEMENS"), + DMI_MATCH(DMI_PRODUCT_NAME, "LIFEBOOK S2110"), + }, + .driver_data = (void *)keymap_s2110 + }, {} }; From 12fd7e6a2c3a11de2b0225eea9695f9ff20ef0d9 Mon Sep 17 00:00:00 2001 From: Mark Pearson Date: Fri, 16 May 2025 22:33:37 -0400 Subject: [PATCH 520/529] platform/x86: thinkpad_acpi: Ignore battery threshold change event notification MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 29e4e6b4235fefa5930affb531fe449cac330a72 ] If user modifies the battery charge threshold an ACPI event is generated. Confirmed with Lenovo FW team this is only generated on user event. As no action is needed, ignore the event and prevent spurious kernel logs. Reported-by: Derek Barbosa Closes: https://lore.kernel.org/platform-driver-x86/7e9a1c47-5d9c-4978-af20-3949d53fb5dc@app.fastmail.com/T/#m5f5b9ae31d3fbf30d7d9a9d76c15fb3502dfd903 Signed-off-by: Mark Pearson Reviewed-by: Hans de Goede Reviewed-by: Armin Wolf Link: https://lore.kernel.org/r/20250517023348.2962591-1-mpearson-lenovo@squebb.ca Reviewed-by: Ilpo Järvinen Signed-off-by: Ilpo Järvinen Signed-off-by: Sasha Levin --- drivers/platform/x86/thinkpad_acpi.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 295939e9ac69..17d74434e604 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -211,6 +211,7 @@ enum tpacpi_hkey_event_t { /* Thermal events */ TP_HKEY_EV_ALARM_BAT_HOT = 0x6011, /* battery too hot */ TP_HKEY_EV_ALARM_BAT_XHOT = 0x6012, /* battery critically hot */ + TP_HKEY_EV_ALARM_BAT_LIM_CHANGE = 0x6013, /* battery charge limit changed*/ TP_HKEY_EV_ALARM_SENSOR_HOT = 0x6021, /* sensor too hot */ TP_HKEY_EV_ALARM_SENSOR_XHOT = 0x6022, /* sensor critically hot */ TP_HKEY_EV_THM_TABLE_CHANGED = 0x6030, /* windows; thermal table changed */ @@ -3948,6 +3949,10 @@ static bool hotkey_notify_6xxx(const u32 hkey, pr_alert("THERMAL EMERGENCY: battery is extremely hot!\n"); /* recommended action: immediate sleep/hibernate */ break; + case TP_HKEY_EV_ALARM_BAT_LIM_CHANGE: + pr_debug("Battery Info: battery charge threshold changed\n"); + /* User changed charging threshold. No action needed */ + return true; case TP_HKEY_EV_ALARM_SENSOR_HOT: pr_crit("THERMAL ALARM: a sensor reports something is too hot!\n"); /* recommended action: warn user through gui, that */ From 0fadcc204396e731bb0b57492ba1f356bc4162a2 Mon Sep 17 00:00:00 2001 From: Nishanth Menon Date: Fri, 16 May 2025 07:26:55 -0500 Subject: [PATCH 521/529] net: ethernet: ti: am65-cpsw: Lower random mac address error print to info [ Upstream commit 50980d8da71a0c2e045e85bba93c0099ab73a209 ] Using random mac address is not an error since the driver continues to function, it should be informative that the system has not assigned a MAC address. This is inline with other drivers such as ax88796c, dm9051 etc. Drop the error level to info level. Signed-off-by: Nishanth Menon Reviewed-by: Simon Horman Reviewed-by: Roger Quadros Link: https://patch.msgid.link/20250516122655.442808-1-nm@ti.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/ti/am65-cpsw-nuss.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ti/am65-cpsw-nuss.c b/drivers/net/ethernet/ti/am65-cpsw-nuss.c index 32828d4ac64c..a0a9e4e13e77 100644 --- a/drivers/net/ethernet/ti/am65-cpsw-nuss.c +++ b/drivers/net/ethernet/ti/am65-cpsw-nuss.c @@ -1918,7 +1918,7 @@ static int am65_cpsw_nuss_init_slave_ports(struct am65_cpsw_common *common) port->slave.mac_addr); if (!is_valid_ether_addr(port->slave.mac_addr)) { eth_random_addr(port->slave.mac_addr); - dev_err(dev, "Use random MAC address\n"); + dev_info(dev, "Use random MAC address\n"); } } } From 58485ff1a74f6c5be9e7c6aafb7293e4337348e7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 4 Jun 2025 14:40:26 +0200 Subject: [PATCH 522/529] Linux 6.1.141 Link: https://lore.kernel.org/r/20250602134319.723650984@linuxfoundation.org Tested-by: Peter Schneider Tested-by: Florian Fainelli Tested-by: Ron Economos Tested-by: Linux Kernel Functional Testing Tested-by: Shuah Khan Tested-by: Mark Brown Tested-by: Miguel Ojeda Tested-by: Hardik Garg Tested-by: Salvatore Bonaccorso Tested-by: Jon Hunter Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9a7bc4372e35..70969a997a7f 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 PATCHLEVEL = 1 -SUBLEVEL = 140 +SUBLEVEL = 141 EXTRAVERSION = NAME = Curry Ramen From 7f0c46ade6567c34746a68bf89da0e123df54ac7 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 5 Jun 2025 11:11:51 +0000 Subject: [PATCH 523/529] Revert "hrtimers: Force migrate away hrtimers queued after CPUHP_AP_HRTIMERS_DYING" This reverts commit 82ac6adbbb2aad14548a71d5e2e37f4964a15e38 which is commit 53dac345395c0d2493cbc2f4c85fe38aef5b63f5 upstream. It breaks the Android kernel build and can be brought back in the future in an safe way if it is really needed. Bug: 161946584 Change-Id: Ic3951674e27076bd9867102f525af9adc5c2a43c Signed-off-by: Greg Kroah-Hartman --- include/linux/hrtimer.h | 1 - kernel/time/hrtimer.c | 103 ++++++++-------------------------------- 2 files changed, 21 insertions(+), 83 deletions(-) diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 5ed16181a32c..f5d43920ecd6 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -238,7 +238,6 @@ struct hrtimer_cpu_base { ktime_t softirq_expires_next; struct hrtimer *softirq_next_timer; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; - call_single_data_t csd; } ____cacheline_aligned; static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time) diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 0de175d45de7..7a836fcf17d0 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -58,8 +58,6 @@ #define HRTIMER_ACTIVE_SOFT (HRTIMER_ACTIVE_HARD << MASK_SHIFT) #define HRTIMER_ACTIVE_ALL (HRTIMER_ACTIVE_SOFT | HRTIMER_ACTIVE_HARD) -static void retrigger_next_event(void *arg); - /* * The timer bases: * @@ -113,8 +111,7 @@ DEFINE_PER_CPU(struct hrtimer_cpu_base, hrtimer_bases) = .clockid = CLOCK_TAI, .get_time = &ktime_get_clocktai, }, - }, - .csd = CSD_INIT(retrigger_next_event, NULL) + } }; static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { @@ -127,14 +124,6 @@ static const int hrtimer_clock_to_base_table[MAX_CLOCKS] = { [CLOCK_TAI] = HRTIMER_BASE_TAI, }; -static inline bool hrtimer_base_is_online(struct hrtimer_cpu_base *base) -{ - if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) - return true; - else - return likely(base->online); -} - /* * Functions and macros which are different for UP/SMP systems are kept in a * single place @@ -188,54 +177,27 @@ struct hrtimer_clock_base *lock_hrtimer_base(const struct hrtimer *timer, } /* - * Check if the elected target is suitable considering its next - * event and the hotplug state of the current CPU. - * - * If the elected target is remote and its next event is after the timer - * to queue, then a remote reprogram is necessary. However there is no - * guarantee the IPI handling the operation would arrive in time to meet - * the high resolution deadline. In this case the local CPU becomes a - * preferred target, unless it is offline. - * - * High and low resolution modes are handled the same way for simplicity. + * We do not migrate the timer when it is expiring before the next + * event on the target cpu. When high resolution is enabled, we cannot + * reprogram the target cpu hardware and we would cause it to fire + * late. To keep it simple, we handle the high resolution enabled and + * disabled case similar. * * Called with cpu_base->lock of target cpu held. */ -static bool hrtimer_suitable_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base, - struct hrtimer_cpu_base *new_cpu_base, - struct hrtimer_cpu_base *this_cpu_base) +static int +hrtimer_check_target(struct hrtimer *timer, struct hrtimer_clock_base *new_base) { ktime_t expires; - /* - * The local CPU clockevent can be reprogrammed. Also get_target_base() - * guarantees it is online. - */ - if (new_cpu_base == this_cpu_base) - return true; - - /* - * The offline local CPU can't be the default target if the - * next remote target event is after this timer. Keep the - * elected new base. An IPI will we issued to reprogram - * it as a last resort. - */ - if (!hrtimer_base_is_online(this_cpu_base)) - return true; - expires = ktime_sub(hrtimer_get_expires(timer), new_base->offset); - - return expires >= new_base->cpu_base->expires_next; + return expires < new_base->cpu_base->expires_next; } -static inline struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, int pinned) +static inline +struct hrtimer_cpu_base *get_target_base(struct hrtimer_cpu_base *base, + int pinned) { - if (!hrtimer_base_is_online(base)) { - int cpu = cpumask_any_and(cpu_online_mask, housekeeping_cpumask(HK_TYPE_TIMER)); - - return &per_cpu(hrtimer_bases, cpu); - } - #if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) if (static_branch_likely(&timers_migration_enabled) && !pinned) return &per_cpu(hrtimer_bases, get_nohz_timer_target()); @@ -286,8 +248,8 @@ again: raw_spin_unlock(&base->cpu_base->lock); raw_spin_lock(&new_base->cpu_base->lock); - if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, - this_cpu_base)) { + if (new_cpu_base != this_cpu_base && + hrtimer_check_target(timer, new_base)) { raw_spin_unlock(&new_base->cpu_base->lock); raw_spin_lock(&base->cpu_base->lock); new_cpu_base = this_cpu_base; @@ -296,7 +258,8 @@ again: } WRITE_ONCE(timer->base, new_base); } else { - if (!hrtimer_suitable_target(timer, new_base, new_cpu_base, this_cpu_base)) { + if (new_cpu_base != this_cpu_base && + hrtimer_check_target(timer, new_base)) { new_cpu_base = this_cpu_base; goto again; } @@ -755,6 +718,8 @@ static inline int hrtimer_is_hres_enabled(void) return hrtimer_hres_enabled; } +static void retrigger_next_event(void *arg); + /* * Switch to high resolution mode */ @@ -1239,7 +1204,6 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, u64 delta_ns, const enum hrtimer_mode mode, struct hrtimer_clock_base *base) { - struct hrtimer_cpu_base *this_cpu_base = this_cpu_ptr(&hrtimer_bases); struct hrtimer_clock_base *new_base; bool force_local, first; @@ -1251,15 +1215,9 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, * and enforce reprogramming after it is queued no matter whether * it is the new first expiring timer again or not. */ - force_local = base->cpu_base == this_cpu_base; + force_local = base->cpu_base == this_cpu_ptr(&hrtimer_bases); force_local &= base->cpu_base->next_timer == timer; - /* - * Don't force local queuing if this enqueue happens on a unplugged - * CPU after hrtimer_cpu_dying() has been invoked. - */ - force_local &= this_cpu_base->online; - /* * Remove an active timer from the queue. In case it is not queued * on the current CPU, make sure that remove_hrtimer() updates the @@ -1289,27 +1247,8 @@ static int __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim, } first = enqueue_hrtimer(timer, new_base, mode); - if (!force_local) { - /* - * If the current CPU base is online, then the timer is - * never queued on a remote CPU if it would be the first - * expiring timer there. - */ - if (hrtimer_base_is_online(this_cpu_base)) - return first; - - /* - * Timer was enqueued remote because the current base is - * already offline. If the timer is the first to expire, - * kick the remote CPU to reprogram the clock event. - */ - if (first) { - struct hrtimer_cpu_base *new_cpu_base = new_base->cpu_base; - - smp_call_function_single_async(new_cpu_base->cpu, &new_cpu_base->csd); - } - return 0; - } + if (!force_local) + return first; /* * Timer was forced to stay on the current CPU to avoid From b572d7f4f2c4a6786f45edfe29fa98105e051471 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 5 Jun 2025 13:19:40 +0000 Subject: [PATCH 524/529] Revert "serial: mctrl_gpio: split disable_ms into sync and no_sync APIs" This reverts commit 68435c1fa3db696db4f480385db9e50e26691d0d which is commit 1bd2aad57da95f7f2d2bb52f7ad15c0f4993a685 upstream. It breaks the Android kernel abi and can be brought back in the future in an abi-safe way if it is really needed. Bug: 161946584 Change-Id: I5a1f6ecb6b0139b48d9cbfeff2efde4362d6f3b0 Signed-off-by: Greg Kroah-Hartman --- Documentation/driver-api/serial/driver.rst | 2 +- drivers/tty/serial/8250/8250_port.c | 2 +- drivers/tty/serial/atmel_serial.c | 2 +- drivers/tty/serial/imx.c | 2 +- drivers/tty/serial/serial_mctrl_gpio.c | 34 +++++----------------- drivers/tty/serial/serial_mctrl_gpio.h | 17 ++--------- drivers/tty/serial/sh-sci.c | 2 +- drivers/tty/serial/stm32-usart.c | 2 +- 8 files changed, 16 insertions(+), 47 deletions(-) diff --git a/Documentation/driver-api/serial/driver.rst b/Documentation/driver-api/serial/driver.rst index 9436f7c11306..23c6b956cd90 100644 --- a/Documentation/driver-api/serial/driver.rst +++ b/Documentation/driver-api/serial/driver.rst @@ -100,4 +100,4 @@ Some helpers are provided in order to set/get modem control lines via GPIO. .. kernel-doc:: drivers/tty/serial/serial_mctrl_gpio.c :identifiers: mctrl_gpio_init mctrl_gpio_free mctrl_gpio_to_gpiod mctrl_gpio_set mctrl_gpio_get mctrl_gpio_enable_ms - mctrl_gpio_disable_ms_sync mctrl_gpio_disable_ms_no_sync + mctrl_gpio_disable_ms diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 30c48683915c..89a141e60f1d 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -1702,7 +1702,7 @@ static void serial8250_disable_ms(struct uart_port *port) if (up->bugs & UART_BUG_NOMSR) return; - mctrl_gpio_disable_ms_no_sync(up->gpios); + mctrl_gpio_disable_ms(up->gpios); up->ier &= ~UART_IER_MSI; serial_port_out(port, UART_IER, up->ier); diff --git a/drivers/tty/serial/atmel_serial.c b/drivers/tty/serial/atmel_serial.c index b3463cdd1d4b..6a9310379dc2 100644 --- a/drivers/tty/serial/atmel_serial.c +++ b/drivers/tty/serial/atmel_serial.c @@ -692,7 +692,7 @@ static void atmel_disable_ms(struct uart_port *port) atmel_port->ms_irq_enabled = false; - mctrl_gpio_disable_ms_no_sync(atmel_port->gpios); + mctrl_gpio_disable_ms(atmel_port->gpios); if (!mctrl_gpio_to_gpiod(atmel_port->gpios, UART_GPIO_CTS)) idr |= ATMEL_US_CTSIC; diff --git a/drivers/tty/serial/imx.c b/drivers/tty/serial/imx.c index 4acddf41f218..5acbab0512b8 100644 --- a/drivers/tty/serial/imx.c +++ b/drivers/tty/serial/imx.c @@ -1571,7 +1571,7 @@ static void imx_uart_shutdown(struct uart_port *port) imx_uart_dma_exit(sport); } - mctrl_gpio_disable_ms_sync(sport->gpios); + mctrl_gpio_disable_ms(sport->gpios); spin_lock_irqsave(&sport->port.lock, flags); ucr2 = imx_uart_readl(sport, UCR2); diff --git a/drivers/tty/serial/serial_mctrl_gpio.c b/drivers/tty/serial/serial_mctrl_gpio.c index d5fb293dd5a9..7d5aaa8d422b 100644 --- a/drivers/tty/serial/serial_mctrl_gpio.c +++ b/drivers/tty/serial/serial_mctrl_gpio.c @@ -322,7 +322,11 @@ void mctrl_gpio_enable_ms(struct mctrl_gpios *gpios) } EXPORT_SYMBOL_GPL(mctrl_gpio_enable_ms); -static void mctrl_gpio_disable_ms(struct mctrl_gpios *gpios, bool sync) +/** + * mctrl_gpio_disable_ms - disable irqs and handling of changes to the ms lines + * @gpios: gpios to disable + */ +void mctrl_gpio_disable_ms(struct mctrl_gpios *gpios) { enum mctrl_gpio_idx i; @@ -338,34 +342,10 @@ static void mctrl_gpio_disable_ms(struct mctrl_gpios *gpios, bool sync) if (!gpios->irq[i]) continue; - if (sync) - disable_irq(gpios->irq[i]); - else - disable_irq_nosync(gpios->irq[i]); + disable_irq(gpios->irq[i]); } } - -/** - * mctrl_gpio_disable_ms_sync - disable irqs and handling of changes to the ms - * lines, and wait for any pending IRQ to be processed - * @gpios: gpios to disable - */ -void mctrl_gpio_disable_ms_sync(struct mctrl_gpios *gpios) -{ - mctrl_gpio_disable_ms(gpios, true); -} -EXPORT_SYMBOL_GPL(mctrl_gpio_disable_ms_sync); - -/** - * mctrl_gpio_disable_ms_no_sync - disable irqs and handling of changes to the - * ms lines, and return immediately - * @gpios: gpios to disable - */ -void mctrl_gpio_disable_ms_no_sync(struct mctrl_gpios *gpios) -{ - mctrl_gpio_disable_ms(gpios, false); -} -EXPORT_SYMBOL_GPL(mctrl_gpio_disable_ms_no_sync); +EXPORT_SYMBOL_GPL(mctrl_gpio_disable_ms); void mctrl_gpio_enable_irq_wake(struct mctrl_gpios *gpios) { diff --git a/drivers/tty/serial/serial_mctrl_gpio.h b/drivers/tty/serial/serial_mctrl_gpio.h index 79e97838ebe5..fc76910fb105 100644 --- a/drivers/tty/serial/serial_mctrl_gpio.h +++ b/drivers/tty/serial/serial_mctrl_gpio.h @@ -87,16 +87,9 @@ void mctrl_gpio_free(struct device *dev, struct mctrl_gpios *gpios); void mctrl_gpio_enable_ms(struct mctrl_gpios *gpios); /* - * Disable gpio interrupts to report status line changes, and block until - * any corresponding IRQ is processed + * Disable gpio interrupts to report status line changes. */ -void mctrl_gpio_disable_ms_sync(struct mctrl_gpios *gpios); - -/* - * Disable gpio interrupts to report status line changes, and return - * immediately - */ -void mctrl_gpio_disable_ms_no_sync(struct mctrl_gpios *gpios); +void mctrl_gpio_disable_ms(struct mctrl_gpios *gpios); /* * Enable gpio wakeup interrupts to enable wake up source. @@ -155,11 +148,7 @@ static inline void mctrl_gpio_enable_ms(struct mctrl_gpios *gpios) { } -static inline void mctrl_gpio_disable_ms_sync(struct mctrl_gpios *gpios) -{ -} - -static inline void mctrl_gpio_disable_ms_no_sync(struct mctrl_gpios *gpios) +static inline void mctrl_gpio_disable_ms(struct mctrl_gpios *gpios) { } diff --git a/drivers/tty/serial/sh-sci.c b/drivers/tty/serial/sh-sci.c index ed468b676f0b..d33bd792d89a 100644 --- a/drivers/tty/serial/sh-sci.c +++ b/drivers/tty/serial/sh-sci.c @@ -2199,7 +2199,7 @@ static void sci_shutdown(struct uart_port *port) dev_dbg(port->dev, "%s(%d)\n", __func__, port->line); s->autorts = false; - mctrl_gpio_disable_ms_sync(to_sci_port(port)->gpios); + mctrl_gpio_disable_ms(to_sci_port(port)->gpios); spin_lock_irqsave(&port->lock, flags); sci_stop_rx(port); diff --git a/drivers/tty/serial/stm32-usart.c b/drivers/tty/serial/stm32-usart.c index 8670bb5042c4..7d11511c8c12 100644 --- a/drivers/tty/serial/stm32-usart.c +++ b/drivers/tty/serial/stm32-usart.c @@ -850,7 +850,7 @@ static void stm32_usart_enable_ms(struct uart_port *port) static void stm32_usart_disable_ms(struct uart_port *port) { - mctrl_gpio_disable_ms_sync(to_stm32_port(port)->gpios); + mctrl_gpio_disable_ms(to_stm32_port(port)->gpios); } /* Transmit stop */ From f23dc8c566aedb6668128bdd0207bdb9bfe037cb Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 5 Jun 2025 14:25:55 +0000 Subject: [PATCH 525/529] Revert "ipv6: save dontfrag in cork" This reverts commit 4f809be95d9f3db13d31c574b8764c8d429f0c3b which is commit a18dfa9925b9ef6107ea3aa5814ca3c704d34a8a upstream. It breaks the Android kernel abi and can be brought back in the future in an abi-safe way if it is really needed. Bug: 161946584 Change-Id: I92d1731431ba8fdf5be8df5457bc73cd5ae4ffaf Signed-off-by: Greg Kroah-Hartman --- include/linux/ipv6.h | 1 - net/ipv6/ip6_output.c | 9 ++++----- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 44046b921373..125014d3ce55 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -206,7 +206,6 @@ struct inet6_cork { struct ipv6_txoptions *opt; u8 hop_limit; u8 tclass; - u8 dontfrag:1; }; /** diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index cfc276e5a249..f7a225da8525 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1450,7 +1450,6 @@ static int ip6_setup_cork(struct sock *sk, struct inet_cork_full *cork, } v6_cork->hop_limit = ipc6->hlimit; v6_cork->tclass = ipc6->tclass; - v6_cork->dontfrag = ipc6->dontfrag; if (rt->dst.flags & DST_XFRM_TUNNEL) mtu = np->pmtudisc >= IPV6_PMTUDISC_PROBE ? READ_ONCE(rt->dst.dev->mtu) : dst_mtu(&rt->dst); @@ -1484,7 +1483,7 @@ static int __ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), void *from, size_t length, int transhdrlen, - unsigned int flags) + unsigned int flags, struct ipcm6_cookie *ipc6) { struct sk_buff *skb, *skb_prev = NULL; struct inet_cork *cork = &cork_full->base; @@ -1540,7 +1539,7 @@ static int __ip6_append_data(struct sock *sk, if (headersize + transhdrlen > mtu) goto emsgsize; - if (cork->length + length > mtu - headersize && v6_cork->dontfrag && + if (cork->length + length > mtu - headersize && ipc6->dontfrag && (sk->sk_protocol == IPPROTO_UDP || sk->sk_protocol == IPPROTO_ICMPV6 || sk->sk_protocol == IPPROTO_RAW)) { @@ -1885,7 +1884,7 @@ int ip6_append_data(struct sock *sk, return __ip6_append_data(sk, &sk->sk_write_queue, &inet->cork, &np->cork, sk_page_frag(sk), getfrag, - from, length, transhdrlen, flags); + from, length, transhdrlen, flags, ipc6); } EXPORT_SYMBOL_GPL(ip6_append_data); @@ -2090,7 +2089,7 @@ struct sk_buff *ip6_make_skb(struct sock *sk, err = __ip6_append_data(sk, &queue, cork, &v6_cork, ¤t->task_frag, getfrag, from, length + exthdrlen, transhdrlen + exthdrlen, - flags); + flags, ipc6); if (err) { __ip6_flush_pending_frames(sk, &queue, cork, &v6_cork); return ERR_PTR(err); From f4b979262fe08e94875dc5c85a9bd7cef784bf9c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 5 Jun 2025 14:26:26 +0000 Subject: [PATCH 526/529] Revert "coredump: hand a pidfd to the usermode coredump helper" This reverts commit ac190912887386e77d5c95a8e001af7f6838f255 which is commit b5325b2a270fcaf7b2a9a0f23d422ca8a5a8bdea upstream. It breaks the Android kernel abi and can be brought back in the future in an abi-safe way if it is really needed. Bug: 161946584 Change-Id: I79cda13392f475443cb9b0aa701b9224df1fa39e Signed-off-by: Greg Kroah-Hartman --- fs/coredump.c | 78 ++++------------------------------------ include/linux/coredump.h | 1 - 2 files changed, 7 insertions(+), 72 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index b0d782367fcc..2e2b66aed59d 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -42,7 +42,6 @@ #include #include #include -#include #include #include @@ -57,13 +56,6 @@ static bool dump_vma_snapshot(struct coredump_params *cprm); static void free_vma_snapshot(struct coredump_params *cprm); -/* - * File descriptor number for the pidfd for the thread-group leader of - * the coredumping task installed into the usermode helper's file - * descriptor table. - */ -#define COREDUMP_PIDFD_NUMBER 3 - static int core_uses_pid; static unsigned int core_pipe_limit; static char core_pattern[CORENAME_MAX_SIZE] = "core"; @@ -333,27 +325,6 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, err = cn_printf(cn, "%lu", rlimit(RLIMIT_CORE)); break; - /* pidfd number */ - case 'F': { - /* - * Installing a pidfd only makes sense if - * we actually spawn a usermode helper. - */ - if (!ispipe) - break; - - /* - * Note that we'll install a pidfd for the - * thread-group leader. We know that task - * linkage hasn't been removed yet and even if - * this @current isn't the actual thread-group - * leader we know that the thread-group leader - * cannot be reaped until @current has exited. - */ - cprm->pid = task_tgid(current); - err = cn_printf(cn, "%d", COREDUMP_PIDFD_NUMBER); - break; - } default: break; } @@ -508,7 +479,7 @@ static void wait_for_dump_helpers(struct file *file) } /* - * umh_coredump_setup + * umh_pipe_setup * helper function to customize the process used * to collect the core in userspace. Specifically * it sets up a pipe and installs it as fd 0 (stdin) @@ -518,62 +489,27 @@ static void wait_for_dump_helpers(struct file *file) * is a special value that we use to trap recursive * core dumps */ -static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) +static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) { struct file *files[2]; - struct file *pidfs_file = NULL; struct coredump_params *cp = (struct coredump_params *)info->data; int err; - if (cp->pid) { - int fd; - - fd = pidfd_prepare(cp->pid, 0, &pidfs_file); - if (fd < 0) - return fd; - - /* - * We don't care about the fd. We also cannot simply - * replace it below because dup2() will refuse to close - * this file descriptor if its in a larval state. So - * close it! - */ - put_unused_fd(fd); - - /* - * Usermode helpers are childen of either - * system_unbound_wq or of kthreadd. So we know that - * we're starting off with a clean file descriptor - * table. So we should always be able to use - * COREDUMP_PIDFD_NUMBER as our file descriptor value. - */ - err = replace_fd(COREDUMP_PIDFD_NUMBER, pidfs_file, 0); - if (err < 0) - goto out_fail; - - pidfs_file = NULL; - } - err = create_pipe_files(files, 0); if (err) - goto out_fail; + return err; cp->file = files[1]; err = replace_fd(0, files[0], 0); fput(files[0]); if (err < 0) - goto out_fail; + return err; /* and disallow core files too */ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; - err = 0; - -out_fail: - if (pidfs_file) - fput(pidfs_file); - return err; + return 0; } void do_coredump(const kernel_siginfo_t *siginfo) @@ -649,7 +585,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) } if (cprm.limit == 1) { - /* See umh_coredump_setup() which sets RLIMIT_CORE = 1. + /* See umh_pipe_setup() which sets RLIMIT_CORE = 1. * * Normally core limits are irrelevant to pipes, since * we're not writing to the file system, but we use @@ -694,7 +630,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) retval = -ENOMEM; sub_info = call_usermodehelper_setup(helper_argv[0], helper_argv, NULL, GFP_KERNEL, - umh_coredump_setup, NULL, &cprm); + umh_pipe_setup, NULL, &cprm); if (sub_info) retval = call_usermodehelper_exec(sub_info, UMH_WAIT_EXEC); diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 2b7ec58299ca..08a1d3e7e46d 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -28,7 +28,6 @@ struct coredump_params { int vma_count; size_t vma_data_size; struct core_vma_metadata *vma_meta; - struct pid *pid; }; /* From 22d163151df1c871ae10da4810e0598de9d8c050 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 5 Jun 2025 14:27:18 +0000 Subject: [PATCH 527/529] Revert "perf: Avoid the read if the count is already updated" This reverts commit e1c3bfe365f10c3c5cfa53e16ad60201879f74f4 which is commit 8ce939a0fa194939cc1f92dbd8bc1a7806e7d40a upstream. It breaks the Android kernel abi and can be brought back in the future in an abi-safe way if it is really needed. Bug: 161946584 Change-Id: I9c672a97df39e7381e10c7cf113a5a36f76c90e4 Signed-off-by: Greg Kroah-Hartman --- include/linux/perf_event.h | 8 +------- kernel/events/core.c | 33 +++++++++++++++++---------------- kernel/events/ring_buffer.c | 1 - 3 files changed, 18 insertions(+), 24 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index f9804f51f38a..6ef9152c8348 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -937,13 +937,7 @@ struct perf_output_handle { struct perf_buffer *rb; unsigned long wakeup; unsigned long size; - union { - u64 flags; /* perf_output*() */ - u64 aux_flags; /* perf_aux_output*() */ - struct { - u64 skip_read : 1; - }; - }; + u64 aux_flags; union { void *addr; unsigned long head; diff --git a/kernel/events/core.c b/kernel/events/core.c index 43792736d14d..1ab3eef5cc7d 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -1180,12 +1180,6 @@ static void perf_event_ctx_deactivate(struct perf_event_context *ctx) list_del_init(&ctx->active_ctx_list); } -static inline void perf_pmu_read(struct perf_event *event) -{ - if (event->state == PERF_EVENT_STATE_ACTIVE) - event->pmu->read(event); -} - static void get_ctx(struct perf_event_context *ctx) { refcount_inc(&ctx->refcount); @@ -3395,7 +3389,8 @@ static void __perf_event_sync_stat(struct perf_event *event, * we know the event must be on the current CPU, therefore we * don't need to use it. */ - perf_pmu_read(event); + if (event->state == PERF_EVENT_STATE_ACTIVE) + event->pmu->read(event); perf_event_update_time(event); @@ -4449,8 +4444,15 @@ static void __perf_event_read(void *info) pmu->read(event); - for_each_sibling_event(sub, event) - perf_pmu_read(sub); + for_each_sibling_event(sub, event) { + if (sub->state == PERF_EVENT_STATE_ACTIVE) { + /* + * Use sibling's PMU rather than @event's since + * sibling could be on different (eg: software) PMU. + */ + sub->pmu->read(sub); + } + } data->ret = pmu->commit_txn(pmu); @@ -7050,8 +7052,9 @@ static void perf_output_read_group(struct perf_output_handle *handle, if (read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = running; - if ((leader != event) && !handle->skip_read) - perf_pmu_read(leader); + if ((leader != event) && + (leader->state == PERF_EVENT_STATE_ACTIVE)) + leader->pmu->read(leader); values[n++] = perf_event_count(leader); if (read_format & PERF_FORMAT_ID) @@ -7064,8 +7067,9 @@ static void perf_output_read_group(struct perf_output_handle *handle, for_each_sibling_event(sub, leader) { n = 0; - if ((sub != event) && !handle->skip_read) - perf_pmu_read(sub); + if ((sub != event) && + (sub->state == PERF_EVENT_STATE_ACTIVE)) + sub->pmu->read(sub); values[n++] = perf_event_count(sub); if (read_format & PERF_FORMAT_ID) @@ -7120,9 +7124,6 @@ void perf_output_sample(struct perf_output_handle *handle, { u64 sample_type = data->type; - if (data->sample_flags & PERF_SAMPLE_READ) - handle->skip_read = 1; - perf_output_put(handle, *header); if (sample_type & PERF_SAMPLE_IDENTIFIER) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 37f7451010b7..b9c010a0e0fe 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -181,7 +181,6 @@ __perf_output_begin(struct perf_output_handle *handle, handle->rb = rb; handle->event = event; - handle->flags = 0; have_lost = local_read(&rb->lost); if (unlikely(have_lost)) { From eb47017bf9e1aaef27af2e797745c79d2e9467b4 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 5 Jun 2025 14:27:50 +0000 Subject: [PATCH 528/529] Revert "genirq/msi: Store the IOMMU IOVA directly in msi_desc instead of iommu_cookie" This reverts commit e4d3763223c7b72ded53425207075e7453b4e3d5 which is commit 1f7df3a691740a7736bbc99dc4ed536120eb4746 upstream. It breaks the Android kernel abi and can be brought back in the future in an abi-safe way if it is really needed. Bug: 161946584 Change-Id: Ib0d3d22614baadb6bd1a924fb46a905e05f4b23b Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/dma-iommu.c | 28 +++++++++++++++------------- include/linux/msi.h | 33 +++++++++++++++++++++------------ 2 files changed, 36 insertions(+), 25 deletions(-) diff --git a/drivers/iommu/dma-iommu.c b/drivers/iommu/dma-iommu.c index 5ed5117b6087..3425d9b41a62 100644 --- a/drivers/iommu/dma-iommu.c +++ b/drivers/iommu/dma-iommu.c @@ -1679,7 +1679,7 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) static DEFINE_MUTEX(msi_prepare_lock); /* see below */ if (!domain || !domain->iova_cookie) { - msi_desc_set_iommu_msi_iova(desc, 0, 0); + desc->iommu_cookie = NULL; return 0; } @@ -1691,12 +1691,11 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) mutex_lock(&msi_prepare_lock); msi_page = iommu_dma_get_msi_page(dev, msi_addr, domain); mutex_unlock(&msi_prepare_lock); + + msi_desc_set_iommu_cookie(desc, msi_page); + if (!msi_page) return -ENOMEM; - - msi_desc_set_iommu_msi_iova( - desc, msi_page->iova, - ilog2(cookie_msi_granule(domain->iova_cookie))); return 0; } @@ -1707,15 +1706,18 @@ int iommu_dma_prepare_msi(struct msi_desc *desc, phys_addr_t msi_addr) */ void iommu_dma_compose_msi_msg(struct msi_desc *desc, struct msi_msg *msg) { -#ifdef CONFIG_IRQ_MSI_IOMMU - if (desc->iommu_msi_shift) { - u64 msi_iova = desc->iommu_msi_iova << desc->iommu_msi_shift; + struct device *dev = msi_desc_to_dev(desc); + const struct iommu_domain *domain = iommu_get_domain_for_dev(dev); + const struct iommu_dma_msi_page *msi_page; - msg->address_hi = upper_32_bits(msi_iova); - msg->address_lo = lower_32_bits(msi_iova) | - (msg->address_lo & ((1 << desc->iommu_msi_shift) - 1)); - } -#endif + msi_page = msi_desc_get_iommu_cookie(desc); + + if (!domain || !domain->iova_cookie || WARN_ON(!msi_page)) + return; + + msg->address_hi = upper_32_bits(msi_page->iova); + msg->address_lo &= cookie_msi_granule(domain->iova_cookie) - 1; + msg->address_lo += lower_32_bits(msi_page->iova); } static int iommu_dma_init(void) diff --git a/include/linux/msi.h b/include/linux/msi.h index 1bf8d126f792..e5dfb9cf3aa1 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -129,10 +129,6 @@ struct pci_msi_desc { * @dev: Pointer to the device which uses this descriptor * @msg: The last set MSI message cached for reuse * @affinity: Optional pointer to a cpu affinity mask for this descriptor - * @iommu_msi_iova: Optional shifted IOVA from the IOMMU to override the msi_addr. - * Only used if iommu_msi_shift != 0 - * @iommu_msi_shift: Indicates how many bits of the original address should be - * preserved when using iommu_msi_iova. * @sysfs_attr: Pointer to sysfs device attribute * * @write_msi_msg: Callback that may be called when the MSI message @@ -150,8 +146,7 @@ struct msi_desc { struct msi_msg msg; struct irq_affinity_desc *affinity; #ifdef CONFIG_IRQ_MSI_IOMMU - u64 iommu_msi_iova : 58; - u64 iommu_msi_shift : 6; + const void *iommu_cookie; #endif #ifdef CONFIG_SYSFS struct device_attribute *sysfs_attrs; @@ -219,15 +214,29 @@ struct msi_desc *msi_next_desc(struct device *dev, enum msi_desc_filter filter); #define msi_desc_to_dev(desc) ((desc)->dev) -static inline void msi_desc_set_iommu_msi_iova(struct msi_desc *desc, u64 msi_iova, - unsigned int msi_shift) -{ #ifdef CONFIG_IRQ_MSI_IOMMU - desc->iommu_msi_iova = msi_iova >> msi_shift; - desc->iommu_msi_shift = msi_shift; -#endif +static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) +{ + return desc->iommu_cookie; } +static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, + const void *iommu_cookie) +{ + desc->iommu_cookie = iommu_cookie; +} +#else +static inline const void *msi_desc_get_iommu_cookie(struct msi_desc *desc) +{ + return NULL; +} + +static inline void msi_desc_set_iommu_cookie(struct msi_desc *desc, + const void *iommu_cookie) +{ +} +#endif + #ifdef CONFIG_PCI_MSI struct pci_dev *msi_desc_to_pci_dev(struct msi_desc *desc); void pci_write_msi_msg(unsigned int irq, struct msi_msg *msg); From 18282059e07b5950d22bbe38fe69034156871d43 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 5 Jun 2025 14:30:14 +0000 Subject: [PATCH 529/529] Revert "espintcp: remove encap socket caching to avoid reference leak" This reverts commit e4cde54b46a87231c77256a633be1bef62687d69 which is commit 028363685bd0b7a19b4a820f82dd905b1dc83999 upstream. It breaks the Android kernel abi and can be brought back in the future in an abi-safe way if it is really needed. Bug: 161946584 Change-Id: I05ba7cd52d70dad7ccd570cfad6daec79c5ca791 Signed-off-by: Greg Kroah-Hartman --- include/net/xfrm.h | 1 + net/ipv4/esp4.c | 49 +++++++++++++++++++++++++++++++++++++++---- net/ipv6/esp6.c | 49 +++++++++++++++++++++++++++++++++++++++---- net/xfrm/xfrm_state.c | 3 +++ 4 files changed, 94 insertions(+), 8 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 505542501162..0740f1025630 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -212,6 +212,7 @@ struct xfrm_state { /* Data for encapsulator */ struct xfrm_encap_tmpl *encap; + struct sock __rcu *encap_sk; /* Data for care-of address */ xfrm_address_t *coaddr; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 8f5417ff355d..419969b26822 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -118,16 +118,47 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) } #ifdef CONFIG_INET_ESPINTCP +struct esp_tcp_sk { + struct sock *sk; + struct rcu_head rcu; +}; + +static void esp_free_tcp_sk(struct rcu_head *head) +{ + struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); + + sock_put(esk->sk); + kfree(esk); +} + static struct sock *esp_find_tcp_sk(struct xfrm_state *x) { struct xfrm_encap_tmpl *encap = x->encap; struct net *net = xs_net(x); + struct esp_tcp_sk *esk; __be16 sport, dport; + struct sock *nsk; struct sock *sk; + sk = rcu_dereference(x->encap_sk); + if (sk && sk->sk_state == TCP_ESTABLISHED) + return sk; + spin_lock_bh(&x->lock); sport = encap->encap_sport; dport = encap->encap_dport; + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (sk && sk == nsk) { + esk = kmalloc(sizeof(*esk), GFP_ATOMIC); + if (!esk) { + spin_unlock_bh(&x->lock); + return ERR_PTR(-ENOMEM); + } + RCU_INIT_POINTER(x->encap_sk, NULL); + esk->sk = sk; + call_rcu(&esk->rcu, esp_free_tcp_sk); + } spin_unlock_bh(&x->lock); sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4, @@ -140,6 +171,20 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x) return ERR_PTR(-EINVAL); } + spin_lock_bh(&x->lock); + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (encap->encap_sport != sport || + encap->encap_dport != dport) { + sock_put(sk); + sk = nsk ?: ERR_PTR(-EREMCHG); + } else if (sk == nsk) { + sock_put(sk); + } else { + rcu_assign_pointer(x->encap_sk, sk); + } + spin_unlock_bh(&x->lock); + return sk; } @@ -162,8 +207,6 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) err = espintcp_push_skb(sk, skb); bh_unlock_sock(sk); - sock_put(sk); - out: rcu_read_unlock(); return err; @@ -348,8 +391,6 @@ static struct ip_esp_hdr *esp_output_tcp_encap(struct xfrm_state *x, if (IS_ERR(sk)) return ERR_CAST(sk); - sock_put(sk); - *lenp = htons(len); esph = (struct ip_esp_hdr *)(lenp + 1); diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 085a83b807af..a021c88d3d9b 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -135,16 +135,47 @@ static void esp_ssg_unref(struct xfrm_state *x, void *tmp) } #ifdef CONFIG_INET6_ESPINTCP +struct esp_tcp_sk { + struct sock *sk; + struct rcu_head rcu; +}; + +static void esp_free_tcp_sk(struct rcu_head *head) +{ + struct esp_tcp_sk *esk = container_of(head, struct esp_tcp_sk, rcu); + + sock_put(esk->sk); + kfree(esk); +} + static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) { struct xfrm_encap_tmpl *encap = x->encap; struct net *net = xs_net(x); + struct esp_tcp_sk *esk; __be16 sport, dport; + struct sock *nsk; struct sock *sk; + sk = rcu_dereference(x->encap_sk); + if (sk && sk->sk_state == TCP_ESTABLISHED) + return sk; + spin_lock_bh(&x->lock); sport = encap->encap_sport; dport = encap->encap_dport; + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (sk && sk == nsk) { + esk = kmalloc(sizeof(*esk), GFP_ATOMIC); + if (!esk) { + spin_unlock_bh(&x->lock); + return ERR_PTR(-ENOMEM); + } + RCU_INIT_POINTER(x->encap_sk, NULL); + esk->sk = sk; + call_rcu(&esk->rcu, esp_free_tcp_sk); + } spin_unlock_bh(&x->lock); sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6, @@ -157,6 +188,20 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) return ERR_PTR(-EINVAL); } + spin_lock_bh(&x->lock); + nsk = rcu_dereference_protected(x->encap_sk, + lockdep_is_held(&x->lock)); + if (encap->encap_sport != sport || + encap->encap_dport != dport) { + sock_put(sk); + sk = nsk ?: ERR_PTR(-EREMCHG); + } else if (sk == nsk) { + sock_put(sk); + } else { + rcu_assign_pointer(x->encap_sk, sk); + } + spin_unlock_bh(&x->lock); + return sk; } @@ -179,8 +224,6 @@ static int esp_output_tcp_finish(struct xfrm_state *x, struct sk_buff *skb) err = espintcp_push_skb(sk, skb); bh_unlock_sock(sk); - sock_put(sk); - out: rcu_read_unlock(); return err; @@ -384,8 +427,6 @@ static struct ip_esp_hdr *esp6_output_tcp_encap(struct xfrm_state *x, if (IS_ERR(sk)) return ERR_CAST(sk); - sock_put(sk); - *lenp = htons(len); esph = (struct ip_esp_hdr *)(lenp + 1); diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index f43c09b85f96..a38ea5ae9d5c 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -694,6 +694,9 @@ int __xfrm_state_delete(struct xfrm_state *x) net->xfrm.state_num--; spin_unlock(&net->xfrm.xfrm_state_lock); + if (x->encap_sk) + sock_put(rcu_dereference_raw(x->encap_sk)); + xfrm_dev_state_delete(x); /* All xfrm_state objects are created by xfrm_state_alloc.