From 7cfc972acb0d8236d7c4ffad81b6397650d35e51 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 21 Sep 2024 14:25:37 -0400 Subject: [PATCH 01/29] NFSD: Mark filecache "down" if init fails [ Upstream commit dc0d0f885aa422f621bc1c2124133eff566b0bc8 ] NeilBrown says: > The handling of NFSD_FILE_CACHE_UP is strange. nfsd_file_cache_init() > sets it, but doesn't clear it on failure. So if nfsd_file_cache_init() > fails for some reason, nfsd_file_cache_shutdown() would still try to > clean up if it was called. Reported-by: NeilBrown Fixes: c7b824c3d06c ("NFSD: Replace the "init once" mechanism") Signed-off-by: Chuck Lever Signed-off-by: Sasha Levin --- fs/nfsd/filecache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 9e81f3a9097e..c9393c732889 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -718,7 +718,7 @@ nfsd_file_cache_init(void) ret = rhltable_init(&nfsd_file_rhltable, &nfsd_file_rhash_params); if (ret) - return ret; + goto out; ret = -ENOMEM; nfsd_filecache_wq = alloc_workqueue("nfsd_filecache", 0, 0); @@ -770,6 +770,8 @@ nfsd_file_cache_init(void) INIT_DELAYED_WORK(&nfsd_filecache_laundrette, nfsd_file_gc_worker); out: + if (ret) + clear_bit(NFSD_FILE_CACHE_UP, &nfsd_file_flags); return ret; out_notifier: lease_unregister_notifier(&nfsd_file_lease_notifier); From 8ef4af06713bc7153cca99d8dbaaaa6c2ef25a48 Mon Sep 17 00:00:00 2001 From: Dave Ertman Date: Wed, 18 Sep 2024 14:02:56 -0400 Subject: [PATCH 02/29] ice: fix VLAN replay after reset [ Upstream commit 0eae2c136cb624e4050092feb59f18159b4f2512 ] There is a bug currently when there are more than one VLAN defined and any reset that affects the PF is initiated, after the reset rebuild no traffic will pass on any VLAN but the last one created. This is caused by the iteration though the VLANs during replay each clearing the vsi_map bitmap of the VSI that is being replayed. The problem is that during rhe replay, the pointer to the vsi_map bitmap is used by each successive vlan to determine if it should be replayed on this VSI. The logic was that the replay of the VLAN would replace the bit in the map before the next VLAN would iterate through. But, since the replay copies the old bitmap pointer to filt_replay_rules and creates a new one for the recreated VLANS, it does not do this, and leaves the old bitmap broken to be used to replay the remaining VLANs. Since the old bitmap will be cleaned up in post replay cleanup, there is no need to alter it and break following VLAN replay, so don't clear the bit. Fixes: 334cb0626de1 ("ice: Implement VSI replay framework") Reviewed-by: Przemek Kitszel Signed-off-by: Dave Ertman Reviewed-by: Jacob Keller Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/ice/ice_switch.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index dc4ce3bd412d..3a29ae46fb39 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -6317,8 +6317,6 @@ ice_replay_vsi_fltr(struct ice_hw *hw, u16 vsi_handle, u8 recp_id, if (!itr->vsi_list_info || !test_bit(vsi_handle, itr->vsi_list_info->vsi_map)) continue; - /* Clearing it so that the logic can add it back */ - clear_bit(vsi_handle, itr->vsi_list_info->vsi_map); f_entry.fltr_info.vsi_handle = vsi_handle; f_entry.fltr_info.fltr_act = ICE_FWD_TO_VSI; /* update the src in case it is VSI num */ From ae315bcc30ccabdd93a491b02e8ad04cd5790a37 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 19 Sep 2024 11:50:33 +0300 Subject: [PATCH 03/29] SUNRPC: Fix integer overflow in decode_rc_list() [ Upstream commit 6dbf1f341b6b35bcc20ff95b6b315e509f6c5369 ] The math in "rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)" could have an integer overflow. Add bounds checking on rc_list->rcl_nrefcalls to fix that. Fixes: 4aece6a19cf7 ("nfs41: cb_sequence xdr implementation") Signed-off-by: Dan Carpenter Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- fs/nfs/callback_xdr.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c index d0cccddb7d08..fa519ce5c841 100644 --- a/fs/nfs/callback_xdr.c +++ b/fs/nfs/callback_xdr.c @@ -372,6 +372,8 @@ static __be32 decode_rc_list(struct xdr_stream *xdr, rc_list->rcl_nrefcalls = ntohl(*p++); if (rc_list->rcl_nrefcalls) { + if (unlikely(rc_list->rcl_nrefcalls > xdr->buf->len)) + goto out; p = xdr_inline_decode(xdr, rc_list->rcl_nrefcalls * 2 * sizeof(uint32_t)); if (unlikely(p == NULL)) From 632344b9efa064ca737bfcdaaaced59fd5f18ae9 Mon Sep 17 00:00:00 2001 From: Yanjun Zhang Date: Tue, 1 Oct 2024 16:39:30 +0800 Subject: [PATCH 04/29] NFSv4: Prevent NULL-pointer dereference in nfs42_complete_copies() [ Upstream commit a848c29e3486189aaabd5663bc11aea50c5bd144 ] On the node of an NFS client, some files saved in the mountpoint of the NFS server were copied to another location of the same NFS server. Accidentally, the nfs42_complete_copies() got a NULL-pointer dereference crash with the following syslog: [232064.838881] NFSv4: state recovery failed for open file nfs/pvc-12b5200d-cd0f-46a3-b9f0-af8f4fe0ef64.qcow2, error = -116 [232064.839360] NFSv4: state recovery failed for open file nfs/pvc-12b5200d-cd0f-46a3-b9f0-af8f4fe0ef64.qcow2, error = -116 [232066.588183] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058 [232066.588586] Mem abort info: [232066.588701] ESR = 0x0000000096000007 [232066.588862] EC = 0x25: DABT (current EL), IL = 32 bits [232066.589084] SET = 0, FnV = 0 [232066.589216] EA = 0, S1PTW = 0 [232066.589340] FSC = 0x07: level 3 translation fault [232066.589559] Data abort info: [232066.589683] ISV = 0, ISS = 0x00000007 [232066.589842] CM = 0, WnR = 0 [232066.589967] user pgtable: 64k pages, 48-bit VAs, pgdp=00002000956ff400 [232066.590231] [0000000000000058] pgd=08001100ae100003, p4d=08001100ae100003, pud=08001100ae100003, pmd=08001100b3c00003, pte=0000000000000000 [232066.590757] Internal error: Oops: 96000007 [#1] SMP [232066.590958] Modules linked in: rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache netfs ocfs2_dlmfs ocfs2_stack_o2cb ocfs2_dlm vhost_net vhost vhost_iotlb tap tun ipt_rpfilter xt_multiport ip_set_hash_ip ip_set_hash_net xfrm_interface xfrm6_tunnel tunnel4 tunnel6 esp4 ah4 wireguard libcurve25519_generic veth xt_addrtype xt_set nf_conntrack_netlink ip_set_hash_ipportnet ip_set_hash_ipportip ip_set_bitmap_port ip_set_hash_ipport dummy ip_set ip_vs_sh ip_vs_wrr ip_vs_rr ip_vs iptable_filter sch_ingress nfnetlink_cttimeout vport_gre ip_gre ip_tunnel gre vport_geneve geneve vport_vxlan vxlan ip6_udp_tunnel udp_tunnel openvswitch nf_conncount dm_round_robin dm_service_time dm_multipath xt_nat xt_MASQUERADE nft_chain_nat nf_nat xt_mark xt_conntrack xt_comment nft_compat nft_counter nf_tables nfnetlink ocfs2 ocfs2_nodemanager ocfs2_stackglue iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi ipmi_ssif nbd overlay 8021q garp mrp bonding tls rfkill sunrpc ext4 mbcache jbd2 [232066.591052] vfat fat cas_cache cas_disk ses enclosure scsi_transport_sas sg acpi_ipmi ipmi_si ipmi_devintf ipmi_msghandler ip_tables vfio_pci vfio_pci_core vfio_virqfd vfio_iommu_type1 vfio dm_mirror dm_region_hash dm_log dm_mod nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 br_netfilter bridge stp llc fuse xfs libcrc32c ast drm_vram_helper qla2xxx drm_kms_helper syscopyarea crct10dif_ce sysfillrect ghash_ce sysimgblt sha2_ce fb_sys_fops cec sha256_arm64 sha1_ce drm_ttm_helper ttm nvme_fc igb sbsa_gwdt nvme_fabrics drm nvme_core i2c_algo_bit i40e scsi_transport_fc megaraid_sas aes_neon_bs [232066.596953] CPU: 6 PID: 4124696 Comm: 10.253.166.125- Kdump: loaded Not tainted 5.15.131-9.cl9_ocfs2.aarch64 #1 [232066.597356] Hardware name: Great Wall .\x93\x8e...RF6260 V5/GWMSSE2GL1T, BIOS T656FBE_V3.0.18 2024-01-06 [232066.597721] pstate: 20400009 (nzCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) [232066.598034] pc : nfs4_reclaim_open_state+0x220/0x800 [nfsv4] [232066.598327] lr : nfs4_reclaim_open_state+0x12c/0x800 [nfsv4] [232066.598595] sp : ffff8000f568fc70 [232066.598731] x29: ffff8000f568fc70 x28: 0000000000001000 x27: ffff21003db33000 [232066.599030] x26: ffff800005521ae0 x25: ffff0100f98fa3f0 x24: 0000000000000001 [232066.599319] x23: ffff800009920008 x22: ffff21003db33040 x21: ffff21003db33050 [232066.599628] x20: ffff410172fe9e40 x19: ffff410172fe9e00 x18: 0000000000000000 [232066.599914] x17: 0000000000000000 x16: 0000000000000004 x15: 0000000000000000 [232066.600195] x14: 0000000000000000 x13: ffff800008e685a8 x12: 00000000eac0c6e6 [232066.600498] x11: 0000000000000000 x10: 0000000000000008 x9 : ffff8000054e5828 [232066.600784] x8 : 00000000ffffffbf x7 : 0000000000000001 x6 : 000000000a9eb14a [232066.601062] x5 : 0000000000000000 x4 : ffff70ff8a14a800 x3 : 0000000000000058 [232066.601348] x2 : 0000000000000001 x1 : 54dce46366daa6c6 x0 : 0000000000000000 [232066.601636] Call trace: [232066.601749] nfs4_reclaim_open_state+0x220/0x800 [nfsv4] [232066.601998] nfs4_do_reclaim+0x1b8/0x28c [nfsv4] [232066.602218] nfs4_state_manager+0x928/0x10f0 [nfsv4] [232066.602455] nfs4_run_state_manager+0x78/0x1b0 [nfsv4] [232066.602690] kthread+0x110/0x114 [232066.602830] ret_from_fork+0x10/0x20 [232066.602985] Code: 1400000d f9403f20 f9402e61 91016003 (f9402c00) [232066.603284] SMP: stopping secondary CPUs [232066.606936] Starting crashdump kernel... [232066.607146] Bye! Analysing the vmcore, we know that nfs4_copy_state listed by destination nfs_server->ss_copies was added by the field copies in handle_async_copy(), and we found a waiting copy process with the stack as: PID: 3511963 TASK: ffff710028b47e00 CPU: 0 COMMAND: "cp" #0 [ffff8001116ef740] __switch_to at ffff8000081b92f4 #1 [ffff8001116ef760] __schedule at ffff800008dd0650 #2 [ffff8001116ef7c0] schedule at ffff800008dd0a00 #3 [ffff8001116ef7e0] schedule_timeout at ffff800008dd6aa0 #4 [ffff8001116ef860] __wait_for_common at ffff800008dd166c #5 [ffff8001116ef8e0] wait_for_completion_interruptible at ffff800008dd1898 #6 [ffff8001116ef8f0] handle_async_copy at ffff8000055142f4 [nfsv4] #7 [ffff8001116ef970] _nfs42_proc_copy at ffff8000055147c8 [nfsv4] #8 [ffff8001116efa80] nfs42_proc_copy at ffff800005514cf0 [nfsv4] #9 [ffff8001116efc50] __nfs4_copy_file_range.constprop.0 at ffff8000054ed694 [nfsv4] The NULL-pointer dereference was due to nfs42_complete_copies() listed the nfs_server->ss_copies by the field ss_copies of nfs4_copy_state. So the nfs4_copy_state address ffff0100f98fa3f0 was offset by 0x10 and the data accessed through this pointer was also incorrect. Generally, the ordered list nfs4_state_owner->so_states indicate open(O_RDWR) or open(O_WRITE) states are reclaimed firstly by nfs4_reclaim_open_state(). When destination state reclaim is failed with NFS_STATE_RECOVERY_FAILED and copies are not deleted in nfs_server->ss_copies, the source state may be passed to the nfs42_complete_copies() process earlier, resulting in this crash scene finally. To solve this issue, we add a list_head nfs_server->ss_src_copies for a server-to-server copy specially. Fixes: 0e65a32c8a56 ("NFS: handle source server reboot") Signed-off-by: Yanjun Zhang Reviewed-by: Trond Myklebust Signed-off-by: Anna Schumaker Signed-off-by: Sasha Levin --- fs/nfs/client.c | 1 + fs/nfs/nfs42proc.c | 2 +- fs/nfs/nfs4state.c | 2 +- include/linux/nfs_fs_sb.h | 1 + 4 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 755256875052..a8930e6c417f 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -965,6 +965,7 @@ struct nfs_server *nfs_alloc_server(void) INIT_LIST_HEAD(&server->layouts); INIT_LIST_HEAD(&server->state_owners_lru); INIT_LIST_HEAD(&server->ss_copies); + INIT_LIST_HEAD(&server->ss_src_copies); atomic_set(&server->active, 0); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 5a8fe0e57a3d..89c32a963dd1 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -218,7 +218,7 @@ static int handle_async_copy(struct nfs42_copy_res *res, if (dst_server != src_server) { spin_lock(&src_server->nfs_client->cl_lock); - list_add_tail(©->src_copies, &src_server->ss_copies); + list_add_tail(©->src_copies, &src_server->ss_src_copies); spin_unlock(&src_server->nfs_client->cl_lock); } diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 4a5c47d791f2..48ea40660422 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1594,7 +1594,7 @@ static void nfs42_complete_copies(struct nfs4_state_owner *sp, struct nfs4_state complete(©->completion); } } - list_for_each_entry(copy, &sp->so_server->ss_copies, src_copies) { + list_for_each_entry(copy, &sp->so_server->ss_src_copies, src_copies) { if ((test_bit(NFS_CLNT_SRC_SSC_COPY_STATE, &state->flags) && !nfs4_stateid_match_other(&state->stateid, ©->parent_src_state->stateid))) diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index ef8ba5fbc650..9ea9f9087a71 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -235,6 +235,7 @@ struct nfs_server { struct list_head layouts; struct list_head delegations; struct list_head ss_copies; + struct list_head ss_src_copies; unsigned long mig_gen; unsigned long mig_status; From c1944b4253649fc6f2fb53e7d6302eb414d2182c Mon Sep 17 00:00:00 2001 From: Ingo van Lil Date: Wed, 2 Oct 2024 18:18:07 +0200 Subject: [PATCH 05/29] net: phy: dp83869: fix memory corruption when enabling fiber [ Upstream commit a842e443ca8184f2dc82ab307b43a8b38defd6a5 ] When configuring the fiber port, the DP83869 PHY driver incorrectly calls linkmode_set_bit() with a bit mask (1 << 10) rather than a bit number (10). This corrupts some other memory location -- in case of arm64 the priv pointer in the same structure. Since the advertising flags are updated from supported at the end of the function the incorrect line isn't needed at all and can be removed. Fixes: a29de52ba2a1 ("net: dp83869: Add ability to advertise Fiber connection") Signed-off-by: Ingo van Lil Reviewed-by: Alexander Sverdlin Reviewed-by: Andrew Lunn Link: https://patch.msgid.link/20241002161807.440378-1-inguin@gmx.de Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/phy/dp83869.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/phy/dp83869.c b/drivers/net/phy/dp83869.c index 9ab5eff502b7..b924f98b2397 100644 --- a/drivers/net/phy/dp83869.c +++ b/drivers/net/phy/dp83869.c @@ -645,7 +645,6 @@ static int dp83869_configure_fiber(struct phy_device *phydev, phydev->supported); linkmode_set_bit(ETHTOOL_LINK_MODE_FIBRE_BIT, phydev->supported); - linkmode_set_bit(ADVERTISED_FIBRE, phydev->advertising); if (dp83869->mode == DP83869_RGMII_1000_BASE) { linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseX_Full_BIT, From dc566bf54b12b611b48378e56ea620938e155d7d Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 1 Oct 2024 20:05:15 +0000 Subject: [PATCH 06/29] tcp: fix to allow timestamp undo if no retransmits were sent [ Upstream commit e37ab7373696e650d3b6262a5b882aadad69bb9e ] Fix the TCP loss recovery undo logic in tcp_packet_delayed() so that it can trigger undo even if TSQ prevents a fast recovery episode from reaching tcp_retransmit_skb(). Geumhwan Yu recently reported that after this commit from 2019: commit bc9f38c8328e ("tcp: avoid unconditional congestion window undo on SYN retransmit") ...and before this fix we could have buggy scenarios like the following: + Due to reordering, a TCP connection receives some SACKs and enters a spurious fast recovery. + TSQ prevents all invocations of tcp_retransmit_skb(), because many skbs are queued in lower layers of the sending machine's network stack; thus tp->retrans_stamp remains 0. + The connection receives a TCP timestamp ECR value echoing a timestamp before the fast recovery, indicating that the fast recovery was spurious. + The connection fails to undo the spurious fast recovery because tp->retrans_stamp is 0, and thus tcp_packet_delayed() returns false, due to the new logic in the 2019 commit: commit bc9f38c8328e ("tcp: avoid unconditional congestion window undo on SYN retransmit") This fix tweaks the logic to be more similar to the tcp_packet_delayed() logic before bc9f38c8328e, except that we take care not to be fooled by the FLAG_SYN_ACKED code path zeroing out tp->retrans_stamp (the bug noted and fixed by Yuchung in bc9f38c8328e). Note that this returns the high-level behavior of tcp_packet_delayed() to again match the comment for the function, which says: "Nothing was retransmitted or returned timestamp is less than timestamp of the first retransmission." Note that this comment is in the original 2005-04-16 Linux git commit, so this is evidently long-standing behavior. Fixes: bc9f38c8328e ("tcp: avoid unconditional congestion window undo on SYN retransmit") Reported-by: Geumhwan Yu Diagnosed-by: Geumhwan Yu Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241001200517.2756803-2-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv4/tcp_input.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4c7a2702d904..5b7345a3a96e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2440,8 +2440,22 @@ static bool tcp_skb_spurious_retrans(const struct tcp_sock *tp, */ static inline bool tcp_packet_delayed(const struct tcp_sock *tp) { - return tp->retrans_stamp && - tcp_tsopt_ecr_before(tp, tp->retrans_stamp); + const struct sock *sk = (const struct sock *)tp; + + if (tp->retrans_stamp && + tcp_tsopt_ecr_before(tp, tp->retrans_stamp)) + return true; /* got echoed TS before first retransmission */ + + /* Check if nothing was retransmitted (retrans_stamp==0), which may + * happen in fast recovery due to TSQ. But we ignore zero retrans_stamp + * in TCP_SYN_SENT, since when we set FLAG_SYN_ACKED we also clear + * retrans_stamp even if we had retransmitted the SYN. + */ + if (!tp->retrans_stamp && /* no record of a retransmit/SYN? */ + sk->sk_state != TCP_SYN_SENT) /* not the FLAG_SYN_ACKED case? */ + return true; /* nothing was retransmitted */ + + return false; } /* Undo procedures. */ From ed36591af26fa025de7606823f06758868228e7b Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Tue, 1 Oct 2024 20:05:16 +0000 Subject: [PATCH 07/29] tcp: fix tcp_enter_recovery() to zero retrans_stamp when it's safe [ Upstream commit b41b4cbd9655bcebcce941bef3601db8110335be ] Fix tcp_enter_recovery() so that if there are no retransmits out then we zero retrans_stamp when entering fast recovery. This is necessary to fix two buggy behaviors. Currently a non-zero retrans_stamp value can persist across multiple back-to-back loss recovery episodes. This is because we generally only clears retrans_stamp if we are completely done with loss recoveries, and get to tcp_try_to_open() and find !tcp_any_retrans_done(sk). This behavior causes two bugs: (1) When a loss recovery episode (CA_Loss or CA_Recovery) is followed immediately by a new CA_Recovery, the retrans_stamp value can persist and can be a time before this new CA_Recovery episode starts. That means that timestamp-based undo will be using the wrong retrans_stamp (a value that is too old) when comparing incoming TS ecr values to retrans_stamp to see if the current fast recovery episode can be undone. (2) If there is a roughly minutes-long sequence of back-to-back fast recovery episodes, one after another (e.g. in a shallow-buffered or policed bottleneck), where each fast recovery successfully makes forward progress and recovers one window of sequence space (but leaves at least one retransmit in flight at the end of the recovery), followed by several RTOs, then the ETIMEDOUT check may be using the wrong retrans_stamp (a value set at the start of the first fast recovery in the sequence). This can cause a very premature ETIMEDOUT, killing the connection prematurely. This commit changes the code to zero retrans_stamp when entering fast recovery, when this is known to be safe (no retransmits are out in the network). That ensures that when starting a fast recovery episode, and it is safe to do so, retrans_stamp is set when we send the fast retransmit packet. That addresses both bug (1) and bug (2) by ensuring that (if no retransmits are out when we start a fast recovery) we use the initial fast retransmit of this fast recovery as the time value for undo and ETIMEDOUT calculations. This makes intuitive sense, since the start of a new fast recovery episode (in a scenario where no lost packets are out in the network) means that the connection has made forward progress since the last RTO or fast recovery, and we should thus "restart the clock" used for both undo and ETIMEDOUT logic. Note that if when we start fast recovery there *are* retransmits out in the network, there can still be undesirable (1)/(2) issues. For example, after this patch we can still have the (1) and (2) problems in cases like this: + round 1: sender sends flight 1 + round 2: sender receives SACKs and enters fast recovery 1, retransmits some packets in flight 1 and then sends some new data as flight 2 + round 3: sender receives some SACKs for flight 2, notes losses, and retransmits some packets to fill the holes in flight 2 + fast recovery has some lost retransmits in flight 1 and continues for one or more rounds sending retransmits for flight 1 and flight 2 + fast recovery 1 completes when snd_una reaches high_seq at end of flight 1 + there are still holes in the SACK scoreboard in flight 2, so we enter fast recovery 2, but some retransmits in the flight 2 sequence range are still in flight (retrans_out > 0), so we can't execute the new retrans_stamp=0 added here to clear retrans_stamp It's not yet clear how to fix these remaining (1)/(2) issues in an efficient way without breaking undo behavior, given that retrans_stamp is currently used for undo and ETIMEDOUT. Perhaps the optimal (but expensive) strategy would be to set retrans_stamp to the timestamp of the earliest outstanding retransmit when entering fast recovery. But at least this commit makes things better. Note that this does not change the semantics of retrans_stamp; it simply makes retrans_stamp accurate in some cases where it was not before: (1) Some loss recovery, followed by an immediate entry into a fast recovery, where there are no retransmits out when entering the fast recovery. (2) When a TFO server has a SYNACK retransmit that sets retrans_stamp, and then the ACK that completes the 3-way handshake has SACK blocks that trigger a fast recovery. In this case when entering fast recovery we want to zero out the retrans_stamp from the TFO SYNACK retransmit, and set the retrans_stamp based on the timestamp of the fast recovery. We introduce a tcp_retrans_stamp_cleanup() helper, because this two-line sequence already appears in 3 places and is about to appear in 2 more as a result of this bug fix patch series. Once this bug fix patches series in the net branch makes it into the net-next branch we'll update the 3 other call sites to use the new helper. This is a long-standing issue. The Fixes tag below is chosen to be the oldest commit at which the patch will apply cleanly, which is from Linux v3.5 in 2012. Fixes: 1fbc340514fc ("tcp: early retransmit: tcp_enter_recovery()") Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Link: https://patch.msgid.link/20241001200517.2756803-3-ncardwell.sw@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/ipv4/tcp_input.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 5b7345a3a96e..9ac47ccfe120 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2489,6 +2489,16 @@ static bool tcp_any_retrans_done(const struct sock *sk) return false; } +/* If loss recovery is finished and there are no retransmits out in the + * network, then we clear retrans_stamp so that upon the next loss recovery + * retransmits_timed_out() and timestamp-undo are using the correct value. + */ +static void tcp_retrans_stamp_cleanup(struct sock *sk) +{ + if (!tcp_any_retrans_done(sk)) + tcp_sk(sk)->retrans_stamp = 0; +} + static void DBGUNDO(struct sock *sk, const char *msg) { #if FASTRETRANS_DEBUG > 1 @@ -2856,6 +2866,9 @@ void tcp_enter_recovery(struct sock *sk, bool ece_ack) struct tcp_sock *tp = tcp_sk(sk); int mib_idx; + /* Start the clock with our fast retransmit, for undo and ETIMEDOUT. */ + tcp_retrans_stamp_cleanup(sk); + if (tcp_is_reno(tp)) mib_idx = LINUX_MIB_TCPRENORECOVERY; else From 78ed917133b118661e1fe62d4a85d5d428ee9568 Mon Sep 17 00:00:00 2001 From: Andy Roulin Date: Tue, 1 Oct 2024 08:43:59 -0700 Subject: [PATCH 08/29] netfilter: br_netfilter: fix panic with metadata_dst skb [ Upstream commit f9ff7665cd128012868098bbd07e28993e314fdb ] Fix a kernel panic in the br_netfilter module when sending untagged traffic via a VxLAN device. This happens during the check for fragmentation in br_nf_dev_queue_xmit. It is dependent on: 1) the br_netfilter module being loaded; 2) net.bridge.bridge-nf-call-iptables set to 1; 3) a bridge with a VxLAN (single-vxlan-device) netdevice as a bridge port; 4) untagged frames with size higher than the VxLAN MTU forwarded/flooded When forwarding the untagged packet to the VxLAN bridge port, before the netfilter hooks are called, br_handle_egress_vlan_tunnel is called and changes the skb_dst to the tunnel dst. The tunnel_dst is a metadata type of dst, i.e., skb_valid_dst(skb) is false, and metadata->dst.dev is NULL. Then in the br_netfilter hooks, in br_nf_dev_queue_xmit, there's a check for frames that needs to be fragmented: frames with higher MTU than the VxLAN device end up calling br_nf_ip_fragment, which in turns call ip_skb_dst_mtu. The ip_dst_mtu tries to use the skb_dst(skb) as if it was a valid dst with valid dst->dev, thus the crash. This case was never supported in the first place, so drop the packet instead. PING 10.0.0.2 (10.0.0.2) from 0.0.0.0 h1-eth0: 2000(2028) bytes of data. [ 176.291791] Unable to handle kernel NULL pointer dereference at virtual address 0000000000000110 [ 176.292101] Mem abort info: [ 176.292184] ESR = 0x0000000096000004 [ 176.292322] EC = 0x25: DABT (current EL), IL = 32 bits [ 176.292530] SET = 0, FnV = 0 [ 176.292709] EA = 0, S1PTW = 0 [ 176.292862] FSC = 0x04: level 0 translation fault [ 176.293013] Data abort info: [ 176.293104] ISV = 0, ISS = 0x00000004, ISS2 = 0x00000000 [ 176.293488] CM = 0, WnR = 0, TnD = 0, TagAccess = 0 [ 176.293787] GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 [ 176.293995] user pgtable: 4k pages, 48-bit VAs, pgdp=0000000043ef5000 [ 176.294166] [0000000000000110] pgd=0000000000000000, p4d=0000000000000000 [ 176.294827] Internal error: Oops: 0000000096000004 [#1] PREEMPT SMP [ 176.295252] Modules linked in: vxlan ip6_udp_tunnel udp_tunnel veth br_netfilter bridge stp llc ipv6 crct10dif_ce [ 176.295923] CPU: 0 PID: 188 Comm: ping Not tainted 6.8.0-rc3-g5b3fbd61b9d1 #2 [ 176.296314] Hardware name: linux,dummy-virt (DT) [ 176.296535] pstate: 80000005 (Nzcv daif -PAN -UAO -TCO -DIT -SSBS BTYPE=--) [ 176.296808] pc : br_nf_dev_queue_xmit+0x390/0x4ec [br_netfilter] [ 176.297382] lr : br_nf_dev_queue_xmit+0x2ac/0x4ec [br_netfilter] [ 176.297636] sp : ffff800080003630 [ 176.297743] x29: ffff800080003630 x28: 0000000000000008 x27: ffff6828c49ad9f8 [ 176.298093] x26: ffff6828c49ad000 x25: 0000000000000000 x24: 00000000000003e8 [ 176.298430] x23: 0000000000000000 x22: ffff6828c4960b40 x21: ffff6828c3b16d28 [ 176.298652] x20: ffff6828c3167048 x19: ffff6828c3b16d00 x18: 0000000000000014 [ 176.298926] x17: ffffb0476322f000 x16: ffffb7e164023730 x15: 0000000095744632 [ 176.299296] x14: ffff6828c3f1c880 x13: 0000000000000002 x12: ffffb7e137926a70 [ 176.299574] x11: 0000000000000001 x10: ffff6828c3f1c898 x9 : 0000000000000000 [ 176.300049] x8 : ffff6828c49bf070 x7 : 0008460f18d5f20e x6 : f20e0100bebafeca [ 176.300302] x5 : ffff6828c7f918fe x4 : ffff6828c49bf070 x3 : 0000000000000000 [ 176.300586] x2 : 0000000000000000 x1 : ffff6828c3c7ad00 x0 : ffff6828c7f918f0 [ 176.300889] Call trace: [ 176.301123] br_nf_dev_queue_xmit+0x390/0x4ec [br_netfilter] [ 176.301411] br_nf_post_routing+0x2a8/0x3e4 [br_netfilter] [ 176.301703] nf_hook_slow+0x48/0x124 [ 176.302060] br_forward_finish+0xc8/0xe8 [bridge] [ 176.302371] br_nf_hook_thresh+0x124/0x134 [br_netfilter] [ 176.302605] br_nf_forward_finish+0x118/0x22c [br_netfilter] [ 176.302824] br_nf_forward_ip.part.0+0x264/0x290 [br_netfilter] [ 176.303136] br_nf_forward+0x2b8/0x4e0 [br_netfilter] [ 176.303359] nf_hook_slow+0x48/0x124 [ 176.303803] __br_forward+0xc4/0x194 [bridge] [ 176.304013] br_flood+0xd4/0x168 [bridge] [ 176.304300] br_handle_frame_finish+0x1d4/0x5c4 [bridge] [ 176.304536] br_nf_hook_thresh+0x124/0x134 [br_netfilter] [ 176.304978] br_nf_pre_routing_finish+0x29c/0x494 [br_netfilter] [ 176.305188] br_nf_pre_routing+0x250/0x524 [br_netfilter] [ 176.305428] br_handle_frame+0x244/0x3cc [bridge] [ 176.305695] __netif_receive_skb_core.constprop.0+0x33c/0xecc [ 176.306080] __netif_receive_skb_one_core+0x40/0x8c [ 176.306197] __netif_receive_skb+0x18/0x64 [ 176.306369] process_backlog+0x80/0x124 [ 176.306540] __napi_poll+0x38/0x17c [ 176.306636] net_rx_action+0x124/0x26c [ 176.306758] __do_softirq+0x100/0x26c [ 176.307051] ____do_softirq+0x10/0x1c [ 176.307162] call_on_irq_stack+0x24/0x4c [ 176.307289] do_softirq_own_stack+0x1c/0x2c [ 176.307396] do_softirq+0x54/0x6c [ 176.307485] __local_bh_enable_ip+0x8c/0x98 [ 176.307637] __dev_queue_xmit+0x22c/0xd28 [ 176.307775] neigh_resolve_output+0xf4/0x1a0 [ 176.308018] ip_finish_output2+0x1c8/0x628 [ 176.308137] ip_do_fragment+0x5b4/0x658 [ 176.308279] ip_fragment.constprop.0+0x48/0xec [ 176.308420] __ip_finish_output+0xa4/0x254 [ 176.308593] ip_finish_output+0x34/0x130 [ 176.308814] ip_output+0x6c/0x108 [ 176.308929] ip_send_skb+0x50/0xf0 [ 176.309095] ip_push_pending_frames+0x30/0x54 [ 176.309254] raw_sendmsg+0x758/0xaec [ 176.309568] inet_sendmsg+0x44/0x70 [ 176.309667] __sys_sendto+0x110/0x178 [ 176.309758] __arm64_sys_sendto+0x28/0x38 [ 176.309918] invoke_syscall+0x48/0x110 [ 176.310211] el0_svc_common.constprop.0+0x40/0xe0 [ 176.310353] do_el0_svc+0x1c/0x28 [ 176.310434] el0_svc+0x34/0xb4 [ 176.310551] el0t_64_sync_handler+0x120/0x12c [ 176.310690] el0t_64_sync+0x190/0x194 [ 176.311066] Code: f9402e61 79402aa2 927ff821 f9400023 (f9408860) [ 176.315743] ---[ end trace 0000000000000000 ]--- [ 176.316060] Kernel panic - not syncing: Oops: Fatal exception in interrupt [ 176.316371] Kernel Offset: 0x37e0e3000000 from 0xffff800080000000 [ 176.316564] PHYS_OFFSET: 0xffff97d780000000 [ 176.316782] CPU features: 0x0,88000203,3c020000,0100421b [ 176.317210] Memory Limit: none [ 176.317527] ---[ end Kernel panic - not syncing: Oops: Fatal Exception in interrupt ]---\ Fixes: 11538d039ac6 ("bridge: vlan dst_metadata hooks in ingress and egress paths") Reviewed-by: Ido Schimmel Signed-off-by: Andy Roulin Acked-by: Nikolay Aleksandrov Link: https://patch.msgid.link/20241001154400.22787-2-aroulin@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/bridge/br_netfilter_hooks.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 9229300881b5..5c6ed1d49b92 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -871,6 +872,10 @@ static int br_nf_dev_queue_xmit(struct net *net, struct sock *sk, struct sk_buff return br_dev_queue_push_xmit(net, sk, skb); } + /* Fragmentation on metadata/template dst is not supported */ + if (unlikely(!skb_valid_dst(skb))) + goto drop; + /* This is wrong! We should preserve the original fragment * boundaries by preserving frag_list rather than refragmenting. */ From 6e39646d7a130576131252a8b04dd89cf50b2bc8 Mon Sep 17 00:00:00 2001 From: Kacper Ludwinski Date: Wed, 2 Oct 2024 14:10:16 +0900 Subject: [PATCH 09/29] selftests: net: no_forwarding: fix VID for $swp2 in one_bridge_two_pvids() test [ Upstream commit 9f49d14ec41ce7be647028d7d34dea727af55272 ] Currently, the second bridge command overwrites the first one. Fix this by adding this VID to the interface behind $swp2. The one_bridge_two_pvids() test intends to check that there is no leakage of traffic between bridge ports which have a single VLAN - the PVID VLAN. Because of a typo, port $swp1 is configured with a PVID twice (second command overwrites first), and $swp2 isn't configured at all (and since the bridge vlan_default_pvid property is set to 0, this port will not have a PVID at all, so it will drop all untagged and priority-tagged traffic). So, instead of testing the configuration that was intended, we are testing a different one, where one port has PVID 2 and the other has no PVID. This incorrect version of the test should also pass, but is ineffective for its purpose, so fix the typo. This typo has an impact on results of the test, potentially leading to wrong conclusions regarding the functionality of a network device. The tests results: TEST: Switch ports in VLAN-aware bridge with different PVIDs: Unicast non-IP untagged [ OK ] Multicast non-IP untagged [ OK ] Broadcast non-IP untagged [ OK ] Unicast IPv4 untagged [ OK ] Multicast IPv4 untagged [ OK ] Unicast IPv6 untagged [ OK ] Multicast IPv6 untagged [ OK ] Unicast non-IP VID 1 [ OK ] Multicast non-IP VID 1 [ OK ] Broadcast non-IP VID 1 [ OK ] Unicast IPv4 VID 1 [ OK ] Multicast IPv4 VID 1 [ OK ] Unicast IPv6 VID 1 [ OK ] Multicast IPv6 VID 1 [ OK ] Unicast non-IP VID 4094 [ OK ] Multicast non-IP VID 4094 [ OK ] Broadcast non-IP VID 4094 [ OK ] Unicast IPv4 VID 4094 [ OK ] Multicast IPv4 VID 4094 [ OK ] Unicast IPv6 VID 4094 [ OK ] Multicast IPv6 VID 4094 [ OK ] Fixes: 476a4f05d9b8 ("selftests: forwarding: add a no_forwarding.sh test") Reviewed-by: Hangbin Liu Reviewed-by: Shuah Khan Signed-off-by: Kacper Ludwinski Link: https://patch.msgid.link/20241002051016.849-1-kac.ludwinski@icloud.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- tools/testing/selftests/net/forwarding/no_forwarding.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/net/forwarding/no_forwarding.sh b/tools/testing/selftests/net/forwarding/no_forwarding.sh index 9e677aa64a06..694ece9ba3a7 100755 --- a/tools/testing/selftests/net/forwarding/no_forwarding.sh +++ b/tools/testing/selftests/net/forwarding/no_forwarding.sh @@ -202,7 +202,7 @@ one_bridge_two_pvids() ip link set $swp2 master br0 bridge vlan add dev $swp1 vid 1 pvid untagged - bridge vlan add dev $swp1 vid 2 pvid untagged + bridge vlan add dev $swp2 vid 2 pvid untagged run_test "Switch ports in VLAN-aware bridge with different PVIDs" From ced98072d3511b232ae1d3347945f35f30c0e303 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Mon, 30 Sep 2024 13:26:21 -0400 Subject: [PATCH 10/29] Bluetooth: RFCOMM: FIX possible deadlock in rfcomm_sk_state_change [ Upstream commit 08d1914293dae38350b8088980e59fbc699a72fe ] rfcomm_sk_state_change attempts to use sock_lock so it must never be called with it locked but rfcomm_sock_ioctl always attempt to lock it causing the following trace: ====================================================== WARNING: possible circular locking dependency detected 6.8.0-syzkaller-08951-gfe46a7dd189e #0 Not tainted ------------------------------------------------------ syz-executor386/5093 is trying to acquire lock: ffff88807c396258 (sk_lock-AF_BLUETOOTH-BTPROTO_RFCOMM){+.+.}-{0:0}, at: lock_sock include/net/sock.h:1671 [inline] ffff88807c396258 (sk_lock-AF_BLUETOOTH-BTPROTO_RFCOMM){+.+.}-{0:0}, at: rfcomm_sk_state_change+0x5b/0x310 net/bluetooth/rfcomm/sock.c:73 but task is already holding lock: ffff88807badfd28 (&d->lock){+.+.}-{3:3}, at: __rfcomm_dlc_close+0x226/0x6a0 net/bluetooth/rfcomm/core.c:491 Reported-by: syzbot+d7ce59b06b3eb14fd218@syzkaller.appspotmail.com Tested-by: syzbot+d7ce59b06b3eb14fd218@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=d7ce59b06b3eb14fd218 Fixes: 3241ad820dbb ("[Bluetooth] Add timestamp support to L2CAP, RFCOMM and SCO") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- net/bluetooth/rfcomm/sock.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 29aa07e9db9d..cbff37b32734 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -865,9 +865,7 @@ static int rfcomm_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned lon if (err == -ENOIOCTLCMD) { #ifdef CONFIG_BT_RFCOMM_TTY - lock_sock(sk); err = rfcomm_dev_ioctl(sk, cmd, (void __user *) arg); - release_sock(sk); #else err = -EOPNOTSUPP; #endif From 30c3d3d9b7ae03b023809428397ab5dcca401991 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 3 Oct 2024 21:03:21 +0200 Subject: [PATCH 11/29] net: phy: bcm84881: Fix some error handling paths [ Upstream commit 9234a2549cb6ac038bec36cc7c084218e9575513 ] If phy_read_mmd() fails, the error code stored in 'bmsr' should be returned instead of 'val' which is likely to be 0. Fixes: 75f4d8d10e01 ("net: phy: add Broadcom BCM84881 PHY driver") Signed-off-by: Christophe JAILLET Link: https://patch.msgid.link/3e1755b0c40340d00e089d6adae5bca2f8c79e53.1727982168.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/phy/bcm84881.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/phy/bcm84881.c b/drivers/net/phy/bcm84881.c index 9717a1626f3f..37a64a37b2ae 100644 --- a/drivers/net/phy/bcm84881.c +++ b/drivers/net/phy/bcm84881.c @@ -120,7 +120,7 @@ static int bcm84881_aneg_done(struct phy_device *phydev) bmsr = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_AN_C22 + MII_BMSR); if (bmsr < 0) - return val; + return bmsr; return !!(val & MDIO_AN_STAT1_COMPLETE) && !!(bmsr & BMSR_ANEGCOMPLETE); @@ -146,7 +146,7 @@ static int bcm84881_read_status(struct phy_device *phydev) bmsr = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_AN_C22 + MII_BMSR); if (bmsr < 0) - return val; + return bmsr; phydev->autoneg_complete = !!(val & MDIO_AN_STAT1_COMPLETE) && !!(bmsr & BMSR_ANEGCOMPLETE); From 49c0fbd5e7d5730123aefe0f85a39e6f0834d385 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Mon, 9 Oct 2023 12:05:34 -0700 Subject: [PATCH 12/29] thermal: int340x: processor_thermal: Set feature mask before proc_thermal_add [ Upstream commit 6ebc25d8b053a208786295bab58abbb66b39c318 ] The function proc_thermal_add() adds sysfs entries for power limits. The feature mask of available features is not present at that time, so it cannot be used by proc_thermal_add() to selectively create sysfs attributes. The feature mask is set by proc_thermal_mmio_add(), so modify the code to call it before proc_thermal_add() so as to allow the latter to use the feature mask. There is no functional impact with this change. Signed-off-by: Srinivas Pandruvada [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki Stable-dep-of: 99ca0b57e49f ("thermal: intel: int340x: processor: Fix warning during module unload") Signed-off-by: Sasha Levin --- .../processor_thermal_device_pci.c | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c index bf1b1cdfade4..a04e565fdcf1 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c @@ -237,26 +237,26 @@ static int proc_thermal_pci_probe(struct pci_dev *pdev, const struct pci_device_ INIT_DELAYED_WORK(&pci_info->work, proc_thermal_threshold_work_fn); - ret = proc_thermal_add(&pdev->dev, proc_priv); - if (ret) { - dev_err(&pdev->dev, "error: proc_thermal_add, will continue\n"); - pci_info->no_legacy = 1; - } - proc_priv->priv_data = pci_info; pci_info->proc_priv = proc_priv; pci_set_drvdata(pdev, proc_priv); ret = proc_thermal_mmio_add(pdev, proc_priv, id->driver_data); if (ret) - goto err_ret_thermal; + return ret; + + ret = proc_thermal_add(&pdev->dev, proc_priv); + if (ret) { + dev_err(&pdev->dev, "error: proc_thermal_add, will continue\n"); + pci_info->no_legacy = 1; + } pci_info->tzone = thermal_zone_device_register("TCPU_PCI", 1, 1, pci_info, &tzone_ops, &tzone_params, 0, 0); if (IS_ERR(pci_info->tzone)) { ret = PTR_ERR(pci_info->tzone); - goto err_ret_mmio; + goto err_del_legacy; } /* request and enable interrupt */ @@ -283,11 +283,10 @@ err_free_vectors: pci_free_irq_vectors(pdev); err_ret_tzone: thermal_zone_device_unregister(pci_info->tzone); -err_ret_mmio: - proc_thermal_mmio_remove(pdev, proc_priv); -err_ret_thermal: +err_del_legacy: if (!pci_info->no_legacy) proc_thermal_remove(proc_priv); + proc_thermal_mmio_remove(pdev, proc_priv); pci_disable_device(pdev); return ret; From b4ab78f4adeaf6c98be5d375518dd4fb666eac5e Mon Sep 17 00:00:00 2001 From: Zhang Rui Date: Mon, 30 Sep 2024 16:17:57 +0800 Subject: [PATCH 13/29] thermal: intel: int340x: processor: Fix warning during module unload [ Upstream commit 99ca0b57e49fb73624eede1c4396d9e3d10ccf14 ] The processor_thermal driver uses pcim_device_enable() to enable a PCI device, which means the device will be automatically disabled on driver detach. Thus there is no need to call pci_disable_device() again on it. With recent PCI device resource management improvements, e.g. commit f748a07a0b64 ("PCI: Remove legacy pcim_release()"), this problem is exposed and triggers the warining below. [ 224.010735] proc_thermal_pci 0000:00:04.0: disabling already-disabled device [ 224.010747] WARNING: CPU: 8 PID: 4442 at drivers/pci/pci.c:2250 pci_disable_device+0xe5/0x100 ... [ 224.010844] Call Trace: [ 224.010845] [ 224.010847] ? show_regs+0x6d/0x80 [ 224.010851] ? __warn+0x8c/0x140 [ 224.010854] ? pci_disable_device+0xe5/0x100 [ 224.010856] ? report_bug+0x1c9/0x1e0 [ 224.010859] ? handle_bug+0x46/0x80 [ 224.010862] ? exc_invalid_op+0x1d/0x80 [ 224.010863] ? asm_exc_invalid_op+0x1f/0x30 [ 224.010867] ? pci_disable_device+0xe5/0x100 [ 224.010869] ? pci_disable_device+0xe5/0x100 [ 224.010871] ? kfree+0x21a/0x2b0 [ 224.010873] pcim_disable_device+0x20/0x30 [ 224.010875] devm_action_release+0x16/0x20 [ 224.010878] release_nodes+0x47/0xc0 [ 224.010880] devres_release_all+0x9f/0xe0 [ 224.010883] device_unbind_cleanup+0x12/0x80 [ 224.010885] device_release_driver_internal+0x1ca/0x210 [ 224.010887] driver_detach+0x4e/0xa0 [ 224.010889] bus_remove_driver+0x6f/0xf0 [ 224.010890] driver_unregister+0x35/0x60 [ 224.010892] pci_unregister_driver+0x44/0x90 [ 224.010894] proc_thermal_pci_driver_exit+0x14/0x5f0 [processor_thermal_device_pci] ... [ 224.010921] ---[ end trace 0000000000000000 ]--- Remove the excess pci_disable_device() calls. Fixes: acd65d5d1cf4 ("thermal/drivers/int340x/processor_thermal: Add PCI MMIO based thermal driver") Signed-off-by: Zhang Rui Reviewed-by: Srinivas Pandruvada Link: https://patch.msgid.link/20240930081801.28502-3-rui.zhang@intel.com [ rjw: Subject and changelog edits ] Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- .../intel/int340x_thermal/processor_thermal_device_pci.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c index a04e565fdcf1..6b76e267cb9f 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device_pci.c @@ -287,7 +287,6 @@ err_del_legacy: if (!pci_info->no_legacy) proc_thermal_remove(proc_priv); proc_thermal_mmio_remove(pdev, proc_priv); - pci_disable_device(pdev); return ret; } @@ -309,7 +308,6 @@ static void proc_thermal_pci_remove(struct pci_dev *pdev) proc_thermal_mmio_remove(pdev, pci_info->proc_priv); if (!pci_info->no_legacy) proc_thermal_remove(proc_priv); - pci_disable_device(pdev); } #ifdef CONFIG_PM_SLEEP From 2a2d6b24f01c322d0e01239974c2901b7badce21 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Fri, 4 Oct 2024 07:21:15 -0700 Subject: [PATCH 14/29] Revert "net: stmmac: set PP_FLAG_DMA_SYNC_DEV only if XDP is enabled" [ Upstream commit 5546da79e6cc5bb3324bf25688ed05498fd3f86d ] This reverts commit b514c47ebf41a6536551ed28a05758036e6eca7c. The commit describes that we don't have to sync the page when recycling, and it tries to optimize that case. But we do need to sync after allocation. Recycling side should be changed to pass the right sync size instead. Fixes: b514c47ebf41 ("net: stmmac: set PP_FLAG_DMA_SYNC_DEV only if XDP is enabled") Reported-by: Jon Hunter Link: https://lore.kernel.org/20241004070846.2502e9ea@kernel.org Reviewed-by: Jacob Keller Reviewed-by: Furong Xu <0x1207@gmail.com> Link: https://patch.msgid.link/20241004142115.910876-1-kuba@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index b5b7ff5b3261..93630840309e 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -2015,7 +2015,7 @@ static int __alloc_dma_rx_desc_resources(struct stmmac_priv *priv, rx_q->queue_index = queue; rx_q->priv_data = priv; - pp_params.flags = PP_FLAG_DMA_MAP | (xdp_prog ? PP_FLAG_DMA_SYNC_DEV : 0); + pp_params.flags = PP_FLAG_DMA_MAP | PP_FLAG_DMA_SYNC_DEV; pp_params.pool_size = dma_conf->dma_rx_size; num_pages = DIV_ROUND_UP(dma_conf->dma_buf_sz, PAGE_SIZE); pp_params.order = ilog2(num_pages); From e4340fef88482acee78c5819787effc77234b2bb Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Thu, 3 Oct 2024 20:53:15 +0200 Subject: [PATCH 15/29] net: ethernet: adi: adin1110: Fix some error handling path in adin1110_read_fifo() [ Upstream commit 83211ae1640516accae645de82f5a0a142676897 ] If 'frame_size' is too small or if 'round_len' is an error code, it is likely that an error code should be returned to the caller. Actually, 'ret' is likely to be 0, so if one of these sanity checks fails, 'success' is returned. Return -EINVAL instead. Fixes: bc93e19d088b ("net: ethernet: adi: Add ADIN1110 support") Signed-off-by: Christophe JAILLET Link: https://patch.msgid.link/8ff73b40f50d8fa994a454911b66adebce8da266.1727981562.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/adi/adin1110.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/adi/adin1110.c b/drivers/net/ethernet/adi/adin1110.c index 7474afc0e8e7..5e17d107f362 100644 --- a/drivers/net/ethernet/adi/adin1110.c +++ b/drivers/net/ethernet/adi/adin1110.c @@ -318,11 +318,11 @@ static int adin1110_read_fifo(struct adin1110_port_priv *port_priv) * from the ADIN1110 frame header. */ if (frame_size < ADIN1110_FRAME_HEADER_LEN + ADIN1110_FEC_LEN) - return ret; + return -EINVAL; round_len = adin1110_round_len(frame_size); if (round_len < 0) - return ret; + return -EINVAL; frame_size_no_fcs = frame_size - ADIN1110_FRAME_HEADER_LEN - ADIN1110_FEC_LEN; memset(priv->data, 0, ADIN1110_RD_HEADER_LEN); From 22deeb6a70b27d37e5587067ef0dce22948e4c22 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Fri, 4 Oct 2024 10:47:17 +0200 Subject: [PATCH 16/29] net: dsa: b53: fix jumbo frame mtu check [ Upstream commit 42fb3acf6826c6764ba79feb6e15229b43fd2f9f ] JMS_MIN_SIZE is the full ethernet frame length, while mtu is just the data payload size. Comparing these two meant that mtus between 1500 and 1518 did not trigger enabling jumbo frames. So instead compare the set mtu ETH_DATA_LEN, which is equal to JMS_MIN_SIZE - ETH_HLEN - ETH_FCS_LEN; Also do a check that the requested mtu is actually greater than the minimum length, else we do not need to enable jumbo frames. In practice this only introduced a very small range of mtus that did not work properly. Newer chips allow 2000 byte large frames by default, and older chips allow 1536 bytes long, which is equivalent to an mtu of 1514. So effectivly only mtus of 1515~1517 were broken. Fixes: 6ae5834b983a ("net: dsa: b53: add MTU configuration support") Signed-off-by: Jonas Gorski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/dsa/b53/b53_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 922e5934de73..862150c54e88 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -2224,7 +2224,7 @@ static int b53_change_mtu(struct dsa_switch *ds, int port, int mtu) if (!dsa_is_cpu_port(ds, port)) return 0; - enable_jumbo = (mtu >= JMS_MIN_SIZE); + enable_jumbo = (mtu > ETH_DATA_LEN); allow_10_100 = (dev->chip_id == BCM583XX_DEVICE_ID); return b53_set_jumbo(dev, enable_jumbo, allow_10_100); From c43068b2555d80290c5c68f1374575a8f0e33d23 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Fri, 4 Oct 2024 10:47:18 +0200 Subject: [PATCH 17/29] net: dsa: b53: fix max MTU for 1g switches [ Upstream commit 680a8217dc00dc7e7da57888b3c053289b60eb2b ] JMS_MAX_SIZE is the ethernet frame length, not the MTU, which is payload without ethernet headers. According to the datasheets maximum supported frame length for most gigabyte swithes is 9720 bytes, so convert that to the expected MTU when using VLAN tagged frames. Fixes: 6ae5834b983a ("net: dsa: b53: add MTU configuration support") Signed-off-by: Jonas Gorski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/dsa/b53/b53_common.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 862150c54e88..82583edbb3f9 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "b53_regs.h" @@ -224,6 +225,8 @@ static const struct b53_mib_desc b53_mibs_58xx[] = { #define B53_MIBS_58XX_SIZE ARRAY_SIZE(b53_mibs_58xx) +#define B53_MAX_MTU (9720 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN) + static int b53_do_vlan_op(struct b53_device *dev, u8 op) { unsigned int i; @@ -2232,7 +2235,7 @@ static int b53_change_mtu(struct dsa_switch *ds, int port, int mtu) static int b53_get_max_mtu(struct dsa_switch *ds, int port) { - return JMS_MAX_SIZE; + return B53_MAX_MTU; } static const struct dsa_switch_ops b53_switch_ops = { From deec6368c05298094add675e03fd8ed58ed73bb8 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Fri, 4 Oct 2024 10:47:19 +0200 Subject: [PATCH 18/29] net: dsa: b53: fix max MTU for BCM5325/BCM5365 [ Upstream commit ca8c1f71c10193c270f772d70d34b15ad765d6a8 ] BCM5325/BCM5365 do not support jumbo frames, so we should not report a jumbo frame mtu for them. But they do support so called "oversized" frames up to 1536 bytes long by default, so report an appropriate MTU. Fixes: 6ae5834b983a ("net: dsa: b53: add MTU configuration support") Signed-off-by: Jonas Gorski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/dsa/b53/b53_common.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 82583edbb3f9..28b58dee6b8c 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -225,6 +225,7 @@ static const struct b53_mib_desc b53_mibs_58xx[] = { #define B53_MIBS_58XX_SIZE ARRAY_SIZE(b53_mibs_58xx) +#define B53_MAX_MTU_25 (1536 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN) #define B53_MAX_MTU (9720 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN) static int b53_do_vlan_op(struct b53_device *dev, u8 op) @@ -2235,6 +2236,11 @@ static int b53_change_mtu(struct dsa_switch *ds, int port, int mtu) static int b53_get_max_mtu(struct dsa_switch *ds, int port) { + struct b53_device *dev = ds->priv; + + if (is5325(dev) || is5365(dev)) + return B53_MAX_MTU_25; + return B53_MAX_MTU; } From 40081f2107666899c45c5ce8aad5a3ce4c9417c4 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Fri, 4 Oct 2024 10:47:20 +0200 Subject: [PATCH 19/29] net: dsa: b53: allow lower MTUs on BCM5325/5365 [ Upstream commit e4b294f88a32438baf31762441f3dd1c996778be ] While BCM5325/5365 do not support jumbo frames, they do support slightly oversized frames, so do not error out if requesting a supported MTU for them. Fixes: 6ae5834b983a ("net: dsa: b53: add MTU configuration support") Signed-off-by: Jonas Gorski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/dsa/b53/b53_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 28b58dee6b8c..83f71c181a15 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -2223,7 +2223,7 @@ static int b53_change_mtu(struct dsa_switch *ds, int port, int mtu) bool allow_10_100; if (is5325(dev) || is5365(dev)) - return -EOPNOTSUPP; + return 0; if (!dsa_is_cpu_port(ds, port)) return 0; From bc16154fde8c3993c0f80d1c32cc3a3b398fd295 Mon Sep 17 00:00:00 2001 From: Jonas Gorski Date: Fri, 4 Oct 2024 10:47:21 +0200 Subject: [PATCH 20/29] net: dsa: b53: fix jumbo frames on 10/100 ports [ Upstream commit 2f3dcd0d39affe5b9ba1c351ce0e270c8bdd5109 ] All modern chips support and need the 10_100 bit set for supporting jumbo frames on 10/100 ports, so instead of enabling it only for 583XX enable it for everything except bcm63xx, where the bit is writeable, but does nothing. Tested on BCM53115, where jumbo frames were dropped at 10/100 speeds without the bit set. Fixes: 6ae5834b983a ("net: dsa: b53: add MTU configuration support") Signed-off-by: Jonas Gorski Reviewed-by: Florian Fainelli Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/dsa/b53/b53_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index 83f71c181a15..463c6d84ae1b 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -2229,7 +2229,7 @@ static int b53_change_mtu(struct dsa_switch *ds, int port, int mtu) return 0; enable_jumbo = (mtu > ETH_DATA_LEN); - allow_10_100 = (dev->chip_id == BCM583XX_DEVICE_ID); + allow_10_100 = !is63xx(dev); return b53_set_jumbo(dev, enable_jumbo, allow_10_100); } From 563550b619e23c3734eba6da1c3ad40e4f537c94 Mon Sep 17 00:00:00 2001 From: Billy Tsai Date: Tue, 8 Oct 2024 16:14:44 +0800 Subject: [PATCH 21/29] gpio: aspeed: Add the flush write to ensure the write complete. [ Upstream commit 1bb5a99e1f3fd27accb804aa0443a789161f843c ] Performing a dummy read ensures that the register write operation is fully completed, mitigating any potential bus delays that could otherwise impact the frequency of bitbang usage. E.g., if the JTAG application uses GPIO to control the JTAG pins (TCK, TMS, TDI, TDO, and TRST), and the application sets the TCK clock to 1 MHz, the GPIO's high/low transitions will rely on a delay function to ensure the clock frequency does not exceed 1 MHz. However, this can lead to rapid toggling of the GPIO because the write operation is POSTed and does not wait for a bus acknowledgment. Fixes: 361b79119a4b ("gpio: Add Aspeed driver") Reviewed-by: Andrew Jeffery Signed-off-by: Billy Tsai Link: https://lore.kernel.org/r/20241008081450.1490955-2-billy_tsai@aspeedtech.com Signed-off-by: Bartosz Golaszewski Signed-off-by: Sasha Levin --- drivers/gpio/gpio-aspeed.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/gpio/gpio-aspeed.c b/drivers/gpio/gpio-aspeed.c index 42d3e1cf7352..3cfb2c6103c6 100644 --- a/drivers/gpio/gpio-aspeed.c +++ b/drivers/gpio/gpio-aspeed.c @@ -404,6 +404,8 @@ static void __aspeed_gpio_set(struct gpio_chip *gc, unsigned int offset, gpio->dcache[GPIO_BANK(offset)] = reg; iowrite32(reg, addr); + /* Flush write */ + ioread32(addr); } static void aspeed_gpio_set(struct gpio_chip *gc, unsigned int offset, From 75aa45ece3bd591eb4e7f666f4b288b39beeecf3 Mon Sep 17 00:00:00 2001 From: Billy Tsai Date: Tue, 8 Oct 2024 16:14:45 +0800 Subject: [PATCH 22/29] gpio: aspeed: Use devm_clk api to manage clock source [ Upstream commit a6191a3d18119184237f4ee600039081ad992320 ] Replace of_clk_get with devm_clk_get_enabled to manage the clock source. Fixes: 5ae4cb94b313 ("gpio: aspeed: Add debounce support") Reviewed-by: Andrew Jeffery Signed-off-by: Billy Tsai Link: https://lore.kernel.org/r/20241008081450.1490955-3-billy_tsai@aspeedtech.com Signed-off-by: Bartosz Golaszewski Signed-off-by: Sasha Levin --- drivers/gpio/gpio-aspeed.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpio/gpio-aspeed.c b/drivers/gpio/gpio-aspeed.c index 3cfb2c6103c6..21deb228c7d7 100644 --- a/drivers/gpio/gpio-aspeed.c +++ b/drivers/gpio/gpio-aspeed.c @@ -1156,7 +1156,7 @@ static int __init aspeed_gpio_probe(struct platform_device *pdev) if (!gpio_id) return -EINVAL; - gpio->clk = of_clk_get(pdev->dev.of_node, 0); + gpio->clk = devm_clk_get_enabled(&pdev->dev, NULL); if (IS_ERR(gpio->clk)) { dev_warn(&pdev->dev, "Failed to get clock from devicetree, debouncing disabled\n"); From 94438143fed51ad38fe7e1aa5679de76f5559ff5 Mon Sep 17 00:00:00 2001 From: Marcin Szycik Date: Tue, 24 Sep 2024 12:04:24 +0200 Subject: [PATCH 23/29] ice: Fix netif_is_ice() in Safe Mode [ Upstream commit 8e60dbcbaaa177dacef55a61501790e201bf8c88 ] netif_is_ice() works by checking the pointer to netdev ops. However, it only checks for the default ice_netdev_ops, not ice_netdev_safe_mode_ops, so in Safe Mode it always returns false, which is unintuitive. While it doesn't look like netif_is_ice() is currently being called anywhere in Safe Mode, this could change and potentially lead to unexpected behaviour. Fixes: df006dd4b1dc ("ice: Add initial support framework for LAG") Reviewed-by: Przemek Kitszel Signed-off-by: Marcin Szycik Reviewed-by: Brett Creeley Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/ice/ice_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c index 3f01942e4982..9be88820b77c 100644 --- a/drivers/net/ethernet/intel/ice/ice_main.c +++ b/drivers/net/ethernet/intel/ice/ice_main.c @@ -82,7 +82,8 @@ ice_indr_setup_tc_cb(struct net_device *netdev, struct Qdisc *sch, bool netif_is_ice(struct net_device *dev) { - return dev && (dev->netdev_ops == &ice_netdev_ops); + return dev && (dev->netdev_ops == &ice_netdev_ops || + dev->netdev_ops == &ice_netdev_safe_mode_ops); } /** From 9a9747288ba0a9ad4f5c9877f18dd245770ad64e Mon Sep 17 00:00:00 2001 From: Aleksandr Loktionov Date: Mon, 23 Sep 2024 11:12:19 +0200 Subject: [PATCH 24/29] i40e: Fix macvlan leak by synchronizing access to mac_filter_hash [ Upstream commit dac6c7b3d33756d6ce09f00a96ea2ecd79fae9fb ] This patch addresses a macvlan leak issue in the i40e driver caused by concurrent access to vsi->mac_filter_hash. The leak occurs when multiple threads attempt to modify the mac_filter_hash simultaneously, leading to inconsistent state and potential memory leaks. To fix this, we now wrap the calls to i40e_del_mac_filter() and zeroing vf->default_lan_addr.addr with spin_lock/unlock_bh(&vsi->mac_filter_hash_lock), ensuring atomic operations and preventing concurrent access. Additionally, we add lockdep_assert_held(&vsi->mac_filter_hash_lock) in i40e_add_mac_filter() to help catch similar issues in the future. Reproduction steps: 1. Spawn VFs and configure port vlan on them. 2. Trigger concurrent macvlan operations (e.g., adding and deleting portvlan and/or mac filters). 3. Observe the potential memory leak and inconsistent state in the mac_filter_hash. This synchronization ensures the integrity of the mac_filter_hash and prevents the described leak. Fixes: fed0d9f13266 ("i40e: Fix VF's MAC Address change on VM") Reviewed-by: Arkadiusz Kubalewski Signed-off-by: Aleksandr Loktionov Reviewed-by: Simon Horman Tested-by: Rafal Romanowski Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/i40e/i40e_main.c | 1 + drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c index 1194dcacbd29..5f46a0677d19 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_main.c +++ b/drivers/net/ethernet/intel/i40e/i40e_main.c @@ -1730,6 +1730,7 @@ struct i40e_mac_filter *i40e_add_mac_filter(struct i40e_vsi *vsi, struct hlist_node *h; int bkt; + lockdep_assert_held(&vsi->mac_filter_hash_lock); if (vsi->info.pvid) return i40e_add_filter(vsi, macaddr, le16_to_cpu(vsi->info.pvid)); diff --git a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c index a5f0c95cba8b..11e9858cc86a 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c +++ b/drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c @@ -2215,8 +2215,10 @@ static int i40e_vc_get_vf_resources_msg(struct i40e_vf *vf, u8 *msg) vfres->vsi_res[0].qset_handle = le16_to_cpu(vsi->info.qs_handle[0]); if (!(vf->driver_caps & VIRTCHNL_VF_OFFLOAD_USO) && !vf->pf_set_mac) { + spin_lock_bh(&vsi->mac_filter_hash_lock); i40e_del_mac_filter(vsi, vf->default_lan_addr.addr); eth_zero_addr(vf->default_lan_addr.addr); + spin_unlock_bh(&vsi->mac_filter_hash_lock); } ether_addr_copy(vfres->vsi_res[0].default_mac_addr, vf->default_lan_addr.addr); From 6a39c8f5c8aae74c5ab2ba466791f59ffaab0178 Mon Sep 17 00:00:00 2001 From: Mohamed Khalfella Date: Tue, 24 Sep 2024 15:06:01 -0600 Subject: [PATCH 25/29] igb: Do not bring the device up after non-fatal error [ Upstream commit 330a699ecbfc9c26ec92c6310686da1230b4e7eb ] Commit 004d25060c78 ("igb: Fix igb_down hung on surprise removal") changed igb_io_error_detected() to ignore non-fatal pcie errors in order to avoid hung task that can happen when igb_down() is called multiple times. This caused an issue when processing transient non-fatal errors. igb_io_resume(), which is called after igb_io_error_detected(), assumes that device is brought down by igb_io_error_detected() if the interface is up. This resulted in panic with stacktrace below. [ T3256] igb 0000:09:00.0 haeth0: igb: haeth0 NIC Link is Down [ T292] pcieport 0000:00:1c.5: AER: Uncorrected (Non-Fatal) error received: 0000:09:00.0 [ T292] igb 0000:09:00.0: PCIe Bus Error: severity=Uncorrected (Non-Fatal), type=Transaction Layer, (Requester ID) [ T292] igb 0000:09:00.0: device [8086:1537] error status/mask=00004000/00000000 [ T292] igb 0000:09:00.0: [14] CmpltTO [ 200.105524,009][ T292] igb 0000:09:00.0: AER: TLP Header: 00000000 00000000 00000000 00000000 [ T292] pcieport 0000:00:1c.5: AER: broadcast error_detected message [ T292] igb 0000:09:00.0: Non-correctable non-fatal error reported. [ T292] pcieport 0000:00:1c.5: AER: broadcast mmio_enabled message [ T292] pcieport 0000:00:1c.5: AER: broadcast resume message [ T292] ------------[ cut here ]------------ [ T292] kernel BUG at net/core/dev.c:6539! [ T292] invalid opcode: 0000 [#1] PREEMPT SMP [ T292] RIP: 0010:napi_enable+0x37/0x40 [ T292] Call Trace: [ T292] [ T292] ? die+0x33/0x90 [ T292] ? do_trap+0xdc/0x110 [ T292] ? napi_enable+0x37/0x40 [ T292] ? do_error_trap+0x70/0xb0 [ T292] ? napi_enable+0x37/0x40 [ T292] ? napi_enable+0x37/0x40 [ T292] ? exc_invalid_op+0x4e/0x70 [ T292] ? napi_enable+0x37/0x40 [ T292] ? asm_exc_invalid_op+0x16/0x20 [ T292] ? napi_enable+0x37/0x40 [ T292] igb_up+0x41/0x150 [ T292] igb_io_resume+0x25/0x70 [ T292] report_resume+0x54/0x70 [ T292] ? report_frozen_detected+0x20/0x20 [ T292] pci_walk_bus+0x6c/0x90 [ T292] ? aer_print_port_info+0xa0/0xa0 [ T292] pcie_do_recovery+0x22f/0x380 [ T292] aer_process_err_devices+0x110/0x160 [ T292] aer_isr+0x1c1/0x1e0 [ T292] ? disable_irq_nosync+0x10/0x10 [ T292] irq_thread_fn+0x1a/0x60 [ T292] irq_thread+0xe3/0x1a0 [ T292] ? irq_set_affinity_notifier+0x120/0x120 [ T292] ? irq_affinity_notify+0x100/0x100 [ T292] kthread+0xe2/0x110 [ T292] ? kthread_complete_and_exit+0x20/0x20 [ T292] ret_from_fork+0x2d/0x50 [ T292] ? kthread_complete_and_exit+0x20/0x20 [ T292] ret_from_fork_asm+0x11/0x20 [ T292] To fix this issue igb_io_resume() checks if the interface is running and the device is not down this means igb_io_error_detected() did not bring the device down and there is no need to bring it up. Signed-off-by: Mohamed Khalfella Reviewed-by: Yuanyuan Zhong Fixes: 004d25060c78 ("igb: Fix igb_down hung on surprise removal") Reviewed-by: Simon Horman Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/igb/igb_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index f2f719a952f8..2e2caf559d00 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -9666,6 +9666,10 @@ static void igb_io_resume(struct pci_dev *pdev) struct igb_adapter *adapter = netdev_priv(netdev); if (netif_running(netdev)) { + if (!test_bit(__IGB_DOWN, &adapter->state)) { + dev_dbg(&pdev->dev, "Resuming from non-fatal error, do nothing.\n"); + return; + } if (igb_up(adapter)) { dev_err(&pdev->dev, "igb_up failed after reset\n"); return; From 76feedc74b90270390fbfdf74a2e944e96872363 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 7 Oct 2024 18:41:30 +0000 Subject: [PATCH 26/29] net/sched: accept TCA_STAB only for root qdisc [ Upstream commit 3cb7cf1540ddff5473d6baeb530228d19bc97b8a ] Most qdiscs maintain their backlog using qdisc_pkt_len(skb) on the assumption it is invariant between the enqueue() and dequeue() handlers. Unfortunately syzbot can crash a host rather easily using a TBF + SFQ combination, with an STAB on SFQ [1] We can't support TCA_STAB on arbitrary level, this would require to maintain per-qdisc storage. [1] [ 88.796496] BUG: kernel NULL pointer dereference, address: 0000000000000000 [ 88.798611] #PF: supervisor read access in kernel mode [ 88.799014] #PF: error_code(0x0000) - not-present page [ 88.799506] PGD 0 P4D 0 [ 88.799829] Oops: Oops: 0000 [#1] SMP NOPTI [ 88.800569] CPU: 14 UID: 0 PID: 2053 Comm: b371744477 Not tainted 6.12.0-rc1-virtme #1117 [ 88.801107] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-debian-1.16.3-2 04/01/2014 [ 88.801779] RIP: 0010:sfq_dequeue (net/sched/sch_sfq.c:272 net/sched/sch_sfq.c:499) sch_sfq [ 88.802544] Code: 0f b7 50 12 48 8d 04 d5 00 00 00 00 48 89 d6 48 29 d0 48 8b 91 c0 01 00 00 48 c1 e0 03 48 01 c2 66 83 7a 1a 00 7e c0 48 8b 3a <4c> 8b 07 4c 89 02 49 89 50 08 48 c7 47 08 00 00 00 00 48 c7 07 00 All code ======== 0: 0f b7 50 12 movzwl 0x12(%rax),%edx 4: 48 8d 04 d5 00 00 00 lea 0x0(,%rdx,8),%rax b: 00 c: 48 89 d6 mov %rdx,%rsi f: 48 29 d0 sub %rdx,%rax 12: 48 8b 91 c0 01 00 00 mov 0x1c0(%rcx),%rdx 19: 48 c1 e0 03 shl $0x3,%rax 1d: 48 01 c2 add %rax,%rdx 20: 66 83 7a 1a 00 cmpw $0x0,0x1a(%rdx) 25: 7e c0 jle 0xffffffffffffffe7 27: 48 8b 3a mov (%rdx),%rdi 2a:* 4c 8b 07 mov (%rdi),%r8 <-- trapping instruction 2d: 4c 89 02 mov %r8,(%rdx) 30: 49 89 50 08 mov %rdx,0x8(%r8) 34: 48 c7 47 08 00 00 00 movq $0x0,0x8(%rdi) 3b: 00 3c: 48 rex.W 3d: c7 .byte 0xc7 3e: 07 (bad) ... Code starting with the faulting instruction =========================================== 0: 4c 8b 07 mov (%rdi),%r8 3: 4c 89 02 mov %r8,(%rdx) 6: 49 89 50 08 mov %rdx,0x8(%r8) a: 48 c7 47 08 00 00 00 movq $0x0,0x8(%rdi) 11: 00 12: 48 rex.W 13: c7 .byte 0xc7 14: 07 (bad) ... [ 88.803721] RSP: 0018:ffff9a1f892b7d58 EFLAGS: 00000206 [ 88.804032] RAX: 0000000000000000 RBX: ffff9a1f8420c800 RCX: ffff9a1f8420c800 [ 88.804560] RDX: ffff9a1f81bc1440 RSI: 0000000000000000 RDI: 0000000000000000 [ 88.805056] RBP: ffffffffc04bb0e0 R08: 0000000000000001 R09: 00000000ff7f9a1f [ 88.805473] R10: 000000000001001b R11: 0000000000009a1f R12: 0000000000000140 [ 88.806194] R13: 0000000000000001 R14: ffff9a1f886df400 R15: ffff9a1f886df4ac [ 88.806734] FS: 00007f445601a740(0000) GS:ffff9a2e7fd80000(0000) knlGS:0000000000000000 [ 88.807225] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 88.807672] CR2: 0000000000000000 CR3: 000000050cc46000 CR4: 00000000000006f0 [ 88.808165] Call Trace: [ 88.808459] [ 88.808710] ? __die (arch/x86/kernel/dumpstack.c:421 arch/x86/kernel/dumpstack.c:434) [ 88.809261] ? page_fault_oops (arch/x86/mm/fault.c:715) [ 88.809561] ? exc_page_fault (./arch/x86/include/asm/irqflags.h:26 ./arch/x86/include/asm/irqflags.h:87 ./arch/x86/include/asm/irqflags.h:147 arch/x86/mm/fault.c:1489 arch/x86/mm/fault.c:1539) [ 88.809806] ? asm_exc_page_fault (./arch/x86/include/asm/idtentry.h:623) [ 88.810074] ? sfq_dequeue (net/sched/sch_sfq.c:272 net/sched/sch_sfq.c:499) sch_sfq [ 88.810411] sfq_reset (net/sched/sch_sfq.c:525) sch_sfq [ 88.810671] qdisc_reset (./include/linux/skbuff.h:2135 ./include/linux/skbuff.h:2441 ./include/linux/skbuff.h:3304 ./include/linux/skbuff.h:3310 net/sched/sch_generic.c:1036) [ 88.810950] tbf_reset (./include/linux/timekeeping.h:169 net/sched/sch_tbf.c:334) sch_tbf [ 88.811208] qdisc_reset (./include/linux/skbuff.h:2135 ./include/linux/skbuff.h:2441 ./include/linux/skbuff.h:3304 ./include/linux/skbuff.h:3310 net/sched/sch_generic.c:1036) [ 88.811484] netif_set_real_num_tx_queues (./include/linux/spinlock.h:396 ./include/net/sch_generic.h:768 net/core/dev.c:2958) [ 88.811870] __tun_detach (drivers/net/tun.c:590 drivers/net/tun.c:673) [ 88.812271] tun_chr_close (drivers/net/tun.c:702 drivers/net/tun.c:3517) [ 88.812505] __fput (fs/file_table.c:432 (discriminator 1)) [ 88.812735] task_work_run (kernel/task_work.c:230) [ 88.813016] do_exit (kernel/exit.c:940) [ 88.813372] ? trace_hardirqs_on (kernel/trace/trace_preemptirq.c:58 (discriminator 4)) [ 88.813639] ? handle_mm_fault (./arch/x86/include/asm/irqflags.h:42 ./arch/x86/include/asm/irqflags.h:97 ./arch/x86/include/asm/irqflags.h:155 ./include/linux/memcontrol.h:1022 ./include/linux/memcontrol.h:1045 ./include/linux/memcontrol.h:1052 mm/memory.c:5928 mm/memory.c:6088) [ 88.813867] do_group_exit (kernel/exit.c:1070) [ 88.814138] __x64_sys_exit_group (kernel/exit.c:1099) [ 88.814490] x64_sys_call (??:?) [ 88.814791] do_syscall_64 (arch/x86/entry/common.c:52 (discriminator 1) arch/x86/entry/common.c:83 (discriminator 1)) [ 88.815012] entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) [ 88.815495] RIP: 0033:0x7f44560f1975 Fixes: 175f9c1bba9b ("net_sched: Add size table for qdiscs") Reported-by: syzbot Signed-off-by: Eric Dumazet Cc: Daniel Borkmann Link: https://patch.msgid.link/20241007184130.3960565-1-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- include/net/sch_generic.h | 1 - net/sched/sch_api.c | 7 ++++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index aefdb080ad3d..743acbc43c85 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -813,7 +813,6 @@ static inline void qdisc_calculate_pkt_len(struct sk_buff *skb, static inline int qdisc_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { - qdisc_calculate_pkt_len(skb, sch); return sch->enqueue(skb, sch, to_free); } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index bf8e45ffc298..87ba5aaef206 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -592,7 +592,6 @@ out: pkt_len = 1; qdisc_skb_cb(skb)->pkt_len = pkt_len; } -EXPORT_SYMBOL(__qdisc_calculate_pkt_len); void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc) { @@ -1169,6 +1168,12 @@ skip: return -EINVAL; } + if (new && + !(parent->flags & TCQ_F_MQROOT) && + rcu_access_pointer(new->stab)) { + NL_SET_ERR_MSG(extack, "STAB not supported on a non root"); + return -EINVAL; + } err = cops->graft(parent, cl, new, &old, extack); if (err) return err; From b541831a5d1cde6da778872cc7d2745f8619fc8f Mon Sep 17 00:00:00 2001 From: Rosen Penev Date: Mon, 7 Oct 2024 16:57:11 -0700 Subject: [PATCH 27/29] net: ibm: emac: mal: fix wrong goto [ Upstream commit 08c8acc9d8f3f70d62dd928571368d5018206490 ] dcr_map is called in the previous if and therefore needs to be unmapped. Fixes: 1ff0fcfcb1a6 ("ibm_newemac: Fix new MAL feature handling") Signed-off-by: Rosen Penev Link: https://patch.msgid.link/20241007235711.5714-1-rosenp@gmail.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/ibm/emac/mal.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/ibm/emac/mal.c b/drivers/net/ethernet/ibm/emac/mal.c index ff5487bbebe3..a93c9230e982 100644 --- a/drivers/net/ethernet/ibm/emac/mal.c +++ b/drivers/net/ethernet/ibm/emac/mal.c @@ -576,7 +576,7 @@ static int mal_probe(struct platform_device *ofdev) printk(KERN_ERR "%pOF: Support for 405EZ not enabled!\n", ofdev->dev.of_node); err = -ENODEV; - goto fail; + goto fail_unmap; #endif } From 8409539dbb93708bca21fbdeee9abfc057cd0883 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 2 Oct 2024 15:02:56 +0100 Subject: [PATCH 28/29] btrfs: zoned: fix missing RCU locking in error message when loading zone info [ Upstream commit fe4cd7ed128fe82ab9fe4f9fc8a73d4467699787 ] At btrfs_load_zone_info() we have an error path that is dereferencing the name of a device which is a RCU string but we are not holding a RCU read lock, which is incorrect. Fix this by using btrfs_err_in_rcu() instead of btrfs_err(). The problem is there since commit 08e11a3db098 ("btrfs: zoned: load zone's allocation offset"), back then at btrfs_load_block_group_zone_info() but then later on that code was factored out into the helper btrfs_load_zone_info() by commit 09a46725cc84 ("btrfs: zoned: factor out per-zone logic from btrfs_load_block_group_zone_info"). Fixes: 08e11a3db098 ("btrfs: zoned: load zone's allocation offset") Reviewed-by: Johannes Thumshirn Reviewed-by: Qu Wenruo Reviewed-by: Naohiro Aota Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/zoned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 2c42e85a3e26..794526ab90d2 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1345,7 +1345,7 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: - btrfs_err(fs_info, + btrfs_err_in_rcu(fs_info, "zoned: offline/readonly zone %llu on device %s (devid %llu)", (info->physical >> device->zone_info->zone_size_shift), rcu_str_deref(device->name), device->devid); From 5de0b8ca7c2b35e443fe79f2380c5ff9b6c339a3 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Mon, 7 Oct 2024 12:25:11 -0400 Subject: [PATCH 29/29] sctp: ensure sk_state is set to CLOSED if hashing fails in sctp_listen_start [ Upstream commit 4d5c70e6155d5eae198bade4afeab3c1b15073b6 ] If hashing fails in sctp_listen_start(), the socket remains in the LISTENING state, even though it was not added to the hash table. This can lead to a scenario where a socket appears to be listening without actually being accessible. This patch ensures that if the hashing operation fails, the sk_state is set back to CLOSED before returning an error. Note that there is no need to undo the autobind operation if hashing fails, as the bind port can still be used for next listen() call on the same socket. Fixes: 76c6d988aeb3 ("sctp: add sock_reuseport for the sock in __sctp_hash_endpoint") Reported-by: Marcelo Ricardo Leitner Signed-off-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/sctp/socket.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 98b8eb9a21bd..0b201cd811a1 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -8520,6 +8520,7 @@ static int sctp_listen_start(struct sock *sk, int backlog) struct sctp_endpoint *ep = sp->ep; struct crypto_shash *tfm = NULL; char alg[32]; + int err; /* Allocate HMAC for generating cookie. */ if (!sp->hmac && sp->sctp_hmac_alg) { @@ -8547,18 +8548,25 @@ static int sctp_listen_start(struct sock *sk, int backlog) inet_sk_set_state(sk, SCTP_SS_LISTENING); if (!ep->base.bind_addr.port) { if (sctp_autobind(sk)) { - inet_sk_set_state(sk, SCTP_SS_CLOSED); - return -EAGAIN; + err = -EAGAIN; + goto err; } } else { if (sctp_get_port(sk, inet_sk(sk)->inet_num)) { - inet_sk_set_state(sk, SCTP_SS_CLOSED); - return -EADDRINUSE; + err = -EADDRINUSE; + goto err; } } WRITE_ONCE(sk->sk_max_ack_backlog, backlog); - return sctp_hash_endpoint(ep); + err = sctp_hash_endpoint(ep); + if (err) + goto err; + + return 0; +err: + inet_sk_set_state(sk, SCTP_SS_CLOSED); + return err; } /*