From 4e87a52133284afbd40fb522dbf96e258af52a98 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Tue, 5 Nov 2024 17:52:34 -0800 Subject: [PATCH 01/76] netlink: terminate outstanding dump on socket close [ Upstream commit 1904fb9ebf911441f90a68e96b22aa73e4410505 ] Netlink supports iterative dumping of data. It provides the families the following ops: - start - (optional) kicks off the dumping process - dump - actual dump helper, keeps getting called until it returns 0 - done - (optional) pairs with .start, can be used for cleanup The whole process is asynchronous and the repeated calls to .dump don't actually happen in a tight loop, but rather are triggered in response to recvmsg() on the socket. This gives the user full control over the dump, but also means that the user can close the socket without getting to the end of the dump. To make sure .start is always paired with .done we check if there is an ongoing dump before freeing the socket, and if so call .done. The complication is that sockets can get freed from BH and .done is allowed to sleep. So we use a workqueue to defer the call, when needed. Unfortunately this does not work correctly. What we defer is not the cleanup but rather releasing a reference on the socket. We have no guarantee that we own the last reference, if someone else holds the socket they may release it in BH and we're back to square one. The whole dance, however, appears to be unnecessary. Only the user can interact with dumps, so we can clean up when socket is closed. And close always happens in process context. Some async code may still access the socket after close, queue notification skbs to it etc. but no dumps can start, end or otherwise make progress. Delete the workqueue and flush the dump state directly from the release handler. Note that further cleanup is possible in -next, for instance we now always call .done before releasing the main module reference, so dump doesn't have to take a reference of its own. Reported-by: syzkaller Fixes: ed5d7788a934 ("netlink: Do not schedule work from sk_destruct") Reviewed-by: Kuniyuki Iwashima Reviewed-by: Eric Dumazet Link: https://patch.msgid.link/20241106015235.2458807-1-kuba@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/netlink/af_netlink.c | 31 ++++++++----------------------- net/netlink/af_netlink.h | 2 -- 2 files changed, 8 insertions(+), 25 deletions(-) diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 9eb87f35bc65..8a74847dacaf 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -393,15 +393,6 @@ static void netlink_skb_set_owner_r(struct sk_buff *skb, struct sock *sk) static void netlink_sock_destruct(struct sock *sk) { - struct netlink_sock *nlk = nlk_sk(sk); - - if (nlk->cb_running) { - if (nlk->cb.done) - nlk->cb.done(&nlk->cb); - module_put(nlk->cb.module); - kfree_skb(nlk->cb.skb); - } - skb_queue_purge(&sk->sk_receive_queue); if (!sock_flag(sk, SOCK_DEAD)) { @@ -414,14 +405,6 @@ static void netlink_sock_destruct(struct sock *sk) WARN_ON(nlk_sk(sk)->groups); } -static void netlink_sock_destruct_work(struct work_struct *work) -{ - struct netlink_sock *nlk = container_of(work, struct netlink_sock, - work); - - sk_free(&nlk->sk); -} - /* This lock without WQ_FLAG_EXCLUSIVE is good on UP and it is _very_ bad on * SMP. Look, when several writers sleep and reader wakes them up, all but one * immediately hit write lock and grab all the cpus. Exclusive sleep solves @@ -736,12 +719,6 @@ static void deferred_put_nlk_sk(struct rcu_head *head) if (!refcount_dec_and_test(&sk->sk_refcnt)) return; - if (nlk->cb_running && nlk->cb.done) { - INIT_WORK(&nlk->work, netlink_sock_destruct_work); - schedule_work(&nlk->work); - return; - } - sk_free(sk); } @@ -791,6 +768,14 @@ static int netlink_release(struct socket *sock) NETLINK_URELEASE, &n); } + /* Terminate any outstanding dump */ + if (nlk->cb_running) { + if (nlk->cb.done) + nlk->cb.done(&nlk->cb); + module_put(nlk->cb.module); + kfree_skb(nlk->cb.skb); + } + module_put(nlk->module); if (netlink_is_kernel(sk)) { diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h index b30b8fc760f7..aa430e4d58d8 100644 --- a/net/netlink/af_netlink.h +++ b/net/netlink/af_netlink.h @@ -4,7 +4,6 @@ #include #include -#include #include /* flags */ @@ -48,7 +47,6 @@ struct netlink_sock { struct rhash_head node; struct rcu_head rcu; - struct work_struct work; }; static inline struct netlink_sock *nlk_sk(struct sock *sk) From cc673c7151ac5879f489b41a2d299b0fa173f336 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Fri, 8 Nov 2024 12:43:43 +0100 Subject: [PATCH 02/76] net: vertexcom: mse102x: Fix tx_bytes calculation [ Upstream commit e68da664d379f352d41d7955712c44e0a738e4ab ] The tx_bytes should consider the actual size of the Ethernet frames without the SPI encapsulation. But we still need to take care of Ethernet padding. Fixes: 2f207cbf0dd4 ("net: vertexcom: Add MSE102x SPI support") Signed-off-by: Stefan Wahren Link: https://patch.msgid.link/20241108114343.6174-3-wahrenst@gmx.net Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/vertexcom/mse102x.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/vertexcom/mse102x.c b/drivers/net/ethernet/vertexcom/mse102x.c index dd766e175f7d..8f67c39f479e 100644 --- a/drivers/net/ethernet/vertexcom/mse102x.c +++ b/drivers/net/ethernet/vertexcom/mse102x.c @@ -437,13 +437,15 @@ static void mse102x_tx_work(struct work_struct *work) mse = &mses->mse102x; while ((txb = skb_dequeue(&mse->txq))) { + unsigned int len = max_t(unsigned int, txb->len, ETH_ZLEN); + mutex_lock(&mses->lock); ret = mse102x_tx_pkt_spi(mse, txb, work_timeout); mutex_unlock(&mses->lock); if (ret) { mse->ndev->stats.tx_dropped++; } else { - mse->ndev->stats.tx_bytes += txb->len; + mse->ndev->stats.tx_bytes += len; mse->ndev->stats.tx_packets++; } From 656dbd1c21c2c088c70059cdd43ec83e7d54ec4d Mon Sep 17 00:00:00 2001 From: Andy Yan Date: Mon, 21 Oct 2024 15:28:06 +0800 Subject: [PATCH 03/76] drm/rockchip: vop: Fix a dereferenced before check warning [ Upstream commit ab1c793f457f740ab7108cc0b1340a402dbf484d ] The 'state' can't be NULL, we should check crtc_state. Fix warning: drivers/gpu/drm/rockchip/rockchip_drm_vop.c:1096 vop_plane_atomic_async_check() warn: variable dereferenced before check 'state' (see line 1077) Fixes: 5ddb0bd4ddc3 ("drm/atomic: Pass the full state to planes async atomic check and update") Signed-off-by: Andy Yan Signed-off-by: Heiko Stuebner Link: https://patchwork.freedesktop.org/patch/msgid/20241021072818.61621-1-andyshrk@163.com Signed-off-by: Sasha Levin --- drivers/gpu/drm/rockchip/rockchip_drm_vop.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c index b2289a523c40..e5b2112af138 100644 --- a/drivers/gpu/drm/rockchip/rockchip_drm_vop.c +++ b/drivers/gpu/drm/rockchip/rockchip_drm_vop.c @@ -1080,10 +1080,10 @@ static int vop_plane_atomic_async_check(struct drm_plane *plane, if (!plane->state->fb) return -EINVAL; - if (state) - crtc_state = drm_atomic_get_existing_crtc_state(state, - new_plane_state->crtc); - else /* Special case for asynchronous cursor updates. */ + crtc_state = drm_atomic_get_existing_crtc_state(state, new_plane_state->crtc); + + /* Special case for asynchronous cursor updates. */ + if (!crtc_state) crtc_state = plane->crtc->state; return drm_atomic_helper_check_plane_state(plane->state, crtc_state, From a749b23059b43a9b1787eb36c5d9d44150a34238 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Fri, 8 Nov 2024 11:58:16 +0100 Subject: [PATCH 04/76] mptcp: error out earlier on disconnect [ Upstream commit 581302298524e9d77c4c44ff5156a6cd112227ae ] Eric reported a division by zero splat in the MPTCP protocol: Oops: divide error: 0000 [#1] PREEMPT SMP KASAN PTI CPU: 1 UID: 0 PID: 6094 Comm: syz-executor317 Not tainted 6.12.0-rc5-syzkaller-00291-g05b92660cdfe #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 09/13/2024 RIP: 0010:__tcp_select_window+0x5b4/0x1310 net/ipv4/tcp_output.c:3163 Code: f6 44 01 e3 89 df e8 9b 75 09 f8 44 39 f3 0f 8d 11 ff ff ff e8 0d 74 09 f8 45 89 f4 e9 04 ff ff ff e8 00 74 09 f8 44 89 f0 99 7c 24 14 41 29 d6 45 89 f4 e9 ec fe ff ff e8 e8 73 09 f8 48 89 RSP: 0018:ffffc900041f7930 EFLAGS: 00010293 RAX: 0000000000017e67 RBX: 0000000000017e67 RCX: ffffffff8983314b RDX: 0000000000000000 RSI: ffffffff898331b0 RDI: 0000000000000004 RBP: 00000000005d6000 R08: 0000000000000004 R09: 0000000000017e67 R10: 0000000000003e80 R11: 0000000000000000 R12: 0000000000003e80 R13: ffff888031d9b440 R14: 0000000000017e67 R15: 00000000002eb000 FS: 00007feb5d7f16c0(0000) GS:ffff8880b8700000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007feb5d8adbb8 CR3: 0000000074e4c000 CR4: 00000000003526f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: __tcp_cleanup_rbuf+0x3e7/0x4b0 net/ipv4/tcp.c:1493 mptcp_rcv_space_adjust net/mptcp/protocol.c:2085 [inline] mptcp_recvmsg+0x2156/0x2600 net/mptcp/protocol.c:2289 inet_recvmsg+0x469/0x6a0 net/ipv4/af_inet.c:885 sock_recvmsg_nosec net/socket.c:1051 [inline] sock_recvmsg+0x1b2/0x250 net/socket.c:1073 __sys_recvfrom+0x1a5/0x2e0 net/socket.c:2265 __do_sys_recvfrom net/socket.c:2283 [inline] __se_sys_recvfrom net/socket.c:2279 [inline] __x64_sys_recvfrom+0xe0/0x1c0 net/socket.c:2279 do_syscall_x64 arch/x86/entry/common.c:52 [inline] do_syscall_64+0xcd/0x250 arch/x86/entry/common.c:83 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7feb5d857559 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 51 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b0 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007feb5d7f1208 EFLAGS: 00000246 ORIG_RAX: 000000000000002d RAX: ffffffffffffffda RBX: 00007feb5d8e1318 RCX: 00007feb5d857559 RDX: 000000800000000e RSI: 0000000000000000 RDI: 0000000000000003 RBP: 00007feb5d8e1310 R08: 0000000000000000 R09: ffffffff81000000 R10: 0000000000000100 R11: 0000000000000246 R12: 00007feb5d8e131c R13: 00007feb5d8ae074 R14: 000000800000000e R15: 00000000fffffdef and provided a nice reproducer. The root cause is the current bad handling of racing disconnect. After the blamed commit below, sk_wait_data() can return (with error) with the underlying socket disconnected and a zero rcv_mss. Catch the error and return without performing any additional operations on the current socket. Reported-by: Eric Dumazet Fixes: 419ce133ab92 ("tcp: allow again tcp_disconnect() when threads are waiting") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/8c82ecf71662ecbc47bf390f9905de70884c9f2d.1731060874.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/mptcp/protocol.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d68e93dab88c..78ac5c538e13 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2180,7 +2180,7 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, cmsg_flags = MPTCP_CMSG_INQ; while (copied < len) { - int bytes_read; + int err, bytes_read; bytes_read = __mptcp_recvmsg_mskq(msk, msg, len - copied, flags, &tss, &cmsg_flags); if (unlikely(bytes_read < 0)) { @@ -2245,9 +2245,16 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, } pr_debug("block timeout %ld\n", timeo); - sk_wait_data(sk, &timeo, NULL); + mptcp_rcv_space_adjust(msk, copied); + err = sk_wait_data(sk, &timeo, NULL); + if (err < 0) { + err = copied ? : err; + goto out_err; + } } + mptcp_rcv_space_adjust(msk, copied); + out_err: if (cmsg_flags && copied >= 0) { if (cmsg_flags & MPTCP_CMSG_TS) @@ -2263,8 +2270,6 @@ out_err: pr_debug("msk=%p rx queue empty=%d:%d copied=%d\n", msk, skb_queue_empty_lockless(&sk->sk_receive_queue), skb_queue_empty(&msk->receive_queue), copied); - if (!(flags & MSG_PEEK)) - mptcp_rcv_space_adjust(msk, copied); release_sock(sk); return copied; From bfba288f53192db08c68d4c568db9783fb9cb838 Mon Sep 17 00:00:00 2001 From: Mark Bloch Date: Thu, 7 Nov 2024 20:35:23 +0200 Subject: [PATCH 05/76] net/mlx5: fs, lock FTE when checking if active [ Upstream commit 9ca314419930f9135727e39d77e66262d5f7bef6 ] The referenced commits introduced a two-step process for deleting FTEs: - Lock the FTE, delete it from hardware, set the hardware deletion function to NULL and unlock the FTE. - Lock the parent flow group, delete the software copy of the FTE, and remove it from the xarray. However, this approach encounters a race condition if a rule with the same match value is added simultaneously. In this scenario, fs_core may set the hardware deletion function to NULL prematurely, causing a panic during subsequent rule deletions. To prevent this, ensure the active flag of the FTE is checked under a lock, which will prevent the fs_core layer from attaching a new steering rule to an FTE that is in the process of deletion. [ 438.967589] MOSHE: 2496 mlx5_del_flow_rules del_hw_func [ 438.968205] ------------[ cut here ]------------ [ 438.968654] refcount_t: decrement hit 0; leaking memory. [ 438.969249] WARNING: CPU: 0 PID: 8957 at lib/refcount.c:31 refcount_warn_saturate+0xfb/0x110 [ 438.970054] Modules linked in: act_mirred cls_flower act_gact sch_ingress openvswitch nsh mlx5_vdpa vringh vhost_iotlb vdpa mlx5_ib mlx5_core xt_conntrack xt_MASQUERADE nf_conntrack_netlink nfnetlink xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm ib_uverbs ib_core zram zsmalloc fuse [last unloaded: cls_flower] [ 438.973288] CPU: 0 UID: 0 PID: 8957 Comm: tc Not tainted 6.12.0-rc1+ #8 [ 438.973888] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 438.974874] RIP: 0010:refcount_warn_saturate+0xfb/0x110 [ 438.975363] Code: 40 66 3b 82 c6 05 16 e9 4d 01 01 e8 1f 7c a0 ff 0f 0b c3 cc cc cc cc 48 c7 c7 10 66 3b 82 c6 05 fd e8 4d 01 01 e8 05 7c a0 ff <0f> 0b c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 00 90 [ 438.976947] RSP: 0018:ffff888124a53610 EFLAGS: 00010286 [ 438.977446] RAX: 0000000000000000 RBX: ffff888119d56de0 RCX: 0000000000000000 [ 438.978090] RDX: ffff88852c828700 RSI: ffff88852c81b3c0 RDI: ffff88852c81b3c0 [ 438.978721] RBP: ffff888120fa0e88 R08: 0000000000000000 R09: ffff888124a534b0 [ 438.979353] R10: 0000000000000001 R11: 0000000000000001 R12: ffff888119d56de0 [ 438.979979] R13: ffff888120fa0ec0 R14: ffff888120fa0ee8 R15: ffff888119d56de0 [ 438.980607] FS: 00007fe6dcc0f800(0000) GS:ffff88852c800000(0000) knlGS:0000000000000000 [ 438.983984] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 438.984544] CR2: 00000000004275e0 CR3: 0000000186982001 CR4: 0000000000372eb0 [ 438.985205] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 438.985842] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 438.986507] Call Trace: [ 438.986799] [ 438.987070] ? __warn+0x7d/0x110 [ 438.987426] ? refcount_warn_saturate+0xfb/0x110 [ 438.987877] ? report_bug+0x17d/0x190 [ 438.988261] ? prb_read_valid+0x17/0x20 [ 438.988659] ? handle_bug+0x53/0x90 [ 438.989054] ? exc_invalid_op+0x14/0x70 [ 438.989458] ? asm_exc_invalid_op+0x16/0x20 [ 438.989883] ? refcount_warn_saturate+0xfb/0x110 [ 438.990348] mlx5_del_flow_rules+0x2f7/0x340 [mlx5_core] [ 438.990932] __mlx5_eswitch_del_rule+0x49/0x170 [mlx5_core] [ 438.991519] ? mlx5_lag_is_sriov+0x3c/0x50 [mlx5_core] [ 438.992054] ? xas_load+0x9/0xb0 [ 438.992407] mlx5e_tc_rule_unoffload+0x45/0xe0 [mlx5_core] [ 438.993037] mlx5e_tc_del_fdb_flow+0x2a6/0x2e0 [mlx5_core] [ 438.993623] mlx5e_flow_put+0x29/0x60 [mlx5_core] [ 438.994161] mlx5e_delete_flower+0x261/0x390 [mlx5_core] [ 438.994728] tc_setup_cb_destroy+0xb9/0x190 [ 438.995150] fl_hw_destroy_filter+0x94/0xc0 [cls_flower] [ 438.995650] fl_change+0x11a4/0x13c0 [cls_flower] [ 438.996105] tc_new_tfilter+0x347/0xbc0 [ 438.996503] ? ___slab_alloc+0x70/0x8c0 [ 438.996929] rtnetlink_rcv_msg+0xf9/0x3e0 [ 438.997339] ? __netlink_sendskb+0x4c/0x70 [ 438.997751] ? netlink_unicast+0x286/0x2d0 [ 438.998171] ? __pfx_rtnetlink_rcv_msg+0x10/0x10 [ 438.998625] netlink_rcv_skb+0x54/0x100 [ 438.999020] netlink_unicast+0x203/0x2d0 [ 438.999421] netlink_sendmsg+0x1e4/0x420 [ 438.999820] __sock_sendmsg+0xa1/0xb0 [ 439.000203] ____sys_sendmsg+0x207/0x2a0 [ 439.000600] ? copy_msghdr_from_user+0x6d/0xa0 [ 439.001072] ___sys_sendmsg+0x80/0xc0 [ 439.001459] ? ___sys_recvmsg+0x8b/0xc0 [ 439.001848] ? generic_update_time+0x4d/0x60 [ 439.002282] __sys_sendmsg+0x51/0x90 [ 439.002658] do_syscall_64+0x50/0x110 [ 439.003040] entry_SYSCALL_64_after_hwframe+0x76/0x7e Fixes: 718ce4d601db ("net/mlx5: Consolidate update FTE for all removal changes") Fixes: cefc23554fc2 ("net/mlx5: Fix FTE cleanup") Signed-off-by: Mark Bloch Reviewed-by: Maor Gottlieb Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241107183527.676877-4-tariqt@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- .../net/ethernet/mellanox/mlx5/core/fs_core.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 164e10b5f9b7..50fdc3cbb778 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1880,13 +1880,22 @@ lookup_fte_locked(struct mlx5_flow_group *g, fte_tmp = NULL; goto out; } - if (!fte_tmp->node.active) { - tree_put_node(&fte_tmp->node, false); - fte_tmp = NULL; - goto out; - } nested_down_write_ref_node(&fte_tmp->node, FS_LOCK_CHILD); + + if (!fte_tmp->node.active) { + up_write_ref_node(&fte_tmp->node, false); + + if (take_write) + up_write_ref_node(&g->node, false); + else + up_read_ref_node(&g->node); + + tree_put_node(&fte_tmp->node, false); + + return NULL; + } + out: if (take_write) up_write_ref_node(&g->node, false); From 69fbd07f17b0fdaf8970bc705f5bf115c297839d Mon Sep 17 00:00:00 2001 From: Dragos Tatulea Date: Thu, 7 Nov 2024 20:35:24 +0200 Subject: [PATCH 06/76] net/mlx5e: kTLS, Fix incorrect page refcounting [ Upstream commit dd6e972cc5890d91d6749bb48e3912721c4e4b25 ] The kTLS tx handling code is using a mix of get_page() and page_ref_inc() APIs to increment the page reference. But on the release path (mlx5e_ktls_tx_handle_resync_dump_comp()), only put_page() is used. This is an issue when using pages from large folios: the get_page() references are stored on the folio page while the page_ref_inc() references are stored directly in the given page. On release the folio page will be dereferenced too many times. This was found while doing kTLS testing with sendfile() + ZC when the served file was read from NFS on a kernel with NFS large folios support (commit 49b29a573da8 ("nfs: add support for large folios")). Fixes: 84d1bb2b139e ("net/mlx5e: kTLS, Limit DUMP wqe size") Signed-off-by: Dragos Tatulea Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241107183527.676877-5-tariqt@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- .../net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c index 2e0335246967..6d56d4a9977b 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ktls_tx.c @@ -665,7 +665,7 @@ tx_sync_info_get(struct mlx5e_ktls_offload_context_tx *priv_tx, while (remaining > 0) { skb_frag_t *frag = &record->frags[i]; - get_page(skb_frag_page(frag)); + page_ref_inc(skb_frag_page(frag)); remaining -= skb_frag_size(frag); info->frags[i++] = *frag; } @@ -768,7 +768,7 @@ void mlx5e_ktls_tx_handle_resync_dump_comp(struct mlx5e_txqsq *sq, stats = sq->stats; mlx5e_tx_dma_unmap(sq->pdev, dma); - put_page(wi->resync_dump_frag_page); + page_ref_dec(wi->resync_dump_frag_page); stats->tls_dump_packets++; stats->tls_dump_bytes += wi->num_bytes; } @@ -821,12 +821,12 @@ mlx5e_ktls_tx_handle_ooo(struct mlx5e_ktls_offload_context_tx *priv_tx, err_out: for (; i < info.nr_frags; i++) - /* The put_page() here undoes the page ref obtained in tx_sync_info_get(). + /* The page_ref_dec() here undoes the page ref obtained in tx_sync_info_get(). * Page refs obtained for the DUMP WQEs above (by page_ref_add) will be * released only upon their completions (or in mlx5e_free_txqsq_descs, * if channel closes). */ - put_page(skb_frag_page(&info.frags[i])); + page_ref_dec(skb_frag_page(&info.frags[i])); return MLX5E_KTLS_SYNC_FAIL; } From 0c7c70ff8b696cfedba350411dca736361ef9a0f Mon Sep 17 00:00:00 2001 From: Moshe Shemesh Date: Thu, 7 Nov 2024 20:35:26 +0200 Subject: [PATCH 07/76] net/mlx5e: CT: Fix null-ptr-deref in add rule err flow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit e99c6873229fe0482e7ceb7d5600e32d623ed9d9 ] In error flow of mlx5_tc_ct_entry_add_rule(), in case ct_rule_add() callback returns error, zone_rule->attr is used uninitiated. Fix it to use attr which has the needed pointer value. Kernel log: BUG: kernel NULL pointer dereference, address: 0000000000000110 RIP: 0010:mlx5_tc_ct_entry_add_rule+0x2b1/0x2f0 [mlx5_core] … Call Trace: ? __die+0x20/0x70 ? page_fault_oops+0x150/0x3e0 ? exc_page_fault+0x74/0x140 ? asm_exc_page_fault+0x22/0x30 ? mlx5_tc_ct_entry_add_rule+0x2b1/0x2f0 [mlx5_core] ? mlx5_tc_ct_entry_add_rule+0x1d5/0x2f0 [mlx5_core] mlx5_tc_ct_block_flow_offload+0xc6a/0xf90 [mlx5_core] ? nf_flow_offload_tuple+0xd8/0x190 [nf_flow_table] nf_flow_offload_tuple+0xd8/0x190 [nf_flow_table] flow_offload_work_handler+0x142/0x320 [nf_flow_table] ? finish_task_switch.isra.0+0x15b/0x2b0 process_one_work+0x16c/0x320 worker_thread+0x28c/0x3a0 ? __pfx_worker_thread+0x10/0x10 kthread+0xb8/0xf0 ? __pfx_kthread+0x10/0x10 ret_from_fork+0x2d/0x50 ? __pfx_kthread+0x10/0x10 ret_from_fork_asm+0x1a/0x30 Fixes: 7fac5c2eced3 ("net/mlx5: CT: Avoid reusing modify header context for natted entries") Signed-off-by: Moshe Shemesh Reviewed-by: Cosmin Ratiu Reviewed-by: Yevgeny Kliteynik Signed-off-by: Tariq Toukan Link: https://patch.msgid.link/20241107183527.676877-7-tariqt@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c index f01f7dfdbcf8..b011e0d2b620 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en/tc_ct.c @@ -862,7 +862,7 @@ mlx5_tc_ct_entry_add_rule(struct mlx5_tc_ct_priv *ct_priv, return 0; err_rule: - mlx5_tc_ct_entry_destroy_mod_hdr(ct_priv, zone_rule->attr, zone_rule->mh); + mlx5_tc_ct_entry_destroy_mod_hdr(ct_priv, attr, zone_rule->mh); mlx5_put_label_mapping(ct_priv, attr->ct_attr.ct_labels_id); err_mod_hdr: kfree(attr); From 946c7600fa2207cc8d3fbc86a518ec56f98a5813 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Thu, 7 Nov 2024 21:46:12 +0100 Subject: [PATCH 08/76] virtio/vsock: Fix accept_queue memory leak [ Upstream commit d7b0ff5a866724c3ad21f2628c22a63336deec3f ] As the final stages of socket destruction may be delayed, it is possible that virtio_transport_recv_listen() will be called after the accept_queue has been flushed, but before the SOCK_DONE flag has been set. As a result, sockets enqueued after the flush would remain unremoved, leading to a memory leak. vsock_release __vsock_release lock virtio_transport_release virtio_transport_close schedule_delayed_work(close_work) sk_shutdown = SHUTDOWN_MASK (!) flush accept_queue release virtio_transport_recv_pkt vsock_find_bound_socket lock if flag(SOCK_DONE) return virtio_transport_recv_listen child = vsock_create_connected (!) vsock_enqueue_accept(child) release close_work lock virtio_transport_do_close set_flag(SOCK_DONE) virtio_transport_remove_sock vsock_remove_sock vsock_remove_bound release Introduce a sk_shutdown check to disallow vsock_enqueue_accept() during socket destruction. unreferenced object 0xffff888109e3f800 (size 2040): comm "kworker/5:2", pid 371, jiffies 4294940105 hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 28 00 0b 40 00 00 00 00 00 00 00 00 00 00 00 00 (..@............ backtrace (crc 9e5f4e84): [] kmem_cache_alloc_noprof+0x2c1/0x360 [] sk_prot_alloc+0x30/0x120 [] sk_alloc+0x2c/0x4b0 [] __vsock_create.constprop.0+0x2a/0x310 [] virtio_transport_recv_pkt+0x4dc/0x9a0 [] vsock_loopback_work+0xfd/0x140 [] process_one_work+0x20c/0x570 [] worker_thread+0x1bf/0x3a0 [] kthread+0xdd/0x110 [] ret_from_fork+0x2d/0x50 [] ret_from_fork_asm+0x1a/0x30 Fixes: 3fe356d58efa ("vsock/virtio: discard packets only when socket is really closed") Reviewed-by: Stefano Garzarella Signed-off-by: Michal Luczaj Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- net/vmw_vsock/virtio_transport_common.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index b22dc7bed218..3bc573cbf8a6 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1220,6 +1220,14 @@ virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, return -ENOMEM; } + /* __vsock_release() might have already flushed accept_queue. + * Subsequent enqueues would lead to a memory leak. + */ + if (sk->sk_shutdown == SHUTDOWN_MASK) { + virtio_transport_reset_no_sock(t, skb); + return -ESHUTDOWN; + } + child = vsock_create_connected(sk); if (!child) { virtio_transport_reset_no_sock(t, skb); From 334e297d0a90566c1229f89238849b7af33df147 Mon Sep 17 00:00:00 2001 From: Lukas Bulwahn Date: Wed, 7 Feb 2024 14:42:11 +0100 Subject: [PATCH 09/76] Bluetooth: hci_event: Remove code to removed CONFIG_BT_HS [ Upstream commit f4b0c2b4cd78b75acde56c2ee5aa732b6fb2a6a9 ] Commit cec9f3c5561d ("Bluetooth: Remove BT_HS") removes config BT_HS, but misses two "ifdef BT_HS" blocks in hci_event.c. Remove this dead code from this removed config option. Signed-off-by: Lukas Bulwahn Signed-off-by: Luiz Augusto von Dentz Stable-dep-of: 7967dc8f797f ("Bluetooth: hci_core: Fix calling mgmt_device_connected") Signed-off-by: Sasha Levin --- net/bluetooth/hci_event.c | 163 -------------------------------------- 1 file changed, 163 deletions(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 7c1df481ebe9..b6fe5e15981f 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -5648,150 +5648,6 @@ unlock: hci_dev_unlock(hdev); } -#if IS_ENABLED(CONFIG_BT_HS) -static void hci_chan_selected_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_ev_channel_selected *ev = data; - struct hci_conn *hcon; - - bt_dev_dbg(hdev, "handle 0x%2.2x", ev->phy_handle); - - hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); - if (!hcon) - return; - - amp_read_loc_assoc_final_data(hdev, hcon); -} - -static void hci_phy_link_complete_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_ev_phy_link_complete *ev = data; - struct hci_conn *hcon, *bredr_hcon; - - bt_dev_dbg(hdev, "handle 0x%2.2x status 0x%2.2x", ev->phy_handle, - ev->status); - - hci_dev_lock(hdev); - - hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); - if (!hcon) - goto unlock; - - if (!hcon->amp_mgr) - goto unlock; - - if (ev->status) { - hci_conn_del(hcon); - goto unlock; - } - - bredr_hcon = hcon->amp_mgr->l2cap_conn->hcon; - - hcon->state = BT_CONNECTED; - bacpy(&hcon->dst, &bredr_hcon->dst); - - hci_conn_hold(hcon); - hcon->disc_timeout = HCI_DISCONN_TIMEOUT; - hci_conn_drop(hcon); - - hci_debugfs_create_conn(hcon); - hci_conn_add_sysfs(hcon); - - amp_physical_cfm(bredr_hcon, hcon); - -unlock: - hci_dev_unlock(hdev); -} - -static void hci_loglink_complete_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_ev_logical_link_complete *ev = data; - struct hci_conn *hcon; - struct hci_chan *hchan; - struct amp_mgr *mgr; - - bt_dev_dbg(hdev, "log_handle 0x%4.4x phy_handle 0x%2.2x status 0x%2.2x", - le16_to_cpu(ev->handle), ev->phy_handle, ev->status); - - hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); - if (!hcon) - return; - - /* Create AMP hchan */ - hchan = hci_chan_create(hcon); - if (!hchan) - return; - - hchan->handle = le16_to_cpu(ev->handle); - hchan->amp = true; - - BT_DBG("hcon %p mgr %p hchan %p", hcon, hcon->amp_mgr, hchan); - - mgr = hcon->amp_mgr; - if (mgr && mgr->bredr_chan) { - struct l2cap_chan *bredr_chan = mgr->bredr_chan; - - l2cap_chan_lock(bredr_chan); - - bredr_chan->conn->mtu = hdev->block_mtu; - l2cap_logical_cfm(bredr_chan, hchan, 0); - hci_conn_hold(hcon); - - l2cap_chan_unlock(bredr_chan); - } -} - -static void hci_disconn_loglink_complete_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_ev_disconn_logical_link_complete *ev = data; - struct hci_chan *hchan; - - bt_dev_dbg(hdev, "handle 0x%4.4x status 0x%2.2x", - le16_to_cpu(ev->handle), ev->status); - - if (ev->status) - return; - - hci_dev_lock(hdev); - - hchan = hci_chan_lookup_handle(hdev, le16_to_cpu(ev->handle)); - if (!hchan || !hchan->amp) - goto unlock; - - amp_destroy_logical_link(hchan, ev->reason); - -unlock: - hci_dev_unlock(hdev); -} - -static void hci_disconn_phylink_complete_evt(struct hci_dev *hdev, void *data, - struct sk_buff *skb) -{ - struct hci_ev_disconn_phy_link_complete *ev = data; - struct hci_conn *hcon; - - bt_dev_dbg(hdev, "status 0x%2.2x", ev->status); - - if (ev->status) - return; - - hci_dev_lock(hdev); - - hcon = hci_conn_hash_lookup_handle(hdev, ev->phy_handle); - if (hcon && hcon->type == AMP_LINK) { - hcon->state = BT_CLOSED; - hci_disconn_cfm(hcon, ev->reason); - hci_conn_del(hcon); - } - - hci_dev_unlock(hdev); -} -#endif - static void le_conn_update_addr(struct hci_conn *conn, bdaddr_t *bdaddr, u8 bdaddr_type, bdaddr_t *local_rpa) { @@ -7473,25 +7329,6 @@ static const struct hci_ev { /* [0x3e = HCI_EV_LE_META] */ HCI_EV_REQ_VL(HCI_EV_LE_META, hci_le_meta_evt, sizeof(struct hci_ev_le_meta), HCI_MAX_EVENT_SIZE), -#if IS_ENABLED(CONFIG_BT_HS) - /* [0x40 = HCI_EV_PHY_LINK_COMPLETE] */ - HCI_EV(HCI_EV_PHY_LINK_COMPLETE, hci_phy_link_complete_evt, - sizeof(struct hci_ev_phy_link_complete)), - /* [0x41 = HCI_EV_CHANNEL_SELECTED] */ - HCI_EV(HCI_EV_CHANNEL_SELECTED, hci_chan_selected_evt, - sizeof(struct hci_ev_channel_selected)), - /* [0x42 = HCI_EV_DISCONN_PHY_LINK_COMPLETE] */ - HCI_EV(HCI_EV_DISCONN_LOGICAL_LINK_COMPLETE, - hci_disconn_loglink_complete_evt, - sizeof(struct hci_ev_disconn_logical_link_complete)), - /* [0x45 = HCI_EV_LOGICAL_LINK_COMPLETE] */ - HCI_EV(HCI_EV_LOGICAL_LINK_COMPLETE, hci_loglink_complete_evt, - sizeof(struct hci_ev_logical_link_complete)), - /* [0x46 = HCI_EV_DISCONN_LOGICAL_LINK_COMPLETE] */ - HCI_EV(HCI_EV_DISCONN_PHY_LINK_COMPLETE, - hci_disconn_phylink_complete_evt, - sizeof(struct hci_ev_disconn_phy_link_complete)), -#endif /* [0x48 = HCI_EV_NUM_COMP_BLOCKS] */ HCI_EV(HCI_EV_NUM_COMP_BLOCKS, hci_num_comp_blocks_evt, sizeof(struct hci_ev_num_comp_blocks)), From dd60de788153cf9eaaac6e68a07a5b51d23fbc2b Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 8 Nov 2024 11:19:54 -0500 Subject: [PATCH 10/76] Bluetooth: hci_core: Fix calling mgmt_device_connected [ Upstream commit 7967dc8f797f454d4f4acec15c7df0cdf4801617 ] Since 61a939c68ee0 ("Bluetooth: Queue incoming ACL data until BT_CONNECTED state is reached") there is no long the need to call mgmt_device_connected as ACL data will be queued until BT_CONNECTED state. Link: https://bugzilla.kernel.org/show_bug.cgi?id=219458 Link: https://github.com/bluez/bluez/issues/1014 Fixes: 333b4fd11e89 ("Bluetooth: L2CAP: Fix uaf in l2cap_connect") Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Sasha Levin --- net/bluetooth/hci_core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index f93f3e7a3d90..789f7f4a0908 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -3846,8 +3846,6 @@ static void hci_acldata_packet(struct hci_dev *hdev, struct sk_buff *skb) hci_dev_lock(hdev); conn = hci_conn_hash_lookup_handle(hdev, handle); - if (conn && hci_dev_test_flag(hdev, HCI_MGMT)) - mgmt_device_connected(hdev, conn, NULL, 0); hci_dev_unlock(hdev); if (conn) { From d848eb0b32682e5f42b2bd38a8e33b6eb82bbb70 Mon Sep 17 00:00:00 2001 From: Pedro Tammela Date: Tue, 14 Nov 2023 11:18:55 -0300 Subject: [PATCH 11/76] net/sched: cls_u32: replace int refcounts with proper refcounts [ Upstream commit 6b78debe1c07e6aa3c91ca0b1384bf3cb8217c50 ] Proper refcounts will always warn splat when something goes wrong, be it underflow, saturation or object resurrection. As these are always a source of bugs, use it in cls_u32 as a safeguard to prevent/catch issues. Another benefit is that the refcount API self documents the code, making clear when transitions to dead are expected. For such an update we had to make minor adaptations on u32 to fit the refcount API. First we set explicitly to '1' when objects are created, then the objects are alive until a 1 -> 0 happens, which is then released appropriately. The above made clear some redundant operations in the u32 code around the root_ht handling that were removed. The root_ht is created with a refcnt set to 1. Then when it's associated with tcf_proto it increments the refcnt to 2. Throughout the entire code the root_ht is an exceptional case and can never be referenced, therefore the refcnt never incremented/decremented. Its lifetime is always bound to tcf_proto, meaning if you delete tcf_proto the root_ht is deleted as well. The code made up for the fact that root_ht refcnt is 2 and did a double decrement to free it, which is not a fit for the refcount API. Even though refcount_t is implemented using atomics, we should observe a negligible control plane impact. Signed-off-by: Pedro Tammela Acked-by: Jamal Hadi Salim Link: https://lore.kernel.org/r/20231114141856.974326-2-pctammela@mojatatu.com Signed-off-by: Jakub Kicinski Stable-dep-of: 73af53d82076 ("net: sched: cls_u32: Fix u32's systematic failure to free IDR entries for hnodes.") Signed-off-by: Sasha Levin --- net/sched/cls_u32.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 04448bfb4d3d..adcc8de1d01b 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -70,7 +70,7 @@ struct tc_u_hnode { struct tc_u_hnode __rcu *next; u32 handle; u32 prio; - int refcnt; + refcount_t refcnt; unsigned int divisor; struct idr handle_idr; bool is_root; @@ -85,7 +85,7 @@ struct tc_u_hnode { struct tc_u_common { struct tc_u_hnode __rcu *hlist; void *ptr; - int refcnt; + refcount_t refcnt; struct idr handle_idr; struct hlist_node hnode; long knodes; @@ -357,7 +357,7 @@ static int u32_init(struct tcf_proto *tp) if (root_ht == NULL) return -ENOBUFS; - root_ht->refcnt++; + refcount_set(&root_ht->refcnt, 1); root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000; root_ht->prio = tp->prio; root_ht->is_root = true; @@ -369,18 +369,20 @@ static int u32_init(struct tcf_proto *tp) kfree(root_ht); return -ENOBUFS; } + refcount_set(&tp_c->refcnt, 1); tp_c->ptr = key; INIT_HLIST_NODE(&tp_c->hnode); idr_init(&tp_c->handle_idr); hlist_add_head(&tp_c->hnode, tc_u_hash(key)); + } else { + refcount_inc(&tp_c->refcnt); } - tp_c->refcnt++; RCU_INIT_POINTER(root_ht->next, tp_c->hlist); rcu_assign_pointer(tp_c->hlist, root_ht); - root_ht->refcnt++; + /* root_ht must be destroyed when tcf_proto is destroyed */ rcu_assign_pointer(tp->root, root_ht); tp->data = tp_c; return 0; @@ -391,7 +393,7 @@ static void __u32_destroy_key(struct tc_u_knode *n) struct tc_u_hnode *ht = rtnl_dereference(n->ht_down); tcf_exts_destroy(&n->exts); - if (ht && --ht->refcnt == 0) + if (ht && refcount_dec_and_test(&ht->refcnt)) kfree(ht); kfree(n); } @@ -599,8 +601,6 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, struct tc_u_hnode __rcu **hn; struct tc_u_hnode *phn; - WARN_ON(--ht->refcnt); - u32_clear_hnode(tp, ht, extack); hn = &tp_c->hlist; @@ -628,10 +628,10 @@ static void u32_destroy(struct tcf_proto *tp, bool rtnl_held, WARN_ON(root_ht == NULL); - if (root_ht && --root_ht->refcnt == 1) + if (root_ht && refcount_dec_and_test(&root_ht->refcnt)) u32_destroy_hnode(tp, root_ht, extack); - if (--tp_c->refcnt == 0) { + if (refcount_dec_and_test(&tp_c->refcnt)) { struct tc_u_hnode *ht; hlist_del(&tp_c->hnode); @@ -643,7 +643,7 @@ static void u32_destroy(struct tcf_proto *tp, bool rtnl_held, /* u32_destroy_key() will later free ht for us, if it's * still referenced by some knode */ - if (--ht->refcnt == 0) + if (refcount_dec_and_test(&ht->refcnt)) kfree_rcu(ht, rcu); } @@ -672,7 +672,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last, return -EINVAL; } - if (ht->refcnt == 1) { + if (refcount_dec_if_one(&ht->refcnt)) { u32_destroy_hnode(tp, ht, extack); } else { NL_SET_ERR_MSG_MOD(extack, "Can not delete in-use filter"); @@ -680,7 +680,7 @@ static int u32_delete(struct tcf_proto *tp, void *arg, bool *last, } out: - *last = tp_c->refcnt == 1 && tp_c->knodes == 0; + *last = refcount_read(&tp_c->refcnt) == 1 && tp_c->knodes == 0; return ret; } @@ -764,14 +764,14 @@ static int u32_set_parms(struct net *net, struct tcf_proto *tp, NL_SET_ERR_MSG_MOD(extack, "Not linking to root node"); return -EINVAL; } - ht_down->refcnt++; + refcount_inc(&ht_down->refcnt); } ht_old = rtnl_dereference(n->ht_down); rcu_assign_pointer(n->ht_down, ht_down); if (ht_old) - ht_old->refcnt--; + refcount_dec(&ht_old->refcnt); } if (ifindex >= 0) @@ -850,7 +850,7 @@ static struct tc_u_knode *u32_init_knode(struct net *net, struct tcf_proto *tp, /* bump reference count as long as we hold pointer to structure */ if (ht) - ht->refcnt++; + refcount_inc(&ht->refcnt); return new; } @@ -930,7 +930,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, ht_old = rtnl_dereference(n->ht_down); if (ht_old) - ht_old->refcnt++; + refcount_inc(&ht_old->refcnt); } __u32_destroy_key(new); return err; @@ -978,7 +978,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, return err; } } - ht->refcnt = 1; + refcount_set(&ht->refcnt, 1); ht->divisor = divisor; ht->handle = handle; ht->prio = tp->prio; From 9b2c3184e126b941dfa716b8128ed6db92a64e2f Mon Sep 17 00:00:00 2001 From: Alexandre Ferrieux Date: Sun, 10 Nov 2024 18:28:36 +0100 Subject: [PATCH 12/76] net: sched: cls_u32: Fix u32's systematic failure to free IDR entries for hnodes. [ Upstream commit 73af53d82076bbe184d9ece9e14b0dc8599e6055 ] To generate hnode handles (in gen_new_htid()), u32 uses IDR and encodes the returned small integer into a structured 32-bit word. Unfortunately, at disposal time, the needed decoding is not done. As a result, idr_remove() fails, and the IDR fills up. Since its size is 2048, the following script ends up with "Filter already exists": tc filter add dev myve $FILTER1 tc filter add dev myve $FILTER2 for i in {1..2048} do echo $i tc filter del dev myve $FILTER2 tc filter add dev myve $FILTER2 done This patch adds the missing decoding logic for handles that deserve it. Fixes: e7614370d6f0 ("net_sched: use idr to allocate u32 filter handles") Reviewed-by: Eric Dumazet Acked-by: Jamal Hadi Salim Signed-off-by: Alexandre Ferrieux Tested-by: Victor Nogueira Link: https://patch.msgid.link/20241110172836.331319-1-alexandre.ferrieux@orange.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/sched/cls_u32.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index adcc8de1d01b..e87d79d043d5 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -91,6 +91,16 @@ struct tc_u_common { long knodes; }; +static u32 handle2id(u32 h) +{ + return ((h & 0x80000000) ? ((h >> 20) & 0x7FF) : h); +} + +static u32 id2handle(u32 id) +{ + return (id | 0x800U) << 20; +} + static inline unsigned int u32_hash_fold(__be32 key, const struct tc_u32_sel *sel, u8 fshift) @@ -308,7 +318,7 @@ static u32 gen_new_htid(struct tc_u_common *tp_c, struct tc_u_hnode *ptr) int id = idr_alloc_cyclic(&tp_c->handle_idr, ptr, 1, 0x7FF, GFP_KERNEL); if (id < 0) return 0; - return (id | 0x800U) << 20; + return id2handle(id); } static struct hlist_head *tc_u_common_hash; @@ -358,7 +368,7 @@ static int u32_init(struct tcf_proto *tp) return -ENOBUFS; refcount_set(&root_ht->refcnt, 1); - root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : 0x80000000; + root_ht->handle = tp_c ? gen_new_htid(tp_c, root_ht) : id2handle(0); root_ht->prio = tp->prio; root_ht->is_root = true; idr_init(&root_ht->handle_idr); @@ -610,7 +620,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht, if (phn == ht) { u32_clear_hw_hnode(tp, ht, extack); idr_destroy(&ht->handle_idr); - idr_remove(&tp_c->handle_idr, ht->handle); + idr_remove(&tp_c->handle_idr, handle2id(ht->handle)); RCU_INIT_POINTER(*hn, ht->next); kfree_rcu(ht, rcu); return 0; @@ -987,7 +997,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, err = u32_replace_hw_hnode(tp, ht, userflags, extack); if (err) { - idr_remove(&tp_c->handle_idr, handle); + idr_remove(&tp_c->handle_idr, handle2id(handle)); kfree(ht); return err; } From 1b13c1ca46c0889406b55379da7cf1fd32c0e48c Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Tue, 12 Nov 2024 11:03:47 +0800 Subject: [PATCH 13/76] samples: pktgen: correct dev to DEV [ Upstream commit 3342dc8b4623d835e7dd76a15cec2e5a94fe2f93 ] In the pktgen_sample01_simple.sh script, the device variable is uppercase 'DEV' instead of lowercase 'dev'. Because of this typo, the script cannot enable UDP tx checksum. Fixes: 460a9aa23de6 ("samples: pktgen: add UDP tx checksum support") Signed-off-by: Wei Fang Reviewed-by: Simon Horman Acked-by: Jesper Dangaard Brouer Link: https://patch.msgid.link/20241112030347.1849335-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- samples/pktgen/pktgen_sample01_simple.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/samples/pktgen/pktgen_sample01_simple.sh b/samples/pktgen/pktgen_sample01_simple.sh index 09a92ea963f9..c8e75888a9c2 100755 --- a/samples/pktgen/pktgen_sample01_simple.sh +++ b/samples/pktgen/pktgen_sample01_simple.sh @@ -72,7 +72,7 @@ if [ -n "$DST_PORT" ]; then pg_set $DEV "udp_dst_max $UDP_DST_MAX" fi -[ ! -z "$UDP_CSUM" ] && pg_set $dev "flag UDPCSUM" +[ ! -z "$UDP_CSUM" ] && pg_set $DEV "flag UDPCSUM" # Setup random UDP port src range pg_set $DEV "flag UDPSRC_RND" From ae35fcddd3ddca9959fc8c487fae59f9f8714c15 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 11 Nov 2024 10:16:49 +0000 Subject: [PATCH 14/76] bonding: add ns target multicast address to slave device MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 8eb36164d1a6769a20ed43033510067ff3dab9ee ] Commit 4598380f9c54 ("bonding: fix ns validation on backup slaves") tried to resolve the issue where backup slaves couldn't be brought up when receiving IPv6 Neighbor Solicitation (NS) messages. However, this fix only worked for drivers that receive all multicast messages, such as the veth interface. For standard drivers, the NS multicast message is silently dropped because the slave device is not a member of the NS target multicast group. To address this, we need to make the slave device join the NS target multicast group, ensuring it can receive these IPv6 NS messages to validate the slave’s status properly. There are three policies before joining the multicast group: 1. All settings must be under active-backup mode (alb and tlb do not support arp_validate), with backup slaves and slaves supporting multicast. 2. We can add or remove multicast groups when arp_validate changes. 3. Other operations, such as enslaving, releasing, or setting NS targets, need to be guarded by arp_validate. Fixes: 4e24be018eb9 ("bonding: add new parameter ns_targets") Signed-off-by: Hangbin Liu Reviewed-by: Nikolay Aleksandrov Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/bonding/bond_main.c | 16 +++++- drivers/net/bonding/bond_options.c | 82 +++++++++++++++++++++++++++++- include/net/bond_options.h | 2 + 3 files changed, 98 insertions(+), 2 deletions(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 51d6cf0a3fb4..26a9f99882e6 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -919,6 +919,8 @@ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active, if (bond->dev->flags & IFF_UP) bond_hw_addr_flush(bond->dev, old_active->dev); + + bond_slave_ns_maddrs_add(bond, old_active); } if (new_active) { @@ -935,6 +937,8 @@ static void bond_hw_addr_swap(struct bonding *bond, struct slave *new_active, dev_mc_sync(new_active->dev, bond->dev); netif_addr_unlock_bh(bond->dev); } + + bond_slave_ns_maddrs_del(bond, new_active); } } @@ -2231,6 +2235,11 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, bond_compute_features(bond); bond_set_carrier(bond); + /* Needs to be called before bond_select_active_slave(), which will + * remove the maddrs if the slave is selected as active slave. + */ + bond_slave_ns_maddrs_add(bond, new_slave); + if (bond_uses_primary(bond)) { block_netpoll_tx(); bond_select_active_slave(bond); @@ -2240,7 +2249,6 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, if (bond_mode_can_use_xmit_hash(bond)) bond_update_slave_arr(bond, NULL); - if (!slave_dev->netdev_ops->ndo_bpf || !slave_dev->netdev_ops->ndo_xdp_xmit) { if (bond->xdp_prog) { @@ -2436,6 +2444,12 @@ static int __bond_release_one(struct net_device *bond_dev, if (oldcurrent == slave) bond_change_active_slave(bond, NULL); + /* Must be called after bond_change_active_slave () as the slave + * might change from an active slave to a backup slave. Then it is + * necessary to clear the maddrs on the backup slave. + */ + bond_slave_ns_maddrs_del(bond, slave); + if (bond_is_lb(bond)) { /* Must be called only after the slave has been * detached from the list and the curr_active_slave diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c index 06c4cd0f0002..c8536dc7d860 100644 --- a/drivers/net/bonding/bond_options.c +++ b/drivers/net/bonding/bond_options.c @@ -15,6 +15,7 @@ #include #include +#include static int bond_option_active_slave_set(struct bonding *bond, const struct bond_opt_value *newval); @@ -1230,6 +1231,68 @@ static int bond_option_arp_ip_targets_set(struct bonding *bond, } #if IS_ENABLED(CONFIG_IPV6) +static bool slave_can_set_ns_maddr(const struct bonding *bond, struct slave *slave) +{ + return BOND_MODE(bond) == BOND_MODE_ACTIVEBACKUP && + !bond_is_active_slave(slave) && + slave->dev->flags & IFF_MULTICAST; +} + +static void slave_set_ns_maddrs(struct bonding *bond, struct slave *slave, bool add) +{ + struct in6_addr *targets = bond->params.ns_targets; + char slot_maddr[MAX_ADDR_LEN]; + int i; + + if (!slave_can_set_ns_maddr(bond, slave)) + return; + + for (i = 0; i < BOND_MAX_NS_TARGETS; i++) { + if (ipv6_addr_any(&targets[i])) + break; + + if (!ndisc_mc_map(&targets[i], slot_maddr, slave->dev, 0)) { + if (add) + dev_mc_add(slave->dev, slot_maddr); + else + dev_mc_del(slave->dev, slot_maddr); + } + } +} + +void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave) +{ + if (!bond->params.arp_validate) + return; + slave_set_ns_maddrs(bond, slave, true); +} + +void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave) +{ + if (!bond->params.arp_validate) + return; + slave_set_ns_maddrs(bond, slave, false); +} + +static void slave_set_ns_maddr(struct bonding *bond, struct slave *slave, + struct in6_addr *target, struct in6_addr *slot) +{ + char target_maddr[MAX_ADDR_LEN], slot_maddr[MAX_ADDR_LEN]; + + if (!bond->params.arp_validate || !slave_can_set_ns_maddr(bond, slave)) + return; + + /* remove the previous maddr from slave */ + if (!ipv6_addr_any(slot) && + !ndisc_mc_map(slot, slot_maddr, slave->dev, 0)) + dev_mc_del(slave->dev, slot_maddr); + + /* add new maddr on slave if target is set */ + if (!ipv6_addr_any(target) && + !ndisc_mc_map(target, target_maddr, slave->dev, 0)) + dev_mc_add(slave->dev, target_maddr); +} + static void _bond_options_ns_ip6_target_set(struct bonding *bond, int slot, struct in6_addr *target, unsigned long last_rx) @@ -1239,8 +1302,10 @@ static void _bond_options_ns_ip6_target_set(struct bonding *bond, int slot, struct slave *slave; if (slot >= 0 && slot < BOND_MAX_NS_TARGETS) { - bond_for_each_slave(bond, slave, iter) + bond_for_each_slave(bond, slave, iter) { slave->target_last_arp_rx[slot] = last_rx; + slave_set_ns_maddr(bond, slave, target, &targets[slot]); + } targets[slot] = *target; } } @@ -1292,15 +1357,30 @@ static int bond_option_ns_ip6_targets_set(struct bonding *bond, { return -EPERM; } + +static void slave_set_ns_maddrs(struct bonding *bond, struct slave *slave, bool add) {} + +void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave) {} + +void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave) {} #endif static int bond_option_arp_validate_set(struct bonding *bond, const struct bond_opt_value *newval) { + bool changed = !!bond->params.arp_validate != !!newval->value; + struct list_head *iter; + struct slave *slave; + netdev_dbg(bond->dev, "Setting arp_validate to %s (%llu)\n", newval->string, newval->value); bond->params.arp_validate = newval->value; + if (changed) { + bond_for_each_slave(bond, slave, iter) + slave_set_ns_maddrs(bond, slave, !!bond->params.arp_validate); + } + return 0; } diff --git a/include/net/bond_options.h b/include/net/bond_options.h index 69292ecc0325..f631d9f09941 100644 --- a/include/net/bond_options.h +++ b/include/net/bond_options.h @@ -160,5 +160,7 @@ void bond_option_arp_ip_targets_clear(struct bonding *bond); #if IS_ENABLED(CONFIG_IPV6) void bond_option_ns_ip6_targets_clear(struct bonding *bond); #endif +void bond_slave_ns_maddrs_add(struct bonding *bond, struct slave *slave); +void bond_slave_ns_maddrs_del(struct bonding *bond, struct slave *slave); #endif /* _NET_BOND_OPTIONS_H */ From 609641e959ee6ed86369f6ac08c56f66547587b7 Mon Sep 17 00:00:00 2001 From: Harith G Date: Wed, 18 Sep 2024 06:57:11 +0100 Subject: [PATCH 15/76] ARM: 9419/1: mm: Fix kernel memory mapping for xip kernels [ Upstream commit ed6cbe6e5563452f305e89c15846820f2874e431 ] The patchset introducing kernel_sec_start/end variables to separate the kernel/lowmem memory mappings, broke the mapping of the kernel memory for xipkernels. kernel_sec_start/end variables are in RO area before the MMU is switched on for xipkernels. So these cannot be set early in boot in head.S. Fix this by setting these after MMU is switched on. xipkernels need two different mappings for kernel text (starting at CONFIG_XIP_PHYS_ADDR) and data (starting at CONFIG_PHYS_OFFSET). Also, move the kernel code mapping from devicemaps_init() to map_kernel(). Fixes: a91da5457085 ("ARM: 9089/1: Define kernel physical section start and end") Signed-off-by: Harith George Reviewed-by: Linus Walleij Signed-off-by: Russell King (Oracle) Signed-off-by: Sasha Levin --- arch/arm/kernel/head.S | 8 ++++++-- arch/arm/mm/mmu.c | 34 +++++++++++++++++++++------------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/arch/arm/kernel/head.S b/arch/arm/kernel/head.S index 29e2900178a1..b97da9e069a0 100644 --- a/arch/arm/kernel/head.S +++ b/arch/arm/kernel/head.S @@ -252,11 +252,15 @@ __create_page_tables: */ add r0, r4, #KERNEL_OFFSET >> (SECTION_SHIFT - PMD_ENTRY_ORDER) ldr r6, =(_end - 1) + + /* For XIP, kernel_sec_start/kernel_sec_end are currently in RO memory */ +#ifndef CONFIG_XIP_KERNEL adr_l r5, kernel_sec_start @ _pa(kernel_sec_start) #if defined CONFIG_CPU_ENDIAN_BE8 || defined CONFIG_CPU_ENDIAN_BE32 str r8, [r5, #4] @ Save physical start of kernel (BE) #else str r8, [r5] @ Save physical start of kernel (LE) +#endif #endif orr r3, r8, r7 @ Add the MMU flags add r6, r4, r6, lsr #(SECTION_SHIFT - PMD_ENTRY_ORDER) @@ -264,6 +268,7 @@ __create_page_tables: add r3, r3, #1 << SECTION_SHIFT cmp r0, r6 bls 1b +#ifndef CONFIG_XIP_KERNEL eor r3, r3, r7 @ Remove the MMU flags adr_l r5, kernel_sec_end @ _pa(kernel_sec_end) #if defined CONFIG_CPU_ENDIAN_BE8 || defined CONFIG_CPU_ENDIAN_BE32 @@ -271,8 +276,7 @@ __create_page_tables: #else str r3, [r5] @ Save physical end of kernel (LE) #endif - -#ifdef CONFIG_XIP_KERNEL +#else /* * Map the kernel image separately as it is not located in RAM. */ diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 463fc2a8448f..a39a7043f189 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -1401,18 +1401,6 @@ static void __init devicemaps_init(const struct machine_desc *mdesc) create_mapping(&map); } - /* - * Map the kernel if it is XIP. - * It is always first in the modulearea. - */ -#ifdef CONFIG_XIP_KERNEL - map.pfn = __phys_to_pfn(CONFIG_XIP_PHYS_ADDR & SECTION_MASK); - map.virtual = MODULES_VADDR; - map.length = ((unsigned long)_exiprom - map.virtual + ~SECTION_MASK) & SECTION_MASK; - map.type = MT_ROM; - create_mapping(&map); -#endif - /* * Map the cache flushing regions. */ @@ -1602,12 +1590,27 @@ static void __init map_kernel(void) * This will only persist until we turn on proper memory management later on * and we remap the whole kernel with page granularity. */ +#ifdef CONFIG_XIP_KERNEL + phys_addr_t kernel_nx_start = kernel_sec_start; +#else phys_addr_t kernel_x_start = kernel_sec_start; phys_addr_t kernel_x_end = round_up(__pa(__init_end), SECTION_SIZE); phys_addr_t kernel_nx_start = kernel_x_end; +#endif phys_addr_t kernel_nx_end = kernel_sec_end; struct map_desc map; + /* + * Map the kernel if it is XIP. + * It is always first in the modulearea. + */ +#ifdef CONFIG_XIP_KERNEL + map.pfn = __phys_to_pfn(CONFIG_XIP_PHYS_ADDR & SECTION_MASK); + map.virtual = MODULES_VADDR; + map.length = ((unsigned long)_exiprom - map.virtual + ~SECTION_MASK) & SECTION_MASK; + map.type = MT_ROM; + create_mapping(&map); +#else map.pfn = __phys_to_pfn(kernel_x_start); map.virtual = __phys_to_virt(kernel_x_start); map.length = kernel_x_end - kernel_x_start; @@ -1617,7 +1620,7 @@ static void __init map_kernel(void) /* If the nx part is small it may end up covered by the tail of the RWX section */ if (kernel_x_end == kernel_nx_end) return; - +#endif map.pfn = __phys_to_pfn(kernel_nx_start); map.virtual = __phys_to_virt(kernel_nx_start); map.length = kernel_nx_end - kernel_nx_start; @@ -1762,6 +1765,11 @@ void __init paging_init(const struct machine_desc *mdesc) { void *zero_page; +#ifdef CONFIG_XIP_KERNEL + /* Store the kernel RW RAM region start/end in these variables */ + kernel_sec_start = CONFIG_PHYS_OFFSET & SECTION_MASK; + kernel_sec_end = round_up(__pa(_end), SECTION_SIZE); +#endif pr_debug("physical kernel sections: 0x%08llx-0x%08llx\n", kernel_sec_start, kernel_sec_end); From 69556613d99ba5d848a5fba5d04355a061de5219 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Wed, 11 Sep 2024 16:16:15 +0800 Subject: [PATCH 16/76] x86/mm: Fix a kdump kernel failure on SME system when CONFIG_IMA_KEXEC=y MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 8d9ffb2fe65a6c4ef114e8d4f947958a12751bbe upstream. The kdump kernel is broken on SME systems with CONFIG_IMA_KEXEC=y enabled. Debugging traced the issue back to b69a2afd5afc ("x86/kexec: Carry forward IMA measurement log on kexec"). Testing was previously not conducted on SME systems with CONFIG_IMA_KEXEC enabled, which led to the oversight, with the following incarnation: ... ima: No TPM chip found, activating TPM-bypass! Loading compiled-in module X.509 certificates Loaded X.509 cert 'Build time autogenerated kernel key: 18ae0bc7e79b64700122bb1d6a904b070fef2656' ima: Allocated hash algorithm: sha256 Oops: general protection fault, probably for non-canonical address 0xcfacfdfe6660003e: 0000 [#1] PREEMPT SMP NOPTI CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.11.0-rc2+ #14 Hardware name: Dell Inc. PowerEdge R7425/02MJ3T, BIOS 1.20.0 05/03/2023 RIP: 0010:ima_restore_measurement_list Call Trace: ? show_trace_log_lvl ? show_trace_log_lvl ? ima_load_kexec_buffer ? __die_body.cold ? die_addr ? exc_general_protection ? asm_exc_general_protection ? ima_restore_measurement_list ? vprintk_emit ? ima_load_kexec_buffer ima_load_kexec_buffer ima_init ? __pfx_init_ima init_ima ? __pfx_init_ima do_one_initcall do_initcalls ? __pfx_kernel_init kernel_init_freeable kernel_init ret_from_fork ? __pfx_kernel_init ret_from_fork_asm Modules linked in: ---[ end trace 0000000000000000 ]--- ... Kernel panic - not syncing: Fatal exception Kernel Offset: disabled Rebooting in 10 seconds.. Adding debug printks showed that the stored addr and size of ima_kexec buffer are not decrypted correctly like: ima: ima_load_kexec_buffer, buffer:0xcfacfdfe6660003e, size:0xe48066052d5df359 Three types of setup_data info — SETUP_EFI, - SETUP_IMA, and - SETUP_RNG_SEED are passed to the kexec/kdump kernel. Only the ima_kexec buffer experienced incorrect decryption. Debugging identified a bug in early_memremap_is_setup_data(), where an incorrect range calculation occurred due to the len variable in struct setup_data ended up only representing the length of the data field, excluding the struct's size, and thus leading to miscalculation. Address a similar issue in memremap_is_setup_data() while at it. [ bp: Heavily massage. ] Fixes: b3c72fc9a78e ("x86/boot: Introduce setup_indirect") Signed-off-by: Baoquan He Signed-off-by: Borislav Petkov (AMD) Acked-by: Tom Lendacky Cc: Link: https://lore.kernel.org/r/20240911081615.262202-3-bhe@redhat.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/mm/ioremap.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 6453fbaedb08..aa5234034c50 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -650,7 +650,8 @@ static bool memremap_is_setup_data(resource_size_t phys_addr, paddr_next = data->next; len = data->len; - if ((phys_addr > paddr) && (phys_addr < (paddr + len))) { + if ((phys_addr > paddr) && + (phys_addr < (paddr + sizeof(struct setup_data) + len))) { memunmap(data); return true; } @@ -712,7 +713,8 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr, paddr_next = data->next; len = data->len; - if ((phys_addr > paddr) && (phys_addr < (paddr + len))) { + if ((phys_addr > paddr) && + (phys_addr < (paddr + sizeof(struct setup_data) + len))) { early_memunmap(data, sizeof(*data)); return true; } From 6addb2d9501ec866d7b3a3b4e665307c437e9be2 Mon Sep 17 00:00:00 2001 From: Jinjiang Tu Date: Wed, 13 Nov 2024 16:32:35 +0800 Subject: [PATCH 17/76] mm: fix NULL pointer dereference in alloc_pages_bulk_noprof commit 8ce41b0f9d77cca074df25afd39b86e2ee3aa68e upstream. We triggered a NULL pointer dereference for ac.preferred_zoneref->zone in alloc_pages_bulk_noprof() when the task is migrated between cpusets. When cpuset is enabled, in prepare_alloc_pages(), ac->nodemask may be ¤t->mems_allowed. when first_zones_zonelist() is called to find preferred_zoneref, the ac->nodemask may be modified concurrently if the task is migrated between different cpusets. Assuming we have 2 NUMA Node, when traversing Node1 in ac->zonelist, the nodemask is 2, and when traversing Node2 in ac->zonelist, the nodemask is 1. As a result, the ac->preferred_zoneref points to NULL zone. In alloc_pages_bulk_noprof(), for_each_zone_zonelist_nodemask() finds a allowable zone and calls zonelist_node_idx(ac.preferred_zoneref), leading to NULL pointer dereference. __alloc_pages_noprof() fixes this issue by checking NULL pointer in commit ea57485af8f4 ("mm, page_alloc: fix check for NULL preferred_zone") and commit df76cee6bbeb ("mm, page_alloc: remove redundant checks from alloc fastpath"). To fix it, check NULL pointer for preferred_zoneref->zone. Link: https://lkml.kernel.org/r/20241113083235.166798-1-tujinjiang@huawei.com Fixes: 387ba26fb1cb ("mm/page_alloc: add a bulk page allocator") Signed-off-by: Jinjiang Tu Reviewed-by: Vlastimil Babka Cc: Alexander Lobakin Cc: David Hildenbrand Cc: Kefeng Wang Cc: Mel Gorman Cc: Nanyong Sun Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b87b350b2f40..58a3f70eb39b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5457,7 +5457,8 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, gfp = alloc_gfp; /* Find an allowed local zone that meets the low watermark. */ - for_each_zone_zonelist_nodemask(zone, z, ac.zonelist, ac.highest_zoneidx, ac.nodemask) { + z = ac.preferred_zoneref; + for_next_zone_zonelist_nodemask(zone, z, ac.highest_zoneidx, ac.nodemask) { unsigned long mark; if (cpusets_enabled() && (alloc_flags & ALLOC_CPUSET) && From 620d22598110b0d0cb97a3fcca65fc473ea86e73 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Thu, 14 Nov 2024 07:38:44 +0300 Subject: [PATCH 18/76] ocfs2: uncache inode which has failed entering the group commit 737f34137844d6572ab7d473c998c7f977ff30eb upstream. Syzbot has reported the following BUG: kernel BUG at fs/ocfs2/uptodate.c:509! ... Call Trace: ? __die_body+0x5f/0xb0 ? die+0x9e/0xc0 ? do_trap+0x15a/0x3a0 ? ocfs2_set_new_buffer_uptodate+0x145/0x160 ? do_error_trap+0x1dc/0x2c0 ? ocfs2_set_new_buffer_uptodate+0x145/0x160 ? __pfx_do_error_trap+0x10/0x10 ? handle_invalid_op+0x34/0x40 ? ocfs2_set_new_buffer_uptodate+0x145/0x160 ? exc_invalid_op+0x38/0x50 ? asm_exc_invalid_op+0x1a/0x20 ? ocfs2_set_new_buffer_uptodate+0x2e/0x160 ? ocfs2_set_new_buffer_uptodate+0x144/0x160 ? ocfs2_set_new_buffer_uptodate+0x145/0x160 ocfs2_group_add+0x39f/0x15a0 ? __pfx_ocfs2_group_add+0x10/0x10 ? __pfx_lock_acquire+0x10/0x10 ? mnt_get_write_access+0x68/0x2b0 ? __pfx_lock_release+0x10/0x10 ? rcu_read_lock_any_held+0xb7/0x160 ? __pfx_rcu_read_lock_any_held+0x10/0x10 ? smack_log+0x123/0x540 ? mnt_get_write_access+0x68/0x2b0 ? mnt_get_write_access+0x68/0x2b0 ? mnt_get_write_access+0x226/0x2b0 ocfs2_ioctl+0x65e/0x7d0 ? __pfx_ocfs2_ioctl+0x10/0x10 ? smack_file_ioctl+0x29e/0x3a0 ? __pfx_smack_file_ioctl+0x10/0x10 ? lockdep_hardirqs_on_prepare+0x43d/0x780 ? __pfx_lockdep_hardirqs_on_prepare+0x10/0x10 ? __pfx_ocfs2_ioctl+0x10/0x10 __se_sys_ioctl+0xfb/0x170 do_syscall_64+0xf3/0x230 entry_SYSCALL_64_after_hwframe+0x77/0x7f ... When 'ioctl(OCFS2_IOC_GROUP_ADD, ...)' has failed for the particular inode in 'ocfs2_verify_group_and_input()', corresponding buffer head remains cached and subsequent call to the same 'ioctl()' for the same inode issues the BUG() in 'ocfs2_set_new_buffer_uptodate()' (trying to cache the same buffer head of that inode). Fix this by uncaching the buffer head with 'ocfs2_remove_from_cache()' on error path in 'ocfs2_group_add()'. Link: https://lkml.kernel.org/r/20241114043844.111847-1-dmantipov@yandex.ru Fixes: 7909f2bf8353 ("[PATCH 2/2] ocfs2: Implement group add for online resize") Signed-off-by: Dmitry Antipov Reported-by: syzbot+453873f1588c2d75b447@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=453873f1588c2d75b447 Reviewed-by: Joseph Qi Cc: Dmitry Antipov Cc: Joel Becker Cc: Mark Fasheh Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/resize.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index d65d43c61857..b2b47bb79529 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -566,6 +566,8 @@ out_commit: ocfs2_commit_trans(osb, handle); out_free_group_bh: + if (ret < 0) + ocfs2_remove_from_cache(INODE_CACHE(inode), group_bh); brelse(group_bh); out_unlock: From 9012d29347689a03de32be8336ccad2e4749edd0 Mon Sep 17 00:00:00 2001 From: Si-Wei Liu Date: Mon, 21 Oct 2024 16:40:39 +0300 Subject: [PATCH 19/76] vdpa/mlx5: Fix PA offset with unaligned starting iotlb map commit 29ce8b8a4fa74e841342c8b8f8941848a3c6f29f upstream. When calculating the physical address range based on the iotlb and mr [start,end) ranges, the offset of mr->start relative to map->start is not taken into account. This leads to some incorrect and duplicate mappings. For the case when mr->start < map->start the code is already correct: the range in [mr->start, map->start) was handled by a different iteration. Fixes: 94abbccdf291 ("vdpa/mlx5: Add shared memory registration code") Cc: stable@vger.kernel.org Signed-off-by: Si-Wei Liu Signed-off-by: Dragos Tatulea Message-Id: <20241021134040.975221-2-dtatulea@nvidia.com> Signed-off-by: Michael S. Tsirkin Acked-by: Jason Wang Signed-off-by: Greg Kroah-Hartman --- drivers/vdpa/mlx5/core/mr.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index 113aac0446de..4f0a2edc2333 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -232,7 +232,7 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr struct page *pg; unsigned int nsg; int sglen; - u64 pa; + u64 pa, offset; u64 paend; struct scatterlist *sg; struct device *dma = mvdev->vdev.dma_dev; @@ -255,8 +255,10 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr sg = mr->sg_head.sgl; for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1); map; map = vhost_iotlb_itree_next(map, mr->start, mr->end - 1)) { - paend = map->addr + maplen(map, mr); - for (pa = map->addr; pa < paend; pa += sglen) { + offset = mr->start > map->start ? mr->start - map->start : 0; + pa = map->addr + offset; + paend = map->addr + offset + maplen(map, mr); + for (; pa < paend; pa += sglen) { pg = pfn_to_page(__phys_to_pfn(pa)); if (!sg) { mlx5_vdpa_warn(mvdev, "sg null. start 0x%llx, end 0x%llx\n", From 870d68fe17b5d9032049dcad98b5781a344a8657 Mon Sep 17 00:00:00 2001 From: Xiaoguang Wang Date: Tue, 5 Nov 2024 21:35:18 +0800 Subject: [PATCH 20/76] vp_vdpa: fix id_table array not null terminated error commit 4e39ecadf1d2a08187139619f1f314b64ba7d947 upstream. Allocate one extra virtio_device_id as null terminator, otherwise vdpa_mgmtdev_get_classes() may iterate multiple times and visit undefined memory. Fixes: ffbda8e9df10 ("vdpa/vp_vdpa : add vdpa tool support in vp_vdpa") Cc: stable@vger.kernel.org Suggested-by: Parav Pandit Signed-off-by: Angus Chen Signed-off-by: Xiaoguang Wang Message-Id: <20241105133518.1494-1-lege.wang@jaguarmicro.com> Signed-off-by: Michael S. Tsirkin Reviewed-by: Parav Pandit Acked-by: Jason Wang Signed-off-by: Greg Kroah-Hartman --- drivers/vdpa/virtio_pci/vp_vdpa.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c index 281287fae89f..1d6d89c08e6e 100644 --- a/drivers/vdpa/virtio_pci/vp_vdpa.c +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -591,7 +591,11 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto mdev_err; } - mdev_id = kzalloc(sizeof(struct virtio_device_id), GFP_KERNEL); + /* + * id_table should be a null terminated array, so allocate one additional + * entry here, see vdpa_mgmtdev_get_classes(). + */ + mdev_id = kcalloc(2, sizeof(struct virtio_device_id), GFP_KERNEL); if (!mdev_id) { err = -ENOMEM; goto mdev_id_err; @@ -611,8 +615,8 @@ static int vp_vdpa_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto probe_err; } - mdev_id->device = mdev->id.device; - mdev_id->vendor = mdev->id.vendor; + mdev_id[0].device = mdev->id.device; + mdev_id[0].vendor = mdev->id.vendor; mgtdev->id_table = mdev_id; mgtdev->max_supported_vqs = vp_modern_get_num_queues(mdev); mgtdev->supported_features = vp_modern_get_features(mdev); From e01aae58e818503f2ffcd34c6f7dc6f90af1057e Mon Sep 17 00:00:00 2001 From: Samasth Norway Ananda Date: Wed, 7 Aug 2024 10:27:13 -0700 Subject: [PATCH 21/76] ima: fix buffer overrun in ima_eventdigest_init_common commit 923168a0631bc42fffd55087b337b1b6c54dcff5 upstream. Function ima_eventdigest_init() calls ima_eventdigest_init_common() with HASH_ALGO__LAST which is then used to access the array hash_digest_size[] leading to buffer overrun. Have a conditional statement to handle this. Fixes: 9fab303a2cb3 ("ima: fix violation measurement list record") Signed-off-by: Samasth Norway Ananda Tested-by: Enrico Bravi (PhD at polito.it) Cc: stable@vger.kernel.org # 5.19+ Signed-off-by: Mimi Zohar Signed-off-by: Greg Kroah-Hartman --- security/integrity/ima/ima_template_lib.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/security/integrity/ima/ima_template_lib.c b/security/integrity/ima/ima_template_lib.c index 41ec31debe87..bf5267bcd7fe 100644 --- a/security/integrity/ima/ima_template_lib.c +++ b/security/integrity/ima/ima_template_lib.c @@ -318,15 +318,21 @@ static int ima_eventdigest_init_common(const u8 *digest, u32 digestsize, hash_algo_name[hash_algo]); } - if (digest) + if (digest) { memcpy(buffer + offset, digest, digestsize); - else + } else { /* * If digest is NULL, the event being recorded is a violation. * Make room for the digest by increasing the offset by the - * hash algorithm digest size. + * hash algorithm digest size. If the hash algorithm is not + * specified increase the offset by IMA_DIGEST_SIZE which + * fits SHA1 or MD5 */ - offset += hash_digest_size[hash_algo]; + if (hash_algo < HASH_ALGO__LAST) + offset += hash_digest_size[hash_algo]; + else + offset += IMA_DIGEST_SIZE; + } return ima_write_template_field_data(buffer, offset + digestsize, fmt, field_data); From a22120826327c2bac9811d9714998a9f039b9979 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Thu, 31 Oct 2024 13:20:11 -0700 Subject: [PATCH 22/76] KVM: nVMX: Treat vpid01 as current if L2 is active, but with VPID disabled commit 2657b82a78f18528bef56dc1b017158490970873 upstream. When getting the current VPID, e.g. to emulate a guest TLB flush, return vpid01 if L2 is running but with VPID disabled, i.e. if VPID is disabled in vmcs12. Architecturally, if VPID is disabled, then the guest and host effectively share VPID=0. KVM emulates this behavior by using vpid01 when running an L2 with VPID disabled (see prepare_vmcs02_early_rare()), and so KVM must also treat vpid01 as the current VPID while L2 is active. Unconditionally treating vpid02 as the current VPID when L2 is active causes KVM to flush TLB entries for vpid02 instead of vpid01, which results in TLB entries from L1 being incorrectly preserved across nested VM-Enter to L2 (L2=>L1 isn't problematic, because the TLB flush after nested VM-Exit flushes vpid01). The bug manifests as failures in the vmx_apicv_test KVM-Unit-Test, as KVM incorrectly retains TLB entries for the APIC-access page across a nested VM-Enter. Opportunisticaly add comments at various touchpoints to explain the architectural requirements, and also why KVM uses vpid01 instead of vpid02. All credit goes to Chao, who root caused the issue and identified the fix. Link: https://lore.kernel.org/all/ZwzczkIlYGX+QXJz@intel.com Fixes: 2b4a5a5d5688 ("KVM: nVMX: Flush current VPID (L1 vs. L2) for KVM_REQ_TLB_FLUSH_GUEST") Cc: stable@vger.kernel.org Cc: Like Xu Debugged-by: Chao Gao Reviewed-by: Chao Gao Tested-by: Chao Gao Link: https://lore.kernel.org/r/20241031202011.1580522-1-seanjc@google.com Signed-off-by: Sean Christopherson Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx/nested.c | 30 +++++++++++++++++++++++++----- arch/x86/kvm/vmx/vmx.c | 2 +- 2 files changed, 26 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 2283f485a81f..8052f8b7d8e1 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -1126,11 +1126,14 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu, struct vcpu_vmx *vmx = to_vmx(vcpu); /* - * If vmcs12 doesn't use VPID, L1 expects linear and combined mappings - * for *all* contexts to be flushed on VM-Enter/VM-Exit, i.e. it's a - * full TLB flush from the guest's perspective. This is required even - * if VPID is disabled in the host as KVM may need to synchronize the - * MMU in response to the guest TLB flush. + * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the + * same VPID as the host, and so architecturally, linear and combined + * mappings for VPID=0 must be flushed at VM-Enter and VM-Exit. KVM + * emulates L2 sharing L1's VPID=0 by using vpid01 while running L2, + * and so KVM must also emulate TLB flush of VPID=0, i.e. vpid01. This + * is required if VPID is disabled in KVM, as a TLB flush (there are no + * VPIDs) still occurs from L1's perspective, and KVM may need to + * synchronize the MMU in response to the guest TLB flush. * * Note, using TLB_FLUSH_GUEST is correct even if nested EPT is in use. * EPT is a special snowflake, as guest-physical mappings aren't @@ -2196,6 +2199,17 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, vmcs_write64(VMCS_LINK_POINTER, INVALID_GPA); + /* + * If VPID is disabled, then guest TLB accesses use VPID=0, i.e. the + * same VPID as the host. Emulate this behavior by using vpid01 for L2 + * if VPID is disabled in vmcs12. Note, if VPID is disabled, VM-Enter + * and VM-Exit are architecturally required to flush VPID=0, but *only* + * VPID=0. I.e. using vpid02 would be ok (so long as KVM emulates the + * required flushes), but doing so would cause KVM to over-flush. E.g. + * if L1 runs L2 X with VPID12=1, then runs L2 Y with VPID12 disabled, + * and then runs L2 X again, then KVM can and should retain TLB entries + * for VPID12=1. + */ if (enable_vpid) { if (nested_cpu_has_vpid(vmcs12) && vmx->nested.vpid02) vmcs_write16(VIRTUAL_PROCESSOR_ID, vmx->nested.vpid02); @@ -5758,6 +5772,12 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return nested_vmx_fail(vcpu, VMXERR_INVALID_OPERAND_TO_INVEPT_INVVPID); + /* + * Always flush the effective vpid02, i.e. never flush the current VPID + * and never explicitly flush vpid01. INVVPID targets a VPID, not a + * VMCS, and so whether or not the current vmcs12 has VPID enabled is + * irrelevant (and there may not be a loaded vmcs12). + */ vpid02 = nested_get_vpid02(vcpu); switch (type) { case VMX_VPID_EXTENT_INDIVIDUAL_ADDR: diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 87abf4eebf8a..6baa4aa20fc7 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -3098,7 +3098,7 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu) static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu) { - if (is_guest_mode(vcpu)) + if (is_guest_mode(vcpu) && nested_cpu_has_vpid(get_vmcs12(vcpu))) return nested_get_vpid02(vcpu); return to_vmx(vcpu)->vpid; } From 14db62eb01fa7a224c0c416253a540ba6f41f96d Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 5 Nov 2024 17:51:35 -0800 Subject: [PATCH 23/76] KVM: x86: Unconditionally set irr_pending when updating APICv state commit d3ddef46f22e8c3124e0df1f325bc6a18dadff39 upstream. Always set irr_pending (to true) when updating APICv status to fix a bug where KVM fails to set irr_pending when userspace sets APIC state and APICv is disabled, which ultimate results in KVM failing to inject the pending interrupt(s) that userspace stuffed into the vIRR, until another interrupt happens to be emulated by KVM. Only the APICv-disabled case is flawed, as KVM forces apic->irr_pending to be true if APICv is enabled, because not all vIRR updates will be visible to KVM. Hit the bug with a big hammer, even though strictly speaking KVM can scan the vIRR and set/clear irr_pending as appropriate for this specific case. The bug was introduced by commit 755c2bf87860 ("KVM: x86: lapic: don't touch irr_pending in kvm_apic_update_apicv when inhibiting it"), which as the shortlog suggests, deleted code that updated irr_pending. Before that commit, kvm_apic_update_apicv() did indeed scan the vIRR, with with the crucial difference that kvm_apic_update_apicv() did the scan even when APICv was being *disabled*, e.g. due to an AVIC inhibition. struct kvm_lapic *apic = vcpu->arch.apic; if (vcpu->arch.apicv_active) { /* irr_pending is always true when apicv is activated. */ apic->irr_pending = true; apic->isr_count = 1; } else { apic->irr_pending = (apic_search_irr(apic) != -1); apic->isr_count = count_vectors(apic->regs + APIC_ISR); } And _that_ bug (clearing irr_pending) was introduced by commit b26a695a1d78 ("kvm: lapic: Introduce APICv update helper function"), prior to which KVM unconditionally set irr_pending to true in kvm_apic_set_state(), i.e. assumed that the new virtual APIC state could have a pending IRQ. Furthermore, in addition to introducing this issue, commit 755c2bf87860 also papered over the underlying bug: KVM doesn't ensure CPUs and devices see APICv as disabled prior to searching the IRR. Waiting until KVM emulates an EOI to update irr_pending "works", but only because KVM won't emulate EOI until after refresh_apicv_exec_ctrl(), and there are plenty of memory barriers in between. I.e. leaving irr_pending set is basically hacking around bad ordering. So, effectively revert to the pre-b26a695a1d78 behavior for state restore, even though it's sub-optimal if no IRQs are pending, in order to provide a minimal fix, but leave behind a FIXME to document the ugliness. With luck, the ordering issue will be fixed and the mess will be cleaned up in the not-too-distant future. Fixes: 755c2bf87860 ("KVM: x86: lapic: don't touch irr_pending in kvm_apic_update_apicv when inhibiting it") Cc: stable@vger.kernel.org Cc: Maxim Levitsky Reported-by: Yong He Closes: https://lkml.kernel.org/r/20241023124527.1092810-1-alexyonghe%40tencent.com Signed-off-by: Sean Christopherson Message-ID: <20241106015135.2462147-1-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/lapic.c | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index 246374cb3ed3..7f57dce5c828 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2453,19 +2453,26 @@ void kvm_apic_update_apicv(struct kvm_vcpu *vcpu) { struct kvm_lapic *apic = vcpu->arch.apic; - if (apic->apicv_active) { - /* irr_pending is always true when apicv is activated. */ - apic->irr_pending = true; + /* + * When APICv is enabled, KVM must always search the IRR for a pending + * IRQ, as other vCPUs and devices can set IRR bits even if the vCPU + * isn't running. If APICv is disabled, KVM _should_ search the IRR + * for a pending IRQ. But KVM currently doesn't ensure *all* hardware, + * e.g. CPUs and IOMMUs, has seen the change in state, i.e. searching + * the IRR at this time could race with IRQ delivery from hardware that + * still sees APICv as being enabled. + * + * FIXME: Ensure other vCPUs and devices observe the change in APICv + * state prior to updating KVM's metadata caches, so that KVM + * can safely search the IRR and set irr_pending accordingly. + */ + apic->irr_pending = true; + + if (apic->apicv_active) apic->isr_count = 1; - } else { - /* - * Don't clear irr_pending, searching the IRR can race with - * updates from the CPU as APICv is still active from hardware's - * perspective. The flag will be cleared as appropriate when - * KVM injects the interrupt. - */ + else apic->isr_count = count_vectors(apic->regs + APIC_ISR); - } + apic->highest_isr_cache = -1; } EXPORT_SYMBOL_GPL(kvm_apic_update_apicv); From e6716f4230a8784957273ddd27326264b27b9313 Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Fri, 1 Nov 2024 11:50:30 -0700 Subject: [PATCH 24/76] KVM: VMX: Bury Intel PT virtualization (guest/host mode) behind CONFIG_BROKEN commit aa0d42cacf093a6fcca872edc954f6f812926a17 upstream. Hide KVM's pt_mode module param behind CONFIG_BROKEN, i.e. disable support for virtualizing Intel PT via guest/host mode unless BROKEN=y. There are myriad bugs in the implementation, some of which are fatal to the guest, and others which put the stability and health of the host at risk. For guest fatalities, the most glaring issue is that KVM fails to ensure tracing is disabled, and *stays* disabled prior to VM-Enter, which is necessary as hardware disallows loading (the guest's) RTIT_CTL if tracing is enabled (enforced via a VMX consistency check). Per the SDM: If the logical processor is operating with Intel PT enabled (if IA32_RTIT_CTL.TraceEn = 1) at the time of VM entry, the "load IA32_RTIT_CTL" VM-entry control must be 0. On the host side, KVM doesn't validate the guest CPUID configuration provided by userspace, and even worse, uses the guest configuration to decide what MSRs to save/load at VM-Enter and VM-Exit. E.g. configuring guest CPUID to enumerate more address ranges than are supported in hardware will result in KVM trying to passthrough, save, and load non-existent MSRs, which generates a variety of WARNs, ToPA ERRORs in the host, a potential deadlock, etc. Fixes: f99e3daf94ff ("KVM: x86: Add Intel PT virtualization work mode") Cc: stable@vger.kernel.org Cc: Adrian Hunter Signed-off-by: Sean Christopherson Reviewed-by: Xiaoyao Li Tested-by: Adrian Hunter Message-ID: <20241101185031.1799556-2-seanjc@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx/vmx.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 6baa4aa20fc7..460ba48eb66c 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -209,9 +209,11 @@ module_param(ple_window_shrink, uint, 0444); static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX; module_param(ple_window_max, uint, 0444); -/* Default is SYSTEM mode, 1 for host-guest mode */ +/* Default is SYSTEM mode, 1 for host-guest mode (which is BROKEN) */ int __read_mostly pt_mode = PT_MODE_SYSTEM; +#ifdef CONFIG_BROKEN module_param(pt_mode, int, S_IRUGO); +#endif static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush); static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond); From 3b2a4fd9bbee77afdd3ed5a05a0c02b6cde8d3b9 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 7 Nov 2024 01:07:32 +0900 Subject: [PATCH 25/76] nilfs2: fix null-ptr-deref in block_touch_buffer tracepoint commit cd45e963e44b0f10d90b9e6c0e8b4f47f3c92471 upstream. Patch series "nilfs2: fix null-ptr-deref bugs on block tracepoints". This series fixes null pointer dereference bugs that occur when using nilfs2 and two block-related tracepoints. This patch (of 2): It has been reported that when using "block:block_touch_buffer" tracepoint, touch_buffer() called from __nilfs_get_folio_block() causes a NULL pointer dereference, or a general protection fault when KASAN is enabled. This happens because since the tracepoint was added in touch_buffer(), it references the dev_t member bh->b_bdev->bd_dev regardless of whether the buffer head has a pointer to a block_device structure. In the current implementation, the block_device structure is set after the function returns to the caller. Here, touch_buffer() is used to mark the folio/page that owns the buffer head as accessed, but the common search helper for folio/page used by the caller function was optimized to mark the folio/page as accessed when it was reimplemented a long time ago, eliminating the need to call touch_buffer() here in the first place. So this solves the issue by eliminating the touch_buffer() call itself. Link: https://lkml.kernel.org/r/20241106160811.3316-1-konishi.ryusuke@gmail.com Link: https://lkml.kernel.org/r/20241106160811.3316-2-konishi.ryusuke@gmail.com Fixes: 5305cb830834 ("block: add block_{touch|dirty}_buffer tracepoint") Signed-off-by: Ryusuke Konishi Reported-by: Ubisectech Sirius Closes: https://lkml.kernel.org/r/86bd3013-887e-4e38-960f-ca45c657f032.bugreport@valiantsec.com Reported-by: syzbot+9982fb8d18eba905abe2@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=9982fb8d18eba905abe2 Tested-by: syzbot+9982fb8d18eba905abe2@syzkaller.appspotmail.com Cc: Tejun Heo Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/page.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 6bc4cda804e1..0854b786a513 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -39,7 +39,6 @@ __nilfs_get_page_block(struct page *page, unsigned long block, pgoff_t index, first_block = (unsigned long)index << (PAGE_SHIFT - blkbits); bh = nilfs_page_get_nth_block(page, block - first_block); - touch_buffer(bh); wait_on_buffer(bh); return bh; } From 7d4dea25ccd84a0036a95d29a9099eec3f6aee69 Mon Sep 17 00:00:00 2001 From: Kailang Yang Date: Fri, 25 Oct 2024 16:37:57 +0800 Subject: [PATCH 26/76] ALSA: hda/realtek - Fixed Clevo platform headset Mic issue commit 42ee87df8530150d637aa48363b72b22a9bbd78f upstream. Clevo platform with ALC255 Headset Mic was disable by default. Assigned verb table for Mic pin will enable it. Signed-off-by: Kailang Yang Cc: Link: https://lore.kernel.org/b2dcac3e09ef4f82b36d6712194e1ea4@realtek.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 1b5527cb59a9..4bdfad4e995b 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -10872,6 +10872,8 @@ static const struct snd_hda_pin_quirk alc269_fallback_pin_fixup_tbl[] = { {0x1a, 0x40000000}), SND_HDA_PIN_QUIRK(0x10ec0256, 0x1043, "ASUS", ALC2XX_FIXUP_HEADSET_MIC, {0x19, 0x40000000}), + SND_HDA_PIN_QUIRK(0x10ec0255, 0x1558, "Clevo", ALC2XX_FIXUP_HEADSET_MIC, + {0x19, 0x40000000}), {} }; From 89f886a0928dd939d6476a48d3a3ccfeb668a08b Mon Sep 17 00:00:00 2001 From: Maksym Glubokiy Date: Tue, 12 Nov 2024 17:48:15 +0200 Subject: [PATCH 27/76] ALSA: hda/realtek: fix mute/micmute LEDs for a HP EliteBook 645 G10 commit 96409eeab8cdd394e03ec494ea9547edc27f7ab4 upstream. HP EliteBook 645 G10 uses ALC236 codec and need the ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF quirk to make mute LED and micmute LED work. Signed-off-by: Maksym Glubokiy Cc: Link: https://patch.msgid.link/20241112154815.10888-1-maxgl.kernel@gmail.com Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 4bdfad4e995b..a56ec9bd90fa 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -9840,6 +9840,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { SND_PCI_QUIRK(0x103c, 0x8b59, "HP Elite mt645 G7 Mobile Thin Client U89", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8b5d, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8b5e, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8b5f, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8b63, "HP Elite Dragonfly 13.5 inch G4", ALC245_FIXUP_CS35L41_SPI_4_HP_GPIO_LED), SND_PCI_QUIRK(0x103c, 0x8b65, "HP ProBook 455 15.6 inch G10 Notebook PC", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), SND_PCI_QUIRK(0x103c, 0x8b66, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), From c5e0a6af5f2444f8928e9ed1b71f521ea84d5d23 Mon Sep 17 00:00:00 2001 From: Dmitry Antipov Date: Wed, 6 Nov 2024 12:21:00 +0300 Subject: [PATCH 28/76] ocfs2: fix UBSAN warning in ocfs2_verify_volume() commit 23aab037106d46e6168ce1214a958ce9bf317f2e upstream. Syzbot has reported the following splat triggered by UBSAN: UBSAN: shift-out-of-bounds in fs/ocfs2/super.c:2336:10 shift exponent 32768 is too large for 32-bit type 'int' CPU: 2 UID: 0 PID: 5255 Comm: repro Not tainted 6.12.0-rc4-syzkaller-00047-gc2ee9f594da8 #0 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-3.fc41 04/01/2014 Call Trace: dump_stack_lvl+0x241/0x360 ? __pfx_dump_stack_lvl+0x10/0x10 ? __pfx__printk+0x10/0x10 ? __asan_memset+0x23/0x50 ? lockdep_init_map_type+0xa1/0x910 __ubsan_handle_shift_out_of_bounds+0x3c8/0x420 ocfs2_fill_super+0xf9c/0x5750 ? __pfx_ocfs2_fill_super+0x10/0x10 ? __pfx_validate_chain+0x10/0x10 ? __pfx_validate_chain+0x10/0x10 ? validate_chain+0x11e/0x5920 ? __lock_acquire+0x1384/0x2050 ? __pfx_validate_chain+0x10/0x10 ? string+0x26a/0x2b0 ? widen_string+0x3a/0x310 ? string+0x26a/0x2b0 ? bdev_name+0x2b1/0x3c0 ? pointer+0x703/0x1210 ? __pfx_pointer+0x10/0x10 ? __pfx_format_decode+0x10/0x10 ? __lock_acquire+0x1384/0x2050 ? vsnprintf+0x1ccd/0x1da0 ? snprintf+0xda/0x120 ? __pfx_lock_release+0x10/0x10 ? do_raw_spin_lock+0x14f/0x370 ? __pfx_snprintf+0x10/0x10 ? set_blocksize+0x1f9/0x360 ? sb_set_blocksize+0x98/0xf0 ? setup_bdev_super+0x4e6/0x5d0 mount_bdev+0x20c/0x2d0 ? __pfx_ocfs2_fill_super+0x10/0x10 ? __pfx_mount_bdev+0x10/0x10 ? vfs_parse_fs_string+0x190/0x230 ? __pfx_vfs_parse_fs_string+0x10/0x10 legacy_get_tree+0xf0/0x190 ? __pfx_ocfs2_mount+0x10/0x10 vfs_get_tree+0x92/0x2b0 do_new_mount+0x2be/0xb40 ? __pfx_do_new_mount+0x10/0x10 __se_sys_mount+0x2d6/0x3c0 ? __pfx___se_sys_mount+0x10/0x10 ? do_syscall_64+0x100/0x230 ? __x64_sys_mount+0x20/0xc0 do_syscall_64+0xf3/0x230 entry_SYSCALL_64_after_hwframe+0x77/0x7f RIP: 0033:0x7f37cae96fda Code: 48 8b 0d 51 ce 0c 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 49 89 ca b8 a5 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 1e ce 0c 00 f7 d8 64 89 01 48 RSP: 002b:00007fff6c1aa228 EFLAGS: 00000206 ORIG_RAX: 00000000000000a5 RAX: ffffffffffffffda RBX: 00007fff6c1aa240 RCX: 00007f37cae96fda RDX: 00000000200002c0 RSI: 0000000020000040 RDI: 00007fff6c1aa240 RBP: 0000000000000004 R08: 00007fff6c1aa280 R09: 0000000000000000 R10: 00000000000008c0 R11: 0000000000000206 R12: 00000000000008c0 R13: 00007fff6c1aa280 R14: 0000000000000003 R15: 0000000001000000 For a really damaged superblock, the value of 'i_super.s_blocksize_bits' may exceed the maximum possible shift for an underlying 'int'. So add an extra check whether the aforementioned field represents the valid block size, which is 512 bytes, 1K, 2K, or 4K. Link: https://lkml.kernel.org/r/20241106092100.2661330-1-dmantipov@yandex.ru Fixes: ccd979bdbce9 ("[PATCH] OCFS2: The Second Oracle Cluster Filesystem") Signed-off-by: Dmitry Antipov Reported-by: syzbot+56f7cd1abe4b8e475180@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=56f7cd1abe4b8e475180 Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Cc: Jun Piao Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/ocfs2/super.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 988d1c076861..b39eff78ca48 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -2321,6 +2321,7 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, struct ocfs2_blockcheck_stats *stats) { int status = -EAGAIN; + u32 blksz_bits; if (memcmp(di->i_signature, OCFS2_SUPER_BLOCK_SIGNATURE, strlen(OCFS2_SUPER_BLOCK_SIGNATURE)) == 0) { @@ -2335,11 +2336,15 @@ static int ocfs2_verify_volume(struct ocfs2_dinode *di, goto out; } status = -EINVAL; - if ((1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits)) != blksz) { + /* Acceptable block sizes are 512 bytes, 1K, 2K and 4K. */ + blksz_bits = le32_to_cpu(di->id2.i_super.s_blocksize_bits); + if (blksz_bits < 9 || blksz_bits > 12) { mlog(ML_ERROR, "found superblock with incorrect block " - "size: found %u, should be %u\n", - 1 << le32_to_cpu(di->id2.i_super.s_blocksize_bits), - blksz); + "size bits: found %u, should be 9, 10, 11, or 12\n", + blksz_bits); + } else if ((1 << le32_to_cpu(blksz_bits)) != blksz) { + mlog(ML_ERROR, "found superblock with incorrect block " + "size: found %u, should be %u\n", 1 << blksz_bits, blksz); } else if (le16_to_cpu(di->id2.i_super.s_major_rev_level) != OCFS2_MAJOR_REV_LEVEL || le16_to_cpu(di->id2.i_super.s_minor_rev_level) != From 86b19031dbc79abc378dfae357f6ea33ebeb0c95 Mon Sep 17 00:00:00 2001 From: Ryusuke Konishi Date: Thu, 7 Nov 2024 01:07:33 +0900 Subject: [PATCH 29/76] nilfs2: fix null-ptr-deref in block_dirty_buffer tracepoint commit 2026559a6c4ce34db117d2db8f710fe2a9420d5a upstream. When using the "block:block_dirty_buffer" tracepoint, mark_buffer_dirty() may cause a NULL pointer dereference, or a general protection fault when KASAN is enabled. This happens because, since the tracepoint was added in mark_buffer_dirty(), it references the dev_t member bh->b_bdev->bd_dev regardless of whether the buffer head has a pointer to a block_device structure. In the current implementation, nilfs_grab_buffer(), which grabs a buffer to read (or create) a block of metadata, including b-tree node blocks, does not set the block device, but instead does so only if the buffer is not in the "uptodate" state for each of its caller block reading functions. However, if the uptodate flag is set on a folio/page, and the buffer heads are detached from it by try_to_free_buffers(), and new buffer heads are then attached by create_empty_buffers(), the uptodate flag may be restored to each buffer without the block device being set to bh->b_bdev, and mark_buffer_dirty() may be called later in that state, resulting in the bug mentioned above. Fix this issue by making nilfs_grab_buffer() always set the block device of the super block structure to the buffer head, regardless of the state of the buffer's uptodate flag. Link: https://lkml.kernel.org/r/20241106160811.3316-3-konishi.ryusuke@gmail.com Fixes: 5305cb830834 ("block: add block_{touch|dirty}_buffer tracepoint") Signed-off-by: Ryusuke Konishi Cc: Tejun Heo Cc: Ubisectech Sirius Cc: Signed-off-by: Andrew Morton Signed-off-by: Greg Kroah-Hartman --- fs/nilfs2/btnode.c | 2 -- fs/nilfs2/gcinode.c | 4 +--- fs/nilfs2/mdt.c | 1 - fs/nilfs2/page.c | 1 + 4 files changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c index 19ed9015bd66..13d943df871d 100644 --- a/fs/nilfs2/btnode.c +++ b/fs/nilfs2/btnode.c @@ -68,7 +68,6 @@ nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr) goto failed; } memset(bh->b_data, 0, i_blocksize(inode)); - bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = blocknr; set_buffer_mapped(bh); set_buffer_uptodate(bh); @@ -133,7 +132,6 @@ int nilfs_btnode_submit_block(struct address_space *btnc, __u64 blocknr, goto found; } set_buffer_mapped(bh); - bh->b_bdev = inode->i_sb->s_bdev; bh->b_blocknr = pblocknr; /* set block address for read */ bh->b_end_io = end_buffer_read_sync; get_bh(bh); diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c index fcd13da5d012..2f612288ea45 100644 --- a/fs/nilfs2/gcinode.c +++ b/fs/nilfs2/gcinode.c @@ -83,10 +83,8 @@ int nilfs_gccache_submit_read_data(struct inode *inode, sector_t blkoff, goto out; } - if (!buffer_mapped(bh)) { - bh->b_bdev = inode->i_sb->s_bdev; + if (!buffer_mapped(bh)) set_buffer_mapped(bh); - } bh->b_blocknr = pbn; bh->b_end_io = end_buffer_read_sync; get_bh(bh); diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index cbf4fa60eea2..d0808953296a 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -89,7 +89,6 @@ static int nilfs_mdt_create_block(struct inode *inode, unsigned long block, if (buffer_uptodate(bh)) goto failed_bh; - bh->b_bdev = sb->s_bdev; err = nilfs_mdt_insert_new_block(inode, block, bh, init_block); if (likely(!err)) { get_bh(bh); diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c index 0854b786a513..b4e2192e8796 100644 --- a/fs/nilfs2/page.c +++ b/fs/nilfs2/page.c @@ -63,6 +63,7 @@ struct buffer_head *nilfs_grab_buffer(struct inode *inode, put_page(page); return NULL; } + bh->b_bdev = inode->i_sb->s_bdev; return bh; } From 8f9416147d7ed414109d3501f1cb3d7a1735b25a Mon Sep 17 00:00:00 2001 From: Aurelien Jarno Date: Sun, 10 Nov 2024 12:46:36 +0100 Subject: [PATCH 30/76] Revert "mmc: dw_mmc: Fix IDMAC operation with pages bigger than 4K" commit 1635e407a4a64d08a8517ac59ca14ad4fc785e75 upstream. The commit 8396c793ffdf ("mmc: dw_mmc: Fix IDMAC operation with pages bigger than 4K") increased the max_req_size, even for 4K pages, causing various issues: - Panic booting the kernel/rootfs from an SD card on Rockchip RK3566 - Panic booting the kernel/rootfs from an SD card on StarFive JH7100 - "swiotlb buffer is full" and data corruption on StarFive JH7110 At this stage no fix have been found, so it's probably better to just revert the change. This reverts commit 8396c793ffdf28bb8aee7cfe0891080f8cab7890. Cc: stable@vger.kernel.org Cc: Sam Protsenko Fixes: 8396c793ffdf ("mmc: dw_mmc: Fix IDMAC operation with pages bigger than 4K") Closes: https://lore.kernel.org/linux-mmc/614692b4-1dbe-31b8-a34d-cb6db1909bb7@w6rz.net/ Closes: https://lore.kernel.org/linux-mmc/CAC8uq=Ppnmv98mpa1CrWLawWoPnu5abtU69v-=G-P7ysATQ2Pw@mail.gmail.com/ Signed-off-by: Aurelien Jarno Message-ID: <20241110114700.622372-1-aurelien@aurel32.net> Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/dw_mmc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 121e833efe28..d0da4573b38c 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -2952,8 +2952,8 @@ static int dw_mci_init_slot(struct dw_mci *host) if (host->use_dma == TRANS_MODE_IDMAC) { mmc->max_segs = host->ring_size; mmc->max_blk_size = 65535; - mmc->max_req_size = DW_MCI_DESC_DATA_LENGTH * host->ring_size; - mmc->max_seg_size = mmc->max_req_size; + mmc->max_seg_size = 0x1000; + mmc->max_req_size = mmc->max_seg_size * host->ring_size; mmc->max_blk_count = mmc->max_req_size / 512; } else if (host->use_dma == TRANS_MODE_EDMAC) { mmc->max_segs = 64; From db6dfee39cfb48600ef6aaebabc42044fdd3def2 Mon Sep 17 00:00:00 2001 From: Andre Przywara Date: Thu, 7 Nov 2024 01:42:40 +0000 Subject: [PATCH 31/76] mmc: sunxi-mmc: Fix A100 compatible description commit 85b580afc2c215394e08974bf033de9face94955 upstream. It turns out that the Allwinner A100/A133 SoC only supports 8K DMA blocks (13 bits wide), for both the SD/SDIO and eMMC instances. And while this alone would make a trivial fix, the H616 falls back to the A100 compatible string, so we have to now match the H616 compatible string explicitly against the description advertising 64K DMA blocks. As the A100 is now compatible with the D1 description, let the A100 compatible string point to that block instead, and introduce an explicit match against the H616 string, pointing to the old description. Also remove the redundant setting of clk_delays to NULL on the way. Fixes: 3536b82e5853 ("mmc: sunxi: add support for A100 mmc controller") Cc: stable@vger.kernel.org Signed-off-by: Andre Przywara Tested-by: Parthiban Nallathambi Reviewed-by: Chen-Yu Tsai Message-ID: <20241107014240.24669-1-andre.przywara@arm.com> Signed-off-by: Ulf Hansson Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/host/sunxi-mmc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/sunxi-mmc.c b/drivers/mmc/host/sunxi-mmc.c index 69dcb8805e05..e7f3240a53d8 100644 --- a/drivers/mmc/host/sunxi-mmc.c +++ b/drivers/mmc/host/sunxi-mmc.c @@ -1191,10 +1191,9 @@ static const struct sunxi_mmc_cfg sun50i_a64_emmc_cfg = { .needs_new_timings = true, }; -static const struct sunxi_mmc_cfg sun50i_a100_cfg = { +static const struct sunxi_mmc_cfg sun50i_h616_cfg = { .idma_des_size_bits = 16, .idma_des_shift = 2, - .clk_delays = NULL, .can_calibrate = true, .mask_data0 = true, .needs_new_timings = true, @@ -1217,8 +1216,9 @@ static const struct of_device_id sunxi_mmc_of_match[] = { { .compatible = "allwinner,sun20i-d1-mmc", .data = &sun20i_d1_cfg }, { .compatible = "allwinner,sun50i-a64-mmc", .data = &sun50i_a64_cfg }, { .compatible = "allwinner,sun50i-a64-emmc", .data = &sun50i_a64_emmc_cfg }, - { .compatible = "allwinner,sun50i-a100-mmc", .data = &sun50i_a100_cfg }, + { .compatible = "allwinner,sun50i-a100-mmc", .data = &sun20i_d1_cfg }, { .compatible = "allwinner,sun50i-a100-emmc", .data = &sun50i_a100_emmc_cfg }, + { .compatible = "allwinner,sun50i-h616-mmc", .data = &sun50i_h616_cfg }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, sunxi_mmc_of_match); From 74d8e0efa065756be311a836a714593339f1114b Mon Sep 17 00:00:00 2001 From: Francesco Dolcini Date: Thu, 26 Sep 2024 16:12:46 +0200 Subject: [PATCH 32/76] drm/bridge: tc358768: Fix DSI command tx commit 32c4514455b2b8fde506f8c0962f15c7e4c26f1d upstream. Wait for the command transmission to be completed in the DSI transfer function polling for the dc_start bit to go back to idle state after the transmission is started. This is documented in the datasheet and failures to do so lead to commands corruption. Fixes: ff1ca6397b1d ("drm/bridge: Add tc358768 driver") Cc: stable@vger.kernel.org Signed-off-by: Francesco Dolcini Reviewed-by: Neil Armstrong Link: https://lore.kernel.org/r/20240926141246.48282-1-francesco@dolcini.it Signed-off-by: Neil Armstrong Link: https://patchwork.freedesktop.org/patch/msgid/20240926141246.48282-1-francesco@dolcini.it Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/bridge/tc358768.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/bridge/tc358768.c b/drivers/gpu/drm/bridge/tc358768.c index aabdb5c74d93..ed8094b2a7ff 100644 --- a/drivers/gpu/drm/bridge/tc358768.c +++ b/drivers/gpu/drm/bridge/tc358768.c @@ -126,6 +126,9 @@ #define TC358768_DSI_CONFW_MODE_CLR (6 << 29) #define TC358768_DSI_CONFW_ADDR_DSI_CONTROL (0x3 << 24) +/* TC358768_DSICMD_TX (0x0600) register */ +#define TC358768_DSI_CMDTX_DC_START BIT(0) + static const char * const tc358768_supplies[] = { "vddc", "vddmipi", "vddio" }; @@ -230,6 +233,21 @@ static void tc358768_update_bits(struct tc358768_priv *priv, u32 reg, u32 mask, tc358768_write(priv, reg, tmp); } +static void tc358768_dsicmd_tx(struct tc358768_priv *priv) +{ + u32 val; + + /* start transfer */ + tc358768_write(priv, TC358768_DSICMD_TX, TC358768_DSI_CMDTX_DC_START); + if (priv->error) + return; + + /* wait transfer completion */ + priv->error = regmap_read_poll_timeout(priv->regmap, TC358768_DSICMD_TX, val, + (val & TC358768_DSI_CMDTX_DC_START) == 0, + 100, 100000); +} + static int tc358768_sw_reset(struct tc358768_priv *priv) { /* Assert Reset */ @@ -517,8 +535,7 @@ static ssize_t tc358768_dsi_host_transfer(struct mipi_dsi_host *host, } } - /* start transfer */ - tc358768_write(priv, TC358768_DSICMD_TX, 1); + tc358768_dsicmd_tx(priv); ret = tc358768_clear_error(priv); if (ret) From d61a1ccd79b97fb827e5823bfe13e4a248bb8ff3 Mon Sep 17 00:00:00 2001 From: Vijendar Mukunda Date: Tue, 12 Nov 2024 10:11:42 -0600 Subject: [PATCH 33/76] drm/amd: Fix initialization mistake for NBIO 7.7.0 commit 7013a8268d311fded6c7a6528fc1de82668e75f6 upstream. There is a strapping issue on NBIO 7.7.0 that can lead to spurious PME events while in the D0 state. Co-developed-by: Mario Limonciello Signed-off-by: Vijendar Mukunda Signed-off-by: Mario Limonciello Acked-by: Alex Deucher Link: https://lore.kernel.org/r/20241112161142.28974-1-mario.limonciello@amd.com Signed-off-by: Alex Deucher (cherry picked from commit 447a54a0f79c9a409ceaa17804bdd2e0206397b9) Cc: stable@vger.kernel.org Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c index def89379b51a..d23e7391c6f2 100644 --- a/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c +++ b/drivers/gpu/drm/amd/amdgpu/nbio_v7_7.c @@ -247,6 +247,12 @@ static void nbio_v7_7_init_registers(struct amdgpu_device *adev) if (def != data) WREG32_SOC15(NBIO, 0, regBIF0_PCIE_MST_CTRL_3, data); + switch (adev->ip_versions[NBIO_HWIP][0]) { + case IP_VERSION(7, 7, 0): + data = RREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4) & ~BIT(23); + WREG32_SOC15(NBIO, 0, regRCC_DEV0_EPF5_STRAP4, data); + break; + } } static void nbio_v7_7_update_medium_grain_clock_gating(struct amdgpu_device *adev, From c26cff492697cde3c6687a4b62f078170b51e59a Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Fri, 21 Jun 2024 15:19:53 +0200 Subject: [PATCH 34/76] staging: vchiq_arm: Get the rid off struct vchiq_2835_state [ Upstream commit 4e2766102da632f26341d5539519b0abf73df887 ] The whole benefit of this encapsulating struct is questionable. It just stores a flag to signalize the init state of vchiq_arm_state. Beside the fact this flag is set too soon, the access to uninitialized members should be avoided. So initialize vchiq_arm_state properly before assign it directly to vchiq_state. Signed-off-by: Stefan Wahren Link: https://lore.kernel.org/r/20240621131958.98208-6-wahrenst@gmx.net Signed-off-by: Greg Kroah-Hartman Stable-dep-of: 404b739e8955 ("staging: vchiq_arm: Use devm_kzalloc() for vchiq_arm_state allocation") Signed-off-by: Sasha Levin --- .../interface/vchiq_arm/vchiq_arm.c | 25 +++++-------------- 1 file changed, 6 insertions(+), 19 deletions(-) diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c index 705c5e283c27..bb1342223ad0 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c @@ -115,11 +115,6 @@ struct vchiq_arm_state { int first_connect; }; -struct vchiq_2835_state { - int inited; - struct vchiq_arm_state arm_state; -}; - struct vchiq_pagelist_info { struct pagelist *pagelist; size_t pagelist_buffer_size; @@ -574,29 +569,21 @@ vchiq_arm_init_state(struct vchiq_state *state, int vchiq_platform_init_state(struct vchiq_state *state) { - struct vchiq_2835_state *platform_state; + struct vchiq_arm_state *platform_state; - state->platform_state = kzalloc(sizeof(*platform_state), GFP_KERNEL); - if (!state->platform_state) + platform_state = kzalloc(sizeof(*platform_state), GFP_KERNEL); + if (!platform_state) return -ENOMEM; - platform_state = (struct vchiq_2835_state *)state->platform_state; - - platform_state->inited = 1; - vchiq_arm_init_state(state, &platform_state->arm_state); + vchiq_arm_init_state(state, platform_state); + state->platform_state = (struct opaque_platform_state *)platform_state; return 0; } static struct vchiq_arm_state *vchiq_platform_get_arm_state(struct vchiq_state *state) { - struct vchiq_2835_state *platform_state; - - platform_state = (struct vchiq_2835_state *)state->platform_state; - - WARN_ON_ONCE(!platform_state->inited); - - return &platform_state->arm_state; + return (struct vchiq_arm_state *)state->platform_state; } void From 0708d3197f693164e5d4ecd9073cfca91a8c6856 Mon Sep 17 00:00:00 2001 From: Umang Jain Date: Wed, 16 Oct 2024 18:32:24 +0530 Subject: [PATCH 35/76] staging: vchiq_arm: Use devm_kzalloc() for vchiq_arm_state allocation [ Upstream commit 404b739e895522838f1abdc340c554654d671dde ] The struct vchiq_arm_state 'platform_state' is currently allocated dynamically using kzalloc(). Unfortunately, it is never freed and is subjected to memory leaks in the error handling paths of the probe() function. To address the issue, use device resource management helper devm_kzalloc(), to ensure cleanup after its allocation. Fixes: 71bad7f08641 ("staging: add bcm2708 vchiq driver") Cc: stable@vger.kernel.org Signed-off-by: Umang Jain Reviewed-by: Dan Carpenter Link: https://lore.kernel.org/r/20241016130225.61024-2-umang.jain@ideasonboard.com Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin --- drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c index bb1342223ad0..456a9508fb91 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c @@ -571,7 +571,7 @@ vchiq_platform_init_state(struct vchiq_state *state) { struct vchiq_arm_state *platform_state; - platform_state = kzalloc(sizeof(*platform_state), GFP_KERNEL); + platform_state = devm_kzalloc(state->dev, sizeof(*platform_state), GFP_KERNEL); if (!platform_state) return -ENOMEM; From d1ac7e2620302e3e49573df39bd4e868e8b4962a Mon Sep 17 00:00:00 2001 From: Konstantin Komarov Date: Mon, 18 Nov 2024 10:26:50 +0800 Subject: [PATCH 36/76] fs/ntfs3: Additional check in ntfs_file_release [ Upstream commit 031d6f608290c847ba6378322d0986d08d1a645a ] Reported-by: syzbot+8c652f14a0fde76ff11d@syzkaller.appspotmail.com Signed-off-by: Konstantin Komarov Signed-off-by: Bin Lan Signed-off-by: Sasha Levin --- fs/ntfs3/file.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index aedd4f5f459e..70b38465aee3 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -1214,8 +1214,16 @@ static int ntfs_file_release(struct inode *inode, struct file *file) int err = 0; /* If we are last writer on the inode, drop the block reservation. */ - if (sbi->options->prealloc && ((file->f_mode & FMODE_WRITE) && - atomic_read(&inode->i_writecount) == 1)) { + if (sbi->options->prealloc && + ((file->f_mode & FMODE_WRITE) && + atomic_read(&inode->i_writecount) == 1) + /* + * The only file when inode->i_fop = &ntfs_file_operations and + * init_rwsem(&ni->file.run_lock) is not called explicitly is MFT. + * + * Add additional check here. + */ + && inode->i_ino != MFT_REC_MFT) { ni_lock(ni); down_write(&ni->file.run_lock); From cec736e60dc18d91b88af28d96664bff284b02d1 Mon Sep 17 00:00:00 2001 From: Luiz Augusto von Dentz Date: Fri, 5 Apr 2024 15:56:50 -0400 Subject: [PATCH 37/76] Bluetooth: ISO: Fix not validating setsockopt user input commit 9e8742cdfc4b0e65266bb4a901a19462bda9285e upstream. Check user input length before copying data. Fixes: ccf74f2390d6 ("Bluetooth: Add BTPROTO_ISO socket type") Fixes: 0731c5ab4d51 ("Bluetooth: ISO: Add support for BT_PKT_STATUS") Fixes: f764a6c2c1e4 ("Bluetooth: ISO: Add broadcast support") Signed-off-by: Eric Dumazet Signed-off-by: Luiz Augusto von Dentz [Xiangyu: Bp to fix CVE: CVE-2024-35964 resolved minor conflicts] Signed-off-by: Xiangyu Chen Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/iso.c | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/net/bluetooth/iso.c b/net/bluetooth/iso.c index 27efca5dc7bb..ff15d5192768 100644 --- a/net/bluetooth/iso.c +++ b/net/bluetooth/iso.c @@ -1189,7 +1189,7 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, sockptr_t optval, unsigned int optlen) { struct sock *sk = sock->sk; - int len, err = 0; + int err = 0; struct bt_iso_qos qos; u32 opt; @@ -1204,10 +1204,9 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = bt_copy_from_sockptr(&opt, sizeof(opt), optval, optlen); + if (err) break; - } if (opt) set_bit(BT_SK_DEFER_SETUP, &bt_sk(sk)->flags); @@ -1222,18 +1221,9 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, break; } - len = min_t(unsigned int, sizeof(qos), optlen); - if (len != sizeof(qos)) { - err = -EINVAL; + err = bt_copy_from_sockptr(&qos, sizeof(qos), optval, optlen); + if (err) break; - } - - memset(&qos, 0, sizeof(qos)); - - if (copy_from_sockptr(&qos, optval, len)) { - err = -EFAULT; - break; - } if (!check_qos(&qos)) { err = -EINVAL; @@ -1252,18 +1242,16 @@ static int iso_sock_setsockopt(struct socket *sock, int level, int optname, } if (optlen > sizeof(iso_pi(sk)->base)) { - err = -EOVERFLOW; + err = -EINVAL; break; } - len = min_t(unsigned int, sizeof(iso_pi(sk)->base), optlen); - - if (copy_from_sockptr(iso_pi(sk)->base, optval, len)) { - err = -EFAULT; + err = bt_copy_from_sockptr(iso_pi(sk)->base, optlen, optval, + optlen); + if (err) break; - } - iso_pi(sk)->base_len = len; + iso_pi(sk)->base_len = optlen; break; From c7fa16e41acd789da6cc434b4d5b88eef57b99c1 Mon Sep 17 00:00:00 2001 From: Jiri Olsa Date: Mon, 4 Nov 2024 18:52:54 +0100 Subject: [PATCH 38/76] lib/buildid: Fix build ID parsing logic The parse_build_id_buf does not account Elf32_Nhdr header size when getting the build id data pointer and returns wrong build id data as result. This is problem only for stable trees that merged 84887f4c1c3a fix, the upstream build id code was refactored and returns proper build id. Acked-by: Andrii Nakryiko Fixes: 84887f4c1c3a ("lib/buildid: harden build ID parsing logic") Signed-off-by: Jiri Olsa Signed-off-by: Greg Kroah-Hartman --- lib/buildid.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/buildid.c b/lib/buildid.c index e41fb0ee405f..cc5da016b235 100644 --- a/lib/buildid.c +++ b/lib/buildid.c @@ -40,7 +40,7 @@ static int parse_build_id_buf(unsigned char *build_id, name_sz == note_name_sz && memcmp(nhdr + 1, note_name, note_name_sz) == 0 && desc_sz > 0 && desc_sz <= BUILD_ID_SIZE_MAX) { - data = note_start + note_off + ALIGN(note_name_sz, 4); + data = note_start + note_off + sizeof(Elf32_Nhdr) + ALIGN(note_name_sz, 4); memcpy(build_id, data, desc_sz); memset(build_id + desc_sz, 0, BUILD_ID_SIZE_MAX - desc_sz); if (size) From 769698a45f4fb2905f6920d93e8e71eaa4743ef2 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 15 Nov 2024 17:11:38 +0300 Subject: [PATCH 39/76] cxl/pci: fix error code in __cxl_hdm_decode_init() When commit 0cab68720598 ("cxl/pci: Fix disabling memory if DVSEC CXL Range does not match a CFMWS window") was backported, this chunk moved from the cxl_hdm_decode_init() function which returns negative error codes to the __cxl_hdm_decode_init() function which returns false on error. So the error code needs to be modified from -ENXIO to false. This issue only exits in the 6.1.y kernels. In later kernels negative error codes are correct and the driver didn't exist in earlier kernels. Fixes: 031217128990 ("cxl/pci: Fix disabling memory if DVSEC CXL Range does not match a CFMWS window") Signed-off-by: Dan Carpenter Reviewed-by: Ira Weiny Reviewed-by: Dave Jiang Signed-off-by: Greg Kroah-Hartman --- drivers/cxl/core/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cxl/core/pci.c b/drivers/cxl/core/pci.c index 8d92a24fd73d..97adf9a7ea89 100644 --- a/drivers/cxl/core/pci.c +++ b/drivers/cxl/core/pci.c @@ -377,7 +377,7 @@ static bool __cxl_hdm_decode_init(struct cxl_dev_state *cxlds, if (!allowed && info->mem_enabled) { dev_err(dev, "Range register decodes outside platform defined CXL ranges.\n"); - return -ENXIO; + return false; } /* From 94180edc356ea91fb22cbc9a17729356e45ce7d8 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Wed, 6 Nov 2024 21:50:55 +0100 Subject: [PATCH 40/76] media: dvbdev: fix the logic when DVB_DYNAMIC_MINORS is not set commit a4aebaf6e6efff548b01a3dc49b4b9074751c15b upstream. When CONFIG_DVB_DYNAMIC_MINORS, ret is not initialized, and a semaphore is left at the wrong state, in case of errors. Make the code simpler and avoid mistakes by having just one error check logic used weather DVB_DYNAMIC_MINORS is used or not. Reported-by: kernel test robot Reported-by: Dan Carpenter Closes: https://lore.kernel.org/r/202410201717.ULWWdJv8-lkp@intel.com/ Signed-off-by: Mauro Carvalho Chehab Link: https://lore.kernel.org/r/9e067488d8935b8cf00959764a1fa5de85d65725.1730926254.git.mchehab+huawei@kernel.org Cc: Nathan Chancellor Signed-off-by: Greg Kroah-Hartman --- drivers/media/dvb-core/dvbdev.c | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/drivers/media/dvb-core/dvbdev.c b/drivers/media/dvb-core/dvbdev.c index d1212acb7093..fc9a9ef93cae 100644 --- a/drivers/media/dvb-core/dvbdev.c +++ b/drivers/media/dvb-core/dvbdev.c @@ -534,6 +534,9 @@ int dvb_register_device(struct dvb_adapter *adap, struct dvb_device **pdvbdev, for (minor = 0; minor < MAX_DVB_MINORS; minor++) if (dvb_minors[minor] == NULL) break; +#else + minor = nums2minor(adap->num, type, id); +#endif if (minor >= MAX_DVB_MINORS) { if (new_node) { list_del (&new_node->list_head); @@ -547,17 +550,7 @@ int dvb_register_device(struct dvb_adapter *adap, struct dvb_device **pdvbdev, mutex_unlock(&dvbdev_register_lock); return -EINVAL; } -#else - minor = nums2minor(adap->num, type, id); - if (minor >= MAX_DVB_MINORS) { - dvb_media_device_free(dvbdev); - list_del(&dvbdev->list_head); - kfree(dvbdev); - *pdvbdev = NULL; - mutex_unlock(&dvbdev_register_lock); - return ret; - } -#endif + dvbdev->minor = minor; dvb_minors[minor] = dvb_device_get(dvbdev); up_write(&minor_rwsem); From a86db75c53144b199bfcc5c806452f68c60edb97 Mon Sep 17 00:00:00 2001 From: Dai Ngo Date: Mon, 18 Nov 2024 16:18:56 -0500 Subject: [PATCH 41/76] NFSD: initialize copy->cp_clp early in nfsd4_copy for use by trace point [ Upstream commit 15d1975b7279693d6f09398e0e2e31aca2310275 ] Prepare for adding server copy trace points. Signed-off-by: Dai Ngo Tested-by: Chen Hanxiao Stable-dep-of: 9ed666eba4e0 ("NFSD: Async COPY result needs to return a write verifier") Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index df9dbd93663e..50f17cee8bcf 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1768,6 +1768,7 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, __be32 status; struct nfsd4_copy *async_copy = NULL; + copy->cp_clp = cstate->clp; if (nfsd4_ssc_is_inter(copy)) { if (!inter_copy_offload_enable || nfsd4_copy_is_sync(copy)) { status = nfserr_notsupp; @@ -1782,7 +1783,6 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } - copy->cp_clp = cstate->clp; memcpy(©->fh, &cstate->current_fh.fh_handle, sizeof(struct knfsd_fh)); if (nfsd4_copy_is_async(copy)) { From 2fd3f2f2538df18fc7b803cd0092d2bd65bf6084 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Nov 2024 16:18:57 -0500 Subject: [PATCH 42/76] NFSD: Async COPY result needs to return a write verifier [ Upstream commit 9ed666eba4e0a2bb8ffaa3739d830b64d4f2aaad ] Currently, when NFSD handles an asynchronous COPY, it returns a zero write verifier, relying on the subsequent CB_OFFLOAD callback to pass the write verifier and a stable_how4 value to the client. However, if the CB_OFFLOAD never arrives at the client (for example, if a network partition occurs just as the server sends the CB_OFFLOAD operation), the client will never receive this verifier. Thus, if the client sends a follow-up COMMIT, there is no way for the client to assess the COMMIT result. The usual recovery for a missing CB_OFFLOAD is for the client to send an OFFLOAD_STATUS operation, but that operation does not carry a write verifier in its result. Neither does it carry a stable_how4 value, so the client /must/ send a COMMIT in this case -- which will always fail because currently there's still no write verifier in the COPY result. Thus the server needs to return a normal write verifier in its COPY result even if the COPY operation is to be performed asynchronously. If the server recognizes the callback stateid in subsequent OFFLOAD_STATUS operations, then obviously it has not restarted, and the write verifier the client received in the COPY result is still valid and can be used to assess a COMMIT of the copied data, if one is needed. Reviewed-by: Jeff Layton [ cel: adjusted to apply to origin/linux-6.1.y ] Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4proc.c | 25 +++++++++---------------- 1 file changed, 9 insertions(+), 16 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 50f17cee8bcf..b8fb37cc59c4 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -717,15 +717,6 @@ nfsd4_access(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, &access->ac_supported); } -static void gen_boot_verifier(nfs4_verifier *verifier, struct net *net) -{ - __be32 *verf = (__be32 *)verifier->data; - - BUILD_BUG_ON(2*sizeof(*verf) != sizeof(verifier->data)); - - nfsd_copy_write_verifier(verf, net_generic(net, nfsd_net_id)); -} - static __be32 nfsd4_commit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -1593,7 +1584,6 @@ static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync) test_bit(NFSD4_COPY_F_COMMITTED, ©->cp_flags) ? NFS_FILE_SYNC : NFS_UNSTABLE; nfsd4_copy_set_sync(copy, sync); - gen_boot_verifier(©->cp_res.wr_verifier, copy->cp_clp->net); } static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy, @@ -1764,9 +1754,14 @@ static __be32 nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { - struct nfsd4_copy *copy = &u->copy; - __be32 status; + struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct nfsd4_copy *async_copy = NULL; + struct nfsd4_copy *copy = &u->copy; + struct nfsd42_write_res *result; + __be32 status; + + result = ©->cp_res; + nfsd_copy_write_verifier((__be32 *)&result->wr_verifier.data, nn); copy->cp_clp = cstate->clp; if (nfsd4_ssc_is_inter(copy)) { @@ -1786,8 +1781,6 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, memcpy(©->fh, &cstate->current_fh.fh_handle, sizeof(struct knfsd_fh)); if (nfsd4_copy_is_async(copy)) { - struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); - status = nfserrno(-ENOMEM); async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); if (!async_copy) @@ -1799,8 +1792,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, goto out_err; if (!nfs4_init_copy_state(nn, copy)) goto out_err; - memcpy(©->cp_res.cb_stateid, ©->cp_stateid.cs_stid, - sizeof(copy->cp_res.cb_stateid)); + memcpy(&result->cb_stateid, ©->cp_stateid.cs_stid, + sizeof(result->cb_stateid)); dup_copy_fields(copy, async_copy); async_copy->copy_task = kthread_create(nfsd4_do_async_copy, async_copy, "%s", "copy thread"); From 7ea9260874b779637aff6d24c344b8ef4ac862a0 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Nov 2024 16:18:58 -0500 Subject: [PATCH 43/76] NFSD: Limit the number of concurrent async COPY operations [ Upstream commit aadc3bbea163b6caaaebfdd2b6c4667fbc726752 ] Nothing appears to limit the number of concurrent async COPY operations that clients can start. In addition, AFAICT each async COPY can copy an unlimited number of 4MB chunks, so can run for a long time. Thus IMO async COPY can become a DoS vector. Add a restriction mechanism that bounds the number of concurrent background COPY operations. Start simple and try to be fair -- this patch implements a per-namespace limit. An async COPY request that occurs while this limit is exceeded gets NFS4ERR_DELAY. The requesting client can choose to send the request again after a delay or fall back to a traditional read/write style copy. If there is need to make the mechanism more sophisticated, we can visit that in future patches. Cc: stable@vger.kernel.org Reviewed-by: Jeff Layton Link: https://nvd.nist.gov/vuln/detail/CVE-2024-49974 Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/netns.h | 1 + fs/nfsd/nfs4proc.c | 11 +++++++++-- fs/nfsd/nfs4state.c | 1 + fs/nfsd/xdr4.h | 1 + 4 files changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h index 548422b24a7d..41c750f34473 100644 --- a/fs/nfsd/netns.h +++ b/fs/nfsd/netns.h @@ -152,6 +152,7 @@ struct nfsd_net { u32 s2s_cp_cl_id; struct idr s2s_cp_stateids; spinlock_t s2s_cp_lock; + atomic_t pending_async_copies; /* * Version information diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index b8fb37cc59c4..6637a7ef974b 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1243,6 +1243,7 @@ static void nfs4_put_copy(struct nfsd4_copy *copy) { if (!refcount_dec_and_test(©->refcount)) return; + atomic_dec(©->cp_nn->pending_async_copies); kfree(copy->cp_src); kfree(copy); } @@ -1781,10 +1782,16 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, memcpy(©->fh, &cstate->current_fh.fh_handle, sizeof(struct knfsd_fh)); if (nfsd4_copy_is_async(copy)) { - status = nfserrno(-ENOMEM); async_copy = kzalloc(sizeof(struct nfsd4_copy), GFP_KERNEL); if (!async_copy) goto out_err; + async_copy->cp_nn = nn; + /* Arbitrary cap on number of pending async copy operations */ + if (atomic_inc_return(&nn->pending_async_copies) > + (int)rqstp->rq_pool->sp_nrthreads) { + atomic_dec(&nn->pending_async_copies); + goto out_err; + } INIT_LIST_HEAD(&async_copy->copies); refcount_set(&async_copy->refcount, 1); async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL); @@ -1823,7 +1830,7 @@ out_err: } if (async_copy) cleanup_async_copy(async_copy); - status = nfserrno(-ENOMEM); + status = nfserr_jukebox; goto out; } diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 893d099e0099..e195012db48e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -8076,6 +8076,7 @@ static int nfs4_state_create_net(struct net *net) spin_lock_init(&nn->client_lock); spin_lock_init(&nn->s2s_cp_lock); idr_init(&nn->s2s_cp_stateids); + atomic_set(&nn->pending_async_copies, 0); spin_lock_init(&nn->blocked_locks_lock); INIT_LIST_HEAD(&nn->blocked_locks_lru); diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 510978e602da..9bd1ade6ba54 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -574,6 +574,7 @@ struct nfsd4_copy { struct nfsd4_ssc_umount_item *ss_nsui; struct nfs_fh c_fh; nfs4_stateid stateid; + struct nfsd_net *cp_nn; }; static inline void nfsd4_copy_set_sync(struct nfsd4_copy *copy, bool sync) From c3074003fa6837c2b89a34d8d12d9463b59d22d6 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Nov 2024 16:18:59 -0500 Subject: [PATCH 44/76] NFSD: Initialize struct nfsd4_copy earlier [ Upstream commit 63fab04cbd0f96191b6e5beedc3b643b01c15889 ] Ensure the refcount and async_copies fields are initialized early. cleanup_async_copy() will reference these fields if an error occurs in nfsd4_copy(). If they are not correctly initialized, at the very least, a refcount underflow occurs. Reported-by: Olga Kornievskaia Fixes: aadc3bbea163 ("NFSD: Limit the number of concurrent async COPY operations") Reviewed-by: Jeff Layton Tested-by: Olga Kornievskaia Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4proc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6637a7ef974b..c0f13989bd24 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1786,14 +1786,14 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if (!async_copy) goto out_err; async_copy->cp_nn = nn; + INIT_LIST_HEAD(&async_copy->copies); + refcount_set(&async_copy->refcount, 1); /* Arbitrary cap on number of pending async copy operations */ if (atomic_inc_return(&nn->pending_async_copies) > (int)rqstp->rq_pool->sp_nrthreads) { atomic_dec(&nn->pending_async_copies); goto out_err; } - INIT_LIST_HEAD(&async_copy->copies); - refcount_set(&async_copy->refcount, 1); async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL); if (!async_copy->cp_src) goto out_err; From 949ee5d44d1fd95119b29b3382a933cdc617bf9e Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 18 Nov 2024 16:19:00 -0500 Subject: [PATCH 45/76] NFSD: Never decrement pending_async_copies on error [ Upstream commit 8286f8b622990194207df9ab852e0f87c60d35e9 ] The error flow in nfsd4_copy() calls cleanup_async_copy(), which already decrements nn->pending_async_copies. Reported-by: Olga Kornievskaia Fixes: aadc3bbea163 ("NFSD: Limit the number of concurrent async COPY operations") Signed-off-by: Chuck Lever Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4proc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index c0f13989bd24..0aebb2dc5776 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1790,10 +1790,8 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, refcount_set(&async_copy->refcount, 1); /* Arbitrary cap on number of pending async copy operations */ if (atomic_inc_return(&nn->pending_async_copies) > - (int)rqstp->rq_pool->sp_nrthreads) { - atomic_dec(&nn->pending_async_copies); + (int)rqstp->rq_pool->sp_nrthreads) goto out_err; - } async_copy->cp_src = kmalloc(sizeof(*async_copy->cp_src), GFP_KERNEL); if (!async_copy->cp_src) goto out_err; From 24995851d58c4a205ad0ffa7b2f21e479a9c8527 Mon Sep 17 00:00:00 2001 From: Paolo Abeni Date: Tue, 19 Nov 2024 09:35:49 +0100 Subject: [PATCH 46/76] mptcp: cope racing subflow creation in mptcp_rcv_space_adjust commit ce7356ae35943cc6494cc692e62d51a734062b7d upstream. Additional active subflows - i.e. created by the in kernel path manager - are included into the subflow list before starting the 3whs. A racing recvmsg() spooling data received on an already established subflow would unconditionally call tcp_cleanup_rbuf() on all the current subflows, potentially hitting a divide by zero error on the newly created ones. Explicitly check that the subflow is in a suitable state before invoking tcp_cleanup_rbuf(). Fixes: c76c6956566f ("mptcp: call tcp_cleanup_rbuf on subflows") Signed-off-by: Paolo Abeni Reviewed-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/02374660836e1b52afc91966b7535c8c5f7bafb0.1731060874.git.pabeni@redhat.com Signed-off-by: Jakub Kicinski [ Conflicts in protocol.c, because commit f410cbea9f3d ("tcp: annotate data-races around tp->window_clamp") has not been backported to this version. The conflict is easy to resolve, because only the context is different, but not the line to modify. ] Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Greg Kroah-Hartman --- net/mptcp/protocol.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 78ac5c538e13..1acd4e37a0ea 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -2057,7 +2057,8 @@ static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied) slow = lock_sock_fast(ssk); WRITE_ONCE(ssk->sk_rcvbuf, rcvbuf); tcp_sk(ssk)->window_clamp = window_clamp; - tcp_cleanup_rbuf(ssk, 1); + if (tcp_can_send_ack(ssk)) + tcp_cleanup_rbuf(ssk, 1); unlock_sock_fast(ssk, slow); } } From 71603aa07ab76de6e15a90fcb798329b7e976ceb Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 19 Nov 2024 09:35:50 +0100 Subject: [PATCH 47/76] mptcp: define more local variables sk commit 14cb0e0bf39bd10429ba14e9e2f905f1144226fc upstream. '(struct sock *)msk' is used several times in mptcp_nl_cmd_announce(), mptcp_nl_cmd_remove() or mptcp_userspace_pm_set_flags() in pm_userspace.c, it's worth adding a local variable sk to point it. Reviewed-by: Matthieu Baerts Signed-off-by: Geliang Tang Signed-off-by: Mat Martineau Link: https://lore.kernel.org/r/20231025-send-net-next-20231025-v1-8-db8f25f798eb@kernel.org Signed-off-by: Jakub Kicinski Stable-dep-of: 06afe09091ee ("mptcp: add userspace_pm_lookup_addr_by_id helper") Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Greg Kroah-Hartman --- net/mptcp/pm_userspace.c | 31 ++++++++++++++++++++----------- 1 file changed, 20 insertions(+), 11 deletions(-) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 748e3876ec6d..530f414e57d6 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -185,6 +185,7 @@ int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info) struct mptcp_pm_addr_entry addr_val; struct mptcp_sock *msk; int err = -EINVAL; + struct sock *sk; u32 token_val; if (!addr || !token) { @@ -200,6 +201,8 @@ int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info) return err; } + sk = (struct sock *)msk; + if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto announce_err; @@ -223,7 +226,7 @@ int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info) goto announce_err; } - lock_sock((struct sock *)msk); + lock_sock(sk); spin_lock_bh(&msk->pm.lock); if (mptcp_pm_alloc_anno_list(msk, &addr_val.addr)) { @@ -233,11 +236,11 @@ int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info) } spin_unlock_bh(&msk->pm.lock); - release_sock((struct sock *)msk); + release_sock(sk); err = 0; announce_err: - sock_put((struct sock *)msk); + sock_put(sk); return err; } @@ -284,6 +287,7 @@ int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info) struct mptcp_sock *msk; LIST_HEAD(free_list); int err = -EINVAL; + struct sock *sk; u32 token_val; u8 id_val; @@ -301,6 +305,8 @@ int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info) return err; } + sk = (struct sock *)msk; + if (!mptcp_pm_is_userspace(msk)) { GENL_SET_ERR_MSG(info, "invalid request; userspace PM not selected"); goto remove_err; @@ -311,7 +317,7 @@ int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info) goto remove_err; } - lock_sock((struct sock *)msk); + lock_sock(sk); list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { if (entry->addr.id == id_val) { @@ -322,7 +328,7 @@ int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info) if (!match) { GENL_SET_ERR_MSG(info, "address with specified id not found"); - release_sock((struct sock *)msk); + release_sock(sk); goto remove_err; } @@ -330,15 +336,15 @@ int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info) mptcp_pm_remove_addrs(msk, &free_list); - release_sock((struct sock *)msk); + release_sock(sk); list_for_each_entry_safe(match, entry, &free_list, list) { - sock_kfree_s((struct sock *)msk, match, sizeof(*match)); + sock_kfree_s(sk, match, sizeof(*match)); } err = 0; remove_err: - sock_put((struct sock *)msk); + sock_put(sk); return err; } @@ -560,6 +566,7 @@ int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, { struct mptcp_sock *msk; int ret = -EINVAL; + struct sock *sk; u32 token_val; token_val = nla_get_u32(token); @@ -568,6 +575,8 @@ int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, if (!msk) return ret; + sk = (struct sock *)msk; + if (!mptcp_pm_is_userspace(msk)) goto set_flags_err; @@ -575,11 +584,11 @@ int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, rem->addr.family == AF_UNSPEC) goto set_flags_err; - lock_sock((struct sock *)msk); + lock_sock(sk); ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc->addr, &rem->addr, bkup); - release_sock((struct sock *)msk); + release_sock(sk); set_flags_err: - sock_put((struct sock *)msk); + sock_put(sk); return ret; } From 199af46b93f6a4728f7f60247ae8b112a38fc42f Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 19 Nov 2024 09:35:51 +0100 Subject: [PATCH 48/76] mptcp: add userspace_pm_lookup_addr_by_id helper commit 06afe09091ee69dc7ab058b4be9917ae59cc81e5 upstream. Corresponding __lookup_addr_by_id() helper in the in-kernel netlink PM, this patch adds a new helper mptcp_userspace_pm_lookup_addr_by_id() to lookup the address entry with the given id on the userspace pm local address list. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Reviewed-by: Mat Martineau Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: David S. Miller Stable-dep-of: f642c5c4d528 ("mptcp: hold pm lock when deleting entry") Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Greg Kroah-Hartman --- net/mptcp/pm_userspace.c | 31 ++++++++++++++++--------------- 1 file changed, 16 insertions(+), 15 deletions(-) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 530f414e57d6..ca3e452d4edb 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -106,22 +106,29 @@ static int mptcp_userspace_pm_delete_local_addr(struct mptcp_sock *msk, return -EINVAL; } +static struct mptcp_pm_addr_entry * +mptcp_userspace_pm_lookup_addr_by_id(struct mptcp_sock *msk, unsigned int id) +{ + struct mptcp_pm_addr_entry *entry; + + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (entry->addr.id == id) + return entry; + } + return NULL; +} + int mptcp_userspace_pm_get_flags_and_ifindex_by_id(struct mptcp_sock *msk, unsigned int id, u8 *flags, int *ifindex) { - struct mptcp_pm_addr_entry *entry, *match = NULL; + struct mptcp_pm_addr_entry *match; *flags = 0; *ifindex = 0; spin_lock_bh(&msk->pm.lock); - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { - if (id == entry->addr.id) { - match = entry; - break; - } - } + match = mptcp_userspace_pm_lookup_addr_by_id(msk, id); spin_unlock_bh(&msk->pm.lock); if (match) { *flags = match->flags; @@ -282,7 +289,7 @@ int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info) { struct nlattr *token = info->attrs[MPTCP_PM_ATTR_TOKEN]; struct nlattr *id = info->attrs[MPTCP_PM_ATTR_LOC_ID]; - struct mptcp_pm_addr_entry *match = NULL; + struct mptcp_pm_addr_entry *match; struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; LIST_HEAD(free_list); @@ -319,13 +326,7 @@ int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info) lock_sock(sk); - list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { - if (entry->addr.id == id_val) { - match = entry; - break; - } - } - + match = mptcp_userspace_pm_lookup_addr_by_id(msk, id_val); if (!match) { GENL_SET_ERR_MSG(info, "address with specified id not found"); release_sock(sk); From 3095f4e8cc5fe5ff6656cf47dd3ae497a2b10c98 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 19 Nov 2024 09:35:52 +0100 Subject: [PATCH 49/76] mptcp: update local address flags when setting it commit e0266319413d5d687ba7b6df7ca99e4b9724a4f2 upstream. Just like in-kernel pm, when userspace pm does set_flags, it needs to send out MP_PRIO signal, and also modify the flags of the corresponding address entry in the local address list. This patch implements the missing logic. Traverse all address entries on userspace_pm_local_addr_list to find the local address entry, if bkup is true, set the flags of this entry with FLAG_BACKUP, otherwise, clear FLAG_BACKUP. Fixes: 892f396c8e68 ("mptcp: netlink: issue MP_PRIO signals from userspace PMs") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241112-net-mptcp-misc-6-12-pm-v1-1-b835580cefa8@kernel.org Signed-off-by: Jakub Kicinski [ Conflicts in pm_userspace.c, because commit 6a42477fe449 ("mptcp: update set_flags interfaces"), is not in this version, and causes too many conflicts when backporting it. The same code can still be added at the same place, before sending the ACK. ] Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Greg Kroah-Hartman --- net/mptcp/pm_userspace.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index ca3e452d4edb..195f84f16b97 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -565,6 +565,7 @@ int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, struct mptcp_pm_addr_entry *loc, struct mptcp_pm_addr_entry *rem, u8 bkup) { + struct mptcp_pm_addr_entry *entry; struct mptcp_sock *msk; int ret = -EINVAL; struct sock *sk; @@ -585,6 +586,17 @@ int mptcp_userspace_pm_set_flags(struct net *net, struct nlattr *token, rem->addr.family == AF_UNSPEC) goto set_flags_err; + spin_lock_bh(&msk->pm.lock); + list_for_each_entry(entry, &msk->pm.userspace_pm_local_addr_list, list) { + if (mptcp_addresses_equal(&entry->addr, &loc->addr, false)) { + if (bkup) + entry->flags |= MPTCP_PM_ADDR_FLAG_BACKUP; + else + entry->flags &= ~MPTCP_PM_ADDR_FLAG_BACKUP; + } + } + spin_unlock_bh(&msk->pm.lock); + lock_sock(sk); ret = mptcp_pm_nl_mp_prio_send_ack(msk, &loc->addr, &rem->addr, bkup); release_sock(sk); From 33e1fc8d1b58d1731914b31e9aa45e8c4a735cb7 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 19 Nov 2024 09:35:53 +0100 Subject: [PATCH 50/76] mptcp: hold pm lock when deleting entry commit f642c5c4d528d11bd78b6c6f84f541cd3c0bea86 upstream. When traversing userspace_pm_local_addr_list and deleting an entry from it in mptcp_pm_nl_remove_doit(), msk->pm.lock should be held. This patch holds this lock before mptcp_userspace_pm_lookup_addr_by_id() and releases it after list_move() in mptcp_pm_nl_remove_doit(). Fixes: d9a4594edabf ("mptcp: netlink: Add MPTCP_PM_CMD_REMOVE") Cc: stable@vger.kernel.org Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241112-net-mptcp-misc-6-12-pm-v1-2-b835580cefa8@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Greg Kroah-Hartman --- net/mptcp/pm_userspace.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index 195f84f16b97..9016f8900c19 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -326,14 +326,17 @@ int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info) lock_sock(sk); + spin_lock_bh(&msk->pm.lock); match = mptcp_userspace_pm_lookup_addr_by_id(msk, id_val); if (!match) { GENL_SET_ERR_MSG(info, "address with specified id not found"); + spin_unlock_bh(&msk->pm.lock); release_sock(sk); goto remove_err; } list_move(&match->list, &free_list); + spin_unlock_bh(&msk->pm.lock); mptcp_pm_remove_addrs(msk, &free_list); From 8e1c565eeb4165d9fdf07e002d851002b2b01f7a Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 19 Nov 2024 09:35:54 +0100 Subject: [PATCH 51/76] mptcp: drop lookup_by_id in lookup_addr commit af250c27ea1c404e210fc3a308b20f772df584d6 upstream. When the lookup_by_id parameter of __lookup_addr() is true, it's the same as __lookup_addr_by_id(), it can be replaced by __lookup_addr_by_id() directly. So drop this parameter, let __lookup_addr() only looks up address on the local address list by comparing addresses in it, not address ids. Signed-off-by: Geliang Tang Reviewed-by: Matthieu Baerts (NGI0) Signed-off-by: Matthieu Baerts (NGI0) Link: https://lore.kernel.org/r/20240305-upstream-net-next-20240304-mptcp-misc-cleanup-v1-4-c436ba5e569b@kernel.org Signed-off-by: Jakub Kicinski Stable-dep-of: db3eab8110bc ("mptcp: pm: use _rcu variant under rcu_read_lock") [ Conflicts in pm_netlink.c, because commit 6a42477fe449 ("mptcp: update set_flags interfaces") is not in this version, and causes too many conflicts when backporting it. The conflict is easy to resolve: addr is a pointer here here in mptcp_pm_nl_set_flags(), the rest of the code is the same. ] Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Greg Kroah-Hartman --- net/mptcp/pm_netlink.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 49e8156f5388..9b65d9360976 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -525,15 +525,12 @@ __lookup_addr_by_id(struct pm_nl_pernet *pernet, unsigned int id) } static struct mptcp_pm_addr_entry * -__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info, - bool lookup_by_id) +__lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) { struct mptcp_pm_addr_entry *entry; list_for_each_entry(entry, &pernet->local_addr_list, list) { - if ((!lookup_by_id && - mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) || - (lookup_by_id && entry->addr.id == info->id)) + if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) return entry; } return NULL; @@ -564,7 +561,7 @@ static void mptcp_pm_create_subflow_or_signal_addr(struct mptcp_sock *msk) mptcp_local_address((struct sock_common *)msk->first, &mpc_addr); rcu_read_lock(); - entry = __lookup_addr(pernet, &mpc_addr, false); + entry = __lookup_addr(pernet, &mpc_addr); if (entry) { __clear_bit(entry->addr.id, msk->pm.id_avail_bitmap); msk->mpc_endpoint_id = entry->addr.id; @@ -2081,7 +2078,8 @@ static int mptcp_nl_cmd_set_flags(struct sk_buff *skb, struct genl_info *info) token, &addr, &remote, bkup); spin_lock_bh(&pernet->lock); - entry = __lookup_addr(pernet, &addr.addr, lookup_by_id); + entry = lookup_by_id ? __lookup_addr_by_id(pernet, addr.addr.id) : + __lookup_addr(pernet, &addr.addr); if (!entry) { spin_unlock_bh(&pernet->lock); return -EINVAL; From 5ecf539bc5b2fd0dc9700f459704922714ad5cff Mon Sep 17 00:00:00 2001 From: "Matthieu Baerts (NGI0)" Date: Tue, 19 Nov 2024 09:35:55 +0100 Subject: [PATCH 52/76] mptcp: pm: use _rcu variant under rcu_read_lock commit db3eab8110bc0520416101b6a5b52f44a43fb4cf upstream. In mptcp_pm_create_subflow_or_signal_addr(), rcu_read_(un)lock() are used as expected to iterate over the list of local addresses, but list_for_each_entry() was used instead of list_for_each_entry_rcu() in __lookup_addr(). It is important to use this variant which adds the required READ_ONCE() (and diagnostic checks if enabled). Because __lookup_addr() is also used in mptcp_pm_nl_set_flags() where it is called under the pernet->lock and not rcu_read_lock(), an extra condition is then passed to help the diagnostic checks making sure either the associated spin lock or the RCU lock is held. Fixes: 86e39e04482b ("mptcp: keep track of local endpoint still available for each msk") Cc: stable@vger.kernel.org Reviewed-by: Geliang Tang Signed-off-by: Matthieu Baerts (NGI0) Link: https://patch.msgid.link/20241112-net-mptcp-misc-6-12-pm-v1-3-b835580cefa8@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Matthieu Baerts (NGI0) Signed-off-by: Greg Kroah-Hartman --- net/mptcp/pm_netlink.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 9b65d9360976..3fd7de56a30f 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -529,7 +529,8 @@ __lookup_addr(struct pm_nl_pernet *pernet, const struct mptcp_addr_info *info) { struct mptcp_pm_addr_entry *entry; - list_for_each_entry(entry, &pernet->local_addr_list, list) { + list_for_each_entry_rcu(entry, &pernet->local_addr_list, list, + lockdep_is_held(&pernet->lock)) { if (mptcp_addresses_equal(&entry->addr, info, entry->addr.port)) return entry; } From d70c2e0904ab3715c5673fd45788a464a246d1db Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Sat, 16 Mar 2024 23:36:36 +0900 Subject: [PATCH 53/76] ksmbd: fix slab-out-of-bounds in smb_strndup_from_utf16() commit a80a486d72e20bd12c335bcd38b6e6f19356b0aa upstream. If ->NameOffset of smb2_create_req is smaller than Buffer offset of smb2_create_req, slab-out-of-bounds read can happen from smb2_open. This patch set the minimum value of the name offset to the buffer offset to validate name length of smb2_create_req(). Cc: stable@vger.kernel.org Reported-by: Xuanzhe Yu Signed-off-by: Namjae Jeon Signed-off-by: Steve French Stable-dep-of: c6cd2e8d2d9a ("ksmbd: fix potencial out-of-bounds when buffer offset is invalid") Signed-off-by: Vamsi Krishna Brahmajosyula Signed-off-by: Greg Kroah-Hartman --- fs/smb/server/smb2misc.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c index 03dded29a980..7c872ffb4b0a 100644 --- a/fs/smb/server/smb2misc.c +++ b/fs/smb/server/smb2misc.c @@ -107,7 +107,10 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len, case SMB2_CREATE: { unsigned short int name_off = - le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset); + max_t(unsigned short int, + le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset), + offsetof(struct smb2_create_req, Buffer)); + unsigned short int name_len = le16_to_cpu(((struct smb2_create_req *)hdr)->NameLength); From ad6480c9a5d884e2704adc51d69895d93339176c Mon Sep 17 00:00:00 2001 From: Namjae Jeon Date: Tue, 19 Mar 2024 08:40:48 +0900 Subject: [PATCH 54/76] ksmbd: fix potencial out-of-bounds when buffer offset is invalid commit c6cd2e8d2d9aa7ee35b1fa6a668e32a22a9753da upstream. I found potencial out-of-bounds when buffer offset fields of a few requests is invalid. This patch set the minimum value of buffer offset field to ->Buffer offset to validate buffer length. Cc: stable@vger.kernel.org Signed-off-by: Namjae Jeon Signed-off-by: Steve French Signed-off-by: Sasha Levin Signed-off-by: Vamsi Krishna Brahmajosyula Signed-off-by: Greg Kroah-Hartman --- fs/smb/server/smb2misc.c | 23 +++++++++++++------ fs/smb/server/smb2pdu.c | 48 ++++++++++++++++++++++------------------ 2 files changed, 42 insertions(+), 29 deletions(-) diff --git a/fs/smb/server/smb2misc.c b/fs/smb/server/smb2misc.c index 7c872ffb4b0a..727cb49926ee 100644 --- a/fs/smb/server/smb2misc.c +++ b/fs/smb/server/smb2misc.c @@ -101,7 +101,9 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len, *len = le16_to_cpu(((struct smb2_sess_setup_req *)hdr)->SecurityBufferLength); break; case SMB2_TREE_CONNECT: - *off = le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathOffset); + *off = max_t(unsigned short int, + le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathOffset), + offsetof(struct smb2_tree_connect_req, Buffer)); *len = le16_to_cpu(((struct smb2_tree_connect_req *)hdr)->PathLength); break; case SMB2_CREATE: @@ -110,7 +112,6 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len, max_t(unsigned short int, le16_to_cpu(((struct smb2_create_req *)hdr)->NameOffset), offsetof(struct smb2_create_req, Buffer)); - unsigned short int name_len = le16_to_cpu(((struct smb2_create_req *)hdr)->NameLength); @@ -131,11 +132,15 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len, break; } case SMB2_QUERY_INFO: - *off = le16_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferOffset); + *off = max_t(unsigned int, + le16_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferOffset), + offsetof(struct smb2_query_info_req, Buffer)); *len = le32_to_cpu(((struct smb2_query_info_req *)hdr)->InputBufferLength); break; case SMB2_SET_INFO: - *off = le16_to_cpu(((struct smb2_set_info_req *)hdr)->BufferOffset); + *off = max_t(unsigned int, + le16_to_cpu(((struct smb2_set_info_req *)hdr)->BufferOffset), + offsetof(struct smb2_set_info_req, Buffer)); *len = le32_to_cpu(((struct smb2_set_info_req *)hdr)->BufferLength); break; case SMB2_READ: @@ -145,7 +150,7 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len, case SMB2_WRITE: if (((struct smb2_write_req *)hdr)->DataOffset || ((struct smb2_write_req *)hdr)->Length) { - *off = max_t(unsigned int, + *off = max_t(unsigned short int, le16_to_cpu(((struct smb2_write_req *)hdr)->DataOffset), offsetof(struct smb2_write_req, Buffer)); *len = le32_to_cpu(((struct smb2_write_req *)hdr)->Length); @@ -156,7 +161,9 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len, *len = le16_to_cpu(((struct smb2_write_req *)hdr)->WriteChannelInfoLength); break; case SMB2_QUERY_DIRECTORY: - *off = le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameOffset); + *off = max_t(unsigned short int, + le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameOffset), + offsetof(struct smb2_query_directory_req, Buffer)); *len = le16_to_cpu(((struct smb2_query_directory_req *)hdr)->FileNameLength); break; case SMB2_LOCK: @@ -171,7 +178,9 @@ static int smb2_get_data_area_len(unsigned int *off, unsigned int *len, break; } case SMB2_IOCTL: - *off = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset); + *off = max_t(unsigned int, + le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputOffset), + offsetof(struct smb2_ioctl_req, Buffer)); *len = le32_to_cpu(((struct smb2_ioctl_req *)hdr)->InputCount); break; default: diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 9b5847bf9b2a..7e068c4187a8 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1961,7 +1961,7 @@ int smb2_tree_connect(struct ksmbd_work *work) WORK_BUFFERS(work, req, rsp); - treename = smb_strndup_from_utf16(req->Buffer, + treename = smb_strndup_from_utf16((char *)req + le16_to_cpu(req->PathOffset), le16_to_cpu(req->PathLength), true, conn->local_nls); if (IS_ERR(treename)) { @@ -2723,7 +2723,7 @@ int smb2_open(struct ksmbd_work *work) goto err_out2; } - name = smb2_get_name(req->Buffer, + name = smb2_get_name((char *)req + le16_to_cpu(req->NameOffset), le16_to_cpu(req->NameLength), work->conn->local_nls); if (IS_ERR(name)) { @@ -4096,7 +4096,7 @@ int smb2_query_dir(struct ksmbd_work *work) } srch_flag = req->Flags; - srch_ptr = smb_strndup_from_utf16(req->Buffer, + srch_ptr = smb_strndup_from_utf16((char *)req + le16_to_cpu(req->FileNameOffset), le16_to_cpu(req->FileNameLength), 1, conn->local_nls); if (IS_ERR(srch_ptr)) { @@ -4357,7 +4357,8 @@ static int smb2_get_ea(struct ksmbd_work *work, struct ksmbd_file *fp, sizeof(struct smb2_ea_info_req)) return -EINVAL; - ea_req = (struct smb2_ea_info_req *)req->Buffer; + ea_req = (struct smb2_ea_info_req *)((char *)req + + le16_to_cpu(req->InputBufferOffset)); } else { /* need to send all EAs, if no specific EA is requested*/ if (le32_to_cpu(req->Flags) & SL_RETURN_SINGLE_ENTRY) @@ -5971,6 +5972,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, struct ksmbd_share_config *share) { unsigned int buf_len = le32_to_cpu(req->BufferLength); + char *buffer = (char *)req + le16_to_cpu(req->BufferOffset); switch (req->FileInfoClass) { case FILE_BASIC_INFORMATION: @@ -5978,7 +5980,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, if (buf_len < sizeof(struct smb2_file_basic_info)) return -EINVAL; - return set_file_basic_info(fp, (struct smb2_file_basic_info *)req->Buffer, share); + return set_file_basic_info(fp, (struct smb2_file_basic_info *)buffer, share); } case FILE_ALLOCATION_INFORMATION: { @@ -5986,7 +5988,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, return -EINVAL; return set_file_allocation_info(work, fp, - (struct smb2_file_alloc_info *)req->Buffer); + (struct smb2_file_alloc_info *)buffer); } case FILE_END_OF_FILE_INFORMATION: { @@ -5994,7 +5996,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, return -EINVAL; return set_end_of_file_info(work, fp, - (struct smb2_file_eof_info *)req->Buffer); + (struct smb2_file_eof_info *)buffer); } case FILE_RENAME_INFORMATION: { @@ -6002,7 +6004,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, return -EINVAL; return set_rename_info(work, fp, - (struct smb2_file_rename_info *)req->Buffer, + (struct smb2_file_rename_info *)buffer, buf_len); } case FILE_LINK_INFORMATION: @@ -6011,7 +6013,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, return -EINVAL; return smb2_create_link(work, work->tcon->share_conf, - (struct smb2_file_link_info *)req->Buffer, + (struct smb2_file_link_info *)buffer, buf_len, fp->filp, work->conn->local_nls); } @@ -6021,7 +6023,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, return -EINVAL; return set_file_disposition_info(fp, - (struct smb2_file_disposition_info *)req->Buffer); + (struct smb2_file_disposition_info *)buffer); } case FILE_FULL_EA_INFORMATION: { @@ -6034,7 +6036,7 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, if (buf_len < sizeof(struct smb2_ea_info)) return -EINVAL; - return smb2_set_ea((struct smb2_ea_info *)req->Buffer, + return smb2_set_ea((struct smb2_ea_info *)buffer, buf_len, &fp->filp->f_path, true); } case FILE_POSITION_INFORMATION: @@ -6042,14 +6044,14 @@ static int smb2_set_info_file(struct ksmbd_work *work, struct ksmbd_file *fp, if (buf_len < sizeof(struct smb2_file_pos_info)) return -EINVAL; - return set_file_position_info(fp, (struct smb2_file_pos_info *)req->Buffer); + return set_file_position_info(fp, (struct smb2_file_pos_info *)buffer); } case FILE_MODE_INFORMATION: { if (buf_len < sizeof(struct smb2_file_mode_info)) return -EINVAL; - return set_file_mode_info(fp, (struct smb2_file_mode_info *)req->Buffer); + return set_file_mode_info(fp, (struct smb2_file_mode_info *)buffer); } } @@ -6130,7 +6132,7 @@ int smb2_set_info(struct ksmbd_work *work) } rc = smb2_set_info_sec(fp, le32_to_cpu(req->AdditionalInformation), - req->Buffer, + (char *)req + le16_to_cpu(req->BufferOffset), le32_to_cpu(req->BufferLength)); ksmbd_revert_fsids(work); break; @@ -7576,7 +7578,7 @@ static int fsctl_pipe_transceive(struct ksmbd_work *work, u64 id, struct smb2_ioctl_rsp *rsp) { struct ksmbd_rpc_command *rpc_resp; - char *data_buf = (char *)&req->Buffer[0]; + char *data_buf = (char *)req + le32_to_cpu(req->InputOffset); int nbytes = 0; rpc_resp = ksmbd_rpc_ioctl(work->sess, id, data_buf, @@ -7689,6 +7691,7 @@ int smb2_ioctl(struct ksmbd_work *work) u64 id = KSMBD_NO_FID; struct ksmbd_conn *conn = work->conn; int ret = 0; + char *buffer; if (work->next_smb2_rcv_hdr_off) { req = ksmbd_req_buf_next(work); @@ -7711,6 +7714,8 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } + buffer = (char *)req + le32_to_cpu(req->InputOffset); + cnt_code = le32_to_cpu(req->CtlCode); ret = smb2_calc_max_out_buf_len(work, 48, le32_to_cpu(req->MaxOutputResponse)); @@ -7768,7 +7773,7 @@ int smb2_ioctl(struct ksmbd_work *work) } ret = fsctl_validate_negotiate_info(conn, - (struct validate_negotiate_info_req *)&req->Buffer[0], + (struct validate_negotiate_info_req *)buffer, (struct validate_negotiate_info_rsp *)&rsp->Buffer[0], in_buf_len); if (ret < 0) @@ -7821,7 +7826,7 @@ int smb2_ioctl(struct ksmbd_work *work) rsp->VolatileFileId = req->VolatileFileId; rsp->PersistentFileId = req->PersistentFileId; fsctl_copychunk(work, - (struct copychunk_ioctl_req *)&req->Buffer[0], + (struct copychunk_ioctl_req *)buffer, le32_to_cpu(req->CtlCode), le32_to_cpu(req->InputCount), req->VolatileFileId, @@ -7834,8 +7839,7 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } - ret = fsctl_set_sparse(work, id, - (struct file_sparse *)&req->Buffer[0]); + ret = fsctl_set_sparse(work, id, (struct file_sparse *)buffer); if (ret < 0) goto out; break; @@ -7858,7 +7862,7 @@ int smb2_ioctl(struct ksmbd_work *work) } zero_data = - (struct file_zero_data_information *)&req->Buffer[0]; + (struct file_zero_data_information *)buffer; off = le64_to_cpu(zero_data->FileOffset); bfz = le64_to_cpu(zero_data->BeyondFinalZero); @@ -7889,7 +7893,7 @@ int smb2_ioctl(struct ksmbd_work *work) } ret = fsctl_query_allocated_ranges(work, id, - (struct file_allocated_range_buffer *)&req->Buffer[0], + (struct file_allocated_range_buffer *)buffer, (struct file_allocated_range_buffer *)&rsp->Buffer[0], out_buf_len / sizeof(struct file_allocated_range_buffer), &nbytes); @@ -7933,7 +7937,7 @@ int smb2_ioctl(struct ksmbd_work *work) goto out; } - dup_ext = (struct duplicate_extents_to_file *)&req->Buffer[0]; + dup_ext = (struct duplicate_extents_to_file *)buffer; fp_in = ksmbd_lookup_fd_slow(work, dup_ext->VolatileFileHandle, dup_ext->PersistentFileHandle); From 8a015f9cabd54dc6c4468c1cce99ad2abddfabfd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Nov 2024 10:05:36 +0800 Subject: [PATCH 55/76] net: add copy_safe_from_sockptr() helper [ Upstream commit 6309863b31dd80317cd7d6824820b44e254e2a9c ] copy_from_sockptr() helper is unsafe, unless callers did the prior check against user provided optlen. Too many callers get this wrong, lets add a helper to fix them and avoid future copy/paste bugs. Instead of : if (optlen < sizeof(opt)) { err = -EINVAL; break; } if (copy_from_sockptr(&opt, optval, sizeof(opt)) { err = -EFAULT; break; } Use : err = copy_safe_from_sockptr(&opt, sizeof(opt), optval, optlen); if (err) break; Signed-off-by: Eric Dumazet Link: https://lore.kernel.org/r/20240408082845.3957374-2-edumazet@google.com Signed-off-by: Jakub Kicinski Stable-dep-of: 7a87441c9651 ("nfc: llcp: fix nfc_llcp_setsockopt() unsafe copies") Signed-off-by: Sasha Levin Signed-off-by: Xiangyu Chen Signed-off-by: Greg Kroah-Hartman --- include/linux/sockptr.h | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index bae5e2369b4f..1c1a5d926b17 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -50,11 +50,36 @@ static inline int copy_from_sockptr_offset(void *dst, sockptr_t src, return 0; } +/* Deprecated. + * This is unsafe, unless caller checked user provided optlen. + * Prefer copy_safe_from_sockptr() instead. + */ static inline int copy_from_sockptr(void *dst, sockptr_t src, size_t size) { return copy_from_sockptr_offset(dst, src, 0, size); } +/** + * copy_safe_from_sockptr: copy a struct from sockptr + * @dst: Destination address, in kernel space. This buffer must be @ksize + * bytes long. + * @ksize: Size of @dst struct. + * @optval: Source address. (in user or kernel space) + * @optlen: Size of @optval data. + * + * Returns: + * * -EINVAL: @optlen < @ksize + * * -EFAULT: access to userspace failed. + * * 0 : @ksize bytes were copied + */ +static inline int copy_safe_from_sockptr(void *dst, size_t ksize, + sockptr_t optval, unsigned int optlen) +{ + if (optlen < ksize) + return -EINVAL; + return copy_from_sockptr(dst, optval, ksize); +} + static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, const void *src, size_t size) { From 298609e7069ce74542a2253a39ccc9717f1d877a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Nov 2024 10:05:37 +0800 Subject: [PATCH 56/76] nfc: llcp: fix nfc_llcp_setsockopt() unsafe copies [ Upstream commit 7a87441c9651ba37842f4809224aca13a554a26f ] syzbot reported unsafe calls to copy_from_sockptr() [1] Use copy_safe_from_sockptr() instead. [1] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] BUG: KASAN: slab-out-of-bounds in copy_from_sockptr include/linux/sockptr.h:55 [inline] BUG: KASAN: slab-out-of-bounds in nfc_llcp_setsockopt+0x6c2/0x850 net/nfc/llcp_sock.c:255 Read of size 4 at addr ffff88801caa1ec3 by task syz-executor459/5078 CPU: 0 PID: 5078 Comm: syz-executor459 Not tainted 6.8.0-syzkaller-08951-gfe46a7dd189e #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 Call Trace: __dump_stack lib/dump_stack.c:88 [inline] dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 print_address_description mm/kasan/report.c:377 [inline] print_report+0x169/0x550 mm/kasan/report.c:488 kasan_report+0x143/0x180 mm/kasan/report.c:601 copy_from_sockptr_offset include/linux/sockptr.h:49 [inline] copy_from_sockptr include/linux/sockptr.h:55 [inline] nfc_llcp_setsockopt+0x6c2/0x850 net/nfc/llcp_sock.c:255 do_sock_setsockopt+0x3b1/0x720 net/socket.c:2311 __sys_setsockopt+0x1ae/0x250 net/socket.c:2334 __do_sys_setsockopt net/socket.c:2343 [inline] __se_sys_setsockopt net/socket.c:2340 [inline] __x64_sys_setsockopt+0xb5/0xd0 net/socket.c:2340 do_syscall_64+0xfd/0x240 entry_SYSCALL_64_after_hwframe+0x6d/0x75 RIP: 0033:0x7f7fac07fd89 Code: 28 00 00 00 75 05 48 83 c4 28 c3 e8 91 18 00 00 90 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 b8 ff ff ff f7 d8 64 89 01 48 RSP: 002b:00007fff660eb788 EFLAGS: 00000246 ORIG_RAX: 0000000000000036 RAX: ffffffffffffffda RBX: 0000000000000003 RCX: 00007f7fac07fd89 RDX: 0000000000000000 RSI: 0000000000000118 RDI: 0000000000000004 RBP: 0000000000000000 R08: 0000000000000002 R09: 0000000000000000 R10: 0000000020000a80 R11: 0000000000000246 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 Signed-off-by: Eric Dumazet Reported-by: syzbot Reviewed-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20240408082845.3957374-4-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Xiangyu Chen Signed-off-by: Greg Kroah-Hartman --- net/nfc/llcp_sock.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c index 645677f84dba..cd0fd26196b8 100644 --- a/net/nfc/llcp_sock.c +++ b/net/nfc/llcp_sock.c @@ -252,10 +252,10 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = copy_safe_from_sockptr(&opt, sizeof(opt), + optval, optlen); + if (err) break; - } if (opt > LLCP_MAX_RW) { err = -EINVAL; @@ -274,10 +274,10 @@ static int nfc_llcp_setsockopt(struct socket *sock, int level, int optname, break; } - if (copy_from_sockptr(&opt, optval, sizeof(u32))) { - err = -EFAULT; + err = copy_safe_from_sockptr(&opt, sizeof(opt), + optval, optlen); + if (err) break; - } if (opt > LLCP_MAX_MIUX) { err = -EINVAL; From 18cf7026355187b8d2b4cdfed61dbf873e9d29ff Mon Sep 17 00:00:00 2001 From: Eric Van Hensbergen Date: Tue, 19 Nov 2024 11:43:16 +0800 Subject: [PATCH 57/76] fs/9p: fix uninitialized values during inode evict [ Upstream commit 6630036b7c228f57c7893ee0403e92c2db2cd21d ] If an iget fails due to not being able to retrieve information from the server then the inode structure is only partially initialized. When the inode gets evicted, references to uninitialized structures (like fscache cookies) were being made. This patch checks for a bad_inode before doing anything other than clearing the inode from the cache. Since the inode is bad, it shouldn't have any state associated with it that needs to be written back (and there really isn't a way to complete those anyways). Reported-by: syzbot+eb83fe1cce5833cd66a0@syzkaller.appspotmail.com Signed-off-by: Eric Van Hensbergen (cherry picked from commit 1b4cb6e91f19b81217ad98142ee53a1ab25893fd) [Xiangyu: CVE-2024-36923 Minor conflict resolution due to missing 4eb31178 ] Signed-off-by: Xiangyu Chen Signed-off-by: Greg Kroah-Hartman --- fs/9p/vfs_inode.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 8f287009545c..495631eba3a6 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -392,17 +392,20 @@ void v9fs_evict_inode(struct inode *inode) struct v9fs_inode *v9inode = V9FS_I(inode); __le32 version; - truncate_inode_pages_final(&inode->i_data); - version = cpu_to_le32(v9inode->qid.version); - fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode, + if (!is_bad_inode(inode)) { + truncate_inode_pages_final(&inode->i_data); + version = cpu_to_le32(v9inode->qid.version); + fscache_clear_inode_writeback(v9fs_inode_cookie(v9inode), inode, &version); - clear_inode(inode); - filemap_fdatawrite(&inode->i_data); - - fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false); - /* clunk the fid stashed in writeback_fid */ - p9_fid_put(v9inode->writeback_fid); - v9inode->writeback_fid = NULL; + clear_inode(inode); + filemap_fdatawrite(&inode->i_data); + if (v9fs_inode_cookie(v9inode)) + fscache_relinquish_cookie(v9fs_inode_cookie(v9inode), false); + /* clunk the fid stashed in writeback_fid */ + p9_fid_put(v9inode->writeback_fid); + v9inode->writeback_fid = NULL; + } else + clear_inode(inode); } static int v9fs_test_inode(struct inode *inode, void *data) From b2c664df3bb46aabac6a5fd78aaa5bd614cfad97 Mon Sep 17 00:00:00 2001 From: Chen Hanxiao Date: Tue, 19 Nov 2024 18:20:10 +0800 Subject: [PATCH 58/76] ipvs: properly dereference pe in ip_vs_add_service [ Upstream commit cbd070a4ae62f119058973f6d2c984e325bce6e7 ] Use pe directly to resolve sparse warning: net/netfilter/ipvs/ip_vs_ctl.c:1471:27: warning: dereference of noderef expression Fixes: 39b972231536 ("ipvs: handle connections started by real-servers") Signed-off-by: Chen Hanxiao Acked-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Pablo Neira Ayuso [ Resolve minor conflicts to fix CVE-2024-42322 ] Signed-off-by: Bin Lan Signed-off-by: Greg Kroah-Hartman --- net/netfilter/ipvs/ip_vs_ctl.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 17a1b731a76b..18e37b32a5d6 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1382,18 +1382,18 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u, sched = NULL; } - /* Bind the ct retriever */ - RCU_INIT_POINTER(svc->pe, pe); - pe = NULL; - /* Update the virtual service counters */ if (svc->port == FTPPORT) atomic_inc(&ipvs->ftpsvc_counter); else if (svc->port == 0) atomic_inc(&ipvs->nullsvc_counter); - if (svc->pe && svc->pe->conn_out) + if (pe && pe->conn_out) atomic_inc(&ipvs->conn_out_counter); + /* Bind the ct retriever */ + RCU_INIT_POINTER(svc->pe, pe); + pe = NULL; + ip_vs_start_estimator(ipvs, &svc->stats); /* Count only IPv4 services for old get/setsockopt interface */ From 34d83c3e6e97867ae061d14eb52123404aab1cbc Mon Sep 17 00:00:00 2001 From: Vladimir Oltean Date: Tue, 19 Nov 2024 16:06:18 +0800 Subject: [PATCH 59/76] net/sched: taprio: extend minimum interval restriction to entire cycle too [ Upstream commit fb66df20a7201e60f2b13d7f95d031b31a8831d3 ] It is possible for syzbot to side-step the restriction imposed by the blamed commit in the Fixes: tag, because the taprio UAPI permits a cycle-time different from (and potentially shorter than) the sum of entry intervals. We need one more restriction, which is that the cycle time itself must be larger than N * ETH_ZLEN bit times, where N is the number of schedule entries. This restriction needs to apply regardless of whether the cycle time came from the user or was the implicit, auto-calculated value, so we move the existing "cycle == 0" check outside the "if "(!new->cycle_time)" branch. This way covers both conditions and scenarios. Add a selftest which illustrates the issue triggered by syzbot. Fixes: b5b73b26b3ca ("taprio: Fix allowing too small intervals") Reported-by: syzbot+a7d2b1d5d1af83035567@syzkaller.appspotmail.com Closes: https://lore.kernel.org/netdev/0000000000007d66bc06196e7c66@google.com/ Signed-off-by: Vladimir Oltean Link: https://lore.kernel.org/r/20240527153955.553333-2-vladimir.oltean@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Xiangyu Chen Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_taprio.c | 10 ++++----- .../tc-testing/tc-tests/qdiscs/taprio.json | 22 +++++++++++++++++++ 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index 1d5cdc987abd..62219f23f76a 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -915,11 +915,6 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, list_for_each_entry(entry, &new->entries, list) cycle = ktime_add_ns(cycle, entry->interval); - if (!cycle) { - NL_SET_ERR_MSG(extack, "'cycle_time' can never be 0"); - return -EINVAL; - } - if (cycle < 0 || cycle > INT_MAX) { NL_SET_ERR_MSG(extack, "'cycle_time' is too big"); return -EINVAL; @@ -928,6 +923,11 @@ static int parse_taprio_schedule(struct taprio_sched *q, struct nlattr **tb, new->cycle_time = cycle; } + if (new->cycle_time < new->num_entries * length_to_duration(q, ETH_ZLEN)) { + NL_SET_ERR_MSG(extack, "'cycle_time' is too small"); + return -EINVAL; + } + return 0; } diff --git a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json index 08d4861c2e78..d04fed83332c 100644 --- a/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json +++ b/tools/testing/selftests/tc-testing/tc-tests/qdiscs/taprio.json @@ -132,6 +132,28 @@ "echo \"1\" > /sys/bus/netdevsim/del_device" ] }, + { + "id": "831f", + "name": "Add taprio Qdisc with too short cycle-time", + "category": [ + "qdisc", + "taprio" + ], + "plugins": { + "requires": "nsPlugin" + }, + "setup": [ + "echo \"1 1 8\" > /sys/bus/netdevsim/new_device" + ], + "cmdUnderTest": "$TC qdisc add dev $ETH root handle 1: taprio num_tc 2 queues 1@0 1@1 sched-entry S 01 200000 sched-entry S 02 200000 cycle-time 100 clockid CLOCK_TAI", + "expExitCode": "2", + "verifyCmd": "$TC qdisc show dev $ETH", + "matchPattern": "qdisc taprio 1: root refcnt", + "matchCount": "0", + "teardown": [ + "echo \"1\" > /sys/bus/netdevsim/del_device" + ] + }, { "id": "3e1e", "name": "Add taprio Qdisc with an invalid cycle-time", From e2348d8c61d03feece1de4c05f72e6e99f74c650 Mon Sep 17 00:00:00 2001 From: Wei Fang Date: Tue, 19 Nov 2024 15:41:35 +0800 Subject: [PATCH 60/76] net: fec: remove .ndo_poll_controller to avoid deadlocks [ Upstream commit c2e0c58b25a0a0c37ec643255558c5af4450c9f5 ] There is a deadlock issue found in sungem driver, please refer to the commit ac0a230f719b ("eth: sungem: remove .ndo_poll_controller to avoid deadlocks"). The root cause of the issue is that netpoll is in atomic context and disable_irq() is called by .ndo_poll_controller interface of sungem driver, however, disable_irq() might sleep. After analyzing the implementation of fec_poll_controller(), the fec driver should have the same issue. Due to the fec driver uses NAPI for TX completions, the .ndo_poll_controller is unnecessary to be implemented in the fec driver, so fec_poll_controller() can be safely removed. Fixes: 7f5c6addcdc0 ("net/fec: add poll controller function for fec nic") Signed-off-by: Wei Fang Link: https://lore.kernel.org/r/20240511062009.652918-1-wei.fang@nxp.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin Signed-off-by: Xiangyu Chen Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/freescale/fec_main.c | 26 ----------------------- 1 file changed, 26 deletions(-) diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c index 0a5c3d27ed3b..aeab6c28892f 100644 --- a/drivers/net/ethernet/freescale/fec_main.c +++ b/drivers/net/ethernet/freescale/fec_main.c @@ -3508,29 +3508,6 @@ fec_set_mac_address(struct net_device *ndev, void *p) return 0; } -#ifdef CONFIG_NET_POLL_CONTROLLER -/** - * fec_poll_controller - FEC Poll controller function - * @dev: The FEC network adapter - * - * Polled functionality used by netconsole and others in non interrupt mode - * - */ -static void fec_poll_controller(struct net_device *dev) -{ - int i; - struct fec_enet_private *fep = netdev_priv(dev); - - for (i = 0; i < FEC_IRQ_NUM; i++) { - if (fep->irq[i] > 0) { - disable_irq(fep->irq[i]); - fec_enet_interrupt(fep->irq[i], dev); - enable_irq(fep->irq[i]); - } - } -} -#endif - static inline void fec_enet_set_netdev_features(struct net_device *netdev, netdev_features_t features) { @@ -3604,9 +3581,6 @@ static const struct net_device_ops fec_netdev_ops = { .ndo_tx_timeout = fec_timeout, .ndo_set_mac_address = fec_set_mac_address, .ndo_eth_ioctl = fec_enet_ioctl, -#ifdef CONFIG_NET_POLL_CONTROLLER - .ndo_poll_controller = fec_poll_controller, -#endif .ndo_set_features = fec_set_features, }; From 5874c1150e77296565ad6e495ef41fbf87570d14 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Fri, 15 Nov 2024 16:57:24 -0800 Subject: [PATCH 61/76] mm: revert "mm: shmem: fix data-race in shmem_getattr()" commit d1aa0c04294e29883d65eac6c2f72fe95cc7c049 upstream. Revert d949d1d14fa2 ("mm: shmem: fix data-race in shmem_getattr()") as suggested by Chuck [1]. It is causing deadlocks when accessing tmpfs over NFS. As Hugh commented, "added just to silence a syzbot sanitizer splat: added where there has never been any practical problem". Link: https://lkml.kernel.org/r/ZzdxKF39VEmXSSyN@tissot.1015granger.net [1] Fixes: d949d1d14fa2 ("mm: shmem: fix data-race in shmem_getattr()") Acked-by: Hugh Dickins Cc: Chuck Lever Cc: Jeongjun Park Cc: Yu Zhao Cc: Signed-off-by: Andrew Morton Signed-off-by: Hugh Dickins Signed-off-by: Greg Kroah-Hartman --- mm/shmem.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 0e1fbc53717d..f7c08e169e42 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1086,9 +1086,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, stat->attributes_mask |= (STATX_ATTR_APPEND | STATX_ATTR_IMMUTABLE | STATX_ATTR_NODUMP); - inode_lock_shared(inode); generic_fillattr(&init_user_ns, inode, stat); - inode_unlock_shared(inode); if (shmem_is_huge(NULL, inode, 0, false)) stat->blksize = HPAGE_PMD_SIZE; From 7443f3e11254f94a16b48d38b0a9e9b659e006bf Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 18 Nov 2024 16:17:25 +0000 Subject: [PATCH 62/76] mm: avoid unsafe VMA hook invocation when error arises on mmap hook [ Upstream commit 3dd6ed34ce1f2356a77fb88edafb5ec96784e3cf ] Patch series "fix error handling in mmap_region() and refactor (hotfixes)", v4. mmap_region() is somewhat terrifying, with spaghetti-like control flow and numerous means by which issues can arise and incomplete state, memory leaks and other unpleasantness can occur. A large amount of the complexity arises from trying to handle errors late in the process of mapping a VMA, which forms the basis of recently observed issues with resource leaks and observable inconsistent state. This series goes to great lengths to simplify how mmap_region() works and to avoid unwinding errors late on in the process of setting up the VMA for the new mapping, and equally avoids such operations occurring while the VMA is in an inconsistent state. The patches in this series comprise the minimal changes required to resolve existing issues in mmap_region() error handling, in order that they can be hotfixed and backported. There is additionally a follow up series which goes further, separated out from the v1 series and sent and updated separately. This patch (of 5): After an attempted mmap() fails, we are no longer in a situation where we can safely interact with VMA hooks. This is currently not enforced, meaning that we need complicated handling to ensure we do not incorrectly call these hooks. We can avoid the whole issue by treating the VMA as suspect the moment that the file->f_ops->mmap() function reports an error by replacing whatever VMA operations were installed with a dummy empty set of VMA operations. We do so through a new helper function internal to mm - mmap_file() - which is both more logically named than the existing call_mmap() function and correctly isolates handling of the vm_op reassignment to mm. All the existing invocations of call_mmap() outside of mm are ultimately nested within the call_mmap() from mm, which we now replace. It is therefore safe to leave call_mmap() in place as a convenience function (and to avoid churn). The invokers are: ovl_file_operations -> mmap -> ovl_mmap() -> backing_file_mmap() coda_file_operations -> mmap -> coda_file_mmap() shm_file_operations -> shm_mmap() shm_file_operations_huge -> shm_mmap() dma_buf_fops -> dma_buf_mmap_internal -> i915_dmabuf_ops -> i915_gem_dmabuf_mmap() None of these callers interact with vm_ops or mappings in a problematic way on error, quickly exiting out. Link: https://lkml.kernel.org/r/cover.1730224667.git.lorenzo.stoakes@oracle.com Link: https://lkml.kernel.org/r/d41fd763496fd0048a962f3fd9407dc72dd4fd86.1730224667.git.lorenzo.stoakes@oracle.com Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails") Signed-off-by: Lorenzo Stoakes Reported-by: Jann Horn Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Reviewed-by: Jann Horn Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Helge Deller Cc: James E.J. Bottomley Cc: Linus Torvalds Cc: Mark Brown Cc: Peter Xu Cc: Will Deacon Cc: Signed-off-by: Andrew Morton Signed-off-by: Lorenzo Stoakes Signed-off-by: Greg Kroah-Hartman --- mm/internal.h | 12 ++++++++++++ mm/mmap.c | 4 ++-- mm/nommu.c | 4 ++-- mm/util.c | 18 ++++++++++++++++++ 4 files changed, 34 insertions(+), 4 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index a50bc08337d2..85ac9c6a1393 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -52,6 +52,18 @@ struct folio_batch; void page_writeback_init(void); +/* + * This is a file-backed mapping, and is about to be memory mapped - invoke its + * mmap hook and safely handle error conditions. On error, VMA hooks will be + * mutated. + * + * @file: File which backs the mapping. + * @vma: VMA which we are mapping. + * + * Returns: 0 if success, error otherwise. + */ +int mmap_file(struct file *file, struct vm_area_struct *vma); + static inline void *folio_raw_mapping(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; diff --git a/mm/mmap.c b/mm/mmap.c index c0f9575493de..bf2f1ca87bef 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2760,7 +2760,7 @@ cannot_expand: } vma->vm_file = get_file(file); - error = call_mmap(file, vma); + error = mmap_file(file, vma); if (error) goto unmap_and_free_vma; @@ -2775,7 +2775,7 @@ cannot_expand: mas_reset(&mas); /* - * If vm_flags changed after call_mmap(), we should try merge + * If vm_flags changed after mmap_file(), we should try merge * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { diff --git a/mm/nommu.c b/mm/nommu.c index 8e8fe491d914..f09e798a4416 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -939,7 +939,7 @@ static int do_mmap_shared_file(struct vm_area_struct *vma) { int ret; - ret = call_mmap(vma->vm_file, vma); + ret = mmap_file(vma->vm_file, vma); if (ret == 0) { vma->vm_region->vm_top = vma->vm_region->vm_end; return 0; @@ -970,7 +970,7 @@ static int do_mmap_private(struct vm_area_struct *vma, * - VM_MAYSHARE will be set if it may attempt to share */ if (capabilities & NOMMU_MAP_DIRECT) { - ret = call_mmap(vma->vm_file, vma); + ret = mmap_file(vma->vm_file, vma); if (ret == 0) { /* shouldn't return success if we're not sharing */ BUG_ON(!(vma->vm_flags & VM_MAYSHARE)); diff --git a/mm/util.c b/mm/util.c index 94fff247831b..15f1970da665 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1103,6 +1103,24 @@ int __weak memcmp_pages(struct page *page1, struct page *page2) return ret; } +int mmap_file(struct file *file, struct vm_area_struct *vma) +{ + static const struct vm_operations_struct dummy_vm_ops = {}; + int err = call_mmap(file, vma); + + if (likely(!err)) + return 0; + + /* + * OK, we tried to call the file hook for mmap(), but an error + * arose. The mapping is in an inconsistent state and we most not invoke + * any further hooks on it. + */ + vma->vm_ops = &dummy_vm_ops; + + return err; +} + #ifdef CONFIG_PRINTK /** * mem_dump_obj - Print available provenance information From 77b50f617a4b4c6f2ddee41df92a23d9a35629b9 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 18 Nov 2024 16:17:26 +0000 Subject: [PATCH 63/76] mm: unconditionally close VMAs on error [ Upstream commit 4080ef1579b2413435413988d14ac8c68e4d42c8 ] Incorrect invocation of VMA callbacks when the VMA is no longer in a consistent state is bug prone and risky to perform. With regards to the important vm_ops->close() callback We have gone to great lengths to try to track whether or not we ought to close VMAs. Rather than doing so and risking making a mistake somewhere, instead unconditionally close and reset vma->vm_ops to an empty dummy operations set with a NULL .close operator. We introduce a new function to do so - vma_close() - and simplify existing vms logic which tracked whether we needed to close or not. This simplifies the logic, avoids incorrect double-calling of the .close() callback and allows us to update error paths to simply call vma_close() unconditionally - making VMA closure idempotent. Link: https://lkml.kernel.org/r/28e89dda96f68c505cb6f8e9fc9b57c3e9f74b42.1730224667.git.lorenzo.stoakes@oracle.com Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails") Signed-off-by: Lorenzo Stoakes Reported-by: Jann Horn Reviewed-by: Vlastimil Babka Reviewed-by: Liam R. Howlett Reviewed-by: Jann Horn Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Helge Deller Cc: James E.J. Bottomley Cc: Linus Torvalds Cc: Mark Brown Cc: Peter Xu Cc: Will Deacon Cc: Signed-off-by: Andrew Morton Signed-off-by: Lorenzo Stoakes Signed-off-by: Greg Kroah-Hartman --- mm/internal.h | 7 +++++++ mm/mmap.c | 12 ++++-------- mm/nommu.c | 3 +-- mm/util.c | 15 +++++++++++++++ 4 files changed, 27 insertions(+), 10 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index 85ac9c6a1393..16a4a9aece30 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -64,6 +64,13 @@ void page_writeback_init(void); */ int mmap_file(struct file *file, struct vm_area_struct *vma); +/* + * If the VMA has a close hook then close it, and since closing it might leave + * it in an inconsistent state which makes the use of any hooks suspect, clear + * them down by installing dummy empty hooks. + */ +void vma_close(struct vm_area_struct *vma); + static inline void *folio_raw_mapping(struct folio *folio) { unsigned long mapping = (unsigned long)folio->mapping; diff --git a/mm/mmap.c b/mm/mmap.c index bf2f1ca87bef..4bfec4df51c2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -136,8 +136,7 @@ void unlink_file_vma(struct vm_area_struct *vma) static void remove_vma(struct vm_area_struct *vma) { might_sleep(); - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); + vma_close(vma); if (vma->vm_file) fput(vma->vm_file); mpol_put(vma_policy(vma)); @@ -2388,8 +2387,7 @@ int __split_vma(struct mm_struct *mm, struct vm_area_struct *vma, new->vm_start = new->vm_end; new->vm_pgoff = 0; /* Clean everything up if vma_adjust failed. */ - if (new->vm_ops && new->vm_ops->close) - new->vm_ops->close(new); + vma_close(new); if (new->vm_file) fput(new->vm_file); unlink_anon_vmas(new); @@ -2885,8 +2883,7 @@ expanded: return addr; close_and_free_vma: - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); + vma_close(vma); unmap_and_free_vma: fput(vma->vm_file); vma->vm_file = NULL; @@ -3376,8 +3373,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return new_vma; out_vma_link: - if (new_vma->vm_ops && new_vma->vm_ops->close) - new_vma->vm_ops->close(new_vma); + vma_close(new_vma); if (new_vma->vm_file) fput(new_vma->vm_file); diff --git a/mm/nommu.c b/mm/nommu.c index f09e798a4416..e0428fa57526 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -650,8 +650,7 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) */ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) { - if (vma->vm_ops && vma->vm_ops->close) - vma->vm_ops->close(vma); + vma_close(vma); if (vma->vm_file) fput(vma->vm_file); put_nommu_region(vma->vm_region); diff --git a/mm/util.c b/mm/util.c index 15f1970da665..d3a2877c176f 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1121,6 +1121,21 @@ int mmap_file(struct file *file, struct vm_area_struct *vma) return err; } +void vma_close(struct vm_area_struct *vma) +{ + static const struct vm_operations_struct dummy_vm_ops = {}; + + if (vma->vm_ops && vma->vm_ops->close) { + vma->vm_ops->close(vma); + + /* + * The mapping is in an inconsistent state, and no further hooks + * may be invoked upon it. + */ + vma->vm_ops = &dummy_vm_ops; + } +} + #ifdef CONFIG_PRINTK /** * mem_dump_obj - Print available provenance information From 8fad7b004be8dc46eb801172ea62b0a2000fcf82 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 18 Nov 2024 16:17:27 +0000 Subject: [PATCH 64/76] mm: refactor arch_calc_vm_flag_bits() and arm64 MTE handling [ Upstream commit 5baf8b037debf4ec60108ccfeccb8636d1dbad81 ] Currently MTE is permitted in two circumstances (desiring to use MTE having been specified by the VM_MTE flag) - where MAP_ANONYMOUS is specified, as checked by arch_calc_vm_flag_bits() and actualised by setting the VM_MTE_ALLOWED flag, or if the file backing the mapping is shmem, in which case we set VM_MTE_ALLOWED in shmem_mmap() when the mmap hook is activated in mmap_region(). The function that checks that, if VM_MTE is set, VM_MTE_ALLOWED is also set is the arm64 implementation of arch_validate_flags(). Unfortunately, we intend to refactor mmap_region() to perform this check earlier, meaning that in the case of a shmem backing we will not have invoked shmem_mmap() yet, causing the mapping to fail spuriously. It is inappropriate to set this architecture-specific flag in general mm code anyway, so a sensible resolution of this issue is to instead move the check somewhere else. We resolve this by setting VM_MTE_ALLOWED much earlier in do_mmap(), via the arch_calc_vm_flag_bits() call. This is an appropriate place to do this as we already check for the MAP_ANONYMOUS case here, and the shmem file case is simply a variant of the same idea - we permit RAM-backed memory. This requires a modification to the arch_calc_vm_flag_bits() signature to pass in a pointer to the struct file associated with the mapping, however this is not too egregious as this is only used by two architectures anyway - arm64 and parisc. So this patch performs this adjustment and removes the unnecessary assignment of VM_MTE_ALLOWED in shmem_mmap(). [akpm@linux-foundation.org: fix whitespace, per Catalin] Link: https://lkml.kernel.org/r/ec251b20ba1964fb64cf1607d2ad80c47f3873df.1730224667.git.lorenzo.stoakes@oracle.com Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails") Signed-off-by: Lorenzo Stoakes Suggested-by: Catalin Marinas Reported-by: Jann Horn Reviewed-by: Catalin Marinas Reviewed-by: Vlastimil Babka Cc: Andreas Larsson Cc: David S. Miller Cc: Helge Deller Cc: James E.J. Bottomley Cc: Liam R. Howlett Cc: Linus Torvalds Cc: Mark Brown Cc: Peter Xu Cc: Will Deacon Cc: Signed-off-by: Andrew Morton Signed-off-by: Lorenzo Stoakes Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/mman.h | 10 +++++++--- include/linux/mman.h | 7 ++++--- mm/mmap.c | 2 +- mm/nommu.c | 2 +- mm/shmem.c | 3 --- 5 files changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index 5966ee4a6154..ef35c52aabd6 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -3,6 +3,8 @@ #define __ASM_MMAN_H__ #include +#include +#include #include #include @@ -21,19 +23,21 @@ static inline unsigned long arch_calc_vm_prot_bits(unsigned long prot, } #define arch_calc_vm_prot_bits(prot, pkey) arch_calc_vm_prot_bits(prot, pkey) -static inline unsigned long arch_calc_vm_flag_bits(unsigned long flags) +static inline unsigned long arch_calc_vm_flag_bits(struct file *file, + unsigned long flags) { /* * Only allow MTE on anonymous mappings as these are guaranteed to be * backed by tags-capable memory. The vm_flags may be overridden by a * filesystem supporting MTE (RAM-based). */ - if (system_supports_mte() && (flags & MAP_ANONYMOUS)) + if (system_supports_mte() && + ((flags & MAP_ANONYMOUS) || shmem_file(file))) return VM_MTE_ALLOWED; return 0; } -#define arch_calc_vm_flag_bits(flags) arch_calc_vm_flag_bits(flags) +#define arch_calc_vm_flag_bits(file, flags) arch_calc_vm_flag_bits(file, flags) static inline bool arch_validate_prot(unsigned long prot, unsigned long addr __always_unused) diff --git a/include/linux/mman.h b/include/linux/mman.h index 58b3abd457a3..21ea08b919d9 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -2,6 +2,7 @@ #ifndef _LINUX_MMAN_H #define _LINUX_MMAN_H +#include #include #include @@ -90,7 +91,7 @@ static inline void vm_unacct_memory(long pages) #endif #ifndef arch_calc_vm_flag_bits -#define arch_calc_vm_flag_bits(flags) 0 +#define arch_calc_vm_flag_bits(file, flags) 0 #endif #ifndef arch_validate_prot @@ -147,12 +148,12 @@ calc_vm_prot_bits(unsigned long prot, unsigned long pkey) * Combine the mmap "flags" argument into "vm_flags" used internally. */ static inline unsigned long -calc_vm_flag_bits(unsigned long flags) +calc_vm_flag_bits(struct file *file, unsigned long flags) { return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) | _calc_vm_trans(flags, MAP_SYNC, VM_SYNC ) | - arch_calc_vm_flag_bits(flags); + arch_calc_vm_flag_bits(file, flags); } unsigned long vm_commit_limit(void); diff --git a/mm/mmap.c b/mm/mmap.c index 4bfec4df51c2..322677f61d30 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1316,7 +1316,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, * to. we assume access permissions have been handled by the open * of the memory object, so we don't do any here. */ - vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | + vm_flags = calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(file, flags) | mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; if (flags & MAP_LOCKED) diff --git a/mm/nommu.c b/mm/nommu.c index e0428fa57526..859ba6bdeb9c 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -903,7 +903,7 @@ static unsigned long determine_vm_flags(struct file *file, { unsigned long vm_flags; - vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(flags); + vm_flags = calc_vm_prot_bits(prot, 0) | calc_vm_flag_bits(file, flags); /* vm_flags |= mm->def_flags; */ if (!(capabilities & NOMMU_MAP_DIRECT)) { diff --git a/mm/shmem.c b/mm/shmem.c index f7c08e169e42..2e6b7db7f14b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2306,9 +2306,6 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) if (ret) return ret; - /* arm64 - allow memory tagging on RAM-based files */ - vma->vm_flags |= VM_MTE_ALLOWED; - file_accessed(file); vma->vm_ops = &shmem_vm_ops; return 0; From 52c81fd0f5a8bf8032687b94ccf00d13b44cc5c8 Mon Sep 17 00:00:00 2001 From: Lorenzo Stoakes Date: Mon, 18 Nov 2024 16:17:28 +0000 Subject: [PATCH 65/76] mm: resolve faulty mmap_region() error path behaviour [ Upstream commit 5de195060b2e251a835f622759550e6202167641 ] The mmap_region() function is somewhat terrifying, with spaghetti-like control flow and numerous means by which issues can arise and incomplete state, memory leaks and other unpleasantness can occur. A large amount of the complexity arises from trying to handle errors late in the process of mapping a VMA, which forms the basis of recently observed issues with resource leaks and observable inconsistent state. Taking advantage of previous patches in this series we move a number of checks earlier in the code, simplifying things by moving the core of the logic into a static internal function __mmap_region(). Doing this allows us to perform a number of checks up front before we do any real work, and allows us to unwind the writable unmap check unconditionally as required and to perform a CONFIG_DEBUG_VM_MAPLE_TREE validation unconditionally also. We move a number of things here: 1. We preallocate memory for the iterator before we call the file-backed memory hook, allowing us to exit early and avoid having to perform complicated and error-prone close/free logic. We carefully free iterator state on both success and error paths. 2. The enclosing mmap_region() function handles the mapping_map_writable() logic early. Previously the logic had the mapping_map_writable() at the point of mapping a newly allocated file-backed VMA, and a matching mapping_unmap_writable() on success and error paths. We now do this unconditionally if this is a file-backed, shared writable mapping. If a driver changes the flags to eliminate VM_MAYWRITE, however doing so does not invalidate the seal check we just performed, and we in any case always decrement the counter in the wrapper. We perform a debug assert to ensure a driver does not attempt to do the opposite. 3. We also move arch_validate_flags() up into the mmap_region() function. This is only relevant on arm64 and sparc64, and the check is only meaningful for SPARC with ADI enabled. We explicitly add a warning for this arch if a driver invalidates this check, though the code ought eventually to be fixed to eliminate the need for this. With all of these measures in place, we no longer need to explicitly close the VMA on error paths, as we place all checks which might fail prior to a call to any driver mmap hook. This eliminates an entire class of errors, makes the code easier to reason about and more robust. Link: https://lkml.kernel.org/r/6e0becb36d2f5472053ac5d544c0edfe9b899e25.1730224667.git.lorenzo.stoakes@oracle.com Fixes: deb0f6562884 ("mm/mmap: undo ->mmap() when arch_validate_flags() fails") Signed-off-by: Lorenzo Stoakes Reported-by: Jann Horn Reviewed-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Tested-by: Mark Brown Cc: Andreas Larsson Cc: Catalin Marinas Cc: David S. Miller Cc: Helge Deller Cc: James E.J. Bottomley Cc: Linus Torvalds Cc: Peter Xu Cc: Will Deacon Cc: Signed-off-by: Andrew Morton Signed-off-by: Lorenzo Stoakes Signed-off-by: Greg Kroah-Hartman --- mm/mmap.c | 104 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 57 insertions(+), 47 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 322677f61d30..9a9933ede542 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2652,7 +2652,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return do_mas_munmap(&mas, mm, start, len, uf, false); } -unsigned long mmap_region(struct file *file, unsigned long addr, +static unsigned long __mmap_region(struct file *file, unsigned long addr, unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, struct list_head *uf) { @@ -2750,26 +2750,28 @@ cannot_expand: vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; - if (file) { - if (vm_flags & VM_SHARED) { - error = mapping_map_writable(file->f_mapping); - if (error) - goto free_vma; - } + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + error = -ENOMEM; + goto free_vma; + } + if (file) { vma->vm_file = get_file(file); error = mmap_file(file, vma); if (error) - goto unmap_and_free_vma; + goto unmap_and_free_file_vma; + + /* Drivers cannot alter the address of the VMA. */ + WARN_ON_ONCE(addr != vma->vm_start); /* - * Expansion is handled above, merging is handled below. - * Drivers should not alter the address of the VMA. + * Drivers should not permit writability when previously it was + * disallowed. */ - if (WARN_ON((addr != vma->vm_start))) { - error = -EINVAL; - goto close_and_free_vma; - } + VM_WARN_ON_ONCE(vm_flags != vma->vm_flags && + !(vm_flags & VM_MAYWRITE) && + (vma->vm_flags & VM_MAYWRITE)); + mas_reset(&mas); /* @@ -2792,7 +2794,8 @@ cannot_expand: vma = merge; /* Update vm_flags to pick up the change. */ vm_flags = vma->vm_flags; - goto unmap_writable; + mas_destroy(&mas); + goto file_expanded; } } @@ -2800,31 +2803,15 @@ cannot_expand: } else if (vm_flags & VM_SHARED) { error = shmem_zero_setup(vma); if (error) - goto free_vma; + goto free_iter_vma; } else { vma_set_anonymous(vma); } - /* Allow architectures to sanity-check the vm_flags */ - if (!arch_validate_flags(vma->vm_flags)) { - error = -EINVAL; - if (file) - goto close_and_free_vma; - else if (vma->vm_file) - goto unmap_and_free_vma; - else - goto free_vma; - } - - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { - error = -ENOMEM; - if (file) - goto close_and_free_vma; - else if (vma->vm_file) - goto unmap_and_free_vma; - else - goto free_vma; - } +#ifdef CONFIG_SPARC64 + /* TODO: Fix SPARC ADI! */ + WARN_ON_ONCE(!arch_validate_flags(vm_flags)); +#endif if (vma->vm_file) i_mmap_lock_write(vma->vm_file->f_mapping); @@ -2847,10 +2834,7 @@ cannot_expand: */ khugepaged_enter_vma(vma, vma->vm_flags); - /* Once vma denies write, undo our temporary denial count */ -unmap_writable: - if (file && vm_flags & VM_SHARED) - mapping_unmap_writable(file->f_mapping); +file_expanded: file = vma->vm_file; expanded: perf_event_mmap(vma); @@ -2879,28 +2863,54 @@ expanded: vma_set_page_prot(vma); - validate_mm(mm); return addr; -close_and_free_vma: - vma_close(vma); -unmap_and_free_vma: +unmap_and_free_file_vma: fput(vma->vm_file); vma->vm_file = NULL; /* Undo any partial mapping done by a device driver. */ unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end); - if (file && (vm_flags & VM_SHARED)) - mapping_unmap_writable(file->f_mapping); +free_iter_vma: + mas_destroy(&mas); free_vma: vm_area_free(vma); unacct_error: if (charged) vm_unacct_memory(charged); - validate_mm(mm); return error; } +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + unsigned long ret; + bool writable_file_mapping = false; + + /* Allow architectures to sanity-check the vm_flags. */ + if (!arch_validate_flags(vm_flags)) + return -EINVAL; + + /* Map writable and ensure this isn't a sealed memfd. */ + if (file && (vm_flags & VM_SHARED)) { + int error = mapping_map_writable(file->f_mapping); + + if (error) + return error; + writable_file_mapping = true; + } + + ret = __mmap_region(file, addr, len, vm_flags, pgoff, uf); + + /* Clear our write mapping regardless of error. */ + if (writable_file_mapping) + mapping_unmap_writable(file->f_mapping); + + validate_mm(current->mm); + return ret; +} + static int __vm_munmap(unsigned long start, size_t len, bool downgrade) { int ret; From 2f2d48b6247ae3001f83c98730b3cce475cb2927 Mon Sep 17 00:00:00 2001 From: "Lin.Cao" Date: Wed, 25 Oct 2023 11:32:41 +0800 Subject: [PATCH 66/76] drm/amd: check num of link levels when update pcie param commit 406e8845356d18bdf3d3a23b347faf67706472ec upstream. In SR-IOV environment, the value of pcie_table->num_of_link_levels will be 0, and num_of_levels - 1 will cause array index out of bounds Signed-off-by: Lin.Cao Acked-by: Jingwen Chen Signed-off-by: Alex Deucher [ Resolve minor conflicts to fix CVE-2023-52812 ] Signed-off-by: Bin Lan Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c index 3aab1caed2ac..e159f715c1c2 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c +++ b/drivers/gpu/drm/amd/pm/swsmu/smu13/smu_v13_0.c @@ -2498,6 +2498,9 @@ int smu_v13_0_update_pcie_parameters(struct smu_context *smu, uint32_t smu_pcie_arg; int ret, i; + if (!num_of_levels) + return 0; + if (!amdgpu_device_pcie_dynamic_switching_supported()) { if (pcie_table->pcie_gen[num_of_levels - 1] < pcie_gen_cap) pcie_gen_cap = pcie_table->pcie_gen[num_of_levels - 1]; From c2dcdaa27d0a00bd8ac3a415d7f4c811631d4f5c Mon Sep 17 00:00:00 2001 From: Eli Billauer Date: Sun, 30 Oct 2022 11:42:09 +0200 Subject: [PATCH 67/76] char: xillybus: Prevent use-after-free due to race condition commit 282a4b71816b6076029017a7bab3a9dcee12a920 upstream. The driver for XillyUSB devices maintains a kref reference count on each xillyusb_dev structure, which represents a physical device. This reference count reaches zero when the device has been disconnected and there are no open file descriptors that are related to the device. When this occurs, kref_put() calls cleanup_dev(), which clears up the device's data, including the structure itself. However, when xillyusb_open() is called, this reference count becomes tricky: This function needs to obtain the xillyusb_dev structure that relates to the inode's major and minor (as there can be several such). xillybus_find_inode() (which is defined in xillybus_class.c) is called for this purpose. xillybus_find_inode() holds a mutex that is global in xillybus_class.c to protect the list of devices, and releases this mutex before returning. As a result, nothing protects the xillyusb_dev's reference counter from being decremented to zero before xillyusb_open() increments it on its own behalf. Hence the structure can be freed due to a rare race condition. To solve this, a mutex is added. It is locked by xillyusb_open() before the call to xillybus_find_inode() and is released only after the kref counter has been incremented on behalf of the newly opened inode. This protects the kref reference counters of all xillyusb_dev structs from being decremented by xillyusb_disconnect() during this time segment, as the call to kref_put() in this function is done with the same lock held. There is no need to hold the lock on other calls to kref_put(), because if xillybus_find_inode() finds a struct, xillyusb_disconnect() has not made the call to remove it, and hence not made its call to kref_put(), which takes place afterwards. Hence preventing xillyusb_disconnect's call to kref_put() is enough to ensure that the reference doesn't reach zero before it's incremented by xillyusb_open(). It would have been more natural to increment the reference count in xillybus_find_inode() of course, however this function is also called by Xillybus' driver for PCIe / OF, which registers a completely different structure. Therefore, xillybus_find_inode() treats these structures as void pointers, and accordingly can't make any changes. Reported-by: Hyunwoo Kim Suggested-by: Alan Stern Signed-off-by: Eli Billauer Link: https://lore.kernel.org/r/20221030094209.65916-1-eli.billauer@gmail.com Signed-off-by: Bin Lan Signed-off-by: Greg Kroah-Hartman --- drivers/char/xillybus/xillyusb.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/drivers/char/xillybus/xillyusb.c b/drivers/char/xillybus/xillyusb.c index 3a2a0fb3d928..45771b1a3716 100644 --- a/drivers/char/xillybus/xillyusb.c +++ b/drivers/char/xillybus/xillyusb.c @@ -185,6 +185,14 @@ struct xillyusb_dev { struct mutex process_in_mutex; /* synchronize wakeup_all() */ }; +/* + * kref_mutex is used in xillyusb_open() to prevent the xillyusb_dev + * struct from being freed during the gap between being found by + * xillybus_find_inode() and having its reference count incremented. + */ + +static DEFINE_MUTEX(kref_mutex); + /* FPGA to host opcodes */ enum { OPCODE_DATA = 0, @@ -1234,9 +1242,16 @@ static int xillyusb_open(struct inode *inode, struct file *filp) int rc; int index; + mutex_lock(&kref_mutex); + rc = xillybus_find_inode(inode, (void **)&xdev, &index); - if (rc) + if (rc) { + mutex_unlock(&kref_mutex); return rc; + } + + kref_get(&xdev->kref); + mutex_unlock(&kref_mutex); chan = &xdev->channels[index]; filp->private_data = chan; @@ -1272,8 +1287,6 @@ static int xillyusb_open(struct inode *inode, struct file *filp) ((filp->f_mode & FMODE_WRITE) && chan->open_for_write)) goto unmutex_fail; - kref_get(&xdev->kref); - if (filp->f_mode & FMODE_READ) chan->open_for_read = 1; @@ -1410,6 +1423,7 @@ unopen: return rc; unmutex_fail: + kref_put(&xdev->kref, cleanup_dev); mutex_unlock(&chan->lock); return rc; } @@ -2244,7 +2258,9 @@ static void xillyusb_disconnect(struct usb_interface *interface) xdev->dev = NULL; + mutex_lock(&kref_mutex); kref_put(&xdev->kref, cleanup_dev); + mutex_unlock(&kref_mutex); } static struct usb_driver xillyusb_driver = { From af4040457d3465b8aa297cbeca341c506bd7cd01 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sun, 14 Jan 2024 10:00:59 +0100 Subject: [PATCH 68/76] null_blk: Remove usage of the deprecated ida_simple_xx() API commit 95931a245b44ee04f3359ec432e73614d44d8b38 upstream. ida_alloc() and ida_free() should be preferred to the deprecated ida_simple_get() and ida_simple_remove(). This is less verbose. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/bf257b1078475a415cdc3344c6a750842946e367.1705222845.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jens Axboe Signed-off-by: Xiangyu Chen Signed-off-by: Greg Kroah-Hartman --- drivers/block/null_blk/main.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index 4d78b5583dc6..f58778b57375 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -1764,7 +1764,7 @@ static void null_del_dev(struct nullb *nullb) dev = nullb->dev; - ida_simple_remove(&nullb_indexes, nullb->index); + ida_free(&nullb_indexes, nullb->index); list_del_init(&nullb->list); @@ -2103,7 +2103,7 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); mutex_lock(&lock); - rv = ida_simple_get(&nullb_indexes, 0, 0, GFP_KERNEL); + rv = ida_alloc(&nullb_indexes, GFP_KERNEL); if (rv < 0) { mutex_unlock(&lock); goto out_cleanup_zone; From 1d4c8baef435c98e8d5aa7027dc5a9f70834ba16 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Thu, 23 May 2024 23:39:34 +0800 Subject: [PATCH 69/76] null_blk: fix null-ptr-dereference while configuring 'power' and 'submit_queues' commit a2db328b0839312c169eb42746ec46fc1ab53ed2 upstream. Writing 'power' and 'submit_queues' concurrently will trigger kernel panic: Test script: modprobe null_blk nr_devices=0 mkdir -p /sys/kernel/config/nullb/nullb0 while true; do echo 1 > submit_queues; echo 4 > submit_queues; done & while true; do echo 1 > power; echo 0 > power; done Test result: BUG: kernel NULL pointer dereference, address: 0000000000000148 Oops: 0000 [#1] PREEMPT SMP RIP: 0010:__lock_acquire+0x41d/0x28f0 Call Trace: lock_acquire+0x121/0x450 down_write+0x5f/0x1d0 simple_recursive_removal+0x12f/0x5c0 blk_mq_debugfs_unregister_hctxs+0x7c/0x100 blk_mq_update_nr_hw_queues+0x4a3/0x720 nullb_update_nr_hw_queues+0x71/0xf0 [null_blk] nullb_device_submit_queues_store+0x79/0xf0 [null_blk] configfs_write_iter+0x119/0x1e0 vfs_write+0x326/0x730 ksys_write+0x74/0x150 This is because del_gendisk() can concurrent with blk_mq_update_nr_hw_queues(): nullb_device_power_store nullb_apply_submit_queues null_del_dev del_gendisk nullb_update_nr_hw_queues if (!dev->nullb) // still set while gendisk is deleted return 0 blk_mq_update_nr_hw_queues dev->nullb = NULL Fix this problem by resuing the global mutex to protect nullb_device_power_store() and nullb_update_nr_hw_queues() from configfs. Fixes: 45919fbfe1c4 ("null_blk: Enable modifying 'submit_queues' after an instance has been configured") Reported-and-tested-by: Yi Zhang Closes: https://lore.kernel.org/all/CAHj4cs9LgsHLnjg8z06LQ3Pr5cax-+Ps+xT7AP7TPnEjStuwZA@mail.gmail.com/ Signed-off-by: Yu Kuai Reviewed-by: Zhu Yanjun Link: https://lore.kernel.org/r/20240523153934.1937851-1-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Xiangyu Chen Signed-off-by: Greg Kroah-Hartman --- drivers/block/null_blk/main.c | 40 +++++++++++++++++++++++------------ 1 file changed, 26 insertions(+), 14 deletions(-) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index f58778b57375..e838eed4aacf 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -392,13 +392,25 @@ static int nullb_update_nr_hw_queues(struct nullb_device *dev, static int nullb_apply_submit_queues(struct nullb_device *dev, unsigned int submit_queues) { - return nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues); + int ret; + + mutex_lock(&lock); + ret = nullb_update_nr_hw_queues(dev, submit_queues, dev->poll_queues); + mutex_unlock(&lock); + + return ret; } static int nullb_apply_poll_queues(struct nullb_device *dev, unsigned int poll_queues) { - return nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues); + int ret; + + mutex_lock(&lock); + ret = nullb_update_nr_hw_queues(dev, dev->submit_queues, poll_queues); + mutex_unlock(&lock); + + return ret; } NULLB_DEVICE_ATTR(size, ulong, NULL); @@ -444,28 +456,31 @@ static ssize_t nullb_device_power_store(struct config_item *item, if (ret < 0) return ret; + ret = count; + mutex_lock(&lock); if (!dev->power && newp) { if (test_and_set_bit(NULLB_DEV_FL_UP, &dev->flags)) - return count; + goto out; + ret = null_add_dev(dev); if (ret) { clear_bit(NULLB_DEV_FL_UP, &dev->flags); - return ret; + goto out; } set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); dev->power = newp; } else if (dev->power && !newp) { if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { - mutex_lock(&lock); dev->power = newp; null_del_dev(dev->nullb); - mutex_unlock(&lock); } clear_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); } - return count; +out: + mutex_unlock(&lock); + return ret; } CONFIGFS_ATTR(nullb_device_, power); @@ -2102,15 +2117,12 @@ static int null_add_dev(struct nullb_device *dev) blk_queue_flag_set(QUEUE_FLAG_NONROT, nullb->q); blk_queue_flag_clear(QUEUE_FLAG_ADD_RANDOM, nullb->q); - mutex_lock(&lock); rv = ida_alloc(&nullb_indexes, GFP_KERNEL); - if (rv < 0) { - mutex_unlock(&lock); + if (rv < 0) goto out_cleanup_zone; - } + nullb->index = rv; dev->index = rv; - mutex_unlock(&lock); blk_queue_logical_block_size(nullb->q, dev->blocksize); blk_queue_physical_block_size(nullb->q, dev->blocksize); @@ -2134,9 +2146,7 @@ static int null_add_dev(struct nullb_device *dev) if (rv) goto out_ida_free; - mutex_lock(&lock); list_add_tail(&nullb->list, &nullb_list); - mutex_unlock(&lock); pr_info("disk %s created\n", nullb->disk_name); @@ -2185,7 +2195,9 @@ static int null_create_dev(void) if (!dev) return -ENOMEM; + mutex_lock(&lock); ret = null_add_dev(dev); + mutex_unlock(&lock); if (ret) { null_free_dev(dev); return ret; From 777595da53aa11e7e2fa0332a2c4b0511086be39 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Mon, 27 May 2024 13:34:45 +0900 Subject: [PATCH 70/76] null_blk: Fix return value of nullb_device_power_store() commit d9ff882b54f99f96787fa3df7cd938966843c418 upstream. When powering on a null_blk device that is not already on, the return value ret that is initialized to be count is reused to check the return value of null_add_dev(), leading to nullb_device_power_store() to return null_add_dev() return value (0 on success) instead of "count". So make sure to set ret to be equal to count when there are no errors. Fixes: a2db328b0839 ("null_blk: fix null-ptr-dereference while configuring 'power' and 'submit_queues'") Signed-off-by: Damien Le Moal Reviewed-by: Yu Kuai Reviewed-by: Kanchan Joshi Link: https://lore.kernel.org/r/20240527043445.235267-1-dlemoal@kernel.org Signed-off-by: Jens Axboe Signed-off-by: Xiangyu Chen Signed-off-by: Greg Kroah-Hartman --- drivers/block/null_blk/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/block/null_blk/main.c b/drivers/block/null_blk/main.c index e838eed4aacf..e66cace433cb 100644 --- a/drivers/block/null_blk/main.c +++ b/drivers/block/null_blk/main.c @@ -470,6 +470,7 @@ static ssize_t nullb_device_power_store(struct config_item *item, set_bit(NULLB_DEV_FL_CONFIGURED, &dev->flags); dev->power = newp; + ret = count; } else if (dev->power && !newp) { if (test_and_clear_bit(NULLB_DEV_FL_UP, &dev->flags)) { dev->power = newp; From 00baca74fb5879e5f9034b6156671301f500f8ee Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Sat, 27 Jul 2024 20:22:52 +0200 Subject: [PATCH 71/76] parisc: fix a possible DMA corruption commit 7ae04ba36b381bffe2471eff3a93edced843240f upstream. ARCH_DMA_MINALIGN was defined as 16 - this is too small - it may be possible that two unrelated 16-byte allocations share a cache line. If one of these allocations is written using DMA and the other is written using cached write, the value that was written with DMA may be corrupted. This commit changes ARCH_DMA_MINALIGN to be 128 on PA20 and 32 on PA1.1 - that's the largest possible cache line size. As different parisc microarchitectures have different cache line size, we define arch_slab_minalign(), cache_line_size() and dma_get_cache_alignment() so that the kernel may tune slab cache parameters dynamically, based on the detected cache line size. Signed-off-by: Mikulas Patocka Cc: stable@vger.kernel.org Signed-off-by: Helge Deller Signed-off-by: Bin Lan Signed-off-by: Greg Kroah-Hartman --- arch/parisc/Kconfig | 1 + arch/parisc/include/asm/cache.h | 11 ++++++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 3341d4a42199..3a32b49d7ad0 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -18,6 +18,7 @@ config PARISC select ARCH_SUPPORTS_HUGETLBFS if PA20 select ARCH_SUPPORTS_MEMORY_FAILURE select ARCH_STACKWALK + select ARCH_HAS_CACHE_LINE_SIZE select ARCH_HAS_DEBUG_VM_PGTABLE select HAVE_RELIABLE_STACKTRACE select DMA_OPS diff --git a/arch/parisc/include/asm/cache.h b/arch/parisc/include/asm/cache.h index e23d06b51a20..91e753f08eaa 100644 --- a/arch/parisc/include/asm/cache.h +++ b/arch/parisc/include/asm/cache.h @@ -20,7 +20,16 @@ #define SMP_CACHE_BYTES L1_CACHE_BYTES -#define ARCH_DMA_MINALIGN L1_CACHE_BYTES +#ifdef CONFIG_PA20 +#define ARCH_DMA_MINALIGN 128 +#else +#define ARCH_DMA_MINALIGN 32 +#endif +#define ARCH_KMALLOC_MINALIGN 16 /* ldcw requires 16-byte alignment */ + +#define arch_slab_minalign() ((unsigned)dcache_stride) +#define cache_line_size() dcache_stride +#define dma_get_cache_alignment cache_line_size #define __read_mostly __section(".data..read_mostly") From 9b242c4232af20897e1eddea4836e66af36e51ce Mon Sep 17 00:00:00 2001 From: Eli Billauer Date: Thu, 17 Nov 2022 09:18:25 +0200 Subject: [PATCH 72/76] char: xillybus: Fix trivial bug with mutex commit c002f04c0bc79ec00d4beb75fb631d5bf37419bd upstream. @unit_mutex protects @unit from being freed, so obviously it should be released after @unit is used, and not before. This is a follow-up to commit 282a4b71816b ("char: xillybus: Prevent use-after-free due to race condition") which ensures, among others, the protection of @private_data after @unit_mutex has been released. Reported-by: Hyunwoo Kim Signed-off-by: Eli Billauer Link: https://lore.kernel.org/r/20221117071825.3942-1-eli.billauer@gmail.com Signed-off-by: Greg Kroah-Hartman --- drivers/char/xillybus/xillybus_class.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/char/xillybus/xillybus_class.c b/drivers/char/xillybus/xillybus_class.c index 0f238648dcfe..e9a288e61c15 100644 --- a/drivers/char/xillybus/xillybus_class.c +++ b/drivers/char/xillybus/xillybus_class.c @@ -227,14 +227,15 @@ int xillybus_find_inode(struct inode *inode, break; } - mutex_unlock(&unit_mutex); - - if (!unit) + if (!unit) { + mutex_unlock(&unit_mutex); return -ENODEV; + } *private_data = unit->private_data; *index = minor - unit->lowest_minor; + mutex_unlock(&unit_mutex); return 0; } EXPORT_SYMBOL(xillybus_find_inode); From a0b8fd37fd300d3d73254c4af6ab87f1a4593c9b Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Mon, 11 Nov 2024 00:17:34 +0100 Subject: [PATCH 73/76] net: Make copy_safe_from_sockptr() match documentation commit eb94b7bb10109a14a5431a67e5d8e31cfa06b395 upstream. copy_safe_from_sockptr() return copy_from_sockptr() return copy_from_sockptr_offset() return copy_from_user() copy_from_user() does not return an error on fault. Instead, it returns a number of bytes that were not copied. Have it handled. Patch has a side effect: it un-breaks garbage input handling of nfc_llcp_setsockopt() and mISDN's data_sock_setsockopt(). Fixes: 6309863b31dd ("net: add copy_safe_from_sockptr() helper") Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20241111-sockptr-copy-ret-fix-v1-1-a520083a93fb@rbox.co Signed-off-by: Jakub Kicinski Signed-off-by: Greg Kroah-Hartman --- include/linux/sockptr.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/sockptr.h b/include/linux/sockptr.h index 1c1a5d926b17..0eb3a2b1f81f 100644 --- a/include/linux/sockptr.h +++ b/include/linux/sockptr.h @@ -77,7 +77,9 @@ static inline int copy_safe_from_sockptr(void *dst, size_t ksize, { if (optlen < ksize) return -EINVAL; - return copy_from_sockptr(dst, optval, ksize); + if (copy_from_sockptr(dst, optval, ksize)) + return -EFAULT; + return 0; } static inline int copy_to_sockptr_offset(sockptr_t dst, size_t offset, From e4d90d63d385228b1e0bcf31cc15539bbbc28f7f Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Fri, 22 Nov 2024 15:37:35 +0100 Subject: [PATCH 74/76] Linux 6.1.119 Link: https://lore.kernel.org/r/20241120125809.623237564@linuxfoundation.org Tested-by: Mark Brown Tested-by: SeongJae Park Tested-by: Florian Fainelli Tested-by: Shuah Khan Tested-by: Ron Economos Tested-by: Pavel Machek (CIP) Tested-by: Salvatore Bonaccorso Tested-by: Hardik Garg hargar@linux.microsoft.com=0A= Tested-by: Jon Hunter Tested-by: kernelci.org bot Tested-by: Yann Sionneau Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ca304cece572..cc153c731405 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 6 PATCHLEVEL = 1 -SUBLEVEL = 118 +SUBLEVEL = 119 EXTRAVERSION = NAME = Curry Ramen From b7dd20a0306cc457bf7e97cb33f014f9e1c466c6 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 11 Dec 2024 15:57:47 +0000 Subject: [PATCH 75/76] ANDROID: fix build breakage in virtio_transport_common.c In commit 946c7600fa22 ("virtio/vsock: Fix accept_queue memory leak"), a change is made that breaks the build due to previous changes being reverted in the Android tree to preserve the abi. Fix this up by resolving the build error so that the real bugfix still works properly. Fixes: 946c7600fa22 ("virtio/vsock: Fix accept_queue memory leak") Change-Id: Idd5754459814e3e7db7be1581de2af920b402f5a Signed-off-by: Greg Kroah-Hartman --- net/vmw_vsock/virtio_transport_common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index ff2defb5d830..e2cb532497ea 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1201,7 +1201,7 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, * Subsequent enqueues would lead to a memory leak. */ if (sk->sk_shutdown == SHUTDOWN_MASK) { - virtio_transport_reset_no_sock(t, skb); + virtio_transport_reset_no_sock(t, pkt); return -ESHUTDOWN; } From 73b6ba5d327e0f3b383262910f52d4555a91d7a2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 12 Dec 2024 07:07:36 +0000 Subject: [PATCH 76/76] ANDROID: fix up crc problems 6.1.119 In commit 8fad7b004be8 ("mm: refactor arch_calc_vm_flag_bits() and arm64 MTE handling"), new .h files are included which break the CRC generation of some symbols. Fix this up by only including the .h files for when a real build happens to preserve the ABI. Fixes: 8fad7b004be8 ("mm: refactor arch_calc_vm_flag_bits() and arm64 MTE handling") Change-Id: I1f0990d75a4813c4f2dcb01ead3396cfbc7c452c Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/mman.h | 2 ++ include/linux/mman.h | 2 ++ 2 files changed, 4 insertions(+) diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index ef35c52aabd6..7ad015ed5c21 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -3,8 +3,10 @@ #define __ASM_MMAN_H__ #include +#ifndef __GENKSYMS__ #include #include +#endif #include #include diff --git a/include/linux/mman.h b/include/linux/mman.h index 21ea08b919d9..4c96a118e507 100644 --- a/include/linux/mman.h +++ b/include/linux/mman.h @@ -2,7 +2,9 @@ #ifndef _LINUX_MMAN_H #define _LINUX_MMAN_H +#ifndef __GENKSYMS__ #include +#endif #include #include