From df9692b8a31902d8f0ae9e3f81cb4465431e22cc Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sat, 23 Jul 2022 17:22:47 +0200 Subject: [PATCH 01/31] x86/speculation: Make all RETbleed mitigations 64-bit only commit b648ab487f31bc4c38941bc770ea97fe394304bb upstream. The mitigations for RETBleed are currently ineffective on x86_32 since entry_32.S does not use the required macros. However, for an x86_32 target, the kconfig symbols for them are still enabled by default and /sys/devices/system/cpu/vulnerabilities/retbleed will wrongly report that mitigations are in place. Make all of these symbols depend on X86_64, and only enable RETHUNK by default on X86_64. Fixes: f43b9876e857 ("x86/retbleed: Add fine grained Kconfig knobs") Signed-off-by: Ben Hutchings Signed-off-by: Borislav Petkov Cc: Link: https://lore.kernel.org/r/YtwSR3NNsWp1ohfV@decadent.org.uk [bwh: Backported to 5.10/5.15/5.18: adjust context] Signed-off-by: Ben Hutchings Signed-off-by: Greg Kroah-Hartman --- arch/x86/Kconfig | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index a170cfdae2a7..fe6981a38795 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -2427,7 +2427,7 @@ config RETPOLINE config RETHUNK bool "Enable return-thunks" depends on RETPOLINE && CC_HAS_RETURN_THUNK - default y + default y if X86_64 help Compile the kernel with the return-thunks compiler option to guard against kernel-to-user data leaks by avoiding return speculation. @@ -2436,21 +2436,21 @@ config RETHUNK config CPU_UNRET_ENTRY bool "Enable UNRET on kernel entry" - depends on CPU_SUP_AMD && RETHUNK + depends on CPU_SUP_AMD && RETHUNK && X86_64 default y help Compile the kernel with support for the retbleed=unret mitigation. config CPU_IBPB_ENTRY bool "Enable IBPB on kernel entry" - depends on CPU_SUP_AMD + depends on CPU_SUP_AMD && X86_64 default y help Compile the kernel with support for the retbleed=ibpb mitigation. config CPU_IBRS_ENTRY bool "Enable IBRS on kernel entry" - depends on CPU_SUP_INTEL + depends on CPU_SUP_INTEL && X86_64 default y help Compile the kernel with support for the spectre_v2=ibrs mitigation. From 119debdb9f25b1b24f6c52aa9b8a4aa4b51e2465 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 1 Aug 2022 17:48:50 +0300 Subject: [PATCH 02/31] selftests/bpf: Extend verifier and bpf_sock tests for dst_port loads commit 8f50f16ff39dd4e2d43d1548ca66925652f8aff7 upstream. Add coverage to the verifier tests and tests for reading bpf_sock fields to ensure that 32-bit, 16-bit, and 8-bit loads from dst_port field are allowed only at intended offsets and produce expected values. While 16-bit and 8-bit access to dst_port field is straight-forward, 32-bit wide loads need be allowed and produce a zero-padded 16-bit value for backward compatibility. Signed-off-by: Jakub Sitnicki Link: https://lore.kernel.org/r/20220130115518.213259-3-jakub@cloudflare.com Signed-off-by: Alexei Starovoitov Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- tools/include/uapi/linux/bpf.h | 3 +- .../selftests/bpf/prog_tests/sock_fields.c | 58 +++++++++---- .../selftests/bpf/progs/test_sock_fields.c | 41 ++++++++++ tools/testing/selftests/bpf/verifier/sock.c | 81 ++++++++++++++++++- 4 files changed, 162 insertions(+), 21 deletions(-) diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h index e2c8f946c541..8330e3ca8fbf 100644 --- a/tools/include/uapi/linux/bpf.h +++ b/tools/include/uapi/linux/bpf.h @@ -5347,7 +5347,8 @@ struct bpf_sock { __u32 src_ip4; __u32 src_ip6[4]; __u32 src_port; /* host byte order */ - __u32 dst_port; /* network byte order */ + __be16 dst_port; /* network byte order */ + __u16 :16; /* zero padding */ __u32 dst_ip4; __u32 dst_ip6[4]; __u32 state; diff --git a/tools/testing/selftests/bpf/prog_tests/sock_fields.c b/tools/testing/selftests/bpf/prog_tests/sock_fields.c index 577d619fb07e..197ec1d1b702 100644 --- a/tools/testing/selftests/bpf/prog_tests/sock_fields.c +++ b/tools/testing/selftests/bpf/prog_tests/sock_fields.c @@ -1,9 +1,11 @@ // SPDX-License-Identifier: GPL-2.0 /* Copyright (c) 2019 Facebook */ +#define _GNU_SOURCE #include #include #include +#include #include #include #include @@ -21,6 +23,7 @@ enum bpf_linum_array_idx { EGRESS_LINUM_IDX, INGRESS_LINUM_IDX, + READ_SK_DST_PORT_LINUM_IDX, __NR_BPF_LINUM_ARRAY_IDX, }; @@ -43,8 +46,16 @@ static __u64 child_cg_id; static int linum_map_fd; static __u32 duration; -static __u32 egress_linum_idx = EGRESS_LINUM_IDX; -static __u32 ingress_linum_idx = INGRESS_LINUM_IDX; +static bool create_netns(void) +{ + if (!ASSERT_OK(unshare(CLONE_NEWNET), "create netns")) + return false; + + if (!ASSERT_OK(system("ip link set dev lo up"), "bring up lo")) + return false; + + return true; +} static void print_sk(const struct bpf_sock *sk, const char *prefix) { @@ -92,19 +103,24 @@ static void check_result(void) { struct bpf_tcp_sock srv_tp, cli_tp, listen_tp; struct bpf_sock srv_sk, cli_sk, listen_sk; - __u32 ingress_linum, egress_linum; + __u32 idx, ingress_linum, egress_linum, linum; int err; - err = bpf_map_lookup_elem(linum_map_fd, &egress_linum_idx, - &egress_linum); + idx = EGRESS_LINUM_IDX; + err = bpf_map_lookup_elem(linum_map_fd, &idx, &egress_linum); CHECK(err < 0, "bpf_map_lookup_elem(linum_map_fd)", "err:%d errno:%d\n", err, errno); - err = bpf_map_lookup_elem(linum_map_fd, &ingress_linum_idx, - &ingress_linum); + idx = INGRESS_LINUM_IDX; + err = bpf_map_lookup_elem(linum_map_fd, &idx, &ingress_linum); CHECK(err < 0, "bpf_map_lookup_elem(linum_map_fd)", "err:%d errno:%d\n", err, errno); + idx = READ_SK_DST_PORT_LINUM_IDX; + err = bpf_map_lookup_elem(linum_map_fd, &idx, &linum); + ASSERT_OK(err, "bpf_map_lookup_elem(linum_map_fd, READ_SK_DST_PORT_IDX)"); + ASSERT_EQ(linum, 0, "failure in read_sk_dst_port on line"); + memcpy(&srv_sk, &skel->bss->srv_sk, sizeof(srv_sk)); memcpy(&srv_tp, &skel->bss->srv_tp, sizeof(srv_tp)); memcpy(&cli_sk, &skel->bss->cli_sk, sizeof(cli_sk)); @@ -263,7 +279,7 @@ static void test(void) char buf[DATA_LEN]; /* Prepare listen_fd */ - listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); + listen_fd = start_server(AF_INET6, SOCK_STREAM, "::1", 0xcafe, 0); /* start_server() has logged the error details */ if (CHECK_FAIL(listen_fd == -1)) goto done; @@ -331,8 +347,12 @@ done: void test_sock_fields(void) { - struct bpf_link *egress_link = NULL, *ingress_link = NULL; int parent_cg_fd = -1, child_cg_fd = -1; + struct bpf_link *link; + + /* Use a dedicated netns to have a fixed listen port */ + if (!create_netns()) + return; /* Create a cgroup, get fd, and join it */ parent_cg_fd = test__join_cgroup(PARENT_CGROUP); @@ -353,15 +373,20 @@ void test_sock_fields(void) if (CHECK(!skel, "test_sock_fields__open_and_load", "failed\n")) goto done; - egress_link = bpf_program__attach_cgroup(skel->progs.egress_read_sock_fields, - child_cg_fd); - if (!ASSERT_OK_PTR(egress_link, "attach_cgroup(egress)")) + link = bpf_program__attach_cgroup(skel->progs.egress_read_sock_fields, child_cg_fd); + if (!ASSERT_OK_PTR(link, "attach_cgroup(egress_read_sock_fields)")) goto done; + skel->links.egress_read_sock_fields = link; - ingress_link = bpf_program__attach_cgroup(skel->progs.ingress_read_sock_fields, - child_cg_fd); - if (!ASSERT_OK_PTR(ingress_link, "attach_cgroup(ingress)")) + link = bpf_program__attach_cgroup(skel->progs.ingress_read_sock_fields, child_cg_fd); + if (!ASSERT_OK_PTR(link, "attach_cgroup(ingress_read_sock_fields)")) goto done; + skel->links.ingress_read_sock_fields = link; + + link = bpf_program__attach_cgroup(skel->progs.read_sk_dst_port, child_cg_fd); + if (!ASSERT_OK_PTR(link, "attach_cgroup(read_sk_dst_port")) + goto done; + skel->links.read_sk_dst_port = link; linum_map_fd = bpf_map__fd(skel->maps.linum_map); sk_pkt_out_cnt_fd = bpf_map__fd(skel->maps.sk_pkt_out_cnt); @@ -370,8 +395,7 @@ void test_sock_fields(void) test(); done: - bpf_link__destroy(egress_link); - bpf_link__destroy(ingress_link); + test_sock_fields__detach(skel); test_sock_fields__destroy(skel); if (child_cg_fd >= 0) close(child_cg_fd); diff --git a/tools/testing/selftests/bpf/progs/test_sock_fields.c b/tools/testing/selftests/bpf/progs/test_sock_fields.c index 7967348b11af..3e2e3ee51cc9 100644 --- a/tools/testing/selftests/bpf/progs/test_sock_fields.c +++ b/tools/testing/selftests/bpf/progs/test_sock_fields.c @@ -12,6 +12,7 @@ enum bpf_linum_array_idx { EGRESS_LINUM_IDX, INGRESS_LINUM_IDX, + READ_SK_DST_PORT_LINUM_IDX, __NR_BPF_LINUM_ARRAY_IDX, }; @@ -250,4 +251,44 @@ int ingress_read_sock_fields(struct __sk_buff *skb) return CG_OK; } +static __noinline bool sk_dst_port__load_word(struct bpf_sock *sk) +{ + __u32 *word = (__u32 *)&sk->dst_port; + return word[0] == bpf_htonl(0xcafe0000); +} + +static __noinline bool sk_dst_port__load_half(struct bpf_sock *sk) +{ + __u16 *half = (__u16 *)&sk->dst_port; + return half[0] == bpf_htons(0xcafe); +} + +static __noinline bool sk_dst_port__load_byte(struct bpf_sock *sk) +{ + __u8 *byte = (__u8 *)&sk->dst_port; + return byte[0] == 0xca && byte[1] == 0xfe; +} + +SEC("cgroup_skb/egress") +int read_sk_dst_port(struct __sk_buff *skb) +{ + __u32 linum, linum_idx; + struct bpf_sock *sk; + + linum_idx = READ_SK_DST_PORT_LINUM_IDX; + + sk = skb->sk; + if (!sk) + RET_LOG(); + + if (!sk_dst_port__load_word(sk)) + RET_LOG(); + if (!sk_dst_port__load_half(sk)) + RET_LOG(); + if (!sk_dst_port__load_byte(sk)) + RET_LOG(); + + return CG_OK; +} + char _license[] SEC("license") = "GPL"; diff --git a/tools/testing/selftests/bpf/verifier/sock.c b/tools/testing/selftests/bpf/verifier/sock.c index ce13ece08d51..8c224eac93df 100644 --- a/tools/testing/selftests/bpf/verifier/sock.c +++ b/tools/testing/selftests/bpf/verifier/sock.c @@ -121,7 +121,25 @@ .result = ACCEPT, }, { - "sk_fullsock(skb->sk): sk->dst_port [narrow load]", + "sk_fullsock(skb->sk): sk->dst_port [word load] (backward compatibility)", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_W, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_port)), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = ACCEPT, +}, +{ + "sk_fullsock(skb->sk): sk->dst_port [half load]", .insns = { BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), @@ -139,7 +157,7 @@ .result = ACCEPT, }, { - "sk_fullsock(skb->sk): sk->dst_port [load 2nd byte]", + "sk_fullsock(skb->sk): sk->dst_port [half load] (invalid)", .insns = { BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), @@ -149,7 +167,64 @@ BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), - BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_port) + 1), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_port) + 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = REJECT, + .errstr = "invalid sock access", +}, +{ + "sk_fullsock(skb->sk): sk->dst_port [byte load]", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_2, BPF_REG_0, offsetof(struct bpf_sock, dst_port)), + BPF_LDX_MEM(BPF_B, BPF_REG_2, BPF_REG_0, offsetof(struct bpf_sock, dst_port) + 1), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = ACCEPT, +}, +{ + "sk_fullsock(skb->sk): sk->dst_port [byte load] (invalid)", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_B, BPF_REG_0, BPF_REG_0, offsetof(struct bpf_sock, dst_port) + 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + }, + .prog_type = BPF_PROG_TYPE_CGROUP_SKB, + .result = REJECT, + .errstr = "invalid sock access", +}, +{ + "sk_fullsock(skb->sk): past sk->dst_port [half load] (invalid)", + .insns = { + BPF_LDX_MEM(BPF_DW, BPF_REG_1, BPF_REG_1, offsetof(struct __sk_buff, sk)), + BPF_JMP_IMM(BPF_JNE, BPF_REG_1, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_EMIT_CALL(BPF_FUNC_sk_fullsock), + BPF_JMP_IMM(BPF_JNE, BPF_REG_0, 0, 2), + BPF_MOV64_IMM(BPF_REG_0, 0), + BPF_EXIT_INSN(), + BPF_LDX_MEM(BPF_H, BPF_REG_0, BPF_REG_0, offsetofend(struct bpf_sock, dst_port)), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), }, From e889a4c440eb64296857b50a1fc2c9e57cc9c0d8 Mon Sep 17 00:00:00 2001 From: Jakub Sitnicki Date: Mon, 1 Aug 2022 17:48:51 +0300 Subject: [PATCH 03/31] selftests/bpf: Check dst_port only on the client socket commit 2d2202ba858c112b03f84d546e260c61425831a1 upstream. cgroup_skb/egress programs which sock_fields test installs process packets flying in both directions, from the client to the server, and in reverse direction. Recently added dst_port check relies on the fact that destination port (remote peer port) of the socket which sends the packet is known ahead of time. This holds true only for the client socket, which connects to the known server port. Filter out any traffic that is not egressing from the client socket in the BPF program that tests reading the dst_port. Fixes: 8f50f16ff39d ("selftests/bpf: Extend verifier and bpf_sock tests for dst_port loads") Signed-off-by: Jakub Sitnicki Signed-off-by: Daniel Borkmann Acked-by: Martin KaFai Lau Link: https://lore.kernel.org/bpf/20220317113920.1068535-3-jakub@cloudflare.com Signed-off-by: Ovidiu Panait Signed-off-by: Greg Kroah-Hartman --- tools/testing/selftests/bpf/progs/test_sock_fields.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/testing/selftests/bpf/progs/test_sock_fields.c b/tools/testing/selftests/bpf/progs/test_sock_fields.c index 3e2e3ee51cc9..43b31aa1fcf7 100644 --- a/tools/testing/selftests/bpf/progs/test_sock_fields.c +++ b/tools/testing/selftests/bpf/progs/test_sock_fields.c @@ -281,6 +281,10 @@ int read_sk_dst_port(struct __sk_buff *skb) if (!sk) RET_LOG(); + /* Ignore everything but the SYN from the client socket */ + if (sk->state != BPF_TCP_SYN_SENT) + return CG_OK; + if (!sk_dst_port__load_word(sk)) RET_LOG(); if (!sk_dst_port__load_half(sk)) From 7ad47f414b4095fa104daaf08c8c942308f5a5c4 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Thu, 23 Jun 2022 09:48:26 +0200 Subject: [PATCH 04/31] block: fix default IO priority handling again commit e589f46445960c274cc813a1cc8e2fc73b2a1849 upstream. Commit e70344c05995 ("block: fix default IO priority handling") introduced an inconsistency in get_current_ioprio() that tasks without IO context return IOPRIO_DEFAULT priority while tasks with freshly allocated IO context will return 0 (IOPRIO_CLASS_NONE/0) IO priority. Tasks without IO context used to be rare before 5a9d041ba2f6 ("block: move io_context creation into where it's needed") but after this commit they became common because now only BFQ IO scheduler setups task's IO context. Similar inconsistency is there for get_task_ioprio() so this inconsistency is now exposed to userspace and userspace will see different IO priority for tasks operating on devices with BFQ compared to devices without BFQ. Furthemore the changes done by commit e70344c05995 change the behavior when no IO priority is set for BFQ IO scheduler which is also documented in ioprio_set(2) manpage: "If no I/O scheduler has been set for a thread, then by default the I/O priority will follow the CPU nice value (setpriority(2)). In Linux kernels before version 2.6.24, once an I/O priority had been set using ioprio_set(), there was no way to reset the I/O scheduling behavior to the default. Since Linux 2.6.24, specifying ioprio as 0 can be used to reset to the default I/O scheduling behavior." So make sure we default to IOPRIO_CLASS_NONE as used to be the case before commit e70344c05995. Also cleanup alloc_io_context() to explicitely set this IO priority for the allocated IO context to avoid future surprises. Note that we tweak ioprio_best() to maintain ioprio_get(2) behavior and make this commit easily backportable. CC: stable@vger.kernel.org Fixes: e70344c05995 ("block: fix default IO priority handling") Reviewed-by: Damien Le Moal Tested-by: Damien Le Moal Signed-off-by: Jan Kara Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20220623074840.5960-1-jack@suse.cz Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/blk-ioc.c | 1 + block/ioprio.c | 4 ++-- include/linux/ioprio.h | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 57299f860d41..90c05971f71e 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -265,6 +265,7 @@ int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&ioc->icq_list); INIT_WORK(&ioc->release_work, ioc_release_fn); + ioc->ioprio = IOPRIO_DEFAULT; /* * Try to install. ioc shouldn't be installed if someone else diff --git a/block/ioprio.c b/block/ioprio.c index 6f01d35a5145..6c830154856f 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -189,9 +189,9 @@ out: int ioprio_best(unsigned short aprio, unsigned short bprio) { if (!ioprio_valid(aprio)) - aprio = IOPRIO_DEFAULT; + aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM); if (!ioprio_valid(bprio)) - bprio = IOPRIO_DEFAULT; + bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM); return min(aprio, bprio); } diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index 3f53bc27a19b..3d088a88f832 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -11,7 +11,7 @@ /* * Default IO priority. */ -#define IOPRIO_DEFAULT IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_BE_NORM) +#define IOPRIO_DEFAULT IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, 0) /* * Check that a priority value has a valid class. From 9894717519cc144827bf8a46b543363533e8c0fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?St=C3=A9phane=20Graber?= Date: Thu, 23 Jun 2022 16:45:52 -0500 Subject: [PATCH 05/31] tools/vm/slabinfo: Handle files in debugfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0c7e0d699ef1430d7f4cf12b4b1d097af58b5515 upstream. Commit 64dd68497be76 relocated and renamed the alloc_calls and free_calls files from /sys/kernel/slab/NAME/*_calls over to /sys/kernel/debug/slab/NAME/*_calls but didn't update the slabinfo tool with the new location. This change will now have slabinfo look at the new location (and filenames) with a fallback to the prior files. Fixes: 64dd68497be76 ("mm: slub: move sysfs slab alloc/free interfaces to debugfs") Cc: stable@vger.kernel.org Signed-off-by: Stéphane Graber Tested-by: Stéphane Graber Signed-off-by: Vlastimil Babka Signed-off-by: Greg Kroah-Hartman --- tools/vm/slabinfo.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/tools/vm/slabinfo.c b/tools/vm/slabinfo.c index 9b68658b6bb8..5b98f3ee58a5 100644 --- a/tools/vm/slabinfo.c +++ b/tools/vm/slabinfo.c @@ -233,6 +233,24 @@ static unsigned long read_slab_obj(struct slabinfo *s, const char *name) return l; } +static unsigned long read_debug_slab_obj(struct slabinfo *s, const char *name) +{ + char x[128]; + FILE *f; + size_t l; + + snprintf(x, 128, "/sys/kernel/debug/slab/%s/%s", s->name, name); + f = fopen(x, "r"); + if (!f) { + buffer[0] = 0; + l = 0; + } else { + l = fread(buffer, 1, sizeof(buffer), f); + buffer[l] = 0; + fclose(f); + } + return l; +} /* * Put a size string together @@ -409,14 +427,18 @@ static void show_tracking(struct slabinfo *s) { printf("\n%s: Kernel object allocation\n", s->name); printf("-----------------------------------------------------------------------\n"); - if (read_slab_obj(s, "alloc_calls")) + if (read_debug_slab_obj(s, "alloc_traces")) + printf("%s", buffer); + else if (read_slab_obj(s, "alloc_calls")) printf("%s", buffer); else printf("No Data\n"); printf("\n%s: Kernel object freeing\n", s->name); printf("------------------------------------------------------------------------\n"); - if (read_slab_obj(s, "free_calls")) + if (read_debug_slab_obj(s, "free_traces")) + printf("%s", buffer); + else if (read_slab_obj(s, "free_calls")) printf("%s", buffer); else printf("No Data\n"); From 3a5fab5c45056a8d8c33880b113ae40cf23500e8 Mon Sep 17 00:00:00 2001 From: Werner Sembach Date: Thu, 7 Jul 2022 20:09:52 +0200 Subject: [PATCH 06/31] ACPI: video: Force backlight native for some TongFang devices commit c752089f7cf5b5800c6ace4cdd1a8351ee78a598 upstream. The TongFang PF5PU1G, PF4NU1F, PF5NU1G, and PF5LUXG/TUXEDO BA15 Gen10, Pulse 14/15 Gen1, and Pulse 15 Gen2 have the same problem as the Clevo NL5xRU and NL5xNU/TUXEDO Aura 15 Gen1 and Gen2: They have a working native and video interface. However the default detection mechanism first registers the video interface before unregistering it again and switching to the native interface during boot. This results in a dangling SBIOS request for backlight change for some reason, causing the backlight to switch to ~2% once per boot on the first power cord connect or disconnect event. Setting the native interface explicitly circumvents this buggy behaviour by avoiding the unregistering process. Signed-off-by: Werner Sembach Cc: All applicable Reviewed-by: Hans de Goede Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/video_detect.c | 51 ++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c index 7b9793cb55c5..3021254597f9 100644 --- a/drivers/acpi/video_detect.c +++ b/drivers/acpi/video_detect.c @@ -484,7 +484,56 @@ static const struct dmi_system_id video_detect_dmi_table[] = { DMI_MATCH(DMI_BOARD_NAME, "NL5xNU"), }, }, - + /* + * The TongFang PF5PU1G, PF4NU1F, PF5NU1G, and PF5LUXG/TUXEDO BA15 Gen10, + * Pulse 14/15 Gen1, and Pulse 15 Gen2 have the same problem as the Clevo + * NL5xRU and NL5xNU/TUXEDO Aura 15 Gen1 and Gen2. See the description + * above. + */ + { + .callback = video_detect_force_native, + .ident = "TongFang PF5PU1G", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "PF5PU1G"), + }, + }, + { + .callback = video_detect_force_native, + .ident = "TongFang PF4NU1F", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "PF4NU1F"), + }, + }, + { + .callback = video_detect_force_native, + .ident = "TongFang PF4NU1F", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "TUXEDO"), + DMI_MATCH(DMI_BOARD_NAME, "PULSE1401"), + }, + }, + { + .callback = video_detect_force_native, + .ident = "TongFang PF5NU1G", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "PF5NU1G"), + }, + }, + { + .callback = video_detect_force_native, + .ident = "TongFang PF5NU1G", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "TUXEDO"), + DMI_MATCH(DMI_BOARD_NAME, "PULSE1501"), + }, + }, + { + .callback = video_detect_force_native, + .ident = "TongFang PF5LUXG", + .matches = { + DMI_MATCH(DMI_BOARD_NAME, "PF5LUXG"), + }, + }, /* * Desktops which falsely report a backlight and which our heuristics * for this do not catch. From e7170bcda6137ed682c59da713c0af44d8f706e5 Mon Sep 17 00:00:00 2001 From: Werner Sembach Date: Thu, 7 Jul 2022 20:09:53 +0200 Subject: [PATCH 07/31] ACPI: video: Shortening quirk list by identifying Clevo by board_name only commit f0341e67b3782603737f7788e71bd3530012a4f4 upstream. Taking a recent change in the i8042 quirklist to this one: Clevo board_names are somewhat unique, and if not: The generic Board_-/Sys_Vendor string "Notebook" doesn't help much anyway. So identifying the devices just by the board_name helps keeping the list significantly shorter and might even hit more devices requiring the fix. Signed-off-by: Werner Sembach Fixes: c844d22fe0c0 ("ACPI: video: Force backlight native for Clevo NL5xRU and NL5xNU") Cc: All applicable Reviewed-by: Hans de Goede Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/video_detect.c | 34 ---------------------------------- 1 file changed, 34 deletions(-) diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c index 3021254597f9..e39d59ad6496 100644 --- a/drivers/acpi/video_detect.c +++ b/drivers/acpi/video_detect.c @@ -424,23 +424,6 @@ static const struct dmi_system_id video_detect_dmi_table[] = { .callback = video_detect_force_native, .ident = "Clevo NL5xRU", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "TUXEDO"), - DMI_MATCH(DMI_BOARD_NAME, "NL5xRU"), - }, - }, - { - .callback = video_detect_force_native, - .ident = "Clevo NL5xRU", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "SchenkerTechnologiesGmbH"), - DMI_MATCH(DMI_BOARD_NAME, "NL5xRU"), - }, - }, - { - .callback = video_detect_force_native, - .ident = "Clevo NL5xRU", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Notebook"), DMI_MATCH(DMI_BOARD_NAME, "NL5xRU"), }, }, @@ -464,23 +447,6 @@ static const struct dmi_system_id video_detect_dmi_table[] = { .callback = video_detect_force_native, .ident = "Clevo NL5xNU", .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "TUXEDO"), - DMI_MATCH(DMI_BOARD_NAME, "NL5xNU"), - }, - }, - { - .callback = video_detect_force_native, - .ident = "Clevo NL5xNU", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "SchenkerTechnologiesGmbH"), - DMI_MATCH(DMI_BOARD_NAME, "NL5xNU"), - }, - }, - { - .callback = video_detect_force_native, - .ident = "Clevo NL5xNU", - .matches = { - DMI_MATCH(DMI_SYS_VENDOR, "Notebook"), DMI_MATCH(DMI_BOARD_NAME, "NL5xNU"), }, }, From 397c2116cbe26a9f09e825ad394e802804bc182a Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Wed, 22 Jun 2022 10:09:06 -0700 Subject: [PATCH 08/31] ACPI: APEI: Better fix to avoid spamming the console with old error logs commit c3481b6b75b4797657838f44028fd28226ab48e0 upstream. The fix in commit 3f8dec116210 ("ACPI/APEI: Limit printable size of BERT table data") does not work as intended on systems where the BIOS has a fixed size block of memory for the BERT table, relying on s/w to quit when it finds a record with estatus->block_status == 0. On these systems all errors are suppressed because the check: if (region_len < ACPI_BERT_PRINT_MAX_LEN) always fails. New scheme skips individual CPER records that are too large, and also limits the total number of records that will be printed to 5. Fixes: 3f8dec116210 ("ACPI/APEI: Limit printable size of BERT table data") Cc: All applicable Signed-off-by: Tony Luck Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/apei/bert.c | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/drivers/acpi/apei/bert.c b/drivers/acpi/apei/bert.c index 598fd19b65fa..45973aa6e06d 100644 --- a/drivers/acpi/apei/bert.c +++ b/drivers/acpi/apei/bert.c @@ -29,16 +29,26 @@ #undef pr_fmt #define pr_fmt(fmt) "BERT: " fmt + +#define ACPI_BERT_PRINT_MAX_RECORDS 5 #define ACPI_BERT_PRINT_MAX_LEN 1024 static int bert_disable; +/* + * Print "all" the error records in the BERT table, but avoid huge spam to + * the console if the BIOS included oversize records, or too many records. + * Skipping some records here does not lose anything because the full + * data is available to user tools in: + * /sys/firmware/acpi/tables/data/BERT + */ static void __init bert_print_all(struct acpi_bert_region *region, unsigned int region_len) { struct acpi_hest_generic_status *estatus = (struct acpi_hest_generic_status *)region; int remain = region_len; + int printed = 0, skipped = 0; u32 estatus_len; while (remain >= sizeof(struct acpi_bert_region)) { @@ -46,24 +56,26 @@ static void __init bert_print_all(struct acpi_bert_region *region, if (remain < estatus_len) { pr_err(FW_BUG "Truncated status block (length: %u).\n", estatus_len); - return; + break; } /* No more error records. */ if (!estatus->block_status) - return; + break; if (cper_estatus_check(estatus)) { pr_err(FW_BUG "Invalid error record.\n"); - return; + break; } - pr_info_once("Error records from previous boot:\n"); - if (region_len < ACPI_BERT_PRINT_MAX_LEN) + if (estatus_len < ACPI_BERT_PRINT_MAX_LEN && + printed < ACPI_BERT_PRINT_MAX_RECORDS) { + pr_info_once("Error records from previous boot:\n"); cper_estatus_print(KERN_INFO HW_ERR, estatus); - else - pr_info_once("Max print length exceeded, table data is available at:\n" - "/sys/firmware/acpi/tables/data/BERT"); + printed++; + } else { + skipped++; + } /* * Because the boot error source is "one-time polled" type, @@ -75,6 +87,9 @@ static void __init bert_print_all(struct acpi_bert_region *region, estatus = (void *)estatus + estatus_len; remain -= estatus_len; } + + if (skipped) + pr_info(HW_ERR "Skipped %d error records\n", skipped); } static int __init setup_bert_disable(char *str) From 3d4c28475ee352c440b83484b72b1320ff76364a Mon Sep 17 00:00:00 2001 From: GUO Zihua Date: Fri, 22 Jul 2022 14:31:57 +0800 Subject: [PATCH 09/31] crypto: arm64/poly1305 - fix a read out-of-bound commit 7ae19d422c7da84b5f13bc08b98bd737a08d3a53 upstream. A kasan error was reported during fuzzing: BUG: KASAN: slab-out-of-bounds in neon_poly1305_blocks.constprop.0+0x1b4/0x250 [poly1305_neon] Read of size 4 at addr ffff0010e293f010 by task syz-executor.5/1646715 CPU: 4 PID: 1646715 Comm: syz-executor.5 Kdump: loaded Not tainted 5.10.0.aarch64 #1 Hardware name: Huawei TaiShan 2280 /BC11SPCD, BIOS 1.59 01/31/2019 Call trace: dump_backtrace+0x0/0x394 show_stack+0x34/0x4c arch/arm64/kernel/stacktrace.c:196 __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x158/0x1e4 lib/dump_stack.c:118 print_address_description.constprop.0+0x68/0x204 mm/kasan/report.c:387 __kasan_report+0xe0/0x140 mm/kasan/report.c:547 kasan_report+0x44/0xe0 mm/kasan/report.c:564 check_memory_region_inline mm/kasan/generic.c:187 [inline] __asan_load4+0x94/0xd0 mm/kasan/generic.c:252 neon_poly1305_blocks.constprop.0+0x1b4/0x250 [poly1305_neon] neon_poly1305_do_update+0x6c/0x15c [poly1305_neon] neon_poly1305_update+0x9c/0x1c4 [poly1305_neon] crypto_shash_update crypto/shash.c:131 [inline] shash_finup_unaligned+0x84/0x15c crypto/shash.c:179 crypto_shash_finup+0x8c/0x140 crypto/shash.c:193 shash_digest_unaligned+0xb8/0xe4 crypto/shash.c:201 crypto_shash_digest+0xa4/0xfc crypto/shash.c:217 crypto_shash_tfm_digest+0xb4/0x150 crypto/shash.c:229 essiv_skcipher_setkey+0x164/0x200 [essiv] crypto_skcipher_setkey+0xb0/0x160 crypto/skcipher.c:612 skcipher_setkey+0x3c/0x50 crypto/algif_skcipher.c:305 alg_setkey+0x114/0x2a0 crypto/af_alg.c:220 alg_setsockopt+0x19c/0x210 crypto/af_alg.c:253 __sys_setsockopt+0x190/0x2e0 net/socket.c:2123 __do_sys_setsockopt net/socket.c:2134 [inline] __se_sys_setsockopt net/socket.c:2131 [inline] __arm64_sys_setsockopt+0x78/0x94 net/socket.c:2131 __invoke_syscall arch/arm64/kernel/syscall.c:36 [inline] invoke_syscall+0x64/0x100 arch/arm64/kernel/syscall.c:48 el0_svc_common.constprop.0+0x220/0x230 arch/arm64/kernel/syscall.c:155 do_el0_svc+0xb4/0xd4 arch/arm64/kernel/syscall.c:217 el0_svc+0x24/0x3c arch/arm64/kernel/entry-common.c:353 el0_sync_handler+0x160/0x164 arch/arm64/kernel/entry-common.c:369 el0_sync+0x160/0x180 arch/arm64/kernel/entry.S:683 This error can be reproduced by the following code compiled as ko on a system with kasan enabled: #include #include #include #include char test_data[] = "\x00\x01\x02\x03\x04\x05\x06\x07" "\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" "\x10\x11\x12\x13\x14\x15\x16\x17" "\x18\x19\x1a\x1b\x1c\x1d\x1e"; int init(void) { struct crypto_shash *tfm = NULL; char *data = NULL, *out = NULL; tfm = crypto_alloc_shash("poly1305", 0, 0); data = kmalloc(POLY1305_KEY_SIZE - 1, GFP_KERNEL); out = kmalloc(POLY1305_DIGEST_SIZE, GFP_KERNEL); memcpy(data, test_data, POLY1305_KEY_SIZE - 1); crypto_shash_tfm_digest(tfm, data, POLY1305_KEY_SIZE - 1, out); kfree(data); kfree(out); return 0; } void deinit(void) { } module_init(init) module_exit(deinit) MODULE_LICENSE("GPL"); The root cause of the bug sits in neon_poly1305_blocks. The logic neon_poly1305_blocks() performed is that if it was called with both s[] and r[] uninitialized, it will first try to initialize them with the data from the first "block" that it believed to be 32 bytes in length. First 16 bytes are used as the key and the next 16 bytes for s[]. This would lead to the aforementioned read out-of-bound. However, after calling poly1305_init_arch(), only 16 bytes were deducted from the input and s[] is initialized yet again with the following 16 bytes. The second initialization of s[] is certainly redundent which indicates that the first initialization should be for r[] only. This patch fixes the issue by calling poly1305_init_arm64() instead of poly1305_init_arch(). This is also the implementation for the same algorithm on arm platform. Fixes: f569ca164751 ("crypto: arm64/poly1305 - incorporate OpenSSL/CRYPTOGAMS NEON implementation") Cc: stable@vger.kernel.org Signed-off-by: GUO Zihua Reviewed-by: Eric Biggers Acked-by: Will Deacon Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- arch/arm64/crypto/poly1305-glue.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/crypto/poly1305-glue.c b/arch/arm64/crypto/poly1305-glue.c index 9c3d86e397bf..1fae18ba11ed 100644 --- a/arch/arm64/crypto/poly1305-glue.c +++ b/arch/arm64/crypto/poly1305-glue.c @@ -52,7 +52,7 @@ static void neon_poly1305_blocks(struct poly1305_desc_ctx *dctx, const u8 *src, { if (unlikely(!dctx->sset)) { if (!dctx->rset) { - poly1305_init_arch(dctx, src); + poly1305_init_arm64(&dctx->h, src); src += POLY1305_BLOCK_SIZE; len -= POLY1305_BLOCK_SIZE; dctx->rset = 1; From 92343314d34e04da0923cefd3be67521d706fa35 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 Jun 2022 10:09:03 -0400 Subject: [PATCH 10/31] KVM: x86: do not report a vCPU as preempted outside instruction boundaries [ Upstream commit 6cd88243c7e03845a450795e134b488fc2afb736 ] If a vCPU is outside guest mode and is scheduled out, it might be in the process of making a memory access. A problem occurs if another vCPU uses the PV TLB flush feature during the period when the vCPU is scheduled out, and a virtual address has already been translated but has not yet been accessed, because this is equivalent to using a stale TLB entry. To avoid this, only report a vCPU as preempted if sure that the guest is at an instruction boundary. A rescheduling request will be delivered to the host physical CPU as an external interrupt, so for simplicity consider any vmexit *not* instruction boundary except for external interrupts. It would in principle be okay to report the vCPU as preempted also if it is sleeping in kvm_vcpu_block(): a TLB flush IPI will incur the vmentry/vmexit overhead unnecessarily, and optimistic spinning is also unlikely to succeed. However, leave it for later because right now kvm_vcpu_check_block() is doing memory accesses. Even though the TLB flush issue only applies to virtual memory address, it's very much preferrable to be conservative. Reported-by: Jann Horn Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin --- arch/x86/include/asm/kvm_host.h | 3 +++ arch/x86/kvm/svm/svm.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 1 + arch/x86/kvm/x86.c | 22 ++++++++++++++++++++++ 4 files changed, 28 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 49d814b2a341..a35f5e23fc2a 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -642,6 +642,7 @@ struct kvm_vcpu_arch { u64 ia32_misc_enable_msr; u64 smbase; u64 smi_count; + bool at_instruction_boundary; bool tpr_access_reporting; bool xsaves_enabled; u64 ia32_xss; @@ -1271,6 +1272,8 @@ struct kvm_vcpu_stat { u64 nested_run; u64 directed_yield_attempted; u64 directed_yield_successful; + u64 preemption_reported; + u64 preemption_other; u64 guest_mode; }; diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 26f2da1590ed..5b51156712f7 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -4263,6 +4263,8 @@ out: static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) { + if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR) + vcpu->arch.at_instruction_boundary = true; } static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index a236104fc743..359292767e17 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -6471,6 +6471,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) return; handle_interrupt_nmi_irqoff(vcpu, gate_offset(desc)); + vcpu->arch.at_instruction_boundary = true; } static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index bd410926fda5..b2436796e03c 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -277,6 +277,8 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, nested_run), STATS_DESC_COUNTER(VCPU, directed_yield_attempted), STATS_DESC_COUNTER(VCPU, directed_yield_successful), + STATS_DESC_COUNTER(VCPU, preemption_reported), + STATS_DESC_COUNTER(VCPU, preemption_other), STATS_DESC_ICOUNTER(VCPU, guest_mode) }; @@ -4371,6 +4373,19 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) struct kvm_memslots *slots; static const u8 preempted = KVM_VCPU_PREEMPTED; + /* + * The vCPU can be marked preempted if and only if the VM-Exit was on + * an instruction boundary and will not trigger guest emulation of any + * kind (see vcpu_run). Vendor specific code controls (conservatively) + * when this is true, for example allowing the vCPU to be marked + * preempted if and only if the VM-Exit was due to a host interrupt. + */ + if (!vcpu->arch.at_instruction_boundary) { + vcpu->stat.preemption_other++; + return; + } + + vcpu->stat.preemption_reported++; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -9934,6 +9949,13 @@ static int vcpu_run(struct kvm_vcpu *vcpu) vcpu->arch.l1tf_flush_l1d = true; for (;;) { + /* + * If another guest vCPU requests a PV TLB flush in the middle + * of instruction emulation, the rest of the emulation could + * use a stale page translation. Assume that any code after + * this point can start executing an instruction. + */ + vcpu->arch.at_instruction_boundary = false; if (kvm_vcpu_running(vcpu)) { r = vcpu_enter_guest(vcpu); } else { From ad6fd99d5febfd63465ef327240c78b6d803d48d Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Tue, 7 Jun 2022 10:07:11 -0400 Subject: [PATCH 11/31] KVM: x86: do not set st->preempted when going back to user space [ Upstream commit 54aa83c90198e68eee8b0850c749bc70efb548da ] Similar to the Xen path, only change the vCPU's reported state if the vCPU was actually preempted. The reason for KVM's behavior is that for example optimistic spinning might not be a good idea if the guest is doing repeated exits to userspace; however, it is confusing and unlikely to make a difference, because well-tuned guests will hardly ever exit KVM_RUN in the first place. Suggested-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin --- arch/x86/kvm/x86.c | 26 ++++++++++++++------------ arch/x86/kvm/xen.h | 6 ++++-- 2 files changed, 18 insertions(+), 14 deletions(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index b2436796e03c..8a6ee5d8adc7 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -4415,19 +4415,21 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) { int idx; - if (vcpu->preempted && !vcpu->arch.guest_state_protected) - vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); + if (vcpu->preempted) { + if (!vcpu->arch.guest_state_protected) + vcpu->arch.preempted_in_kernel = !static_call(kvm_x86_get_cpl)(vcpu); - /* - * Take the srcu lock as memslots will be accessed to check the gfn - * cache generation against the memslots generation. - */ - idx = srcu_read_lock(&vcpu->kvm->srcu); - if (kvm_xen_msr_enabled(vcpu->kvm)) - kvm_xen_runstate_set_preempted(vcpu); - else - kvm_steal_time_set_preempted(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + /* + * Take the srcu lock as memslots will be accessed to check the gfn + * cache generation against the memslots generation. + */ + idx = srcu_read_lock(&vcpu->kvm->srcu); + if (kvm_xen_msr_enabled(vcpu->kvm)) + kvm_xen_runstate_set_preempted(vcpu); + else + kvm_steal_time_set_preempted(vcpu); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index cc0cf5f37450..a7693a286e40 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -97,8 +97,10 @@ static inline void kvm_xen_runstate_set_preempted(struct kvm_vcpu *vcpu) * behalf of the vCPU. Only if the VMM does actually block * does it need to enter RUNSTATE_blocked. */ - if (vcpu->preempted) - kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable); + if (WARN_ON_ONCE(!vcpu->preempted)) + return; + + kvm_xen_update_runstate_guest(vcpu, RUNSTATE_runnable); } /* 32-bit compatibility definitions, also used natively in 32-bit build */ From 9acd899d2febb4dd88565514d263fcdf0514ec26 Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 1 Jun 2022 16:43:22 +0200 Subject: [PATCH 12/31] KVM: selftests: Make hyperv_clock selftest more stable [ Upstream commit eae260be3a0111a28fe95923e117a55dddec0384 ] hyperv_clock doesn't always give a stable test result, especially with AMD CPUs. The test compares Hyper-V MSR clocksource (acquired either with rdmsr() from within the guest or KVM_GET_MSRS from the host) against rdtsc(). To increase the accuracy, increase the measured delay (done with nop loop) by two orders of magnitude and take the mean rdtsc() value before and after rdmsr()/KVM_GET_MSRS. Reported-by: Maxim Levitsky Signed-off-by: Vitaly Kuznetsov Reviewed-by: Maxim Levitsky Tested-by: Maxim Levitsky Message-Id: <20220601144322.1968742-1-vkuznets@redhat.com> Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin --- tools/testing/selftests/kvm/x86_64/hyperv_clock.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c index e0b2bb1339b1..3330fb183c68 100644 --- a/tools/testing/selftests/kvm/x86_64/hyperv_clock.c +++ b/tools/testing/selftests/kvm/x86_64/hyperv_clock.c @@ -44,7 +44,7 @@ static inline void nop_loop(void) { int i; - for (i = 0; i < 1000000; i++) + for (i = 0; i < 100000000; i++) asm volatile("nop"); } @@ -56,12 +56,14 @@ static inline void check_tsc_msr_rdtsc(void) tsc_freq = rdmsr(HV_X64_MSR_TSC_FREQUENCY); GUEST_ASSERT(tsc_freq > 0); - /* First, check MSR-based clocksource */ + /* For increased accuracy, take mean rdtsc() before and afrer rdmsr() */ r1 = rdtsc(); t1 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + r1 = (r1 + rdtsc()) / 2; nop_loop(); r2 = rdtsc(); t2 = rdmsr(HV_X64_MSR_TIME_REF_COUNT); + r2 = (r2 + rdtsc()) / 2; GUEST_ASSERT(r2 > r1 && t2 > t1); @@ -181,12 +183,14 @@ static void host_check_tsc_msr_rdtsc(struct kvm_vm *vm) tsc_freq = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TSC_FREQUENCY); TEST_ASSERT(tsc_freq > 0, "TSC frequency must be nonzero"); - /* First, check MSR-based clocksource */ + /* For increased accuracy, take mean rdtsc() before and afrer ioctl */ r1 = rdtsc(); t1 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT); + r1 = (r1 + rdtsc()) / 2; nop_loop(); r2 = rdtsc(); t2 = vcpu_get_msr(vm, VCPU_ID, HV_X64_MSR_TIME_REF_COUNT); + r2 = (r2 + rdtsc()) / 2; TEST_ASSERT(t2 > t1, "Time reference MSR is not monotonic (%ld <= %ld)", t1, t2); From bc2cee443c7401a15227bb7f9e7e0a967563cfd2 Mon Sep 17 00:00:00 2001 From: Dmitry Klochkov Date: Tue, 14 Jun 2022 15:11:41 +0300 Subject: [PATCH 13/31] tools/kvm_stat: fix display of error when multiple processes are found [ Upstream commit 933b5f9f98da29af646b51b36a0753692908ef64 ] Instead of printing an error message, kvm_stat script fails when we restrict statistics to a guest by its name and there are multiple guests with such name: # kvm_stat -g my_vm Traceback (most recent call last): File "/usr/bin/kvm_stat", line 1819, in main() File "/usr/bin/kvm_stat", line 1779, in main options = get_options() File "/usr/bin/kvm_stat", line 1718, in get_options options = argparser.parse_args() File "/usr/lib64/python3.10/argparse.py", line 1825, in parse_args args, argv = self.parse_known_args(args, namespace) File "/usr/lib64/python3.10/argparse.py", line 1858, in parse_known_args namespace, args = self._parse_known_args(args, namespace) File "/usr/lib64/python3.10/argparse.py", line 2067, in _parse_known_args start_index = consume_optional(start_index) File "/usr/lib64/python3.10/argparse.py", line 2007, in consume_optional take_action(action, args, option_string) File "/usr/lib64/python3.10/argparse.py", line 1935, in take_action action(self, namespace, argument_values, option_string) File "/usr/bin/kvm_stat", line 1649, in __call__ ' to specify the desired pid'.format(" ".join(pids))) TypeError: sequence item 0: expected str instance, int found To avoid this, it's needed to convert pids int values to strings before pass them to join(). Signed-off-by: Dmitry Klochkov Message-Id: <20220614121141.160689-1-kdmitry556@gmail.com> Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin --- tools/kvm/kvm_stat/kvm_stat | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/kvm/kvm_stat/kvm_stat b/tools/kvm/kvm_stat/kvm_stat index 5a5bd74f55bd..9c366b3a676d 100755 --- a/tools/kvm/kvm_stat/kvm_stat +++ b/tools/kvm/kvm_stat/kvm_stat @@ -1646,7 +1646,8 @@ Press any other key to refresh statistics immediately. .format(values)) if len(pids) > 1: sys.exit('Error: Multiple processes found (pids: {}). Use "-p"' - ' to specify the desired pid'.format(" ".join(pids))) + ' to specify the desired pid' + .format(" ".join(map(str, pids)))) namespace.pid = pids[0] argparser = argparse.ArgumentParser(description=description_text, From e423893fe3209604236874e412dd3a9386578cff Mon Sep 17 00:00:00 2001 From: Raghavendra Rao Ananta Date: Wed, 15 Jun 2022 18:57:06 +0000 Subject: [PATCH 14/31] selftests: KVM: Handle compiler optimizations in ucall [ Upstream commit 9e2f6498efbbc880d7caa7935839e682b64fe5a6 ] The selftests, when built with newer versions of clang, is found to have over optimized guests' ucall() function, and eliminating the stores for uc.cmd (perhaps due to no immediate readers). This resulted in the userspace side always reading a value of '0', and causing multiple test failures. As a result, prevent the compiler from optimizing the stores in ucall() with WRITE_ONCE(). Suggested-by: Ricardo Koller Suggested-by: Reiji Watanabe Signed-off-by: Raghavendra Rao Ananta Message-Id: <20220615185706.1099208-1-rananta@google.com> Reviewed-by: Andrew Jones Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin --- tools/testing/selftests/kvm/lib/aarch64/ucall.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tools/testing/selftests/kvm/lib/aarch64/ucall.c b/tools/testing/selftests/kvm/lib/aarch64/ucall.c index e0b0164e9af8..be1d9728c4ce 100644 --- a/tools/testing/selftests/kvm/lib/aarch64/ucall.c +++ b/tools/testing/selftests/kvm/lib/aarch64/ucall.c @@ -73,20 +73,19 @@ void ucall_uninit(struct kvm_vm *vm) void ucall(uint64_t cmd, int nargs, ...) { - struct ucall uc = { - .cmd = cmd, - }; + struct ucall uc = {}; va_list va; int i; + WRITE_ONCE(uc.cmd, cmd); nargs = nargs <= UCALL_MAX_ARGS ? nargs : UCALL_MAX_ARGS; va_start(va, nargs); for (i = 0; i < nargs; ++i) - uc.args[i] = va_arg(va, uint64_t); + WRITE_ONCE(uc.args[i], va_arg(va, uint64_t)); va_end(va); - *ucall_exit_mmio_addr = (vm_vaddr_t)&uc; + WRITE_ONCE(*ucall_exit_mmio_addr, (vm_vaddr_t)&uc); } uint64_t get_ucall(struct kvm_vm *vm, uint32_t vcpu_id, struct ucall *uc) From a619a03120991b679cd6a41355f6c9199f63fd8b Mon Sep 17 00:00:00 2001 From: Mingwei Zhang Date: Thu, 23 Jun 2022 17:18:58 +0000 Subject: [PATCH 15/31] KVM: x86/svm: add __GFP_ACCOUNT to __sev_dbg_{en,de}crypt_user() [ Upstream commit ebdec859faa8cfbfef9f6c1f83d79dd6c8f4ab8c ] Adding the accounting flag when allocating pages within the SEV function, since these memory pages should belong to individual VM. No functional change intended. Signed-off-by: Mingwei Zhang Message-Id: <20220623171858.2083637-1-mizhang@google.com> Signed-off-by: Paolo Bonzini Signed-off-by: Sasha Levin --- arch/x86/kvm/svm/sev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 4a4dc105552e..86f3096f042f 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -832,7 +832,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, /* If source buffer is not aligned then use an intermediate buffer */ if (!IS_ALIGNED((unsigned long)vaddr, 16)) { - src_tpage = alloc_page(GFP_KERNEL); + src_tpage = alloc_page(GFP_KERNEL_ACCOUNT); if (!src_tpage) return -ENOMEM; @@ -853,7 +853,7 @@ static int __sev_dbg_encrypt_user(struct kvm *kvm, unsigned long paddr, if (!IS_ALIGNED((unsigned long)dst_vaddr, 16) || !IS_ALIGNED(size, 16)) { int dst_offset; - dst_tpage = alloc_page(GFP_KERNEL); + dst_tpage = alloc_page(GFP_KERNEL_ACCOUNT); if (!dst_tpage) { ret = -ENOMEM; goto e_free; From 775871d4be0d75e219cca937af843a4a1b60489a Mon Sep 17 00:00:00 2001 From: Peter Collingbourne Date: Mon, 8 Aug 2022 13:53:21 +0100 Subject: [PATCH 16/31] arm64: set UXN on swapper page tables [ This issue was fixed upstream by accident in c3cee924bd85 ("arm64: head: cover entire kernel image in initial ID map") as part of a large refactoring of the arm64 boot flow. This simple fix is therefore preferred for -stable backporting ] On a system that implements FEAT_EPAN, read/write access to the idmap is denied because UXN is not set on the swapper PTEs. As a result, idmap_kpti_install_ng_mappings panics the kernel when accessing __idmap_kpti_flag. Fix it by setting UXN on these PTEs. Fixes: 18107f8a2df6 ("arm64: Support execute-only permissions with Enhanced PAN") Cc: # 5.15 Link: https://linux-review.googlesource.com/id/Ic452fa4b4f74753e54f71e61027e7222a0fae1b1 Signed-off-by: Peter Collingbourne Acked-by: Will Deacon Cc: Ard Biesheuvel Cc: Catalin Marinas Link: https://lore.kernel.org/r/20220719234909.1398992-1-pcc@google.com Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/kernel-pgtable.h | 4 ++-- arch/arm64/kernel/head.S | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 96dc0f7da258..a971d462f531 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -103,8 +103,8 @@ /* * Initial memory map attributes. */ -#define SWAPPER_PTE_FLAGS (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) -#define SWAPPER_PMD_FLAGS (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) +#define SWAPPER_PTE_FLAGS (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED | PTE_UXN) +#define SWAPPER_PMD_FLAGS (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S | PMD_SECT_UXN) #if ARM64_KERNEL_USES_PMD_MAPS #define SWAPPER_MM_MMUFLAGS (PMD_ATTRINDX(MT_NORMAL) | SWAPPER_PMD_FLAGS) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 17962452e31d..ab6566bf1c33 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -285,7 +285,7 @@ SYM_FUNC_START_LOCAL(__create_page_tables) subs x1, x1, #64 b.ne 1b - mov x7, SWAPPER_MM_MMUFLAGS + mov_q x7, SWAPPER_MM_MMUFLAGS /* * Create the identity mapping. From 5e04c8bf42d810986b609b989955347c2913b1be Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 7 Jun 2022 16:08:29 +0900 Subject: [PATCH 17/31] btrfs: zoned: prevent allocation from previous data relocation BG commit 343d8a30851c48a4ef0f5ef61d5e9fbd847a6883 upstream. After commit 5f0addf7b890 ("btrfs: zoned: use dedicated lock for data relocation"), we observe IO errors on e.g, btrfs/232 like below. [09.0][T4038707] WARNING: CPU: 3 PID: 4038707 at fs/btrfs/extent-tree.c:2381 btrfs_cross_ref_exist+0xfc/0x120 [btrfs] [09.9][T4038707] Call Trace: [09.5][T4038707] [09.3][T4038707] run_delalloc_nocow+0x7f1/0x11a0 [btrfs] [09.6][T4038707] ? test_range_bit+0x174/0x320 [btrfs] [09.2][T4038707] ? fallback_to_cow+0x980/0x980 [btrfs] [09.3][T4038707] ? find_lock_delalloc_range+0x33e/0x3e0 [btrfs] [09.5][T4038707] btrfs_run_delalloc_range+0x445/0x1320 [btrfs] [09.2][T4038707] ? test_range_bit+0x320/0x320 [btrfs] [09.4][T4038707] ? lock_downgrade+0x6a0/0x6a0 [09.2][T4038707] ? orc_find.part.0+0x1ed/0x300 [09.5][T4038707] ? __module_address.part.0+0x25/0x300 [09.0][T4038707] writepage_delalloc+0x159/0x310 [btrfs] [09.4][ C3] sd 10:0:1:0: [sde] tag#2620 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s [09.5][ C3] sd 10:0:1:0: [sde] tag#2620 Sense Key : Illegal Request [current] [09.9][ C3] sd 10:0:1:0: [sde] tag#2620 Add. Sense: Unaligned write command [09.5][ C3] sd 10:0:1:0: [sde] tag#2620 CDB: Write(16) 8a 00 00 00 00 00 02 f3 63 87 00 00 00 2c 00 00 [09.4][ C3] critical target error, dev sde, sector 396041272 op 0x1:(WRITE) flags 0x800 phys_seg 3 prio class 0 [09.9][ C3] BTRFS error (device dm-1): bdev /dev/mapper/dml_102_2 errs: wr 1, rd 0, flush 0, corrupt 0, gen 0 The IO errors occur when we allocate a regular extent in previous data relocation block group. On zoned btrfs, we use a dedicated block group to relocate a data extent. Thus, we allocate relocating data extents (pre-alloc) only from the dedicated block group and vice versa. Once the free space in the dedicated block group gets tight, a relocating extent may not fit into the block group. In that case, we need to switch the dedicated block group to the next one. Then, the previous one is now freed up for allocating a regular extent. The BG is already not enough to allocate the relocating extent, but there is still room to allocate a smaller extent. Now the problem happens. By allocating a regular extent while nocow IOs for the relocation is still on-going, we will issue WRITE IOs (for relocation) and ZONE APPEND IOs (for the regular writes) at the same time. That mixed IOs confuses the write pointer and arises the unaligned write errors. This commit introduces a new bit 'zoned_data_reloc_ongoing' to the btrfs_block_group. We set this bit before releasing the dedicated block group, and no extent are allocated from a block group having this bit set. This bit is similar to setting block_group->ro, but is different from it by allowing nocow writes to start. Once all the nocow IO for relocation is done (hooked from btrfs_finish_ordered_io), we reset the bit to release the block group for further allocation. Fixes: c2707a255623 ("btrfs: zoned: add a dedicated data relocation block group") CC: stable@vger.kernel.org # 5.16+ Signed-off-by: Naohiro Aota Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/block-group.h | 1 + fs/btrfs/extent-tree.c | 20 ++++++++++++++++++-- fs/btrfs/inode.c | 2 ++ fs/btrfs/zoned.c | 27 +++++++++++++++++++++++++++ fs/btrfs/zoned.h | 5 +++++ 5 files changed, 53 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 37e55ebde735..d73db0dfacb2 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -98,6 +98,7 @@ struct btrfs_block_group { unsigned int to_copy:1; unsigned int relocating_repair:1; unsigned int chunk_item_inserted:1; + unsigned int zoned_data_reloc_ongoing:1; int disk_cache_state; diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e3514f9a4e8d..248ea15c9734 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3804,7 +3804,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, block_group->start == fs_info->data_reloc_bg || fs_info->data_reloc_bg == 0); - if (block_group->ro) { + if (block_group->ro || block_group->zoned_data_reloc_ongoing) { ret = 1; goto out; } @@ -3865,8 +3865,24 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group, out: if (ret && ffe_ctl->for_treelog) fs_info->treelog_bg = 0; - if (ret && ffe_ctl->for_data_reloc) + if (ret && ffe_ctl->for_data_reloc && + fs_info->data_reloc_bg == block_group->start) { + /* + * Do not allow further allocations from this block group. + * Compared to increasing the ->ro, setting the + * ->zoned_data_reloc_ongoing flag still allows nocow + * writers to come in. See btrfs_inc_nocow_writers(). + * + * We need to disable an allocation to avoid an allocation of + * regular (non-relocation data) extent. With mix of relocation + * extents and regular extents, we can dispatch WRITE commands + * (for relocation extents) and ZONE APPEND commands (for + * regular extents) at the same time to the same zone, which + * easily break the write pointer. + */ + block_group->zoned_data_reloc_ongoing = 1; fs_info->data_reloc_bg = 0; + } spin_unlock(&fs_info->relocation_bg_lock); spin_unlock(&fs_info->treelog_bg_lock); spin_unlock(&block_group->lock); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ea7262050790..1b4fee8a2f28 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3069,6 +3069,8 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) ordered_extent->file_offset, ordered_extent->file_offset + logical_len); + btrfs_zoned_release_data_reloc_bg(fs_info, ordered_extent->disk_bytenr, + ordered_extent->disk_num_bytes); } else { BUG_ON(root == fs_info->tree_root); ret = insert_ordered_extent_file_extent(trans, ordered_extent); diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 574769f921a2..fc791f7c7142 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1623,3 +1623,30 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) } mutex_unlock(&fs_devices->device_list_mutex); } + +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length) +{ + struct btrfs_block_group *block_group; + + if (!btrfs_is_zoned(fs_info)) + return; + + block_group = btrfs_lookup_block_group(fs_info, logical); + /* It should be called on a previous data relocation block group. */ + ASSERT(block_group && (block_group->flags & BTRFS_BLOCK_GROUP_DATA)); + + spin_lock(&block_group->lock); + if (!block_group->zoned_data_reloc_ongoing) + goto out; + + /* All relocation extents are written. */ + if (block_group->start + block_group->alloc_offset == logical + length) { + /* Now, release this block group for further allocations. */ + block_group->zoned_data_reloc_ongoing = 0; + } + +out: + spin_unlock(&block_group->lock); + btrfs_put_block_group(block_group); +} diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 3a826f7c2040..574490ea2cc8 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -70,6 +70,8 @@ struct btrfs_device *btrfs_zoned_get_device(struct btrfs_fs_info *fs_info, u64 logical, u64 length); void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg); void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info); +void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical, + u64 length); #else /* CONFIG_BLK_DEV_ZONED */ static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos, struct blk_zone *zone) @@ -207,6 +209,9 @@ static inline struct btrfs_device *btrfs_zoned_get_device( static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { } static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { } + +static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, + u64 logical, u64 length) { } #endif static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos) From 37b385c78cd57edcc9c2350a37763012c28982cd Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Tue, 7 Jun 2022 16:08:30 +0900 Subject: [PATCH 18/31] btrfs: zoned: fix critical section of relocation inode writeback commit 19ab78ca86981e0e1e73036fb73a508731a7c078 upstream. We use btrfs_zoned_data_reloc_{lock,unlock} to allow only one process to write out to the relocation inode. That critical section must include all the IO submission for the inode. However, flush_write_bio() in extent_writepages() is out of the critical section, causing an IO submission outside of the lock. This leads to an out of the order IO submission and fail the relocation process. Fix it by extending the critical section. Fixes: 35156d852762 ("btrfs: zoned: only allow one process to add pages to a relocation inode") CC: stable@vger.kernel.org # 5.16+ Reviewed-by: Johannes Thumshirn Signed-off-by: Naohiro Aota Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/extent_io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b791e280af0c..a90546b3107c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5152,13 +5152,14 @@ int extent_writepages(struct address_space *mapping, */ btrfs_zoned_data_reloc_lock(BTRFS_I(inode)); ret = extent_write_cache_pages(mapping, wbc, &epd); - btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); ASSERT(ret <= 0); if (ret < 0) { + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); end_write_bio(&epd, ret); return ret; } ret = flush_write_bio(&epd); + btrfs_zoned_data_reloc_unlock(BTRFS_I(inode)); return ret; } From 2aa38f0af306230ae4fb15b73b4c18f9209758cb Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Tue, 24 May 2022 07:56:41 +0200 Subject: [PATCH 19/31] Bluetooth: hci_bcm: Add BCM4349B1 variant commit 4f17c2b6694d0c4098f33b07ee3a696976940aa5 upstream. The BCM4349B1, aka CYW/BCM89359, is a WiFi+BT chip and its Bluetooth portion can be controlled over serial. Two subversions are added for the chip, because ROM firmware reports 002.002.013 (at least for the chips I have here), while depending on patchram firmware revision, either 002.002.013 or 002.002.014 is reported. Signed-off-by: Ahmad Fatoum Reviewed-by: Linus Walleij Signed-off-by: Marcel Holtmann Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/btbcm.c | 2 ++ drivers/bluetooth/hci_bcm.c | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/bluetooth/btbcm.c b/drivers/bluetooth/btbcm.c index d9ceca7a7935..a18f289d7346 100644 --- a/drivers/bluetooth/btbcm.c +++ b/drivers/bluetooth/btbcm.c @@ -453,6 +453,8 @@ static const struct bcm_subver_table bcm_uart_subver_table[] = { { 0x6606, "BCM4345C5" }, /* 003.006.006 */ { 0x230f, "BCM4356A2" }, /* 001.003.015 */ { 0x220e, "BCM20702A1" }, /* 001.002.014 */ + { 0x420d, "BCM4349B1" }, /* 002.002.013 */ + { 0x420e, "BCM4349B1" }, /* 002.002.014 */ { 0x4217, "BCM4329B1" }, /* 002.002.023 */ { 0x6106, "BCM4359C0" }, /* 003.001.006 */ { 0x4106, "BCM4335A0" }, /* 002.001.006 */ diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c index 7abf99f0ee39..b47a31bd2e9b 100644 --- a/drivers/bluetooth/hci_bcm.c +++ b/drivers/bluetooth/hci_bcm.c @@ -1515,6 +1515,7 @@ static const struct of_device_id bcm_bluetooth_of_match[] = { { .compatible = "brcm,bcm4345c5" }, { .compatible = "brcm,bcm4330-bt" }, { .compatible = "brcm,bcm43438-bt", .data = &bcm43438_device_data }, + { .compatible = "brcm,bcm4349-bt", .data = &bcm43438_device_data }, { .compatible = "brcm,bcm43540-bt", .data = &bcm4354_device_data }, { .compatible = "brcm,bcm4335a0" }, { }, From 88e088e29487913c302b7480b866266f8971427f Mon Sep 17 00:00:00 2001 From: Hakan Jansson Date: Thu, 30 Jun 2022 14:45:22 +0200 Subject: [PATCH 20/31] Bluetooth: hci_bcm: Add DT compatible for CYW55572 commit f8cad62002a7699fd05a23b558b980b5a77defe0 upstream. CYW55572 is a Wi-Fi + Bluetooth combo device from Infineon. Signed-off-by: Hakan Jansson Reviewed-by: Linus Walleij Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/hci_bcm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/hci_bcm.c b/drivers/bluetooth/hci_bcm.c index b47a31bd2e9b..cf622e459605 100644 --- a/drivers/bluetooth/hci_bcm.c +++ b/drivers/bluetooth/hci_bcm.c @@ -1518,6 +1518,7 @@ static const struct of_device_id bcm_bluetooth_of_match[] = { { .compatible = "brcm,bcm4349-bt", .data = &bcm43438_device_data }, { .compatible = "brcm,bcm43540-bt", .data = &bcm4354_device_data }, { .compatible = "brcm,bcm4335a0" }, + { .compatible = "infineon,cyw55572-bt" }, { }, }; MODULE_DEVICE_TABLE(of, bcm_bluetooth_of_match); From 6a5ec48fb7528aea20805534a867d0cc631994f8 Mon Sep 17 00:00:00 2001 From: Ahmad Fatoum Date: Tue, 24 May 2022 07:56:40 +0200 Subject: [PATCH 21/31] dt-bindings: bluetooth: broadcom: Add BCM4349B1 DT binding commit 88b65887aa1b76cd8649a97824fb9904c1d79254 upstream. The BCM4349B1, aka CYW/BCM89359, is a WiFi+BT chip and its Bluetooth portion can be controlled over serial. Extend the binding with its DT compatible. Acked-by: Krzysztof Kozlowski Reviewed-by: Linus Walleij Signed-off-by: Ahmad Fatoum Signed-off-by: Marcel Holtmann Signed-off-by: Greg Kroah-Hartman --- Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml b/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml index fbdc2083bec4..20ee96584aba 100644 --- a/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml +++ b/Documentation/devicetree/bindings/net/broadcom-bluetooth.yaml @@ -23,6 +23,7 @@ properties: - brcm,bcm4345c5 - brcm,bcm43540-bt - brcm,bcm4335a0 + - brcm,bcm4349-bt shutdown-gpios: maxItems: 1 From 04e3388eeb4709a8f079f92a48e8e034cc7e5e50 Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Thu, 2 Jun 2022 17:28:22 +0800 Subject: [PATCH 22/31] Bluetooth: btusb: Add support of IMC Networks PID 0x3568 commit c69ecb0ea4c96b8b191cbaa0b420222a37867655 upstream. It is 13d3:3568 for MediaTek MT7922 USB Bluetooth chip. T: Bus=03 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 2 Spd=480 MxCh= 0 D: Ver= 2.10 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=13d3 ProdID=3568 Rev=01.00 S: Manufacturer=MediaTek Inc. S: Product=Wireless_Device S: SerialNumber=... C: #Ifs= 3 Cfg#= 1 Atr=e0 MxPwr=100mA I: If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=02(O) Atr=02(Bulk) MxPS= 512 Ivl=0ms E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=125us E: Ad=82(I) Atr=02(Bulk) MxPS= 512 Ivl=0ms I: If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 2 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=(none) E: Ad=0a(O) Atr=03(Int.) MxPS= 64 Ivl=125us E: Ad=8a(I) Atr=03(Int.) MxPS= 64 Ivl=125us Signed-off-by: Aaron Ma Signed-off-by: Marcel Holtmann Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/btusb.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index a68edbc7be0f..26ff826b5f16 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -459,6 +459,9 @@ static const struct usb_device_id blacklist_table[] = { { USB_DEVICE(0x0489, 0xe0d9), .driver_info = BTUSB_MEDIATEK | BTUSB_WIDEBAND_SPEECH | BTUSB_VALID_LE_STATES }, + { USB_DEVICE(0x13d3, 0x3568), .driver_info = BTUSB_MEDIATEK | + BTUSB_WIDEBAND_SPEECH | + BTUSB_VALID_LE_STATES }, /* Additional Realtek 8723AE Bluetooth devices */ { USB_DEVICE(0x0930, 0x021d), .driver_info = BTUSB_REALTEK }, From d4f921efb4bf6b75375ed459be785b0c6415daec Mon Sep 17 00:00:00 2001 From: Hilda Wu Date: Thu, 14 Jul 2022 19:25:19 +0800 Subject: [PATCH 23/31] Bluetooth: btusb: Add Realtek RTL8852C support ID 0x04CA:0x4007 commit c379c96cc221767af9688a5d4758a78eea30883a upstream. Add the support ID(0x04CA, 0x4007) to usb_device_id table for Realtek RTL8852C. The device info from /sys/kernel/debug/usb/devices as below. T: Bus=03 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 2 Spd=12 MxCh= 0 D: Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=04ca ProdID=4007 Rev= 0.00 S: Manufacturer=Realtek S: Product=Bluetooth Radio S: SerialNumber=00e04c000001 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Hilda Wu Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/btusb.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 26ff826b5f16..1196d1c37d37 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -420,6 +420,10 @@ static const struct usb_device_id blacklist_table[] = { { USB_DEVICE(0x04ca, 0x4006), .driver_info = BTUSB_REALTEK | BTUSB_WIDEBAND_SPEECH }, + /* Realtek 8852CE Bluetooth devices */ + { USB_DEVICE(0x04ca, 0x4007), .driver_info = BTUSB_REALTEK | + BTUSB_WIDEBAND_SPEECH }, + /* Realtek Bluetooth devices */ { USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01), .driver_info = BTUSB_REALTEK }, From b653eeaa8cf8d909a52410069a9ccdcb545bc9bb Mon Sep 17 00:00:00 2001 From: Hilda Wu Date: Thu, 14 Jul 2022 19:25:20 +0800 Subject: [PATCH 24/31] Bluetooth: btusb: Add Realtek RTL8852C support ID 0x04C5:0x1675 commit 893fa8bc9952a36fb682ee12f0a994b5817a36d2 upstream. Add the support ID(0x04c5, 0x1675) to usb_device_id table for Realtek RTL8852C. The device info from /sys/kernel/debug/usb/devices as below. T: Bus=03 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 2 Spd=12 MxCh= 0 D: Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=04c5 ProdID=1675 Rev= 0.00 S: Manufacturer=Realtek S: Product=Bluetooth Radio S: SerialNumber=00e04c000001 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Hilda Wu Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/btusb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 1196d1c37d37..250612c24dd1 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -423,6 +423,8 @@ static const struct usb_device_id blacklist_table[] = { /* Realtek 8852CE Bluetooth devices */ { USB_DEVICE(0x04ca, 0x4007), .driver_info = BTUSB_REALTEK | BTUSB_WIDEBAND_SPEECH }, + { USB_DEVICE(0x04c5, 0x1675), .driver_info = BTUSB_REALTEK | + BTUSB_WIDEBAND_SPEECH }, /* Realtek Bluetooth devices */ { USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01), From 59689a843bc9bbc42046084998da7efe1fbc331f Mon Sep 17 00:00:00 2001 From: Hilda Wu Date: Thu, 14 Jul 2022 19:25:21 +0800 Subject: [PATCH 25/31] Bluetooth: btusb: Add Realtek RTL8852C support ID 0x0CB8:0xC558 commit 5b75ee37ebb73f58468d4cca172434324af203f1 upstream. Add the support ID(0x0CB8, 0xC558) to usb_device_id table for Realtek RTL8852C. The device info from /sys/kernel/debug/usb/devices as below. T: Bus=03 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 2 Spd=12 MxCh= 0 D: Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=0cb8 ProdID=c558 Rev= 0.00 S: Manufacturer=Realtek S: Product=Bluetooth Radio S: SerialNumber=00e04c000001 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Hilda Wu Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/btusb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 250612c24dd1..ac48f74a65a4 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -425,6 +425,8 @@ static const struct usb_device_id blacklist_table[] = { BTUSB_WIDEBAND_SPEECH }, { USB_DEVICE(0x04c5, 0x1675), .driver_info = BTUSB_REALTEK | BTUSB_WIDEBAND_SPEECH }, + { USB_DEVICE(0x0cb8, 0xc558), .driver_info = BTUSB_REALTEK | + BTUSB_WIDEBAND_SPEECH }, /* Realtek Bluetooth devices */ { USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01), From ee421ad8973b5abb8f3948b4bfab1810b420717f Mon Sep 17 00:00:00 2001 From: Hilda Wu Date: Thu, 14 Jul 2022 19:25:22 +0800 Subject: [PATCH 26/31] Bluetooth: btusb: Add Realtek RTL8852C support ID 0x13D3:0x3587 commit 8f0054dd29373cd877db87751c143610561d549d upstream. Add the support ID(0x13D3, 0x3587) to usb_device_id table for Realtek RTL8852C. The device info from /sys/kernel/debug/usb/devices as below. T: Bus=03 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 2 Spd=12 MxCh= 0 D: Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=13d3 ProdID=3587 Rev= 0.00 S: Manufacturer=Realtek S: Product=Bluetooth Radio S: SerialNumber=00e04c000001 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Hilda Wu Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/btusb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index ac48f74a65a4..85233179b960 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -427,6 +427,8 @@ static const struct usb_device_id blacklist_table[] = { BTUSB_WIDEBAND_SPEECH }, { USB_DEVICE(0x0cb8, 0xc558), .driver_info = BTUSB_REALTEK | BTUSB_WIDEBAND_SPEECH }, + { USB_DEVICE(0x13d3, 0x3587), .driver_info = BTUSB_REALTEK | + BTUSB_WIDEBAND_SPEECH }, /* Realtek Bluetooth devices */ { USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01), From d98cf2b40c2059eefa757fb1ab2607e5483caa05 Mon Sep 17 00:00:00 2001 From: Hilda Wu Date: Thu, 14 Jul 2022 19:25:23 +0800 Subject: [PATCH 27/31] Bluetooth: btusb: Add Realtek RTL8852C support ID 0x13D3:0x3586 commit 6ad353dfc8ee3230a5e123c21da50f1b64cc4b39 upstream. Add the support ID(0x13D3, 0x3586) to usb_device_id table for Realtek RTL8852C. The device info from /sys/kernel/debug/usb/devices as below. T: Bus=03 Lev=01 Prnt=01 Port=02 Cnt=01 Dev#= 2 Spd=12 MxCh= 0 D: Ver= 1.00 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=13d3 ProdID=3586 Rev= 0.00 S: Manufacturer=Realtek S: Product=Bluetooth Radio S: SerialNumber=00e04c000001 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Hilda Wu Signed-off-by: Luiz Augusto von Dentz Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/btusb.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index 85233179b960..627436329b50 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -429,6 +429,8 @@ static const struct usb_device_id blacklist_table[] = { BTUSB_WIDEBAND_SPEECH }, { USB_DEVICE(0x13d3, 0x3587), .driver_info = BTUSB_REALTEK | BTUSB_WIDEBAND_SPEECH }, + { USB_DEVICE(0x13d3, 0x3586), .driver_info = BTUSB_REALTEK | + BTUSB_WIDEBAND_SPEECH }, /* Realtek Bluetooth devices */ { USB_VENDOR_AND_INTERFACE_INFO(0x0bda, 0xe0, 0x01, 0x01), From c81d1bb58c88ab831e02539a26ad47860e8d6f0c Mon Sep 17 00:00:00 2001 From: Ning Qiang Date: Wed, 13 Jul 2022 23:37:34 +0800 Subject: [PATCH 28/31] macintosh/adb: fix oob read in do_adb_query() function commit fd97e4ad6d3b0c9fce3bca8ea8e6969d9ce7423b upstream. In do_adb_query() function of drivers/macintosh/adb.c, req->data is copied form userland. The parameter "req->data[2]" is missing check, the array size of adb_handler[] is 16, so adb_handler[req->data[2]].original_address and adb_handler[req->data[2]].handler_id will lead to oob read. Cc: stable Signed-off-by: Ning Qiang Reviewed-by: Kees Cook Reviewed-by: Greg Kroah-Hartman Acked-by: Benjamin Herrenschmidt Signed-off-by: Michael Ellerman Link: https://lore.kernel.org/r/20220713153734.2248-1-sohu0106@126.com Signed-off-by: Greg Kroah-Hartman --- drivers/macintosh/adb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/macintosh/adb.c b/drivers/macintosh/adb.c index 73b396189039..afb0942ccc29 100644 --- a/drivers/macintosh/adb.c +++ b/drivers/macintosh/adb.c @@ -647,7 +647,7 @@ do_adb_query(struct adb_request *req) switch(req->data[1]) { case ADB_QUERY_GETDEVINFO: - if (req->nbytes < 3) + if (req->nbytes < 3 || req->data[2] >= 16) break; mutex_lock(&adb_handler_mutex); req->reply[0] = adb_handler[req->data[2]].original_address; From 7fcd99e889c0634f8275ae7a6b06aec4a22c8715 Mon Sep 17 00:00:00 2001 From: Daniel Sneddon Date: Tue, 2 Aug 2022 15:47:01 -0700 Subject: [PATCH 29/31] x86/speculation: Add RSB VM Exit protections commit 2b1299322016731d56807aa49254a5ea3080b6b3 upstream. tl;dr: The Enhanced IBRS mitigation for Spectre v2 does not work as documented for RET instructions after VM exits. Mitigate it with a new one-entry RSB stuffing mechanism and a new LFENCE. == Background == Indirect Branch Restricted Speculation (IBRS) was designed to help mitigate Branch Target Injection and Speculative Store Bypass, i.e. Spectre, attacks. IBRS prevents software run in less privileged modes from affecting branch prediction in more privileged modes. IBRS requires the MSR to be written on every privilege level change. To overcome some of the performance issues of IBRS, Enhanced IBRS was introduced. eIBRS is an "always on" IBRS, in other words, just turn it on once instead of writing the MSR on every privilege level change. When eIBRS is enabled, more privileged modes should be protected from less privileged modes, including protecting VMMs from guests. == Problem == Here's a simplification of how guests are run on Linux' KVM: void run_kvm_guest(void) { // Prepare to run guest VMRESUME(); // Clean up after guest runs } The execution flow for that would look something like this to the processor: 1. Host-side: call run_kvm_guest() 2. Host-side: VMRESUME 3. Guest runs, does "CALL guest_function" 4. VM exit, host runs again 5. Host might make some "cleanup" function calls 6. Host-side: RET from run_kvm_guest() Now, when back on the host, there are a couple of possible scenarios of post-guest activity the host needs to do before executing host code: * on pre-eIBRS hardware (legacy IBRS, or nothing at all), the RSB is not touched and Linux has to do a 32-entry stuffing. * on eIBRS hardware, VM exit with IBRS enabled, or restoring the host IBRS=1 shortly after VM exit, has a documented side effect of flushing the RSB except in this PBRSB situation where the software needs to stuff the last RSB entry "by hand". IOW, with eIBRS supported, host RET instructions should no longer be influenced by guest behavior after the host retires a single CALL instruction. However, if the RET instructions are "unbalanced" with CALLs after a VM exit as is the RET in #6, it might speculatively use the address for the instruction after the CALL in #3 as an RSB prediction. This is a problem since the (untrusted) guest controls this address. Balanced CALL/RET instruction pairs such as in step #5 are not affected. == Solution == The PBRSB issue affects a wide variety of Intel processors which support eIBRS. But not all of them need mitigation. Today, X86_FEATURE_RSB_VMEXIT triggers an RSB filling sequence that mitigates PBRSB. Systems setting RSB_VMEXIT need no further mitigation - i.e., eIBRS systems which enable legacy IBRS explicitly. However, such systems (X86_FEATURE_IBRS_ENHANCED) do not set RSB_VMEXIT and most of them need a new mitigation. Therefore, introduce a new feature flag X86_FEATURE_RSB_VMEXIT_LITE which triggers a lighter-weight PBRSB mitigation versus RSB_VMEXIT. The lighter-weight mitigation performs a CALL instruction which is immediately followed by a speculative execution barrier (INT3). This steers speculative execution to the barrier -- just like a retpoline -- which ensures that speculation can never reach an unbalanced RET. Then, ensure this CALL is retired before continuing execution with an LFENCE. In other words, the window of exposure is opened at VM exit where RET behavior is troublesome. While the window is open, force RSB predictions sampling for RET targets to a dead end at the INT3. Close the window with the LFENCE. There is a subset of eIBRS systems which are not vulnerable to PBRSB. Add these systems to the cpu_vuln_whitelist[] as NO_EIBRS_PBRSB. Future systems that aren't vulnerable will set ARCH_CAP_PBRSB_NO. [ bp: Massage, incorporate review comments from Andy Cooper. ] Signed-off-by: Daniel Sneddon Co-developed-by: Pawan Gupta Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov Signed-off-by: Greg Kroah-Hartman --- Documentation/admin-guide/hw-vuln/spectre.rst | 8 ++ arch/x86/include/asm/cpufeatures.h | 2 + arch/x86/include/asm/msr-index.h | 4 + arch/x86/include/asm/nospec-branch.h | 17 +++- arch/x86/kernel/cpu/bugs.c | 86 ++++++++++++++----- arch/x86/kernel/cpu/common.c | 12 ++- arch/x86/kvm/vmx/vmenter.S | 8 +- tools/arch/x86/include/asm/cpufeatures.h | 1 + tools/arch/x86/include/asm/msr-index.h | 4 + 9 files changed, 113 insertions(+), 29 deletions(-) diff --git a/Documentation/admin-guide/hw-vuln/spectre.rst b/Documentation/admin-guide/hw-vuln/spectre.rst index 6bd97cd50d62..7e061ed449aa 100644 --- a/Documentation/admin-guide/hw-vuln/spectre.rst +++ b/Documentation/admin-guide/hw-vuln/spectre.rst @@ -422,6 +422,14 @@ The possible values in this file are: 'RSB filling' Protection of RSB on context switch enabled ============= =========================================== + - EIBRS Post-barrier Return Stack Buffer (PBRSB) protection status: + + =========================== ======================================================= + 'PBRSB-eIBRS: SW sequence' CPU is affected and protection of RSB on VMEXIT enabled + 'PBRSB-eIBRS: Vulnerable' CPU is vulnerable + 'PBRSB-eIBRS: Not affected' CPU is not affected by PBRSB + =========================== ======================================================= + Full mitigation might require a microcode update from the CPU vendor. When the necessary microcode is not available, the kernel will report vulnerability. diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index d370718e222b..be744fa10004 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -301,6 +301,7 @@ #define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ #define X86_FEATURE_USE_IBPB_FW (11*32+16) /* "" Use IBPB during runtime firmware calls */ +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM exit when EIBRS is enabled */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ @@ -446,5 +447,6 @@ #define X86_BUG_SRBDS X86_BUG(24) /* CPU may leak RNG bits if not mitigated */ #define X86_BUG_MMIO_STALE_DATA X86_BUG(25) /* CPU is affected by Processor MMIO Stale Data vulnerabilities */ #define X86_BUG_RETBLEED X86_BUG(26) /* CPU is affected by RETBleed */ +#define X86_BUG_EIBRS_PBRSB X86_BUG(27) /* EIBRS is vulnerable to Post Barrier RSB Predictions */ #endif /* _ASM_X86_CPUFEATURES_H */ diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ec2967e7249f..8f38265bc81d 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -148,6 +148,10 @@ * are restricted to targets in * kernel. */ +#define ARCH_CAP_PBRSB_NO BIT(24) /* + * Not susceptible to Post-Barrier + * Return Stack Buffer Predictions. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 9a79b96e5521..96597246f793 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -118,13 +118,28 @@ #endif .endm +.macro ISSUE_UNBALANCED_RET_GUARD + ANNOTATE_INTRA_FUNCTION_CALL + call .Lunbalanced_ret_guard_\@ + int3 +.Lunbalanced_ret_guard_\@: + add $(BITS_PER_LONG/8), %_ASM_SP + lfence +.endm + /* * A simpler FILL_RETURN_BUFFER macro. Don't make people use the CPP * monstrosity above, manually. */ -.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req +.macro FILL_RETURN_BUFFER reg:req nr:req ftr:req ftr2 +.ifb \ftr2 ALTERNATIVE "jmp .Lskip_rsb_\@", "", \ftr +.else + ALTERNATIVE_2 "jmp .Lskip_rsb_\@", "", \ftr, "jmp .Lunbalanced_\@", \ftr2 +.endif __FILL_RETURN_BUFFER(\reg,\nr,%_ASM_SP) +.Lunbalanced_\@: + ISSUE_UNBALANCED_RET_GUARD .Lskip_rsb_\@: .endm diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index a37814c8547e..837e617f3b76 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -1328,6 +1328,53 @@ static void __init spec_ctrl_disable_kernel_rrsba(void) } } +static void __init spectre_v2_determine_rsb_fill_type_at_vmexit(enum spectre_v2_mitigation mode) +{ + /* + * Similar to context switches, there are two types of RSB attacks + * after VM exit: + * + * 1) RSB underflow + * + * 2) Poisoned RSB entry + * + * When retpoline is enabled, both are mitigated by filling/clearing + * the RSB. + * + * When IBRS is enabled, while #1 would be mitigated by the IBRS branch + * prediction isolation protections, RSB still needs to be cleared + * because of #2. Note that SMEP provides no protection here, unlike + * user-space-poisoned RSB entries. + * + * eIBRS should protect against RSB poisoning, but if the EIBRS_PBRSB + * bug is present then a LITE version of RSB protection is required, + * just a single call needs to retire before a RET is executed. + */ + switch (mode) { + case SPECTRE_V2_NONE: + return; + + case SPECTRE_V2_EIBRS_LFENCE: + case SPECTRE_V2_EIBRS: + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT_LITE); + pr_info("Spectre v2 / PBRSB-eIBRS: Retire a single CALL on VMEXIT\n"); + } + return; + + case SPECTRE_V2_EIBRS_RETPOLINE: + case SPECTRE_V2_RETPOLINE: + case SPECTRE_V2_LFENCE: + case SPECTRE_V2_IBRS: + setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); + pr_info("Spectre v2 / SpectreRSB : Filling RSB on VMEXIT\n"); + return; + } + + pr_warn_once("Unknown Spectre v2 mode, disabling RSB mitigation at VM exit"); + dump_stack(); +} + static void __init spectre_v2_select_mitigation(void) { enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); @@ -1478,28 +1525,7 @@ static void __init spectre_v2_select_mitigation(void) setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); pr_info("Spectre v2 / SpectreRSB mitigation: Filling RSB on context switch\n"); - /* - * Similar to context switches, there are two types of RSB attacks - * after vmexit: - * - * 1) RSB underflow - * - * 2) Poisoned RSB entry - * - * When retpoline is enabled, both are mitigated by filling/clearing - * the RSB. - * - * When IBRS is enabled, while #1 would be mitigated by the IBRS branch - * prediction isolation protections, RSB still needs to be cleared - * because of #2. Note that SMEP provides no protection here, unlike - * user-space-poisoned RSB entries. - * - * eIBRS, on the other hand, has RSB-poisoning protections, so it - * doesn't need RSB clearing after vmexit. - */ - if (boot_cpu_has(X86_FEATURE_RETPOLINE) || - boot_cpu_has(X86_FEATURE_KERNEL_IBRS)) - setup_force_cpu_cap(X86_FEATURE_RSB_VMEXIT); + spectre_v2_determine_rsb_fill_type_at_vmexit(mode); /* * Retpoline protects the kernel, but doesn't protect firmware. IBRS @@ -2285,6 +2311,19 @@ static char *ibpb_state(void) return ""; } +static char *pbrsb_eibrs_state(void) +{ + if (boot_cpu_has_bug(X86_BUG_EIBRS_PBRSB)) { + if (boot_cpu_has(X86_FEATURE_RSB_VMEXIT_LITE) || + boot_cpu_has(X86_FEATURE_RSB_VMEXIT)) + return ", PBRSB-eIBRS: SW sequence"; + else + return ", PBRSB-eIBRS: Vulnerable"; + } else { + return ", PBRSB-eIBRS: Not affected"; + } +} + static ssize_t spectre_v2_show_state(char *buf) { if (spectre_v2_enabled == SPECTRE_V2_LFENCE) @@ -2297,12 +2336,13 @@ static ssize_t spectre_v2_show_state(char *buf) spectre_v2_enabled == SPECTRE_V2_EIBRS_LFENCE) return sprintf(buf, "Vulnerable: eIBRS+LFENCE with unprivileged eBPF and SMT\n"); - return sprintf(buf, "%s%s%s%s%s%s\n", + return sprintf(buf, "%s%s%s%s%s%s%s\n", spectre_v2_strings[spectre_v2_enabled], ibpb_state(), boot_cpu_has(X86_FEATURE_USE_IBRS_FW) ? ", IBRS_FW" : "", stibp_state(), boot_cpu_has(X86_FEATURE_RSB_CTXSW) ? ", RSB filling" : "", + pbrsb_eibrs_state(), spectre_v2_module_string()); } diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80cc41f79783..4a538ec413b8 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1027,6 +1027,7 @@ static void identify_cpu_without_cpuid(struct cpuinfo_x86 *c) #define NO_SWAPGS BIT(6) #define NO_ITLB_MULTIHIT BIT(7) #define NO_SPECTRE_V2 BIT(8) +#define NO_EIBRS_PBRSB BIT(9) #define VULNWL(vendor, family, model, whitelist) \ X86_MATCH_VENDOR_FAM_MODEL(vendor, family, model, whitelist) @@ -1067,7 +1068,7 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { VULNWL_INTEL(ATOM_GOLDMONT, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), VULNWL_INTEL(ATOM_GOLDMONT_D, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), - VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_GOLDMONT_PLUS, NO_MDS | NO_L1TF | NO_SWAPGS | NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), /* * Technically, swapgs isn't serializing on AMD (despite it previously @@ -1077,7 +1078,9 @@ static const __initconst struct x86_cpu_id cpu_vuln_whitelist[] = { * good enough for our purposes. */ - VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT), + VULNWL_INTEL(ATOM_TREMONT, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_L, NO_EIBRS_PBRSB), + VULNWL_INTEL(ATOM_TREMONT_D, NO_ITLB_MULTIHIT | NO_EIBRS_PBRSB), /* AMD Family 0xf - 0x12 */ VULNWL_AMD(0x0f, NO_MELTDOWN | NO_SSB | NO_L1TF | NO_MDS | NO_SWAPGS | NO_ITLB_MULTIHIT), @@ -1255,6 +1258,11 @@ static void __init cpu_set_bug_bits(struct cpuinfo_x86 *c) setup_force_cpu_bug(X86_BUG_RETBLEED); } + if (cpu_has(c, X86_FEATURE_IBRS_ENHANCED) && + !cpu_matches(cpu_vuln_whitelist, NO_EIBRS_PBRSB) && + !(ia32_cap & ARCH_CAP_PBRSB_NO)) + setup_force_cpu_bug(X86_BUG_EIBRS_PBRSB); + if (cpu_matches(cpu_vuln_whitelist, NO_MELTDOWN)) return; diff --git a/arch/x86/kvm/vmx/vmenter.S b/arch/x86/kvm/vmx/vmenter.S index 857fa0fc49fa..982138bebb70 100644 --- a/arch/x86/kvm/vmx/vmenter.S +++ b/arch/x86/kvm/vmx/vmenter.S @@ -197,11 +197,13 @@ SYM_INNER_LABEL(vmx_vmexit, SYM_L_GLOBAL) * entries and (in some cases) RSB underflow. * * eIBRS has its own protection against poisoned RSB, so it doesn't - * need the RSB filling sequence. But it does need to be enabled - * before the first unbalanced RET. + * need the RSB filling sequence. But it does need to be enabled, and a + * single call to retire, before the first unbalanced RET. */ - FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT + FILL_RETURN_BUFFER %_ASM_CX, RSB_CLEAR_LOOPS, X86_FEATURE_RSB_VMEXIT,\ + X86_FEATURE_RSB_VMEXIT_LITE + pop %_ASM_ARG2 /* @flags */ pop %_ASM_ARG1 /* @vmx */ diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 3781a7f489ef..bcaedfe60572 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -300,6 +300,7 @@ #define X86_FEATURE_RETPOLINE_LFENCE (11*32+13) /* "" Use LFENCE for Spectre variant 2 */ #define X86_FEATURE_RETHUNK (11*32+14) /* "" Use REturn THUNK */ #define X86_FEATURE_UNRET (11*32+15) /* "" AMD BTB untrain return */ +#define X86_FEATURE_RSB_VMEXIT_LITE (11*32+17) /* "" Fill RSB on VM-Exit when EIBRS is enabled */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index ec2967e7249f..8f38265bc81d 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -148,6 +148,10 @@ * are restricted to targets in * kernel. */ +#define ARCH_CAP_PBRSB_NO BIT(24) /* + * Not susceptible to Post-Barrier + * Return Stack Buffer Predictions. + */ #define MSR_IA32_FLUSH_CMD 0x0000010b #define L1D_FLUSH BIT(0) /* From 5c5c77746ce1108833d1fda005598a749eaef2cb Mon Sep 17 00:00:00 2001 From: Pawan Gupta Date: Tue, 2 Aug 2022 15:47:02 -0700 Subject: [PATCH 30/31] x86/speculation: Add LFENCE to RSB fill sequence commit ba6e31af2be96c4d0536f2152ed6f7b6c11bca47 upstream. RSB fill sequence does not have any protection for miss-prediction of conditional branch at the end of the sequence. CPU can speculatively execute code immediately after the sequence, while RSB filling hasn't completed yet. #define __FILL_RETURN_BUFFER(reg, nr, sp) \ mov $(nr/2), reg; \ 771: \ ANNOTATE_INTRA_FUNCTION_CALL; \ call 772f; \ 773: /* speculation trap */ \ UNWIND_HINT_EMPTY; \ pause; \ lfence; \ jmp 773b; \ 772: \ ANNOTATE_INTRA_FUNCTION_CALL; \ call 774f; \ 775: /* speculation trap */ \ UNWIND_HINT_EMPTY; \ pause; \ lfence; \ jmp 775b; \ 774: \ add $(BITS_PER_LONG/8) * 2, sp; \ dec reg; \ jnz 771b; <----- CPU can miss-predict here. Before RSB is filled, RETs that come in program order after this macro can be executed speculatively, making them vulnerable to RSB-based attacks. Mitigate it by adding an LFENCE after the conditional branch to prevent speculation while RSB is being filled. Suggested-by: Andrew Cooper Signed-off-by: Pawan Gupta Signed-off-by: Borislav Petkov Signed-off-by: Daniel Sneddon Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/nospec-branch.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h index 96597246f793..6a59b2d58a3a 100644 --- a/arch/x86/include/asm/nospec-branch.h +++ b/arch/x86/include/asm/nospec-branch.h @@ -60,7 +60,9 @@ 774: \ add $(BITS_PER_LONG/8) * 2, sp; \ dec reg; \ - jnz 771b; + jnz 771b; \ + /* barrier for jnz misprediction */ \ + lfence; #ifdef __ASSEMBLY__ From 7217df81279835a7aee62a07aabb7b8fb8c766f2 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 11 Aug 2022 13:07:54 +0200 Subject: [PATCH 31/31] Linux 5.15.60 Link: https://lore.kernel.org/r/20220809175514.276643253@linuxfoundation.org Tested-by: Florian Fainelli Tested-by: Bagas Sanjaya Tested-by: Linux Kernel Functional Testing Tested-by: Sudip Mukherjee Tested-by: Guenter Roeck Tested-by: Jon Hunter Tested-by: Shuah Khan Tested-by: Ron Economos Signed-off-by: Greg Kroah-Hartman --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 22bca3948306..4ea646f496c9 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 5 PATCHLEVEL = 15 -SUBLEVEL = 59 +SUBLEVEL = 60 EXTRAVERSION = NAME = Trick or Treat