From 2c07e2437a3e98027c049ca560e4b6e39a975089 Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Wed, 25 Oct 2023 12:11:31 +0200 Subject: [PATCH 001/256] drm/i915/display: Use i915_gem_object_get_dma_address to get dma address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 7054b551de18e9875fbdf8d4f3baade428353545 ] Works better for xe like that. obj is no longer const. Signed-off-by: Maarten Lankhorst Link: https://patchwork.freedesktop.org/patch/msgid/20231204134946.16219-1-maarten.lankhorst@linux.intel.com Reviewed-by: Jouni Högander Stable-dep-of: 582dc04b0658 ("drm/i915: Pre-populate the cursor physical dma address") Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/display/intel_cursor.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_cursor.c b/drivers/gpu/drm/i915/display/intel_cursor.c index b342fad180ca..0d21c34f7499 100644 --- a/drivers/gpu/drm/i915/display/intel_cursor.c +++ b/drivers/gpu/drm/i915/display/intel_cursor.c @@ -23,6 +23,8 @@ #include "intel_psr.h" #include "skl_watermark.h" +#include "gem/i915_gem_object.h" + /* Cursor formats */ static const u32 intel_cursor_formats[] = { DRM_FORMAT_ARGB8888, @@ -33,11 +35,11 @@ static u32 intel_cursor_base(const struct intel_plane_state *plane_state) struct drm_i915_private *dev_priv = to_i915(plane_state->uapi.plane->dev); const struct drm_framebuffer *fb = plane_state->hw.fb; - const struct drm_i915_gem_object *obj = intel_fb_obj(fb); + struct drm_i915_gem_object *obj = intel_fb_obj(fb); u32 base; if (DISPLAY_INFO(dev_priv)->cursor_needs_physical) - base = sg_dma_address(obj->mm.pages->sgl); + base = i915_gem_object_get_dma_address(obj, 0); else base = intel_plane_ggtt_offset(plane_state); From cc696ce93089e3e1bc28d749aee321a37cabe4bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= Date: Mon, 25 Mar 2024 19:57:38 +0200 Subject: [PATCH 002/256] drm/i915: Pre-populate the cursor physical dma address MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 582dc04b0658ef3b90aeb49cbdd9747c2f1eccc3 ] Calling i915_gem_object_get_dma_address() from the vblank evade critical section triggers might_sleep(). While we know that we've already pinned the framebuffer and thus i915_gem_object_get_dma_address() will in fact not sleep in this case, it seems reasonable to keep the unconditional might_sleep() for maximum coverage. So let's instead pre-populate the dma address during fb pinning, which all happens before we enter the vblank evade critical section. We can use u32 for the dma address as this class of hardware doesn't support >32bit addresses. Cc: stable@vger.kernel.org Fixes: 0225a90981c8 ("drm/i915: Make cursor plane registers unlocked") Reported-by: Borislav Petkov Closes: https://lore.kernel.org/intel-gfx/20240227100342.GAZd2zfmYcPS_SndtO@fat_crate.local/ Signed-off-by: Ville Syrjälä Link: https://patchwork.freedesktop.org/patch/msgid/20240325175738.3440-1-ville.syrjala@linux.intel.com Tested-by: Borislav Petkov (AMD) Reviewed-by: Chaitanya Kumar Borah (cherry picked from commit c1289a5c3594cf04caa94ebf0edeb50c62009f1f) Signed-off-by: Rodrigo Vivi Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/display/intel_cursor.c | 4 +--- drivers/gpu/drm/i915/display/intel_display_types.h | 1 + drivers/gpu/drm/i915/display/intel_fb_pin.c | 10 ++++++++++ 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/i915/display/intel_cursor.c b/drivers/gpu/drm/i915/display/intel_cursor.c index 0d21c34f7499..61df6cd3f377 100644 --- a/drivers/gpu/drm/i915/display/intel_cursor.c +++ b/drivers/gpu/drm/i915/display/intel_cursor.c @@ -34,12 +34,10 @@ static u32 intel_cursor_base(const struct intel_plane_state *plane_state) { struct drm_i915_private *dev_priv = to_i915(plane_state->uapi.plane->dev); - const struct drm_framebuffer *fb = plane_state->hw.fb; - struct drm_i915_gem_object *obj = intel_fb_obj(fb); u32 base; if (DISPLAY_INFO(dev_priv)->cursor_needs_physical) - base = i915_gem_object_get_dma_address(obj, 0); + base = plane_state->phys_dma_addr; else base = intel_plane_ggtt_offset(plane_state); diff --git a/drivers/gpu/drm/i915/display/intel_display_types.h b/drivers/gpu/drm/i915/display/intel_display_types.h index 7fc92b1474cc..8b0dc2b75da4 100644 --- a/drivers/gpu/drm/i915/display/intel_display_types.h +++ b/drivers/gpu/drm/i915/display/intel_display_types.h @@ -701,6 +701,7 @@ struct intel_plane_state { #define PLANE_HAS_FENCE BIT(0) struct intel_fb_view view; + u32 phys_dma_addr; /* for cursor_needs_physical */ /* Plane pxp decryption state */ bool decrypt; diff --git a/drivers/gpu/drm/i915/display/intel_fb_pin.c b/drivers/gpu/drm/i915/display/intel_fb_pin.c index fffd568070d4..a131656757f2 100644 --- a/drivers/gpu/drm/i915/display/intel_fb_pin.c +++ b/drivers/gpu/drm/i915/display/intel_fb_pin.c @@ -254,6 +254,16 @@ int intel_plane_pin_fb(struct intel_plane_state *plane_state) return PTR_ERR(vma); plane_state->ggtt_vma = vma; + + /* + * Pre-populate the dma address before we enter the vblank + * evade critical section as i915_gem_object_get_dma_address() + * will trigger might_sleep() even if it won't actually sleep, + * which is the case when the fb has already been pinned. + */ + if (phys_cursor) + plane_state->phys_dma_addr = + i915_gem_object_get_dma_address(intel_fb_obj(fb), 0); } else { struct intel_framebuffer *intel_fb = to_intel_framebuffer(fb); From 54d38a5ca0f7d1c70af5c0ccfba3b1eb3f8926b1 Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Fri, 15 Mar 2024 10:34:43 +0800 Subject: [PATCH 003/256] scripts/bpf_doc: Use silent mode when exec make cmd [ Upstream commit 5384cc0d1a88c27448a6a4e65b8abe6486de8012 ] When getting kernel version via make, the result may be polluted by other output, like directory change info. e.g. $ export MAKEFLAGS="-w" $ make kernelversion make: Entering directory '/home/net' 6.8.0 make: Leaving directory '/home/net' This will distort the reStructuredText output and make latter rst2man failed like: [...] bpf-helpers.rst:20: (WARNING/2) Field list ends without a blank line; unexpected unindent. [...] Using silent mode would help. e.g. $ make -s --no-print-directory kernelversion 6.8.0 Fixes: fd0a38f9c37d ("scripts/bpf: Set version attribute for bpf-helpers(7) man page") Signed-off-by: Michael Hofmann Signed-off-by: Hangbin Liu Signed-off-by: Daniel Borkmann Reviewed-by: Quentin Monnet Acked-by: Alejandro Colomar Link: https://lore.kernel.org/bpf/20240315023443.2364442-1-liuhangbin@gmail.com Signed-off-by: Sasha Levin --- scripts/bpf_doc.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/bpf_doc.py b/scripts/bpf_doc.py index 0669bac5e900..3f899cc7e99a 100755 --- a/scripts/bpf_doc.py +++ b/scripts/bpf_doc.py @@ -414,8 +414,8 @@ class PrinterRST(Printer): version = version.stdout.decode().rstrip() except: try: - version = subprocess.run(['make', 'kernelversion'], cwd=linuxRoot, - capture_output=True, check=True) + version = subprocess.run(['make', '-s', '--no-print-directory', 'kernelversion'], + cwd=linuxRoot, capture_output=True, check=True) version = version.stdout.decode().rstrip() except: return 'Linux' From c3062bdb859b6e2567e7f5c8cde20c0250bb130f Mon Sep 17 00:00:00 2001 From: Ilya Leoshkevich Date: Wed, 20 Mar 2024 02:54:12 +0100 Subject: [PATCH 004/256] s390/bpf: Fix bpf_plt pointer arithmetic [ Upstream commit 7ded842b356d151ece8ac4985940438e6d3998bb ] Kui-Feng Lee reported a crash on s390x triggered by the dummy_st_ops/dummy_init_ptr_arg test [1]: [<0000000000000002>] 0x2 [<00000000009d5cde>] bpf_struct_ops_test_run+0x156/0x250 [<000000000033145a>] __sys_bpf+0xa1a/0xd00 [<00000000003319dc>] __s390x_sys_bpf+0x44/0x50 [<0000000000c4382c>] __do_syscall+0x244/0x300 [<0000000000c59a40>] system_call+0x70/0x98 This is caused by GCC moving memcpy() after assignments in bpf_jit_plt(), resulting in NULL pointers being written instead of the return and the target addresses. Looking at the GCC internals, the reordering is allowed because the alias analysis thinks that the memcpy() destination and the assignments' left-hand-sides are based on different objects: new_plt and bpf_plt_ret/bpf_plt_target respectively, and therefore they cannot alias. This is in turn due to a violation of the C standard: When two pointers are subtracted, both shall point to elements of the same array object, or one past the last element of the array object ... From the C's perspective, bpf_plt_ret and bpf_plt are distinct objects and cannot be subtracted. In the practical terms, doing so confuses the GCC's alias analysis. The code was written this way in order to let the C side know a few offsets defined in the assembly. While nice, this is by no means necessary. Fix the noncompliance by hardcoding these offsets. [1] https://lore.kernel.org/bpf/c9923c1d-971d-4022-8dc8-1364e929d34c@gmail.com/ Fixes: f1d5df84cd8c ("s390/bpf: Implement bpf_arch_text_poke()") Signed-off-by: Ilya Leoshkevich Message-ID: <20240320015515.11883-1-iii@linux.ibm.com> Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- arch/s390/net/bpf_jit_comp.c | 46 ++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 26 deletions(-) diff --git a/arch/s390/net/bpf_jit_comp.c b/arch/s390/net/bpf_jit_comp.c index e507692e51e7..8af02176f68b 100644 --- a/arch/s390/net/bpf_jit_comp.c +++ b/arch/s390/net/bpf_jit_comp.c @@ -516,11 +516,12 @@ static void bpf_skip(struct bpf_jit *jit, int size) * PLT for hotpatchable calls. The calling convention is the same as for the * ftrace hotpatch trampolines: %r0 is return address, %r1 is clobbered. */ -extern const char bpf_plt[]; -extern const char bpf_plt_ret[]; -extern const char bpf_plt_target[]; -extern const char bpf_plt_end[]; -#define BPF_PLT_SIZE 32 +struct bpf_plt { + char code[16]; + void *ret; + void *target; +} __packed; +extern const struct bpf_plt bpf_plt; asm( ".pushsection .rodata\n" " .balign 8\n" @@ -531,15 +532,14 @@ asm( " .balign 8\n" "bpf_plt_ret: .quad 0\n" "bpf_plt_target: .quad 0\n" - "bpf_plt_end:\n" " .popsection\n" ); -static void bpf_jit_plt(void *plt, void *ret, void *target) +static void bpf_jit_plt(struct bpf_plt *plt, void *ret, void *target) { - memcpy(plt, bpf_plt, BPF_PLT_SIZE); - *(void **)((char *)plt + (bpf_plt_ret - bpf_plt)) = ret; - *(void **)((char *)plt + (bpf_plt_target - bpf_plt)) = target ?: ret; + memcpy(plt, &bpf_plt, sizeof(*plt)); + plt->ret = ret; + plt->target = target; } /* @@ -662,9 +662,9 @@ static void bpf_jit_epilogue(struct bpf_jit *jit, u32 stack_depth) jit->prg = ALIGN(jit->prg, 8); jit->prologue_plt = jit->prg; if (jit->prg_buf) - bpf_jit_plt(jit->prg_buf + jit->prg, + bpf_jit_plt((struct bpf_plt *)(jit->prg_buf + jit->prg), jit->prg_buf + jit->prologue_plt_ret, NULL); - jit->prg += BPF_PLT_SIZE; + jit->prg += sizeof(struct bpf_plt); } static int get_probe_mem_regno(const u8 *insn) @@ -1901,9 +1901,6 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp) struct bpf_jit jit; int pass; - if (WARN_ON_ONCE(bpf_plt_end - bpf_plt != BPF_PLT_SIZE)) - return orig_fp; - if (!fp->jit_requested) return orig_fp; @@ -2009,14 +2006,11 @@ bool bpf_jit_supports_far_kfunc_call(void) int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, void *old_addr, void *new_addr) { + struct bpf_plt expected_plt, current_plt, new_plt, *plt; struct { u16 opc; s32 disp; } __packed insn; - char expected_plt[BPF_PLT_SIZE]; - char current_plt[BPF_PLT_SIZE]; - char new_plt[BPF_PLT_SIZE]; - char *plt; char *ret; int err; @@ -2035,18 +2029,18 @@ int bpf_arch_text_poke(void *ip, enum bpf_text_poke_type t, */ } else { /* Verify the PLT. */ - plt = (char *)ip + (insn.disp << 1); - err = copy_from_kernel_nofault(current_plt, plt, BPF_PLT_SIZE); + plt = ip + (insn.disp << 1); + err = copy_from_kernel_nofault(¤t_plt, plt, + sizeof(current_plt)); if (err < 0) return err; ret = (char *)ip + 6; - bpf_jit_plt(expected_plt, ret, old_addr); - if (memcmp(current_plt, expected_plt, BPF_PLT_SIZE)) + bpf_jit_plt(&expected_plt, ret, old_addr); + if (memcmp(¤t_plt, &expected_plt, sizeof(current_plt))) return -EINVAL; /* Adjust the call address. */ - bpf_jit_plt(new_plt, ret, new_addr); - s390_kernel_write(plt + (bpf_plt_target - bpf_plt), - new_plt + (bpf_plt_target - bpf_plt), + bpf_jit_plt(&new_plt, ret, new_addr); + s390_kernel_write(&plt->target, &new_plt.target, sizeof(void *)); } From aeecb678ec369d888b90d4a4dae9871fd8e56ae8 Mon Sep 17 00:00:00 2001 From: Puranjay Mohan Date: Tue, 12 Mar 2024 23:59:17 +0000 Subject: [PATCH 005/256] bpf, arm64: fix bug in BPF_LDX_MEMSX [ Upstream commit 114b5b3b4bde7358624437be2f12cde1b265224e ] A64_LDRSW() takes three registers: Xt, Xn, Xm as arguments and it loads and sign extends the value at address Xn + Xm into register Xt. Currently, the offset is being directly used in place of the tmp register which has the offset already loaded by the last emitted instruction. This will cause JIT failures. The easiest way to reproduce this is to test the following code through test_bpf module: { "BPF_LDX_MEMSX | BPF_W", .u.insns_int = { BPF_LD_IMM64(R1, 0x00000000deadbeefULL), BPF_LD_IMM64(R2, 0xffffffffdeadbeefULL), BPF_STX_MEM(BPF_DW, R10, R1, -7), BPF_LDX_MEMSX(BPF_W, R0, R10, -7), BPF_JMP_REG(BPF_JNE, R0, R2, 1), BPF_ALU64_IMM(BPF_MOV, R0, 0), BPF_EXIT_INSN(), }, INTERNAL, { }, { { 0, 0 } }, .stack_depth = 7, }, We need to use the offset as -7 to trigger this code path, there could be other valid ways to trigger this from proper BPF programs as well. This code is rejected by the JIT because -7 is passed to A64_LDRSW() but it expects a valid register (0 - 31). roott@pjy:~# modprobe test_bpf test_name="BPF_LDX_MEMSX | BPF_W" [11300.490371] test_bpf: test_bpf: set 'test_bpf' as the default test_suite. [11300.491750] test_bpf: #345 BPF_LDX_MEMSX | BPF_W [11300.493179] aarch64_insn_encode_register: unknown register encoding -7 [11300.494133] aarch64_insn_encode_register: unknown register encoding -7 [11300.495292] FAIL to select_runtime err=-524 [11300.496804] test_bpf: Summary: 0 PASSED, 1 FAILED, [0/0 JIT'ed] modprobe: ERROR: could not insert 'test_bpf': Invalid argument Applying this patch fixes the issue. root@pjy:~# modprobe test_bpf test_name="BPF_LDX_MEMSX | BPF_W" [ 292.837436] test_bpf: test_bpf: set 'test_bpf' as the default test_suite. [ 292.839416] test_bpf: #345 BPF_LDX_MEMSX | BPF_W jited:1 156 PASS [ 292.844794] test_bpf: Summary: 1 PASSED, 0 FAILED, [1/1 JIT'ed] Fixes: cc88f540da52 ("bpf, arm64: Support sign-extension load instructions") Signed-off-by: Puranjay Mohan Message-ID: <20240312235917.103626-1-puranjay12@gmail.com> Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- arch/arm64/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 150d1c6543f7..5fe4d8b3fdc8 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -1189,7 +1189,7 @@ emit_cond_jmp: } else { emit_a64_mov_i(1, tmp, off, ctx); if (sign_extend) - emit(A64_LDRSW(dst, src_adj, off_adj), ctx); + emit(A64_LDRSW(dst, src, tmp), ctx); else emit(A64_LDR32(dst, src, tmp), ctx); } From 156c226cbbdcf5f3bce7b2408a33b59fab7fae2c Mon Sep 17 00:00:00 2001 From: Pavel Sakharov Date: Wed, 20 Mar 2024 04:15:23 +0500 Subject: [PATCH 006/256] dma-buf: Fix NULL pointer dereference in sanitycheck() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 2295bd846765c766701e666ed2e4b35396be25e6 ] If due to a memory allocation failure mock_chain() returns NULL, it is passed to dma_fence_enable_sw_signaling() resulting in NULL pointer dereference there. Call dma_fence_enable_sw_signaling() only if mock_chain() succeeds. Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: d62c43a953ce ("dma-buf: Enable signaling on fence for selftests") Signed-off-by: Pavel Sakharov Reviewed-by: Christian König Signed-off-by: Christian König Link: https://patchwork.freedesktop.org/patch/msgid/20240319231527.1821372-1-p.sakharov@ispras.ru Signed-off-by: Sasha Levin --- drivers/dma-buf/st-dma-fence-chain.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/dma-buf/st-dma-fence-chain.c b/drivers/dma-buf/st-dma-fence-chain.c index c0979c8049b5..661de4add4c7 100644 --- a/drivers/dma-buf/st-dma-fence-chain.c +++ b/drivers/dma-buf/st-dma-fence-chain.c @@ -84,11 +84,11 @@ static int sanitycheck(void *arg) return -ENOMEM; chain = mock_chain(NULL, f, 1); - if (!chain) + if (chain) + dma_fence_enable_sw_signaling(chain); + else err = -ENOMEM; - dma_fence_enable_sw_signaling(chain); - dma_fence_signal(f); dma_fence_put(f); From 087dc50d8bafbf72dd52d3c1c4e140e95f89ab2a Mon Sep 17 00:00:00 2001 From: Artem Savkov Date: Thu, 21 Mar 2024 09:18:09 +0100 Subject: [PATCH 007/256] arm64: bpf: fix 32bit unconditional bswap [ Upstream commit a51cd6bf8e10793103c5870ff9e4db295a843604 ] In case when is64 == 1 in emit(A64_REV32(is64, dst, dst), ctx) the generated insn reverses byte order for both high and low 32-bit words, resuling in an incorrect swap as indicated by the jit test: [ 9757.262607] test_bpf: #312 BSWAP 16: 0x0123456789abcdef -> 0xefcd jited:1 8 PASS [ 9757.264435] test_bpf: #313 BSWAP 32: 0x0123456789abcdef -> 0xefcdab89 jited:1 ret 1460850314 != -271733879 (0x5712ce8a != 0xefcdab89)FAIL (1 times) [ 9757.266260] test_bpf: #314 BSWAP 64: 0x0123456789abcdef -> 0x67452301 jited:1 8 PASS [ 9757.268000] test_bpf: #315 BSWAP 64: 0x0123456789abcdef >> 32 -> 0xefcdab89 jited:1 8 PASS [ 9757.269686] test_bpf: #316 BSWAP 16: 0xfedcba9876543210 -> 0x1032 jited:1 8 PASS [ 9757.271380] test_bpf: #317 BSWAP 32: 0xfedcba9876543210 -> 0x10325476 jited:1 ret -1460850316 != 271733878 (0xa8ed3174 != 0x10325476)FAIL (1 times) [ 9757.273022] test_bpf: #318 BSWAP 64: 0xfedcba9876543210 -> 0x98badcfe jited:1 7 PASS [ 9757.274721] test_bpf: #319 BSWAP 64: 0xfedcba9876543210 >> 32 -> 0x10325476 jited:1 9 PASS Fix this by forcing 32bit variant of rev32. Fixes: 1104247f3f979 ("bpf, arm64: Support unconditional bswap") Signed-off-by: Artem Savkov Tested-by: Puranjay Mohan Acked-by: Puranjay Mohan Acked-by: Xu Kuohai Message-ID: <20240321081809.158803-1-asavkov@redhat.com> Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- arch/arm64/net/bpf_jit_comp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/net/bpf_jit_comp.c b/arch/arm64/net/bpf_jit_comp.c index 5fe4d8b3fdc8..29196dce9b91 100644 --- a/arch/arm64/net/bpf_jit_comp.c +++ b/arch/arm64/net/bpf_jit_comp.c @@ -876,7 +876,7 @@ static int build_insn(const struct bpf_insn *insn, struct jit_ctx *ctx, emit(A64_UXTH(is64, dst, dst), ctx); break; case 32: - emit(A64_REV32(is64, dst, dst), ctx); + emit(A64_REV32(0, dst, dst), ctx); /* upper 32 bits already cleared */ break; case 64: From a946ebee45b09294c8b0b0e77410b763c4d2817a Mon Sep 17 00:00:00 2001 From: Ryosuke Yasuoka Date: Wed, 20 Mar 2024 09:54:10 +0900 Subject: [PATCH 008/256] nfc: nci: Fix uninit-value in nci_dev_up and nci_ntf_packet [ Upstream commit d24b03535e5eb82e025219c2f632b485409c898f ] syzbot reported the following uninit-value access issue [1][2]: nci_rx_work() parses and processes received packet. When the payload length is zero, each message type handler reads uninitialized payload and KMSAN detects this issue. The receipt of a packet with a zero-size payload is considered unexpected, and therefore, such packets should be silently discarded. This patch resolved this issue by checking payload size before calling each message type handler codes. Fixes: 6a2968aaf50c ("NFC: basic NCI protocol implementation") Reported-and-tested-by: syzbot+7ea9413ea6749baf5574@syzkaller.appspotmail.com Reported-and-tested-by: syzbot+29b5ca705d2e0f4a44d2@syzkaller.appspotmail.com Closes: https://syzkaller.appspot.com/bug?extid=7ea9413ea6749baf5574 [1] Closes: https://syzkaller.appspot.com/bug?extid=29b5ca705d2e0f4a44d2 [2] Signed-off-by: Ryosuke Yasuoka Reviewed-by: Jeremy Cline Reviewed-by: Krzysztof Kozlowski Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/nfc/nci/core.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c index 12684d835cb5..772ddb5824d9 100644 --- a/net/nfc/nci/core.c +++ b/net/nfc/nci/core.c @@ -1516,6 +1516,11 @@ static void nci_rx_work(struct work_struct *work) nfc_send_to_raw_sock(ndev->nfc_dev, skb, RAW_PAYLOAD_NCI, NFC_DIRECTION_RX); + if (!nci_plen(skb->data)) { + kfree_skb(skb); + break; + } + /* Process frame */ switch (nci_mt(skb->data)) { case NCI_MT_RSP_PKT: From 5c05bdd95f0e07ca3acf4f186771cb312c764703 Mon Sep 17 00:00:00 2001 From: Jakub Kicinski Date: Wed, 20 Mar 2024 19:02:14 -0700 Subject: [PATCH 009/256] tools: ynl: fix setting presence bits in simple nests [ Upstream commit f6c8f5e8694c7a78c94e408b628afa6255cc428a ] When we set members of simple nested structures in requests we need to set "presence" bits for all the nesting layers below. This has nothing to do with the presence type of the last layer. Fixes: be5bea1cc0bf ("net: add basic C code generators for Netlink") Reviewed-by: Breno Leitao Link: https://lore.kernel.org/r/20240321020214.1250202-1-kuba@kernel.org Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- tools/net/ynl/ynl-gen-c.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/tools/net/ynl/ynl-gen-c.py b/tools/net/ynl/ynl-gen-c.py index 897af958cee8..575b7e248e52 100755 --- a/tools/net/ynl/ynl-gen-c.py +++ b/tools/net/ynl/ynl-gen-c.py @@ -198,8 +198,11 @@ class Type(SpecAttr): presence = '' for i in range(0, len(ref)): presence = f"{var}->{'.'.join(ref[:i] + [''])}_present.{ref[i]}" - if self.presence_type() == 'bit': - code.append(presence + ' = 1;') + # Every layer below last is a nest, so we know it uses bit presence + # last layer is "self" and may be a complex type + if i == len(ref) - 1 and self.presence_type() != 'bit': + continue + code.append(presence + ' = 1;') code += self._setter_lines(ri, member, presence) func_name = f"{op_prefix(ri, direction, deref=deref)}_set_{'_'.join(ref)}" From 84d30c56786a5c0f6247f94f9d5925076f7b768d Mon Sep 17 00:00:00 2001 From: David Thompson Date: Wed, 20 Mar 2024 15:31:17 -0400 Subject: [PATCH 010/256] mlxbf_gige: stop PHY during open() error paths [ Upstream commit d6c30c5a168f8586b8bcc0d8e42e2456eb05209b ] The mlxbf_gige_open() routine starts the PHY as part of normal initialization. The mlxbf_gige_open() routine must stop the PHY during its error paths. Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver") Signed-off-by: David Thompson Reviewed-by: Asmaa Mnebhi Reviewed-by: Andrew Lunn Reviewed-by: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c index aaf1faed4133..044ff5f87b5e 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c @@ -157,7 +157,7 @@ static int mlxbf_gige_open(struct net_device *netdev) err = mlxbf_gige_tx_init(priv); if (err) - goto free_irqs; + goto phy_deinit; err = mlxbf_gige_rx_init(priv); if (err) goto tx_deinit; @@ -185,6 +185,9 @@ static int mlxbf_gige_open(struct net_device *netdev) tx_deinit: mlxbf_gige_tx_deinit(priv); +phy_deinit: + phy_stop(phydev); + free_irqs: mlxbf_gige_free_irqs(priv); return err; From c0a40f2f8eba07416f695ffe2011bf3f8b0b6dc8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 19 Mar 2024 10:10:17 +0200 Subject: [PATCH 011/256] wifi: iwlwifi: mvm: rfi: fix potential response leaks [ Upstream commit 06a093807eb7b5c5b29b6cff49f8174a4e702341 ] If the rx payload length check fails, or if kmemdup() fails, we still need to free the command response. Fix that. Fixes: 21254908cbe9 ("iwlwifi: mvm: add RFI-M support") Co-authored-by: Anjaneyulu Signed-off-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240319100755.db2fa0196aa7.I116293b132502ac68a65527330fa37799694b79c@changeid Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- drivers/net/wireless/intel/iwlwifi/mvm/rfi.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rfi.c b/drivers/net/wireless/intel/iwlwifi/mvm/rfi.c index 2ecd32bed752..045c862a8fc4 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rfi.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rfi.c @@ -132,14 +132,18 @@ struct iwl_rfi_freq_table_resp_cmd *iwl_rfi_get_freq_table(struct iwl_mvm *mvm) if (ret) return ERR_PTR(ret); - if (WARN_ON_ONCE(iwl_rx_packet_payload_len(cmd.resp_pkt) != resp_size)) + if (WARN_ON_ONCE(iwl_rx_packet_payload_len(cmd.resp_pkt) != + resp_size)) { + iwl_free_resp(&cmd); return ERR_PTR(-EIO); + } resp = kmemdup(cmd.resp_pkt->data, resp_size, GFP_KERNEL); + iwl_free_resp(&cmd); + if (!resp) return ERR_PTR(-ENOMEM); - iwl_free_resp(&cmd); return resp; } From 13fd96c975969b28669004a3ed5f2044f7ac3053 Mon Sep 17 00:00:00 2001 From: Emmanuel Grumbach Date: Tue, 17 Oct 2023 12:16:46 +0300 Subject: [PATCH 012/256] wifi: iwlwifi: disable multi rx queue for 9000 [ Upstream commit 29fa9a984b6d1075020f12071a89897fd62ed27f ] Multi rx queue allows to spread the load of the Rx streams on different CPUs. 9000 series required complex synchronization mechanisms from the driver side since the hardware / firmware is not able to provide information about duplicate packets and timeouts inside the reordering buffer. Users have complained that for newer devices, all those synchronization mechanisms have caused spurious packet drops. Those packet drops disappeared if we simplify the code, but unfortunately, we can't have RSS enabled on 9000 series without this complex code. Remove support for RSS on 9000 so that we can make the code much simpler for newer devices and fix the bugs for them. The down side of this patch is a that all the Rx path will be routed to a single CPU, but this has never been an issue, the modern CPUs are just fast enough to cope with all the traffic. Signed-off-by: Emmanuel Grumbach Signed-off-by: Gregory Greenman Link: https://lore.kernel.org/r/20231017115047.2917eb8b7af9.Iddd7dcf335387ba46fcbbb6067ef4ff9cd3755a7@changeid Signed-off-by: Johannes Berg Stable-dep-of: e78d78773089 ("wifi: iwlwifi: mvm: include link ID when releasing frames") Signed-off-by: Sasha Levin --- drivers/net/wireless/intel/iwlwifi/iwl-trans.h | 2 +- drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c | 4 +++- drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 11 ++++++++++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h index 168eda2132fb..9dcc1506bd0b 100644 --- a/drivers/net/wireless/intel/iwlwifi/iwl-trans.h +++ b/drivers/net/wireless/intel/iwlwifi/iwl-trans.h @@ -278,7 +278,7 @@ static inline void iwl_free_rxb(struct iwl_rx_cmd_buffer *r) #define IWL_MGMT_TID 15 #define IWL_FRAME_LIMIT 64 #define IWL_MAX_RX_HW_QUEUES 16 -#define IWL_9000_MAX_RX_HW_QUEUES 6 +#define IWL_9000_MAX_RX_HW_QUEUES 1 /** * enum iwl_wowlan_status - WoWLAN image/device status diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index aaa9840d0d4c..ee9d14250a26 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -352,7 +352,9 @@ int iwl_mvm_mac_setup_register(struct iwl_mvm *mvm) ieee80211_hw_set(hw, HAS_RATE_CONTROL); } - if (iwl_mvm_has_new_rx_api(mvm)) + /* We want to use the mac80211's reorder buffer for 9000 */ + if (iwl_mvm_has_new_rx_api(mvm) && + mvm->trans->trans_cfg->device_family > IWL_DEVICE_FAMILY_9000) ieee80211_hw_set(hw, SUPPORTS_REORDERING_BUFFER); if (fw_has_capa(&mvm->fw->ucode_capa, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c index bac0228b8c86..92b3e18dbe87 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c @@ -963,6 +963,9 @@ static bool iwl_mvm_reorder(struct iwl_mvm *mvm, baid = (reorder & IWL_RX_MPDU_REORDER_BAID_MASK) >> IWL_RX_MPDU_REORDER_BAID_SHIFT; + if (mvm->trans->trans_cfg->device_family == IWL_DEVICE_FAMILY_9000) + return false; + /* * This also covers the case of receiving a Block Ack Request * outside a BA session; we'll pass it to mac80211 and that @@ -2621,9 +2624,15 @@ void iwl_mvm_rx_mpdu_mq(struct iwl_mvm *mvm, struct napi_struct *napi, if (!iwl_mvm_reorder(mvm, napi, queue, sta, skb, desc) && likely(!iwl_mvm_time_sync_frame(mvm, skb, hdr->addr2)) && - likely(!iwl_mvm_mei_filter_scan(mvm, skb))) + likely(!iwl_mvm_mei_filter_scan(mvm, skb))) { + if (mvm->trans->trans_cfg->device_family == IWL_DEVICE_FAMILY_9000 && + (desc->mac_flags2 & IWL_RX_MPDU_MFLG2_AMSDU) && + !(desc->amsdu_info & IWL_RX_MPDU_AMSDU_LAST_SUBFRAME)) + rx_status->flag |= RX_FLAG_AMSDU_MORE; + iwl_mvm_pass_packet_to_mac80211(mvm, napi, skb, queue, sta, link_sta); + } out: rcu_read_unlock(); } From dc1ec9c5efec2df1f249c53cde937d2b938b61fa Mon Sep 17 00:00:00 2001 From: Benjamin Berg Date: Wed, 20 Mar 2024 23:26:22 +0200 Subject: [PATCH 013/256] wifi: iwlwifi: mvm: include link ID when releasing frames [ Upstream commit e78d7877308989ef91b64a3c746ae31324c07caa ] When releasing frames from the reorder buffer, the link ID was not included in the RX status information. This subsequently led mac80211 to drop the frame. Change it so that the link information is set immediately when possible so that it doesn't not need to be filled in anymore when submitting the frame to mac80211. Fixes: b8a85a1d42d7 ("wifi: iwlwifi: mvm: rxmq: report link ID to mac80211") Signed-off-by: Benjamin Berg Tested-by: Emmanuel Grumbach Reviewed-by: Johannes Berg Signed-off-by: Miri Korenblit Link: https://msgid.link/20240320232419.bbbd5e9bfe80.Iec1bf5c884e371f7bc5ea2534ed9ea8d3f2c0bf6@changeid Signed-off-by: Johannes Berg Signed-off-by: Sasha Levin --- drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c | 20 ++++++++----------- 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c index 92b3e18dbe87..e9360b555ac9 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c @@ -236,21 +236,13 @@ static void iwl_mvm_add_rtap_sniffer_config(struct iwl_mvm *mvm, static void iwl_mvm_pass_packet_to_mac80211(struct iwl_mvm *mvm, struct napi_struct *napi, struct sk_buff *skb, int queue, - struct ieee80211_sta *sta, - struct ieee80211_link_sta *link_sta) + struct ieee80211_sta *sta) { if (unlikely(iwl_mvm_check_pn(mvm, skb, queue, sta))) { kfree_skb(skb); return; } - if (sta && sta->valid_links && link_sta) { - struct ieee80211_rx_status *rx_status = IEEE80211_SKB_RXCB(skb); - - rx_status->link_valid = 1; - rx_status->link_id = link_sta->link_id; - } - ieee80211_rx_napi(mvm->hw, sta, skb, napi); } @@ -636,7 +628,7 @@ static void iwl_mvm_release_frames(struct iwl_mvm *mvm, while ((skb = __skb_dequeue(skb_list))) { iwl_mvm_pass_packet_to_mac80211(mvm, napi, skb, reorder_buf->queue, - sta, NULL /* FIXME */); + sta); reorder_buf->num_stored--; } } @@ -2489,6 +2481,11 @@ void iwl_mvm_rx_mpdu_mq(struct iwl_mvm *mvm, struct napi_struct *napi, if (IS_ERR(sta)) sta = NULL; link_sta = rcu_dereference(mvm->fw_id_to_link_sta[id]); + + if (sta && sta->valid_links && link_sta) { + rx_status->link_valid = 1; + rx_status->link_id = link_sta->link_id; + } } } else if (!is_multicast_ether_addr(hdr->addr2)) { /* @@ -2630,8 +2627,7 @@ void iwl_mvm_rx_mpdu_mq(struct iwl_mvm *mvm, struct napi_struct *napi, !(desc->amsdu_info & IWL_RX_MPDU_AMSDU_LAST_SUBFRAME)) rx_status->flag |= RX_FLAG_AMSDU_MORE; - iwl_mvm_pass_packet_to_mac80211(mvm, napi, skb, queue, sta, - link_sta); + iwl_mvm_pass_packet_to_mac80211(mvm, napi, skb, queue, sta); } out: rcu_read_unlock(); From 0172edc572b052192eb89ebfb93191f8bb5bf1bf Mon Sep 17 00:00:00 2001 From: Simon Trimmer Date: Mon, 25 Mar 2024 14:55:10 +0000 Subject: [PATCH 014/256] ALSA: hda: cs35l56: Set the init_done flag before component_add() [ Upstream commit cafe9c6a72cf1ffe96d2561d988a141cb5c093db ] Initialization is completed before adding the component as that can start the process of the device binding and trigger actions that check init_done. Signed-off-by: Simon Trimmer Signed-off-by: Richard Fitzgerald Fixes: 73cfbfa9caea ("ALSA: hda/cs35l56: Add driver for Cirrus Logic CS35L56 amplifier") Message-ID: <20240325145510.328378-1-rf@opensource.cirrus.com> Signed-off-by: Takashi Iwai Signed-off-by: Sasha Levin --- sound/pci/hda/cs35l56_hda.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/sound/pci/hda/cs35l56_hda.c b/sound/pci/hda/cs35l56_hda.c index 7adc1d373d65..27848d646963 100644 --- a/sound/pci/hda/cs35l56_hda.c +++ b/sound/pci/hda/cs35l56_hda.c @@ -978,14 +978,14 @@ int cs35l56_hda_common_probe(struct cs35l56_hda *cs35l56, int id) pm_runtime_mark_last_busy(cs35l56->base.dev); pm_runtime_enable(cs35l56->base.dev); + cs35l56->base.init_done = true; + ret = component_add(cs35l56->base.dev, &cs35l56_hda_comp_ops); if (ret) { dev_err(cs35l56->base.dev, "Register component failed: %d\n", ret); goto pm_err; } - cs35l56->base.init_done = true; - return 0; pm_err: From 493b29930f66b865d969badfb3d0431dae76a1ff Mon Sep 17 00:00:00 2001 From: Steven Zou Date: Wed, 7 Feb 2024 09:49:59 +0800 Subject: [PATCH 015/256] ice: Refactor FW data type and fix bitmap casting issue [ Upstream commit 817b18965b58a6e5fb6ce97abf01b03a205a6aea ] According to the datasheet, the recipe association data is an 8-byte little-endian value. It is described as 'Bitmap of the recipe indexes associated with this profile', it is from 24 to 31 byte area in FW. Therefore, it is defined to '__le64 recipe_assoc' in struct ice_aqc_recipe_to_profile. And then fix the bitmap casting issue, as we must never ever use castings for bitmap type. Fixes: 1e0f9881ef79 ("ice: Flesh out implementation of support for SRIOV on bonded interface") Reviewed-by: Przemek Kitszel Reviewed-by: Andrii Staikov Reviewed-by: Jan Sokolowski Reviewed-by: Simon Horman Signed-off-by: Steven Zou Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- .../net/ethernet/intel/ice/ice_adminq_cmd.h | 3 ++- drivers/net/ethernet/intel/ice/ice_lag.c | 4 ++-- drivers/net/ethernet/intel/ice/ice_switch.c | 24 +++++++++++-------- drivers/net/ethernet/intel/ice/ice_switch.h | 4 ++-- 4 files changed, 20 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h index 45f3e351653d..72ca2199c957 100644 --- a/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h +++ b/drivers/net/ethernet/intel/ice/ice_adminq_cmd.h @@ -592,8 +592,9 @@ struct ice_aqc_recipe_data_elem { struct ice_aqc_recipe_to_profile { __le16 profile_id; u8 rsvd[6]; - DECLARE_BITMAP(recipe_assoc, ICE_MAX_NUM_RECIPES); + __le64 recipe_assoc; }; +static_assert(sizeof(struct ice_aqc_recipe_to_profile) == 16); /* Add/Update/Remove/Get switch rules (indirect 0x02A0, 0x02A1, 0x02A2, 0x02A3) */ diff --git a/drivers/net/ethernet/intel/ice/ice_lag.c b/drivers/net/ethernet/intel/ice/ice_lag.c index 23e197c3d02a..4e675c7c199f 100644 --- a/drivers/net/ethernet/intel/ice/ice_lag.c +++ b/drivers/net/ethernet/intel/ice/ice_lag.c @@ -2000,14 +2000,14 @@ int ice_init_lag(struct ice_pf *pf) /* associate recipes to profiles */ for (n = 0; n < ICE_PROFID_IPV6_GTPU_IPV6_TCP_INNER; n++) { err = ice_aq_get_recipe_to_profile(&pf->hw, n, - (u8 *)&recipe_bits, NULL); + &recipe_bits, NULL); if (err) continue; if (recipe_bits & BIT(ICE_SW_LKUP_DFLT)) { recipe_bits |= BIT(lag->pf_recipe); ice_aq_map_recipe_to_profile(&pf->hw, n, - (u8 *)&recipe_bits, NULL); + recipe_bits, NULL); } } diff --git a/drivers/net/ethernet/intel/ice/ice_switch.c b/drivers/net/ethernet/intel/ice/ice_switch.c index 2f77b684ff76..4c6d58bb2690 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.c +++ b/drivers/net/ethernet/intel/ice/ice_switch.c @@ -2032,12 +2032,12 @@ error_out: * ice_aq_map_recipe_to_profile - Map recipe to packet profile * @hw: pointer to the HW struct * @profile_id: package profile ID to associate the recipe with - * @r_bitmap: Recipe bitmap filled in and need to be returned as response + * @r_assoc: Recipe bitmap filled in and need to be returned as response * @cd: pointer to command details structure or NULL * Recipe to profile association (0x0291) */ int -ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, +ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u64 r_assoc, struct ice_sq_cd *cd) { struct ice_aqc_recipe_to_profile *cmd; @@ -2049,7 +2049,7 @@ ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, /* Set the recipe ID bit in the bitmask to let the device know which * profile we are associating the recipe to */ - memcpy(cmd->recipe_assoc, r_bitmap, sizeof(cmd->recipe_assoc)); + cmd->recipe_assoc = cpu_to_le64(r_assoc); return ice_aq_send_cmd(hw, &desc, NULL, 0, cd); } @@ -2058,12 +2058,12 @@ ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, * ice_aq_get_recipe_to_profile - Map recipe to packet profile * @hw: pointer to the HW struct * @profile_id: package profile ID to associate the recipe with - * @r_bitmap: Recipe bitmap filled in and need to be returned as response + * @r_assoc: Recipe bitmap filled in and need to be returned as response * @cd: pointer to command details structure or NULL * Associate profile ID with given recipe (0x0293) */ int -ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, +ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u64 *r_assoc, struct ice_sq_cd *cd) { struct ice_aqc_recipe_to_profile *cmd; @@ -2076,7 +2076,7 @@ ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, status = ice_aq_send_cmd(hw, &desc, NULL, 0, cd); if (!status) - memcpy(r_bitmap, cmd->recipe_assoc, sizeof(cmd->recipe_assoc)); + *r_assoc = le64_to_cpu(cmd->recipe_assoc); return status; } @@ -2121,6 +2121,7 @@ int ice_alloc_recipe(struct ice_hw *hw, u16 *rid) static void ice_get_recp_to_prof_map(struct ice_hw *hw) { DECLARE_BITMAP(r_bitmap, ICE_MAX_NUM_RECIPES); + u64 recp_assoc; u16 i; for (i = 0; i < hw->switch_info->max_used_prof_index + 1; i++) { @@ -2128,8 +2129,9 @@ static void ice_get_recp_to_prof_map(struct ice_hw *hw) bitmap_zero(profile_to_recipe[i], ICE_MAX_NUM_RECIPES); bitmap_zero(r_bitmap, ICE_MAX_NUM_RECIPES); - if (ice_aq_get_recipe_to_profile(hw, i, (u8 *)r_bitmap, NULL)) + if (ice_aq_get_recipe_to_profile(hw, i, &recp_assoc, NULL)) continue; + bitmap_from_arr64(r_bitmap, &recp_assoc, ICE_MAX_NUM_RECIPES); bitmap_copy(profile_to_recipe[i], r_bitmap, ICE_MAX_NUM_RECIPES); for_each_set_bit(j, r_bitmap, ICE_MAX_NUM_RECIPES) @@ -5431,22 +5433,24 @@ ice_add_adv_recipe(struct ice_hw *hw, struct ice_adv_lkup_elem *lkups, */ list_for_each_entry(fvit, &rm->fv_list, list_entry) { DECLARE_BITMAP(r_bitmap, ICE_MAX_NUM_RECIPES); + u64 recp_assoc; u16 j; status = ice_aq_get_recipe_to_profile(hw, fvit->profile_id, - (u8 *)r_bitmap, NULL); + &recp_assoc, NULL); if (status) goto err_unroll; + bitmap_from_arr64(r_bitmap, &recp_assoc, ICE_MAX_NUM_RECIPES); bitmap_or(r_bitmap, r_bitmap, rm->r_bitmap, ICE_MAX_NUM_RECIPES); status = ice_acquire_change_lock(hw, ICE_RES_WRITE); if (status) goto err_unroll; + bitmap_to_arr64(&recp_assoc, r_bitmap, ICE_MAX_NUM_RECIPES); status = ice_aq_map_recipe_to_profile(hw, fvit->profile_id, - (u8 *)r_bitmap, - NULL); + recp_assoc, NULL); ice_release_change_lock(hw); if (status) diff --git a/drivers/net/ethernet/intel/ice/ice_switch.h b/drivers/net/ethernet/intel/ice/ice_switch.h index db7e501b7e0a..89ffa1b51b5a 100644 --- a/drivers/net/ethernet/intel/ice/ice_switch.h +++ b/drivers/net/ethernet/intel/ice/ice_switch.h @@ -424,10 +424,10 @@ int ice_aq_add_recipe(struct ice_hw *hw, struct ice_aqc_recipe_data_elem *s_recipe_list, u16 num_recipes, struct ice_sq_cd *cd); int -ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, +ice_aq_get_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u64 *r_assoc, struct ice_sq_cd *cd); int -ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u8 *r_bitmap, +ice_aq_map_recipe_to_profile(struct ice_hw *hw, u32 profile_id, u64 r_assoc, struct ice_sq_cd *cd); #endif /* _ICE_SWITCH_H_ */ From feddf6c09c44339a75a54322573ae14ef226404b Mon Sep 17 00:00:00 2001 From: Michal Swiatkowski Date: Tue, 24 Oct 2023 13:09:26 +0200 Subject: [PATCH 016/256] ice: realloc VSI stats arrays [ Upstream commit 5995ef88e3a8c2b014f51256a88be8e336532ce7 ] Previously only case when queues amount is lower was covered. Implement realloc for case when queues amount is higher than previous one. Use krealloc() function and zero new allocated elements. It has to be done before ice_vsi_def_cfg(), because stats element for ring is set there. Reviewed-by: Wojciech Drewek Signed-off-by: Michal Swiatkowski Tested-by: Sujai Buvaneswaran Signed-off-by: Tony Nguyen Stable-dep-of: 1cb7fdb1dfde ("ice: fix memory corruption bug with suspend and rebuild") Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/ice/ice_lib.c | 58 ++++++++++++++++-------- 1 file changed, 39 insertions(+), 19 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 7f4bc110ead4..47298ab675a5 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -3084,27 +3084,26 @@ ice_vsi_rebuild_set_coalesce(struct ice_vsi *vsi, } /** - * ice_vsi_realloc_stat_arrays - Frees unused stat structures + * ice_vsi_realloc_stat_arrays - Frees unused stat structures or alloc new ones * @vsi: VSI pointer - * @prev_txq: Number of Tx rings before ring reallocation - * @prev_rxq: Number of Rx rings before ring reallocation */ -static void -ice_vsi_realloc_stat_arrays(struct ice_vsi *vsi, int prev_txq, int prev_rxq) +static int +ice_vsi_realloc_stat_arrays(struct ice_vsi *vsi) { + u16 req_txq = vsi->req_txq ? vsi->req_txq : vsi->alloc_txq; + u16 req_rxq = vsi->req_rxq ? vsi->req_rxq : vsi->alloc_rxq; + struct ice_ring_stats **tx_ring_stats; + struct ice_ring_stats **rx_ring_stats; struct ice_vsi_stats *vsi_stat; struct ice_pf *pf = vsi->back; + u16 prev_txq = vsi->alloc_txq; + u16 prev_rxq = vsi->alloc_rxq; int i; - if (!prev_txq || !prev_rxq) - return; - if (vsi->type == ICE_VSI_CHNL) - return; - vsi_stat = pf->vsi_stats[vsi->idx]; - if (vsi->num_txq < prev_txq) { - for (i = vsi->num_txq; i < prev_txq; i++) { + if (req_txq < prev_txq) { + for (i = req_txq; i < prev_txq; i++) { if (vsi_stat->tx_ring_stats[i]) { kfree_rcu(vsi_stat->tx_ring_stats[i], rcu); WRITE_ONCE(vsi_stat->tx_ring_stats[i], NULL); @@ -3112,14 +3111,36 @@ ice_vsi_realloc_stat_arrays(struct ice_vsi *vsi, int prev_txq, int prev_rxq) } } - if (vsi->num_rxq < prev_rxq) { - for (i = vsi->num_rxq; i < prev_rxq; i++) { + tx_ring_stats = vsi_stat->rx_ring_stats; + vsi_stat->tx_ring_stats = + krealloc_array(vsi_stat->tx_ring_stats, req_txq, + sizeof(*vsi_stat->tx_ring_stats), + GFP_KERNEL | __GFP_ZERO); + if (!vsi_stat->tx_ring_stats) { + vsi_stat->tx_ring_stats = tx_ring_stats; + return -ENOMEM; + } + + if (req_rxq < prev_rxq) { + for (i = req_rxq; i < prev_rxq; i++) { if (vsi_stat->rx_ring_stats[i]) { kfree_rcu(vsi_stat->rx_ring_stats[i], rcu); WRITE_ONCE(vsi_stat->rx_ring_stats[i], NULL); } } } + + rx_ring_stats = vsi_stat->rx_ring_stats; + vsi_stat->rx_ring_stats = + krealloc_array(vsi_stat->rx_ring_stats, req_rxq, + sizeof(*vsi_stat->rx_ring_stats), + GFP_KERNEL | __GFP_ZERO); + if (!vsi_stat->rx_ring_stats) { + vsi_stat->rx_ring_stats = rx_ring_stats; + return -ENOMEM; + } + + return 0; } /** @@ -3136,9 +3157,9 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, u32 vsi_flags) { struct ice_vsi_cfg_params params = {}; struct ice_coalesce_stored *coalesce; - int ret, prev_txq, prev_rxq; int prev_num_q_vectors = 0; struct ice_pf *pf; + int ret; if (!vsi) return -EINVAL; @@ -3157,8 +3178,9 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, u32 vsi_flags) prev_num_q_vectors = ice_vsi_rebuild_get_coalesce(vsi, coalesce); - prev_txq = vsi->num_txq; - prev_rxq = vsi->num_rxq; + ret = ice_vsi_realloc_stat_arrays(vsi); + if (ret) + goto err_vsi_cfg; ice_vsi_decfg(vsi); ret = ice_vsi_cfg_def(vsi, ¶ms); @@ -3176,8 +3198,6 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, u32 vsi_flags) return ice_schedule_reset(pf, ICE_RESET_PFR); } - ice_vsi_realloc_stat_arrays(vsi, prev_txq, prev_rxq); - ice_vsi_rebuild_set_coalesce(vsi, coalesce, prev_num_q_vectors); kfree(coalesce); From e40a02f06ceb0e0b0183e0b973ac5dbf8f75edec Mon Sep 17 00:00:00 2001 From: Jesse Brandeburg Date: Tue, 5 Mar 2024 15:02:03 -0800 Subject: [PATCH 017/256] ice: fix memory corruption bug with suspend and rebuild [ Upstream commit 1cb7fdb1dfde1aab66780b4ba44dba6402172111 ] The ice driver would previously panic after suspend. This is caused from the driver *only* calling the ice_vsi_free_q_vectors() function by itself, when it is suspending. Since commit b3e7b3a6ee92 ("ice: prevent NULL pointer deref during reload") the driver has zeroed out num_q_vectors, and only restored it in ice_vsi_cfg_def(). This further causes the ice_rebuild() function to allocate a zero length buffer, after which num_q_vectors is updated, and then the new value of num_q_vectors is used to index into the zero length buffer, which corrupts memory. The fix entails making sure all the code referencing num_q_vectors only does so after it has been reset via ice_vsi_cfg_def(). I didn't perform a full bisect, but I was able to test against 6.1.77 kernel and that ice driver works fine for suspend/resume with no panic, so sometime since then, this problem was introduced. Also clean up an un-needed init of a local variable in the function being modified. PANIC from 6.8.0-rc1: [1026674.915596] PM: suspend exit [1026675.664697] ice 0000:17:00.1: PTP reset successful [1026675.664707] ice 0000:17:00.1: 2755 msecs passed between update to cached PHC time [1026675.667660] ice 0000:b1:00.0: PTP reset successful [1026675.675944] ice 0000:b1:00.0: 2832 msecs passed between update to cached PHC time [1026677.137733] ixgbe 0000:31:00.0 ens787: NIC Link is Up 1 Gbps, Flow Control: None [1026677.190201] BUG: kernel NULL pointer dereference, address: 0000000000000010 [1026677.192753] ice 0000:17:00.0: PTP reset successful [1026677.192764] ice 0000:17:00.0: 4548 msecs passed between update to cached PHC time [1026677.197928] #PF: supervisor read access in kernel mode [1026677.197933] #PF: error_code(0x0000) - not-present page [1026677.197937] PGD 1557a7067 P4D 0 [1026677.212133] ice 0000:b1:00.1: PTP reset successful [1026677.212143] ice 0000:b1:00.1: 4344 msecs passed between update to cached PHC time [1026677.212575] [1026677.243142] Oops: 0000 [#1] PREEMPT SMP NOPTI [1026677.247918] CPU: 23 PID: 42790 Comm: kworker/23:0 Kdump: loaded Tainted: G W 6.8.0-rc1+ #1 [1026677.257989] Hardware name: Intel Corporation M50CYP2SBSTD/M50CYP2SBSTD, BIOS SE5C620.86B.01.01.0005.2202160810 02/16/2022 [1026677.269367] Workqueue: ice ice_service_task [ice] [1026677.274592] RIP: 0010:ice_vsi_rebuild_set_coalesce+0x130/0x1e0 [ice] [1026677.281421] Code: 0f 84 3a ff ff ff 41 0f b7 74 ec 02 66 89 b0 22 02 00 00 81 e6 ff 1f 00 00 e8 ec fd ff ff e9 35 ff ff ff 48 8b 43 30 49 63 ed <41> 0f b7 34 24 41 83 c5 01 48 8b 3c e8 66 89 b7 aa 02 00 00 81 e6 [1026677.300877] RSP: 0018:ff3be62a6399bcc0 EFLAGS: 00010202 [1026677.306556] RAX: ff28691e28980828 RBX: ff28691e41099828 RCX: 0000000000188000 [1026677.314148] RDX: 0000000000000000 RSI: 0000000000000010 RDI: ff28691e41099828 [1026677.321730] RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000 [1026677.329311] R10: 0000000000000007 R11: ffffffffffffffc0 R12: 0000000000000010 [1026677.336896] R13: 0000000000000000 R14: 0000000000000000 R15: ff28691e0eaa81a0 [1026677.344472] FS: 0000000000000000(0000) GS:ff28693cbffc0000(0000) knlGS:0000000000000000 [1026677.353000] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [1026677.359195] CR2: 0000000000000010 CR3: 0000000128df4001 CR4: 0000000000771ef0 [1026677.366779] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [1026677.374369] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [1026677.381952] PKRU: 55555554 [1026677.385116] Call Trace: [1026677.388023] [1026677.390589] ? __die+0x20/0x70 [1026677.394105] ? page_fault_oops+0x82/0x160 [1026677.398576] ? do_user_addr_fault+0x65/0x6a0 [1026677.403307] ? exc_page_fault+0x6a/0x150 [1026677.407694] ? asm_exc_page_fault+0x22/0x30 [1026677.412349] ? ice_vsi_rebuild_set_coalesce+0x130/0x1e0 [ice] [1026677.418614] ice_vsi_rebuild+0x34b/0x3c0 [ice] [1026677.423583] ice_vsi_rebuild_by_type+0x76/0x180 [ice] [1026677.429147] ice_rebuild+0x18b/0x520 [ice] [1026677.433746] ? delay_tsc+0x8f/0xc0 [1026677.437630] ice_do_reset+0xa3/0x190 [ice] [1026677.442231] ice_service_task+0x26/0x440 [ice] [1026677.447180] process_one_work+0x174/0x340 [1026677.451669] worker_thread+0x27e/0x390 [1026677.455890] ? __pfx_worker_thread+0x10/0x10 [1026677.460627] kthread+0xee/0x120 [1026677.464235] ? __pfx_kthread+0x10/0x10 [1026677.468445] ret_from_fork+0x2d/0x50 [1026677.472476] ? __pfx_kthread+0x10/0x10 [1026677.476671] ret_from_fork_asm+0x1b/0x30 [1026677.481050] Fixes: b3e7b3a6ee92 ("ice: prevent NULL pointer deref during reload") Reported-by: Robert Elliott Signed-off-by: Jesse Brandeburg Reviewed-by: Simon Horman Reviewed-by: Aleksandr Loktionov Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/ice/ice_lib.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c index 47298ab675a5..0b7132a42e35 100644 --- a/drivers/net/ethernet/intel/ice/ice_lib.c +++ b/drivers/net/ethernet/intel/ice/ice_lib.c @@ -3157,7 +3157,7 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, u32 vsi_flags) { struct ice_vsi_cfg_params params = {}; struct ice_coalesce_stored *coalesce; - int prev_num_q_vectors = 0; + int prev_num_q_vectors; struct ice_pf *pf; int ret; @@ -3171,13 +3171,6 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, u32 vsi_flags) if (WARN_ON(vsi->type == ICE_VSI_VF && !vsi->vf)) return -EINVAL; - coalesce = kcalloc(vsi->num_q_vectors, - sizeof(struct ice_coalesce_stored), GFP_KERNEL); - if (!coalesce) - return -ENOMEM; - - prev_num_q_vectors = ice_vsi_rebuild_get_coalesce(vsi, coalesce); - ret = ice_vsi_realloc_stat_arrays(vsi); if (ret) goto err_vsi_cfg; @@ -3187,6 +3180,13 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, u32 vsi_flags) if (ret) goto err_vsi_cfg; + coalesce = kcalloc(vsi->num_q_vectors, + sizeof(struct ice_coalesce_stored), GFP_KERNEL); + if (!coalesce) + return -ENOMEM; + + prev_num_q_vectors = ice_vsi_rebuild_get_coalesce(vsi, coalesce); + ret = ice_vsi_cfg_tc_lan(pf, vsi); if (ret) { if (vsi_flags & ICE_VSI_FLAG_INIT) { @@ -3205,8 +3205,8 @@ int ice_vsi_rebuild(struct ice_vsi *vsi, u32 vsi_flags) err_vsi_cfg_tc_lan: ice_vsi_decfg(vsi); -err_vsi_cfg: kfree(coalesce); +err_vsi_cfg: return ret; } From 4465b15ae5c522c6d0b1510b2ecc997da4b8e4c3 Mon Sep 17 00:00:00 2001 From: Przemek Kitszel Date: Tue, 5 Mar 2024 17:02:02 +0100 Subject: [PATCH 018/256] ixgbe: avoid sleeping allocation in ixgbe_ipsec_vf_add_sa() [ Upstream commit aec806fb4afba5fe80b09e29351379a4292baa43 ] Change kzalloc() flags used in ixgbe_ipsec_vf_add_sa() to GFP_ATOMIC, to avoid sleeping in IRQ context. Dan Carpenter, with the help of Smatch, has found following issue: The patch eda0333ac293: "ixgbe: add VF IPsec management" from Aug 13, 2018 (linux-next), leads to the following Smatch static checker warning: drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c:917 ixgbe_ipsec_vf_add_sa() warn: sleeping in IRQ context The call tree that Smatch is worried about is: ixgbe_msix_other() <- IRQ handler -> ixgbe_msg_task() -> ixgbe_rcv_msg_from_vf() -> ixgbe_ipsec_vf_add_sa() Fixes: eda0333ac293 ("ixgbe: add VF IPsec management") Reported-by: Dan Carpenter Link: https://lore.kernel.org/intel-wired-lan/db31a0b0-4d9f-4e6b-aed8-88266eb5665c@moroto.mountain Reviewed-by: Michal Kubiak Signed-off-by: Przemek Kitszel Reviewed-by: Shannon Nelson Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c index 13a6fca31004..866024f2b9ee 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_ipsec.c @@ -914,7 +914,13 @@ int ixgbe_ipsec_vf_add_sa(struct ixgbe_adapter *adapter, u32 *msgbuf, u32 vf) goto err_out; } - xs = kzalloc(sizeof(*xs), GFP_KERNEL); + algo = xfrm_aead_get_byname(aes_gcm_name, IXGBE_IPSEC_AUTH_BITS, 1); + if (unlikely(!algo)) { + err = -ENOENT; + goto err_out; + } + + xs = kzalloc(sizeof(*xs), GFP_ATOMIC); if (unlikely(!xs)) { err = -ENOMEM; goto err_out; @@ -930,14 +936,8 @@ int ixgbe_ipsec_vf_add_sa(struct ixgbe_adapter *adapter, u32 *msgbuf, u32 vf) memcpy(&xs->id.daddr.a4, sam->addr, sizeof(xs->id.daddr.a4)); xs->xso.dev = adapter->netdev; - algo = xfrm_aead_get_byname(aes_gcm_name, IXGBE_IPSEC_AUTH_BITS, 1); - if (unlikely(!algo)) { - err = -ENOENT; - goto err_xs; - } - aead_len = sizeof(*xs->aead) + IXGBE_IPSEC_KEY_BITS / 8; - xs->aead = kzalloc(aead_len, GFP_KERNEL); + xs->aead = kzalloc(aead_len, GFP_ATOMIC); if (unlikely(!xs->aead)) { err = -ENOMEM; goto err_xs; From 1b1c0f6ce790e5213dd6765a72c789a8aec76047 Mon Sep 17 00:00:00 2001 From: Kurt Kanzenbach Date: Wed, 13 Mar 2024 14:03:10 +0100 Subject: [PATCH 019/256] igc: Remove stale comment about Tx timestamping [ Upstream commit 47ce2956c7a61ff354723e28235205fa2012265b ] The initial igc Tx timestamping implementation used only one register for retrieving Tx timestamps. Commit 3ed247e78911 ("igc: Add support for multiple in-flight TX timestamps") added support for utilizing all four of them e.g., for multiple domain support. Remove the stale comment/FIXME. Fixes: 3ed247e78911 ("igc: Add support for multiple in-flight TX timestamps") Signed-off-by: Kurt Kanzenbach Acked-by: Vinicius Costa Gomes Reviewed-by: Przemek Kitszel Tested-by: Naama Meir Signed-off-by: Tony Nguyen Signed-off-by: Sasha Levin --- drivers/net/ethernet/intel/igc/igc_main.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/net/ethernet/intel/igc/igc_main.c b/drivers/net/ethernet/intel/igc/igc_main.c index fc1de116d554..e83700ad7e62 100644 --- a/drivers/net/ethernet/intel/igc/igc_main.c +++ b/drivers/net/ethernet/intel/igc/igc_main.c @@ -1640,10 +1640,6 @@ done: if (unlikely(test_bit(IGC_RING_FLAG_TX_HWTSTAMP, &tx_ring->flags) && skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { - /* FIXME: add support for retrieving timestamps from - * the other timer registers before skipping the - * timestamping request. - */ unsigned long flags; u32 tstamp_flags; From 21dea1475fd453ac4feaa3c395f51c61b5779260 Mon Sep 17 00:00:00 2001 From: Alexandra Winter Date: Thu, 21 Mar 2024 12:53:37 +0100 Subject: [PATCH 020/256] s390/qeth: handle deferred cc1 [ Upstream commit afb373ff3f54c9d909efc7f810dc80a9742807b2 ] The IO subsystem expects a driver to retry a ccw_device_start, when the subsequent interrupt response block (irb) contains a deferred condition code 1. Symptoms before this commit: On the read channel we always trigger the next read anyhow, so no different behaviour here. On the write channel we may experience timeout errors, because the expected reply will never be received without the retry. Other callers of qeth_send_control_data() may wrongly assume that the ccw was successful, which may cause problems later. Note that since commit 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") and commit 5ef1dc40ffa6 ("s390/cio: fix invalid -EBUSY on ccw_device_start") deferred CC1s are much more likely to occur. See the commit message of the latter for more background information. Fixes: 2297791c92d0 ("s390/cio: dont unregister subchannel from child-drivers") Signed-off-by: Alexandra Winter Co-developed-by: Thorsten Winkler Signed-off-by: Thorsten Winkler Reviewed-by: Peter Oberparleiter Link: https://lore.kernel.org/r/20240321115337.3564694-1-wintera@linux.ibm.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/s390/net/qeth_core_main.c | 38 +++++++++++++++++++++++++++++-- 1 file changed, 36 insertions(+), 2 deletions(-) diff --git a/drivers/s390/net/qeth_core_main.c b/drivers/s390/net/qeth_core_main.c index cd783290bde5..1148b4ecabdd 100644 --- a/drivers/s390/net/qeth_core_main.c +++ b/drivers/s390/net/qeth_core_main.c @@ -1179,6 +1179,20 @@ static int qeth_check_irb_error(struct qeth_card *card, struct ccw_device *cdev, } } +/** + * qeth_irq() - qeth interrupt handler + * @cdev: ccw device + * @intparm: expect pointer to iob + * @irb: Interruption Response Block + * + * In the good path: + * corresponding qeth channel is locked with last used iob as active_cmd. + * But this function is also called for error interrupts. + * + * Caller ensures that: + * Interrupts are disabled; ccw device lock is held; + * + */ static void qeth_irq(struct ccw_device *cdev, unsigned long intparm, struct irb *irb) { @@ -1220,11 +1234,10 @@ static void qeth_irq(struct ccw_device *cdev, unsigned long intparm, iob = (struct qeth_cmd_buffer *) (addr_t)intparm; } - qeth_unlock_channel(card, channel); - rc = qeth_check_irb_error(card, cdev, irb); if (rc) { /* IO was terminated, free its resources. */ + qeth_unlock_channel(card, channel); if (iob) qeth_cancel_cmd(iob, rc); return; @@ -1268,6 +1281,7 @@ static void qeth_irq(struct ccw_device *cdev, unsigned long intparm, rc = qeth_get_problem(card, cdev, irb); if (rc) { card->read_or_write_problem = 1; + qeth_unlock_channel(card, channel); if (iob) qeth_cancel_cmd(iob, rc); qeth_clear_ipacmd_list(card); @@ -1276,6 +1290,26 @@ static void qeth_irq(struct ccw_device *cdev, unsigned long intparm, } } + if (scsw_cmd_is_valid_cc(&irb->scsw) && irb->scsw.cmd.cc == 1 && iob) { + /* channel command hasn't started: retry. + * active_cmd is still set to last iob + */ + QETH_CARD_TEXT(card, 2, "irqcc1"); + rc = ccw_device_start_timeout(cdev, __ccw_from_cmd(iob), + (addr_t)iob, 0, 0, iob->timeout); + if (rc) { + QETH_DBF_MESSAGE(2, + "ccw retry on %x failed, rc = %i\n", + CARD_DEVID(card), rc); + QETH_CARD_TEXT_(card, 2, " err%d", rc); + qeth_unlock_channel(card, channel); + qeth_cancel_cmd(iob, rc); + } + return; + } + + qeth_unlock_channel(card, channel); + if (iob) { /* sanity check: */ if (irb->scsw.cmd.count > iob->length) { From 984c3d962c9ef711016e46dc8986cbd9241fdd85 Mon Sep 17 00:00:00 2001 From: Ravi Gunasekaran Date: Fri, 22 Mar 2024 15:34:47 +0530 Subject: [PATCH 021/256] net: hsr: hsr_slave: Fix the promiscuous mode in offload mode [ Upstream commit b11c81731c810efe592e510bb0110e0db6877419 ] commit e748d0fd66ab ("net: hsr: Disable promiscuous mode in offload mode") disables promiscuous mode of slave devices while creating an HSR interface. But while deleting the HSR interface, it does not take care of it. It decreases the promiscuous mode count, which eventually enables promiscuous mode on the slave devices when creating HSR interface again. Fix this by not decrementing the promiscuous mode count while deleting the HSR interface when offload is enabled. Fixes: e748d0fd66ab ("net: hsr: Disable promiscuous mode in offload mode") Signed-off-by: Ravi Gunasekaran Reviewed-by: Jiri Pirko Link: https://lore.kernel.org/r/20240322100447.27615-1-r-gunasekaran@ti.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/hsr/hsr_slave.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/hsr/hsr_slave.c b/net/hsr/hsr_slave.c index e5742f2a2d52..1b6457f357bd 100644 --- a/net/hsr/hsr_slave.c +++ b/net/hsr/hsr_slave.c @@ -220,7 +220,8 @@ void hsr_del_port(struct hsr_port *port) netdev_update_features(master->dev); dev_set_mtu(master->dev, hsr_get_max_mtu(hsr)); netdev_rx_handler_unregister(port->dev); - dev_set_promiscuity(port->dev, -1); + if (!port->hsr->fwd_offloaded) + dev_set_promiscuity(port->dev, -1); netdev_upper_dev_unlink(port->dev, master->dev); } From c1ae4d1e76eacddaacb958b67cd942082f800c87 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 22 Mar 2024 13:57:32 +0000 Subject: [PATCH 022/256] tcp: properly terminate timers for kernel sockets [ Upstream commit 151c9c724d05d5b0dd8acd3e11cb69ef1f2dbada ] We had various syzbot reports about tcp timers firing after the corresponding netns has been dismantled. Fortunately Josef Bacik could trigger the issue more often, and could test a patch I wrote two years ago. When TCP sockets are closed, we call inet_csk_clear_xmit_timers() to 'stop' the timers. inet_csk_clear_xmit_timers() can be called from any context, including when socket lock is held. This is the reason it uses sk_stop_timer(), aka del_timer(). This means that ongoing timers might finish much later. For user sockets, this is fine because each running timer holds a reference on the socket, and the user socket holds a reference on the netns. For kernel sockets, we risk that the netns is freed before timer can complete, because kernel sockets do not hold reference on the netns. This patch adds inet_csk_clear_xmit_timers_sync() function that using sk_stop_timer_sync() to make sure all timers are terminated before the kernel socket is released. Modules using kernel sockets close them in their netns exit() handler. Also add sock_not_owned_by_me() helper to get LOCKDEP support : inet_csk_clear_xmit_timers_sync() must not be called while socket lock is held. It is very possible we can revert in the future commit 3a58f13a881e ("net: rds: acquire refcount on TCP sockets") which attempted to solve the issue in rds only. (net/smc/af_smc.c and net/mptcp/subflow.c have similar code) We probably can remove the check_net() tests from tcp_out_of_resources() and __tcp_close() in the future. Reported-by: Josef Bacik Closes: https://lore.kernel.org/netdev/20240314210740.GA2823176@perftesting/ Fixes: 26abe14379f8 ("net: Modify sk_alloc to not reference count the netns of kernel sockets.") Fixes: 8a68173691f0 ("net: sk_clone_lock() should only do get_net() if the parent is not a kernel socket") Link: https://lore.kernel.org/bpf/CANn89i+484ffqb93aQm1N-tjxxvb3WDKX0EbD7318RwRgsatjw@mail.gmail.com/ Signed-off-by: Eric Dumazet Tested-by: Josef Bacik Cc: Tetsuo Handa Link: https://lore.kernel.org/r/20240322135732.1535772-1-edumazet@google.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- include/net/inet_connection_sock.h | 1 + include/net/sock.h | 7 +++++++ net/ipv4/inet_connection_sock.c | 14 ++++++++++++++ net/ipv4/tcp.c | 2 ++ 4 files changed, 24 insertions(+) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 01a73bf74fa1..6ecac01115d9 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -173,6 +173,7 @@ void inet_csk_init_xmit_timers(struct sock *sk, void (*delack_handler)(struct timer_list *), void (*keepalive_handler)(struct timer_list *)); void inet_csk_clear_xmit_timers(struct sock *sk); +void inet_csk_clear_xmit_timers_sync(struct sock *sk); static inline void inet_csk_schedule_ack(struct sock *sk) { diff --git a/include/net/sock.h b/include/net/sock.h index e70c903b04f3..25780942ec8b 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1808,6 +1808,13 @@ static inline void sock_owned_by_me(const struct sock *sk) #endif } +static inline void sock_not_owned_by_me(const struct sock *sk) +{ +#ifdef CONFIG_LOCKDEP + WARN_ON_ONCE(lockdep_sock_is_held(sk) && debug_locks); +#endif +} + static inline bool sock_owned_by_user(const struct sock *sk) { sock_owned_by_me(sk); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 762817d6c8d7..a587cb6be807 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -774,6 +774,20 @@ void inet_csk_clear_xmit_timers(struct sock *sk) } EXPORT_SYMBOL(inet_csk_clear_xmit_timers); +void inet_csk_clear_xmit_timers_sync(struct sock *sk) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + + /* ongoing timer handlers need to acquire socket lock. */ + sock_not_owned_by_me(sk); + + icsk->icsk_pending = icsk->icsk_ack.pending = 0; + + sk_stop_timer_sync(sk, &icsk->icsk_retransmit_timer); + sk_stop_timer_sync(sk, &icsk->icsk_delack_timer); + sk_stop_timer_sync(sk, &sk->sk_timer); +} + void inet_csk_delete_keepalive_timer(struct sock *sk) { sk_stop_timer(sk, &sk->sk_timer); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 68bb8d6bcc11..f8df35f7352a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2931,6 +2931,8 @@ void tcp_close(struct sock *sk, long timeout) lock_sock(sk); __tcp_close(sk, timeout); release_sock(sk); + if (!sk->sk_net_refcnt) + inet_csk_clear_xmit_timers_sync(sk); sock_put(sk); } EXPORT_SYMBOL(tcp_close); From 2e22c9cb618716b8e557fe17c3d4958171288082 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B8rn=20Mork?= Date: Fri, 22 Mar 2024 15:40:00 +0100 Subject: [PATCH 023/256] net: wwan: t7xx: Split 64bit accesses to fix alignment issues MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 7d5a7dd5a35876f0ecc286f3602a88887a788217 ] Some of the registers are aligned on a 32bit boundary, causing alignment faults on 64bit platforms. Unable to handle kernel paging request at virtual address ffffffc084a1d004 Mem abort info: ESR = 0x0000000096000061 EC = 0x25: DABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x21: alignment fault Data abort info: ISV = 0, ISS = 0x00000061, ISS2 = 0x00000000 CM = 0, WnR = 1, TnD = 0, TagAccess = 0 GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 swapper pgtable: 4k pages, 39-bit VAs, pgdp=0000000046ad6000 [ffffffc084a1d004] pgd=100000013ffff003, p4d=100000013ffff003, pud=100000013ffff003, pmd=0068000020a00711 Internal error: Oops: 0000000096000061 [#1] SMP Modules linked in: mtk_t7xx(+) qcserial pppoe ppp_async option nft_fib_inet nf_flow_table_inet mt7921u(O) mt7921s(O) mt7921e(O) mt7921_common(O) iwlmvm(O) iwldvm(O) usb_wwan rndis_host qmi_wwan pppox ppp_generic nft_reject_ipv6 nft_reject_ipv4 nft_reject_inet nft_reject nft_redir nft_quota nft_numgen nft_nat nft_masq nft_log nft_limit nft_hash nft_flow_offload nft_fib_ipv6 nft_fib_ipv4 nft_fib nft_ct nft_chain_nat nf_tables nf_nat nf_flow_table nf_conntrack mt7996e(O) mt792x_usb(O) mt792x_lib(O) mt7915e(O) mt76_usb(O) mt76_sdio(O) mt76_connac_lib(O) mt76(O) mac80211(O) iwlwifi(O) huawei_cdc_ncm cfg80211(O) cdc_ncm cdc_ether wwan usbserial usbnet slhc sfp rtc_pcf8563 nfnetlink nf_reject_ipv6 nf_reject_ipv4 nf_log_syslog nf_defrag_ipv6 nf_defrag_ipv4 mt6577_auxadc mdio_i2c libcrc32c compat(O) cdc_wdm cdc_acm at24 crypto_safexcel pwm_fan i2c_gpio i2c_smbus industrialio i2c_algo_bit i2c_mux_reg i2c_mux_pca954x i2c_mux_pca9541 i2c_mux_gpio i2c_mux dummy oid_registry tun sha512_arm64 sha1_ce sha1_generic seqiv md5 geniv des_generic libdes cbc authencesn authenc leds_gpio xhci_plat_hcd xhci_pci xhci_mtk_hcd xhci_hcd nvme nvme_core gpio_button_hotplug(O) dm_mirror dm_region_hash dm_log dm_crypt dm_mod dax usbcore usb_common ptp aquantia pps_core mii tpm encrypted_keys trusted CPU: 3 PID: 5266 Comm: kworker/u9:1 Tainted: G O 6.6.22 #0 Hardware name: Bananapi BPI-R4 (DT) Workqueue: md_hk_wq t7xx_fsm_uninit [mtk_t7xx] pstate: 804000c5 (Nzcv daIF +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : t7xx_cldma_hw_set_start_addr+0x1c/0x3c [mtk_t7xx] lr : t7xx_cldma_start+0xac/0x13c [mtk_t7xx] sp : ffffffc085d63d30 x29: ffffffc085d63d30 x28: 0000000000000000 x27: 0000000000000000 x26: 0000000000000000 x25: ffffff80c804f2c0 x24: ffffff80ca196c05 x23: 0000000000000000 x22: ffffff80c814b9b8 x21: ffffff80c814b128 x20: 0000000000000001 x19: ffffff80c814b080 x18: 0000000000000014 x17: 0000000055c9806b x16: 000000007c5296d0 x15: 000000000f6bca68 x14: 00000000dbdbdce4 x13: 000000001aeaf72a x12: 0000000000000001 x11: 0000000000000000 x10: 0000000000000000 x9 : 0000000000000000 x8 : ffffff80ca1ef6b4 x7 : ffffff80c814b818 x6 : 0000000000000018 x5 : 0000000000000870 x4 : 0000000000000000 x3 : 0000000000000000 x2 : 000000010a947000 x1 : ffffffc084a1d004 x0 : ffffffc084a1d004 Call trace: t7xx_cldma_hw_set_start_addr+0x1c/0x3c [mtk_t7xx] t7xx_fsm_uninit+0x578/0x5ec [mtk_t7xx] process_one_work+0x154/0x2a0 worker_thread+0x2ac/0x488 kthread+0xe0/0xec ret_from_fork+0x10/0x20 Code: f9400800 91001000 8b214001 d50332bf (f9000022) ---[ end trace 0000000000000000 ]--- The inclusion of io-64-nonatomic-lo-hi.h indicates that all 64bit accesses can be replaced by pairs of nonatomic 32bit access. Fix alignment by forcing all accesses to be 32bit on 64bit platforms. Link: https://forum.openwrt.org/t/fibocom-fm350-gl-support/142682/72 Fixes: 39d439047f1d ("net: wwan: t7xx: Add control DMA interface") Signed-off-by: Bjørn Mork Reviewed-by: Sergey Ryazanov Tested-by: Liviu Dudau Link: https://lore.kernel.org/r/20240322144000.1683822-1-bjorn@mork.no Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- drivers/net/wwan/t7xx/t7xx_cldma.c | 4 ++-- drivers/net/wwan/t7xx/t7xx_hif_cldma.c | 9 +++++---- drivers/net/wwan/t7xx/t7xx_pcie_mac.c | 8 ++++---- 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/drivers/net/wwan/t7xx/t7xx_cldma.c b/drivers/net/wwan/t7xx/t7xx_cldma.c index 9f43f256db1d..f0a4783baf1f 100644 --- a/drivers/net/wwan/t7xx/t7xx_cldma.c +++ b/drivers/net/wwan/t7xx/t7xx_cldma.c @@ -106,7 +106,7 @@ bool t7xx_cldma_tx_addr_is_set(struct t7xx_cldma_hw *hw_info, unsigned int qno) { u32 offset = REG_CLDMA_UL_START_ADDRL_0 + qno * ADDR_SIZE; - return ioread64(hw_info->ap_pdn_base + offset); + return ioread64_lo_hi(hw_info->ap_pdn_base + offset); } void t7xx_cldma_hw_set_start_addr(struct t7xx_cldma_hw *hw_info, unsigned int qno, u64 address, @@ -117,7 +117,7 @@ void t7xx_cldma_hw_set_start_addr(struct t7xx_cldma_hw *hw_info, unsigned int qn reg = tx_rx == MTK_RX ? hw_info->ap_ao_base + REG_CLDMA_DL_START_ADDRL_0 : hw_info->ap_pdn_base + REG_CLDMA_UL_START_ADDRL_0; - iowrite64(address, reg + offset); + iowrite64_lo_hi(address, reg + offset); } void t7xx_cldma_hw_resume_queue(struct t7xx_cldma_hw *hw_info, unsigned int qno, diff --git a/drivers/net/wwan/t7xx/t7xx_hif_cldma.c b/drivers/net/wwan/t7xx/t7xx_hif_cldma.c index cc70360364b7..554ba4669cc8 100644 --- a/drivers/net/wwan/t7xx/t7xx_hif_cldma.c +++ b/drivers/net/wwan/t7xx/t7xx_hif_cldma.c @@ -139,8 +139,9 @@ static int t7xx_cldma_gpd_rx_from_q(struct cldma_queue *queue, int budget, bool return -ENODEV; } - gpd_addr = ioread64(hw_info->ap_pdn_base + REG_CLDMA_DL_CURRENT_ADDRL_0 + - queue->index * sizeof(u64)); + gpd_addr = ioread64_lo_hi(hw_info->ap_pdn_base + + REG_CLDMA_DL_CURRENT_ADDRL_0 + + queue->index * sizeof(u64)); if (req->gpd_addr == gpd_addr || hwo_polling_count++ >= 100) return 0; @@ -318,8 +319,8 @@ static void t7xx_cldma_txq_empty_hndl(struct cldma_queue *queue) struct t7xx_cldma_hw *hw_info = &md_ctrl->hw_info; /* Check current processing TGPD, 64-bit address is in a table by Q index */ - ul_curr_addr = ioread64(hw_info->ap_pdn_base + REG_CLDMA_UL_CURRENT_ADDRL_0 + - queue->index * sizeof(u64)); + ul_curr_addr = ioread64_lo_hi(hw_info->ap_pdn_base + REG_CLDMA_UL_CURRENT_ADDRL_0 + + queue->index * sizeof(u64)); if (req->gpd_addr != ul_curr_addr) { spin_unlock_irqrestore(&md_ctrl->cldma_lock, flags); dev_err(md_ctrl->dev, "CLDMA%d queue %d is not empty\n", diff --git a/drivers/net/wwan/t7xx/t7xx_pcie_mac.c b/drivers/net/wwan/t7xx/t7xx_pcie_mac.c index 76da4c15e3de..f071ec7ff23d 100644 --- a/drivers/net/wwan/t7xx/t7xx_pcie_mac.c +++ b/drivers/net/wwan/t7xx/t7xx_pcie_mac.c @@ -75,7 +75,7 @@ static void t7xx_pcie_mac_atr_tables_dis(void __iomem *pbase, enum t7xx_atr_src_ for (i = 0; i < ATR_TABLE_NUM_PER_ATR; i++) { offset = ATR_PORT_OFFSET * port + ATR_TABLE_OFFSET * i; reg = pbase + ATR_PCIE_WIN0_T0_ATR_PARAM_SRC_ADDR + offset; - iowrite64(0, reg); + iowrite64_lo_hi(0, reg); } } @@ -112,17 +112,17 @@ static int t7xx_pcie_mac_atr_cfg(struct t7xx_pci_dev *t7xx_dev, struct t7xx_atr_ reg = pbase + ATR_PCIE_WIN0_T0_TRSL_ADDR + offset; value = cfg->trsl_addr & ATR_PCIE_WIN0_ADDR_ALGMT; - iowrite64(value, reg); + iowrite64_lo_hi(value, reg); reg = pbase + ATR_PCIE_WIN0_T0_TRSL_PARAM + offset; iowrite32(cfg->trsl_id, reg); reg = pbase + ATR_PCIE_WIN0_T0_ATR_PARAM_SRC_ADDR + offset; value = (cfg->src_addr & ATR_PCIE_WIN0_ADDR_ALGMT) | (atr_size << 1) | BIT(0); - iowrite64(value, reg); + iowrite64_lo_hi(value, reg); /* Ensure ATR is set */ - ioread64(reg); + ioread64_lo_hi(reg); return 0; } From 7fb8b3de7f22adfee1a40562c249974b31e0697e Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 25 Mar 2024 09:50:30 +0200 Subject: [PATCH 024/256] selftests: vxlan_mdb: Fix failures with old libnet [ Upstream commit f1425529c33def8b46faae4400dd9e2bbaf16a05 ] Locally generated IP multicast packets (such as the ones used in the test) do not perform routing and simply egress the bound device. However, as explained in commit 8bcfb4ae4d97 ("selftests: forwarding: Fix failing tests with old libnet"), old versions of libnet (used by mausezahn) do not use the "SO_BINDTODEVICE" socket option. Specifically, the library started using the option for IPv6 sockets in version 1.1.6 and for IPv4 sockets in version 1.2. This explains why on Ubuntu - which uses version 1.1.6 - the IPv4 overlay tests are failing whereas the IPv6 ones are passing. Fix by specifying the source and destination MAC of the packets which will cause mausezahn to use a packet socket instead of an IP socket. Fixes: 62199e3f1658 ("selftests: net: Add VXLAN MDB test") Reported-by: Mirsad Todorovac Closes: https://lore.kernel.org/netdev/5bb50349-196d-4892-8ed2-f37543aa863f@alu.unizg.hr/ Tested-by: Mirsad Todorovac Signed-off-by: Ido Schimmel Link: https://lore.kernel.org/r/20240325075030.2379513-1-idosch@nvidia.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- tools/testing/selftests/net/test_vxlan_mdb.sh | 205 +++++++++++------- 1 file changed, 128 insertions(+), 77 deletions(-) diff --git a/tools/testing/selftests/net/test_vxlan_mdb.sh b/tools/testing/selftests/net/test_vxlan_mdb.sh index 31e5f0f8859d..be8e66abc74e 100755 --- a/tools/testing/selftests/net/test_vxlan_mdb.sh +++ b/tools/testing/selftests/net/test_vxlan_mdb.sh @@ -984,6 +984,7 @@ encap_params_common() local plen=$1; shift local enc_ethtype=$1; shift local grp=$1; shift + local grp_dmac=$1; shift local src=$1; shift local mz=$1; shift @@ -1002,11 +1003,11 @@ encap_params_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent dst $vtep2_ip src_vni 10020" run_cmd "tc -n $ns2 filter replace dev vx0 ingress pref 1 handle 101 proto all flower enc_dst_ip $vtep1_ip action pass" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Destination IP - match" - run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Destination IP - no match" @@ -1019,20 +1020,20 @@ encap_params_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent dst $vtep1_ip dst_port 1111 src_vni 10020" run_cmd "tc -n $ns2 filter replace dev veth0 ingress pref 1 handle 101 proto $enc_ethtype flower ip_proto udp dst_port 4789 action pass" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev veth0 ingress" 101 1 log_test $? 0 "Default destination port - match" - run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev veth0 ingress" 101 1 log_test $? 0 "Default destination port - no match" run_cmd "tc -n $ns2 filter replace dev veth0 ingress pref 1 handle 101 proto $enc_ethtype flower ip_proto udp dst_port 1111 action pass" - run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev veth0 ingress" 101 1 log_test $? 0 "Non-default destination port - match" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev veth0 ingress" 101 1 log_test $? 0 "Non-default destination port - no match" @@ -1045,11 +1046,11 @@ encap_params_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent dst $vtep1_ip src_vni 10020" run_cmd "tc -n $ns2 filter replace dev vx0 ingress pref 1 handle 101 proto all flower enc_key_id 10010 action pass" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Default destination VNI - match" - run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Default destination VNI - no match" @@ -1057,11 +1058,11 @@ encap_params_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent dst $vtep1_ip vni 10010 src_vni 10020" run_cmd "tc -n $ns2 filter replace dev vx0 ingress pref 1 handle 101 proto all flower enc_key_id 10020 action pass" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Non-default destination VNI - match" - run_cmd "ip netns exec $ns1 $mz br0.20 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.20 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Non-default destination VNI - no match" @@ -1079,6 +1080,7 @@ encap_params_ipv4_ipv4() local plen=32 local enc_ethtype="ip" local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local src=192.0.2.129 echo @@ -1086,7 +1088,7 @@ encap_params_ipv4_ipv4() echo "------------------------------------------------------------------" encap_params_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $enc_ethtype \ - $grp $src "mausezahn" + $grp $grp_dmac $src "mausezahn" } encap_params_ipv6_ipv4() @@ -1098,6 +1100,7 @@ encap_params_ipv6_ipv4() local plen=32 local enc_ethtype="ip" local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local src=2001:db8:100::1 echo @@ -1105,7 +1108,7 @@ encap_params_ipv6_ipv4() echo "------------------------------------------------------------------" encap_params_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $enc_ethtype \ - $grp $src "mausezahn -6" + $grp $grp_dmac $src "mausezahn -6" } encap_params_ipv4_ipv6() @@ -1117,6 +1120,7 @@ encap_params_ipv4_ipv6() local plen=128 local enc_ethtype="ipv6" local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local src=192.0.2.129 echo @@ -1124,7 +1128,7 @@ encap_params_ipv4_ipv6() echo "------------------------------------------------------------------" encap_params_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $enc_ethtype \ - $grp $src "mausezahn" + $grp $grp_dmac $src "mausezahn" } encap_params_ipv6_ipv6() @@ -1136,6 +1140,7 @@ encap_params_ipv6_ipv6() local plen=128 local enc_ethtype="ipv6" local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local src=2001:db8:100::1 echo @@ -1143,7 +1148,7 @@ encap_params_ipv6_ipv6() echo "------------------------------------------------------------------" encap_params_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $enc_ethtype \ - $grp $src "mausezahn -6" + $grp $grp_dmac $src "mausezahn -6" } starg_exclude_ir_common() @@ -1154,6 +1159,7 @@ starg_exclude_ir_common() local vtep2_ip=$1; shift local plen=$1; shift local grp=$1; shift + local grp_dmac=$1; shift local valid_src=$1; shift local invalid_src=$1; shift local mz=$1; shift @@ -1175,14 +1181,14 @@ starg_exclude_ir_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent filter_mode exclude source_list $invalid_src dst $vtep2_ip src_vni 10010" # Check that invalid source is not forwarded to any VTEP. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 0 log_test $? 0 "Block excluded source - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 0 log_test $? 0 "Block excluded source - second VTEP" # Check that valid source is forwarded to both VTEPs. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Forward valid source - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 @@ -1192,14 +1198,14 @@ starg_exclude_ir_common() run_cmd "bridge -n $ns1 mdb del dev vx0 port vx0 grp $grp dst $vtep2_ip src_vni 10010" # Check that invalid source is not forwarded to any VTEP. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Block excluded source after removal - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 log_test $? 0 "Block excluded source after removal - second VTEP" # Check that valid source is forwarded to the remaining VTEP. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 2 log_test $? 0 "Forward valid source after removal - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 @@ -1214,6 +1220,7 @@ starg_exclude_ir_ipv4_ipv4() local vtep2_ip=198.51.100.200 local plen=32 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1222,7 +1229,7 @@ starg_exclude_ir_ipv4_ipv4() echo "-------------------------------------------------------------" starg_exclude_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn" + $grp_dmac $valid_src $invalid_src "mausezahn" } starg_exclude_ir_ipv6_ipv4() @@ -1233,6 +1240,7 @@ starg_exclude_ir_ipv6_ipv4() local vtep2_ip=198.51.100.200 local plen=32 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1241,7 +1249,7 @@ starg_exclude_ir_ipv6_ipv4() echo "-------------------------------------------------------------" starg_exclude_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn -6" + $grp_dmac $valid_src $invalid_src "mausezahn -6" } starg_exclude_ir_ipv4_ipv6() @@ -1252,6 +1260,7 @@ starg_exclude_ir_ipv4_ipv6() local vtep2_ip=2001:db8:2000::1 local plen=128 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1260,7 +1269,7 @@ starg_exclude_ir_ipv4_ipv6() echo "-------------------------------------------------------------" starg_exclude_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn" + $grp_dmac $valid_src $invalid_src "mausezahn" } starg_exclude_ir_ipv6_ipv6() @@ -1271,6 +1280,7 @@ starg_exclude_ir_ipv6_ipv6() local vtep2_ip=2001:db8:2000::1 local plen=128 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1279,7 +1289,7 @@ starg_exclude_ir_ipv6_ipv6() echo "-------------------------------------------------------------" starg_exclude_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn -6" + $grp_dmac $valid_src $invalid_src "mausezahn -6" } starg_include_ir_common() @@ -1290,6 +1300,7 @@ starg_include_ir_common() local vtep2_ip=$1; shift local plen=$1; shift local grp=$1; shift + local grp_dmac=$1; shift local valid_src=$1; shift local invalid_src=$1; shift local mz=$1; shift @@ -1311,14 +1322,14 @@ starg_include_ir_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent filter_mode include source_list $valid_src dst $vtep2_ip src_vni 10010" # Check that invalid source is not forwarded to any VTEP. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 0 log_test $? 0 "Block excluded source - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 0 log_test $? 0 "Block excluded source - second VTEP" # Check that valid source is forwarded to both VTEPs. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Forward valid source - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 @@ -1328,14 +1339,14 @@ starg_include_ir_common() run_cmd "bridge -n $ns1 mdb del dev vx0 port vx0 grp $grp dst $vtep2_ip src_vni 10010" # Check that invalid source is not forwarded to any VTEP. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Block excluded source after removal - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 log_test $? 0 "Block excluded source after removal - second VTEP" # Check that valid source is forwarded to the remaining VTEP. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 2 log_test $? 0 "Forward valid source after removal - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 @@ -1350,6 +1361,7 @@ starg_include_ir_ipv4_ipv4() local vtep2_ip=198.51.100.200 local plen=32 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1358,7 +1370,7 @@ starg_include_ir_ipv4_ipv4() echo "-------------------------------------------------------------" starg_include_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn" + $grp_dmac $valid_src $invalid_src "mausezahn" } starg_include_ir_ipv6_ipv4() @@ -1369,6 +1381,7 @@ starg_include_ir_ipv6_ipv4() local vtep2_ip=198.51.100.200 local plen=32 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1377,7 +1390,7 @@ starg_include_ir_ipv6_ipv4() echo "-------------------------------------------------------------" starg_include_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn -6" + $grp_dmac $valid_src $invalid_src "mausezahn -6" } starg_include_ir_ipv4_ipv6() @@ -1388,6 +1401,7 @@ starg_include_ir_ipv4_ipv6() local vtep2_ip=2001:db8:2000::1 local plen=128 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1396,7 +1410,7 @@ starg_include_ir_ipv4_ipv6() echo "-------------------------------------------------------------" starg_include_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn" + $grp_dmac $valid_src $invalid_src "mausezahn" } starg_include_ir_ipv6_ipv6() @@ -1407,6 +1421,7 @@ starg_include_ir_ipv6_ipv6() local vtep2_ip=2001:db8:2000::1 local plen=128 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1415,7 +1430,7 @@ starg_include_ir_ipv6_ipv6() echo "-------------------------------------------------------------" starg_include_ir_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $grp \ - $valid_src $invalid_src "mausezahn -6" + $grp_dmac $valid_src $invalid_src "mausezahn -6" } starg_exclude_p2mp_common() @@ -1425,6 +1440,7 @@ starg_exclude_p2mp_common() local mcast_grp=$1; shift local plen=$1; shift local grp=$1; shift + local grp_dmac=$1; shift local valid_src=$1; shift local invalid_src=$1; shift local mz=$1; shift @@ -1442,12 +1458,12 @@ starg_exclude_p2mp_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent filter_mode exclude source_list $invalid_src dst $mcast_grp src_vni 10010 via veth0" # Check that invalid source is not forwarded. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 0 log_test $? 0 "Block excluded source" # Check that valid source is forwarded. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Forward valid source" @@ -1455,7 +1471,7 @@ starg_exclude_p2mp_common() run_cmd "ip -n $ns2 address del $mcast_grp/$plen dev veth0" # Check that valid source is not received anymore. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Receive of valid source after removal from group" } @@ -1467,6 +1483,7 @@ starg_exclude_p2mp_ipv4_ipv4() local mcast_grp=238.1.1.1 local plen=32 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1474,7 +1491,7 @@ starg_exclude_p2mp_ipv4_ipv4() echo "Data path: (*, G) EXCLUDE - P2MP - IPv4 overlay / IPv4 underlay" echo "---------------------------------------------------------------" - starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn" } @@ -1485,6 +1502,7 @@ starg_exclude_p2mp_ipv6_ipv4() local mcast_grp=238.1.1.1 local plen=32 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1492,7 +1510,7 @@ starg_exclude_p2mp_ipv6_ipv4() echo "Data path: (*, G) EXCLUDE - P2MP - IPv6 overlay / IPv4 underlay" echo "---------------------------------------------------------------" - starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn -6" } @@ -1503,6 +1521,7 @@ starg_exclude_p2mp_ipv4_ipv6() local mcast_grp=ff0e::2 local plen=128 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1510,7 +1529,7 @@ starg_exclude_p2mp_ipv4_ipv6() echo "Data path: (*, G) EXCLUDE - P2MP - IPv4 overlay / IPv6 underlay" echo "---------------------------------------------------------------" - starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn" } @@ -1521,6 +1540,7 @@ starg_exclude_p2mp_ipv6_ipv6() local mcast_grp=ff0e::2 local plen=128 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1528,7 +1548,7 @@ starg_exclude_p2mp_ipv6_ipv6() echo "Data path: (*, G) EXCLUDE - P2MP - IPv6 overlay / IPv6 underlay" echo "---------------------------------------------------------------" - starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_exclude_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn -6" } @@ -1539,6 +1559,7 @@ starg_include_p2mp_common() local mcast_grp=$1; shift local plen=$1; shift local grp=$1; shift + local grp_dmac=$1; shift local valid_src=$1; shift local invalid_src=$1; shift local mz=$1; shift @@ -1556,12 +1577,12 @@ starg_include_p2mp_common() run_cmd "bridge -n $ns1 mdb replace dev vx0 port vx0 grp $grp permanent filter_mode include source_list $valid_src dst $mcast_grp src_vni 10010 via veth0" # Check that invalid source is not forwarded. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $invalid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 0 log_test $? 0 "Block excluded source" # Check that valid source is forwarded. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Forward valid source" @@ -1569,7 +1590,7 @@ starg_include_p2mp_common() run_cmd "ip -n $ns2 address del $mcast_grp/$plen dev veth0" # Check that valid source is not received anymore. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $valid_src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Receive of valid source after removal from group" } @@ -1581,6 +1602,7 @@ starg_include_p2mp_ipv4_ipv4() local mcast_grp=238.1.1.1 local plen=32 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1588,7 +1610,7 @@ starg_include_p2mp_ipv4_ipv4() echo "Data path: (*, G) INCLUDE - P2MP - IPv4 overlay / IPv4 underlay" echo "---------------------------------------------------------------" - starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn" } @@ -1599,6 +1621,7 @@ starg_include_p2mp_ipv6_ipv4() local mcast_grp=238.1.1.1 local plen=32 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1606,7 +1629,7 @@ starg_include_p2mp_ipv6_ipv4() echo "Data path: (*, G) INCLUDE - P2MP - IPv6 overlay / IPv4 underlay" echo "---------------------------------------------------------------" - starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn -6" } @@ -1617,6 +1640,7 @@ starg_include_p2mp_ipv4_ipv6() local mcast_grp=ff0e::2 local plen=128 local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local valid_src=192.0.2.129 local invalid_src=192.0.2.145 @@ -1624,7 +1648,7 @@ starg_include_p2mp_ipv4_ipv6() echo "Data path: (*, G) INCLUDE - P2MP - IPv4 overlay / IPv6 underlay" echo "---------------------------------------------------------------" - starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn" } @@ -1635,6 +1659,7 @@ starg_include_p2mp_ipv6_ipv6() local mcast_grp=ff0e::2 local plen=128 local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local valid_src=2001:db8:100::1 local invalid_src=2001:db8:200::1 @@ -1642,7 +1667,7 @@ starg_include_p2mp_ipv6_ipv6() echo "Data path: (*, G) INCLUDE - P2MP - IPv6 overlay / IPv6 underlay" echo "---------------------------------------------------------------" - starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp \ + starg_include_p2mp_common $ns1 $ns2 $mcast_grp $plen $grp $grp_dmac \ $valid_src $invalid_src "mausezahn -6" } @@ -1654,6 +1679,7 @@ egress_vni_translation_common() local plen=$1; shift local proto=$1; shift local grp=$1; shift + local grp_dmac=$1; shift local src=$1; shift local mz=$1; shift @@ -1689,20 +1715,20 @@ egress_vni_translation_common() # Make sure that packets sent from the first VTEP over VLAN 10 are # received by the SVI corresponding to the L3VNI (14000 / VLAN 4000) on # the second VTEP, since it is configured as PVID. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev br0.4000 ingress" 101 1 log_test $? 0 "Egress VNI translation - PVID configured" # Remove PVID flag from VLAN 4000 on the second VTEP and make sure # packets are no longer received by the SVI interface. run_cmd "bridge -n $ns2 vlan add vid 4000 dev vx0" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev br0.4000 ingress" 101 1 log_test $? 0 "Egress VNI translation - no PVID configured" # Reconfigure the PVID and make sure packets are received again. run_cmd "bridge -n $ns2 vlan add vid 4000 dev vx0 pvid" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev br0.4000 ingress" 101 2 log_test $? 0 "Egress VNI translation - PVID reconfigured" } @@ -1715,6 +1741,7 @@ egress_vni_translation_ipv4_ipv4() local plen=32 local proto="ipv4" local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local src=192.0.2.129 echo @@ -1722,7 +1749,7 @@ egress_vni_translation_ipv4_ipv4() echo "----------------------------------------------------------------" egress_vni_translation_common $ns1 $ns2 $mcast_grp $plen $proto $grp \ - $src "mausezahn" + $grp_dmac $src "mausezahn" } egress_vni_translation_ipv6_ipv4() @@ -1733,6 +1760,7 @@ egress_vni_translation_ipv6_ipv4() local plen=32 local proto="ipv6" local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local src=2001:db8:100::1 echo @@ -1740,7 +1768,7 @@ egress_vni_translation_ipv6_ipv4() echo "----------------------------------------------------------------" egress_vni_translation_common $ns1 $ns2 $mcast_grp $plen $proto $grp \ - $src "mausezahn -6" + $grp_dmac $src "mausezahn -6" } egress_vni_translation_ipv4_ipv6() @@ -1751,6 +1779,7 @@ egress_vni_translation_ipv4_ipv6() local plen=128 local proto="ipv4" local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local src=192.0.2.129 echo @@ -1758,7 +1787,7 @@ egress_vni_translation_ipv4_ipv6() echo "----------------------------------------------------------------" egress_vni_translation_common $ns1 $ns2 $mcast_grp $plen $proto $grp \ - $src "mausezahn" + $grp_dmac $src "mausezahn" } egress_vni_translation_ipv6_ipv6() @@ -1769,6 +1798,7 @@ egress_vni_translation_ipv6_ipv6() local plen=128 local proto="ipv6" local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local src=2001:db8:100::1 echo @@ -1776,7 +1806,7 @@ egress_vni_translation_ipv6_ipv6() echo "----------------------------------------------------------------" egress_vni_translation_common $ns1 $ns2 $mcast_grp $plen $proto $grp \ - $src "mausezahn -6" + $grp_dmac $src "mausezahn -6" } all_zeros_mdb_common() @@ -1789,12 +1819,18 @@ all_zeros_mdb_common() local vtep4_ip=$1; shift local plen=$1; shift local ipv4_grp=239.1.1.1 + local ipv4_grp_dmac=01:00:5e:01:01:01 local ipv4_unreg_grp=239.2.2.2 + local ipv4_unreg_grp_dmac=01:00:5e:02:02:02 local ipv4_ll_grp=224.0.0.100 + local ipv4_ll_grp_dmac=01:00:5e:00:00:64 local ipv4_src=192.0.2.129 local ipv6_grp=ff0e::1 + local ipv6_grp_dmac=33:33:00:00:00:01 local ipv6_unreg_grp=ff0e::2 + local ipv6_unreg_grp_dmac=33:33:00:00:00:02 local ipv6_ll_grp=ff02::1 + local ipv6_ll_grp_dmac=33:33:00:00:00:01 local ipv6_src=2001:db8:100::1 # Install all-zeros (catchall) MDB entries for IPv4 and IPv6 traffic @@ -1830,7 +1866,7 @@ all_zeros_mdb_common() # Send registered IPv4 multicast and make sure it only arrives to the # first VTEP. - run_cmd "ip netns exec $ns1 mausezahn br0.10 -A $ipv4_src -B $ipv4_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 mausezahn br0.10 -a own -b $ipv4_grp_dmac -A $ipv4_src -B $ipv4_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "Registered IPv4 multicast - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 0 @@ -1838,7 +1874,7 @@ all_zeros_mdb_common() # Send unregistered IPv4 multicast that is not link-local and make sure # it arrives to the first and second VTEPs. - run_cmd "ip netns exec $ns1 mausezahn br0.10 -A $ipv4_src -B $ipv4_unreg_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 mausezahn br0.10 -a own -b $ipv4_unreg_grp_dmac -A $ipv4_src -B $ipv4_unreg_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 2 log_test $? 0 "Unregistered IPv4 multicast - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 @@ -1846,7 +1882,7 @@ all_zeros_mdb_common() # Send IPv4 link-local multicast traffic and make sure it does not # arrive to any VTEP. - run_cmd "ip netns exec $ns1 mausezahn br0.10 -A $ipv4_src -B $ipv4_ll_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 mausezahn br0.10 -a own -b $ipv4_ll_grp_dmac -A $ipv4_src -B $ipv4_ll_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 2 log_test $? 0 "Link-local IPv4 multicast - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 1 @@ -1881,7 +1917,7 @@ all_zeros_mdb_common() # Send registered IPv6 multicast and make sure it only arrives to the # third VTEP. - run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -A $ipv6_src -B $ipv6_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -a own -b $ipv6_grp_dmac -A $ipv6_src -B $ipv6_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 103 1 log_test $? 0 "Registered IPv6 multicast - third VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 104 0 @@ -1889,7 +1925,7 @@ all_zeros_mdb_common() # Send unregistered IPv6 multicast that is not link-local and make sure # it arrives to the third and fourth VTEPs. - run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -A $ipv6_src -B $ipv6_unreg_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -a own -b $ipv6_unreg_grp_dmac -A $ipv6_src -B $ipv6_unreg_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 103 2 log_test $? 0 "Unregistered IPv6 multicast - third VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 104 1 @@ -1897,7 +1933,7 @@ all_zeros_mdb_common() # Send IPv6 link-local multicast traffic and make sure it does not # arrive to any VTEP. - run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -A $ipv6_src -B $ipv6_ll_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 mausezahn -6 br0.10 -a own -b $ipv6_ll_grp_dmac -A $ipv6_src -B $ipv6_ll_grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 103 2 log_test $? 0 "Link-local IPv6 multicast - third VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 104 1 @@ -1972,6 +2008,7 @@ mdb_fdb_common() local plen=$1; shift local proto=$1; shift local grp=$1; shift + local grp_dmac=$1; shift local src=$1; shift local mz=$1; shift @@ -1995,7 +2032,7 @@ mdb_fdb_common() # Send IP multicast traffic and make sure it is forwarded by the MDB # and only arrives to the first VTEP. - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "IP multicast - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 0 @@ -2012,7 +2049,7 @@ mdb_fdb_common() # Remove the MDB entry and make sure that IP multicast is now forwarded # by the FDB to the second VTEP. run_cmd "bridge -n $ns1 mdb del dev vx0 port vx0 grp $grp dst $vtep1_ip src_vni 10010" - run_cmd "ip netns exec $ns1 $mz br0.10 -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" + run_cmd "ip netns exec $ns1 $mz br0.10 -a own -b $grp_dmac -A $src -B $grp -t udp sp=12345,dp=54321 -p 100 -c 1 -q" tc_check_packets "$ns2" "dev vx0 ingress" 101 1 log_test $? 0 "IP multicast after removal - first VTEP" tc_check_packets "$ns2" "dev vx0 ingress" 102 2 @@ -2028,14 +2065,15 @@ mdb_fdb_ipv4_ipv4() local plen=32 local proto="ipv4" local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local src=192.0.2.129 echo echo "Data path: MDB with FDB - IPv4 overlay / IPv4 underlay" echo "------------------------------------------------------" - mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp $src \ - "mausezahn" + mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp \ + $grp_dmac $src "mausezahn" } mdb_fdb_ipv6_ipv4() @@ -2047,14 +2085,15 @@ mdb_fdb_ipv6_ipv4() local plen=32 local proto="ipv6" local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local src=2001:db8:100::1 echo echo "Data path: MDB with FDB - IPv6 overlay / IPv4 underlay" echo "------------------------------------------------------" - mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp $src \ - "mausezahn -6" + mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp \ + $grp_dmac $src "mausezahn -6" } mdb_fdb_ipv4_ipv6() @@ -2066,14 +2105,15 @@ mdb_fdb_ipv4_ipv6() local plen=128 local proto="ipv4" local grp=239.1.1.1 + local grp_dmac=01:00:5e:01:01:01 local src=192.0.2.129 echo echo "Data path: MDB with FDB - IPv4 overlay / IPv6 underlay" echo "------------------------------------------------------" - mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp $src \ - "mausezahn" + mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp \ + $grp_dmac $src "mausezahn" } mdb_fdb_ipv6_ipv6() @@ -2085,14 +2125,15 @@ mdb_fdb_ipv6_ipv6() local plen=128 local proto="ipv6" local grp=ff0e::1 + local grp_dmac=33:33:00:00:00:01 local src=2001:db8:100::1 echo echo "Data path: MDB with FDB - IPv6 overlay / IPv6 underlay" echo "------------------------------------------------------" - mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp $src \ - "mausezahn -6" + mdb_fdb_common $ns1 $ns2 $vtep1_ip $vtep2_ip $plen $proto $grp \ + $grp_dmac $src "mausezahn -6" } mdb_grp1_loop() @@ -2127,7 +2168,9 @@ mdb_torture_common() local vtep1_ip=$1; shift local vtep2_ip=$1; shift local grp1=$1; shift + local grp1_dmac=$1; shift local grp2=$1; shift + local grp2_dmac=$1; shift local src=$1; shift local mz=$1; shift local pid1 @@ -2152,9 +2195,9 @@ mdb_torture_common() pid1=$! mdb_grp2_loop $ns1 $vtep1_ip $vtep2_ip $grp2 & pid2=$! - ip netns exec $ns1 $mz br0.10 -A $src -B $grp1 -t udp sp=12345,dp=54321 -p 100 -c 0 -q & + ip netns exec $ns1 $mz br0.10 -a own -b $grp1_dmac -A $src -B $grp1 -t udp sp=12345,dp=54321 -p 100 -c 0 -q & pid3=$! - ip netns exec $ns1 $mz br0.10 -A $src -B $grp2 -t udp sp=12345,dp=54321 -p 100 -c 0 -q & + ip netns exec $ns1 $mz br0.10 -a own -b $grp2_dmac -A $src -B $grp2 -t udp sp=12345,dp=54321 -p 100 -c 0 -q & pid4=$! sleep 30 @@ -2170,15 +2213,17 @@ mdb_torture_ipv4_ipv4() local vtep1_ip=198.51.100.100 local vtep2_ip=198.51.100.200 local grp1=239.1.1.1 + local grp1_dmac=01:00:5e:01:01:01 local grp2=239.2.2.2 + local grp2_dmac=01:00:5e:02:02:02 local src=192.0.2.129 echo echo "Data path: MDB torture test - IPv4 overlay / IPv4 underlay" echo "----------------------------------------------------------" - mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp2 $src \ - "mausezahn" + mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp1_dmac $grp2 \ + $grp2_dmac $src "mausezahn" } mdb_torture_ipv6_ipv4() @@ -2187,15 +2232,17 @@ mdb_torture_ipv6_ipv4() local vtep1_ip=198.51.100.100 local vtep2_ip=198.51.100.200 local grp1=ff0e::1 + local grp1_dmac=33:33:00:00:00:01 local grp2=ff0e::2 + local grp2_dmac=33:33:00:00:00:02 local src=2001:db8:100::1 echo echo "Data path: MDB torture test - IPv6 overlay / IPv4 underlay" echo "----------------------------------------------------------" - mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp2 $src \ - "mausezahn -6" + mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp1_dmac $grp2 \ + $grp2_dmac $src "mausezahn -6" } mdb_torture_ipv4_ipv6() @@ -2204,15 +2251,17 @@ mdb_torture_ipv4_ipv6() local vtep1_ip=2001:db8:1000::1 local vtep2_ip=2001:db8:2000::1 local grp1=239.1.1.1 + local grp1_dmac=01:00:5e:01:01:01 local grp2=239.2.2.2 + local grp2_dmac=01:00:5e:02:02:02 local src=192.0.2.129 echo echo "Data path: MDB torture test - IPv4 overlay / IPv6 underlay" echo "----------------------------------------------------------" - mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp2 $src \ - "mausezahn" + mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp1_dmac $grp2 \ + $grp2_dmac $src "mausezahn" } mdb_torture_ipv6_ipv6() @@ -2221,15 +2270,17 @@ mdb_torture_ipv6_ipv6() local vtep1_ip=2001:db8:1000::1 local vtep2_ip=2001:db8:2000::1 local grp1=ff0e::1 + local grp1_dmac=33:33:00:00:00:01 local grp2=ff0e::2 + local grp2_dmac=33:33:00:00:00:02 local src=2001:db8:100::1 echo echo "Data path: MDB torture test - IPv6 overlay / IPv6 underlay" echo "----------------------------------------------------------" - mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp2 $src \ - "mausezahn -6" + mdb_torture_common $ns1 $vtep1_ip $vtep2_ip $grp1 $grp1_dmac $grp2 \ + $grp2_dmac $src "mausezahn -6" } ################################################################################ From 77ffc72b497e43e6a8f832729eb2358c3492b9c1 Mon Sep 17 00:00:00 2001 From: Nikita Kiryushin Date: Fri, 22 Mar 2024 21:07:53 +0300 Subject: [PATCH 025/256] ACPICA: debugger: check status of acpi_evaluate_object() in acpi_db_walk_for_fields() [ Upstream commit 40e2710860e57411ab57a1529c5a2748abbe8a19 ] ACPICA commit 9061cd9aa131205657c811a52a9f8325a040c6c9 Errors in acpi_evaluate_object() can lead to incorrect state of buffer. This can lead to access to data in previously ACPI_FREEd buffer and secondary ACPI_FREE to the same buffer later. Handle errors in acpi_evaluate_object the same way it is done earlier with acpi_ns_handle_to_pathname. Found by Linux Verification Center (linuxtesting.org) with SVACE. Link: https://github.com/acpica/acpica/commit/9061cd9a Fixes: 5fd033288a86 ("ACPICA: debugger: add command to dump all fields of particular subtype") Signed-off-by: Nikita Kiryushin Signed-off-by: Rafael J. Wysocki Signed-off-by: Sasha Levin --- drivers/acpi/acpica/dbnames.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/acpi/acpica/dbnames.c b/drivers/acpi/acpica/dbnames.c index b91155ea9c34..c9131259f717 100644 --- a/drivers/acpi/acpica/dbnames.c +++ b/drivers/acpi/acpica/dbnames.c @@ -550,8 +550,12 @@ acpi_db_walk_for_fields(acpi_handle obj_handle, ACPI_FREE(buffer.pointer); buffer.length = ACPI_ALLOCATE_LOCAL_BUFFER; - acpi_evaluate_object(obj_handle, NULL, NULL, &buffer); - + status = acpi_evaluate_object(obj_handle, NULL, NULL, &buffer); + if (ACPI_FAILURE(status)) { + acpi_os_printf("Could Not evaluate object %p\n", + obj_handle); + return (AE_OK); + } /* * Since this is a field unit, surround the output in braces */ From b033da1461c1dd5a3a1ef868b2680cfe06331b0d Mon Sep 17 00:00:00 2001 From: Jie Wang Date: Mon, 25 Mar 2024 20:43:09 +0800 Subject: [PATCH 026/256] net: hns3: fix index limit to support all queue stats [ Upstream commit 47e39d213e09c6cae0d6b4d95e454ea404013312 ] Currently, hns hardware supports more than 512 queues and the index limit in hclge_comm_tqps_update_stats is wrong. So this patch removes it. Fixes: 287db5c40d15 ("net: hns3: create new set of common tqp stats APIs for PF and VF reuse") Signed-off-by: Jie Wang Signed-off-by: Jijie Shao Reviewed-by: Michal Kubiak Reviewed-by: Kalesh AP Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- .../ethernet/hisilicon/hns3/hns3_common/hclge_comm_tqp_stats.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_tqp_stats.c b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_tqp_stats.c index f3c9395d8351..618f66d9586b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_tqp_stats.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_common/hclge_comm_tqp_stats.c @@ -85,7 +85,7 @@ int hclge_comm_tqps_update_stats(struct hnae3_handle *handle, hclge_comm_cmd_setup_basic_desc(&desc, HCLGE_OPC_QUERY_TX_STATS, true); - desc.data[0] = cpu_to_le32(tqp->index & 0x1ff); + desc.data[0] = cpu_to_le32(tqp->index); ret = hclge_comm_cmd_send(hw, &desc, 1); if (ret) { dev_err(&hw->cmq.csq.pdev->dev, From 1b550dae55901c2cc9075d6a7155a71b4f516e86 Mon Sep 17 00:00:00 2001 From: Yonglong Liu Date: Mon, 25 Mar 2024 20:43:10 +0800 Subject: [PATCH 027/256] net: hns3: fix kernel crash when devlink reload during pf initialization [ Upstream commit 93305b77ffcb042f1538ecc383505e87d95aa05a ] The devlink reload process will access the hardware resources, but the register operation is done before the hardware is initialized. So, processing the devlink reload during initialization may lead to kernel crash. This patch fixes this by taking devl_lock during initialization. Fixes: b741269b2759 ("net: hns3: add support for registering devlink for PF") Signed-off-by: Yonglong Liu Signed-off-by: Jijie Shao Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index f1ca2cda2961..dfd0c5f4cb9f 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -11614,6 +11614,8 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) if (ret) goto err_pci_uninit; + devl_lock(hdev->devlink); + /* Firmware command queue initialize */ ret = hclge_comm_cmd_queue_init(hdev->pdev, &hdev->hw.hw); if (ret) @@ -11793,6 +11795,7 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_task_schedule(hdev, round_jiffies_relative(HZ)); + devl_unlock(hdev->devlink); return 0; err_mdiobus_unreg: @@ -11805,6 +11808,7 @@ err_msi_uninit: err_cmd_uninit: hclge_comm_cmd_uninit(hdev->ae_dev, &hdev->hw.hw); err_devlink_uninit: + devl_unlock(hdev->devlink); hclge_devlink_uninit(hdev); err_pci_uninit: pcim_iounmap(pdev, hdev->hw.hw.io_base); From 872f574f88606121c7c56c0cd26ad305d2960f78 Mon Sep 17 00:00:00 2001 From: Jian Shen Date: Mon, 25 Mar 2024 20:43:11 +0800 Subject: [PATCH 028/256] net: hns3: mark unexcuted loopback test result as UNEXECUTED [ Upstream commit 5bd088d6c21a45ee70e6116879310e54174d75eb ] Currently, loopback test may be skipped when resetting, but the test result will still show as 'PASS', because the driver doesn't set ETH_TEST_FL_FAILED flag. Fix it by setting the flag and initializating the value to UNEXECUTED. Fixes: 4c8dab1c709c ("net: hns3: reconstruct function hns3_self_test") Signed-off-by: Jian Shen Signed-off-by: Jijie Shao Reviewed-by: Michal Kubiak Reviewed-by: Simon Horman Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- .../ethernet/hisilicon/hns3/hns3_ethtool.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 682239f33082..78181eea93c1 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -78,6 +78,9 @@ static const struct hns3_stats hns3_rxq_stats[] = { #define HNS3_NIC_LB_TEST_NO_MEM_ERR 1 #define HNS3_NIC_LB_TEST_TX_CNT_ERR 2 #define HNS3_NIC_LB_TEST_RX_CNT_ERR 3 +#define HNS3_NIC_LB_TEST_UNEXECUTED 4 + +static int hns3_get_sset_count(struct net_device *netdev, int stringset); static int hns3_lp_setup(struct net_device *ndev, enum hnae3_loop loop, bool en) { @@ -418,18 +421,26 @@ static void hns3_do_external_lb(struct net_device *ndev, static void hns3_self_test(struct net_device *ndev, struct ethtool_test *eth_test, u64 *data) { + int cnt = hns3_get_sset_count(ndev, ETH_SS_TEST); struct hns3_nic_priv *priv = netdev_priv(ndev); struct hnae3_handle *h = priv->ae_handle; int st_param[HNAE3_LOOP_NONE][2]; bool if_running = netif_running(ndev); + int i; + + /* initialize the loopback test result, avoid marking an unexcuted + * loopback test as PASS. + */ + for (i = 0; i < cnt; i++) + data[i] = HNS3_NIC_LB_TEST_UNEXECUTED; if (hns3_nic_resetting(ndev)) { netdev_err(ndev, "dev resetting!"); - return; + goto failure; } if (!(eth_test->flags & ETH_TEST_FL_OFFLINE)) - return; + goto failure; if (netif_msg_ifdown(h)) netdev_info(ndev, "self test start\n"); @@ -451,6 +462,10 @@ static void hns3_self_test(struct net_device *ndev, if (netif_msg_ifdown(h)) netdev_info(ndev, "self test end\n"); + return; + +failure: + eth_test->flags |= ETH_TEST_FL_FAILED; } static void hns3_update_limit_promisc_mode(struct net_device *netdev, From dc4bce20fa9e6af3a8b4aaa1eb26198c04fcef8a Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Mar 2024 16:56:45 +0100 Subject: [PATCH 029/256] tls: recv: process_rx_list shouldn't use an offset with kvec [ Upstream commit 7608a971fdeb4c3eefa522d1bfe8d4bc6b2481cc ] Only MSG_PEEK needs to copy from an offset during the final process_rx_list call, because the bytes we copied at the beginning of tls_sw_recvmsg were left on the rx_list. In the KVEC case, we removed data from the rx_list as we were copying it, so there's no need to use an offset, just like in the normal case. Fixes: 692d7b5d1f91 ("tls: Fix recvmsg() to be able to peek across multiple records") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/e5487514f828e0347d2b92ca40002c62b58af73d.1711120964.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index acf5bb74fd38..8e753d10e694 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2152,7 +2152,7 @@ recv_end: } /* Drain records from the rx_list & copy if required */ - if (is_peek || is_kvec) + if (is_peek) err = process_rx_list(ctx, msg, &control, copied + peeked, decrypted - peeked, is_peek, NULL); else From f19e995b4813a81b63df940ff2f8ba00882016c2 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Mar 2024 16:56:46 +0100 Subject: [PATCH 030/256] tls: adjust recv return with async crypto and failed copy to userspace [ Upstream commit 85eef9a41d019b59be7bc91793f26251909c0710 ] process_rx_list may not copy as many bytes as we want to the userspace buffer, for example in case we hit an EFAULT during the copy. If this happens, we should only count the bytes that were actually copied, which may be 0. Subtracting async_copy_bytes is correct in both peek and !peek cases, because decrypted == async_copy_bytes + peeked for the peek case: peek is always !ZC, and we can go through either the sync or async path. In the async case, we add chunk to both decrypted and async_copy_bytes. In the sync case, we add chunk to both decrypted and peeked. I missed that in commit 6caaf104423d ("tls: fix peeking with sync+async decryption"). Fixes: 4d42cd6bc2ac ("tls: rx: fix return value for async crypto") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/1b5a1eaab3c088a9dd5d9f1059ceecd7afe888d1.1711120964.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/tls/tls_sw.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 8e753d10e694..925de4caa894 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2158,6 +2158,9 @@ recv_end: else err = process_rx_list(ctx, msg, &control, 0, async_copy_bytes, is_peek, NULL); + + /* we could have copied less than we wanted, and possibly nothing */ + decrypted += max(err, 0) - async_copy_bytes; } copied += decrypted; From f1b7f14130d782433bc98c1e1e41ce6b4d4c3096 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Mon, 25 Mar 2024 16:56:48 +0100 Subject: [PATCH 031/256] tls: get psock ref after taking rxlock to avoid leak [ Upstream commit 417e91e856099e9b8a42a2520e2255e6afe024be ] At the start of tls_sw_recvmsg, we take a reference on the psock, and then call tls_rx_reader_lock. If that fails, we return directly without releasing the reference. Instead of adding a new label, just take the reference after locking has succeeded, since we don't need it before. Fixes: 4cbc325ed6b4 ("tls: rx: allow only one reader at a time") Signed-off-by: Sabrina Dubroca Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/fe2ade22d030051ce4c3638704ed58b67d0df643.1711120964.git.sd@queasysnail.net Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- net/tls/tls_sw.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index 925de4caa894..df166f6afad8 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -1976,10 +1976,10 @@ int tls_sw_recvmsg(struct sock *sk, if (unlikely(flags & MSG_ERRQUEUE)) return sock_recv_errqueue(sk, msg, len, SOL_IP, IP_RECVERR); - psock = sk_psock_get(sk); err = tls_rx_reader_lock(sk, ctx, flags & MSG_DONTWAIT); if (err < 0) return err; + psock = sk_psock_get(sk); bpf_strp_enabled = sk_psock_strp_enabled(psock); /* If crypto failed the connection is broken */ From 867a2f598af6a645c865d1101b58c5e070c6dd9e Mon Sep 17 00:00:00 2001 From: David Thompson Date: Mon, 25 Mar 2024 14:36:27 -0400 Subject: [PATCH 032/256] mlxbf_gige: call request_irq() after NAPI initialized [ Upstream commit f7442a634ac06b953fc1f7418f307b25acd4cfbc ] The mlxbf_gige driver encounters a NULL pointer exception in mlxbf_gige_open() when kdump is enabled. The sequence to reproduce the exception is as follows: a) enable kdump b) trigger kdump via "echo c > /proc/sysrq-trigger" c) kdump kernel executes d) kdump kernel loads mlxbf_gige module e) the mlxbf_gige module runs its open() as the the "oob_net0" interface is brought up f) mlxbf_gige module will experience an exception during its open(), something like: Unable to handle kernel NULL pointer dereference at virtual address 0000000000000000 Mem abort info: ESR = 0x0000000086000004 EC = 0x21: IABT (current EL), IL = 32 bits SET = 0, FnV = 0 EA = 0, S1PTW = 0 FSC = 0x04: level 0 translation fault user pgtable: 4k pages, 48-bit VAs, pgdp=00000000e29a4000 [0000000000000000] pgd=0000000000000000, p4d=0000000000000000 Internal error: Oops: 0000000086000004 [#1] SMP CPU: 0 PID: 812 Comm: NetworkManager Tainted: G OE 5.15.0-1035-bluefield #37-Ubuntu Hardware name: https://www.mellanox.com BlueField-3 SmartNIC Main Card/BlueField-3 SmartNIC Main Card, BIOS 4.6.0.13024 Jan 19 2024 pstate: 80400009 (Nzcv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) pc : 0x0 lr : __napi_poll+0x40/0x230 sp : ffff800008003e00 x29: ffff800008003e00 x28: 0000000000000000 x27: 00000000ffffffff x26: ffff000066027238 x25: ffff00007cedec00 x24: ffff800008003ec8 x23: 000000000000012c x22: ffff800008003eb7 x21: 0000000000000000 x20: 0000000000000001 x19: ffff000066027238 x18: 0000000000000000 x17: ffff578fcb450000 x16: ffffa870b083c7c0 x15: 0000aaab010441d0 x14: 0000000000000001 x13: 00726f7272655f65 x12: 6769675f6662786c x11: 0000000000000000 x10: 0000000000000000 x9 : ffffa870b0842398 x8 : 0000000000000004 x7 : fe5a48b9069706ea x6 : 17fdb11fc84ae0d2 x5 : d94a82549d594f35 x4 : 0000000000000000 x3 : 0000000000400100 x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff000066027238 Call trace: 0x0 net_rx_action+0x178/0x360 __do_softirq+0x15c/0x428 __irq_exit_rcu+0xac/0xec irq_exit+0x18/0x2c handle_domain_irq+0x6c/0xa0 gic_handle_irq+0xec/0x1b0 call_on_irq_stack+0x20/0x2c do_interrupt_handler+0x5c/0x70 el1_interrupt+0x30/0x50 el1h_64_irq_handler+0x18/0x2c el1h_64_irq+0x7c/0x80 __setup_irq+0x4c0/0x950 request_threaded_irq+0xf4/0x1bc mlxbf_gige_request_irqs+0x68/0x110 [mlxbf_gige] mlxbf_gige_open+0x5c/0x170 [mlxbf_gige] __dev_open+0x100/0x220 __dev_change_flags+0x16c/0x1f0 dev_change_flags+0x2c/0x70 do_setlink+0x220/0xa40 __rtnl_newlink+0x56c/0x8a0 rtnl_newlink+0x58/0x84 rtnetlink_rcv_msg+0x138/0x3c4 netlink_rcv_skb+0x64/0x130 rtnetlink_rcv+0x20/0x30 netlink_unicast+0x2ec/0x360 netlink_sendmsg+0x278/0x490 __sock_sendmsg+0x5c/0x6c ____sys_sendmsg+0x290/0x2d4 ___sys_sendmsg+0x84/0xd0 __sys_sendmsg+0x70/0xd0 __arm64_sys_sendmsg+0x2c/0x40 invoke_syscall+0x78/0x100 el0_svc_common.constprop.0+0x54/0x184 do_el0_svc+0x30/0xac el0_svc+0x48/0x160 el0t_64_sync_handler+0xa4/0x12c el0t_64_sync+0x1a4/0x1a8 Code: bad PC value ---[ end trace 7d1c3f3bf9d81885 ]--- Kernel panic - not syncing: Oops: Fatal exception in interrupt Kernel Offset: 0x2870a7a00000 from 0xffff800008000000 PHYS_OFFSET: 0x80000000 CPU features: 0x0,000005c1,a3332a5a Memory Limit: none ---[ end Kernel panic - not syncing: Oops: Fatal exception in interrupt ]--- The exception happens because there is a pending RX interrupt before the call to request_irq(RX IRQ) executes. Then, the RX IRQ handler fires immediately after this request_irq() completes. The RX IRQ handler runs "napi_schedule()" before NAPI is fully initialized via "netif_napi_add()" and "napi_enable()", both which happen later in the open() logic. The logic in mlxbf_gige_open() must fully initialize NAPI before any calls to request_irq() execute. Fixes: f92e1869d74e ("Add Mellanox BlueField Gigabit Ethernet driver") Signed-off-by: David Thompson Reviewed-by: Asmaa Mnebhi Link: https://lore.kernel.org/r/20240325183627.7641-1-davthompson@nvidia.com Signed-off-by: Jakub Kicinski Signed-off-by: Sasha Levin --- .../mellanox/mlxbf_gige/mlxbf_gige_main.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c index 044ff5f87b5e..f1fa5f10051f 100644 --- a/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c +++ b/drivers/net/ethernet/mellanox/mlxbf_gige/mlxbf_gige_main.c @@ -139,13 +139,10 @@ static int mlxbf_gige_open(struct net_device *netdev) control |= MLXBF_GIGE_CONTROL_PORT_EN; writeq(control, priv->base + MLXBF_GIGE_CONTROL); - err = mlxbf_gige_request_irqs(priv); - if (err) - return err; mlxbf_gige_cache_stats(priv); err = mlxbf_gige_clean_port(priv); if (err) - goto free_irqs; + return err; /* Clear driver's valid_polarity to match hardware, * since the above call to clean_port() resets the @@ -166,6 +163,10 @@ static int mlxbf_gige_open(struct net_device *netdev) napi_enable(&priv->napi); netif_start_queue(netdev); + err = mlxbf_gige_request_irqs(priv); + if (err) + goto napi_deinit; + /* Set bits in INT_EN that we care about */ int_en = MLXBF_GIGE_INT_EN_HW_ACCESS_ERROR | MLXBF_GIGE_INT_EN_TX_CHECKSUM_INPUTS | @@ -182,14 +183,17 @@ static int mlxbf_gige_open(struct net_device *netdev) return 0; +napi_deinit: + netif_stop_queue(netdev); + napi_disable(&priv->napi); + netif_napi_del(&priv->napi); + mlxbf_gige_rx_deinit(priv); + tx_deinit: mlxbf_gige_tx_deinit(priv); phy_deinit: phy_stop(phydev); - -free_irqs: - mlxbf_gige_free_irqs(priv); return err; } From 3f0784b2f1eb9147973d8c43ba085c5fdf44ff69 Mon Sep 17 00:00:00 2001 From: Andrei Matei Date: Tue, 26 Mar 2024 22:42:45 -0400 Subject: [PATCH 033/256] bpf: Protect against int overflow for stack access size [ Upstream commit ecc6a2101840177e57c925c102d2d29f260d37c8 ] This patch re-introduces protection against the size of access to stack memory being negative; the access size can appear negative as a result of overflowing its signed int representation. This should not actually happen, as there are other protections along the way, but we should protect against it anyway. One code path was missing such protections (fixed in the previous patch in the series), causing out-of-bounds array accesses in check_stack_range_initialized(). This patch causes the verification of a program with such a non-sensical access size to fail. This check used to exist in a more indirect way, but was inadvertendly removed in a833a17aeac7. Fixes: a833a17aeac7 ("bpf: Fix verification of indirect var-off stack access") Reported-by: syzbot+33f4297b5f927648741a@syzkaller.appspotmail.com Reported-by: syzbot+aafd0513053a1cbf52ef@syzkaller.appspotmail.com Closes: https://lore.kernel.org/bpf/CAADnVQLORV5PT0iTAhRER+iLBTkByCYNBYyvBSgjN1T31K+gOw@mail.gmail.com/ Acked-by: Andrii Nakryiko Signed-off-by: Andrei Matei Link: https://lore.kernel.org/r/20240327024245.318299-3-andreimatei1@gmail.com Signed-off-by: Alexei Starovoitov Signed-off-by: Sasha Levin --- kernel/bpf/verifier.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 396c4c66932f..c9fc734989c6 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -6637,6 +6637,11 @@ static int check_stack_access_within_bounds( err = check_stack_slot_within_bounds(env, min_off, state, type); if (!err && max_off > 0) err = -EINVAL; /* out of stack access into non-negative offsets */ + if (!err && access_size < 0) + /* access_size should not be negative (or overflow an int); others checks + * along the way should have prevented such an access. + */ + err = -EFAULT; /* invalid negative access size; integer overflow? */ if (err) { if (tnum_is_const(reg->var_off)) { From 614bc8c71ed5ae9ffdc89886003d20afbf20e86c Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 27 Mar 2024 14:13:24 +0000 Subject: [PATCH 034/256] cifs: Fix duplicate fscache cookie warnings [ Upstream commit 8876a37277cb832e1861c35f8c661825179f73f5 ] fscache emits a lot of duplicate cookie warnings with cifs because the index key for the fscache cookies does not include everything that the cifs_find_inode() function does. The latter is used with iget5_locked() to distinguish between inodes in the local inode cache. Fix this by adding the creation time and file type to the fscache cookie key. Additionally, add a couple of comments to note that if one is changed the other must be also. Signed-off-by: David Howells Fixes: 70431bfd825d ("cifs: Support fscache indexing rewrite") cc: Shyam Prasad N cc: Rohith Surabattula cc: Jeff Layton cc: linux-cifs@vger.kernel.org cc: netfs@lists.linux.dev cc: linux-fsdevel@vger.kernel.org Signed-off-by: Steve French Signed-off-by: Sasha Levin --- fs/smb/client/fscache.c | 16 +++++++++++++++- fs/smb/client/inode.c | 2 ++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/fs/smb/client/fscache.c b/fs/smb/client/fscache.c index e5cad149f5a2..a4ee801b2939 100644 --- a/fs/smb/client/fscache.c +++ b/fs/smb/client/fscache.c @@ -12,6 +12,16 @@ #include "cifs_fs_sb.h" #include "cifsproto.h" +/* + * Key for fscache inode. [!] Contents must match comparisons in cifs_find_inode(). + */ +struct cifs_fscache_inode_key { + + __le64 uniqueid; /* server inode number */ + __le64 createtime; /* creation time on server */ + u8 type; /* S_IFMT file type */ +} __packed; + static void cifs_fscache_fill_volume_coherency( struct cifs_tcon *tcon, struct cifs_fscache_volume_coherency_data *cd) @@ -97,15 +107,19 @@ void cifs_fscache_release_super_cookie(struct cifs_tcon *tcon) void cifs_fscache_get_inode_cookie(struct inode *inode) { struct cifs_fscache_inode_coherency_data cd; + struct cifs_fscache_inode_key key; struct cifsInodeInfo *cifsi = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + key.uniqueid = cpu_to_le64(cifsi->uniqueid); + key.createtime = cpu_to_le64(cifsi->createtime); + key.type = (inode->i_mode & S_IFMT) >> 12; cifs_fscache_fill_coherency(&cifsi->netfs.inode, &cd); cifsi->netfs.cache = fscache_acquire_cookie(tcon->fscache, 0, - &cifsi->uniqueid, sizeof(cifsi->uniqueid), + &key, sizeof(key), &cd, sizeof(cd), i_size_read(&cifsi->netfs.inode)); if (cifsi->netfs.cache) diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index cb9e719e67ae..fa6330d586e8 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1390,6 +1390,8 @@ cifs_find_inode(struct inode *inode, void *opaque) { struct cifs_fattr *fattr = opaque; + /* [!] The compared values must be the same in struct cifs_fscache_inode_key. */ + /* don't match inode with different uniqueid */ if (CIFS_I(inode)->uniqueid != fattr->cf_uniqueid) return 0; From cddd0480a682426d44fdadb55354367a905cedb7 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 21 Mar 2024 01:27:50 +0100 Subject: [PATCH 035/256] netfilter: nf_tables: reject destroy command to remove basechain hooks [ Upstream commit b32ca27fa238ff83427d23bef2a5b741e2a88a1e ] Report EOPNOTSUPP if NFT_MSG_DESTROYCHAIN is used to delete hooks in an existing netdev basechain, thus, only NFT_MSG_DELCHAIN is allowed. Fixes: 7d937b107108f ("netfilter: nf_tables: support for deleting devices in an existing netdev chain") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index f10419ba6e0b..0653f1e5e892 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2934,7 +2934,8 @@ static int nf_tables_delchain(struct sk_buff *skb, const struct nfnl_info *info, nft_ctx_init(&ctx, net, skb, info->nlh, family, table, chain, nla); if (nla[NFTA_CHAIN_HOOK]) { - if (chain->flags & NFT_CHAIN_HW_OFFLOAD) + if (NFNL_MSG_TYPE(info->nlh->nlmsg_type) == NFT_MSG_DESTROYCHAIN || + chain->flags & NFT_CHAIN_HW_OFFLOAD) return -EOPNOTSUPP; if (nft_is_base_chain(chain)) { From 2d0d1abe119af2595a760ccc404cdb4e8c846de9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 21 Mar 2024 01:27:59 +0100 Subject: [PATCH 036/256] netfilter: nf_tables: reject table flag and netdev basechain updates [ Upstream commit 1e1fb6f00f52812277963365d9bd835b9b0ea4e0 ] netdev basechain updates are stored in the transaction object hook list. When setting on the table dormant flag, it iterates over the existing hooks in the basechain. Thus, skipping the hooks that are being added/deleted in this transaction, which leaves hook registration in inconsistent state. Reject table flag updates in combination with netdev basechain updates in the same batch: - Update table flags and add/delete basechain: Check from basechain update path if there are pending flag updates for this table. - add/delete basechain and update table flags: Iterate over the transaction list to search for basechain updates from the table update path. In both cases, the batch is rejected. Based on suggestion from Florian Westphal. Fixes: b9703ed44ffb ("netfilter: nf_tables: support for adding new devices to an existing netdev chain") Fixes: 7d937b107108f ("netfilter: nf_tables: support for deleting devices in an existing netdev chain") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 31 ++++++++++++++++++++++++++++++- 1 file changed, 30 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 0653f1e5e892..6e4e22a10a82 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1200,6 +1200,25 @@ static void nf_tables_table_disable(struct net *net, struct nft_table *table) #define __NFT_TABLE_F_UPDATE (__NFT_TABLE_F_WAS_DORMANT | \ __NFT_TABLE_F_WAS_AWAKEN) +static bool nft_table_pending_update(const struct nft_ctx *ctx) +{ + struct nftables_pernet *nft_net = nft_pernet(ctx->net); + struct nft_trans *trans; + + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return true; + + list_for_each_entry(trans, &nft_net->commit_list, list) { + if ((trans->msg_type == NFT_MSG_NEWCHAIN || + trans->msg_type == NFT_MSG_DELCHAIN) && + trans->ctx.table == ctx->table && + nft_trans_chain_update(trans)) + return true; + } + + return false; +} + static int nf_tables_updtable(struct nft_ctx *ctx) { struct nft_trans *trans; @@ -1223,7 +1242,7 @@ static int nf_tables_updtable(struct nft_ctx *ctx) return -EOPNOTSUPP; /* No dormant off/on/off/on games in single transaction */ - if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + if (nft_table_pending_update(ctx)) return -EINVAL; trans = nft_trans_alloc(ctx, NFT_MSG_NEWTABLE, @@ -2621,6 +2640,13 @@ static int nf_tables_updchain(struct nft_ctx *ctx, u8 genmask, u8 policy, } } + if (table->flags & __NFT_TABLE_F_UPDATE && + !list_empty(&hook.list)) { + NL_SET_BAD_ATTR(extack, attr); + err = -EOPNOTSUPP; + goto err_hooks; + } + if (!(table->flags & NFT_TABLE_F_DORMANT) && nft_is_base_chain(chain) && !list_empty(&hook.list)) { @@ -2850,6 +2876,9 @@ static int nft_delchain_hook(struct nft_ctx *ctx, struct nft_trans *trans; int err; + if (ctx->table->flags & __NFT_TABLE_F_UPDATE) + return -EOPNOTSUPP; + err = nft_chain_parse_hook(ctx->net, basechain, nla, &chain_hook, ctx->family, chain->flags, extack); if (err < 0) From cf893953633db14a749aa6677bed20529cfb13ac Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Thu, 21 Mar 2024 01:28:07 +0100 Subject: [PATCH 037/256] netfilter: nf_tables: skip netdev hook unregistration if table is dormant [ Upstream commit 216e7bf7402caf73f4939a8e0248392e96d7c0da ] Skip hook unregistration when adding or deleting devices from an existing netdev basechain. Otherwise, commit/abort path try to unregister hooks which not enabled. Fixes: b9703ed44ffb ("netfilter: nf_tables: support for adding new devices to an existing netdev chain") Fixes: 7d937b107108 ("netfilter: nf_tables: support for deleting devices in an existing netdev chain") Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- net/netfilter/nf_tables_api.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 6e4e22a10a82..b2ef7e37f11c 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -10083,9 +10083,11 @@ static int nf_tables_commit(struct net *net, struct sk_buff *skb) if (nft_trans_chain_update(trans)) { nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN, &nft_trans_chain_hooks(trans)); - nft_netdev_unregister_hooks(net, - &nft_trans_chain_hooks(trans), - true); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT)) { + nft_netdev_unregister_hooks(net, + &nft_trans_chain_hooks(trans), + true); + } } else { nft_chain_del(trans->ctx.chain); nf_tables_chain_notify(&trans->ctx, NFT_MSG_DELCHAIN, @@ -10357,9 +10359,11 @@ static int __nf_tables_abort(struct net *net, enum nfnl_abort_action action) break; case NFT_MSG_NEWCHAIN: if (nft_trans_chain_update(trans)) { - nft_netdev_unregister_hooks(net, - &nft_trans_chain_hooks(trans), - true); + if (!(trans->ctx.table->flags & NFT_TABLE_F_DORMANT)) { + nft_netdev_unregister_hooks(net, + &nft_trans_chain_hooks(trans), + true); + } free_percpu(nft_trans_chain_stats(trans)); kfree(nft_trans_chain_name(trans)); nft_trans_destroy(trans); From 4bb7ad116be79b652e1075727dbc16b541d449c3 Mon Sep 17 00:00:00 2001 From: Justin Chen Date: Mon, 25 Mar 2024 12:30:24 -0700 Subject: [PATCH 038/256] net: bcmasp: Bring up unimac after PHY link up [ Upstream commit dfd222e2aef68818320a57b13a1c52a44c22bc80 ] The unimac requires the PHY RX clk during reset or it may be put into a bad state. Bring up the unimac after link up to ensure the PHY RX clk exists. Fixes: 490cb412007d ("net: bcmasp: Add support for ASP2.0 Ethernet controller") Signed-off-by: Justin Chen Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- .../net/ethernet/broadcom/asp2/bcmasp_intf.c | 28 +++++++++++++------ 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c index 9cae5a309000..b3d04f49f77e 100644 --- a/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c +++ b/drivers/net/ethernet/broadcom/asp2/bcmasp_intf.c @@ -391,7 +391,9 @@ static void umac_reset(struct bcmasp_intf *intf) umac_wl(intf, 0x0, UMC_CMD); umac_wl(intf, UMC_CMD_SW_RESET, UMC_CMD); usleep_range(10, 100); - umac_wl(intf, 0x0, UMC_CMD); + /* We hold the umac in reset and bring it out of + * reset when phy link is up. + */ } static void umac_set_hw_addr(struct bcmasp_intf *intf, @@ -411,6 +413,8 @@ static void umac_enable_set(struct bcmasp_intf *intf, u32 mask, u32 reg; reg = umac_rl(intf, UMC_CMD); + if (reg & UMC_CMD_SW_RESET) + return; if (enable) reg |= mask; else @@ -429,7 +433,6 @@ static void umac_init(struct bcmasp_intf *intf) umac_wl(intf, 0x800, UMC_FRM_LEN); umac_wl(intf, 0xffff, UMC_PAUSE_CNTRL); umac_wl(intf, 0x800, UMC_RX_MAX_PKT_SZ); - umac_enable_set(intf, UMC_CMD_PROMISC, 1); } static int bcmasp_tx_poll(struct napi_struct *napi, int budget) @@ -656,6 +659,12 @@ static void bcmasp_adj_link(struct net_device *dev) UMC_CMD_HD_EN | UMC_CMD_RX_PAUSE_IGNORE | UMC_CMD_TX_PAUSE_IGNORE); reg |= cmd_bits; + if (reg & UMC_CMD_SW_RESET) { + reg &= ~UMC_CMD_SW_RESET; + umac_wl(intf, reg, UMC_CMD); + udelay(2); + reg |= UMC_CMD_TX_EN | UMC_CMD_RX_EN | UMC_CMD_PROMISC; + } umac_wl(intf, reg, UMC_CMD); intf->eee.eee_active = phy_init_eee(phydev, 0) >= 0; @@ -1061,9 +1070,6 @@ static int bcmasp_netif_init(struct net_device *dev, bool phy_connect) umac_init(intf); - /* Disable the UniMAC RX/TX */ - umac_enable_set(intf, (UMC_CMD_RX_EN | UMC_CMD_TX_EN), 0); - umac_set_hw_addr(intf, dev->dev_addr); intf->old_duplex = -1; @@ -1083,9 +1089,6 @@ static int bcmasp_netif_init(struct net_device *dev, bool phy_connect) bcmasp_enable_rx(intf, 1); - /* Turn on UniMAC TX/RX */ - umac_enable_set(intf, (UMC_CMD_RX_EN | UMC_CMD_TX_EN), 1); - intf->crc_fwd = !!(umac_rl(intf, UMC_CMD) & UMC_CMD_CRC_FWD); bcmasp_netif_start(dev); @@ -1321,7 +1324,14 @@ static void bcmasp_suspend_to_wol(struct bcmasp_intf *intf) if (intf->wolopts & WAKE_FILTER) bcmasp_netfilt_suspend(intf); - /* UniMAC receive needs to be turned on */ + /* Bring UniMAC out of reset if needed and enable RX */ + reg = umac_rl(intf, UMC_CMD); + if (reg & UMC_CMD_SW_RESET) + reg &= ~UMC_CMD_SW_RESET; + + reg |= UMC_CMD_RX_EN | UMC_CMD_PROMISC; + umac_wl(intf, reg, UMC_CMD); + umac_enable_set(intf, UMC_CMD_RX_EN, 1); if (intf->parent->wol_irq > 0) { From 74a78a00db8fe0b1a777e875ffd923e6aaa7a1d5 Mon Sep 17 00:00:00 2001 From: Raju Lakkaraju Date: Tue, 26 Mar 2024 12:28:05 +0530 Subject: [PATCH 039/256] net: lan743x: Add set RFE read fifo threshold for PCI1x1x chips [ Upstream commit e4a58989f5c839316ac63675e8800b9eed7dbe96 ] PCI11x1x Rev B0 devices might drop packets when receiving back to back frames at 2.5G link speed. Change the B0 Rev device's Receive filtering Engine FIFO threshold parameter from its hardware default of 4 to 3 dwords to prevent the problem. Rev C0 and later hardware already defaults to 3 dwords. Fixes: bb4f6bffe33c ("net: lan743x: Add PCI11010 / PCI11414 device IDs") Signed-off-by: Raju Lakkaraju Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240326065805.686128-1-Raju.Lakkaraju@microchip.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/ethernet/microchip/lan743x_main.c | 18 ++++++++++++++++++ drivers/net/ethernet/microchip/lan743x_main.h | 4 ++++ 2 files changed, 22 insertions(+) diff --git a/drivers/net/ethernet/microchip/lan743x_main.c b/drivers/net/ethernet/microchip/lan743x_main.c index c81cdeb4d4e7..0b6174748d2b 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.c +++ b/drivers/net/ethernet/microchip/lan743x_main.c @@ -25,6 +25,8 @@ #define PCS_POWER_STATE_DOWN 0x6 #define PCS_POWER_STATE_UP 0x4 +#define RFE_RD_FIFO_TH_3_DWORDS 0x3 + static void pci11x1x_strap_get_status(struct lan743x_adapter *adapter) { u32 chip_rev; @@ -3223,6 +3225,21 @@ static void lan743x_full_cleanup(struct lan743x_adapter *adapter) lan743x_pci_cleanup(adapter); } +static void pci11x1x_set_rfe_rd_fifo_threshold(struct lan743x_adapter *adapter) +{ + u16 rev = adapter->csr.id_rev & ID_REV_CHIP_REV_MASK_; + + if (rev == ID_REV_CHIP_REV_PCI11X1X_B0_) { + u32 misc_ctl; + + misc_ctl = lan743x_csr_read(adapter, MISC_CTL_0); + misc_ctl &= ~MISC_CTL_0_RFE_READ_FIFO_MASK_; + misc_ctl |= FIELD_PREP(MISC_CTL_0_RFE_READ_FIFO_MASK_, + RFE_RD_FIFO_TH_3_DWORDS); + lan743x_csr_write(adapter, MISC_CTL_0, misc_ctl); + } +} + static int lan743x_hardware_init(struct lan743x_adapter *adapter, struct pci_dev *pdev) { @@ -3238,6 +3255,7 @@ static int lan743x_hardware_init(struct lan743x_adapter *adapter, pci11x1x_strap_get_status(adapter); spin_lock_init(&adapter->eth_syslock_spinlock); mutex_init(&adapter->sgmii_rw_lock); + pci11x1x_set_rfe_rd_fifo_threshold(adapter); } else { adapter->max_tx_channels = LAN743X_MAX_TX_CHANNELS; adapter->used_tx_channels = LAN743X_USED_TX_CHANNELS; diff --git a/drivers/net/ethernet/microchip/lan743x_main.h b/drivers/net/ethernet/microchip/lan743x_main.h index 52609fc13ad9..f0b486f85450 100644 --- a/drivers/net/ethernet/microchip/lan743x_main.h +++ b/drivers/net/ethernet/microchip/lan743x_main.h @@ -26,6 +26,7 @@ #define ID_REV_CHIP_REV_MASK_ (0x0000FFFF) #define ID_REV_CHIP_REV_A0_ (0x00000000) #define ID_REV_CHIP_REV_B0_ (0x00000010) +#define ID_REV_CHIP_REV_PCI11X1X_B0_ (0x000000B0) #define FPGA_REV (0x04) #define FPGA_REV_GET_MINOR_(fpga_rev) (((fpga_rev) >> 8) & 0x000000FF) @@ -311,6 +312,9 @@ #define SGMII_CTL_LINK_STATUS_SOURCE_ BIT(8) #define SGMII_CTL_SGMII_POWER_DN_ BIT(1) +#define MISC_CTL_0 (0x920) +#define MISC_CTL_0_RFE_READ_FIFO_MASK_ GENMASK(6, 4) + /* Vendor Specific SGMII MMD details */ #define SR_VSMMD_PCS_ID1 0x0004 #define SR_VSMMD_PCS_ID2 0x0005 From 54720f68c4ad0ffb5a776e7a14b25f02242068cb Mon Sep 17 00:00:00 2001 From: Hariprasad Kelam Date: Tue, 26 Mar 2024 10:57:20 +0530 Subject: [PATCH 040/256] Octeontx2-af: fix pause frame configuration in GMP mode [ Upstream commit 40d4b4807cadd83fb3f46cc8cd67a945b5b25461 ] The Octeontx2 MAC block (CGX) has separate data paths (SMU and GMP) for different speeds, allowing for efficient data transfer. The previous patch which added pause frame configuration has a bug due to which pause frame feature is not working in GMP mode. This patch fixes the issue by configurating appropriate registers. Fixes: f7e086e754fe ("octeontx2-af: Pause frame configuration at cgx") Signed-off-by: Hariprasad Kelam Reviewed-by: Simon Horman Link: https://lore.kernel.org/r/20240326052720.4441-1-hkelam@marvell.com Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- drivers/net/ethernet/marvell/octeontx2/af/cgx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 6c18d3d2442e..2539c985f695 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -808,6 +808,11 @@ static int cgx_lmac_enadis_pause_frm(void *cgxd, int lmac_id, if (!is_lmac_valid(cgx, lmac_id)) return -ENODEV; + cfg = cgx_read(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL); + cfg &= ~CGX_GMP_GMI_RXX_FRM_CTL_CTL_BCK; + cfg |= rx_pause ? CGX_GMP_GMI_RXX_FRM_CTL_CTL_BCK : 0x0; + cgx_write(cgx, lmac_id, CGXX_GMP_GMI_RXX_FRM_CTL, cfg); + cfg = cgx_read(cgx, lmac_id, CGXX_SMUX_RX_FRM_CTL); cfg &= ~CGX_SMUX_RX_FRM_CTL_CTL_BCK; cfg |= rx_pause ? CGX_SMUX_RX_FRM_CTL_CTL_BCK : 0x0; From f4877225313d474659ee53150ccc3d553a978727 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 26 Mar 2024 11:18:41 +0100 Subject: [PATCH 041/256] inet: inet_defrag: prevent sk release while still in use [ Upstream commit 18685451fc4e546fc0e718580d32df3c0e5c8272 ] ip_local_out() and other functions can pass skb->sk as function argument. If the skb is a fragment and reassembly happens before such function call returns, the sk must not be released. This affects skb fragments reassembled via netfilter or similar modules, e.g. openvswitch or ct_act.c, when run as part of tx pipeline. Eric Dumazet made an initial analysis of this bug. Quoting Eric: Calling ip_defrag() in output path is also implying skb_orphan(), which is buggy because output path relies on sk not disappearing. A relevant old patch about the issue was : 8282f27449bf ("inet: frag: Always orphan skbs inside ip_defrag()") [..] net/ipv4/ip_output.c depends on skb->sk being set, and probably to an inet socket, not an arbitrary one. If we orphan the packet in ipvlan, then downstream things like FQ packet scheduler will not work properly. We need to change ip_defrag() to only use skb_orphan() when really needed, ie whenever frag_list is going to be used. Eric suggested to stash sk in fragment queue and made an initial patch. However there is a problem with this: If skb is refragmented again right after, ip_do_fragment() will copy head->sk to the new fragments, and sets up destructor to sock_wfree. IOW, we have no choice but to fix up sk_wmem accouting to reflect the fully reassembled skb, else wmem will underflow. This change moves the orphan down into the core, to last possible moment. As ip_defrag_offset is aliased with sk_buff->sk member, we must move the offset into the FRAG_CB, else skb->sk gets clobbered. This allows to delay the orphaning long enough to learn if the skb has to be queued or if the skb is completing the reasm queue. In the former case, things work as before, skb is orphaned. This is safe because skb gets queued/stolen and won't continue past reasm engine. In the latter case, we will steal the skb->sk reference, reattach it to the head skb, and fix up wmem accouting when inet_frag inflates truesize. Fixes: 7026b1ddb6b8 ("netfilter: Pass socket pointer down through okfn().") Diagnosed-by: Eric Dumazet Reported-by: xingwei lee Reported-by: yue sun Reported-by: syzbot+e5167d7144a62715044c@syzkaller.appspotmail.com Signed-off-by: Florian Westphal Reviewed-by: Eric Dumazet Link: https://lore.kernel.org/r/20240326101845.30836-1-fw@strlen.de Signed-off-by: Paolo Abeni Signed-off-by: Sasha Levin --- include/linux/skbuff.h | 7 +-- net/ipv4/inet_fragment.c | 70 ++++++++++++++++++++----- net/ipv4/ip_fragment.c | 2 +- net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- 4 files changed, 60 insertions(+), 21 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 2922059908cc..9e61f6df6bc5 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -736,8 +736,6 @@ typedef unsigned char *sk_buff_data_t; * @list: queue head * @ll_node: anchor in an llist (eg socket defer_list) * @sk: Socket we are owned by - * @ip_defrag_offset: (aka @sk) alternate use of @sk, used in - * fragmentation management * @dev: Device we arrived on/are leaving by * @dev_scratch: (aka @dev) alternate use of @dev when @dev would be %NULL * @cb: Control buffer. Free for use by every layer. Put private vars here @@ -860,10 +858,7 @@ struct sk_buff { struct llist_node ll_node; }; - union { - struct sock *sk; - int ip_defrag_offset; - }; + struct sock *sk; union { ktime_t tstamp; diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 7072fc0783ef..c88c9034d630 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -24,6 +24,8 @@ #include #include +#include "../core/sock_destructor.h" + /* Use skb->cb to track consecutive/adjacent fragments coming at * the end of the queue. Nodes in the rb-tree queue will * contain "runs" of one or more adjacent fragments. @@ -39,6 +41,7 @@ struct ipfrag_skb_cb { }; struct sk_buff *next_frag; int frag_run_len; + int ip_defrag_offset; }; #define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) @@ -396,12 +399,12 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, */ if (!last) fragrun_create(q, skb); /* First fragment. */ - else if (last->ip_defrag_offset + last->len < end) { + else if (FRAG_CB(last)->ip_defrag_offset + last->len < end) { /* This is the common case: skb goes to the end. */ /* Detect and discard overlaps. */ - if (offset < last->ip_defrag_offset + last->len) + if (offset < FRAG_CB(last)->ip_defrag_offset + last->len) return IPFRAG_OVERLAP; - if (offset == last->ip_defrag_offset + last->len) + if (offset == FRAG_CB(last)->ip_defrag_offset + last->len) fragrun_append_to_last(q, skb); else fragrun_create(q, skb); @@ -418,13 +421,13 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, parent = *rbn; curr = rb_to_skb(parent); - curr_run_end = curr->ip_defrag_offset + + curr_run_end = FRAG_CB(curr)->ip_defrag_offset + FRAG_CB(curr)->frag_run_len; - if (end <= curr->ip_defrag_offset) + if (end <= FRAG_CB(curr)->ip_defrag_offset) rbn = &parent->rb_left; else if (offset >= curr_run_end) rbn = &parent->rb_right; - else if (offset >= curr->ip_defrag_offset && + else if (offset >= FRAG_CB(curr)->ip_defrag_offset && end <= curr_run_end) return IPFRAG_DUP; else @@ -438,7 +441,7 @@ int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, rb_insert_color(&skb->rbnode, &q->rb_fragments); } - skb->ip_defrag_offset = offset; + FRAG_CB(skb)->ip_defrag_offset = offset; return IPFRAG_OK; } @@ -448,13 +451,28 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, struct sk_buff *parent) { struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); - struct sk_buff **nextp; + void (*destructor)(struct sk_buff *); + unsigned int orig_truesize = 0; + struct sk_buff **nextp = NULL; + struct sock *sk = skb->sk; int delta; + if (sk && is_skb_wmem(skb)) { + /* TX: skb->sk might have been passed as argument to + * dst->output and must remain valid until tx completes. + * + * Move sk to reassembled skb and fix up wmem accounting. + */ + orig_truesize = skb->truesize; + destructor = skb->destructor; + } + if (head != skb) { fp = skb_clone(skb, GFP_ATOMIC); - if (!fp) - return NULL; + if (!fp) { + head = skb; + goto out_restore_sk; + } FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; if (RB_EMPTY_NODE(&skb->rbnode)) FRAG_CB(parent)->next_frag = fp; @@ -463,6 +481,12 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, &q->rb_fragments); if (q->fragments_tail == skb) q->fragments_tail = fp; + + if (orig_truesize) { + /* prevent skb_morph from releasing sk */ + skb->sk = NULL; + skb->destructor = NULL; + } skb_morph(skb, head); FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; rb_replace_node(&head->rbnode, &skb->rbnode, @@ -470,13 +494,13 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, consume_skb(head); head = skb; } - WARN_ON(head->ip_defrag_offset != 0); + WARN_ON(FRAG_CB(head)->ip_defrag_offset != 0); delta = -head->truesize; /* Head of list must not be cloned. */ if (skb_unclone(head, GFP_ATOMIC)) - return NULL; + goto out_restore_sk; delta += head->truesize; if (delta) @@ -492,7 +516,7 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, clone = alloc_skb(0, GFP_ATOMIC); if (!clone) - return NULL; + goto out_restore_sk; skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; skb_frag_list_init(head); for (i = 0; i < skb_shinfo(head)->nr_frags; i++) @@ -509,6 +533,21 @@ void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, nextp = &skb_shinfo(head)->frag_list; } +out_restore_sk: + if (orig_truesize) { + int ts_delta = head->truesize - orig_truesize; + + /* if this reassembled skb is fragmented later, + * fraglist skbs will get skb->sk assigned from head->sk, + * and each frag skb will be released via sock_wfree. + * + * Update sk_wmem_alloc. + */ + head->sk = sk; + head->destructor = destructor; + refcount_add(ts_delta, &sk->sk_wmem_alloc); + } + return nextp; } EXPORT_SYMBOL(inet_frag_reasm_prepare); @@ -516,6 +555,8 @@ EXPORT_SYMBOL(inet_frag_reasm_prepare); void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, void *reasm_data, bool try_coalesce) { + struct sock *sk = is_skb_wmem(head) ? head->sk : NULL; + const unsigned int head_truesize = head->truesize; struct sk_buff **nextp = reasm_data; struct rb_node *rbn; struct sk_buff *fp; @@ -579,6 +620,9 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, head->prev = NULL; head->tstamp = q->stamp; head->mono_delivery_time = q->mono_delivery_time; + + if (sk) + refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc); } EXPORT_SYMBOL(inet_frag_reasm_finish); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index a4941f53b523..fb947d1613fe 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -384,6 +384,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) } skb_dst_drop(skb); + skb_orphan(skb); return -EINPROGRESS; insert_error: @@ -487,7 +488,6 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user) struct ipq *qp; __IP_INC_STATS(net, IPSTATS_MIB_REASMREQDS); - skb_orphan(skb); /* Lookup (or create) queue header */ qp = ip_find(net, ip_hdr(skb), user, vif); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index b2dd48911c8d..efbec7ee27d0 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -294,6 +294,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, } skb_dst_drop(skb); + skb_orphan(skb); return -EINPROGRESS; insert_error: @@ -469,7 +470,6 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); - skb_orphan(skb); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); if (fq == NULL) { From 0a9901fdb7bb785ec4975aeeebc1428e3abae172 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Wed, 16 Aug 2023 14:42:05 -0700 Subject: [PATCH 042/256] drm/i915/dg2: Drop pre-production GT workarounds [ Upstream commit eaeb4b3614529bfa8a7edfdd7ecf6977b27f18b2 ] DG2 first production steppings were C0 (for DG2-G10), B1 (for DG2-G11), and A1 (for DG2-G12). Several workarounds that apply onto to pre-production hardware can be dropped. Furthermore, several workarounds that apply to all production steppings can have their conditions simplified to no longer check the GT stepping. v2: - Keep Wa_16011777198 in place for now; it will be removed separately in a follow-up patch to keep review easier. Bspec: 44477 Signed-off-by: Matt Roper Acked-by: Jani Nikula Reviewed-by: Matt Atwood Link: https://patchwork.freedesktop.org/patch/msgid/20230816214201.534095-10-matthew.d.roper@intel.com Stable-dep-of: 186bce682772 ("drm/i915/mtl: Update workaround 14018575942") Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/gt/intel_lrc.c | 34 +--- drivers/gpu/drm/i915/gt/intel_mocs.c | 21 +- drivers/gpu/drm/i915/gt/intel_rc6.c | 6 +- drivers/gpu/drm/i915/gt/intel_workarounds.c | 211 +------------------- drivers/gpu/drm/i915/gt/uc/intel_guc.c | 20 +- drivers/gpu/drm/i915/intel_clock_gating.c | 8 - 6 files changed, 21 insertions(+), 279 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index c378cc7c953c..f297c5808e7c 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1316,29 +1316,6 @@ gen12_emit_cmd_buf_wa(const struct intel_context *ce, u32 *cs) return cs; } -/* - * On DG2 during context restore of a preempted context in GPGPU mode, - * RCS restore hang is detected. This is extremely timing dependent. - * To address this below sw wabb is implemented for DG2 A steppings. - */ -static u32 * -dg2_emit_rcs_hang_wabb(const struct intel_context *ce, u32 *cs) -{ - *cs++ = MI_LOAD_REGISTER_IMM(1); - *cs++ = i915_mmio_reg_offset(GEN12_STATE_ACK_DEBUG(ce->engine->mmio_base)); - *cs++ = 0x21; - - *cs++ = MI_LOAD_REGISTER_REG; - *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base)); - *cs++ = i915_mmio_reg_offset(XEHP_CULLBIT1); - - *cs++ = MI_LOAD_REGISTER_REG; - *cs++ = i915_mmio_reg_offset(RING_NOPID(ce->engine->mmio_base)); - *cs++ = i915_mmio_reg_offset(XEHP_CULLBIT2); - - return cs; -} - /* * The bspec's tuning guide asks us to program a vertical watermark value of * 0x3FF. However this register is not saved/restored properly by the @@ -1363,14 +1340,8 @@ gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) cs = gen12_emit_cmd_buf_wa(ce, cs); cs = gen12_emit_restore_scratch(ce, cs); - /* Wa_22011450934:dg2 */ - if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_A0, STEP_B0) || - IS_DG2_GRAPHICS_STEP(ce->engine->i915, G11, STEP_A0, STEP_B0)) - cs = dg2_emit_rcs_hang_wabb(ce, cs); - /* Wa_16013000631:dg2 */ - if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) || - IS_DG2_G11(ce->engine->i915)) + if (IS_DG2_G11(ce->engine->i915)) cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, 0); cs = gen12_emit_aux_table_inv(ce->engine, cs); @@ -1391,8 +1362,7 @@ gen12_emit_indirect_ctx_xcs(const struct intel_context *ce, u32 *cs) cs = gen12_emit_restore_scratch(ce, cs); /* Wa_16013000631:dg2 */ - if (IS_DG2_GRAPHICS_STEP(ce->engine->i915, G10, STEP_B0, STEP_C0) || - IS_DG2_G11(ce->engine->i915)) + if (IS_DG2_G11(ce->engine->i915)) if (ce->engine->class == COMPUTE_CLASS) cs = gen8_emit_pipe_control(cs, PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE, diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c index 2c014407225c..bf8b42d2d327 100644 --- a/drivers/gpu/drm/i915/gt/intel_mocs.c +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c @@ -404,18 +404,6 @@ static const struct drm_i915_mocs_entry dg2_mocs_table[] = { MOCS_ENTRY(3, 0, L3_3_WB | L3_LKUP(1)), }; -static const struct drm_i915_mocs_entry dg2_mocs_table_g10_ax[] = { - /* Wa_14011441408: Set Go to Memory for MOCS#0 */ - MOCS_ENTRY(0, 0, L3_1_UC | L3_GLBGO(1) | L3_LKUP(1)), - /* UC - Coherent; GO:Memory */ - MOCS_ENTRY(1, 0, L3_1_UC | L3_GLBGO(1) | L3_LKUP(1)), - /* UC - Non-Coherent; GO:Memory */ - MOCS_ENTRY(2, 0, L3_1_UC | L3_GLBGO(1)), - - /* WB - LC */ - MOCS_ENTRY(3, 0, L3_3_WB | L3_LKUP(1)), -}; - static const struct drm_i915_mocs_entry pvc_mocs_table[] = { /* Error */ MOCS_ENTRY(0, 0, L3_3_WB), @@ -521,13 +509,8 @@ static unsigned int get_mocs_settings(const struct drm_i915_private *i915, table->wb_index = 2; table->unused_entries_index = 2; } else if (IS_DG2(i915)) { - if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0)) { - table->size = ARRAY_SIZE(dg2_mocs_table_g10_ax); - table->table = dg2_mocs_table_g10_ax; - } else { - table->size = ARRAY_SIZE(dg2_mocs_table); - table->table = dg2_mocs_table; - } + table->size = ARRAY_SIZE(dg2_mocs_table); + table->table = dg2_mocs_table; table->uc_index = 1; table->n_entries = GEN9_NUM_MOCS_ENTRIES; table->unused_entries_index = 3; diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c index ccdc1afbf11b..b8c9338176bd 100644 --- a/drivers/gpu/drm/i915/gt/intel_rc6.c +++ b/drivers/gpu/drm/i915/gt/intel_rc6.c @@ -118,14 +118,12 @@ static void gen11_rc6_enable(struct intel_rc6 *rc6) GEN6_RC_CTL_EI_MODE(1); /* - * Wa_16011777198 and BSpec 52698 - Render powergating must be off. + * BSpec 52698 - Render powergating must be off. * FIXME BSpec is outdated, disabling powergating for MTL is just * temporary wa and should be removed after fixing real cause * of forcewake timeouts. */ - if (IS_METEORLAKE(gt->i915) || - IS_DG2_GRAPHICS_STEP(gt->i915, G10, STEP_A0, STEP_C0) || - IS_DG2_GRAPHICS_STEP(gt->i915, G11, STEP_A0, STEP_B0)) + if (IS_METEORLAKE(gt->i915)) pg_enable = GEN9_MEDIA_PG_ENABLE | GEN11_MEDIA_SAMPLER_PG_ENABLE; diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 3ae0dbd39eaa..7b426f3015b3 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -764,39 +764,15 @@ static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine, { dg2_ctx_gt_tuning_init(engine, wal); - /* Wa_16011186671:dg2_g11 */ - if (IS_DG2_GRAPHICS_STEP(engine->i915, G11, STEP_A0, STEP_B0)) { - wa_mcr_masked_dis(wal, VFLSKPD, DIS_MULT_MISS_RD_SQUASH); - wa_mcr_masked_en(wal, VFLSKPD, DIS_OVER_FETCH_CACHE); - } - - if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0)) { - /* Wa_14010469329:dg2_g10 */ - wa_mcr_masked_en(wal, XEHP_COMMON_SLICE_CHICKEN3, - XEHP_DUAL_SIMD8_SEQ_MERGE_DISABLE); - - /* - * Wa_22010465075:dg2_g10 - * Wa_22010613112:dg2_g10 - * Wa_14010698770:dg2_g10 - */ - wa_mcr_masked_en(wal, XEHP_COMMON_SLICE_CHICKEN3, - GEN12_DISABLE_CPS_AWARE_COLOR_PIPE); - } - /* Wa_16013271637:dg2 */ wa_mcr_masked_en(wal, XEHP_SLICE_COMMON_ECO_CHICKEN1, MSC_MSAA_REODER_BUF_BYPASS_DISABLE); /* Wa_14014947963:dg2 */ - if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_B0, STEP_FOREVER) || - IS_DG2_G11(engine->i915) || IS_DG2_G12(engine->i915)) - wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000); + wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000); /* Wa_18018764978:dg2 */ - if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_C0, STEP_FOREVER) || - IS_DG2_G11(engine->i915) || IS_DG2_G12(engine->i915)) - wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL); + wa_mcr_masked_en(wal, XEHP_PSS_MODE2, SCOREBOARD_STALL_FLUSH_CONTROL); /* Wa_15010599737:dg2 */ wa_mcr_masked_en(wal, CHICKEN_RASTER_1, DIS_SF_ROUND_NEAREST_EVEN); @@ -1606,31 +1582,11 @@ xehpsdv_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) static void dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) { - struct intel_engine_cs *engine; - int id; - xehp_init_mcr(gt, wal); /* Wa_14011060649:dg2 */ wa_14011060649(gt, wal); - /* - * Although there are per-engine instances of these registers, - * they technically exist outside the engine itself and are not - * impacted by engine resets. Furthermore, they're part of the - * GuC blacklist so trying to treat them as engine workarounds - * will result in GuC initialization failure and a wedged GPU. - */ - for_each_engine(engine, gt, id) { - if (engine->class != VIDEO_DECODE_CLASS) - continue; - - /* Wa_16010515920:dg2_g10 */ - if (IS_DG2_GRAPHICS_STEP(gt->i915, G10, STEP_A0, STEP_B0)) - wa_write_or(wal, VDBOX_CGCTL3F18(engine->mmio_base), - ALNUNIT_CLKGATE_DIS); - } - if (IS_DG2_G10(gt->i915)) { /* Wa_22010523718:dg2 */ wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, @@ -1641,65 +1597,6 @@ dg2_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) DSS_ROUTER_CLKGATE_DIS); } - if (IS_DG2_GRAPHICS_STEP(gt->i915, G10, STEP_A0, STEP_B0) || - IS_DG2_GRAPHICS_STEP(gt->i915, G11, STEP_A0, STEP_B0)) { - /* Wa_14012362059:dg2 */ - wa_mcr_write_or(wal, XEHP_MERT_MOD_CTRL, FORCE_MISS_FTLB); - } - - if (IS_DG2_GRAPHICS_STEP(gt->i915, G10, STEP_A0, STEP_B0)) { - /* Wa_14010948348:dg2_g10 */ - wa_write_or(wal, UNSLCGCTL9430, MSQDUNIT_CLKGATE_DIS); - - /* Wa_14011037102:dg2_g10 */ - wa_write_or(wal, UNSLCGCTL9444, LTCDD_CLKGATE_DIS); - - /* Wa_14011371254:dg2_g10 */ - wa_mcr_write_or(wal, XEHP_SLICE_UNIT_LEVEL_CLKGATE, NODEDSS_CLKGATE_DIS); - - /* Wa_14011431319:dg2_g10 */ - wa_write_or(wal, UNSLCGCTL9440, GAMTLBOACS_CLKGATE_DIS | - GAMTLBVDBOX7_CLKGATE_DIS | - GAMTLBVDBOX6_CLKGATE_DIS | - GAMTLBVDBOX5_CLKGATE_DIS | - GAMTLBVDBOX4_CLKGATE_DIS | - GAMTLBVDBOX3_CLKGATE_DIS | - GAMTLBVDBOX2_CLKGATE_DIS | - GAMTLBVDBOX1_CLKGATE_DIS | - GAMTLBVDBOX0_CLKGATE_DIS | - GAMTLBKCR_CLKGATE_DIS | - GAMTLBGUC_CLKGATE_DIS | - GAMTLBBLT_CLKGATE_DIS); - wa_write_or(wal, UNSLCGCTL9444, GAMTLBGFXA0_CLKGATE_DIS | - GAMTLBGFXA1_CLKGATE_DIS | - GAMTLBCOMPA0_CLKGATE_DIS | - GAMTLBCOMPA1_CLKGATE_DIS | - GAMTLBCOMPB0_CLKGATE_DIS | - GAMTLBCOMPB1_CLKGATE_DIS | - GAMTLBCOMPC0_CLKGATE_DIS | - GAMTLBCOMPC1_CLKGATE_DIS | - GAMTLBCOMPD0_CLKGATE_DIS | - GAMTLBCOMPD1_CLKGATE_DIS | - GAMTLBMERT_CLKGATE_DIS | - GAMTLBVEBOX3_CLKGATE_DIS | - GAMTLBVEBOX2_CLKGATE_DIS | - GAMTLBVEBOX1_CLKGATE_DIS | - GAMTLBVEBOX0_CLKGATE_DIS); - - /* Wa_14010569222:dg2_g10 */ - wa_write_or(wal, UNSLICE_UNIT_LEVEL_CLKGATE, - GAMEDIA_CLKGATE_DIS); - - /* Wa_14011028019:dg2_g10 */ - wa_mcr_write_or(wal, SSMCGCTL9530, RTFUNIT_CLKGATE_DIS); - - /* Wa_14010680813:dg2_g10 */ - wa_mcr_write_or(wal, XEHP_GAMSTLB_CTRL, - CONTROL_BLOCK_CLKGATE_DIS | - EGRESS_BLOCK_CLKGATE_DIS | - TAG_BLOCK_CLKGATE_DIS); - } - /* Wa_14014830051:dg2 */ wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN); @@ -2242,29 +2139,10 @@ static void dg2_whitelist_build(struct intel_engine_cs *engine) switch (engine->class) { case RENDER_CLASS: - /* - * Wa_1507100340:dg2_g10 - * - * This covers 4 registers which are next to one another : - * - PS_INVOCATION_COUNT - * - PS_INVOCATION_COUNT_UDW - * - PS_DEPTH_COUNT - * - PS_DEPTH_COUNT_UDW - */ - if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0)) - whitelist_reg_ext(w, PS_INVOCATION_COUNT, - RING_FORCE_TO_NONPRIV_ACCESS_RD | - RING_FORCE_TO_NONPRIV_RANGE_4); - /* Required by recommended tuning setting (not a workaround) */ whitelist_mcr_reg(w, XEHP_COMMON_SLICE_CHICKEN3); break; - case COMPUTE_CLASS: - /* Wa_16011157294:dg2_g10 */ - if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_B0)) - whitelist_reg(w, GEN9_CTX_PREEMPT_REG); - break; default: break; } @@ -2415,12 +2293,6 @@ engine_fake_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) } } -static bool needs_wa_1308578152(struct intel_engine_cs *engine) -{ - return intel_sseu_find_first_xehp_dss(&engine->gt->info.sseu, 0, 0) >= - GEN_DSS_PER_GSLICE; -} - static void rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) { @@ -2435,42 +2307,20 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0) || - IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_FOREVER) || - IS_DG2_G11(i915) || IS_DG2_G12(i915)) { + IS_DG2(i915)) { /* Wa_1509727124 */ wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE, SC_DISABLE_POWER_OPTIMIZATION_EBB); } - if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_FOREVER) || - IS_DG2_G11(i915) || IS_DG2_G12(i915) || - IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0)) { + if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || + IS_DG2(i915)) { /* Wa_22012856258 */ wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_DISABLE_READ_SUPPRESSION); } - if (IS_DG2_GRAPHICS_STEP(i915, G11, STEP_A0, STEP_B0)) { - /* Wa_14013392000:dg2_g11 */ - wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, GEN12_ENABLE_LARGE_GRF_MODE); - } - - if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0) || - IS_DG2_GRAPHICS_STEP(i915, G11, STEP_A0, STEP_B0)) { - /* Wa_14012419201:dg2 */ - wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, - GEN12_DISABLE_HDR_PAST_PAYLOAD_HOLD_FIX); - } - - /* Wa_1308578152:dg2_g10 when first gslice is fused off */ - if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_C0) && - needs_wa_1308578152(engine)) { - wa_masked_dis(wal, GEN12_CS_DEBUG_MODE1_CCCSUNIT_BE_COMMON, - GEN12_REPLAY_MODE_GRANULARITY); - } - - if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_FOREVER) || - IS_DG2_G11(i915) || IS_DG2_G12(i915)) { + if (IS_DG2(i915)) { /* * Wa_22010960976:dg2 * Wa_14013347512:dg2 @@ -2479,34 +2329,7 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) LSC_L1_FLUSH_CTL_3D_DATAPORT_FLUSH_EVENTS_MASK); } - if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0)) { - /* - * Wa_1608949956:dg2_g10 - * Wa_14010198302:dg2_g10 - */ - wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN, - MDQ_ARBITRATION_MODE | UGM_BACKUP_MODE); - } - - if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0)) - /* Wa_22010430635:dg2 */ - wa_mcr_masked_en(wal, - GEN9_ROW_CHICKEN4, - GEN12_DISABLE_GRF_CLEAR); - - /* Wa_14013202645:dg2 */ - if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_C0) || - IS_DG2_GRAPHICS_STEP(i915, G11, STEP_A0, STEP_B0)) - wa_mcr_write_or(wal, RT_CTRL, DIS_NULL_QUERY); - - /* Wa_22012532006:dg2 */ - if (IS_DG2_GRAPHICS_STEP(engine->i915, G10, STEP_A0, STEP_C0) || - IS_DG2_GRAPHICS_STEP(engine->i915, G11, STEP_A0, STEP_B0)) - wa_mcr_masked_en(wal, GEN9_HALF_SLICE_CHICKEN7, - DG2_DISABLE_ROUND_ENABLE_ALLOW_FOR_SSLA); - - if (IS_DG2_GRAPHICS_STEP(i915, G11, STEP_B0, STEP_FOREVER) || - IS_DG2_G10(i915)) { + if (IS_DG2_G11(i915) || IS_DG2_G10(i915)) { /* Wa_22014600077:dg2 */ wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0, _MASKED_BIT_ENABLE(ENABLE_EU_COUNT_FOR_TDL_FLUSH), @@ -3050,8 +2873,7 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0) || - IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_FOREVER) || - IS_DG2_G11(i915) || IS_DG2_G12(i915)) { + IS_DG2(i915)) { /* Wa_22013037850 */ wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DISABLE_128B_EVICTION_COMMAND_UDW); @@ -3072,8 +2894,7 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE); } - if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_B0, STEP_C0) || - IS_DG2_G11(i915)) { + if (IS_DG2_G11(i915)) { /* * Wa_22012826095:dg2 * Wa_22013059131:dg2 @@ -3087,18 +2908,6 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li FORCE_1_SUB_MESSAGE_PER_FRAGMENT); } - if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0)) { - /* - * Wa_14010918519:dg2_g10 - * - * LSC_CHICKEN_BIT_0 always reads back as 0 is this stepping, - * so ignoring verification. - */ - wa_mcr_add(wal, LSC_CHICKEN_BIT_0_UDW, 0, - FORCE_SLM_FENCE_SCOPE_TO_TILE | FORCE_UGM_FENCE_SCOPE_TO_TILE, - 0, false); - } - if (IS_XEHPSDV(i915)) { /* Wa_1409954639 */ wa_mcr_masked_en(wal, @@ -3131,7 +2940,7 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8); } - if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_C0) || IS_DG2_G11(i915)) + if (IS_DG2_G11(i915)) /* * Wa_22012654132 * diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c index 569b5fe94c41..82a2ecc12b21 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c @@ -272,18 +272,14 @@ static u32 guc_ctl_wa_flags(struct intel_guc *guc) GRAPHICS_VER_FULL(gt->i915) < IP_VER(12, 50)) flags |= GUC_WA_POLLCS; - /* Wa_16011759253:dg2_g10:a0 */ - if (IS_DG2_GRAPHICS_STEP(gt->i915, G10, STEP_A0, STEP_B0)) - flags |= GUC_WA_GAM_CREDITS; - /* Wa_14014475959 */ if (IS_MTL_GRAPHICS_STEP(gt->i915, M, STEP_A0, STEP_B0) || IS_DG2(gt->i915)) flags |= GUC_WA_HOLD_CCS_SWITCHOUT; /* - * Wa_14012197797:dg2_g10:a0,dg2_g11:a0 - * Wa_22011391025:dg2_g10,dg2_g11,dg2_g12 + * Wa_14012197797 + * Wa_22011391025 * * The same WA bit is used for both and 22011391025 is applicable to * all DG2. @@ -297,17 +293,11 @@ static u32 guc_ctl_wa_flags(struct intel_guc *guc) GRAPHICS_VER_FULL(gt->i915) < IP_VER(12, 70))) flags |= GUC_WA_PRE_PARSER; - /* Wa_16011777198:dg2 */ - if (IS_DG2_GRAPHICS_STEP(gt->i915, G10, STEP_A0, STEP_C0) || - IS_DG2_GRAPHICS_STEP(gt->i915, G11, STEP_A0, STEP_B0)) - flags |= GUC_WA_RCS_RESET_BEFORE_RC6; - /* - * Wa_22012727170:dg2_g10[a0-c0), dg2_g11[a0..) - * Wa_22012727685:dg2_g11[a0..) + * Wa_22012727170 + * Wa_22012727685 */ - if (IS_DG2_GRAPHICS_STEP(gt->i915, G10, STEP_A0, STEP_C0) || - IS_DG2_GRAPHICS_STEP(gt->i915, G11, STEP_A0, STEP_FOREVER)) + if (IS_DG2_G11(gt->i915)) flags |= GUC_WA_CONTEXT_ISOLATION; /* Wa_16015675438 */ diff --git a/drivers/gpu/drm/i915/intel_clock_gating.c b/drivers/gpu/drm/i915/intel_clock_gating.c index 81a4d32734e9..c66eb6abd4a2 100644 --- a/drivers/gpu/drm/i915/intel_clock_gating.c +++ b/drivers/gpu/drm/i915/intel_clock_gating.c @@ -396,14 +396,6 @@ static void dg2_init_clock_gating(struct drm_i915_private *i915) /* Wa_22010954014:dg2 */ intel_uncore_rmw(&i915->uncore, XEHP_CLOCK_GATE_DIS, 0, SGSI_SIDECLK_DIS); - - /* - * Wa_14010733611:dg2_g10 - * Wa_22010146351:dg2_g10 - */ - if (IS_DG2_GRAPHICS_STEP(i915, G10, STEP_A0, STEP_B0)) - intel_uncore_rmw(&i915->uncore, XEHP_CLOCK_GATE_DIS, 0, - SGR_DIS | SGGI_DIS); } static void pvc_init_clock_gating(struct drm_i915_private *i915) From 6b25099eea4b65ba3b750ce49fa1a9a13d158046 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Wed, 16 Aug 2023 14:42:06 -0700 Subject: [PATCH 043/256] drm/i915: Tidy workaround definitions [ Upstream commit f1c805716516f9e648e13f0108cea8096e0c7023 ] Removal of the DG2 pre-production workarounds has left duplicate condition blocks in a couple places, as well as some inconsistent platform ordering. Reshuffle and consolidate some of the workarounds to reduce the number of condition blocks and to more consistently follow the "newest platform first" convention. Code movement only; no functional change. Signed-off-by: Matt Roper Acked-by: Jani Nikula Reviewed-by: Matt Atwood Link: https://patchwork.freedesktop.org/patch/msgid/20230816214201.534095-11-matthew.d.roper@intel.com Stable-dep-of: 186bce682772 ("drm/i915/mtl: Update workaround 14018575942") Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/gt/intel_workarounds.c | 100 +++++++++----------- 1 file changed, 46 insertions(+), 54 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 7b426f3015b3..69973dc51828 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -2337,6 +2337,19 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) true); } + if (IS_DG2(i915) || IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || + IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) { + /* + * Wa_1606700617:tgl,dg1,adl-p + * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p + * Wa_14010826681:tgl,dg1,rkl,adl-p + * Wa_18019627453:dg2 + */ + wa_masked_en(wal, + GEN9_CS_DEBUG_MODE1, + FF_DOP_CLOCK_GATE_DISABLE); + } + if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) { /* Wa_1606931601:tgl,rkl,dg1,adl-s,adl-p */ @@ -2350,19 +2363,11 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) */ wa_write_or(wal, GEN7_FF_THREAD_MODE, GEN12_FF_TESSELATION_DOP_GATE_DISABLE); - } - if (IS_ALDERLAKE_P(i915) || IS_DG2(i915) || IS_ALDERLAKE_S(i915) || - IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915)) { - /* - * Wa_1606700617:tgl,dg1,adl-p - * Wa_22010271021:tgl,rkl,dg1,adl-s,adl-p - * Wa_14010826681:tgl,dg1,rkl,adl-p - * Wa_18019627453:dg2 - */ - wa_masked_en(wal, - GEN9_CS_DEBUG_MODE1, - FF_DOP_CLOCK_GATE_DISABLE); + /* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */ + wa_mcr_masked_en(wal, + GEN10_SAMPLER_MODE, + ENABLE_SMALLPL); } if (IS_ALDERLAKE_P(i915) || IS_ALDERLAKE_S(i915) || @@ -2389,14 +2394,6 @@ rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) GEN8_RC_SEMA_IDLE_MSG_DISABLE); } - if (IS_DG1(i915) || IS_ROCKETLAKE(i915) || IS_TIGERLAKE(i915) || - IS_ALDERLAKE_S(i915) || IS_ALDERLAKE_P(i915)) { - /* Wa_1406941453:tgl,rkl,dg1,adl-s,adl-p */ - wa_mcr_masked_en(wal, - GEN10_SAMPLER_MODE, - ENABLE_SMALLPL); - } - if (GRAPHICS_VER(i915) == 11) { /* This is not an Wa. Enable for better image quality */ wa_masked_en(wal, @@ -2877,6 +2874,9 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li /* Wa_22013037850 */ wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DISABLE_128B_EVICTION_COMMAND_UDW); + + /* Wa_18017747507 */ + wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE); } if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || @@ -2887,11 +2887,20 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, DISABLE_D8_D16_COASLESCE); } - if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || - IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0) || - IS_DG2(i915)) { - /* Wa_18017747507 */ - wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE); + if (IS_PONTEVECCHIO(i915) || IS_DG2(i915)) { + /* Wa_14015227452:dg2,pvc */ + wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE); + + /* Wa_16015675438:dg2,pvc */ + wa_masked_en(wal, FF_SLICE_CS_CHICKEN2, GEN12_PERF_FIX_BALANCING_CFE_DISABLE); + } + + if (IS_DG2(i915)) { + /* + * Wa_16011620976:dg2_g11 + * Wa_22015475538:dg2 + */ + wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8); } if (IS_DG2_G11(i915)) { @@ -2906,6 +2915,18 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li /* Wa_22013059131:dg2 */ wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0, FORCE_1_SUB_MESSAGE_PER_FRAGMENT); + + /* + * Wa_22012654132 + * + * Note that register 0xE420 is write-only and cannot be read + * back for verification on DG2 (due to Wa_14012342262), so + * we need to explicitly skip the readback. + */ + wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0, + _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC), + 0 /* write-only, so skip validation */, + true); } if (IS_XEHPSDV(i915)) { @@ -2923,35 +2944,6 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li wa_mcr_masked_en(wal, GEN8_HALF_SLICE_CHICKEN1, GEN7_PSD_SINGLE_PORT_DISPATCH_ENABLE); } - - if (IS_DG2(i915) || IS_PONTEVECCHIO(i915)) { - /* Wa_14015227452:dg2,pvc */ - wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN4, XEHP_DIS_BBL_SYSPIPE); - - /* Wa_16015675438:dg2,pvc */ - wa_masked_en(wal, FF_SLICE_CS_CHICKEN2, GEN12_PERF_FIX_BALANCING_CFE_DISABLE); - } - - if (IS_DG2(i915)) { - /* - * Wa_16011620976:dg2_g11 - * Wa_22015475538:dg2 - */ - wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, DIS_CHAIN_2XSIMD8); - } - - if (IS_DG2_G11(i915)) - /* - * Wa_22012654132 - * - * Note that register 0xE420 is write-only and cannot be read - * back for verification on DG2 (due to Wa_14012342262), so - * we need to explicitly skip the readback. - */ - wa_mcr_add(wal, GEN10_CACHE_MODE_SS, 0, - _MASKED_BIT_ENABLE(ENABLE_PREFETCH_INTO_IC), - 0 /* write-only, so skip validation */, - true); } static void From 67f7fba8a08608cfd42ab354b79df56e9fee8856 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Mon, 21 Aug 2023 11:06:21 -0700 Subject: [PATCH 044/256] drm/i915: Consolidate condition for Wa_22011802037 [ Upstream commit 28c46feec7f8760683ef08f12746630a3598173e ] The workaround bounds for Wa_22011802037 are somewhat complex and are replicated in several places throughout the code. Pull the condition out to a helper function to prevent mistakes if this condition needs to change again in the future. Signed-off-by: Matt Roper Reviewed-by: Gustavo Sousa Link: https://patchwork.freedesktop.org/patch/msgid/20230821180619.650007-12-matthew.d.roper@intel.com Stable-dep-of: 186bce682772 ("drm/i915/mtl: Update workaround 14018575942") Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/gt/intel_engine_cs.c | 4 +--- .../drm/i915/gt/intel_execlists_submission.c | 4 +--- drivers/gpu/drm/i915/gt/intel_reset.c | 18 ++++++++++++++++++ drivers/gpu/drm/i915/gt/intel_reset.h | 2 ++ drivers/gpu/drm/i915/gt/uc/intel_guc.c | 4 +--- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 4 +--- 6 files changed, 24 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_cs.c b/drivers/gpu/drm/i915/gt/intel_engine_cs.c index e85d70a62123..84a75c95f3f7 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_cs.c @@ -1616,9 +1616,7 @@ static int __intel_engine_stop_cs(struct intel_engine_cs *engine, * Wa_22011802037: Prior to doing a reset, ensure CS is * stopped, set ring stop bit and prefetch disable bit to halt CS */ - if (IS_MTL_GRAPHICS_STEP(engine->i915, M, STEP_A0, STEP_B0) || - (GRAPHICS_VER(engine->i915) >= 11 && - GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70))) + if (intel_engine_reset_needs_wa_22011802037(engine->gt)) intel_uncore_write_fw(uncore, RING_MODE_GEN7(engine->mmio_base), _MASKED_BIT_ENABLE(GEN12_GFX_PREFETCH_DISABLE)); diff --git a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c index 5a720e252312..42e09f158920 100644 --- a/drivers/gpu/drm/i915/gt/intel_execlists_submission.c +++ b/drivers/gpu/drm/i915/gt/intel_execlists_submission.c @@ -3001,9 +3001,7 @@ static void execlists_reset_prepare(struct intel_engine_cs *engine) * Wa_22011802037: In addition to stopping the cs, we need * to wait for any pending mi force wakeups */ - if (IS_MTL_GRAPHICS_STEP(engine->i915, M, STEP_A0, STEP_B0) || - (GRAPHICS_VER(engine->i915) >= 11 && - GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70))) + if (intel_engine_reset_needs_wa_22011802037(engine->gt)) intel_engine_wait_for_pending_mi_fw(engine); engine->execlists.reset_ccid = active_ccid(engine); diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 5fa57a34cf4b..3a3f71ce3cb7 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -1632,6 +1632,24 @@ void __intel_fini_wedge(struct intel_wedge_me *w) w->gt = NULL; } +/* + * Wa_22011802037 requires that we (or the GuC) ensure that no command + * streamers are executing MI_FORCE_WAKE while an engine reset is initiated. + */ +bool intel_engine_reset_needs_wa_22011802037(struct intel_gt *gt) +{ + if (GRAPHICS_VER(gt->i915) < 11) + return false; + + if (IS_MTL_GRAPHICS_STEP(gt->i915, M, STEP_A0, STEP_B0)) + return true; + + if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70)) + return false; + + return true; +} + #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) #include "selftest_reset.c" #include "selftest_hangcheck.c" diff --git a/drivers/gpu/drm/i915/gt/intel_reset.h b/drivers/gpu/drm/i915/gt/intel_reset.h index 25c975b6e8fc..f615b30b81c5 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.h +++ b/drivers/gpu/drm/i915/gt/intel_reset.h @@ -78,4 +78,6 @@ void __intel_fini_wedge(struct intel_wedge_me *w); bool intel_has_gpu_reset(const struct intel_gt *gt); bool intel_has_reset_engine(const struct intel_gt *gt); +bool intel_engine_reset_needs_wa_22011802037(struct intel_gt *gt); + #endif /* I915_RESET_H */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c index 82a2ecc12b21..da967938fea5 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c @@ -288,9 +288,7 @@ static u32 guc_ctl_wa_flags(struct intel_guc *guc) flags |= GUC_WA_DUAL_QUEUE; /* Wa_22011802037: graphics version 11/12 */ - if (IS_MTL_GRAPHICS_STEP(gt->i915, M, STEP_A0, STEP_B0) || - (GRAPHICS_VER(gt->i915) >= 11 && - GRAPHICS_VER_FULL(gt->i915) < IP_VER(12, 70))) + if (intel_engine_reset_needs_wa_22011802037(gt)) flags |= GUC_WA_PRE_PARSER; /* diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 836e4d9d65ef..7a3e02ea5663 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -1690,9 +1690,7 @@ static void guc_engine_reset_prepare(struct intel_engine_cs *engine) * Wa_22011802037: In addition to stopping the cs, we need * to wait for any pending mi force wakeups */ - if (IS_MTL_GRAPHICS_STEP(engine->i915, M, STEP_A0, STEP_B0) || - (GRAPHICS_VER(engine->i915) >= 11 && - GRAPHICS_VER_FULL(engine->i915) < IP_VER(12, 70))) { + if (intel_engine_reset_needs_wa_22011802037(engine->gt)) { intel_engine_stop_cs(engine); intel_engine_wait_for_pending_mi_fw(engine); } From 18e77951e14a73f75d269e54b90c648b1e18b66e Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Mon, 21 Aug 2023 11:06:23 -0700 Subject: [PATCH 045/256] drm/i915/xelpg: Call Xe_LPG workaround functions based on IP version [ Upstream commit f7696ded7c9e358670dae1801660f442f059c7db ] Although some of our Xe_LPG workarounds were already being applied based on IP version correctly, others were matching on MTL as a base platform, which is incorrect. Although MTL is the only platform right now that uses Xe_LPG IP, this may not always be the case. If a future platform re-uses this graphics IP, the same workarounds should be applied, even if it isn't a "MTL" platform. We were also incorrectly applying Xe_LPG workarounds/tuning to the Xe_LPM+ media IP in one or two places; we should make sure that we don't try to apply graphics workarounds to the media GT and vice versa where they don't belong. A new helper macro IS_GT_IP_RANGE() is added to help ensure this is handled properly -- it checks that the GT matches the IP type being tested as well as the IP version falling in the proper range. Note that many of the stepping-based workarounds are still incorrectly checking for a MTL base platform; that will be remedied in a later patch. v2: - Rework macro into a slightly more generic IS_GT_IP_RANGE() that can be used for either GFX or MEDIA checks. v3: - Switch back to separate macros for gfx and media. (Jani) - Move macro to intel_gt.h. (Andi) Cc: Gustavo Sousa Cc: Tvrtko Ursulin Cc: Jani Nikula Cc: Andi Shyti Signed-off-by: Matt Roper Reviewed-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20230821180619.650007-14-matthew.d.roper@intel.com Stable-dep-of: 186bce682772 ("drm/i915/mtl: Update workaround 14018575942") Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/gt/intel_gt.h | 11 ++++++ drivers/gpu/drm/i915/gt/intel_workarounds.c | 38 +++++++++++---------- 2 files changed, 31 insertions(+), 18 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h index 6c34547b58b5..15c25980411d 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.h +++ b/drivers/gpu/drm/i915/gt/intel_gt.h @@ -14,6 +14,17 @@ struct drm_i915_private; struct drm_printer; +/* + * Check that the GT is a graphics GT and has an IP version within the + * specified range (inclusive). + */ +#define IS_GFX_GT_IP_RANGE(gt, from, until) ( \ + BUILD_BUG_ON_ZERO((from) < IP_VER(2, 0)) + \ + BUILD_BUG_ON_ZERO((until) < (from)) + \ + ((gt)->type != GT_MEDIA && \ + GRAPHICS_VER_FULL((gt)->i915) >= (from) && \ + GRAPHICS_VER_FULL((gt)->i915) <= (until))) + #define GT_TRACE(gt, fmt, ...) do { \ const struct intel_gt *gt__ __maybe_unused = (gt); \ GEM_TRACE("%s " fmt, dev_name(gt__->i915->drm.dev), \ diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 69973dc51828..4c24f3897aee 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -781,8 +781,8 @@ static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine, wa_masked_en(wal, CACHE_MODE_1, MSAA_OPTIMIZATION_REDUC_DISABLE); } -static void mtl_ctx_gt_tuning_init(struct intel_engine_cs *engine, - struct i915_wa_list *wal) +static void xelpg_ctx_gt_tuning_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) { struct drm_i915_private *i915 = engine->i915; @@ -793,12 +793,12 @@ static void mtl_ctx_gt_tuning_init(struct intel_engine_cs *engine, wa_add(wal, DRAW_WATERMARK, VERT_WM_VAL, 0x3FF, 0, false); } -static void mtl_ctx_workarounds_init(struct intel_engine_cs *engine, - struct i915_wa_list *wal) +static void xelpg_ctx_workarounds_init(struct intel_engine_cs *engine, + struct i915_wa_list *wal) { struct drm_i915_private *i915 = engine->i915; - mtl_ctx_gt_tuning_init(engine, wal); + xelpg_ctx_gt_tuning_init(engine, wal); if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0)) { @@ -907,8 +907,8 @@ __intel_engine_init_ctx_wa(struct intel_engine_cs *engine, if (engine->class != RENDER_CLASS) goto done; - if (IS_METEORLAKE(i915)) - mtl_ctx_workarounds_init(engine, wal); + if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 71))) + xelpg_ctx_workarounds_init(engine, wal); else if (IS_PONTEVECCHIO(i915)) ; /* noop; none at this time */ else if (IS_DG2(i915)) @@ -1688,10 +1688,8 @@ xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) */ static void gt_tuning_settings(struct intel_gt *gt, struct i915_wa_list *wal) { - if (IS_METEORLAKE(gt->i915)) { - if (gt->type != GT_MEDIA) - wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS); - + if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71))) { + wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS); wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS); } @@ -1723,7 +1721,7 @@ gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal) return; } - if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70)) + if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71))) xelpg_gt_workarounds_init(gt, wal); else if (IS_PONTEVECCHIO(i915)) pvc_gt_workarounds_init(gt, wal); @@ -2172,7 +2170,7 @@ static void pvc_whitelist_build(struct intel_engine_cs *engine) blacklist_trtt(engine); } -static void mtl_whitelist_build(struct intel_engine_cs *engine) +static void xelpg_whitelist_build(struct intel_engine_cs *engine) { struct i915_wa_list *w = &engine->whitelist; @@ -2194,8 +2192,10 @@ void intel_engine_init_whitelist(struct intel_engine_cs *engine) wa_init_start(w, engine->gt, "whitelist", engine->name); - if (IS_METEORLAKE(i915)) - mtl_whitelist_build(engine); + if (engine->gt->type == GT_MEDIA) + ; /* none yet */ + else if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 71))) + xelpg_whitelist_build(engine); else if (IS_PONTEVECCHIO(i915)) pvc_whitelist_build(engine); else if (IS_DG2(i915)) @@ -2795,10 +2795,12 @@ ccs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) * function invoked by __intel_engine_init_ctx_wa(). */ static void -add_render_compute_tuning_settings(struct drm_i915_private *i915, +add_render_compute_tuning_settings(struct intel_gt *gt, struct i915_wa_list *wal) { - if (IS_METEORLAKE(i915) || IS_DG2(i915)) + struct drm_i915_private *i915 = gt->i915; + + if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) || IS_DG2(i915)) wa_mcr_write_clr_set(wal, RT_CTRL, STACKID_CTRL, STACKID_CTRL_512); /* @@ -2828,7 +2830,7 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li { struct drm_i915_private *i915 = engine->i915; - add_render_compute_tuning_settings(i915, wal); + add_render_compute_tuning_settings(engine->gt, wal); if (GRAPHICS_VER(i915) >= 11) { /* This is not a Wa (although referred to as From b3749611a5e51188d17b4898eed8ecea571bc539 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Mon, 21 Aug 2023 11:06:24 -0700 Subject: [PATCH 046/256] drm/i915: Eliminate IS_MTL_GRAPHICS_STEP [ Upstream commit 5a213086a025349361b5cf75c8fd4591d96a7a99 ] Several workarounds are guarded by IS_MTL_GRAPHICS_STEP. However none of these workarounds are actually tied to MTL as a platform; they only relate to the Xe_LPG graphics IP, regardless of what platform it appears in. At the moment MTL is the only platform that uses Xe_LPG with IP versions 12.70 and 12.71, but we can't count on this being true in the future. Switch these to use a new IS_GFX_GT_IP_STEP() macro instead that is purely based on IP version. IS_GFX_GT_IP_STEP() is also GT-based rather than device-based, which will help prevent mistakes where we accidentally try to apply Xe_LPG graphics workarounds to the Xe_LPM+ media GT and vice-versa. v2: - Switch to a more generic and shorter IS_GT_IP_STEP macro that can be used for both graphics and media IP (and any other kind of GTs that show up in the future). v3: - Switch back to long-form IS_GFX_GT_IP_STEP macro. (Jani) - Move macro to intel_gt.h. (Andi) v4: - Build IS_GFX_GT_IP_STEP on top of IS_GFX_GT_IP_RANGE and IS_GRAPHICS_STEP building blocks and name the parameters from/until rather than begin/fixed. (Jani) - Fix usage examples in comment. v5: - Tweak comment on macro. (Gustavo) Cc: Gustavo Sousa Cc: Tvrtko Ursulin Cc: Andi Shyti Cc: Jani Nikula Signed-off-by: Matt Roper Reviewed-by: Gustavo Sousa Reviewed-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20230821180619.650007-15-matthew.d.roper@intel.com Stable-dep-of: 186bce682772 ("drm/i915/mtl: Update workaround 14018575942") Signed-off-by: Sasha Levin --- .../drm/i915/display/skl_universal_plane.c | 5 +- drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 11 +++-- drivers/gpu/drm/i915/gt/intel_gt.h | 20 ++++++++ drivers/gpu/drm/i915/gt/intel_gt_mcr.c | 7 ++- drivers/gpu/drm/i915/gt/intel_lrc.c | 4 +- drivers/gpu/drm/i915/gt/intel_reset.c | 2 +- drivers/gpu/drm/i915/gt/intel_workarounds.c | 48 ++++++++++--------- drivers/gpu/drm/i915/gt/uc/intel_guc.c | 2 +- .../gpu/drm/i915/gt/uc/intel_guc_submission.c | 2 +- drivers/gpu/drm/i915/i915_drv.h | 4 -- 10 files changed, 62 insertions(+), 43 deletions(-) diff --git a/drivers/gpu/drm/i915/display/skl_universal_plane.c b/drivers/gpu/drm/i915/display/skl_universal_plane.c index ffc15d278a39..d557ecd4e1eb 100644 --- a/drivers/gpu/drm/i915/display/skl_universal_plane.c +++ b/drivers/gpu/drm/i915/display/skl_universal_plane.c @@ -20,6 +20,7 @@ #include "skl_scaler.h" #include "skl_universal_plane.h" #include "skl_watermark.h" +#include "gt/intel_gt.h" #include "pxp/intel_pxp.h" static const u32 skl_plane_formats[] = { @@ -2169,8 +2170,8 @@ static bool skl_plane_has_rc_ccs(struct drm_i915_private *i915, enum pipe pipe, enum plane_id plane_id) { /* Wa_14017240301 */ - if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || - IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0)) + if (IS_GFX_GT_IP_STEP(to_gt(i915), IP_VER(12, 70), STEP_A0, STEP_B0) || + IS_GFX_GT_IP_STEP(to_gt(i915), IP_VER(12, 71), STEP_A0, STEP_B0)) return false; /* Wa_22011186057 */ diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c index 7ad36198aab2..3ac3e12d9c52 100644 --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c @@ -4,9 +4,9 @@ */ #include "gen8_engine_cs.h" -#include "i915_drv.h" #include "intel_engine_regs.h" #include "intel_gpu_commands.h" +#include "intel_gt.h" #include "intel_lrc.h" #include "intel_ring.h" @@ -226,8 +226,8 @@ u32 *gen12_emit_aux_table_inv(struct intel_engine_cs *engine, u32 *cs) static int mtl_dummy_pipe_control(struct i915_request *rq) { /* Wa_14016712196 */ - if (IS_MTL_GRAPHICS_STEP(rq->i915, M, STEP_A0, STEP_B0) || - IS_MTL_GRAPHICS_STEP(rq->i915, P, STEP_A0, STEP_B0)) { + if (IS_GFX_GT_IP_STEP(rq->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) || + IS_GFX_GT_IP_STEP(rq->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0)) { u32 *cs; /* dummy PIPE_CONTROL + depth flush */ @@ -808,6 +808,7 @@ u32 *gen12_emit_fini_breadcrumb_xcs(struct i915_request *rq, u32 *cs) u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs) { struct drm_i915_private *i915 = rq->i915; + struct intel_gt *gt = rq->engine->gt; u32 flags = (PIPE_CONTROL_CS_STALL | PIPE_CONTROL_TLB_INVALIDATE | PIPE_CONTROL_TILE_CACHE_FLUSH | @@ -818,8 +819,8 @@ u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs) PIPE_CONTROL_FLUSH_ENABLE); /* Wa_14016712196 */ - if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || - IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0)) + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || + IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) /* dummy PIPE_CONTROL + depth flush */ cs = gen12_emit_pipe_control(cs, 0, PIPE_CONTROL_DEPTH_CACHE_FLUSH, 0); diff --git a/drivers/gpu/drm/i915/gt/intel_gt.h b/drivers/gpu/drm/i915/gt/intel_gt.h index 15c25980411d..6e63b46682f7 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt.h +++ b/drivers/gpu/drm/i915/gt/intel_gt.h @@ -25,6 +25,26 @@ struct drm_printer; GRAPHICS_VER_FULL((gt)->i915) >= (from) && \ GRAPHICS_VER_FULL((gt)->i915) <= (until))) +/* + * Check that the GT is a graphics GT with a specific IP version and has + * a stepping in the range [from, until). The lower stepping bound is + * inclusive, the upper bound is exclusive. The most common use-case of this + * macro is for checking bounds for workarounds, which usually have a stepping + * ("from") at which the hardware issue is first present and another stepping + * ("until") at which a hardware fix is present and the software workaround is + * no longer necessary. E.g., + * + * IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) + * IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B1, STEP_FOREVER) + * + * "STEP_FOREVER" can be passed as "until" for workarounds that have no upper + * stepping bound for the specified IP version. + */ +#define IS_GFX_GT_IP_STEP(gt, ipver, from, until) ( \ + BUILD_BUG_ON_ZERO((until) <= (from)) + \ + (IS_GFX_GT_IP_RANGE((gt), (ipver), (ipver)) && \ + IS_GRAPHICS_STEP((gt)->i915, (from), (until)))) + #define GT_TRACE(gt, fmt, ...) do { \ const struct intel_gt *gt__ __maybe_unused = (gt); \ GEM_TRACE("%s " fmt, dev_name(gt__->i915->drm.dev), \ diff --git a/drivers/gpu/drm/i915/gt/intel_gt_mcr.c b/drivers/gpu/drm/i915/gt/intel_gt_mcr.c index 2c0f1f3e28ff..c6dec485aefb 100644 --- a/drivers/gpu/drm/i915/gt/intel_gt_mcr.c +++ b/drivers/gpu/drm/i915/gt/intel_gt_mcr.c @@ -3,8 +3,7 @@ * Copyright © 2022 Intel Corporation */ -#include "i915_drv.h" - +#include "intel_gt.h" #include "intel_gt_mcr.h" #include "intel_gt_print.h" #include "intel_gt_regs.h" @@ -166,8 +165,8 @@ void intel_gt_mcr_init(struct intel_gt *gt) gt->steering_table[OADDRM] = xelpmp_oaddrm_steering_table; } else if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70)) { /* Wa_14016747170 */ - if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || - IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0)) + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || + IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) fuse = REG_FIELD_GET(MTL_GT_L3_EXC_MASK, intel_uncore_read(gt->uncore, MTL_GT_ACTIVITY_FACTOR)); diff --git a/drivers/gpu/drm/i915/gt/intel_lrc.c b/drivers/gpu/drm/i915/gt/intel_lrc.c index f297c5808e7c..b99efa348ad1 100644 --- a/drivers/gpu/drm/i915/gt/intel_lrc.c +++ b/drivers/gpu/drm/i915/gt/intel_lrc.c @@ -1347,8 +1347,8 @@ gen12_emit_indirect_ctx_rcs(const struct intel_context *ce, u32 *cs) cs = gen12_emit_aux_table_inv(ce->engine, cs); /* Wa_16014892111 */ - if (IS_MTL_GRAPHICS_STEP(ce->engine->i915, M, STEP_A0, STEP_B0) || - IS_MTL_GRAPHICS_STEP(ce->engine->i915, P, STEP_A0, STEP_B0) || + if (IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) || + IS_GFX_GT_IP_STEP(ce->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0) || IS_DG2(ce->engine->i915)) cs = dg2_emit_draw_watermark_setting(cs); diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 3a3f71ce3cb7..63d0892d3c45 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -1641,7 +1641,7 @@ bool intel_engine_reset_needs_wa_22011802037(struct intel_gt *gt) if (GRAPHICS_VER(gt->i915) < 11) return false; - if (IS_MTL_GRAPHICS_STEP(gt->i915, M, STEP_A0, STEP_B0)) + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0)) return true; if (GRAPHICS_VER_FULL(gt->i915) >= IP_VER(12, 70)) diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 4c24f3897aee..b6237e999be9 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -784,24 +784,24 @@ static void dg2_ctx_workarounds_init(struct intel_engine_cs *engine, static void xelpg_ctx_gt_tuning_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) { - struct drm_i915_private *i915 = engine->i915; + struct intel_gt *gt = engine->gt; dg2_ctx_gt_tuning_init(engine, wal); - if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_B0, STEP_FOREVER) || - IS_MTL_GRAPHICS_STEP(i915, P, STEP_B0, STEP_FOREVER)) + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) || + IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER)) wa_add(wal, DRAW_WATERMARK, VERT_WM_VAL, 0x3FF, 0, false); } static void xelpg_ctx_workarounds_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) { - struct drm_i915_private *i915 = engine->i915; + struct intel_gt *gt = engine->gt; xelpg_ctx_gt_tuning_init(engine, wal); - if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || - IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0)) { + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || + IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) { /* Wa_14014947963 */ wa_masked_field_set(wal, VF_PREEMPTION, PREEMPTION_VERTEX_COUNT, 0x4000); @@ -1644,8 +1644,8 @@ xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) /* Wa_22016670082 */ wa_write_or(wal, GEN12_SQCNT1, GEN12_STRICT_RAR_ENABLE); - if (IS_MTL_GRAPHICS_STEP(gt->i915, M, STEP_A0, STEP_B0) || - IS_MTL_GRAPHICS_STEP(gt->i915, P, STEP_A0, STEP_B0)) { + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || + IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) { /* Wa_14014830051 */ wa_mcr_write_clr(wal, SARB_CHICKEN1, COMP_CKN_IN); @@ -2297,23 +2297,24 @@ static void rcs_engine_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) { struct drm_i915_private *i915 = engine->i915; + struct intel_gt *gt = engine->gt; - if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || - IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0)) { + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || + IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) { /* Wa_22014600077 */ wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS, ENABLE_EU_COUNT_FOR_TDL_FLUSH); } - if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || - IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0) || + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || + IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) || IS_DG2(i915)) { /* Wa_1509727124 */ wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE, SC_DISABLE_POWER_OPTIMIZATION_EBB); } - if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || IS_DG2(i915)) { /* Wa_22012856258 */ wa_mcr_masked_en(wal, GEN8_ROW_CHICKEN2, @@ -2829,8 +2830,9 @@ static void general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_list *wal) { struct drm_i915_private *i915 = engine->i915; + struct intel_gt *gt = engine->gt; - add_render_compute_tuning_settings(engine->gt, wal); + add_render_compute_tuning_settings(gt, wal); if (GRAPHICS_VER(i915) >= 11) { /* This is not a Wa (although referred to as @@ -2851,13 +2853,13 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li GEN11_INDIRECT_STATE_BASE_ADDR_OVERRIDE); } - if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_B0, STEP_FOREVER) || - IS_MTL_GRAPHICS_STEP(i915, P, STEP_B0, STEP_FOREVER)) + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) || + IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER)) /* Wa_14017856879 */ wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN3, MTL_DISABLE_FIX_FOR_EOT_FLUSH); - if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || - IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0)) + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || + IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) /* * Wa_14017066071 * Wa_14017654203 @@ -2865,13 +2867,13 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li wa_mcr_masked_en(wal, GEN10_SAMPLER_MODE, MTL_DISABLE_SAMPLER_SC_OOO); - if (IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0)) + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) /* Wa_22015279794 */ wa_mcr_masked_en(wal, GEN10_CACHE_MODE_SS, DISABLE_PREFETCH_INTO_IC); - if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || - IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0) || + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || + IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) || IS_DG2(i915)) { /* Wa_22013037850 */ wa_mcr_write_or(wal, LSC_CHICKEN_BIT_0_UDW, @@ -2881,8 +2883,8 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li wa_masked_en(wal, VFG_PREEMPTION_CHICKEN, POLYGON_TRIFAN_LINELOOP_DISABLE); } - if (IS_MTL_GRAPHICS_STEP(i915, M, STEP_A0, STEP_B0) || - IS_MTL_GRAPHICS_STEP(i915, P, STEP_A0, STEP_B0) || + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || + IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0) || IS_PONTEVECCHIO(i915) || IS_DG2(i915)) { /* Wa_22014226127 */ diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc.c b/drivers/gpu/drm/i915/gt/uc/intel_guc.c index da967938fea5..861d0c58388c 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc.c @@ -273,7 +273,7 @@ static u32 guc_ctl_wa_flags(struct intel_guc *guc) flags |= GUC_WA_POLLCS; /* Wa_14014475959 */ - if (IS_MTL_GRAPHICS_STEP(gt->i915, M, STEP_A0, STEP_B0) || + if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || IS_DG2(gt->i915)) flags |= GUC_WA_HOLD_CCS_SWITCHOUT; diff --git a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c index 7a3e02ea5663..b5de5a9f5967 100644 --- a/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c +++ b/drivers/gpu/drm/i915/gt/uc/intel_guc_submission.c @@ -4297,7 +4297,7 @@ static void guc_default_vfuncs(struct intel_engine_cs *engine) /* Wa_14014475959:dg2 */ if (engine->class == COMPUTE_CLASS) - if (IS_MTL_GRAPHICS_STEP(engine->i915, M, STEP_A0, STEP_B0) || + if (IS_GFX_GT_IP_STEP(engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) || IS_DG2(engine->i915)) engine->flags |= I915_ENGINE_USES_WA_HOLD_CCS_SWITCHOUT; diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h index 7a8ce7239bc9..e0e0493d6c1f 100644 --- a/drivers/gpu/drm/i915/i915_drv.h +++ b/drivers/gpu/drm/i915/i915_drv.h @@ -658,10 +658,6 @@ IS_SUBPLATFORM(const struct drm_i915_private *i915, #define IS_XEHPSDV_GRAPHICS_STEP(__i915, since, until) \ (IS_XEHPSDV(__i915) && IS_GRAPHICS_STEP(__i915, since, until)) -#define IS_MTL_GRAPHICS_STEP(__i915, variant, since, until) \ - (IS_SUBPLATFORM(__i915, INTEL_METEORLAKE, INTEL_SUBPLATFORM_##variant) && \ - IS_GRAPHICS_STEP(__i915, since, until)) - #define IS_MTL_DISPLAY_STEP(__i915, since, until) \ (IS_METEORLAKE(__i915) && \ IS_DISPLAY_STEP(__i915, since, until)) From ec84b2a44b057b2c51ed9f670b92690904e1106c Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Mon, 21 Aug 2023 11:06:29 -0700 Subject: [PATCH 047/256] drm/i915: Replace several IS_METEORLAKE with proper IP version checks [ Upstream commit 14128d64090fa88445376cb8ccf91c50c08bd410 ] Many of the IS_METEORLAKE conditions throughout the driver are supposed to be checks for Xe_LPG and/or Xe_LPM+ IP, not for the MTL platform specifically. Update those checks to ensure that the code will still operate properly if/when these IP versions show up on future platforms. v2: - Update two more conditions (one for pg_enable, one for MTL HuC compatibility). v3: - Don't change GuC/HuC compatibility check, which sounds like it truly is specific to the MTL platform. (Gustavo) - Drop a non-lineage workaround number for the OA timestamp frequency workaround. (Gustavo) Cc: Gustavo Sousa Signed-off-by: Matt Roper Reviewed-by: Gustavo Sousa Link: https://patchwork.freedesktop.org/patch/msgid/20230821180619.650007-20-matthew.d.roper@intel.com Stable-dep-of: 186bce682772 ("drm/i915/mtl: Update workaround 14018575942") Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/gem/i915_gem_create.c | 4 ++-- drivers/gpu/drm/i915/gt/intel_engine_pm.c | 2 +- drivers/gpu/drm/i915/gt/intel_mocs.c | 2 +- drivers/gpu/drm/i915/gt/intel_rc6.c | 2 +- drivers/gpu/drm/i915/gt/intel_reset.c | 2 +- drivers/gpu/drm/i915/gt/intel_rps.c | 2 +- drivers/gpu/drm/i915/i915_debugfs.c | 2 +- drivers/gpu/drm/i915/i915_perf.c | 11 +++++------ 8 files changed, 13 insertions(+), 14 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_create.c b/drivers/gpu/drm/i915/gem/i915_gem_create.c index d24c0ce8805c..19156ba4b9ef 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_create.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_create.c @@ -405,8 +405,8 @@ static int ext_set_pat(struct i915_user_extension __user *base, void *data) BUILD_BUG_ON(sizeof(struct drm_i915_gem_create_ext_set_pat) != offsetofend(struct drm_i915_gem_create_ext_set_pat, rsvd)); - /* Limiting the extension only to Meteor Lake */ - if (!IS_METEORLAKE(i915)) + /* Limiting the extension only to Xe_LPG and beyond */ + if (GRAPHICS_VER_FULL(i915) < IP_VER(12, 70)) return -ENODEV; if (copy_from_user(&ext, base, sizeof(ext))) diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c index a95615b345cd..5a3a5b29d150 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c @@ -21,7 +21,7 @@ static void intel_gsc_idle_msg_enable(struct intel_engine_cs *engine) { struct drm_i915_private *i915 = engine->i915; - if (IS_METEORLAKE(i915) && engine->id == GSC0) { + if (MEDIA_VER(i915) >= 13 && engine->id == GSC0) { intel_uncore_write(engine->gt->uncore, RC_PSMI_CTRL_GSCCS, _MASKED_BIT_DISABLE(IDLE_MSG_DISABLE)); diff --git a/drivers/gpu/drm/i915/gt/intel_mocs.c b/drivers/gpu/drm/i915/gt/intel_mocs.c index bf8b42d2d327..07269ff3be13 100644 --- a/drivers/gpu/drm/i915/gt/intel_mocs.c +++ b/drivers/gpu/drm/i915/gt/intel_mocs.c @@ -495,7 +495,7 @@ static unsigned int get_mocs_settings(const struct drm_i915_private *i915, memset(table, 0, sizeof(struct drm_i915_mocs_table)); table->unused_entries_index = I915_MOCS_PTE; - if (IS_METEORLAKE(i915)) { + if (IS_GFX_GT_IP_RANGE(&i915->gt0, IP_VER(12, 70), IP_VER(12, 71))) { table->size = ARRAY_SIZE(mtl_mocs_table); table->table = mtl_mocs_table; table->n_entries = MTL_NUM_MOCS_ENTRIES; diff --git a/drivers/gpu/drm/i915/gt/intel_rc6.c b/drivers/gpu/drm/i915/gt/intel_rc6.c index b8c9338176bd..9e113e947326 100644 --- a/drivers/gpu/drm/i915/gt/intel_rc6.c +++ b/drivers/gpu/drm/i915/gt/intel_rc6.c @@ -123,7 +123,7 @@ static void gen11_rc6_enable(struct intel_rc6 *rc6) * temporary wa and should be removed after fixing real cause * of forcewake timeouts. */ - if (IS_METEORLAKE(gt->i915)) + if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71))) pg_enable = GEN9_MEDIA_PG_ENABLE | GEN11_MEDIA_SAMPLER_PG_ENABLE; diff --git a/drivers/gpu/drm/i915/gt/intel_reset.c b/drivers/gpu/drm/i915/gt/intel_reset.c index 63d0892d3c45..13fb8e5042c5 100644 --- a/drivers/gpu/drm/i915/gt/intel_reset.c +++ b/drivers/gpu/drm/i915/gt/intel_reset.c @@ -705,7 +705,7 @@ static int __reset_guc(struct intel_gt *gt) static bool needs_wa_14015076503(struct intel_gt *gt, intel_engine_mask_t engine_mask) { - if (!IS_METEORLAKE(gt->i915) || !HAS_ENGINE(gt, GSC0)) + if (MEDIA_VER_FULL(gt->i915) != IP_VER(13, 0) || !HAS_ENGINE(gt, GSC0)) return false; if (!__HAS_ENGINE(engine_mask, GSC0)) diff --git a/drivers/gpu/drm/i915/gt/intel_rps.c b/drivers/gpu/drm/i915/gt/intel_rps.c index 092542f53aad..4feef874e6d6 100644 --- a/drivers/gpu/drm/i915/gt/intel_rps.c +++ b/drivers/gpu/drm/i915/gt/intel_rps.c @@ -1161,7 +1161,7 @@ void gen6_rps_get_freq_caps(struct intel_rps *rps, struct intel_rps_freq_caps *c { struct drm_i915_private *i915 = rps_to_i915(rps); - if (IS_METEORLAKE(i915)) + if (GRAPHICS_VER_FULL(i915) >= IP_VER(12, 70)) return mtl_get_freq_caps(rps, caps); else return __gen6_rps_get_freq_caps(rps, caps); diff --git a/drivers/gpu/drm/i915/i915_debugfs.c b/drivers/gpu/drm/i915/i915_debugfs.c index 4de44cf1026d..7a90a2e32c9f 100644 --- a/drivers/gpu/drm/i915/i915_debugfs.c +++ b/drivers/gpu/drm/i915/i915_debugfs.c @@ -144,7 +144,7 @@ static const char *i915_cache_level_str(struct drm_i915_gem_object *obj) { struct drm_i915_private *i915 = obj_to_i915(obj); - if (IS_METEORLAKE(i915)) { + if (IS_GFX_GT_IP_RANGE(to_gt(i915), IP_VER(12, 70), IP_VER(12, 71))) { switch (obj->pat_index) { case 0: return " WB"; case 1: return " WT"; diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 8f4a25d2cfc2..48ea17b49b3a 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -3255,11 +3255,10 @@ get_sseu_config(struct intel_sseu *out_sseu, */ u32 i915_perf_oa_timestamp_frequency(struct drm_i915_private *i915) { - /* - * Wa_18013179988:dg2 - * Wa_14015846243:mtl - */ - if (IS_DG2(i915) || IS_METEORLAKE(i915)) { + struct intel_gt *gt = to_gt(i915); + + /* Wa_18013179988 */ + if (IS_DG2(i915) || IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71))) { intel_wakeref_t wakeref; u32 reg, shift; @@ -4564,7 +4563,7 @@ static bool xehp_is_valid_b_counter_addr(struct i915_perf *perf, u32 addr) static bool gen12_is_valid_mux_addr(struct i915_perf *perf, u32 addr) { - if (IS_METEORLAKE(perf->i915)) + if (GRAPHICS_VER_FULL(perf->i915) >= IP_VER(12, 70)) return reg_in_range_table(addr, mtl_oa_mux_regs); else return reg_in_range_table(addr, gen12_oa_mux_regs); From 338db8193cb2dd93544ac445a7b4b4a7f77094ad Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Mon, 28 Aug 2023 12:04:50 +0530 Subject: [PATCH 048/256] drm/i915/mtl: Update workaround 14016712196 [ Upstream commit 7467e1da906468bcbd311023b30708193103ecf9 ] Now this workaround is permanent workaround on MTL and DG2, earlier we used to apply on MTL A0 step only. VLK-45480 Fixes: d922b80b1010 ("drm/i915/gt: Add workaround 14016712196") Signed-off-by: Tejas Upadhyay Acked-by: Nirmoy Das Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20230828063450.2642748-1-tejas.upadhyay@intel.com Stable-dep-of: 186bce682772 ("drm/i915/mtl: Update workaround 14018575942") Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c index 3ac3e12d9c52..ba4c2422b340 100644 --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c @@ -226,8 +226,8 @@ u32 *gen12_emit_aux_table_inv(struct intel_engine_cs *engine, u32 *cs) static int mtl_dummy_pipe_control(struct i915_request *rq) { /* Wa_14016712196 */ - if (IS_GFX_GT_IP_STEP(rq->engine->gt, IP_VER(12, 70), STEP_A0, STEP_B0) || - IS_GFX_GT_IP_STEP(rq->engine->gt, IP_VER(12, 71), STEP_A0, STEP_B0)) { + if (IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 71)) || + IS_DG2(rq->i915)) { u32 *cs; /* dummy PIPE_CONTROL + depth flush */ @@ -819,8 +819,7 @@ u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs) PIPE_CONTROL_FLUSH_ENABLE); /* Wa_14016712196 */ - if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || - IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0)) + if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) || IS_DG2(i915)) /* dummy PIPE_CONTROL + depth flush */ cs = gen12_emit_pipe_control(cs, 0, PIPE_CONTROL_DEPTH_CACHE_FLUSH, 0); From 798781b43194c6d2bdea0c4ded660f3135c484d3 Mon Sep 17 00:00:00 2001 From: Matt Roper Date: Mon, 8 Jan 2024 17:57:38 +0530 Subject: [PATCH 049/256] drm/i915/xelpg: Extend some workarounds/tuning to gfx version 12.74 [ Upstream commit c44d4ef47fdad0a33966de89f9064e19736bb52f ] Some of our existing Xe_LPG workarounds and tuning are also applicable to the version 12.74 variant. Extend the condition bounds accordingly. Also fix the comment on Wa_14018575942 while we're at it. v2: Extend some more workarounds (Harish) Signed-off-by: Matt Roper Signed-off-by: Harish Chegondi Signed-off-by: Haridhar Kalvala Reviewed-by: Matt Atwood Link: https://patchwork.freedesktop.org/patch/msgid/20240108122738.14399-4-haridhar.kalvala@intel.com Stable-dep-of: 186bce682772 ("drm/i915/mtl: Update workaround 14018575942") Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/gt/gen8_engine_cs.c | 4 ++-- drivers/gpu/drm/i915/gt/intel_workarounds.c | 24 +++++++++++++-------- drivers/gpu/drm/i915/i915_perf.c | 2 +- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c index ba4c2422b340..cddf8c16e9a7 100644 --- a/drivers/gpu/drm/i915/gt/gen8_engine_cs.c +++ b/drivers/gpu/drm/i915/gt/gen8_engine_cs.c @@ -226,7 +226,7 @@ u32 *gen12_emit_aux_table_inv(struct intel_engine_cs *engine, u32 *cs) static int mtl_dummy_pipe_control(struct i915_request *rq) { /* Wa_14016712196 */ - if (IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 71)) || + if (IS_GFX_GT_IP_RANGE(rq->engine->gt, IP_VER(12, 70), IP_VER(12, 74)) || IS_DG2(rq->i915)) { u32 *cs; @@ -819,7 +819,7 @@ u32 *gen12_emit_fini_breadcrumb_rcs(struct i915_request *rq, u32 *cs) PIPE_CONTROL_FLUSH_ENABLE); /* Wa_14016712196 */ - if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) || IS_DG2(i915)) + if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)) || IS_DG2(i915)) /* dummy PIPE_CONTROL + depth flush */ cs = gen12_emit_pipe_control(cs, 0, PIPE_CONTROL_DEPTH_CACHE_FLUSH, 0); diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index b6237e999be9..37b2b0440923 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -788,8 +788,13 @@ static void xelpg_ctx_gt_tuning_init(struct intel_engine_cs *engine, dg2_ctx_gt_tuning_init(engine, wal); - if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) || - IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER)) + /* + * Due to Wa_16014892111, the DRAW_WATERMARK tuning must be done in + * gen12_emit_indirect_ctx_rcs() rather than here on some early + * steppings. + */ + if (!(IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_A0, STEP_B0) || + IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_A0, STEP_B0))) wa_add(wal, DRAW_WATERMARK, VERT_WM_VAL, 0x3FF, 0, false); } @@ -907,7 +912,7 @@ __intel_engine_init_ctx_wa(struct intel_engine_cs *engine, if (engine->class != RENDER_CLASS) goto done; - if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 71))) + if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74))) xelpg_ctx_workarounds_init(engine, wal); else if (IS_PONTEVECCHIO(i915)) ; /* noop; none at this time */ @@ -1638,7 +1643,7 @@ pvc_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) static void xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) { - /* Wa_14018778641 / Wa_18018781329 */ + /* Wa_14018575942 / Wa_18018781329 */ wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB); /* Wa_22016670082 */ @@ -1688,7 +1693,7 @@ xelpmp_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) */ static void gt_tuning_settings(struct intel_gt *gt, struct i915_wa_list *wal) { - if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71))) { + if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) { wa_mcr_write_or(wal, XEHP_L3SCQREG7, BLEND_FILL_CACHING_OPT_DIS); wa_mcr_write_or(wal, XEHP_SQCM, EN_32B_ACCESS); } @@ -1721,7 +1726,7 @@ gt_init_workarounds(struct intel_gt *gt, struct i915_wa_list *wal) return; } - if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71))) + if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) xelpg_gt_workarounds_init(gt, wal); else if (IS_PONTEVECCHIO(i915)) pvc_gt_workarounds_init(gt, wal); @@ -2194,7 +2199,7 @@ void intel_engine_init_whitelist(struct intel_engine_cs *engine) if (engine->gt->type == GT_MEDIA) ; /* none yet */ - else if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 71))) + else if (IS_GFX_GT_IP_RANGE(engine->gt, IP_VER(12, 70), IP_VER(12, 74))) xelpg_whitelist_build(engine); else if (IS_PONTEVECCHIO(i915)) pvc_whitelist_build(engine); @@ -2801,7 +2806,7 @@ add_render_compute_tuning_settings(struct intel_gt *gt, { struct drm_i915_private *i915 = gt->i915; - if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71)) || IS_DG2(i915)) + if (IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74)) || IS_DG2(i915)) wa_mcr_write_clr_set(wal, RT_CTRL, STACKID_CTRL, STACKID_CTRL_512); /* @@ -2854,7 +2859,8 @@ general_render_compute_wa_init(struct intel_engine_cs *engine, struct i915_wa_li } if (IS_GFX_GT_IP_STEP(gt, IP_VER(12, 70), STEP_B0, STEP_FOREVER) || - IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER)) + IS_GFX_GT_IP_STEP(gt, IP_VER(12, 71), STEP_B0, STEP_FOREVER) || + IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 74), IP_VER(12, 74))) /* Wa_14017856879 */ wa_mcr_masked_en(wal, GEN9_ROW_CHICKEN3, MTL_DISABLE_FIX_FOR_EOT_FLUSH); diff --git a/drivers/gpu/drm/i915/i915_perf.c b/drivers/gpu/drm/i915/i915_perf.c index 48ea17b49b3a..3f90403d86cb 100644 --- a/drivers/gpu/drm/i915/i915_perf.c +++ b/drivers/gpu/drm/i915/i915_perf.c @@ -3258,7 +3258,7 @@ u32 i915_perf_oa_timestamp_frequency(struct drm_i915_private *i915) struct intel_gt *gt = to_gt(i915); /* Wa_18013179988 */ - if (IS_DG2(i915) || IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 71))) { + if (IS_DG2(i915) || IS_GFX_GT_IP_RANGE(gt, IP_VER(12, 70), IP_VER(12, 74))) { intel_wakeref_t wakeref; u32 reg, shift; From 2564623ee0da92ed7f8a87aa3758cbf2c46257bb Mon Sep 17 00:00:00 2001 From: Tejas Upadhyay Date: Wed, 28 Feb 2024 16:07:38 +0530 Subject: [PATCH 050/256] drm/i915/mtl: Update workaround 14018575942 [ Upstream commit 186bce682772e7346bf7ced5325b5f4ff050ccfb ] Applying WA 14018575942 only on Compute engine has impact on some apps like chrome. Updating this WA to apply on Render engine as well as it is helping with performance on Chrome. Note: There is no concern from media team thus not applying WA on media engines. We will revisit if any issues reported from media team. V2(Matt): - Use correct WA number Fixes: 668f37e1ee11 ("drm/i915/mtl: Update workaround 14018778641") Signed-off-by: Tejas Upadhyay Reviewed-by: Matt Roper Reviewed-by: Andi Shyti Signed-off-by: Andi Shyti Link: https://patchwork.freedesktop.org/patch/msgid/20240228103738.2018458-1-tejas.upadhyay@intel.com (cherry picked from commit 71271280175aa0ed6673e40cce7c01296bcd05f6) Signed-off-by: Rodrigo Vivi Signed-off-by: Sasha Levin --- drivers/gpu/drm/i915/gt/intel_workarounds.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/gpu/drm/i915/gt/intel_workarounds.c b/drivers/gpu/drm/i915/gt/intel_workarounds.c index 37b2b0440923..0ea52f77b4c7 100644 --- a/drivers/gpu/drm/i915/gt/intel_workarounds.c +++ b/drivers/gpu/drm/i915/gt/intel_workarounds.c @@ -1644,6 +1644,7 @@ static void xelpg_gt_workarounds_init(struct intel_gt *gt, struct i915_wa_list *wal) { /* Wa_14018575942 / Wa_18018781329 */ + wa_mcr_write_or(wal, RENDER_MOD_CTRL, FORCE_MISS_FTLB); wa_mcr_write_or(wal, COMP_MOD_CTRL, FORCE_MISS_FTLB); /* Wa_22016670082 */ From 7cd73d90856d7edec080b356a33959b57a241598 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 28 Mar 2024 15:30:39 +0100 Subject: [PATCH 051/256] dm integrity: fix out-of-range warning [ Upstream commit 8e91c2342351e0f5ef6c0a704384a7f6fc70c3b2 ] Depending on the value of CONFIG_HZ, clang complains about a pointless comparison: drivers/md/dm-integrity.c:4085:12: error: result of comparison of constant 42949672950 with expression of type 'unsigned int' is always false [-Werror,-Wtautological-constant-out-of-range-compare] if (val >= (uint64_t)UINT_MAX * 1000 / HZ) { As the check remains useful for other configurations, shut up the warning by adding a second type cast to uint64_t. Fixes: 468dfca38b1a ("dm integrity: add a bitmap mode") Signed-off-by: Arnd Bergmann Reviewed-by: Mikulas Patocka Reviewed-by: Justin Stitt Signed-off-by: Mike Snitzer Signed-off-by: Sasha Levin --- drivers/md/dm-integrity.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/md/dm-integrity.c b/drivers/md/dm-integrity.c index e7cd27e387df..470add73f7bd 100644 --- a/drivers/md/dm-integrity.c +++ b/drivers/md/dm-integrity.c @@ -4231,7 +4231,7 @@ static int dm_integrity_ctr(struct dm_target *ti, unsigned int argc, char **argv } else if (sscanf(opt_string, "sectors_per_bit:%llu%c", &llval, &dummy) == 1) { log2_sectors_per_bitmap_bit = !llval ? 0 : __ilog2_u64(llval); } else if (sscanf(opt_string, "bitmap_flush_interval:%u%c", &val, &dummy) == 1) { - if (val >= (uint64_t)UINT_MAX * 1000 / HZ) { + if ((uint64_t)val >= (uint64_t)UINT_MAX * 1000 / HZ) { r = -EINVAL; ti->error = "Invalid bitmap_flush_interval argument"; goto bad; From 907835e6dee6f77ac30ae50bb3f88bd92055c86e Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Tue, 5 Mar 2024 12:37:48 +0800 Subject: [PATCH 052/256] mm/treewide: replace pud_large() with pud_leaf() [ Upstream commit 0a845e0f6348ccfa2dcc8c450ffd1c9ffe8c4add ] pud_large() is always defined as pud_leaf(). Merge their usages. Chose pud_leaf() because pud_leaf() is a global API, while pud_large() is not. Link: https://lkml.kernel.org/r/20240305043750.93762-9-peterx@redhat.com Signed-off-by: Peter Xu Reviewed-by: Jason Gunthorpe Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: "Aneesh Kumar K.V" Cc: Borislav Petkov Cc: Christophe Leroy Cc: Dave Hansen Cc: Dmitry Vyukov Cc: Ingo Molnar Cc: Kirill A. Shutemov Cc: Michael Ellerman Cc: Muchun Song Cc: "Naveen N. Rao" Cc: Nicholas Piggin Cc: Thomas Gleixner Cc: Vincenzo Frascino Cc: Yang Shi Signed-off-by: Andrew Morton Stable-dep-of: c567f2948f57 ("Revert "x86/mm/ident_map: Use gbpages only where full GB page should be mapped."") Signed-off-by: Sasha Levin --- arch/powerpc/mm/book3s64/pgtable.c | 2 +- arch/s390/boot/vmem.c | 2 +- arch/s390/include/asm/pgtable.h | 4 ++-- arch/s390/mm/gmap.c | 2 +- arch/s390/mm/hugetlbpage.c | 4 ++-- arch/s390/mm/pageattr.c | 2 +- arch/s390/mm/pgtable.c | 2 +- arch/s390/mm/vmem.c | 6 +++--- arch/sparc/mm/init_64.c | 2 +- arch/x86/kvm/mmu/mmu.c | 2 +- arch/x86/mm/fault.c | 4 ++-- arch/x86/mm/ident_map.c | 2 +- arch/x86/mm/init_64.c | 4 ++-- arch/x86/mm/kasan_init_64.c | 2 +- arch/x86/mm/mem_encrypt_identity.c | 2 +- arch/x86/mm/pat/set_memory.c | 6 +++--- arch/x86/mm/pgtable.c | 2 +- arch/x86/mm/pti.c | 2 +- arch/x86/power/hibernate.c | 2 +- arch/x86/xen/mmu_pv.c | 4 ++-- 20 files changed, 29 insertions(+), 29 deletions(-) diff --git a/arch/powerpc/mm/book3s64/pgtable.c b/arch/powerpc/mm/book3s64/pgtable.c index 926bec775f41..9822366dc186 100644 --- a/arch/powerpc/mm/book3s64/pgtable.c +++ b/arch/powerpc/mm/book3s64/pgtable.c @@ -130,7 +130,7 @@ void set_pud_at(struct mm_struct *mm, unsigned long addr, WARN_ON(pte_hw_valid(pud_pte(*pudp))); assert_spin_locked(pud_lockptr(mm, pudp)); - WARN_ON(!(pud_large(pud))); + WARN_ON(!(pud_leaf(pud))); #endif trace_hugepage_set_pud(addr, pud_val(pud)); return set_pte_at(mm, addr, pudp_ptep(pudp), pud_pte(pud)); diff --git a/arch/s390/boot/vmem.c b/arch/s390/boot/vmem.c index 442a74f113cb..14e1a73ffcfe 100644 --- a/arch/s390/boot/vmem.c +++ b/arch/s390/boot/vmem.c @@ -360,7 +360,7 @@ static void pgtable_pud_populate(p4d_t *p4d, unsigned long addr, unsigned long e } pmd = boot_crst_alloc(_SEGMENT_ENTRY_EMPTY); pud_populate(&init_mm, pud, pmd); - } else if (pud_large(*pud)) { + } else if (pud_leaf(*pud)) { continue; } pgtable_pmd_populate(pud, addr, next, mode); diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index fb3ee7758b76..38290b0078c5 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -729,7 +729,7 @@ static inline int pud_bad(pud_t pud) { unsigned long type = pud_val(pud) & _REGION_ENTRY_TYPE_MASK; - if (type > _REGION_ENTRY_TYPE_R3 || pud_large(pud)) + if (type > _REGION_ENTRY_TYPE_R3 || pud_leaf(pud)) return 1; if (type < _REGION_ENTRY_TYPE_R3) return 0; @@ -1396,7 +1396,7 @@ static inline unsigned long pud_deref(pud_t pud) unsigned long origin_mask; origin_mask = _REGION_ENTRY_ORIGIN; - if (pud_large(pud)) + if (pud_leaf(pud)) origin_mask = _REGION3_ENTRY_ORIGIN_LARGE; return (unsigned long)__va(pud_val(pud) & origin_mask); } diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 157e0a8d5157..d17bb1ef63f4 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -596,7 +596,7 @@ int __gmap_link(struct gmap *gmap, unsigned long gaddr, unsigned long vmaddr) pud = pud_offset(p4d, vmaddr); VM_BUG_ON(pud_none(*pud)); /* large puds cannot yet be handled */ - if (pud_large(*pud)) + if (pud_leaf(*pud)) return -EFAULT; pmd = pmd_offset(pud, vmaddr); VM_BUG_ON(pmd_none(*pmd)); diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c index 297a6d897d5a..5f64f3d0fafb 100644 --- a/arch/s390/mm/hugetlbpage.c +++ b/arch/s390/mm/hugetlbpage.c @@ -224,7 +224,7 @@ pte_t *huge_pte_offset(struct mm_struct *mm, if (p4d_present(*p4dp)) { pudp = pud_offset(p4dp, addr); if (pud_present(*pudp)) { - if (pud_large(*pudp)) + if (pud_leaf(*pudp)) return (pte_t *) pudp; pmdp = pmd_offset(pudp, addr); } @@ -240,7 +240,7 @@ int pmd_huge(pmd_t pmd) int pud_huge(pud_t pud) { - return pud_large(pud); + return pud_leaf(pud); } bool __init arch_hugetlb_valid_size(unsigned long size) diff --git a/arch/s390/mm/pageattr.c b/arch/s390/mm/pageattr.c index b87e96c64b61..441f654d048d 100644 --- a/arch/s390/mm/pageattr.c +++ b/arch/s390/mm/pageattr.c @@ -274,7 +274,7 @@ static int walk_pud_level(p4d_t *p4d, unsigned long addr, unsigned long end, if (pud_none(*pudp)) return -EINVAL; next = pud_addr_end(addr, end); - if (pud_large(*pudp)) { + if (pud_leaf(*pudp)) { need_split = !!(flags & SET_MEMORY_4K); need_split |= !!(addr & ~PUD_MASK); need_split |= !!(addr + PUD_SIZE > next); diff --git a/arch/s390/mm/pgtable.c b/arch/s390/mm/pgtable.c index 5cb92941540b..5e349869590a 100644 --- a/arch/s390/mm/pgtable.c +++ b/arch/s390/mm/pgtable.c @@ -479,7 +479,7 @@ static int pmd_lookup(struct mm_struct *mm, unsigned long addr, pmd_t **pmdp) return -ENOENT; /* Large PUDs are not supported yet. */ - if (pud_large(*pud)) + if (pud_leaf(*pud)) return -EFAULT; *pmdp = pmd_offset(pud, addr); diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 6d276103c6d5..2d3f65da56ee 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -322,7 +322,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, if (!add) { if (pud_none(*pud)) continue; - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { if (IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE)) { pud_clear(pud); @@ -343,7 +343,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, if (!pmd) goto out; pud_populate(&init_mm, pud, pmd); - } else if (pud_large(*pud)) { + } else if (pud_leaf(*pud)) { continue; } ret = modify_pmd_table(pud, addr, next, add, direct); @@ -586,7 +586,7 @@ pte_t *vmem_get_alloc_pte(unsigned long addr, bool alloc) if (!pmd) goto out; pud_populate(&init_mm, pud, pmd); - } else if (WARN_ON_ONCE(pud_large(*pud))) { + } else if (WARN_ON_ONCE(pud_leaf(*pud))) { goto out; } pmd = pmd_offset(pud, addr); diff --git a/arch/sparc/mm/init_64.c b/arch/sparc/mm/init_64.c index f83017992eaa..d7db4e737218 100644 --- a/arch/sparc/mm/init_64.c +++ b/arch/sparc/mm/init_64.c @@ -1665,7 +1665,7 @@ bool kern_addr_valid(unsigned long addr) if (pud_none(*pud)) return false; - if (pud_large(*pud)) + if (pud_leaf(*pud)) return pfn_valid(pud_pfn(*pud)); pmd = pmd_offset(pud, addr); diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index f7901cb4d2fa..11c484d72eab 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3120,7 +3120,7 @@ static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn, if (pud_none(pud) || !pud_present(pud)) goto out; - if (pud_large(pud)) { + if (pud_leaf(pud)) { level = PG_LEVEL_1G; goto out; } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a9d69ec994b7..e23851796883 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -376,7 +376,7 @@ static void dump_pagetable(unsigned long address) goto bad; pr_cont("PUD %lx ", pud_val(*pud)); - if (!pud_present(*pud) || pud_large(*pud)) + if (!pud_present(*pud) || pud_leaf(*pud)) goto out; pmd = pmd_offset(pud, address); @@ -1037,7 +1037,7 @@ spurious_kernel_fault(unsigned long error_code, unsigned long address) if (!pud_present(*pud)) return 0; - if (pud_large(*pud)) + if (pud_leaf(*pud)) return spurious_kernel_fault_check(error_code, (pte_t *) pud); pmd = pmd_offset(pud, address); diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index f50cc210a981..a204a332c71f 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -33,7 +33,7 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, next = end; /* if this is already a gbpage, this portion is already mapped */ - if (pud_large(*pud)) + if (pud_leaf(*pud)) continue; /* Is using a gbpage allowed? */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a190aae8ceaf..19d209b412d7 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -617,7 +617,7 @@ phys_pud_init(pud_t *pud_page, unsigned long paddr, unsigned long paddr_end, } if (!pud_none(*pud)) { - if (!pud_large(*pud)) { + if (!pud_leaf(*pud)) { pmd = pmd_offset(pud, 0); paddr_last = phys_pmd_init(pmd, paddr, paddr_end, @@ -1163,7 +1163,7 @@ remove_pud_table(pud_t *pud_start, unsigned long addr, unsigned long end, if (!pud_present(*pud)) continue; - if (pud_large(*pud) && + if (pud_leaf(*pud) && IS_ALIGNED(addr, PUD_SIZE) && IS_ALIGNED(next, PUD_SIZE)) { spin_lock(&init_mm.page_table_lock); diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c index 0302491d799d..fcf508c52bdc 100644 --- a/arch/x86/mm/kasan_init_64.c +++ b/arch/x86/mm/kasan_init_64.c @@ -115,7 +115,7 @@ static void __init kasan_populate_p4d(p4d_t *p4d, unsigned long addr, pud = pud_offset(p4d, addr); do { next = pud_addr_end(addr, end); - if (!pud_large(*pud)) + if (!pud_leaf(*pud)) kasan_populate_pud(pud, addr, next, nid); } while (pud++, addr = next, addr != end); } diff --git a/arch/x86/mm/mem_encrypt_identity.c b/arch/x86/mm/mem_encrypt_identity.c index 0166ab1780cc..ead356135924 100644 --- a/arch/x86/mm/mem_encrypt_identity.c +++ b/arch/x86/mm/mem_encrypt_identity.c @@ -144,7 +144,7 @@ static pud_t __init *sme_prepare_pgd(struct sme_populate_pgd_data *ppd) set_pud(pud, __pud(PUD_FLAGS | __pa(pmd))); } - if (pud_large(*pud)) + if (pud_leaf(*pud)) return NULL; return pud; diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index bda9f129835e..f3c4c756fe1e 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -684,7 +684,7 @@ pte_t *lookup_address_in_pgd(pgd_t *pgd, unsigned long address, return NULL; *level = PG_LEVEL_1G; - if (pud_large(*pud) || !pud_present(*pud)) + if (pud_leaf(*pud) || !pud_present(*pud)) return (pte_t *)pud; pmd = pmd_offset(pud, address); @@ -743,7 +743,7 @@ pmd_t *lookup_pmd_address(unsigned long address) return NULL; pud = pud_offset(p4d, address); - if (pud_none(*pud) || pud_large(*pud) || !pud_present(*pud)) + if (pud_none(*pud) || pud_leaf(*pud) || !pud_present(*pud)) return NULL; return pmd_offset(pud, address); @@ -1274,7 +1274,7 @@ static void unmap_pud_range(p4d_t *p4d, unsigned long start, unsigned long end) */ while (end - start >= PUD_SIZE) { - if (pud_large(*pud)) + if (pud_leaf(*pud)) pud_clear(pud); else unmap_pmd_range(pud, start, start + PUD_SIZE); diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 9deadf517f14..8e1ef5345b7a 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -774,7 +774,7 @@ int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot) */ int pud_clear_huge(pud_t *pud) { - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { pud_clear(pud); return 1; } diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c index 78414c6d1b5e..51b6b78e6b17 100644 --- a/arch/x86/mm/pti.c +++ b/arch/x86/mm/pti.c @@ -217,7 +217,7 @@ static pmd_t *pti_user_pagetable_walk_pmd(unsigned long address) pud = pud_offset(p4d, address); /* The user page tables do not use large mappings: */ - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { WARN_ON(1); return NULL; } diff --git a/arch/x86/power/hibernate.c b/arch/x86/power/hibernate.c index 6f955eb1e163..d8af46e67750 100644 --- a/arch/x86/power/hibernate.c +++ b/arch/x86/power/hibernate.c @@ -170,7 +170,7 @@ int relocate_restore_code(void) goto out; } pud = pud_offset(p4d, relocated_restore_code); - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { set_pud(pud, __pud(pud_val(*pud) & ~_PAGE_NX)); goto out; } diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c index b6830554ff69..9d4a9311e819 100644 --- a/arch/x86/xen/mmu_pv.c +++ b/arch/x86/xen/mmu_pv.c @@ -1082,7 +1082,7 @@ static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin) pmd_t *pmd_tbl; int i; - if (pud_large(*pud)) { + if (pud_leaf(*pud)) { pa = pud_val(*pud) & PHYSICAL_PAGE_MASK; xen_free_ro_pages(pa, PUD_SIZE); return; @@ -1863,7 +1863,7 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr) if (!pud_present(pud)) return 0; pa = pud_val(pud) & PTE_PFN_MASK; - if (pud_large(pud)) + if (pud_leaf(pud)) return pa + (vaddr & ~PUD_MASK); pmd = native_make_pmd(xen_read_phys_ulong(pa + pmd_index(vaddr) * From fbc0a833c055a453d4bc1961c980a85ea1b0750d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 Mar 2024 11:47:51 +0100 Subject: [PATCH 053/256] Revert "x86/mm/ident_map: Use gbpages only where full GB page should be mapped." [ Upstream commit c567f2948f57bdc03ed03403ae0234085f376b7d ] This reverts commit d794734c9bbfe22f86686dc2909c25f5ffe1a572. While the original change tries to fix a bug, it also unintentionally broke existing systems, see the regressions reported at: https://lore.kernel.org/all/3a1b9909-45ac-4f97-ad68-d16ef1ce99db@pavinjoseph.com/ Since d794734c9bbf was also marked for -stable, let's back it out before causing more damage. Note that due to another upstream change the revert was not 100% automatic: 0a845e0f6348 mm/treewide: replace pud_large() with pud_leaf() Signed-off-by: Ingo Molnar Cc: Cc: Russ Anderson Cc: Steve Wahl Cc: Dave Hansen Link: https://lore.kernel.org/all/3a1b9909-45ac-4f97-ad68-d16ef1ce99db@pavinjoseph.com/ Fixes: d794734c9bbf ("x86/mm/ident_map: Use gbpages only where full GB page should be mapped.") Signed-off-by: Sasha Levin --- arch/x86/mm/ident_map.c | 23 +++++------------------ 1 file changed, 5 insertions(+), 18 deletions(-) diff --git a/arch/x86/mm/ident_map.c b/arch/x86/mm/ident_map.c index a204a332c71f..968d7005f4a7 100644 --- a/arch/x86/mm/ident_map.c +++ b/arch/x86/mm/ident_map.c @@ -26,31 +26,18 @@ static int ident_pud_init(struct x86_mapping_info *info, pud_t *pud_page, for (; addr < end; addr = next) { pud_t *pud = pud_page + pud_index(addr); pmd_t *pmd; - bool use_gbpage; next = (addr & PUD_MASK) + PUD_SIZE; if (next > end) next = end; - /* if this is already a gbpage, this portion is already mapped */ - if (pud_leaf(*pud)) - continue; - - /* Is using a gbpage allowed? */ - use_gbpage = info->direct_gbpages; - - /* Don't use gbpage if it maps more than the requested region. */ - /* at the begining: */ - use_gbpage &= ((addr & ~PUD_MASK) == 0); - /* ... or at the end: */ - use_gbpage &= ((next & ~PUD_MASK) == 0); - - /* Never overwrite existing mappings */ - use_gbpage &= !pud_present(*pud); - - if (use_gbpage) { + if (info->direct_gbpages) { pud_t pudval; + if (pud_present(*pud)) + continue; + + addr &= PUD_MASK; pudval = __pud((addr - info->offset) | info->page_flag); set_pud(pud, pudval); continue; From 8cc484e85e0c6bf72ec7c3c8c697865355873cfb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Thu, 22 Feb 2024 12:29:34 +0000 Subject: [PATCH 054/256] btrfs: ensure fiemap doesn't race with writes when FIEMAP_FLAG_SYNC is given [ Upstream commit 418b09027743d9a9fb39116bed46a192f868a3c3 ] When FIEMAP_FLAG_SYNC is given to fiemap the expectation is that that are no concurrent writes and we get a stable view of the inode's extent layout. When the flag is given we flush all IO (and wait for ordered extents to complete) and then lock the inode in shared mode, however that leaves open the possibility that a write might happen right after the flushing and before locking the inode. So fix this by flushing again after locking the inode - we leave the initial flushing before locking the inode to avoid holding the lock and blocking other RO operations while waiting for IO and ordered extents to complete. The second flushing while holding the inode's lock will most of the time do nothing or very little since the time window for new writes to have happened is small. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba Stable-dep-of: 978b63f7464a ("btrfs: fix race when detecting delalloc ranges during fiemap") Signed-off-by: Sasha Levin --- fs/btrfs/extent_io.c | 21 ++++++++------------- fs/btrfs/inode.c | 22 +++++++++++++++++++++- 2 files changed, 29 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index fc8eb8d86ca2..45d427c3033d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2953,17 +2953,15 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, range_end = round_up(start + len, sectorsize); prev_extent_end = range_start; - btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); if (ret < 0) - goto out_unlock; + goto out; btrfs_release_path(path); path->reada = READA_FORWARD; ret = fiemap_search_slot(inode, path, range_start); if (ret < 0) { - goto out_unlock; + goto out; } else if (ret > 0) { /* * No file extent item found, but we may have delalloc between @@ -3010,7 +3008,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, backref_ctx, 0, 0, 0, prev_extent_end, hole_end); if (ret < 0) { - goto out_unlock; + goto out; } else if (ret > 0) { /* fiemap_fill_next_extent() told us to stop. */ stopped = true; @@ -3066,7 +3064,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, extent_gen, backref_ctx); if (ret < 0) - goto out_unlock; + goto out; else if (ret > 0) flags |= FIEMAP_EXTENT_SHARED; } @@ -3077,7 +3075,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, } if (ret < 0) { - goto out_unlock; + goto out; } else if (ret > 0) { /* fiemap_fill_next_extent() told us to stop. */ stopped = true; @@ -3088,12 +3086,12 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, next_item: if (fatal_signal_pending(current)) { ret = -EINTR; - goto out_unlock; + goto out; } ret = fiemap_next_leaf_item(inode, path); if (ret < 0) { - goto out_unlock; + goto out; } else if (ret > 0) { /* No more file extent items for this inode. */ break; @@ -3117,7 +3115,7 @@ check_eof_delalloc: &delalloc_cached_state, backref_ctx, 0, 0, 0, prev_extent_end, range_end - 1); if (ret < 0) - goto out_unlock; + goto out; prev_extent_end = range_end; } @@ -3155,9 +3153,6 @@ check_eof_delalloc: } ret = emit_last_fiemap_cache(fieinfo, &cache); - -out_unlock: - btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); out: free_extent_state(delalloc_cached_state); btrfs_free_backref_share_ctx(backref_ctx); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ca79c2b8adc4..1ac14223ffb5 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7813,6 +7813,7 @@ struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter, static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { + struct btrfs_inode *btrfs_inode = BTRFS_I(inode); int ret; ret = fiemap_prep(inode, fieinfo, start, &len, 0); @@ -7838,7 +7839,26 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); + btrfs_inode_lock(btrfs_inode, BTRFS_ILOCK_SHARED); + + /* + * We did an initial flush to avoid holding the inode's lock while + * triggering writeback and waiting for the completion of IO and ordered + * extents. Now after we locked the inode we do it again, because it's + * possible a new write may have happened in between those two steps. + */ + if (fieinfo->fi_flags & FIEMAP_FLAG_SYNC) { + ret = btrfs_wait_ordered_range(inode, 0, LLONG_MAX); + if (ret) { + btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); + return ret; + } + } + + ret = extent_fiemap(btrfs_inode, fieinfo, start, len); + btrfs_inode_unlock(btrfs_inode, BTRFS_ILOCK_SHARED); + + return ret; } static int btrfs_writepages(struct address_space *mapping, From 49d640d2946c35a17b051d54171a032dd95b0f50 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Wed, 28 Feb 2024 11:37:56 +0000 Subject: [PATCH 055/256] btrfs: fix race when detecting delalloc ranges during fiemap [ Upstream commit 978b63f7464abcfd364a6c95f734282c50f3decf ] For fiemap we recently stopped locking the target extent range for the whole duration of the fiemap call, in order to avoid a deadlock in a scenario where the fiemap buffer happens to be a memory mapped range of the same file. This use case is very unlikely to be useful in practice but it may be triggered by fuzz testing (syzbot, etc). This however introduced a race that makes us miss delalloc ranges for file regions that are currently holes, so the caller of fiemap will not be aware that there's data for some file regions. This can be quite serious for some use cases - for example in coreutils versions before 9.0, the cp program used fiemap to detect holes and data in the source file, copying only regions with data (extents or delalloc) from the source file to the destination file in order to preserve holes (see the documentation for its --sparse command line option). This means that if cp was used with a source file that had delalloc in a hole, the destination file could end up without that data, which is effectively a data loss issue, if it happened to hit the race described below. The race happens like this: 1) Fiemap is called, without the FIEMAP_FLAG_SYNC flag, for a file that has delalloc in the file range [64M, 65M[, which is currently a hole; 2) Fiemap locks the inode in shared mode, then starts iterating the inode's subvolume tree searching for file extent items, without having the whole fiemap target range locked in the inode's io tree - the change introduced recently by commit b0ad381fa769 ("btrfs: fix deadlock with fiemap and extent locking"). It only locks ranges in the io tree when it finds a hole or prealloc extent since that commit; 3) Note that fiemap clones each leaf before using it, and this is to avoid deadlocks when locking a file range in the inode's io tree and the fiemap buffer is memory mapped to some file, because writing to the page with btrfs_page_mkwrite() will wait on any ordered extent for the page's range and the ordered extent needs to lock the range and may need to modify the same leaf, therefore leading to a deadlock on the leaf; 4) While iterating the file extent items in the cloned leaf before finding the hole in the range [64M, 65M[, the delalloc in that range is flushed and its ordered extent completes - meaning the corresponding file extent item is in the inode's subvolume tree, but not present in the cloned leaf that fiemap is iterating over; 5) When fiemap finds the hole in the [64M, 65M[ range by seeing the gap in the cloned leaf (or a file extent item with disk_bytenr == 0 in case the NO_HOLES feature is not enabled), it will lock that file range in the inode's io tree and then search for delalloc by checking for the EXTENT_DELALLOC bit in the io tree for that range and ordered extents (with btrfs_find_delalloc_in_range()). But it finds nothing since the delalloc in that range was already flushed and the ordered extent completed and is gone - as a result fiemap will not report that there's delalloc or an extent for the range [64M, 65M[, so user space will be mislead into thinking that there's a hole in that range. This could actually be sporadically triggered with test case generic/094 from fstests, which reports a missing extent/delalloc range like this: # generic/094 2s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//generic/094.out.bad) # --- tests/generic/094.out 2020-06-10 19:29:03.830519425 +0100 # +++ /home/fdmanana/git/hub/xfstests/results//generic/094.out.bad 2024-02-28 11:00:00.381071525 +0000 # @@ -1,3 +1,9 @@ # QA output created by 094 # fiemap run with sync # fiemap run without sync # +ERROR: couldn't find extent at 7 # +map is 'HHDDHPPDPHPH' # +logical: [ 5.. 6] phys: 301517.. 301518 flags: 0x800 tot: 2 # +logical: [ 8.. 8] phys: 301520.. 301520 flags: 0x800 tot: 1 # ... # (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/generic/094.out /home/fdmanana/git/hub/xfstests/results//generic/094.out.bad' to see the entire diff) So in order to fix this, while still avoiding deadlocks in the case where the fiemap buffer is memory mapped to the same file, change fiemap to work like the following: 1) Always lock the whole range in the inode's io tree before starting to iterate the inode's subvolume tree searching for file extent items, just like we did before commit b0ad381fa769 ("btrfs: fix deadlock with fiemap and extent locking"); 2) Now instead of writing to the fiemap buffer every time we have an extent to report, write instead to a temporary buffer (1 page), and when that buffer becomes full, stop iterating the file extent items, unlock the range in the io tree, release the search path, submit all the entries kept in that buffer to the fiemap buffer, and then resume the search for file extent items after locking again the remainder of the range in the io tree. The buffer having a size of a page, allows for 146 entries in a system with 4K pages. This is a large enough value to have a good performance by avoiding too many restarts of the search for file extent items. In other words this preserves the huge performance gains made in the last two years to fiemap, while avoiding the deadlocks in case the fiemap buffer is memory mapped to the same file (useless in practice, but possible and exercised by fuzz testing and syzbot). Fixes: b0ad381fa769 ("btrfs: fix deadlock with fiemap and extent locking") Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba Signed-off-by: Sasha Levin --- fs/btrfs/extent_io.c | 221 +++++++++++++++++++++++++++++++------------ 1 file changed, 160 insertions(+), 61 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 45d427c3033d..5acb2cb79d4b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2410,12 +2410,65 @@ next: return try_release_extent_state(tree, page, mask); } +struct btrfs_fiemap_entry { + u64 offset; + u64 phys; + u64 len; + u32 flags; +}; + /* - * To cache previous fiemap extent + * Indicate the caller of emit_fiemap_extent() that it needs to unlock the file + * range from the inode's io tree, unlock the subvolume tree search path, flush + * the fiemap cache and relock the file range and research the subvolume tree. + * The value here is something negative that can't be confused with a valid + * errno value and different from 1 because that's also a return value from + * fiemap_fill_next_extent() and also it's often used to mean some btree search + * did not find a key, so make it some distinct negative value. + */ +#define BTRFS_FIEMAP_FLUSH_CACHE (-(MAX_ERRNO + 1)) + +/* + * Used to: * - * Will be used for merging fiemap extent + * - Cache the next entry to be emitted to the fiemap buffer, so that we can + * merge extents that are contiguous and can be grouped as a single one; + * + * - Store extents ready to be written to the fiemap buffer in an intermediary + * buffer. This intermediary buffer is to ensure that in case the fiemap + * buffer is memory mapped to the fiemap target file, we don't deadlock + * during btrfs_page_mkwrite(). This is because during fiemap we are locking + * an extent range in order to prevent races with delalloc flushing and + * ordered extent completion, which is needed in order to reliably detect + * delalloc in holes and prealloc extents. And this can lead to a deadlock + * if the fiemap buffer is memory mapped to the file we are running fiemap + * against (a silly, useless in practice scenario, but possible) because + * btrfs_page_mkwrite() will try to lock the same extent range. */ struct fiemap_cache { + /* An array of ready fiemap entries. */ + struct btrfs_fiemap_entry *entries; + /* Number of entries in the entries array. */ + int entries_size; + /* Index of the next entry in the entries array to write to. */ + int entries_pos; + /* + * Once the entries array is full, this indicates what's the offset for + * the next file extent item we must search for in the inode's subvolume + * tree after unlocking the extent range in the inode's io tree and + * releasing the search path. + */ + u64 next_search_offset; + /* + * This matches struct fiemap_extent_info::fi_mapped_extents, we use it + * to count ourselves emitted extents and stop instead of relying on + * fiemap_fill_next_extent() because we buffer ready fiemap entries at + * the @entries array, and we want to stop as soon as we hit the max + * amount of extents to map, not just to save time but also to make the + * logic at extent_fiemap() simpler. + */ + unsigned int extents_mapped; + /* Fields for the cached extent (unsubmitted, not ready, extent). */ u64 offset; u64 phys; u64 len; @@ -2423,6 +2476,28 @@ struct fiemap_cache { bool cached; }; +static int flush_fiemap_cache(struct fiemap_extent_info *fieinfo, + struct fiemap_cache *cache) +{ + for (int i = 0; i < cache->entries_pos; i++) { + struct btrfs_fiemap_entry *entry = &cache->entries[i]; + int ret; + + ret = fiemap_fill_next_extent(fieinfo, entry->offset, + entry->phys, entry->len, + entry->flags); + /* + * Ignore 1 (reached max entries) because we keep track of that + * ourselves in emit_fiemap_extent(). + */ + if (ret < 0) + return ret; + } + cache->entries_pos = 0; + + return 0; +} + /* * Helper to submit fiemap extent. * @@ -2437,8 +2512,8 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, struct fiemap_cache *cache, u64 offset, u64 phys, u64 len, u32 flags) { + struct btrfs_fiemap_entry *entry; u64 cache_end; - int ret = 0; /* Set at the end of extent_fiemap(). */ ASSERT((flags & FIEMAP_EXTENT_LAST) == 0); @@ -2451,7 +2526,9 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, * find an extent that starts at an offset behind the end offset of the * previous extent we processed. This happens if fiemap is called * without FIEMAP_FLAG_SYNC and there are ordered extents completing - * while we call btrfs_next_leaf() (through fiemap_next_leaf_item()). + * after we had to unlock the file range, release the search path, emit + * the fiemap extents stored in the buffer (cache->entries array) and + * the lock the remainder of the range and re-search the btree. * * For example we are in leaf X processing its last item, which is the * file extent item for file range [512K, 1M[, and after @@ -2564,11 +2641,35 @@ static int emit_fiemap_extent(struct fiemap_extent_info *fieinfo, emit: /* Not mergeable, need to submit cached one */ - ret = fiemap_fill_next_extent(fieinfo, cache->offset, cache->phys, - cache->len, cache->flags); - cache->cached = false; - if (ret) - return ret; + + if (cache->entries_pos == cache->entries_size) { + /* + * We will need to research for the end offset of the last + * stored extent and not from the current offset, because after + * unlocking the range and releasing the path, if there's a hole + * between that end offset and this current offset, a new extent + * may have been inserted due to a new write, so we don't want + * to miss it. + */ + entry = &cache->entries[cache->entries_size - 1]; + cache->next_search_offset = entry->offset + entry->len; + cache->cached = false; + + return BTRFS_FIEMAP_FLUSH_CACHE; + } + + entry = &cache->entries[cache->entries_pos]; + entry->offset = cache->offset; + entry->phys = cache->phys; + entry->len = cache->len; + entry->flags = cache->flags; + cache->entries_pos++; + cache->extents_mapped++; + + if (cache->extents_mapped == fieinfo->fi_extents_max) { + cache->cached = false; + return 1; + } assign: cache->cached = true; cache->offset = offset; @@ -2694,8 +2795,8 @@ static int fiemap_search_slot(struct btrfs_inode *inode, struct btrfs_path *path * neighbour leaf). * We also need the private clone because holding a read lock on an * extent buffer of the subvolume's b+tree will make lockdep unhappy - * when we call fiemap_fill_next_extent(), because that may cause a page - * fault when filling the user space buffer with fiemap data. + * when we check if extents are shared, as backref walking may need to + * lock the same leaf we are processing. */ clone = btrfs_clone_extent_buffer(path->nodes[0]); if (!clone) @@ -2735,34 +2836,16 @@ static int fiemap_process_hole(struct btrfs_inode *inode, * it beyond i_size. */ while (cur_offset < end && cur_offset < i_size) { - struct extent_state *cached_state = NULL; u64 delalloc_start; u64 delalloc_end; u64 prealloc_start; - u64 lockstart; - u64 lockend; u64 prealloc_len = 0; bool delalloc; - lockstart = round_down(cur_offset, inode->root->fs_info->sectorsize); - lockend = round_up(end, inode->root->fs_info->sectorsize); - - /* - * We are only locking for the delalloc range because that's the - * only thing that can change here. With fiemap we have a lock - * on the inode, so no buffered or direct writes can happen. - * - * However mmaps and normal page writeback will cause this to - * change arbitrarily. We have to lock the extent lock here to - * make sure that nobody messes with the tree while we're doing - * btrfs_find_delalloc_in_range. - */ - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); delalloc = btrfs_find_delalloc_in_range(inode, cur_offset, end, delalloc_cached_state, &delalloc_start, &delalloc_end); - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); if (!delalloc) break; @@ -2930,6 +3013,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { const u64 ino = btrfs_ino(inode); + struct extent_state *cached_state = NULL; struct extent_state *delalloc_cached_state = NULL; struct btrfs_path *path; struct fiemap_cache cache = { 0 }; @@ -2942,26 +3026,33 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, bool stopped = false; int ret; + cache.entries_size = PAGE_SIZE / sizeof(struct btrfs_fiemap_entry); + cache.entries = kmalloc_array(cache.entries_size, + sizeof(struct btrfs_fiemap_entry), + GFP_KERNEL); backref_ctx = btrfs_alloc_backref_share_check_ctx(); path = btrfs_alloc_path(); - if (!backref_ctx || !path) { + if (!cache.entries || !backref_ctx || !path) { ret = -ENOMEM; goto out; } +restart: range_start = round_down(start, sectorsize); range_end = round_up(start + len, sectorsize); prev_extent_end = range_start; + lock_extent(&inode->io_tree, range_start, range_end, &cached_state); + ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); if (ret < 0) - goto out; + goto out_unlock; btrfs_release_path(path); path->reada = READA_FORWARD; ret = fiemap_search_slot(inode, path, range_start); if (ret < 0) { - goto out; + goto out_unlock; } else if (ret > 0) { /* * No file extent item found, but we may have delalloc between @@ -3008,7 +3099,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, backref_ctx, 0, 0, 0, prev_extent_end, hole_end); if (ret < 0) { - goto out; + goto out_unlock; } else if (ret > 0) { /* fiemap_fill_next_extent() told us to stop. */ stopped = true; @@ -3064,7 +3155,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, extent_gen, backref_ctx); if (ret < 0) - goto out; + goto out_unlock; else if (ret > 0) flags |= FIEMAP_EXTENT_SHARED; } @@ -3075,9 +3166,9 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, } if (ret < 0) { - goto out; + goto out_unlock; } else if (ret > 0) { - /* fiemap_fill_next_extent() told us to stop. */ + /* emit_fiemap_extent() told us to stop. */ stopped = true; break; } @@ -3086,12 +3177,12 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, next_item: if (fatal_signal_pending(current)) { ret = -EINTR; - goto out; + goto out_unlock; } ret = fiemap_next_leaf_item(inode, path); if (ret < 0) { - goto out; + goto out_unlock; } else if (ret > 0) { /* No more file extent items for this inode. */ break; @@ -3100,22 +3191,12 @@ next_item: } check_eof_delalloc: - /* - * Release (and free) the path before emitting any final entries to - * fiemap_fill_next_extent() to keep lockdep happy. This is because - * once we find no more file extent items exist, we may have a - * non-cloned leaf, and fiemap_fill_next_extent() can trigger page - * faults when copying data to the user space buffer. - */ - btrfs_free_path(path); - path = NULL; - if (!stopped && prev_extent_end < range_end) { ret = fiemap_process_hole(inode, fieinfo, &cache, &delalloc_cached_state, backref_ctx, 0, 0, 0, prev_extent_end, range_end - 1); if (ret < 0) - goto out; + goto out_unlock; prev_extent_end = range_end; } @@ -3123,28 +3204,16 @@ check_eof_delalloc: const u64 i_size = i_size_read(&inode->vfs_inode); if (prev_extent_end < i_size) { - struct extent_state *cached_state = NULL; u64 delalloc_start; u64 delalloc_end; - u64 lockstart; - u64 lockend; bool delalloc; - lockstart = round_down(prev_extent_end, sectorsize); - lockend = round_up(i_size, sectorsize); - - /* - * See the comment in fiemap_process_hole as to why - * we're doing the locking here. - */ - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); delalloc = btrfs_find_delalloc_in_range(inode, prev_extent_end, i_size - 1, &delalloc_cached_state, &delalloc_start, &delalloc_end); - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); if (!delalloc) cache.flags |= FIEMAP_EXTENT_LAST; } else { @@ -3152,9 +3221,39 @@ check_eof_delalloc: } } +out_unlock: + unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); + + if (ret == BTRFS_FIEMAP_FLUSH_CACHE) { + btrfs_release_path(path); + ret = flush_fiemap_cache(fieinfo, &cache); + if (ret) + goto out; + len -= cache.next_search_offset - start; + start = cache.next_search_offset; + goto restart; + } else if (ret < 0) { + goto out; + } + + /* + * Must free the path before emitting to the fiemap buffer because we + * may have a non-cloned leaf and if the fiemap buffer is memory mapped + * to a file, a write into it (through btrfs_page_mkwrite()) may trigger + * waiting for an ordered extent that in order to complete needs to + * modify that leaf, therefore leading to a deadlock. + */ + btrfs_free_path(path); + path = NULL; + + ret = flush_fiemap_cache(fieinfo, &cache); + if (ret) + goto out; + ret = emit_last_fiemap_cache(fieinfo, &cache); out: free_extent_state(delalloc_cached_state); + kfree(cache.entries); btrfs_free_backref_share_ctx(backref_ctx); btrfs_free_path(path); return ret; From 936e59cb56ea1e7d2dc5635b48f72db300379d75 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Tue, 31 Oct 2023 23:30:59 +0100 Subject: [PATCH 056/256] x86/CPU/AMD: Add ZenX generations flags [ Upstream commit 30fa92832f405d5ac9f263e99f62445fa3084008 ] Add X86_FEATURE flags for each Zen generation. They should be used from now on instead of checking f/m/s. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Acked-by: Thomas Gleixner Link: http://lore.kernel.org/r/20231120104152.13740-2-bp@alien8.de Stable-dep-of: c7b2edd8377b ("perf/x86/amd/core: Update and fix stalled-cycles-* events for Zen 2 and later") Signed-off-by: Sasha Levin --- arch/x86/include/asm/cpufeatures.h | 5 ++- arch/x86/kernel/cpu/amd.c | 70 +++++++++++++++++++++++++++++- 2 files changed, 72 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index bd33f6366c80..1f9db287165a 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -218,7 +218,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU based on Zen microarchitecture */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ @@ -312,6 +312,9 @@ #define X86_FEATURE_SRSO_ALIAS (11*32+25) /* "" AMD BTB untrain RETs through aliasing */ #define X86_FEATURE_IBPB_ON_VMEXIT (11*32+26) /* "" Issue an IBPB only on VMEXIT */ #define X86_FEATURE_APIC_MSRS_FENCE (11*32+27) /* "" IA32_TSC_DEADLINE and X2APIC MSRs need fencing */ +#define X86_FEATURE_ZEN2 (11*32+28) /* "" CPU based on Zen2 microarchitecture */ +#define X86_FEATURE_ZEN3 (11*32+29) /* "" CPU based on Zen3 microarchitecture */ +#define X86_FEATURE_ZEN4 (11*32+30) /* "" CPU based on Zen4 microarchitecture */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 031bca974fbf..5391385707b3 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -620,6 +620,49 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) } resctrl_cpu_detect(c); + + /* Figure out Zen generations: */ + switch (c->x86) { + case 0x17: { + switch (c->x86_model) { + case 0x00 ... 0x2f: + case 0x50 ... 0x5f: + setup_force_cpu_cap(X86_FEATURE_ZEN); + break; + case 0x30 ... 0x4f: + case 0x60 ... 0x7f: + case 0x90 ... 0x91: + case 0xa0 ... 0xaf: + setup_force_cpu_cap(X86_FEATURE_ZEN2); + break; + default: + goto warn; + } + break; + } + case 0x19: { + switch (c->x86_model) { + case 0x00 ... 0x0f: + case 0x20 ... 0x5f: + setup_force_cpu_cap(X86_FEATURE_ZEN3); + break; + case 0x10 ... 0x1f: + case 0x60 ... 0xaf: + setup_force_cpu_cap(X86_FEATURE_ZEN4); + break; + default: + goto warn; + } + break; + } + default: + break; + } + + return; + +warn: + WARN_ONCE(1, "Family 0x%x, model: 0x%x??\n", c->x86, c->x86_model); } static void early_detect_mem_encrypt(struct cpuinfo_x86 *c) @@ -978,8 +1021,6 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) static void init_amd_zn(struct cpuinfo_x86 *c) { - set_cpu_cap(c, X86_FEATURE_ZEN); - #ifdef CONFIG_NUMA node_reclaim_distance = 32; #endif @@ -1042,6 +1083,22 @@ static void zenbleed_check(struct cpuinfo_x86 *c) } } +static void init_amd_zen(struct cpuinfo_x86 *c) +{ +} + +static void init_amd_zen2(struct cpuinfo_x86 *c) +{ +} + +static void init_amd_zen3(struct cpuinfo_x86 *c) +{ +} + +static void init_amd_zen4(struct cpuinfo_x86 *c) +{ +} + static void init_amd(struct cpuinfo_x86 *c) { early_init_amd(c); @@ -1080,6 +1137,15 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x19: init_amd_zn(c); break; } + if (boot_cpu_has(X86_FEATURE_ZEN)) + init_amd_zen(c); + else if (boot_cpu_has(X86_FEATURE_ZEN2)) + init_amd_zen2(c); + else if (boot_cpu_has(X86_FEATURE_ZEN3)) + init_amd_zen3(c); + else if (boot_cpu_has(X86_FEATURE_ZEN4)) + init_amd_zen4(c); + /* * Enable workaround for FXSAVE leak on CPUs * without a XSaveErPtr feature From eae590201d4a7f9aa64f31db4c3074fb1d6368a6 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 1 Nov 2023 11:14:59 +0100 Subject: [PATCH 057/256] x86/CPU/AMD: Carve out the erratum 1386 fix [ Upstream commit a7c32a1ae9ee43abfe884f5af376877c4301d166 ] Call it on the affected CPU generations. No functional changes. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-3-bp@alien8.de Stable-dep-of: c7b2edd8377b ("perf/x86/amd/core: Update and fix stalled-cycles-* events for Zen 2 and later") Signed-off-by: Sasha Levin --- arch/x86/kernel/cpu/amd.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 5391385707b3..28c3a1045b06 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -988,6 +988,19 @@ static void init_amd_bd(struct cpuinfo_x86 *c) clear_rdrand_cpuid_bit(c); } +static void fix_erratum_1386(struct cpuinfo_x86 *c) +{ + /* + * Work around Erratum 1386. The XSAVES instruction malfunctions in + * certain circumstances on Zen1/2 uarch, and not all parts have had + * updated microcode at the time of writing (March 2023). + * + * Affected parts all have no supervisor XSAVE states, meaning that + * the XSAVEC instruction (which works fine) is equivalent. + */ + clear_cpu_cap(c, X86_FEATURE_XSAVES); +} + void init_spectral_chicken(struct cpuinfo_x86 *c) { #ifdef CONFIG_CPU_UNRET_ENTRY @@ -1008,15 +1021,6 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) } } #endif - /* - * Work around Erratum 1386. The XSAVES instruction malfunctions in - * certain circumstances on Zen1/2 uarch, and not all parts have had - * updated microcode at the time of writing (March 2023). - * - * Affected parts all have no supervisor XSAVE states, meaning that - * the XSAVEC instruction (which works fine) is equivalent. - */ - clear_cpu_cap(c, X86_FEATURE_XSAVES); } static void init_amd_zn(struct cpuinfo_x86 *c) @@ -1085,10 +1089,12 @@ static void zenbleed_check(struct cpuinfo_x86 *c) static void init_amd_zen(struct cpuinfo_x86 *c) { + fix_erratum_1386(c); } static void init_amd_zen2(struct cpuinfo_x86 *c) { + fix_erratum_1386(c); } static void init_amd_zen3(struct cpuinfo_x86 *c) From 2577e2a7cae00c786d49ac5ae08d668174d3f38f Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 1 Nov 2023 12:31:44 +0100 Subject: [PATCH 058/256] x86/CPU/AMD: Move erratum 1076 fix into the Zen1 init function [ Upstream commit 0da91912fc150d6d321b15e648bead202ced1a27 ] No functional changes. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-5-bp@alien8.de Stable-dep-of: c7b2edd8377b ("perf/x86/amd/core: Update and fix stalled-cycles-* events for Zen 2 and later") Signed-off-by: Sasha Levin --- arch/x86/kernel/cpu/amd.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 28c3a1045b06..71503181bffd 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -1028,6 +1028,11 @@ static void init_amd_zn(struct cpuinfo_x86 *c) #ifdef CONFIG_NUMA node_reclaim_distance = 32; #endif +} + +static void init_amd_zen(struct cpuinfo_x86 *c) +{ + fix_erratum_1386(c); /* Fix up CPUID bits, but only if not virtualised. */ if (!cpu_has(c, X86_FEATURE_HYPERVISOR)) { @@ -1087,11 +1092,6 @@ static void zenbleed_check(struct cpuinfo_x86 *c) } } -static void init_amd_zen(struct cpuinfo_x86 *c) -{ - fix_erratum_1386(c); -} - static void init_amd_zen2(struct cpuinfo_x86 *c) { fix_erratum_1386(c); From 702a65272da64980ca2dc9bd8e26e260ad276ac7 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 1 Nov 2023 12:38:35 +0100 Subject: [PATCH 059/256] x86/CPU/AMD: Move Zenbleed check to the Zen2 init function [ Upstream commit f69759be251dce722942594fbc62e53a40822a82 ] Prefix it properly so that it is clear which generation it is dealing with. No functional changes. Signed-off-by: Borislav Petkov (AMD) Link: http://lore.kernel.org/r/20231120104152.13740-8-bp@alien8.de Stable-dep-of: c7b2edd8377b ("perf/x86/amd/core: Update and fix stalled-cycles-* events for Zen 2 and later") Signed-off-by: Sasha Levin --- arch/x86/kernel/cpu/amd.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 71503181bffd..d8a0dc01a7db 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -70,12 +70,6 @@ static const int amd_erratum_383[] = static const int amd_erratum_1054[] = AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); -static const int amd_zenbleed[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x30, 0x0, 0x4f, 0xf), - AMD_MODEL_RANGE(0x17, 0x60, 0x0, 0x7f, 0xf), - AMD_MODEL_RANGE(0x17, 0x90, 0x0, 0x91, 0xf), - AMD_MODEL_RANGE(0x17, 0xa0, 0x0, 0xaf, 0xf)); - static const int amd_div0[] = AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf), AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf)); @@ -1073,11 +1067,8 @@ static bool cpu_has_zenbleed_microcode(void) return true; } -static void zenbleed_check(struct cpuinfo_x86 *c) +static void zen2_zenbleed_check(struct cpuinfo_x86 *c) { - if (!cpu_has_amd_erratum(c, amd_zenbleed)) - return; - if (cpu_has(c, X86_FEATURE_HYPERVISOR)) return; @@ -1095,6 +1086,7 @@ static void zenbleed_check(struct cpuinfo_x86 *c) static void init_amd_zen2(struct cpuinfo_x86 *c) { fix_erratum_1386(c); + zen2_zenbleed_check(c); } static void init_amd_zen3(struct cpuinfo_x86 *c) @@ -1219,8 +1211,6 @@ static void init_amd(struct cpuinfo_x86 *c) cpu_has(c, X86_FEATURE_AUTOIBRS)) WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); - zenbleed_check(c); - if (cpu_has_amd_erratum(c, amd_div0)) { pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); setup_force_cpu_bug(X86_BUG_DIV0); @@ -1385,7 +1375,7 @@ static void zenbleed_check_cpu(void *unused) { struct cpuinfo_x86 *c = &cpu_data(smp_processor_id()); - zenbleed_check(c); + zen2_zenbleed_check(c); } void amd_check_microcode(void) From 82454981660945fc7022c799c21e9811b0ebfb96 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Wed, 1 Nov 2023 12:52:01 +0100 Subject: [PATCH 060/256] x86/CPU/AMD: Move the DIV0 bug detection to the Zen1 init function [ Upstream commit bfff3c6692ce64fa9d86eb829d18229c307a0855 ] No functional changes. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-9-bp@alien8.de Stable-dep-of: c7b2edd8377b ("perf/x86/amd/core: Update and fix stalled-cycles-* events for Zen 2 and later") Signed-off-by: Sasha Levin --- arch/x86/kernel/cpu/amd.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index d8a0dc01a7db..f4373530c7de 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -70,10 +70,6 @@ static const int amd_erratum_383[] = static const int amd_erratum_1054[] = AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); -static const int amd_div0[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0x00, 0x0, 0x2f, 0xf), - AMD_MODEL_RANGE(0x17, 0x50, 0x0, 0x5f, 0xf)); - static const int amd_erratum_1485[] = AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); @@ -1043,6 +1039,9 @@ static void init_amd_zen(struct cpuinfo_x86 *c) if (c->x86 == 0x19 && !cpu_has(c, X86_FEATURE_BTC_NO)) set_cpu_cap(c, X86_FEATURE_BTC_NO); } + + pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); + setup_force_cpu_bug(X86_BUG_DIV0); } static bool cpu_has_zenbleed_microcode(void) @@ -1211,11 +1210,6 @@ static void init_amd(struct cpuinfo_x86 *c) cpu_has(c, X86_FEATURE_AUTOIBRS)) WARN_ON_ONCE(msr_set_bit(MSR_EFER, _EFER_AUTOIBRS)); - if (cpu_has_amd_erratum(c, amd_div0)) { - pr_notice_once("AMD Zen1 DIV0 bug detected. Disable SMT for full protection.\n"); - setup_force_cpu_bug(X86_BUG_DIV0); - } - if (!cpu_has(c, X86_FEATURE_HYPERVISOR) && cpu_has_amd_erratum(c, amd_erratum_1485)) msr_set_bit(MSR_ZEN4_BP_CFG, MSR_ZEN4_BP_CFG_SHARED_BTB_FIX_BIT); From 54273025be0cffcef3964d5ebdc64aa678bdb684 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Fri, 3 Nov 2023 19:53:49 +0100 Subject: [PATCH 061/256] x86/CPU/AMD: Get rid of amd_erratum_1054[] [ Upstream commit 54c33e23f75d5c9925495231c57d3319336722ef ] No functional changes. Signed-off-by: Borislav Petkov (AMD) Reviewed-by: Nikolay Borisov Link: http://lore.kernel.org/r/20231120104152.13740-10-bp@alien8.de Stable-dep-of: c7b2edd8377b ("perf/x86/amd/core: Update and fix stalled-cycles-* events for Zen 2 and later") Signed-off-by: Sasha Levin --- arch/x86/kernel/cpu/amd.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index f4373530c7de..98fa23ef97df 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -66,10 +66,6 @@ static const int amd_erratum_400[] = static const int amd_erratum_383[] = AMD_OSVW_ERRATUM(3, AMD_MODEL_RANGE(0x10, 0, 0, 0xff, 0xf)); -/* #1054: Instructions Retired Performance Counter May Be Inaccurate */ -static const int amd_erratum_1054[] = - AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x17, 0, 0, 0x2f, 0xf)); - static const int amd_erratum_1485[] = AMD_LEGACY_ERRATUM(AMD_MODEL_RANGE(0x19, 0x10, 0x0, 0x1f, 0xf), AMD_MODEL_RANGE(0x19, 0x60, 0x0, 0xaf, 0xf)); @@ -1194,7 +1190,7 @@ static void init_amd(struct cpuinfo_x86 *c) * Counter May Be Inaccurate". */ if (cpu_has(c, X86_FEATURE_IRPERF) && - !cpu_has_amd_erratum(c, amd_erratum_1054)) + (boot_cpu_has(X86_FEATURE_ZEN) && c->x86_model > 0x2f)) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); From d2be2f872fe7ba865a37bb50d5def771e9cc1901 Mon Sep 17 00:00:00 2001 From: "Borislav Petkov (AMD)" Date: Sat, 2 Dec 2023 12:50:23 +0100 Subject: [PATCH 062/256] x86/CPU/AMD: Add X86_FEATURE_ZEN1 [ Upstream commit 232afb557835d6f6859c73bf610bad308c96b131 ] Add a synthetic feature flag specifically for first generation Zen machines. There's need to have a generic flag for all Zen generations so make X86_FEATURE_ZEN be that flag. Fixes: 30fa92832f40 ("x86/CPU/AMD: Add ZenX generations flags") Suggested-by: Brian Gerst Suggested-by: Tom Lendacky Signed-off-by: Borislav Petkov (AMD) Link: https://lore.kernel.org/r/dc3835e3-0731-4230-bbb9-336bbe3d042b@amd.com Stable-dep-of: c7b2edd8377b ("perf/x86/amd/core: Update and fix stalled-cycles-* events for Zen 2 and later") Signed-off-by: Sasha Levin --- arch/x86/include/asm/cpufeatures.h | 3 ++- arch/x86/kernel/cpu/amd.c | 11 ++++++----- tools/arch/x86/include/asm/cpufeatures.h | 2 +- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index 1f9db287165a..bc66aec9139e 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -218,7 +218,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN ( 7*32+28) /* "" CPU based on Zen microarchitecture */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ @@ -315,6 +315,7 @@ #define X86_FEATURE_ZEN2 (11*32+28) /* "" CPU based on Zen2 microarchitecture */ #define X86_FEATURE_ZEN3 (11*32+29) /* "" CPU based on Zen3 microarchitecture */ #define X86_FEATURE_ZEN4 (11*32+30) /* "" CPU based on Zen4 microarchitecture */ +#define X86_FEATURE_ZEN1 (11*32+31) /* "" CPU based on Zen1 microarchitecture */ /* Intel-defined CPU features, CPUID level 0x00000007:1 (EAX), word 12 */ #define X86_FEATURE_AVX_VNNI (12*32+ 4) /* AVX VNNI instructions */ diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 98fa23ef97df..9fd91022d92d 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -613,7 +613,7 @@ static void bsp_init_amd(struct cpuinfo_x86 *c) switch (c->x86_model) { case 0x00 ... 0x2f: case 0x50 ... 0x5f: - setup_force_cpu_cap(X86_FEATURE_ZEN); + setup_force_cpu_cap(X86_FEATURE_ZEN1); break; case 0x30 ... 0x4f: case 0x60 ... 0x7f: @@ -1011,12 +1011,13 @@ void init_spectral_chicken(struct cpuinfo_x86 *c) static void init_amd_zn(struct cpuinfo_x86 *c) { + setup_force_cpu_cap(X86_FEATURE_ZEN); #ifdef CONFIG_NUMA node_reclaim_distance = 32; #endif } -static void init_amd_zen(struct cpuinfo_x86 *c) +static void init_amd_zen1(struct cpuinfo_x86 *c) { fix_erratum_1386(c); @@ -1130,8 +1131,8 @@ static void init_amd(struct cpuinfo_x86 *c) case 0x19: init_amd_zn(c); break; } - if (boot_cpu_has(X86_FEATURE_ZEN)) - init_amd_zen(c); + if (boot_cpu_has(X86_FEATURE_ZEN1)) + init_amd_zen1(c); else if (boot_cpu_has(X86_FEATURE_ZEN2)) init_amd_zen2(c); else if (boot_cpu_has(X86_FEATURE_ZEN3)) @@ -1190,7 +1191,7 @@ static void init_amd(struct cpuinfo_x86 *c) * Counter May Be Inaccurate". */ if (cpu_has(c, X86_FEATURE_IRPERF) && - (boot_cpu_has(X86_FEATURE_ZEN) && c->x86_model > 0x2f)) + (boot_cpu_has(X86_FEATURE_ZEN1) && c->x86_model > 0x2f)) msr_set_bit(MSR_K7_HWCR, MSR_K7_HWCR_IRPERF_EN_BIT); check_null_seg_clears_base(c); diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h index 798e60b5454b..845a4023ba44 100644 --- a/tools/arch/x86/include/asm/cpufeatures.h +++ b/tools/arch/x86/include/asm/cpufeatures.h @@ -219,7 +219,7 @@ #define X86_FEATURE_IBRS ( 7*32+25) /* Indirect Branch Restricted Speculation */ #define X86_FEATURE_IBPB ( 7*32+26) /* Indirect Branch Prediction Barrier */ #define X86_FEATURE_STIBP ( 7*32+27) /* Single Thread Indirect Branch Predictors */ -#define X86_FEATURE_ZEN (7*32+28) /* "" CPU based on Zen microarchitecture */ +#define X86_FEATURE_ZEN ( 7*32+28) /* "" Generic flag for all Zen and newer */ #define X86_FEATURE_L1TF_PTEINV ( 7*32+29) /* "" L1TF workaround PTE inversion */ #define X86_FEATURE_IBRS_ENHANCED ( 7*32+30) /* Enhanced IBRS */ #define X86_FEATURE_MSR_IA32_FEAT_CTL ( 7*32+31) /* "" MSR IA32_FEAT_CTL configured */ From 69fe5f177ad39c0ed2186e2391cbee1908096616 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Mon, 25 Mar 2024 13:17:53 +0530 Subject: [PATCH 063/256] perf/x86/amd/core: Update and fix stalled-cycles-* events for Zen 2 and later [ Upstream commit c7b2edd8377be983442c1344cb940cd2ac21b601 ] AMD processors based on Zen 2 and later microarchitectures do not support PMCx087 (instruction pipe stalls) which is used as the backing event for "stalled-cycles-frontend" and "stalled-cycles-backend". Use PMCx0A9 (cycles where micro-op queue is empty) instead to count frontend stalls and remove the entry for backend stalls since there is no direct replacement. Signed-off-by: Sandipan Das Signed-off-by: Ingo Molnar Reviewed-by: Ian Rogers Fixes: 3fe3331bb285 ("perf/x86/amd: Add event map for AMD Family 17h") Link: https://lore.kernel.org/r/03d7fc8fa2a28f9be732116009025bdec1b3ec97.1711352180.git.sandipan.das@amd.com Signed-off-by: Sasha Levin --- arch/x86/events/amd/core.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 5365d6acbf09..b30349eeb767 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -250,7 +250,7 @@ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = /* * AMD Performance Monitor Family 17h and later: */ -static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = +static const u64 amd_zen1_perfmon_event_map[PERF_COUNT_HW_MAX] = { [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, @@ -262,10 +262,24 @@ static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187, }; +static const u64 amd_zen2_perfmon_event_map[PERF_COUNT_HW_MAX] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0964, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00a9, +}; + static u64 amd_pmu_event_map(int hw_event) { - if (boot_cpu_data.x86 >= 0x17) - return amd_f17h_perfmon_event_map[hw_event]; + if (cpu_feature_enabled(X86_FEATURE_ZEN2) || boot_cpu_data.x86 >= 0x19) + return amd_zen2_perfmon_event_map[hw_event]; + + if (cpu_feature_enabled(X86_FEATURE_ZEN1)) + return amd_zen1_perfmon_event_map[hw_event]; return amd_perfmon_event_map[hw_event]; } From 56e7373f9a67843caa6cf14fd8a6805aa41dfa11 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Mon, 25 Mar 2024 13:01:44 +0530 Subject: [PATCH 064/256] x86/cpufeatures: Add new word for scattered features [ Upstream commit 7f274e609f3d5f45c22b1dd59053f6764458b492 ] Add a new word for scattered features because all free bits among the existing Linux-defined auxiliary flags have been exhausted. Signed-off-by: Sandipan Das Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/8380d2a0da469a1f0ad75b8954a79fb689599ff6.1711091584.git.sandipan.das@amd.com Stable-dep-of: 598c2fafc06f ("perf/x86/amd/lbr: Use freeze based on availability") Signed-off-by: Sasha Levin --- arch/x86/include/asm/cpufeature.h | 6 ++++-- arch/x86/include/asm/cpufeatures.h | 2 +- arch/x86/include/asm/disabled-features.h | 3 ++- arch/x86/include/asm/required-features.h | 3 ++- 4 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index a1273698fc43..42157ddcc09d 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -91,8 +91,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 20, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(REQUIRED_MASK, 21, feature_bit) || \ REQUIRED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 21)) + BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define DISABLED_MASK_BIT_SET(feature_bit) \ ( CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 0, feature_bit) || \ @@ -116,8 +117,9 @@ extern const char * const x86_bug_flags[NBUGINTS*32]; CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 18, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 19, feature_bit) || \ CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 20, feature_bit) || \ + CHECK_BIT_IN_MASK_WORD(DISABLED_MASK, 21, feature_bit) || \ DISABLED_MASK_CHECK || \ - BUILD_BUG_ON_ZERO(NCAPINTS != 21)) + BUILD_BUG_ON_ZERO(NCAPINTS != 22)) #define cpu_has(c, bit) \ (__builtin_constant_p(bit) && REQUIRED_MASK_BIT_SET(bit) ? 1 : \ diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index bc66aec9139e..a42db7bbe593 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -13,7 +13,7 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 21 /* N 32-bit words worth of info */ +#define NCAPINTS 22 /* N 32-bit words worth of info */ #define NBUGINTS 2 /* N 32-bit bug flags */ /* diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h index 702d93fdd10e..88fcf08458d9 100644 --- a/arch/x86/include/asm/disabled-features.h +++ b/arch/x86/include/asm/disabled-features.h @@ -143,6 +143,7 @@ #define DISABLED_MASK18 (DISABLE_IBT) #define DISABLED_MASK19 0 #define DISABLED_MASK20 0 -#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define DISABLED_MASK21 0 +#define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) #endif /* _ASM_X86_DISABLED_FEATURES_H */ diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index 7ba1726b71c7..e9187ddd3d1f 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -99,6 +99,7 @@ #define REQUIRED_MASK18 0 #define REQUIRED_MASK19 0 #define REQUIRED_MASK20 0 -#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 21) +#define REQUIRED_MASK21 0 +#define REQUIRED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 22) #endif /* _ASM_X86_REQUIRED_FEATURES_H */ From 55ed6c477872bac7be7e688a6c3af57654ee0049 Mon Sep 17 00:00:00 2001 From: Sandipan Das Date: Mon, 25 Mar 2024 13:01:45 +0530 Subject: [PATCH 065/256] perf/x86/amd/lbr: Use freeze based on availability [ Upstream commit 598c2fafc06fe5c56a1a415fb7b544b31453d637 ] Currently, the LBR code assumes that LBR Freeze is supported on all processors when X86_FEATURE_AMD_LBR_V2 is available i.e. CPUID leaf 0x80000022[EAX] bit 1 is set. This is incorrect as the availability of the feature is additionally dependent on CPUID leaf 0x80000022[EAX] bit 2 being set, which may not be set for all Zen 4 processors. Define a new feature bit for LBR and PMC freeze and set the freeze enable bit (FLBRI) in DebugCtl (MSR 0x1d9) conditionally. It should still be possible to use LBR without freeze for profile-guided optimization of user programs by using an user-only branch filter during profiling. When the user-only filter is enabled, branches are no longer recorded after the transition to CPL 0 upon PMI arrival. When branch entries are read in the PMI handler, the branch stack does not change. E.g. $ perf record -j any,u -e ex_ret_brn_tkn ./workload Since the feature bit is visible under flags in /proc/cpuinfo, it can be used to determine the feasibility of use-cases which require LBR Freeze to be supported by the hardware such as profile-guided optimization of kernels. Fixes: ca5b7c0d9621 ("perf/x86/amd/lbr: Add LbrExtV2 branch record support") Signed-off-by: Sandipan Das Signed-off-by: Ingo Molnar Link: https://lore.kernel.org/r/69a453c97cfd11c6f2584b19f937fe6df741510f.1711091584.git.sandipan.das@amd.com Signed-off-by: Sasha Levin --- arch/x86/events/amd/core.c | 4 ++-- arch/x86/events/amd/lbr.c | 16 ++++++++++------ arch/x86/include/asm/cpufeatures.h | 8 ++++++++ arch/x86/kernel/cpu/scattered.c | 1 + 4 files changed, 21 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index b30349eeb767..8ed10366c4a2 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -918,8 +918,8 @@ static int amd_pmu_v2_handle_irq(struct pt_regs *regs) if (!status) goto done; - /* Read branch records before unfreezing */ - if (status & GLOBAL_STATUS_LBRS_FROZEN) { + /* Read branch records */ + if (x86_pmu.lbr_nr) { amd_pmu_lbr_read(); status &= ~GLOBAL_STATUS_LBRS_FROZEN; } diff --git a/arch/x86/events/amd/lbr.c b/arch/x86/events/amd/lbr.c index eb31f850841a..110e34c59643 100644 --- a/arch/x86/events/amd/lbr.c +++ b/arch/x86/events/amd/lbr.c @@ -400,10 +400,12 @@ void amd_pmu_lbr_enable_all(void) wrmsrl(MSR_AMD64_LBR_SELECT, lbr_select); } - rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); - rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); + if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) { + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + } - wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl | DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg | DBG_EXTN_CFG_LBRV2EN); } @@ -416,10 +418,12 @@ void amd_pmu_lbr_disable_all(void) return; rdmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg); - rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); - wrmsrl(MSR_AMD_DBG_EXTN_CFG, dbg_extn_cfg & ~DBG_EXTN_CFG_LBRV2EN); - wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + + if (cpu_feature_enabled(X86_FEATURE_AMD_LBR_PMC_FREEZE)) { + rdmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl); + wrmsrl(MSR_IA32_DEBUGCTLMSR, dbg_ctl & ~DEBUGCTLMSR_FREEZE_LBRS_ON_PMI); + } } __init int amd_pmu_lbr_init(void) diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h index a42db7bbe593..9b2b99670d36 100644 --- a/arch/x86/include/asm/cpufeatures.h +++ b/arch/x86/include/asm/cpufeatures.h @@ -456,6 +456,14 @@ #define X86_FEATURE_IBPB_BRTYPE (20*32+28) /* "" MSR_PRED_CMD[IBPB] flushes all branch type predictions */ #define X86_FEATURE_SRSO_NO (20*32+29) /* "" CPU is not affected by SRSO */ +/* + * Extended auxiliary flags: Linux defined - for features scattered in various + * CPUID levels like 0x80000022, etc. + * + * Reuse free bits when adding new feature flags! + */ +#define X86_FEATURE_AMD_LBR_PMC_FREEZE (21*32+ 0) /* AMD LBR and PMC Freeze */ + /* * BUG word(s) */ diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c index 0dad49a09b7a..a515328d9d7d 100644 --- a/arch/x86/kernel/cpu/scattered.c +++ b/arch/x86/kernel/cpu/scattered.c @@ -49,6 +49,7 @@ static const struct cpuid_bit cpuid_bits[] = { { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 }, { 0, 0, 0, 0, 0 } }; From 2bc92c61c5417982e7b92ba628281f411d31013c Mon Sep 17 00:00:00 2001 From: Jack Brennen Date: Tue, 26 Sep 2023 08:40:44 -0400 Subject: [PATCH 066/256] modpost: Optimize symbol search from linear to binary search [ Upstream commit 4074532758c5c367d3fcb8d124150824a254659d ] Modify modpost to use binary search for converting addresses back into symbol references. Previously it used linear search. This change saves a few seconds of wall time for defconfig builds, but can save several minutes on allyesconfigs. Before: $ make LLVM=1 -j128 allyesconfig vmlinux -s KCFLAGS="-Wno-error" $ time scripts/mod/modpost -M -m -a -N -o vmlinux.symvers vmlinux.o 198.38user 1.27system 3:19.71elapsed After: $ make LLVM=1 -j128 allyesconfig vmlinux -s KCFLAGS="-Wno-error" $ time scripts/mod/modpost -M -m -a -N -o vmlinux.symvers vmlinux.o 11.91user 0.85system 0:12.78elapsed Signed-off-by: Jack Brennen Tested-by: Nick Desaulniers Signed-off-by: Masahiro Yamada Stable-dep-of: 1102f9f85bf6 ("modpost: do not make find_tosym() return NULL") Signed-off-by: Sasha Levin --- scripts/mod/Makefile | 4 +- scripts/mod/modpost.c | 70 ++------------ scripts/mod/modpost.h | 25 +++++ scripts/mod/symsearch.c | 199 ++++++++++++++++++++++++++++++++++++++++ 4 files changed, 232 insertions(+), 66 deletions(-) create mode 100644 scripts/mod/symsearch.c diff --git a/scripts/mod/Makefile b/scripts/mod/Makefile index c9e38ad937fd..3c54125eb373 100644 --- a/scripts/mod/Makefile +++ b/scripts/mod/Makefile @@ -5,7 +5,7 @@ CFLAGS_REMOVE_empty.o += $(CC_FLAGS_LTO) hostprogs-always-y += modpost mk_elfconfig always-y += empty.o -modpost-objs := modpost.o file2alias.o sumversion.o +modpost-objs := modpost.o file2alias.o sumversion.o symsearch.o devicetable-offsets-file := devicetable-offsets.h @@ -16,7 +16,7 @@ targets += $(devicetable-offsets-file) devicetable-offsets.s # dependencies on generated files need to be listed explicitly -$(obj)/modpost.o $(obj)/file2alias.o $(obj)/sumversion.o: $(obj)/elfconfig.h +$(obj)/modpost.o $(obj)/file2alias.o $(obj)/sumversion.o $(obj)/symsearch.o: $(obj)/elfconfig.h $(obj)/file2alias.o: $(obj)/$(devicetable-offsets-file) quiet_cmd_elfconfig = MKELF $@ diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 5191fdbd3fa2..66589fb4e9ae 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -22,7 +22,6 @@ #include #include "modpost.h" #include "../../include/linux/license.h" -#include "../../include/linux/module_symbol.h" static bool module_enabled; /* Are we using CONFIG_MODVERSIONS? */ @@ -577,11 +576,14 @@ static int parse_elf(struct elf_info *info, const char *filename) *p = TO_NATIVE(*p); } + symsearch_init(info); + return 1; } static void parse_elf_finish(struct elf_info *info) { + symsearch_finish(info); release_file(info->hdr, info->size); } @@ -1042,71 +1044,10 @@ static int secref_whitelist(const char *fromsec, const char *fromsym, return 1; } -/* - * If there's no name there, ignore it; likewise, ignore it if it's - * one of the magic symbols emitted used by current tools. - * - * Otherwise if find_symbols_between() returns those symbols, they'll - * fail the whitelist tests and cause lots of false alarms ... fixable - * only by merging __exit and __init sections into __text, bloating - * the kernel (which is especially evil on embedded platforms). - */ -static inline int is_valid_name(struct elf_info *elf, Elf_Sym *sym) -{ - const char *name = elf->strtab + sym->st_name; - - if (!name || !strlen(name)) - return 0; - return !is_mapping_symbol(name); -} - -/* Look up the nearest symbol based on the section and the address */ -static Elf_Sym *find_nearest_sym(struct elf_info *elf, Elf_Addr addr, - unsigned int secndx, bool allow_negative, - Elf_Addr min_distance) -{ - Elf_Sym *sym; - Elf_Sym *near = NULL; - Elf_Addr sym_addr, distance; - bool is_arm = (elf->hdr->e_machine == EM_ARM); - - for (sym = elf->symtab_start; sym < elf->symtab_stop; sym++) { - if (get_secindex(elf, sym) != secndx) - continue; - if (!is_valid_name(elf, sym)) - continue; - - sym_addr = sym->st_value; - - /* - * For ARM Thumb instruction, the bit 0 of st_value is set - * if the symbol is STT_FUNC type. Mask it to get the address. - */ - if (is_arm && ELF_ST_TYPE(sym->st_info) == STT_FUNC) - sym_addr &= ~1; - - if (addr >= sym_addr) - distance = addr - sym_addr; - else if (allow_negative) - distance = sym_addr - addr; - else - continue; - - if (distance <= min_distance) { - min_distance = distance; - near = sym; - } - - if (min_distance == 0) - break; - } - return near; -} - static Elf_Sym *find_fromsym(struct elf_info *elf, Elf_Addr addr, unsigned int secndx) { - return find_nearest_sym(elf, addr, secndx, false, ~0); + return symsearch_find_nearest(elf, addr, secndx, false, ~0); } static Elf_Sym *find_tosym(struct elf_info *elf, Elf_Addr addr, Elf_Sym *sym) @@ -1119,7 +1060,8 @@ static Elf_Sym *find_tosym(struct elf_info *elf, Elf_Addr addr, Elf_Sym *sym) * Strive to find a better symbol name, but the resulting name may not * match the symbol referenced in the original code. */ - return find_nearest_sym(elf, addr, get_secindex(elf, sym), true, 20); + return symsearch_find_nearest(elf, addr, get_secindex(elf, sym), + true, 20); } static bool is_executable_section(struct elf_info *elf, unsigned int secndx) diff --git a/scripts/mod/modpost.h b/scripts/mod/modpost.h index 5f94c2c9f2d9..6413f26fcb6b 100644 --- a/scripts/mod/modpost.h +++ b/scripts/mod/modpost.h @@ -10,6 +10,7 @@ #include #include #include +#include "../../include/linux/module_symbol.h" #include "list.h" #include "elfconfig.h" @@ -128,6 +129,8 @@ struct elf_info { * take shndx from symtab_shndx_start[N] instead */ Elf32_Word *symtab_shndx_start; Elf32_Word *symtab_shndx_stop; + + struct symsearch *symsearch; }; /* Accessor for sym->st_shndx, hides ugliness of "64k sections" */ @@ -154,6 +157,28 @@ static inline unsigned int get_secindex(const struct elf_info *info, return index; } +/* + * If there's no name there, ignore it; likewise, ignore it if it's + * one of the magic symbols emitted used by current tools. + * + * Internal symbols created by tools should be ignored by modpost. + */ +static inline int is_valid_name(struct elf_info *elf, Elf_Sym *sym) +{ + const char *name = elf->strtab + sym->st_name; + + if (!name || !strlen(name)) + return 0; + return !is_mapping_symbol(name); +} + +/* symsearch.c */ +void symsearch_init(struct elf_info *elf); +void symsearch_finish(struct elf_info *elf); +Elf_Sym *symsearch_find_nearest(struct elf_info *elf, Elf_Addr addr, + unsigned int secndx, bool allow_negative, + Elf_Addr min_distance); + /* file2alias.c */ void handle_moddevtable(struct module *mod, struct elf_info *info, Elf_Sym *sym, const char *symname); diff --git a/scripts/mod/symsearch.c b/scripts/mod/symsearch.c new file mode 100644 index 000000000000..aa4ed51f9960 --- /dev/null +++ b/scripts/mod/symsearch.c @@ -0,0 +1,199 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Helper functions for finding the symbol in an ELF which is "nearest" + * to a given address. + */ + +#include "modpost.h" + +struct syminfo { + unsigned int symbol_index; + unsigned int section_index; + Elf_Addr addr; +}; + +/* + * Container used to hold an entire binary search table. + * Entries in table are ascending, sorted first by section_index, + * then by addr, and last by symbol_index. The sorting by + * symbol_index is used to ensure predictable behavior when + * multiple symbols are present with the same address; all + * symbols past the first are effectively ignored, by eliding + * them in symsearch_fixup(). + */ +struct symsearch { + unsigned int table_size; + struct syminfo table[]; +}; + +static int syminfo_compare(const void *s1, const void *s2) +{ + const struct syminfo *sym1 = s1; + const struct syminfo *sym2 = s2; + + if (sym1->section_index > sym2->section_index) + return 1; + if (sym1->section_index < sym2->section_index) + return -1; + if (sym1->addr > sym2->addr) + return 1; + if (sym1->addr < sym2->addr) + return -1; + if (sym1->symbol_index > sym2->symbol_index) + return 1; + if (sym1->symbol_index < sym2->symbol_index) + return -1; + return 0; +} + +static unsigned int symbol_count(struct elf_info *elf) +{ + unsigned int result = 0; + + for (Elf_Sym *sym = elf->symtab_start; sym < elf->symtab_stop; sym++) { + if (is_valid_name(elf, sym)) + result++; + } + return result; +} + +/* + * Populate the search array that we just allocated. + * Be slightly paranoid here. The ELF file is mmap'd and could + * conceivably change between symbol_count() and symsearch_populate(). + * If we notice any difference, bail out rather than potentially + * propagating errors or crashing. + */ +static void symsearch_populate(struct elf_info *elf, + struct syminfo *table, + unsigned int table_size) +{ + bool is_arm = (elf->hdr->e_machine == EM_ARM); + + for (Elf_Sym *sym = elf->symtab_start; sym < elf->symtab_stop; sym++) { + if (is_valid_name(elf, sym)) { + if (table_size-- == 0) + fatal("%s: size mismatch\n", __func__); + table->symbol_index = sym - elf->symtab_start; + table->section_index = get_secindex(elf, sym); + table->addr = sym->st_value; + + /* + * For ARM Thumb instruction, the bit 0 of st_value is + * set if the symbol is STT_FUNC type. Mask it to get + * the address. + */ + if (is_arm && ELF_ST_TYPE(sym->st_info) == STT_FUNC) + table->addr &= ~1; + + table++; + } + } + + if (table_size != 0) + fatal("%s: size mismatch\n", __func__); +} + +/* + * Do any fixups on the table after sorting. + * For now, this just finds adjacent entries which have + * the same section_index and addr, and it propagates + * the first symbol_index over the subsequent entries, + * so that only one symbol_index is seen for any given + * section_index and addr. This ensures that whether + * we're looking at an address from "above" or "below" + * that we see the same symbol_index. + * This does leave some duplicate entries in the table; + * in practice, these are a small fraction of the + * total number of entries, and they are harmless to + * the binary search algorithm other than a few occasional + * unnecessary comparisons. + */ +static void symsearch_fixup(struct syminfo *table, unsigned int table_size) +{ + /* Don't look at index 0, it will never change. */ + for (unsigned int i = 1; i < table_size; i++) { + if (table[i].addr == table[i - 1].addr && + table[i].section_index == table[i - 1].section_index) { + table[i].symbol_index = table[i - 1].symbol_index; + } + } +} + +void symsearch_init(struct elf_info *elf) +{ + unsigned int table_size = symbol_count(elf); + + elf->symsearch = NOFAIL(malloc(sizeof(struct symsearch) + + sizeof(struct syminfo) * table_size)); + elf->symsearch->table_size = table_size; + + symsearch_populate(elf, elf->symsearch->table, table_size); + qsort(elf->symsearch->table, table_size, + sizeof(struct syminfo), syminfo_compare); + + symsearch_fixup(elf->symsearch->table, table_size); +} + +void symsearch_finish(struct elf_info *elf) +{ + free(elf->symsearch); + elf->symsearch = NULL; +} + +/* + * Find the syminfo which is in secndx and "nearest" to addr. + * allow_negative: allow returning a symbol whose address is > addr. + * min_distance: ignore symbols which are further away than this. + * + * Returns a pointer into the symbol table for success. + * Returns NULL if no legal symbol is found within the requested range. + */ +Elf_Sym *symsearch_find_nearest(struct elf_info *elf, Elf_Addr addr, + unsigned int secndx, bool allow_negative, + Elf_Addr min_distance) +{ + unsigned int hi = elf->symsearch->table_size; + unsigned int lo = 0; + struct syminfo *table = elf->symsearch->table; + struct syminfo target; + + target.addr = addr; + target.section_index = secndx; + target.symbol_index = ~0; /* compares greater than any actual index */ + while (hi > lo) { + unsigned int mid = lo + (hi - lo) / 2; /* Avoids overflow */ + + if (syminfo_compare(&table[mid], &target) > 0) + hi = mid; + else + lo = mid + 1; + } + + /* + * table[hi], if it exists, is the first entry in the array which + * lies beyond target. table[hi - 1], if it exists, is the last + * entry in the array which comes before target, including the + * case where it perfectly matches the section and the address. + * + * Note -- if the address we're looking up falls perfectly + * in the middle of two symbols, this is written to always + * prefer the symbol with the lower address. + */ + Elf_Sym *result = NULL; + + if (allow_negative && + hi < elf->symsearch->table_size && + table[hi].section_index == secndx && + table[hi].addr - addr <= min_distance) { + min_distance = table[hi].addr - addr; + result = &elf->symtab_start[table[hi].symbol_index]; + } + if (hi > 0 && + table[hi - 1].section_index == secndx && + addr - table[hi - 1].addr <= min_distance) { + result = &elf->symtab_start[table[hi - 1].symbol_index]; + } + return result; +} From a2671601fa020a124fe9c7d181bf9c9faf17011a Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Sat, 23 Mar 2024 20:45:11 +0900 Subject: [PATCH 067/256] modpost: do not make find_tosym() return NULL [ Upstream commit 1102f9f85bf66b1a7bd6a40afb40efbbe05dfc05 ] As mentioned in commit 397586506c3d ("modpost: Add '.ltext' and '.ltext.*' to TEXT_SECTIONS"), modpost can result in a segmentation fault due to a NULL pointer dereference in default_mismatch_handler(). find_tosym() can return the original symbol pointer instead of NULL if a better one is not found. This fixes the reported segmentation fault. Fixes: a23e7584ecf3 ("modpost: unify 'sym' and 'to' in default_mismatch_handler()") Reported-by: Nathan Chancellor Signed-off-by: Masahiro Yamada Signed-off-by: Sasha Levin --- scripts/mod/modpost.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c index 66589fb4e9ae..7d53942445d7 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -1052,6 +1052,8 @@ static Elf_Sym *find_fromsym(struct elf_info *elf, Elf_Addr addr, static Elf_Sym *find_tosym(struct elf_info *elf, Elf_Addr addr, Elf_Sym *sym) { + Elf_Sym *new_sym; + /* If the supplied symbol has a valid name, return it */ if (is_valid_name(elf, sym)) return sym; @@ -1060,8 +1062,9 @@ static Elf_Sym *find_tosym(struct elf_info *elf, Elf_Addr addr, Elf_Sym *sym) * Strive to find a better symbol name, but the resulting name may not * match the symbol referenced in the original code. */ - return symsearch_find_nearest(elf, addr, get_secindex(elf, sym), - true, 20); + new_sym = symsearch_find_nearest(elf, addr, get_secindex(elf, sym), + true, 20); + return new_sym ? new_sym : sym; } static bool is_executable_section(struct elf_info *elf, unsigned int secndx) From 21bc9b158983992b0709222c60f4bc749919a93d Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Mon, 25 Mar 2024 10:02:42 +0100 Subject: [PATCH 068/256] gpio: cdev: sanitize the label before requesting the interrupt commit b34490879baa847d16fc529c8ea6e6d34f004b38 upstream. When an interrupt is requested, a procfs directory is created under "/proc/irq//