From d658220a1c45cb721a80a9af7d1d70b35c7b74ea Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Mon, 25 Oct 2021 17:32:32 +0100
Subject: [PATCH 01/81] arm64/kvm: Fix bitrotted comment for SVE handling in
 handle_exit.c

The comment on the SVE trap handler in handle_exit.c says that it is a
placeholder until we support SVE in guests which we now do for both VHE
and nVHE cases so we really shouldn't get here in any sort of standard
case. Update the comment to be less immediately incorrect, the handling
of such a situation is correct.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211025163232.3502052-1-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kvm/handle_exit.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c
index 275a27368a04..5abe0617f2af 100644
--- a/arch/arm64/kvm/handle_exit.c
+++ b/arch/arm64/kvm/handle_exit.c
@@ -140,9 +140,12 @@ static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu)
 	return 1;
 }
 
+/*
+ * Guest access to SVE registers should be routed to this handler only
+ * when the system doesn't support SVE.
+ */
 static int handle_sve(struct kvm_vcpu *vcpu)
 {
-	/* Until SVE is supported for guests: */
 	kvm_inject_undefined(vcpu);
 	return 1;
 }

From b6363fe7b5135bfb5aea03f414148b3c2417702e Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Fri, 29 Oct 2021 09:40:55 -0500
Subject: [PATCH 02/81] arm64: Simplify checking for populated DT

Use of the of_scan_flat_dt() function predates libfdt and is discouraged
as libfdt provides a nicer set of APIs. Rework dt_scan_depth1_nodes to
use libfdt calls directly, and rename it to dt_is_stub() to reflect
exactly what it checking.

Cc: Will Deacon <will@kernel.org>
Signed-off-by: Rob Herring <robh@kernel.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20211029144055.2365814-1-robh@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/acpi.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/kernel/acpi.c b/arch/arm64/kernel/acpi.c
index f3851724fe35..e4dea8db6924 100644
--- a/arch/arm64/kernel/acpi.c
+++ b/arch/arm64/kernel/acpi.c
@@ -22,6 +22,7 @@
 #include <linux/irq_work.h>
 #include <linux/memblock.h>
 #include <linux/of_fdt.h>
+#include <linux/libfdt.h>
 #include <linux/smp.h>
 #include <linux/serial_core.h>
 #include <linux/pgtable.h>
@@ -62,29 +63,22 @@ static int __init parse_acpi(char *arg)
 }
 early_param("acpi", parse_acpi);
 
-static int __init dt_scan_depth1_nodes(unsigned long node,
-				       const char *uname, int depth,
-				       void *data)
+static bool __init dt_is_stub(void)
 {
-	/*
-	 * Ignore anything not directly under the root node; we'll
-	 * catch its parent instead.
-	 */
-	if (depth != 1)
-		return 0;
+	int node;
 
-	if (strcmp(uname, "chosen") == 0)
-		return 0;
+	fdt_for_each_subnode(node, initial_boot_params, 0) {
+		const char *name = fdt_get_name(initial_boot_params, node, NULL);
+		if (strcmp(name, "chosen") == 0)
+			continue;
+		if (strcmp(name, "hypervisor") == 0 &&
+		    of_flat_dt_is_compatible(node, "xen,xen"))
+			continue;
 
-	if (strcmp(uname, "hypervisor") == 0 &&
-	    of_flat_dt_is_compatible(node, "xen,xen"))
-		return 0;
+		return false;
+	}
 
-	/*
-	 * This node at depth 1 is neither a chosen node nor a xen node,
-	 * which we do not expect.
-	 */
-	return 1;
+	return true;
 }
 
 /*
@@ -205,8 +199,7 @@ void __init acpi_boot_table_init(void)
 	 *   and ACPI has not been [force] enabled (acpi=on|force)
 	 */
 	if (param_acpi_off ||
-	    (!param_acpi_on && !param_acpi_force &&
-	     of_scan_flat_dt(dt_scan_depth1_nodes, NULL)))
+	    (!param_acpi_on && !param_acpi_force && !dt_is_stub()))
 		goto done;
 
 	/*

From c9f5ea08a0f029fc5e0edb5f1380b9a828285439 Mon Sep 17 00:00:00 2001
From: Florian Fainelli <f.fainelli@gmail.com>
Date: Thu, 18 Nov 2021 12:18:10 -0800
Subject: [PATCH 03/81] arm64: entry: Use SDEI event constants

Use SDEI_EV_FAILED instead of open coding the 1 to make it clearer how
SDEI_EVENT_COMPLETE vs. SDEI_EVENT_COMPLETE_AND_RESUME is selected.

Signed-off-by: Florian Fainelli <f.fainelli@gmail.com>
Link: https://lore.kernel.org/r/20211118201811.2974922-1-f.fainelli@gmail.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/entry.S | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S
index 2f69ae43941d..772ec2ecf488 100644
--- a/arch/arm64/kernel/entry.S
+++ b/arch/arm64/kernel/entry.S
@@ -966,8 +966,10 @@ SYM_CODE_START(__sdei_asm_handler)
 	mov	sp, x1
 
 	mov	x1, x0			// address to complete_and_resume
-	/* x0 = (x0 <= 1) ? EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME */
-	cmp	x0, #1
+	/* x0 = (x0 <= SDEI_EV_FAILED) ?
+	 * EVENT_COMPLETE:EVENT_COMPLETE_AND_RESUME
+	 */
+	cmp	x0, #SDEI_EV_FAILED
 	mov_q	x2, SDEI_1_0_FN_SDEI_EVENT_COMPLETE
 	mov_q	x3, SDEI_1_0_FN_SDEI_EVENT_COMPLETE_AND_RESUME
 	csel	x0, x2, x3, ls

From fde046e07d3343a0417eafc0533b0c9675b393e5 Mon Sep 17 00:00:00 2001
From: Jisheng Zhang <jszhang@kernel.org>
Date: Fri, 19 Nov 2021 12:46:08 +0800
Subject: [PATCH 04/81] arm64: extable: remove unused ex_handler_t definition

The ex_handler_t type was introduced in commit d6e2cc564775 ("arm64:
extable: add `type` and `data` fields"), but has never been used, and
is unnecessary. Remove it.

Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20211119124608.3f03380b@xhacker
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/extable.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/arch/arm64/mm/extable.c b/arch/arm64/mm/extable.c
index c3d53811a15e..c0181e60cc98 100644
--- a/arch/arm64/mm/extable.c
+++ b/arch/arm64/mm/extable.c
@@ -10,9 +10,6 @@
 #include <asm/asm-extable.h>
 #include <asm/ptrace.h>
 
-typedef bool (*ex_handler_t)(const struct exception_table_entry *,
-			     struct pt_regs *);
-
 static inline unsigned long
 get_ex_fixup(const struct exception_table_entry *ex)
 {

From f0616abd4e67143b45b04b565839148458857347 Mon Sep 17 00:00:00 2001
From: Reiji Watanabe <reijiw@google.com>
Date: Sun, 5 Dec 2021 16:47:35 -0800
Subject: [PATCH 05/81] arm64: clear_page() shouldn't use DC ZVA when
 DCZID_EL0.DZP == 1

Currently, clear_page() uses DC ZVA instruction unconditionally.  But it
should make sure that DCZID_EL0.DZP, which indicates whether or not use
of DC ZVA instruction is prohibited, is zero when using the instruction.
Use STNP instead when DCZID_EL0.DZP == 1.

Fixes: f27bb139c387 ("arm64: Miscellaneous library functions")
Signed-off-by: Reiji Watanabe <reijiw@google.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20211206004736.1520989-2-reijiw@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/lib/clear_page.S | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/arch/arm64/lib/clear_page.S b/arch/arm64/lib/clear_page.S
index b84b179edba3..1fd5d790ab80 100644
--- a/arch/arm64/lib/clear_page.S
+++ b/arch/arm64/lib/clear_page.S
@@ -16,6 +16,7 @@
  */
 SYM_FUNC_START_PI(clear_page)
 	mrs	x1, dczid_el0
+	tbnz	x1, #4, 2f	/* Branch if DC ZVA is prohibited */
 	and	w1, w1, #0xf
 	mov	x2, #4
 	lsl	x1, x2, x1
@@ -25,5 +26,14 @@ SYM_FUNC_START_PI(clear_page)
 	tst	x0, #(PAGE_SIZE - 1)
 	b.ne	1b
 	ret
+
+2:	stnp	xzr, xzr, [x0]
+	stnp	xzr, xzr, [x0, #16]
+	stnp	xzr, xzr, [x0, #32]
+	stnp	xzr, xzr, [x0, #48]
+	add	x0, x0, #64
+	tst	x0, #(PAGE_SIZE - 1)
+	b.ne	2b
+	ret
 SYM_FUNC_END_PI(clear_page)
 EXPORT_SYMBOL(clear_page)

From 685e2564daa1493053fcd7f1dbed38b35ee2f3cb Mon Sep 17 00:00:00 2001
From: Reiji Watanabe <reijiw@google.com>
Date: Sun, 5 Dec 2021 16:47:36 -0800
Subject: [PATCH 06/81] arm64: mte: DC {GVA,GZVA} shouldn't be used when
 DCZID_EL0.DZP == 1

Currently, mte_set_mem_tag_range() and mte_zero_clear_page_tags() use
DC {GVA,GZVA} unconditionally.  But, they should make sure that
DCZID_EL0.DZP, which indicates whether or not use of those instructions
is prohibited, is zero when using those instructions.
Use ST{G,ZG,Z2G} instead when DCZID_EL0.DZP == 1.

Fixes: 013bb59dbb7c ("arm64: mte: handle tags zeroing at page allocation time")
Fixes: 3d0cca0b02ac ("kasan: speed up mte_set_mem_tag_range")
Signed-off-by: Reiji Watanabe <reijiw@google.com>
Link: https://lore.kernel.org/r/20211206004736.1520989-3-reijiw@google.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/mte-kasan.h | 8 +++++---
 arch/arm64/lib/mte.S               | 8 +++++++-
 2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/arm64/include/asm/mte-kasan.h b/arch/arm64/include/asm/mte-kasan.h
index 478b9bcf69ad..e4704a403237 100644
--- a/arch/arm64/include/asm/mte-kasan.h
+++ b/arch/arm64/include/asm/mte-kasan.h
@@ -84,10 +84,12 @@ static inline void __dc_gzva(u64 p)
 static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag,
 					 bool init)
 {
-	u64 curr, mask, dczid_bs, end1, end2, end3;
+	u64 curr, mask, dczid, dczid_bs, dczid_dzp, end1, end2, end3;
 
 	/* Read DC G(Z)VA block size from the system register. */
-	dczid_bs = 4ul << (read_cpuid(DCZID_EL0) & 0xf);
+	dczid = read_cpuid(DCZID_EL0);
+	dczid_bs = 4ul << (dczid & 0xf);
+	dczid_dzp = (dczid >> 4) & 1;
 
 	curr = (u64)__tag_set(addr, tag);
 	mask = dczid_bs - 1;
@@ -106,7 +108,7 @@ static inline void mte_set_mem_tag_range(void *addr, size_t size, u8 tag,
 	 */
 #define SET_MEMTAG_RANGE(stg_post, dc_gva)		\
 	do {						\
-		if (size >= 2 * dczid_bs) {		\
+		if (!dczid_dzp && size >= 2 * dczid_bs) {\
 			do {				\
 				curr = stg_post(curr);	\
 			} while (curr < end1);		\
diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S
index e83643b3995f..f531dcb95174 100644
--- a/arch/arm64/lib/mte.S
+++ b/arch/arm64/lib/mte.S
@@ -43,17 +43,23 @@ SYM_FUNC_END(mte_clear_page_tags)
  *	x0 - address to the beginning of the page
  */
 SYM_FUNC_START(mte_zero_clear_page_tags)
+	and	x0, x0, #(1 << MTE_TAG_SHIFT) - 1	// clear the tag
 	mrs	x1, dczid_el0
+	tbnz	x1, #4, 2f	// Branch if DC GZVA is prohibited
 	and	w1, w1, #0xf
 	mov	x2, #4
 	lsl	x1, x2, x1
-	and	x0, x0, #(1 << MTE_TAG_SHIFT) - 1	// clear the tag
 
 1:	dc	gzva, x0
 	add	x0, x0, x1
 	tst	x0, #(PAGE_SIZE - 1)
 	b.ne	1b
 	ret
+
+2:	stz2g	x0, [x0], #(MTE_GRANULE_SIZE * 2)
+	tst	x0, #(PAGE_SIZE - 1)
+	b.ne	2b
+	ret
 SYM_FUNC_END(mte_zero_clear_page_tags)
 
 /*

From 7afccde389dcfaca793a0d909f8cb7412e1d1dbe Mon Sep 17 00:00:00 2001
From: Rongwei Wang <rongwei.wang@linux.alibaba.com>
Date: Fri, 26 Nov 2021 01:06:00 +0800
Subject: [PATCH 07/81] arm64: kexec: reduce calls to page_address()

In kexec_page_alloc(), page_address() is called twice.
This patch add a new variable to help to reduce calls
to page_address().

Signed-off-by: Rongwei Wang <rongwei.wang@linux.alibaba.com>
Link: https://lore.kernel.org/r/20211125170600.1608-3-rongwei.wang@linux.alibaba.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/machine_kexec.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/machine_kexec.c b/arch/arm64/kernel/machine_kexec.c
index 1038494135c8..7f2530bcd42e 100644
--- a/arch/arm64/kernel/machine_kexec.c
+++ b/arch/arm64/kernel/machine_kexec.c
@@ -104,13 +104,15 @@ static void *kexec_page_alloc(void *arg)
 {
 	struct kimage *kimage = (struct kimage *)arg;
 	struct page *page = kimage_alloc_control_pages(kimage, 0);
+	void *vaddr = NULL;
 
 	if (!page)
 		return NULL;
 
-	memset(page_address(page), 0, PAGE_SIZE);
+	vaddr = page_address(page);
+	memset(vaddr, 0, PAGE_SIZE);
 
-	return page_address(page);
+	return vaddr;
 }
 
 int machine_kexec_post_load(struct kimage *kimage)

From 1614b2b11fab29dd4ff31ebba9d266961f5af69e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <peterz@infradead.org>
Date: Mon, 29 Nov 2021 14:28:41 +0000
Subject: [PATCH 08/81] arch: Make ARCH_STACKWALK independent of STACKTRACE

Make arch_stack_walk() available for ARCH_STACKWALK architectures
without it being entangled in STACKTRACE.

Link: https://lore.kernel.org/lkml/20211022152104.356586621@infradead.org/
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[Mark: rebase, drop unnecessary arm change]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Albert Ou <aou@eecs.berkeley.edu>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Christian Borntraeger <borntraeger@de.ibm.com>
Cc: Dave Hansen <dave.hansen@linux.intel.com>
Cc: Heiko Carstens <hca@linux.ibm.com>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Michael Ellerman <mpe@ellerman.id.au>
Cc: Palmer Dabbelt <palmer@dabbelt.com>
Cc: Paul Walmsley <paul.walmsley@sifive.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vasily Gorbik <gor@linux.ibm.com>
Link: https://lore.kernel.org/r/20211129142849.3056714-2-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/stacktrace.c |  4 ----
 arch/powerpc/kernel/Makefile   |  3 +--
 arch/riscv/kernel/stacktrace.c |  4 ----
 arch/s390/kernel/Makefile      |  3 +--
 arch/x86/kernel/Makefile       |  2 +-
 include/linux/stacktrace.h     | 35 +++++++++++++++++-----------------
 6 files changed, 21 insertions(+), 30 deletions(-)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 94f83cd44e50..e6ba6b000564 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -221,8 +221,6 @@ void show_stack(struct task_struct *tsk, unsigned long *sp, const char *loglvl)
 	barrier();
 }
 
-#ifdef CONFIG_STACKTRACE
-
 noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry,
 			      void *cookie, struct task_struct *task,
 			      struct pt_regs *regs)
@@ -241,5 +239,3 @@ noinline notrace void arch_stack_walk(stack_trace_consume_fn consume_entry,
 
 	walk_stackframe(task, &frame, consume_entry, cookie);
 }
-
-#endif
diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
index 5fa68c2ef1f8..b039877c743d 100644
--- a/arch/powerpc/kernel/Makefile
+++ b/arch/powerpc/kernel/Makefile
@@ -47,7 +47,7 @@ obj-y				:= cputable.o syscalls.o \
 				   udbg.o misc.o io.o misc_$(BITS).o \
 				   of_platform.o prom_parse.o firmware.o \
 				   hw_breakpoint_constraints.o interrupt.o \
-				   kdebugfs.o
+				   kdebugfs.o stacktrace.o
 obj-y				+= ptrace/
 obj-$(CONFIG_PPC64)		+= setup_64.o \
 				   paca.o nvram_64.o note.o
@@ -116,7 +116,6 @@ obj-$(CONFIG_OPTPROBES)		+= optprobes.o optprobes_head.o
 obj-$(CONFIG_KPROBES_ON_FTRACE)	+= kprobes-ftrace.o
 obj-$(CONFIG_UPROBES)		+= uprobes.o
 obj-$(CONFIG_PPC_UDBG_16550)	+= legacy_serial.o udbg_16550.o
-obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_SWIOTLB)		+= dma-swiotlb.o
 obj-$(CONFIG_ARCH_HAS_DMA_SET_MASK) += dma-mask.o
 
diff --git a/arch/riscv/kernel/stacktrace.c b/arch/riscv/kernel/stacktrace.c
index 0fcdc0233fac..201ee206fb57 100644
--- a/arch/riscv/kernel/stacktrace.c
+++ b/arch/riscv/kernel/stacktrace.c
@@ -139,12 +139,8 @@ unsigned long __get_wchan(struct task_struct *task)
 	return pc;
 }
 
-#ifdef CONFIG_STACKTRACE
-
 noinline void arch_stack_walk(stack_trace_consume_fn consume_entry, void *cookie,
 		     struct task_struct *task, struct pt_regs *regs)
 {
 	walk_stackframe(task, regs, consume_entry, cookie);
 }
-
-#endif /* CONFIG_STACKTRACE */
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index 80f500ffb55c..be8007f367aa 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -40,7 +40,7 @@ obj-y	+= sysinfo.o lgr.o os_info.o machine_kexec.o
 obj-y	+= runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o sthyi.o
 obj-y	+= entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o
 obj-y	+= nospec-branch.o ipl_vmparm.o machine_kexec_reloc.o unwind_bc.o
-obj-y	+= smp.o text_amode31.o
+obj-y	+= smp.o text_amode31.o stacktrace.o
 
 extra-y				+= head64.o vmlinux.lds
 
@@ -55,7 +55,6 @@ compat-obj-$(CONFIG_AUDIT)	+= compat_audit.o
 obj-$(CONFIG_COMPAT)		+= compat_linux.o compat_signal.o
 obj-$(CONFIG_COMPAT)		+= $(compat-obj-y)
 obj-$(CONFIG_EARLY_PRINTK)	+= early_printk.o
-obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
 obj-$(CONFIG_KPROBES)		+= kprobes.o
 obj-$(CONFIG_KPROBES)		+= kprobes_insn_page.o
 obj-$(CONFIG_FUNCTION_TRACER)	+= mcount.o ftrace.o
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 2ff3e600f426..6aef9ee28a39 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -84,7 +84,7 @@ obj-$(CONFIG_IA32_EMULATION)	+= tls.o
 obj-y				+= step.o
 obj-$(CONFIG_INTEL_TXT)		+= tboot.o
 obj-$(CONFIG_ISA_DMA_API)	+= i8237.o
-obj-$(CONFIG_STACKTRACE)	+= stacktrace.o
+obj-y				+= stacktrace.o
 obj-y				+= cpu/
 obj-y				+= acpi/
 obj-y				+= reboot.o
diff --git a/include/linux/stacktrace.h b/include/linux/stacktrace.h
index bef158815e83..97455880ac41 100644
--- a/include/linux/stacktrace.h
+++ b/include/linux/stacktrace.h
@@ -8,22 +8,6 @@
 struct task_struct;
 struct pt_regs;
 
-#ifdef CONFIG_STACKTRACE
-void stack_trace_print(const unsigned long *trace, unsigned int nr_entries,
-		       int spaces);
-int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries,
-			unsigned int nr_entries, int spaces);
-unsigned int stack_trace_save(unsigned long *store, unsigned int size,
-			      unsigned int skipnr);
-unsigned int stack_trace_save_tsk(struct task_struct *task,
-				  unsigned long *store, unsigned int size,
-				  unsigned int skipnr);
-unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
-				   unsigned int size, unsigned int skipnr);
-unsigned int stack_trace_save_user(unsigned long *store, unsigned int size);
-unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
-
-/* Internal interfaces. Do not use in generic code */
 #ifdef CONFIG_ARCH_STACKWALK
 
 /**
@@ -76,8 +60,25 @@ int arch_stack_walk_reliable(stack_trace_consume_fn consume_entry, void *cookie,
 
 void arch_stack_walk_user(stack_trace_consume_fn consume_entry, void *cookie,
 			  const struct pt_regs *regs);
+#endif /* CONFIG_ARCH_STACKWALK */
 
-#else /* CONFIG_ARCH_STACKWALK */
+#ifdef CONFIG_STACKTRACE
+void stack_trace_print(const unsigned long *trace, unsigned int nr_entries,
+		       int spaces);
+int stack_trace_snprint(char *buf, size_t size, const unsigned long *entries,
+			unsigned int nr_entries, int spaces);
+unsigned int stack_trace_save(unsigned long *store, unsigned int size,
+			      unsigned int skipnr);
+unsigned int stack_trace_save_tsk(struct task_struct *task,
+				  unsigned long *store, unsigned int size,
+				  unsigned int skipnr);
+unsigned int stack_trace_save_regs(struct pt_regs *regs, unsigned long *store,
+				   unsigned int size, unsigned int skipnr);
+unsigned int stack_trace_save_user(unsigned long *store, unsigned int size);
+unsigned int filter_irq_stacks(unsigned long *entries, unsigned int nr_entries);
+
+#ifndef CONFIG_ARCH_STACKWALK
+/* Internal interfaces. Do not use in generic code */
 struct stack_trace {
 	unsigned int nr_entries, max_entries;
 	unsigned long *entries;

From 1e5428b2b7e8aef6a1d10a33fa15df427f087450 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 29 Nov 2021 14:28:42 +0000
Subject: [PATCH 09/81] arm64: Add comment for stack_info::kr_cur

We added stack_info::kr_cur in commit:

  cd9bc2c9258816dc ("arm64: Recover kretprobe modified return address in stacktrace")

... but didn't add anything in the corresponding comment block.

For consistency, add a corresponding comment.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviwed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
Cc: Steven Rostedt (VMware) <rostedt@goodmis.org>
Cc: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20211129142849.3056714-3-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/stacktrace.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index 6564a01cc085..1367012e0520 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -47,6 +47,10 @@ struct stack_info {
  * @prev_type:   The type of stack this frame record was on, or a synthetic
  *               value of STACK_TYPE_UNKNOWN. This is used to detect a
  *               transition from one stack to another.
+ *
+ * @kr_cur:      When KRETPROBES is selected, holds the kretprobe instance
+ *               associated with the most recently encountered replacement lr
+ *               value.
  */
 struct stackframe {
 	unsigned long fp;

From 86bcbafcb726b7b11898d2d6269bd665cb27c1b9 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 29 Nov 2021 14:28:43 +0000
Subject: [PATCH 10/81] arm64: Mark __switch_to() as __sched

Unlike most architectures (and only in keeping with powerpc), arm64 has
a non __sched() function on the path to our cpu_switch_to() assembly
function.

It is expected that for a blocked task, in_sched_functions() can be used
to skip all functions between the raw context switch assembly and the
scheduler functions that call into __switch_to(). This is the behaviour
expected by stack_trace_consume_entry_nosched(), and the behaviour we'd
like to have such that we an simplify arm64's __get_wchan()
implementation to use arch_stack_walk().

This patch mark's arm64's __switch_to as __sched. This *will not* change
the behaviour of arm64's current __get_wchan() implementation, which
always performs an initial unwind step which skips __switch_to(). This
*will* change the behaviour of stack_trace_consume_entry_nosched() and
stack_trace_save_tsk() to match their expected behaviour on blocked
tasks, skipping all scheduler-internal functions including
__switch_to().

Other than the above, there should be no functional change as a result
of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211129142849.3056714-4-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/process.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index aacf2f5559a8..980cad7292af 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -490,7 +490,8 @@ void update_sctlr_el1(u64 sctlr)
 /*
  * Thread switching.
  */
-__notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
+__notrace_funcgraph __sched
+struct task_struct *__switch_to(struct task_struct *prev,
 				struct task_struct *next)
 {
 	struct task_struct *last;

From ed876d35a1dc7f9efcfc51dd06843b5b8af08ad1 Mon Sep 17 00:00:00 2001
From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Date: Mon, 29 Nov 2021 14:28:44 +0000
Subject: [PATCH 11/81] arm64: Make perf_callchain_kernel() use
 arch_stack_walk()

To enable RELIABLE_STACKTRACE and LIVEPATCH on arm64, we need to
substantially rework arm64's unwinding code. As part of this, we want to
minimize the set of unwind interfaces we expose, and avoid open-coding
of unwind logic outside of stacktrace.c.

Currently perf_callchain_kernel() walks the stack of an interrupted
context by calling start_backtrace() with the context's PC and FP, and
iterating unwind steps using walk_stackframe(). This is functionally
equivalent to calling arch_stack_walk() with the interrupted context's
pt_regs, which will start with the PC and FP from the regs.

Make perf_callchain_kernel() use arch_stack_walk(). This simplifies
perf_callchain_kernel(), and in future will alow us to make
walk_stackframe() private to stacktrace.c.

At the same time, we update the callchain_trace() callback to check the
return value of perf_callchain_store(), which indicates whether there is
space for any further entries. When a non-zero value is returned,
further calls will be ignored, and are redundant, so we can stop the
unwind at this point.

We also remove the stale and confusing comment for callchain_trace.

There should be no functional change as a result of this patch.

Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
[Mark: elaborate commit message, remove comment, fix includes]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20211129142849.3056714-5-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/perf_callchain.c | 15 +++------------
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/kernel/perf_callchain.c b/arch/arm64/kernel/perf_callchain.c
index 4a72c2727309..e9b7d99f4e3a 100644
--- a/arch/arm64/kernel/perf_callchain.c
+++ b/arch/arm64/kernel/perf_callchain.c
@@ -5,10 +5,10 @@
  * Copyright (C) 2015 ARM Limited
  */
 #include <linux/perf_event.h>
+#include <linux/stacktrace.h>
 #include <linux/uaccess.h>
 
 #include <asm/pointer_auth.h>
-#include <asm/stacktrace.h>
 
 struct frame_tail {
 	struct frame_tail	__user *fp;
@@ -132,30 +132,21 @@ void perf_callchain_user(struct perf_callchain_entry_ctx *entry,
 	}
 }
 
-/*
- * Gets called by walk_stackframe() for every stackframe. This will be called
- * whist unwinding the stackframe and is like a subroutine return so we use
- * the PC.
- */
 static bool callchain_trace(void *data, unsigned long pc)
 {
 	struct perf_callchain_entry_ctx *entry = data;
-	perf_callchain_store(entry, pc);
-	return true;
+	return perf_callchain_store(entry, pc) == 0;
 }
 
 void perf_callchain_kernel(struct perf_callchain_entry_ctx *entry,
 			   struct pt_regs *regs)
 {
-	struct stackframe frame;
-
 	if (perf_guest_cbs && perf_guest_cbs->is_in_guest()) {
 		/* We don't support guest os callchain now */
 		return;
 	}
 
-	start_backtrace(&frame, regs->regs[29], regs->pc);
-	walk_stackframe(current, &frame, callchain_trace, entry);
+	arch_stack_walk(callchain_trace, entry, current, regs);
 }
 
 unsigned long perf_instruction_pointer(struct pt_regs *regs)

From 4f62bb7cb165f3e7b0a91279fe9dd5c56daf3457 Mon Sep 17 00:00:00 2001
From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Date: Mon, 29 Nov 2021 14:28:45 +0000
Subject: [PATCH 12/81] arm64: Make __get_wchan() use arch_stack_walk()

To enable RELIABLE_STACKTRACE and LIVEPATCH on arm64, we need to
substantially rework arm64's unwinding code. As part of this, we want to
minimize the set of unwind interfaces we expose, and avoid open-coding
of unwind logic outside of stacktrace.c.

Currently, __get_wchan() walks the stack of a blocked task by calling
start_backtrace() with the task's saved PC and FP values, and iterating
unwind steps using unwind_frame(). The initialization is functionally
equivalent to calling arch_stack_walk() with the blocked task, which
will start with the task's saved PC and FP values.

Currently __get_wchan() always performs an initial unwind step, which
will stkip __switch_to(), but as this is now marked as a __sched
function, this no longer needs special handling and will be skipped in
the same way as other sched functions.

Make __get_wchan() use arch_stack_walk(). This simplifies __get_wchan(),
and in future will alow us to make unwind_frame() private to
stacktrace.c. At the same time, we can simplify the try_get_task_stack()
check and avoid the unnecessary `stack_page` variable.

The change to the skipping logic means we may terminate one frame
earlier than previously where there are an excessive number of sched
functions in the trace, but this isn't seen in practice, and wchan is
best-effort anyway, so this should not be a problem.

Other than the above, there should be no functional change as a result
of this patch.

Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
[Mark: rebase atop wchan changes, elaborate commit message, fix includes]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211129142849.3056714-6-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/process.c | 42 ++++++++++++++++++++++---------------
 1 file changed, 25 insertions(+), 17 deletions(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index 980cad7292af..836a933156cd 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -40,6 +40,7 @@
 #include <linux/percpu.h>
 #include <linux/thread_info.h>
 #include <linux/prctl.h>
+#include <linux/stacktrace.h>
 
 #include <asm/alternative.h>
 #include <asm/compat.h>
@@ -529,30 +530,37 @@ struct task_struct *__switch_to(struct task_struct *prev,
 	return last;
 }
 
+struct wchan_info {
+	unsigned long	pc;
+	int		count;
+};
+
+static bool get_wchan_cb(void *arg, unsigned long pc)
+{
+	struct wchan_info *wchan_info = arg;
+
+	if (!in_sched_functions(pc)) {
+		wchan_info->pc = pc;
+		return false;
+	}
+	return wchan_info->count++ < 16;
+}
+
 unsigned long __get_wchan(struct task_struct *p)
 {
-	struct stackframe frame;
-	unsigned long stack_page, ret = 0;
-	int count = 0;
+	struct wchan_info wchan_info = {
+		.pc = 0,
+		.count = 0,
+	};
 
-	stack_page = (unsigned long)try_get_task_stack(p);
-	if (!stack_page)
+	if (!try_get_task_stack(p))
 		return 0;
 
-	start_backtrace(&frame, thread_saved_fp(p), thread_saved_pc(p));
+	arch_stack_walk(get_wchan_cb, &wchan_info, p, NULL);
 
-	do {
-		if (unwind_frame(p, &frame))
-			goto out;
-		if (!in_sched_functions(frame.pc)) {
-			ret = frame.pc;
-			goto out;
-		}
-	} while (count++ < 16);
-
-out:
 	put_task_stack(p);
-	return ret;
+
+	return wchan_info.pc;
 }
 
 unsigned long arch_align_stack(unsigned long sp)

From 39ef362d2d45171e941457f8cd00518bfcedfe2b Mon Sep 17 00:00:00 2001
From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Date: Mon, 29 Nov 2021 14:28:46 +0000
Subject: [PATCH 13/81] arm64: Make return_address() use arch_stack_walk()

To enable RELIABLE_STACKTRACE and LIVEPATCH on arm64, we need to
substantially rework arm64's unwinding code. As part of this, we want to
minimize the set of unwind interfaces we expose, and avoid open-coding
of unwind logic outside of stacktrace.c.

Currently return_address() walks the stack of the current task by
calling start_backtrace() with return_address as the PC and the frame
pointer of return_address() as the next frame, iterating unwind steps
using walk_stackframe(). This is functionally equivalent to calling
arch_stack_walk() for the current stack, which will start from its
caller (i.e. return_address()) as the PC and it's caller's frame record
as the next frame.

Make return_address() use arch_stackwalk(). This simplifies
return_address(), and in future will alow us to make walk_stackframe()
private to stacktrace.c.

There should be no functional change as a result of this patch.

Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Tested-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
[Mark: elaborate commit message, fix includes]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20211129142849.3056714-7-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/return_address.c | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/return_address.c b/arch/arm64/kernel/return_address.c
index a6d18755652f..68330017d04f 100644
--- a/arch/arm64/kernel/return_address.c
+++ b/arch/arm64/kernel/return_address.c
@@ -9,9 +9,9 @@
 #include <linux/export.h>
 #include <linux/ftrace.h>
 #include <linux/kprobes.h>
+#include <linux/stacktrace.h>
 
 #include <asm/stack_pointer.h>
-#include <asm/stacktrace.h>
 
 struct return_address_data {
 	unsigned int level;
@@ -35,15 +35,11 @@ NOKPROBE_SYMBOL(save_return_addr);
 void *return_address(unsigned int level)
 {
 	struct return_address_data data;
-	struct stackframe frame;
 
 	data.level = level + 2;
 	data.addr = NULL;
 
-	start_backtrace(&frame,
-			(unsigned long)__builtin_frame_address(0),
-			(unsigned long)return_address);
-	walk_stackframe(current, &frame, save_return_addr, &data);
+	arch_stack_walk(save_return_addr, &data, current, NULL);
 
 	if (!data.level)
 		return data.addr;

From 22ecd975b61d3645ce8f19a95369357f04a25f0b Mon Sep 17 00:00:00 2001
From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Date: Mon, 29 Nov 2021 14:28:47 +0000
Subject: [PATCH 14/81] arm64: Make profile_pc() use arch_stack_walk()

To enable RELIABLE_STACKTRACE and LIVEPATCH on arm64, we need to
substantially rework arm64's unwinding code. As part of this, we want to
minimize the set of unwind interfaces we expose, and avoid open-coding
of unwind logic outside of stacktrace.c.

Currently profile_pc() walks the stack of an interrupted context by
calling start_backtrace() with the context's PC and FP, and iterating
unwind steps using walk_stackframe(). This is functionally equivalent to
calling arch_stack_walk() with the interrupted context's pt_regs, which
will start with the PC and FP from the regs.

Make profile_pc() use arch_stack_walk(). This simplifies profile_pc(),
and in future will alow us to make walk_stackframe() private to
stacktrace.c.

At the same time, we remove the early return for when regs->pc is not in
lock functions, as this will be handled by the first call to the
profile_pc_cb() callback.

There should be no functional change as a result of this patch.

Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
[Mark: remove early return, elaborate commit message, fix includes]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211129142849.3056714-8-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/time.c | 27 ++++++++++++++-------------
 1 file changed, 14 insertions(+), 13 deletions(-)

diff --git a/arch/arm64/kernel/time.c b/arch/arm64/kernel/time.c
index eebbc8d7123e..b5855eb7435d 100644
--- a/arch/arm64/kernel/time.c
+++ b/arch/arm64/kernel/time.c
@@ -18,6 +18,7 @@
 #include <linux/timex.h>
 #include <linux/errno.h>
 #include <linux/profile.h>
+#include <linux/stacktrace.h>
 #include <linux/syscore_ops.h>
 #include <linux/timer.h>
 #include <linux/irq.h>
@@ -29,25 +30,25 @@
 #include <clocksource/arm_arch_timer.h>
 
 #include <asm/thread_info.h>
-#include <asm/stacktrace.h>
 #include <asm/paravirt.h>
 
+static bool profile_pc_cb(void *arg, unsigned long pc)
+{
+	unsigned long *prof_pc = arg;
+
+	if (in_lock_functions(pc))
+		return true;
+	*prof_pc = pc;
+	return false;
+}
+
 unsigned long profile_pc(struct pt_regs *regs)
 {
-	struct stackframe frame;
+	unsigned long prof_pc = 0;
 
-	if (!in_lock_functions(regs->pc))
-		return regs->pc;
+	arch_stack_walk(profile_pc_cb, &prof_pc, current, regs);
 
-	start_backtrace(&frame, regs->regs[29], regs->pc);
-
-	do {
-		int ret = unwind_frame(NULL, &frame);
-		if (ret < 0)
-			return 0;
-	} while (in_lock_functions(frame.pc));
-
-	return frame.pc;
+	return prof_pc;
 }
 EXPORT_SYMBOL(profile_pc);
 

From 2dad6dc17bd0cefd03b42147c6fc9bbd81f7928a Mon Sep 17 00:00:00 2001
From: "Madhavan T. Venkataraman" <madvenka@linux.microsoft.com>
Date: Mon, 29 Nov 2021 14:28:48 +0000
Subject: [PATCH 15/81] arm64: Make dump_backtrace() use arch_stack_walk()

To enable RELIABLE_STACKTRACE and LIVEPATCH on arm64, we need to
substantially rework arm64's unwinding code. As part of this, we want to
minimize the set of unwind interfaces we expose, and avoid open-coding
of unwind logic.

Currently, dump_backtrace() walks the stack of the current task or a
blocked task by calling stact_backtrace() and iterating unwind steps
using unwind_frame(). This can be written more simply in terms of
arch_stack_walk(), considering three distinct cases:

1) When unwinding a blocked task, start_backtrace() is called with the
   blocked task's saved PC and FP, and the unwind proceeds immediately
   from this point without skipping any entries. This is functionally
   equivalent to calling arch_stack_walk() with the blocked task, which
   will start with the task's saved PC and FP.

   There is no functional change to this case.

2) When unwinding the current task without regs, start_backtrace() is
   called with dump_backtrace() as the PC and __builtin_frame_address(0)
   as the next frame, and the unwind proceeds immediately without
   skipping. This is *almost* functionally equivalent to calling
   arch_stack_walk() for the current task, which will start with its
   caller (i.e. an offset into dump_backtrace()) as the PC, and the
   callers frame record as the next frame.

   The only difference being that dump_backtrace() will be reported with
   an offset (which is strictly more correct than currently). Otherwise
   there is no functional cahnge to this case.

3) When unwinding the current task with regs, start_backtrace() is
   called with dump_backtrace() as the PC and __builtin_frame_address(0)
   as the next frame, and the unwind is performed silently until the
   next frame is the frame pointed to by regs->fp. Reporting starts
   from regs->pc and continues from the frame in regs->fp.

   Historically, this pre-unwind was necessary to correctly record
   return addresses rewritten by the ftrace graph calller, but this is
   no longer necessary as these are now recovered using the FP since
   commit:

   c6d3cd32fd0064af ("arm64: ftrace: use HAVE_FUNCTION_GRAPH_RET_ADDR_PTR")

   This pre-unwind is not necessary to recover return addresses
   rewritten by kretprobes, which historically were not recovered, and
   are now recovered using the FP since commit:

   cd9bc2c9258816dc ("arm64: Recover kretprobe modified return address in stacktrace")

   Thus, this is functionally equivalent to calling arch_stack_walk()
   with the current task and regs, which will start with regs->pc as the
   PC and regs->fp as the next frame, without a pre-unwind.

This patch makes dump_backtrace() use arch_stack_walk(). This simplifies
dump_backtrace() and will permit subsequent changes to the unwind code.

Aside from the improved reporting when unwinding current without regs,
there should be no functional change as a result of this patch.

Signed-off-by: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
[Mark: elaborate commit message]
Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211129142849.3056714-9-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/stacktrace.c | 44 +++++-----------------------------
 1 file changed, 6 insertions(+), 38 deletions(-)

diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index e6ba6b000564..9fc771a05306 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -156,24 +156,20 @@ void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
 }
 NOKPROBE_SYMBOL(walk_stackframe);
 
-static void dump_backtrace_entry(unsigned long where, const char *loglvl)
+static bool dump_backtrace_entry(void *arg, unsigned long where)
 {
+	char *loglvl = arg;
 	printk("%s %pSb\n", loglvl, (void *)where);
+	return true;
 }
 
 void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
 		    const char *loglvl)
 {
-	struct stackframe frame;
-	int skip = 0;
-
 	pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk);
 
-	if (regs) {
-		if (user_mode(regs))
-			return;
-		skip = 1;
-	}
+	if (regs && user_mode(regs))
+		return;
 
 	if (!tsk)
 		tsk = current;
@@ -181,36 +177,8 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
 	if (!try_get_task_stack(tsk))
 		return;
 
-	if (tsk == current) {
-		start_backtrace(&frame,
-				(unsigned long)__builtin_frame_address(0),
-				(unsigned long)dump_backtrace);
-	} else {
-		/*
-		 * task blocked in __switch_to
-		 */
-		start_backtrace(&frame,
-				thread_saved_fp(tsk),
-				thread_saved_pc(tsk));
-	}
-
 	printk("%sCall trace:\n", loglvl);
-	do {
-		/* skip until specified stack frame */
-		if (!skip) {
-			dump_backtrace_entry(frame.pc, loglvl);
-		} else if (frame.fp == regs->regs[29]) {
-			skip = 0;
-			/*
-			 * Mostly, this is the case where this function is
-			 * called in panic/abort. As exception handler's
-			 * stack frame does not contain the corresponding pc
-			 * at which an exception has taken place, use regs->pc
-			 * instead.
-			 */
-			dump_backtrace_entry(regs->pc, loglvl);
-		}
-	} while (!unwind_frame(tsk, &frame));
+	arch_stack_walk(dump_backtrace_entry, (void *)loglvl, tsk, regs);
 
 	put_task_stack(tsk);
 }

From d2d1d2645cfd7230fd958fddbc5e7525c34f1374 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 29 Nov 2021 14:28:49 +0000
Subject: [PATCH 16/81] arm64: Make some stacktrace functions private

Now that open-coded stack unwinds have been converted to
arch_stack_walk(), we no longer need to expose any of unwind_frame(),
walk_stackframe(), or start_backtrace() outside of stacktrace.c.

Make those functions private to stacktrace.c, removing their prototypes
from <asm/stacktrace.h> and marking them static.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Mark Brown <broonie@kernel.org>
Cc: Madhavan T. Venkataraman <madvenka@linux.microsoft.com>
Link: https://lore.kernel.org/r/20211129142849.3056714-10-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/stacktrace.h |  6 ------
 arch/arm64/kernel/stacktrace.c      | 12 +++++++-----
 2 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/include/asm/stacktrace.h b/arch/arm64/include/asm/stacktrace.h
index 1367012e0520..e77cdef9ca29 100644
--- a/arch/arm64/include/asm/stacktrace.h
+++ b/arch/arm64/include/asm/stacktrace.h
@@ -63,9 +63,6 @@ struct stackframe {
 #endif
 };
 
-extern int unwind_frame(struct task_struct *tsk, struct stackframe *frame);
-extern void walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
-			    bool (*fn)(void *, unsigned long), void *data);
 extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk,
 			   const char *loglvl);
 
@@ -150,7 +147,4 @@ static inline bool on_accessible_stack(const struct task_struct *tsk,
 	return false;
 }
 
-void start_backtrace(struct stackframe *frame, unsigned long fp,
-		     unsigned long pc);
-
 #endif	/* __ASM_STACKTRACE_H */
diff --git a/arch/arm64/kernel/stacktrace.c b/arch/arm64/kernel/stacktrace.c
index 9fc771a05306..0fb58fed54cb 100644
--- a/arch/arm64/kernel/stacktrace.c
+++ b/arch/arm64/kernel/stacktrace.c
@@ -33,8 +33,8 @@
  */
 
 
-void start_backtrace(struct stackframe *frame, unsigned long fp,
-		     unsigned long pc)
+static void start_backtrace(struct stackframe *frame, unsigned long fp,
+			    unsigned long pc)
 {
 	frame->fp = fp;
 	frame->pc = pc;
@@ -63,7 +63,8 @@ void start_backtrace(struct stackframe *frame, unsigned long fp,
  * records (e.g. a cycle), determined based on the location and fp value of A
  * and the location (but not the fp value) of B.
  */
-int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
+static int notrace unwind_frame(struct task_struct *tsk,
+				struct stackframe *frame)
 {
 	unsigned long fp = frame->fp;
 	struct stack_info info;
@@ -141,8 +142,9 @@ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame)
 }
 NOKPROBE_SYMBOL(unwind_frame);
 
-void notrace walk_stackframe(struct task_struct *tsk, struct stackframe *frame,
-			     bool (*fn)(void *, unsigned long), void *data)
+static void notrace walk_stackframe(struct task_struct *tsk,
+				    struct stackframe *frame,
+				    bool (*fn)(void *, unsigned long), void *data)
 {
 	while (1) {
 		int ret;

From a3a5b763410c7bceacf41a52071134d9dc26202a Mon Sep 17 00:00:00 2001
From: Yunfeng Ye <yeyunfeng@huawei.com>
Date: Thu, 9 Dec 2021 09:42:25 +0800
Subject: [PATCH 17/81] arm64: mm: Rename asid2idx() to ctxid2asid()

The commit 0c8ea531b774 ("arm64: mm: Allocate ASIDs in pairs") introduce
the asid2idx and idx2asid macro, but these macros are not really useful
after the commit f88f42f853a8 ("arm64: context: Free up kernel ASIDs if
KPTI is not in use").

The code "(asid & ~ASID_MASK)" can be instead by a macro, which is the
same code with asid2idx(). So rename it to ctxid2asid() for a better
understanding.

Also we add asid2ctxid() macro, the contextid can be generated based on
the asid and generation through this macro.

Signed-off-by: Yunfeng Ye <yeyunfeng@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Link: https://lore.kernel.org/r/c31516eb-6d15-94e0-421c-305fc010ea79@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/context.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index cd72576ae2b7..bbc2708fe928 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -35,8 +35,8 @@ static unsigned long *pinned_asid_map;
 #define ASID_FIRST_VERSION	(1UL << asid_bits)
 
 #define NUM_USER_ASIDS		ASID_FIRST_VERSION
-#define asid2idx(asid)		((asid) & ~ASID_MASK)
-#define idx2asid(idx)		asid2idx(idx)
+#define ctxid2asid(asid)	((asid) & ~ASID_MASK)
+#define asid2ctxid(asid, genid)	((asid) | (genid))
 
 /* Get the ASIDBits supported by the current CPU */
 static u32 get_cpu_asid_bits(void)
@@ -120,7 +120,7 @@ static void flush_context(void)
 		 */
 		if (asid == 0)
 			asid = per_cpu(reserved_asids, i);
-		__set_bit(asid2idx(asid), asid_map);
+		__set_bit(ctxid2asid(asid), asid_map);
 		per_cpu(reserved_asids, i) = asid;
 	}
 
@@ -162,7 +162,7 @@ static u64 new_context(struct mm_struct *mm)
 	u64 generation = atomic64_read(&asid_generation);
 
 	if (asid != 0) {
-		u64 newasid = generation | (asid & ~ASID_MASK);
+		u64 newasid = asid2ctxid(ctxid2asid(asid), generation);
 
 		/*
 		 * If our current ASID was active during a rollover, we
@@ -183,7 +183,7 @@ static u64 new_context(struct mm_struct *mm)
 		 * We had a valid ASID in a previous life, so try to re-use
 		 * it if possible.
 		 */
-		if (!__test_and_set_bit(asid2idx(asid), asid_map))
+		if (!__test_and_set_bit(ctxid2asid(asid), asid_map))
 			return newasid;
 	}
 
@@ -209,7 +209,7 @@ static u64 new_context(struct mm_struct *mm)
 set_asid:
 	__set_bit(asid, asid_map);
 	cur_idx = asid;
-	return idx2asid(asid) | generation;
+	return asid2ctxid(asid, generation);
 }
 
 void check_and_switch_context(struct mm_struct *mm)
@@ -300,13 +300,13 @@ unsigned long arm64_mm_context_get(struct mm_struct *mm)
 	}
 
 	nr_pinned_asids++;
-	__set_bit(asid2idx(asid), pinned_asid_map);
+	__set_bit(ctxid2asid(asid), pinned_asid_map);
 	refcount_set(&mm->context.pinned, 1);
 
 out_unlock:
 	raw_spin_unlock_irqrestore(&cpu_asid_lock, flags);
 
-	asid &= ~ASID_MASK;
+	asid = ctxid2asid(asid);
 
 	/* Set the equivalent of USER_ASID_BIT */
 	if (asid && arm64_kernel_unmapped_at_el0())
@@ -327,7 +327,7 @@ void arm64_mm_context_put(struct mm_struct *mm)
 	raw_spin_lock_irqsave(&cpu_asid_lock, flags);
 
 	if (refcount_dec_and_test(&mm->context.pinned)) {
-		__clear_bit(asid2idx(asid), pinned_asid_map);
+		__clear_bit(ctxid2asid(asid), pinned_asid_map);
 		nr_pinned_asids--;
 	}
 

From 386a74677be13175b5626f094ef37808c45f48b8 Mon Sep 17 00:00:00 2001
From: Yunfeng Ye <yeyunfeng@huawei.com>
Date: Thu, 9 Dec 2021 09:46:03 +0800
Subject: [PATCH 18/81] arm64: mm: Use asid feature macro for cheanup

The commit 95b54c3e4c92 ("KVM: arm64: Add feature register flag
definitions") introduce the ID_AA64MMFR0_ASID_8 and ID_AA64MMFR0_ASID_16
macros.

We can use these macros for cheanup in get_cpu_asid_bits().

No functional change.

Signed-off-by: Yunfeng Ye <yeyunfeng@huawei.com>
Reviewed-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Link: https://lore.kernel.org/r/f71c75d3-735e-b32a-8414-b3e513c77240@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/context.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/mm/context.c b/arch/arm64/mm/context.c
index bbc2708fe928..b8b4cf0bcf39 100644
--- a/arch/arm64/mm/context.c
+++ b/arch/arm64/mm/context.c
@@ -50,10 +50,10 @@ static u32 get_cpu_asid_bits(void)
 		pr_warn("CPU%d: Unknown ASID size (%d); assuming 8-bit\n",
 					smp_processor_id(),  fld);
 		fallthrough;
-	case 0:
+	case ID_AA64MMFR0_ASID_8:
 		asid = 8;
 		break;
-	case 2:
+	case ID_AA64MMFR0_ASID_16:
 		asid = 16;
 	}
 

From 6f6cfa5867995c03959ce8c715e54b51cd5a1528 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 7 Dec 2021 18:32:25 +0000
Subject: [PATCH 19/81] arm64: mm: use die_kernel_fault() in do_mem_abort()

If we take an unhandled fault from EL1, either:

a) The xFSC handler calls die_kernel_fault() directly. In this case,
   die_kernel_fault() calls:

   pr_alert(..., msg, addr);
   mem_abort_decode(esr);
   show_pte(addr);
   die();
   bust_spinlocks(0);
   do_exit(SIGKILL);

b) The xFSC handler returns to do_mem_abort(), indicating failure. In
   this case, do_mem_abort() calls:

   pr_alert(..., addr);
   mem_abort_decode(esr);
   show_pte(addr);
   arm64_notify_die() {
     die();
   }

This inconstency is unfortunatem, and in theory in case (b) registered
notifiers can prevent us from terminating the faulting thread by
returning NOTIFY_STOP, whereupon we'll end up returning from the fault,
replaying, and almost certainly get stuck in a livelock spewing errors
into dmesg. We don't expect notifers to fix things up, since we dump
state to dmesg before invoking them, so it would be more sensible to
consistently terminate the thread in this case.

This patch has do_mem_abort() call die_kernel_fault() for unhandled
faults taken from EL1. Where we would previously have logged a messafe
of the form:

| Unhandled fault at ${ADDR}

... we will now log a message of the form:

| Unable to handle kernel ${FAULT_NAME} at virtual address ${ADDR}

... and we will consistently terminate the thread from which the fault
was taken.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Will Deacon <will@kernel.org>
Tested-by: Andrey Konovalov <andreyknvl@gmail.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20211207183226.834557-2-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/fault.c | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index 9ae24e3b72be..b7b9caa41bc7 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -813,11 +813,8 @@ void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs)
 	if (!inf->fn(far, esr, regs))
 		return;
 
-	if (!user_mode(regs)) {
-		pr_alert("Unhandled fault at 0x%016lx\n", addr);
-		mem_abort_decode(esr);
-		show_pte(addr);
-	}
+	if (!user_mode(regs))
+		die_kernel_fault(inf->name, addr, esr, regs);
 
 	/*
 	 * At this point we have an unrecognized fault type whose tag bits may

From 07b742a4d91260bdb61cd4cbe5ec3bba2ae7f6f9 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Tue, 7 Dec 2021 18:32:26 +0000
Subject: [PATCH 20/81] arm64: mm: log potential KASAN shadow alias

When the kernel is built with KASAN_GENERIC or KASAN_SW_TAGS, shadow
memory is allocated and mapped for all legitimate kernel addresses, and
prior to a regular memory access instrumentation will read from the
corresponding shadow address.

Due to the way memory addresses are converted to shadow addresses, bogus
pointers (e.g. NULL) can generate shadow addresses out of the bounds of
allocated shadow memory. For example, with KASAN_GENERIC and 48-bit VAs,
NULL would have a shadow address of dfff800000000000, which falls
between the TTBR ranges.

To make such cases easier to debug, this patch makes die_kernel_fault()
dump the real memory address range for any potential KASAN shadow access
using kasan_non_canonical_hook(), which results in fault information as
below when KASAN is enabled:

| Unable to handle kernel paging request at virtual address dfff800000000017
| KASAN: null-ptr-deref in range [0x00000000000000b8-0x00000000000000bf]
| Mem abort info:
|   ESR = 0x96000004
|   EC = 0x25: DABT (current EL), IL = 32 bits
|   SET = 0, FnV = 0
|   EA = 0, S1PTW = 0
|   FSC = 0x04: level 0 translation fault
| Data abort info:
|   ISV = 0, ISS = 0x00000004
|   CM = 0, WnR = 0
| [dfff800000000017] address between user and kernel address ranges

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Andrey Konovalov <andreyknvl@gmail.com>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Will Deacon <will@kernel.org>
Tested-by: Andrey Konovalov <andreyknvl@gmail.com>
Acked-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/r/20211207183226.834557-3-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/fault.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c
index b7b9caa41bc7..9a9e7675b187 100644
--- a/arch/arm64/mm/fault.c
+++ b/arch/arm64/mm/fault.c
@@ -297,6 +297,8 @@ static void die_kernel_fault(const char *msg, unsigned long addr,
 	pr_alert("Unable to handle kernel %s at virtual address %016lx\n", msg,
 		 addr);
 
+	kasan_non_canonical_hook(addr);
+
 	mem_abort_decode(esr);
 
 	show_pte(addr);

From 5c13f042e73200b50573ace63e1a6b94e2917616 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Fri, 10 Dec 2021 16:54:30 +0000
Subject: [PATCH 21/81] arm64: cpufeature: add HWCAP for FEAT_AFP

Add a new HWCAP to detect the Alternate Floating-point Behaviour
feature (FEAT_AFP), introduced in Armv8.7.

Also expose this to userspace in the ID_AA64MMFR1_EL1 feature register.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Will Deacon <will@kernel.org>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211210165432.8106-2-joey.gouly@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arm64/cpu-feature-registers.rst | 9 +++++++++
 Documentation/arm64/elf_hwcaps.rst            | 4 ++++
 arch/arm64/include/asm/hwcap.h                | 1 +
 arch/arm64/include/asm/sysreg.h               | 1 +
 arch/arm64/include/uapi/asm/hwcap.h           | 1 +
 arch/arm64/kernel/cpufeature.c                | 2 ++
 arch/arm64/kernel/cpuinfo.c                   | 1 +
 7 files changed, 19 insertions(+)

diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst
index 9f9b8fd06089..1b19d20c2dbd 100644
--- a/Documentation/arm64/cpu-feature-registers.rst
+++ b/Documentation/arm64/cpu-feature-registers.rst
@@ -275,6 +275,15 @@ infrastructure:
      | SVEVer                       | [3-0]   |    y    |
      +------------------------------+---------+---------+
 
+  8) ID_AA64MMFR1_EL1 - Memory model feature register 1
+
+     +------------------------------+---------+---------+
+     | Name                         |  bits   | visible |
+     +------------------------------+---------+---------+
+     | AFP                          | [47-44] |    y    |
+     +------------------------------+---------+---------+
+
+
 Appendix I: Example
 -------------------
 
diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
index af106af8e1c0..247728d37911 100644
--- a/Documentation/arm64/elf_hwcaps.rst
+++ b/Documentation/arm64/elf_hwcaps.rst
@@ -251,6 +251,10 @@ HWCAP2_ECV
 
     Functionality implied by ID_AA64MMFR0_EL1.ECV == 0b0001.
 
+HWCAP2_AFP
+
+    Functionality implied by ID_AA64MFR1_EL1.AFP == 0b0001.
+
 4. Unused AT_HWCAP bits
 -----------------------
 
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index b100e0055eab..2809df2fdd63 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -106,6 +106,7 @@
 #define KERNEL_HWCAP_BTI		__khwcap2_feature(BTI)
 #define KERNEL_HWCAP_MTE		__khwcap2_feature(MTE)
 #define KERNEL_HWCAP_ECV		__khwcap2_feature(ECV)
+#define KERNEL_HWCAP_AFP		__khwcap2_feature(AFP)
 
 /*
  * This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index 16b3f1a1d468..adcab9009f9d 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -889,6 +889,7 @@
 #endif
 
 /* id_aa64mmfr1 */
+#define ID_AA64MMFR1_AFP_SHIFT		44
 #define ID_AA64MMFR1_ETS_SHIFT		36
 #define ID_AA64MMFR1_TWED_SHIFT		32
 #define ID_AA64MMFR1_XNX_SHIFT		28
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 7b23b16f21ce..180da7396549 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -76,5 +76,6 @@
 #define HWCAP2_BTI		(1 << 17)
 #define HWCAP2_MTE		(1 << 18)
 #define HWCAP2_ECV		(1 << 19)
+#define HWCAP2_AFP		(1 << 20)
 
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 6f3e677d88f1..71ff5a4afb0f 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -325,6 +325,7 @@ static const struct arm64_ftr_bits ftr_id_aa64mmfr0[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64mmfr1[] = {
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_AFP_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_ETS_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_TWED_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64MMFR1_XNX_SHIFT, 4, 0),
@@ -2476,6 +2477,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 	HWCAP_CAP(SYS_ID_AA64PFR1_EL1, ID_AA64PFR1_MTE_SHIFT, FTR_UNSIGNED, ID_AA64PFR1_MTE, CAP_HWCAP, KERNEL_HWCAP_MTE),
 #endif /* CONFIG_ARM64_MTE */
 	HWCAP_CAP(SYS_ID_AA64MMFR0_EL1, ID_AA64MMFR0_ECV_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ECV),
+	HWCAP_CAP(SYS_ID_AA64MMFR1_EL1, ID_AA64MMFR1_AFP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AFP),
 	{},
 };
 
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 6e27b759056a..0e52014019f6 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -95,6 +95,7 @@ static const char *const hwcap_str[] = {
 	[KERNEL_HWCAP_BTI]		= "bti",
 	[KERNEL_HWCAP_MTE]		= "mte",
 	[KERNEL_HWCAP_ECV]		= "ecv",
+	[KERNEL_HWCAP_AFP]		= "afp",
 };
 
 #ifdef CONFIG_COMPAT

From 9e45365f1469ef2b934f9d035975dbc9ad352116 Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Fri, 10 Dec 2021 16:54:31 +0000
Subject: [PATCH 22/81] arm64: add ID_AA64ISAR2_EL1 sys register

This is a new ID register, introduced in 8.7.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Marc Zyngier <maz@kernel.org>
Cc: James Morse <james.morse@arm.com>
Cc: Alexandru Elisei <alexandru.elisei@arm.com>
Cc: Suzuki K Poulose <suzuki.poulose@arm.com>
Cc: Reiji Watanabe <reijiw@google.com>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211210165432.8106-3-joey.gouly@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/cpu.h    |  1 +
 arch/arm64/include/asm/sysreg.h | 15 +++++++++++++++
 arch/arm64/kernel/cpufeature.c  |  9 +++++++++
 arch/arm64/kernel/cpuinfo.c     |  1 +
 arch/arm64/kvm/sys_regs.c       |  2 +-
 5 files changed, 27 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/include/asm/cpu.h b/arch/arm64/include/asm/cpu.h
index 0f6d16faa540..a58e366f0b07 100644
--- a/arch/arm64/include/asm/cpu.h
+++ b/arch/arm64/include/asm/cpu.h
@@ -51,6 +51,7 @@ struct cpuinfo_arm64 {
 	u64		reg_id_aa64dfr1;
 	u64		reg_id_aa64isar0;
 	u64		reg_id_aa64isar1;
+	u64		reg_id_aa64isar2;
 	u64		reg_id_aa64mmfr0;
 	u64		reg_id_aa64mmfr1;
 	u64		reg_id_aa64mmfr2;
diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h
index adcab9009f9d..4704f5893458 100644
--- a/arch/arm64/include/asm/sysreg.h
+++ b/arch/arm64/include/asm/sysreg.h
@@ -182,6 +182,7 @@
 
 #define SYS_ID_AA64ISAR0_EL1		sys_reg(3, 0, 0, 6, 0)
 #define SYS_ID_AA64ISAR1_EL1		sys_reg(3, 0, 0, 6, 1)
+#define SYS_ID_AA64ISAR2_EL1		sys_reg(3, 0, 0, 6, 2)
 
 #define SYS_ID_AA64MMFR0_EL1		sys_reg(3, 0, 0, 7, 0)
 #define SYS_ID_AA64MMFR1_EL1		sys_reg(3, 0, 0, 7, 1)
@@ -771,6 +772,20 @@
 #define ID_AA64ISAR1_GPI_NI			0x0
 #define ID_AA64ISAR1_GPI_IMP_DEF		0x1
 
+/* id_aa64isar2 */
+#define ID_AA64ISAR2_RPRES_SHIFT	4
+#define ID_AA64ISAR2_WFXT_SHIFT		0
+
+#define ID_AA64ISAR2_RPRES_8BIT		0x0
+#define ID_AA64ISAR2_RPRES_12BIT	0x1
+/*
+ * Value 0x1 has been removed from the architecture, and is
+ * reserved, but has not yet been removed from the ARM ARM
+ * as of ARM DDI 0487G.b.
+ */
+#define ID_AA64ISAR2_WFXT_NI		0x0
+#define ID_AA64ISAR2_WFXT_SUPPORTED	0x2
+
 /* id_aa64pfr0 */
 #define ID_AA64PFR0_CSV3_SHIFT		60
 #define ID_AA64PFR0_CSV2_SHIFT		56
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index 71ff5a4afb0f..c36018310da5 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -225,6 +225,10 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
 	ARM64_FTR_END,
 };
 
+static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
+	ARM64_FTR_END,
+};
+
 static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = {
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0),
 	ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0),
@@ -638,6 +642,7 @@ static const struct __ftr_reg_entry {
 	ARM64_FTR_REG(SYS_ID_AA64ISAR0_EL1, ftr_id_aa64isar0),
 	ARM64_FTR_REG_OVERRIDE(SYS_ID_AA64ISAR1_EL1, ftr_id_aa64isar1,
 			       &id_aa64isar1_override),
+	ARM64_FTR_REG(SYS_ID_AA64ISAR2_EL1, ftr_id_aa64isar2),
 
 	/* Op1 = 0, CRn = 0, CRm = 7 */
 	ARM64_FTR_REG(SYS_ID_AA64MMFR0_EL1, ftr_id_aa64mmfr0),
@@ -934,6 +939,7 @@ void __init init_cpu_features(struct cpuinfo_arm64 *info)
 	init_cpu_ftr_reg(SYS_ID_AA64DFR1_EL1, info->reg_id_aa64dfr1);
 	init_cpu_ftr_reg(SYS_ID_AA64ISAR0_EL1, info->reg_id_aa64isar0);
 	init_cpu_ftr_reg(SYS_ID_AA64ISAR1_EL1, info->reg_id_aa64isar1);
+	init_cpu_ftr_reg(SYS_ID_AA64ISAR2_EL1, info->reg_id_aa64isar2);
 	init_cpu_ftr_reg(SYS_ID_AA64MMFR0_EL1, info->reg_id_aa64mmfr0);
 	init_cpu_ftr_reg(SYS_ID_AA64MMFR1_EL1, info->reg_id_aa64mmfr1);
 	init_cpu_ftr_reg(SYS_ID_AA64MMFR2_EL1, info->reg_id_aa64mmfr2);
@@ -1152,6 +1158,8 @@ void update_cpu_features(int cpu,
 				      info->reg_id_aa64isar0, boot->reg_id_aa64isar0);
 	taint |= check_update_ftr_reg(SYS_ID_AA64ISAR1_EL1, cpu,
 				      info->reg_id_aa64isar1, boot->reg_id_aa64isar1);
+	taint |= check_update_ftr_reg(SYS_ID_AA64ISAR2_EL1, cpu,
+				      info->reg_id_aa64isar2, boot->reg_id_aa64isar2);
 
 	/*
 	 * Differing PARange support is fine as long as all peripherals and
@@ -1273,6 +1281,7 @@ u64 __read_sysreg_by_encoding(u32 sys_id)
 	read_sysreg_case(SYS_ID_AA64MMFR2_EL1);
 	read_sysreg_case(SYS_ID_AA64ISAR0_EL1);
 	read_sysreg_case(SYS_ID_AA64ISAR1_EL1);
+	read_sysreg_case(SYS_ID_AA64ISAR2_EL1);
 
 	read_sysreg_case(SYS_CNTFRQ_EL0);
 	read_sysreg_case(SYS_CTR_EL0);
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index 0e52014019f6..f2f8fe02f39c 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -392,6 +392,7 @@ static void __cpuinfo_store_cpu(struct cpuinfo_arm64 *info)
 	info->reg_id_aa64dfr1 = read_cpuid(ID_AA64DFR1_EL1);
 	info->reg_id_aa64isar0 = read_cpuid(ID_AA64ISAR0_EL1);
 	info->reg_id_aa64isar1 = read_cpuid(ID_AA64ISAR1_EL1);
+	info->reg_id_aa64isar2 = read_cpuid(ID_AA64ISAR2_EL1);
 	info->reg_id_aa64mmfr0 = read_cpuid(ID_AA64MMFR0_EL1);
 	info->reg_id_aa64mmfr1 = read_cpuid(ID_AA64MMFR1_EL1);
 	info->reg_id_aa64mmfr2 = read_cpuid(ID_AA64MMFR2_EL1);
diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c
index e3ec1a44f94d..4dc2fba316ff 100644
--- a/arch/arm64/kvm/sys_regs.c
+++ b/arch/arm64/kvm/sys_regs.c
@@ -1525,7 +1525,7 @@ static const struct sys_reg_desc sys_reg_descs[] = {
 	/* CRm=6 */
 	ID_SANITISED(ID_AA64ISAR0_EL1),
 	ID_SANITISED(ID_AA64ISAR1_EL1),
-	ID_UNALLOCATED(6,2),
+	ID_SANITISED(ID_AA64ISAR2_EL1),
 	ID_UNALLOCATED(6,3),
 	ID_UNALLOCATED(6,4),
 	ID_UNALLOCATED(6,5),

From 1175011a7d0030d49dc9c10bde36f08f26d0a8ee Mon Sep 17 00:00:00 2001
From: Joey Gouly <joey.gouly@arm.com>
Date: Fri, 10 Dec 2021 16:54:32 +0000
Subject: [PATCH 23/81] arm64: cpufeature: add HWCAP for FEAT_RPRES

Add a new HWCAP to detect the Increased precision of Reciprocal Estimate
and Reciprocal Square Root Estimate feature (FEAT_RPRES), introduced in Armv8.7.

Also expose this to userspace in the ID_AA64ISAR2_EL1 feature register.

Signed-off-by: Joey Gouly <joey.gouly@arm.com>
Cc: Will Deacon <will@kernel.org>
Cc: Jonathan Corbet <corbet@lwn.net>
Acked-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211210165432.8106-4-joey.gouly@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arm64/cpu-feature-registers.rst | 8 ++++++++
 Documentation/arm64/elf_hwcaps.rst            | 4 ++++
 arch/arm64/include/asm/hwcap.h                | 1 +
 arch/arm64/include/uapi/asm/hwcap.h           | 1 +
 arch/arm64/kernel/cpufeature.c                | 2 ++
 arch/arm64/kernel/cpuinfo.c                   | 1 +
 6 files changed, 17 insertions(+)

diff --git a/Documentation/arm64/cpu-feature-registers.rst b/Documentation/arm64/cpu-feature-registers.rst
index 1b19d20c2dbd..749ae970c319 100644
--- a/Documentation/arm64/cpu-feature-registers.rst
+++ b/Documentation/arm64/cpu-feature-registers.rst
@@ -283,6 +283,14 @@ infrastructure:
      | AFP                          | [47-44] |    y    |
      +------------------------------+---------+---------+
 
+  9) ID_AA64ISAR2_EL1 - Instruction set attribute register 2
+
+     +------------------------------+---------+---------+
+     | Name                         |  bits   | visible |
+     +------------------------------+---------+---------+
+     | RPRES                        | [7-4]   |    y    |
+     +------------------------------+---------+---------+
+
 
 Appendix I: Example
 -------------------
diff --git a/Documentation/arm64/elf_hwcaps.rst b/Documentation/arm64/elf_hwcaps.rst
index 247728d37911..b72ff17d600a 100644
--- a/Documentation/arm64/elf_hwcaps.rst
+++ b/Documentation/arm64/elf_hwcaps.rst
@@ -255,6 +255,10 @@ HWCAP2_AFP
 
     Functionality implied by ID_AA64MFR1_EL1.AFP == 0b0001.
 
+HWCAP2_RPRES
+
+    Functionality implied by ID_AA64ISAR2_EL1.RPRES == 0b0001.
+
 4. Unused AT_HWCAP bits
 -----------------------
 
diff --git a/arch/arm64/include/asm/hwcap.h b/arch/arm64/include/asm/hwcap.h
index 2809df2fdd63..f68fbb207473 100644
--- a/arch/arm64/include/asm/hwcap.h
+++ b/arch/arm64/include/asm/hwcap.h
@@ -107,6 +107,7 @@
 #define KERNEL_HWCAP_MTE		__khwcap2_feature(MTE)
 #define KERNEL_HWCAP_ECV		__khwcap2_feature(ECV)
 #define KERNEL_HWCAP_AFP		__khwcap2_feature(AFP)
+#define KERNEL_HWCAP_RPRES		__khwcap2_feature(RPRES)
 
 /*
  * This yields a mask that user programs can use to figure out what
diff --git a/arch/arm64/include/uapi/asm/hwcap.h b/arch/arm64/include/uapi/asm/hwcap.h
index 180da7396549..f03731847d9d 100644
--- a/arch/arm64/include/uapi/asm/hwcap.h
+++ b/arch/arm64/include/uapi/asm/hwcap.h
@@ -77,5 +77,6 @@
 #define HWCAP2_MTE		(1 << 18)
 #define HWCAP2_ECV		(1 << 19)
 #define HWCAP2_AFP		(1 << 20)
+#define HWCAP2_RPRES		(1 << 21)
 
 #endif /* _UAPI__ASM_HWCAP_H */
diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c
index c36018310da5..a46ab3b1c4d5 100644
--- a/arch/arm64/kernel/cpufeature.c
+++ b/arch/arm64/kernel/cpufeature.c
@@ -226,6 +226,7 @@ static const struct arm64_ftr_bits ftr_id_aa64isar1[] = {
 };
 
 static const struct arm64_ftr_bits ftr_id_aa64isar2[] = {
+	ARM64_FTR_BITS(FTR_VISIBLE, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64ISAR2_RPRES_SHIFT, 4, 0),
 	ARM64_FTR_END,
 };
 
@@ -2487,6 +2488,7 @@ static const struct arm64_cpu_capabilities arm64_elf_hwcaps[] = {
 #endif /* CONFIG_ARM64_MTE */
 	HWCAP_CAP(SYS_ID_AA64MMFR0_EL1, ID_AA64MMFR0_ECV_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_ECV),
 	HWCAP_CAP(SYS_ID_AA64MMFR1_EL1, ID_AA64MMFR1_AFP_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_AFP),
+	HWCAP_CAP(SYS_ID_AA64ISAR2_EL1, ID_AA64ISAR2_RPRES_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, KERNEL_HWCAP_RPRES),
 	{},
 };
 
diff --git a/arch/arm64/kernel/cpuinfo.c b/arch/arm64/kernel/cpuinfo.c
index f2f8fe02f39c..591c18a889a5 100644
--- a/arch/arm64/kernel/cpuinfo.c
+++ b/arch/arm64/kernel/cpuinfo.c
@@ -96,6 +96,7 @@ static const char *const hwcap_str[] = {
 	[KERNEL_HWCAP_MTE]		= "mte",
 	[KERNEL_HWCAP_ECV]		= "ecv",
 	[KERNEL_HWCAP_AFP]		= "afp",
+	[KERNEL_HWCAP_RPRES]		= "rpres",
 };
 
 #ifdef CONFIG_COMPAT

From 369461ce8fb6c8156206c7110d7da48e9fbc41bb Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 8 Dec 2021 14:11:20 -0600
Subject: [PATCH 24/81] x86: perf: Move RDPMC event flag to a common definition

In preparation to enable user counter access on arm64 and to move some
of the user access handling to perf core, create a common event flag for
user counter access and convert x86 to use it.

Since the architecture specific flags start at the LSB, starting at the
MSB for common flags.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Kan Liang <kan.liang@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Borislav Petkov <bp@alien8.de>
Cc: x86@kernel.org
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: linux-perf-users@vger.kernel.org
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20211208201124.310740-2-robh@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/x86/events/core.c       | 10 +++++-----
 arch/x86/events/perf_event.h |  2 +-
 include/linux/perf_event.h   |  9 +++++++++
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/arch/x86/events/core.c b/arch/x86/events/core.c
index 38b2c779146f..68dea7ce6a22 100644
--- a/arch/x86/events/core.c
+++ b/arch/x86/events/core.c
@@ -2476,7 +2476,7 @@ static int x86_pmu_event_init(struct perf_event *event)
 
 	if (READ_ONCE(x86_pmu.attr_rdpmc) &&
 	    !(event->hw.flags & PERF_X86_EVENT_LARGE_PEBS))
-		event->hw.flags |= PERF_X86_EVENT_RDPMC_ALLOWED;
+		event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
 
 	return err;
 }
@@ -2510,7 +2510,7 @@ void perf_clear_dirty_counters(void)
 
 static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
 {
-	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+	if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
 		return;
 
 	/*
@@ -2531,7 +2531,7 @@ static void x86_pmu_event_mapped(struct perf_event *event, struct mm_struct *mm)
 
 static void x86_pmu_event_unmapped(struct perf_event *event, struct mm_struct *mm)
 {
-	if (!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+	if (!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT))
 		return;
 
 	if (atomic_dec_and_test(&mm->context.perf_rdpmc_allowed))
@@ -2542,7 +2542,7 @@ static int x86_pmu_event_idx(struct perf_event *event)
 {
 	struct hw_perf_event *hwc = &event->hw;
 
-	if (!(hwc->flags & PERF_X86_EVENT_RDPMC_ALLOWED))
+	if (!(hwc->flags & PERF_EVENT_FLAG_USER_READ_CNT))
 		return 0;
 
 	if (is_metric_idx(hwc->idx))
@@ -2725,7 +2725,7 @@ void arch_perf_update_userpage(struct perf_event *event,
 	userpg->cap_user_time = 0;
 	userpg->cap_user_time_zero = 0;
 	userpg->cap_user_rdpmc =
-		!!(event->hw.flags & PERF_X86_EVENT_RDPMC_ALLOWED);
+		!!(event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT);
 	userpg->pmc_width = x86_pmu.cntval_bits;
 
 	if (!using_native_sched_clock() || !sched_clock_stable())
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index 5480db242083..9d376e528dfc 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -74,7 +74,7 @@ static inline bool constraint_match(struct event_constraint *c, u64 ecode)
 #define PERF_X86_EVENT_PEBS_NA_HSW	0x0010 /* haswell style datala, unknown */
 #define PERF_X86_EVENT_EXCL		0x0020 /* HT exclusivity on counter */
 #define PERF_X86_EVENT_DYNAMIC		0x0040 /* dynamic alloc'd constraint */
-#define PERF_X86_EVENT_RDPMC_ALLOWED	0x0080 /* grant rdpmc permission */
+
 #define PERF_X86_EVENT_EXCL_ACCT	0x0100 /* accounted EXCL event */
 #define PERF_X86_EVENT_AUTO_RELOAD	0x0200 /* use PEBS auto-reload */
 #define PERF_X86_EVENT_LARGE_PEBS	0x0400 /* use large PEBS */
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 0dcfd265beed..ba9467972c09 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -129,6 +129,15 @@ struct hw_perf_event_extra {
 	int		idx;	/* index in shared_regs->regs[] */
 };
 
+/**
+ * hw_perf_event::flag values
+ *
+ * PERF_EVENT_FLAG_ARCH bits are reserved for architecture-specific
+ * usage.
+ */
+#define PERF_EVENT_FLAG_ARCH			0x0000ffff
+#define PERF_EVENT_FLAG_USER_READ_CNT		0x80000000
+
 /**
  * struct hw_perf_event - performance event hardware details:
  */

From 82ff0c022d19c2ad69a472692bb7ee01ac07a40b Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 8 Dec 2021 14:11:21 -0600
Subject: [PATCH 25/81] perf: Add a counter for number of user access events in
 context

On arm64, user space counter access will be controlled differently
compared to x86. On x86, access in the strictest mode is enabled for all
tasks in an MM when any event is mmap'ed. For arm64, access is
explicitly requested for an event and only enabled when the event's
context is active. This avoids hooks into the arch context switch code
and gives better control of when access is enabled.

In order to configure user space access when the PMU is enabled, it is
necessary to know if any event (currently active or not) in the current
context has user space accessed enabled. Add a counter similar to other
counters in the context to avoid walking the event list every time.

Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Reviewed-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20211208201124.310740-3-robh@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 include/linux/perf_event.h | 1 +
 kernel/events/core.c       | 4 ++++
 2 files changed, 5 insertions(+)

diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index ba9467972c09..411e34210fbf 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -831,6 +831,7 @@ struct perf_event_context {
 
 	int				nr_events;
 	int				nr_active;
+	int				nr_user;
 	int				is_active;
 	int				nr_stat;
 	int				nr_freq;
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 30d94f68c5bd..1362b9b770d8 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -1808,6 +1808,8 @@ list_add_event(struct perf_event *event, struct perf_event_context *ctx)
 
 	list_add_rcu(&event->event_entry, &ctx->event_list);
 	ctx->nr_events++;
+	if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+		ctx->nr_user++;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat++;
 
@@ -1999,6 +2001,8 @@ list_del_event(struct perf_event *event, struct perf_event_context *ctx)
 	event->attach_state &= ~PERF_ATTACH_CONTEXT;
 
 	ctx->nr_events--;
+	if (event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT)
+		ctx->nr_user--;
 	if (event->attr.inherit_stat)
 		ctx->nr_stat--;
 

From e2012600810c9ded81f6f63a8d04781be3c300ad Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 8 Dec 2021 14:11:22 -0600
Subject: [PATCH 26/81] arm64: perf: Add userspace counter access disable
 switch

Like x86, some users may want to disable userspace PMU counter
altogether. Add a sysctl 'perf_user_access' file to control userspace
counter access. The default is '0' which is disabled. Writing '1'
enables access.

Note that x86 supports globally enabling user access by writing '2' to
/sys/bus/event_source/devices/cpu/rdpmc. As there's not existing
userspace support to worry about, this shouldn't be necessary for Arm.
It could be added later if the need arises.

Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: linux-perf-users@vger.kernel.org
Acked-by: Will Deacon <will@kernel.org>
Reviewed-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20211208201124.310740-4-robh@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/admin-guide/sysctl/kernel.rst | 11 +++++++++++
 arch/arm64/kernel/perf_event.c              | 17 +++++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/Documentation/admin-guide/sysctl/kernel.rst b/Documentation/admin-guide/sysctl/kernel.rst
index 0e486f41185e..d359bcfadd39 100644
--- a/Documentation/admin-guide/sysctl/kernel.rst
+++ b/Documentation/admin-guide/sysctl/kernel.rst
@@ -905,6 +905,17 @@ enabled, otherwise writing to this file will return ``-EBUSY``.
 The default value is 8.
 
 
+perf_user_access (arm64 only)
+=================================
+
+Controls user space access for reading perf event counters. When set to 1,
+user space can read performance monitor counter registers directly.
+
+The default value is 0 (access disabled).
+
+See Documentation/arm64/perf.rst for more information.
+
+
 pid_max
 =======
 
diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index b4044469527e..6ae20c4217af 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -286,6 +286,8 @@ static const struct attribute_group armv8_pmuv3_events_attr_group = {
 PMU_FORMAT_ATTR(event, "config:0-15");
 PMU_FORMAT_ATTR(long, "config1:0");
 
+static int sysctl_perf_user_access __read_mostly;
+
 static inline bool armv8pmu_event_is_64bit(struct perf_event *event)
 {
 	return event->attr.config1 & 0x1;
@@ -1104,6 +1106,19 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
 	return probe.present ? 0 : -ENODEV;
 }
 
+static struct ctl_table armv8_pmu_sysctl_table[] = {
+	{
+		.procname       = "perf_user_access",
+		.data		= &sysctl_perf_user_access,
+		.maxlen		= sizeof(unsigned int),
+		.mode           = 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= SYSCTL_ZERO,
+		.extra2		= SYSCTL_ONE,
+	},
+	{ }
+};
+
 static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
 			  int (*map_event)(struct perf_event *event),
 			  const struct attribute_group *events,
@@ -1136,6 +1151,8 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
 	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_CAPS] = caps ?
 			caps : &armv8_pmuv3_caps_attr_group;
 
+	register_sysctl("kernel", armv8_pmu_sysctl_table);
+
 	return 0;
 }
 

From 83a7a4d643d33a8b74a42229346b7ed7139fcef9 Mon Sep 17 00:00:00 2001
From: Rob Herring <robh@kernel.org>
Date: Wed, 8 Dec 2021 14:11:23 -0600
Subject: [PATCH 27/81] arm64: perf: Enable PMU counter userspace access for
 perf event

Arm PMUs can support direct userspace access of counters which allows for
low overhead (i.e. no syscall) self-monitoring of tasks. The same feature
exists on x86 called 'rdpmc'. Unlike x86, userspace access will only be
enabled for thread bound events. This could be extended if needed, but
simplifies the implementation and reduces the chances for any
information leaks (which the x86 implementation suffers from).

PMU EL0 access will be enabled when an event with userspace access is
part of the thread's context. This includes when the event is not
scheduled on the PMU. There's some additional overhead clearing
dirty counters when access is enabled in order to prevent leaking
disabled counter data from other tasks.

Unlike x86, enabling of userspace access must be requested with a new
attr bit: config1:1. If the user requests userspace access with 64-bit
counters, then the event open will fail if the h/w doesn't support
64-bit counters. Chaining is not supported with userspace access. The
modes for config1 are as follows:

config1 = 0 : user access disabled and always 32-bit
config1 = 1 : user access disabled and always 64-bit (using chaining if needed)
config1 = 2 : user access enabled and always 32-bit
config1 = 3 : user access enabled and always 64-bit

Based on work by Raphael Gault <raphael.gault@arm.com>, but has been
completely re-written.

Cc: Will Deacon <will@kernel.org>
Cc: Mark Rutland <mark.rutland@arm.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: linux-arm-kernel@lists.infradead.org
Cc: linux-perf-users@vger.kernel.org
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20211208201124.310740-5-robh@kernel.org
[will: Made armv8pmu_proc_user_access_handler() static]
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/perf_event.c | 119 +++++++++++++++++++++++++++++++--
 1 file changed, 112 insertions(+), 7 deletions(-)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 6ae20c4217af..81cc9f0e718a 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -285,6 +285,7 @@ static const struct attribute_group armv8_pmuv3_events_attr_group = {
 
 PMU_FORMAT_ATTR(event, "config:0-15");
 PMU_FORMAT_ATTR(long, "config1:0");
+PMU_FORMAT_ATTR(rdpmc, "config1:1");
 
 static int sysctl_perf_user_access __read_mostly;
 
@@ -293,9 +294,15 @@ static inline bool armv8pmu_event_is_64bit(struct perf_event *event)
 	return event->attr.config1 & 0x1;
 }
 
+static inline bool armv8pmu_event_want_user_access(struct perf_event *event)
+{
+	return event->attr.config1 & 0x2;
+}
+
 static struct attribute *armv8_pmuv3_format_attrs[] = {
 	&format_attr_event.attr,
 	&format_attr_long.attr,
+	&format_attr_rdpmc.attr,
 	NULL,
 };
 
@@ -364,7 +371,7 @@ static const struct attribute_group armv8_pmuv3_caps_attr_group = {
  */
 #define	ARMV8_IDX_CYCLE_COUNTER	0
 #define	ARMV8_IDX_COUNTER0	1
-
+#define	ARMV8_IDX_CYCLE_COUNTER_USER	32
 
 /*
  * We unconditionally enable ARMv8.5-PMU long event counter support
@@ -376,18 +383,22 @@ static bool armv8pmu_has_long_event(struct arm_pmu *cpu_pmu)
 	return (cpu_pmu->pmuver >= ID_AA64DFR0_PMUVER_8_5);
 }
 
+static inline bool armv8pmu_event_has_user_read(struct perf_event *event)
+{
+	return event->hw.flags & PERF_EVENT_FLAG_USER_READ_CNT;
+}
+
 /*
  * We must chain two programmable counters for 64 bit events,
  * except when we have allocated the 64bit cycle counter (for CPU
- * cycles event). This must be called only when the event has
- * a counter allocated.
+ * cycles event) or when user space counter access is enabled.
  */
 static inline bool armv8pmu_event_is_chained(struct perf_event *event)
 {
 	int idx = event->hw.idx;
 	struct arm_pmu *cpu_pmu = to_arm_pmu(event->pmu);
 
-	return !WARN_ON(idx < 0) &&
+	return !armv8pmu_event_has_user_read(event) &&
 	       armv8pmu_event_is_64bit(event) &&
 	       !armv8pmu_has_long_event(cpu_pmu) &&
 	       (idx != ARMV8_IDX_CYCLE_COUNTER);
@@ -720,6 +731,28 @@ static inline u32 armv8pmu_getreset_flags(void)
 	return value;
 }
 
+static void armv8pmu_disable_user_access(void)
+{
+	write_sysreg(0, pmuserenr_el0);
+}
+
+static void armv8pmu_enable_user_access(struct arm_pmu *cpu_pmu)
+{
+	int i;
+	struct pmu_hw_events *cpuc = this_cpu_ptr(cpu_pmu->hw_events);
+
+	/* Clear any unused counters to avoid leaking their contents */
+	for_each_clear_bit(i, cpuc->used_mask, cpu_pmu->num_events) {
+		if (i == ARMV8_IDX_CYCLE_COUNTER)
+			write_sysreg(0, pmccntr_el0);
+		else
+			armv8pmu_write_evcntr(i, 0);
+	}
+
+	write_sysreg(0, pmuserenr_el0);
+	write_sysreg(ARMV8_PMU_USERENR_ER | ARMV8_PMU_USERENR_CR, pmuserenr_el0);
+}
+
 static void armv8pmu_enable_event(struct perf_event *event)
 {
 	/*
@@ -763,6 +796,14 @@ static void armv8pmu_disable_event(struct perf_event *event)
 
 static void armv8pmu_start(struct arm_pmu *cpu_pmu)
 {
+	struct perf_event_context *task_ctx =
+		this_cpu_ptr(cpu_pmu->pmu.pmu_cpu_context)->task_ctx;
+
+	if (sysctl_perf_user_access && task_ctx && task_ctx->nr_user)
+		armv8pmu_enable_user_access(cpu_pmu);
+	else
+		armv8pmu_disable_user_access();
+
 	/* Enable all counters */
 	armv8pmu_pmcr_write(armv8pmu_pmcr_read() | ARMV8_PMU_PMCR_E);
 }
@@ -880,13 +921,16 @@ static int armv8pmu_get_event_idx(struct pmu_hw_events *cpuc,
 	if (evtype == ARMV8_PMUV3_PERFCTR_CPU_CYCLES) {
 		if (!test_and_set_bit(ARMV8_IDX_CYCLE_COUNTER, cpuc->used_mask))
 			return ARMV8_IDX_CYCLE_COUNTER;
+		else if (armv8pmu_event_is_64bit(event) &&
+			   armv8pmu_event_want_user_access(event) &&
+			   !armv8pmu_has_long_event(cpu_pmu))
+				return -EAGAIN;
 	}
 
 	/*
 	 * Otherwise use events counters
 	 */
-	if (armv8pmu_event_is_64bit(event) &&
-	    !armv8pmu_has_long_event(cpu_pmu))
+	if (armv8pmu_event_is_chained(event))
 		return	armv8pmu_get_chain_idx(cpuc, cpu_pmu);
 	else
 		return armv8pmu_get_single_idx(cpuc, cpu_pmu);
@@ -902,6 +946,22 @@ static void armv8pmu_clear_event_idx(struct pmu_hw_events *cpuc,
 		clear_bit(idx - 1, cpuc->used_mask);
 }
 
+static int armv8pmu_user_event_idx(struct perf_event *event)
+{
+	if (!sysctl_perf_user_access || !armv8pmu_event_has_user_read(event))
+		return 0;
+
+	/*
+	 * We remap the cycle counter index to 32 to
+	 * match the offset applied to the rest of
+	 * the counter indices.
+	 */
+	if (event->hw.idx == ARMV8_IDX_CYCLE_COUNTER)
+		return ARMV8_IDX_CYCLE_COUNTER_USER;
+
+	return event->hw.idx;
+}
+
 /*
  * Add an event filter to a given event.
  */
@@ -998,6 +1058,25 @@ static int __armv8_pmuv3_map_event(struct perf_event *event,
 	if (armv8pmu_event_is_64bit(event))
 		event->hw.flags |= ARMPMU_EVT_64BIT;
 
+	/*
+	 * User events must be allocated into a single counter, and so
+	 * must not be chained.
+	 *
+	 * Most 64-bit events require long counter support, but 64-bit
+	 * CPU_CYCLES events can be placed into the dedicated cycle
+	 * counter when this is free.
+	 */
+	if (armv8pmu_event_want_user_access(event)) {
+		if (!(event->attach_state & PERF_ATTACH_TASK))
+			return -EINVAL;
+		if (armv8pmu_event_is_64bit(event) &&
+		    (hw_event_id != ARMV8_PMUV3_PERFCTR_CPU_CYCLES) &&
+		    !armv8pmu_has_long_event(armpmu))
+			return -EOPNOTSUPP;
+
+		event->hw.flags |= PERF_EVENT_FLAG_USER_READ_CNT;
+	}
+
 	/* Only expose micro/arch events supported by this PMU */
 	if ((hw_event_id > 0) && (hw_event_id < ARMV8_PMUV3_MAX_COMMON_EVENTS)
 	    && test_bit(hw_event_id, armpmu->pmceid_bitmap)) {
@@ -1106,13 +1185,29 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu)
 	return probe.present ? 0 : -ENODEV;
 }
 
+static void armv8pmu_disable_user_access_ipi(void *unused)
+{
+	armv8pmu_disable_user_access();
+}
+
+static int armv8pmu_proc_user_access_handler(struct ctl_table *table, int write,
+		void *buffer, size_t *lenp, loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	if (ret || !write || sysctl_perf_user_access)
+		return ret;
+
+	on_each_cpu(armv8pmu_disable_user_access_ipi, NULL, 1);
+	return 0;
+}
+
 static struct ctl_table armv8_pmu_sysctl_table[] = {
 	{
 		.procname       = "perf_user_access",
 		.data		= &sysctl_perf_user_access,
 		.maxlen		= sizeof(unsigned int),
 		.mode           = 0644,
-		.proc_handler	= proc_dointvec_minmax,
+		.proc_handler	= armv8pmu_proc_user_access_handler,
 		.extra1		= SYSCTL_ZERO,
 		.extra2		= SYSCTL_ONE,
 	},
@@ -1142,6 +1237,8 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
 	cpu_pmu->set_event_filter	= armv8pmu_set_event_filter;
 	cpu_pmu->filter_match		= armv8pmu_filter_match;
 
+	cpu_pmu->pmu.event_idx		= armv8pmu_user_event_idx;
+
 	cpu_pmu->name			= name;
 	cpu_pmu->map_event		= map_event;
 	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = events ?
@@ -1318,6 +1415,14 @@ void arch_perf_update_userpage(struct perf_event *event,
 	userpg->cap_user_time = 0;
 	userpg->cap_user_time_zero = 0;
 	userpg->cap_user_time_short = 0;
+	userpg->cap_user_rdpmc = armv8pmu_event_has_user_read(event);
+
+	if (userpg->cap_user_rdpmc) {
+		if (event->hw.flags & ARMPMU_EVT_64BIT)
+			userpg->pmc_width = 64;
+		else
+			userpg->pmc_width = 32;
+	}
 
 	do {
 		rd = sched_clock_read_begin(&seq);

From aa1005d15d2aee10e5b93a25db076c47e05c4efa Mon Sep 17 00:00:00 2001
From: Raphael Gault <raphael.gault@arm.com>
Date: Wed, 8 Dec 2021 14:11:24 -0600
Subject: [PATCH 28/81] Documentation: arm64: Document PMU counters access from
 userspace

Add documentation to describe the access to the pmu hardware counters from
userspace.

Signed-off-by: Raphael Gault <raphael.gault@arm.com>
Signed-off-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20211208201124.310740-6-robh@kernel.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 Documentation/arm64/perf.rst | 78 +++++++++++++++++++++++++++++++++++-
 1 file changed, 77 insertions(+), 1 deletion(-)

diff --git a/Documentation/arm64/perf.rst b/Documentation/arm64/perf.rst
index b567f177d385..1f87b57c2332 100644
--- a/Documentation/arm64/perf.rst
+++ b/Documentation/arm64/perf.rst
@@ -2,7 +2,10 @@
 
 .. _perf_index:
 
-=====================
+====
+Perf
+====
+
 Perf Event Attributes
 =====================
 
@@ -88,3 +91,76 @@ exclude_host. However when using !exclude_hv there is a small blackout
 window at the guest entry/exit where host events are not captured.
 
 On VHE systems there are no blackout windows.
+
+Perf Userspace PMU Hardware Counter Access
+==========================================
+
+Overview
+--------
+The perf userspace tool relies on the PMU to monitor events. It offers an
+abstraction layer over the hardware counters since the underlying
+implementation is cpu-dependent.
+Arm64 allows userspace tools to have access to the registers storing the
+hardware counters' values directly.
+
+This targets specifically self-monitoring tasks in order to reduce the overhead
+by directly accessing the registers without having to go through the kernel.
+
+How-to
+------
+The focus is set on the armv8 PMUv3 which makes sure that the access to the pmu
+registers is enabled and that the userspace has access to the relevant
+information in order to use them.
+
+In order to have access to the hardware counters, the global sysctl
+kernel/perf_user_access must first be enabled:
+
+.. code-block:: sh
+
+  echo 1 > /proc/sys/kernel/perf_user_access
+
+It is necessary to open the event using the perf tool interface with config1:1
+attr bit set: the sys_perf_event_open syscall returns a fd which can
+subsequently be used with the mmap syscall in order to retrieve a page of memory
+containing information about the event. The PMU driver uses this page to expose
+to the user the hardware counter's index and other necessary data. Using this
+index enables the user to access the PMU registers using the `mrs` instruction.
+Access to the PMU registers is only valid while the sequence lock is unchanged.
+In particular, the PMSELR_EL0 register is zeroed each time the sequence lock is
+changed.
+
+The userspace access is supported in libperf using the perf_evsel__mmap()
+and perf_evsel__read() functions. See `tools/lib/perf/tests/test-evsel.c`_ for
+an example.
+
+About heterogeneous systems
+---------------------------
+On heterogeneous systems such as big.LITTLE, userspace PMU counter access can
+only be enabled when the tasks are pinned to a homogeneous subset of cores and
+the corresponding PMU instance is opened by specifying the 'type' attribute.
+The use of generic event types is not supported in this case.
+
+Have a look at `tools/perf/arch/arm64/tests/user-events.c`_ for an example. It
+can be run using the perf tool to check that the access to the registers works
+correctly from userspace:
+
+.. code-block:: sh
+
+  perf test -v user
+
+About chained events and counter sizes
+--------------------------------------
+The user can request either a 32-bit (config1:0 == 0) or 64-bit (config1:0 == 1)
+counter along with userspace access. The sys_perf_event_open syscall will fail
+if a 64-bit counter is requested and the hardware doesn't support 64-bit
+counters. Chained events are not supported in conjunction with userspace counter
+access. If a 32-bit counter is requested on hardware with 64-bit counters, then
+userspace must treat the upper 32-bits read from the counter as UNKNOWN. The
+'pmc_width' field in the user page will indicate the valid width of the counter
+and should be used to mask the upper bits as needed.
+
+.. Links
+.. _tools/perf/arch/arm64/tests/user-events.c:
+   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/perf/arch/arm64/tests/user-events.c
+.. _tools/lib/perf/tests/test-evsel.c:
+   https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/tools/lib/perf/tests/test-evsel.c

From 56c7c6eaf3eb8ac1ec40d56096c0f2b27250da5f Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:50 +0000
Subject: [PATCH 29/81] perf/arm-cmn: Fix CPU hotplug unregistration

Attempting to migrate the PMU context after we've unregistered the PMU
device, or especially if we never successfully registered it in the
first place, is a woefully bad idea. It's also fundamentally pointless
anyway. Make sure to unregister an instance from the hotplug handler
*without* invoking the teardown callback.

Fixes: 0ba64770a2f2 ("perf: Add Arm CMN-600 PMU driver")
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/2c221d745544774e4b07583b65b5d4d94f7e0fe4.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index bc3cba5f8c5d..400eb7f579dc 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -1561,7 +1561,8 @@ static int arm_cmn_probe(struct platform_device *pdev)
 
 	err = perf_pmu_register(&cmn->pmu, name, -1);
 	if (err)
-		cpuhp_state_remove_instance(arm_cmn_hp_state, &cmn->cpuhp_node);
+		cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node);
+
 	return err;
 }
 
@@ -1572,7 +1573,7 @@ static int arm_cmn_remove(struct platform_device *pdev)
 	writel_relaxed(0, cmn->dtc[0].base + CMN_DT_DTC_CTL);
 
 	perf_pmu_unregister(&cmn->pmu);
-	cpuhp_state_remove_instance(arm_cmn_hp_state, &cmn->cpuhp_node);
+	cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node);
 	return 0;
 }
 

From 6190741c294d1cad15198d5d2f912868434fa492 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:51 +0000
Subject: [PATCH 30/81] perf/arm-cmn: Account for NUMA affinity

On a system with multiple CMN meshes, ideally we'd want to access each
PMU from within its own mesh, rather than with a long CML round-trip,
wherever feasible. Since such a system is likely to be presented as
multiple NUMA nodes, let's also hope a proximity domain is specified
for each CMN programming interface, and use that to guide our choice
of IRQ affinity to favour a node-local CPU where possible.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/32438b0d016e0649d882d47d30ac2000484287b9.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 51 +++++++++++++++++++++++++++++++-----------
 1 file changed, 38 insertions(+), 13 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 400eb7f579dc..02b898dbba91 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -1147,23 +1147,47 @@ static int arm_cmn_commit_txn(struct pmu *pmu)
 	return 0;
 }
 
-static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+static void arm_cmn_migrate(struct arm_cmn *cmn, unsigned int cpu)
+{
+	unsigned int i;
+
+	perf_pmu_migrate_context(&cmn->pmu, cmn->cpu, cpu);
+	for (i = 0; i < cmn->num_dtcs; i++)
+		irq_set_affinity(cmn->dtc[i].irq, cpumask_of(cpu));
+	cmn->cpu = cpu;
+}
+
+static int arm_cmn_pmu_online_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
 {
 	struct arm_cmn *cmn;
-	unsigned int i, target;
+	int node;
 
-	cmn = hlist_entry_safe(node, struct arm_cmn, cpuhp_node);
+	cmn = hlist_entry_safe(cpuhp_node, struct arm_cmn, cpuhp_node);
+	node = dev_to_node(cmn->dev);
+	if (node != NUMA_NO_NODE && cpu_to_node(cmn->cpu) != node && cpu_to_node(cpu) == node)
+		arm_cmn_migrate(cmn, cpu);
+	return 0;
+}
+
+static int arm_cmn_pmu_offline_cpu(unsigned int cpu, struct hlist_node *cpuhp_node)
+{
+	struct arm_cmn *cmn;
+	unsigned int target;
+	int node;
+	cpumask_t mask;
+
+	cmn = hlist_entry_safe(cpuhp_node, struct arm_cmn, cpuhp_node);
 	if (cpu != cmn->cpu)
 		return 0;
 
-	target = cpumask_any_but(cpu_online_mask, cpu);
-	if (target >= nr_cpu_ids)
-		return 0;
-
-	perf_pmu_migrate_context(&cmn->pmu, cpu, target);
-	for (i = 0; i < cmn->num_dtcs; i++)
-		irq_set_affinity(cmn->dtc[i].irq, cpumask_of(target));
-	cmn->cpu = target;
+	node = dev_to_node(cmn->dev);
+	if (cpumask_and(&mask, cpumask_of_node(node), cpu_online_mask) &&
+	    cpumask_andnot(&mask, &mask, cpumask_of(cpu)))
+		target = cpumask_any(&mask);
+	else
+		target = cpumask_any_but(cpu_online_mask, cpu);
+	if (target < nr_cpu_ids)
+		arm_cmn_migrate(cmn, target);
 	return 0;
 }
 
@@ -1532,7 +1556,7 @@ static int arm_cmn_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
-	cmn->cpu = raw_smp_processor_id();
+	cmn->cpu = cpumask_local_spread(0, dev_to_node(cmn->dev));
 	cmn->pmu = (struct pmu) {
 		.module = THIS_MODULE,
 		.attr_groups = arm_cmn_attr_groups,
@@ -1608,7 +1632,8 @@ static int __init arm_cmn_init(void)
 	int ret;
 
 	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
-				      "perf/arm/cmn:online", NULL,
+				      "perf/arm/cmn:online",
+				      arm_cmn_pmu_online_cpu,
 				      arm_cmn_pmu_offline_cpu);
 	if (ret < 0)
 		return ret;

From 82d8ea4b450074e81748830929bbd94eebbaffea Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:52 +0000
Subject: [PATCH 31/81] perf/arm-cmn: Drop compile-test restriction

Although CMN is currently (and overwhelmingly likely to remain) deployed
in arm64-only (modulo userspace) systems, the 64-bit "dependency" for
compile-testing was just laziness due to heavy reliance on readq/writeq
accessors. Since we only need one extra include for robustness in that
regard, let's pull that in, widen the compile-test coverage, and fix up
the smattering of type laziness that that brings to light.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/baee9ee0d0bdad8aaeb70f5a4b98d8fd4b1f5786.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/Kconfig   |  2 +-
 drivers/perf/arm-cmn.c | 25 +++++++++++++------------
 2 files changed, 14 insertions(+), 13 deletions(-)

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 4374af292e6d..1070515d340c 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -43,7 +43,7 @@ config ARM_CCN
 
 config ARM_CMN
 	tristate "Arm CMN-600 PMU support"
-	depends on ARM64 || (COMPILE_TEST && 64BIT)
+	depends on ARM64 || COMPILE_TEST
 	help
 	  Support for PMU events monitoring on the Arm CMN-600 Coherent Mesh
 	  Network interconnect.
diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 02b898dbba91..1d52fcfe3a0d 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -7,6 +7,7 @@
 #include <linux/bitops.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
+#include <linux/io-64-nonatomic-lo-hi.h>
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/module.h>
@@ -122,11 +123,11 @@
 
 
 /* Event attributes */
-#define CMN_CONFIG_TYPE			GENMASK(15, 0)
-#define CMN_CONFIG_EVENTID		GENMASK(23, 16)
-#define CMN_CONFIG_OCCUPID		GENMASK(27, 24)
-#define CMN_CONFIG_BYNODEID		BIT(31)
-#define CMN_CONFIG_NODEID		GENMASK(47, 32)
+#define CMN_CONFIG_TYPE			GENMASK_ULL(15, 0)
+#define CMN_CONFIG_EVENTID		GENMASK_ULL(23, 16)
+#define CMN_CONFIG_OCCUPID		GENMASK_ULL(27, 24)
+#define CMN_CONFIG_BYNODEID		BIT_ULL(31)
+#define CMN_CONFIG_NODEID		GENMASK_ULL(47, 32)
 
 #define CMN_EVENT_TYPE(event)		FIELD_GET(CMN_CONFIG_TYPE, (event)->attr.config)
 #define CMN_EVENT_EVENTID(event)	FIELD_GET(CMN_CONFIG_EVENTID, (event)->attr.config)
@@ -134,13 +135,13 @@
 #define CMN_EVENT_BYNODEID(event)	FIELD_GET(CMN_CONFIG_BYNODEID, (event)->attr.config)
 #define CMN_EVENT_NODEID(event)		FIELD_GET(CMN_CONFIG_NODEID, (event)->attr.config)
 
-#define CMN_CONFIG_WP_COMBINE		GENMASK(27, 24)
-#define CMN_CONFIG_WP_DEV_SEL		BIT(48)
-#define CMN_CONFIG_WP_CHN_SEL		GENMASK(50, 49)
-#define CMN_CONFIG_WP_GRP		BIT(52)
-#define CMN_CONFIG_WP_EXCLUSIVE		BIT(53)
-#define CMN_CONFIG1_WP_VAL		GENMASK(63, 0)
-#define CMN_CONFIG2_WP_MASK		GENMASK(63, 0)
+#define CMN_CONFIG_WP_COMBINE		GENMASK_ULL(27, 24)
+#define CMN_CONFIG_WP_DEV_SEL		BIT_ULL(48)
+#define CMN_CONFIG_WP_CHN_SEL		GENMASK_ULL(50, 49)
+#define CMN_CONFIG_WP_GRP		BIT_ULL(52)
+#define CMN_CONFIG_WP_EXCLUSIVE		BIT_ULL(53)
+#define CMN_CONFIG1_WP_VAL		GENMASK_ULL(63, 0)
+#define CMN_CONFIG2_WP_MASK		GENMASK_ULL(63, 0)
 
 #define CMN_EVENT_WP_COMBINE(event)	FIELD_GET(CMN_CONFIG_WP_COMBINE, (event)->attr.config)
 #define CMN_EVENT_WP_DEV_SEL(event)	FIELD_GET(CMN_CONFIG_WP_DEV_SEL, (event)->attr.config)

From 5f167eab83f153c2c6f80cfe419e269d5f481b09 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:53 +0000
Subject: [PATCH 32/81] perf/arm-cmn: Refactor node ID handling

Add a bit more abstraction for the places where we decompose node IDs.
This will help keep things nice and manageable when we come to add yet
more variables which affect the node ID format. Also use the opportunity
to move the rest of the low-level node management helpers back up to the
logical place they were meant to be - how they ended up buried right in
the middle of the event-related definitions is somewhat of a mystery...

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/a2242a8c3c96056c13a04ae87bf2047e5e64d2d9.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 94 +++++++++++++++++++++++++-----------------
 1 file changed, 56 insertions(+), 38 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 1d52fcfe3a0d..adf50d613734 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -255,6 +255,58 @@ struct arm_cmn {
 
 static int arm_cmn_hp_state;
 
+struct arm_cmn_nodeid {
+	u8 x;
+	u8 y;
+	u8 port;
+	u8 dev;
+};
+
+static int arm_cmn_xyidbits(const struct arm_cmn *cmn)
+{
+	int dim = max(cmn->mesh_x, cmn->mesh_y);
+
+	return dim > 4 ? 3 : 2;
+}
+
+static struct arm_cmn_nodeid arm_cmn_nid(const struct arm_cmn *cmn, u16 id)
+{
+	struct arm_cmn_nodeid nid;
+	int bits = arm_cmn_xyidbits(cmn);
+
+	nid.x = CMN_NODEID_X(id, bits);
+	nid.y = CMN_NODEID_Y(id, bits);
+	nid.port = CMN_NODEID_PID(id);
+	nid.dev = CMN_NODEID_DEVID(id);
+
+	return nid;
+}
+
+static void arm_cmn_init_node_to_xp(const struct arm_cmn *cmn,
+				    struct arm_cmn_node *dn)
+{
+	struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
+	int xp_idx = cmn->mesh_x * nid.y + nid.x;
+
+	dn->to_xp = (cmn->xps + xp_idx) - dn;
+}
+
+static struct arm_cmn_node *arm_cmn_node_to_xp(struct arm_cmn_node *dn)
+{
+	return dn->type == CMN_TYPE_XP ? dn : dn + dn->to_xp;
+}
+
+static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn,
+					 enum cmn_node_type type)
+{
+	int i;
+
+	for (i = 0; i < cmn->num_dns; i++)
+		if (cmn->dns[i].type == type)
+			return &cmn->dns[i];
+	return NULL;
+}
+
 struct arm_cmn_hw_event {
 	struct arm_cmn_node *dn;
 	u64 dtm_idx[2];
@@ -295,38 +347,6 @@ struct arm_cmn_format_attr {
 	int config;
 };
 
-static int arm_cmn_xyidbits(const struct arm_cmn *cmn)
-{
-	return cmn->mesh_x > 4 || cmn->mesh_y > 4 ? 3 : 2;
-}
-
-static void arm_cmn_init_node_to_xp(const struct arm_cmn *cmn,
-				    struct arm_cmn_node *dn)
-{
-	int bits = arm_cmn_xyidbits(cmn);
-	int x = CMN_NODEID_X(dn->id, bits);
-	int y = CMN_NODEID_Y(dn->id, bits);
-	int xp_idx = cmn->mesh_x * y + x;
-
-	dn->to_xp = (cmn->xps + xp_idx) - dn;
-}
-
-static struct arm_cmn_node *arm_cmn_node_to_xp(struct arm_cmn_node *dn)
-{
-	return dn->type == CMN_TYPE_XP ? dn : dn + dn->to_xp;
-}
-
-static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn,
-					 enum cmn_node_type type)
-{
-	int i;
-
-	for (i = 0; i < cmn->num_dns; i++)
-		if (cmn->dns[i].type == type)
-			return &cmn->dns[i];
-	return NULL;
-}
-
 #define CMN_EVENT_ATTR(_name, _type, _eventid, _occupid)		\
 	(&((struct arm_cmn_event_attr[]) {{				\
 		.attr = __ATTR(_name, 0444, arm_cmn_event_show, NULL),	\
@@ -966,11 +986,10 @@ static int arm_cmn_event_init(struct perf_event *event)
 	}
 
 	if (!hw->num_dns) {
-		int bits = arm_cmn_xyidbits(cmn);
+		struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, nodeid);
 
 		dev_dbg(cmn->dev, "invalid node 0x%x (%d,%d,%d,%d) type 0x%x\n",
-			nodeid, CMN_NODEID_X(nodeid, bits), CMN_NODEID_Y(nodeid, bits),
-			CMN_NODEID_PID(nodeid), CMN_NODEID_DEVID(nodeid), type);
+			nodeid, nid.x, nid.y, nid.port, nid.dev, type);
 		return -EINVAL;
 	}
 	/*
@@ -1068,11 +1087,10 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 			dn->wp_event[wp_idx] = dtc_idx;
 			writel_relaxed(cfg, dn->pmu_base + CMN_DTM_WPn_CONFIG(wp_idx));
 		} else {
-			unsigned int port = CMN_NODEID_PID(dn->id);
-			unsigned int dev = CMN_NODEID_DEVID(dn->id);
+			struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
 
 			input_sel = CMN__PMEVCNT0_INPUT_SEL_DEV + dtm_idx +
-				    (port << 4) + (dev << 2);
+				    (nid.port << 4) + (nid.dev << 2);
 
 			if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) {
 				int occupid = CMN_EVENT_OCCUPID(event);

From da5f7d2c8019c9dd053e2d94fdc1b3e7c03c35a5 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:54 +0000
Subject: [PATCH 33/81] perf/arm-cmn: Streamline node iteration

Refactor the places where we scan through the set of nodes to switch
from explicit array indexing to pointer-based iteration. This leads to
slightly simpler object code, but also makes the source less dense and
more pleasant for further development. It also unearths an almost-bug
in arm_cmn_event_init() where we've been depending on the "array index"
of NULL relative to cmn->dns being a sufficiently large number, yuck.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/ee0c9eda9a643f46001ac43aadf3f0b1fd5660dd.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 33 ++++++++++++++++++++-------------
 1 file changed, 20 insertions(+), 13 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index adf50d613734..da7bf779fec2 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -299,11 +299,11 @@ static struct arm_cmn_node *arm_cmn_node_to_xp(struct arm_cmn_node *dn)
 static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn,
 					 enum cmn_node_type type)
 {
-	int i;
+	struct arm_cmn_node *dn;
 
-	for (i = 0; i < cmn->num_dns; i++)
-		if (cmn->dns[i].type == type)
-			return &cmn->dns[i];
+	for (dn = cmn->dns; dn->type; dn++)
+		if (dn->type == type)
+			return dn;
 	return NULL;
 }
 
@@ -941,8 +941,8 @@ static int arm_cmn_event_init(struct perf_event *event)
 {
 	struct arm_cmn *cmn = to_cmn(event->pmu);
 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
+	struct arm_cmn_node *dn;
 	enum cmn_node_type type;
-	unsigned int i;
 	bool bynodeid;
 	u16 nodeid, eventid;
 
@@ -974,10 +974,12 @@ static int arm_cmn_event_init(struct perf_event *event)
 	nodeid = CMN_EVENT_NODEID(event);
 
 	hw->dn = arm_cmn_node(cmn, type);
-	for (i = hw->dn - cmn->dns; i < cmn->num_dns && cmn->dns[i].type == type; i++) {
+	if (!hw->dn)
+		return -EINVAL;
+	for (dn = hw->dn; dn->type == type; dn++) {
 		if (!bynodeid) {
 			hw->num_dns++;
-		} else if (cmn->dns[i].id != nodeid) {
+		} else if (dn->id != nodeid) {
 			hw->dn++;
 		} else {
 			hw->num_dns = 1;
@@ -1332,7 +1334,7 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 
 	cmn->xps = arm_cmn_node(cmn, CMN_TYPE_XP);
 
-	for (dn = cmn->dns; dn < cmn->dns + cmn->num_dns; dn++) {
+	for (dn = cmn->dns; dn->type; dn++) {
 		if (dn->type != CMN_TYPE_XP)
 			arm_cmn_init_node_to_xp(cmn, dn);
 		else if (cmn->num_dtcs == 1)
@@ -1382,6 +1384,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	u32 xp_offset[CMN_MAX_XPS];
 	u64 reg;
 	int i, j;
+	size_t sz;
 
 	cfg_region = cmn->base + rgn_offset;
 	reg = readl_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_2);
@@ -1408,14 +1411,13 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		cmn->num_dns += FIELD_GET(CMN_CI_CHILD_COUNT, reg);
 	}
 
-	/* Cheeky +1 to help terminate pointer-based iteration */
-	cmn->dns = devm_kcalloc(cmn->dev, cmn->num_dns + 1,
-				sizeof(*cmn->dns), GFP_KERNEL);
-	if (!cmn->dns)
+	/* Cheeky +1 to help terminate pointer-based iteration later */
+	dn = devm_kcalloc(cmn->dev, cmn->num_dns + 1, sizeof(*dn), GFP_KERNEL);
+	if (!dn)
 		return -ENOMEM;
 
 	/* Pass 2: now we can actually populate the nodes */
-	dn = cmn->dns;
+	cmn->dns = dn;
 	for (i = 0; i < cmn->num_xps; i++) {
 		void __iomem *xp_region = cmn->base + xp_offset[i];
 		struct arm_cmn_node *xp = dn++;
@@ -1484,6 +1486,11 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	/* Correct for any nodes we skipped */
 	cmn->num_dns = dn - cmn->dns;
 
+	sz = (void *)(dn + 1) - (void *)cmn->dns;
+	dn = devm_krealloc(cmn->dev, cmn->dns, sz, GFP_KERNEL);
+	if (dn)
+		cmn->dns = dn;
+
 	/*
 	 * If mesh_x wasn't set during discovery then we never saw
 	 * an XP at (0,1), thus we must have an Nx1 configuration.

From 0947c80aba23972987a88e620812d17a7af27297 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:55 +0000
Subject: [PATCH 34/81] perf/arm-cmn: Refactor DTM handling

Untangle DTMs from XPs into a dedicated abstraction. This helps make
things a little more obvious and robust, but primarily paves the way
for further development where new IPs can grow extra DTMs per XP.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/9cca18b1b98f482df7f1aaf3d3213e7f39500423.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 169 +++++++++++++++++++++--------------------
 1 file changed, 87 insertions(+), 82 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index da7bf779fec2..8b98ca9666d0 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -35,14 +35,9 @@
 #define CMN_CHILD_NODE_ADDR		GENMASK(27, 0)
 #define CMN_CHILD_NODE_EXTERNAL		BIT(31)
 
-#define CMN_ADDR_NODE_PTR		GENMASK(27, 14)
-
-#define CMN_NODE_PTR_DEVID(ptr)		(((ptr) >> 2) & 3)
-#define CMN_NODE_PTR_PID(ptr)		((ptr) & 1)
-#define CMN_NODE_PTR_X(ptr, bits)	((ptr) >> (6 + (bits)))
-#define CMN_NODE_PTR_Y(ptr, bits)	(((ptr) >> 6) & ((1U << (bits)) - 1))
-
-#define CMN_MAX_XPS			(8 * 8)
+#define CMN_MAX_DIMENSION		8
+#define CMN_MAX_XPS			(CMN_MAX_DIMENSION * CMN_MAX_DIMENSION)
+#define CMN_MAX_DTMS			CMN_MAX_XPS
 
 /* The CFG node has one other useful purpose */
 #define CMN_CFGM_PERIPH_ID_2		0x0010
@@ -190,32 +185,32 @@ struct arm_cmn_node {
 	u16 id, logid;
 	enum cmn_node_type type;
 
+	int dtm;
 	union {
-		/* Device node */
+		/* DN/HN-F/CXHA */
 		struct {
-			int to_xp;
-			/* DN/HN-F/CXHA */
-			unsigned int occupid_val;
-			unsigned int occupid_count;
+			u8 occupid_val;
+			u8 occupid_count;
 		};
 		/* XP */
-		struct {
-			int dtc;
-			u32 pmu_config_low;
-			union {
-				u8 input_sel[4];
-				__le32 pmu_config_high;
-			};
-			s8 wp_event[4];
-		};
+		int dtc;
 	};
-
 	union {
 		u8 event[4];
 		__le32 event_sel;
 	};
 };
 
+struct arm_cmn_dtm {
+	void __iomem *base;
+	u32 pmu_config_low;
+	union {
+		u8 input_sel[4];
+		__le32 pmu_config_high;
+	};
+	s8 wp_event[4];
+};
+
 struct arm_cmn_dtc {
 	void __iomem *base;
 	int irq;
@@ -241,6 +236,7 @@ struct arm_cmn {
 	struct arm_cmn_node *xps;
 	struct arm_cmn_node *dns;
 
+	struct arm_cmn_dtm *dtms;
 	struct arm_cmn_dtc *dtc;
 	unsigned int num_dtcs;
 
@@ -282,20 +278,14 @@ static struct arm_cmn_nodeid arm_cmn_nid(const struct arm_cmn *cmn, u16 id)
 	return nid;
 }
 
-static void arm_cmn_init_node_to_xp(const struct arm_cmn *cmn,
-				    struct arm_cmn_node *dn)
+static struct arm_cmn_node *arm_cmn_node_to_xp(const struct arm_cmn *cmn,
+					       const struct arm_cmn_node *dn)
 {
 	struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
 	int xp_idx = cmn->mesh_x * nid.y + nid.x;
 
-	dn->to_xp = (cmn->xps + xp_idx) - dn;
+	return cmn->xps + xp_idx;
 }
-
-static struct arm_cmn_node *arm_cmn_node_to_xp(struct arm_cmn_node *dn)
-{
-	return dn->type == CMN_TYPE_XP ? dn : dn + dn->to_xp;
-}
-
 static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn,
 					 enum cmn_node_type type)
 {
@@ -706,9 +696,9 @@ static u64 arm_cmn_read_dtm(struct arm_cmn *cmn, struct arm_cmn_hw_event *hw,
 
 	offset = snapshot ? CMN_DTM_PMEVCNTSR : CMN_DTM_PMEVCNT;
 	for_each_hw_dn(hw, dn, i) {
-		struct arm_cmn_node *xp = arm_cmn_node_to_xp(dn);
+		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm];
 		int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
-		u64 reg = readq_relaxed(xp->pmu_base + offset);
+		u64 reg = readq_relaxed(dtm->base + offset);
 		u16 dtm_count = reg >> (dtm_idx * 16);
 
 		count += dtm_count;
@@ -835,9 +825,9 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags)
 }
 
 struct arm_cmn_val {
-	u8 dtm_count[CMN_MAX_XPS];
-	u8 occupid[CMN_MAX_XPS];
-	u8 wp[CMN_MAX_XPS][4];
+	u8 dtm_count[CMN_MAX_DTMS];
+	u8 occupid[CMN_MAX_DTMS];
+	u8 wp[CMN_MAX_DTMS][4];
 	int dtc_count;
 	bool cycles;
 };
@@ -866,16 +856,16 @@ static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *ev
 		occupid = 0;
 
 	for_each_hw_dn(hw, dn, i) {
-		int wp_idx, xp = arm_cmn_node_to_xp(dn)->logid;
+		int wp_idx, dtm = dn->dtm;
 
-		val->dtm_count[xp]++;
-		val->occupid[xp] = occupid;
+		val->dtm_count[dtm]++;
+		val->occupid[dtm] = occupid;
 
 		if (type != CMN_TYPE_WP)
 			continue;
 
 		wp_idx = arm_cmn_wp_idx(event);
-		val->wp[xp][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1;
+		val->wp[dtm][wp_idx] = CMN_EVENT_WP_COMBINE(event) + 1;
 	}
 }
 
@@ -914,22 +904,22 @@ static int arm_cmn_validate_group(struct perf_event *event)
 		occupid = 0;
 
 	for_each_hw_dn(hw, dn, i) {
-		int wp_idx, wp_cmb, xp = arm_cmn_node_to_xp(dn)->logid;
+		int wp_idx, wp_cmb, dtm = dn->dtm;
 
-		if (val.dtm_count[xp] == CMN_DTM_NUM_COUNTERS)
+		if (val.dtm_count[dtm] == CMN_DTM_NUM_COUNTERS)
 			return -EINVAL;
 
-		if (occupid && val.occupid[xp] && occupid != val.occupid[xp])
+		if (occupid && val.occupid[dtm] && occupid != val.occupid[dtm])
 			return -EINVAL;
 
 		if (type != CMN_TYPE_WP)
 			continue;
 
 		wp_idx = arm_cmn_wp_idx(event);
-		if (val.wp[xp][wp_idx])
+		if (val.wp[dtm][wp_idx])
 			return -EINVAL;
 
-		wp_cmb = val.wp[xp][wp_idx ^ 1];
+		wp_cmb = val.wp[dtm][wp_idx ^ 1];
 		if (wp_cmb && wp_cmb != CMN_EVENT_WP_COMBINE(event) + 1)
 			return -EINVAL;
 	}
@@ -1010,17 +1000,17 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
 	enum cmn_node_type type = CMN_EVENT_TYPE(event);
 
 	while (i--) {
-		struct arm_cmn_node *xp = arm_cmn_node_to_xp(hw->dn + i);
+		struct arm_cmn_dtm *dtm = &cmn->dtms[hw->dn[i].dtm];
 		unsigned int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
 
 		if (type == CMN_TYPE_WP)
-			hw->dn[i].wp_event[arm_cmn_wp_idx(event)] = -1;
+			dtm->wp_event[arm_cmn_wp_idx(event)] = -1;
 
 		if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event)))
 			hw->dn[i].occupid_count--;
 
-		xp->pmu_config_low &= ~CMN__PMEVCNT_PAIRED(dtm_idx);
-		writel_relaxed(xp->pmu_config_low, xp->pmu_base + CMN_DTM_PMU_CONFIG);
+		dtm->pmu_config_low &= ~CMN__PMEVCNT_PAIRED(dtm_idx);
+		writel_relaxed(dtm->pmu_config_low, dtm->base + CMN_DTM_PMU_CONFIG);
 	}
 	memset(hw->dtm_idx, 0, sizeof(hw->dtm_idx));
 
@@ -1062,12 +1052,12 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 
 	/* ...then the local counters to feed it. */
 	for_each_hw_dn(hw, dn, i) {
-		struct arm_cmn_node *xp = arm_cmn_node_to_xp(dn);
+		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm];
 		unsigned int dtm_idx, shift;
 		u64 reg;
 
 		dtm_idx = 0;
-		while (xp->pmu_config_low & CMN__PMEVCNT_PAIRED(dtm_idx))
+		while (dtm->pmu_config_low & CMN__PMEVCNT_PAIRED(dtm_idx))
 			if (++dtm_idx == CMN_DTM_NUM_COUNTERS)
 				goto free_dtms;
 
@@ -1077,17 +1067,17 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 			int tmp, wp_idx = arm_cmn_wp_idx(event);
 			u32 cfg = arm_cmn_wp_config(event);
 
-			if (dn->wp_event[wp_idx] >= 0)
+			if (dtm->wp_event[wp_idx] >= 0)
 				goto free_dtms;
 
-			tmp = dn->wp_event[wp_idx ^ 1];
+			tmp = dtm->wp_event[wp_idx ^ 1];
 			if (tmp >= 0 && CMN_EVENT_WP_COMBINE(event) !=
 					CMN_EVENT_WP_COMBINE(dtc->counters[tmp]))
 				goto free_dtms;
 
 			input_sel = CMN__PMEVCNT0_INPUT_SEL_WP + wp_idx;
-			dn->wp_event[wp_idx] = dtc_idx;
-			writel_relaxed(cfg, dn->pmu_base + CMN_DTM_WPn_CONFIG(wp_idx));
+			dtm->wp_event[wp_idx] = dtc_idx;
+			writel_relaxed(cfg, dtm->base + CMN_DTM_WPn_CONFIG(wp_idx));
 		} else {
 			struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
 
@@ -1095,7 +1085,7 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 				    (nid.port << 4) + (nid.dev << 2);
 
 			if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) {
-				int occupid = CMN_EVENT_OCCUPID(event);
+				u8 occupid = CMN_EVENT_OCCUPID(event);
 
 				if (dn->occupid_count == 0) {
 					dn->occupid_val = occupid;
@@ -1110,13 +1100,13 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 
 		arm_cmn_set_index(hw->dtm_idx, i, dtm_idx);
 
-		xp->input_sel[dtm_idx] = input_sel;
+		dtm->input_sel[dtm_idx] = input_sel;
 		shift = CMN__PMEVCNTn_GLOBAL_NUM_SHIFT(dtm_idx);
-		xp->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift);
-		xp->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift;
-		xp->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx);
-		reg = (u64)le32_to_cpu(xp->pmu_config_high) << 32 | xp->pmu_config_low;
-		writeq_relaxed(reg, xp->pmu_base + CMN_DTM_PMU_CONFIG);
+		dtm->pmu_config_low &= ~(CMN__PMEVCNT0_GLOBAL_NUM << shift);
+		dtm->pmu_config_low |= FIELD_PREP(CMN__PMEVCNT0_GLOBAL_NUM, dtc_idx) << shift;
+		dtm->pmu_config_low |= CMN__PMEVCNT_PAIRED(dtm_idx);
+		reg = (u64)le32_to_cpu(dtm->pmu_config_high) << 32 | dtm->pmu_config_low;
+		writeq_relaxed(reg, dtm->base + CMN_DTM_PMU_CONFIG);
 	}
 
 	/* Go go go! */
@@ -1276,23 +1266,22 @@ static int arm_cmn_init_irqs(struct arm_cmn *cmn)
 	return 0;
 }
 
-static void arm_cmn_init_dtm(struct arm_cmn_node *xp)
+static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp)
 {
 	int i;
 
+	dtm->base = xp->pmu_base;
+	dtm->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN;
 	for (i = 0; i < 4; i++) {
-		xp->wp_event[i] = -1;
-		writeq_relaxed(0, xp->pmu_base + CMN_DTM_WPn_MASK(i));
-		writeq_relaxed(~0ULL, xp->pmu_base + CMN_DTM_WPn_VAL(i));
+		dtm->wp_event[i] = -1;
+		writeq_relaxed(0, dtm->base + CMN_DTM_WPn_MASK(i));
+		writeq_relaxed(~0ULL, dtm->base + CMN_DTM_WPn_VAL(i));
 	}
-	xp->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN;
-	xp->dtc = -1;
 }
 
 static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int idx)
 {
 	struct arm_cmn_dtc *dtc = cmn->dtc + idx;
-	struct arm_cmn_node *xp;
 
 	dtc->base = dn->pmu_base - CMN_PMU_OFFSET;
 	dtc->irq = platform_get_irq(to_platform_device(cmn->dev), idx);
@@ -1303,10 +1292,6 @@ static int arm_cmn_init_dtc(struct arm_cmn *cmn, struct arm_cmn_node *dn, int id
 	writel_relaxed(0x1ff, dtc->base + CMN_DT_PMOVSR_CLR);
 	writel_relaxed(CMN_DT_PMCR_OVFL_INTR_EN, dtc->base + CMN_DT_PMCR);
 
-	/* We do at least know that a DTC's XP must be in that DTC's domain */
-	xp = arm_cmn_node_to_xp(dn);
-	xp->dtc = idx;
-
 	return 0;
 }
 
@@ -1323,7 +1308,7 @@ static int arm_cmn_node_cmp(const void *a, const void *b)
 
 static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 {
-	struct arm_cmn_node *dn;
+	struct arm_cmn_node *dn, *xp;
 	int dtc_idx = 0;
 
 	cmn->dtc = devm_kcalloc(cmn->dev, cmn->num_dtcs, sizeof(cmn->dtc[0]), GFP_KERNEL);
@@ -1335,13 +1320,24 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 	cmn->xps = arm_cmn_node(cmn, CMN_TYPE_XP);
 
 	for (dn = cmn->dns; dn->type; dn++) {
-		if (dn->type != CMN_TYPE_XP)
-			arm_cmn_init_node_to_xp(cmn, dn);
-		else if (cmn->num_dtcs == 1)
-			dn->dtc = 0;
+		if (dn->type == CMN_TYPE_XP) {
+			if (dn->dtc < 0 && cmn->num_dtcs == 1)
+				dn->dtc = 0;
+			continue;
+		}
 
-		if (dn->type == CMN_TYPE_DTC)
-			arm_cmn_init_dtc(cmn, dn, dtc_idx++);
+		xp = arm_cmn_node_to_xp(cmn, dn);
+		dn->dtm = xp->dtm;
+
+		if (dn->type == CMN_TYPE_DTC) {
+			int err;
+			/* We do at least know that a DTC's XP must be in that DTC's domain */
+			if (xp->dtc < 0)
+				xp->dtc = dtc_idx;
+			err = arm_cmn_init_dtc(cmn, dn, dtc_idx++);
+			if (err)
+				return err;
+		}
 
 		/* To the PMU, RN-Ds don't add anything over RN-Is, so smoosh them together */
 		if (dn->type == CMN_TYPE_RND)
@@ -1380,6 +1376,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 {
 	void __iomem *cfg_region;
 	struct arm_cmn_node cfg, *dn;
+	struct arm_cmn_dtm *dtm;
 	u16 child_count, child_poff;
 	u32 xp_offset[CMN_MAX_XPS];
 	u64 reg;
@@ -1416,14 +1413,18 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	if (!dn)
 		return -ENOMEM;
 
+	dtm = devm_kcalloc(cmn->dev, cmn->num_xps, sizeof(*dtm), GFP_KERNEL);
+	if (!dtm)
+		return -ENOMEM;
+
 	/* Pass 2: now we can actually populate the nodes */
 	cmn->dns = dn;
+	cmn->dtms = dtm;
 	for (i = 0; i < cmn->num_xps; i++) {
 		void __iomem *xp_region = cmn->base + xp_offset[i];
 		struct arm_cmn_node *xp = dn++;
 
 		arm_cmn_init_node_info(cmn, xp_offset[i], xp);
-		arm_cmn_init_dtm(xp);
 		/*
 		 * Thanks to the order in which XP logical IDs seem to be
 		 * assigned, we can handily infer the mesh X dimension by
@@ -1433,6 +1434,10 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		if (xp->id == (1 << 3))
 			cmn->mesh_x = xp->logid;
 
+		xp->dtc = -1;
+		xp->dtm = dtm - cmn->dtms;
+		arm_cmn_init_dtm(dtm++, xp);
+
 		reg = readq_relaxed(xp_region + CMN_CHILD_INFO);
 		child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg);
 		child_poff = FIELD_GET(CMN_CI_CHILD_PTR_OFFSET, reg);

From 847eef94e6327dd7690bfac0bd3a81a7ba6aa1ee Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:56 +0000
Subject: [PATCH 35/81] perf/arm-cmn: Optimise DTM counter reads

When multiple nodes of the same type are connected to the same XP
(particularly in CAL configurations), it seems that they are likely
to be consecutive in logical ID. Therefore, we're likely to gain a
small benefit from an easy tweak to optimise out consecutive reads
of the same set of DTM counters for an aggregated event.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/7777d77c2df17693cd3dabb6e268906e15238d82.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 17 +++++++++--------
 1 file changed, 9 insertions(+), 8 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 8b98ca9666d0..005a0d83bcac 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -690,18 +690,19 @@ static void arm_cmn_pmu_disable(struct pmu *pmu)
 static u64 arm_cmn_read_dtm(struct arm_cmn *cmn, struct arm_cmn_hw_event *hw,
 			    bool snapshot)
 {
+	struct arm_cmn_dtm *dtm = NULL;
 	struct arm_cmn_node *dn;
-	unsigned int i, offset;
-	u64 count = 0;
+	unsigned int i, offset, dtm_idx;
+	u64 reg, count = 0;
 
 	offset = snapshot ? CMN_DTM_PMEVCNTSR : CMN_DTM_PMEVCNT;
 	for_each_hw_dn(hw, dn, i) {
-		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm];
-		int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
-		u64 reg = readq_relaxed(dtm->base + offset);
-		u16 dtm_count = reg >> (dtm_idx * 16);
-
-		count += dtm_count;
+		if (dtm != &cmn->dtms[dn->dtm]) {
+			dtm = &cmn->dtms[dn->dtm];
+			reg = readq_relaxed(dtm->base + offset);
+		}
+		dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
+		count += (u16)(reg >> (dtm_idx * 16));
 	}
 	return count;
 }

From 4f2c3872dde55090bf39e1f12a8517a32b6cd048 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:57 +0000
Subject: [PATCH 36/81] perf/arm-cmn: Optimise DTC counter accesses

In cases where we do know which DTC domain a node belongs to, we can
skip initialising or reading the global count in DTCs where we know
it won't change. The machinery to achieve that is mostly in place
already, so finish hooking it up by converting the vestigial domain
tracking to propagate suitable bitmaps all the way through to events.

Note that this does not allow allocating such an unused counter to a
different event on that DTC, because that is a flippin' nightmare.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/51d930fd945ef51c81f5889ccca055c302b0a1d0.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 29 ++++++++++++-----------------
 1 file changed, 12 insertions(+), 17 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 005a0d83bcac..acff8683af2c 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -193,7 +193,7 @@ struct arm_cmn_node {
 			u8 occupid_count;
 		};
 		/* XP */
-		int dtc;
+		u8 dtc;
 	};
 	union {
 		u8 event[4];
@@ -968,14 +968,14 @@ static int arm_cmn_event_init(struct perf_event *event)
 	if (!hw->dn)
 		return -EINVAL;
 	for (dn = hw->dn; dn->type == type; dn++) {
-		if (!bynodeid) {
-			hw->num_dns++;
-		} else if (dn->id != nodeid) {
+		if (bynodeid && dn->id != nodeid) {
 			hw->dn++;
-		} else {
-			hw->num_dns = 1;
-			break;
+			continue;
 		}
+		hw->dtcs_used |= arm_cmn_node_to_xp(cmn, dn)->dtc;
+		hw->num_dns++;
+		if (bynodeid)
+			break;
 	}
 
 	if (!hw->num_dns) {
@@ -985,11 +985,6 @@ static int arm_cmn_event_init(struct perf_event *event)
 			nodeid, nid.x, nid.y, nid.port, nid.dev, type);
 		return -EINVAL;
 	}
-	/*
-	 * By assuming events count in all DTC domains, we cunningly avoid
-	 * needing to know anything about how XPs are assigned to domains.
-	 */
-	hw->dtcs_used = (1U << cmn->num_dtcs) - 1;
 
 	return arm_cmn_validate_group(event);
 }
@@ -1311,6 +1306,7 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 {
 	struct arm_cmn_node *dn, *xp;
 	int dtc_idx = 0;
+	u8 dtcs_present = (1 << cmn->num_dtcs) - 1;
 
 	cmn->dtc = devm_kcalloc(cmn->dev, cmn->num_dtcs, sizeof(cmn->dtc[0]), GFP_KERNEL);
 	if (!cmn->dtc)
@@ -1322,8 +1318,7 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 
 	for (dn = cmn->dns; dn->type; dn++) {
 		if (dn->type == CMN_TYPE_XP) {
-			if (dn->dtc < 0 && cmn->num_dtcs == 1)
-				dn->dtc = 0;
+			dn->dtc &= dtcs_present;
 			continue;
 		}
 
@@ -1333,8 +1328,8 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 		if (dn->type == CMN_TYPE_DTC) {
 			int err;
 			/* We do at least know that a DTC's XP must be in that DTC's domain */
-			if (xp->dtc < 0)
-				xp->dtc = dtc_idx;
+			if (xp->dtc == 0xf)
+				xp->dtc = 1 << dtc_idx;
 			err = arm_cmn_init_dtc(cmn, dn, dtc_idx++);
 			if (err)
 				return err;
@@ -1435,7 +1430,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		if (xp->id == (1 << 3))
 			cmn->mesh_x = xp->logid;
 
-		xp->dtc = -1;
+		xp->dtc = 0xf;
 		xp->dtm = dtm - cmn->dtms;
 		arm_cmn_init_dtm(dtm++, xp);
 

From 558a07807038017255005a4820f600da643d8a5f Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:58 +0000
Subject: [PATCH 37/81] perf/arm-cmn: Move group validation data off-stack

With the value of CMN_MAX_DTMS increasing significantly, our validation
data structure is set to get quite big. Technically we could pack it at
least twice as densely, since we only need around 19 bits of information
per DTM, but that makes the code even more mind-bogglingly impenetrable,
and even half of "quite big" may still be uncomfortably large for a
stack frame (~1KB). Just move it to an off-stack allocation instead.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/0cabff2e5839ddc0979e757c55515966f65359e4.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 43 ++++++++++++++++++++++++------------------
 1 file changed, 25 insertions(+), 18 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index acff8683af2c..d2dd02f040b8 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -876,8 +876,8 @@ static int arm_cmn_validate_group(struct perf_event *event)
 	struct arm_cmn_node *dn;
 	struct perf_event *sibling, *leader = event->group_leader;
 	enum cmn_node_type type;
-	struct arm_cmn_val val;
-	int i;
+	struct arm_cmn_val *val;
+	int i, ret = -EINVAL;
 	u8 occupid;
 
 	if (leader == event)
@@ -886,18 +886,22 @@ static int arm_cmn_validate_group(struct perf_event *event)
 	if (event->pmu != leader->pmu && !is_software_event(leader))
 		return -EINVAL;
 
-	memset(&val, 0, sizeof(val));
+	val = kzalloc(sizeof(*val), GFP_KERNEL);
+	if (!val)
+		return -ENOMEM;
 
-	arm_cmn_val_add_event(&val, leader);
+	arm_cmn_val_add_event(val, leader);
 	for_each_sibling_event(sibling, leader)
-		arm_cmn_val_add_event(&val, sibling);
+		arm_cmn_val_add_event(val, sibling);
 
 	type = CMN_EVENT_TYPE(event);
-	if (type == CMN_TYPE_DTC)
-		return val.cycles ? -EINVAL : 0;
+	if (type == CMN_TYPE_DTC) {
+		ret = val->cycles ? -EINVAL : 0;
+		goto done;
+	}
 
-	if (val.dtc_count == CMN_DT_NUM_COUNTERS)
-		return -EINVAL;
+	if (val->dtc_count == CMN_DT_NUM_COUNTERS)
+		goto done;
 
 	if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event)))
 		occupid = CMN_EVENT_OCCUPID(event) + 1;
@@ -907,25 +911,28 @@ static int arm_cmn_validate_group(struct perf_event *event)
 	for_each_hw_dn(hw, dn, i) {
 		int wp_idx, wp_cmb, dtm = dn->dtm;
 
-		if (val.dtm_count[dtm] == CMN_DTM_NUM_COUNTERS)
-			return -EINVAL;
+		if (val->dtm_count[dtm] == CMN_DTM_NUM_COUNTERS)
+			goto done;
 
-		if (occupid && val.occupid[dtm] && occupid != val.occupid[dtm])
-			return -EINVAL;
+		if (occupid && val->occupid[dtm] && occupid != val->occupid[dtm])
+			goto done;
 
 		if (type != CMN_TYPE_WP)
 			continue;
 
 		wp_idx = arm_cmn_wp_idx(event);
-		if (val.wp[dtm][wp_idx])
-			return -EINVAL;
+		if (val->wp[dtm][wp_idx])
+			goto done;
 
-		wp_cmb = val.wp[dtm][wp_idx ^ 1];
+		wp_cmb = val->wp[dtm][wp_idx ^ 1];
 		if (wp_cmb && wp_cmb != CMN_EVENT_WP_COMBINE(event) + 1)
-			return -EINVAL;
+			goto done;
 	}
 
-	return 0;
+	ret = 0;
+done:
+	kfree(val);
+	return ret;
 }
 
 static int arm_cmn_event_init(struct perf_event *event)

From 61ec1d875812046ff9d473183d53e19dcd6b2ada Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:44:59 +0000
Subject: [PATCH 38/81] perf/arm-cmn: Demarcate CMN-600 specifics

In preparation for supporting newer CMN products, let's introduce a
means to differentiate the features and events which are specific to a
particular IP from those which remain common to the whole family. The
newer designs have also smoothed off some of the rough edges in terms
of discoverability, so separate out the parts of the flow which have
effectively now become CMN-600 quirks.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/9f6368cdca4c821d801138939508a5bba54ccabb.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 301 +++++++++++++++++++++--------------------
 1 file changed, 156 insertions(+), 145 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index d2dd02f040b8..ce94f923a607 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -151,7 +151,12 @@
 #define CMN_WP_DOWN			2
 
 
-/* r0px probably don't exist in silicon, thankfully */
+enum cmn_model {
+	CMN_ANY = -1,
+	CMN600 = 1,
+};
+
+/* CMN-600 r0px shouldn't exist in silicon, thankfully */
 enum cmn_revision {
 	CMN600_R1P0,
 	CMN600_R1P1,
@@ -159,6 +164,7 @@ enum cmn_revision {
 	CMN600_R1P3,
 	CMN600_R2P0,
 	CMN600_R3P0,
+	CMN600_R3P1,
 };
 
 enum cmn_node_type {
@@ -229,6 +235,7 @@ struct arm_cmn {
 	void __iomem *base;
 
 	enum cmn_revision rev;
+	enum cmn_model model;
 	u8 mesh_x;
 	u8 mesh_y;
 	u16 num_xps;
@@ -326,6 +333,7 @@ static unsigned int arm_cmn_get_index(u64 x[], unsigned int pos)
 
 struct arm_cmn_event_attr {
 	struct device_attribute attr;
+	enum cmn_model model;
 	enum cmn_node_type type;
 	u8 eventid;
 	u8 occupid;
@@ -337,9 +345,10 @@ struct arm_cmn_format_attr {
 	int config;
 };
 
-#define CMN_EVENT_ATTR(_name, _type, _eventid, _occupid)		\
+#define CMN_EVENT_ATTR(_model, _name, _type, _eventid, _occupid)	\
 	(&((struct arm_cmn_event_attr[]) {{				\
 		.attr = __ATTR(_name, 0444, arm_cmn_event_show, NULL),	\
+		.model = _model,					\
 		.type = _type,						\
 		.eventid = _eventid,					\
 		.occupid = _occupid,					\
@@ -386,12 +395,15 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	eattr = container_of(attr, typeof(*eattr), attr.attr);
 	type = eattr->type;
 
+	if (!(eattr->model & cmn->model))
+		return 0;
+
 	/* Watchpoints aren't nodes */
 	if (type == CMN_TYPE_WP)
 		type = CMN_TYPE_XP;
 
 	/* Revision-specific differences */
-	if (cmn->rev < CMN600_R1P2) {
+	if (cmn->model == CMN600 && cmn->rev < CMN600_R1P2) {
 		if (type == CMN_TYPE_HNF && eattr->eventid == 0x1b)
 			return 0;
 	}
@@ -402,25 +414,27 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	return attr->mode;
 }
 
-#define _CMN_EVENT_DVM(_name, _event, _occup)			\
-	CMN_EVENT_ATTR(dn_##_name, CMN_TYPE_DVM, _event, _occup)
+#define _CMN_EVENT_DVM(_model, _name, _event, _occup)		\
+	CMN_EVENT_ATTR(_model, dn_##_name, CMN_TYPE_DVM, _event, _occup)
 #define CMN_EVENT_DTC(_name)					\
-	CMN_EVENT_ATTR(dtc_##_name, CMN_TYPE_DTC, 0, 0)
-#define _CMN_EVENT_HNF(_name, _event, _occup)			\
-	CMN_EVENT_ATTR(hnf_##_name, CMN_TYPE_HNF, _event, _occup)
+	CMN_EVENT_ATTR(CMN_ANY, dtc_##_name, CMN_TYPE_DTC, 0, 0)
+#define _CMN_EVENT_HNF(_model, _name, _event, _occup)		\
+	CMN_EVENT_ATTR(_model, hnf_##_name, CMN_TYPE_HNF, _event, _occup)
 #define CMN_EVENT_HNI(_name, _event)				\
-	CMN_EVENT_ATTR(hni_##_name, CMN_TYPE_HNI, _event, 0)
+	CMN_EVENT_ATTR(CMN_ANY, hni_##_name, CMN_TYPE_HNI, _event, 0)
 #define __CMN_EVENT_XP(_name, _event)				\
-	CMN_EVENT_ATTR(mxp_##_name, CMN_TYPE_XP, _event, 0)
-#define CMN_EVENT_SBSX(_name, _event)				\
-	CMN_EVENT_ATTR(sbsx_##_name, CMN_TYPE_SBSX, _event, 0)
-#define CMN_EVENT_RNID(_name, _event)				\
-	CMN_EVENT_ATTR(rnid_##_name, CMN_TYPE_RNI, _event, 0)
+	CMN_EVENT_ATTR(CMN_ANY, mxp_##_name, CMN_TYPE_XP, _event, 0)
+#define CMN_EVENT_SBSX(_model, _name, _event)			\
+	CMN_EVENT_ATTR(_model, sbsx_##_name, CMN_TYPE_SBSX, _event, 0)
+#define CMN_EVENT_RNID(_model, _name, _event)			\
+	CMN_EVENT_ATTR(_model, rnid_##_name, CMN_TYPE_RNI, _event, 0)
+#define CMN_EVENT_MTSX(_name, _event)				\
+	CMN_EVENT_ATTR(CMN_ANY, mtsx_##_name, CMN_TYPE_MTSX, _event, 0)
 
-#define CMN_EVENT_DVM(_name, _event)				\
-	_CMN_EVENT_DVM(_name, _event, 0)
-#define CMN_EVENT_HNF(_name, _event)				\
-	_CMN_EVENT_HNF(_name, _event, 0)
+#define CMN_EVENT_DVM(_model, _name, _event)			\
+	_CMN_EVENT_DVM(_model, _name, _event, 0)
+#define CMN_EVENT_HNF(_model, _name, _event)			\
+	_CMN_EVENT_HNF(_model, _name, _event, 0)
 #define _CMN_EVENT_XP(_name, _event)				\
 	__CMN_EVENT_XP(e_##_name, (_event) | (0 << 2)),		\
 	__CMN_EVENT_XP(w_##_name, (_event) | (1 << 2)),		\
@@ -445,115 +459,115 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	 * slot, but our lazy short-cut of using the DTM counter index for
 	 * the PMU index as well happens to avoid that by construction.
 	 */
-	CMN_EVENT_DVM(rxreq_dvmop,	0x01),
-	CMN_EVENT_DVM(rxreq_dvmsync,	0x02),
-	CMN_EVENT_DVM(rxreq_dvmop_vmid_filtered, 0x03),
-	CMN_EVENT_DVM(rxreq_retried,	0x04),
-	_CMN_EVENT_DVM(rxreq_trk_occupancy_all, 0x05, 0),
-	_CMN_EVENT_DVM(rxreq_trk_occupancy_dvmop, 0x05, 1),
-	_CMN_EVENT_DVM(rxreq_trk_occupancy_dvmsync, 0x05, 2),
+	CMN_EVENT_DVM(CMN600, rxreq_dvmop,		0x01),
+	CMN_EVENT_DVM(CMN600, rxreq_dvmsync,		0x02),
+	CMN_EVENT_DVM(CMN600, rxreq_dvmop_vmid_filtered, 0x03),
+	CMN_EVENT_DVM(CMN600, rxreq_retried,		0x04),
+	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_all, 0x05, 0),
+	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmop, 0x05, 1),
+	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmsync, 0x05, 2),
 
-	CMN_EVENT_HNF(cache_miss,	0x01),
-	CMN_EVENT_HNF(slc_sf_cache_access, 0x02),
-	CMN_EVENT_HNF(cache_fill,	0x03),
-	CMN_EVENT_HNF(pocq_retry,	0x04),
-	CMN_EVENT_HNF(pocq_reqs_recvd,	0x05),
-	CMN_EVENT_HNF(sf_hit,		0x06),
-	CMN_EVENT_HNF(sf_evictions,	0x07),
-	CMN_EVENT_HNF(dir_snoops_sent,	0x08),
-	CMN_EVENT_HNF(brd_snoops_sent,	0x09),
-	CMN_EVENT_HNF(slc_eviction,	0x0a),
-	CMN_EVENT_HNF(slc_fill_invalid_way, 0x0b),
-	CMN_EVENT_HNF(mc_retries,	0x0c),
-	CMN_EVENT_HNF(mc_reqs,		0x0d),
-	CMN_EVENT_HNF(qos_hh_retry,	0x0e),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_all, 0x0f, 0),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_read, 0x0f, 1),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_write, 0x0f, 2),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_atomic, 0x0f, 3),
-	_CMN_EVENT_HNF(qos_pocq_occupancy_stash, 0x0f, 4),
-	CMN_EVENT_HNF(pocq_addrhaz,	0x10),
-	CMN_EVENT_HNF(pocq_atomic_addrhaz, 0x11),
-	CMN_EVENT_HNF(ld_st_swp_adq_full, 0x12),
-	CMN_EVENT_HNF(cmp_adq_full,	0x13),
-	CMN_EVENT_HNF(txdat_stall,	0x14),
-	CMN_EVENT_HNF(txrsp_stall,	0x15),
-	CMN_EVENT_HNF(seq_full,		0x16),
-	CMN_EVENT_HNF(seq_hit,		0x17),
-	CMN_EVENT_HNF(snp_sent,		0x18),
-	CMN_EVENT_HNF(sfbi_dir_snp_sent, 0x19),
-	CMN_EVENT_HNF(sfbi_brd_snp_sent, 0x1a),
-	CMN_EVENT_HNF(snp_sent_untrk,	0x1b),
-	CMN_EVENT_HNF(intv_dirty,	0x1c),
-	CMN_EVENT_HNF(stash_snp_sent,	0x1d),
-	CMN_EVENT_HNF(stash_data_pull,	0x1e),
-	CMN_EVENT_HNF(snp_fwded,	0x1f),
+	CMN_EVENT_HNF(CMN_ANY, cache_miss,		0x01),
+	CMN_EVENT_HNF(CMN_ANY, slc_sf_cache_access,	0x02),
+	CMN_EVENT_HNF(CMN_ANY, cache_fill,		0x03),
+	CMN_EVENT_HNF(CMN_ANY, pocq_retry,		0x04),
+	CMN_EVENT_HNF(CMN_ANY, pocq_reqs_recvd,		0x05),
+	CMN_EVENT_HNF(CMN_ANY, sf_hit,			0x06),
+	CMN_EVENT_HNF(CMN_ANY, sf_evictions,		0x07),
+	CMN_EVENT_HNF(CMN_ANY, dir_snoops_sent,		0x08),
+	CMN_EVENT_HNF(CMN_ANY, brd_snoops_sent,		0x09),
+	CMN_EVENT_HNF(CMN_ANY, slc_eviction,		0x0a),
+	CMN_EVENT_HNF(CMN_ANY, slc_fill_invalid_way,	0x0b),
+	CMN_EVENT_HNF(CMN_ANY, mc_retries,		0x0c),
+	CMN_EVENT_HNF(CMN_ANY, mc_reqs,			0x0d),
+	CMN_EVENT_HNF(CMN_ANY, qos_hh_retry,		0x0e),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_all,	0x0f, 0),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_read, 0x0f, 1),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_write, 0x0f, 2),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_atomic, 0x0f, 3),
+	_CMN_EVENT_HNF(CMN_ANY, qos_pocq_occupancy_stash, 0x0f, 4),
+	CMN_EVENT_HNF(CMN_ANY, pocq_addrhaz,		0x10),
+	CMN_EVENT_HNF(CMN_ANY, pocq_atomic_addrhaz,	0x11),
+	CMN_EVENT_HNF(CMN_ANY, ld_st_swp_adq_full,	0x12),
+	CMN_EVENT_HNF(CMN_ANY, cmp_adq_full,		0x13),
+	CMN_EVENT_HNF(CMN_ANY, txdat_stall,		0x14),
+	CMN_EVENT_HNF(CMN_ANY, txrsp_stall,		0x15),
+	CMN_EVENT_HNF(CMN_ANY, seq_full,		0x16),
+	CMN_EVENT_HNF(CMN_ANY, seq_hit,			0x17),
+	CMN_EVENT_HNF(CMN_ANY, snp_sent,		0x18),
+	CMN_EVENT_HNF(CMN_ANY, sfbi_dir_snp_sent,	0x19),
+	CMN_EVENT_HNF(CMN_ANY, sfbi_brd_snp_sent,	0x1a),
+	CMN_EVENT_HNF(CMN_ANY, snp_sent_untrk,		0x1b),
+	CMN_EVENT_HNF(CMN_ANY, intv_dirty,		0x1c),
+	CMN_EVENT_HNF(CMN_ANY, stash_snp_sent,		0x1d),
+	CMN_EVENT_HNF(CMN_ANY, stash_data_pull,		0x1e),
+	CMN_EVENT_HNF(CMN_ANY, snp_fwded,		0x1f),
 
-	CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl, 0x20),
-	CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl, 0x21),
-	CMN_EVENT_HNI(rdt_rd_occ_cnt_ovfl, 0x22),
-	CMN_EVENT_HNI(rdt_wr_occ_cnt_ovfl, 0x23),
-	CMN_EVENT_HNI(wdb_occ_cnt_ovfl,	0x24),
-	CMN_EVENT_HNI(rrt_rd_alloc,	0x25),
-	CMN_EVENT_HNI(rrt_wr_alloc,	0x26),
-	CMN_EVENT_HNI(rdt_rd_alloc,	0x27),
-	CMN_EVENT_HNI(rdt_wr_alloc,	0x28),
-	CMN_EVENT_HNI(wdb_alloc,	0x29),
-	CMN_EVENT_HNI(txrsp_retryack,	0x2a),
-	CMN_EVENT_HNI(arvalid_no_arready, 0x2b),
-	CMN_EVENT_HNI(arready_no_arvalid, 0x2c),
-	CMN_EVENT_HNI(awvalid_no_awready, 0x2d),
-	CMN_EVENT_HNI(awready_no_awvalid, 0x2e),
-	CMN_EVENT_HNI(wvalid_no_wready,	0x2f),
-	CMN_EVENT_HNI(txdat_stall,	0x30),
-	CMN_EVENT_HNI(nonpcie_serialization, 0x31),
-	CMN_EVENT_HNI(pcie_serialization, 0x32),
+	CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl,		0x20),
+	CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl,		0x21),
+	CMN_EVENT_HNI(rdt_rd_occ_cnt_ovfl,		0x22),
+	CMN_EVENT_HNI(rdt_wr_occ_cnt_ovfl,		0x23),
+	CMN_EVENT_HNI(wdb_occ_cnt_ovfl,			0x24),
+	CMN_EVENT_HNI(rrt_rd_alloc,			0x25),
+	CMN_EVENT_HNI(rrt_wr_alloc,			0x26),
+	CMN_EVENT_HNI(rdt_rd_alloc,			0x27),
+	CMN_EVENT_HNI(rdt_wr_alloc,			0x28),
+	CMN_EVENT_HNI(wdb_alloc,			0x29),
+	CMN_EVENT_HNI(txrsp_retryack,			0x2a),
+	CMN_EVENT_HNI(arvalid_no_arready,		0x2b),
+	CMN_EVENT_HNI(arready_no_arvalid,		0x2c),
+	CMN_EVENT_HNI(awvalid_no_awready,		0x2d),
+	CMN_EVENT_HNI(awready_no_awvalid,		0x2e),
+	CMN_EVENT_HNI(wvalid_no_wready,			0x2f),
+	CMN_EVENT_HNI(txdat_stall,			0x30),
+	CMN_EVENT_HNI(nonpcie_serialization,		0x31),
+	CMN_EVENT_HNI(pcie_serialization,		0x32),
 
-	CMN_EVENT_XP(txflit_valid,	0x01),
-	CMN_EVENT_XP(txflit_stall,	0x02),
-	CMN_EVENT_XP(partial_dat_flit,	0x03),
+	CMN_EVENT_XP(txflit_valid,			0x01),
+	CMN_EVENT_XP(txflit_stall,			0x02),
+	CMN_EVENT_XP(partial_dat_flit,			0x03),
 	/* We treat watchpoints as a special made-up class of XP events */
-	CMN_EVENT_ATTR(watchpoint_up, CMN_TYPE_WP, 0, 0),
-	CMN_EVENT_ATTR(watchpoint_down, CMN_TYPE_WP, 2, 0),
+	CMN_EVENT_ATTR(CMN_ANY, watchpoint_up, CMN_TYPE_WP, CMN_WP_UP, 0),
+	CMN_EVENT_ATTR(CMN_ANY, watchpoint_down, CMN_TYPE_WP, CMN_WP_DOWN, 0),
 
-	CMN_EVENT_SBSX(rd_req,		0x01),
-	CMN_EVENT_SBSX(wr_req,		0x02),
-	CMN_EVENT_SBSX(cmo_req,		0x03),
-	CMN_EVENT_SBSX(txrsp_retryack,	0x04),
-	CMN_EVENT_SBSX(txdat_flitv,	0x05),
-	CMN_EVENT_SBSX(txrsp_flitv,	0x06),
-	CMN_EVENT_SBSX(rd_req_trkr_occ_cnt_ovfl, 0x11),
-	CMN_EVENT_SBSX(wr_req_trkr_occ_cnt_ovfl, 0x12),
-	CMN_EVENT_SBSX(cmo_req_trkr_occ_cnt_ovfl, 0x13),
-	CMN_EVENT_SBSX(wdb_occ_cnt_ovfl, 0x14),
-	CMN_EVENT_SBSX(rd_axi_trkr_occ_cnt_ovfl, 0x15),
-	CMN_EVENT_SBSX(cmo_axi_trkr_occ_cnt_ovfl, 0x16),
-	CMN_EVENT_SBSX(arvalid_no_arready, 0x21),
-	CMN_EVENT_SBSX(awvalid_no_awready, 0x22),
-	CMN_EVENT_SBSX(wvalid_no_wready, 0x23),
-	CMN_EVENT_SBSX(txdat_stall,	0x24),
-	CMN_EVENT_SBSX(txrsp_stall,	0x25),
+	CMN_EVENT_SBSX(CMN_ANY, rd_req,			0x01),
+	CMN_EVENT_SBSX(CMN_ANY, wr_req,			0x02),
+	CMN_EVENT_SBSX(CMN_ANY, cmo_req,		0x03),
+	CMN_EVENT_SBSX(CMN_ANY, txrsp_retryack,		0x04),
+	CMN_EVENT_SBSX(CMN_ANY, txdat_flitv,		0x05),
+	CMN_EVENT_SBSX(CMN_ANY, txrsp_flitv,		0x06),
+	CMN_EVENT_SBSX(CMN_ANY, rd_req_trkr_occ_cnt_ovfl, 0x11),
+	CMN_EVENT_SBSX(CMN_ANY, wr_req_trkr_occ_cnt_ovfl, 0x12),
+	CMN_EVENT_SBSX(CMN_ANY, cmo_req_trkr_occ_cnt_ovfl, 0x13),
+	CMN_EVENT_SBSX(CMN_ANY, wdb_occ_cnt_ovfl,	0x14),
+	CMN_EVENT_SBSX(CMN_ANY, rd_axi_trkr_occ_cnt_ovfl, 0x15),
+	CMN_EVENT_SBSX(CMN_ANY, cmo_axi_trkr_occ_cnt_ovfl, 0x16),
+	CMN_EVENT_SBSX(CMN_ANY, arvalid_no_arready,	0x21),
+	CMN_EVENT_SBSX(CMN_ANY, awvalid_no_awready,	0x22),
+	CMN_EVENT_SBSX(CMN_ANY, wvalid_no_wready,	0x23),
+	CMN_EVENT_SBSX(CMN_ANY, txdat_stall,		0x24),
+	CMN_EVENT_SBSX(CMN_ANY, txrsp_stall,		0x25),
 
-	CMN_EVENT_RNID(s0_rdata_beats,	0x01),
-	CMN_EVENT_RNID(s1_rdata_beats,	0x02),
-	CMN_EVENT_RNID(s2_rdata_beats,	0x03),
-	CMN_EVENT_RNID(rxdat_flits,	0x04),
-	CMN_EVENT_RNID(txdat_flits,	0x05),
-	CMN_EVENT_RNID(txreq_flits_total, 0x06),
-	CMN_EVENT_RNID(txreq_flits_retried, 0x07),
-	CMN_EVENT_RNID(rrt_occ_ovfl,	0x08),
-	CMN_EVENT_RNID(wrt_occ_ovfl,	0x09),
-	CMN_EVENT_RNID(txreq_flits_replayed, 0x0a),
-	CMN_EVENT_RNID(wrcancel_sent,	0x0b),
-	CMN_EVENT_RNID(s0_wdata_beats,	0x0c),
-	CMN_EVENT_RNID(s1_wdata_beats,	0x0d),
-	CMN_EVENT_RNID(s2_wdata_beats,	0x0e),
-	CMN_EVENT_RNID(rrt_alloc,	0x0f),
-	CMN_EVENT_RNID(wrt_alloc,	0x10),
-	CMN_EVENT_RNID(rdb_unord,	0x11),
-	CMN_EVENT_RNID(rdb_replay,	0x12),
-	CMN_EVENT_RNID(rdb_hybrid,	0x13),
-	CMN_EVENT_RNID(rdb_ord,		0x14),
+	CMN_EVENT_RNID(CMN_ANY, s0_rdata_beats,		0x01),
+	CMN_EVENT_RNID(CMN_ANY, s1_rdata_beats,		0x02),
+	CMN_EVENT_RNID(CMN_ANY, s2_rdata_beats,		0x03),
+	CMN_EVENT_RNID(CMN_ANY, rxdat_flits,		0x04),
+	CMN_EVENT_RNID(CMN_ANY, txdat_flits,		0x05),
+	CMN_EVENT_RNID(CMN_ANY, txreq_flits_total,	0x06),
+	CMN_EVENT_RNID(CMN_ANY, txreq_flits_retried,	0x07),
+	CMN_EVENT_RNID(CMN_ANY, rrt_occ_ovfl,		0x08),
+	CMN_EVENT_RNID(CMN_ANY, wrt_occ_ovfl,		0x09),
+	CMN_EVENT_RNID(CMN_ANY, txreq_flits_replayed,	0x0a),
+	CMN_EVENT_RNID(CMN_ANY, wrcancel_sent,		0x0b),
+	CMN_EVENT_RNID(CMN_ANY, s0_wdata_beats,		0x0c),
+	CMN_EVENT_RNID(CMN_ANY, s1_wdata_beats,		0x0d),
+	CMN_EVENT_RNID(CMN_ANY, s2_wdata_beats,		0x0e),
+	CMN_EVENT_RNID(CMN_ANY, rrt_alloc,		0x0f),
+	CMN_EVENT_RNID(CMN_ANY, wrt_alloc,		0x10),
+	CMN_EVENT_RNID(CMN600, rdb_unord,		0x11),
+	CMN_EVENT_RNID(CMN600, rdb_replay,		0x12),
+	CMN_EVENT_RNID(CMN600, rdb_hybrid,		0x13),
+	CMN_EVENT_RNID(CMN600, rdb_ord,			0x14),
 
 	NULL
 };
@@ -1386,15 +1400,14 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	int i, j;
 	size_t sz;
 
-	cfg_region = cmn->base + rgn_offset;
-	reg = readl_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_2);
-	cmn->rev = FIELD_GET(CMN_CFGM_PID2_REVISION, reg);
-	dev_dbg(cmn->dev, "periph_id_2 revision: %d\n", cmn->rev);
-
 	arm_cmn_init_node_info(cmn, rgn_offset, &cfg);
 	if (cfg.type != CMN_TYPE_CFG)
 		return -ENODEV;
 
+	cfg_region = cmn->base + rgn_offset;
+	reg = readl_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_2);
+	cmn->rev = FIELD_GET(CMN_CFGM_PID2_REVISION, reg);
+
 	reg = readq_relaxed(cfg_region + CMN_CHILD_INFO);
 	child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg);
 	child_poff = FIELD_GET(CMN_CI_CHILD_PTR_OFFSET, reg);
@@ -1507,13 +1520,14 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		cmn->mesh_x = cmn->num_xps;
 	cmn->mesh_y = cmn->num_xps / cmn->mesh_x;
 
+	dev_dbg(cmn->dev, "model %d, periph_id_2 revision %d\n", cmn->model, cmn->rev);
 	dev_dbg(cmn->dev, "mesh %dx%d, ID width %d\n",
 		cmn->mesh_x, cmn->mesh_y, arm_cmn_xyidbits(cmn));
 
 	return 0;
 }
 
-static int arm_cmn_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn)
+static int arm_cmn600_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn)
 {
 	struct resource *cfg, *root;
 
@@ -1540,21 +1554,11 @@ static int arm_cmn_acpi_probe(struct platform_device *pdev, struct arm_cmn *cmn)
 	return root->start - cfg->start;
 }
 
-static int arm_cmn_of_probe(struct platform_device *pdev, struct arm_cmn *cmn)
+static int arm_cmn600_of_probe(struct device_node *np)
 {
-	struct device_node *np = pdev->dev.of_node;
 	u32 rootnode;
-	int ret;
 
-	cmn->base = devm_platform_ioremap_resource(pdev, 0);
-	if (IS_ERR(cmn->base))
-		return PTR_ERR(cmn->base);
-
-	ret = of_property_read_u32(np, "arm,root-node", &rootnode);
-	if (ret)
-		return ret;
-
-	return rootnode;
+	return of_property_read_u32(np, "arm,root-node", &rootnode) ?: rootnode;
 }
 
 static int arm_cmn_probe(struct platform_device *pdev)
@@ -1569,12 +1573,19 @@ static int arm_cmn_probe(struct platform_device *pdev)
 		return -ENOMEM;
 
 	cmn->dev = &pdev->dev;
+	cmn->model = (unsigned long)device_get_match_data(cmn->dev);
 	platform_set_drvdata(pdev, cmn);
 
-	if (has_acpi_companion(cmn->dev))
-		rootnode = arm_cmn_acpi_probe(pdev, cmn);
-	else
-		rootnode = arm_cmn_of_probe(pdev, cmn);
+	if (cmn->model == CMN600 && has_acpi_companion(cmn->dev)) {
+		rootnode = arm_cmn600_acpi_probe(pdev, cmn);
+	} else {
+		rootnode = 0;
+		cmn->base = devm_platform_ioremap_resource(pdev, 0);
+		if (IS_ERR(cmn->base))
+			return PTR_ERR(cmn->base);
+		if (cmn->model == CMN600)
+			rootnode = arm_cmn600_of_probe(pdev->dev.of_node);
+	}
 	if (rootnode < 0)
 		return rootnode;
 
@@ -1637,7 +1648,7 @@ static int arm_cmn_remove(struct platform_device *pdev)
 
 #ifdef CONFIG_OF
 static const struct of_device_id arm_cmn_of_match[] = {
-	{ .compatible = "arm,cmn-600", },
+	{ .compatible = "arm,cmn-600", .data = (void *)CMN600 },
 	{}
 };
 MODULE_DEVICE_TABLE(of, arm_cmn_of_match);
@@ -1645,7 +1656,7 @@ MODULE_DEVICE_TABLE(of, arm_cmn_of_match);
 
 #ifdef CONFIG_ACPI
 static const struct acpi_device_id arm_cmn_acpi_match[] = {
-	{ "ARMHC600", },
+	{ "ARMHC600", CMN600 },
 	{}
 };
 MODULE_DEVICE_TABLE(acpi, arm_cmn_acpi_match);

From 60d1504070c22c059a1e11bc3fd444953da988c1 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:45:00 +0000
Subject: [PATCH 39/81] perf/arm-cmn: Support new IP features

The second generation of CMN IPs add new node types and significantly
expand the configuration space with options for extra device ports on
edge XPs, either plumbed into the regular DTM or with extra dedicated
DTMs to monitor them, plus larger (and smaller) mesh sizes. Add basic
support for pulling this new information out of the hardware, piping
it around as necessary, and handling (most of) the new choices.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/e58b495bcc7deec3882be4bac910ed0bf6979674.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 222 ++++++++++++++++++++++++++++++++---------
 1 file changed, 173 insertions(+), 49 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index ce94f923a607..0a3f33a83c01 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -24,7 +24,10 @@
 #define CMN_NI_LOGICAL_ID		GENMASK_ULL(47, 32)
 
 #define CMN_NODEID_DEVID(reg)		((reg) & 3)
+#define CMN_NODEID_EXT_DEVID(reg)	((reg) & 1)
 #define CMN_NODEID_PID(reg)		(((reg) >> 2) & 1)
+#define CMN_NODEID_EXT_PID(reg)		(((reg) >> 1) & 3)
+#define CMN_NODEID_1x1_PID(reg)		(((reg) >> 2) & 7)
 #define CMN_NODEID_X(reg, bits)		((reg) >> (3 + (bits)))
 #define CMN_NODEID_Y(reg, bits)		(((reg) >> 3) & ((1U << (bits)) - 1))
 
@@ -37,13 +40,26 @@
 
 #define CMN_MAX_DIMENSION		8
 #define CMN_MAX_XPS			(CMN_MAX_DIMENSION * CMN_MAX_DIMENSION)
-#define CMN_MAX_DTMS			CMN_MAX_XPS
+#define CMN_MAX_DTMS			(CMN_MAX_XPS + (CMN_MAX_DIMENSION - 1) * 4)
 
-/* The CFG node has one other useful purpose */
+/* The CFG node has various info besides the discovery tree */
 #define CMN_CFGM_PERIPH_ID_2		0x0010
 #define CMN_CFGM_PID2_REVISION		GENMASK(7, 4)
 
-/* PMU registers occupy the 3rd 4KB page of each node's 16KB space */
+#define CMN_CFGM_INFO_GLOBAL		0x900
+#define CMN_INFO_MULTIPLE_DTM_EN	BIT_ULL(63)
+#define CMN_INFO_RSP_VC_NUM		GENMASK_ULL(53, 52)
+#define CMN_INFO_DAT_VC_NUM		GENMASK_ULL(51, 50)
+
+/* XPs also have some local topology info which has uses too */
+#define CMN_MXP__CONNECT_INFO_P0	0x0008
+#define CMN_MXP__CONNECT_INFO_P1	0x0010
+#define CMN_MXP__CONNECT_INFO_P2	0x0028
+#define CMN_MXP__CONNECT_INFO_P3	0x0030
+#define CMN_MXP__CONNECT_INFO_P4	0x0038
+#define CMN_MXP__CONNECT_INFO_P5	0x0040
+
+/* PMU registers occupy the 3rd 4KB page of each node's region */
 #define CMN_PMU_OFFSET			0x2000
 
 /* For most nodes, this is all there is */
@@ -53,6 +69,7 @@
 /* DTMs live in the PMU space of XP registers */
 #define CMN_DTM_WPn(n)			(0x1A0 + (n) * 0x18)
 #define CMN_DTM_WPn_CONFIG(n)		(CMN_DTM_WPn(n) + 0x00)
+#define CMN_DTM_WPn_CONFIG_WP_DEV_SEL2	GENMASK_ULL(18,17)
 #define CMN_DTM_WPn_CONFIG_WP_COMBINE	BIT(6)
 #define CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE	BIT(5)
 #define CMN_DTM_WPn_CONFIG_WP_GRP	BIT(4)
@@ -77,7 +94,11 @@
 
 #define CMN_DTM_PMEVCNTSR		0x240
 
+#define CMN_DTM_UNIT_INFO		0x0910
+
 #define CMN_DTM_NUM_COUNTERS		4
+/* Want more local counters? Why not replicate the whole DTM! Ugh... */
+#define CMN_DTM_OFFSET(n)		((n) * 0x200)
 
 /* The DTC node is where the magic happens */
 #define CMN_DT_DTC_CTL			0x0a00
@@ -131,10 +152,10 @@
 #define CMN_EVENT_NODEID(event)		FIELD_GET(CMN_CONFIG_NODEID, (event)->attr.config)
 
 #define CMN_CONFIG_WP_COMBINE		GENMASK_ULL(27, 24)
-#define CMN_CONFIG_WP_DEV_SEL		BIT_ULL(48)
-#define CMN_CONFIG_WP_CHN_SEL		GENMASK_ULL(50, 49)
-#define CMN_CONFIG_WP_GRP		BIT_ULL(52)
-#define CMN_CONFIG_WP_EXCLUSIVE		BIT_ULL(53)
+#define CMN_CONFIG_WP_DEV_SEL		GENMASK_ULL(50, 48)
+#define CMN_CONFIG_WP_CHN_SEL		GENMASK_ULL(55, 51)
+#define CMN_CONFIG_WP_GRP		BIT_ULL(56)
+#define CMN_CONFIG_WP_EXCLUSIVE		BIT_ULL(57)
 #define CMN_CONFIG1_WP_VAL		GENMASK_ULL(63, 0)
 #define CMN_CONFIG2_WP_MASK		GENMASK_ULL(63, 0)
 
@@ -176,9 +197,12 @@ enum cmn_node_type {
 	CMN_TYPE_HNF,
 	CMN_TYPE_XP,
 	CMN_TYPE_SBSX,
-	CMN_TYPE_RNI = 0xa,
+	CMN_TYPE_MPAM_S,
+	CMN_TYPE_MPAM_NS,
+	CMN_TYPE_RNI,
 	CMN_TYPE_RND = 0xd,
 	CMN_TYPE_RNSAM = 0xf,
+	CMN_TYPE_MTSX,
 	CMN_TYPE_CXRA = 0x100,
 	CMN_TYPE_CXHA = 0x101,
 	CMN_TYPE_CXLA = 0x102,
@@ -233,6 +257,7 @@ struct arm_cmn_dtc {
 struct arm_cmn {
 	struct device *dev;
 	void __iomem *base;
+	unsigned int state;
 
 	enum cmn_revision rev;
 	enum cmn_model model;
@@ -240,6 +265,13 @@ struct arm_cmn {
 	u8 mesh_y;
 	u16 num_xps;
 	u16 num_dns;
+	bool multi_dtm;
+	u8 ports_used;
+	struct {
+		unsigned int rsp_vc_num : 2;
+		unsigned int dat_vc_num : 2;
+	};
+
 	struct arm_cmn_node *xps;
 	struct arm_cmn_node *dns;
 
@@ -250,7 +282,6 @@ struct arm_cmn {
 	int cpu;
 	struct hlist_node cpuhp_node;
 
-	unsigned int state;
 	struct pmu pmu;
 };
 
@@ -275,13 +306,25 @@ static int arm_cmn_xyidbits(const struct arm_cmn *cmn)
 static struct arm_cmn_nodeid arm_cmn_nid(const struct arm_cmn *cmn, u16 id)
 {
 	struct arm_cmn_nodeid nid;
-	int bits = arm_cmn_xyidbits(cmn);
 
-	nid.x = CMN_NODEID_X(id, bits);
-	nid.y = CMN_NODEID_Y(id, bits);
-	nid.port = CMN_NODEID_PID(id);
-	nid.dev = CMN_NODEID_DEVID(id);
+	if (cmn->num_xps == 1) {
+		nid.x = 0;
+		nid.y = 0;
+		nid.port = CMN_NODEID_1x1_PID(id);
+		nid.dev = CMN_NODEID_DEVID(id);
+	} else {
+		int bits = arm_cmn_xyidbits(cmn);
 
+		nid.x = CMN_NODEID_X(id, bits);
+		nid.y = CMN_NODEID_Y(id, bits);
+		if (cmn->ports_used & 0xc) {
+			nid.port = CMN_NODEID_EXT_PID(id);
+			nid.dev = CMN_NODEID_EXT_DEVID(id);
+		} else {
+			nid.port = CMN_NODEID_PID(id);
+			nid.dev = CMN_NODEID_DEVID(id);
+		}
+	}
 	return nid;
 }
 
@@ -310,6 +353,7 @@ struct arm_cmn_hw_event {
 	unsigned int dtc_idx;
 	u8 dtcs_used;
 	u8 num_dns;
+	u8 dtm_offset;
 };
 
 #define for_each_hw_dn(hw, dn, i) \
@@ -354,7 +398,8 @@ struct arm_cmn_format_attr {
 		.occupid = _occupid,					\
 	}})[0].attr.attr)
 
-static bool arm_cmn_is_occup_event(enum cmn_node_type type, unsigned int id)
+static bool arm_cmn_is_occup_event(enum cmn_model model,
+				   enum cmn_node_type type, unsigned int id)
 {
 	return (type == CMN_TYPE_DVM && id == 0x05) ||
 	       (type == CMN_TYPE_HNF && id == 0x0f);
@@ -375,7 +420,7 @@ static ssize_t arm_cmn_event_show(struct device *dev,
 				  "type=0x%x,eventid=0x%x,wp_dev_sel=?,wp_chn_sel=?,wp_grp=?,wp_val=?,wp_mask=?\n",
 				  eattr->type, eattr->eventid);
 
-	if (arm_cmn_is_occup_event(eattr->type, eattr->eventid))
+	if (arm_cmn_is_occup_event(eattr->model, eattr->type, eattr->eventid))
 		return sysfs_emit(buf, "type=0x%x,eventid=0x%x,occupid=0x%x\n",
 				  eattr->type, eattr->eventid, eattr->occupid);
 
@@ -390,25 +435,36 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	struct device *dev = kobj_to_dev(kobj);
 	struct arm_cmn *cmn = to_cmn(dev_get_drvdata(dev));
 	struct arm_cmn_event_attr *eattr;
-	enum cmn_node_type type;
 
 	eattr = container_of(attr, typeof(*eattr), attr.attr);
-	type = eattr->type;
 
 	if (!(eattr->model & cmn->model))
 		return 0;
 
-	/* Watchpoints aren't nodes */
-	if (type == CMN_TYPE_WP)
-		type = CMN_TYPE_XP;
+	/* Watchpoints aren't nodes, so avoid confusion */
+	if (eattr->type == CMN_TYPE_WP)
+		return attr->mode;
 
-	/* Revision-specific differences */
-	if (cmn->model == CMN600 && cmn->rev < CMN600_R1P2) {
-		if (type == CMN_TYPE_HNF && eattr->eventid == 0x1b)
+	/* Hide XP events for unused interfaces/channels */
+	if (eattr->type == CMN_TYPE_XP) {
+		unsigned int intf = (eattr->eventid >> 2) & 7;
+		unsigned int chan = eattr->eventid >> 5;
+
+		if ((intf & 4) && !(cmn->ports_used & BIT(intf & 3)))
+			return 0;
+
+		if ((chan == 5 && cmn->rsp_vc_num < 2) ||
+		    (chan == 6 && cmn->dat_vc_num < 2))
 			return 0;
 	}
 
-	if (!arm_cmn_node(cmn, type))
+	/* Revision-specific differences */
+	if (cmn->model == CMN600 && cmn->rev < CMN600_R1P2) {
+		if (eattr->type == CMN_TYPE_HNF && eattr->eventid == 0x1b)
+			return 0;
+	}
+
+	if (!arm_cmn_node(cmn, eattr->type))
 		return 0;
 
 	return attr->mode;
@@ -669,7 +725,8 @@ static u32 arm_cmn_wp_config(struct perf_event *event)
 	config = FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_DEV_SEL, dev) |
 		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_CHN_SEL, chn) |
 		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_GRP, grp) |
-		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE, exc);
+		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_EXCLUSIVE, exc) |
+		 FIELD_PREP(CMN_DTM_WPn_CONFIG_WP_DEV_SEL2, dev >> 1);
 	if (combine && !grp)
 		config |= CMN_DTM_WPn_CONFIG_WP_COMBINE;
 
@@ -712,7 +769,7 @@ static u64 arm_cmn_read_dtm(struct arm_cmn *cmn, struct arm_cmn_hw_event *hw,
 	offset = snapshot ? CMN_DTM_PMEVCNTSR : CMN_DTM_PMEVCNT;
 	for_each_hw_dn(hw, dn, i) {
 		if (dtm != &cmn->dtms[dn->dtm]) {
-			dtm = &cmn->dtms[dn->dtm];
+			dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset;
 			reg = readq_relaxed(dtm->base + offset);
 		}
 		dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
@@ -800,8 +857,10 @@ static void arm_cmn_event_start(struct perf_event *event, int flags)
 		u64 mask = CMN_EVENT_WP_MASK(event);
 
 		for_each_hw_dn(hw, dn, i) {
-			writeq_relaxed(val, dn->pmu_base + CMN_DTM_WPn_VAL(wp_idx));
-			writeq_relaxed(mask, dn->pmu_base + CMN_DTM_WPn_MASK(wp_idx));
+			void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset);
+
+			writeq_relaxed(val, base + CMN_DTM_WPn_VAL(wp_idx));
+			writeq_relaxed(mask, base + CMN_DTM_WPn_MASK(wp_idx));
 		}
 	} else for_each_hw_dn(hw, dn, i) {
 		int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
@@ -826,8 +885,10 @@ static void arm_cmn_event_stop(struct perf_event *event, int flags)
 		int wp_idx = arm_cmn_wp_idx(event);
 
 		for_each_hw_dn(hw, dn, i) {
-			writeq_relaxed(0, dn->pmu_base + CMN_DTM_WPn_MASK(wp_idx));
-			writeq_relaxed(~0ULL, dn->pmu_base + CMN_DTM_WPn_VAL(wp_idx));
+			void __iomem *base = dn->pmu_base + CMN_DTM_OFFSET(hw->dtm_offset);
+
+			writeq_relaxed(0, base + CMN_DTM_WPn_MASK(wp_idx));
+			writeq_relaxed(~0ULL, base + CMN_DTM_WPn_VAL(wp_idx));
 		}
 	} else for_each_hw_dn(hw, dn, i) {
 		int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
@@ -847,7 +908,8 @@ struct arm_cmn_val {
 	bool cycles;
 };
 
-static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *event)
+static void arm_cmn_val_add_event(struct arm_cmn *cmn, struct arm_cmn_val *val,
+				  struct perf_event *event)
 {
 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
 	struct arm_cmn_node *dn;
@@ -865,7 +927,7 @@ static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *ev
 	}
 
 	val->dtc_count++;
-	if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event)))
+	if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event)))
 		occupid = CMN_EVENT_OCCUPID(event) + 1;
 	else
 		occupid = 0;
@@ -884,7 +946,7 @@ static void arm_cmn_val_add_event(struct arm_cmn_val *val, struct perf_event *ev
 	}
 }
 
-static int arm_cmn_validate_group(struct perf_event *event)
+static int arm_cmn_validate_group(struct arm_cmn *cmn, struct perf_event *event)
 {
 	struct arm_cmn_hw_event *hw = to_cmn_hw(event);
 	struct arm_cmn_node *dn;
@@ -904,9 +966,9 @@ static int arm_cmn_validate_group(struct perf_event *event)
 	if (!val)
 		return -ENOMEM;
 
-	arm_cmn_val_add_event(val, leader);
+	arm_cmn_val_add_event(cmn, val, leader);
 	for_each_sibling_event(sibling, leader)
-		arm_cmn_val_add_event(val, sibling);
+		arm_cmn_val_add_event(cmn, val, sibling);
 
 	type = CMN_EVENT_TYPE(event);
 	if (type == CMN_TYPE_DTC) {
@@ -917,7 +979,7 @@ static int arm_cmn_validate_group(struct perf_event *event)
 	if (val->dtc_count == CMN_DT_NUM_COUNTERS)
 		goto done;
 
-	if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event)))
+	if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event)))
 		occupid = CMN_EVENT_OCCUPID(event) + 1;
 	else
 		occupid = 0;
@@ -980,6 +1042,9 @@ static int arm_cmn_event_init(struct perf_event *event)
 		eventid = CMN_EVENT_EVENTID(event);
 		if (eventid != CMN_WP_UP && eventid != CMN_WP_DOWN)
 			return -EINVAL;
+		/* ...but the DTM may depend on which port we're watching */
+		if (cmn->multi_dtm)
+			hw->dtm_offset = CMN_EVENT_WP_DEV_SEL(event) / 2;
 	}
 
 	bynodeid = CMN_EVENT_BYNODEID(event);
@@ -1007,7 +1072,7 @@ static int arm_cmn_event_init(struct perf_event *event)
 		return -EINVAL;
 	}
 
-	return arm_cmn_validate_group(event);
+	return arm_cmn_validate_group(cmn, event);
 }
 
 static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
@@ -1017,13 +1082,13 @@ static void arm_cmn_event_clear(struct arm_cmn *cmn, struct perf_event *event,
 	enum cmn_node_type type = CMN_EVENT_TYPE(event);
 
 	while (i--) {
-		struct arm_cmn_dtm *dtm = &cmn->dtms[hw->dn[i].dtm];
+		struct arm_cmn_dtm *dtm = &cmn->dtms[hw->dn[i].dtm] + hw->dtm_offset;
 		unsigned int dtm_idx = arm_cmn_get_index(hw->dtm_idx, i);
 
 		if (type == CMN_TYPE_WP)
 			dtm->wp_event[arm_cmn_wp_idx(event)] = -1;
 
-		if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event)))
+		if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event)))
 			hw->dn[i].occupid_count--;
 
 		dtm->pmu_config_low &= ~CMN__PMEVCNT_PAIRED(dtm_idx);
@@ -1069,7 +1134,7 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 
 	/* ...then the local counters to feed it. */
 	for_each_hw_dn(hw, dn, i) {
-		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm];
+		struct arm_cmn_dtm *dtm = &cmn->dtms[dn->dtm] + hw->dtm_offset;
 		unsigned int dtm_idx, shift;
 		u64 reg;
 
@@ -1098,10 +1163,13 @@ static int arm_cmn_event_add(struct perf_event *event, int flags)
 		} else {
 			struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
 
+			if (cmn->multi_dtm)
+				nid.port %= 2;
+
 			input_sel = CMN__PMEVCNT0_INPUT_SEL_DEV + dtm_idx +
 				    (nid.port << 4) + (nid.dev << 2);
 
-			if (arm_cmn_is_occup_event(type, CMN_EVENT_EVENTID(event))) {
+			if (arm_cmn_is_occup_event(cmn->model, type, CMN_EVENT_EVENTID(event))) {
 				u8 occupid = CMN_EVENT_OCCUPID(event);
 
 				if (dn->occupid_count == 0) {
@@ -1283,11 +1351,11 @@ static int arm_cmn_init_irqs(struct arm_cmn *cmn)
 	return 0;
 }
 
-static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp)
+static void arm_cmn_init_dtm(struct arm_cmn_dtm *dtm, struct arm_cmn_node *xp, int idx)
 {
 	int i;
 
-	dtm->base = xp->pmu_base;
+	dtm->base = xp->pmu_base + CMN_DTM_OFFSET(idx);
 	dtm->pmu_config_low = CMN_DTM_PMU_CONFIG_PMU_EN;
 	for (i = 0; i < 4; i++) {
 		dtm->wp_event[i] = -1;
@@ -1345,6 +1413,8 @@ static int arm_cmn_init_dtcs(struct arm_cmn *cmn)
 
 		xp = arm_cmn_node_to_xp(cmn, dn);
 		dn->dtm = xp->dtm;
+		if (cmn->multi_dtm)
+			dn->dtm += arm_cmn_nid(cmn, dn->id).port / 2;
 
 		if (dn->type == CMN_TYPE_DTC) {
 			int err;
@@ -1408,6 +1478,11 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	reg = readl_relaxed(cfg_region + CMN_CFGM_PERIPH_ID_2);
 	cmn->rev = FIELD_GET(CMN_CFGM_PID2_REVISION, reg);
 
+	reg = readq_relaxed(cfg_region + CMN_CFGM_INFO_GLOBAL);
+	cmn->multi_dtm = reg & CMN_INFO_MULTIPLE_DTM_EN;
+	cmn->rsp_vc_num = FIELD_GET(CMN_INFO_RSP_VC_NUM, reg);
+	cmn->dat_vc_num = FIELD_GET(CMN_INFO_DAT_VC_NUM, reg);
+
 	reg = readq_relaxed(cfg_region + CMN_CHILD_INFO);
 	child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg);
 	child_poff = FIELD_GET(CMN_CI_CHILD_PTR_OFFSET, reg);
@@ -1429,7 +1504,11 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	if (!dn)
 		return -ENOMEM;
 
-	dtm = devm_kcalloc(cmn->dev, cmn->num_xps, sizeof(*dtm), GFP_KERNEL);
+	/* Initial safe upper bound on DTMs for any possible mesh layout */
+	i = cmn->num_xps;
+	if (cmn->multi_dtm)
+		i += cmn->num_xps + 1;
+	dtm = devm_kcalloc(cmn->dev, i, sizeof(*dtm), GFP_KERNEL);
 	if (!dtm)
 		return -ENOMEM;
 
@@ -1439,6 +1518,7 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	for (i = 0; i < cmn->num_xps; i++) {
 		void __iomem *xp_region = cmn->base + xp_offset[i];
 		struct arm_cmn_node *xp = dn++;
+		unsigned int xp_ports = 0;
 
 		arm_cmn_init_node_info(cmn, xp_offset[i], xp);
 		/*
@@ -1450,9 +1530,39 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		if (xp->id == (1 << 3))
 			cmn->mesh_x = xp->logid;
 
-		xp->dtc = 0xf;
+		if (cmn->model == CMN600)
+			xp->dtc = 0xf;
+		else
+			xp->dtc = 1 << readl_relaxed(xp_region + CMN_DTM_UNIT_INFO);
+
 		xp->dtm = dtm - cmn->dtms;
-		arm_cmn_init_dtm(dtm++, xp);
+		arm_cmn_init_dtm(dtm++, xp, 0);
+		/*
+		 * Keeping track of connected ports will let us filter out
+		 * unnecessary XP events easily. We can also reliably infer the
+		 * "extra device ports" configuration for the node ID format
+		 * from this, since in that case we will see at least one XP
+		 * with port 2 connected, for the HN-D.
+		 */
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P0))
+			xp_ports |= BIT(0);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P1))
+			xp_ports |= BIT(1);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P2))
+			xp_ports |= BIT(2);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P3))
+			xp_ports |= BIT(3);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P4))
+			xp_ports |= BIT(4);
+		if (readq_relaxed(xp_region + CMN_MXP__CONNECT_INFO_P5))
+			xp_ports |= BIT(5);
+
+		if (cmn->multi_dtm && (xp_ports & 0xc))
+			arm_cmn_init_dtm(dtm++, xp, 1);
+		if (cmn->multi_dtm && (xp_ports & 0x30))
+			arm_cmn_init_dtm(dtm++, xp, 2);
+
+		cmn->ports_used |= xp_ports;
 
 		reg = readq_relaxed(xp_region + CMN_CHILD_INFO);
 		child_count = FIELD_GET(CMN_CI_CHILD_COUNT, reg);
@@ -1488,11 +1598,14 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 			case CMN_TYPE_SBSX:
 			case CMN_TYPE_RNI:
 			case CMN_TYPE_RND:
+			case CMN_TYPE_MTSX:
 			case CMN_TYPE_CXRA:
 			case CMN_TYPE_CXHA:
 				dn++;
 				break;
 			/* Nothing to see here */
+			case CMN_TYPE_MPAM_S:
+			case CMN_TYPE_MPAM_NS:
 			case CMN_TYPE_RNSAM:
 			case CMN_TYPE_CXLA:
 				break;
@@ -1512,6 +1625,11 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 	if (dn)
 		cmn->dns = dn;
 
+	sz = (void *)dtm - (void *)cmn->dtms;
+	dtm = devm_krealloc(cmn->dev, cmn->dtms, sz, GFP_KERNEL);
+	if (dtm)
+		cmn->dtms = dtm;
+
 	/*
 	 * If mesh_x wasn't set during discovery then we never saw
 	 * an XP at (0,1), thus we must have an Nx1 configuration.
@@ -1520,9 +1638,15 @@ static int arm_cmn_discover(struct arm_cmn *cmn, unsigned int rgn_offset)
 		cmn->mesh_x = cmn->num_xps;
 	cmn->mesh_y = cmn->num_xps / cmn->mesh_x;
 
+	/* 1x1 config plays havoc with XP event encodings */
+	if (cmn->num_xps == 1)
+		dev_warn(cmn->dev, "1x1 config not fully supported, translate XP events manually\n");
+
 	dev_dbg(cmn->dev, "model %d, periph_id_2 revision %d\n", cmn->model, cmn->rev);
-	dev_dbg(cmn->dev, "mesh %dx%d, ID width %d\n",
-		cmn->mesh_x, cmn->mesh_y, arm_cmn_xyidbits(cmn));
+	reg = cmn->ports_used;
+	dev_dbg(cmn->dev, "mesh %dx%d, ID width %d, ports %6pbl%s\n",
+		cmn->mesh_x, cmn->mesh_y, arm_cmn_xyidbits(cmn), &reg,
+		cmn->multi_dtm ? ", multi-DTM" : "");
 
 	return 0;
 }

From e310644724e195b79cf8114ca5297e4cdb36c955 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:45:01 +0000
Subject: [PATCH 40/81] dt-bindings: perf: arm-cmn: Add CI-700

CI-700 is a new client-level coherent interconnect derived from
the enterprise-level CMN family, and shares the same PMU design.

CC: devicetree@vger.kernel.org
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/5f0b372f808f1468e6d9500cedafbecd10254674.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 .../devicetree/bindings/perf/arm,cmn.yaml     | 21 ++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)

diff --git a/Documentation/devicetree/bindings/perf/arm,cmn.yaml b/Documentation/devicetree/bindings/perf/arm,cmn.yaml
index 42424ccbdd0c..2d4219ec7eda 100644
--- a/Documentation/devicetree/bindings/perf/arm,cmn.yaml
+++ b/Documentation/devicetree/bindings/perf/arm,cmn.yaml
@@ -12,12 +12,14 @@ maintainers:
 
 properties:
   compatible:
-    const: arm,cmn-600
+    enum:
+      - arm,cmn-600
+      - arm,ci-700
 
   reg:
     items:
       - description: Physical address of the base (PERIPHBASE) and
-          size (up to 64MB) of the configuration address space.
+          size of the configuration address space.
 
   interrupts:
     minItems: 1
@@ -31,14 +33,23 @@ properties:
 
   arm,root-node:
     $ref: /schemas/types.yaml#/definitions/uint32
-    description: Offset from PERIPHBASE of the configuration
-      discovery node (see TRM definition of ROOTNODEBASE).
+    description: Offset from PERIPHBASE of CMN-600's configuration
+      discovery node (see TRM definition of ROOTNODEBASE). Not
+      relevant for newer CMN/CI products.
 
 required:
   - compatible
   - reg
   - interrupts
-  - arm,root-node
+
+if:
+  properties:
+    compatible:
+      contains:
+        const: arm,cmn-600
+then:
+  required:
+    - arm,root-node
 
 additionalProperties: false
 

From b2fea780c9282dbaf77ef081e6d97a3f2c0dfc6a Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:45:02 +0000
Subject: [PATCH 41/81] perf/arm-cmn: Add CI-700 Support

Add the identifiers and events for the CI-700 coherent interconnect.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/28f566ab23a83733c6c9ef9414c010b760b4549c.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 57 +++++++++++++++++++++++++++++++++++++++---
 1 file changed, 53 insertions(+), 4 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 0a3f33a83c01..28ab87a6cde4 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -175,6 +175,7 @@
 enum cmn_model {
 	CMN_ANY = -1,
 	CMN600 = 1,
+	CI700 = 2,
 };
 
 /* CMN-600 r0px shouldn't exist in silicon, thankfully */
@@ -186,6 +187,9 @@ enum cmn_revision {
 	CMN600_R2P0,
 	CMN600_R3P0,
 	CMN600_R3P1,
+	CI700_R0P0 = 0,
+	CI700_R1P0,
+	CI700_R2P0,
 };
 
 enum cmn_node_type {
@@ -401,8 +405,10 @@ struct arm_cmn_format_attr {
 static bool arm_cmn_is_occup_event(enum cmn_model model,
 				   enum cmn_node_type type, unsigned int id)
 {
-	return (type == CMN_TYPE_DVM && id == 0x05) ||
-	       (type == CMN_TYPE_HNF && id == 0x0f);
+	if (type == CMN_TYPE_DVM)
+		return (model == CMN600 && id == 0x05) ||
+		       (model == CI700 && id == 0x0c);
+	return type == CMN_TYPE_HNF && id == 0x0f;
 }
 
 static ssize_t arm_cmn_event_show(struct device *dev,
@@ -497,14 +503,19 @@ static umode_t arm_cmn_event_attr_is_visible(struct kobject *kobj,
 	__CMN_EVENT_XP(n_##_name, (_event) | (2 << 2)),		\
 	__CMN_EVENT_XP(s_##_name, (_event) | (3 << 2)),		\
 	__CMN_EVENT_XP(p0_##_name, (_event) | (4 << 2)),	\
-	__CMN_EVENT_XP(p1_##_name, (_event) | (5 << 2))
+	__CMN_EVENT_XP(p1_##_name, (_event) | (5 << 2)),	\
+	__CMN_EVENT_XP(p2_##_name, (_event) | (6 << 2)),	\
+	__CMN_EVENT_XP(p3_##_name, (_event) | (7 << 2))
 
 /* Good thing there are only 3 fundamental XP events... */
 #define CMN_EVENT_XP(_name, _event)				\
 	_CMN_EVENT_XP(req_##_name, (_event) | (0 << 5)),	\
 	_CMN_EVENT_XP(rsp_##_name, (_event) | (1 << 5)),	\
 	_CMN_EVENT_XP(snp_##_name, (_event) | (2 << 5)),	\
-	_CMN_EVENT_XP(dat_##_name, (_event) | (3 << 5))
+	_CMN_EVENT_XP(dat_##_name, (_event) | (3 << 5)),	\
+	_CMN_EVENT_XP(pub_##_name, (_event) | (4 << 5)),	\
+	_CMN_EVENT_XP(rsp2_##_name, (_event) | (5 << 5)),	\
+	_CMN_EVENT_XP(dat2_##_name, (_event) | (6 << 5))
 
 
 static struct attribute *arm_cmn_event_attrs[] = {
@@ -522,6 +533,20 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_all, 0x05, 0),
 	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmop, 0x05, 1),
 	_CMN_EVENT_DVM(CMN600, rxreq_trk_occupancy_dvmsync, 0x05, 2),
+	CMN_EVENT_DVM(CI700, dvmop_tlbi,		0x01),
+	CMN_EVENT_DVM(CI700, dvmop_bpi,			0x02),
+	CMN_EVENT_DVM(CI700, dvmop_pici,		0x03),
+	CMN_EVENT_DVM(CI700, dvmop_vici,		0x04),
+	CMN_EVENT_DVM(CI700, dvmsync,			0x05),
+	CMN_EVENT_DVM(CI700, vmid_filtered,		0x06),
+	CMN_EVENT_DVM(CI700, rndop_filtered,		0x07),
+	CMN_EVENT_DVM(CI700, retry,			0x08),
+	CMN_EVENT_DVM(CI700, txsnp_flitv,		0x09),
+	CMN_EVENT_DVM(CI700, txsnp_stall,		0x0a),
+	CMN_EVENT_DVM(CI700, trkfull,			0x0b),
+	_CMN_EVENT_DVM(CI700, trk_occupancy_all,	0x0c, 0),
+	_CMN_EVENT_DVM(CI700, trk_occupancy_dvmop,	0x0c, 1),
+	_CMN_EVENT_DVM(CI700, trk_occupancy_dvmsync,	0x0c, 2),
 
 	CMN_EVENT_HNF(CMN_ANY, cache_miss,		0x01),
 	CMN_EVENT_HNF(CMN_ANY, slc_sf_cache_access,	0x02),
@@ -558,6 +583,9 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_HNF(CMN_ANY, stash_snp_sent,		0x1d),
 	CMN_EVENT_HNF(CMN_ANY, stash_data_pull,		0x1e),
 	CMN_EVENT_HNF(CMN_ANY, snp_fwded,		0x1f),
+	CMN_EVENT_HNF(CI700, atomic_fwd,		0x20),
+	CMN_EVENT_HNF(CI700, mpam_hardlim,		0x21),
+	CMN_EVENT_HNF(CI700, mpam_softlim,		0x22),
 
 	CMN_EVENT_HNI(rrt_rd_occ_cnt_ovfl,		0x20),
 	CMN_EVENT_HNI(rrt_wr_occ_cnt_ovfl,		0x21),
@@ -598,6 +626,7 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_SBSX(CMN_ANY, wdb_occ_cnt_ovfl,	0x14),
 	CMN_EVENT_SBSX(CMN_ANY, rd_axi_trkr_occ_cnt_ovfl, 0x15),
 	CMN_EVENT_SBSX(CMN_ANY, cmo_axi_trkr_occ_cnt_ovfl, 0x16),
+	CMN_EVENT_SBSX(CI700, rdb_occ_cnt_ovfl,		0x17),
 	CMN_EVENT_SBSX(CMN_ANY, arvalid_no_arready,	0x21),
 	CMN_EVENT_SBSX(CMN_ANY, awvalid_no_awready,	0x22),
 	CMN_EVENT_SBSX(CMN_ANY, wvalid_no_wready,	0x23),
@@ -624,6 +653,25 @@ static struct attribute *arm_cmn_event_attrs[] = {
 	CMN_EVENT_RNID(CMN600, rdb_replay,		0x12),
 	CMN_EVENT_RNID(CMN600, rdb_hybrid,		0x13),
 	CMN_EVENT_RNID(CMN600, rdb_ord,			0x14),
+	CMN_EVENT_RNID(CI700, padb_occ_ovfl,		0x11),
+	CMN_EVENT_RNID(CI700, rpdb_occ_ovfl,		0x12),
+	CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice1,	0x13),
+	CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice2,	0x14),
+	CMN_EVENT_RNID(CI700, rrt_occup_ovfl_slice3,	0x15),
+	CMN_EVENT_RNID(CI700, wrt_throttled,		0x16),
+
+	CMN_EVENT_MTSX(tc_lookup,			0x01),
+	CMN_EVENT_MTSX(tc_fill,				0x02),
+	CMN_EVENT_MTSX(tc_miss,				0x03),
+	CMN_EVENT_MTSX(tdb_forward,			0x04),
+	CMN_EVENT_MTSX(tcq_hazard,			0x05),
+	CMN_EVENT_MTSX(tcq_rd_alloc,			0x06),
+	CMN_EVENT_MTSX(tcq_wr_alloc,			0x07),
+	CMN_EVENT_MTSX(tcq_cmo_alloc,			0x08),
+	CMN_EVENT_MTSX(axi_rd_req,			0x09),
+	CMN_EVENT_MTSX(axi_wr_req,			0x0a),
+	CMN_EVENT_MTSX(tcq_occ_cnt_ovfl,		0x0b),
+	CMN_EVENT_MTSX(tdb_occ_cnt_ovfl,		0x0c),
 
 	NULL
 };
@@ -1773,6 +1821,7 @@ static int arm_cmn_remove(struct platform_device *pdev)
 #ifdef CONFIG_OF
 static const struct of_device_id arm_cmn_of_match[] = {
 	{ .compatible = "arm,cmn-600", .data = (void *)CMN600 },
+	{ .compatible = "arm,ci-700", .data = (void *)CI700 },
 	{}
 };
 MODULE_DEVICE_TABLE(of, arm_cmn_of_match);

From a88fa6c28b867a387e3af202d6dbbb754d3aa2f1 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Fri, 3 Dec 2021 11:45:03 +0000
Subject: [PATCH 42/81] perf/arm-cmn: Add debugfs topology info

In general, detailed performance analysis will require knoweldge of the
the SoC beyond the CMN itself - e.g. which actual CPUs/peripherals/etc.
are connected to each node. However for certain development and bringup
tasks it can be useful to have a quick overview of the CMN internal
topology to hand too. Add a debugfs file to map this out.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/159fd4d7e19fb3c8801a8cb64ee73ec50f55903c.1638530442.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm-cmn.c | 151 ++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 148 insertions(+), 3 deletions(-)

diff --git a/drivers/perf/arm-cmn.c b/drivers/perf/arm-cmn.c
index 28ab87a6cde4..0e48adce57ef 100644
--- a/drivers/perf/arm-cmn.c
+++ b/drivers/perf/arm-cmn.c
@@ -5,6 +5,7 @@
 #include <linux/acpi.h>
 #include <linux/bitfield.h>
 #include <linux/bitops.h>
+#include <linux/debugfs.h>
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/io-64-nonatomic-lo-hi.h>
@@ -287,6 +288,7 @@ struct arm_cmn {
 	struct hlist_node cpuhp_node;
 
 	struct pmu pmu;
+	struct dentry *debug;
 };
 
 #define to_cmn(p)	container_of(p, struct arm_cmn, pmu)
@@ -351,6 +353,140 @@ static struct arm_cmn_node *arm_cmn_node(const struct arm_cmn *cmn,
 	return NULL;
 }
 
+struct dentry *arm_cmn_debugfs;
+
+#ifdef CONFIG_DEBUG_FS
+static const char *arm_cmn_device_type(u8 type)
+{
+	switch(type) {
+		case 0x01: return "  RN-I  |";
+		case 0x02: return "  RN-D  |";
+		case 0x04: return " RN-F_B |";
+		case 0x05: return "RN-F_B_E|";
+		case 0x06: return " RN-F_A |";
+		case 0x07: return "RN-F_A_E|";
+		case 0x08: return "  HN-T  |";
+		case 0x09: return "  HN-I  |";
+		case 0x0a: return "  HN-D  |";
+		case 0x0c: return "  SN-F  |";
+		case 0x0d: return "  SBSX  |";
+		case 0x0e: return "  HN-F  |";
+		case 0x0f: return " SN-F_E |";
+		case 0x10: return " SN-F_D |";
+		case 0x11: return "  CXHA  |";
+		case 0x12: return "  CXRA  |";
+		case 0x13: return "  CXRH  |";
+		case 0x14: return " RN-F_D |";
+		case 0x15: return "RN-F_D_E|";
+		case 0x16: return " RN-F_C |";
+		case 0x17: return "RN-F_C_E|";
+		case 0x1c: return "  MTSX  |";
+		default:   return "        |";
+	}
+}
+
+static void arm_cmn_show_logid(struct seq_file *s, int x, int y, int p, int d)
+{
+	struct arm_cmn *cmn = s->private;
+	struct arm_cmn_node *dn;
+
+	for (dn = cmn->dns; dn->type; dn++) {
+		struct arm_cmn_nodeid nid = arm_cmn_nid(cmn, dn->id);
+
+		if (dn->type == CMN_TYPE_XP)
+			continue;
+		/* Ignore the extra components that will overlap on some ports */
+		if (dn->type < CMN_TYPE_HNI)
+			continue;
+
+		if (nid.x != x || nid.y != y || nid.port != p || nid.dev != d)
+			continue;
+
+		seq_printf(s, "   #%-2d  |", dn->logid);
+		return;
+	}
+	seq_puts(s, "        |");
+}
+
+static int arm_cmn_map_show(struct seq_file *s, void *data)
+{
+	struct arm_cmn *cmn = s->private;
+	int x, y, p, pmax = fls(cmn->ports_used);
+
+	seq_puts(s, "     X");
+	for (x = 0; x < cmn->mesh_x; x++)
+		seq_printf(s, "    %d    ", x);
+	seq_puts(s, "\nY P D+");
+	y = cmn->mesh_y;
+	while (y--) {
+		int xp_base = cmn->mesh_x * y;
+		u8 port[6][CMN_MAX_DIMENSION];
+
+		for (x = 0; x < cmn->mesh_x; x++)
+			seq_puts(s, "--------+");
+
+		seq_printf(s, "\n%d    |", y);
+		for (x = 0; x < cmn->mesh_x; x++) {
+			struct arm_cmn_node *xp = cmn->xps + xp_base + x;
+			void __iomem *base = xp->pmu_base - CMN_PMU_OFFSET;
+
+			port[0][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P0);
+			port[1][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P1);
+			port[2][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P2);
+			port[3][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P3);
+			port[4][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P4);
+			port[5][x] = readl_relaxed(base + CMN_MXP__CONNECT_INFO_P5);
+			seq_printf(s, " XP #%-2d |", xp_base + x);
+		}
+
+		seq_puts(s, "\n     |");
+		for (x = 0; x < cmn->mesh_x; x++) {
+			u8 dtc = cmn->xps[xp_base + x].dtc;
+
+			if (dtc & (dtc - 1))
+				seq_puts(s, " DTC ?? |");
+			else
+				seq_printf(s, " DTC %ld  |", __ffs(dtc));
+		}
+		seq_puts(s, "\n     |");
+		for (x = 0; x < cmn->mesh_x; x++)
+			seq_puts(s, "........|");
+
+		for (p = 0; p < pmax; p++) {
+			seq_printf(s, "\n  %d  |", p);
+			for (x = 0; x < cmn->mesh_x; x++)
+				seq_puts(s, arm_cmn_device_type(port[p][x]));
+			seq_puts(s, "\n    0|");
+			for (x = 0; x < cmn->mesh_x; x++)
+				arm_cmn_show_logid(s, x, y, p, 0);
+			seq_puts(s, "\n    1|");
+			for (x = 0; x < cmn->mesh_x; x++)
+				arm_cmn_show_logid(s, x, y, p, 1);
+		}
+		seq_puts(s, "\n-----+");
+	}
+	for (x = 0; x < cmn->mesh_x; x++)
+		seq_puts(s, "--------+");
+	seq_puts(s, "\n");
+	return 0;
+}
+DEFINE_SHOW_ATTRIBUTE(arm_cmn_map);
+
+static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id)
+{
+	const char *name  = "map";
+
+	if (id > 0)
+		name = devm_kasprintf(cmn->dev, GFP_KERNEL, "map_%d", id);
+	if (!name)
+		return;
+
+	cmn->debug = debugfs_create_file(name, 0444, arm_cmn_debugfs, cmn, &arm_cmn_map_fops);
+}
+#else
+static void arm_cmn_debugfs_init(struct arm_cmn *cmn, int id) {}
+#endif
+
 struct arm_cmn_hw_event {
 	struct arm_cmn_node *dn;
 	u64 dtm_idx[2];
@@ -1738,7 +1874,7 @@ static int arm_cmn_probe(struct platform_device *pdev)
 	struct arm_cmn *cmn;
 	const char *name;
 	static atomic_t id;
-	int err, rootnode;
+	int err, rootnode, this_id;
 
 	cmn = devm_kzalloc(&pdev->dev, sizeof(*cmn), GFP_KERNEL);
 	if (!cmn)
@@ -1792,7 +1928,8 @@ static int arm_cmn_probe(struct platform_device *pdev)
 		.cancel_txn = arm_cmn_end_txn,
 	};
 
-	name = devm_kasprintf(cmn->dev, GFP_KERNEL, "arm_cmn_%d", atomic_fetch_inc(&id));
+	this_id = atomic_fetch_inc(&id);
+	name = devm_kasprintf(cmn->dev, GFP_KERNEL, "arm_cmn_%d", this_id);
 	if (!name)
 		return -ENOMEM;
 
@@ -1803,6 +1940,8 @@ static int arm_cmn_probe(struct platform_device *pdev)
 	err = perf_pmu_register(&cmn->pmu, name, -1);
 	if (err)
 		cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node);
+	else
+		arm_cmn_debugfs_init(cmn, this_id);
 
 	return err;
 }
@@ -1815,6 +1954,7 @@ static int arm_cmn_remove(struct platform_device *pdev)
 
 	perf_pmu_unregister(&cmn->pmu);
 	cpuhp_state_remove_instance_nocalls(arm_cmn_hp_state, &cmn->cpuhp_node);
+	debugfs_remove(cmn->debug);
 	return 0;
 }
 
@@ -1857,9 +1997,13 @@ static int __init arm_cmn_init(void)
 		return ret;
 
 	arm_cmn_hp_state = ret;
+	arm_cmn_debugfs = debugfs_create_dir("arm-cmn", NULL);
+
 	ret = platform_driver_register(&arm_cmn_driver);
-	if (ret)
+	if (ret) {
 		cpuhp_remove_multi_state(arm_cmn_hp_state);
+		debugfs_remove(arm_cmn_debugfs);
+	}
 	return ret;
 }
 
@@ -1867,6 +2011,7 @@ static void __exit arm_cmn_exit(void)
 {
 	platform_driver_unregister(&arm_cmn_driver);
 	cpuhp_remove_multi_state(arm_cmn_hp_state);
+	debugfs_remove(arm_cmn_debugfs);
 }
 
 module_init(arm_cmn_init);

From 2704e7594383c0275dcb65e05408260ad34f3e15 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Wed, 17 Nov 2021 14:48:43 +0000
Subject: [PATCH 43/81] dt-bindings: Add Arm SMMUv3 PMCG binding

Add binding for the Arm SMMUv3 PMU. Each node represents a PMCG, and is
placed as a sibling node of the SMMU. Although the PMCGs registers may
be within the SMMU MMIO region, they are separate devices, and there can
be multiple PMCG devices for each SMMU (for example one for the TCU and
one for each TBU).

Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20211117144844.241072-2-jean-philippe@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 .../bindings/perf/arm,smmu-v3-pmcg.yaml       | 70 +++++++++++++++++++
 1 file changed, 70 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/perf/arm,smmu-v3-pmcg.yaml

diff --git a/Documentation/devicetree/bindings/perf/arm,smmu-v3-pmcg.yaml b/Documentation/devicetree/bindings/perf/arm,smmu-v3-pmcg.yaml
new file mode 100644
index 000000000000..a4b53a6a1ebf
--- /dev/null
+++ b/Documentation/devicetree/bindings/perf/arm,smmu-v3-pmcg.yaml
@@ -0,0 +1,70 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/perf/arm,smmu-v3-pmcg.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Arm SMMUv3 Performance Monitor Counter Group
+
+maintainers:
+  - Will Deacon <will@kernel.org>
+  - Robin Murphy <robin.murphy@arm.com>
+
+description: |
+  An SMMUv3 may have several Performance Monitor Counter Group (PMCG).
+  They are standalone performance monitoring units that support both
+  architected and IMPLEMENTATION DEFINED event counters.
+
+properties:
+  $nodename:
+    pattern: "^pmu@[0-9a-f]*"
+  compatible:
+    oneOf:
+      - items:
+          - const: arm,mmu-600-pmcg
+          - const: arm,smmu-v3-pmcg
+      - const: arm,smmu-v3-pmcg
+
+  reg:
+    items:
+      - description: Register page 0
+      - description: Register page 1, if SMMU_PMCG_CFGR.RELOC_CTRS = 1
+    minItems: 1
+
+  interrupts:
+    maxItems: 1
+
+  msi-parent: true
+
+required:
+  - compatible
+  - reg
+
+anyOf:
+  - required:
+      - interrupts
+  - required:
+      - msi-parent
+
+additionalProperties: false
+
+examples:
+  - |
+    #include <dt-bindings/interrupt-controller/arm-gic.h>
+    #include <dt-bindings/interrupt-controller/irq.h>
+
+    pmu@2b420000 {
+            compatible = "arm,smmu-v3-pmcg";
+            reg = <0x2b420000 0x1000>,
+                  <0x2b430000 0x1000>;
+            interrupts = <GIC_SPI 80 IRQ_TYPE_EDGE_RISING>;
+            msi-parent = <&its 0xff0000>;
+    };
+
+    pmu@2b440000 {
+            compatible = "arm,smmu-v3-pmcg";
+            reg = <0x2b440000 0x1000>,
+                  <0x2b450000 0x1000>;
+            interrupts = <GIC_SPI 81 IRQ_TYPE_EDGE_RISING>;
+            msi-parent = <&its 0xff0000>;
+    };

From 3f7be43561766b7ad91661580376913c997613d6 Mon Sep 17 00:00:00 2001
From: Jean-Philippe Brucker <jean-philippe@linaro.org>
Date: Wed, 17 Nov 2021 14:48:44 +0000
Subject: [PATCH 44/81] perf/smmuv3: Add devicetree support

Add device-tree support to the SMMUv3 PMCG driver.

Signed-off-by: Jay Chen <jkchen@linux.alibaba.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/20211117144844.241072-3-jean-philippe@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_smmuv3_pmu.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 226348822ab3..19697617153a 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -47,6 +47,7 @@
 #include <linux/kernel.h>
 #include <linux/list.h>
 #include <linux/msi.h>
+#include <linux/of.h>
 #include <linux/perf_event.h>
 #include <linux/platform_device.h>
 #include <linux/smp.h>
@@ -834,7 +835,8 @@ static int smmu_pmu_probe(struct platform_device *pdev)
 		return -EINVAL;
 	}
 
-	smmu_pmu_get_acpi_options(smmu_pmu);
+	if (!dev->of_node)
+		smmu_pmu_get_acpi_options(smmu_pmu);
 
 	/* Pick one CPU to be the preferred one to use */
 	smmu_pmu->on_cpu = raw_smp_processor_id();
@@ -884,9 +886,16 @@ static void smmu_pmu_shutdown(struct platform_device *pdev)
 	smmu_pmu_disable(&smmu_pmu->pmu);
 }
 
+static const struct of_device_id smmu_pmu_of_match[] = {
+	{ .compatible = "arm,smmu-v3-pmcg" },
+	{}
+};
+MODULE_DEVICE_TABLE(of, smmu_pmu_of_match);
+
 static struct platform_driver smmu_pmu_driver = {
 	.driver = {
 		.name = "arm-smmu-v3-pmcg",
+		.of_match_table = of_match_ptr(smmu_pmu_of_match),
 		.suppress_bind_attrs = true,
 	},
 	.probe = smmu_pmu_probe,

From df457ca973fee9e66d0d688afa6605d053034cd2 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Wed, 17 Nov 2021 14:48:45 +0000
Subject: [PATCH 45/81] perf/smmuv3: Synthesize IIDR from CoreSight ID
 registers

The SMMU_PMCG_IIDR register was not present in older revisions of the
Arm SMMUv3 spec. On Arm Ltd. implementations, the IIDR value consists of
fields from several PIDR registers, allowing us to present a
standardized identifier to userspace.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Jean-Philippe Brucker <jean-philippe@linaro.org>
Link: https://lore.kernel.org/r/20211117144844.241072-4-jean-philippe@linaro.org
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_smmuv3_pmu.c | 55 ++++++++++++++++++++++++++++++++++-
 1 file changed, 54 insertions(+), 1 deletion(-)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 19697617153a..598d6978280d 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -76,6 +76,10 @@
 #define SMMU_PMCG_CR                    0xE04
 #define SMMU_PMCG_CR_ENABLE             BIT(0)
 #define SMMU_PMCG_IIDR                  0xE08
+#define SMMU_PMCG_IIDR_PRODUCTID        GENMASK(31, 20)
+#define SMMU_PMCG_IIDR_VARIANT          GENMASK(19, 16)
+#define SMMU_PMCG_IIDR_REVISION         GENMASK(15, 12)
+#define SMMU_PMCG_IIDR_IMPLEMENTER      GENMASK(11, 0)
 #define SMMU_PMCG_CEID0                 0xE20
 #define SMMU_PMCG_CEID1                 0xE28
 #define SMMU_PMCG_IRQ_CTRL              0xE50
@@ -84,6 +88,20 @@
 #define SMMU_PMCG_IRQ_CFG1              0xE60
 #define SMMU_PMCG_IRQ_CFG2              0xE64
 
+/* IMP-DEF ID registers */
+#define SMMU_PMCG_PIDR0                 0xFE0
+#define SMMU_PMCG_PIDR0_PART_0          GENMASK(7, 0)
+#define SMMU_PMCG_PIDR1                 0xFE4
+#define SMMU_PMCG_PIDR1_DES_0           GENMASK(7, 4)
+#define SMMU_PMCG_PIDR1_PART_1          GENMASK(3, 0)
+#define SMMU_PMCG_PIDR2                 0xFE8
+#define SMMU_PMCG_PIDR2_REVISION        GENMASK(7, 4)
+#define SMMU_PMCG_PIDR2_DES_1           GENMASK(2, 0)
+#define SMMU_PMCG_PIDR3                 0xFEC
+#define SMMU_PMCG_PIDR3_REVAND          GENMASK(7, 4)
+#define SMMU_PMCG_PIDR4                 0xFD0
+#define SMMU_PMCG_PIDR4_DES_2           GENMASK(3, 0)
+
 /* MSI config fields */
 #define MSI_CFG0_ADDR_MASK              GENMASK_ULL(51, 2)
 #define MSI_CFG2_MEMATTR_DEVICE_nGnRE   0x1
@@ -755,6 +773,41 @@ static void smmu_pmu_get_acpi_options(struct smmu_pmu *smmu_pmu)
 	dev_notice(smmu_pmu->dev, "option mask 0x%x\n", smmu_pmu->options);
 }
 
+static bool smmu_pmu_coresight_id_regs(struct smmu_pmu *smmu_pmu)
+{
+	return of_device_is_compatible(smmu_pmu->dev->of_node,
+				       "arm,mmu-600-pmcg");
+}
+
+static void smmu_pmu_get_iidr(struct smmu_pmu *smmu_pmu)
+{
+	u32 iidr = readl_relaxed(smmu_pmu->reg_base + SMMU_PMCG_IIDR);
+
+	if (!iidr && smmu_pmu_coresight_id_regs(smmu_pmu)) {
+		u32 pidr0 = readl(smmu_pmu->reg_base + SMMU_PMCG_PIDR0);
+		u32 pidr1 = readl(smmu_pmu->reg_base + SMMU_PMCG_PIDR1);
+		u32 pidr2 = readl(smmu_pmu->reg_base + SMMU_PMCG_PIDR2);
+		u32 pidr3 = readl(smmu_pmu->reg_base + SMMU_PMCG_PIDR3);
+		u32 pidr4 = readl(smmu_pmu->reg_base + SMMU_PMCG_PIDR4);
+
+		u32 productid = FIELD_GET(SMMU_PMCG_PIDR0_PART_0, pidr0) |
+				(FIELD_GET(SMMU_PMCG_PIDR1_PART_1, pidr1) << 8);
+		u32 variant = FIELD_GET(SMMU_PMCG_PIDR2_REVISION, pidr2);
+		u32 revision = FIELD_GET(SMMU_PMCG_PIDR3_REVAND, pidr3);
+		u32 implementer =
+			FIELD_GET(SMMU_PMCG_PIDR1_DES_0, pidr1) |
+			(FIELD_GET(SMMU_PMCG_PIDR2_DES_1, pidr2) << 4) |
+			(FIELD_GET(SMMU_PMCG_PIDR4_DES_2, pidr4) << 8);
+
+		iidr = FIELD_PREP(SMMU_PMCG_IIDR_PRODUCTID, productid) |
+		       FIELD_PREP(SMMU_PMCG_IIDR_VARIANT, variant) |
+		       FIELD_PREP(SMMU_PMCG_IIDR_REVISION, revision) |
+		       FIELD_PREP(SMMU_PMCG_IIDR_IMPLEMENTER, implementer);
+	}
+
+	smmu_pmu->iidr = iidr;
+}
+
 static int smmu_pmu_probe(struct platform_device *pdev)
 {
 	struct smmu_pmu *smmu_pmu;
@@ -826,7 +879,7 @@ static int smmu_pmu_probe(struct platform_device *pdev)
 		return err;
 	}
 
-	smmu_pmu->iidr = readl_relaxed(smmu_pmu->reg_base + SMMU_PMCG_IIDR);
+	smmu_pmu_get_iidr(smmu_pmu);
 
 	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "smmuv3_pmcg_%llx",
 			      (res_0->start) >> SMMU_PMCG_PA_SHIFT);

From 2c54b423cf85baed5ad9f9546f6c8ea741774a06 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ardb@kernel.org>
Date: Mon, 13 Dec 2021 15:02:52 +0100
Subject: [PATCH 46/81] arm64/xor: use EOR3 instructions when available

Use the EOR3 instruction to implement xor_blocks() if the instruction is
available, which is the case if the CPU implements the SHA-3 extension.
This is about 20% faster on Apple M1 when using the 5-way version.

Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20211213140252.2856053-1-ardb@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig        |   6 ++
 arch/arm64/Makefile       |   5 ++
 arch/arm64/lib/xor-neon.c | 147 +++++++++++++++++++++++++++++++++++++-
 3 files changed, 157 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index c4207cf9bb17..63d41ba4e716 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -1545,6 +1545,12 @@ endmenu
 
 menu "ARMv8.2 architectural features"
 
+config AS_HAS_ARMV8_2
+       def_bool $(cc-option,-Wa$(comma)-march=armv8.2-a)
+
+config AS_HAS_SHA3
+       def_bool $(as-instr,.arch armv8.2-a+sha3)
+
 config ARM64_PMEM
 	bool "Enable support for persistent memory"
 	select ARCH_HAS_PMEM_API
diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile
index e8cfc5868aa8..2f1de88651e6 100644
--- a/arch/arm64/Makefile
+++ b/arch/arm64/Makefile
@@ -58,6 +58,11 @@ stack_protector_prepare: prepare0
 					include/generated/asm-offsets.h))
 endif
 
+ifeq ($(CONFIG_AS_HAS_ARMV8_2), y)
+# make sure to pass the newest target architecture to -march.
+asm-arch := armv8.2-a
+endif
+
 # Ensure that if the compiler supports branch protection we default it
 # off, this will be overridden if we are using branch protection.
 branch-prot-flags-y += $(call cc-option,-mbranch-protection=none)
diff --git a/arch/arm64/lib/xor-neon.c b/arch/arm64/lib/xor-neon.c
index 11bf4f8aca68..d189cf4e70ea 100644
--- a/arch/arm64/lib/xor-neon.c
+++ b/arch/arm64/lib/xor-neon.c
@@ -167,7 +167,7 @@ void xor_arm64_neon_5(unsigned long bytes, unsigned long *p1,
 	} while (--lines > 0);
 }
 
-struct xor_block_template const xor_block_inner_neon = {
+struct xor_block_template xor_block_inner_neon __ro_after_init = {
 	.name	= "__inner_neon__",
 	.do_2	= xor_arm64_neon_2,
 	.do_3	= xor_arm64_neon_3,
@@ -176,6 +176,151 @@ struct xor_block_template const xor_block_inner_neon = {
 };
 EXPORT_SYMBOL(xor_block_inner_neon);
 
+static inline uint64x2_t eor3(uint64x2_t p, uint64x2_t q, uint64x2_t r)
+{
+	uint64x2_t res;
+
+	asm(ARM64_ASM_PREAMBLE ".arch_extension sha3\n"
+	    "eor3 %0.16b, %1.16b, %2.16b, %3.16b"
+	    : "=w"(res) : "w"(p), "w"(q), "w"(r));
+	return res;
+}
+
+static void xor_arm64_eor3_3(unsigned long bytes, unsigned long *p1,
+			     unsigned long *p2, unsigned long *p3)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 ^ p3 */
+		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+			  vld1q_u64(dp3 + 0));
+		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+			  vld1q_u64(dp3 + 2));
+		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+			  vld1q_u64(dp3 + 4));
+		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+			  vld1q_u64(dp3 + 6));
+
+		/* store */
+		vst1q_u64(dp1 + 0, v0);
+		vst1q_u64(dp1 + 2, v1);
+		vst1q_u64(dp1 + 4, v2);
+		vst1q_u64(dp1 + 6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+	} while (--lines > 0);
+}
+
+static void xor_arm64_eor3_4(unsigned long bytes, unsigned long *p1,
+			     unsigned long *p2, unsigned long *p3,
+			     unsigned long *p4)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+	uint64_t *dp4 = (uint64_t *)p4;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 ^ p3 */
+		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+			  vld1q_u64(dp3 + 0));
+		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+			  vld1q_u64(dp3 + 2));
+		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+			  vld1q_u64(dp3 + 4));
+		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+			  vld1q_u64(dp3 + 6));
+
+		/* p1 ^= p4 */
+		v0 = veorq_u64(v0, vld1q_u64(dp4 + 0));
+		v1 = veorq_u64(v1, vld1q_u64(dp4 + 2));
+		v2 = veorq_u64(v2, vld1q_u64(dp4 + 4));
+		v3 = veorq_u64(v3, vld1q_u64(dp4 + 6));
+
+		/* store */
+		vst1q_u64(dp1 + 0, v0);
+		vst1q_u64(dp1 + 2, v1);
+		vst1q_u64(dp1 + 4, v2);
+		vst1q_u64(dp1 + 6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+		dp4 += 8;
+	} while (--lines > 0);
+}
+
+static void xor_arm64_eor3_5(unsigned long bytes, unsigned long *p1,
+			     unsigned long *p2, unsigned long *p3,
+			     unsigned long *p4, unsigned long *p5)
+{
+	uint64_t *dp1 = (uint64_t *)p1;
+	uint64_t *dp2 = (uint64_t *)p2;
+	uint64_t *dp3 = (uint64_t *)p3;
+	uint64_t *dp4 = (uint64_t *)p4;
+	uint64_t *dp5 = (uint64_t *)p5;
+
+	register uint64x2_t v0, v1, v2, v3;
+	long lines = bytes / (sizeof(uint64x2_t) * 4);
+
+	do {
+		/* p1 ^= p2 ^ p3 */
+		v0 = eor3(vld1q_u64(dp1 + 0), vld1q_u64(dp2 + 0),
+			  vld1q_u64(dp3 + 0));
+		v1 = eor3(vld1q_u64(dp1 + 2), vld1q_u64(dp2 + 2),
+			  vld1q_u64(dp3 + 2));
+		v2 = eor3(vld1q_u64(dp1 + 4), vld1q_u64(dp2 + 4),
+			  vld1q_u64(dp3 + 4));
+		v3 = eor3(vld1q_u64(dp1 + 6), vld1q_u64(dp2 + 6),
+			  vld1q_u64(dp3 + 6));
+
+		/* p1 ^= p4 ^ p5 */
+		v0 = eor3(v0, vld1q_u64(dp4 + 0), vld1q_u64(dp5 + 0));
+		v1 = eor3(v1, vld1q_u64(dp4 + 2), vld1q_u64(dp5 + 2));
+		v2 = eor3(v2, vld1q_u64(dp4 + 4), vld1q_u64(dp5 + 4));
+		v3 = eor3(v3, vld1q_u64(dp4 + 6), vld1q_u64(dp5 + 6));
+
+		/* store */
+		vst1q_u64(dp1 + 0, v0);
+		vst1q_u64(dp1 + 2, v1);
+		vst1q_u64(dp1 + 4, v2);
+		vst1q_u64(dp1 + 6, v3);
+
+		dp1 += 8;
+		dp2 += 8;
+		dp3 += 8;
+		dp4 += 8;
+		dp5 += 8;
+	} while (--lines > 0);
+}
+
+static int __init xor_neon_init(void)
+{
+	if (IS_ENABLED(CONFIG_AS_HAS_SHA3) && cpu_have_named_feature(SHA3)) {
+		xor_block_inner_neon.do_3 = xor_arm64_eor3_3;
+		xor_block_inner_neon.do_4 = xor_arm64_eor3_4;
+		xor_block_inner_neon.do_5 = xor_arm64_eor3_5;
+	}
+	return 0;
+}
+module_init(xor_neon_init);
+
+static void __exit xor_neon_exit(void)
+{
+}
+module_exit(xor_neon_exit);
+
 MODULE_AUTHOR("Jackie Liu <liuyun01@kylinos.cn>");
 MODULE_DESCRIPTION("ARMv8 XOR Extensions");
 MODULE_LICENSE("GPL");

From 036a7584bede317d0df6b854e4f531b7a2dd8b33 Mon Sep 17 00:00:00 2001
From: Bhaskara Budiredla <bbudiredla@marvell.com>
Date: Mon, 15 Nov 2021 10:05:05 +0530
Subject: [PATCH 47/81] drivers: perf: Add LLC-TAD perf counter support

This driver adds support for Last-level cache tag-and-data unit
(LLC-TAD) PMU that is featured in some of the Marvell's CN10K
infrastructure silicons.

The LLC is divided into 2N slices distributed across N Mesh tiles
in a single-socket configuration. The driver always configures the
same counter for all of the TADs. The user would end up effectively
reserving one of eight counters in every TAD to look across all TADs.
The occurrences of events are aggregated and presented to the user
at the end of an application run. The driver does not provide a way
for the user to partition TADs so that different TADs are used for
different applications.

The event counters are zeroed to start event counting to avoid any
rollover issues. TAD perf counters are 64-bit, so it's not currently
possible to overflow event counters at current mesh and core
frequencies.

To measure tad pmu events use perf tool stat command. For instance:

perf stat -e tad_dat_msh_in_dss,tad_req_msh_out_any <application>
perf stat -e tad_alloc_any,tad_hit_any,tad_tag_rd <application>

Signed-off-by: Bhaskara Budiredla <bbudiredla@marvell.com>
Link: https://lore.kernel.org/r/20211115043506.6679-2-bbudiredla@marvell.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/Kconfig                 |   7 +
 drivers/perf/Makefile                |   1 +
 drivers/perf/marvell_cn10k_tad_pmu.c | 429 +++++++++++++++++++++++++++
 3 files changed, 437 insertions(+)
 create mode 100644 drivers/perf/marvell_cn10k_tad_pmu.c

diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig
index 4374af292e6d..8fe825dd0d08 100644
--- a/drivers/perf/Kconfig
+++ b/drivers/perf/Kconfig
@@ -139,6 +139,13 @@ config ARM_DMC620_PMU
 	  Support for PMU events monitoring on the ARM DMC-620 memory
 	  controller.
 
+config MARVELL_CN10K_TAD_PMU
+	tristate "Marvell CN10K LLC-TAD PMU"
+	depends on ARM64 || (COMPILE_TEST && 64BIT)
+	help
+	  Provides support for Last-Level cache Tag-and-data Units (LLC-TAD)
+	  performance monitors on CN10K family silicons.
+
 source "drivers/perf/hisilicon/Kconfig"
 
 endmenu
diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile
index 5260b116c7da..2db5418d5b0a 100644
--- a/drivers/perf/Makefile
+++ b/drivers/perf/Makefile
@@ -14,3 +14,4 @@ obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o
 obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o
 obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o
 obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o
+obj-$(CONFIG_MARVELL_CN10K_TAD_PMU) += marvell_cn10k_tad_pmu.o
diff --git a/drivers/perf/marvell_cn10k_tad_pmu.c b/drivers/perf/marvell_cn10k_tad_pmu.c
new file mode 100644
index 000000000000..250dd4c52d70
--- /dev/null
+++ b/drivers/perf/marvell_cn10k_tad_pmu.c
@@ -0,0 +1,429 @@
+// SPDX-License-Identifier: GPL-2.0
+/* Marvell CN10K LLC-TAD perf driver
+ *
+ * Copyright (C) 2021 Marvell
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#define pr_fmt(fmt) "tad_pmu: " fmt
+
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/of_device.h>
+#include <linux/cpuhotplug.h>
+#include <linux/perf_event.h>
+#include <linux/platform_device.h>
+
+#define TAD_PFC_OFFSET		0x0
+#define TAD_PFC(counter)	(TAD_PFC_OFFSET | (counter << 3))
+#define TAD_PRF_OFFSET		0x100
+#define TAD_PRF(counter)	(TAD_PRF_OFFSET | (counter << 3))
+#define TAD_PRF_CNTSEL_MASK	0xFF
+#define TAD_MAX_COUNTERS	8
+
+#define to_tad_pmu(p) (container_of(p, struct tad_pmu, pmu))
+
+struct tad_region {
+	void __iomem	*base;
+};
+
+struct tad_pmu {
+	struct pmu pmu;
+	struct tad_region *regions;
+	u32 region_cnt;
+	unsigned int cpu;
+	struct hlist_node node;
+	struct perf_event *events[TAD_MAX_COUNTERS];
+	DECLARE_BITMAP(counters_map, TAD_MAX_COUNTERS);
+};
+
+static int tad_pmu_cpuhp_state;
+
+static void tad_pmu_event_counter_read(struct perf_event *event)
+{
+	struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u32 counter_idx = hwc->idx;
+	u64 prev, new;
+	int i;
+
+	do {
+		prev = local64_read(&hwc->prev_count);
+		for (i = 0, new = 0; i < tad_pmu->region_cnt; i++)
+			new += readq(tad_pmu->regions[i].base +
+				     TAD_PFC(counter_idx));
+	} while (local64_cmpxchg(&hwc->prev_count, prev, new) != prev);
+
+	local64_add(new - prev, &event->count);
+}
+
+static void tad_pmu_event_counter_stop(struct perf_event *event, int flags)
+{
+	struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u32 counter_idx = hwc->idx;
+	int i;
+
+	/* TAD()_PFC() stop counting on the write
+	 * which sets TAD()_PRF()[CNTSEL] == 0
+	 */
+	for (i = 0; i < tad_pmu->region_cnt; i++) {
+		writeq_relaxed(0, tad_pmu->regions[i].base +
+			       TAD_PRF(counter_idx));
+	}
+
+	tad_pmu_event_counter_read(event);
+	hwc->state |= PERF_HES_STOPPED | PERF_HES_UPTODATE;
+}
+
+static void tad_pmu_event_counter_start(struct perf_event *event, int flags)
+{
+	struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u32 event_idx = event->attr.config;
+	u32 counter_idx = hwc->idx;
+	u64 reg_val;
+	int i;
+
+	hwc->state = 0;
+
+	/* Typically TAD_PFC() are zeroed to start counting */
+	for (i = 0; i < tad_pmu->region_cnt; i++)
+		writeq_relaxed(0, tad_pmu->regions[i].base +
+			       TAD_PFC(counter_idx));
+
+	/* TAD()_PFC() start counting on the write
+	 * which sets TAD()_PRF()[CNTSEL] != 0
+	 */
+	for (i = 0; i < tad_pmu->region_cnt; i++) {
+		reg_val = readq_relaxed(tad_pmu->regions[i].base +
+					TAD_PRF(counter_idx));
+		reg_val |= (event_idx & 0xFF);
+		writeq_relaxed(reg_val,	tad_pmu->regions[i].base +
+			       TAD_PRF(counter_idx));
+	}
+}
+
+static void tad_pmu_event_counter_del(struct perf_event *event, int flags)
+{
+	struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	tad_pmu_event_counter_stop(event, flags | PERF_EF_UPDATE);
+	tad_pmu->events[idx] = NULL;
+	clear_bit(idx, tad_pmu->counters_map);
+}
+
+static int tad_pmu_event_counter_add(struct perf_event *event, int flags)
+{
+	struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+
+	/* Get a free counter for this event */
+	idx = find_first_zero_bit(tad_pmu->counters_map, TAD_MAX_COUNTERS);
+	if (idx == TAD_MAX_COUNTERS)
+		return -EAGAIN;
+
+	set_bit(idx, tad_pmu->counters_map);
+
+	hwc->idx = idx;
+	hwc->state = PERF_HES_STOPPED;
+	tad_pmu->events[idx] = event;
+
+	if (flags & PERF_EF_START)
+		tad_pmu_event_counter_start(event, flags);
+
+	return 0;
+}
+
+static int tad_pmu_event_init(struct perf_event *event)
+{
+	struct tad_pmu *tad_pmu = to_tad_pmu(event->pmu);
+
+	if (!event->attr.disabled)
+		return -EINVAL;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	if (event->state != PERF_EVENT_STATE_OFF)
+		return -EINVAL;
+
+	event->cpu = tad_pmu->cpu;
+	event->hw.idx = -1;
+	event->hw.config_base = event->attr.config;
+
+	return 0;
+}
+
+static ssize_t tad_pmu_event_show(struct device *dev,
+				struct device_attribute *attr, char *page)
+{
+	struct perf_pmu_events_attr *pmu_attr;
+
+	pmu_attr = container_of(attr, struct perf_pmu_events_attr, attr);
+	return sysfs_emit(page, "event=0x%02llx\n", pmu_attr->id);
+}
+
+#define TAD_PMU_EVENT_ATTR(name, config)			\
+	PMU_EVENT_ATTR_ID(name, tad_pmu_event_show, config)
+
+static struct attribute *tad_pmu_event_attrs[] = {
+	TAD_PMU_EVENT_ATTR(tad_none, 0x0),
+	TAD_PMU_EVENT_ATTR(tad_req_msh_in_any, 0x1),
+	TAD_PMU_EVENT_ATTR(tad_req_msh_in_mn, 0x2),
+	TAD_PMU_EVENT_ATTR(tad_req_msh_in_exlmn, 0x3),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_any, 0x4),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_mn, 0x5),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_exlmn, 0x6),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_dss, 0x7),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_in_retry_dss, 0x8),
+	TAD_PMU_EVENT_ATTR(tad_dat_msh_in_any, 0x9),
+	TAD_PMU_EVENT_ATTR(tad_dat_msh_in_dss, 0xa),
+	TAD_PMU_EVENT_ATTR(tad_req_msh_out_any, 0xb),
+	TAD_PMU_EVENT_ATTR(tad_req_msh_out_dss_rd, 0xc),
+	TAD_PMU_EVENT_ATTR(tad_req_msh_out_dss_wr, 0xd),
+	TAD_PMU_EVENT_ATTR(tad_req_msh_out_evict, 0xe),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_any, 0xf),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_retry_exlmn, 0x10),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_retry_mn, 0x11),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_exlmn, 0x12),
+	TAD_PMU_EVENT_ATTR(tad_rsp_msh_out_mn, 0x13),
+	TAD_PMU_EVENT_ATTR(tad_snp_msh_out_any, 0x14),
+	TAD_PMU_EVENT_ATTR(tad_snp_msh_out_mn, 0x15),
+	TAD_PMU_EVENT_ATTR(tad_snp_msh_out_exlmn, 0x16),
+	TAD_PMU_EVENT_ATTR(tad_dat_msh_out_any, 0x17),
+	TAD_PMU_EVENT_ATTR(tad_dat_msh_out_fill, 0x18),
+	TAD_PMU_EVENT_ATTR(tad_dat_msh_out_dss, 0x19),
+	TAD_PMU_EVENT_ATTR(tad_alloc_dtg, 0x1a),
+	TAD_PMU_EVENT_ATTR(tad_alloc_ltg, 0x1b),
+	TAD_PMU_EVENT_ATTR(tad_alloc_any, 0x1c),
+	TAD_PMU_EVENT_ATTR(tad_hit_dtg, 0x1d),
+	TAD_PMU_EVENT_ATTR(tad_hit_ltg, 0x1e),
+	TAD_PMU_EVENT_ATTR(tad_hit_any, 0x1f),
+	TAD_PMU_EVENT_ATTR(tad_tag_rd, 0x20),
+	TAD_PMU_EVENT_ATTR(tad_dat_rd, 0x21),
+	TAD_PMU_EVENT_ATTR(tad_dat_rd_byp, 0x22),
+	TAD_PMU_EVENT_ATTR(tad_ifb_occ, 0x23),
+	TAD_PMU_EVENT_ATTR(tad_req_occ, 0x24),
+	NULL
+};
+
+static const struct attribute_group tad_pmu_events_attr_group = {
+	.name = "events",
+	.attrs = tad_pmu_event_attrs,
+};
+
+PMU_FORMAT_ATTR(event, "config:0-7");
+
+static struct attribute *tad_pmu_format_attrs[] = {
+	&format_attr_event.attr,
+	NULL
+};
+
+static struct attribute_group tad_pmu_format_attr_group = {
+	.name = "format",
+	.attrs = tad_pmu_format_attrs,
+};
+
+static ssize_t tad_pmu_cpumask_show(struct device *dev,
+				struct device_attribute *attr, char *buf)
+{
+	struct tad_pmu *tad_pmu = to_tad_pmu(dev_get_drvdata(dev));
+
+	return cpumap_print_to_pagebuf(true, buf, cpumask_of(tad_pmu->cpu));
+}
+
+static DEVICE_ATTR(cpumask, 0444, tad_pmu_cpumask_show, NULL);
+
+static struct attribute *tad_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL
+};
+
+static struct attribute_group tad_pmu_cpumask_attr_group = {
+	.attrs = tad_pmu_cpumask_attrs,
+};
+
+static const struct attribute_group *tad_pmu_attr_groups[] = {
+	&tad_pmu_events_attr_group,
+	&tad_pmu_format_attr_group,
+	&tad_pmu_cpumask_attr_group,
+	NULL
+};
+
+static int tad_pmu_probe(struct platform_device *pdev)
+{
+	struct device_node *node = pdev->dev.of_node;
+	struct tad_region *regions;
+	struct tad_pmu *tad_pmu;
+	struct resource *res;
+	u32 tad_pmu_page_size;
+	u32 tad_page_size;
+	u32 tad_cnt;
+	int i, ret;
+	char *name;
+
+	tad_pmu = devm_kzalloc(&pdev->dev, sizeof(*tad_pmu), GFP_KERNEL);
+	if (!tad_pmu)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, tad_pmu);
+
+	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	if (!res) {
+		dev_err(&pdev->dev, "Mem resource not found\n");
+		return -ENODEV;
+	}
+
+	ret = of_property_read_u32(node, "marvell,tad-page-size",
+				   &tad_page_size);
+	if (ret) {
+		dev_err(&pdev->dev, "Can't find tad-page-size property\n");
+		return ret;
+	}
+
+	ret = of_property_read_u32(node, "marvell,tad-pmu-page-size",
+				   &tad_pmu_page_size);
+	if (ret) {
+		dev_err(&pdev->dev, "Can't find tad-pmu-page-size property\n");
+		return ret;
+	}
+
+	ret = of_property_read_u32(node, "marvell,tad-cnt", &tad_cnt);
+	if (ret) {
+		dev_err(&pdev->dev, "Can't find tad-cnt property\n");
+		return ret;
+	}
+
+	regions = devm_kcalloc(&pdev->dev, tad_cnt,
+			       sizeof(*regions), GFP_KERNEL);
+	if (!regions)
+		return -ENOMEM;
+
+	/* ioremap the distributed TAD pmu regions */
+	for (i = 0; i < tad_cnt && res->start < res->end; i++) {
+		regions[i].base = devm_ioremap(&pdev->dev,
+					       res->start,
+					       tad_pmu_page_size);
+		if (IS_ERR(regions[i].base)) {
+			dev_err(&pdev->dev, "TAD%d ioremap fail\n", i);
+			return -ENOMEM;
+		}
+		res->start += tad_page_size;
+	}
+
+	tad_pmu->regions = regions;
+	tad_pmu->region_cnt = tad_cnt;
+
+	tad_pmu->pmu = (struct pmu) {
+
+		.module		= THIS_MODULE,
+		.attr_groups	= tad_pmu_attr_groups,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE |
+				  PERF_PMU_CAP_NO_INTERRUPT,
+		.task_ctx_nr	= perf_invalid_context,
+
+		.event_init	= tad_pmu_event_init,
+		.add		= tad_pmu_event_counter_add,
+		.del		= tad_pmu_event_counter_del,
+		.start		= tad_pmu_event_counter_start,
+		.stop		= tad_pmu_event_counter_stop,
+		.read		= tad_pmu_event_counter_read,
+	};
+
+	tad_pmu->cpu = raw_smp_processor_id();
+
+	/* Register pmu instance for cpu hotplug */
+	ret = cpuhp_state_add_instance_nocalls(tad_pmu_cpuhp_state,
+					       &tad_pmu->node);
+	if (ret) {
+		dev_err(&pdev->dev, "Error %d registering hotplug\n", ret);
+		return ret;
+	}
+
+	name = "tad";
+	ret = perf_pmu_register(&tad_pmu->pmu, name, -1);
+	if (ret)
+		cpuhp_state_remove_instance_nocalls(tad_pmu_cpuhp_state,
+						    &tad_pmu->node);
+
+	return ret;
+}
+
+static int tad_pmu_remove(struct platform_device *pdev)
+{
+	struct tad_pmu *pmu = platform_get_drvdata(pdev);
+
+	cpuhp_state_remove_instance_nocalls(tad_pmu_cpuhp_state,
+						&pmu->node);
+	perf_pmu_unregister(&pmu->pmu);
+
+	return 0;
+}
+
+static const struct of_device_id tad_pmu_of_match[] = {
+	{ .compatible = "marvell,cn10k-tad-pmu", },
+	{},
+};
+
+static struct platform_driver tad_pmu_driver = {
+	.driver         = {
+		.name   = "cn10k_tad_pmu",
+		.of_match_table = of_match_ptr(tad_pmu_of_match),
+		.suppress_bind_attrs = true,
+	},
+	.probe          = tad_pmu_probe,
+	.remove         = tad_pmu_remove,
+};
+
+static int tad_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct tad_pmu *pmu = hlist_entry_safe(node, struct tad_pmu, node);
+	unsigned int target;
+
+	if (cpu != pmu->cpu)
+		return 0;
+
+	target = cpumask_any_but(cpu_online_mask, cpu);
+	if (target >= nr_cpu_ids)
+		return 0;
+
+	perf_pmu_migrate_context(&pmu->pmu, cpu, target);
+	pmu->cpu = target;
+
+	return 0;
+}
+
+static int __init tad_pmu_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN,
+				      "perf/cn10k/tadpmu:online",
+				      NULL,
+				      tad_pmu_offline_cpu);
+	if (ret < 0)
+		return ret;
+	tad_pmu_cpuhp_state = ret;
+	return platform_driver_register(&tad_pmu_driver);
+}
+
+static void __exit tad_pmu_exit(void)
+{
+	platform_driver_unregister(&tad_pmu_driver);
+	cpuhp_remove_multi_state(tad_pmu_cpuhp_state);
+}
+
+module_init(tad_pmu_init);
+module_exit(tad_pmu_exit);
+
+MODULE_DESCRIPTION("Marvell CN10K LLC-TAD Perf driver");
+MODULE_AUTHOR("Bhaskara Budiredla <bbudiredla@marvell.com>");
+MODULE_LICENSE("GPL v2");

From 4cbf47728f8d9d18e7e43863c0b623351267f203 Mon Sep 17 00:00:00 2001
From: Bhaskara Budiredla <bbudiredla@marvell.com>
Date: Mon, 15 Nov 2021 10:05:06 +0530
Subject: [PATCH 48/81] dt-bindings: perf: Add YAML schemas for Marvell CN10K
 LLC-TAD pmu bindings

Add device tree bindings for Last-level-cache Tag-and-data
(LLC-TAD) unit PMU for Marvell CN10K SoCs.

Signed-off-by: Bhaskara Budiredla <bbudiredla@marvell.com>
Reviewed-by: Rob Herring <robh@kernel.org>
Link: https://lore.kernel.org/r/20211115043506.6679-3-bbudiredla@marvell.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 .../bindings/perf/marvell-cn10k-tad.yaml      | 63 +++++++++++++++++++
 1 file changed, 63 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/perf/marvell-cn10k-tad.yaml

diff --git a/Documentation/devicetree/bindings/perf/marvell-cn10k-tad.yaml b/Documentation/devicetree/bindings/perf/marvell-cn10k-tad.yaml
new file mode 100644
index 000000000000..362142252667
--- /dev/null
+++ b/Documentation/devicetree/bindings/perf/marvell-cn10k-tad.yaml
@@ -0,0 +1,63 @@
+# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause)
+%YAML 1.2
+---
+$id: http://devicetree.org/schemas/perf/marvell-cn10k-tad.yaml#
+$schema: http://devicetree.org/meta-schemas/core.yaml#
+
+title: Marvell CN10K LLC-TAD performance monitor
+
+maintainers:
+  - Bhaskara Budiredla <bbudiredla@marvell.com>
+
+description: |
+  The Tag-and-Data units (TADs) maintain coherence and contain CN10K
+  shared on-chip last level cache (LLC). The tad pmu measures the
+  performance of last-level cache. Each tad pmu supports up to eight
+  counters.
+
+  The DT setup comprises of number of tad blocks, the sizes of pmu
+  regions, tad blocks and overall base address of the HW.
+
+properties:
+  compatible:
+    const: marvell,cn10k-tad-pmu
+
+  reg:
+    maxItems: 1
+
+  marvell,tad-cnt:
+    description: specifies the number of tads on the soc
+    $ref: /schemas/types.yaml#/definitions/uint32
+
+  marvell,tad-page-size:
+    description: specifies the size of each tad page
+    $ref: /schemas/types.yaml#/definitions/uint32
+
+  marvell,tad-pmu-page-size:
+    description: specifies the size of page that the pmu uses
+    $ref: /schemas/types.yaml#/definitions/uint32
+
+required:
+  - compatible
+  - reg
+  - marvell,tad-cnt
+  - marvell,tad-page-size
+  - marvell,tad-pmu-page-size
+
+additionalProperties: false
+
+examples:
+  - |
+
+    tad {
+        #address-cells = <2>;
+        #size-cells = <2>;
+
+        tad_pmu@80000000 {
+            compatible = "marvell,cn10k-tad-pmu";
+            reg = <0x87e2 0x80000000 0x0 0x1000>;
+            marvell,tad-cnt = <1>;
+            marvell,tad-page-size = <0x1000>;
+            marvell,tad-pmu-page-size = <0x1000>;
+        };
+    };

From c8602008e247f5603317c16f076565a96715e1ba Mon Sep 17 00:00:00 2001
From: Qi Liu <liuqi115@huawei.com>
Date: Thu, 2 Dec 2021 16:06:32 +0800
Subject: [PATCH 49/81] docs: perf: Add description for HiSilicon PCIe PMU
 driver

PCIe PMU Root Complex Integrated End Point(RCiEP) device is supported on
HiSilicon HIP09 platform. Document it to provide guidance on how to
use it.

Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Reviewed-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Link: https://lore.kernel.org/r/20211202080633.2919-2-liuqi115@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 .../admin-guide/perf/hisi-pcie-pmu.rst        | 106 ++++++++++++++++++
 1 file changed, 106 insertions(+)
 create mode 100644 Documentation/admin-guide/perf/hisi-pcie-pmu.rst

diff --git a/Documentation/admin-guide/perf/hisi-pcie-pmu.rst b/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
new file mode 100644
index 000000000000..294ebbdb22af
--- /dev/null
+++ b/Documentation/admin-guide/perf/hisi-pcie-pmu.rst
@@ -0,0 +1,106 @@
+================================================
+HiSilicon PCIe Performance Monitoring Unit (PMU)
+================================================
+
+On Hip09, HiSilicon PCIe Performance Monitoring Unit (PMU) could monitor
+bandwidth, latency, bus utilization and buffer occupancy data of PCIe.
+
+Each PCIe Core has a PMU to monitor multi Root Ports of this PCIe Core and
+all Endpoints downstream these Root Ports.
+
+
+HiSilicon PCIe PMU driver
+=========================
+
+The PCIe PMU driver registers a perf PMU with the name of its sicl-id and PCIe
+Core id.::
+
+  /sys/bus/event_source/hisi_pcie<sicl>_<core>
+
+PMU driver provides description of available events and filter options in sysfs,
+see /sys/bus/event_source/devices/hisi_pcie<sicl>_<core>.
+
+The "format" directory describes all formats of the config (events) and config1
+(filter options) fields of the perf_event_attr structure. The "events" directory
+describes all documented events shown in perf list.
+
+The "identifier" sysfs file allows users to identify the version of the
+PMU hardware device.
+
+The "bus" sysfs file allows users to get the bus number of Root Ports
+monitored by PMU.
+
+Example usage of perf::
+
+  $# perf list
+  hisi_pcie0_0/rx_mwr_latency/ [kernel PMU event]
+  hisi_pcie0_0/rx_mwr_cnt/ [kernel PMU event]
+  ------------------------------------------
+
+  $# perf stat -e hisi_pcie0_0/rx_mwr_latency/
+  $# perf stat -e hisi_pcie0_0/rx_mwr_cnt/
+  $# perf stat -g -e hisi_pcie0_0/rx_mwr_latency/ -e hisi_pcie0_0/rx_mwr_cnt/
+
+The current driver does not support sampling. So "perf record" is unsupported.
+Also attach to a task is unsupported for PCIe PMU.
+
+Filter options
+--------------
+
+1. Target filter
+PMU could only monitor the performance of traffic downstream target Root Ports
+or downstream target Endpoint. PCIe PMU driver support "port" and "bdf"
+interfaces for users, and these two interfaces aren't supported at the same
+time.
+
+-port
+"port" filter can be used in all PCIe PMU events, target Root Port can be
+selected by configuring the 16-bits-bitmap "port". Multi ports can be selected
+for AP-layer-events, and only one port can be selected for TL/DL-layer-events.
+
+For example, if target Root Port is 0000:00:00.0 (x8 lanes), bit0 of bitmap
+should be set, port=0x1; if target Root Port is 0000:00:04.0 (x4 lanes),
+bit8 is set, port=0x100; if these two Root Ports are both monitored, port=0x101.
+
+Example usage of perf::
+
+  $# perf stat -e hisi_pcie0_0/rx_mwr_latency,port=0x1/ sleep 5
+
+-bdf
+
+"bdf" filter can only be used in bandwidth events, target Endpoint is selected
+by configuring BDF to "bdf". Counter only counts the bandwidth of message
+requested by target Endpoint.
+
+For example, "bdf=0x3900" means BDF of target Endpoint is 0000:39:00.0.
+
+Example usage of perf::
+
+  $# perf stat -e hisi_pcie0_0/rx_mrd_flux,bdf=0x3900/ sleep 5
+
+2. Trigger filter
+Event statistics start when the first time TLP length is greater/smaller
+than trigger condition. You can set the trigger condition by writing "trig_len",
+and set the trigger mode by writing "trig_mode". This filter can only be used
+in bandwidth events.
+
+For example, "trig_len=4" means trigger condition is 2^4 DW, "trig_mode=0"
+means statistics start when TLP length > trigger condition, "trig_mode=1"
+means start when TLP length < condition.
+
+Example usage of perf::
+
+  $# perf stat -e hisi_pcie0_0/rx_mrd_flux,trig_len=0x4,trig_mode=1/ sleep 5
+
+3. Threshold filter
+Counter counts when TLP length within the specified range. You can set the
+threshold by writing "thr_len", and set the threshold mode by writing
+"thr_mode". This filter can only be used in bandwidth events.
+
+For example, "thr_len=4" means threshold is 2^4 DW, "thr_mode=0" means
+counter counts when TLP length >= threshold, and "thr_mode=1" means counts
+when TLP length < threshold.
+
+Example usage of perf::
+
+  $# perf stat -e hisi_pcie0_0/rx_mrd_flux,thr_len=0x4,thr_mode=1/ sleep 5

From 8404b0fbc7fbd42e5c5d28cdedd450e70829c77a Mon Sep 17 00:00:00 2001
From: Qi Liu <liuqi115@huawei.com>
Date: Thu, 2 Dec 2021 16:06:33 +0800
Subject: [PATCH 50/81] drivers/perf: hisi: Add driver for HiSilicon PCIe PMU
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

PCIe PMU Root Complex Integrated End Point(RCiEP) device is supported
to sample bandwidth, latency, buffer occupation etc.

Each PMU RCiEP device monitors multiple Root Ports, and each RCiEP is
registered as a PMU in /sys/bus/event_source/devices, so users can
select target PMU, and use filter to do further sets.

Filtering options contains:
event     - select the event.
port      - select target Root Ports. Information of Root Ports are
            shown under sysfs.
bdf       - select requester_id of target EP device.
trig_len  - set trigger condition for starting event statistics.
trig_mode - set trigger mode. 0 means starting to statistic when bigger
            than trigger condition, and 1 means smaller.
thr_len   - set threshold for statistics.
thr_mode  - set threshold mode. 0 means count when bigger than threshold,
            and 1 means smaller.

Acked-by: Krzysztof Wilczyński <kw@linux.com>
Reviewed-by: John Garry <john.garry@huawei.com>
Signed-off-by: Qi Liu <liuqi115@huawei.com>
Reviewed-by: Shaokun Zhang <zhangshaokun@hisilicon.com>
Link: https://lore.kernel.org/r/20211202080633.2919-3-liuqi115@huawei.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 MAINTAINERS                            |   2 +
 drivers/perf/hisilicon/Kconfig         |   9 +
 drivers/perf/hisilicon/Makefile        |   2 +
 drivers/perf/hisilicon/hisi_pcie_pmu.c | 948 +++++++++++++++++++++++++
 include/linux/cpuhotplug.h             |   1 +
 5 files changed, 962 insertions(+)
 create mode 100644 drivers/perf/hisilicon/hisi_pcie_pmu.c

diff --git a/MAINTAINERS b/MAINTAINERS
index 360e9aa0205d..90a7d7b21982 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -8607,8 +8607,10 @@ F:	drivers/misc/hisi_hikey_usb.c
 
 HISILICON PMU DRIVER
 M:	Shaokun Zhang <zhangshaokun@hisilicon.com>
+M:	Qi Liu <liuqi115@huawei.com>
 S:	Supported
 W:	http://www.hisilicon.com
+F:	Documentation/admin-guide/perf/hisi-pcie-pmu.rst
 F:	Documentation/admin-guide/perf/hisi-pmu.rst
 F:	drivers/perf/hisilicon
 
diff --git a/drivers/perf/hisilicon/Kconfig b/drivers/perf/hisilicon/Kconfig
index c5d1b7019fff..5546218b5598 100644
--- a/drivers/perf/hisilicon/Kconfig
+++ b/drivers/perf/hisilicon/Kconfig
@@ -5,3 +5,12 @@ config HISI_PMU
 	  help
 	  Support for HiSilicon SoC L3 Cache performance monitor, Hydra Home
 	  Agent performance monitor and DDR Controller performance monitor.
+
+config HISI_PCIE_PMU
+	tristate "HiSilicon PCIE PERF PMU"
+	depends on PCI && ARM64
+	help
+	  Provide support for HiSilicon PCIe performance monitoring unit (PMU)
+	  RCiEP devices.
+	  Adds the PCIe PMU into perf events system for monitoring latency,
+	  bandwidth etc.
diff --git a/drivers/perf/hisilicon/Makefile b/drivers/perf/hisilicon/Makefile
index 7643c9f93e36..506ed39e3266 100644
--- a/drivers/perf/hisilicon/Makefile
+++ b/drivers/perf/hisilicon/Makefile
@@ -2,3 +2,5 @@
 obj-$(CONFIG_HISI_PMU) += hisi_uncore_pmu.o hisi_uncore_l3c_pmu.o \
 			  hisi_uncore_hha_pmu.o hisi_uncore_ddrc_pmu.o hisi_uncore_sllc_pmu.o \
 			  hisi_uncore_pa_pmu.o
+
+obj-$(CONFIG_HISI_PCIE_PMU) += hisi_pcie_pmu.o
diff --git a/drivers/perf/hisilicon/hisi_pcie_pmu.c b/drivers/perf/hisilicon/hisi_pcie_pmu.c
new file mode 100644
index 000000000000..21771708597d
--- /dev/null
+++ b/drivers/perf/hisilicon/hisi_pcie_pmu.c
@@ -0,0 +1,948 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * This driver adds support for PCIe PMU RCiEP device. Related
+ * perf events are bandwidth, latency etc.
+ *
+ * Copyright (C) 2021 HiSilicon Limited
+ * Author: Qi Liu <liuqi115@huawei.com>
+ */
+#include <linux/bitfield.h>
+#include <linux/bitmap.h>
+#include <linux/bug.h>
+#include <linux/device.h>
+#include <linux/err.h>
+#include <linux/interrupt.h>
+#include <linux/irq.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/pci.h>
+#include <linux/perf_event.h>
+
+#define DRV_NAME "hisi_pcie_pmu"
+/* Define registers */
+#define HISI_PCIE_GLOBAL_CTRL		0x00
+#define HISI_PCIE_EVENT_CTRL		0x010
+#define HISI_PCIE_CNT			0x090
+#define HISI_PCIE_EXT_CNT		0x110
+#define HISI_PCIE_INT_STAT		0x150
+#define HISI_PCIE_INT_MASK		0x154
+#define HISI_PCIE_REG_BDF		0xfe0
+#define HISI_PCIE_REG_VERSION		0xfe4
+#define HISI_PCIE_REG_INFO		0xfe8
+
+/* Define command in HISI_PCIE_GLOBAL_CTRL */
+#define HISI_PCIE_GLOBAL_EN		0x01
+#define HISI_PCIE_GLOBAL_NONE		0
+
+/* Define command in HISI_PCIE_EVENT_CTRL */
+#define HISI_PCIE_EVENT_EN		BIT_ULL(20)
+#define HISI_PCIE_RESET_CNT		BIT_ULL(22)
+#define HISI_PCIE_INIT_SET		BIT_ULL(34)
+#define HISI_PCIE_THR_EN		BIT_ULL(26)
+#define HISI_PCIE_TARGET_EN		BIT_ULL(32)
+#define HISI_PCIE_TRIG_EN		BIT_ULL(52)
+
+/* Define offsets in HISI_PCIE_EVENT_CTRL */
+#define HISI_PCIE_EVENT_M		GENMASK_ULL(15, 0)
+#define HISI_PCIE_THR_MODE_M		GENMASK_ULL(27, 27)
+#define HISI_PCIE_THR_M			GENMASK_ULL(31, 28)
+#define HISI_PCIE_TARGET_M		GENMASK_ULL(52, 36)
+#define HISI_PCIE_TRIG_MODE_M		GENMASK_ULL(53, 53)
+#define HISI_PCIE_TRIG_M		GENMASK_ULL(59, 56)
+
+#define HISI_PCIE_MAX_COUNTERS		8
+#define HISI_PCIE_REG_STEP		8
+#define HISI_PCIE_THR_MAX_VAL		10
+#define HISI_PCIE_TRIG_MAX_VAL		10
+#define HISI_PCIE_MAX_PERIOD		(GENMASK_ULL(63, 0))
+#define HISI_PCIE_INIT_VAL		BIT_ULL(63)
+
+struct hisi_pcie_pmu {
+	struct perf_event *hw_events[HISI_PCIE_MAX_COUNTERS];
+	struct hlist_node node;
+	struct pci_dev *pdev;
+	struct pmu pmu;
+	void __iomem *base;
+	int irq;
+	u32 identifier;
+	/* Minimum and maximum BDF of root ports monitored by PMU */
+	u16 bdf_min;
+	u16 bdf_max;
+	int on_cpu;
+};
+
+struct hisi_pcie_reg_pair {
+	u16 lo;
+	u16 hi;
+};
+
+#define to_pcie_pmu(p)  (container_of((p), struct hisi_pcie_pmu, pmu))
+#define GET_PCI_DEVFN(bdf)  ((bdf) & 0xff)
+
+#define HISI_PCIE_PMU_FILTER_ATTR(_name, _config, _hi, _lo)		  \
+	static u64 hisi_pcie_get_##_name(struct perf_event *event)	  \
+	{								  \
+		return FIELD_GET(GENMASK(_hi, _lo), event->attr._config); \
+	}								  \
+
+HISI_PCIE_PMU_FILTER_ATTR(event, config, 16, 0);
+HISI_PCIE_PMU_FILTER_ATTR(thr_len, config1, 3, 0);
+HISI_PCIE_PMU_FILTER_ATTR(thr_mode, config1, 4, 4);
+HISI_PCIE_PMU_FILTER_ATTR(trig_len, config1, 8, 5);
+HISI_PCIE_PMU_FILTER_ATTR(trig_mode, config1, 9, 9);
+HISI_PCIE_PMU_FILTER_ATTR(port, config2, 15, 0);
+HISI_PCIE_PMU_FILTER_ATTR(bdf, config2, 31, 16);
+
+static ssize_t hisi_pcie_format_sysfs_show(struct device *dev, struct device_attribute *attr,
+					   char *buf)
+{
+	struct dev_ext_attribute *eattr;
+
+	eattr = container_of(attr, struct dev_ext_attribute, attr);
+
+	return sysfs_emit(buf, "%s\n", (char *)eattr->var);
+}
+
+static ssize_t hisi_pcie_event_sysfs_show(struct device *dev, struct device_attribute *attr,
+					  char *buf)
+{
+	struct perf_pmu_events_attr *pmu_attr =
+		container_of(attr, struct perf_pmu_events_attr, attr);
+
+	return sysfs_emit(buf, "config=0x%llx\n", pmu_attr->id);
+}
+
+#define HISI_PCIE_PMU_FORMAT_ATTR(_name, _format)                              \
+	(&((struct dev_ext_attribute[]){                                       \
+		{ .attr = __ATTR(_name, 0444, hisi_pcie_format_sysfs_show,     \
+				 NULL),                                        \
+		  .var = (void *)_format }                                     \
+	})[0].attr.attr)
+
+#define HISI_PCIE_PMU_EVENT_ATTR(_name, _id)			\
+	PMU_EVENT_ATTR_ID(_name, hisi_pcie_event_sysfs_show, _id)
+
+static ssize_t cpumask_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
+
+	return cpumap_print_to_pagebuf(true, buf, cpumask_of(pcie_pmu->on_cpu));
+}
+static DEVICE_ATTR_RO(cpumask);
+
+static ssize_t identifier_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
+
+	return sysfs_emit(buf, "%#x\n", pcie_pmu->identifier);
+}
+static DEVICE_ATTR_RO(identifier);
+
+static ssize_t bus_show(struct device *dev, struct device_attribute *attr, char *buf)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(dev_get_drvdata(dev));
+
+	return sysfs_emit(buf, "%#04x\n", PCI_BUS_NUM(pcie_pmu->bdf_min));
+}
+static DEVICE_ATTR_RO(bus);
+
+static struct hisi_pcie_reg_pair
+hisi_pcie_parse_reg_value(struct hisi_pcie_pmu *pcie_pmu, u32 reg_off)
+{
+	u32 val = readl_relaxed(pcie_pmu->base + reg_off);
+	struct hisi_pcie_reg_pair regs = {
+		.lo = val,
+		.hi = val >> 16,
+	};
+
+	return regs;
+}
+
+/*
+ * Hardware counter and ext_counter work together for bandwidth, latency, bus
+ * utilization and buffer occupancy events. For example, RX memory write latency
+ * events(index = 0x0010), counter counts total delay cycles and ext_counter
+ * counts RX memory write PCIe packets number.
+ *
+ * As we don't want PMU driver to process these two data, "delay cycles" can
+ * be treated as an independent event(index = 0x0010), "RX memory write packets
+ * number" as another(index = 0x10010). BIT 16 is used to distinguish and 0-15
+ * bits are "real" event index, which can be used to set HISI_PCIE_EVENT_CTRL.
+ */
+#define EXT_COUNTER_IS_USED(idx)		((idx) & BIT(16))
+
+static u32 hisi_pcie_get_real_event(struct perf_event *event)
+{
+	return hisi_pcie_get_event(event) & GENMASK(15, 0);
+}
+
+static u32 hisi_pcie_pmu_get_offset(u32 offset, u32 idx)
+{
+	return offset + HISI_PCIE_REG_STEP * idx;
+}
+
+static u32 hisi_pcie_pmu_readl(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset,
+			       u32 idx)
+{
+	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+	return readl_relaxed(pcie_pmu->base + offset);
+}
+
+static void hisi_pcie_pmu_writel(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx, u32 val)
+{
+	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+	writel_relaxed(val, pcie_pmu->base + offset);
+}
+
+static u64 hisi_pcie_pmu_readq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx)
+{
+	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+	return readq_relaxed(pcie_pmu->base + offset);
+}
+
+static void hisi_pcie_pmu_writeq(struct hisi_pcie_pmu *pcie_pmu, u32 reg_offset, u32 idx, u64 val)
+{
+	u32 offset = hisi_pcie_pmu_get_offset(reg_offset, idx);
+
+	writeq_relaxed(val, pcie_pmu->base + offset);
+}
+
+static void hisi_pcie_pmu_config_filter(struct perf_event *event)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	u64 reg = HISI_PCIE_INIT_SET;
+	u64 port, trig_len, thr_len;
+
+	/* Config HISI_PCIE_EVENT_CTRL according to event. */
+	reg |= FIELD_PREP(HISI_PCIE_EVENT_M, hisi_pcie_get_real_event(event));
+
+	/* Config HISI_PCIE_EVENT_CTRL according to root port or EP device. */
+	port = hisi_pcie_get_port(event);
+	if (port)
+		reg |= FIELD_PREP(HISI_PCIE_TARGET_M, port);
+	else
+		reg |= HISI_PCIE_TARGET_EN |
+		       FIELD_PREP(HISI_PCIE_TARGET_M, hisi_pcie_get_bdf(event));
+
+	/* Config HISI_PCIE_EVENT_CTRL according to trigger condition. */
+	trig_len = hisi_pcie_get_trig_len(event);
+	if (trig_len) {
+		reg |= FIELD_PREP(HISI_PCIE_TRIG_M, trig_len);
+		reg |= FIELD_PREP(HISI_PCIE_TRIG_MODE_M, hisi_pcie_get_trig_mode(event));
+		reg |= HISI_PCIE_TRIG_EN;
+	}
+
+	/* Config HISI_PCIE_EVENT_CTRL according to threshold condition. */
+	thr_len = hisi_pcie_get_thr_len(event);
+	if (thr_len) {
+		reg |= FIELD_PREP(HISI_PCIE_THR_M, thr_len);
+		reg |= FIELD_PREP(HISI_PCIE_THR_MODE_M, hisi_pcie_get_thr_mode(event));
+		reg |= HISI_PCIE_THR_EN;
+	}
+
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, hwc->idx, reg);
+}
+
+static void hisi_pcie_pmu_clear_filter(struct perf_event *event)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, hwc->idx, HISI_PCIE_INIT_SET);
+}
+
+static bool hisi_pcie_pmu_valid_requester_id(struct hisi_pcie_pmu *pcie_pmu, u32 bdf)
+{
+	struct pci_dev *root_port, *pdev;
+	u16 rp_bdf;
+
+	pdev = pci_get_domain_bus_and_slot(pci_domain_nr(pcie_pmu->pdev->bus), PCI_BUS_NUM(bdf),
+					   GET_PCI_DEVFN(bdf));
+	if (!pdev)
+		return false;
+
+	root_port = pcie_find_root_port(pdev);
+	if (!root_port) {
+		pci_dev_put(pdev);
+		return false;
+	}
+
+	pci_dev_put(pdev);
+	rp_bdf = pci_dev_id(root_port);
+	return rp_bdf >= pcie_pmu->bdf_min && rp_bdf <= pcie_pmu->bdf_max;
+}
+
+static bool hisi_pcie_pmu_valid_filter(struct perf_event *event,
+				       struct hisi_pcie_pmu *pcie_pmu)
+{
+	u32 requester_id = hisi_pcie_get_bdf(event);
+
+	if (hisi_pcie_get_thr_len(event) > HISI_PCIE_THR_MAX_VAL)
+		return false;
+
+	if (hisi_pcie_get_trig_len(event) > HISI_PCIE_TRIG_MAX_VAL)
+		return false;
+
+	if (requester_id) {
+		if (!hisi_pcie_pmu_valid_requester_id(pcie_pmu, requester_id))
+			return false;
+	}
+
+	return true;
+}
+
+static bool hisi_pcie_pmu_cmp_event(struct perf_event *target,
+					struct perf_event *event)
+{
+	return hisi_pcie_get_real_event(target) == hisi_pcie_get_real_event(event);
+}
+
+static bool hisi_pcie_pmu_validate_event_group(struct perf_event *event)
+{
+	struct perf_event *sibling, *leader = event->group_leader;
+	struct perf_event *event_group[HISI_PCIE_MAX_COUNTERS];
+	int counters = 1;
+	int num;
+
+	event_group[0] = leader;
+	if (!is_software_event(leader)) {
+		if (leader->pmu != event->pmu)
+			return false;
+
+		if (leader != event && !hisi_pcie_pmu_cmp_event(leader, event))
+			event_group[counters++] = event;
+	}
+
+	for_each_sibling_event(sibling, event->group_leader) {
+		if (is_software_event(sibling))
+			continue;
+
+		if (sibling->pmu != event->pmu)
+			return false;
+
+		for (num = 0; num < counters; num++) {
+			if (hisi_pcie_pmu_cmp_event(event_group[num], sibling))
+				break;
+		}
+
+		if (num == counters)
+			event_group[counters++] = sibling;
+	}
+
+	return counters <= HISI_PCIE_MAX_COUNTERS;
+}
+
+static int hisi_pcie_pmu_event_init(struct perf_event *event)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	event->cpu = pcie_pmu->on_cpu;
+
+	if (EXT_COUNTER_IS_USED(hisi_pcie_get_event(event)))
+		hwc->event_base = HISI_PCIE_EXT_CNT;
+	else
+		hwc->event_base = HISI_PCIE_CNT;
+
+	if (event->attr.type != event->pmu->type)
+		return -ENOENT;
+
+	/* Sampling is not supported. */
+	if (is_sampling_event(event) || event->attach_state & PERF_ATTACH_TASK)
+		return -EOPNOTSUPP;
+
+	if (!hisi_pcie_pmu_valid_filter(event, pcie_pmu))
+		return -EINVAL;
+
+	if (!hisi_pcie_pmu_validate_event_group(event))
+		return -EINVAL;
+
+	return 0;
+}
+
+static u64 hisi_pcie_pmu_read_counter(struct perf_event *event)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	u32 idx = event->hw.idx;
+
+	return hisi_pcie_pmu_readq(pcie_pmu, event->hw.event_base, idx);
+}
+
+static int hisi_pcie_pmu_find_related_event(struct hisi_pcie_pmu *pcie_pmu,
+					    struct perf_event *event)
+{
+	struct perf_event *sibling;
+	int idx;
+
+	for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) {
+		sibling = pcie_pmu->hw_events[idx];
+		if (!sibling)
+			continue;
+
+		if (!hisi_pcie_pmu_cmp_event(sibling, event))
+			continue;
+
+		/* Related events must be used in group */
+		if (sibling->group_leader == event->group_leader)
+			return idx;
+		else
+			return -EINVAL;
+	}
+
+	return idx;
+}
+
+static int hisi_pcie_pmu_get_event_idx(struct hisi_pcie_pmu *pcie_pmu)
+{
+	int idx;
+
+	for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) {
+		if (!pcie_pmu->hw_events[idx])
+			return idx;
+	}
+
+	return -EINVAL;
+}
+
+static void hisi_pcie_pmu_event_update(struct perf_event *event)
+{
+	struct hw_perf_event *hwc = &event->hw;
+	u64 new_cnt, prev_cnt, delta;
+
+	do {
+		prev_cnt = local64_read(&hwc->prev_count);
+		new_cnt = hisi_pcie_pmu_read_counter(event);
+	} while (local64_cmpxchg(&hwc->prev_count, prev_cnt,
+				 new_cnt) != prev_cnt);
+
+	delta = (new_cnt - prev_cnt) & HISI_PCIE_MAX_PERIOD;
+	local64_add(delta, &event->count);
+}
+
+static void hisi_pcie_pmu_read(struct perf_event *event)
+{
+	hisi_pcie_pmu_event_update(event);
+}
+
+static void hisi_pcie_pmu_set_period(struct perf_event *event)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+
+	local64_set(&hwc->prev_count, HISI_PCIE_INIT_VAL);
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_CNT, idx, HISI_PCIE_INIT_VAL);
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EXT_CNT, idx, HISI_PCIE_INIT_VAL);
+}
+
+static void hisi_pcie_pmu_enable_counter(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+	u64 val;
+
+	val = hisi_pcie_pmu_readq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx);
+	val |= HISI_PCIE_EVENT_EN;
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, val);
+}
+
+static void hisi_pcie_pmu_disable_counter(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+	u64 val;
+
+	val = hisi_pcie_pmu_readq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx);
+	val &= ~HISI_PCIE_EVENT_EN;
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, val);
+}
+
+static void hisi_pcie_pmu_enable_int(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+
+	hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_MASK, idx, 0);
+}
+
+static void hisi_pcie_pmu_disable_int(struct hisi_pcie_pmu *pcie_pmu, struct hw_perf_event *hwc)
+{
+	u32 idx = hwc->idx;
+
+	hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_MASK, idx, 1);
+}
+
+static void hisi_pcie_pmu_reset_counter(struct hisi_pcie_pmu *pcie_pmu, int idx)
+{
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, HISI_PCIE_RESET_CNT);
+	hisi_pcie_pmu_writeq(pcie_pmu, HISI_PCIE_EVENT_CTRL, idx, HISI_PCIE_INIT_SET);
+}
+
+static void hisi_pcie_pmu_start(struct perf_event *event, int flags)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx = hwc->idx;
+	u64 prev_cnt;
+
+	if (WARN_ON_ONCE(!(hwc->state & PERF_HES_STOPPED)))
+		return;
+
+	WARN_ON_ONCE(!(hwc->state & PERF_HES_UPTODATE));
+	hwc->state = 0;
+
+	hisi_pcie_pmu_config_filter(event);
+	hisi_pcie_pmu_enable_counter(pcie_pmu, hwc);
+	hisi_pcie_pmu_enable_int(pcie_pmu, hwc);
+	hisi_pcie_pmu_set_period(event);
+
+	if (flags & PERF_EF_RELOAD) {
+		prev_cnt = local64_read(&hwc->prev_count);
+		hisi_pcie_pmu_writeq(pcie_pmu, hwc->event_base, idx, prev_cnt);
+	}
+
+	perf_event_update_userpage(event);
+}
+
+static void hisi_pcie_pmu_stop(struct perf_event *event, int flags)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	hisi_pcie_pmu_event_update(event);
+	hisi_pcie_pmu_disable_int(pcie_pmu, hwc);
+	hisi_pcie_pmu_disable_counter(pcie_pmu, hwc);
+	hisi_pcie_pmu_clear_filter(event);
+	WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
+	hwc->state |= PERF_HES_STOPPED;
+
+	if (hwc->state & PERF_HES_UPTODATE)
+		return;
+
+	hwc->state |= PERF_HES_UPTODATE;
+}
+
+static int hisi_pcie_pmu_add(struct perf_event *event, int flags)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+	int idx;
+
+	hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE;
+
+	/* Check all working events to find a related event. */
+	idx = hisi_pcie_pmu_find_related_event(pcie_pmu, event);
+	if (idx < 0)
+		return idx;
+
+	/* Current event shares an enabled counter with the related event */
+	if (idx < HISI_PCIE_MAX_COUNTERS) {
+		hwc->idx = idx;
+		goto start_count;
+	}
+
+	idx = hisi_pcie_pmu_get_event_idx(pcie_pmu);
+	if (idx < 0)
+		return idx;
+
+	hwc->idx = idx;
+	pcie_pmu->hw_events[idx] = event;
+	/* Reset Counter to avoid previous statistic interference. */
+	hisi_pcie_pmu_reset_counter(pcie_pmu, idx);
+
+start_count:
+	if (flags & PERF_EF_START)
+		hisi_pcie_pmu_start(event, PERF_EF_RELOAD);
+
+	return 0;
+}
+
+static void hisi_pcie_pmu_del(struct perf_event *event, int flags)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(event->pmu);
+	struct hw_perf_event *hwc = &event->hw;
+
+	hisi_pcie_pmu_stop(event, PERF_EF_UPDATE);
+	pcie_pmu->hw_events[hwc->idx] = NULL;
+	perf_event_update_userpage(event);
+}
+
+static void hisi_pcie_pmu_enable(struct pmu *pmu)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
+	int num;
+
+	for (num = 0; num < HISI_PCIE_MAX_COUNTERS; num++) {
+		if (pcie_pmu->hw_events[num])
+			break;
+	}
+
+	if (num == HISI_PCIE_MAX_COUNTERS)
+		return;
+
+	writel(HISI_PCIE_GLOBAL_EN, pcie_pmu->base + HISI_PCIE_GLOBAL_CTRL);
+}
+
+static void hisi_pcie_pmu_disable(struct pmu *pmu)
+{
+	struct hisi_pcie_pmu *pcie_pmu = to_pcie_pmu(pmu);
+
+	writel(HISI_PCIE_GLOBAL_NONE, pcie_pmu->base + HISI_PCIE_GLOBAL_CTRL);
+}
+
+static irqreturn_t hisi_pcie_pmu_irq(int irq, void *data)
+{
+	struct hisi_pcie_pmu *pcie_pmu = data;
+	irqreturn_t ret = IRQ_NONE;
+	struct perf_event *event;
+	u32 overflown;
+	int idx;
+
+	for (idx = 0; idx < HISI_PCIE_MAX_COUNTERS; idx++) {
+		overflown = hisi_pcie_pmu_readl(pcie_pmu, HISI_PCIE_INT_STAT, idx);
+		if (!overflown)
+			continue;
+
+		/* Clear status of interrupt. */
+		hisi_pcie_pmu_writel(pcie_pmu, HISI_PCIE_INT_STAT, idx, 1);
+		event = pcie_pmu->hw_events[idx];
+		if (!event)
+			continue;
+
+		hisi_pcie_pmu_event_update(event);
+		hisi_pcie_pmu_set_period(event);
+		ret = IRQ_HANDLED;
+	}
+
+	return ret;
+}
+
+static int hisi_pcie_pmu_irq_register(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu)
+{
+	int irq, ret;
+
+	ret = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_MSI);
+	if (ret < 0) {
+		pci_err(pdev, "Failed to enable MSI vectors: %d\n", ret);
+		return ret;
+	}
+
+	irq = pci_irq_vector(pdev, 0);
+	ret = request_irq(irq, hisi_pcie_pmu_irq, IRQF_NOBALANCING | IRQF_NO_THREAD, DRV_NAME,
+			  pcie_pmu);
+	if (ret) {
+		pci_err(pdev, "Failed to register IRQ: %d\n", ret);
+		pci_free_irq_vectors(pdev);
+		return ret;
+	}
+
+	pcie_pmu->irq = irq;
+
+	return 0;
+}
+
+static void hisi_pcie_pmu_irq_unregister(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu)
+{
+	free_irq(pcie_pmu->irq, pcie_pmu);
+	pci_free_irq_vectors(pdev);
+}
+
+static int hisi_pcie_pmu_online_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node, struct hisi_pcie_pmu, node);
+
+	if (pcie_pmu->on_cpu == -1) {
+		pcie_pmu->on_cpu = cpu;
+		WARN_ON(irq_set_affinity(pcie_pmu->irq, cpumask_of(cpu)));
+	}
+
+	return 0;
+}
+
+static int hisi_pcie_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node)
+{
+	struct hisi_pcie_pmu *pcie_pmu = hlist_entry_safe(node, struct hisi_pcie_pmu, node);
+	unsigned int target;
+
+	/* Nothing to do if this CPU doesn't own the PMU */
+	if (pcie_pmu->on_cpu != cpu)
+		return 0;
+
+	pcie_pmu->on_cpu = -1;
+	/* Choose a new CPU from all online cpus. */
+	target = cpumask_first(cpu_online_mask);
+	if (target >= nr_cpu_ids) {
+		pci_err(pcie_pmu->pdev, "There is no CPU to set\n");
+		return 0;
+	}
+
+	perf_pmu_migrate_context(&pcie_pmu->pmu, cpu, target);
+	/* Use this CPU for event counting */
+	pcie_pmu->on_cpu = target;
+	WARN_ON(irq_set_affinity(pcie_pmu->irq, cpumask_of(target)));
+
+	return 0;
+}
+
+static struct attribute *hisi_pcie_pmu_events_attr[] = {
+	HISI_PCIE_PMU_EVENT_ATTR(rx_mwr_latency, 0x0010),
+	HISI_PCIE_PMU_EVENT_ATTR(rx_mwr_cnt, 0x10010),
+	HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_latency, 0x0210),
+	HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_cnt, 0x10210),
+	HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_latency, 0x0011),
+	HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_cnt, 0x10011),
+	HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_flux, 0x1005),
+	HISI_PCIE_PMU_EVENT_ATTR(rx_mrd_time, 0x11005),
+	HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_flux, 0x2004),
+	HISI_PCIE_PMU_EVENT_ATTR(tx_mrd_time, 0x12004),
+	NULL
+};
+
+static struct attribute_group hisi_pcie_pmu_events_group = {
+	.name = "events",
+	.attrs = hisi_pcie_pmu_events_attr,
+};
+
+static struct attribute *hisi_pcie_pmu_format_attr[] = {
+	HISI_PCIE_PMU_FORMAT_ATTR(event, "config:0-16"),
+	HISI_PCIE_PMU_FORMAT_ATTR(thr_len, "config1:0-3"),
+	HISI_PCIE_PMU_FORMAT_ATTR(thr_mode, "config1:4"),
+	HISI_PCIE_PMU_FORMAT_ATTR(trig_len, "config1:5-8"),
+	HISI_PCIE_PMU_FORMAT_ATTR(trig_mode, "config1:9"),
+	HISI_PCIE_PMU_FORMAT_ATTR(port, "config2:0-15"),
+	HISI_PCIE_PMU_FORMAT_ATTR(bdf, "config2:16-31"),
+	NULL
+};
+
+static const struct attribute_group hisi_pcie_pmu_format_group = {
+	.name = "format",
+	.attrs = hisi_pcie_pmu_format_attr,
+};
+
+static struct attribute *hisi_pcie_pmu_bus_attrs[] = {
+	&dev_attr_bus.attr,
+	NULL
+};
+
+static const struct attribute_group hisi_pcie_pmu_bus_attr_group = {
+	.attrs = hisi_pcie_pmu_bus_attrs,
+};
+
+static struct attribute *hisi_pcie_pmu_cpumask_attrs[] = {
+	&dev_attr_cpumask.attr,
+	NULL
+};
+
+static const struct attribute_group hisi_pcie_pmu_cpumask_attr_group = {
+	.attrs = hisi_pcie_pmu_cpumask_attrs,
+};
+
+static struct attribute *hisi_pcie_pmu_identifier_attrs[] = {
+	&dev_attr_identifier.attr,
+	NULL
+};
+
+static const struct attribute_group hisi_pcie_pmu_identifier_attr_group = {
+	.attrs = hisi_pcie_pmu_identifier_attrs,
+};
+
+static const struct attribute_group *hisi_pcie_pmu_attr_groups[] = {
+	&hisi_pcie_pmu_events_group,
+	&hisi_pcie_pmu_format_group,
+	&hisi_pcie_pmu_bus_attr_group,
+	&hisi_pcie_pmu_cpumask_attr_group,
+	&hisi_pcie_pmu_identifier_attr_group,
+	NULL
+};
+
+static int hisi_pcie_alloc_pmu(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu)
+{
+	struct hisi_pcie_reg_pair regs;
+	u16 sicl_id, core_id;
+	char *name;
+
+	regs = hisi_pcie_parse_reg_value(pcie_pmu, HISI_PCIE_REG_BDF);
+	pcie_pmu->bdf_min = regs.lo;
+	pcie_pmu->bdf_max = regs.hi;
+
+	regs = hisi_pcie_parse_reg_value(pcie_pmu, HISI_PCIE_REG_INFO);
+	sicl_id = regs.hi;
+	core_id = regs.lo;
+
+	name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "hisi_pcie%u_core%u", sicl_id, core_id);
+	if (!name)
+		return -ENOMEM;
+
+	pcie_pmu->pdev = pdev;
+	pcie_pmu->on_cpu = -1;
+	pcie_pmu->identifier = readl(pcie_pmu->base + HISI_PCIE_REG_VERSION);
+	pcie_pmu->pmu = (struct pmu) {
+		.name		= name,
+		.module		= THIS_MODULE,
+		.event_init	= hisi_pcie_pmu_event_init,
+		.pmu_enable	= hisi_pcie_pmu_enable,
+		.pmu_disable	= hisi_pcie_pmu_disable,
+		.add		= hisi_pcie_pmu_add,
+		.del		= hisi_pcie_pmu_del,
+		.start		= hisi_pcie_pmu_start,
+		.stop		= hisi_pcie_pmu_stop,
+		.read		= hisi_pcie_pmu_read,
+		.task_ctx_nr	= perf_invalid_context,
+		.attr_groups	= hisi_pcie_pmu_attr_groups,
+		.capabilities	= PERF_PMU_CAP_NO_EXCLUDE,
+	};
+
+	return 0;
+}
+
+static int hisi_pcie_init_pmu(struct pci_dev *pdev, struct hisi_pcie_pmu *pcie_pmu)
+{
+	int ret;
+
+	pcie_pmu->base = pci_ioremap_bar(pdev, 2);
+	if (!pcie_pmu->base) {
+		pci_err(pdev, "Ioremap failed for pcie_pmu resource\n");
+		return -ENOMEM;
+	}
+
+	ret = hisi_pcie_alloc_pmu(pdev, pcie_pmu);
+	if (ret)
+		goto err_iounmap;
+
+	ret = hisi_pcie_pmu_irq_register(pdev, pcie_pmu);
+	if (ret)
+		goto err_iounmap;
+
+	ret = cpuhp_state_add_instance(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node);
+	if (ret) {
+		pci_err(pdev, "Failed to register hotplug: %d\n", ret);
+		goto err_irq_unregister;
+	}
+
+	ret = perf_pmu_register(&pcie_pmu->pmu, pcie_pmu->pmu.name, -1);
+	if (ret) {
+		pci_err(pdev, "Failed to register PCIe PMU: %d\n", ret);
+		goto err_hotplug_unregister;
+	}
+
+	return ret;
+
+err_hotplug_unregister:
+	cpuhp_state_remove_instance_nocalls(
+		CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node);
+
+err_irq_unregister:
+	hisi_pcie_pmu_irq_unregister(pdev, pcie_pmu);
+
+err_iounmap:
+	iounmap(pcie_pmu->base);
+
+	return ret;
+}
+
+static void hisi_pcie_uninit_pmu(struct pci_dev *pdev)
+{
+	struct hisi_pcie_pmu *pcie_pmu = pci_get_drvdata(pdev);
+
+	perf_pmu_unregister(&pcie_pmu->pmu);
+	cpuhp_state_remove_instance_nocalls(
+		CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE, &pcie_pmu->node);
+	hisi_pcie_pmu_irq_unregister(pdev, pcie_pmu);
+	iounmap(pcie_pmu->base);
+}
+
+static int hisi_pcie_init_dev(struct pci_dev *pdev)
+{
+	int ret;
+
+	ret = pcim_enable_device(pdev);
+	if (ret) {
+		pci_err(pdev, "Failed to enable PCI device: %d\n", ret);
+		return ret;
+	}
+
+	ret = pcim_iomap_regions(pdev, BIT(2), DRV_NAME);
+	if (ret < 0) {
+		pci_err(pdev, "Failed to request PCI mem regions: %d\n", ret);
+		return ret;
+	}
+
+	pci_set_master(pdev);
+
+	return 0;
+}
+
+static int hisi_pcie_pmu_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+{
+	struct hisi_pcie_pmu *pcie_pmu;
+	int ret;
+
+	pcie_pmu = devm_kzalloc(&pdev->dev, sizeof(*pcie_pmu), GFP_KERNEL);
+	if (!pcie_pmu)
+		return -ENOMEM;
+
+	ret = hisi_pcie_init_dev(pdev);
+	if (ret)
+		return ret;
+
+	ret = hisi_pcie_init_pmu(pdev, pcie_pmu);
+	if (ret)
+		return ret;
+
+	pci_set_drvdata(pdev, pcie_pmu);
+
+	return ret;
+}
+
+static void hisi_pcie_pmu_remove(struct pci_dev *pdev)
+{
+	hisi_pcie_uninit_pmu(pdev);
+	pci_set_drvdata(pdev, NULL);
+}
+
+static const struct pci_device_id hisi_pcie_pmu_ids[] = {
+	{ PCI_DEVICE(PCI_VENDOR_ID_HUAWEI, 0xa12d) },
+	{ 0, }
+};
+MODULE_DEVICE_TABLE(pci, hisi_pcie_pmu_ids);
+
+static struct pci_driver hisi_pcie_pmu_driver = {
+	.name = DRV_NAME,
+	.id_table = hisi_pcie_pmu_ids,
+	.probe = hisi_pcie_pmu_probe,
+	.remove = hisi_pcie_pmu_remove,
+};
+
+static int __init hisi_pcie_module_init(void)
+{
+	int ret;
+
+	ret = cpuhp_setup_state_multi(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE,
+				      "AP_PERF_ARM_HISI_PCIE_PMU_ONLINE",
+				      hisi_pcie_pmu_online_cpu,
+				      hisi_pcie_pmu_offline_cpu);
+	if (ret) {
+		pr_err("Failed to setup PCIe PMU hotplug: %d\n", ret);
+		return ret;
+	}
+
+	ret = pci_register_driver(&hisi_pcie_pmu_driver);
+	if (ret)
+		cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE);
+
+	return ret;
+}
+module_init(hisi_pcie_module_init);
+
+static void __exit hisi_pcie_module_exit(void)
+{
+	pci_unregister_driver(&hisi_pcie_pmu_driver);
+	cpuhp_remove_multi_state(CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE);
+}
+module_exit(hisi_pcie_module_exit);
+
+MODULE_DESCRIPTION("HiSilicon PCIe PMU driver");
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Qi Liu <liuqi115@huawei.com>");
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 773c83730906..411a428ace4d 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -225,6 +225,7 @@ enum cpuhp_state {
 	CPUHP_AP_PERF_ARM_HISI_L3_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_PA_ONLINE,
 	CPUHP_AP_PERF_ARM_HISI_SLLC_ONLINE,
+	CPUHP_AP_PERF_ARM_HISI_PCIE_PMU_ONLINE,
 	CPUHP_AP_PERF_ARM_L2X0_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L2_ONLINE,
 	CPUHP_AP_PERF_ARM_QCOM_L3_ONLINE,

From 8e6082e94aac6d0338883b5953631b662a5a9188 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 10 Dec 2021 15:14:06 +0000
Subject: [PATCH 51/81] arm64: atomics: format whitespace consistently

The code for the atomic ops is formatted inconsistently, and while this
is not a functional problem it is rather distracting when working on
them.

Some have ops have consistent indentation, e.g.

| #define ATOMIC_OP_ADD_RETURN(name, mb, cl...)                           \
| static inline int __lse_atomic_add_return##name(int i, atomic_t *v)     \
| {                                                                       \
|         u32 tmp;                                                        \
|                                                                         \
|         asm volatile(                                                   \
|         __LSE_PREAMBLE                                                  \
|         "       ldadd" #mb "    %w[i], %w[tmp], %[v]\n"                 \
|         "       add     %w[i], %w[i], %w[tmp]"                          \
|         : [i] "+r" (i), [v] "+Q" (v->counter), [tmp] "=&r" (tmp)        \
|         : "r" (v)                                                       \
|         : cl);                                                          \
|                                                                         \
|         return i;                                                       \
| }

While others have negative indentation for some lines, and/or have
misaligned trailing backslashes, e.g.

| static inline void __lse_atomic_##op(int i, atomic_t *v)                        \
| {                                                                       \
|         asm volatile(                                                   \
|         __LSE_PREAMBLE                                                  \
| "       " #asm_op "     %w[i], %[v]\n"                                  \
|         : [i] "+r" (i), [v] "+Q" (v->counter)                           \
|         : "r" (v));                                                     \
| }

This patch makes the indentation consistent and also aligns the trailing
backslashes. This makes the code easier to read for those (like myself)
who are easily distracted by these inconsistencies.

This is intended as a cleanup.
There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20211210151410.2782645-2-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/atomic_ll_sc.h | 86 +++++++++++++--------------
 arch/arm64/include/asm/atomic_lse.h   | 14 ++---
 2 files changed, 50 insertions(+), 50 deletions(-)

diff --git a/arch/arm64/include/asm/atomic_ll_sc.h b/arch/arm64/include/asm/atomic_ll_sc.h
index 13869b76b58c..fe0db8d416fb 100644
--- a/arch/arm64/include/asm/atomic_ll_sc.h
+++ b/arch/arm64/include/asm/atomic_ll_sc.h
@@ -44,11 +44,11 @@ __ll_sc_atomic_##op(int i, atomic_t *v)					\
 									\
 	asm volatile("// atomic_" #op "\n"				\
 	__LL_SC_FALLBACK(						\
-"	prfm	pstl1strm, %2\n"					\
-"1:	ldxr	%w0, %2\n"						\
-"	" #asm_op "	%w0, %w0, %w3\n"				\
-"	stxr	%w1, %w0, %2\n"						\
-"	cbnz	%w1, 1b\n")						\
+	"	prfm	pstl1strm, %2\n"				\
+	"1:	ldxr	%w0, %2\n"					\
+	"	" #asm_op "	%w0, %w0, %w3\n"			\
+	"	stxr	%w1, %w0, %2\n"					\
+	"	cbnz	%w1, 1b\n")					\
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
 	: __stringify(constraint) "r" (i));				\
 }
@@ -62,12 +62,12 @@ __ll_sc_atomic_##op##_return##name(int i, atomic_t *v)			\
 									\
 	asm volatile("// atomic_" #op "_return" #name "\n"		\
 	__LL_SC_FALLBACK(						\
-"	prfm	pstl1strm, %2\n"					\
-"1:	ld" #acq "xr	%w0, %2\n"					\
-"	" #asm_op "	%w0, %w0, %w3\n"				\
-"	st" #rel "xr	%w1, %w0, %2\n"					\
-"	cbnz	%w1, 1b\n"						\
-"	" #mb )								\
+	"	prfm	pstl1strm, %2\n"				\
+	"1:	ld" #acq "xr	%w0, %2\n"				\
+	"	" #asm_op "	%w0, %w0, %w3\n"			\
+	"	st" #rel "xr	%w1, %w0, %2\n"				\
+	"	cbnz	%w1, 1b\n"					\
+	"	" #mb )							\
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
 	: __stringify(constraint) "r" (i)				\
 	: cl);								\
@@ -84,12 +84,12 @@ __ll_sc_atomic_fetch_##op##name(int i, atomic_t *v)			\
 									\
 	asm volatile("// atomic_fetch_" #op #name "\n"			\
 	__LL_SC_FALLBACK(						\
-"	prfm	pstl1strm, %3\n"					\
-"1:	ld" #acq "xr	%w0, %3\n"					\
-"	" #asm_op "	%w1, %w0, %w4\n"				\
-"	st" #rel "xr	%w2, %w1, %3\n"					\
-"	cbnz	%w2, 1b\n"						\
-"	" #mb )								\
+	"	prfm	pstl1strm, %3\n"				\
+	"1:	ld" #acq "xr	%w0, %3\n"				\
+	"	" #asm_op "	%w1, %w0, %w4\n"			\
+	"	st" #rel "xr	%w2, %w1, %3\n"				\
+	"	cbnz	%w2, 1b\n"					\
+	"	" #mb )							\
 	: "=&r" (result), "=&r" (val), "=&r" (tmp), "+Q" (v->counter)	\
 	: __stringify(constraint) "r" (i)				\
 	: cl);								\
@@ -143,11 +143,11 @@ __ll_sc_atomic64_##op(s64 i, atomic64_t *v)				\
 									\
 	asm volatile("// atomic64_" #op "\n"				\
 	__LL_SC_FALLBACK(						\
-"	prfm	pstl1strm, %2\n"					\
-"1:	ldxr	%0, %2\n"						\
-"	" #asm_op "	%0, %0, %3\n"					\
-"	stxr	%w1, %0, %2\n"						\
-"	cbnz	%w1, 1b")						\
+	"	prfm	pstl1strm, %2\n"				\
+	"1:	ldxr	%0, %2\n"					\
+	"	" #asm_op "	%0, %0, %3\n"				\
+	"	stxr	%w1, %0, %2\n"					\
+	"	cbnz	%w1, 1b")					\
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
 	: __stringify(constraint) "r" (i));				\
 }
@@ -161,12 +161,12 @@ __ll_sc_atomic64_##op##_return##name(s64 i, atomic64_t *v)		\
 									\
 	asm volatile("// atomic64_" #op "_return" #name "\n"		\
 	__LL_SC_FALLBACK(						\
-"	prfm	pstl1strm, %2\n"					\
-"1:	ld" #acq "xr	%0, %2\n"					\
-"	" #asm_op "	%0, %0, %3\n"					\
-"	st" #rel "xr	%w1, %0, %2\n"					\
-"	cbnz	%w1, 1b\n"						\
-"	" #mb )								\
+	"	prfm	pstl1strm, %2\n"				\
+	"1:	ld" #acq "xr	%0, %2\n"				\
+	"	" #asm_op "	%0, %0, %3\n"				\
+	"	st" #rel "xr	%w1, %0, %2\n"				\
+	"	cbnz	%w1, 1b\n"					\
+	"	" #mb )							\
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)		\
 	: __stringify(constraint) "r" (i)				\
 	: cl);								\
@@ -176,19 +176,19 @@ __ll_sc_atomic64_##op##_return##name(s64 i, atomic64_t *v)		\
 
 #define ATOMIC64_FETCH_OP(name, mb, acq, rel, cl, op, asm_op, constraint)\
 static inline long							\
-__ll_sc_atomic64_fetch_##op##name(s64 i, atomic64_t *v)		\
+__ll_sc_atomic64_fetch_##op##name(s64 i, atomic64_t *v)			\
 {									\
 	s64 result, val;						\
 	unsigned long tmp;						\
 									\
 	asm volatile("// atomic64_fetch_" #op #name "\n"		\
 	__LL_SC_FALLBACK(						\
-"	prfm	pstl1strm, %3\n"					\
-"1:	ld" #acq "xr	%0, %3\n"					\
-"	" #asm_op "	%1, %0, %4\n"					\
-"	st" #rel "xr	%w2, %1, %3\n"					\
-"	cbnz	%w2, 1b\n"						\
-"	" #mb )								\
+	"	prfm	pstl1strm, %3\n"				\
+	"1:	ld" #acq "xr	%0, %3\n"				\
+	"	" #asm_op "	%1, %0, %4\n"				\
+	"	st" #rel "xr	%w2, %1, %3\n"				\
+	"	cbnz	%w2, 1b\n"					\
+	"	" #mb )							\
 	: "=&r" (result), "=&r" (val), "=&r" (tmp), "+Q" (v->counter)	\
 	: __stringify(constraint) "r" (i)				\
 	: cl);								\
@@ -241,14 +241,14 @@ __ll_sc_atomic64_dec_if_positive(atomic64_t *v)
 
 	asm volatile("// atomic64_dec_if_positive\n"
 	__LL_SC_FALLBACK(
-"	prfm	pstl1strm, %2\n"
-"1:	ldxr	%0, %2\n"
-"	subs	%0, %0, #1\n"
-"	b.lt	2f\n"
-"	stlxr	%w1, %0, %2\n"
-"	cbnz	%w1, 1b\n"
-"	dmb	ish\n"
-"2:")
+	"	prfm	pstl1strm, %2\n"
+	"1:	ldxr	%0, %2\n"
+	"	subs	%0, %0, #1\n"
+	"	b.lt	2f\n"
+	"	stlxr	%w1, %0, %2\n"
+	"	cbnz	%w1, 1b\n"
+	"	dmb	ish\n"
+	"2:")
 	: "=&r" (result), "=&r" (tmp), "+Q" (v->counter)
 	:
 	: "cc", "memory");
diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index da3280f639cd..ab661375835e 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -11,11 +11,11 @@
 #define __ASM_ATOMIC_LSE_H
 
 #define ATOMIC_OP(op, asm_op)						\
-static inline void __lse_atomic_##op(int i, atomic_t *v)			\
+static inline void __lse_atomic_##op(int i, atomic_t *v)		\
 {									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
-"	" #asm_op "	%w[i], %[v]\n"					\
+	"	" #asm_op "	%w[i], %[v]\n"				\
 	: [i] "+r" (i), [v] "+Q" (v->counter)				\
 	: "r" (v));							\
 }
@@ -32,7 +32,7 @@ static inline int __lse_atomic_fetch_##op##name(int i, atomic_t *v)	\
 {									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
-"	" #asm_op #mb "	%w[i], %w[i], %[v]"				\
+	"	" #asm_op #mb "	%w[i], %w[i], %[v]"			\
 	: [i] "+r" (i), [v] "+Q" (v->counter)				\
 	: "r" (v)							\
 	: cl);								\
@@ -130,7 +130,7 @@ static inline int __lse_atomic_sub_return##name(int i, atomic_t *v)	\
 	"	add	%w[i], %w[i], %w[tmp]"				\
 	: [i] "+&r" (i), [v] "+Q" (v->counter), [tmp] "=&r" (tmp)	\
 	: "r" (v)							\
-	: cl);							\
+	: cl);								\
 									\
 	return i;							\
 }
@@ -168,7 +168,7 @@ static inline void __lse_atomic64_##op(s64 i, atomic64_t *v)		\
 {									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
-"	" #asm_op "	%[i], %[v]\n"					\
+	"	" #asm_op "	%[i], %[v]\n"				\
 	: [i] "+r" (i), [v] "+Q" (v->counter)				\
 	: "r" (v));							\
 }
@@ -185,7 +185,7 @@ static inline long __lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v)\
 {									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
-"	" #asm_op #mb "	%[i], %[i], %[v]"				\
+	"	" #asm_op #mb "	%[i], %[i], %[v]"			\
 	: [i] "+r" (i), [v] "+Q" (v->counter)				\
 	: "r" (v)							\
 	: cl);								\
@@ -272,7 +272,7 @@ static inline void __lse_atomic64_sub(s64 i, atomic64_t *v)
 }
 
 #define ATOMIC64_OP_SUB_RETURN(name, mb, cl...)				\
-static inline long __lse_atomic64_sub_return##name(s64 i, atomic64_t *v)	\
+static inline long __lse_atomic64_sub_return##name(s64 i, atomic64_t *v)\
 {									\
 	unsigned long tmp;						\
 									\

From ef53245060989d1cbe4b72ba56e94f217f4fd1ee Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 10 Dec 2021 15:14:07 +0000
Subject: [PATCH 52/81] arm64: atomics lse: define SUBs in terms of ADDs

The FEAT_LSE atomic instructions include atomic ADD instructions
(`stadd*` and `ldadd*`), but do not include atomic SUB instructions, so
we must build all of the SUB operations using the ADD instructions. We
open-code these today, with each SUB op implemented as a copy of the
corresponding ADD op with a leading `neg` instruction in the inline
assembly to negate the `i` argument.

As the compiler has no visibility of the `neg`, this leads to less than
optimal code generation when generating `i` into a register. For
example, __les_atomic_fetch_sub(1, v) can be compiled to:

	mov     w1, #0x1
	neg     w1, w1
	ldaddal w1, w1, [x2]

This patch improves this by replacing the `neg` with negation in C
before the inline assembly block, e.g.

	i = -i;

This allows the compiler to generate `i` into a register more optimally,
e.g.

	mov     w1, #0xffffffff
	ldaddal w1, w1, [x2]

With this change the assembly for each SUB op is identical to the
corresponding ADD op (including barriers and clobbers), so I've removed
the inline assembly and rewritten each SUB op in terms of the
corresponding ADD op, e.g.

| static inline void __lse_atomic_sub(int i, atomic_t *v)
| {
| 	__lse_atomic_add(-i, v);
| }

For clarity I've moved the definition of each SUB op immediately after
the corresponding ADD op, and used a single macro to create the RETURN
forms of both ops.

This is intended as an optimization and cleanup.
There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20211210151410.2782645-3-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/atomic_lse.h | 180 +++++++++-------------------
 1 file changed, 58 insertions(+), 122 deletions(-)

diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index ab661375835e..7454febb6d77 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -25,6 +25,11 @@ ATOMIC_OP(or, stset)
 ATOMIC_OP(xor, steor)
 ATOMIC_OP(add, stadd)
 
+static inline void __lse_atomic_sub(int i, atomic_t *v)
+{
+	__lse_atomic_add(-i, v);
+}
+
 #undef ATOMIC_OP
 
 #define ATOMIC_FETCH_OP(name, mb, op, asm_op, cl...)			\
@@ -54,7 +59,20 @@ ATOMIC_FETCH_OPS(add, ldadd)
 #undef ATOMIC_FETCH_OP
 #undef ATOMIC_FETCH_OPS
 
-#define ATOMIC_OP_ADD_RETURN(name, mb, cl...)				\
+#define ATOMIC_FETCH_OP_SUB(name)					\
+static inline int __lse_atomic_fetch_sub##name(int i, atomic_t *v)	\
+{									\
+	return __lse_atomic_fetch_add##name(-i, v);			\
+}
+
+ATOMIC_FETCH_OP_SUB(_relaxed)
+ATOMIC_FETCH_OP_SUB(_acquire)
+ATOMIC_FETCH_OP_SUB(_release)
+ATOMIC_FETCH_OP_SUB(        )
+
+#undef ATOMIC_FETCH_OP_SUB
+
+#define ATOMIC_OP_ADD_SUB_RETURN(name, mb, cl...)			\
 static inline int __lse_atomic_add_return##name(int i, atomic_t *v)	\
 {									\
 	u32 tmp;							\
@@ -68,14 +86,19 @@ static inline int __lse_atomic_add_return##name(int i, atomic_t *v)	\
 	: cl);								\
 									\
 	return i;							\
+}									\
+									\
+static inline int __lse_atomic_sub_return##name(int i, atomic_t *v)	\
+{									\
+	return __lse_atomic_add_return##name(-i, v);			\
 }
 
-ATOMIC_OP_ADD_RETURN(_relaxed,   )
-ATOMIC_OP_ADD_RETURN(_acquire,  a, "memory")
-ATOMIC_OP_ADD_RETURN(_release,  l, "memory")
-ATOMIC_OP_ADD_RETURN(        , al, "memory")
+ATOMIC_OP_ADD_SUB_RETURN(_relaxed,   )
+ATOMIC_OP_ADD_SUB_RETURN(_acquire,  a, "memory")
+ATOMIC_OP_ADD_SUB_RETURN(_release,  l, "memory")
+ATOMIC_OP_ADD_SUB_RETURN(        , al, "memory")
 
-#undef ATOMIC_OP_ADD_RETURN
+#undef ATOMIC_OP_ADD_SUB_RETURN
 
 static inline void __lse_atomic_and(int i, atomic_t *v)
 {
@@ -108,61 +131,6 @@ ATOMIC_FETCH_OP_AND(        , al, "memory")
 
 #undef ATOMIC_FETCH_OP_AND
 
-static inline void __lse_atomic_sub(int i, atomic_t *v)
-{
-	asm volatile(
-	__LSE_PREAMBLE
-	"	neg	%w[i], %w[i]\n"
-	"	stadd	%w[i], %[v]"
-	: [i] "+&r" (i), [v] "+Q" (v->counter)
-	: "r" (v));
-}
-
-#define ATOMIC_OP_SUB_RETURN(name, mb, cl...)				\
-static inline int __lse_atomic_sub_return##name(int i, atomic_t *v)	\
-{									\
-	u32 tmp;							\
-									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	neg	%w[i], %w[i]\n"					\
-	"	ldadd" #mb "	%w[i], %w[tmp], %[v]\n"			\
-	"	add	%w[i], %w[i], %w[tmp]"				\
-	: [i] "+&r" (i), [v] "+Q" (v->counter), [tmp] "=&r" (tmp)	\
-	: "r" (v)							\
-	: cl);								\
-									\
-	return i;							\
-}
-
-ATOMIC_OP_SUB_RETURN(_relaxed,   )
-ATOMIC_OP_SUB_RETURN(_acquire,  a, "memory")
-ATOMIC_OP_SUB_RETURN(_release,  l, "memory")
-ATOMIC_OP_SUB_RETURN(        , al, "memory")
-
-#undef ATOMIC_OP_SUB_RETURN
-
-#define ATOMIC_FETCH_OP_SUB(name, mb, cl...)				\
-static inline int __lse_atomic_fetch_sub##name(int i, atomic_t *v)	\
-{									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	neg	%w[i], %w[i]\n"					\
-	"	ldadd" #mb "	%w[i], %w[i], %[v]"			\
-	: [i] "+&r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v)							\
-	: cl);								\
-									\
-	return i;							\
-}
-
-ATOMIC_FETCH_OP_SUB(_relaxed,   )
-ATOMIC_FETCH_OP_SUB(_acquire,  a, "memory")
-ATOMIC_FETCH_OP_SUB(_release,  l, "memory")
-ATOMIC_FETCH_OP_SUB(        , al, "memory")
-
-#undef ATOMIC_FETCH_OP_SUB
-
 #define ATOMIC64_OP(op, asm_op)						\
 static inline void __lse_atomic64_##op(s64 i, atomic64_t *v)		\
 {									\
@@ -178,6 +146,11 @@ ATOMIC64_OP(or, stset)
 ATOMIC64_OP(xor, steor)
 ATOMIC64_OP(add, stadd)
 
+static inline void __lse_atomic64_sub(s64 i, atomic64_t *v)
+{
+	__lse_atomic64_add(-i, v);
+}
+
 #undef ATOMIC64_OP
 
 #define ATOMIC64_FETCH_OP(name, mb, op, asm_op, cl...)			\
@@ -207,7 +180,20 @@ ATOMIC64_FETCH_OPS(add, ldadd)
 #undef ATOMIC64_FETCH_OP
 #undef ATOMIC64_FETCH_OPS
 
-#define ATOMIC64_OP_ADD_RETURN(name, mb, cl...)				\
+#define ATOMIC64_FETCH_OP_SUB(name)					\
+static inline long __lse_atomic64_fetch_sub##name(s64 i, atomic64_t *v)	\
+{									\
+	return __lse_atomic64_fetch_add##name(-i, v);			\
+}
+
+ATOMIC64_FETCH_OP_SUB(_relaxed)
+ATOMIC64_FETCH_OP_SUB(_acquire)
+ATOMIC64_FETCH_OP_SUB(_release)
+ATOMIC64_FETCH_OP_SUB(        )
+
+#undef ATOMIC64_FETCH_OP_SUB
+
+#define ATOMIC64_OP_ADD_SUB_RETURN(name, mb, cl...)			\
 static inline long __lse_atomic64_add_return##name(s64 i, atomic64_t *v)\
 {									\
 	unsigned long tmp;						\
@@ -221,14 +207,19 @@ static inline long __lse_atomic64_add_return##name(s64 i, atomic64_t *v)\
 	: cl);								\
 									\
 	return i;							\
+}									\
+									\
+static inline long __lse_atomic64_sub_return##name(s64 i, atomic64_t *v)\
+{									\
+	return __lse_atomic64_add_return##name(-i, v);			\
 }
 
-ATOMIC64_OP_ADD_RETURN(_relaxed,   )
-ATOMIC64_OP_ADD_RETURN(_acquire,  a, "memory")
-ATOMIC64_OP_ADD_RETURN(_release,  l, "memory")
-ATOMIC64_OP_ADD_RETURN(        , al, "memory")
+ATOMIC64_OP_ADD_SUB_RETURN(_relaxed,   )
+ATOMIC64_OP_ADD_SUB_RETURN(_acquire,  a, "memory")
+ATOMIC64_OP_ADD_SUB_RETURN(_release,  l, "memory")
+ATOMIC64_OP_ADD_SUB_RETURN(        , al, "memory")
 
-#undef ATOMIC64_OP_ADD_RETURN
+#undef ATOMIC64_OP_ADD_SUB_RETURN
 
 static inline void __lse_atomic64_and(s64 i, atomic64_t *v)
 {
@@ -261,61 +252,6 @@ ATOMIC64_FETCH_OP_AND(        , al, "memory")
 
 #undef ATOMIC64_FETCH_OP_AND
 
-static inline void __lse_atomic64_sub(s64 i, atomic64_t *v)
-{
-	asm volatile(
-	__LSE_PREAMBLE
-	"	neg	%[i], %[i]\n"
-	"	stadd	%[i], %[v]"
-	: [i] "+&r" (i), [v] "+Q" (v->counter)
-	: "r" (v));
-}
-
-#define ATOMIC64_OP_SUB_RETURN(name, mb, cl...)				\
-static inline long __lse_atomic64_sub_return##name(s64 i, atomic64_t *v)\
-{									\
-	unsigned long tmp;						\
-									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	neg	%[i], %[i]\n"					\
-	"	ldadd" #mb "	%[i], %x[tmp], %[v]\n"			\
-	"	add	%[i], %[i], %x[tmp]"				\
-	: [i] "+&r" (i), [v] "+Q" (v->counter), [tmp] "=&r" (tmp)	\
-	: "r" (v)							\
-	: cl);								\
-									\
-	return i;							\
-}
-
-ATOMIC64_OP_SUB_RETURN(_relaxed,   )
-ATOMIC64_OP_SUB_RETURN(_acquire,  a, "memory")
-ATOMIC64_OP_SUB_RETURN(_release,  l, "memory")
-ATOMIC64_OP_SUB_RETURN(        , al, "memory")
-
-#undef ATOMIC64_OP_SUB_RETURN
-
-#define ATOMIC64_FETCH_OP_SUB(name, mb, cl...)				\
-static inline long __lse_atomic64_fetch_sub##name(s64 i, atomic64_t *v)	\
-{									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	neg	%[i], %[i]\n"					\
-	"	ldadd" #mb "	%[i], %[i], %[v]"			\
-	: [i] "+&r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v)							\
-	: cl);								\
-									\
-	return i;							\
-}
-
-ATOMIC64_FETCH_OP_SUB(_relaxed,   )
-ATOMIC64_FETCH_OP_SUB(_acquire,  a, "memory")
-ATOMIC64_FETCH_OP_SUB(_release,  l, "memory")
-ATOMIC64_FETCH_OP_SUB(        , al, "memory")
-
-#undef ATOMIC64_FETCH_OP_SUB
-
 static inline s64 __lse_atomic64_dec_if_positive(atomic64_t *v)
 {
 	unsigned long tmp;

From 5e9e43c987b2614078b795d68eae6598eea3067a Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 10 Dec 2021 15:14:08 +0000
Subject: [PATCH 53/81] arm64: atomics: lse: define ANDs in terms of ANDNOTs

The FEAT_LSE atomic instructions include atomic bit-clear instructions
(`ldclr*` and `stclr*`) which can be used to directly implement ANDNOT
operations. Each AND op is implemented as a copy of the corresponding
ANDNOT op with a leading `mvn` instruction to apply a bitwise NOT to the
`i` argument.

As the compiler has no visibility of the `mvn`, this leads to less than
optimal code generation when generating `i` into a register. For
example, __lse_atomic_fetch_and(0xf, v) can be compiled to:

	mov     w1, #0xf
	mvn     w1, w1
	ldclral w1, w1, [x2]

This patch improves this by replacing the `mvn` with NOT in C before the
inline assembly block, e.g.

	i = ~i;

This allows the compiler to generate `i` into a register more optimally,
e.g.

	mov     w1, #0xfffffff0
	ldclral w1, w1, [x2]

With this change the assembly for each AND op is identical to the
corresponding ANDNOT op (including barriers and clobbers), so I've
removed the inline assembly and rewritten each AND op in terms of the
corresponding ANDNOT op, e.g.

| static inline void __lse_atomic_and(int i, atomic_t *v)
| {
| 	return __lse_atomic_andnot(~i, v);
| }

This is intended as an optimization and cleanup.
There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20211210151410.2782645-4-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/atomic_lse.h | 34 ++++-------------------------
 1 file changed, 4 insertions(+), 30 deletions(-)

diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index 7454febb6d77..d707eafb7677 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -102,26 +102,13 @@ ATOMIC_OP_ADD_SUB_RETURN(        , al, "memory")
 
 static inline void __lse_atomic_and(int i, atomic_t *v)
 {
-	asm volatile(
-	__LSE_PREAMBLE
-	"	mvn	%w[i], %w[i]\n"
-	"	stclr	%w[i], %[v]"
-	: [i] "+&r" (i), [v] "+Q" (v->counter)
-	: "r" (v));
+	return __lse_atomic_andnot(~i, v);
 }
 
 #define ATOMIC_FETCH_OP_AND(name, mb, cl...)				\
 static inline int __lse_atomic_fetch_and##name(int i, atomic_t *v)	\
 {									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	mvn	%w[i], %w[i]\n"					\
-	"	ldclr" #mb "	%w[i], %w[i], %[v]"			\
-	: [i] "+&r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v)							\
-	: cl);								\
-									\
-	return i;							\
+	return __lse_atomic_fetch_andnot##name(~i, v);			\
 }
 
 ATOMIC_FETCH_OP_AND(_relaxed,   )
@@ -223,26 +210,13 @@ ATOMIC64_OP_ADD_SUB_RETURN(        , al, "memory")
 
 static inline void __lse_atomic64_and(s64 i, atomic64_t *v)
 {
-	asm volatile(
-	__LSE_PREAMBLE
-	"	mvn	%[i], %[i]\n"
-	"	stclr	%[i], %[v]"
-	: [i] "+&r" (i), [v] "+Q" (v->counter)
-	: "r" (v));
+	return __lse_atomic64_andnot(~i, v);
 }
 
 #define ATOMIC64_FETCH_OP_AND(name, mb, cl...)				\
 static inline long __lse_atomic64_fetch_and##name(s64 i, atomic64_t *v)	\
 {									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	mvn	%[i], %[i]\n"					\
-	"	ldclr" #mb "	%[i], %[i], %[v]"			\
-	: [i] "+&r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v)							\
-	: cl);								\
-									\
-	return i;							\
+	return __lse_atomic64_fetch_andnot##name(~i, v);		\
 }
 
 ATOMIC64_FETCH_OP_AND(_relaxed,   )

From 8a578a759ad61b13240190a12cdfa8845c5fa949 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 10 Dec 2021 15:14:09 +0000
Subject: [PATCH 54/81] arm64: atomics: lse: improve constraints for simple ops

We have overly conservative assembly constraints for the basic FEAT_LSE
atomic instructions, and using more accurate and permissive constraints
will allow for better code generation.

The FEAT_LSE basic atomic instructions have come in two forms:

	LD{op}{order}{size} <Rs>, <Rt>, [<Rn>]
	ST{op}{order}{size} <Rs>, [<Rn>]

The ST* forms are aliases of the LD* forms where:

	ST{op}{order}{size} <Rs>, [<Rn>]
Is:
	LD{op}{order}{size} <Rs>, XZR, [<Rn>]

For either form, both <Rs> and <Rn> are read but not written back to,
and <Rt> is written with the original value of the memory location.
Where (<Rt> == <Rs>) or (<Rt> == <Rn>), <Rt> is written *after* the
other register value(s) are consumed. There are no UNPREDICTABLE or
CONSTRAINED UNPREDICTABLE behaviours when any pair of <Rs>, <Rt>, or
<Rn> are the same register.

Our current inline assembly always uses <Rs> == <Rt>, treating this
register as both an input and an output (using a '+r' constraint). This
forces the compiler to do some unnecessary register shuffling and/or
redundant value generation.

For example, the compiler cannot reuse the <Rs> value, and currently GCC
11.1.0 will compile:

	__lse_atomic_add(1, a);
	__lse_atomic_add(1, b);
	__lse_atomic_add(1, c);

As:

	mov     w3, #0x1
	mov     w4, w3
	stadd   w4, [x0]
	mov     w0, w3
	stadd   w0, [x1]
	stadd   w3, [x2]

We can improve this with more accurate constraints, separating <Rs> and
<Rt>, where <Rs> is an input-only register ('r'), and <Rt> is an
output-only value ('=r'). As <Rt> is written back after <Rs> is
consumed, it does not need to be earlyclobber ('=&r'), leaving the
compiler free to use the same register for both <Rs> and <Rt> where this
is desirable.

At the same time, the redundant 'r' constraint for `v` is removed, as
the `+Q` constraint is sufficient.

With this change, the above example becomes:

	mov     w3, #0x1
	stadd   w3, [x0]
	stadd   w3, [x1]
	stadd   w3, [x2]

I've made this change for the non-value-returning and FETCH ops. The
RETURN ops have a multi-instruction sequence for which we cannot use the
same constraints, and a subsequent patch will rewrite hte RETURN ops in
terms of the FETCH ops, relying on the ability for the compiler to reuse
the <Rs> value.

This is intended as an optimization.
There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20211210151410.2782645-5-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/atomic_lse.h | 30 +++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index d707eafb7677..e4c5c4c34ce6 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -16,8 +16,8 @@ static inline void __lse_atomic_##op(int i, atomic_t *v)		\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
 	"	" #asm_op "	%w[i], %[v]\n"				\
-	: [i] "+r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v));							\
+	: [v] "+Q" (v->counter)						\
+	: [i] "r" (i));							\
 }
 
 ATOMIC_OP(andnot, stclr)
@@ -35,14 +35,17 @@ static inline void __lse_atomic_sub(int i, atomic_t *v)
 #define ATOMIC_FETCH_OP(name, mb, op, asm_op, cl...)			\
 static inline int __lse_atomic_fetch_##op##name(int i, atomic_t *v)	\
 {									\
+	int old;							\
+									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
-	"	" #asm_op #mb "	%w[i], %w[i], %[v]"			\
-	: [i] "+r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v)							\
+	"	" #asm_op #mb "	%w[i], %w[old], %[v]"			\
+	: [v] "+Q" (v->counter),					\
+	  [old] "=r" (old)						\
+	: [i] "r" (i)							\
 	: cl);								\
 									\
-	return i;							\
+	return old;							\
 }
 
 #define ATOMIC_FETCH_OPS(op, asm_op)					\
@@ -124,8 +127,8 @@ static inline void __lse_atomic64_##op(s64 i, atomic64_t *v)		\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
 	"	" #asm_op "	%[i], %[v]\n"				\
-	: [i] "+r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v));							\
+	: [v] "+Q" (v->counter)						\
+	: [i] "r" (i));							\
 }
 
 ATOMIC64_OP(andnot, stclr)
@@ -143,14 +146,17 @@ static inline void __lse_atomic64_sub(s64 i, atomic64_t *v)
 #define ATOMIC64_FETCH_OP(name, mb, op, asm_op, cl...)			\
 static inline long __lse_atomic64_fetch_##op##name(s64 i, atomic64_t *v)\
 {									\
+	s64 old;							\
+									\
 	asm volatile(							\
 	__LSE_PREAMBLE							\
-	"	" #asm_op #mb "	%[i], %[i], %[v]"			\
-	: [i] "+r" (i), [v] "+Q" (v->counter)				\
-	: "r" (v)							\
+	"	" #asm_op #mb "	%[i], %[old], %[v]"			\
+	: [v] "+Q" (v->counter),					\
+	  [old] "=r" (old)						\
+	: [i] "r" (i) 							\
 	: cl);								\
 									\
-	return i;							\
+	return old;							\
 }
 
 #define ATOMIC64_FETCH_OPS(op, asm_op)					\

From 053f58bab331e6c31c486661fdb5f6bad5f413de Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Fri, 10 Dec 2021 15:14:10 +0000
Subject: [PATCH 55/81] arm64: atomics: lse: define RETURN ops in terms of
 FETCH ops

The FEAT_LSE atomic instructions include LD* instructions which return
the original value of a memory location can be used to directly
implement FETCH opertations. Each RETURN op is implemented as a copy of
the corresponding FETCH op with a trailing instruction to generate the
new value of the memory location. We only directly implement
*_fetch_add*(), for which we have a trailing `add` instruction.

As the compiler has no visibility of the `add`, this leads to less than
optimal code generation when consuming the result.

For example, the compiler cannot constant-fold the addition into later
operations, and currently GCC 11.1.0 will compile:

       return __lse_atomic_sub_return(1, v) == 0;

As:

	mov     w1, #0xffffffff
	ldaddal w1, w2, [x0]
	add     w1, w1, w2
	cmp     w1, #0x0
	cset    w0, eq  // eq = none
	ret

This patch improves this by replacing the `add` with C addition after
the inline assembly block, e.g.

	ret += i;

This allows the compiler to manipulate `i`. This permits the compiler to
merge the `add` and `cmp` for the above, e.g.

	mov     w1, #0xffffffff
	ldaddal w1, w1, [x0]
	cmp     w1, #0x1
	cset    w0, eq  // eq = none
	ret

With this change the assembly for each RETURN op is identical to the
corresponding FETCH op (including barriers and clobbers) so I've removed
the inline assembly and rewritten each RETURN op in terms of the
corresponding FETCH op, e.g.

| static inline void __lse_atomic_add_return(int i, atomic_t *v)
| {
|       return __lse_atomic_fetch_add(i, v) + i
| }

The new construction does not adversely affect the common case, and
before and after this patch GCC 11.1.0 can compile:

	__lse_atomic_add_return(i, v)

As:

	ldaddal w0, w2, [x1]
	add     w0, w0, w2

... while having the freedom to do better elsewhere.

This is intended as an optimization and cleanup.
There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Boqun Feng <boqun.feng@gmail.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Will Deacon <will@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Link: https://lore.kernel.org/r/20211210151410.2782645-6-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/atomic_lse.h | 48 +++++++++--------------------
 1 file changed, 14 insertions(+), 34 deletions(-)

diff --git a/arch/arm64/include/asm/atomic_lse.h b/arch/arm64/include/asm/atomic_lse.h
index e4c5c4c34ce6..d955ade5df7c 100644
--- a/arch/arm64/include/asm/atomic_lse.h
+++ b/arch/arm64/include/asm/atomic_lse.h
@@ -75,31 +75,21 @@ ATOMIC_FETCH_OP_SUB(        )
 
 #undef ATOMIC_FETCH_OP_SUB
 
-#define ATOMIC_OP_ADD_SUB_RETURN(name, mb, cl...)			\
+#define ATOMIC_OP_ADD_SUB_RETURN(name)					\
 static inline int __lse_atomic_add_return##name(int i, atomic_t *v)	\
 {									\
-	u32 tmp;							\
-									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	ldadd" #mb "	%w[i], %w[tmp], %[v]\n"			\
-	"	add	%w[i], %w[i], %w[tmp]"				\
-	: [i] "+r" (i), [v] "+Q" (v->counter), [tmp] "=&r" (tmp)	\
-	: "r" (v)							\
-	: cl);								\
-									\
-	return i;							\
+	return __lse_atomic_fetch_add##name(i, v) + i;			\
 }									\
 									\
 static inline int __lse_atomic_sub_return##name(int i, atomic_t *v)	\
 {									\
-	return __lse_atomic_add_return##name(-i, v);			\
+	return __lse_atomic_fetch_sub(i, v) - i;			\
 }
 
-ATOMIC_OP_ADD_SUB_RETURN(_relaxed,   )
-ATOMIC_OP_ADD_SUB_RETURN(_acquire,  a, "memory")
-ATOMIC_OP_ADD_SUB_RETURN(_release,  l, "memory")
-ATOMIC_OP_ADD_SUB_RETURN(        , al, "memory")
+ATOMIC_OP_ADD_SUB_RETURN(_relaxed)
+ATOMIC_OP_ADD_SUB_RETURN(_acquire)
+ATOMIC_OP_ADD_SUB_RETURN(_release)
+ATOMIC_OP_ADD_SUB_RETURN(        )
 
 #undef ATOMIC_OP_ADD_SUB_RETURN
 
@@ -186,31 +176,21 @@ ATOMIC64_FETCH_OP_SUB(        )
 
 #undef ATOMIC64_FETCH_OP_SUB
 
-#define ATOMIC64_OP_ADD_SUB_RETURN(name, mb, cl...)			\
+#define ATOMIC64_OP_ADD_SUB_RETURN(name)				\
 static inline long __lse_atomic64_add_return##name(s64 i, atomic64_t *v)\
 {									\
-	unsigned long tmp;						\
-									\
-	asm volatile(							\
-	__LSE_PREAMBLE							\
-	"	ldadd" #mb "	%[i], %x[tmp], %[v]\n"			\
-	"	add	%[i], %[i], %x[tmp]"				\
-	: [i] "+r" (i), [v] "+Q" (v->counter), [tmp] "=&r" (tmp)	\
-	: "r" (v)							\
-	: cl);								\
-									\
-	return i;							\
+	return __lse_atomic64_fetch_add##name(i, v) + i;		\
 }									\
 									\
 static inline long __lse_atomic64_sub_return##name(s64 i, atomic64_t *v)\
 {									\
-	return __lse_atomic64_add_return##name(-i, v);			\
+	return __lse_atomic64_fetch_sub##name(i, v) - i;		\
 }
 
-ATOMIC64_OP_ADD_SUB_RETURN(_relaxed,   )
-ATOMIC64_OP_ADD_SUB_RETURN(_acquire,  a, "memory")
-ATOMIC64_OP_ADD_SUB_RETURN(_release,  l, "memory")
-ATOMIC64_OP_ADD_SUB_RETURN(        , al, "memory")
+ATOMIC64_OP_ADD_SUB_RETURN(_relaxed)
+ATOMIC64_OP_ADD_SUB_RETURN(_acquire)
+ATOMIC64_OP_ADD_SUB_RETURN(_release)
+ATOMIC64_OP_ADD_SUB_RETURN(        )
 
 #undef ATOMIC64_OP_ADD_SUB_RETURN
 

From d4c4844a9b47dc1c7e8ccce0cdd899602f5479fd Mon Sep 17 00:00:00 2001
From: Thierry Reding <treding@nvidia.com>
Date: Tue, 14 Dec 2021 14:16:13 +0000
Subject: [PATCH 56/81] arm64: perf: Support Denver and Carmel PMUs

Add support for the NVIDIA Denver and Carmel PMUs using the generic
PMUv3 event map for now.

Acked-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Thierry Reding <treding@nvidia.com>
[ rm: reorder entries alphabetically ]
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/5f0f69d47acca78a9e479501aa4d8b429e23cf11.1639490264.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/perf_event.c | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index b4044469527e..035da52e04bf 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1247,6 +1247,18 @@ static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu)
 				       armv8_vulcan_map_event);
 }
 
+static int armv8_carmel_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_nvidia_carmel",
+				       armv8_pmuv3_map_event);
+}
+
+static int armv8_denver_pmu_init(struct arm_pmu *cpu_pmu)
+{
+	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_nvidia_denver",
+				       armv8_pmuv3_map_event);
+}
+
 static const struct of_device_id armv8_pmu_of_device_ids[] = {
 	{.compatible = "arm,armv8-pmuv3",	.data = armv8_pmuv3_init},
 	{.compatible = "arm,cortex-a34-pmu",	.data = armv8_a34_pmu_init},
@@ -1265,6 +1277,8 @@ static const struct of_device_id armv8_pmu_of_device_ids[] = {
 	{.compatible = "arm,neoverse-n1-pmu",	.data = armv8_n1_pmu_init},
 	{.compatible = "cavium,thunder-pmu",	.data = armv8_thunder_pmu_init},
 	{.compatible = "brcm,vulcan-pmu",	.data = armv8_vulcan_pmu_init},
+	{.compatible = "nvidia,carmel-pmu",	.data = armv8_carmel_pmu_init},
+	{.compatible = "nvidia,denver-pmu",	.data = armv8_denver_pmu_init},
 	{},
 };
 

From 6ac9f30bd43baf9e05696e9a614fcd7e83c74afa Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 14 Dec 2021 14:16:14 +0000
Subject: [PATCH 57/81] arm64: perf: Simplify registration boilerplate

With the trend for per-core events moving to userspace JSON, registering
names for PMUv3 implementations is increasingly a pure boilerplate
exercise. Let's wrap things a step further so we can generate the basic
PMUv3 init function with a macro invocation, and reduce further new
addition to just 2 lines each.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/b79477ea3b97f685d00511d4ecd2f686184dca34.1639490264.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/perf_event.c | 113 +++++++++------------------------
 1 file changed, 31 insertions(+), 82 deletions(-)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 035da52e04bf..ed6d8cd2f88f 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1145,17 +1145,26 @@ static int armv8_pmu_init_nogroups(struct arm_pmu *cpu_pmu, char *name,
 	return armv8_pmu_init(cpu_pmu, name, map_event, NULL, NULL, NULL);
 }
 
-static int armv8_pmuv3_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_pmuv3",
-				       armv8_pmuv3_map_event);
+#define PMUV3_INIT_SIMPLE(name)						\
+static int name##_pmu_init(struct arm_pmu *cpu_pmu)			\
+{									\
+	return armv8_pmu_init_nogroups(cpu_pmu, #name, armv8_pmuv3_map_event);\
 }
 
-static int armv8_a34_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a34",
-				       armv8_pmuv3_map_event);
-}
+PMUV3_INIT_SIMPLE(armv8_pmuv3)
+
+PMUV3_INIT_SIMPLE(armv8_cortex_a34)
+PMUV3_INIT_SIMPLE(armv8_cortex_a55)
+PMUV3_INIT_SIMPLE(armv8_cortex_a65)
+PMUV3_INIT_SIMPLE(armv8_cortex_a75)
+PMUV3_INIT_SIMPLE(armv8_cortex_a76)
+PMUV3_INIT_SIMPLE(armv8_cortex_a77)
+PMUV3_INIT_SIMPLE(armv8_cortex_a78)
+PMUV3_INIT_SIMPLE(armv8_neoverse_e1)
+PMUV3_INIT_SIMPLE(armv8_neoverse_n1)
+
+PMUV3_INIT_SIMPLE(armv8_nvidia_carmel)
+PMUV3_INIT_SIMPLE(armv8_nvidia_denver)
 
 static int armv8_a35_pmu_init(struct arm_pmu *cpu_pmu)
 {
@@ -1169,24 +1178,12 @@ static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu)
 				       armv8_a53_map_event);
 }
 
-static int armv8_a55_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a55",
-				       armv8_pmuv3_map_event);
-}
-
 static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu)
 {
 	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a57",
 				       armv8_a57_map_event);
 }
 
-static int armv8_a65_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a65",
-				       armv8_pmuv3_map_event);
-}
-
 static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu)
 {
 	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a72",
@@ -1199,42 +1196,6 @@ static int armv8_a73_pmu_init(struct arm_pmu *cpu_pmu)
 				       armv8_a73_map_event);
 }
 
-static int armv8_a75_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a75",
-				       armv8_pmuv3_map_event);
-}
-
-static int armv8_a76_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a76",
-				       armv8_pmuv3_map_event);
-}
-
-static int armv8_a77_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a77",
-				       armv8_pmuv3_map_event);
-}
-
-static int armv8_a78_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cortex_a78",
-				       armv8_pmuv3_map_event);
-}
-
-static int armv8_e1_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_neoverse_e1",
-				       armv8_pmuv3_map_event);
-}
-
-static int armv8_n1_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_neoverse_n1",
-				       armv8_pmuv3_map_event);
-}
-
 static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu)
 {
 	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_cavium_thunder",
@@ -1247,38 +1208,26 @@ static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu)
 				       armv8_vulcan_map_event);
 }
 
-static int armv8_carmel_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_nvidia_carmel",
-				       armv8_pmuv3_map_event);
-}
-
-static int armv8_denver_pmu_init(struct arm_pmu *cpu_pmu)
-{
-	return armv8_pmu_init_nogroups(cpu_pmu, "armv8_nvidia_denver",
-				       armv8_pmuv3_map_event);
-}
-
 static const struct of_device_id armv8_pmu_of_device_ids[] = {
-	{.compatible = "arm,armv8-pmuv3",	.data = armv8_pmuv3_init},
-	{.compatible = "arm,cortex-a34-pmu",	.data = armv8_a34_pmu_init},
+	{.compatible = "arm,armv8-pmuv3",	.data = armv8_pmuv3_pmu_init},
+	{.compatible = "arm,cortex-a34-pmu",	.data = armv8_cortex_a34_pmu_init},
 	{.compatible = "arm,cortex-a35-pmu",	.data = armv8_a35_pmu_init},
 	{.compatible = "arm,cortex-a53-pmu",	.data = armv8_a53_pmu_init},
-	{.compatible = "arm,cortex-a55-pmu",	.data = armv8_a55_pmu_init},
+	{.compatible = "arm,cortex-a55-pmu",	.data = armv8_cortex_a55_pmu_init},
 	{.compatible = "arm,cortex-a57-pmu",	.data = armv8_a57_pmu_init},
-	{.compatible = "arm,cortex-a65-pmu",	.data = armv8_a65_pmu_init},
+	{.compatible = "arm,cortex-a65-pmu",	.data = armv8_cortex_a65_pmu_init},
 	{.compatible = "arm,cortex-a72-pmu",	.data = armv8_a72_pmu_init},
 	{.compatible = "arm,cortex-a73-pmu",	.data = armv8_a73_pmu_init},
-	{.compatible = "arm,cortex-a75-pmu",	.data = armv8_a75_pmu_init},
-	{.compatible = "arm,cortex-a76-pmu",	.data = armv8_a76_pmu_init},
-	{.compatible = "arm,cortex-a77-pmu",	.data = armv8_a77_pmu_init},
-	{.compatible = "arm,cortex-a78-pmu",	.data = armv8_a78_pmu_init},
-	{.compatible = "arm,neoverse-e1-pmu",	.data = armv8_e1_pmu_init},
-	{.compatible = "arm,neoverse-n1-pmu",	.data = armv8_n1_pmu_init},
+	{.compatible = "arm,cortex-a75-pmu",	.data = armv8_cortex_a75_pmu_init},
+	{.compatible = "arm,cortex-a76-pmu",	.data = armv8_cortex_a76_pmu_init},
+	{.compatible = "arm,cortex-a77-pmu",	.data = armv8_cortex_a77_pmu_init},
+	{.compatible = "arm,cortex-a78-pmu",	.data = armv8_cortex_a78_pmu_init},
+	{.compatible = "arm,neoverse-e1-pmu",	.data = armv8_neoverse_e1_pmu_init},
+	{.compatible = "arm,neoverse-n1-pmu",	.data = armv8_neoverse_n1_pmu_init},
 	{.compatible = "cavium,thunder-pmu",	.data = armv8_thunder_pmu_init},
 	{.compatible = "brcm,vulcan-pmu",	.data = armv8_vulcan_pmu_init},
-	{.compatible = "nvidia,carmel-pmu",	.data = armv8_carmel_pmu_init},
-	{.compatible = "nvidia,denver-pmu",	.data = armv8_denver_pmu_init},
+	{.compatible = "nvidia,carmel-pmu",	.data = armv8_nvidia_carmel_pmu_init},
+	{.compatible = "nvidia,denver-pmu",	.data = armv8_nvidia_denver_pmu_init},
 	{},
 };
 
@@ -1301,7 +1250,7 @@ static int __init armv8_pmu_driver_init(void)
 	if (acpi_disabled)
 		return platform_driver_register(&armv8_pmu_driver);
 	else
-		return arm_pmu_acpi_probe(armv8_pmuv3_init);
+		return arm_pmu_acpi_probe(armv8_pmuv3_pmu_init);
 }
 device_initcall(armv8_pmu_driver_init)
 

From 893c34b60a59cd0bdb496084f5c096be720bd244 Mon Sep 17 00:00:00 2001
From: Robin Murphy <robin.murphy@arm.com>
Date: Tue, 14 Dec 2021 14:16:15 +0000
Subject: [PATCH 58/81] arm64: perf: Support new DT compatibles

Wire up the new DT compatibles so we can present appropriate
PMU names to userspace for the latest and greatest CPUs.

Signed-off-by: Robin Murphy <robin.murphy@arm.com>
Link: https://lore.kernel.org/r/62d14ba12d847ec7f1fba7cb0b3b881b437e1cc5.1639490264.git.robin.murphy@arm.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/perf_event.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index ed6d8cd2f88f..4c1698f63f34 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1160,8 +1160,14 @@ PMUV3_INIT_SIMPLE(armv8_cortex_a75)
 PMUV3_INIT_SIMPLE(armv8_cortex_a76)
 PMUV3_INIT_SIMPLE(armv8_cortex_a77)
 PMUV3_INIT_SIMPLE(armv8_cortex_a78)
+PMUV3_INIT_SIMPLE(armv9_cortex_a510)
+PMUV3_INIT_SIMPLE(armv9_cortex_a710)
+PMUV3_INIT_SIMPLE(armv8_cortex_x1)
+PMUV3_INIT_SIMPLE(armv9_cortex_x2)
 PMUV3_INIT_SIMPLE(armv8_neoverse_e1)
 PMUV3_INIT_SIMPLE(armv8_neoverse_n1)
+PMUV3_INIT_SIMPLE(armv9_neoverse_n2)
+PMUV3_INIT_SIMPLE(armv8_neoverse_v1)
 
 PMUV3_INIT_SIMPLE(armv8_nvidia_carmel)
 PMUV3_INIT_SIMPLE(armv8_nvidia_denver)
@@ -1222,8 +1228,14 @@ static const struct of_device_id armv8_pmu_of_device_ids[] = {
 	{.compatible = "arm,cortex-a76-pmu",	.data = armv8_cortex_a76_pmu_init},
 	{.compatible = "arm,cortex-a77-pmu",	.data = armv8_cortex_a77_pmu_init},
 	{.compatible = "arm,cortex-a78-pmu",	.data = armv8_cortex_a78_pmu_init},
+	{.compatible = "arm,cortex-a510-pmu",	.data = armv9_cortex_a510_pmu_init},
+	{.compatible = "arm,cortex-a710-pmu",	.data = armv9_cortex_a710_pmu_init},
+	{.compatible = "arm,cortex-x1-pmu",	.data = armv8_cortex_x1_pmu_init},
+	{.compatible = "arm,cortex-x2-pmu",	.data = armv9_cortex_x2_pmu_init},
 	{.compatible = "arm,neoverse-e1-pmu",	.data = armv8_neoverse_e1_pmu_init},
 	{.compatible = "arm,neoverse-n1-pmu",	.data = armv8_neoverse_n1_pmu_init},
+	{.compatible = "arm,neoverse-n2-pmu",	.data = armv9_neoverse_n2_pmu_init},
+	{.compatible = "arm,neoverse-v1-pmu",	.data = armv8_neoverse_v1_pmu_init},
 	{.compatible = "cavium,thunder-pmu",	.data = armv8_thunder_pmu_init},
 	{.compatible = "brcm,vulcan-pmu",	.data = armv8_vulcan_pmu_init},
 	{.compatible = "nvidia,carmel-pmu",	.data = armv8_nvidia_carmel_pmu_init},

From 9be34be87cc8d1afe3c3bc2e645b4dee512d9eda Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 14 Dec 2021 15:27:12 +0000
Subject: [PATCH 59/81] arm64: Add macro version of the BTI instruction

BTI is only available from v8.5 so we need to encode it using HINT in
generic code and for older toolchains. Add an assembler macro based on
one written by Mark Rutland which lets us use the mnemonic and update
the existing users.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Will Deacon <will@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20211214152714.2380849-2-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/crypto/aes-modes.S      | 10 +++++-----
 arch/arm64/include/asm/assembler.h | 10 ++++++++++
 arch/arm64/include/asm/linkage.h   |  7 +------
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/arch/arm64/crypto/aes-modes.S b/arch/arm64/crypto/aes-modes.S
index b495de22bb38..ff01f0167ba2 100644
--- a/arch/arm64/crypto/aes-modes.S
+++ b/arch/arm64/crypto/aes-modes.S
@@ -363,15 +363,15 @@ ST5(	mov		v4.16b, vctr.16b		)
 	adr		x16, 1f
 	sub		x16, x16, x12, lsl #3
 	br		x16
-	hint		34			// bti c
+	bti		c
 	mov		v0.d[0], vctr.d[0]
-	hint		34			// bti c
+	bti		c
 	mov		v1.d[0], vctr.d[0]
-	hint		34			// bti c
+	bti		c
 	mov		v2.d[0], vctr.d[0]
-	hint		34			// bti c
+	bti		c
 	mov		v3.d[0], vctr.d[0]
-ST5(	hint		34				)
+ST5(	bti		c				)
 ST5(	mov		v4.d[0], vctr.d[0]		)
 1:	b		2f
 	.previous
diff --git a/arch/arm64/include/asm/assembler.h b/arch/arm64/include/asm/assembler.h
index 136d13f3d6e9..e8bd0af0141c 100644
--- a/arch/arm64/include/asm/assembler.h
+++ b/arch/arm64/include/asm/assembler.h
@@ -790,6 +790,16 @@ alternative_endif
 .Lnoyield_\@:
 	.endm
 
+/*
+ * Branch Target Identifier (BTI)
+ */
+	.macro  bti, targets
+	.equ	.L__bti_targets_c, 34
+	.equ	.L__bti_targets_j, 36
+	.equ	.L__bti_targets_jc,38
+	hint	#.L__bti_targets_\targets
+	.endm
+
 /*
  * This macro emits a program property note section identifying
  * architecture features which require special handling, mainly for
diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
index 9906541a6861..c5d0c11d7709 100644
--- a/arch/arm64/include/asm/linkage.h
+++ b/arch/arm64/include/asm/linkage.h
@@ -6,12 +6,7 @@
 
 #if defined(CONFIG_ARM64_BTI_KERNEL) && defined(__aarch64__)
 
-/*
- * Since current versions of gas reject the BTI instruction unless we
- * set the architecture version to v8.5 we use the hint instruction
- * instead.
- */
-#define BTI_C hint 34 ;
+#define BTI_C bti c ;
 
 /*
  * When using in-kernel BTI we need to ensure that PCS-conformant assembly

From 481ee45ce9e078715b4ca50fcaea518e3aee1aa7 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 14 Dec 2021 15:27:13 +0000
Subject: [PATCH 60/81] arm64: Unconditionally override SYM_FUNC macros

Currently we only override the SYM_FUNC macros when we need to insert
BTI C into them, do this unconditionally to make it more likely that we'll
notice bugs in our override.

Suggested-by: Mark Rutland <mark.rutland@arm.com>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Signed-off-by: Mark Brown <broonie@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20211214152714.2380849-3-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/linkage.h | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
index c5d0c11d7709..1cfa8bb33edd 100644
--- a/arch/arm64/include/asm/linkage.h
+++ b/arch/arm64/include/asm/linkage.h
@@ -8,10 +8,18 @@
 
 #define BTI_C bti c ;
 
+#else
+
+#define BTI_C
+
+#endif
+
 /*
- * When using in-kernel BTI we need to ensure that PCS-conformant assembly
- * functions have suitable annotations.  Override SYM_FUNC_START to insert
- * a BTI landing pad at the start of everything.
+ * When using in-kernel BTI we need to ensure that PCS-conformant
+ * assembly functions have suitable annotations.  Override
+ * SYM_FUNC_START to insert a BTI landing pad at the start of
+ * everything, the override is done unconditionally so we're more
+ * likely to notice any drift from the overridden definitions.
  */
 #define SYM_FUNC_START(name)				\
 	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
@@ -37,8 +45,6 @@
 	SYM_START(name, SYM_L_WEAK, SYM_A_NONE)		\
 	BTI_C
 
-#endif
-
 /*
  * Annotate a function as position independent, i.e., safe to be called before
  * the kernel virtual mapping is activated.

From 742a15b1a23aa43bde2d9d681281ec1925be13fd Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 14 Dec 2021 15:27:14 +0000
Subject: [PATCH 61/81] arm64: Use BTI C directly and unconditionally

Now we have a macro for BTI C that looks like a regular instruction change
all the users of the current BTI_C macro to just emit a BTI C directly and
remove the macro.

This does mean that we now unconditionally BTI annotate all assembly
functions, meaning that they are worse in this respect than code generated
by the compiler. The overhead should be minimal for implementations with a
reasonable HINT implementation.

Signed-off-by: Mark Brown <broonie@kernel.org>
Reviewed-by: Ard Biesheuvel <ardb@kernel.org>
Acked-by: Mark Rutland <mark.rutland@arm.com>
Link: https://lore.kernel.org/r/20211214152714.2380849-4-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/linkage.h | 22 ++++++----------------
 arch/arm64/kernel/entry-ftrace.S |  8 ++------
 arch/arm64/lib/kasan_sw_tags.S   |  4 +---
 3 files changed, 9 insertions(+), 25 deletions(-)

diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
index 1cfa8bb33edd..9065e4749b42 100644
--- a/arch/arm64/include/asm/linkage.h
+++ b/arch/arm64/include/asm/linkage.h
@@ -4,16 +4,6 @@
 #define __ALIGN		.align 2
 #define __ALIGN_STR	".align 2"
 
-#if defined(CONFIG_ARM64_BTI_KERNEL) && defined(__aarch64__)
-
-#define BTI_C bti c ;
-
-#else
-
-#define BTI_C
-
-#endif
-
 /*
  * When using in-kernel BTI we need to ensure that PCS-conformant
  * assembly functions have suitable annotations.  Override
@@ -23,27 +13,27 @@
  */
 #define SYM_FUNC_START(name)				\
 	SYM_START(name, SYM_L_GLOBAL, SYM_A_ALIGN)	\
-	BTI_C
+	bti c ;
 
 #define SYM_FUNC_START_NOALIGN(name)			\
 	SYM_START(name, SYM_L_GLOBAL, SYM_A_NONE)	\
-	BTI_C
+	bti c ;
 
 #define SYM_FUNC_START_LOCAL(name)			\
 	SYM_START(name, SYM_L_LOCAL, SYM_A_ALIGN)	\
-	BTI_C
+	bti c ;
 
 #define SYM_FUNC_START_LOCAL_NOALIGN(name)		\
 	SYM_START(name, SYM_L_LOCAL, SYM_A_NONE)	\
-	BTI_C
+	bti c ;
 
 #define SYM_FUNC_START_WEAK(name)			\
 	SYM_START(name, SYM_L_WEAK, SYM_A_ALIGN)	\
-	BTI_C
+	bti c ;
 
 #define SYM_FUNC_START_WEAK_NOALIGN(name)		\
 	SYM_START(name, SYM_L_WEAK, SYM_A_NONE)		\
-	BTI_C
+	bti c ;
 
 /*
  * Annotate a function as position independent, i.e., safe to be called before
diff --git a/arch/arm64/kernel/entry-ftrace.S b/arch/arm64/kernel/entry-ftrace.S
index 8cf970d219f5..e535480a4069 100644
--- a/arch/arm64/kernel/entry-ftrace.S
+++ b/arch/arm64/kernel/entry-ftrace.S
@@ -77,17 +77,13 @@
 	.endm
 
 SYM_CODE_START(ftrace_regs_caller)
-#ifdef BTI_C
-	BTI_C
-#endif
+	bti	c
 	ftrace_regs_entry	1
 	b	ftrace_common
 SYM_CODE_END(ftrace_regs_caller)
 
 SYM_CODE_START(ftrace_caller)
-#ifdef BTI_C
-	BTI_C
-#endif
+	bti	c
 	ftrace_regs_entry	0
 	b	ftrace_common
 SYM_CODE_END(ftrace_caller)
diff --git a/arch/arm64/lib/kasan_sw_tags.S b/arch/arm64/lib/kasan_sw_tags.S
index 5b04464c045e..20784ce75def 100644
--- a/arch/arm64/lib/kasan_sw_tags.S
+++ b/arch/arm64/lib/kasan_sw_tags.S
@@ -38,9 +38,7 @@
  * incremented by 256 prior to return).
  */
 SYM_CODE_START(__hwasan_tag_mismatch)
-#ifdef BTI_C
-	BTI_C
-#endif
+	bti	c
 	add	x29, sp, #232
 	stp	x2, x3, [sp, #8 * 2]
 	stp	x4, x5, [sp, #8 * 4]

From 97bcbee404e386195614b58e98ab2e7eddd89a5c Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 10 Dec 2021 18:40:57 +0000
Subject: [PATCH 62/81] arm64/sve: Make sysctl interface for SVE reusable by
 SME

The vector length configuration for SME is very similar to that for SVE
so in order to allow reuse refactor the SVE configuration so that it takes
the vector type from the struct ctl_table. Since there's no dedicated space
for this we repurpose the extra1 field to store the vector type, this is
otherwise unused for integer sysctls.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211210184133.320748-2-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/fpsimd.c | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index fa244c426f61..23e575c4e580 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -15,6 +15,7 @@
 #include <linux/compiler.h>
 #include <linux/cpu.h>
 #include <linux/cpu_pm.h>
+#include <linux/ctype.h>
 #include <linux/kernel.h>
 #include <linux/linkage.h>
 #include <linux/irqflags.h>
@@ -406,12 +407,13 @@ static unsigned int find_supported_vector_length(enum vec_type type,
 
 #if defined(CONFIG_ARM64_SVE) && defined(CONFIG_SYSCTL)
 
-static int sve_proc_do_default_vl(struct ctl_table *table, int write,
+static int vec_proc_do_default_vl(struct ctl_table *table, int write,
 				  void *buffer, size_t *lenp, loff_t *ppos)
 {
-	struct vl_info *info = &vl_info[ARM64_VEC_SVE];
+	struct vl_info *info = table->extra1;
+	enum vec_type type = info->type;
 	int ret;
-	int vl = get_sve_default_vl();
+	int vl = get_default_vl(type);
 	struct ctl_table tmp_table = {
 		.data = &vl,
 		.maxlen = sizeof(vl),
@@ -428,7 +430,7 @@ static int sve_proc_do_default_vl(struct ctl_table *table, int write,
 	if (!sve_vl_valid(vl))
 		return -EINVAL;
 
-	set_sve_default_vl(find_supported_vector_length(ARM64_VEC_SVE, vl));
+	set_default_vl(type, find_supported_vector_length(type, vl));
 	return 0;
 }
 
@@ -436,7 +438,8 @@ static struct ctl_table sve_default_vl_table[] = {
 	{
 		.procname	= "sve_default_vector_length",
 		.mode		= 0644,
-		.proc_handler	= sve_proc_do_default_vl,
+		.proc_handler	= vec_proc_do_default_vl,
+		.extra1		= &vl_info[ARM64_VEC_SVE],
 	},
 	{ }
 };
@@ -1107,7 +1110,7 @@ static void fpsimd_flush_thread_vl(enum vec_type type)
 		vl = get_default_vl(type);
 
 	if (WARN_ON(!sve_vl_valid(vl)))
-		vl = SVE_VL_MIN;
+		vl = vl_info[type].min_vl;
 
 	supported_vl = find_supported_vector_length(type, vl);
 	if (WARN_ON(supported_vl != vl))

From 30c43e73b3fa2d75f5d4e72fea345380ba5eb21b Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 10 Dec 2021 18:40:58 +0000
Subject: [PATCH 63/81] arm64/sve: Generalise vector length configuration
 prctl() for SME

In preparation for adding SME support update the bulk of the implementation
for the vector length configuration prctl() calls to be independent of
vector type.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211210184133.320748-3-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/fpsimd.h |  6 ++---
 arch/arm64/kernel/fpsimd.c      | 47 ++++++++++++++++++---------------
 arch/arm64/kernel/ptrace.c      |  4 +--
 arch/arm64/kvm/reset.c          |  8 +++---
 4 files changed, 34 insertions(+), 31 deletions(-)

diff --git a/arch/arm64/include/asm/fpsimd.h b/arch/arm64/include/asm/fpsimd.h
index dbb4b30a5648..cb24385e3632 100644
--- a/arch/arm64/include/asm/fpsimd.h
+++ b/arch/arm64/include/asm/fpsimd.h
@@ -51,8 +51,8 @@ extern void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *state,
 extern void fpsimd_flush_task_state(struct task_struct *target);
 extern void fpsimd_save_and_flush_cpu_state(void);
 
-/* Maximum VL that SVE VL-agnostic software can transparently support */
-#define SVE_VL_ARCH_MAX 0x100
+/* Maximum VL that SVE/SME VL-agnostic software can transparently support */
+#define VL_ARCH_MAX 0x100
 
 /* Offset of FFR in the SVE register dump */
 static inline size_t sve_ffr_offset(int vl)
@@ -122,7 +122,7 @@ extern void fpsimd_sync_to_sve(struct task_struct *task);
 extern void sve_sync_to_fpsimd(struct task_struct *task);
 extern void sve_sync_from_fpsimd_zeropad(struct task_struct *task);
 
-extern int sve_set_vector_length(struct task_struct *task,
+extern int vec_set_vector_length(struct task_struct *task, enum vec_type type,
 				 unsigned long vl, unsigned long flags);
 
 extern int sve_set_current_vl(unsigned long arg);
diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index 23e575c4e580..4a98cc3b1df1 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -632,7 +632,7 @@ void sve_sync_from_fpsimd_zeropad(struct task_struct *task)
 	__fpsimd_to_sve(sst, fst, vq);
 }
 
-int sve_set_vector_length(struct task_struct *task,
+int vec_set_vector_length(struct task_struct *task, enum vec_type type,
 			  unsigned long vl, unsigned long flags)
 {
 	if (flags & ~(unsigned long)(PR_SVE_VL_INHERIT |
@@ -643,33 +643,35 @@ int sve_set_vector_length(struct task_struct *task,
 		return -EINVAL;
 
 	/*
-	 * Clamp to the maximum vector length that VL-agnostic SVE code can
-	 * work with.  A flag may be assigned in the future to allow setting
-	 * of larger vector lengths without confusing older software.
+	 * Clamp to the maximum vector length that VL-agnostic code
+	 * can work with.  A flag may be assigned in the future to
+	 * allow setting of larger vector lengths without confusing
+	 * older software.
 	 */
-	if (vl > SVE_VL_ARCH_MAX)
-		vl = SVE_VL_ARCH_MAX;
+	if (vl > VL_ARCH_MAX)
+		vl = VL_ARCH_MAX;
 
-	vl = find_supported_vector_length(ARM64_VEC_SVE, vl);
+	vl = find_supported_vector_length(type, vl);
 
 	if (flags & (PR_SVE_VL_INHERIT |
 		     PR_SVE_SET_VL_ONEXEC))
-		task_set_sve_vl_onexec(task, vl);
+		task_set_vl_onexec(task, type, vl);
 	else
 		/* Reset VL to system default on next exec: */
-		task_set_sve_vl_onexec(task, 0);
+		task_set_vl_onexec(task, type, 0);
 
 	/* Only actually set the VL if not deferred: */
 	if (flags & PR_SVE_SET_VL_ONEXEC)
 		goto out;
 
-	if (vl == task_get_sve_vl(task))
+	if (vl == task_get_vl(task, type))
 		goto out;
 
 	/*
 	 * To ensure the FPSIMD bits of the SVE vector registers are preserved,
 	 * write any live register state back to task_struct, and convert to a
-	 * non-SVE thread.
+	 * regular FPSIMD thread.  Since the vector length can only be changed
+	 * with a syscall we can't be in streaming mode while reconfiguring.
 	 */
 	if (task == current) {
 		get_cpu_fpsimd_context();
@@ -690,10 +692,10 @@ int sve_set_vector_length(struct task_struct *task,
 	 */
 	sve_free(task);
 
-	task_set_sve_vl(task, vl);
+	task_set_vl(task, type, vl);
 
 out:
-	update_tsk_thread_flag(task, TIF_SVE_VL_INHERIT,
+	update_tsk_thread_flag(task, vec_vl_inherit_flag(type),
 			       flags & PR_SVE_VL_INHERIT);
 
 	return 0;
@@ -701,20 +703,21 @@ out:
 
 /*
  * Encode the current vector length and flags for return.
- * This is only required for prctl(): ptrace has separate fields
+ * This is only required for prctl(): ptrace has separate fields.
+ * SVE and SME use the same bits for _ONEXEC and _INHERIT.
  *
- * flags are as for sve_set_vector_length().
+ * flags are as for vec_set_vector_length().
  */
-static int sve_prctl_status(unsigned long flags)
+static int vec_prctl_status(enum vec_type type, unsigned long flags)
 {
 	int ret;
 
 	if (flags & PR_SVE_SET_VL_ONEXEC)
-		ret = task_get_sve_vl_onexec(current);
+		ret = task_get_vl_onexec(current, type);
 	else
-		ret = task_get_sve_vl(current);
+		ret = task_get_vl(current, type);
 
-	if (test_thread_flag(TIF_SVE_VL_INHERIT))
+	if (test_thread_flag(vec_vl_inherit_flag(type)))
 		ret |= PR_SVE_VL_INHERIT;
 
 	return ret;
@@ -732,11 +735,11 @@ int sve_set_current_vl(unsigned long arg)
 	if (!system_supports_sve() || is_compat_task())
 		return -EINVAL;
 
-	ret = sve_set_vector_length(current, vl, flags);
+	ret = vec_set_vector_length(current, ARM64_VEC_SVE, vl, flags);
 	if (ret)
 		return ret;
 
-	return sve_prctl_status(flags);
+	return vec_prctl_status(ARM64_VEC_SVE, flags);
 }
 
 /* PR_SVE_GET_VL */
@@ -745,7 +748,7 @@ int sve_get_current_vl(void)
 	if (!system_supports_sve() || is_compat_task())
 		return -EINVAL;
 
-	return sve_prctl_status(0);
+	return vec_prctl_status(ARM64_VEC_SVE, 0);
 }
 
 static void vec_probe_vqs(struct vl_info *info,
diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c
index 88a9034fb9b5..716dde289446 100644
--- a/arch/arm64/kernel/ptrace.c
+++ b/arch/arm64/kernel/ptrace.c
@@ -812,9 +812,9 @@ static int sve_set(struct task_struct *target,
 
 	/*
 	 * Apart from SVE_PT_REGS_MASK, all SVE_PT_* flags are consumed by
-	 * sve_set_vector_length(), which will also validate them for us:
+	 * vec_set_vector_length(), which will also validate them for us:
 	 */
-	ret = sve_set_vector_length(target, header.vl,
+	ret = vec_set_vector_length(target, ARM64_VEC_SVE, header.vl,
 		((unsigned long)header.flags & ~SVE_PT_REGS_MASK) << 16);
 	if (ret)
 		goto out;
diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c
index 426bd7fbc3fd..27386f0d81e4 100644
--- a/arch/arm64/kvm/reset.c
+++ b/arch/arm64/kvm/reset.c
@@ -52,10 +52,10 @@ int kvm_arm_init_sve(void)
 		 * The get_sve_reg()/set_sve_reg() ioctl interface will need
 		 * to be extended with multiple register slice support in
 		 * order to support vector lengths greater than
-		 * SVE_VL_ARCH_MAX:
+		 * VL_ARCH_MAX:
 		 */
-		if (WARN_ON(kvm_sve_max_vl > SVE_VL_ARCH_MAX))
-			kvm_sve_max_vl = SVE_VL_ARCH_MAX;
+		if (WARN_ON(kvm_sve_max_vl > VL_ARCH_MAX))
+			kvm_sve_max_vl = VL_ARCH_MAX;
 
 		/*
 		 * Don't even try to make use of vector lengths that
@@ -103,7 +103,7 @@ static int kvm_vcpu_finalize_sve(struct kvm_vcpu *vcpu)
 	 * set_sve_vls().  Double-check here just to be sure:
 	 */
 	if (WARN_ON(!sve_vl_valid(vl) || vl > sve_max_virtualisable_vl() ||
-		    vl > SVE_VL_ARCH_MAX))
+		    vl > VL_ARCH_MAX))
 		return -EIO;
 
 	buf = kzalloc(SVE_SIG_REGS_SIZE(sve_vq_from_vl(vl)), GFP_KERNEL_ACCOUNT);

From aed34d9e52b80b4e7485119272c77c5553a21499 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 10 Dec 2021 18:40:59 +0000
Subject: [PATCH 64/81] arm64/sve: Minor clarification of ABI documentation

As suggested by Luis for the SME version of this explicitly say that the
vector length should be extracted from the return value of a set vector
length prctl() with a bitwise and rather than just any old and.

Suggested-by: Luis Machado <Luis.Machado@arm.com>
Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211210184133.320748-4-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arm64/sve.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/arm64/sve.rst b/Documentation/arm64/sve.rst
index 03137154299e..9d9a4de5bc34 100644
--- a/Documentation/arm64/sve.rst
+++ b/Documentation/arm64/sve.rst
@@ -255,7 +255,7 @@ prctl(PR_SVE_GET_VL)
     vector length change (which would only normally be the case between a
     fork() or vfork() and the corresponding execve() in typical use).
 
-    To extract the vector length from the result, and it with
+    To extract the vector length from the result, bitwise and it with
     PR_SVE_VL_LEN_MASK.
 
     Return value: a nonnegative value on success, or a negative value on error:

From 18edbb6b3259332b577c2b92b6b4d8dfd86ff00e Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 10 Dec 2021 18:41:00 +0000
Subject: [PATCH 65/81] kselftest/arm64: Parameterise ptrace vector length
 information

SME introduces a new mode called streaming mode in which the SVE registers
have a different vector length. Since the ptrace interface for this is
based on the existing SVE interface prepare for supporting this by moving
the regset specific configuration into  struct and passing that around,
allowing these tests to be reused for streaming mode. As we will also have
to verify the interoperation of the SVE and streaming SVE regsets don't
just iterate over an array.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211210184133.320748-5-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/sve-ptrace.c | 221 +++++++++++-------
 1 file changed, 143 insertions(+), 78 deletions(-)

diff --git a/tools/testing/selftests/arm64/fp/sve-ptrace.c b/tools/testing/selftests/arm64/fp/sve-ptrace.c
index c4417bc48d4f..af798b9d232c 100644
--- a/tools/testing/selftests/arm64/fp/sve-ptrace.c
+++ b/tools/testing/selftests/arm64/fp/sve-ptrace.c
@@ -21,16 +21,37 @@
 
 #include "../../kselftest.h"
 
-#define VL_TESTS (((SVE_VQ_MAX - SVE_VQ_MIN) + 1) * 3)
-#define FPSIMD_TESTS 5
-
-#define EXPECTED_TESTS (VL_TESTS + FPSIMD_TESTS)
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
 
 /* <linux/elf.h> and <sys/auxv.h> don't like each other, so: */
 #ifndef NT_ARM_SVE
 #define NT_ARM_SVE 0x405
 #endif
 
+struct vec_type {
+	const char *name;
+	unsigned long hwcap_type;
+	unsigned long hwcap;
+	int regset;
+	int prctl_set;
+};
+
+static const struct vec_type vec_types[] = {
+	{
+		.name = "SVE",
+		.hwcap_type = AT_HWCAP,
+		.hwcap = HWCAP_SVE,
+		.regset = NT_ARM_SVE,
+		.prctl_set = PR_SVE_SET_VL,
+	},
+};
+
+#define VL_TESTS (((SVE_VQ_MAX - SVE_VQ_MIN) + 1) * 3)
+#define FLAG_TESTS 2
+#define FPSIMD_TESTS 3
+
+#define EXPECTED_TESTS ((VL_TESTS + FLAG_TESTS + FPSIMD_TESTS) * ARRAY_SIZE(vec_types))
+
 static void fill_buf(char *buf, size_t size)
 {
 	int i;
@@ -59,7 +80,8 @@ static int get_fpsimd(pid_t pid, struct user_fpsimd_state *fpsimd)
 	return ptrace(PTRACE_GETREGSET, pid, NT_PRFPREG, &iov);
 }
 
-static struct user_sve_header *get_sve(pid_t pid, void **buf, size_t *size)
+static struct user_sve_header *get_sve(pid_t pid, const struct vec_type *type,
+				       void **buf, size_t *size)
 {
 	struct user_sve_header *sve;
 	void *p;
@@ -80,7 +102,7 @@ static struct user_sve_header *get_sve(pid_t pid, void **buf, size_t *size)
 
 		iov.iov_base = *buf;
 		iov.iov_len = sz;
-		if (ptrace(PTRACE_GETREGSET, pid, NT_ARM_SVE, &iov))
+		if (ptrace(PTRACE_GETREGSET, pid, type->regset, &iov))
 			goto error;
 
 		sve = *buf;
@@ -96,17 +118,18 @@ error:
 	return NULL;
 }
 
-static int set_sve(pid_t pid, const struct user_sve_header *sve)
+static int set_sve(pid_t pid, const struct vec_type *type,
+		   const struct user_sve_header *sve)
 {
 	struct iovec iov;
 
 	iov.iov_base = (void *)sve;
 	iov.iov_len = sve->size;
-	return ptrace(PTRACE_SETREGSET, pid, NT_ARM_SVE, &iov);
+	return ptrace(PTRACE_SETREGSET, pid, type->regset, &iov);
 }
 
 /* Validate setting and getting the inherit flag */
-static void ptrace_set_get_inherit(pid_t child)
+static void ptrace_set_get_inherit(pid_t child, const struct vec_type *type)
 {
 	struct user_sve_header sve;
 	struct user_sve_header *new_sve = NULL;
@@ -118,9 +141,10 @@ static void ptrace_set_get_inherit(pid_t child)
 	sve.size = sizeof(sve);
 	sve.vl = sve_vl_from_vq(SVE_VQ_MIN);
 	sve.flags = SVE_PT_VL_INHERIT;
-	ret = set_sve(child, &sve);
+	ret = set_sve(child, type, &sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to set SVE_PT_VL_INHERIT\n");
+		ksft_test_result_fail("Failed to set %s SVE_PT_VL_INHERIT\n",
+				      type->name);
 		return;
 	}
 
@@ -128,35 +152,39 @@ static void ptrace_set_get_inherit(pid_t child)
 	 * Read back the new register state and verify that we have
 	 * set the flags we expected.
 	 */
-	if (!get_sve(child, (void **)&new_sve, &new_sve_size)) {
-		ksft_test_result_fail("Failed to read SVE flags\n");
+	if (!get_sve(child, type, (void **)&new_sve, &new_sve_size)) {
+		ksft_test_result_fail("Failed to read %s SVE flags\n",
+				      type->name);
 		return;
 	}
 
 	ksft_test_result(new_sve->flags & SVE_PT_VL_INHERIT,
-			 "SVE_PT_VL_INHERIT set\n");
+			 "%s SVE_PT_VL_INHERIT set\n", type->name);
 
 	/* Now clear */
 	sve.flags &= ~SVE_PT_VL_INHERIT;
-	ret = set_sve(child, &sve);
+	ret = set_sve(child, type, &sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to clear SVE_PT_VL_INHERIT\n");
+		ksft_test_result_fail("Failed to clear %s SVE_PT_VL_INHERIT\n",
+				      type->name);
 		return;
 	}
 
-	if (!get_sve(child, (void **)&new_sve, &new_sve_size)) {
-		ksft_test_result_fail("Failed to read SVE flags\n");
+	if (!get_sve(child, type, (void **)&new_sve, &new_sve_size)) {
+		ksft_test_result_fail("Failed to read %s SVE flags\n",
+				      type->name);
 		return;
 	}
 
 	ksft_test_result(!(new_sve->flags & SVE_PT_VL_INHERIT),
-			 "SVE_PT_VL_INHERIT cleared\n");
+			 "%s SVE_PT_VL_INHERIT cleared\n", type->name);
 
 	free(new_sve);
 }
 
 /* Validate attempting to set the specfied VL via ptrace */
-static void ptrace_set_get_vl(pid_t child, unsigned int vl, bool *supported)
+static void ptrace_set_get_vl(pid_t child, const struct vec_type *type,
+			      unsigned int vl, bool *supported)
 {
 	struct user_sve_header sve;
 	struct user_sve_header *new_sve = NULL;
@@ -166,10 +194,10 @@ static void ptrace_set_get_vl(pid_t child, unsigned int vl, bool *supported)
 	*supported = false;
 
 	/* Check if the VL is supported in this process */
-	prctl_vl = prctl(PR_SVE_SET_VL, vl);
+	prctl_vl = prctl(type->prctl_set, vl);
 	if (prctl_vl == -1)
-		ksft_exit_fail_msg("prctl(PR_SVE_SET_VL) failed: %s (%d)\n",
-				   strerror(errno), errno);
+		ksft_exit_fail_msg("prctl(PR_%s_SET_VL) failed: %s (%d)\n",
+				   type->name, strerror(errno), errno);
 
 	/* If the VL is not supported then a supported VL will be returned */
 	*supported = (prctl_vl == vl);
@@ -178,9 +206,10 @@ static void ptrace_set_get_vl(pid_t child, unsigned int vl, bool *supported)
 	memset(&sve, 0, sizeof(sve));
 	sve.size = sizeof(sve);
 	sve.vl = vl;
-	ret = set_sve(child, &sve);
+	ret = set_sve(child, type, &sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to set VL %u\n", vl);
+		ksft_test_result_fail("Failed to set %s VL %u\n",
+				      type->name, vl);
 		return;
 	}
 
@@ -188,12 +217,14 @@ static void ptrace_set_get_vl(pid_t child, unsigned int vl, bool *supported)
 	 * Read back the new register state and verify that we have the
 	 * same VL that we got from prctl() on ourselves.
 	 */
-	if (!get_sve(child, (void **)&new_sve, &new_sve_size)) {
-		ksft_test_result_fail("Failed to read VL %u\n", vl);
+	if (!get_sve(child, type, (void **)&new_sve, &new_sve_size)) {
+		ksft_test_result_fail("Failed to read %s VL %u\n",
+				      type->name, vl);
 		return;
 	}
 
-	ksft_test_result(new_sve->vl = prctl_vl, "Set VL %u\n", vl);
+	ksft_test_result(new_sve->vl = prctl_vl, "Set %s VL %u\n",
+			 type->name, vl);
 
 	free(new_sve);
 }
@@ -209,7 +240,7 @@ static void check_u32(unsigned int vl, const char *reg,
 }
 
 /* Access the FPSIMD registers via the SVE regset */
-static void ptrace_sve_fpsimd(pid_t child)
+static void ptrace_sve_fpsimd(pid_t child, const struct vec_type *type)
 {
 	void *svebuf = NULL;
 	size_t svebufsz = 0;
@@ -219,17 +250,18 @@ static void ptrace_sve_fpsimd(pid_t child)
 	unsigned char *p;
 
 	/* New process should start with FPSIMD registers only */
-	sve = get_sve(child, &svebuf, &svebufsz);
+	sve = get_sve(child, type, &svebuf, &svebufsz);
 	if (!sve) {
-		ksft_test_result_fail("get_sve: %s\n", strerror(errno));
+		ksft_test_result_fail("get_sve(%s): %s\n",
+				      type->name, strerror(errno));
 
 		return;
 	} else {
-		ksft_test_result_pass("get_sve(FPSIMD)\n");
+		ksft_test_result_pass("get_sve(%s FPSIMD)\n", type->name);
 	}
 
 	ksft_test_result((sve->flags & SVE_PT_REGS_MASK) == SVE_PT_REGS_FPSIMD,
-			 "Set FPSIMD registers\n");
+			 "Set FPSIMD registers via %s\n", type->name);
 	if ((sve->flags & SVE_PT_REGS_MASK) != SVE_PT_REGS_FPSIMD)
 		goto out;
 
@@ -243,9 +275,9 @@ static void ptrace_sve_fpsimd(pid_t child)
 			p[j] = j;
 	}
 
-	if (set_sve(child, sve)) {
-		ksft_test_result_fail("set_sve(FPSIMD): %s\n",
-				      strerror(errno));
+	if (set_sve(child, type, sve)) {
+		ksft_test_result_fail("set_sve(%s FPSIMD): %s\n",
+				      type->name, strerror(errno));
 
 		goto out;
 	}
@@ -257,16 +289,20 @@ static void ptrace_sve_fpsimd(pid_t child)
 		goto out;
 	}
 	if (memcmp(fpsimd, &new_fpsimd, sizeof(*fpsimd)) == 0)
-		ksft_test_result_pass("get_fpsimd() gave same state\n");
+		ksft_test_result_pass("%s get_fpsimd() gave same state\n",
+				      type->name);
 	else
-		ksft_test_result_fail("get_fpsimd() gave different state\n");
+		ksft_test_result_fail("%s get_fpsimd() gave different state\n",
+				      type->name);
 
 out:
 	free(svebuf);
 }
 
 /* Validate attempting to set SVE data and read SVE data */
-static void ptrace_set_sve_get_sve_data(pid_t child, unsigned int vl)
+static void ptrace_set_sve_get_sve_data(pid_t child,
+					const struct vec_type *type,
+					unsigned int vl)
 {
 	void *write_buf;
 	void *read_buf = NULL;
@@ -281,8 +317,8 @@ static void ptrace_set_sve_get_sve_data(pid_t child, unsigned int vl)
 	data_size = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
 	write_buf = malloc(data_size);
 	if (!write_buf) {
-		ksft_test_result_fail("Error allocating %d byte buffer for VL %u\n",
-				      data_size, vl);
+		ksft_test_result_fail("Error allocating %d byte buffer for %s VL %u\n",
+				      data_size, type->name, vl);
 		return;
 	}
 	write_sve = write_buf;
@@ -306,23 +342,26 @@ static void ptrace_set_sve_get_sve_data(pid_t child, unsigned int vl)
 
 	/* TODO: Generate a valid FFR pattern */
 
-	ret = set_sve(child, write_sve);
+	ret = set_sve(child, type, write_sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to set VL %u data\n", vl);
+		ksft_test_result_fail("Failed to set %s VL %u data\n",
+				      type->name, vl);
 		goto out;
 	}
 
 	/* Read the data back */
-	if (!get_sve(child, (void **)&read_buf, &read_sve_size)) {
-		ksft_test_result_fail("Failed to read VL %u data\n", vl);
+	if (!get_sve(child, type, (void **)&read_buf, &read_sve_size)) {
+		ksft_test_result_fail("Failed to read %s VL %u data\n",
+				      type->name, vl);
 		goto out;
 	}
 	read_sve = read_buf;
 
 	/* We might read more data if there's extensions we don't know */
 	if (read_sve->size < write_sve->size) {
-		ksft_test_result_fail("Wrote %d bytes, only read %d\n",
-				      write_sve->size, read_sve->size);
+		ksft_test_result_fail("%s wrote %d bytes, only read %d\n",
+				      type->name, write_sve->size,
+				      read_sve->size);
 		goto out_read;
 	}
 
@@ -349,7 +388,8 @@ static void ptrace_set_sve_get_sve_data(pid_t child, unsigned int vl)
 	check_u32(vl, "FPCR", write_buf + SVE_PT_SVE_FPCR_OFFSET(vq),
 		  read_buf + SVE_PT_SVE_FPCR_OFFSET(vq), &errors);
 
-	ksft_test_result(errors == 0, "Set and get SVE data for VL %u\n", vl);
+	ksft_test_result(errors == 0, "Set and get %s data for VL %u\n",
+			 type->name, vl);
 
 out_read:
 	free(read_buf);
@@ -358,7 +398,9 @@ out:
 }
 
 /* Validate attempting to set SVE data and read SVE data */
-static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
+static void ptrace_set_sve_get_fpsimd_data(pid_t child,
+					   const struct vec_type *type,
+					   unsigned int vl)
 {
 	void *write_buf;
 	struct user_sve_header *write_sve;
@@ -376,8 +418,8 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
 	data_size = SVE_PT_SVE_OFFSET + SVE_PT_SVE_SIZE(vq, SVE_PT_REGS_SVE);
 	write_buf = malloc(data_size);
 	if (!write_buf) {
-		ksft_test_result_fail("Error allocating %d byte buffer for VL %u\n",
-				      data_size, vl);
+		ksft_test_result_fail("Error allocating %d byte buffer for %s VL %u\n",
+				      data_size, type->name, vl);
 		return;
 	}
 	write_sve = write_buf;
@@ -395,16 +437,17 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
 	fill_buf(write_buf + SVE_PT_SVE_FPSR_OFFSET(vq), SVE_PT_SVE_FPSR_SIZE);
 	fill_buf(write_buf + SVE_PT_SVE_FPCR_OFFSET(vq), SVE_PT_SVE_FPCR_SIZE);
 
-	ret = set_sve(child, write_sve);
+	ret = set_sve(child, type, write_sve);
 	if (ret != 0) {
-		ksft_test_result_fail("Failed to set VL %u data\n", vl);
+		ksft_test_result_fail("Failed to set %s VL %u data\n",
+				      type->name, vl);
 		goto out;
 	}
 
 	/* Read the data back */
 	if (get_fpsimd(child, &fpsimd_state)) {
-		ksft_test_result_fail("Failed to read VL %u FPSIMD data\n",
-				      vl);
+		ksft_test_result_fail("Failed to read %s VL %u FPSIMD data\n",
+				      type->name, vl);
 		goto out;
 	}
 
@@ -419,7 +462,8 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
 		       sizeof(tmp));
 
 		if (tmp != fpsimd_state.vregs[i]) {
-			printf("# Mismatch in FPSIMD for VL %u Z%d\n", vl, i);
+			printf("# Mismatch in FPSIMD for %s VL %u Z%d\n",
+			       type->name, vl, i);
 			errors++;
 		}
 	}
@@ -429,8 +473,8 @@ static void ptrace_set_sve_get_fpsimd_data(pid_t child, unsigned int vl)
 	check_u32(vl, "FPCR", write_buf + SVE_PT_SVE_FPCR_OFFSET(vq),
 		  &fpsimd_state.fpcr, &errors);
 
-	ksft_test_result(errors == 0, "Set and get FPSIMD data for VL %u\n",
-			 vl);
+	ksft_test_result(errors == 0, "Set and get FPSIMD data for %s VL %u\n",
+			 type->name, vl);
 
 out:
 	free(write_buf);
@@ -440,7 +484,7 @@ static int do_parent(pid_t child)
 {
 	int ret = EXIT_FAILURE;
 	pid_t pid;
-	int status;
+	int status, i;
 	siginfo_t si;
 	unsigned int vq, vl;
 	bool vl_supported;
@@ -499,26 +543,47 @@ static int do_parent(pid_t child)
 		}
 	}
 
-	/* FPSIMD via SVE regset */
-	ptrace_sve_fpsimd(child);
-
-	/* prctl() flags */
-	ptrace_set_get_inherit(child);
-
-	/* Step through every possible VQ */
-	for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; vq++) {
-		vl = sve_vl_from_vq(vq);
-
-		/* First, try to set this vector length */
-		ptrace_set_get_vl(child, vl, &vl_supported);
-
-		/* If the VL is supported validate data set/get */
-		if (vl_supported) {
-			ptrace_set_sve_get_sve_data(child, vl);
-			ptrace_set_sve_get_fpsimd_data(child, vl);
+	for (i = 0; i < ARRAY_SIZE(vec_types); i++) {
+		/* FPSIMD via SVE regset */
+		if (getauxval(vec_types[i].hwcap_type) & vec_types[i].hwcap) {
+			ptrace_sve_fpsimd(child, &vec_types[i]);
 		} else {
-			ksft_test_result_skip("set SVE get SVE for VL %d\n", vl);
-			ksft_test_result_skip("set SVE get FPSIMD for VL %d\n", vl);
+			ksft_test_result_skip("%s FPSIMD get via SVE\n",
+					      vec_types[i].name);
+			ksft_test_result_skip("%s FPSIMD set via SVE\n",
+					      vec_types[i].name);
+			ksft_test_result_skip("%s set read via FPSIMD\n",
+					      vec_types[i].name);
+		}
+
+		/* prctl() flags */
+		ptrace_set_get_inherit(child, &vec_types[i]);
+
+		/* Step through every possible VQ */
+		for (vq = SVE_VQ_MIN; vq <= SVE_VQ_MAX; vq++) {
+			vl = sve_vl_from_vq(vq);
+
+			/* First, try to set this vector length */
+			if (getauxval(vec_types[i].hwcap_type) &
+			    vec_types[i].hwcap) {
+				ptrace_set_get_vl(child, &vec_types[i], vl,
+						  &vl_supported);
+			} else {
+				ksft_test_result_skip("%s get/set VL %d\n",
+						      vec_types[i].name, vl);
+				vl_supported = false;
+			}
+
+			/* If the VL is supported validate data set/get */
+			if (vl_supported) {
+				ptrace_set_sve_get_sve_data(child, &vec_types[i], vl);
+				ptrace_set_sve_get_fpsimd_data(child, &vec_types[i], vl);
+			} else {
+				ksft_test_result_skip("%s set SVE get SVE for VL %d\n",
+						      vec_types[i].name, vl);
+				ksft_test_result_skip("%s set SVE get FPSIMD for VL %d\n",
+						      vec_types[i].name, vl);
+			}
 		}
 	}
 

From 9331a604858a0f399ef8f886a88dd94927f136aa Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 10 Dec 2021 18:41:01 +0000
Subject: [PATCH 66/81] kselftest/arm64: Allow signal tests to trigger from a
 function

Currently we have the facility to specify custom code to trigger a signal
but none of the tests use it and for some reason the framework requires us
to also specify a signal to send as a trigger in order to make use of a
custom trigger. This doesn't seem to make much sense, instead allow the
use of a custom trigger function without specifying a signal to inject.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211210184133.320748-6-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 .../selftests/arm64/signal/test_signals_utils.c      | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/tools/testing/selftests/arm64/signal/test_signals_utils.c b/tools/testing/selftests/arm64/signal/test_signals_utils.c
index 22722abc9dfa..2f8c23af3b5e 100644
--- a/tools/testing/selftests/arm64/signal/test_signals_utils.c
+++ b/tools/testing/selftests/arm64/signal/test_signals_utils.c
@@ -310,14 +310,12 @@ int test_setup(struct tdescr *td)
 
 int test_run(struct tdescr *td)
 {
-	if (td->sig_trig) {
-		if (td->trigger)
-			return td->trigger(td);
-		else
-			return default_trigger(td);
-	} else {
+	if (td->trigger)
+		return td->trigger(td);
+	else if (td->sig_trig)
+		return default_trigger(td);
+	else
 		return td->run(td, NULL, NULL);
-	}
 }
 
 void test_result(struct tdescr *td)

From b77e995e3b9638e759d482e93deba1fa0760cb26 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Fri, 10 Dec 2021 18:41:02 +0000
Subject: [PATCH 67/81] kselftest/arm64: Add a test program to exercise the
 syscall ABI

Currently we don't have any coverage of the syscall ABI so let's add a very
dumb test program which sets up register patterns, does a sysscall and then
checks that the register state after the syscall matches what we expect.
The program is written in an extremely simplistic fashion with the goal of
making it easy to verify that it's doing what it thinks it's doing, it is
not a model of how one should write actual code.

Currently we validate the general purpose, FPSIMD and SVE registers. There
are other thing things that could be covered like FPCR and flags registers,
these can be covered incrementally - my main focus at the minute is
covering the ABI for the SVE registers.

The program repeats the tests for all possible SVE vector lengths in case
some vector length specific optimisation causes issues, as well as testing
FPSIMD only. It tries two syscalls, getpid() and sched_yield(), in an
effort to cover both immediate return to userspace and scheduling another
task though there are no guarantees which cases will be hit.

A new test directory "abi" is added to hold the test, it doesn't seem to
fit well into any of the existing directories.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211210184133.320748-7-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/Makefile        |   2 +-
 tools/testing/selftests/arm64/abi/.gitignore  |   1 +
 tools/testing/selftests/arm64/abi/Makefile    |   8 +
 .../selftests/arm64/abi/syscall-abi-asm.S     | 240 +++++++++++++
 .../testing/selftests/arm64/abi/syscall-abi.c | 318 ++++++++++++++++++
 5 files changed, 568 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/abi/.gitignore
 create mode 100644 tools/testing/selftests/arm64/abi/Makefile
 create mode 100644 tools/testing/selftests/arm64/abi/syscall-abi-asm.S
 create mode 100644 tools/testing/selftests/arm64/abi/syscall-abi.c

diff --git a/tools/testing/selftests/arm64/Makefile b/tools/testing/selftests/arm64/Makefile
index ced910fb4019..1e8d9a8f59df 100644
--- a/tools/testing/selftests/arm64/Makefile
+++ b/tools/testing/selftests/arm64/Makefile
@@ -4,7 +4,7 @@
 ARCH ?= $(shell uname -m 2>/dev/null || echo not)
 
 ifneq (,$(filter $(ARCH),aarch64 arm64))
-ARM64_SUBTARGETS ?= tags signal pauth fp mte bti
+ARM64_SUBTARGETS ?= tags signal pauth fp mte bti abi
 else
 ARM64_SUBTARGETS :=
 endif
diff --git a/tools/testing/selftests/arm64/abi/.gitignore b/tools/testing/selftests/arm64/abi/.gitignore
new file mode 100644
index 000000000000..b79cf5814c23
--- /dev/null
+++ b/tools/testing/selftests/arm64/abi/.gitignore
@@ -0,0 +1 @@
+syscall-abi
diff --git a/tools/testing/selftests/arm64/abi/Makefile b/tools/testing/selftests/arm64/abi/Makefile
new file mode 100644
index 000000000000..96eba974ac8d
--- /dev/null
+++ b/tools/testing/selftests/arm64/abi/Makefile
@@ -0,0 +1,8 @@
+# SPDX-License-Identifier: GPL-2.0
+# Copyright (C) 2021 ARM Limited
+
+TEST_GEN_PROGS := syscall-abi
+
+include ../../lib.mk
+
+$(OUTPUT)/syscall-abi: syscall-abi.c syscall-abi-asm.S
diff --git a/tools/testing/selftests/arm64/abi/syscall-abi-asm.S b/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
new file mode 100644
index 000000000000..983467cfcee0
--- /dev/null
+++ b/tools/testing/selftests/arm64/abi/syscall-abi-asm.S
@@ -0,0 +1,240 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2021 ARM Limited.
+//
+// Assembly portion of the syscall ABI test
+
+//
+// Load values from memory into registers, invoke a syscall and save the
+// register values back to memory for later checking.  The syscall to be
+// invoked is configured in x8 of the input GPR data.
+//
+// x0:	SVE VL, 0 for FP only
+//
+//	GPRs:	gpr_in, gpr_out
+//	FPRs:	fpr_in, fpr_out
+//	Zn:	z_in, z_out
+//	Pn:	p_in, p_out
+//	FFR:	ffr_in, ffr_out
+
+.arch_extension sve
+
+.globl do_syscall
+do_syscall:
+	// Store callee saved registers x19-x29 (80 bytes) plus x0 and x1
+	stp	x29, x30, [sp, #-112]!
+	mov	x29, sp
+	stp	x0, x1, [sp, #16]
+	stp	x19, x20, [sp, #32]
+	stp	x21, x22, [sp, #48]
+	stp	x23, x24, [sp, #64]
+	stp	x25, x26, [sp, #80]
+	stp	x27, x28, [sp, #96]
+
+	// Load GPRs x8-x28, and save our SP/FP for later comparison
+	ldr	x2, =gpr_in
+	add	x2, x2, #64
+	ldp	x8, x9, [x2], #16
+	ldp	x10, x11, [x2], #16
+	ldp	x12, x13, [x2], #16
+	ldp	x14, x15, [x2], #16
+	ldp	x16, x17, [x2], #16
+	ldp	x18, x19, [x2], #16
+	ldp	x20, x21, [x2], #16
+	ldp	x22, x23, [x2], #16
+	ldp	x24, x25, [x2], #16
+	ldp	x26, x27, [x2], #16
+	ldr	x28, [x2], #8
+	str	x29, [x2], #8		// FP
+	str	x30, [x2], #8		// LR
+
+	// Load FPRs if we're not doing SVE
+	cbnz	x0, 1f
+	ldr	x2, =fpr_in
+	ldp	q0, q1, [x2]
+	ldp	q2, q3, [x2, #16 * 2]
+	ldp	q4, q5, [x2, #16 * 4]
+	ldp	q6, q7, [x2, #16 * 6]
+	ldp	q8, q9, [x2, #16 * 8]
+	ldp	q10, q11, [x2, #16 * 10]
+	ldp	q12, q13, [x2, #16 * 12]
+	ldp	q14, q15, [x2, #16 * 14]
+	ldp	q16, q17, [x2, #16 * 16]
+	ldp	q18, q19, [x2, #16 * 18]
+	ldp	q20, q21, [x2, #16 * 20]
+	ldp	q22, q23, [x2, #16 * 22]
+	ldp	q24, q25, [x2, #16 * 24]
+	ldp	q26, q27, [x2, #16 * 26]
+	ldp	q28, q29, [x2, #16 * 28]
+	ldp	q30, q31, [x2, #16 * 30]
+1:
+
+	// Load the SVE registers if we're doing SVE
+	cbz	x0, 1f
+
+	ldr	x2, =z_in
+	ldr	z0, [x2, #0, MUL VL]
+	ldr	z1, [x2, #1, MUL VL]
+	ldr	z2, [x2, #2, MUL VL]
+	ldr	z3, [x2, #3, MUL VL]
+	ldr	z4, [x2, #4, MUL VL]
+	ldr	z5, [x2, #5, MUL VL]
+	ldr	z6, [x2, #6, MUL VL]
+	ldr	z7, [x2, #7, MUL VL]
+	ldr	z8, [x2, #8, MUL VL]
+	ldr	z9, [x2, #9, MUL VL]
+	ldr	z10, [x2, #10, MUL VL]
+	ldr	z11, [x2, #11, MUL VL]
+	ldr	z12, [x2, #12, MUL VL]
+	ldr	z13, [x2, #13, MUL VL]
+	ldr	z14, [x2, #14, MUL VL]
+	ldr	z15, [x2, #15, MUL VL]
+	ldr	z16, [x2, #16, MUL VL]
+	ldr	z17, [x2, #17, MUL VL]
+	ldr	z18, [x2, #18, MUL VL]
+	ldr	z19, [x2, #19, MUL VL]
+	ldr	z20, [x2, #20, MUL VL]
+	ldr	z21, [x2, #21, MUL VL]
+	ldr	z22, [x2, #22, MUL VL]
+	ldr	z23, [x2, #23, MUL VL]
+	ldr	z24, [x2, #24, MUL VL]
+	ldr	z25, [x2, #25, MUL VL]
+	ldr	z26, [x2, #26, MUL VL]
+	ldr	z27, [x2, #27, MUL VL]
+	ldr	z28, [x2, #28, MUL VL]
+	ldr	z29, [x2, #29, MUL VL]
+	ldr	z30, [x2, #30, MUL VL]
+	ldr	z31, [x2, #31, MUL VL]
+
+	ldr	x2, =ffr_in
+	ldr	p0, [x2, #0]
+	wrffr	p0.b
+
+	ldr	x2, =p_in
+	ldr	p0, [x2, #0, MUL VL]
+	ldr	p1, [x2, #1, MUL VL]
+	ldr	p2, [x2, #2, MUL VL]
+	ldr	p3, [x2, #3, MUL VL]
+	ldr	p4, [x2, #4, MUL VL]
+	ldr	p5, [x2, #5, MUL VL]
+	ldr	p6, [x2, #6, MUL VL]
+	ldr	p7, [x2, #7, MUL VL]
+	ldr	p8, [x2, #8, MUL VL]
+	ldr	p9, [x2, #9, MUL VL]
+	ldr	p10, [x2, #10, MUL VL]
+	ldr	p11, [x2, #11, MUL VL]
+	ldr	p12, [x2, #12, MUL VL]
+	ldr	p13, [x2, #13, MUL VL]
+	ldr	p14, [x2, #14, MUL VL]
+	ldr	p15, [x2, #15, MUL VL]
+1:
+
+	// Do the syscall
+	svc	#0
+
+	// Save GPRs x8-x30
+	ldr	x2, =gpr_out
+	add	x2, x2, #64
+	stp	x8, x9, [x2], #16
+	stp	x10, x11, [x2], #16
+	stp	x12, x13, [x2], #16
+	stp	x14, x15, [x2], #16
+	stp	x16, x17, [x2], #16
+	stp	x18, x19, [x2], #16
+	stp	x20, x21, [x2], #16
+	stp	x22, x23, [x2], #16
+	stp	x24, x25, [x2], #16
+	stp	x26, x27, [x2], #16
+	stp	x28, x29, [x2], #16
+	str	x30, [x2]
+
+	// Restore x0 and x1 for feature checks
+	ldp	x0, x1, [sp, #16]
+
+	// Save FPSIMD state
+	ldr	x2, =fpr_out
+	stp	q0, q1, [x2]
+	stp	q2, q3, [x2, #16 * 2]
+	stp	q4, q5, [x2, #16 * 4]
+	stp	q6, q7, [x2, #16 * 6]
+	stp	q8, q9, [x2, #16 * 8]
+	stp	q10, q11, [x2, #16 * 10]
+	stp	q12, q13, [x2, #16 * 12]
+	stp	q14, q15, [x2, #16 * 14]
+	stp	q16, q17, [x2, #16 * 16]
+	stp	q18, q19, [x2, #16 * 18]
+	stp	q20, q21, [x2, #16 * 20]
+	stp	q22, q23, [x2, #16 * 22]
+	stp	q24, q25, [x2, #16 * 24]
+	stp	q26, q27, [x2, #16 * 26]
+	stp	q28, q29, [x2, #16 * 28]
+	stp	q30, q31, [x2, #16 * 30]
+
+	// Save the SVE state if we have some
+	cbz	x0, 1f
+
+	ldr	x2, =z_out
+	str	z0, [x2, #0, MUL VL]
+	str	z1, [x2, #1, MUL VL]
+	str	z2, [x2, #2, MUL VL]
+	str	z3, [x2, #3, MUL VL]
+	str	z4, [x2, #4, MUL VL]
+	str	z5, [x2, #5, MUL VL]
+	str	z6, [x2, #6, MUL VL]
+	str	z7, [x2, #7, MUL VL]
+	str	z8, [x2, #8, MUL VL]
+	str	z9, [x2, #9, MUL VL]
+	str	z10, [x2, #10, MUL VL]
+	str	z11, [x2, #11, MUL VL]
+	str	z12, [x2, #12, MUL VL]
+	str	z13, [x2, #13, MUL VL]
+	str	z14, [x2, #14, MUL VL]
+	str	z15, [x2, #15, MUL VL]
+	str	z16, [x2, #16, MUL VL]
+	str	z17, [x2, #17, MUL VL]
+	str	z18, [x2, #18, MUL VL]
+	str	z19, [x2, #19, MUL VL]
+	str	z20, [x2, #20, MUL VL]
+	str	z21, [x2, #21, MUL VL]
+	str	z22, [x2, #22, MUL VL]
+	str	z23, [x2, #23, MUL VL]
+	str	z24, [x2, #24, MUL VL]
+	str	z25, [x2, #25, MUL VL]
+	str	z26, [x2, #26, MUL VL]
+	str	z27, [x2, #27, MUL VL]
+	str	z28, [x2, #28, MUL VL]
+	str	z29, [x2, #29, MUL VL]
+	str	z30, [x2, #30, MUL VL]
+	str	z31, [x2, #31, MUL VL]
+
+	ldr	x2, =p_out
+	str	p0, [x2, #0, MUL VL]
+	str	p1, [x2, #1, MUL VL]
+	str	p2, [x2, #2, MUL VL]
+	str	p3, [x2, #3, MUL VL]
+	str	p4, [x2, #4, MUL VL]
+	str	p5, [x2, #5, MUL VL]
+	str	p6, [x2, #6, MUL VL]
+	str	p7, [x2, #7, MUL VL]
+	str	p8, [x2, #8, MUL VL]
+	str	p9, [x2, #9, MUL VL]
+	str	p10, [x2, #10, MUL VL]
+	str	p11, [x2, #11, MUL VL]
+	str	p12, [x2, #12, MUL VL]
+	str	p13, [x2, #13, MUL VL]
+	str	p14, [x2, #14, MUL VL]
+	str	p15, [x2, #15, MUL VL]
+
+	ldr	x2, =ffr_out
+	rdffr	p0.b
+	str	p0, [x2, #0]
+1:
+
+	// Restore callee saved registers x19-x30
+	ldp	x19, x20, [sp, #32]
+	ldp	x21, x22, [sp, #48]
+	ldp	x23, x24, [sp, #64]
+	ldp	x25, x26, [sp, #80]
+	ldp	x27, x28, [sp, #96]
+	ldp	x29, x30, [sp], #112
+
+	ret
diff --git a/tools/testing/selftests/arm64/abi/syscall-abi.c b/tools/testing/selftests/arm64/abi/syscall-abi.c
new file mode 100644
index 000000000000..d8eeeafb50dc
--- /dev/null
+++ b/tools/testing/selftests/arm64/abi/syscall-abi.c
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 2021 ARM Limited.
+ */
+
+#include <errno.h>
+#include <stdbool.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <sys/auxv.h>
+#include <sys/prctl.h>
+#include <asm/hwcap.h>
+#include <asm/sigcontext.h>
+#include <asm/unistd.h>
+
+#include "../../kselftest.h"
+
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof(a[0]))
+#define NUM_VL ((SVE_VQ_MAX - SVE_VQ_MIN) + 1)
+
+extern void do_syscall(int sve_vl);
+
+static void fill_random(void *buf, size_t size)
+{
+	int i;
+	uint32_t *lbuf = buf;
+
+	/* random() returns a 32 bit number regardless of the size of long */
+	for (i = 0; i < size / sizeof(uint32_t); i++)
+		lbuf[i] = random();
+}
+
+/*
+ * We also repeat the test for several syscalls to try to expose different
+ * behaviour.
+ */
+static struct syscall_cfg {
+	int syscall_nr;
+	const char *name;
+} syscalls[] = {
+	{ __NR_getpid,		"getpid()" },
+	{ __NR_sched_yield,	"sched_yield()" },
+};
+
+#define NUM_GPR 31
+uint64_t gpr_in[NUM_GPR];
+uint64_t gpr_out[NUM_GPR];
+
+static void setup_gpr(struct syscall_cfg *cfg, int sve_vl)
+{
+	fill_random(gpr_in, sizeof(gpr_in));
+	gpr_in[8] = cfg->syscall_nr;
+	memset(gpr_out, 0, sizeof(gpr_out));
+}
+
+static int check_gpr(struct syscall_cfg *cfg, int sve_vl)
+{
+	int errors = 0;
+	int i;
+
+	/*
+	 * GPR x0-x7 may be clobbered, and all others should be preserved.
+	 */
+	for (i = 9; i < ARRAY_SIZE(gpr_in); i++) {
+		if (gpr_in[i] != gpr_out[i]) {
+			ksft_print_msg("%s SVE VL %d mismatch in GPR %d: %llx != %llx\n",
+				       cfg->name, sve_vl, i,
+				       gpr_in[i], gpr_out[i]);
+			errors++;
+		}
+	}
+
+	return errors;
+}
+
+#define NUM_FPR 32
+uint64_t fpr_in[NUM_FPR * 2];
+uint64_t fpr_out[NUM_FPR * 2];
+
+static void setup_fpr(struct syscall_cfg *cfg, int sve_vl)
+{
+	fill_random(fpr_in, sizeof(fpr_in));
+	memset(fpr_out, 0, sizeof(fpr_out));
+}
+
+static int check_fpr(struct syscall_cfg *cfg, int sve_vl)
+{
+	int errors = 0;
+	int i;
+
+	if (!sve_vl) {
+		for (i = 0; i < ARRAY_SIZE(fpr_in); i++) {
+			if (fpr_in[i] != fpr_out[i]) {
+				ksft_print_msg("%s Q%d/%d mismatch %llx != %llx\n",
+					       cfg->name,
+					       i / 2, i % 2,
+					       fpr_in[i], fpr_out[i]);
+				errors++;
+			}
+		}
+	}
+
+	return errors;
+}
+
+static uint8_t z_zero[__SVE_ZREG_SIZE(SVE_VQ_MAX)];
+uint8_t z_in[SVE_NUM_PREGS * __SVE_ZREG_SIZE(SVE_VQ_MAX)];
+uint8_t z_out[SVE_NUM_PREGS * __SVE_ZREG_SIZE(SVE_VQ_MAX)];
+
+static void setup_z(struct syscall_cfg *cfg, int sve_vl)
+{
+	fill_random(z_in, sizeof(z_in));
+	fill_random(z_out, sizeof(z_out));
+}
+
+static int check_z(struct syscall_cfg *cfg, int sve_vl)
+{
+	size_t reg_size = sve_vl;
+	int errors = 0;
+	int i;
+
+	if (!sve_vl)
+		return 0;
+
+	/*
+	 * After a syscall the low 128 bits of the Z registers should
+	 * be preserved and the rest be zeroed or preserved.
+	 */
+	for (i = 0; i < SVE_NUM_ZREGS; i++) {
+		void *in = &z_in[reg_size * i];
+		void *out = &z_out[reg_size * i];
+
+		if (memcmp(in, out, SVE_VQ_BYTES) != 0) {
+			ksft_print_msg("%s SVE VL %d Z%d low 128 bits changed\n",
+				       cfg->name, sve_vl, i);
+			errors++;
+		}
+	}
+
+	return errors;
+}
+
+uint8_t p_in[SVE_NUM_PREGS * __SVE_PREG_SIZE(SVE_VQ_MAX)];
+uint8_t p_out[SVE_NUM_PREGS * __SVE_PREG_SIZE(SVE_VQ_MAX)];
+
+static void setup_p(struct syscall_cfg *cfg, int sve_vl)
+{
+	fill_random(p_in, sizeof(p_in));
+	fill_random(p_out, sizeof(p_out));
+}
+
+static int check_p(struct syscall_cfg *cfg, int sve_vl)
+{
+	size_t reg_size = sve_vq_from_vl(sve_vl) * 2; /* 1 bit per VL byte */
+
+	int errors = 0;
+	int i;
+
+	if (!sve_vl)
+		return 0;
+
+	/* After a syscall the P registers should be preserved or zeroed */
+	for (i = 0; i < SVE_NUM_PREGS * reg_size; i++)
+		if (p_out[i] && (p_in[i] != p_out[i]))
+			errors++;
+	if (errors)
+		ksft_print_msg("%s SVE VL %d predicate registers non-zero\n",
+			       cfg->name, sve_vl);
+
+	return errors;
+}
+
+uint8_t ffr_in[__SVE_PREG_SIZE(SVE_VQ_MAX)];
+uint8_t ffr_out[__SVE_PREG_SIZE(SVE_VQ_MAX)];
+
+static void setup_ffr(struct syscall_cfg *cfg, int sve_vl)
+{
+	/*
+	 * It is only valid to set a contiguous set of bits starting
+	 * at 0.  For now since we're expecting this to be cleared by
+	 * a syscall just set all bits.
+	 */
+	memset(ffr_in, 0xff, sizeof(ffr_in));
+	fill_random(ffr_out, sizeof(ffr_out));
+}
+
+static int check_ffr(struct syscall_cfg *cfg, int sve_vl)
+{
+	size_t reg_size = sve_vq_from_vl(sve_vl) * 2;  /* 1 bit per VL byte */
+	int errors = 0;
+	int i;
+
+	if (!sve_vl)
+		return 0;
+
+	/* After a syscall the P registers should be preserved or zeroed */
+	for (i = 0; i < reg_size; i++)
+		if (ffr_out[i] && (ffr_in[i] != ffr_out[i]))
+			errors++;
+	if (errors)
+		ksft_print_msg("%s SVE VL %d FFR non-zero\n",
+			       cfg->name, sve_vl);
+
+	return errors;
+}
+
+typedef void (*setup_fn)(struct syscall_cfg *cfg, int sve_vl);
+typedef int (*check_fn)(struct syscall_cfg *cfg, int sve_vl);
+
+/*
+ * Each set of registers has a setup function which is called before
+ * the syscall to fill values in a global variable for loading by the
+ * test code and a check function which validates that the results are
+ * as expected.  Vector lengths are passed everywhere, a vector length
+ * of 0 should be treated as do not test.
+ */
+static struct {
+	setup_fn setup;
+	check_fn check;
+} regset[] = {
+	{ setup_gpr, check_gpr },
+	{ setup_fpr, check_fpr },
+	{ setup_z, check_z },
+	{ setup_p, check_p },
+	{ setup_ffr, check_ffr },
+};
+
+static bool do_test(struct syscall_cfg *cfg, int sve_vl)
+{
+	int errors = 0;
+	int i;
+
+	for (i = 0; i < ARRAY_SIZE(regset); i++)
+		regset[i].setup(cfg, sve_vl);
+
+	do_syscall(sve_vl);
+
+	for (i = 0; i < ARRAY_SIZE(regset); i++)
+		errors += regset[i].check(cfg, sve_vl);
+
+	return errors == 0;
+}
+
+static void test_one_syscall(struct syscall_cfg *cfg)
+{
+	int sve_vq, sve_vl;
+
+	/* FPSIMD only case */
+	ksft_test_result(do_test(cfg, 0),
+			 "%s FPSIMD\n", cfg->name);
+
+	if (!(getauxval(AT_HWCAP) & HWCAP_SVE))
+		return;
+
+	for (sve_vq = SVE_VQ_MAX; sve_vq > 0; --sve_vq) {
+		sve_vl = prctl(PR_SVE_SET_VL, sve_vq * 16);
+		if (sve_vl == -1)
+			ksft_exit_fail_msg("PR_SVE_SET_VL failed: %s (%d)\n",
+					   strerror(errno), errno);
+
+		sve_vl &= PR_SVE_VL_LEN_MASK;
+
+		if (sve_vq != sve_vq_from_vl(sve_vl))
+			sve_vq = sve_vq_from_vl(sve_vl);
+
+		ksft_test_result(do_test(cfg, sve_vl),
+				 "%s SVE VL %d\n", cfg->name, sve_vl);
+	}
+}
+
+int sve_count_vls(void)
+{
+	unsigned int vq;
+	int vl_count = 0;
+	int vl;
+
+	if (!(getauxval(AT_HWCAP) & HWCAP_SVE))
+		return 0;
+
+	/*
+	 * Enumerate up to SVE_VQ_MAX vector lengths
+	 */
+	for (vq = SVE_VQ_MAX; vq > 0; --vq) {
+		vl = prctl(PR_SVE_SET_VL, vq * 16);
+		if (vl == -1)
+			ksft_exit_fail_msg("PR_SVE_SET_VL failed: %s (%d)\n",
+					   strerror(errno), errno);
+
+		vl &= PR_SVE_VL_LEN_MASK;
+
+		if (vq != sve_vq_from_vl(vl))
+			vq = sve_vq_from_vl(vl);
+
+		vl_count++;
+	}
+
+	return vl_count;
+}
+
+int main(void)
+{
+	int i;
+
+	srandom(getpid());
+
+	ksft_print_header();
+	ksft_set_plan(ARRAY_SIZE(syscalls) * (sve_count_vls() + 1));
+
+	for (i = 0; i < ARRAY_SIZE(syscalls); i++)
+		test_one_syscall(&syscalls[i]);
+
+	ksft_print_cnts();
+
+	return 0;
+}

From 12b792e5e2344772434ea817572bedde8ad9c172 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Tue, 7 Dec 2021 16:32:50 +0000
Subject: [PATCH 68/81] arm64/fp: Add comments documenting the usage of state
 restore functions

Add comments to help people figure out when fpsimd_bind_state_to_cpu() and
fpsimd_update_current_state() are used.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211207163250.1373542-1-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/fpsimd.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c
index fa244c426f61..aad59051b3b7 100644
--- a/arch/arm64/kernel/fpsimd.c
+++ b/arch/arm64/kernel/fpsimd.c
@@ -1213,7 +1213,8 @@ void fpsimd_bind_state_to_cpu(struct user_fpsimd_state *st, void *sve_state,
 /*
  * Load the userland FPSIMD state of 'current' from memory, but only if the
  * FPSIMD state already held in the registers is /not/ the most recent FPSIMD
- * state of 'current'
+ * state of 'current'.  This is called when we are preparing to return to
+ * userspace to ensure that userspace sees a good register state.
  */
 void fpsimd_restore_current_state(void)
 {
@@ -1244,7 +1245,9 @@ void fpsimd_restore_current_state(void)
 /*
  * Load an updated userland FPSIMD state for 'current' from memory and set the
  * flag that indicates that the FPSIMD register contents are the most recent
- * FPSIMD state of 'current'
+ * FPSIMD state of 'current'. This is used by the signal code to restore the
+ * register state when returning from a signal handler in FPSIMD only cases,
+ * any SVE context will be discarded.
  */
 void fpsimd_update_current_state(struct user_fpsimd_state const *state)
 {

From 2c94ebedc844d64459bfe73b25ae725c3f040526 Mon Sep 17 00:00:00 2001
From: Mark Brown <broonie@kernel.org>
Date: Thu, 2 Dec 2021 16:51:07 +0000
Subject: [PATCH 69/81] kselftest/arm64: Add pidbench for floating point
 syscall cases

Since it's likely to be useful for performance work with SVE let's have a
pidbench that gives us some numbers for consideration. In order to ensure
that we test exactly the scenario we want this is written in assembly - if
system libraries use SVE this would stop us exercising the case where the
process has never used SVE.

We exercise three cases:

 - Never having used SVE.
 - Having used SVE once.
 - Using SVE after each syscall.

by spinning running getpid() for a fixed number of iterations with the
time measured using CNTVCT_EL0 reported on the console. This is obviously
a totally unrealistic benchmark which will show the extremes of any
performance variation but equally given the potential gotchas with use of
FP instructions by system libraries it's good to have some concrete code
shared to make it easier to compare notes on results.

Testing over multiple SVE vector lengths will need to be done with vlset
currently, the test could be extended to iterate over all of them if
desired.

Signed-off-by: Mark Brown <broonie@kernel.org>
Link: https://lore.kernel.org/r/20211202165107.1075259-1-broonie@kernel.org
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 tools/testing/selftests/arm64/fp/.gitignore   |  1 +
 tools/testing/selftests/arm64/fp/Makefile     |  4 +-
 .../testing/selftests/arm64/fp/fp-pidbench.S  | 71 +++++++++++++++++++
 3 files changed, 75 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/arm64/fp/fp-pidbench.S

diff --git a/tools/testing/selftests/arm64/fp/.gitignore b/tools/testing/selftests/arm64/fp/.gitignore
index b67395903b9b..c50d86331ed2 100644
--- a/tools/testing/selftests/arm64/fp/.gitignore
+++ b/tools/testing/selftests/arm64/fp/.gitignore
@@ -1,3 +1,4 @@
+fp-pidbench
 fpsimd-test
 rdvl-sve
 sve-probe-vls
diff --git a/tools/testing/selftests/arm64/fp/Makefile b/tools/testing/selftests/arm64/fp/Makefile
index ba1488c7c315..95f0b877a060 100644
--- a/tools/testing/selftests/arm64/fp/Makefile
+++ b/tools/testing/selftests/arm64/fp/Makefile
@@ -2,13 +2,15 @@
 
 CFLAGS += -I../../../../../usr/include/
 TEST_GEN_PROGS := sve-ptrace sve-probe-vls vec-syscfg
-TEST_PROGS_EXTENDED := fpsimd-test fpsimd-stress \
+TEST_PROGS_EXTENDED := fp-pidbench fpsimd-test fpsimd-stress \
 	rdvl-sve \
 	sve-test sve-stress \
 	vlset
 
 all: $(TEST_GEN_PROGS) $(TEST_PROGS_EXTENDED)
 
+fp-pidbench: fp-pidbench.S asm-utils.o
+	$(CC) -nostdlib $^ -o $@
 fpsimd-test: fpsimd-test.o asm-utils.o
 	$(CC) -nostdlib $^ -o $@
 rdvl-sve: rdvl-sve.o rdvl.o
diff --git a/tools/testing/selftests/arm64/fp/fp-pidbench.S b/tools/testing/selftests/arm64/fp/fp-pidbench.S
new file mode 100644
index 000000000000..16a436389bfc
--- /dev/null
+++ b/tools/testing/selftests/arm64/fp/fp-pidbench.S
@@ -0,0 +1,71 @@
+// SPDX-License-Identifier: GPL-2.0-only
+// Copyright (C) 2021 ARM Limited.
+// Original author: Mark Brown <broonie@kernel.org>
+//
+// Trivial syscall overhead benchmark.
+//
+// This is implemented in asm to ensure that we don't have any issues with
+// system libraries using instructions that disrupt the test.
+
+#include <asm/unistd.h>
+#include "assembler.h"
+
+.arch_extension sve
+
+.macro test_loop per_loop
+	mov	x10, x20
+	mov	x8, #__NR_getpid
+	mrs	x11, CNTVCT_EL0
+1:
+	\per_loop
+	svc	#0
+	sub	x10, x10, #1
+	cbnz	x10, 1b
+
+	mrs	x12, CNTVCT_EL0
+	sub	x0, x12, x11
+	bl	putdec
+	puts	"\n"
+.endm
+
+// Main program entry point
+.globl _start
+function _start
+_start:
+	puts	"Iterations per test: "
+	mov	x20, #10000
+	lsl	x20, x20, #8
+	mov	x0, x20
+	bl	putdec
+	puts	"\n"
+
+	// Test having never used SVE
+	puts	"No SVE: "
+	test_loop
+
+	// Check for SVE support - should use hwcap but that's hard in asm
+	mrs	x0, ID_AA64PFR0_EL1
+	ubfx	x0, x0, #32, #4
+	cbnz	x0, 1f
+	puts	"System does not support SVE\n"
+	b	out
+1:
+
+	// Execute a SVE instruction
+	puts	"SVE VL: "
+	rdvl	x0, #8
+	bl	putdec
+	puts	"\n"
+
+	puts	"SVE used once: "
+	test_loop
+
+	// Use SVE per syscall
+	puts	"SVE used per syscall: "
+	test_loop "rdvl x0, #8"
+
+	//  And we're done
+out:
+	mov	x0, #0
+	mov	x8, #__NR_exit
+	svc	#0

From dd03762ab608e058c8f390ad9cf667e490089796 Mon Sep 17 00:00:00 2001
From: Kefeng Wang <wangkefeng.wang@huawei.com>
Date: Sat, 11 Dec 2021 21:17:34 +0800
Subject: [PATCH 70/81] arm64: Enable KCSAN

This patch enables KCSAN for arm64, with updates to build rules
to not use KCSAN for several incompatible compilation units.

Recent GCC version(at least GCC10) made outline-atomics as the
default option(unlike Clang), which will cause linker errors
for kernel/kcsan/core.o. Disables the out-of-line atomics by
no-outline-atomics to fix the linker errors.

Meanwhile, as Mark said[1], some latent issues are needed to be
fixed which isn't just a KCSAN problem, we make the KCSAN depends
on EXPERT for now.

Tested selftest and kcsan_test(built with GCC11 and Clang 13),
and all passed.

[1] https://lkml.kernel.org/r/YadiUPpJ0gADbiHQ@FVFF77S0Q05N

Acked-by: Marco Elver <elver@google.com> # kernel/kcsan
Tested-by: Joey Gouly <joey.gouly@arm.com>
Signed-off-by: Kefeng Wang <wangkefeng.wang@huawei.com>
Link: https://lore.kernel.org/r/20211211131734.126874-1-wangkefeng.wang@huawei.com
[catalin.marinas@arm.com: added comment to justify EXPERT]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/Kconfig               | 2 ++
 arch/arm64/kernel/vdso/Makefile  | 1 +
 arch/arm64/kvm/hyp/nvhe/Makefile | 1 +
 kernel/kcsan/Makefile            | 1 +
 4 files changed, 5 insertions(+)

diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig
index c4207cf9bb17..1d98e935c4d6 100644
--- a/arch/arm64/Kconfig
+++ b/arch/arm64/Kconfig
@@ -150,6 +150,8 @@ config ARM64
 	select HAVE_ARCH_KASAN_VMALLOC if HAVE_ARCH_KASAN
 	select HAVE_ARCH_KASAN_SW_TAGS if HAVE_ARCH_KASAN
 	select HAVE_ARCH_KASAN_HW_TAGS if (HAVE_ARCH_KASAN && ARM64_MTE)
+	# Some instrumentation may be unsound, hence EXPERT
+	select HAVE_ARCH_KCSAN if EXPERT
 	select HAVE_ARCH_KFENCE
 	select HAVE_ARCH_KGDB
 	select HAVE_ARCH_MMAP_RND_BITS
diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile
index 700767dfd221..60813497a381 100644
--- a/arch/arm64/kernel/vdso/Makefile
+++ b/arch/arm64/kernel/vdso/Makefile
@@ -32,6 +32,7 @@ ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO
 CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) $(GCC_PLUGINS_CFLAGS) \
 				$(CC_FLAGS_LTO)
 KASAN_SANITIZE			:= n
+KCSAN_SANITIZE			:= n
 UBSAN_SANITIZE			:= n
 OBJECT_FILES_NON_STANDARD	:= y
 KCOV_INSTRUMENT			:= n
diff --git a/arch/arm64/kvm/hyp/nvhe/Makefile b/arch/arm64/kvm/hyp/nvhe/Makefile
index c3c11974fa3b..24b2c2425b38 100644
--- a/arch/arm64/kvm/hyp/nvhe/Makefile
+++ b/arch/arm64/kvm/hyp/nvhe/Makefile
@@ -89,6 +89,7 @@ KBUILD_CFLAGS := $(filter-out $(CC_FLAGS_FTRACE) $(CC_FLAGS_SCS) $(CC_FLAGS_CFI)
 # cause crashes. Just disable it.
 GCOV_PROFILE	:= n
 KASAN_SANITIZE	:= n
+KCSAN_SANITIZE	:= n
 UBSAN_SANITIZE	:= n
 KCOV_INSTRUMENT	:= n
 
diff --git a/kernel/kcsan/Makefile b/kernel/kcsan/Makefile
index c2bb07f5bcc7..e893b0e1d62a 100644
--- a/kernel/kcsan/Makefile
+++ b/kernel/kcsan/Makefile
@@ -8,6 +8,7 @@ CFLAGS_REMOVE_debugfs.o = $(CC_FLAGS_FTRACE)
 CFLAGS_REMOVE_report.o = $(CC_FLAGS_FTRACE)
 
 CFLAGS_core.o := $(call cc-option,-fno-conserve-stack) \
+	$(call cc-option,-mno-outline-atomics) \
 	-fno-stack-protector -DDISABLE_BRANCH_PROFILING
 
 obj-y := core.o debugfs.o report.o

From f2cefc0c2d2a0baf5be40388371c289c41db01b9 Mon Sep 17 00:00:00 2001
From: Yanteng Si <siyanteng01@gmail.com>
Date: Thu, 9 Dec 2021 17:19:22 +0800
Subject: [PATCH 71/81] docs/arm64: delete a space from tagged-address-abi

Since e71e2ace5721("userfaultfd: do not untag user pointers") which
introduced a warning:

linux/Documentation/arm64/tagged-address-abi.rst:52: WARNING: Unexpected indentation.

Let's fix it.

Signed-off-by: Yanteng Si <siyanteng@loongson.cn>
Link: https://lore.kernel.org/r/20211209091922.560979-1-siyanteng@loongson.cn
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/arm64/tagged-address-abi.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Documentation/arm64/tagged-address-abi.rst b/Documentation/arm64/tagged-address-abi.rst
index 0c9120ec58ae..540a1d4fc6c9 100644
--- a/Documentation/arm64/tagged-address-abi.rst
+++ b/Documentation/arm64/tagged-address-abi.rst
@@ -49,7 +49,7 @@ how the user addresses are used by the kernel:
 
    - ``brk()``, ``mmap()`` and the ``new_address`` argument to
      ``mremap()`` as these have the potential to alias with existing
-      user addresses.
+     user addresses.
 
      NOTE: This behaviour changed in v5.6 and so some earlier kernels may
      incorrectly accept valid tagged pointers for the ``brk()``,

From c2c529b27ceb394ff4d3273ed1f552195fc4d555 Mon Sep 17 00:00:00 2001
From: Mark Rutland <mark.rutland@arm.com>
Date: Mon, 6 Dec 2021 12:47:12 +0000
Subject: [PATCH 72/81] arm64: remove __dma_*_area() aliases

The __dma_inv_area() and __dma_clean_area() aliases make cache.S harder
to navigate, but don't gain us anything in practice.

For clarity, let's remove them along with their redundant comments. The
only users are __dma_map_area() and __dma_unmap_area(), which need to be
position independent, and can call __pi_dcache_inval_poc() and
__pi_dcache_clean_poc() directly.

There should be no functional change as a result of this patch.

Signed-off-by: Mark Rutland <mark.rutland@arm.com>
Cc: Ard Biesheuvel <ardb@kernel.org>
Cc: Fuad Tabba <tabba@google.com>
Cc: Marc Zyngier <maz@kernel.org>
Cc: Will Deacon <will@kernel.org>
Acked-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Mark Brown <broonie@kernel.org>
Acked-by: Ard Biesheuvel <ardb@kernel.org>
Link: https://lore.kernel.org/r/20211206124715.4101571-4-mark.rutland@arm.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/mm/cache.S | 24 +++---------------------
 1 file changed, 3 insertions(+), 21 deletions(-)

diff --git a/arch/arm64/mm/cache.S b/arch/arm64/mm/cache.S
index 5051b3c1a4f1..7d0563db4201 100644
--- a/arch/arm64/mm/cache.S
+++ b/arch/arm64/mm/cache.S
@@ -140,15 +140,7 @@ SYM_FUNC_END(dcache_clean_pou)
  *	- start   - kernel start address of region
  *	- end     - kernel end address of region
  */
-SYM_FUNC_START_LOCAL(__dma_inv_area)
 SYM_FUNC_START_PI(dcache_inval_poc)
-	/* FALLTHROUGH */
-
-/*
- *	__dma_inv_area(start, end)
- *	- start   - virtual start address of region
- *	- end     - virtual end address of region
- */
 	dcache_line_size x2, x3
 	sub	x3, x2, #1
 	tst	x1, x3				// end cache line aligned?
@@ -167,7 +159,6 @@ SYM_FUNC_START_PI(dcache_inval_poc)
 	dsb	sy
 	ret
 SYM_FUNC_END_PI(dcache_inval_poc)
-SYM_FUNC_END(__dma_inv_area)
 
 /*
  *	dcache_clean_poc(start, end)
@@ -178,19 +169,10 @@ SYM_FUNC_END(__dma_inv_area)
  *	- start   - virtual start address of region
  *	- end     - virtual end address of region
  */
-SYM_FUNC_START_LOCAL(__dma_clean_area)
 SYM_FUNC_START_PI(dcache_clean_poc)
-	/* FALLTHROUGH */
-
-/*
- *	__dma_clean_area(start, end)
- *	- start   - virtual start address of region
- *	- end     - virtual end address of region
- */
 	dcache_by_line_op cvac, sy, x0, x1, x2, x3
 	ret
 SYM_FUNC_END_PI(dcache_clean_poc)
-SYM_FUNC_END(__dma_clean_area)
 
 /*
  *	dcache_clean_pop(start, end)
@@ -232,8 +214,8 @@ SYM_FUNC_END_PI(__dma_flush_area)
 SYM_FUNC_START_PI(__dma_map_area)
 	add	x1, x0, x1
 	cmp	w2, #DMA_FROM_DEVICE
-	b.eq	__dma_inv_area
-	b	__dma_clean_area
+	b.eq	__pi_dcache_inval_poc
+	b	__pi_dcache_clean_poc
 SYM_FUNC_END_PI(__dma_map_area)
 
 /*
@@ -245,6 +227,6 @@ SYM_FUNC_END_PI(__dma_map_area)
 SYM_FUNC_START_PI(__dma_unmap_area)
 	add	x1, x0, x1
 	cmp	w2, #DMA_TO_DEVICE
-	b.ne	__dma_inv_area
+	b.ne	__pi_dcache_inval_poc
 	ret
 SYM_FUNC_END_PI(__dma_unmap_area)

From dd73d18e7fc7adc7e70711ff2c92373ce8862c3f Mon Sep 17 00:00:00 2001
From: Catalin Marinas <catalin.marinas@arm.com>
Date: Fri, 17 Dec 2021 16:20:45 +0000
Subject: [PATCH 73/81] arm64: Ensure that the 'bti' macro is defined where
 linkage.h is included

Not all .S files include asm/assembler.h, however the SYM_FUNC_*
definitions invoke the 'bti' macro. Include asm/assembler.h in
asm/linkage.h.

Fixes: 9be34be87cc8 ("arm64: Add macro version of the BTI instruction")
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/include/asm/linkage.h | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/arch/arm64/include/asm/linkage.h b/arch/arm64/include/asm/linkage.h
index 9065e4749b42..b77e9b3f5371 100644
--- a/arch/arm64/include/asm/linkage.h
+++ b/arch/arm64/include/asm/linkage.h
@@ -1,6 +1,10 @@
 #ifndef __ASM_LINKAGE_H
 #define __ASM_LINKAGE_H
 
+#ifdef __ASSEMBLY__
+#include <asm/assembler.h>
+#endif
+
 #define __ALIGN		.align 2
 #define __ALIGN_STR	".align 2"
 

From d5624bb29f49b849ac8d1e9783dbf9c65cf33457 Mon Sep 17 00:00:00 2001
From: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Date: Tue, 21 Dec 2021 11:55:56 +0800
Subject: [PATCH 74/81] asm-generic: introduce io_stop_wc() and add
 implementation for ARM64

For memory accesses with write-combining attributes (e.g. those returned
by ioremap_wc()), the CPU may wait for prior accesses to be merged with
subsequent ones. But in some situation, such wait is bad for the
performance.

We introduce io_stop_wc() to prevent the merging of write-combining
memory accesses before this macro with those after it.

We add implementation for ARM64 using DGH instruction and provide NOP
implementation for other architectures.

Signed-off-by: Xiongfeng Wang <wangxiongfeng2@huawei.com>
Suggested-by: Will Deacon <will@kernel.org>
Suggested-by: Catalin Marinas <catalin.marinas@arm.com>
Acked-by: Arnd Bergmann <arnd@arndb.de>
Link: https://lore.kernel.org/r/20211221035556.60346-1-wangxiongfeng2@huawei.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 Documentation/memory-barriers.txt |  8 ++++++++
 arch/arm64/include/asm/barrier.h  |  9 +++++++++
 include/asm-generic/barrier.h     | 11 +++++++++++
 3 files changed, 28 insertions(+)

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index 7367ada13208..b12df9137e1c 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -1950,6 +1950,14 @@ There are some more advanced barrier functions:
      For load from persistent memory, existing read memory barriers are sufficient
      to ensure read ordering.
 
+ (*) io_stop_wc();
+
+     For memory accesses with write-combining attributes (e.g. those returned
+     by ioremap_wc(), the CPU may wait for prior accesses to be merged with
+     subsequent ones. io_stop_wc() can be used to prevent the merging of
+     write-combining memory accesses before this macro with those after it when
+     such wait has performance implications.
+
 ===============================
 IMPLICIT KERNEL MEMORY BARRIERS
 ===============================
diff --git a/arch/arm64/include/asm/barrier.h b/arch/arm64/include/asm/barrier.h
index 1c5a00598458..62217be36217 100644
--- a/arch/arm64/include/asm/barrier.h
+++ b/arch/arm64/include/asm/barrier.h
@@ -26,6 +26,14 @@
 #define __tsb_csync()	asm volatile("hint #18" : : : "memory")
 #define csdb()		asm volatile("hint #20" : : : "memory")
 
+/*
+ * Data Gathering Hint:
+ * This instruction prevents merging memory accesses with Normal-NC or
+ * Device-GRE attributes before the hint instruction with any memory accesses
+ * appearing after the hint instruction.
+ */
+#define dgh()		asm volatile("hint #6" : : : "memory")
+
 #ifdef CONFIG_ARM64_PSEUDO_NMI
 #define pmr_sync()						\
 	do {							\
@@ -46,6 +54,7 @@
 #define dma_rmb()	dmb(oshld)
 #define dma_wmb()	dmb(oshst)
 
+#define io_stop_wc()	dgh()
 
 #define tsb_csync()								\
 	do {									\
diff --git a/include/asm-generic/barrier.h b/include/asm-generic/barrier.h
index 640f09479bdf..4c2c1b830344 100644
--- a/include/asm-generic/barrier.h
+++ b/include/asm-generic/barrier.h
@@ -251,5 +251,16 @@ do {									\
 #define pmem_wmb()	wmb()
 #endif
 
+/*
+ * ioremap_wc() maps I/O memory as memory with write-combining attributes. For
+ * this kind of memory accesses, the CPU may wait for prior accesses to be
+ * merged with subsequent ones. In some situation, such wait is bad for the
+ * performance. io_stop_wc() can be used to prevent the merging of
+ * write-combining memory accesses before this macro with those after it.
+ */
+#ifndef io_stop_wc
+#define io_stop_wc do { } while (0)
+#endif
+
 #endif /* !__ASSEMBLY__ */
 #endif /* __ASM_GENERIC_BARRIER_H */

From 31e833b2031232493f2c30e53401e1f5ba293f97 Mon Sep 17 00:00:00 2001
From: "Guilherme G. Piccoli" <gpiccoli@igalia.com>
Date: Tue, 21 Dec 2021 12:52:30 -0300
Subject: [PATCH 75/81] arm64: Unhash early pointer print plus improve comment
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When facing a really early issue on DT parsing we have currently
a message that shows both the physical and virtual address of the FDT.
The printk pointer modifier for the virtual address shows a hashed
address there unless the user provides "no_hash_pointers" parameter in
the command-line. The situation in which this message shows-up is a bit
more serious though: the boot process is broken, nothing can be done
(even an oops is too much for this early stage) so we have this message
as a last resort in order to help debug bootloader issues, for example.
Hence, we hereby change that to "%px" in order to make debugging easy,
there's not much information leak risk in such early boot failure.

Also, we tried to improve a bit the commenting on that function, given
that if kernel fails there, it just hangs forever in a cpu_relax() loop.
The reason we cannot BUG/panic is that is too early to do so; thanks to
Mark Brown for pointing that on IRC and thanks Robin Murphy for the good
pointer hash discussion in the mailing-list.

Cc: Mark Brown <broonie@kernel.org>
Cc: Robin Murphy <robin.murphy@arm.com>
Signed-off-by: Guilherme G. Piccoli <gpiccoli@igalia.com>
Reviewed-by: Robin Murphy <robin.murphy@arm.com>
Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
Link: https://lore.kernel.org/r/20211221155230.1532850-1-gpiccoli@igalia.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/setup.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index be5f85b0a24d..a80430550a73 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -189,11 +189,16 @@ static void __init setup_machine_fdt(phys_addr_t dt_phys)
 
 	if (!dt_virt || !early_init_dt_scan(dt_virt)) {
 		pr_crit("\n"
-			"Error: invalid device tree blob at physical address %pa (virtual address 0x%p)\n"
+			"Error: invalid device tree blob at physical address %pa (virtual address 0x%px)\n"
 			"The dtb must be 8-byte aligned and must not exceed 2 MB in size\n"
 			"\nPlease check your bootloader.",
 			&dt_phys, dt_virt);
 
+		/*
+		 * Note that in this _really_ early stage we cannot even BUG()
+		 * or oops, so the least terrible thing to do is cpu_relax(),
+		 * or else we could end-up printing non-initialized data, etc.
+		 */
 		while (true)
 			cpu_relax();
 	}

From 38e0257e0e6f4fef2aa2966b089b56a8b1cfb75c Mon Sep 17 00:00:00 2001
From: D Scott Phillips <scott@os.amperecomputing.com>
Date: Mon, 20 Dec 2021 15:41:14 -0800
Subject: [PATCH 76/81] arm64: errata: Fix exec handling in erratum 1418040
 workaround

The erratum 1418040 workaround enables CNTVCT_EL1 access trapping in EL0
when executing compat threads. The workaround is applied when switching
between tasks, but the need for the workaround could also change at an
exec(), when a non-compat task execs a compat binary or vice versa. Apply
the workaround in arch_setup_new_exec().

This leaves a small window of time between SET_PERSONALITY and
arch_setup_new_exec where preemption could occur and confuse the old
workaround logic that compares TIF_32BIT between prev and next. Instead, we
can just read cntkctl to make sure it's in the state that the next task
needs. I measured cntkctl read time to be about the same as a mov from a
general-purpose register on N1. Update the workaround logic to examine the
current value of cntkctl instead of the previous task's compat state.

Fixes: d49f7d7376d0 ("arm64: Move handling of erratum 1418040 into C code")
Cc: <stable@vger.kernel.org> # 5.9.x
Signed-off-by: D Scott Phillips <scott@os.amperecomputing.com>
Reviewed-by: Marc Zyngier <maz@kernel.org>
Link: https://lore.kernel.org/r/20211220234114.3926-1-scott@os.amperecomputing.com
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/process.c | 39 +++++++++++++++----------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c
index aacf2f5559a8..271d4bbf468e 100644
--- a/arch/arm64/kernel/process.c
+++ b/arch/arm64/kernel/process.c
@@ -439,34 +439,26 @@ static void entry_task_switch(struct task_struct *next)
 
 /*
  * ARM erratum 1418040 handling, affecting the 32bit view of CNTVCT.
- * Assuming the virtual counter is enabled at the beginning of times:
- *
- * - disable access when switching from a 64bit task to a 32bit task
- * - enable access when switching from a 32bit task to a 64bit task
+ * Ensure access is disabled when switching to a 32bit task, ensure
+ * access is enabled when switching to a 64bit task.
  */
-static void erratum_1418040_thread_switch(struct task_struct *prev,
-					  struct task_struct *next)
+static void erratum_1418040_thread_switch(struct task_struct *next)
 {
-	bool prev32, next32;
-	u64 val;
-
-	if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040))
+	if (!IS_ENABLED(CONFIG_ARM64_ERRATUM_1418040) ||
+	    !this_cpu_has_cap(ARM64_WORKAROUND_1418040))
 		return;
 
-	prev32 = is_compat_thread(task_thread_info(prev));
-	next32 = is_compat_thread(task_thread_info(next));
-
-	if (prev32 == next32 || !this_cpu_has_cap(ARM64_WORKAROUND_1418040))
-		return;
-
-	val = read_sysreg(cntkctl_el1);
-
-	if (!next32)
-		val |= ARCH_TIMER_USR_VCT_ACCESS_EN;
+	if (is_compat_thread(task_thread_info(next)))
+		sysreg_clear_set(cntkctl_el1, ARCH_TIMER_USR_VCT_ACCESS_EN, 0);
 	else
-		val &= ~ARCH_TIMER_USR_VCT_ACCESS_EN;
+		sysreg_clear_set(cntkctl_el1, 0, ARCH_TIMER_USR_VCT_ACCESS_EN);
+}
 
-	write_sysreg(val, cntkctl_el1);
+static void erratum_1418040_new_exec(void)
+{
+	preempt_disable();
+	erratum_1418040_thread_switch(current);
+	preempt_enable();
 }
 
 /*
@@ -501,7 +493,7 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev,
 	contextidr_thread_switch(next);
 	entry_task_switch(next);
 	ssbs_thread_switch(next);
-	erratum_1418040_thread_switch(prev, next);
+	erratum_1418040_thread_switch(next);
 	ptrauth_thread_switch_user(next);
 
 	/*
@@ -611,6 +603,7 @@ void arch_setup_new_exec(void)
 	current->mm->context.flags = mmflags;
 	ptrauth_thread_init_user();
 	mte_thread_init_user();
+	erratum_1418040_new_exec();
 
 	if (task_spec_ssb_noexec(current)) {
 		arch_prctl_spec_ctrl_set(current, PR_SPEC_STORE_BYPASS,

From 527a7f52529fd661d200e6c4412b6d2528d1a6e8 Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 4 Jan 2022 13:34:12 +0000
Subject: [PATCH 77/81] perf/smmuv3: Fix unused variable warning when
 CONFIG_OF=n

The kbuild robot reports that building the SMMUv3 PMU driver with
CONFIG_OF=n results in a warning for W=1 builds:

>> drivers/perf/arm_smmuv3_pmu.c:889:34: warning: unused variable 'smmu_pmu_of_match' [-Wunused-const-variable]
   static const struct of_device_id smmu_pmu_of_match[] = {
                                    ^

Guard the match table with #ifdef CONFIG_OF.

Link: https://lore.kernel.org/r/202201041700.01KZEzhb-lkp@intel.com
Fixes: 3f7be4356176 ("perf/smmuv3: Add devicetree support")
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/arm_smmuv3_pmu.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c
index 598d6978280d..1ae19f7301b2 100644
--- a/drivers/perf/arm_smmuv3_pmu.c
+++ b/drivers/perf/arm_smmuv3_pmu.c
@@ -939,11 +939,13 @@ static void smmu_pmu_shutdown(struct platform_device *pdev)
 	smmu_pmu_disable(&smmu_pmu->pmu);
 }
 
+#ifdef CONFIG_OF
 static const struct of_device_id smmu_pmu_of_match[] = {
 	{ .compatible = "arm,smmu-v3-pmcg" },
 	{}
 };
 MODULE_DEVICE_TABLE(of, smmu_pmu_of_match);
+#endif
 
 static struct platform_driver smmu_pmu_driver = {
 	.driver = {

From 2da56881a7f8fdd887e6c2d250db58f1abdad0b6 Mon Sep 17 00:00:00 2001
From: Dan Carpenter <dan.carpenter@oracle.com>
Date: Fri, 17 Dec 2021 17:59:08 +0300
Subject: [PATCH 78/81] drivers: perf: marvell_cn10k: fix an IS_ERR() vs NULL
 check

The devm_ioremap() function does not return error pointers.  It returns
NULL.

Fixes: 036a7584bede ("drivers: perf: Add LLC-TAD perf counter support")
Signed-off-by: Dan Carpenter <dan.carpenter@oracle.com>
Link: https://lore.kernel.org/r/20211217145907.GA16611@kili
Signed-off-by: Will Deacon <will@kernel.org>
---
 drivers/perf/marvell_cn10k_tad_pmu.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/perf/marvell_cn10k_tad_pmu.c b/drivers/perf/marvell_cn10k_tad_pmu.c
index 250dd4c52d70..7f4d292658e3 100644
--- a/drivers/perf/marvell_cn10k_tad_pmu.c
+++ b/drivers/perf/marvell_cn10k_tad_pmu.c
@@ -312,7 +312,7 @@ static int tad_pmu_probe(struct platform_device *pdev)
 		regions[i].base = devm_ioremap(&pdev->dev,
 					       res->start,
 					       tad_pmu_page_size);
-		if (IS_ERR(regions[i].base)) {
+		if (!regions[i].base) {
 			dev_err(&pdev->dev, "TAD%d ioremap fail\n", i);
 			return -ENOMEM;
 		}

From 3da4390bcdf4dcea5eb7961f1ba05f75c642a39d Mon Sep 17 00:00:00 2001
From: Will Deacon <will@kernel.org>
Date: Tue, 4 Jan 2022 14:57:14 +0000
Subject: [PATCH 79/81] arm64: perf: Don't register user access sysctl handler
 multiple times

Commit e2012600810c ("arm64: perf: Add userspace counter access disable
switch") introduced a new 'perf_user_access' sysctl file to enable and
disable direct userspace access to the PMU counters. Sadly, Geert
reports that on his big.LITTLE SoC ('Renesas Salvator-XS w/ R-Car H3'),
the file is created for each PMU type probed, resulting in a splat
during boot:

  | hw perfevents: enabled with armv8_cortex_a53 PMU driver, 7 counters available
  | sysctl duplicate entry: /kernel//perf_user_access
  | CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.16.0-rc3-arm64-renesas-00003-ge2012600810c #1420
  | Hardware name: Renesas Salvator-X 2nd version board based on r8a77951 (DT)
  | Call trace:
  |  dump_backtrace+0x0/0x190
  |  show_stack+0x14/0x20
  |  dump_stack_lvl+0x88/0xb0
  |  dump_stack+0x14/0x2c
  |  __register_sysctl_table+0x384/0x818
  |  register_sysctl+0x20/0x28
  |  armv8_pmu_init.constprop.0+0x118/0x150
  |  armv8_a57_pmu_init+0x1c/0x28
  |  arm_pmu_device_probe+0x1b4/0x558
  |  armv8_pmu_device_probe+0x18/0x20
  |  platform_probe+0x64/0xd0
  |  hw perfevents: enabled with armv8_cortex_a57 PMU driver, 7 counters available

Introduce a state variable to track creation of the sysctl file and
ensure that it is only created once.

Reported-by: Geert Uytterhoeven <geert@linux-m68k.org>
Fixes: e2012600810c ("arm64: perf: Add userspace counter access disable switch")
Link: https://lore.kernel.org/r/CAMuHMdVcDxR9sGzc5pcnORiotonERBgc6dsXZXMd6wTvLGA9iw@mail.gmail.com
Signed-off-by: Will Deacon <will@kernel.org>
---
 arch/arm64/kernel/perf_event.c | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c
index 4c5c8d731fbb..cab678ed6618 100644
--- a/arch/arm64/kernel/perf_event.c
+++ b/arch/arm64/kernel/perf_event.c
@@ -1214,6 +1214,14 @@ static struct ctl_table armv8_pmu_sysctl_table[] = {
 	{ }
 };
 
+static void armv8_pmu_register_sysctl_table(void)
+{
+	static u32 tbl_registered = 0;
+
+	if (!cmpxchg_relaxed(&tbl_registered, 0, 1))
+		register_sysctl("kernel", armv8_pmu_sysctl_table);
+}
+
 static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
 			  int (*map_event)(struct perf_event *event),
 			  const struct attribute_group *events,
@@ -1248,8 +1256,7 @@ static int armv8_pmu_init(struct arm_pmu *cpu_pmu, char *name,
 	cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_CAPS] = caps ?
 			caps : &armv8_pmuv3_caps_attr_group;
 
-	register_sysctl("kernel", armv8_pmu_sysctl_table);
-
+	armv8_pmu_register_sysctl_table();
 	return 0;
 }
 

From 89d30b11507db7c24fe54cd4ccc498c145406028 Mon Sep 17 00:00:00 2001
From: Kees Cook <keescook@chromium.org>
Date: Wed, 15 Dec 2021 11:18:35 -0800
Subject: [PATCH 80/81] arm64: Drop outdated links in comments

As started by commit 05a5f51ca566 ("Documentation: Replace lkml.org links
with lore"), an effort was made to replace lkml.org links with lore to
better use a single source that's more likely to stay available long-term.
However, it seems these links don't offer much value here, so just
remove them entirely.

Cc: Joe Perches <joe@perches.com>
Suggested-by: Will Deacon <will@kernel.org>
Link: https://lore.kernel.org/lkml/20210211100213.GA29813@willie-the-truck/
Signed-off-by: Kees Cook <keescook@chromium.org>
Link: https://lore.kernel.org/r/20211215191835.1420010-1-keescook@chromium.org
[catalin.marinas@arm.com: removed the arch/arm changes]
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/hibernate.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/arch/arm64/kernel/hibernate.c b/arch/arm64/kernel/hibernate.c
index 2758f75d6809..6328308be272 100644
--- a/arch/arm64/kernel/hibernate.c
+++ b/arch/arm64/kernel/hibernate.c
@@ -7,10 +7,6 @@
  * Ubuntu project, hibernation support for mach-dove
  * Copyright (C) 2010 Nokia Corporation (Hiroshi Doyu)
  * Copyright (C) 2010 Texas Instruments, Inc. (Teerth Reddy et al.)
- *  https://lkml.org/lkml/2010/6/18/4
- *  https://lists.linux-foundation.org/pipermail/linux-pm/2010-June/027422.html
- *  https://patchwork.kernel.org/patch/96442/
- *
  * Copyright (C) 2006 Rafael J. Wysocki <rjw@sisk.pl>
  */
 #define pr_fmt(x) "hibernate: " x

From daa149dd8cd4ad8dc4f8dd47bd24f15992b3a8c1 Mon Sep 17 00:00:00 2001
From: Huacai Chen <chenhuacai@loongson.cn>
Date: Fri, 22 Oct 2021 15:06:46 +0800
Subject: [PATCH 81/81] arm64: Use correct method to calculate nomap region
 boundaries

Nomap regions are treated as "reserved". When region boundaries are not
page aligned, we usually increase the "reserved" regions rather than
decrease them. So, we should use memblock_region_reserved_base_pfn()/
memblock_region_reserved_end_pfn() instead of memblock_region_memory_
base_pfn()/memblock_region_memory_base_pfn() to calculate boundaries.

Signed-off-by: Huacai Chen <chenhuacai@loongson.cn>
Link: https://lore.kernel.org/r/20211022070646.41923-1-chenhuacai@loongson.cn
Signed-off-by: Catalin Marinas <catalin.marinas@arm.com>
---
 arch/arm64/kernel/setup.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c
index a80430550a73..f70573928f1b 100644
--- a/arch/arm64/kernel/setup.c
+++ b/arch/arm64/kernel/setup.c
@@ -237,12 +237,14 @@ static void __init request_standard_resources(void)
 		if (memblock_is_nomap(region)) {
 			res->name  = "reserved";
 			res->flags = IORESOURCE_MEM;
+			res->start = __pfn_to_phys(memblock_region_reserved_base_pfn(region));
+			res->end = __pfn_to_phys(memblock_region_reserved_end_pfn(region)) - 1;
 		} else {
 			res->name  = "System RAM";
 			res->flags = IORESOURCE_SYSTEM_RAM | IORESOURCE_BUSY;
+			res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
+			res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
 		}
-		res->start = __pfn_to_phys(memblock_region_memory_base_pfn(region));
-		res->end = __pfn_to_phys(memblock_region_memory_end_pfn(region)) - 1;
 
 		request_resource(&iomem_resource, res);