diff --git a/Documentation/arm64/kasan-offsets.sh b/Documentation/arm64/kasan-offsets.sh index 2b7a021db363..2dc5f9e18039 100644 --- a/Documentation/arm64/kasan-offsets.sh +++ b/Documentation/arm64/kasan-offsets.sh @@ -1,12 +1,11 @@ #!/bin/sh # Print out the KASAN_SHADOW_OFFSETS required to place the KASAN SHADOW -# start address at the mid-point of the kernel VA space +# start address at the top of the linear region print_kasan_offset () { printf "%02d\t" $1 printf "0x%08x00000000\n" $(( (0xffffffff & (-1 << ($1 - 1 - 32))) \ - + (1 << ($1 - 32 - $2)) \ - (1 << (64 - 32 - $2)) )) } diff --git a/Documentation/arm64/memory.rst b/Documentation/arm64/memory.rst index cf03b3290800..e7522e5c8322 100644 --- a/Documentation/arm64/memory.rst +++ b/Documentation/arm64/memory.rst @@ -32,17 +32,16 @@ AArch64 Linux memory layout with 4KB pages + 4 levels (48-bit):: ----------------------------------------------------------------------- 0000000000000000 0000ffffffffffff 256TB user ffff000000000000 ffff7fffffffffff 128TB kernel logical memory map - ffff800000000000 ffff9fffffffffff 32TB kasan shadow region - ffffa00000000000 ffffa00007ffffff 128MB bpf jit region - ffffa00008000000 ffffa0000fffffff 128MB modules - ffffa00010000000 fffffdffbffeffff ~93TB vmalloc - fffffdffbfff0000 fffffdfffe5f8fff ~998MB [guard region] - fffffdfffe5f9000 fffffdfffe9fffff 4124KB fixed mappings - fffffdfffea00000 fffffdfffebfffff 2MB [guard region] - fffffdfffec00000 fffffdffffbfffff 16MB PCI I/O space - fffffdffffc00000 fffffdffffdfffff 2MB [guard region] - fffffdffffe00000 ffffffffffdfffff 2TB vmemmap - ffffffffffe00000 ffffffffffffffff 2MB [guard region] + [ffff600000000000 ffff7fffffffffff] 32TB [kasan shadow region] + ffff800000000000 ffff800007ffffff 128MB bpf jit region + ffff800008000000 ffff80000fffffff 128MB modules + ffff800010000000 fffffbffefffffff 124TB vmalloc + fffffbfff0000000 fffffbfffdffffff 224MB fixed mappings (top down) + fffffbfffe000000 fffffbfffe7fffff 8MB [guard region] + fffffbfffe800000 fffffbffff7fffff 16MB PCI I/O space + fffffbffff800000 fffffbffffffffff 8MB [guard region] + fffffc0000000000 fffffdffffffffff 2TB vmemmap + fffffe0000000000 ffffffffffffffff 2TB [guard region] AArch64 Linux memory layout with 64KB pages + 3 levels (52-bit with HW support):: @@ -50,19 +49,17 @@ AArch64 Linux memory layout with 64KB pages + 3 levels (52-bit with HW support): Start End Size Use ----------------------------------------------------------------------- 0000000000000000 000fffffffffffff 4PB user - fff0000000000000 fff7ffffffffffff 2PB kernel logical memory map - fff8000000000000 fffd9fffffffffff 1440TB [gap] - fffda00000000000 ffff9fffffffffff 512TB kasan shadow region - ffffa00000000000 ffffa00007ffffff 128MB bpf jit region - ffffa00008000000 ffffa0000fffffff 128MB modules - ffffa00010000000 fffff81ffffeffff ~88TB vmalloc - fffff81fffff0000 fffffc1ffe58ffff ~3TB [guard region] - fffffc1ffe590000 fffffc1ffe9fffff 4544KB fixed mappings - fffffc1ffea00000 fffffc1ffebfffff 2MB [guard region] - fffffc1ffec00000 fffffc1fffbfffff 16MB PCI I/O space - fffffc1fffc00000 fffffc1fffdfffff 2MB [guard region] - fffffc1fffe00000 ffffffffffdfffff 3968GB vmemmap - ffffffffffe00000 ffffffffffffffff 2MB [guard region] + fff0000000000000 ffff7fffffffffff ~4PB kernel logical memory map + [fffd800000000000 ffff7fffffffffff] 512TB [kasan shadow region] + ffff800000000000 ffff800007ffffff 128MB bpf jit region + ffff800008000000 ffff80000fffffff 128MB modules + ffff800010000000 fffffbffefffffff 124TB vmalloc + fffffbfff0000000 fffffbfffdffffff 224MB fixed mappings (top down) + fffffbfffe000000 fffffbfffe7fffff 8MB [guard region] + fffffbfffe800000 fffffbffff7fffff 16MB PCI I/O space + fffffbffff800000 fffffbffffffffff 8MB [guard region] + fffffc0000000000 ffffffdfffffffff ~4TB vmemmap + ffffffe000000000 ffffffffffffffff 128GB [guard region] Translation table lookup with 4KB pages:: diff --git a/Documentation/arm64/tagged-pointers.rst b/Documentation/arm64/tagged-pointers.rst index eab4323609b9..19d284b70384 100644 --- a/Documentation/arm64/tagged-pointers.rst +++ b/Documentation/arm64/tagged-pointers.rst @@ -53,12 +53,25 @@ visibility. Preserving tags --------------- -Non-zero tags are not preserved when delivering signals. This means that -signal handlers in applications making use of tags cannot rely on the -tag information for user virtual addresses being maintained for fields -inside siginfo_t. One exception to this rule is for signals raised in -response to watchpoint debug exceptions, where the tag information will -be preserved. +When delivering signals, non-zero tags are not preserved in +siginfo.si_addr unless the flag SA_EXPOSE_TAGBITS was set in +sigaction.sa_flags when the signal handler was installed. This means +that signal handlers in applications making use of tags cannot rely +on the tag information for user virtual addresses being maintained +in these fields unless the flag was set. + +Due to architecture limitations, bits 63:60 of the fault address +are not preserved in response to synchronous tag check faults +(SEGV_MTESERR) even if SA_EXPOSE_TAGBITS was set. Applications should +treat the values of these bits as undefined in order to accommodate +future architecture revisions which may preserve the bits. + +For signals raised in response to watchpoint debug exceptions, the +tag information will be preserved regardless of the SA_EXPOSE_TAGBITS +flag setting. + +Non-zero tags are never preserved in sigcontext.fault_address +regardless of the SA_EXPOSE_TAGBITS flag setting. The architecture prevents the use of a tagged PC, so the upper byte will be set to a sign-extension of bit 55 on exception return. diff --git a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml index 5aad9f4e0b2a..80a92385367e 100644 --- a/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml +++ b/Documentation/devicetree/bindings/perf/fsl-imx-ddr.yaml @@ -15,6 +15,9 @@ properties: - enum: - fsl,imx8-ddr-pmu - fsl,imx8m-ddr-pmu + - fsl,imx8mq-ddr-pmu + - fsl,imx8mm-ddr-pmu + - fsl,imx8mn-ddr-pmu - fsl,imx8mp-ddr-pmu - items: - enum: diff --git a/arch/alpha/include/uapi/asm/signal.h b/arch/alpha/include/uapi/asm/signal.h index 74c750bf1c1a..a69dd8d080a8 100644 --- a/arch/alpha/include/uapi/asm/signal.h +++ b/arch/alpha/include/uapi/asm/signal.h @@ -60,20 +60,6 @@ typedef unsigned long sigset_t; #define SIGRTMIN 32 #define SIGRTMAX _NSIG -/* - * SA_FLAGS values: - * - * SA_ONSTACK indicates that a registered stack_t will be used. - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - */ - #define SA_ONSTACK 0x00000001 #define SA_RESTART 0x00000002 #define SA_NOCLDSTOP 0x00000004 diff --git a/arch/arm/include/asm/signal.h b/arch/arm/include/asm/signal.h index 65530a042009..430be7774402 100644 --- a/arch/arm/include/asm/signal.h +++ b/arch/arm/include/asm/signal.h @@ -17,6 +17,8 @@ typedef struct { unsigned long sig[_NSIG_WORDS]; } sigset_t; +#define __ARCH_UAPI_SA_FLAGS (SA_THIRTYTWO | SA_RESTORER) + #define __ARCH_HAS_SA_RESTORER #include diff --git a/arch/arm/include/uapi/asm/signal.h b/arch/arm/include/uapi/asm/signal.h index 9b4185ba4f8a..c9a3ea1d8d41 100644 --- a/arch/arm/include/uapi/asm/signal.h +++ b/arch/arm/include/uapi/asm/signal.h @@ -60,33 +60,12 @@ typedef unsigned long sigset_t; #define SIGSWI 32 /* - * SA_FLAGS values: - * - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_SIGINFO deliver the signal with SIGINFO structs - * SA_THIRTYTWO delivers the signal in 32-bit mode, even if the task - * is running in 26-bit. - * SA_ONSTACK allows alternate signal stacks (see sigaltstack(2)). - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NODEFER prevents the current signal from being masked in the handler. - * SA_RESETHAND clears the handler when the signal is delivered. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. + * SA_THIRTYTWO historically meant deliver the signal in 32-bit mode, even if + * the task is running in 26-bit. But since the kernel no longer supports + * 26-bit mode, the flag has no effect. */ -#define SA_NOCLDSTOP 0x00000001 -#define SA_NOCLDWAIT 0x00000002 -#define SA_SIGINFO 0x00000004 #define SA_THIRTYTWO 0x02000000 #define SA_RESTORER 0x04000000 -#define SA_ONSTACK 0x08000000 -#define SA_RESTART 0x10000000 -#define SA_NODEFER 0x40000000 -#define SA_RESETHAND 0x80000000 - -#define SA_NOMASK SA_NODEFER -#define SA_ONESHOT SA_RESETHAND #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 75aefc9990ea..362edd9385c2 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -171,6 +171,8 @@ config ARM64 select HAVE_NMI select HAVE_PATA_PLATFORM select HAVE_PERF_EVENTS + select HAVE_PERF_EVENTS_NMI if ARM64_PSEUDO_NMI && HW_PERF_EVENTS + select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API @@ -196,7 +198,6 @@ config ARM64 select PCI_SYSCALL if PCI select POWER_RESET select POWER_SUPPLY - select SET_FS select SPARSE_IRQ select SWIOTLB select SYSCTL_EXCEPTION_TRACE @@ -332,16 +333,16 @@ config BROKEN_GAS_INST config KASAN_SHADOW_OFFSET hex depends on KASAN - default 0xdfffa00000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && !KASAN_SW_TAGS - default 0xdfffd00000000000 if ARM64_VA_BITS_47 && !KASAN_SW_TAGS - default 0xdffffe8000000000 if ARM64_VA_BITS_42 && !KASAN_SW_TAGS - default 0xdfffffd000000000 if ARM64_VA_BITS_39 && !KASAN_SW_TAGS - default 0xdffffffa00000000 if ARM64_VA_BITS_36 && !KASAN_SW_TAGS - default 0xefff900000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && KASAN_SW_TAGS - default 0xefffc80000000000 if ARM64_VA_BITS_47 && KASAN_SW_TAGS - default 0xeffffe4000000000 if ARM64_VA_BITS_42 && KASAN_SW_TAGS - default 0xefffffc800000000 if ARM64_VA_BITS_39 && KASAN_SW_TAGS - default 0xeffffff900000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS + default 0xdfff800000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && !KASAN_SW_TAGS + default 0xdfffc00000000000 if ARM64_VA_BITS_47 && !KASAN_SW_TAGS + default 0xdffffe0000000000 if ARM64_VA_BITS_42 && !KASAN_SW_TAGS + default 0xdfffffc000000000 if ARM64_VA_BITS_39 && !KASAN_SW_TAGS + default 0xdffffff800000000 if ARM64_VA_BITS_36 && !KASAN_SW_TAGS + default 0xefff800000000000 if (ARM64_VA_BITS_48 || ARM64_VA_BITS_52) && KASAN_SW_TAGS + default 0xefffc00000000000 if ARM64_VA_BITS_47 && KASAN_SW_TAGS + default 0xeffffe0000000000 if ARM64_VA_BITS_42 && KASAN_SW_TAGS + default 0xefffffc000000000 if ARM64_VA_BITS_39 && KASAN_SW_TAGS + default 0xeffffff800000000 if ARM64_VA_BITS_36 && KASAN_SW_TAGS default 0xffffffffffffffff source "arch/arm64/Kconfig.platforms" @@ -1389,6 +1390,9 @@ config ARM64_PAN The feature is detected at runtime, and will remain as a 'nop' instruction if the cpu does not implement the feature. +config AS_HAS_LDAPR + def_bool $(as-instr,.arch_extension rcpc) + config ARM64_LSE_ATOMICS bool default ARM64_USE_LSE_ATOMICS @@ -1426,27 +1430,6 @@ endmenu menu "ARMv8.2 architectural features" -config ARM64_UAO - bool "Enable support for User Access Override (UAO)" - default y - help - User Access Override (UAO; part of the ARMv8.2 Extensions) - causes the 'unprivileged' variant of the load/store instructions to - be overridden to be privileged. - - This option changes get_user() and friends to use the 'unprivileged' - variant of the load/store instructions. This ensures that user-space - really did have access to the supplied memory. When addr_limit is - set to kernel memory the UAO bit will be set, allowing privileged - access to kernel memory. - - Choosing this option will cause copy_to_user() et al to use user-space - memory permissions. - - The feature is detected at runtime, the kernel will use the - regular load/store instructions if the cpu does not implement the - feature. - config ARM64_PMEM bool "Enable support for persistent memory" select ARCH_HAS_PMEM_API @@ -1850,6 +1833,9 @@ config CMDLINE choice prompt "Kernel command line type" if CMDLINE != "" default CMDLINE_FROM_BOOTLOADER + help + Choose how the kernel will handle the provided default kernel + command line string. config CMDLINE_FROM_BOOTLOADER bool "Use bootloader kernel arguments if available" @@ -1866,7 +1852,6 @@ config CMDLINE_EXTEND config CMDLINE_FORCE bool "Always use the default kernel command string" - depends on CMDLINE != "" help Always use the default kernel command string, even if the boot loader passes other arguments to the kernel. diff --git a/arch/arm64/include/asm/alternative-macros.h b/arch/arm64/include/asm/alternative-macros.h new file mode 100644 index 000000000000..5df500dcc627 --- /dev/null +++ b/arch/arm64/include/asm/alternative-macros.h @@ -0,0 +1,217 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ASM_ALTERNATIVE_MACROS_H +#define __ASM_ALTERNATIVE_MACROS_H + +#include + +#define ARM64_CB_PATCH ARM64_NCAPS + +/* A64 instructions are always 32 bits. */ +#define AARCH64_INSN_SIZE 4 + +#ifndef __ASSEMBLY__ + +#include + +#define ALTINSTR_ENTRY(feature) \ + " .word 661b - .\n" /* label */ \ + " .word 663f - .\n" /* new instruction */ \ + " .hword " __stringify(feature) "\n" /* feature bit */ \ + " .byte 662b-661b\n" /* source len */ \ + " .byte 664f-663f\n" /* replacement len */ + +#define ALTINSTR_ENTRY_CB(feature, cb) \ + " .word 661b - .\n" /* label */ \ + " .word " __stringify(cb) "- .\n" /* callback */ \ + " .hword " __stringify(feature) "\n" /* feature bit */ \ + " .byte 662b-661b\n" /* source len */ \ + " .byte 664f-663f\n" /* replacement len */ + +/* + * alternative assembly primitive: + * + * If any of these .org directive fail, it means that insn1 and insn2 + * don't have the same length. This used to be written as + * + * .if ((664b-663b) != (662b-661b)) + * .error "Alternatives instruction length mismatch" + * .endif + * + * but most assemblers die if insn1 or insn2 have a .inst. This should + * be fixed in a binutils release posterior to 2.25.51.0.2 (anything + * containing commit 4e4d08cf7399b606 or c1baaddf8861). + * + * Alternatives with callbacks do not generate replacement instructions. + */ +#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled) \ + ".if "__stringify(cfg_enabled)" == 1\n" \ + "661:\n\t" \ + oldinstr "\n" \ + "662:\n" \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(feature) \ + ".popsection\n" \ + ".subsection 1\n" \ + "663:\n\t" \ + newinstr "\n" \ + "664:\n\t" \ + ".org . - (664b-663b) + (662b-661b)\n\t" \ + ".org . - (662b-661b) + (664b-663b)\n\t" \ + ".previous\n" \ + ".endif\n" + +#define __ALTERNATIVE_CFG_CB(oldinstr, feature, cfg_enabled, cb) \ + ".if "__stringify(cfg_enabled)" == 1\n" \ + "661:\n\t" \ + oldinstr "\n" \ + "662:\n" \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY_CB(feature, cb) \ + ".popsection\n" \ + "663:\n\t" \ + "664:\n\t" \ + ".endif\n" + +#define _ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg, ...) \ + __ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg)) + +#define ALTERNATIVE_CB(oldinstr, cb) \ + __ALTERNATIVE_CFG_CB(oldinstr, ARM64_CB_PATCH, 1, cb) +#else + +#include + +.macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len + .word \orig_offset - . + .word \alt_offset - . + .hword \feature + .byte \orig_len + .byte \alt_len +.endm + +.macro alternative_insn insn1, insn2, cap, enable = 1 + .if \enable +661: \insn1 +662: .pushsection .altinstructions, "a" + altinstruction_entry 661b, 663f, \cap, 662b-661b, 664f-663f + .popsection + .subsection 1 +663: \insn2 +664: .previous + .org . - (664b-663b) + (662b-661b) + .org . - (662b-661b) + (664b-663b) + .endif +.endm + +/* + * Alternative sequences + * + * The code for the case where the capability is not present will be + * assembled and linked as normal. There are no restrictions on this + * code. + * + * The code for the case where the capability is present will be + * assembled into a special section to be used for dynamic patching. + * Code for that case must: + * + * 1. Be exactly the same length (in bytes) as the default code + * sequence. + * + * 2. Not contain a branch target that is used outside of the + * alternative sequence it is defined in (branches into an + * alternative sequence are not fixed up). + */ + +/* + * Begin an alternative code sequence. + */ +.macro alternative_if_not cap + .set .Lasm_alt_mode, 0 + .pushsection .altinstructions, "a" + altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f + .popsection +661: +.endm + +.macro alternative_if cap + .set .Lasm_alt_mode, 1 + .pushsection .altinstructions, "a" + altinstruction_entry 663f, 661f, \cap, 664f-663f, 662f-661f + .popsection + .subsection 1 + .align 2 /* So GAS knows label 661 is suitably aligned */ +661: +.endm + +.macro alternative_cb cb + .set .Lasm_alt_mode, 0 + .pushsection .altinstructions, "a" + altinstruction_entry 661f, \cb, ARM64_CB_PATCH, 662f-661f, 0 + .popsection +661: +.endm + +/* + * Provide the other half of the alternative code sequence. + */ +.macro alternative_else +662: + .if .Lasm_alt_mode==0 + .subsection 1 + .else + .previous + .endif +663: +.endm + +/* + * Complete an alternative code sequence. + */ +.macro alternative_endif +664: + .if .Lasm_alt_mode==0 + .previous + .endif + .org . - (664b-663b) + (662b-661b) + .org . - (662b-661b) + (664b-663b) +.endm + +/* + * Callback-based alternative epilogue + */ +.macro alternative_cb_end +662: +.endm + +/* + * Provides a trivial alternative or default sequence consisting solely + * of NOPs. The number of NOPs is chosen automatically to match the + * previous case. + */ +.macro alternative_else_nop_endif +alternative_else + nops (662b-661b) / AARCH64_INSN_SIZE +alternative_endif +.endm + +#define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...) \ + alternative_insn insn1, insn2, cap, IS_ENABLED(cfg) + +.macro user_alt, label, oldinstr, newinstr, cond +9999: alternative_insn "\oldinstr", "\newinstr", \cond + _asm_extable 9999b, \label +.endm + +#endif /* __ASSEMBLY__ */ + +/* + * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature)); + * + * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO)); + * N.B. If CONFIG_FOO is specified, but not selected, the whole block + * will be omitted, including oldinstr. + */ +#define ALTERNATIVE(oldinstr, newinstr, ...) \ + _ALTERNATIVE_CFG(oldinstr, newinstr, __VA_ARGS__, 1) + +#endif /* __ASM_ALTERNATIVE_MACROS_H */ diff --git a/arch/arm64/include/asm/alternative.h b/arch/arm64/include/asm/alternative.h index 619db9b4c9d5..a38b92e11811 100644 --- a/arch/arm64/include/asm/alternative.h +++ b/arch/arm64/include/asm/alternative.h @@ -2,17 +2,13 @@ #ifndef __ASM_ALTERNATIVE_H #define __ASM_ALTERNATIVE_H -#include -#include - -#define ARM64_CB_PATCH ARM64_NCAPS +#include #ifndef __ASSEMBLY__ #include #include #include -#include struct alt_instr { s32 orig_offset; /* offset to original instruction */ @@ -35,264 +31,5 @@ void apply_alternatives_module(void *start, size_t length); static inline void apply_alternatives_module(void *start, size_t length) { } #endif -#define ALTINSTR_ENTRY(feature) \ - " .word 661b - .\n" /* label */ \ - " .word 663f - .\n" /* new instruction */ \ - " .hword " __stringify(feature) "\n" /* feature bit */ \ - " .byte 662b-661b\n" /* source len */ \ - " .byte 664f-663f\n" /* replacement len */ - -#define ALTINSTR_ENTRY_CB(feature, cb) \ - " .word 661b - .\n" /* label */ \ - " .word " __stringify(cb) "- .\n" /* callback */ \ - " .hword " __stringify(feature) "\n" /* feature bit */ \ - " .byte 662b-661b\n" /* source len */ \ - " .byte 664f-663f\n" /* replacement len */ - -/* - * alternative assembly primitive: - * - * If any of these .org directive fail, it means that insn1 and insn2 - * don't have the same length. This used to be written as - * - * .if ((664b-663b) != (662b-661b)) - * .error "Alternatives instruction length mismatch" - * .endif - * - * but most assemblers die if insn1 or insn2 have a .inst. This should - * be fixed in a binutils release posterior to 2.25.51.0.2 (anything - * containing commit 4e4d08cf7399b606 or c1baaddf8861). - * - * Alternatives with callbacks do not generate replacement instructions. - */ -#define __ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg_enabled) \ - ".if "__stringify(cfg_enabled)" == 1\n" \ - "661:\n\t" \ - oldinstr "\n" \ - "662:\n" \ - ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY(feature) \ - ".popsection\n" \ - ".subsection 1\n" \ - "663:\n\t" \ - newinstr "\n" \ - "664:\n\t" \ - ".org . - (664b-663b) + (662b-661b)\n\t" \ - ".org . - (662b-661b) + (664b-663b)\n\t" \ - ".previous\n" \ - ".endif\n" - -#define __ALTERNATIVE_CFG_CB(oldinstr, feature, cfg_enabled, cb) \ - ".if "__stringify(cfg_enabled)" == 1\n" \ - "661:\n\t" \ - oldinstr "\n" \ - "662:\n" \ - ".pushsection .altinstructions,\"a\"\n" \ - ALTINSTR_ENTRY_CB(feature, cb) \ - ".popsection\n" \ - "663:\n\t" \ - "664:\n\t" \ - ".endif\n" - -#define _ALTERNATIVE_CFG(oldinstr, newinstr, feature, cfg, ...) \ - __ALTERNATIVE_CFG(oldinstr, newinstr, feature, IS_ENABLED(cfg)) - -#define ALTERNATIVE_CB(oldinstr, cb) \ - __ALTERNATIVE_CFG_CB(oldinstr, ARM64_CB_PATCH, 1, cb) -#else - -#include - -.macro altinstruction_entry orig_offset alt_offset feature orig_len alt_len - .word \orig_offset - . - .word \alt_offset - . - .hword \feature - .byte \orig_len - .byte \alt_len -.endm - -.macro alternative_insn insn1, insn2, cap, enable = 1 - .if \enable -661: \insn1 -662: .pushsection .altinstructions, "a" - altinstruction_entry 661b, 663f, \cap, 662b-661b, 664f-663f - .popsection - .subsection 1 -663: \insn2 -664: .previous - .org . - (664b-663b) + (662b-661b) - .org . - (662b-661b) + (664b-663b) - .endif -.endm - -/* - * Alternative sequences - * - * The code for the case where the capability is not present will be - * assembled and linked as normal. There are no restrictions on this - * code. - * - * The code for the case where the capability is present will be - * assembled into a special section to be used for dynamic patching. - * Code for that case must: - * - * 1. Be exactly the same length (in bytes) as the default code - * sequence. - * - * 2. Not contain a branch target that is used outside of the - * alternative sequence it is defined in (branches into an - * alternative sequence are not fixed up). - */ - -/* - * Begin an alternative code sequence. - */ -.macro alternative_if_not cap - .set .Lasm_alt_mode, 0 - .pushsection .altinstructions, "a" - altinstruction_entry 661f, 663f, \cap, 662f-661f, 664f-663f - .popsection -661: -.endm - -.macro alternative_if cap - .set .Lasm_alt_mode, 1 - .pushsection .altinstructions, "a" - altinstruction_entry 663f, 661f, \cap, 664f-663f, 662f-661f - .popsection - .subsection 1 - .align 2 /* So GAS knows label 661 is suitably aligned */ -661: -.endm - -.macro alternative_cb cb - .set .Lasm_alt_mode, 0 - .pushsection .altinstructions, "a" - altinstruction_entry 661f, \cb, ARM64_CB_PATCH, 662f-661f, 0 - .popsection -661: -.endm - -/* - * Provide the other half of the alternative code sequence. - */ -.macro alternative_else -662: - .if .Lasm_alt_mode==0 - .subsection 1 - .else - .previous - .endif -663: -.endm - -/* - * Complete an alternative code sequence. - */ -.macro alternative_endif -664: - .if .Lasm_alt_mode==0 - .previous - .endif - .org . - (664b-663b) + (662b-661b) - .org . - (662b-661b) + (664b-663b) -.endm - -/* - * Callback-based alternative epilogue - */ -.macro alternative_cb_end -662: -.endm - -/* - * Provides a trivial alternative or default sequence consisting solely - * of NOPs. The number of NOPs is chosen automatically to match the - * previous case. - */ -.macro alternative_else_nop_endif -alternative_else - nops (662b-661b) / AARCH64_INSN_SIZE -alternative_endif -.endm - -#define _ALTERNATIVE_CFG(insn1, insn2, cap, cfg, ...) \ - alternative_insn insn1, insn2, cap, IS_ENABLED(cfg) - -.macro user_alt, label, oldinstr, newinstr, cond -9999: alternative_insn "\oldinstr", "\newinstr", \cond - _asm_extable 9999b, \label -.endm - -/* - * Generate the assembly for UAO alternatives with exception table entries. - * This is complicated as there is no post-increment or pair versions of the - * unprivileged instructions, and USER() only works for single instructions. - */ -#ifdef CONFIG_ARM64_UAO - .macro uao_ldp l, reg1, reg2, addr, post_inc - alternative_if_not ARM64_HAS_UAO -8888: ldp \reg1, \reg2, [\addr], \post_inc; -8889: nop; - nop; - alternative_else - ldtr \reg1, [\addr]; - ldtr \reg2, [\addr, #8]; - add \addr, \addr, \post_inc; - alternative_endif - - _asm_extable 8888b,\l; - _asm_extable 8889b,\l; - .endm - - .macro uao_stp l, reg1, reg2, addr, post_inc - alternative_if_not ARM64_HAS_UAO -8888: stp \reg1, \reg2, [\addr], \post_inc; -8889: nop; - nop; - alternative_else - sttr \reg1, [\addr]; - sttr \reg2, [\addr, #8]; - add \addr, \addr, \post_inc; - alternative_endif - - _asm_extable 8888b,\l; - _asm_extable 8889b,\l; - .endm - - .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc - alternative_if_not ARM64_HAS_UAO -8888: \inst \reg, [\addr], \post_inc; - nop; - alternative_else - \alt_inst \reg, [\addr]; - add \addr, \addr, \post_inc; - alternative_endif - - _asm_extable 8888b,\l; - .endm -#else - .macro uao_ldp l, reg1, reg2, addr, post_inc - USER(\l, ldp \reg1, \reg2, [\addr], \post_inc) - .endm - .macro uao_stp l, reg1, reg2, addr, post_inc - USER(\l, stp \reg1, \reg2, [\addr], \post_inc) - .endm - .macro uao_user_alternative l, inst, alt_inst, reg, addr, post_inc - USER(\l, \inst \reg, [\addr], \post_inc) - .endm -#endif - -#endif /* __ASSEMBLY__ */ - -/* - * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature)); - * - * Usage: asm(ALTERNATIVE(oldinstr, newinstr, feature, CONFIG_FOO)); - * N.B. If CONFIG_FOO is specified, but not selected, the whole block - * will be omitted, including oldinstr. - */ -#define ALTERNATIVE(oldinstr, newinstr, ...) \ - _ALTERNATIVE_CFG(oldinstr, newinstr, __VA_ARGS__, 1) - +#endif /* __ASSEMBLY__ */ #endif /* __ASM_ALTERNATIVE_H */ diff --git a/arch/arm64/include/asm/asm-uaccess.h b/arch/arm64/include/asm/asm-uaccess.h index f68a0e64482a..9990059be106 100644 --- a/arch/arm64/include/asm/asm-uaccess.h +++ b/arch/arm64/include/asm/asm-uaccess.h @@ -2,7 +2,7 @@ #ifndef __ASM_ASM_UACCESS_H #define __ASM_ASM_UACCESS_H -#include +#include #include #include #include @@ -15,10 +15,10 @@ .macro __uaccess_ttbr0_disable, tmp1 mrs \tmp1, ttbr1_el1 // swapper_pg_dir bic \tmp1, \tmp1, #TTBR_ASID_MASK - sub \tmp1, \tmp1, #RESERVED_TTBR0_SIZE // reserved_ttbr0 just before swapper_pg_dir + sub \tmp1, \tmp1, #PAGE_SIZE // reserved_pg_dir just before swapper_pg_dir msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 isb - add \tmp1, \tmp1, #RESERVED_TTBR0_SIZE + add \tmp1, \tmp1, #PAGE_SIZE msr ttbr1_el1, \tmp1 // set reserved ASID isb .endm @@ -58,4 +58,33 @@ alternative_else_nop_endif .endm #endif +/* + * Generate the assembly for LDTR/STTR with exception table entries. + * This is complicated as there is no post-increment or pair versions of the + * unprivileged instructions, and USER() only works for single instructions. + */ + .macro user_ldp l, reg1, reg2, addr, post_inc +8888: ldtr \reg1, [\addr]; +8889: ldtr \reg2, [\addr, #8]; + add \addr, \addr, \post_inc; + + _asm_extable 8888b,\l; + _asm_extable 8889b,\l; + .endm + + .macro user_stp l, reg1, reg2, addr, post_inc +8888: sttr \reg1, [\addr]; +8889: sttr \reg2, [\addr, #8]; + add \addr, \addr, \post_inc; + + _asm_extable 8888b,\l; + _asm_extable 8889b,\l; + .endm + + .macro user_ldst l, inst, reg, addr, post_inc +8888: \inst \reg, [\addr]; + add \addr, \addr, \post_inc; + + _asm_extable 8888b,\l; + .endm #endif diff --git a/arch/arm64/include/asm/cpucaps.h b/arch/arm64/include/asm/cpucaps.h index e7d98997c09c..a7242ef2a2cd 100644 --- a/arch/arm64/include/asm/cpucaps.h +++ b/arch/arm64/include/asm/cpucaps.h @@ -16,8 +16,6 @@ #define ARM64_WORKAROUND_CAVIUM_23154 6 #define ARM64_WORKAROUND_834220 7 #define ARM64_HAS_NO_HW_PREFETCH 8 -#define ARM64_HAS_UAO 9 -#define ARM64_ALT_PAN_NOT_UAO 10 #define ARM64_HAS_VIRT_HOST_EXTN 11 #define ARM64_WORKAROUND_CAVIUM_27456 12 #define ARM64_HAS_32BIT_EL0 13 @@ -66,7 +64,8 @@ #define ARM64_HAS_TLB_RANGE 56 #define ARM64_MTE 57 #define ARM64_WORKAROUND_1508412 58 +#define ARM64_HAS_LDAPR 59 -#define ARM64_NCAPS 59 +#define ARM64_NCAPS 60 #endif /* __ASM_CPUCAPS_H */ diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index 6af7973bb5c6..1160bafa40b2 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -681,10 +681,16 @@ static __always_inline bool system_supports_fpsimd(void) return !cpus_have_const_cap(ARM64_HAS_NO_FPSIMD); } +static inline bool system_uses_hw_pan(void) +{ + return IS_ENABLED(CONFIG_ARM64_PAN) && + cpus_have_const_cap(ARM64_HAS_PAN); +} + static inline bool system_uses_ttbr0_pan(void) { return IS_ENABLED(CONFIG_ARM64_SW_TTBR0_PAN) && - !cpus_have_const_cap(ARM64_HAS_PAN); + !system_uses_hw_pan(); } static __always_inline bool system_supports_sve(void) @@ -776,11 +782,26 @@ static inline bool cpu_has_hw_af(void) ID_AA64MMFR1_HADBS_SHIFT); } +static inline bool cpu_has_pan(void) +{ + u64 mmfr1 = read_cpuid(ID_AA64MMFR1_EL1); + return cpuid_feature_extract_unsigned_field(mmfr1, + ID_AA64MMFR1_PAN_SHIFT); +} + #ifdef CONFIG_ARM64_AMU_EXTN /* Check whether the cpu supports the Activity Monitors Unit (AMU) */ extern bool cpu_has_amu_feat(int cpu); +#else +static inline bool cpu_has_amu_feat(int cpu) +{ + return false; +} #endif +/* Get a cpu that supports the Activity Monitors Unit (AMU) */ +extern int get_cpu_with_amu_feat(void); + static inline unsigned int get_vmid_bits(u64 mmfr1) { int vmid_bits; diff --git a/arch/arm64/include/asm/exception.h b/arch/arm64/include/asm/exception.h index 0756191f44f6..78537393b650 100644 --- a/arch/arm64/include/asm/exception.h +++ b/arch/arm64/include/asm/exception.h @@ -37,7 +37,7 @@ asmlinkage void enter_from_user_mode(void); asmlinkage void exit_to_user_mode(void); void arm64_enter_nmi(struct pt_regs *regs); void arm64_exit_nmi(struct pt_regs *regs); -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs); +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs); void do_undefinstr(struct pt_regs *regs); void do_bti(struct pt_regs *regs); asmlinkage void bad_mode(struct pt_regs *regs, int reason, unsigned int esr); diff --git a/arch/arm64/include/asm/exec.h b/arch/arm64/include/asm/exec.h index 1aae6f9962fc..9a1c22ce664b 100644 --- a/arch/arm64/include/asm/exec.h +++ b/arch/arm64/include/asm/exec.h @@ -10,6 +10,5 @@ #include extern unsigned long arch_align_stack(unsigned long sp); -void uao_thread_switch(struct task_struct *next); #endif /* __ASM_EXEC_H */ diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index 97f6a63810ec..8e41faa37c69 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -16,7 +16,7 @@ do { \ unsigned int loops = FUTEX_MAX_LOOPS; \ \ - uaccess_enable(); \ + uaccess_enable_privileged(); \ asm volatile( \ " prfm pstl1strm, %2\n" \ "1: ldxr %w1, %2\n" \ @@ -39,7 +39,7 @@ do { \ "+r" (loops) \ : "r" (oparg), "Ir" (-EFAULT), "Ir" (-EAGAIN) \ : "memory"); \ - uaccess_disable(); \ + uaccess_disable_privileged(); \ } while (0) static inline int @@ -95,7 +95,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr, return -EFAULT; uaddr = __uaccess_mask_ptr(_uaddr); - uaccess_enable(); + uaccess_enable_privileged(); asm volatile("// futex_atomic_cmpxchg_inatomic\n" " prfm pstl1strm, %2\n" "1: ldxr %w1, %2\n" @@ -118,7 +118,7 @@ futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr, : "+r" (ret), "=&r" (val), "+Q" (*uaddr), "=&r" (tmp), "+r" (loops) : "r" (oldval), "r" (newval), "Ir" (-EFAULT), "Ir" (-EAGAIN) : "memory"); - uaccess_disable(); + uaccess_disable_privileged(); if (!ret) *uval = val; diff --git a/arch/arm64/include/asm/insn.h b/arch/arm64/include/asm/insn.h index 4b39293d0f72..4ebb9c054ccc 100644 --- a/arch/arm64/include/asm/insn.h +++ b/arch/arm64/include/asm/insn.h @@ -10,8 +10,7 @@ #include #include -/* A64 instructions are always 32 bits. */ -#define AARCH64_INSN_SIZE 4 +#include #ifndef __ASSEMBLY__ /* diff --git a/arch/arm64/include/asm/kernel-pgtable.h b/arch/arm64/include/asm/kernel-pgtable.h index 19ca76ea60d9..587c504a4c8b 100644 --- a/arch/arm64/include/asm/kernel-pgtable.h +++ b/arch/arm64/include/asm/kernel-pgtable.h @@ -89,12 +89,6 @@ #define INIT_DIR_SIZE (PAGE_SIZE * EARLY_PAGES(KIMAGE_VADDR, _end)) #define IDMAP_DIR_SIZE (IDMAP_PGTABLE_LEVELS * PAGE_SIZE) -#ifdef CONFIG_ARM64_SW_TTBR0_PAN -#define RESERVED_TTBR0_SIZE (PAGE_SIZE) -#else -#define RESERVED_TTBR0_SIZE (0) -#endif - /* Initial memory map size */ #if ARM64_SWAPPER_USES_SECTION_MAPS #define SWAPPER_BLOCK_SHIFT SECTION_SHIFT diff --git a/arch/arm64/include/asm/kprobes.h b/arch/arm64/include/asm/kprobes.h index 8699ce30f587..5d38ff4a4806 100644 --- a/arch/arm64/include/asm/kprobes.h +++ b/arch/arm64/include/asm/kprobes.h @@ -28,18 +28,11 @@ struct prev_kprobe { unsigned int status; }; -/* Single step context for kprobe */ -struct kprobe_step_ctx { - unsigned long ss_pending; - unsigned long match_addr; -}; - /* per-cpu kprobe control block */ struct kprobe_ctlblk { unsigned int kprobe_status; unsigned long saved_irqflag; struct prev_kprobe prev_kprobe; - struct kprobe_step_ctx ss_ctx; }; void arch_remove_kprobe(struct kprobe *); diff --git a/arch/arm64/include/asm/memory.h b/arch/arm64/include/asm/memory.h index cd61239bae8c..556cb2d62b5b 100644 --- a/arch/arm64/include/asm/memory.h +++ b/arch/arm64/include/asm/memory.h @@ -30,8 +30,8 @@ * keep a constant PAGE_OFFSET and "fallback" to using the higher end * of the VMEMMAP where 52-bit support is not available in hardware. */ -#define VMEMMAP_SIZE ((_PAGE_END(VA_BITS_MIN) - PAGE_OFFSET) \ - >> (PAGE_SHIFT - STRUCT_PAGE_MAX_SHIFT)) +#define VMEMMAP_SHIFT (PAGE_SHIFT - STRUCT_PAGE_MAX_SHIFT) +#define VMEMMAP_SIZE ((_PAGE_END(VA_BITS_MIN) - PAGE_OFFSET) >> VMEMMAP_SHIFT) /* * PAGE_OFFSET - the virtual address of the start of the linear map, at the @@ -44,17 +44,17 @@ #define _PAGE_OFFSET(va) (-(UL(1) << (va))) #define PAGE_OFFSET (_PAGE_OFFSET(VA_BITS)) #define KIMAGE_VADDR (MODULES_END) -#define BPF_JIT_REGION_START (KASAN_SHADOW_END) +#define BPF_JIT_REGION_START (_PAGE_END(VA_BITS_MIN)) #define BPF_JIT_REGION_SIZE (SZ_128M) #define BPF_JIT_REGION_END (BPF_JIT_REGION_START + BPF_JIT_REGION_SIZE) #define MODULES_END (MODULES_VADDR + MODULES_VSIZE) #define MODULES_VADDR (BPF_JIT_REGION_END) #define MODULES_VSIZE (SZ_128M) -#define VMEMMAP_START (-VMEMMAP_SIZE - SZ_2M) +#define VMEMMAP_START (-(UL(1) << (VA_BITS - VMEMMAP_SHIFT))) #define VMEMMAP_END (VMEMMAP_START + VMEMMAP_SIZE) -#define PCI_IO_END (VMEMMAP_START - SZ_2M) +#define PCI_IO_END (VMEMMAP_START - SZ_8M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) -#define FIXADDR_TOP (PCI_IO_START - SZ_2M) +#define FIXADDR_TOP (VMEMMAP_START - SZ_32M) #if VA_BITS > 48 #define VA_BITS_MIN (48) @@ -76,10 +76,11 @@ #define KASAN_SHADOW_OFFSET _AC(CONFIG_KASAN_SHADOW_OFFSET, UL) #define KASAN_SHADOW_END ((UL(1) << (64 - KASAN_SHADOW_SCALE_SHIFT)) \ + KASAN_SHADOW_OFFSET) +#define PAGE_END (KASAN_SHADOW_END - (1UL << (vabits_actual - KASAN_SHADOW_SCALE_SHIFT))) #define KASAN_THREAD_SHIFT 1 #else #define KASAN_THREAD_SHIFT 0 -#define KASAN_SHADOW_END (_PAGE_END(VA_BITS_MIN)) +#define PAGE_END (_PAGE_END(VA_BITS_MIN)) #endif /* CONFIG_KASAN */ #define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT) @@ -167,7 +168,6 @@ #include extern u64 vabits_actual; -#define PAGE_END (_PAGE_END(vabits_actual)) extern s64 memstart_addr; /* PHYS_OFFSET - the physical address of the start of memory. */ @@ -238,11 +238,9 @@ static inline const void *__tag_set(const void *addr, u8 tag) /* - * The linear kernel range starts at the bottom of the virtual address - * space. Testing the top bit for the start of the region is a - * sufficient check and avoids having to worry about the tag. + * The linear kernel range starts at the bottom of the virtual address space. */ -#define __is_lm_address(addr) (!(((u64)addr) & BIT(vabits_actual - 1))) +#define __is_lm_address(addr) (((u64)(addr) & ~PAGE_OFFSET) < (PAGE_END - PAGE_OFFSET)) #define __lm_to_phys(addr) (((addr) & ~PAGE_OFFSET) + PHYS_OFFSET) #define __kimg_to_phys(addr) ((addr) - kimage_voffset) diff --git a/arch/arm64/include/asm/mmu_context.h b/arch/arm64/include/asm/mmu_context.h index 0672236e1aea..5c72c20bd300 100644 --- a/arch/arm64/include/asm/mmu_context.h +++ b/arch/arm64/include/asm/mmu_context.h @@ -36,11 +36,11 @@ static inline void contextidr_thread_switch(struct task_struct *next) } /* - * Set TTBR0 to empty_zero_page. No translations will be possible via TTBR0. + * Set TTBR0 to reserved_pg_dir. No translations will be possible via TTBR0. */ static inline void cpu_set_reserved_ttbr0(void) { - unsigned long ttbr = phys_to_ttbr(__pa_symbol(empty_zero_page)); + unsigned long ttbr = phys_to_ttbr(__pa_symbol(reserved_pg_dir)); write_sysreg(ttbr, ttbr0_el1); isb(); @@ -195,7 +195,7 @@ static inline void update_saved_ttbr0(struct task_struct *tsk, return; if (mm == &init_mm) - ttbr = __pa_symbol(empty_zero_page); + ttbr = __pa_symbol(reserved_pg_dir); else ttbr = virt_to_phys(mm->pgd) | ASID(mm) << 48; diff --git a/arch/arm64/include/asm/pgtable-hwdef.h b/arch/arm64/include/asm/pgtable-hwdef.h index 01a96d07ae74..42442a0ae2ab 100644 --- a/arch/arm64/include/asm/pgtable-hwdef.h +++ b/arch/arm64/include/asm/pgtable-hwdef.h @@ -260,6 +260,7 @@ #define TCR_TBI1 (UL(1) << 38) #define TCR_HA (UL(1) << 39) #define TCR_HD (UL(1) << 40) +#define TCR_TBID1 (UL(1) << 52) #define TCR_NFD0 (UL(1) << 53) #define TCR_NFD1 (UL(1) << 54) #define TCR_E0PD0 (UL(1) << 55) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 5628289b9d5e..005eb035fbfa 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -22,7 +22,7 @@ * and fixed mappings */ #define VMALLOC_START (MODULES_END) -#define VMALLOC_END (- PUD_SIZE - VMEMMAP_SIZE - SZ_64K) +#define VMALLOC_END (VMEMMAP_START - SZ_256M) #define vmemmap ((struct page *)VMEMMAP_START - (memstart_addr >> PAGE_SHIFT)) @@ -527,6 +527,7 @@ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; extern pgd_t idmap_pg_end[]; extern pgd_t tramp_pg_dir[PTRS_PER_PGD]; +extern pgd_t reserved_pg_dir[PTRS_PER_PGD]; extern void set_swapper_pgd(pgd_t *pgdp, pgd_t pgd); diff --git a/arch/arm64/include/asm/processor.h b/arch/arm64/include/asm/processor.h index 9c5efcc6e7f1..3ba97cf8902f 100644 --- a/arch/arm64/include/asm/processor.h +++ b/arch/arm64/include/asm/processor.h @@ -8,9 +8,6 @@ #ifndef __ASM_PROCESSOR_H #define __ASM_PROCESSOR_H -#define KERNEL_DS UL(-1) -#define USER_DS ((UL(1) << VA_BITS) - 1) - /* * On arm64 systems, unaligned accesses by the CPU are cheap, and so there is * no point in shifting all network buffers by 2 bytes just to make some IP @@ -49,6 +46,7 @@ #define DEFAULT_MAP_WINDOW_64 (UL(1) << VA_BITS_MIN) #define TASK_SIZE_64 (UL(1) << vabits_actual) +#define TASK_SIZE_MAX (UL(1) << VA_BITS) #ifdef CONFIG_COMPAT #if defined(CONFIG_ARM64_64K_PAGES) && defined(CONFIG_KUSER_HELPERS) diff --git a/arch/arm64/include/asm/ptrace.h b/arch/arm64/include/asm/ptrace.h index 28c85b87b8cd..e58bca832dff 100644 --- a/arch/arm64/include/asm/ptrace.h +++ b/arch/arm64/include/asm/ptrace.h @@ -16,6 +16,11 @@ #define CurrentEL_EL1 (1 << 2) #define CurrentEL_EL2 (2 << 2) +#define INIT_PSTATE_EL1 \ + (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL1h) +#define INIT_PSTATE_EL2 \ + (PSR_D_BIT | PSR_A_BIT | PSR_I_BIT | PSR_F_BIT | PSR_MODE_EL2h) + /* * PMR values used to mask/unmask interrupts. * @@ -188,8 +193,7 @@ struct pt_regs { s32 syscallno; u32 unused2; #endif - - u64 orig_addr_limit; + u64 sdei_ttbr1; /* Only valid when ARM64_HAS_IRQ_PRIO_MASKING is enabled. */ u64 pmr_save; u64 stackframe[2]; diff --git a/arch/arm64/include/asm/rwonce.h b/arch/arm64/include/asm/rwonce.h new file mode 100644 index 000000000000..1bce62fa908a --- /dev/null +++ b/arch/arm64/include/asm/rwonce.h @@ -0,0 +1,73 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (C) 2020 Google LLC. + */ +#ifndef __ASM_RWONCE_H +#define __ASM_RWONCE_H + +#ifdef CONFIG_LTO + +#include +#include + +#ifndef BUILD_VDSO + +#ifdef CONFIG_AS_HAS_LDAPR +#define __LOAD_RCPC(sfx, regs...) \ + ALTERNATIVE( \ + "ldar" #sfx "\t" #regs, \ + ".arch_extension rcpc\n" \ + "ldapr" #sfx "\t" #regs, \ + ARM64_HAS_LDAPR) +#else +#define __LOAD_RCPC(sfx, regs...) "ldar" #sfx "\t" #regs +#endif /* CONFIG_AS_HAS_LDAPR */ + +/* + * When building with LTO, there is an increased risk of the compiler + * converting an address dependency headed by a READ_ONCE() invocation + * into a control dependency and consequently allowing for harmful + * reordering by the CPU. + * + * Ensure that such transformations are harmless by overriding the generic + * READ_ONCE() definition with one that provides RCpc acquire semantics + * when building with LTO. + */ +#define __READ_ONCE(x) \ +({ \ + typeof(&(x)) __x = &(x); \ + int atomic = 1; \ + union { __unqual_scalar_typeof(*__x) __val; char __c[1]; } __u; \ + switch (sizeof(x)) { \ + case 1: \ + asm volatile(__LOAD_RCPC(b, %w0, %1) \ + : "=r" (*(__u8 *)__u.__c) \ + : "Q" (*__x) : "memory"); \ + break; \ + case 2: \ + asm volatile(__LOAD_RCPC(h, %w0, %1) \ + : "=r" (*(__u16 *)__u.__c) \ + : "Q" (*__x) : "memory"); \ + break; \ + case 4: \ + asm volatile(__LOAD_RCPC(, %w0, %1) \ + : "=r" (*(__u32 *)__u.__c) \ + : "Q" (*__x) : "memory"); \ + break; \ + case 8: \ + asm volatile(__LOAD_RCPC(, %0, %1) \ + : "=r" (*(__u64 *)__u.__c) \ + : "Q" (*__x) : "memory"); \ + break; \ + default: \ + atomic = 0; \ + } \ + atomic ? (typeof(*__x))__u.__val : (*(volatile typeof(__x))__x);\ +}) + +#endif /* !BUILD_VDSO */ +#endif /* CONFIG_LTO */ + +#include + +#endif /* __ASM_RWONCE_H */ diff --git a/arch/arm64/include/asm/signal.h b/arch/arm64/include/asm/signal.h new file mode 100644 index 000000000000..ef449f5f4ba8 --- /dev/null +++ b/arch/arm64/include/asm/signal.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __ARM64_ASM_SIGNAL_H +#define __ARM64_ASM_SIGNAL_H + +#include +#include +#include + +static inline void __user *arch_untagged_si_addr(void __user *addr, + unsigned long sig, + unsigned long si_code) +{ + /* + * For historical reasons, all bits of the fault address are exposed as + * address bits for watchpoint exceptions. New architectures should + * handle the tag bits consistently. + */ + if (sig == SIGTRAP && si_code == TRAP_BRKPT) + return addr; + + return untagged_addr(addr); +} +#define arch_untagged_si_addr arch_untagged_si_addr + +#endif diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index 801861d05426..cf7922f23808 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -98,6 +98,10 @@ #define SET_PSTATE_SSBS(x) __emit_inst(0xd500401f | PSTATE_SSBS | ((!!x) << PSTATE_Imm_shift)) #define SET_PSTATE_TCO(x) __emit_inst(0xd500401f | PSTATE_TCO | ((!!x) << PSTATE_Imm_shift)) +#define set_pstate_pan(x) asm volatile(SET_PSTATE_PAN(x)) +#define set_pstate_uao(x) asm volatile(SET_PSTATE_UAO(x)) +#define set_pstate_ssbs(x) asm volatile(SET_PSTATE_SSBS(x)) + #define __SYS_BARRIER_INSN(CRm, op2, Rt) \ __emit_inst(0xd5000000 | sys_insn(0, 3, 3, (CRm), (op2)) | ((Rt) & 0x1f)) @@ -582,6 +586,9 @@ #define ENDIAN_SET_EL2 0 #endif +#define INIT_SCTLR_EL2_MMU_OFF \ + (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) + /* SCTLR_EL1 specific flags. */ #define SCTLR_EL1_ATA0 (BIT(42)) @@ -615,12 +622,15 @@ #define ENDIAN_SET_EL1 0 #endif -#define SCTLR_EL1_SET (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA |\ - SCTLR_EL1_SA0 | SCTLR_EL1_SED | SCTLR_ELx_I |\ - SCTLR_EL1_DZE | SCTLR_EL1_UCT |\ - SCTLR_EL1_NTWE | SCTLR_ELx_IESB | SCTLR_EL1_SPAN |\ - SCTLR_ELx_ITFSB| SCTLR_ELx_ATA | SCTLR_EL1_ATA0 |\ - ENDIAN_SET_EL1 | SCTLR_EL1_UCI | SCTLR_EL1_RES1) +#define INIT_SCTLR_EL1_MMU_OFF \ + (ENDIAN_SET_EL1 | SCTLR_EL1_RES1) + +#define INIT_SCTLR_EL1_MMU_ON \ + (SCTLR_ELx_M | SCTLR_ELx_C | SCTLR_ELx_SA | SCTLR_EL1_SA0 | \ + SCTLR_EL1_SED | SCTLR_ELx_I | SCTLR_EL1_DZE | SCTLR_EL1_UCT | \ + SCTLR_EL1_NTWE | SCTLR_ELx_IESB | SCTLR_EL1_SPAN | SCTLR_ELx_ITFSB | \ + SCTLR_ELx_ATA | SCTLR_EL1_ATA0 | ENDIAN_SET_EL1 | SCTLR_EL1_UCI | \ + SCTLR_EL1_RES1) /* MAIR_ELx memory attributes (used by Linux) */ #define MAIR_ATTR_DEVICE_nGnRnE UL(0x00) diff --git a/arch/arm64/include/asm/system_misc.h b/arch/arm64/include/asm/system_misc.h index d02c5e5ea015..305a7157c6a6 100644 --- a/arch/arm64/include/asm/system_misc.h +++ b/arch/arm64/include/asm/system_misc.h @@ -22,7 +22,7 @@ void die(const char *msg, struct pt_regs *regs, int err); struct siginfo; void arm64_notify_die(const char *str, struct pt_regs *regs, - int signo, int sicode, void __user *addr, + int signo, int sicode, unsigned long far, int err); void hook_debug_fault_code(int nr, int (*fn)(unsigned long, unsigned int, diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index 4a967f417fdd..2c8cce12f01f 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -18,14 +18,11 @@ struct task_struct; #include #include -typedef unsigned long mm_segment_t; - /* * low level task data that entry.S needs immediate access to. */ struct thread_info { unsigned long flags; /* low level flags */ - mm_segment_t addr_limit; /* address limit */ #ifdef CONFIG_ARM64_SW_TTBR0_PAN u64 ttbr0; /* saved TTBR0_EL1 */ #endif @@ -66,9 +63,8 @@ void arch_release_task_struct(struct task_struct *tsk); #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ -#define TIF_FSCHECK 5 /* Check FS is USER_DS on return */ -#define TIF_MTE_ASYNC_FAULT 6 /* MTE Asynchronous Tag Check Fault */ -#define TIF_CHECK_32BIT_AFFINITY 7 /* Check thread affinity for asymmetric AArch32 */ +#define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ +#define TIF_CHECK_32BIT_AFFINITY 6 /* Check thread affinity for asymmetric AArch32 */ #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ @@ -94,7 +90,6 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_SYSCALL_EMU (1 << TIF_SYSCALL_EMU) #define _TIF_UPROBE (1 << TIF_UPROBE) -#define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_CHECK_32BIT_AFFINITY (1 << TIF_CHECK_32BIT_AFFINITY) @@ -103,9 +98,8 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ - _TIF_UPROBE | _TIF_FSCHECK | \ - _TIF_MTE_ASYNC_FAULT | \ - _TIF_CHECK_32BIT_AFFINITY) + _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ + _TIF_CHECK_32BIT_AFFINITY) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ @@ -123,7 +117,6 @@ void arch_release_task_struct(struct task_struct *tsk); { \ .flags = _TIF_FOREIGN_FPSTATE, \ .preempt_count = INIT_PREEMPT_COUNT, \ - .addr_limit = KERNEL_DS, \ INIT_SCS \ } diff --git a/arch/arm64/include/asm/topology.h b/arch/arm64/include/asm/topology.h index 11a465243f66..3b8dca4eb08d 100644 --- a/arch/arm64/include/asm/topology.h +++ b/arch/arm64/include/asm/topology.h @@ -16,12 +16,14 @@ int pcibus_to_node(struct pci_bus *bus); #include +void update_freq_counters_refs(void); +void topology_scale_freq_tick(void); + #ifdef CONFIG_ARM64_AMU_EXTN /* * Replace task scheduler's default counter-based * frequency-invariance scale factor setting. */ -void topology_scale_freq_tick(void); #define arch_scale_freq_tick topology_scale_freq_tick #endif /* CONFIG_ARM64_AMU_EXTN */ diff --git a/arch/arm64/include/asm/traps.h b/arch/arm64/include/asm/traps.h index d96dc2c7c09d..54f32a0675df 100644 --- a/arch/arm64/include/asm/traps.h +++ b/arch/arm64/include/asm/traps.h @@ -26,9 +26,9 @@ void register_undef_hook(struct undef_hook *hook); void unregister_undef_hook(struct undef_hook *hook); void force_signal_inject(int signal, int code, unsigned long address, unsigned int err); void arm64_notify_segfault(unsigned long addr); -void arm64_force_sig_fault(int signo, int code, void __user *addr, const char *str); -void arm64_force_sig_mceerr(int code, void __user *addr, short lsb, const char *str); -void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr, const char *str); +void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str); +void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str); +void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str); /* * Move regs->pc to next instruction and do necessary setup before it diff --git a/arch/arm64/include/asm/uaccess.h b/arch/arm64/include/asm/uaccess.h index 991dd5f031e4..abb31aa1f8ca 100644 --- a/arch/arm64/include/asm/uaccess.h +++ b/arch/arm64/include/asm/uaccess.h @@ -24,44 +24,18 @@ #include #include -#define get_fs() (current_thread_info()->addr_limit) - -static inline void set_fs(mm_segment_t fs) -{ - current_thread_info()->addr_limit = fs; - - /* - * Prevent a mispredicted conditional call to set_fs from forwarding - * the wrong address limit to access_ok under speculation. - */ - spec_bar(); - - /* On user-mode return, check fs is correct */ - set_thread_flag(TIF_FSCHECK); - - /* - * Enable/disable UAO so that copy_to_user() etc can access - * kernel memory with the unprivileged instructions. - */ - if (IS_ENABLED(CONFIG_ARM64_UAO) && fs == KERNEL_DS) - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO)); - else - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO, - CONFIG_ARM64_UAO)); -} - -#define uaccess_kernel() (get_fs() == KERNEL_DS) +#define HAVE_GET_KERNEL_NOFAULT /* * Test whether a block of memory is a valid user space address. * Returns 1 if the range is valid, 0 otherwise. * * This is equivalent to the following test: - * (u65)addr + (u65)size <= (u65)current->addr_limit + 1 + * (u65)addr + (u65)size <= (u65)TASK_SIZE_MAX */ static inline unsigned long __range_ok(const void __user *addr, unsigned long size) { - unsigned long ret, limit = current_thread_info()->addr_limit; + unsigned long ret, limit = TASK_SIZE_MAX - 1; /* * Asynchronous I/O running in a kernel thread does not have the @@ -94,7 +68,6 @@ static inline unsigned long __range_ok(const void __user *addr, unsigned long si } #define access_ok(addr, size) __range_ok(addr, size) -#define user_addr_max get_fs #define _ASM_EXTABLE(from, to) \ " .pushsection __ex_table, \"a\"\n" \ @@ -113,8 +86,8 @@ static inline void __uaccess_ttbr0_disable(void) local_irq_save(flags); ttbr = read_sysreg(ttbr1_el1); ttbr &= ~TTBR_ASID_MASK; - /* reserved_ttbr0 placed before swapper_pg_dir */ - write_sysreg(ttbr - RESERVED_TTBR0_SIZE, ttbr0_el1); + /* reserved_pg_dir placed before swapper_pg_dir */ + write_sysreg(ttbr - PAGE_SIZE, ttbr0_el1); isb(); /* Set reserved ASID */ write_sysreg(ttbr, ttbr1_el1); @@ -186,47 +159,26 @@ static inline void __uaccess_enable_hw_pan(void) CONFIG_ARM64_PAN)); } -#define __uaccess_disable(alt) \ -do { \ - if (!uaccess_ttbr0_disable()) \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(1), alt, \ - CONFIG_ARM64_PAN)); \ -} while (0) - -#define __uaccess_enable(alt) \ -do { \ - if (!uaccess_ttbr0_enable()) \ - asm(ALTERNATIVE("nop", SET_PSTATE_PAN(0), alt, \ - CONFIG_ARM64_PAN)); \ -} while (0) - -static inline void uaccess_disable(void) +static inline void uaccess_disable_privileged(void) { - __uaccess_disable(ARM64_HAS_PAN); + if (uaccess_ttbr0_disable()) + return; + + __uaccess_enable_hw_pan(); } -static inline void uaccess_enable(void) +static inline void uaccess_enable_privileged(void) { - __uaccess_enable(ARM64_HAS_PAN); + if (uaccess_ttbr0_enable()) + return; + + __uaccess_disable_hw_pan(); } /* - * These functions are no-ops when UAO is present. - */ -static inline void uaccess_disable_not_uao(void) -{ - __uaccess_disable(ARM64_ALT_PAN_NOT_UAO); -} - -static inline void uaccess_enable_not_uao(void) -{ - __uaccess_enable(ARM64_ALT_PAN_NOT_UAO); -} - -/* - * Sanitise a uaccess pointer such that it becomes NULL if above the - * current addr_limit. In case the pointer is tagged (has the top byte set), - * untag the pointer before checking. + * Sanitise a uaccess pointer such that it becomes NULL if above the maximum + * user address. In case the pointer is tagged (has the top byte set), untag + * the pointer before checking. */ #define uaccess_mask_ptr(ptr) (__typeof__(ptr))__uaccess_mask_ptr(ptr) static inline void __user *__uaccess_mask_ptr(const void __user *ptr) @@ -237,7 +189,7 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr) " bics xzr, %3, %2\n" " csel %0, %1, xzr, eq\n" : "=&r" (safe_ptr) - : "r" (ptr), "r" (current_thread_info()->addr_limit), + : "r" (ptr), "r" (TASK_SIZE_MAX - 1), "r" (untagged_addr(ptr)) : "cc"); @@ -253,10 +205,9 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr) * The "__xxx_error" versions set the third argument to -EFAULT if an error * occurs, and leave it unchanged on success. */ -#define __get_user_asm(instr, alt_instr, reg, x, addr, err, feature) \ +#define __get_mem_asm(load, reg, x, addr, err) \ asm volatile( \ - "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ - alt_instr " " reg "1, [%2]\n", feature) \ + "1: " load " " reg "1, [%2]\n" \ "2:\n" \ " .section .fixup, \"ax\"\n" \ " .align 2\n" \ @@ -268,35 +219,36 @@ static inline void __user *__uaccess_mask_ptr(const void __user *ptr) : "+r" (err), "=&r" (x) \ : "r" (addr), "i" (-EFAULT)) -#define __raw_get_user(x, ptr, err) \ +#define __raw_get_mem(ldr, x, ptr, err) \ do { \ unsigned long __gu_val; \ - __chk_user_ptr(ptr); \ - uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ - __get_user_asm("ldrb", "ldtrb", "%w", __gu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __get_mem_asm(ldr "b", "%w", __gu_val, (ptr), (err)); \ break; \ case 2: \ - __get_user_asm("ldrh", "ldtrh", "%w", __gu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __get_mem_asm(ldr "h", "%w", __gu_val, (ptr), (err)); \ break; \ case 4: \ - __get_user_asm("ldr", "ldtr", "%w", __gu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __get_mem_asm(ldr, "%w", __gu_val, (ptr), (err)); \ break; \ case 8: \ - __get_user_asm("ldr", "ldtr", "%x", __gu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __get_mem_asm(ldr, "%x", __gu_val, (ptr), (err)); \ break; \ default: \ BUILD_BUG(); \ } \ - uaccess_disable_not_uao(); \ (x) = (__force __typeof__(*(ptr)))__gu_val; \ } while (0) +#define __raw_get_user(x, ptr, err) \ +do { \ + __chk_user_ptr(ptr); \ + uaccess_ttbr0_enable(); \ + __raw_get_mem("ldtr", x, ptr, err); \ + uaccess_ttbr0_disable(); \ +} while (0) + #define __get_user_error(x, ptr, err) \ do { \ __typeof__(*(ptr)) __user *__p = (ptr); \ @@ -318,10 +270,19 @@ do { \ #define get_user __get_user -#define __put_user_asm(instr, alt_instr, reg, x, addr, err, feature) \ +#define __get_kernel_nofault(dst, src, type, err_label) \ +do { \ + int __gkn_err = 0; \ + \ + __raw_get_mem("ldr", *((type *)(dst)), \ + (__force type *)(src), __gkn_err); \ + if (unlikely(__gkn_err)) \ + goto err_label; \ +} while (0) + +#define __put_mem_asm(store, reg, x, addr, err) \ asm volatile( \ - "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ - alt_instr " " reg "1, [%2]\n", feature) \ + "1: " store " " reg "1, [%2]\n" \ "2:\n" \ " .section .fixup,\"ax\"\n" \ " .align 2\n" \ @@ -332,32 +293,33 @@ do { \ : "+r" (err) \ : "r" (x), "r" (addr), "i" (-EFAULT)) -#define __raw_put_user(x, ptr, err) \ +#define __raw_put_mem(str, x, ptr, err) \ do { \ __typeof__(*(ptr)) __pu_val = (x); \ - __chk_user_ptr(ptr); \ - uaccess_enable_not_uao(); \ switch (sizeof(*(ptr))) { \ case 1: \ - __put_user_asm("strb", "sttrb", "%w", __pu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __put_mem_asm(str "b", "%w", __pu_val, (ptr), (err)); \ break; \ case 2: \ - __put_user_asm("strh", "sttrh", "%w", __pu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __put_mem_asm(str "h", "%w", __pu_val, (ptr), (err)); \ break; \ case 4: \ - __put_user_asm("str", "sttr", "%w", __pu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __put_mem_asm(str, "%w", __pu_val, (ptr), (err)); \ break; \ case 8: \ - __put_user_asm("str", "sttr", "%x", __pu_val, (ptr), \ - (err), ARM64_HAS_UAO); \ + __put_mem_asm(str, "%x", __pu_val, (ptr), (err)); \ break; \ default: \ BUILD_BUG(); \ } \ - uaccess_disable_not_uao(); \ +} while (0) + +#define __raw_put_user(x, ptr, err) \ +do { \ + __chk_user_ptr(ptr); \ + uaccess_ttbr0_enable(); \ + __raw_put_mem("sttr", x, ptr, err); \ + uaccess_ttbr0_disable(); \ } while (0) #define __put_user_error(x, ptr, err) \ @@ -381,14 +343,24 @@ do { \ #define put_user __put_user +#define __put_kernel_nofault(dst, src, type, err_label) \ +do { \ + int __pkn_err = 0; \ + \ + __raw_put_mem("str", *((type *)(src)), \ + (__force type *)(dst), __pkn_err); \ + if (unlikely(__pkn_err)) \ + goto err_label; \ +} while(0) + extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); #define raw_copy_from_user(to, from, n) \ ({ \ unsigned long __acfu_ret; \ - uaccess_enable_not_uao(); \ + uaccess_ttbr0_enable(); \ __acfu_ret = __arch_copy_from_user((to), \ __uaccess_mask_ptr(from), (n)); \ - uaccess_disable_not_uao(); \ + uaccess_ttbr0_disable(); \ __acfu_ret; \ }) @@ -396,10 +368,10 @@ extern unsigned long __must_check __arch_copy_to_user(void __user *to, const voi #define raw_copy_to_user(to, from, n) \ ({ \ unsigned long __actu_ret; \ - uaccess_enable_not_uao(); \ + uaccess_ttbr0_enable(); \ __actu_ret = __arch_copy_to_user(__uaccess_mask_ptr(to), \ (from), (n)); \ - uaccess_disable_not_uao(); \ + uaccess_ttbr0_disable(); \ __actu_ret; \ }) @@ -407,10 +379,10 @@ extern unsigned long __must_check __arch_copy_in_user(void __user *to, const voi #define raw_copy_in_user(to, from, n) \ ({ \ unsigned long __aciu_ret; \ - uaccess_enable_not_uao(); \ + uaccess_ttbr0_enable(); \ __aciu_ret = __arch_copy_in_user(__uaccess_mask_ptr(to), \ __uaccess_mask_ptr(from), (n)); \ - uaccess_disable_not_uao(); \ + uaccess_ttbr0_disable(); \ __aciu_ret; \ }) @@ -421,9 +393,9 @@ extern unsigned long __must_check __arch_clear_user(void __user *to, unsigned lo static inline unsigned long __must_check __clear_user(void __user *to, unsigned long n) { if (access_ok(to, n)) { - uaccess_enable_not_uao(); + uaccess_ttbr0_enable(); n = __arch_clear_user(__uaccess_mask_ptr(to), n); - uaccess_disable_not_uao(); + uaccess_ttbr0_disable(); } return n; } diff --git a/arch/arm64/kernel/Makefile b/arch/arm64/kernel/Makefile index bbaf0bc4ad60..86364ab6f13f 100644 --- a/arch/arm64/kernel/Makefile +++ b/arch/arm64/kernel/Makefile @@ -58,7 +58,6 @@ obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_CRASH_CORE) += crash_core.o obj-$(CONFIG_ARM_SDE_INTERFACE) += sdei.o obj-$(CONFIG_ARM64_PTR_AUTH) += pointer_auth.o -obj-$(CONFIG_SHADOW_CALL_STACK) += scs.o obj-$(CONFIG_ARM64_MTE) += mte.o obj-y += vdso/ probes/ diff --git a/arch/arm64/kernel/alternative.c b/arch/arm64/kernel/alternative.c index 73039949b5ce..a57cffb752e8 100644 --- a/arch/arm64/kernel/alternative.c +++ b/arch/arm64/kernel/alternative.c @@ -21,7 +21,8 @@ #define ALT_ORIG_PTR(a) __ALT_PTR(a, orig_offset) #define ALT_REPL_PTR(a) __ALT_PTR(a, alt_offset) -static int all_alternatives_applied; +/* Volatile, as we may be patching the guts of READ_ONCE() */ +static volatile int all_alternatives_applied; static DECLARE_BITMAP(applied_alternatives, ARM64_NCAPS); @@ -205,7 +206,7 @@ static int __apply_alternatives_multi_stop(void *unused) /* We always have a CPU 0 at this point (__init) */ if (smp_processor_id()) { - while (!READ_ONCE(all_alternatives_applied)) + while (!all_alternatives_applied) cpu_relax(); isb(); } else { @@ -217,7 +218,7 @@ static int __apply_alternatives_multi_stop(void *unused) BUG_ON(all_alternatives_applied); __apply_alternatives(®ion, false, remaining_capabilities); /* Barriers provided by the cache flushing */ - WRITE_ONCE(all_alternatives_applied, 1); + all_alternatives_applied = 1; } return 0; diff --git a/arch/arm64/kernel/armv8_deprecated.c b/arch/arm64/kernel/armv8_deprecated.c index 7364de008bab..0e86e8b9cedd 100644 --- a/arch/arm64/kernel/armv8_deprecated.c +++ b/arch/arm64/kernel/armv8_deprecated.c @@ -277,7 +277,7 @@ static void __init register_insn_emulation_sysctl(void) #define __user_swpX_asm(data, addr, res, temp, temp2, B) \ do { \ - uaccess_enable(); \ + uaccess_enable_privileged(); \ __asm__ __volatile__( \ " mov %w3, %w7\n" \ "0: ldxr"B" %w2, [%4]\n" \ @@ -302,7 +302,7 @@ do { \ "i" (-EFAULT), \ "i" (__SWP_LL_SC_LOOPS) \ : "memory"); \ - uaccess_disable(); \ + uaccess_disable_privileged(); \ } while (0) #define __user_swp_asm(data, addr, res, temp, temp2) \ diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 7d32fc959b1a..679b19b8a7ff 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -30,7 +30,6 @@ int main(void) BLANK(); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); - DEFINE(TSK_TI_ADDR_LIMIT, offsetof(struct task_struct, thread_info.addr_limit)); #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif @@ -70,7 +69,7 @@ int main(void) DEFINE(S_PSTATE, offsetof(struct pt_regs, pstate)); DEFINE(S_PC, offsetof(struct pt_regs, pc)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); - DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit)); + DEFINE(S_SDEI_TTBR1, offsetof(struct pt_regs, sdei_ttbr1)); DEFINE(S_PMR_SAVE, offsetof(struct pt_regs, pmr_save)); DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index 2d20cec1ded6..bc616a085905 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -152,10 +152,6 @@ EXPORT_SYMBOL(cpu_hwcap_keys); .width = 0, \ } -/* meta feature for alternatives */ -static bool __maybe_unused -cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused); - static void cpu_enable_cnp(struct arm64_cpu_capabilities const *cap); static bool __system_matches_cap(unsigned int n); @@ -1532,8 +1528,10 @@ bool cpu_has_amu_feat(int cpu) return cpumask_test_cpu(cpu, &amu_cpus); } -/* Initialize the use of AMU counters for frequency invariance */ -extern void init_cpu_freq_invariance_counters(void); +int get_cpu_with_amu_feat(void) +{ + return cpumask_any(&amu_cpus); +} static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap) { @@ -1541,7 +1539,7 @@ static void cpu_amu_enable(struct arm64_cpu_capabilities const *cap) pr_info("detected CPU%d: Activity Monitors Unit (AMU)\n", smp_processor_id()); cpumask_set_cpu(smp_processor_id(), &amu_cpus); - init_cpu_freq_invariance_counters(); + update_freq_counters_refs(); } } @@ -1563,6 +1561,11 @@ static bool has_amu(const struct arm64_cpu_capabilities *cap, return true; } +#else +int get_cpu_with_amu_feat(void) +{ + return nr_cpu_ids; +} #endif #ifdef CONFIG_ARM64_VHE @@ -1604,7 +1607,7 @@ static void cpu_enable_pan(const struct arm64_cpu_capabilities *__unused) WARN_ON_ONCE(in_interrupt()); sysreg_clear_set(sctlr_el1, SCTLR_EL1_SPAN, 0); - asm(SET_PSTATE_PAN(1)); + set_pstate_pan(1); } #endif /* CONFIG_ARM64_PAN */ @@ -1784,28 +1787,6 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .type = ARM64_CPUCAP_WEAK_LOCAL_CPU_FEATURE, .matches = has_no_hw_prefetch, }, -#ifdef CONFIG_ARM64_UAO - { - .desc = "User Access Override", - .capability = ARM64_HAS_UAO, - .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .matches = has_cpuid_feature, - .sys_reg = SYS_ID_AA64MMFR2_EL1, - .field_pos = ID_AA64MMFR2_UAO_SHIFT, - .min_field_value = 1, - /* - * We rely on stop_machine() calling uao_thread_switch() to set - * UAO immediately after patching. - */ - }, -#endif /* CONFIG_ARM64_UAO */ -#ifdef CONFIG_ARM64_PAN - { - .capability = ARM64_ALT_PAN_NOT_UAO, - .type = ARM64_CPUCAP_SYSTEM_FEATURE, - .matches = cpufeature_pan_not_uao, - }, -#endif /* CONFIG_ARM64_PAN */ #ifdef CONFIG_ARM64_VHE { .desc = "Virtualization Host Extensions", @@ -2157,6 +2138,16 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .cpu_enable = cpu_enable_mte, }, #endif /* CONFIG_ARM64_MTE */ + { + .desc = "RCpc load-acquire (LDAPR)", + .capability = ARM64_HAS_LDAPR, + .type = ARM64_CPUCAP_SYSTEM_FEATURE, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .sign = FTR_UNSIGNED, + .field_pos = ID_AA64ISAR1_LRCPC_SHIFT, + .matches = has_cpuid_feature, + .min_field_value = 1, + }, {}, }; @@ -2672,7 +2663,7 @@ bool this_cpu_has_cap(unsigned int n) * - The SYSTEM_FEATURE cpu_hwcaps may not have been set. * In all other cases cpus_have_{const_}cap() should be used. */ -static bool __system_matches_cap(unsigned int n) +static bool __maybe_unused __system_matches_cap(unsigned int n) { if (n < ARM64_NCAPS) { const struct arm64_cpu_capabilities *cap = cpu_hwcaps_ptrs[n]; @@ -2752,12 +2743,6 @@ void __init setup_cpu_features(void) ARCH_DMA_MINALIGN); } -static bool __maybe_unused -cpufeature_pan_not_uao(const struct arm64_cpu_capabilities *entry, int __unused) -{ - return (__system_matches_cap(ARM64_HAS_PAN) && !__system_matches_cap(ARM64_HAS_UAO)); -} - static void __maybe_unused cpu_enable_cnp(struct arm64_cpu_capabilities const *cap) { cpu_replace_ttbr1(lm_alias(swapper_pg_dir)); diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 1e14bfa8c8a4..852abc57c2ce 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -234,9 +234,8 @@ static void send_user_sigtrap(int si_code) if (interrupts_enabled(regs)) local_irq_enable(); - arm64_force_sig_fault(SIGTRAP, si_code, - (void __user *)instruction_pointer(regs), - "User debug trap"); + arm64_force_sig_fault(SIGTRAP, si_code, instruction_pointer(regs), + "User debug trap"); } static int single_step_handler(unsigned long unused, unsigned int esr, diff --git a/arch/arm64/kernel/efi-header.S b/arch/arm64/kernel/efi-header.S index a71844fb923e..28d8a5dca5f1 100644 --- a/arch/arm64/kernel/efi-header.S +++ b/arch/arm64/kernel/efi-header.S @@ -7,30 +7,48 @@ #include #include + .macro efi_signature_nop +#ifdef CONFIG_EFI +.L_head: + /* + * This ccmp instruction has no meaningful effect except that + * its opcode forms the magic "MZ" signature required by UEFI. + */ + ccmp x18, #0, #0xd, pl +#else + /* + * Bootloaders may inspect the opcode at the start of the kernel + * image to decide if the kernel is capable of booting via UEFI. + * So put an ordinary NOP here, not the "MZ.." pseudo-nop above. + */ + nop +#endif + .endm + .macro __EFI_PE_HEADER +#ifdef CONFIG_EFI + .set .Lpe_header_offset, . - .L_head .long PE_MAGIC -coff_header: .short IMAGE_FILE_MACHINE_ARM64 // Machine - .short section_count // NumberOfSections + .short .Lsection_count // NumberOfSections .long 0 // TimeDateStamp .long 0 // PointerToSymbolTable .long 0 // NumberOfSymbols - .short section_table - optional_header // SizeOfOptionalHeader + .short .Lsection_table - .Loptional_header // SizeOfOptionalHeader .short IMAGE_FILE_DEBUG_STRIPPED | \ IMAGE_FILE_EXECUTABLE_IMAGE | \ IMAGE_FILE_LINE_NUMS_STRIPPED // Characteristics -optional_header: +.Loptional_header: .short PE_OPT_MAGIC_PE32PLUS // PE32+ format .byte 0x02 // MajorLinkerVersion .byte 0x14 // MinorLinkerVersion - .long __initdata_begin - efi_header_end // SizeOfCode + .long __initdata_begin - .Lefi_header_end // SizeOfCode .long __pecoff_data_size // SizeOfInitializedData .long 0 // SizeOfUninitializedData - .long __efistub_efi_pe_entry - _head // AddressOfEntryPoint - .long efi_header_end - _head // BaseOfCode + .long __efistub_efi_pe_entry - .L_head // AddressOfEntryPoint + .long .Lefi_header_end - .L_head // BaseOfCode -extra_header_fields: .quad 0 // ImageBase .long SEGMENT_ALIGN // SectionAlignment .long PECOFF_FILE_ALIGNMENT // FileAlignment @@ -42,10 +60,10 @@ extra_header_fields: .short 0 // MinorSubsystemVersion .long 0 // Win32VersionValue - .long _end - _head // SizeOfImage + .long _end - .L_head // SizeOfImage // Everything before the kernel image is considered part of the header - .long efi_header_end - _head // SizeOfHeaders + .long .Lefi_header_end - .L_head // SizeOfHeaders .long 0 // CheckSum .short IMAGE_SUBSYSTEM_EFI_APPLICATION // Subsystem .short 0 // DllCharacteristics @@ -54,7 +72,7 @@ extra_header_fields: .quad 0 // SizeOfHeapReserve .quad 0 // SizeOfHeapCommit .long 0 // LoaderFlags - .long (section_table - .) / 8 // NumberOfRvaAndSizes + .long (.Lsection_table - .) / 8 // NumberOfRvaAndSizes .quad 0 // ExportTable .quad 0 // ImportTable @@ -64,17 +82,17 @@ extra_header_fields: .quad 0 // BaseRelocationTable #ifdef CONFIG_DEBUG_EFI - .long efi_debug_table - _head // DebugTable - .long efi_debug_table_size + .long .Lefi_debug_table - .L_head // DebugTable + .long .Lefi_debug_table_size #endif // Section table -section_table: +.Lsection_table: .ascii ".text\0\0\0" - .long __initdata_begin - efi_header_end // VirtualSize - .long efi_header_end - _head // VirtualAddress - .long __initdata_begin - efi_header_end // SizeOfRawData - .long efi_header_end - _head // PointerToRawData + .long __initdata_begin - .Lefi_header_end // VirtualSize + .long .Lefi_header_end - .L_head // VirtualAddress + .long __initdata_begin - .Lefi_header_end // SizeOfRawData + .long .Lefi_header_end - .L_head // PointerToRawData .long 0 // PointerToRelocations .long 0 // PointerToLineNumbers @@ -86,9 +104,9 @@ section_table: .ascii ".data\0\0\0" .long __pecoff_data_size // VirtualSize - .long __initdata_begin - _head // VirtualAddress + .long __initdata_begin - .L_head // VirtualAddress .long __pecoff_data_rawsize // SizeOfRawData - .long __initdata_begin - _head // PointerToRawData + .long __initdata_begin - .L_head // PointerToRawData .long 0 // PointerToRelocations .long 0 // PointerToLineNumbers @@ -98,7 +116,7 @@ section_table: IMAGE_SCN_MEM_READ | \ IMAGE_SCN_MEM_WRITE // Characteristics - .set section_count, (. - section_table) / 40 + .set .Lsection_count, (. - .Lsection_table) / 40 #ifdef CONFIG_DEBUG_EFI /* @@ -114,21 +132,21 @@ section_table: __INITRODATA .align 2 -efi_debug_table: +.Lefi_debug_table: // EFI_IMAGE_DEBUG_DIRECTORY_ENTRY .long 0 // Characteristics .long 0 // TimeDateStamp .short 0 // MajorVersion .short 0 // MinorVersion .long IMAGE_DEBUG_TYPE_CODEVIEW // Type - .long efi_debug_entry_size // SizeOfData + .long .Lefi_debug_entry_size // SizeOfData .long 0 // RVA - .long efi_debug_entry - _head // FileOffset + .long .Lefi_debug_entry - .L_head // FileOffset - .set efi_debug_table_size, . - efi_debug_table + .set .Lefi_debug_table_size, . - .Lefi_debug_table .previous -efi_debug_entry: +.Lefi_debug_entry: // EFI_IMAGE_DEBUG_CODEVIEW_NB10_ENTRY .ascii "NB10" // Signature .long 0 // Unknown @@ -137,16 +155,12 @@ efi_debug_entry: .asciz VMLINUX_PATH - .set efi_debug_entry_size, . - efi_debug_entry + .set .Lefi_debug_entry_size, . - .Lefi_debug_entry #endif - /* - * EFI will load .text onwards at the 4k section alignment - * described in the PE/COFF header. To ensure that instruction - * sequences using an adrp and a :lo12: immediate will function - * correctly at this alignment, we must ensure that .text is - * placed at a 4k boundary in the Image to begin with. - */ .balign SEGMENT_ALIGN -efi_header_end: +.Lefi_header_end: +#else + .set .Lpe_header_offset, 0x0 +#endif .endm diff --git a/arch/arm64/kernel/entry-common.c b/arch/arm64/kernel/entry-common.c index 70e0a7591245..5346953e4382 100644 --- a/arch/arm64/kernel/entry-common.c +++ b/arch/arm64/kernel/entry-common.c @@ -115,7 +115,6 @@ static void noinstr el1_abort(struct pt_regs *regs, unsigned long esr) enter_from_kernel_mode(regs); local_daif_inherit(regs); - far = untagged_addr(far); do_mem_abort(far, esr, regs); local_daif_mask(); exit_to_kernel_mode(regs); @@ -256,7 +255,6 @@ static void noinstr el0_da(struct pt_regs *regs, unsigned long esr) enter_from_user_mode(); local_daif_restore(DAIF_PROCCTX); - far = untagged_addr(far); do_mem_abort(far, esr, regs); } diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index d72c818b019c..51c762156099 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -216,12 +216,6 @@ alternative_else_nop_endif .else add x21, sp, #S_FRAME_SIZE get_current_task tsk - /* Save the task's original addr_limit and set USER_DS */ - ldr x20, [tsk, #TSK_TI_ADDR_LIMIT] - str x20, [sp, #S_ORIG_ADDR_LIMIT] - mov x20, #USER_DS - str x20, [tsk, #TSK_TI_ADDR_LIMIT] - /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */ .endif /* \el == 0 */ mrs x22, elr_el1 mrs x23, spsr_el1 @@ -279,12 +273,6 @@ alternative_else_nop_endif .macro kernel_exit, el .if \el != 0 disable_daif - - /* Restore the task's original addr_limit. */ - ldr x20, [sp, #S_ORIG_ADDR_LIMIT] - str x20, [tsk, #TSK_TI_ADDR_LIMIT] - - /* No need to restore UAO, it will be restored from SPSR_EL1 */ .endif /* Restore pmr */ @@ -438,7 +426,7 @@ SYM_CODE_END(__swpan_exit_el0) #ifdef CONFIG_SHADOW_CALL_STACK /* also switch to the irq shadow stack */ - adr_this_cpu scs_sp, irq_shadow_call_stack, x26 + ldr_this_cpu scs_sp, irq_shadow_call_stack_ptr, x26 #endif 9998: @@ -773,9 +761,10 @@ SYM_CODE_END(ret_to_user) */ .pushsection ".entry.tramp.text", "ax" + // Move from tramp_pg_dir to swapper_pg_dir .macro tramp_map_kernel, tmp mrs \tmp, ttbr1_el1 - add \tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE) + add \tmp, \tmp, #(2 * PAGE_SIZE) bic \tmp, \tmp, #USER_ASID_FLAG msr ttbr1_el1, \tmp #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 @@ -792,9 +781,10 @@ alternative_else_nop_endif #endif /* CONFIG_QCOM_FALKOR_ERRATUM_1003 */ .endm + // Move from swapper_pg_dir to tramp_pg_dir .macro tramp_unmap_kernel, tmp mrs \tmp, ttbr1_el1 - sub \tmp, \tmp, #(PAGE_SIZE + RESERVED_TTBR0_SIZE) + sub \tmp, \tmp, #(2 * PAGE_SIZE) orr \tmp, \tmp, #USER_ASID_FLAG msr ttbr1_el1, \tmp /* @@ -965,10 +955,9 @@ SYM_CODE_START(__sdei_asm_entry_trampoline) mov x4, xzr /* - * Use reg->interrupted_regs.addr_limit to remember whether to unmap - * the kernel on exit. + * Remember whether to unmap the kernel on exit. */ -1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] +1: str x4, [x1, #(SDEI_EVENT_INTREGS + S_SDEI_TTBR1)] #ifdef CONFIG_RANDOMIZE_BASE adr x4, tramp_vectors + PAGE_SIZE @@ -989,7 +978,7 @@ NOKPROBE(__sdei_asm_entry_trampoline) * x4: struct sdei_registered_event argument from registration time. */ SYM_CODE_START(__sdei_asm_exit_trampoline) - ldr x4, [x4, #(SDEI_EVENT_INTREGS + S_ORIG_ADDR_LIMIT)] + ldr x4, [x4, #(SDEI_EVENT_INTREGS + S_SDEI_TTBR1)] cbnz x4, 1f tramp_unmap_kernel tmp=x4 @@ -1063,9 +1052,9 @@ SYM_CODE_START(__sdei_asm_handler) #ifdef CONFIG_SHADOW_CALL_STACK /* Use a separate shadow call stack for normal and critical events */ cbnz w4, 3f - adr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_normal, tmp=x6 + ldr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_normal_ptr, tmp=x6 b 4f -3: adr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_critical, tmp=x6 +3: ldr_this_cpu dst=scs_sp, sym=sdei_shadow_call_stack_critical_ptr, tmp=x6 4: #endif diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index d8d9caf02834..f2eb206920a2 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -58,21 +58,11 @@ * in the entry routines. */ __HEAD -_head: /* * DO NOT MODIFY. Image header expected by Linux boot-loaders. */ -#ifdef CONFIG_EFI - /* - * This add instruction has no meaningful effect except that - * its opcode forms the magic "MZ" signature required by UEFI. - */ - add x13, x18, #0x16 - b primary_entry -#else + efi_signature_nop // special NOP to identity as PE/COFF executable b primary_entry // branch to kernel start, magic - .long 0 // reserved -#endif .quad 0 // Image load offset from start of RAM, little-endian le64sym _kernel_size_le // Effective size of kernel image, little-endian le64sym _kernel_flags_le // Informative flags, little-endian @@ -80,14 +70,9 @@ _head: .quad 0 // reserved .quad 0 // reserved .ascii ARM64_IMAGE_MAGIC // Magic number -#ifdef CONFIG_EFI - .long pe_header - _head // Offset to the PE header. + .long .Lpe_header_offset // Offset to the PE header. -pe_header: __EFI_PE_HEADER -#else - .long 0 // reserved -#endif __INIT @@ -104,7 +89,7 @@ pe_header: */ SYM_CODE_START(primary_entry) bl preserve_boot_args - bl el2_setup // Drop to EL1, w0=cpu_boot_mode + bl init_kernel_el // w0=cpu_boot_mode adrp x23, __PHYS_OFFSET and x23, x23, MIN_KIMG_ALIGN - 1 // KASLR offset, defaults to 0 bl set_cpu_boot_mode_flag @@ -482,24 +467,33 @@ EXPORT_SYMBOL(kimage_vaddr) .section ".idmap.text","awx" /* - * If we're fortunate enough to boot at EL2, ensure that the world is - * sane before dropping to EL1. + * Starting from EL2 or EL1, configure the CPU to execute at the highest + * reachable EL supported by the kernel in a chosen default state. If dropping + * from EL2 to EL1, configure EL2 before configuring EL1. + * + * Since we cannot always rely on ERET synchronizing writes to sysregs (e.g. if + * SCTLR_ELx.EOS is clear), we place an ISB prior to ERET. * * Returns either BOOT_CPU_MODE_EL1 or BOOT_CPU_MODE_EL2 in w0 if * booted in EL1 or EL2 respectively. */ -SYM_FUNC_START(el2_setup) - msr SPsel, #1 // We want to use SP_EL{1,2} +SYM_FUNC_START(init_kernel_el) mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 - b.eq 1f - mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1) - msr sctlr_el1, x0 - mov w0, #BOOT_CPU_MODE_EL1 // This cpu booted in EL1 - isb - ret + b.eq init_el2 -1: mov_q x0, (SCTLR_EL2_RES1 | ENDIAN_SET_EL2) +SYM_INNER_LABEL(init_el1, SYM_L_LOCAL) + mov_q x0, INIT_SCTLR_EL1_MMU_OFF + msr sctlr_el1, x0 + isb + mov_q x0, INIT_PSTATE_EL1 + msr spsr_el1, x0 + msr elr_el1, lr + mov w0, #BOOT_CPU_MODE_EL1 + eret + +SYM_INNER_LABEL(init_el2, SYM_L_LOCAL) + mov_q x0, INIT_SCTLR_EL2_MMU_OFF msr sctlr_el2, x0 #ifdef CONFIG_ARM64_VHE @@ -608,9 +602,12 @@ set_hcr: cbz x2, install_el2_stub - mov w0, #BOOT_CPU_MODE_EL2 // This CPU booted in EL2 isb - ret + mov_q x0, INIT_PSTATE_EL2 + msr spsr_el2, x0 + msr elr_el2, lr + mov w0, #BOOT_CPU_MODE_EL2 + eret SYM_INNER_LABEL(install_el2_stub, SYM_L_LOCAL) /* @@ -620,7 +617,7 @@ SYM_INNER_LABEL(install_el2_stub, SYM_L_LOCAL) * requires no configuration, and all non-hyp-specific EL2 setup * will be done via the _EL1 system register aliases in __cpu_setup. */ - mov_q x0, (SCTLR_EL1_RES1 | ENDIAN_SET_EL1) + mov_q x0, INIT_SCTLR_EL1_MMU_OFF msr sctlr_el1, x0 /* Coprocessor traps. */ @@ -642,14 +639,13 @@ SYM_INNER_LABEL(install_el2_stub, SYM_L_LOCAL) 7: adr_l x0, __hyp_stub_vectors msr vbar_el2, x0 - /* spsr */ - mov x0, #(PSR_F_BIT | PSR_I_BIT | PSR_A_BIT | PSR_D_BIT |\ - PSR_MODE_EL1h) + isb + mov x0, #INIT_PSTATE_EL1 msr spsr_el2, x0 msr elr_el2, lr - mov w0, #BOOT_CPU_MODE_EL2 // This CPU booted in EL2 + mov w0, #BOOT_CPU_MODE_EL2 eret -SYM_FUNC_END(el2_setup) +SYM_FUNC_END(init_kernel_el) /* * Sets the __boot_cpu_mode flag depending on the CPU boot mode passed @@ -699,7 +695,7 @@ SYM_DATA_END(__early_cpu_boot_status) * cores are held until we're ready for them to initialise. */ SYM_FUNC_START(secondary_holding_pen) - bl el2_setup // Drop to EL1, w0=cpu_boot_mode + bl init_kernel_el // w0=cpu_boot_mode bl set_cpu_boot_mode_flag mrs x0, mpidr_el1 mov_q x1, MPIDR_HWID_BITMASK @@ -717,7 +713,7 @@ SYM_FUNC_END(secondary_holding_pen) * be used where CPUs are brought online dynamically by the kernel. */ SYM_FUNC_START(secondary_entry) - bl el2_setup // Drop to EL1 + bl init_kernel_el // w0=cpu_boot_mode bl set_cpu_boot_mode_flag b secondary_startup SYM_FUNC_END(secondary_entry) diff --git a/arch/arm64/kernel/irq.c b/arch/arm64/kernel/irq.c index 60456a62da11..dfb1feab867d 100644 --- a/arch/arm64/kernel/irq.c +++ b/arch/arm64/kernel/irq.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -27,6 +28,25 @@ DEFINE_PER_CPU(struct nmi_ctx, nmi_contexts); DEFINE_PER_CPU(unsigned long *, irq_stack_ptr); + +DECLARE_PER_CPU(unsigned long *, irq_shadow_call_stack_ptr); + +#ifdef CONFIG_SHADOW_CALL_STACK +DEFINE_PER_CPU(unsigned long *, irq_shadow_call_stack_ptr); +#endif + +static void init_irq_scs(void) +{ + int cpu; + + if (!IS_ENABLED(CONFIG_SHADOW_CALL_STACK)) + return; + + for_each_possible_cpu(cpu) + per_cpu(irq_shadow_call_stack_ptr, cpu) = + scs_alloc(cpu_to_node(cpu)); +} + #ifdef CONFIG_VMAP_STACK static void init_irq_stacks(void) { @@ -54,6 +74,7 @@ static void init_irq_stacks(void) void __init init_IRQ(void) { init_irq_stacks(); + init_irq_scs(); irqchip_init(); if (!handle_arch_irq) panic("No interrupt controller found."); diff --git a/arch/arm64/kernel/kaslr.c b/arch/arm64/kernel/kaslr.c index b181e0544b79..0921aa1520b0 100644 --- a/arch/arm64/kernel/kaslr.c +++ b/arch/arm64/kernel/kaslr.c @@ -50,10 +50,16 @@ static __init u64 get_kaslr_seed(void *fdt) return ret; } -static __init const u8 *kaslr_get_cmdline(void *fdt) +static __init bool cmdline_contains_nokaslr(const u8 *cmdline) { - static __initconst const u8 default_cmdline[] = CONFIG_CMDLINE; + const u8 *str; + str = strstr(cmdline, "nokaslr"); + return str == cmdline || (str > cmdline && *(str - 1) == ' '); +} + +static __init bool is_kaslr_disabled_cmdline(void *fdt) +{ if (!IS_ENABLED(CONFIG_CMDLINE_FORCE)) { int node; const u8 *prop; @@ -65,10 +71,17 @@ static __init const u8 *kaslr_get_cmdline(void *fdt) prop = fdt_getprop(fdt, node, "bootargs", NULL); if (!prop) goto out; - return prop; + + if (cmdline_contains_nokaslr(prop)) + return true; + + if (IS_ENABLED(CONFIG_CMDLINE_EXTEND)) + goto out; + + return false; } out: - return default_cmdline; + return cmdline_contains_nokaslr(CONFIG_CMDLINE); } /* @@ -83,7 +96,6 @@ u64 __init kaslr_early_init(u64 dt_phys) { void *fdt; u64 seed, offset, mask, module_range; - const u8 *cmdline, *str; unsigned long raw; int size; @@ -115,9 +127,7 @@ u64 __init kaslr_early_init(u64 dt_phys) * Check if 'nokaslr' appears on the command line, and * return 0 if that is the case. */ - cmdline = kaslr_get_cmdline(fdt); - str = strstr(cmdline, "nokaslr"); - if (str == cmdline || (str > cmdline && *(str - 1) == ' ')) { + if (is_kaslr_disabled_cmdline(fdt)) { kaslr_status = KASLR_DISABLED_CMDLINE; return 0; } diff --git a/arch/arm64/kernel/mte.c b/arch/arm64/kernel/mte.c index 52a0638ed967..ef15c8a2a49d 100644 --- a/arch/arm64/kernel/mte.c +++ b/arch/arm64/kernel/mte.c @@ -189,7 +189,8 @@ long get_mte_ctrl(struct task_struct *task) switch (task->thread.sctlr_tcf0) { case SCTLR_EL1_TCF0_NONE: - return PR_MTE_TCF_NONE; + ret |= PR_MTE_TCF_NONE; + break; case SCTLR_EL1_TCF0_SYNC: ret |= PR_MTE_TCF_SYNC; break; diff --git a/arch/arm64/kernel/perf_event.c b/arch/arm64/kernel/perf_event.c index 3605f77ad4df..38bb07eff872 100644 --- a/arch/arm64/kernel/perf_event.c +++ b/arch/arm64/kernel/perf_event.c @@ -23,6 +23,8 @@ #include #include #include +#include +#include /* ARMv8 Cortex-A53 specific event types. */ #define ARMV8_A53_PERFCTR_PREF_LINEFILL 0xC2 @@ -1248,10 +1250,21 @@ static struct platform_driver armv8_pmu_driver = { static int __init armv8_pmu_driver_init(void) { + int ret; + if (acpi_disabled) - return platform_driver_register(&armv8_pmu_driver); + ret = platform_driver_register(&armv8_pmu_driver); else - return arm_pmu_acpi_probe(armv8_pmuv3_init); + ret = arm_pmu_acpi_probe(armv8_pmuv3_init); + + /* + * Try to re-initialize lockup detector after PMU init in + * case PMU events are triggered via NMIs. + */ + if (ret == 0 && arm_pmu_irq_is_nmi()) + lockup_detector_init(); + + return ret; } device_initcall(armv8_pmu_driver_init) @@ -1309,3 +1322,27 @@ void arch_perf_update_userpage(struct perf_event *event, userpg->cap_user_time_zero = 1; userpg->cap_user_time_short = 1; } + +#ifdef CONFIG_HARDLOCKUP_DETECTOR_PERF +/* + * Safe maximum CPU frequency in case a particular platform doesn't implement + * cpufreq driver. Although, architecture doesn't put any restrictions on + * maximum frequency but 5 GHz seems to be safe maximum given the available + * Arm CPUs in the market which are clocked much less than 5 GHz. On the other + * hand, we can't make it much higher as it would lead to a large hard-lockup + * detection timeout on parts which are running slower (eg. 1GHz on + * Developerbox) and doesn't possess a cpufreq driver. + */ +#define SAFE_MAX_CPU_FREQ 5000000000UL // 5 GHz +u64 hw_nmi_get_sample_period(int watchdog_thresh) +{ + unsigned int cpu = smp_processor_id(); + unsigned long max_cpu_freq; + + max_cpu_freq = cpufreq_get_hw_max_freq(cpu) * 1000UL; + if (!max_cpu_freq) + max_cpu_freq = SAFE_MAX_CPU_FREQ; + + return (u64)max_cpu_freq * watchdog_thresh; +} +#endif diff --git a/arch/arm64/kernel/probes/kprobes.c b/arch/arm64/kernel/probes/kprobes.c index f11a1a1f7026..89c64ada8732 100644 --- a/arch/arm64/kernel/probes/kprobes.c +++ b/arch/arm64/kernel/probes/kprobes.c @@ -34,7 +34,7 @@ DEFINE_PER_CPU(struct kprobe *, current_kprobe) = NULL; DEFINE_PER_CPU(struct kprobe_ctlblk, kprobe_ctlblk); static void __kprobes -post_kprobe_handler(struct kprobe_ctlblk *, struct pt_regs *); +post_kprobe_handler(struct kprobe *, struct kprobe_ctlblk *, struct pt_regs *); static void __kprobes arch_prepare_ss_slot(struct kprobe *p) { @@ -68,7 +68,7 @@ static void __kprobes arch_simulate_insn(struct kprobe *p, struct pt_regs *regs) p->ainsn.api.handler((u32)p->opcode, (long)p->addr, regs); /* single step simulated, now go for post processing */ - post_kprobe_handler(kcb, regs); + post_kprobe_handler(p, kcb, regs); } int __kprobes arch_prepare_kprobe(struct kprobe *p) @@ -177,19 +177,6 @@ static void __kprobes kprobes_restore_local_irqflag(struct kprobe_ctlblk *kcb, regs->pstate |= kcb->saved_irqflag; } -static void __kprobes -set_ss_context(struct kprobe_ctlblk *kcb, unsigned long addr) -{ - kcb->ss_ctx.ss_pending = true; - kcb->ss_ctx.match_addr = addr + sizeof(kprobe_opcode_t); -} - -static void __kprobes clear_ss_context(struct kprobe_ctlblk *kcb) -{ - kcb->ss_ctx.ss_pending = false; - kcb->ss_ctx.match_addr = 0; -} - static void __kprobes setup_singlestep(struct kprobe *p, struct pt_regs *regs, struct kprobe_ctlblk *kcb, int reenter) @@ -209,7 +196,6 @@ static void __kprobes setup_singlestep(struct kprobe *p, /* prepare for single stepping */ slot = (unsigned long)p->ainsn.api.insn; - set_ss_context(kcb, slot); /* mark pending ss */ kprobes_save_local_irqflag(kcb, regs); instruction_pointer_set(regs, slot); } else { @@ -243,13 +229,8 @@ static int __kprobes reenter_kprobe(struct kprobe *p, } static void __kprobes -post_kprobe_handler(struct kprobe_ctlblk *kcb, struct pt_regs *regs) +post_kprobe_handler(struct kprobe *cur, struct kprobe_ctlblk *kcb, struct pt_regs *regs) { - struct kprobe *cur = kprobe_running(); - - if (!cur) - return; - /* return addr restore if non-branching insn */ if (cur->ainsn.api.restore != 0) instruction_pointer_set(regs, cur->ainsn.api.restore); @@ -364,33 +345,23 @@ static void __kprobes kprobe_handler(struct pt_regs *regs) */ } -static int __kprobes -kprobe_ss_hit(struct kprobe_ctlblk *kcb, unsigned long addr) -{ - if ((kcb->ss_ctx.ss_pending) - && (kcb->ss_ctx.match_addr == addr)) { - clear_ss_context(kcb); /* clear pending ss */ - return DBG_HOOK_HANDLED; - } - /* not ours, kprobes should ignore it */ - return DBG_HOOK_ERROR; -} - static int __kprobes kprobe_breakpoint_ss_handler(struct pt_regs *regs, unsigned int esr) { struct kprobe_ctlblk *kcb = get_kprobe_ctlblk(); - int retval; + unsigned long addr = instruction_pointer(regs); + struct kprobe *cur = kprobe_running(); - /* return error if this is not our step */ - retval = kprobe_ss_hit(kcb, instruction_pointer(regs)); - - if (retval == DBG_HOOK_HANDLED) { + if (cur && (kcb->kprobe_status == KPROBE_HIT_SS) + && ((unsigned long)&cur->ainsn.api.insn[1] == addr)) { kprobes_restore_local_irqflag(kcb, regs); - post_kprobe_handler(kcb, regs); + post_kprobe_handler(cur, kcb, regs); + + return DBG_HOOK_HANDLED; } - return retval; + /* not ours, kprobes should ignore it */ + return DBG_HOOK_ERROR; } static struct break_hook kprobes_break_ss_hook = { diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 577a63bc5994..8c73445993da 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -419,16 +419,15 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, if (clone_flags & CLONE_SETTLS) p->thread.uw.tp_value = tls; } else { + /* + * A kthread has no context to ERET to, so ensure any buggy + * ERET is treated as an illegal exception return. + * + * When a user task is created from a kthread, childregs will + * be initialized by start_thread() or start_compat_thread(). + */ memset(childregs, 0, sizeof(struct pt_regs)); - childregs->pstate = PSR_MODE_EL1h; - if (IS_ENABLED(CONFIG_ARM64_UAO) && - cpus_have_const_cap(ARM64_HAS_UAO)) - childregs->pstate |= PSR_UAO_BIT; - - spectre_v4_enable_task_mitigation(p); - - if (system_uses_irq_prio_masking()) - childregs->pmr_save = GIC_PRIO_IRQON; + childregs->pstate = PSR_MODE_EL1h | PSR_IL_BIT; p->thread.cpu_context.x19 = stack_start; p->thread.cpu_context.x20 = stk_sz; @@ -458,17 +457,6 @@ static void tls_thread_switch(struct task_struct *next) write_sysreg(*task_user_tls(next), tpidr_el0); } -/* Restore the UAO state depending on next's addr_limit */ -void uao_thread_switch(struct task_struct *next) -{ - if (IS_ENABLED(CONFIG_ARM64_UAO)) { - if (task_thread_info(next)->addr_limit == KERNEL_DS) - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(1), ARM64_HAS_UAO)); - else - asm(ALTERNATIVE("nop", SET_PSTATE_UAO(0), ARM64_HAS_UAO)); - } -} - /* * Force SSBS state on context-switch, since it may be lost after migrating * from a CPU which treats the bit as RES0 in a heterogeneous system. @@ -560,7 +548,6 @@ __notrace_funcgraph struct task_struct *__switch_to(struct task_struct *prev, hw_breakpoint_thread_switch(next); contextidr_thread_switch(next); entry_task_switch(next); - uao_thread_switch(next); ssbs_thread_switch(next); erratum_1418040_thread_switch(prev, next); aarch32_thread_switch(next); diff --git a/arch/arm64/kernel/proton-pack.c b/arch/arm64/kernel/proton-pack.c index f6e4e3737405..4c25c008504f 100644 --- a/arch/arm64/kernel/proton-pack.c +++ b/arch/arm64/kernel/proton-pack.c @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -538,12 +539,12 @@ static enum mitigation_state spectre_v4_enable_hw_mitigation(void) if (spectre_v4_mitigations_off()) { sysreg_clear_set(sctlr_el1, 0, SCTLR_ELx_DSSBS); - asm volatile(SET_PSTATE_SSBS(1)); + set_pstate_ssbs(1); return SPECTRE_VULNERABLE; } /* SCTLR_EL1.DSSBS was initialised to 0 during boot */ - asm volatile(SET_PSTATE_SSBS(0)); + set_pstate_ssbs(0); return SPECTRE_MITIGATED; } diff --git a/arch/arm64/kernel/ptrace.c b/arch/arm64/kernel/ptrace.c index f49b349e16a3..8ac487c84e37 100644 --- a/arch/arm64/kernel/ptrace.c +++ b/arch/arm64/kernel/ptrace.c @@ -192,14 +192,11 @@ static void ptrace_hbptriggered(struct perf_event *bp, break; } } - arm64_force_sig_ptrace_errno_trap(si_errno, - (void __user *)bkpt->trigger, + arm64_force_sig_ptrace_errno_trap(si_errno, bkpt->trigger, desc); } #endif - arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, - (void __user *)(bkpt->trigger), - desc); + arm64_force_sig_fault(SIGTRAP, TRAP_HWBKPT, bkpt->trigger, desc); } /* diff --git a/arch/arm64/kernel/scs.c b/arch/arm64/kernel/scs.c deleted file mode 100644 index e8f7ff45dd8f..000000000000 --- a/arch/arm64/kernel/scs.c +++ /dev/null @@ -1,16 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Shadow Call Stack support. - * - * Copyright (C) 2019 Google LLC - */ - -#include -#include - -DEFINE_SCS(irq_shadow_call_stack); - -#ifdef CONFIG_ARM_SDE_INTERFACE -DEFINE_SCS(sdei_shadow_call_stack_normal); -DEFINE_SCS(sdei_shadow_call_stack_critical); -#endif diff --git a/arch/arm64/kernel/sdei.c b/arch/arm64/kernel/sdei.c index 793c46d6a447..2c7ca449dd51 100644 --- a/arch/arm64/kernel/sdei.c +++ b/arch/arm64/kernel/sdei.c @@ -7,6 +7,7 @@ #include #include #include +#include #include #include @@ -38,6 +39,14 @@ DEFINE_PER_CPU(unsigned long *, sdei_stack_normal_ptr); DEFINE_PER_CPU(unsigned long *, sdei_stack_critical_ptr); #endif +DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr); +DECLARE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr); + +#ifdef CONFIG_SHADOW_CALL_STACK +DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_normal_ptr); +DEFINE_PER_CPU(unsigned long *, sdei_shadow_call_stack_critical_ptr); +#endif + static void _free_sdei_stack(unsigned long * __percpu *ptr, int cpu) { unsigned long *p; @@ -53,6 +62,9 @@ static void free_sdei_stacks(void) { int cpu; + if (!IS_ENABLED(CONFIG_VMAP_STACK)) + return; + for_each_possible_cpu(cpu) { _free_sdei_stack(&sdei_stack_normal_ptr, cpu); _free_sdei_stack(&sdei_stack_critical_ptr, cpu); @@ -76,6 +88,9 @@ static int init_sdei_stacks(void) int cpu; int err = 0; + if (!IS_ENABLED(CONFIG_VMAP_STACK)) + return 0; + for_each_possible_cpu(cpu) { err = _init_sdei_stack(&sdei_stack_normal_ptr, cpu); if (err) @@ -91,6 +106,62 @@ static int init_sdei_stacks(void) return err; } +static void _free_sdei_scs(unsigned long * __percpu *ptr, int cpu) +{ + void *s; + + s = per_cpu(*ptr, cpu); + if (s) { + per_cpu(*ptr, cpu) = NULL; + scs_free(s); + } +} + +static void free_sdei_scs(void) +{ + int cpu; + + for_each_possible_cpu(cpu) { + _free_sdei_scs(&sdei_shadow_call_stack_normal_ptr, cpu); + _free_sdei_scs(&sdei_shadow_call_stack_critical_ptr, cpu); + } +} + +static int _init_sdei_scs(unsigned long * __percpu *ptr, int cpu) +{ + void *s; + + s = scs_alloc(cpu_to_node(cpu)); + if (!s) + return -ENOMEM; + per_cpu(*ptr, cpu) = s; + + return 0; +} + +static int init_sdei_scs(void) +{ + int cpu; + int err = 0; + + if (!IS_ENABLED(CONFIG_SHADOW_CALL_STACK)) + return 0; + + for_each_possible_cpu(cpu) { + err = _init_sdei_scs(&sdei_shadow_call_stack_normal_ptr, cpu); + if (err) + break; + err = _init_sdei_scs(&sdei_shadow_call_stack_critical_ptr, cpu); + if (err) + break; + } + + if (err) + free_sdei_scs(); + + return err; +} + static bool on_sdei_normal_stack(unsigned long sp, struct stack_info *info) { unsigned long low = (unsigned long)raw_cpu_read(sdei_stack_normal_ptr); @@ -131,13 +202,14 @@ unsigned long sdei_arch_get_entry_point(int conduit) */ if (is_hyp_mode_available() && !is_kernel_in_hyp_mode()) { pr_err("Not supported on this hardware/boot configuration\n"); - return 0; + goto out_err; } - if (IS_ENABLED(CONFIG_VMAP_STACK)) { - if (init_sdei_stacks()) - return 0; - } + if (init_sdei_stacks()) + goto out_err; + + if (init_sdei_scs()) + goto out_err_free_stacks; sdei_exit_mode = (conduit == SMCCC_CONDUIT_HVC) ? SDEI_EXIT_HVC : SDEI_EXIT_SMC; @@ -152,6 +224,10 @@ unsigned long sdei_arch_get_entry_point(int conduit) #endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ return (unsigned long)__sdei_asm_handler; +out_err_free_stacks: + free_sdei_stacks(); +out_err: + return 0; } /* @@ -179,12 +255,6 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, sdei_api_event_context(i, ®s->regs[i]); } - /* - * We didn't take an exception to get here, set PAN. UAO will be cleared - * by sdei_event_handler()s force_uaccess_begin() call. - */ - __uaccess_enable_hw_pan(); - err = sdei_event_handler(regs, arg); if (err) return SDEI_EV_FAILED; @@ -223,12 +293,39 @@ static __kprobes unsigned long _sdei_handler(struct pt_regs *regs, return vbar + 0x480; } +static void __kprobes notrace __sdei_pstate_entry(void) +{ + /* + * The original SDEI spec (ARM DEN 0054A) can be read ambiguously as to + * whether PSTATE bits are inherited unchanged or generated from + * scratch, and the TF-A implementation always clears PAN and always + * clears UAO. There are no other known implementations. + * + * Subsequent revisions (ARM DEN 0054B) follow the usual rules for how + * PSTATE is modified upon architectural exceptions, and so PAN is + * either inherited or set per SCTLR_ELx.SPAN, and UAO is always + * cleared. + * + * We must explicitly reset PAN to the expected state, including + * clearing it when the host isn't using it, in case a VM had it set. + */ + if (system_uses_hw_pan()) + set_pstate_pan(1); + else if (cpu_has_pan()) + set_pstate_pan(0); +} asmlinkage noinstr unsigned long __sdei_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { unsigned long ret; + /* + * We didn't take an exception to get here, so the HW hasn't + * set/cleared bits in PSTATE that we may rely on. Initialize PAN. + */ + __sdei_pstate_entry(); + arm64_enter_nmi(regs); ret = _sdei_handler(regs, arg); diff --git a/arch/arm64/kernel/setup.c b/arch/arm64/kernel/setup.c index 133257ffd859..1a57a76e1cc2 100644 --- a/arch/arm64/kernel/setup.c +++ b/arch/arm64/kernel/setup.c @@ -206,7 +206,7 @@ static void __init request_standard_resources(void) unsigned long i = 0; size_t res_size; - kernel_code.start = __pa_symbol(_text); + kernel_code.start = __pa_symbol(_stext); kernel_code.end = __pa_symbol(__init_begin - 1); kernel_data.start = __pa_symbol(_sdata); kernel_data.end = __pa_symbol(_end - 1); @@ -283,7 +283,7 @@ u64 cpu_logical_map(int cpu) void __init __no_sanitize_address setup_arch(char **cmdline_p) { - init_mm.start_code = (unsigned long) _text; + init_mm.start_code = (unsigned long) _stext; init_mm.end_code = (unsigned long) _etext; init_mm.end_data = (unsigned long) _edata; init_mm.brk = (unsigned long) _end; @@ -366,7 +366,7 @@ void __init __no_sanitize_address setup_arch(char **cmdline_p) * faults in case uaccess_enable() is inadvertently called by the init * thread. */ - init_task.thread_info.ttbr0 = __pa_symbol(empty_zero_page); + init_task.thread_info.ttbr0 = __pa_symbol(reserved_pg_dir); #endif if (boot_args[1] || boot_args[2] || boot_args[3]) { diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index d7f4623fa208..a6058fb7a2f4 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -955,9 +955,6 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, trace_hardirqs_off(); do { - /* Check valid user FS if needed */ - addr_limit_user_check(); - if (thread_flags & _TIF_NEED_RESCHED) { /* Unmask Debug and SError for the next task */ local_daif_restore(DAIF_PROCCTX_NOIRQ); diff --git a/arch/arm64/kernel/sleep.S b/arch/arm64/kernel/sleep.S index ba40d57757d6..4be7f7eed875 100644 --- a/arch/arm64/kernel/sleep.S +++ b/arch/arm64/kernel/sleep.S @@ -99,7 +99,7 @@ SYM_FUNC_END(__cpu_suspend_enter) .pushsection ".idmap.text", "awx" SYM_CODE_START(cpu_resume) - bl el2_setup // if in EL2 drop to EL1 cleanly + bl init_kernel_el bl __cpu_setup /* enable the MMU early - so we can access sleep_save_stash by va */ adrp x1, swapper_pg_dir diff --git a/arch/arm64/kernel/smp.c b/arch/arm64/kernel/smp.c index 91e2fd11e70f..b9e3fae282fb 100644 --- a/arch/arm64/kernel/smp.c +++ b/arch/arm64/kernel/smp.c @@ -844,14 +844,13 @@ void __init smp_prepare_cpus(unsigned int max_cpus) } static const char *ipi_types[NR_IPI] __tracepoint_string = { -#define S(x,s) [x] = s - S(IPI_RESCHEDULE, "Rescheduling interrupts"), - S(IPI_CALL_FUNC, "Function call interrupts"), - S(IPI_CPU_STOP, "CPU stop interrupts"), - S(IPI_CPU_CRASH_STOP, "CPU stop (for crash dump) interrupts"), - S(IPI_TIMER, "Timer broadcast interrupts"), - S(IPI_IRQ_WORK, "IRQ work interrupts"), - S(IPI_WAKEUP, "CPU wake-up interrupts"), + [IPI_RESCHEDULE] = "Rescheduling interrupts", + [IPI_CALL_FUNC] = "Function call interrupts", + [IPI_CPU_STOP] = "CPU stop interrupts", + [IPI_CPU_CRASH_STOP] = "CPU stop (for crash dump) interrupts", + [IPI_TIMER] = "Timer broadcast interrupts", + [IPI_IRQ_WORK] = "IRQ work interrupts", + [IPI_WAKEUP] = "CPU wake-up interrupts", }; static void smp_cross_call(const struct cpumask *target, unsigned int ipinr); diff --git a/arch/arm64/kernel/suspend.c b/arch/arm64/kernel/suspend.c index 96cd347c7a46..a67b37a7a47e 100644 --- a/arch/arm64/kernel/suspend.c +++ b/arch/arm64/kernel/suspend.c @@ -58,7 +58,6 @@ void notrace __cpu_suspend_exit(void) * features that might not have been set correctly. */ __uaccess_enable_hw_pan(); - uao_thread_switch(current); /* * Restore HW breakpoint registers to sane values diff --git a/arch/arm64/kernel/sys_compat.c b/arch/arm64/kernel/sys_compat.c index 3c18c2454089..265fe3eb1069 100644 --- a/arch/arm64/kernel/sys_compat.c +++ b/arch/arm64/kernel/sys_compat.c @@ -68,7 +68,7 @@ do_compat_cache_op(unsigned long start, unsigned long end, int flags) */ long compat_arm_syscall(struct pt_regs *regs, int scno) { - void __user *addr; + unsigned long addr; switch (scno) { /* @@ -111,8 +111,7 @@ long compat_arm_syscall(struct pt_regs *regs, int scno) break; } - addr = (void __user *)instruction_pointer(regs) - - (compat_thumb_mode(regs) ? 2 : 4); + addr = instruction_pointer(regs) - (compat_thumb_mode(regs) ? 2 : 4); arm64_notify_die("Oops - bad compat syscall(2)", regs, SIGILL, ILL_ILLTRP, addr, scno); diff --git a/arch/arm64/kernel/syscall.c b/arch/arm64/kernel/syscall.c index f8f758e4a306..f61e9d8cc55a 100644 --- a/arch/arm64/kernel/syscall.c +++ b/arch/arm64/kernel/syscall.c @@ -122,7 +122,7 @@ static void el0_svc_common(struct pt_regs *regs, int scno, int sc_nr, cortex_a76_erratum_1463225_svc_handler(); local_daif_restore(DAIF_PROCCTX); - if (system_supports_mte() && (flags & _TIF_MTE_ASYNC_FAULT)) { + if (flags & _TIF_MTE_ASYNC_FAULT) { /* * Process the asynchronous tag check fault before the actual * syscall. do_notify_resume() will send a signal to userspace diff --git a/arch/arm64/kernel/topology.c b/arch/arm64/kernel/topology.c index 543c67cae02f..b8026ec684ba 100644 --- a/arch/arm64/kernel/topology.c +++ b/arch/arm64/kernel/topology.c @@ -124,6 +124,12 @@ int __init parse_acpi_topology(void) #endif #ifdef CONFIG_ARM64_AMU_EXTN +#define read_corecnt() read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0) +#define read_constcnt() read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0) +#else +#define read_corecnt() (0UL) +#define read_constcnt() (0UL) +#endif #undef pr_fmt #define pr_fmt(fmt) "AMU: " fmt @@ -133,54 +139,58 @@ static DEFINE_PER_CPU(u64, arch_const_cycles_prev); static DEFINE_PER_CPU(u64, arch_core_cycles_prev); static cpumask_var_t amu_fie_cpus; -/* Initialize counter reference per-cpu variables for the current CPU */ -void init_cpu_freq_invariance_counters(void) +void update_freq_counters_refs(void) { - this_cpu_write(arch_core_cycles_prev, - read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0)); - this_cpu_write(arch_const_cycles_prev, - read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0)); + this_cpu_write(arch_core_cycles_prev, read_corecnt()); + this_cpu_write(arch_const_cycles_prev, read_constcnt()); } -static int validate_cpu_freq_invariance_counters(int cpu) +static inline bool freq_counters_valid(int cpu) { - u64 max_freq_hz, ratio; + if ((cpu >= nr_cpu_ids) || !cpumask_test_cpu(cpu, cpu_present_mask)) + return false; if (!cpu_has_amu_feat(cpu)) { pr_debug("CPU%d: counters are not supported.\n", cpu); - return -EINVAL; + return false; } if (unlikely(!per_cpu(arch_const_cycles_prev, cpu) || !per_cpu(arch_core_cycles_prev, cpu))) { pr_debug("CPU%d: cycle counters are not enabled.\n", cpu); - return -EINVAL; + return false; } - /* Convert maximum frequency from KHz to Hz and validate */ - max_freq_hz = cpufreq_get_hw_max_freq(cpu) * 1000; - if (unlikely(!max_freq_hz)) { - pr_debug("CPU%d: invalid maximum frequency.\n", cpu); + return true; +} + +static int freq_inv_set_max_ratio(int cpu, u64 max_rate, u64 ref_rate) +{ + u64 ratio; + + if (unlikely(!max_rate || !ref_rate)) { + pr_debug("CPU%d: invalid maximum or reference frequency.\n", + cpu); return -EINVAL; } /* * Pre-compute the fixed ratio between the frequency of the constant - * counter and the maximum frequency of the CPU. + * reference counter and the maximum frequency of the CPU. * - * const_freq - * arch_max_freq_scale = ---------------- * SCHED_CAPACITY_SCALE² - * cpuinfo_max_freq + * ref_rate + * arch_max_freq_scale = ---------- * SCHED_CAPACITY_SCALE² + * max_rate * * We use a factor of 2 * SCHED_CAPACITY_SHIFT -> SCHED_CAPACITY_SCALE² * in order to ensure a good resolution for arch_max_freq_scale for - * very low arch timer frequencies (down to the KHz range which should + * very low reference frequencies (down to the KHz range which should * be unlikely). */ - ratio = (u64)arch_timer_get_rate() << (2 * SCHED_CAPACITY_SHIFT); - ratio = div64_u64(ratio, max_freq_hz); + ratio = ref_rate << (2 * SCHED_CAPACITY_SHIFT); + ratio = div64_u64(ratio, max_rate); if (!ratio) { - WARN_ONCE(1, "System timer frequency too low.\n"); + WARN_ONCE(1, "Reference frequency too low.\n"); return -EINVAL; } @@ -227,8 +237,12 @@ static int __init init_amu_fie(void) } for_each_present_cpu(cpu) { - if (validate_cpu_freq_invariance_counters(cpu)) + if (!freq_counters_valid(cpu) || + freq_inv_set_max_ratio(cpu, + cpufreq_get_hw_max_freq(cpu) * 1000, + arch_timer_get_rate())) continue; + cpumask_set_cpu(cpu, valid_cpus); have_policy |= enable_policy_freq_counters(cpu, valid_cpus); } @@ -280,11 +294,14 @@ void topology_scale_freq_tick(void) if (!cpumask_test_cpu(cpu, amu_fie_cpus)) return; - const_cnt = read_sysreg_s(SYS_AMEVCNTR0_CONST_EL0); - core_cnt = read_sysreg_s(SYS_AMEVCNTR0_CORE_EL0); prev_const_cnt = this_cpu_read(arch_const_cycles_prev); prev_core_cnt = this_cpu_read(arch_core_cycles_prev); + update_freq_counters_refs(); + + const_cnt = this_cpu_read(arch_const_cycles_prev); + core_cnt = this_cpu_read(arch_core_cycles_prev); + if (unlikely(core_cnt <= prev_core_cnt || const_cnt <= prev_const_cnt)) goto store_and_exit; @@ -309,4 +326,71 @@ store_and_exit: this_cpu_write(arch_core_cycles_prev, core_cnt); this_cpu_write(arch_const_cycles_prev, const_cnt); } -#endif /* CONFIG_ARM64_AMU_EXTN */ + +#ifdef CONFIG_ACPI_CPPC_LIB +#include + +static void cpu_read_corecnt(void *val) +{ + *(u64 *)val = read_corecnt(); +} + +static void cpu_read_constcnt(void *val) +{ + *(u64 *)val = read_constcnt(); +} + +static inline +int counters_read_on_cpu(int cpu, smp_call_func_t func, u64 *val) +{ + /* + * Abort call on counterless CPU or when interrupts are + * disabled - can lead to deadlock in smp sync call. + */ + if (!cpu_has_amu_feat(cpu)) + return -EOPNOTSUPP; + + if (WARN_ON_ONCE(irqs_disabled())) + return -EPERM; + + smp_call_function_single(cpu, func, val, 1); + + return 0; +} + +/* + * Refer to drivers/acpi/cppc_acpi.c for the description of the functions + * below. + */ +bool cpc_ffh_supported(void) +{ + return freq_counters_valid(get_cpu_with_amu_feat()); +} + +int cpc_read_ffh(int cpu, struct cpc_reg *reg, u64 *val) +{ + int ret = -EOPNOTSUPP; + + switch ((u64)reg->address) { + case 0x0: + ret = counters_read_on_cpu(cpu, cpu_read_corecnt, val); + break; + case 0x1: + ret = counters_read_on_cpu(cpu, cpu_read_constcnt, val); + break; + } + + if (!ret) { + *val &= GENMASK_ULL(reg->bit_offset + reg->bit_width - 1, + reg->bit_offset); + *val >>= reg->bit_offset; + } + + return ret; +} + +int cpc_write_ffh(int cpunum, struct cpc_reg *reg, u64 val) +{ + return -EOPNOTSUPP; +} +#endif /* CONFIG_ACPI_CPPC_LIB */ diff --git a/arch/arm64/kernel/traps.c b/arch/arm64/kernel/traps.c index 2059d8f43f55..08156be75569 100644 --- a/arch/arm64/kernel/traps.c +++ b/arch/arm64/kernel/traps.c @@ -171,32 +171,32 @@ static void arm64_show_signal(int signo, const char *str) __show_regs(regs); } -void arm64_force_sig_fault(int signo, int code, void __user *addr, +void arm64_force_sig_fault(int signo, int code, unsigned long far, const char *str) { arm64_show_signal(signo, str); if (signo == SIGKILL) force_sig(SIGKILL); else - force_sig_fault(signo, code, addr); + force_sig_fault(signo, code, (void __user *)far); } -void arm64_force_sig_mceerr(int code, void __user *addr, short lsb, +void arm64_force_sig_mceerr(int code, unsigned long far, short lsb, const char *str) { arm64_show_signal(SIGBUS, str); - force_sig_mceerr(code, addr, lsb); + force_sig_mceerr(code, (void __user *)far, lsb); } -void arm64_force_sig_ptrace_errno_trap(int errno, void __user *addr, +void arm64_force_sig_ptrace_errno_trap(int errno, unsigned long far, const char *str) { arm64_show_signal(SIGTRAP, str); - force_sig_ptrace_errno_trap(errno, addr); + force_sig_ptrace_errno_trap(errno, (void __user *)far); } void arm64_notify_die(const char *str, struct pt_regs *regs, - int signo, int sicode, void __user *addr, + int signo, int sicode, unsigned long far, int err) { if (user_mode(regs)) { @@ -204,7 +204,7 @@ void arm64_notify_die(const char *str, struct pt_regs *regs, current->thread.fault_address = 0; current->thread.fault_code = err; - arm64_force_sig_fault(signo, sicode, addr, str); + arm64_force_sig_fault(signo, sicode, far, str); } else { die(str, regs, err); } @@ -375,7 +375,7 @@ void force_signal_inject(int signal, int code, unsigned long address, unsigned i signal = SIGKILL; } - arm64_notify_die(desc, regs, signal, code, (void __user *)address, err); + arm64_notify_die(desc, regs, signal, code, address, err); } /* @@ -386,7 +386,7 @@ void arm64_notify_segfault(unsigned long addr) int code; mmap_read_lock(current->mm); - if (find_vma(current->mm, addr) == NULL) + if (find_vma(current->mm, untagged_addr(addr)) == NULL) code = SEGV_MAPERR; else code = SEGV_ACCERR; @@ -449,12 +449,13 @@ NOKPROBE_SYMBOL(do_ptrauth_fault); static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) { - unsigned long address; + unsigned long tagged_address, address; int rt = ESR_ELx_SYS64_ISS_RT(esr); int crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; int ret = 0; - address = untagged_addr(pt_regs_read_reg(regs, rt)); + tagged_address = pt_regs_read_reg(regs, rt); + address = untagged_addr(tagged_address); switch (crm) { case ESR_ELx_SYS64_ISS_CRM_DC_CVAU: /* DC CVAU, gets promoted */ @@ -481,7 +482,7 @@ static void user_cache_maint_handler(unsigned int esr, struct pt_regs *regs) } if (ret) - arm64_notify_segfault(address); + arm64_notify_segfault(tagged_address); else arm64_skip_faulting_instruction(regs, AARCH64_INSN_SIZE); } @@ -775,7 +776,7 @@ asmlinkage void notrace bad_mode(struct pt_regs *regs, int reason, unsigned int */ void bad_el0_sync(struct pt_regs *regs, int reason, unsigned int esr) { - void __user *pc = (void __user *)instruction_pointer(regs); + unsigned long pc = instruction_pointer(regs); current->thread.fault_address = 0; current->thread.fault_code = esr; diff --git a/arch/arm64/kernel/vdso/Makefile b/arch/arm64/kernel/vdso/Makefile index d65f52264aba..a8f8e409e2bf 100644 --- a/arch/arm64/kernel/vdso/Makefile +++ b/arch/arm64/kernel/vdso/Makefile @@ -28,7 +28,7 @@ ldflags-y := -shared -nostdlib -soname=linux-vdso.so.1 --hash-style=sysv \ $(btildflags-y) -T ccflags-y := -fno-common -fno-builtin -fno-stack-protector -ffixed-x18 -ccflags-y += -DDISABLE_BRANCH_PROFILING +ccflags-y += -DDISABLE_BRANCH_PROFILING -DBUILD_VDSO CFLAGS_REMOVE_vgettimeofday.o = $(CC_FLAGS_FTRACE) -Os $(CC_FLAGS_SCS) $(GCC_PLUGINS_CFLAGS) KASAN_SANITIZE := n diff --git a/arch/arm64/kernel/vdso32/Makefile b/arch/arm64/kernel/vdso32/Makefile index 79280c53b9a6..a1e0f91e6cea 100644 --- a/arch/arm64/kernel/vdso32/Makefile +++ b/arch/arm64/kernel/vdso32/Makefile @@ -48,7 +48,7 @@ cc32-as-instr = $(call try-run,\ # As a result we set our own flags here. # KBUILD_CPPFLAGS and NOSTDINC_FLAGS from top-level Makefile -VDSO_CPPFLAGS := -D__KERNEL__ -nostdinc -isystem $(shell $(CC_COMPAT) -print-file-name=include) +VDSO_CPPFLAGS := -DBUILD_VDSO -D__KERNEL__ -nostdinc -isystem $(shell $(CC_COMPAT) -print-file-name=include) VDSO_CPPFLAGS += $(LINUXINCLUDE) # Common C and assembly flags diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 1bda604f4c70..5d5857c5b025 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -121,7 +121,7 @@ SECTIONS _text = .; HEAD_TEXT } - .text : { /* Real text segment */ + .text : ALIGN(SEGMENT_ALIGN) { /* Real text segment */ _stext = .; /* Text and read-only data */ IRQENTRY_TEXT SOFTIRQENTRY_TEXT @@ -164,13 +164,11 @@ SECTIONS . += PAGE_SIZE; #endif -#ifdef CONFIG_ARM64_SW_TTBR0_PAN - reserved_ttbr0 = .; - . += RESERVED_TTBR0_SIZE; -#endif + reserved_pg_dir = .; + . += PAGE_SIZE; + swapper_pg_dir = .; . += PAGE_SIZE; - swapper_pg_end = .; . = ALIGN(SEGMENT_ALIGN); __init_begin = .; @@ -201,7 +199,7 @@ SECTIONS INIT_CALLS CON_INITCALL INIT_RAM_FS - *(.init.rodata.* .init.bss) /* from the EFI stub */ + *(.init.altinstructions .init.bss) /* from the EFI stub */ } .exit.data : { EXIT_DATA diff --git a/arch/arm64/lib/clear_user.S b/arch/arm64/lib/clear_user.S index 48a3a26eff66..af9afcbec92c 100644 --- a/arch/arm64/lib/clear_user.S +++ b/arch/arm64/lib/clear_user.S @@ -24,20 +24,20 @@ SYM_FUNC_START(__arch_clear_user) subs x1, x1, #8 b.mi 2f 1: -uao_user_alternative 9f, str, sttr, xzr, x0, 8 +user_ldst 9f, sttr, xzr, x0, 8 subs x1, x1, #8 b.pl 1b 2: adds x1, x1, #4 b.mi 3f -uao_user_alternative 9f, str, sttr, wzr, x0, 4 +user_ldst 9f, sttr, wzr, x0, 4 sub x1, x1, #4 3: adds x1, x1, #2 b.mi 4f -uao_user_alternative 9f, strh, sttrh, wzr, x0, 2 +user_ldst 9f, sttrh, wzr, x0, 2 sub x1, x1, #2 4: adds x1, x1, #1 b.mi 5f -uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 +user_ldst 9f, sttrb, wzr, x0, 0 5: mov x0, #0 ret SYM_FUNC_END(__arch_clear_user) diff --git a/arch/arm64/lib/copy_from_user.S b/arch/arm64/lib/copy_from_user.S index 0f8a3a9e3795..95cd62d67371 100644 --- a/arch/arm64/lib/copy_from_user.S +++ b/arch/arm64/lib/copy_from_user.S @@ -21,7 +21,7 @@ */ .macro ldrb1 reg, ptr, val - uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val + user_ldst 9998f, ldtrb, \reg, \ptr, \val .endm .macro strb1 reg, ptr, val @@ -29,7 +29,7 @@ .endm .macro ldrh1 reg, ptr, val - uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val + user_ldst 9998f, ldtrh, \reg, \ptr, \val .endm .macro strh1 reg, ptr, val @@ -37,7 +37,7 @@ .endm .macro ldr1 reg, ptr, val - uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val + user_ldst 9998f, ldtr, \reg, \ptr, \val .endm .macro str1 reg, ptr, val @@ -45,7 +45,7 @@ .endm .macro ldp1 reg1, reg2, ptr, val - uao_ldp 9998f, \reg1, \reg2, \ptr, \val + user_ldp 9998f, \reg1, \reg2, \ptr, \val .endm .macro stp1 reg1, reg2, ptr, val diff --git a/arch/arm64/lib/copy_in_user.S b/arch/arm64/lib/copy_in_user.S index 80e37ada0ee1..1f61cd0df062 100644 --- a/arch/arm64/lib/copy_in_user.S +++ b/arch/arm64/lib/copy_in_user.S @@ -22,35 +22,35 @@ * x0 - bytes not copied */ .macro ldrb1 reg, ptr, val - uao_user_alternative 9998f, ldrb, ldtrb, \reg, \ptr, \val + user_ldst 9998f, ldtrb, \reg, \ptr, \val .endm .macro strb1 reg, ptr, val - uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val + user_ldst 9998f, sttrb, \reg, \ptr, \val .endm .macro ldrh1 reg, ptr, val - uao_user_alternative 9998f, ldrh, ldtrh, \reg, \ptr, \val + user_ldst 9998f, ldtrh, \reg, \ptr, \val .endm .macro strh1 reg, ptr, val - uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val + user_ldst 9998f, sttrh, \reg, \ptr, \val .endm .macro ldr1 reg, ptr, val - uao_user_alternative 9998f, ldr, ldtr, \reg, \ptr, \val + user_ldst 9998f, ldtr, \reg, \ptr, \val .endm .macro str1 reg, ptr, val - uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val + user_ldst 9998f, sttr, \reg, \ptr, \val .endm .macro ldp1 reg1, reg2, ptr, val - uao_ldp 9998f, \reg1, \reg2, \ptr, \val + user_ldp 9998f, \reg1, \reg2, \ptr, \val .endm .macro stp1 reg1, reg2, ptr, val - uao_stp 9998f, \reg1, \reg2, \ptr, \val + user_stp 9998f, \reg1, \reg2, \ptr, \val .endm end .req x5 diff --git a/arch/arm64/lib/copy_to_user.S b/arch/arm64/lib/copy_to_user.S index 4ec59704b8f2..043da90f5dd7 100644 --- a/arch/arm64/lib/copy_to_user.S +++ b/arch/arm64/lib/copy_to_user.S @@ -24,7 +24,7 @@ .endm .macro strb1 reg, ptr, val - uao_user_alternative 9998f, strb, sttrb, \reg, \ptr, \val + user_ldst 9998f, sttrb, \reg, \ptr, \val .endm .macro ldrh1 reg, ptr, val @@ -32,7 +32,7 @@ .endm .macro strh1 reg, ptr, val - uao_user_alternative 9998f, strh, sttrh, \reg, \ptr, \val + user_ldst 9998f, sttrh, \reg, \ptr, \val .endm .macro ldr1 reg, ptr, val @@ -40,7 +40,7 @@ .endm .macro str1 reg, ptr, val - uao_user_alternative 9998f, str, sttr, \reg, \ptr, \val + user_ldst 9998f, sttr, \reg, \ptr, \val .endm .macro ldp1 reg1, reg2, ptr, val @@ -48,7 +48,7 @@ .endm .macro stp1 reg1, reg2, ptr, val - uao_stp 9998f, \reg1, \reg2, \ptr, \val + user_stp 9998f, \reg1, \reg2, \ptr, \val .endm end .req x5 diff --git a/arch/arm64/lib/mte.S b/arch/arm64/lib/mte.S index 03ca6d8b8670..351537c12f36 100644 --- a/arch/arm64/lib/mte.S +++ b/arch/arm64/lib/mte.S @@ -4,7 +4,7 @@ */ #include -#include +#include #include #include #include @@ -67,7 +67,7 @@ SYM_FUNC_START(mte_copy_tags_from_user) mov x3, x1 cbz x2, 2f 1: - uao_user_alternative 2f, ldrb, ldtrb, w4, x1, 0 + user_ldst 2f, ldtrb, w4, x1, 0 lsl x4, x4, #MTE_TAG_SHIFT stg x4, [x0], #MTE_GRANULE_SIZE add x1, x1, #1 @@ -94,7 +94,7 @@ SYM_FUNC_START(mte_copy_tags_to_user) 1: ldg x4, [x1] ubfx x4, x4, #MTE_TAG_SHIFT, #MTE_TAG_SIZE - uao_user_alternative 2f, strb, sttrb, w4, x0, 0 + user_ldst 2f, sttrb, w4, x0, 0 add x0, x0, #1 add x1, x1, #MTE_GRANULE_SIZE subs x2, x2, #1 diff --git a/arch/arm64/lib/uaccess_flushcache.c b/arch/arm64/lib/uaccess_flushcache.c index bfa30b75b2b8..c83bb5a4aad2 100644 --- a/arch/arm64/lib/uaccess_flushcache.c +++ b/arch/arm64/lib/uaccess_flushcache.c @@ -30,9 +30,7 @@ unsigned long __copy_user_flushcache(void *to, const void __user *from, { unsigned long rc; - uaccess_enable_not_uao(); - rc = __arch_copy_from_user(to, from, n); - uaccess_disable_not_uao(); + rc = raw_copy_from_user(to, from, n); /* See above */ __clean_dcache_area_pop(to, n - rc); diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 795d224f184f..2848952b178d 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -40,7 +40,7 @@ #include struct fault_info { - int (*fn)(unsigned long addr, unsigned int esr, + int (*fn)(unsigned long far, unsigned int esr, struct pt_regs *regs); int sig; int code; @@ -385,8 +385,11 @@ static void set_thread_esr(unsigned long address, unsigned int esr) current->thread.fault_code = esr; } -static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *regs) +static void do_bad_area(unsigned long far, unsigned int esr, + struct pt_regs *regs) { + unsigned long addr = untagged_addr(far); + /* * If we are in kernel mode at this point, we have no context to * handle this fault with. @@ -395,8 +398,7 @@ static void do_bad_area(unsigned long addr, unsigned int esr, struct pt_regs *re const struct fault_info *inf = esr_to_fault_info(esr); set_thread_esr(addr, esr); - arm64_force_sig_fault(inf->sig, inf->code, (void __user *)addr, - inf->name); + arm64_force_sig_fault(inf->sig, inf->code, far, inf->name); } else { __do_kernel_fault(addr, esr, regs); } @@ -448,7 +450,7 @@ static bool is_write_abort(unsigned int esr) return (esr & ESR_ELx_WNR) && !(esr & ESR_ELx_CM); } -static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, +static int __kprobes do_page_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf; @@ -456,6 +458,7 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, vm_fault_t fault; unsigned long vm_flags = VM_ACCESS_FLAGS; unsigned int mm_flags = FAULT_FLAG_DEFAULT; + unsigned long addr = untagged_addr(far); if (kprobe_page_fault(regs, esr)) return 0; @@ -479,11 +482,6 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, } if (is_ttbr0_addr(addr) && is_el1_permission_fault(addr, esr, regs)) { - /* regs->orig_addr_limit may be 0 if we entered from EL0 */ - if (regs->orig_addr_limit == KERNEL_DS) - die_kernel_fault("access to user memory with fs=KERNEL_DS", - addr, esr, regs); - if (is_el1_instruction_abort(esr)) die_kernel_fault("execution of user memory", addr, esr, regs); @@ -567,8 +565,7 @@ retry: * We had some memory, but were unable to successfully fix up * this page fault. */ - arm64_force_sig_fault(SIGBUS, BUS_ADRERR, (void __user *)addr, - inf->name); + arm64_force_sig_fault(SIGBUS, BUS_ADRERR, far, inf->name); } else if (fault & (VM_FAULT_HWPOISON_LARGE | VM_FAULT_HWPOISON)) { unsigned int lsb; @@ -576,8 +573,7 @@ retry: if (fault & VM_FAULT_HWPOISON_LARGE) lsb = hstate_index_to_shift(VM_FAULT_GET_HINDEX(fault)); - arm64_force_sig_mceerr(BUS_MCEERR_AR, (void __user *)addr, lsb, - inf->name); + arm64_force_sig_mceerr(BUS_MCEERR_AR, far, lsb, inf->name); } else { /* * Something tried to access memory that isn't in our memory @@ -585,8 +581,7 @@ retry: */ arm64_force_sig_fault(SIGSEGV, fault == VM_FAULT_BADACCESS ? SEGV_ACCERR : SEGV_MAPERR, - (void __user *)addr, - inf->name); + far, inf->name); } return 0; @@ -596,33 +591,35 @@ no_context: return 0; } -static int __kprobes do_translation_fault(unsigned long addr, +static int __kprobes do_translation_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { - if (is_ttbr0_addr(addr)) - return do_page_fault(addr, esr, regs); + unsigned long addr = untagged_addr(far); - do_bad_area(addr, esr, regs); + if (is_ttbr0_addr(addr)) + return do_page_fault(far, esr, regs); + + do_bad_area(far, esr, regs); return 0; } -static int do_alignment_fault(unsigned long addr, unsigned int esr, +static int do_alignment_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { - do_bad_area(addr, esr, regs); + do_bad_area(far, esr, regs); return 0; } -static int do_bad(unsigned long addr, unsigned int esr, struct pt_regs *regs) +static int do_bad(unsigned long far, unsigned int esr, struct pt_regs *regs) { return 1; /* "fault" */ } -static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) +static int do_sea(unsigned long far, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf; - void __user *siaddr; + unsigned long siaddr; inf = esr_to_fault_info(esr); @@ -634,19 +631,30 @@ static int do_sea(unsigned long addr, unsigned int esr, struct pt_regs *regs) return 0; } - if (esr & ESR_ELx_FnV) - siaddr = NULL; - else - siaddr = (void __user *)addr; + if (esr & ESR_ELx_FnV) { + siaddr = 0; + } else { + /* + * The architecture specifies that the tag bits of FAR_EL1 are + * UNKNOWN for synchronous external aborts. Mask them out now + * so that userspace doesn't see them. + */ + siaddr = untagged_addr(far); + } arm64_notify_die(inf->name, regs, inf->sig, inf->code, siaddr, esr); return 0; } -static int do_tag_check_fault(unsigned long addr, unsigned int esr, +static int do_tag_check_fault(unsigned long far, unsigned int esr, struct pt_regs *regs) { - do_bad_area(addr, esr, regs); + /* + * The architecture specifies that bits 63:60 of FAR_EL1 are UNKNOWN for tag + * check faults. Mask them out now so that userspace doesn't see them. + */ + far &= (1UL << 60) - 1; + do_bad_area(far, esr, regs); return 0; } @@ -717,11 +725,12 @@ static const struct fault_info fault_info[] = { { do_bad, SIGKILL, SI_KERNEL, "unknown 63" }, }; -void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) +void do_mem_abort(unsigned long far, unsigned int esr, struct pt_regs *regs) { const struct fault_info *inf = esr_to_fault_info(esr); + unsigned long addr = untagged_addr(far); - if (!inf->fn(addr, esr, regs)) + if (!inf->fn(far, esr, regs)) return; if (!user_mode(regs)) { @@ -730,8 +739,12 @@ void do_mem_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) show_pte(addr); } - arm64_notify_die(inf->name, regs, - inf->sig, inf->code, (void __user *)addr, esr); + /* + * At this point we have an unrecognized fault type whose tag bits may + * have been defined as UNKNOWN. Therefore we only expose the untagged + * address to the signal handler. + */ + arm64_notify_die(inf->name, regs, inf->sig, inf->code, addr, esr); } NOKPROBE_SYMBOL(do_mem_abort); @@ -744,8 +757,8 @@ NOKPROBE_SYMBOL(do_el0_irq_bp_hardening); void do_sp_pc_abort(unsigned long addr, unsigned int esr, struct pt_regs *regs) { - arm64_notify_die("SP/PC alignment exception", regs, - SIGBUS, BUS_ADRALN, (void __user *)addr, esr); + arm64_notify_die("SP/PC alignment exception", regs, SIGBUS, BUS_ADRALN, + addr, esr); } NOKPROBE_SYMBOL(do_sp_pc_abort); @@ -846,8 +859,7 @@ void do_debug_exception(unsigned long addr_if_watchpoint, unsigned int esr, arm64_apply_bp_hardening(); if (inf->fn(addr_if_watchpoint, esr, regs)) { - arm64_notify_die(inf->name, regs, - inf->sig, inf->code, (void __user *)pc, esr); + arm64_notify_die(inf->name, regs, inf->sig, inf->code, pc, esr); } debug_exception_exit(regs); diff --git a/arch/arm64/mm/init.c b/arch/arm64/mm/init.c index 93458820bc53..fbd452e12397 100644 --- a/arch/arm64/mm/init.c +++ b/arch/arm64/mm/init.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -42,8 +43,6 @@ #include #include -#define ARM64_ZONE_DMA_BITS 30 - /* * We need to be able to catch inadvertent references to memstart_addr * that occur (potentially in generic code) before arm64_memblock_init() @@ -175,21 +174,34 @@ static void __init reserve_elfcorehdr(void) #endif /* CONFIG_CRASH_DUMP */ /* - * Return the maximum physical address for a zone with a given address size - * limit. It currently assumes that for memory starting above 4G, 32-bit - * devices will use a DMA offset. + * Return the maximum physical address for a zone accessible by the given bits + * limit. If DRAM starts above 32-bit, expand the zone to the maximum + * available memory, otherwise cap it at 32-bit. */ static phys_addr_t __init max_zone_phys(unsigned int zone_bits) { - phys_addr_t offset = memblock_start_of_DRAM() & GENMASK_ULL(63, zone_bits); - return min(offset + (1ULL << zone_bits), memblock_end_of_DRAM()); + phys_addr_t zone_mask = DMA_BIT_MASK(zone_bits); + phys_addr_t phys_start = memblock_start_of_DRAM(); + + if (phys_start > U32_MAX) + zone_mask = PHYS_ADDR_MAX; + else if (phys_start > zone_mask) + zone_mask = U32_MAX; + + return min(zone_mask, memblock_end_of_DRAM() - 1) + 1; } static void __init zone_sizes_init(unsigned long min, unsigned long max) { unsigned long max_zone_pfns[MAX_NR_ZONES] = {0}; + unsigned int __maybe_unused acpi_zone_dma_bits; + unsigned int __maybe_unused dt_zone_dma_bits; #ifdef CONFIG_ZONE_DMA + acpi_zone_dma_bits = fls64(acpi_iort_dma_get_max_cpu_address()); + dt_zone_dma_bits = fls64(of_dma_get_max_cpu_address(NULL)); + zone_dma_bits = min3(32U, dt_zone_dma_bits, acpi_zone_dma_bits); + arm64_dma_phys_limit = max_zone_phys(zone_dma_bits); max_zone_pfns[ZONE_DMA] = PFN_DOWN(arm64_dma_phys_limit); #endif #ifdef CONFIG_ZONE_DMA32 @@ -269,7 +281,7 @@ static void __init fdt_enforce_memory_region(void) void __init arm64_memblock_init(void) { - const s64 linear_region_size = BIT(vabits_actual - 1); + const s64 linear_region_size = PAGE_END - _PAGE_OFFSET(vabits_actual); /* Handle linux,usable-memory-range property */ fdt_enforce_memory_region(); @@ -370,7 +382,7 @@ void __init arm64_memblock_init(void) * Register the kernel text, kernel data, initrd, and initial * pagetables with memblock. */ - memblock_reserve(__pa_symbol(_text), _end - _text); + memblock_reserve(__pa_symbol(_stext), _end - _stext); if (IS_ENABLED(CONFIG_BLK_DEV_INITRD) && phys_initrd_size) { /* the generic initrd code expects virtual addresses */ initrd_start = __phys_to_virt(phys_initrd_start); @@ -379,18 +391,11 @@ void __init arm64_memblock_init(void) early_init_fdt_scan_reserved_mem(); - if (IS_ENABLED(CONFIG_ZONE_DMA)) { - zone_dma_bits = ARM64_ZONE_DMA_BITS; - arm64_dma_phys_limit = max_zone_phys(ARM64_ZONE_DMA_BITS); - } - if (IS_ENABLED(CONFIG_ZONE_DMA32)) arm64_dma32_phys_limit = max_zone_phys(32); else arm64_dma32_phys_limit = PHYS_MASK + 1; - reserve_crashkernel(); - reserve_elfcorehdr(); high_memory = __va(memblock_end_of_DRAM() - 1) + 1; @@ -430,6 +435,12 @@ void __init bootmem_init(void) sparse_init(); zone_sizes_init(min, max); + /* + * request_standard_resources() depends on crashkernel's memory being + * reserved, so do it here. + */ + reserve_crashkernel(); + memblock_dump_all(); } diff --git a/arch/arm64/mm/mmu.c b/arch/arm64/mm/mmu.c index ca692a815731..ae0c3d023824 100644 --- a/arch/arm64/mm/mmu.c +++ b/arch/arm64/mm/mmu.c @@ -464,20 +464,35 @@ void __init mark_linear_text_alias_ro(void) /* * Remove the write permissions from the linear alias of .text/.rodata */ - update_mapping_prot(__pa_symbol(_text), (unsigned long)lm_alias(_text), - (unsigned long)__init_begin - (unsigned long)_text, + update_mapping_prot(__pa_symbol(_stext), (unsigned long)lm_alias(_stext), + (unsigned long)__init_begin - (unsigned long)_stext, PAGE_KERNEL_RO); } +static bool crash_mem_map __initdata; + +static int __init enable_crash_mem_map(char *arg) +{ + /* + * Proper parameter parsing is done by reserve_crashkernel(). We only + * need to know if the linear map has to avoid block mappings so that + * the crashkernel reservations can be unmapped later. + */ + crash_mem_map = true; + + return 0; +} +early_param("crashkernel", enable_crash_mem_map); + static void __init map_mem(pgd_t *pgdp) { - phys_addr_t kernel_start = __pa_symbol(_text); + phys_addr_t kernel_start = __pa_symbol(_stext); phys_addr_t kernel_end = __pa_symbol(__init_begin); phys_addr_t start, end; int flags = 0; u64 i; - if (rodata_full || debug_pagealloc_enabled()) + if (rodata_full || crash_mem_map || debug_pagealloc_enabled()) flags = NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS; /* @@ -487,11 +502,6 @@ static void __init map_mem(pgd_t *pgdp) * the following for-loop */ memblock_mark_nomap(kernel_start, kernel_end - kernel_start); -#ifdef CONFIG_KEXEC_CORE - if (crashk_res.end) - memblock_mark_nomap(crashk_res.start, - resource_size(&crashk_res)); -#endif /* map all the memory banks */ for_each_mem_range(i, &start, &end) { @@ -506,7 +516,7 @@ static void __init map_mem(pgd_t *pgdp) } /* - * Map the linear alias of the [_text, __init_begin) interval + * Map the linear alias of the [_stext, __init_begin) interval * as non-executable now, and remove the write permission in * mark_linear_text_alias_ro() below (which will be called after * alternative patching has completed). This makes the contents @@ -518,21 +528,6 @@ static void __init map_mem(pgd_t *pgdp) __map_memblock(pgdp, kernel_start, kernel_end, PAGE_KERNEL, NO_CONT_MAPPINGS); memblock_clear_nomap(kernel_start, kernel_end - kernel_start); - -#ifdef CONFIG_KEXEC_CORE - /* - * Use page-level mappings here so that we can shrink the region - * in page granularity and put back unused memory to buddy system - * through /sys/kernel/kexec_crash_size interface. - */ - if (crashk_res.end) { - __map_memblock(pgdp, crashk_res.start, crashk_res.end + 1, - PAGE_KERNEL, - NO_BLOCK_MAPPINGS | NO_CONT_MAPPINGS); - memblock_clear_nomap(crashk_res.start, - resource_size(&crashk_res)); - } -#endif } void mark_rodata_ro(void) @@ -665,7 +660,7 @@ static void __init map_kernel(pgd_t *pgdp) * Only rodata will be remapped with different permissions later on, * all other segments are allowed to use contiguous mappings. */ - map_kernel_segment(pgdp, _text, _etext, text_prot, &vmlinux_text, 0, + map_kernel_segment(pgdp, _stext, _etext, text_prot, &vmlinux_text, 0, VM_NO_GUARD); map_kernel_segment(pgdp, __start_rodata, __inittext_begin, PAGE_KERNEL, &vmlinux_rodata, NO_CONT_MAPPINGS, VM_NO_GUARD); @@ -1132,8 +1127,11 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node, void *p = NULL; p = vmemmap_alloc_block_buf(PMD_SIZE, node, altmap); - if (!p) - return -ENOMEM; + if (!p) { + if (vmemmap_populate_basepages(addr, next, node, altmap)) + return -ENOMEM; + continue; + } pmd_set_huge(pmdp, __pa(p), __pgprot(PROT_SECT_NORMAL)); } else @@ -1510,13 +1508,43 @@ static int prevent_bootmem_remove_notifier(struct notifier_block *nb, unsigned long end_pfn = arg->start_pfn + arg->nr_pages; unsigned long pfn = arg->start_pfn; - if (action != MEM_GOING_OFFLINE) + if ((action != MEM_GOING_OFFLINE) && (action != MEM_OFFLINE)) return NOTIFY_OK; for (; pfn < end_pfn; pfn += PAGES_PER_SECTION) { + unsigned long start = PFN_PHYS(pfn); + unsigned long end = start + (1UL << PA_SECTION_SHIFT); + ms = __pfn_to_section(pfn); - if (early_section(ms)) + if (!early_section(ms)) + continue; + + if (action == MEM_GOING_OFFLINE) { + /* + * Boot memory removal is not supported. Prevent + * it via blocking any attempted offline request + * for the boot memory and just report it. + */ + pr_warn("Boot memory [%lx %lx] offlining attempted\n", start, end); return NOTIFY_BAD; + } else if (action == MEM_OFFLINE) { + /* + * This should have never happened. Boot memory + * offlining should have been prevented by this + * very notifier. Probably some memory removal + * procedure might have changed which would then + * require further debug. + */ + pr_err("Boot memory [%lx %lx] offlined\n", start, end); + + /* + * Core memory hotplug does not process a return + * code from the notifier for MEM_OFFLINE events. + * The error condition has been reported. Return + * from here as if ignored. + */ + return NOTIFY_DONE; + } } return NOTIFY_OK; } @@ -1525,9 +1553,66 @@ static struct notifier_block prevent_bootmem_remove_nb = { .notifier_call = prevent_bootmem_remove_notifier, }; +/* + * This ensures that boot memory sections on the platform are online + * from early boot. Memory sections could not be prevented from being + * offlined, unless for some reason they are not online to begin with. + * This helps validate the basic assumption on which the above memory + * event notifier works to prevent boot memory section offlining and + * its possible removal. + */ +static void validate_bootmem_online(void) +{ + phys_addr_t start, end, addr; + struct mem_section *ms; + u64 i; + + /* + * Scanning across all memblock might be expensive + * on some big memory systems. Hence enable this + * validation only with DEBUG_VM. + */ + if (!IS_ENABLED(CONFIG_DEBUG_VM)) + return; + + for_each_mem_range(i, &start, &end) { + for (addr = start; addr < end; addr += (1UL << PA_SECTION_SHIFT)) { + ms = __pfn_to_section(PHYS_PFN(addr)); + + /* + * All memory ranges in the system at this point + * should have been marked as early sections. + */ + WARN_ON(!early_section(ms)); + + /* + * Memory notifier mechanism here to prevent boot + * memory offlining depends on the fact that each + * early section memory on the system is initially + * online. Otherwise a given memory section which + * is already offline will be overlooked and can + * be removed completely. Call out such sections. + */ + if (!online_section(ms)) + pr_err("Boot memory [%llx %llx] is offline, can be removed\n", + addr, addr + (1UL << PA_SECTION_SHIFT)); + } + } +} + static int __init prevent_bootmem_remove_init(void) { - return register_memory_notifier(&prevent_bootmem_remove_nb); + int ret = 0; + + if (!IS_ENABLED(CONFIG_MEMORY_HOTREMOVE)) + return ret; + + validate_bootmem_online(); + ret = register_memory_notifier(&prevent_bootmem_remove_nb); + if (ret) + pr_err("%s: Notifier registration failed %d\n", __func__, ret); + + return ret; } -device_initcall(prevent_bootmem_remove_init); +early_initcall(prevent_bootmem_remove_init); #endif diff --git a/arch/arm64/mm/proc.S b/arch/arm64/mm/proc.S index 23c326a06b2d..a0831bf8a018 100644 --- a/arch/arm64/mm/proc.S +++ b/arch/arm64/mm/proc.S @@ -40,7 +40,7 @@ #define TCR_CACHE_FLAGS TCR_IRGN_WBWA | TCR_ORGN_WBWA #ifdef CONFIG_KASAN_SW_TAGS -#define TCR_KASAN_FLAGS TCR_TBI1 +#define TCR_KASAN_FLAGS TCR_TBI1 | TCR_TBID1 #else #define TCR_KASAN_FLAGS 0 #endif @@ -168,7 +168,7 @@ SYM_FUNC_END(cpu_do_resume) .pushsection ".idmap.text", "awx" .macro __idmap_cpu_set_reserved_ttbr1, tmp1, tmp2 - adrp \tmp1, empty_zero_page + adrp \tmp1, reserved_pg_dir phys_to_ttbr \tmp2, \tmp1 offset_ttbr1 \tmp2, \tmp1 msr ttbr1_el1, \tmp2 @@ -489,6 +489,6 @@ SYM_FUNC_START(__cpu_setup) /* * Prepare SCTLR */ - mov_q x0, SCTLR_EL1_SET + mov_q x0, INIT_SCTLR_EL1_MMU_ON ret // return to head.S SYM_FUNC_END(__cpu_setup) diff --git a/arch/h8300/include/uapi/asm/signal.h b/arch/h8300/include/uapi/asm/signal.h index e15521037348..2cd0dce2b6a6 100644 --- a/arch/h8300/include/uapi/asm/signal.h +++ b/arch/h8300/include/uapi/asm/signal.h @@ -57,30 +57,6 @@ typedef unsigned long sigset_t; #define SIGRTMIN 32 #define SIGRTMAX _NSIG -/* - * SA_FLAGS values: - * - * SA_ONSTACK indicates that a registered stack_t will be used. - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - */ -#define SA_NOCLDSTOP 0x00000001 -#define SA_NOCLDWAIT 0x00000002 /* not supported yet */ -#define SA_SIGINFO 0x00000004 -#define SA_ONSTACK 0x08000000 -#define SA_RESTART 0x10000000 -#define SA_NODEFER 0x40000000 -#define SA_RESETHAND 0x80000000 - -#define SA_NOMASK SA_NODEFER -#define SA_ONESHOT SA_RESETHAND - #define SA_RESTORER 0x04000000 #define MINSIGSTKSZ 2048 diff --git a/arch/ia64/include/uapi/asm/signal.h b/arch/ia64/include/uapi/asm/signal.h index aa98ff1b9e22..38166a88e4c9 100644 --- a/arch/ia64/include/uapi/asm/signal.h +++ b/arch/ia64/include/uapi/asm/signal.h @@ -53,30 +53,6 @@ #define SIGRTMIN 32 #define SIGRTMAX _NSIG -/* - * SA_FLAGS values: - * - * SA_ONSTACK indicates that a registered stack_t will be used. - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - */ -#define SA_NOCLDSTOP 0x00000001 -#define SA_NOCLDWAIT 0x00000002 -#define SA_SIGINFO 0x00000004 -#define SA_ONSTACK 0x08000000 -#define SA_RESTART 0x10000000 -#define SA_NODEFER 0x40000000 -#define SA_RESETHAND 0x80000000 - -#define SA_NOMASK SA_NODEFER -#define SA_ONESHOT SA_RESETHAND - #define SA_RESTORER 0x04000000 /* diff --git a/arch/m68k/include/uapi/asm/signal.h b/arch/m68k/include/uapi/asm/signal.h index 915cc755a184..4619291df601 100644 --- a/arch/m68k/include/uapi/asm/signal.h +++ b/arch/m68k/include/uapi/asm/signal.h @@ -57,30 +57,6 @@ typedef unsigned long sigset_t; #define SIGRTMIN 32 #define SIGRTMAX _NSIG -/* - * SA_FLAGS values: - * - * SA_ONSTACK indicates that a registered stack_t will be used. - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - */ -#define SA_NOCLDSTOP 0x00000001 -#define SA_NOCLDWAIT 0x00000002 -#define SA_SIGINFO 0x00000004 -#define SA_ONSTACK 0x08000000 -#define SA_RESTART 0x10000000 -#define SA_NODEFER 0x40000000 -#define SA_RESETHAND 0x80000000 - -#define SA_NOMASK SA_NODEFER -#define SA_ONESHOT SA_RESETHAND - #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 diff --git a/arch/mips/include/uapi/asm/signal.h b/arch/mips/include/uapi/asm/signal.h index 53104b10aae2..e6c78a15cb2f 100644 --- a/arch/mips/include/uapi/asm/signal.h +++ b/arch/mips/include/uapi/asm/signal.h @@ -62,18 +62,6 @@ typedef unsigned long old_sigset_t; /* at least 32 bits */ #define SIGRTMAX _NSIG /* - * SA_FLAGS values: - * - * SA_ONSTACK indicates that a registered stack_t will be used. - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - * * SA_RESTORER used to be defined as 0x04000000 but only the O32 ABI ever * supported its use and no libc was using it, so the entire sa-restorer * functionality was removed with lmo commit 39bffc12c3580ab for 2.5.48 diff --git a/arch/parisc/include/asm/signal.h b/arch/parisc/include/asm/signal.h index 715c96ba2ec8..30dd1e43ef88 100644 --- a/arch/parisc/include/asm/signal.h +++ b/arch/parisc/include/asm/signal.h @@ -21,6 +21,8 @@ typedef struct { unsigned long sig[_NSIG_WORDS]; } sigset_t; +#define __ARCH_UAPI_SA_FLAGS _SA_SIGGFAULT + #include #endif /* !__ASSEMBLY */ diff --git a/arch/parisc/include/uapi/asm/signal.h b/arch/parisc/include/uapi/asm/signal.h index e605197b462c..e5a2657477ac 100644 --- a/arch/parisc/include/uapi/asm/signal.h +++ b/arch/parisc/include/uapi/asm/signal.h @@ -41,19 +41,6 @@ #define SIGRTMIN 32 #define SIGRTMAX _NSIG -/* - * SA_FLAGS values: - * - * SA_ONSTACK indicates that a registered stack_t will be used. - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - */ #define SA_ONSTACK 0x00000001 #define SA_RESETHAND 0x00000004 #define SA_NOCLDSTOP 0x00000008 @@ -68,14 +55,7 @@ #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 - -#define SIG_BLOCK 0 /* for blocking signals */ -#define SIG_UNBLOCK 1 /* for unblocking signals */ -#define SIG_SETMASK 2 /* for setting the signal mask */ - -#define SIG_DFL ((__sighandler_t)0) /* default signal handling */ -#define SIG_IGN ((__sighandler_t)1) /* ignore signal */ -#define SIG_ERR ((__sighandler_t)-1) /* error return from signal */ +#include # ifndef __ASSEMBLY__ @@ -84,18 +64,6 @@ /* Avoid too many header ordering problems. */ struct siginfo; -/* Type of a signal handler. */ -#if defined(__LP64__) -/* function pointers on 64-bit parisc are pointers to little structs and the - * compiler doesn't support code which changes or tests the address of - * the function in the little struct. This is really ugly -PB - */ -typedef char __user *__sighandler_t; -#else -typedef void __signalfn_t(int); -typedef __signalfn_t __user *__sighandler_t; -#endif - typedef struct sigaltstack { void __user *ss_sp; int ss_flags; diff --git a/arch/powerpc/include/uapi/asm/signal.h b/arch/powerpc/include/uapi/asm/signal.h index 85b0a7aa43e7..04873dd311c2 100644 --- a/arch/powerpc/include/uapi/asm/signal.h +++ b/arch/powerpc/include/uapi/asm/signal.h @@ -60,30 +60,6 @@ typedef struct { #define SIGRTMIN 32 #define SIGRTMAX _NSIG -/* - * SA_FLAGS values: - * - * SA_ONSTACK is not currently supported, but will allow sigaltstack(2). - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - */ -#define SA_NOCLDSTOP 0x00000001U -#define SA_NOCLDWAIT 0x00000002U -#define SA_SIGINFO 0x00000004U -#define SA_ONSTACK 0x08000000U -#define SA_RESTART 0x10000000U -#define SA_NODEFER 0x40000000U -#define SA_RESETHAND 0x80000000U - -#define SA_NOMASK SA_NODEFER -#define SA_ONESHOT SA_RESETHAND - #define SA_RESTORER 0x04000000U #define MINSIGSTKSZ 2048 diff --git a/arch/s390/include/uapi/asm/signal.h b/arch/s390/include/uapi/asm/signal.h index 9a14a611ed82..0189f326aac5 100644 --- a/arch/s390/include/uapi/asm/signal.h +++ b/arch/s390/include/uapi/asm/signal.h @@ -65,30 +65,6 @@ typedef unsigned long sigset_t; #define SIGRTMIN 32 #define SIGRTMAX _NSIG -/* - * SA_FLAGS values: - * - * SA_ONSTACK indicates that a registered stack_t will be used. - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - */ -#define SA_NOCLDSTOP 0x00000001 -#define SA_NOCLDWAIT 0x00000002 -#define SA_SIGINFO 0x00000004 -#define SA_ONSTACK 0x08000000 -#define SA_RESTART 0x10000000 -#define SA_NODEFER 0x40000000 -#define SA_RESETHAND 0x80000000 - -#define SA_NOMASK SA_NODEFER -#define SA_ONESHOT SA_RESETHAND - #define SA_RESTORER 0x04000000 #define MINSIGSTKSZ 2048 diff --git a/arch/sparc/include/uapi/asm/signal.h b/arch/sparc/include/uapi/asm/signal.h index ff9505923b9a..53758d53ac0e 100644 --- a/arch/sparc/include/uapi/asm/signal.h +++ b/arch/sparc/include/uapi/asm/signal.h @@ -137,13 +137,11 @@ struct sigstack { #define SA_STACK _SV_SSTACK #define SA_ONSTACK _SV_SSTACK #define SA_RESTART _SV_INTR -#define SA_ONESHOT _SV_RESET +#define SA_RESETHAND _SV_RESET #define SA_NODEFER 0x20u #define SA_NOCLDWAIT 0x100u #define SA_SIGINFO 0x200u -#define SA_NOMASK SA_NODEFER - #define SIG_BLOCK 0x01 /* for blocking signals */ #define SIG_UNBLOCK 0x02 /* for unblocking signals */ #define SIG_SETMASK 0x04 /* for setting the signal mask */ diff --git a/arch/x86/include/uapi/asm/signal.h b/arch/x86/include/uapi/asm/signal.h index e5745d593dc7..164a22a72984 100644 --- a/arch/x86/include/uapi/asm/signal.h +++ b/arch/x86/include/uapi/asm/signal.h @@ -62,30 +62,6 @@ typedef unsigned long sigset_t; #define SIGRTMIN 32 #define SIGRTMAX _NSIG -/* - * SA_FLAGS values: - * - * SA_ONSTACK indicates that a registered stack_t will be used. - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - */ -#define SA_NOCLDSTOP 0x00000001u -#define SA_NOCLDWAIT 0x00000002u -#define SA_SIGINFO 0x00000004u -#define SA_ONSTACK 0x08000000u -#define SA_RESTART 0x10000000u -#define SA_NODEFER 0x40000000u -#define SA_RESETHAND 0x80000000u - -#define SA_NOMASK SA_NODEFER -#define SA_ONESHOT SA_RESETHAND - #define SA_RESTORER 0x04000000 #define MINSIGSTKSZ 2048 diff --git a/arch/x86/kernel/signal_compat.c b/arch/x86/kernel/signal_compat.c index a7f3e12cfbdb..ddfd919be46c 100644 --- a/arch/x86/kernel/signal_compat.c +++ b/arch/x86/kernel/signal_compat.c @@ -165,16 +165,9 @@ void sigaction_compat_abi(struct k_sigaction *act, struct k_sigaction *oact) { signal_compat_build_tests(); - /* Don't leak in-kernel non-uapi flags to user-space */ - if (oact) - oact->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); - if (!act) return; - /* Don't let flags to be set from userspace */ - act->sa.sa_flags &= ~(SA_IA32_ABI | SA_X32_ABI); - if (in_ia32_syscall()) act->sa.sa_flags |= SA_IA32_ABI; if (in_x32_syscall()) diff --git a/arch/xtensa/include/uapi/asm/signal.h b/arch/xtensa/include/uapi/asm/signal.h index 005dec5bfde4..79ddabaa4e5d 100644 --- a/arch/xtensa/include/uapi/asm/signal.h +++ b/arch/xtensa/include/uapi/asm/signal.h @@ -72,30 +72,6 @@ typedef struct { #define SIGRTMIN 32 #define SIGRTMAX (_NSIG-1) -/* - * SA_FLAGS values: - * - * SA_ONSTACK indicates that a registered stack_t will be used. - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - */ -#define SA_NOCLDSTOP 0x00000001 -#define SA_NOCLDWAIT 0x00000002 /* not supported yet */ -#define SA_SIGINFO 0x00000004 -#define SA_ONSTACK 0x08000000 -#define SA_RESTART 0x10000000 -#define SA_NODEFER 0x40000000 -#define SA_RESETHAND 0x80000000 - -#define SA_NOMASK SA_NODEFER -#define SA_ONESHOT SA_RESETHAND - #define SA_RESTORER 0x04000000 #define MINSIGSTKSZ 2048 diff --git a/drivers/acpi/arm64/iort.c b/drivers/acpi/arm64/iort.c index 770d84071a32..d4eac6d7e9fb 100644 --- a/drivers/acpi/arm64/iort.c +++ b/drivers/acpi/arm64/iort.c @@ -1720,3 +1720,58 @@ void __init acpi_iort_init(void) iort_init_platform_devices(); } + +#ifdef CONFIG_ZONE_DMA +/* + * Extract the highest CPU physical address accessible to all DMA masters in + * the system. PHYS_ADDR_MAX is returned when no constrained device is found. + */ +phys_addr_t __init acpi_iort_dma_get_max_cpu_address(void) +{ + phys_addr_t limit = PHYS_ADDR_MAX; + struct acpi_iort_node *node, *end; + struct acpi_table_iort *iort; + acpi_status status; + int i; + + if (acpi_disabled) + return limit; + + status = acpi_get_table(ACPI_SIG_IORT, 0, + (struct acpi_table_header **)&iort); + if (ACPI_FAILURE(status)) + return limit; + + node = ACPI_ADD_PTR(struct acpi_iort_node, iort, iort->node_offset); + end = ACPI_ADD_PTR(struct acpi_iort_node, iort, iort->header.length); + + for (i = 0; i < iort->node_count; i++) { + if (node >= end) + break; + + switch (node->type) { + struct acpi_iort_named_component *ncomp; + struct acpi_iort_root_complex *rc; + phys_addr_t local_limit; + + case ACPI_IORT_NODE_NAMED_COMPONENT: + ncomp = (struct acpi_iort_named_component *)node->node_data; + local_limit = DMA_BIT_MASK(ncomp->memory_address_limit); + limit = min_not_zero(limit, local_limit); + break; + + case ACPI_IORT_NODE_PCI_ROOT_COMPLEX: + if (node->revision < 1) + break; + + rc = (struct acpi_iort_root_complex *)node->node_data; + local_limit = DMA_BIT_MASK(rc->memory_address_limit); + limit = min_not_zero(limit, local_limit); + break; + } + node = ACPI_ADD_PTR(struct acpi_iort_node, node, node->length); + } + acpi_put_table(&iort->header); + return limit; +} +#endif diff --git a/drivers/firmware/arm_sdei.c b/drivers/firmware/arm_sdei.c index 840754dcc6ca..a7e762c352f9 100644 --- a/drivers/firmware/arm_sdei.c +++ b/drivers/firmware/arm_sdei.c @@ -31,7 +31,6 @@ #include #include #include -#include /* * The call to use to reach the firmware. @@ -1092,26 +1091,13 @@ int sdei_event_handler(struct pt_regs *regs, struct sdei_registered_event *arg) { int err; - mm_segment_t orig_addr_limit; u32 event_num = arg->event_num; - /* - * Save restore 'fs'. - * The architecture's entry code save/restores 'fs' when taking an - * exception from the kernel. This ensures addr_limit isn't inherited - * if you interrupted something that allowed the uaccess routines to - * access kernel memory. - * Do the same here because this doesn't come via the same entry code. - */ - orig_addr_limit = force_uaccess_begin(); - err = arg->callback(event_num, regs, arg->callback_arg); if (err) pr_err_ratelimited("event %u on CPU %u failed with error: %d\n", event_num, smp_processor_id(), err); - force_uaccess_end(orig_addr_limit); - return err; } NOKPROBE_SYMBOL(sdei_event_handler); diff --git a/drivers/of/address.c b/drivers/of/address.c index 1c3257a2d4e3..73ddf2540f3f 100644 --- a/drivers/of/address.c +++ b/drivers/of/address.c @@ -1024,6 +1024,48 @@ out: } #endif /* CONFIG_HAS_DMA */ +/** + * of_dma_get_max_cpu_address - Gets highest CPU address suitable for DMA + * @np: The node to start searching from or NULL to start from the root + * + * Gets the highest CPU physical address that is addressable by all DMA masters + * in the sub-tree pointed by np, or the whole tree if NULL is passed. If no + * DMA constrained device is found, it returns PHYS_ADDR_MAX. + */ +phys_addr_t __init of_dma_get_max_cpu_address(struct device_node *np) +{ + phys_addr_t max_cpu_addr = PHYS_ADDR_MAX; + struct of_range_parser parser; + phys_addr_t subtree_max_addr; + struct device_node *child; + struct of_range range; + const __be32 *ranges; + u64 cpu_end = 0; + int len; + + if (!np) + np = of_root; + + ranges = of_get_property(np, "dma-ranges", &len); + if (ranges && len) { + of_dma_range_parser_init(&parser, np); + for_each_of_range(&parser, &range) + if (range.cpu_addr + range.size > cpu_end) + cpu_end = range.cpu_addr + range.size - 1; + + if (max_cpu_addr > cpu_end) + max_cpu_addr = cpu_end; + } + + for_each_available_child_of_node(np, child) { + subtree_max_addr = of_dma_get_max_cpu_address(child); + if (max_cpu_addr > subtree_max_addr) + max_cpu_addr = subtree_max_addr; + } + + return max_cpu_addr; +} + /** * of_dma_is_coherent - Check if device is coherent * @np: device node diff --git a/drivers/of/unittest.c b/drivers/of/unittest.c index 06cc988faf78..eb51bc147440 100644 --- a/drivers/of/unittest.c +++ b/drivers/of/unittest.c @@ -869,6 +869,26 @@ static void __init of_unittest_changeset(void) #endif } +static void __init of_unittest_dma_get_max_cpu_address(void) +{ + struct device_node *np; + phys_addr_t cpu_addr; + + if (!IS_ENABLED(CONFIG_OF_ADDRESS)) + return; + + np = of_find_node_by_path("/testcase-data/address-tests"); + if (!np) { + pr_err("missing testcase data\n"); + return; + } + + cpu_addr = of_dma_get_max_cpu_address(np); + unittest(cpu_addr == 0x4fffffff, + "of_dma_get_max_cpu_address: wrong CPU addr %pad (expecting %x)\n", + &cpu_addr, 0x4fffffff); +} + static void __init of_unittest_dma_ranges_one(const char *path, u64 expect_dma_addr, u64 expect_paddr) { @@ -3266,6 +3286,7 @@ static int __init of_unittest(void) of_unittest_changeset(); of_unittest_parse_interrupts(); of_unittest_parse_interrupts_extended(); + of_unittest_dma_get_max_cpu_address(); of_unittest_parse_dma_ranges(); of_unittest_pci_dma_ranges(); of_unittest_match_node(); diff --git a/drivers/perf/Kconfig b/drivers/perf/Kconfig index 130327ff0b0e..3075cf171f78 100644 --- a/drivers/perf/Kconfig +++ b/drivers/perf/Kconfig @@ -130,6 +130,13 @@ config ARM_SPE_PMU Extension, which provides periodic sampling of operations in the CPU pipeline and reports this via the perf AUX interface. +config ARM_DMC620_PMU + tristate "Enable PMU support for the ARM DMC-620 memory controller" + depends on (ARM64 && ACPI) || COMPILE_TEST + help + Support for PMU events monitoring on the ARM DMC-620 memory + controller. + source "drivers/perf/hisilicon/Kconfig" endmenu diff --git a/drivers/perf/Makefile b/drivers/perf/Makefile index 5365fd56f88f..5260b116c7da 100644 --- a/drivers/perf/Makefile +++ b/drivers/perf/Makefile @@ -13,3 +13,4 @@ obj-$(CONFIG_QCOM_L3_PMU) += qcom_l3_pmu.o obj-$(CONFIG_THUNDERX2_PMU) += thunderx2_pmu.o obj-$(CONFIG_XGENE_PMU) += xgene_pmu.o obj-$(CONFIG_ARM_SPE_PMU) += arm_spe_pmu.o +obj-$(CONFIG_ARM_DMC620_PMU) += arm_dmc620_pmu.o diff --git a/drivers/perf/arm_dmc620_pmu.c b/drivers/perf/arm_dmc620_pmu.c new file mode 100644 index 000000000000..004930eb4bbb --- /dev/null +++ b/drivers/perf/arm_dmc620_pmu.c @@ -0,0 +1,748 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * ARM DMC-620 memory controller PMU driver + * + * Copyright (C) 2020 Ampere Computing LLC. + */ + +#define DMC620_PMUNAME "arm_dmc620" +#define DMC620_DRVNAME DMC620_PMUNAME "_pmu" +#define pr_fmt(fmt) DMC620_DRVNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define DMC620_PA_SHIFT 12 +#define DMC620_CNT_INIT 0x80000000 +#define DMC620_CNT_MAX_PERIOD 0xffffffff +#define DMC620_PMU_CLKDIV2_MAX_COUNTERS 8 +#define DMC620_PMU_CLK_MAX_COUNTERS 2 +#define DMC620_PMU_MAX_COUNTERS \ + (DMC620_PMU_CLKDIV2_MAX_COUNTERS + DMC620_PMU_CLK_MAX_COUNTERS) + +/* + * The PMU registers start at 0xA00 in the DMC-620 memory map, and these + * offsets are relative to that base. + * + * Each counter has a group of control/value registers, and the + * DMC620_PMU_COUNTERn offsets are within a counter group. + * + * The counter registers groups start at 0xA10. + */ +#define DMC620_PMU_OVERFLOW_STATUS_CLKDIV2 0x8 +#define DMC620_PMU_OVERFLOW_STATUS_CLKDIV2_MASK \ + (DMC620_PMU_CLKDIV2_MAX_COUNTERS - 1) +#define DMC620_PMU_OVERFLOW_STATUS_CLK 0xC +#define DMC620_PMU_OVERFLOW_STATUS_CLK_MASK \ + (DMC620_PMU_CLK_MAX_COUNTERS - 1) +#define DMC620_PMU_COUNTERS_BASE 0x10 +#define DMC620_PMU_COUNTERn_MASK_31_00 0x0 +#define DMC620_PMU_COUNTERn_MASK_63_32 0x4 +#define DMC620_PMU_COUNTERn_MATCH_31_00 0x8 +#define DMC620_PMU_COUNTERn_MATCH_63_32 0xC +#define DMC620_PMU_COUNTERn_CONTROL 0x10 +#define DMC620_PMU_COUNTERn_CONTROL_ENABLE BIT(0) +#define DMC620_PMU_COUNTERn_CONTROL_INVERT BIT(1) +#define DMC620_PMU_COUNTERn_CONTROL_EVENT_MUX GENMASK(6, 2) +#define DMC620_PMU_COUNTERn_CONTROL_INCR_MUX GENMASK(8, 7) +#define DMC620_PMU_COUNTERn_VALUE 0x20 +/* Offset of the registers for a given counter, relative to 0xA00 */ +#define DMC620_PMU_COUNTERn_OFFSET(n) \ + (DMC620_PMU_COUNTERS_BASE + 0x28 * (n)) + +static LIST_HEAD(dmc620_pmu_irqs); +static DEFINE_MUTEX(dmc620_pmu_irqs_lock); + +struct dmc620_pmu_irq { + struct hlist_node node; + struct list_head pmus_node; + struct list_head irqs_node; + refcount_t refcount; + unsigned int irq_num; + unsigned int cpu; +}; + +struct dmc620_pmu { + struct pmu pmu; + + void __iomem *base; + struct dmc620_pmu_irq *irq; + struct list_head pmus_node; + + /* + * We put all clkdiv2 and clk counters to a same array. + * The first DMC620_PMU_CLKDIV2_MAX_COUNTERS bits belong to + * clkdiv2 counters, the last DMC620_PMU_CLK_MAX_COUNTERS + * belong to clk counters. + */ + DECLARE_BITMAP(used_mask, DMC620_PMU_MAX_COUNTERS); + struct perf_event *events[DMC620_PMU_MAX_COUNTERS]; +}; + +#define to_dmc620_pmu(p) (container_of(p, struct dmc620_pmu, pmu)) + +static int cpuhp_state_num; + +struct dmc620_pmu_event_attr { + struct device_attribute attr; + u8 clkdiv2; + u8 eventid; +}; + +static ssize_t +dmc620_pmu_event_show(struct device *dev, + struct device_attribute *attr, char *page) +{ + struct dmc620_pmu_event_attr *eattr; + + eattr = container_of(attr, typeof(*eattr), attr); + + return sprintf(page, "event=0x%x,clkdiv2=0x%x\n", eattr->eventid, eattr->clkdiv2); +} + +#define DMC620_PMU_EVENT_ATTR(_name, _eventid, _clkdiv2) \ + (&((struct dmc620_pmu_event_attr[]) {{ \ + .attr = __ATTR(_name, 0444, dmc620_pmu_event_show, NULL), \ + .clkdiv2 = _clkdiv2, \ + .eventid = _eventid, \ + }})[0].attr.attr) + +static struct attribute *dmc620_pmu_events_attrs[] = { + /* clkdiv2 events list */ + DMC620_PMU_EVENT_ATTR(clkdiv2_cycle_count, 0x0, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_allocate, 0x1, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_queue_depth, 0x2, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_waiting_for_wr_data, 0x3, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_read_backlog, 0x4, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_waiting_for_mi, 0x5, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_hazard_resolution, 0x6, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_enqueue, 0x7, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_arbitrate, 0x8, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_lrank_turnaround_activate, 0x9, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_prank_turnaround_activate, 0xa, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_read_depth, 0xb, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_write_depth, 0xc, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_highigh_qos_depth, 0xd, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_high_qos_depth, 0xe, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_medium_qos_depth, 0xf, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_low_qos_depth, 0x10, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_activate, 0x11, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_rdwr, 0x12, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_refresh, 0x13, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_training_request, 0x14, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_t_mac_tracker, 0x15, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_bk_fsm_tracker, 0x16, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_bk_open_tracker, 0x17, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_ranks_in_pwr_down, 0x18, 1), + DMC620_PMU_EVENT_ATTR(clkdiv2_ranks_in_sref, 0x19, 1), + + /* clk events list */ + DMC620_PMU_EVENT_ATTR(clk_cycle_count, 0x0, 0), + DMC620_PMU_EVENT_ATTR(clk_request, 0x1, 0), + DMC620_PMU_EVENT_ATTR(clk_upload_stall, 0x2, 0), + NULL, +}; + +static struct attribute_group dmc620_pmu_events_attr_group = { + .name = "events", + .attrs = dmc620_pmu_events_attrs, +}; + +/* User ABI */ +#define ATTR_CFG_FLD_mask_CFG config +#define ATTR_CFG_FLD_mask_LO 0 +#define ATTR_CFG_FLD_mask_HI 44 +#define ATTR_CFG_FLD_match_CFG config1 +#define ATTR_CFG_FLD_match_LO 0 +#define ATTR_CFG_FLD_match_HI 44 +#define ATTR_CFG_FLD_invert_CFG config2 +#define ATTR_CFG_FLD_invert_LO 0 +#define ATTR_CFG_FLD_invert_HI 0 +#define ATTR_CFG_FLD_incr_CFG config2 +#define ATTR_CFG_FLD_incr_LO 1 +#define ATTR_CFG_FLD_incr_HI 2 +#define ATTR_CFG_FLD_event_CFG config2 +#define ATTR_CFG_FLD_event_LO 3 +#define ATTR_CFG_FLD_event_HI 8 +#define ATTR_CFG_FLD_clkdiv2_CFG config2 +#define ATTR_CFG_FLD_clkdiv2_LO 9 +#define ATTR_CFG_FLD_clkdiv2_HI 9 + +#define __GEN_PMU_FORMAT_ATTR(cfg, lo, hi) \ + (lo) == (hi) ? #cfg ":" #lo "\n" : #cfg ":" #lo "-" #hi + +#define _GEN_PMU_FORMAT_ATTR(cfg, lo, hi) \ + __GEN_PMU_FORMAT_ATTR(cfg, lo, hi) + +#define GEN_PMU_FORMAT_ATTR(name) \ + PMU_FORMAT_ATTR(name, \ + _GEN_PMU_FORMAT_ATTR(ATTR_CFG_FLD_##name##_CFG, \ + ATTR_CFG_FLD_##name##_LO, \ + ATTR_CFG_FLD_##name##_HI)) + +#define _ATTR_CFG_GET_FLD(attr, cfg, lo, hi) \ + ((((attr)->cfg) >> lo) & GENMASK_ULL(hi - lo, 0)) + +#define ATTR_CFG_GET_FLD(attr, name) \ + _ATTR_CFG_GET_FLD(attr, \ + ATTR_CFG_FLD_##name##_CFG, \ + ATTR_CFG_FLD_##name##_LO, \ + ATTR_CFG_FLD_##name##_HI) + +GEN_PMU_FORMAT_ATTR(mask); +GEN_PMU_FORMAT_ATTR(match); +GEN_PMU_FORMAT_ATTR(invert); +GEN_PMU_FORMAT_ATTR(incr); +GEN_PMU_FORMAT_ATTR(event); +GEN_PMU_FORMAT_ATTR(clkdiv2); + +static struct attribute *dmc620_pmu_formats_attrs[] = { + &format_attr_mask.attr, + &format_attr_match.attr, + &format_attr_invert.attr, + &format_attr_incr.attr, + &format_attr_event.attr, + &format_attr_clkdiv2.attr, + NULL, +}; + +static struct attribute_group dmc620_pmu_format_attr_group = { + .name = "format", + .attrs = dmc620_pmu_formats_attrs, +}; + +static const struct attribute_group *dmc620_pmu_attr_groups[] = { + &dmc620_pmu_events_attr_group, + &dmc620_pmu_format_attr_group, + NULL, +}; + +static inline +u32 dmc620_pmu_creg_read(struct dmc620_pmu *dmc620_pmu, + unsigned int idx, unsigned int reg) +{ + return readl(dmc620_pmu->base + DMC620_PMU_COUNTERn_OFFSET(idx) + reg); +} + +static inline +void dmc620_pmu_creg_write(struct dmc620_pmu *dmc620_pmu, + unsigned int idx, unsigned int reg, u32 val) +{ + writel(val, dmc620_pmu->base + DMC620_PMU_COUNTERn_OFFSET(idx) + reg); +} + +static +unsigned int dmc620_event_to_counter_control(struct perf_event *event) +{ + struct perf_event_attr *attr = &event->attr; + unsigned int reg = 0; + + reg |= FIELD_PREP(DMC620_PMU_COUNTERn_CONTROL_INVERT, + ATTR_CFG_GET_FLD(attr, invert)); + reg |= FIELD_PREP(DMC620_PMU_COUNTERn_CONTROL_EVENT_MUX, + ATTR_CFG_GET_FLD(attr, event)); + reg |= FIELD_PREP(DMC620_PMU_COUNTERn_CONTROL_INCR_MUX, + ATTR_CFG_GET_FLD(attr, incr)); + + return reg; +} + +static int dmc620_get_event_idx(struct perf_event *event) +{ + struct dmc620_pmu *dmc620_pmu = to_dmc620_pmu(event->pmu); + int idx, start_idx, end_idx; + + if (ATTR_CFG_GET_FLD(&event->attr, clkdiv2)) { + start_idx = 0; + end_idx = DMC620_PMU_CLKDIV2_MAX_COUNTERS; + } else { + start_idx = DMC620_PMU_CLKDIV2_MAX_COUNTERS; + end_idx = DMC620_PMU_MAX_COUNTERS; + } + + for (idx = start_idx; idx < end_idx; ++idx) { + if (!test_and_set_bit(idx, dmc620_pmu->used_mask)) + return idx; + } + + /* The counters are all in use. */ + return -EAGAIN; +} + +static inline +u64 dmc620_pmu_read_counter(struct perf_event *event) +{ + struct dmc620_pmu *dmc620_pmu = to_dmc620_pmu(event->pmu); + + return dmc620_pmu_creg_read(dmc620_pmu, + event->hw.idx, DMC620_PMU_COUNTERn_VALUE); +} + +static void dmc620_pmu_event_update(struct perf_event *event) +{ + struct hw_perf_event *hwc = &event->hw; + u64 delta, prev_count, new_count; + + do { + /* We may also be called from the irq handler */ + prev_count = local64_read(&hwc->prev_count); + new_count = dmc620_pmu_read_counter(event); + } while (local64_cmpxchg(&hwc->prev_count, + prev_count, new_count) != prev_count); + delta = (new_count - prev_count) & DMC620_CNT_MAX_PERIOD; + local64_add(delta, &event->count); +} + +static void dmc620_pmu_event_set_period(struct perf_event *event) +{ + struct dmc620_pmu *dmc620_pmu = to_dmc620_pmu(event->pmu); + + local64_set(&event->hw.prev_count, DMC620_CNT_INIT); + dmc620_pmu_creg_write(dmc620_pmu, + event->hw.idx, DMC620_PMU_COUNTERn_VALUE, DMC620_CNT_INIT); +} + +static void dmc620_pmu_enable_counter(struct perf_event *event) +{ + struct dmc620_pmu *dmc620_pmu = to_dmc620_pmu(event->pmu); + u32 reg; + + reg = dmc620_event_to_counter_control(event) | DMC620_PMU_COUNTERn_CONTROL_ENABLE; + dmc620_pmu_creg_write(dmc620_pmu, + event->hw.idx, DMC620_PMU_COUNTERn_CONTROL, reg); +} + +static void dmc620_pmu_disable_counter(struct perf_event *event) +{ + struct dmc620_pmu *dmc620_pmu = to_dmc620_pmu(event->pmu); + + dmc620_pmu_creg_write(dmc620_pmu, + event->hw.idx, DMC620_PMU_COUNTERn_CONTROL, 0); +} + +static irqreturn_t dmc620_pmu_handle_irq(int irq_num, void *data) +{ + struct dmc620_pmu_irq *irq = data; + struct dmc620_pmu *dmc620_pmu; + irqreturn_t ret = IRQ_NONE; + + rcu_read_lock(); + list_for_each_entry_rcu(dmc620_pmu, &irq->pmus_node, pmus_node) { + unsigned long status; + struct perf_event *event; + unsigned int idx; + + /* + * HW doesn't provide a control to atomically disable all counters. + * To prevent race condition (overflow happens while clearing status register), + * disable all events before continuing + */ + for (idx = 0; idx < DMC620_PMU_MAX_COUNTERS; idx++) { + event = dmc620_pmu->events[idx]; + if (!event) + continue; + dmc620_pmu_disable_counter(event); + } + + status = readl(dmc620_pmu->base + DMC620_PMU_OVERFLOW_STATUS_CLKDIV2); + status |= (readl(dmc620_pmu->base + DMC620_PMU_OVERFLOW_STATUS_CLK) << + DMC620_PMU_CLKDIV2_MAX_COUNTERS); + if (status) { + for_each_set_bit(idx, &status, + DMC620_PMU_MAX_COUNTERS) { + event = dmc620_pmu->events[idx]; + if (WARN_ON_ONCE(!event)) + continue; + dmc620_pmu_event_update(event); + dmc620_pmu_event_set_period(event); + } + + if (status & DMC620_PMU_OVERFLOW_STATUS_CLKDIV2_MASK) + writel(0, dmc620_pmu->base + DMC620_PMU_OVERFLOW_STATUS_CLKDIV2); + + if ((status >> DMC620_PMU_CLKDIV2_MAX_COUNTERS) & + DMC620_PMU_OVERFLOW_STATUS_CLK_MASK) + writel(0, dmc620_pmu->base + DMC620_PMU_OVERFLOW_STATUS_CLK); + } + + for (idx = 0; idx < DMC620_PMU_MAX_COUNTERS; idx++) { + event = dmc620_pmu->events[idx]; + if (!event) + continue; + if (!(event->hw.state & PERF_HES_STOPPED)) + dmc620_pmu_enable_counter(event); + } + + ret = IRQ_HANDLED; + } + rcu_read_unlock(); + + return ret; +} + +static struct dmc620_pmu_irq *__dmc620_pmu_get_irq(int irq_num) +{ + struct dmc620_pmu_irq *irq; + int ret; + + list_for_each_entry(irq, &dmc620_pmu_irqs, irqs_node) + if (irq->irq_num == irq_num && refcount_inc_not_zero(&irq->refcount)) + return irq; + + irq = kzalloc(sizeof(*irq), GFP_KERNEL); + if (!irq) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&irq->pmus_node); + + /* Pick one CPU to be the preferred one to use */ + irq->cpu = raw_smp_processor_id(); + refcount_set(&irq->refcount, 1); + + ret = request_irq(irq_num, dmc620_pmu_handle_irq, + IRQF_NOBALANCING | IRQF_NO_THREAD, + "dmc620-pmu", irq); + if (ret) + goto out_free_aff; + + ret = irq_set_affinity_hint(irq_num, cpumask_of(irq->cpu)); + if (ret) + goto out_free_irq; + + ret = cpuhp_state_add_instance_nocalls(cpuhp_state_num, &irq->node); + if (ret) + goto out_free_irq; + + irq->irq_num = irq_num; + list_add(&irq->irqs_node, &dmc620_pmu_irqs); + + return irq; + +out_free_irq: + free_irq(irq_num, irq); +out_free_aff: + kfree(irq); + return ERR_PTR(ret); +} + +static int dmc620_pmu_get_irq(struct dmc620_pmu *dmc620_pmu, int irq_num) +{ + struct dmc620_pmu_irq *irq; + + mutex_lock(&dmc620_pmu_irqs_lock); + irq = __dmc620_pmu_get_irq(irq_num); + mutex_unlock(&dmc620_pmu_irqs_lock); + + if (IS_ERR(irq)) + return PTR_ERR(irq); + + dmc620_pmu->irq = irq; + mutex_lock(&dmc620_pmu_irqs_lock); + list_add_rcu(&dmc620_pmu->pmus_node, &irq->pmus_node); + mutex_unlock(&dmc620_pmu_irqs_lock); + + return 0; +} + +static void dmc620_pmu_put_irq(struct dmc620_pmu *dmc620_pmu) +{ + struct dmc620_pmu_irq *irq = dmc620_pmu->irq; + + mutex_lock(&dmc620_pmu_irqs_lock); + list_del_rcu(&dmc620_pmu->pmus_node); + + if (!refcount_dec_and_test(&irq->refcount)) { + mutex_unlock(&dmc620_pmu_irqs_lock); + return; + } + + list_del(&irq->irqs_node); + mutex_unlock(&dmc620_pmu_irqs_lock); + + WARN_ON(irq_set_affinity_hint(irq->irq_num, NULL)); + free_irq(irq->irq_num, irq); + cpuhp_state_remove_instance_nocalls(cpuhp_state_num, &irq->node); + kfree(irq); +} + +static int dmc620_pmu_event_init(struct perf_event *event) +{ + struct dmc620_pmu *dmc620_pmu = to_dmc620_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + struct perf_event *sibling; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* + * DMC 620 PMUs are shared across all cpus and cannot + * support task bound and sampling events. + */ + if (is_sampling_event(event) || + event->attach_state & PERF_ATTACH_TASK) { + dev_dbg(dmc620_pmu->pmu.dev, + "Can't support per-task counters\n"); + return -EOPNOTSUPP; + } + + /* + * Many perf core operations (eg. events rotation) operate on a + * single CPU context. This is obvious for CPU PMUs, where one + * expects the same sets of events being observed on all CPUs, + * but can lead to issues for off-core PMUs, where each + * event could be theoretically assigned to a different CPU. To + * mitigate this, we enforce CPU assignment to one, selected + * processor. + */ + event->cpu = dmc620_pmu->irq->cpu; + if (event->cpu < 0) + return -EINVAL; + + /* + * We can't atomically disable all HW counters so only one event allowed, + * although software events are acceptable. + */ + if (event->group_leader != event && + !is_software_event(event->group_leader)) + return -EINVAL; + + for_each_sibling_event(sibling, event->group_leader) { + if (sibling != event && + !is_software_event(sibling)) + return -EINVAL; + } + + hwc->idx = -1; + return 0; +} + +static void dmc620_pmu_read(struct perf_event *event) +{ + dmc620_pmu_event_update(event); +} + +static void dmc620_pmu_start(struct perf_event *event, int flags) +{ + event->hw.state = 0; + dmc620_pmu_event_set_period(event); + dmc620_pmu_enable_counter(event); +} + +static void dmc620_pmu_stop(struct perf_event *event, int flags) +{ + if (event->hw.state & PERF_HES_STOPPED) + return; + + dmc620_pmu_disable_counter(event); + dmc620_pmu_event_update(event); + event->hw.state |= PERF_HES_STOPPED | PERF_HES_UPTODATE; +} + +static int dmc620_pmu_add(struct perf_event *event, int flags) +{ + struct dmc620_pmu *dmc620_pmu = to_dmc620_pmu(event->pmu); + struct perf_event_attr *attr = &event->attr; + struct hw_perf_event *hwc = &event->hw; + int idx; + u64 reg; + + idx = dmc620_get_event_idx(event); + if (idx < 0) + return idx; + + hwc->idx = idx; + dmc620_pmu->events[idx] = event; + hwc->state = PERF_HES_STOPPED | PERF_HES_UPTODATE; + + reg = ATTR_CFG_GET_FLD(attr, mask); + dmc620_pmu_creg_write(dmc620_pmu, + idx, DMC620_PMU_COUNTERn_MASK_31_00, lower_32_bits(reg)); + dmc620_pmu_creg_write(dmc620_pmu, + idx, DMC620_PMU_COUNTERn_MASK_63_32, upper_32_bits(reg)); + + reg = ATTR_CFG_GET_FLD(attr, match); + dmc620_pmu_creg_write(dmc620_pmu, + idx, DMC620_PMU_COUNTERn_MATCH_31_00, lower_32_bits(reg)); + dmc620_pmu_creg_write(dmc620_pmu, + idx, DMC620_PMU_COUNTERn_MATCH_63_32, upper_32_bits(reg)); + + if (flags & PERF_EF_START) + dmc620_pmu_start(event, PERF_EF_RELOAD); + + perf_event_update_userpage(event); + return 0; +} + +static void dmc620_pmu_del(struct perf_event *event, int flags) +{ + struct dmc620_pmu *dmc620_pmu = to_dmc620_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + int idx = hwc->idx; + + dmc620_pmu_stop(event, PERF_EF_UPDATE); + dmc620_pmu->events[idx] = NULL; + clear_bit(idx, dmc620_pmu->used_mask); + perf_event_update_userpage(event); +} + +static int dmc620_pmu_cpu_teardown(unsigned int cpu, + struct hlist_node *node) +{ + struct dmc620_pmu_irq *irq; + struct dmc620_pmu *dmc620_pmu; + unsigned int target; + + irq = hlist_entry_safe(node, struct dmc620_pmu_irq, node); + if (cpu != irq->cpu) + return 0; + + target = cpumask_any_but(cpu_online_mask, cpu); + if (target >= nr_cpu_ids) + return 0; + + /* We're only reading, but this isn't the place to be involving RCU */ + mutex_lock(&dmc620_pmu_irqs_lock); + list_for_each_entry(dmc620_pmu, &irq->pmus_node, pmus_node) + perf_pmu_migrate_context(&dmc620_pmu->pmu, irq->cpu, target); + mutex_unlock(&dmc620_pmu_irqs_lock); + + WARN_ON(irq_set_affinity_hint(irq->irq_num, cpumask_of(target))); + irq->cpu = target; + + return 0; +} + +static int dmc620_pmu_device_probe(struct platform_device *pdev) +{ + struct dmc620_pmu *dmc620_pmu; + struct resource *res; + char *name; + int irq_num; + int i, ret; + + dmc620_pmu = devm_kzalloc(&pdev->dev, + sizeof(struct dmc620_pmu), GFP_KERNEL); + if (!dmc620_pmu) + return -ENOMEM; + + platform_set_drvdata(pdev, dmc620_pmu); + + dmc620_pmu->pmu = (struct pmu) { + .module = THIS_MODULE, + .capabilities = PERF_PMU_CAP_NO_EXCLUDE, + .task_ctx_nr = perf_invalid_context, + .event_init = dmc620_pmu_event_init, + .add = dmc620_pmu_add, + .del = dmc620_pmu_del, + .start = dmc620_pmu_start, + .stop = dmc620_pmu_stop, + .read = dmc620_pmu_read, + .attr_groups = dmc620_pmu_attr_groups, + }; + + res = platform_get_resource(pdev, IORESOURCE_MEM, 0); + dmc620_pmu->base = devm_ioremap_resource(&pdev->dev, res); + if (IS_ERR(dmc620_pmu->base)) + return PTR_ERR(dmc620_pmu->base); + + /* Make sure device is reset before enabling interrupt */ + for (i = 0; i < DMC620_PMU_MAX_COUNTERS; i++) + dmc620_pmu_creg_write(dmc620_pmu, i, DMC620_PMU_COUNTERn_CONTROL, 0); + writel(0, dmc620_pmu->base + DMC620_PMU_OVERFLOW_STATUS_CLKDIV2); + writel(0, dmc620_pmu->base + DMC620_PMU_OVERFLOW_STATUS_CLK); + + irq_num = platform_get_irq(pdev, 0); + if (irq_num < 0) + return irq_num; + + ret = dmc620_pmu_get_irq(dmc620_pmu, irq_num); + if (ret) + return ret; + + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, + "%s_%llx", DMC620_PMUNAME, + (u64)(res->start >> DMC620_PA_SHIFT)); + if (!name) { + dev_err(&pdev->dev, + "Create name failed, PMU @%pa\n", &res->start); + goto out_teardown_dev; + } + + ret = perf_pmu_register(&dmc620_pmu->pmu, name, -1); + if (ret) + goto out_teardown_dev; + + return 0; + +out_teardown_dev: + dmc620_pmu_put_irq(dmc620_pmu); + synchronize_rcu(); + return ret; +} + +static int dmc620_pmu_device_remove(struct platform_device *pdev) +{ + struct dmc620_pmu *dmc620_pmu = platform_get_drvdata(pdev); + + dmc620_pmu_put_irq(dmc620_pmu); + + /* perf will synchronise RCU before devres can free dmc620_pmu */ + perf_pmu_unregister(&dmc620_pmu->pmu); + + return 0; +} + +static const struct acpi_device_id dmc620_acpi_match[] = { + { "ARMHD620", 0}, + {}, +}; +MODULE_DEVICE_TABLE(acpi, dmc620_acpi_match); +static struct platform_driver dmc620_pmu_driver = { + .driver = { + .name = DMC620_DRVNAME, + .acpi_match_table = dmc620_acpi_match, + }, + .probe = dmc620_pmu_device_probe, + .remove = dmc620_pmu_device_remove, +}; + +static int __init dmc620_pmu_init(void) +{ + cpuhp_state_num = cpuhp_setup_state_multi(CPUHP_AP_ONLINE_DYN, + DMC620_DRVNAME, + NULL, + dmc620_pmu_cpu_teardown); + if (cpuhp_state_num < 0) + return cpuhp_state_num; + + return platform_driver_register(&dmc620_pmu_driver); +} + +static void __exit dmc620_pmu_exit(void) +{ + platform_driver_unregister(&dmc620_pmu_driver); + cpuhp_remove_multi_state(cpuhp_state_num); +} + +module_init(dmc620_pmu_init); +module_exit(dmc620_pmu_exit); + +MODULE_DESCRIPTION("Perf driver for the ARM DMC-620 memory controller"); +MODULE_AUTHOR("Tuan Phan dev, &dsu_pmu->associated_cpus); else if (is_acpi_device_node(fwnode)) diff --git a/drivers/perf/arm_pmu.c b/drivers/perf/arm_pmu.c index cb2f55f450e4..794a37d50853 100644 --- a/drivers/perf/arm_pmu.c +++ b/drivers/perf/arm_pmu.c @@ -726,6 +726,11 @@ static int armpmu_get_cpu_irq(struct arm_pmu *pmu, int cpu) return per_cpu(hw_events->irq, cpu); } +bool arm_pmu_irq_is_nmi(void) +{ + return has_nmi; +} + /* * PMU hardware loses all context when a CPU goes offline. * When a CPU is hotplugged back in, since some hardware registers are diff --git a/drivers/perf/arm_smmuv3_pmu.c b/drivers/perf/arm_smmuv3_pmu.c index 5274f7fe359e..74474bb322c3 100644 --- a/drivers/perf/arm_smmuv3_pmu.c +++ b/drivers/perf/arm_smmuv3_pmu.c @@ -74,6 +74,7 @@ #define SMMU_PMCG_CFGR_NCTR GENMASK(5, 0) #define SMMU_PMCG_CR 0xE04 #define SMMU_PMCG_CR_ENABLE BIT(0) +#define SMMU_PMCG_IIDR 0xE08 #define SMMU_PMCG_CEID0 0xE20 #define SMMU_PMCG_CEID1 0xE28 #define SMMU_PMCG_IRQ_CTRL 0xE50 @@ -112,6 +113,7 @@ struct smmu_pmu { void __iomem *reloc_base; u64 counter_mask; u32 options; + u32 iidr; bool global_filter; }; @@ -552,6 +554,40 @@ static struct attribute_group smmu_pmu_events_group = { .is_visible = smmu_pmu_event_is_visible, }; +static ssize_t smmu_pmu_identifier_attr_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct smmu_pmu *smmu_pmu = to_smmu_pmu(dev_get_drvdata(dev)); + + return snprintf(page, PAGE_SIZE, "0x%08x\n", smmu_pmu->iidr); +} + +static umode_t smmu_pmu_identifier_attr_visible(struct kobject *kobj, + struct attribute *attr, + int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct smmu_pmu *smmu_pmu = to_smmu_pmu(dev_get_drvdata(dev)); + + if (!smmu_pmu->iidr) + return 0; + return attr->mode; +} + +static struct device_attribute smmu_pmu_identifier_attr = + __ATTR(identifier, 0444, smmu_pmu_identifier_attr_show, NULL); + +static struct attribute *smmu_pmu_identifier_attrs[] = { + &smmu_pmu_identifier_attr.attr, + NULL +}; + +static struct attribute_group smmu_pmu_identifier_group = { + .attrs = smmu_pmu_identifier_attrs, + .is_visible = smmu_pmu_identifier_attr_visible, +}; + /* Formats */ PMU_FORMAT_ATTR(event, "config:0-15"); PMU_FORMAT_ATTR(filter_stream_id, "config1:0-31"); @@ -575,6 +611,7 @@ static const struct attribute_group *smmu_pmu_attr_grps[] = { &smmu_pmu_cpumask_group, &smmu_pmu_events_group, &smmu_pmu_format_group, + &smmu_pmu_identifier_group, NULL }; @@ -795,6 +832,8 @@ static int smmu_pmu_probe(struct platform_device *pdev) return err; } + smmu_pmu->iidr = readl_relaxed(smmu_pmu->reg_base + SMMU_PMCG_IIDR); + name = devm_kasprintf(&pdev->dev, GFP_KERNEL, "smmuv3_pmcg_%llx", (res_0->start) >> SMMU_PMCG_PA_SHIFT); if (!name) { diff --git a/drivers/perf/fsl_imx8_ddr_perf.c b/drivers/perf/fsl_imx8_ddr_perf.c index 397540a4b799..a11bfd8a0823 100644 --- a/drivers/perf/fsl_imx8_ddr_perf.c +++ b/drivers/perf/fsl_imx8_ddr_perf.c @@ -50,6 +50,7 @@ static DEFINE_IDA(ddr_ida); struct fsl_ddr_devtype_data { unsigned int quirks; /* quirks needed for different DDR Perf core */ + const char *identifier; /* system PMU identifier for userspace */ }; static const struct fsl_ddr_devtype_data imx8_devtype_data; @@ -58,13 +59,32 @@ static const struct fsl_ddr_devtype_data imx8m_devtype_data = { .quirks = DDR_CAP_AXI_ID_FILTER, }; +static const struct fsl_ddr_devtype_data imx8mq_devtype_data = { + .quirks = DDR_CAP_AXI_ID_FILTER, + .identifier = "i.MX8MQ", +}; + +static const struct fsl_ddr_devtype_data imx8mm_devtype_data = { + .quirks = DDR_CAP_AXI_ID_FILTER, + .identifier = "i.MX8MM", +}; + +static const struct fsl_ddr_devtype_data imx8mn_devtype_data = { + .quirks = DDR_CAP_AXI_ID_FILTER, + .identifier = "i.MX8MN", +}; + static const struct fsl_ddr_devtype_data imx8mp_devtype_data = { .quirks = DDR_CAP_AXI_ID_FILTER_ENHANCED, + .identifier = "i.MX8MP", }; static const struct of_device_id imx_ddr_pmu_dt_ids[] = { { .compatible = "fsl,imx8-ddr-pmu", .data = &imx8_devtype_data}, { .compatible = "fsl,imx8m-ddr-pmu", .data = &imx8m_devtype_data}, + { .compatible = "fsl,imx8mq-ddr-pmu", .data = &imx8mq_devtype_data}, + { .compatible = "fsl,imx8mm-ddr-pmu", .data = &imx8mm_devtype_data}, + { .compatible = "fsl,imx8mn-ddr-pmu", .data = &imx8mn_devtype_data}, { .compatible = "fsl,imx8mp-ddr-pmu", .data = &imx8mp_devtype_data}, { /* sentinel */ } }; @@ -84,6 +104,40 @@ struct ddr_pmu { int id; }; +static ssize_t ddr_perf_identifier_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct ddr_pmu *pmu = dev_get_drvdata(dev); + + return sprintf(page, "%s\n", pmu->devtype_data->identifier); +} + +static umode_t ddr_perf_identifier_attr_visible(struct kobject *kobj, + struct attribute *attr, + int n) +{ + struct device *dev = kobj_to_dev(kobj); + struct ddr_pmu *pmu = dev_get_drvdata(dev); + + if (!pmu->devtype_data->identifier) + return 0; + return attr->mode; +}; + +static struct device_attribute ddr_perf_identifier_attr = + __ATTR(identifier, 0444, ddr_perf_identifier_show, NULL); + +static struct attribute *ddr_perf_identifier_attrs[] = { + &ddr_perf_identifier_attr.attr, + NULL, +}; + +static struct attribute_group ddr_perf_identifier_attr_group = { + .attrs = ddr_perf_identifier_attrs, + .is_visible = ddr_perf_identifier_attr_visible, +}; + enum ddr_perf_filter_capabilities { PERF_CAP_AXI_ID_FILTER = 0, PERF_CAP_AXI_ID_FILTER_ENHANCED, @@ -237,6 +291,7 @@ static const struct attribute_group *attr_groups[] = { &ddr_perf_format_attr_group, &ddr_perf_cpumask_attr_group, &ddr_perf_filter_cap_attr_group, + &ddr_perf_identifier_attr_group, NULL, }; @@ -361,25 +416,6 @@ static int ddr_perf_event_init(struct perf_event *event) return 0; } - -static void ddr_perf_event_update(struct perf_event *event) -{ - struct ddr_pmu *pmu = to_ddr_pmu(event->pmu); - struct hw_perf_event *hwc = &event->hw; - u64 delta, prev_raw_count, new_raw_count; - int counter = hwc->idx; - - do { - prev_raw_count = local64_read(&hwc->prev_count); - new_raw_count = ddr_perf_read_counter(pmu, counter); - } while (local64_cmpxchg(&hwc->prev_count, prev_raw_count, - new_raw_count) != prev_raw_count); - - delta = (new_raw_count - prev_raw_count) & 0xFFFFFFFF; - - local64_add(delta, &event->count); -} - static void ddr_perf_counter_enable(struct ddr_pmu *pmu, int config, int counter, bool enable) { @@ -404,6 +440,56 @@ static void ddr_perf_counter_enable(struct ddr_pmu *pmu, int config, } } +static bool ddr_perf_counter_overflow(struct ddr_pmu *pmu, int counter) +{ + int val; + + val = readl_relaxed(pmu->base + counter * 4 + COUNTER_CNTL); + + return val & CNTL_OVER; +} + +static void ddr_perf_counter_clear(struct ddr_pmu *pmu, int counter) +{ + u8 reg = counter * 4 + COUNTER_CNTL; + int val; + + val = readl_relaxed(pmu->base + reg); + val &= ~CNTL_CLEAR; + writel(val, pmu->base + reg); + + val |= CNTL_CLEAR; + writel(val, pmu->base + reg); +} + +static void ddr_perf_event_update(struct perf_event *event) +{ + struct ddr_pmu *pmu = to_ddr_pmu(event->pmu); + struct hw_perf_event *hwc = &event->hw; + u64 new_raw_count; + int counter = hwc->idx; + int ret; + + new_raw_count = ddr_perf_read_counter(pmu, counter); + local64_add(new_raw_count, &event->count); + + /* + * For legacy SoCs: event counter continue counting when overflow, + * no need to clear the counter. + * For new SoCs: event counter stop counting when overflow, need + * clear counter to let it count again. + */ + if (counter != EVENT_CYCLES_COUNTER) { + ret = ddr_perf_counter_overflow(pmu, counter); + if (ret) + dev_warn_ratelimited(pmu->dev, "events lost due to counter overflow (config 0x%llx)\n", + event->attr.config); + } + + /* clear counter every time for both cycle counter and event counter */ + ddr_perf_counter_clear(pmu, counter); +} + static void ddr_perf_event_start(struct perf_event *event, int flags) { struct ddr_pmu *pmu = to_ddr_pmu(event->pmu); @@ -537,7 +623,7 @@ static irqreturn_t ddr_perf_irq_handler(int irq, void *p) { int i; struct ddr_pmu *pmu = (struct ddr_pmu *) p; - struct perf_event *event, *cycle_event = NULL; + struct perf_event *event; /* all counter will stop if cycle counter disabled */ ddr_perf_counter_enable(pmu, @@ -547,7 +633,9 @@ static irqreturn_t ddr_perf_irq_handler(int irq, void *p) /* * When the cycle counter overflows, all counters are stopped, * and an IRQ is raised. If any other counter overflows, it - * continues counting, and no IRQ is raised. + * continues counting, and no IRQ is raised. But for new SoCs, + * such as i.MX8MP, event counter would stop when overflow, so + * we need use cycle counter to stop overflow of event counter. * * Cycles occur at least 4 times as often as other events, so we * can update all events on a cycle counter overflow and not @@ -562,17 +650,12 @@ static irqreturn_t ddr_perf_irq_handler(int irq, void *p) event = pmu->events[i]; ddr_perf_event_update(event); - - if (event->hw.idx == EVENT_CYCLES_COUNTER) - cycle_event = event; } ddr_perf_counter_enable(pmu, EVENT_CYCLES_ID, EVENT_CYCLES_COUNTER, true); - if (cycle_event) - ddr_perf_event_update(cycle_event); return IRQ_HANDLED; } diff --git a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c index 5e3645c96443..5ac6c9113767 100644 --- a/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_ddrc_pmu.c @@ -33,6 +33,7 @@ #define DDRC_INT_MASK 0x6c8 #define DDRC_INT_STATUS 0x6cc #define DDRC_INT_CLEAR 0x6d0 +#define DDRC_VERSION 0x710 /* DDRC has 8-counters */ #define DDRC_NR_COUNTERS 0x8 @@ -267,6 +268,8 @@ static int hisi_ddrc_pmu_init_data(struct platform_device *pdev, return PTR_ERR(ddrc_pmu->base); } + ddrc_pmu->identifier = readl(ddrc_pmu->base + DDRC_VERSION); + return 0; } @@ -308,10 +311,23 @@ static const struct attribute_group hisi_ddrc_pmu_cpumask_attr_group = { .attrs = hisi_ddrc_pmu_cpumask_attrs, }; +static struct device_attribute hisi_ddrc_pmu_identifier_attr = + __ATTR(identifier, 0444, hisi_uncore_pmu_identifier_attr_show, NULL); + +static struct attribute *hisi_ddrc_pmu_identifier_attrs[] = { + &hisi_ddrc_pmu_identifier_attr.attr, + NULL +}; + +static struct attribute_group hisi_ddrc_pmu_identifier_group = { + .attrs = hisi_ddrc_pmu_identifier_attrs, +}; + static const struct attribute_group *hisi_ddrc_pmu_attr_groups[] = { &hisi_ddrc_pmu_format_group, &hisi_ddrc_pmu_events_group, &hisi_ddrc_pmu_cpumask_attr_group, + &hisi_ddrc_pmu_identifier_group, NULL, }; diff --git a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c index 5eb8168029c0..41b2dceb5f26 100644 --- a/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_hha_pmu.c @@ -23,6 +23,7 @@ #define HHA_INT_MASK 0x0804 #define HHA_INT_STATUS 0x0808 #define HHA_INT_CLEAR 0x080C +#define HHA_VERSION 0x1cf0 #define HHA_PERF_CTRL 0x1E00 #define HHA_EVENT_CTRL 0x1E04 #define HHA_EVENT_TYPE0 0x1E80 @@ -261,6 +262,8 @@ static int hisi_hha_pmu_init_data(struct platform_device *pdev, return PTR_ERR(hha_pmu->base); } + hha_pmu->identifier = readl(hha_pmu->base + HHA_VERSION); + return 0; } @@ -320,10 +323,23 @@ static const struct attribute_group hisi_hha_pmu_cpumask_attr_group = { .attrs = hisi_hha_pmu_cpumask_attrs, }; +static struct device_attribute hisi_hha_pmu_identifier_attr = + __ATTR(identifier, 0444, hisi_uncore_pmu_identifier_attr_show, NULL); + +static struct attribute *hisi_hha_pmu_identifier_attrs[] = { + &hisi_hha_pmu_identifier_attr.attr, + NULL +}; + +static struct attribute_group hisi_hha_pmu_identifier_group = { + .attrs = hisi_hha_pmu_identifier_attrs, +}; + static const struct attribute_group *hisi_hha_pmu_attr_groups[] = { &hisi_hha_pmu_format_group, &hisi_hha_pmu_events_group, &hisi_hha_pmu_cpumask_attr_group, + &hisi_hha_pmu_identifier_group, NULL, }; diff --git a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c index 3e8b5eab5514..705501d18d03 100644 --- a/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_l3c_pmu.c @@ -25,6 +25,7 @@ #define L3C_INT_STATUS 0x0808 #define L3C_INT_CLEAR 0x080c #define L3C_EVENT_CTRL 0x1c00 +#define L3C_VERSION 0x1cf0 #define L3C_EVENT_TYPE0 0x1d00 /* * Each counter is 48-bits and [48:63] are reserved @@ -264,6 +265,8 @@ static int hisi_l3c_pmu_init_data(struct platform_device *pdev, return PTR_ERR(l3c_pmu->base); } + l3c_pmu->identifier = readl(l3c_pmu->base + L3C_VERSION); + return 0; } @@ -310,10 +313,23 @@ static const struct attribute_group hisi_l3c_pmu_cpumask_attr_group = { .attrs = hisi_l3c_pmu_cpumask_attrs, }; +static struct device_attribute hisi_l3c_pmu_identifier_attr = + __ATTR(identifier, 0444, hisi_uncore_pmu_identifier_attr_show, NULL); + +static struct attribute *hisi_l3c_pmu_identifier_attrs[] = { + &hisi_l3c_pmu_identifier_attr.attr, + NULL +}; + +static struct attribute_group hisi_l3c_pmu_identifier_group = { + .attrs = hisi_l3c_pmu_identifier_attrs, +}; + static const struct attribute_group *hisi_l3c_pmu_attr_groups[] = { &hisi_l3c_pmu_format_group, &hisi_l3c_pmu_events_group, &hisi_l3c_pmu_cpumask_attr_group, + &hisi_l3c_pmu_identifier_group, NULL, }; diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.c b/drivers/perf/hisilicon/hisi_uncore_pmu.c index 97aff877a4e7..9dbdc3fc3bb4 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.c +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.c @@ -119,6 +119,16 @@ int hisi_uncore_pmu_get_event_idx(struct perf_event *event) } EXPORT_SYMBOL_GPL(hisi_uncore_pmu_get_event_idx); +ssize_t hisi_uncore_pmu_identifier_attr_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct hisi_pmu *hisi_pmu = to_hisi_pmu(dev_get_drvdata(dev)); + + return snprintf(page, PAGE_SIZE, "0x%08x\n", hisi_pmu->identifier); +} +EXPORT_SYMBOL_GPL(hisi_uncore_pmu_identifier_attr_show); + static void hisi_uncore_pmu_clear_event_idx(struct hisi_pmu *hisi_pmu, int idx) { if (!hisi_uncore_pmu_counter_valid(hisi_pmu, idx)) { diff --git a/drivers/perf/hisilicon/hisi_uncore_pmu.h b/drivers/perf/hisilicon/hisi_uncore_pmu.h index b59ec22169ab..25b7cbe1f818 100644 --- a/drivers/perf/hisilicon/hisi_uncore_pmu.h +++ b/drivers/perf/hisilicon/hisi_uncore_pmu.h @@ -75,6 +75,7 @@ struct hisi_pmu { int counter_bits; /* check event code range */ int check_event; + u32 identifier; }; int hisi_uncore_pmu_counter_valid(struct hisi_pmu *hisi_pmu, int idx); @@ -97,4 +98,10 @@ ssize_t hisi_cpumask_sysfs_show(struct device *dev, struct device_attribute *attr, char *buf); int hisi_uncore_pmu_online_cpu(unsigned int cpu, struct hlist_node *node); int hisi_uncore_pmu_offline_cpu(unsigned int cpu, struct hlist_node *node); + +ssize_t hisi_uncore_pmu_identifier_attr_show(struct device *dev, + struct device_attribute *attr, + char *page); + + #endif /* __HISI_UNCORE_PMU_H__ */ diff --git a/include/linux/acpi_iort.h b/include/linux/acpi_iort.h index 20a32120bb88..1a12baa58e40 100644 --- a/include/linux/acpi_iort.h +++ b/include/linux/acpi_iort.h @@ -38,6 +38,7 @@ void iort_dma_setup(struct device *dev, u64 *dma_addr, u64 *size); const struct iommu_ops *iort_iommu_configure_id(struct device *dev, const u32 *id_in); int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head); +phys_addr_t acpi_iort_dma_get_max_cpu_address(void); #else static inline void acpi_iort_init(void) { } static inline u32 iort_msi_map_id(struct device *dev, u32 id) @@ -55,6 +56,9 @@ static inline const struct iommu_ops *iort_iommu_configure_id( static inline int iort_iommu_msi_get_resv_regions(struct device *dev, struct list_head *head) { return 0; } + +static inline phys_addr_t acpi_iort_dma_get_max_cpu_address(void) +{ return PHYS_ADDR_MAX; } #endif #endif /* __ACPI_IORT_H__ */ diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 1df06a7f10dd..14c235d902ad 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -354,26 +354,6 @@ enum zone_type { * DMA mask is assumed when ZONE_DMA32 is defined. Some 64-bit * platforms may need both zones as they support peripherals with * different DMA addressing limitations. - * - * Some examples: - * - * - i386 and x86_64 have a fixed 16M ZONE_DMA and ZONE_DMA32 for the - * rest of the lower 4G. - * - * - arm only uses ZONE_DMA, the size, up to 4G, may vary depending on - * the specific device. - * - * - arm64 has a fixed 1G ZONE_DMA and ZONE_DMA32 for the rest of the - * lower 4G. - * - * - powerpc only uses ZONE_DMA, the size, up to 2G, may vary - * depending on the specific device. - * - * - s390 uses ZONE_DMA fixed to the lower 2G. - * - * - ia64 and riscv only use ZONE_DMA32. - * - * - parisc uses neither. */ #ifdef CONFIG_ZONE_DMA ZONE_DMA, diff --git a/include/linux/of.h b/include/linux/of.h index 5d51891cbf1a..9ed5b8532c30 100644 --- a/include/linux/of.h +++ b/include/linux/of.h @@ -558,6 +558,8 @@ int of_map_id(struct device_node *np, u32 id, const char *map_name, const char *map_mask_name, struct device_node **target, u32 *id_out); +phys_addr_t of_dma_get_max_cpu_address(struct device_node *np); + #else /* CONFIG_OF */ static inline void of_core_init(void) @@ -995,6 +997,11 @@ static inline int of_map_id(struct device_node *np, u32 id, return -EINVAL; } +static inline phys_addr_t of_dma_get_max_cpu_address(struct device_node *np) +{ + return PHYS_ADDR_MAX; +} + #define of_match_ptr(_ptr) NULL #define of_match_node(_matches, _node) NULL #endif /* CONFIG_OF */ diff --git a/include/linux/perf/arm_pmu.h b/include/linux/perf/arm_pmu.h index 505480217cf1..bf7966776c55 100644 --- a/include/linux/perf/arm_pmu.h +++ b/include/linux/perf/arm_pmu.h @@ -163,6 +163,8 @@ int arm_pmu_acpi_probe(armpmu_init_fn init_fn); static inline int arm_pmu_acpi_probe(armpmu_init_fn init_fn) { return 0; } #endif +bool arm_pmu_irq_is_nmi(void); + /* Internal functions only for core arm_pmu code */ struct arm_pmu *armpmu_alloc(void); struct arm_pmu *armpmu_alloc_atomic(void); diff --git a/include/linux/scs.h b/include/linux/scs.h index 6dec390cf154..18122d9e17ff 100644 --- a/include/linux/scs.h +++ b/include/linux/scs.h @@ -15,24 +15,18 @@ #ifdef CONFIG_SHADOW_CALL_STACK -/* - * In testing, 1 KiB shadow stack size (i.e. 128 stack frames on a 64-bit - * architecture) provided ~40% safety margin on stack usage while keeping - * memory allocation overhead reasonable. - */ -#define SCS_SIZE SZ_1K +#define SCS_ORDER 0 +#define SCS_SIZE (PAGE_SIZE << SCS_ORDER) #define GFP_SCS (GFP_KERNEL | __GFP_ZERO) /* An illegal pointer value to mark the end of the shadow stack. */ #define SCS_END_MAGIC (0x5f6UL + POISON_POINTER_DELTA) -/* Allocate a static per-CPU shadow stack */ -#define DEFINE_SCS(name) \ - DEFINE_PER_CPU(unsigned long [SCS_SIZE/sizeof(long)], name) \ - #define task_scs(tsk) (task_thread_info(tsk)->scs_base) #define task_scs_sp(tsk) (task_thread_info(tsk)->scs_sp) +void *scs_alloc(int node); +void scs_free(void *s); void scs_init(void); int scs_prepare(struct task_struct *tsk, int node); void scs_release(struct task_struct *tsk); @@ -61,6 +55,8 @@ static inline bool task_scs_end_corrupted(struct task_struct *tsk) #else /* CONFIG_SHADOW_CALL_STACK */ +static inline void *scs_alloc(int node) { return NULL; } +static inline void scs_free(void *s) {} static inline void scs_init(void) {} static inline void scs_task_reset(struct task_struct *tsk) {} static inline int scs_prepare(struct task_struct *tsk, int node) { return 0; } diff --git a/include/linux/signal.h b/include/linux/signal.h index b256f9c65661..205526c4003a 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -469,4 +469,18 @@ struct seq_file; extern void render_sigset_t(struct seq_file *, const char *, sigset_t *); #endif +#ifndef arch_untagged_si_addr +/* + * Given a fault address and a signal and si_code which correspond to the + * _sigfault union member, returns the address that must appear in si_addr if + * the signal handler does not have SA_EXPOSE_TAGBITS enabled in sa_flags. + */ +static inline void __user *arch_untagged_si_addr(void __user *addr, + unsigned long sig, + unsigned long si_code) +{ + return addr; +} +#endif + #endif /* _LINUX_SIGNAL_H */ diff --git a/include/linux/signal_types.h b/include/linux/signal_types.h index f8a90ae9c6ec..68e06c75c5b2 100644 --- a/include/linux/signal_types.h +++ b/include/linux/signal_types.h @@ -68,4 +68,16 @@ struct ksignal { int sig; }; +#ifndef __ARCH_UAPI_SA_FLAGS +#ifdef SA_RESTORER +#define __ARCH_UAPI_SA_FLAGS SA_RESTORER +#else +#define __ARCH_UAPI_SA_FLAGS 0 +#endif +#endif + +#define UAPI_SA_FLAGS \ + (SA_NOCLDSTOP | SA_NOCLDWAIT | SA_SIGINFO | SA_ONSTACK | SA_RESTART | \ + SA_NODEFER | SA_RESETHAND | SA_EXPOSE_TAGBITS | __ARCH_UAPI_SA_FLAGS) + #endif /* _LINUX_SIGNAL_TYPES_H */ diff --git a/include/uapi/asm-generic/signal-defs.h b/include/uapi/asm-generic/signal-defs.h index e9304c95ceea..fe929e7b77ca 100644 --- a/include/uapi/asm-generic/signal-defs.h +++ b/include/uapi/asm-generic/signal-defs.h @@ -4,6 +4,69 @@ #include +/* + * SA_FLAGS values: + * + * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. + * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. + * SA_SIGINFO delivers the signal with SIGINFO structs. + * SA_ONSTACK indicates that a registered stack_t will be used. + * SA_RESTART flag to get restarting signals (which were the default long ago) + * SA_NODEFER prevents the current signal from being masked in the handler. + * SA_RESETHAND clears the handler when the signal is delivered. + * SA_UNSUPPORTED is a flag bit that will never be supported. Kernels from + * before the introduction of SA_UNSUPPORTED did not clear unknown bits from + * sa_flags when read using the oldact argument to sigaction and rt_sigaction, + * so this bit allows flag bit support to be detected from userspace while + * allowing an old kernel to be distinguished from a kernel that supports every + * flag bit. + * SA_EXPOSE_TAGBITS exposes an architecture-defined set of tag bits in + * siginfo.si_addr. + * + * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single + * Unix names RESETHAND and NODEFER respectively. + */ +#ifndef SA_NOCLDSTOP +#define SA_NOCLDSTOP 0x00000001 +#endif +#ifndef SA_NOCLDWAIT +#define SA_NOCLDWAIT 0x00000002 +#endif +#ifndef SA_SIGINFO +#define SA_SIGINFO 0x00000004 +#endif +/* 0x00000008 used on alpha, mips, parisc */ +/* 0x00000010 used on alpha, parisc */ +/* 0x00000020 used on alpha, parisc, sparc */ +/* 0x00000040 used on alpha, parisc */ +/* 0x00000080 used on parisc */ +/* 0x00000100 used on sparc */ +/* 0x00000200 used on sparc */ +#define SA_UNSUPPORTED 0x00000400 +#define SA_EXPOSE_TAGBITS 0x00000800 +/* 0x00010000 used on mips */ +/* 0x01000000 used on x86 */ +/* 0x02000000 used on x86 */ +/* + * New architectures should not define the obsolete + * SA_RESTORER 0x04000000 + */ +#ifndef SA_ONSTACK +#define SA_ONSTACK 0x08000000 +#endif +#ifndef SA_RESTART +#define SA_RESTART 0x10000000 +#endif +#ifndef SA_NODEFER +#define SA_NODEFER 0x40000000 +#endif +#ifndef SA_RESETHAND +#define SA_RESETHAND 0x80000000 +#endif + +#define SA_NOMASK SA_NODEFER +#define SA_ONESHOT SA_RESETHAND + #ifndef SIG_BLOCK #define SIG_BLOCK 0 /* for blocking signals */ #endif diff --git a/include/uapi/asm-generic/signal.h b/include/uapi/asm-generic/signal.h index 5c716a952cbe..f634822906e4 100644 --- a/include/uapi/asm-generic/signal.h +++ b/include/uapi/asm-generic/signal.h @@ -52,35 +52,6 @@ #define SIGRTMAX _NSIG #endif -/* - * SA_FLAGS values: - * - * SA_ONSTACK indicates that a registered stack_t will be used. - * SA_RESTART flag to get restarting signals (which were the default long ago) - * SA_NOCLDSTOP flag to turn off SIGCHLD when children stop. - * SA_RESETHAND clears the handler when the signal is delivered. - * SA_NOCLDWAIT flag on SIGCHLD to inhibit zombies. - * SA_NODEFER prevents the current signal from being masked in the handler. - * - * SA_ONESHOT and SA_NOMASK are the historical Linux names for the Single - * Unix names RESETHAND and NODEFER respectively. - */ -#define SA_NOCLDSTOP 0x00000001 -#define SA_NOCLDWAIT 0x00000002 -#define SA_SIGINFO 0x00000004 -#define SA_ONSTACK 0x08000000 -#define SA_RESTART 0x10000000 -#define SA_NODEFER 0x40000000 -#define SA_RESETHAND 0x80000000 - -#define SA_NOMASK SA_NODEFER -#define SA_ONESHOT SA_RESETHAND - -/* - * New architectures should not define the obsolete - * SA_RESTORER 0x04000000 - */ - #if !defined MINSIGSTKSZ || !defined SIGSTKSZ #define MINSIGSTKSZ 2048 #define SIGSTKSZ 8192 diff --git a/kernel/scs.c b/kernel/scs.c index 4ff4a7ba0094..e2a71fc82fa0 100644 --- a/kernel/scs.c +++ b/kernel/scs.c @@ -5,26 +5,49 @@ * Copyright (C) 2019 Google LLC */ +#include #include #include #include -#include +#include #include -static struct kmem_cache *scs_cache; - static void __scs_account(void *s, int account) { - struct page *scs_page = virt_to_page(s); + struct page *scs_page = vmalloc_to_page(s); mod_node_page_state(page_pgdat(scs_page), NR_KERNEL_SCS_KB, account * (SCS_SIZE / SZ_1K)); } -static void *scs_alloc(int node) -{ - void *s = kmem_cache_alloc_node(scs_cache, GFP_SCS, node); +/* Matches NR_CACHED_STACKS for VMAP_STACK */ +#define NR_CACHED_SCS 2 +static DEFINE_PER_CPU(void *, scs_cache[NR_CACHED_SCS]); +static void *__scs_alloc(int node) +{ + int i; + void *s; + + for (i = 0; i < NR_CACHED_SCS; i++) { + s = this_cpu_xchg(scs_cache[i], NULL); + if (s) { + kasan_unpoison_vmalloc(s, SCS_SIZE); + memset(s, 0, SCS_SIZE); + return s; + } + } + + return __vmalloc_node_range(SCS_SIZE, 1, VMALLOC_START, VMALLOC_END, + GFP_SCS, PAGE_KERNEL, 0, node, + __builtin_return_address(0)); +} + +void *scs_alloc(int node) +{ + void *s; + + s = __scs_alloc(node); if (!s) return NULL; @@ -34,21 +57,47 @@ static void *scs_alloc(int node) * Poison the allocation to catch unintentional accesses to * the shadow stack when KASAN is enabled. */ - kasan_poison_object_data(scs_cache, s); + kasan_poison_vmalloc(s, SCS_SIZE); __scs_account(s, 1); return s; } -static void scs_free(void *s) +void scs_free(void *s) { + int i; + __scs_account(s, -1); - kasan_unpoison_object_data(scs_cache, s); - kmem_cache_free(scs_cache, s); + + /* + * We cannot sleep as this can be called in interrupt context, + * so use this_cpu_cmpxchg to update the cache, and vfree_atomic + * to free the stack. + */ + + for (i = 0; i < NR_CACHED_SCS; i++) + if (this_cpu_cmpxchg(scs_cache[i], 0, s) == NULL) + return; + + vfree_atomic(s); +} + +static int scs_cleanup(unsigned int cpu) +{ + int i; + void **cache = per_cpu_ptr(scs_cache, cpu); + + for (i = 0; i < NR_CACHED_SCS; i++) { + vfree(cache[i]); + cache[i] = NULL; + } + + return 0; } void __init scs_init(void) { - scs_cache = kmem_cache_create("scs_cache", SCS_SIZE, 0, 0, NULL); + cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "scs:scs_cache", NULL, + scs_cleanup); } int scs_prepare(struct task_struct *tsk, int node) diff --git a/kernel/signal.c b/kernel/signal.c index dd83edcfaf0a..2af3f5170e4f 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2525,6 +2525,26 @@ static int ptrace_signal(int signr, kernel_siginfo_t *info) return signr; } +static void hide_si_addr_tag_bits(struct ksignal *ksig) +{ + switch (siginfo_layout(ksig->sig, ksig->info.si_code)) { + case SIL_FAULT: + case SIL_FAULT_MCEERR: + case SIL_FAULT_BNDERR: + case SIL_FAULT_PKUERR: + ksig->info.si_addr = arch_untagged_si_addr( + ksig->info.si_addr, ksig->sig, ksig->info.si_code); + break; + case SIL_KILL: + case SIL_TIMER: + case SIL_POLL: + case SIL_CHLD: + case SIL_RT: + case SIL_SYS: + break; + } +} + bool get_signal(struct ksignal *ksig) { struct sighand_struct *sighand = current->sighand; @@ -2762,6 +2782,10 @@ relock: spin_unlock_irq(&sighand->siglock); ksig->sig = signr; + + if (!(ksig->ka.sa.sa_flags & SA_EXPOSE_TAGBITS)) + hide_si_addr_tag_bits(ksig); + return ksig->sig > 0; } @@ -3986,6 +4010,22 @@ int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) if (oact) *oact = *k; + /* + * Make sure that we never accidentally claim to support SA_UNSUPPORTED, + * e.g. by having an architecture use the bit in their uapi. + */ + BUILD_BUG_ON(UAPI_SA_FLAGS & SA_UNSUPPORTED); + + /* + * Clear unknown flag bits in order to allow userspace to detect missing + * support for flag bits and to allow the kernel to use non-uapi bits + * internally. + */ + if (act) + act->sa.sa_flags &= UAPI_SA_FLAGS; + if (oact) + oact->sa.sa_flags &= UAPI_SA_FLAGS; + sigaction_compat_abi(act, oact); if (act) {