From f154066b61dfde618d98fdafc8cadde076c7f222 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 6 Feb 2022 09:20:08 -0800 Subject: [PATCH 01/30] gcc-plugins/stackleak: Provide verbose mode In order to compare instrumentation between builds, make the verbose mode of the plugin available during the build. This is rarely needed (behind EXPERT) and very noisy (disabled for COMPILE_TEST). Cc: Alexander Popov Signed-off-by: Kees Cook --- scripts/Makefile.gcc-plugins | 2 ++ security/Kconfig.hardening | 10 ++++++++++ 2 files changed, 12 insertions(+) diff --git a/scripts/Makefile.gcc-plugins b/scripts/Makefile.gcc-plugins index 1d16ca1b78c9..f67153b260c0 100644 --- a/scripts/Makefile.gcc-plugins +++ b/scripts/Makefile.gcc-plugins @@ -37,6 +37,8 @@ gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK) \ += -fplugin-arg-stackleak_plugin-track-min-size=$(CONFIG_STACKLEAK_TRACK_MIN_SIZE) gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK) \ += -fplugin-arg-stackleak_plugin-arch=$(SRCARCH) +gcc-plugin-cflags-$(CONFIG_GCC_PLUGIN_STACKLEAK_VERBOSE) \ + += -fplugin-arg-stackleak_plugin-verbose ifdef CONFIG_GCC_PLUGIN_STACKLEAK DISABLE_STACKLEAK_PLUGIN += -fplugin-arg-stackleak_plugin-disable endif diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening index d051f8ceefdd..ded4d7c0d132 100644 --- a/security/Kconfig.hardening +++ b/security/Kconfig.hardening @@ -174,6 +174,16 @@ config GCC_PLUGIN_STACKLEAK * https://grsecurity.net/ * https://pax.grsecurity.net/ +config GCC_PLUGIN_STACKLEAK_VERBOSE + bool "Report stack depth analysis instrumentation" if EXPERT + depends on GCC_PLUGIN_STACKLEAK + depends on !COMPILE_TEST # too noisy + help + This option will cause a warning to be printed each time the + stackleak plugin finds a function it thinks needs to be + instrumented. This is useful for comparing coverage between + builds. + config STACKLEAK_TRACK_MIN_SIZE int "Minimum stack frame size of functions tracked by STACKLEAK" default 100 From 27e9faf415dbf94af19b9c827842435edbc1fbbc Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 6 Feb 2022 09:08:20 -0800 Subject: [PATCH 02/30] gcc-plugins/stackleak: Exactly match strings instead of prefixes Since STRING_CST may not be NUL terminated, strncmp() was used for check for equality. However, this may lead to mismatches for longer section names where the start matches the tested-for string. Test for exact equality by checking for the presences of NUL termination. Cc: Alexander Popov Signed-off-by: Kees Cook --- scripts/gcc-plugins/stackleak_plugin.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c index e9db7dcb3e5f..b04aa8e91a41 100644 --- a/scripts/gcc-plugins/stackleak_plugin.c +++ b/scripts/gcc-plugins/stackleak_plugin.c @@ -429,6 +429,23 @@ static unsigned int stackleak_cleanup_execute(void) return 0; } +/* + * STRING_CST may or may not be NUL terminated: + * https://gcc.gnu.org/onlinedocs/gccint/Constant-expressions.html + */ +static inline bool string_equal(tree node, const char *string, int length) +{ + if (TREE_STRING_LENGTH(node) < length) + return false; + if (TREE_STRING_LENGTH(node) > length + 1) + return false; + if (TREE_STRING_LENGTH(node) == length + 1 && + TREE_STRING_POINTER(node)[length] != '\0') + return false; + return !memcmp(TREE_STRING_POINTER(node), string, length); +} +#define STRING_EQUAL(node, str) string_equal(node, str, strlen(str)) + static bool stackleak_gate(void) { tree section; @@ -438,13 +455,13 @@ static bool stackleak_gate(void) if (section && TREE_VALUE(section)) { section = TREE_VALUE(TREE_VALUE(section)); - if (!strncmp(TREE_STRING_POINTER(section), ".init.text", 10)) + if (STRING_EQUAL(section, ".init.text")) return false; - if (!strncmp(TREE_STRING_POINTER(section), ".devinit.text", 13)) + if (STRING_EQUAL(section, ".devinit.text")) return false; - if (!strncmp(TREE_STRING_POINTER(section), ".cpuinit.text", 13)) + if (STRING_EQUAL(section, ".cpuinit.text")) return false; - if (!strncmp(TREE_STRING_POINTER(section), ".meminit.text", 13)) + if (STRING_EQUAL(section, ".meminit.text")) return false; } From ae978009fc013e3166c9f523f8b17e41a3c0286e Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sun, 6 Feb 2022 09:12:50 -0800 Subject: [PATCH 03/30] gcc-plugins/stackleak: Ignore .noinstr.text and .entry.text The .noinstr.text section functions may not have "current()" sanely available. Similarly true for .entry.text, though such a check is currently redundant. Add a check for both. In an x86_64 defconfig build, the following functions no longer receive stackleak instrumentation: __do_fast_syscall_32() do_int80_syscall_32() do_machine_check() do_syscall_64() exc_general_protection() fixup_bad_iret() Suggested-by: Peter Zijlstra Cc: Alexander Popov Signed-off-by: Kees Cook --- scripts/gcc-plugins/stackleak_plugin.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/scripts/gcc-plugins/stackleak_plugin.c b/scripts/gcc-plugins/stackleak_plugin.c index b04aa8e91a41..42f0252ee2a4 100644 --- a/scripts/gcc-plugins/stackleak_plugin.c +++ b/scripts/gcc-plugins/stackleak_plugin.c @@ -463,6 +463,10 @@ static bool stackleak_gate(void) return false; if (STRING_EQUAL(section, ".meminit.text")) return false; + if (STRING_EQUAL(section, ".noinstr.text")) + return false; + if (STRING_EQUAL(section, ".entry.text")) + return false; } return track_frame_size >= 0; From 8cb37a5974a48569aab8a1736d21399fddbdbdb2 Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 31 Jan 2022 10:05:20 +0100 Subject: [PATCH 04/30] stack: Introduce CONFIG_RANDOMIZE_KSTACK_OFFSET The randomize_kstack_offset feature is unconditionally compiled in when the architecture supports it. To add constraints on compiler versions, we require a dedicated Kconfig variable. Therefore, introduce RANDOMIZE_KSTACK_OFFSET. Furthermore, this option is now also configurable by EXPERT kernels: while the feature is supposed to have zero performance overhead when disabled, due to its use of static branches, there are few cases where giving a distribution the option to disable the feature entirely makes sense. For example, in very resource constrained environments, which would never enable the feature to begin with, in which case the additional kernel code size increase would be redundant. Signed-off-by: Marco Elver Reviewed-by: Nathan Chancellor Acked-by: Peter Zijlstra (Intel) Acked-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220131090521.1947110-1-elver@google.com --- arch/Kconfig | 23 ++++++++++++++++++----- include/linux/randomize_kstack.h | 5 +++++ init/main.c | 2 +- 3 files changed, 24 insertions(+), 6 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 678a80713b21..2cde48d9b77c 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1159,16 +1159,29 @@ config HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET to the compiler, so it will attempt to add canary checks regardless of the static branch state. -config RANDOMIZE_KSTACK_OFFSET_DEFAULT - bool "Randomize kernel stack offset on syscall entry" +config RANDOMIZE_KSTACK_OFFSET + bool "Support for randomizing kernel stack offset on syscall entry" if EXPERT + default y depends on HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET help The kernel stack offset can be randomized (after pt_regs) by roughly 5 bits of entropy, frustrating memory corruption attacks that depend on stack address determinism or - cross-syscall address exposures. This feature is controlled - by kernel boot param "randomize_kstack_offset=on/off", and this - config chooses the default boot state. + cross-syscall address exposures. + + The feature is controlled via the "randomize_kstack_offset=on/off" + kernel boot param, and if turned off has zero overhead due to its use + of static branches (see JUMP_LABEL). + + If unsure, say Y. + +config RANDOMIZE_KSTACK_OFFSET_DEFAULT + bool "Default state of kernel stack offset randomization" + depends on RANDOMIZE_KSTACK_OFFSET + help + Kernel stack offset randomization is controlled by kernel boot param + "randomize_kstack_offset=on/off", and this config chooses the default + boot state. config ARCH_OPTIONAL_KERNEL_RWX def_bool n diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index bebc911161b6..91f1b990a3c3 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -2,6 +2,7 @@ #ifndef _LINUX_RANDOMIZE_KSTACK_H #define _LINUX_RANDOMIZE_KSTACK_H +#ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET #include #include #include @@ -50,5 +51,9 @@ void *__builtin_alloca(size_t size); raw_cpu_write(kstack_offset, offset); \ } \ } while (0) +#else /* CONFIG_RANDOMIZE_KSTACK_OFFSET */ +#define add_random_kstack_offset() do { } while (0) +#define choose_random_kstack_offset(rand) do { } while (0) +#endif /* CONFIG_RANDOMIZE_KSTACK_OFFSET */ #endif diff --git a/init/main.c b/init/main.c index 65fa2e41a9c0..560f45c27ffe 100644 --- a/init/main.c +++ b/init/main.c @@ -853,7 +853,7 @@ static void __init mm_init(void) pti_init(); } -#ifdef CONFIG_HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET +#ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET DEFINE_STATIC_KEY_MAYBE_RO(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, randomize_kstack_offset); DEFINE_PER_CPU(u32, kstack_offset); From efa90c11f62e6b7252fb75efe2787056872a627c Mon Sep 17 00:00:00 2001 From: Marco Elver Date: Mon, 31 Jan 2022 10:05:21 +0100 Subject: [PATCH 05/30] stack: Constrain and fix stack offset randomization with Clang builds All supported versions of Clang perform auto-init of __builtin_alloca() when stack auto-init is on (CONFIG_INIT_STACK_ALL_{ZERO,PATTERN}). add_random_kstack_offset() uses __builtin_alloca() to add a stack offset. This means, when CONFIG_INIT_STACK_ALL_{ZERO,PATTERN} is enabled, add_random_kstack_offset() will auto-init that unused portion of the stack used to add an offset. There are several problems with this: 1. These offsets can be as large as 1023 bytes. Performing memset() on them isn't exactly cheap, and this is done on every syscall entry. 2. Architectures adding add_random_kstack_offset() to syscall entry implemented in C require them to be 'noinstr' (e.g. see x86 and s390). The potential problem here is that a call to memset may occur, which is not noinstr. A x86_64 defconfig kernel with Clang 11 and CONFIG_VMLINUX_VALIDATION shows: | vmlinux.o: warning: objtool: do_syscall_64()+0x9d: call to memset() leaves .noinstr.text section | vmlinux.o: warning: objtool: do_int80_syscall_32()+0xab: call to memset() leaves .noinstr.text section | vmlinux.o: warning: objtool: __do_fast_syscall_32()+0xe2: call to memset() leaves .noinstr.text section | vmlinux.o: warning: objtool: fixup_bad_iret()+0x2f: call to memset() leaves .noinstr.text section Clang 14 (unreleased) will introduce a way to skip alloca initialization via __builtin_alloca_uninitialized() (https://reviews.llvm.org/D115440). Constrain RANDOMIZE_KSTACK_OFFSET to only be enabled if no stack auto-init is enabled, the compiler is GCC, or Clang is version 14+. Use __builtin_alloca_uninitialized() if the compiler provides it, as is done by Clang 14. Link: https://lkml.kernel.org/r/YbHTKUjEejZCLyhX@elver.google.com Fixes: 39218ff4c625 ("stack: Optionally randomize kernel stack offset each syscall") Signed-off-by: Marco Elver Reviewed-by: Nathan Chancellor Acked-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220131090521.1947110-2-elver@google.com --- arch/Kconfig | 1 + include/linux/randomize_kstack.h | 16 ++++++++++++++-- 2 files changed, 15 insertions(+), 2 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 2cde48d9b77c..c5b50bfe31c1 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1163,6 +1163,7 @@ config RANDOMIZE_KSTACK_OFFSET bool "Support for randomizing kernel stack offset on syscall entry" if EXPERT default y depends on HAVE_ARCH_RANDOMIZE_KSTACK_OFFSET + depends on INIT_STACK_NONE || !CC_IS_CLANG || CLANG_VERSION >= 140000 help The kernel stack offset can be randomized (after pt_regs) by roughly 5 bits of entropy, frustrating memory corruption diff --git a/include/linux/randomize_kstack.h b/include/linux/randomize_kstack.h index 91f1b990a3c3..1468caf001c0 100644 --- a/include/linux/randomize_kstack.h +++ b/include/linux/randomize_kstack.h @@ -17,8 +17,20 @@ DECLARE_PER_CPU(u32, kstack_offset); * alignment. Also, since this use is being explicitly masked to a max of * 10 bits, stack-clash style attacks are unlikely. For more details see * "VLAs" in Documentation/process/deprecated.rst + * + * The normal __builtin_alloca() is initialized with INIT_STACK_ALL (currently + * only with Clang and not GCC). Initializing the unused area on each syscall + * entry is expensive, and generating an implicit call to memset() may also be + * problematic (such as in noinstr functions). Therefore, if the compiler + * supports it (which it should if it initializes allocas), always use the + * "uninitialized" variant of the builtin. */ -void *__builtin_alloca(size_t size); +#if __has_builtin(__builtin_alloca_uninitialized) +#define __kstack_alloca __builtin_alloca_uninitialized +#else +#define __kstack_alloca __builtin_alloca +#endif + /* * Use, at most, 10 bits of entropy. We explicitly cap this to keep the * "VLA" from being unbounded (see above). 10 bits leaves enough room for @@ -37,7 +49,7 @@ void *__builtin_alloca(size_t size); if (static_branch_maybe(CONFIG_RANDOMIZE_KSTACK_OFFSET_DEFAULT, \ &randomize_kstack_offset)) { \ u32 offset = raw_cpu_read(kstack_offset); \ - u8 *ptr = __builtin_alloca(KSTACK_OFFSET_MAX(offset)); \ + u8 *ptr = __kstack_alloca(KSTACK_OFFSET_MAX(offset)); \ /* Keep allocation even after "ptr" loses scope. */ \ asm volatile("" :: "r"(ptr) : "memory"); \ } \ From 2792d84e6da5e0fd7d3b22fd70bc69b7ee263609 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 16 Feb 2022 12:05:28 -0800 Subject: [PATCH 06/30] usercopy: Check valid lifetime via stack depth One of the things that CONFIG_HARDENED_USERCOPY sanity-checks is whether an object that is about to be copied to/from userspace is overlapping the stack at all. If it is, it performs a number of inexpensive bounds checks. One of the finer-grained checks is whether an object crosses stack frames within the stack region. Doing this on x86 with CONFIG_FRAME_POINTER was cheap/easy. Doing it with ORC was deemed too heavy, and was left out (a while ago), leaving the courser whole-stack check. The LKDTM tests USERCOPY_STACK_FRAME_TO and USERCOPY_STACK_FRAME_FROM try to exercise these cross-frame cases to validate the defense is working. They have been failing ever since ORC was added (which was expected). While Muhammad was investigating various LKDTM failures[1], he asked me for additional details on them, and I realized that when exact stack frame boundary checking is not available (i.e. everything except x86 with FRAME_POINTER), it could check if a stack object is at least "current depth valid", in the sense that any object within the stack region but not between start-of-stack and current_stack_pointer should be considered unavailable (i.e. its lifetime is from a call no longer present on the stack). Introduce ARCH_HAS_CURRENT_STACK_POINTER to track which architectures have actually implemented the common global register alias. Additionally report usercopy bounds checking failures with an offset from current_stack_pointer, which may assist with diagnosing failures. The LKDTM USERCOPY_STACK_FRAME_TO and USERCOPY_STACK_FRAME_FROM tests (once slightly adjusted in a separate patch) pass again with this fixed. [1] https://github.com/kernelci/kernelci-project/issues/84 Cc: Matthew Wilcox (Oracle) Cc: Josh Poimboeuf Cc: Andrew Morton Cc: linux-mm@kvack.org Reported-by: Muhammad Usama Anjum Signed-off-by: Kees Cook --- v1: https://lore.kernel.org/lkml/20220216201449.2087956-1-keescook@chromium.org v2: https://lore.kernel.org/lkml/20220224060342.1855457-1-keescook@chromium.org v3: https://lore.kernel.org/lkml/20220225173345.3358109-1-keescook@chromium.org v4: - improve commit log (akpm) --- arch/arm/Kconfig | 1 + arch/arm64/Kconfig | 1 + arch/powerpc/Kconfig | 1 + arch/s390/Kconfig | 1 + arch/sh/Kconfig | 1 + arch/x86/Kconfig | 1 + mm/Kconfig | 9 +++++++++ mm/usercopy.c | 23 +++++++++++++++++++++-- 8 files changed, 36 insertions(+), 2 deletions(-) diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 4c97cb40eebb..a7a09eef1852 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -5,6 +5,7 @@ config ARM select ARCH_32BIT_OFF_T select ARCH_CORRECT_STACKTRACE_ON_KRETPROBE if HAVE_KRETPROBES && FRAME_POINTER && !ARM_UNWIND select ARCH_HAS_BINFMT_FLAT + select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL if MMU select ARCH_HAS_DMA_WRITE_COMBINE if !ARM_DMA_MEM_BUFFERABLE select ARCH_HAS_ELF_RANDOMIZE diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index f2b5a4abef21..b8ab790555c8 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -18,6 +18,7 @@ config ARM64 select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 select ARCH_ENABLE_THP_MIGRATION if TRANSPARENT_HUGEPAGE select ARCH_HAS_CACHE_LINE_SIZE + select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DMA_PREP_COHERENT diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index b779603978e1..7e7387bd7d53 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -108,6 +108,7 @@ config PPC select ARCH_ENABLE_MEMORY_HOTPLUG select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_HAS_COPY_MC if PPC64 + select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX if STRICT_KERNEL_RWX diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index be9f39fd06df..4845ab549dd1 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -60,6 +60,7 @@ config S390 select ARCH_ENABLE_MEMORY_HOTPLUG if SPARSEMEM select ARCH_ENABLE_MEMORY_HOTREMOVE select ARCH_ENABLE_SPLIT_PMD_PTLOCK if PGTABLE_LEVELS > 2 + select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VM_PGTABLE select ARCH_HAS_DEBUG_WX select ARCH_HAS_DEVMEM_IS_ALLOWED diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 2474a04ceac4..1c2b53bf3093 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -7,6 +7,7 @@ config SUPERH select ARCH_HAVE_CUSTOM_GPIO_H select ARCH_HAVE_NMI_SAFE_CMPXCHG if (GUSA_RB || CPU_SH4A) select ARCH_HAS_BINFMT_FLAT if !MMU + select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_GIGANTIC_PAGE select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_PTE_SPECIAL diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 9f5bd41bf660..90494fba3620 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -69,6 +69,7 @@ config X86 select ARCH_ENABLE_THP_MIGRATION if X86_64 && TRANSPARENT_HUGEPAGE select ARCH_HAS_ACPI_TABLE_UPGRADE if ACPI select ARCH_HAS_CACHE_LINE_SIZE + select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DEBUG_VIRTUAL select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE select ARCH_HAS_DEVMEM_IS_ALLOWED diff --git a/mm/Kconfig b/mm/Kconfig index 3326ee3903f3..c349599601f8 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -744,6 +744,15 @@ config IDLE_PAGE_TRACKING config ARCH_HAS_CACHE_LINE_SIZE bool +config ARCH_HAS_CURRENT_STACK_POINTER + bool + help + In support of HARDENED_USERCOPY performing stack variable lifetime + checking, an architecture-agnostic way to find the stack pointer + is needed. Once an architecture defines an unsigned long global + register alias named "current_stack_pointer", this config can be + selected. + config ARCH_HAS_PTE_DEVMAP bool diff --git a/mm/usercopy.c b/mm/usercopy.c index d0d268135d96..5d34c40c16c2 100644 --- a/mm/usercopy.c +++ b/mm/usercopy.c @@ -29,7 +29,7 @@ * Returns: * NOT_STACK: not at all on the stack * GOOD_FRAME: fully within a valid stack frame - * GOOD_STACK: fully on the stack (when can't do frame-checking) + * GOOD_STACK: within the current stack (when can't frame-check exactly) * BAD_STACK: error condition (invalid stack position or bad stack frame) */ static noinline int check_stack_object(const void *obj, unsigned long len) @@ -55,6 +55,17 @@ static noinline int check_stack_object(const void *obj, unsigned long len) if (ret) return ret; + /* Finally, check stack depth if possible. */ +#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER + if (IS_ENABLED(CONFIG_STACK_GROWSUP)) { + if ((void *)current_stack_pointer < obj + len) + return BAD_STACK; + } else { + if (obj < (void *)current_stack_pointer) + return BAD_STACK; + } +#endif + return GOOD_STACK; } @@ -280,7 +291,15 @@ void __check_object_size(const void *ptr, unsigned long n, bool to_user) */ return; default: - usercopy_abort("process stack", NULL, to_user, 0, n); + usercopy_abort("process stack", NULL, to_user, +#ifdef CONFIG_ARCH_HAS_CURRENT_STACK_POINTER + IS_ENABLED(CONFIG_STACK_GROWSUP) ? + ptr - (void *)current_stack_pointer : + (void *)current_stack_pointer - ptr, +#else + 0, +#endif + n); } /* Check for bad heap object. */ From 92652cf986441b18282db6e5bd82afc74e8ed5e9 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 19 Feb 2022 17:26:00 -0800 Subject: [PATCH 07/30] xtensa: Implement "current_stack_pointer" To follow the existing per-arch conventions replace open-coded uses of asm "sp" as "current_stack_pointer". This will let it be used in non-arch places (like HARDENED_USERCOPY). Cc: Chris Zankel Cc: Marc Zyngier Cc: linux-xtensa@linux-xtensa.org Signed-off-by: Kees Cook Acked-by: Max Filippov Link: https://lore.kernel.org/lkml/CAMo8BfJFJE-n3=AF+pb9_6oF3gzxX7a+7aBrASHjjNX5byqDqw@mail.gmail.com --- arch/xtensa/Kconfig | 1 + arch/xtensa/include/asm/current.h | 2 ++ arch/xtensa/include/asm/stacktrace.h | 8 ++++---- arch/xtensa/kernel/irq.c | 3 +-- 4 files changed, 8 insertions(+), 6 deletions(-) diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 8ac599aa6d99..887432327613 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -3,6 +3,7 @@ config XTENSA def_bool y select ARCH_32BIT_OFF_T select ARCH_HAS_BINFMT_FLAT if !MMU + select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DMA_PREP_COHERENT if MMU select ARCH_HAS_SYNC_DMA_FOR_CPU if MMU select ARCH_HAS_SYNC_DMA_FOR_DEVICE if MMU diff --git a/arch/xtensa/include/asm/current.h b/arch/xtensa/include/asm/current.h index 5d98a7ad4251..08010dbf5e09 100644 --- a/arch/xtensa/include/asm/current.h +++ b/arch/xtensa/include/asm/current.h @@ -26,6 +26,8 @@ static inline struct task_struct *get_current(void) #define current get_current() +register unsigned long current_stack_pointer __asm__("a1"); + #else #define GET_CURRENT(reg,sp) \ diff --git a/arch/xtensa/include/asm/stacktrace.h b/arch/xtensa/include/asm/stacktrace.h index fe06e8ed162b..a85e785a6288 100644 --- a/arch/xtensa/include/asm/stacktrace.h +++ b/arch/xtensa/include/asm/stacktrace.h @@ -19,14 +19,14 @@ struct stackframe { static __always_inline unsigned long *stack_pointer(struct task_struct *task) { - unsigned long *sp; + unsigned long sp; if (!task || task == current) - __asm__ __volatile__ ("mov %0, a1\n" : "=a"(sp)); + sp = current_stack_pointer; else - sp = (unsigned long *)task->thread.sp; + sp = task->thread.sp; - return sp; + return (unsigned long *)sp; } void walk_stackframe(unsigned long *sp, diff --git a/arch/xtensa/kernel/irq.c b/arch/xtensa/kernel/irq.c index 15051a8a1539..529fe9245821 100644 --- a/arch/xtensa/kernel/irq.c +++ b/arch/xtensa/kernel/irq.c @@ -36,9 +36,8 @@ asmlinkage void do_IRQ(int hwirq, struct pt_regs *regs) #ifdef CONFIG_DEBUG_STACKOVERFLOW /* Debugging check for stack overflow: is there less than 1KB free? */ { - unsigned long sp; + unsigned long sp = current_stack_pointer; - __asm__ __volatile__ ("mov %0, a1\n" : "=a" (sp)); sp &= THREAD_SIZE - 1; if (unlikely(sp < (sizeof(thread_info) + 1024))) From 575d6b77fa2697afd2b3a443f7f879faa65ae0ca Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Sat, 19 Feb 2022 17:20:02 -0800 Subject: [PATCH 08/30] m68k: Implement "current_stack_pointer" To follow the existing per-arch conventions, add asm "sp" as "current_stack_pointer". This will let it be used in non-arch places (like HARDENED_USERCOPY). Cc: linux-m68k@lists.linux-m68k.org Signed-off-by: Kees Cook Acked-by: Geert Uytterhoeven Link: https://lore.kernel.org/lkml/CAMuHMdU6msvi0j=mS28GFYbm+uMRk7PkYe+zOM4sDmOVxeibLQ@mail.gmail.com --- arch/m68k/Kconfig | 1 + arch/m68k/include/asm/current.h | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 936e1803c7c7..f0eac0e2f123 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -4,6 +4,7 @@ config M68K default y select ARCH_32BIT_OFF_T select ARCH_HAS_BINFMT_FLAT + select ARCH_HAS_CURRENT_STACK_POINTER select ARCH_HAS_DMA_PREP_COHERENT if HAS_DMA && MMU && !COLDFIRE select ARCH_HAS_SYNC_DMA_FOR_DEVICE if HAS_DMA select ARCH_HAVE_NMI_SAFE_CMPXCHG if RMW_INSNS diff --git a/arch/m68k/include/asm/current.h b/arch/m68k/include/asm/current.h index 6390ef2f7f86..c117907e1276 100644 --- a/arch/m68k/include/asm/current.h +++ b/arch/m68k/include/asm/current.h @@ -24,6 +24,8 @@ static inline struct task_struct *get_current(void) #define current get_current() -#endif /* CONFNIG_MMU */ +#endif /* CONFIG_MMU */ + +register unsigned long current_stack_pointer __asm__("sp"); #endif /* !(_M68K_CURRENT_H) */ From 023bbde3db41078780f5d57e5354212f974c4bca Mon Sep 17 00:00:00 2001 From: Vincent Whitchurch Date: Tue, 1 Mar 2022 15:49:32 +0100 Subject: [PATCH 09/30] pstore: Add prefix to ECC messages The "No errors detected" message from the ECC code is shown at the end of the pstore log and can be confusing or misleading, especially since it usually appears just after a kernel crash log which normally means quite the opposite of "no errors". Prefix the message to clarify that this message is only about ECC-detected errors. Signed-off-by: Vincent Whitchurch Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220301144932.89549-1-vincent.whitchurch@axis.com --- fs/pstore/ram_core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/pstore/ram_core.c b/fs/pstore/ram_core.c index fe5305028c6e..a89e33719fcf 100644 --- a/fs/pstore/ram_core.c +++ b/fs/pstore/ram_core.c @@ -263,10 +263,10 @@ ssize_t persistent_ram_ecc_string(struct persistent_ram_zone *prz, if (prz->corrected_bytes || prz->bad_blocks) ret = snprintf(str, len, "" - "\n%d Corrected bytes, %d unrecoverable blocks\n", + "\nECC: %d Corrected bytes, %d unrecoverable blocks\n", prz->corrected_bytes, prz->bad_blocks); else - ret = snprintf(str, len, "\nNo errors detected\n"); + ret = snprintf(str, len, "\nECC: No errors detected\n"); return ret; } From 10b19249192ae28ee9092ceca327a83d27d88bd0 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 3 Oct 2021 15:11:24 +0300 Subject: [PATCH 10/30] ELF: fix overflow in total mapping size calculation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Kernel assumes that ELF program headers are ordered by mapping address, but doesn't enforce it. It is possible to make mapping size extremely huge by simply shuffling first and last PT_LOAD segments. As long as PT_LOAD segments do not overlap, it is silly to require sorting by v_addr anyway because mmap() doesn't care. Don't assume PT_LOAD segments are sorted and calculate min and max addresses correctly. Signed-off-by: Alexey Dobriyan Tested-by: "Magnus Groß" Link: https://lore.kernel.org/all/Yfqm7HbucDjPbES+@fractal.localdomain/ Signed-off-by: Kees Cook Link: https://lore.kernel.org/lkml/YVmd7D0M6G%2FDcP4O@localhost.localdomain --- fs/binfmt_elf.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index c36b1ec357fb..65ea5381c5c7 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -93,7 +93,7 @@ static int elf_core_dump(struct coredump_params *cprm); #define ELF_CORE_EFLAGS 0 #endif -#define ELF_PAGESTART(_v) ((_v) & ~(unsigned long)(ELF_MIN_ALIGN-1)) +#define ELF_PAGESTART(_v) ((_v) & ~(int)(ELF_MIN_ALIGN-1)) #define ELF_PAGEOFFSET(_v) ((_v) & (ELF_MIN_ALIGN-1)) #define ELF_PAGEALIGN(_v) (((_v) + ELF_MIN_ALIGN - 1) & ~(ELF_MIN_ALIGN - 1)) @@ -399,22 +399,21 @@ static unsigned long elf_map(struct file *filep, unsigned long addr, return(map_addr); } -static unsigned long total_mapping_size(const struct elf_phdr *cmds, int nr) +static unsigned long total_mapping_size(const struct elf_phdr *phdr, int nr) { - int i, first_idx = -1, last_idx = -1; + elf_addr_t min_addr = -1; + elf_addr_t max_addr = 0; + bool pt_load = false; + int i; for (i = 0; i < nr; i++) { - if (cmds[i].p_type == PT_LOAD) { - last_idx = i; - if (first_idx == -1) - first_idx = i; + if (phdr[i].p_type == PT_LOAD) { + min_addr = min(min_addr, ELF_PAGESTART(phdr[i].p_vaddr)); + max_addr = max(max_addr, phdr[i].p_vaddr + phdr[i].p_memsz); + pt_load = true; } } - if (first_idx == -1) - return 0; - - return cmds[last_idx].p_vaddr + cmds[last_idx].p_memsz - - ELF_PAGESTART(cmds[first_idx].p_vaddr); + return pt_load ? (max_addr - min_addr) : 0; } static int elf_read(struct file *file, void *buf, size_t len, loff_t pos) From 84158b7f6a0624b81800b4e7c90f7fb7fdecf66c Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Wed, 26 Jan 2022 03:57:39 +0100 Subject: [PATCH 11/30] coredump: Also dump first pages of non-executable ELF libraries When I rewrote the VMA dumping logic for coredumps, I changed it to recognize ELF library mappings based on the file being executable instead of the mapping having an ELF header. But turns out, distros ship many ELF libraries as non-executable, so the heuristic goes wrong... Restore the old behavior where FILTER(ELF_HEADERS) dumps the first page of any offset-0 readable mapping that starts with the ELF magic. This fix is technically layer-breaking a bit, because it checks for something ELF-specific in fs/coredump.c; but since we probably want to share this between standard ELF and FDPIC ELF anyway, I guess it's fine? And this also keeps the change small for backporting. Cc: stable@vger.kernel.org Fixes: 429a22e776a2 ("coredump: rework elf/elf_fdpic vma_dump_size() into common helper") Reported-by: Bill Messmer Signed-off-by: Jann Horn Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220126025739.2014888-1-jannh@google.com --- fs/coredump.c | 39 ++++++++++++++++++++++++++++++++++----- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 1c060c0a2d72..b73817712dd2 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include @@ -980,6 +981,8 @@ static bool always_dump_vma(struct vm_area_struct *vma) return false; } +#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1 + /* * Decide how much of @vma's contents should be included in a core dump. */ @@ -1039,9 +1042,20 @@ static unsigned long vma_dump_size(struct vm_area_struct *vma, * dump the first page to aid in determining what was mapped here. */ if (FILTER(ELF_HEADERS) && - vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) && - (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) - return PAGE_SIZE; + vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) { + if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0) + return PAGE_SIZE; + + /* + * ELF libraries aren't always executable. + * We'll want to check whether the mapping starts with the ELF + * magic, but not now - we're holding the mmap lock, + * so copy_from_user() doesn't work here. + * Use a placeholder instead, and fix it up later in + * dump_vma_snapshot(). + */ + return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER; + } #undef FILTER @@ -1116,8 +1130,6 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, m->end = vma->vm_end; m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); - - vma_data_size += m->dump_size; } mmap_write_unlock(mm); @@ -1127,6 +1139,23 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, return -EFAULT; } + for (i = 0; i < *vma_count; i++) { + struct core_vma_metadata *m = (*vma_meta) + i; + + if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) { + char elfmag[SELFMAG]; + + if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) || + memcmp(elfmag, ELFMAG, SELFMAG) != 0) { + m->dump_size = 0; + } else { + m->dump_size = PAGE_SIZE; + } + } + + vma_data_size += m->dump_size; + } + *vma_data_size_ptr = vma_data_size; return 0; } From dcd46d897adb70d63e025f175a00a89797d31a43 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 31 Jan 2022 16:09:47 -0800 Subject: [PATCH 12/30] exec: Force single empty string when argv is empty Quoting[1] Ariadne Conill: "In several other operating systems, it is a hard requirement that the second argument to execve(2) be the name of a program, thus prohibiting a scenario where argc < 1. POSIX 2017 also recommends this behaviour, but it is not an explicit requirement[2]: The argument arg0 should point to a filename string that is associated with the process being started by one of the exec functions. ... Interestingly, Michael Kerrisk opened an issue about this in 2008[3], but there was no consensus to support fixing this issue then. Hopefully now that CVE-2021-4034 shows practical exploitative use[4] of this bug in a shellcode, we can reconsider. This issue is being tracked in the KSPP issue tracker[5]." While the initial code searches[6][7] turned up what appeared to be mostly corner case tests, trying to that just reject argv == NULL (or an immediately terminated pointer list) quickly started tripping[8] existing userspace programs. The next best approach is forcing a single empty string into argv and adjusting argc to match. The number of programs depending on argc == 0 seems a smaller set than those calling execve with a NULL argv. Account for the additional stack space in bprm_stack_limits(). Inject an empty string when argc == 0 (and set argc = 1). Warn about the case so userspace has some notice about the change: process './argc0' launched './argc0' with NULL argv: empty string added Additionally WARN() and reject NULL argv usage for kernel threads. [1] https://lore.kernel.org/lkml/20220127000724.15106-1-ariadne@dereferenced.org/ [2] https://pubs.opengroup.org/onlinepubs/9699919799/functions/exec.html [3] https://bugzilla.kernel.org/show_bug.cgi?id=8408 [4] https://www.qualys.com/2022/01/25/cve-2021-4034/pwnkit.txt [5] https://github.com/KSPP/linux/issues/176 [6] https://codesearch.debian.net/search?q=execve%5C+*%5C%28%5B%5E%2C%5D%2B%2C+*NULL&literal=0 [7] https://codesearch.debian.net/search?q=execlp%3F%5Cs*%5C%28%5B%5E%2C%5D%2B%2C%5Cs*NULL&literal=0 [8] https://lore.kernel.org/lkml/20220131144352.GE16385@xsang-OptiPlex-9020/ Reported-by: Ariadne Conill Reported-by: Michael Kerrisk Cc: Matthew Wilcox Cc: Christian Brauner Cc: Rich Felker Cc: Eric Biederman Cc: Alexander Viro Cc: linux-fsdevel@vger.kernel.org Cc: stable@vger.kernel.org Signed-off-by: Kees Cook Acked-by: Christian Brauner Acked-by: Ariadne Conill Acked-by: Andy Lutomirski Link: https://lore.kernel.org/r/20220201000947.2453721-1-keescook@chromium.org --- fs/exec.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index 79f2c9483302..40b1008fb0f7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -495,8 +495,14 @@ static int bprm_stack_limits(struct linux_binprm *bprm) * the stack. They aren't stored until much later when we can't * signal to the parent that the child has run out of stack space. * Instead, calculate it here so it's possible to fail gracefully. + * + * In the case of argc = 0, make sure there is space for adding a + * empty string (which will bump argc to 1), to ensure confused + * userspace programs don't start processing from argv[1], thinking + * argc can never be 0, to keep them from walking envp by accident. + * See do_execveat_common(). */ - ptr_size = (bprm->argc + bprm->envc) * sizeof(void *); + ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *); if (limit <= ptr_size) return -E2BIG; limit -= ptr_size; @@ -1897,6 +1903,9 @@ static int do_execveat_common(int fd, struct filename *filename, } retval = count(argv, MAX_ARG_STRINGS); + if (retval == 0) + pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n", + current->comm, bprm->filename); if (retval < 0) goto out_free; bprm->argc = retval; @@ -1923,6 +1932,19 @@ static int do_execveat_common(int fd, struct filename *filename, if (retval < 0) goto out_free; + /* + * When argv is empty, add an empty string ("") as argv[0] to + * ensure confused userspace programs that start processing + * from argv[1] won't end up walking envp. See also + * bprm_stack_limits(). + */ + if (bprm->argc == 0) { + retval = copy_string_kernel("", bprm); + if (retval < 0) + goto out_free; + bprm->argc = 1; + } + retval = bprm_execve(bprm, fd, filename, flags); out_free: free_bprm(bprm); @@ -1951,6 +1973,8 @@ int kernel_execve(const char *kernel_filename, } retval = count_strings_kernel(argv); + if (WARN_ON_ONCE(retval == 0)) + retval = -EINVAL; if (retval < 0) goto out_free; bprm->argc = retval; From 9132c3947b09a6c67372424ff69f867f2cee82f8 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Mon, 31 Jan 2022 17:16:37 -0800 Subject: [PATCH 13/30] selftests/exec: Test for empty string on NULL argv Test for the NULL argv argument producing a single empty string on exec. Cc: Eric Biederman Cc: Shuah Khan Cc: Yang Yingliang Cc: Andrew Morton Cc: linux-kselftest@vger.kernel.org Signed-off-by: Kees Cook Link: https://lore.kernel.org/lkml/20220201011637.2457646-1-keescook@chromium.org --- tools/testing/selftests/exec/Makefile | 1 + tools/testing/selftests/exec/null-argv.c | 78 ++++++++++++++++++++++++ 2 files changed, 79 insertions(+) create mode 100644 tools/testing/selftests/exec/null-argv.c diff --git a/tools/testing/selftests/exec/Makefile b/tools/testing/selftests/exec/Makefile index 12c5e27d32c1..551affb437fe 100644 --- a/tools/testing/selftests/exec/Makefile +++ b/tools/testing/selftests/exec/Makefile @@ -10,6 +10,7 @@ TEST_GEN_FILES := execveat.symlink execveat.denatured script subdir TEST_FILES := Makefile TEST_GEN_PROGS += recursion-depth +TEST_GEN_PROGS += null-argv EXTRA_CLEAN := $(OUTPUT)/subdir.moved $(OUTPUT)/execveat.moved $(OUTPUT)/xxxxx* \ $(OUTPUT)/S_I*.test diff --git a/tools/testing/selftests/exec/null-argv.c b/tools/testing/selftests/exec/null-argv.c new file mode 100644 index 000000000000..c19726e710d1 --- /dev/null +++ b/tools/testing/selftests/exec/null-argv.c @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Test that empty argvs are swapped out for a single empty string. */ +#include +#include +#include +#include + +#include "../kselftest.h" + +#define FORK(exec) \ +do { \ + pid = fork(); \ + if (pid == 0) { \ + /* Child */ \ + exec; /* Some kind of exec */ \ + perror("# " #exec); \ + return 1; \ + } \ + check_result(pid, #exec); \ +} while (0) + +void check_result(pid_t pid, const char *msg) +{ + int wstatus; + + if (pid == (pid_t)-1) { + perror("# fork"); + ksft_test_result_fail("fork failed: %s\n", msg); + return; + } + if (waitpid(pid, &wstatus, 0) < 0) { + perror("# waitpid"); + ksft_test_result_fail("waitpid failed: %s\n", msg); + return; + } + if (!WIFEXITED(wstatus)) { + ksft_test_result_fail("child did not exit: %s\n", msg); + return; + } + if (WEXITSTATUS(wstatus) != 0) { + ksft_test_result_fail("non-zero exit: %s\n", msg); + return; + } + ksft_test_result_pass("%s\n", msg); +} + +int main(int argc, char *argv[], char *envp[]) +{ + pid_t pid; + static char * const args[] = { NULL }; + static char * const str[] = { "", NULL }; + + /* argc counting checks */ + if (argc < 1) { + fprintf(stderr, "# FAIL: saw argc == 0 (old kernel?)\n"); + return 1; + } + if (argc != 1) { + fprintf(stderr, "# FAIL: unknown argc (%d)\n", argc); + return 1; + } + if (argv[0][0] == '\0') { + /* Good, we found a NULL terminated string at argv[0]! */ + return 0; + } + + /* Test runner. */ + ksft_print_header(); + ksft_set_plan(5); + + FORK(execve(argv[0], str, NULL)); + FORK(execve(argv[0], NULL, NULL)); + FORK(execve(argv[0], NULL, envp)); + FORK(execve(argv[0], args, NULL)); + FORK(execve(argv[0], args, envp)); + + ksft_exit(ksft_cnt.ksft_pass == ksft_plan); +} From d65bc29be0ae4ca2368df25dc6f6247aefb57f07 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 13 Feb 2022 22:25:20 +0300 Subject: [PATCH 14/30] binfmt: move more stuff undef CONFIG_COREDUMP struct linux_binfmt::core_dump and struct min_coredump::min_coredump are used under CONFIG_COREDUMP only. Shrink those embedded configs a bit. Signed-off-by: Alexey Dobriyan Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/YglbIFyN+OtwVyjW@localhost.localdomain --- fs/binfmt_elf.c | 2 ++ fs/binfmt_elf_fdpic.c | 2 +- fs/binfmt_flat.c | 2 ++ include/linux/binfmts.h | 2 ++ 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 65ea5381c5c7..d0c1703b9576 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -101,8 +101,10 @@ static struct linux_binfmt elf_format = { .module = THIS_MODULE, .load_binary = load_elf_binary, .load_shlib = load_elf_library, +#ifdef CONFIG_COREDUMP .core_dump = elf_core_dump, .min_coredump = ELF_EXEC_PAGESIZE, +#endif }; #define BAD_ADDR(x) (unlikely((unsigned long)(x) >= TASK_SIZE)) diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c6f588dc4a9d..7fa6e6632d9d 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -83,8 +83,8 @@ static struct linux_binfmt elf_fdpic_format = { .load_binary = load_elf_fdpic_binary, #ifdef CONFIG_ELF_CORE .core_dump = elf_fdpic_core_dump, -#endif .min_coredump = ELF_EXEC_PAGESIZE, +#endif }; static int __init init_elf_fdpic_binfmt(void) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 5d776f80ee50..5f0bf24bb3b8 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -102,8 +102,10 @@ static int flat_core_dump(struct coredump_params *cprm); static struct linux_binfmt flat_format = { .module = THIS_MODULE, .load_binary = load_flat_binary, +#ifdef CONFIG_COREDUMP .core_dump = flat_core_dump, .min_coredump = PAGE_SIZE +#endif }; /****************************************************************************/ diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 049cf9421d83..5d651c219c99 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -98,8 +98,10 @@ struct linux_binfmt { struct module *module; int (*load_binary)(struct linux_binprm *); int (*load_shlib)(struct file *); +#ifdef CONFIG_COREDUMP int (*core_dump)(struct coredump_params *cprm); unsigned long min_coredump; /* minimal dump size */ +#endif } __randomize_layout; extern void __register_binfmt(struct linux_binfmt *fmt, int insert); From 0da1d5002745cdc721bc018b582a8a9704d56c42 Mon Sep 17 00:00:00 2001 From: Akira Kawata Date: Thu, 27 Jan 2022 21:40:16 +0900 Subject: [PATCH 15/30] fs/binfmt_elf: Fix AT_PHDR for unusual ELF files BugLink: https://bugzilla.kernel.org/show_bug.cgi?id=197921 As pointed out in the discussion of buglink, we cannot calculate AT_PHDR as the sum of load_addr and exec->e_phoff. : The AT_PHDR of ELF auxiliary vectors should point to the memory address : of program header. But binfmt_elf.c calculates this address as follows: : : NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff); : : which is wrong since e_phoff is the file offset of program header and : load_addr is the memory base address from PT_LOAD entry. : : The ld.so uses AT_PHDR as the memory address of program header. In normal : case, since the e_phoff is usually 64 and in the first PT_LOAD region, it : is the correct program header address. : : But if the address of program header isn't equal to the first PT_LOAD : address + e_phoff (e.g. Put the program header in other non-consecutive : PT_LOAD region), ld.so will try to read program header from wrong address : then crash or use incorrect program header. This is because exec->e_phoff is the offset of PHDRs in the file and the address of PHDRs in the memory may differ from it. This patch fixes the bug by calculating the address of program headers from PT_LOADs directly. Signed-off-by: Akira Kawata Reported-by: kernel test robot Acked-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220127124014.338760-2-akirakawata1@gmail.com --- fs/binfmt_elf.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index d0c1703b9576..600cec2173c8 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -172,8 +172,8 @@ static int padzero(unsigned long elf_bss) static int create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, - unsigned long load_addr, unsigned long interp_load_addr, - unsigned long e_entry) + unsigned long interp_load_addr, + unsigned long e_entry, unsigned long phdr_addr) { struct mm_struct *mm = current->mm; unsigned long p = bprm->p; @@ -259,7 +259,7 @@ create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec, NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP); NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE); NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC); - NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff); + NEW_AUX_ENT(AT_PHDR, phdr_addr); NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr)); NEW_AUX_ENT(AT_PHNUM, exec->e_phnum); NEW_AUX_ENT(AT_BASE, interp_load_addr); @@ -824,7 +824,7 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ - unsigned long load_addr = 0, load_bias = 0; + unsigned long load_addr, load_bias = 0, phdr_addr = 0; int load_addr_set = 0; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; @@ -1181,6 +1181,17 @@ out_free_interp: reloc_func_desc = load_bias; } } + + /* + * Figure out which segment in the file contains the Program + * Header table, and map to the associated memory address. + */ + if (elf_ppnt->p_offset <= elf_ex->e_phoff && + elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) { + phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset + + elf_ppnt->p_vaddr; + } + k = elf_ppnt->p_vaddr; if ((elf_ppnt->p_flags & PF_X) && k < start_code) start_code = k; @@ -1216,6 +1227,7 @@ out_free_interp: } e_entry = elf_ex->e_entry + load_bias; + phdr_addr += load_bias; elf_bss += load_bias; elf_brk += load_bias; start_code += load_bias; @@ -1279,8 +1291,8 @@ out_free_interp: goto out; #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ - retval = create_elf_tables(bprm, elf_ex, - load_addr, interp_load_addr, e_entry); + retval = create_elf_tables(bprm, elf_ex, interp_load_addr, + e_entry, phdr_addr); if (retval < 0) goto out; From 2b4bfbe0967697c9b5de83dbaf5b49db4b367ec0 Mon Sep 17 00:00:00 2001 From: Akira Kawata Date: Thu, 27 Jan 2022 21:40:17 +0900 Subject: [PATCH 16/30] fs/binfmt_elf: Refactor load_elf_binary function I delete load_addr because it is not used anymore. And I rename load_addr_set to first_pt_load because it is used only to capture the first iteration of the loop. Signed-off-by: Akira Kawata Acked-by: Kees Cook Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220127124014.338760-3-akirakawata1@gmail.com --- fs/binfmt_elf.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 600cec2173c8..4c5a2175f605 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -824,8 +824,8 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, static int load_elf_binary(struct linux_binprm *bprm) { struct file *interpreter = NULL; /* to shut gcc up */ - unsigned long load_addr, load_bias = 0, phdr_addr = 0; - int load_addr_set = 0; + unsigned long load_bias = 0, phdr_addr = 0; + int first_pt_load = 1; unsigned long error; struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; @@ -1075,12 +1075,12 @@ out_free_interp: vaddr = elf_ppnt->p_vaddr; /* - * The first time through the loop, load_addr_set is false: + * The first time through the loop, first_pt_load is true: * layout will be calculated. Once set, use MAP_FIXED since * we know we've already safely mapped the entire region with * MAP_FIXED_NOREPLACE in the once-per-binary logic following. */ - if (load_addr_set) { + if (!first_pt_load) { elf_flags |= MAP_FIXED; } else if (elf_ex->e_type == ET_EXEC) { /* @@ -1171,13 +1171,11 @@ out_free_interp: goto out_free_dentry; } - if (!load_addr_set) { - load_addr_set = 1; - load_addr = (elf_ppnt->p_vaddr - elf_ppnt->p_offset); + if (first_pt_load) { + first_pt_load = 0; if (elf_ex->e_type == ET_DYN) { load_bias += error - ELF_PAGESTART(load_bias + vaddr); - load_addr += load_bias; reloc_func_desc = load_bias; } } From b452722e6ff39a1b8111d5b36b8562aaead1726e Mon Sep 17 00:00:00 2001 From: Tom Rix Date: Fri, 11 Feb 2022 08:09:40 -0800 Subject: [PATCH 17/30] exec: cleanup comments Remove the second 'from'. Replace 'backwords' with 'backwards'. Replace 'visibile' with 'visible'. Signed-off-by: Tom Rix Acked-by: Randy Dunlap Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220211160940.2516243-1-trix@redhat.com --- fs/exec.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 40b1008fb0f7..8256e8bb9ad3 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -118,7 +118,7 @@ bool path_noexec(const struct path *path) * Note that a shared library must be both readable and executable due to * security reasons. * - * Also note that we take the address to load from from the file itself. + * Also note that we take the address to load from the file itself. */ SYSCALL_DEFINE1(uselib, const char __user *, library) { @@ -542,7 +542,7 @@ static int copy_strings(int argc, struct user_arg_ptr argv, if (!valid_arg_len(bprm, len)) goto out; - /* We're going to work our way backwords. */ + /* We're going to work our way backwards. */ pos = bprm->p; str += len; bprm->p -= len; @@ -1275,7 +1275,7 @@ int begin_new_exec(struct linux_binprm * bprm) /* * Must be called _before_ exec_mmap() as bprm->mm is - * not visibile until then. This also enables the update + * not visible until then. This also enables the update * to be lockless. */ retval = set_mm_exe_file(bprm->mm, bprm->file); From 7dc6ea7c56bd76ab88d09504a90879479e3851ef Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 23 Feb 2022 20:38:53 -0800 Subject: [PATCH 18/30] MAINTAINERS: Update execve entry with more details The UAPI elf.h header was missed in the original MAINTAINER entry. Add it. Include linux-mm mailing list since that's where execve has traditionally been discussed. Note that this area is Supported, and aim at the git tree. Cc: Eric Biederman Signed-off-by: Kees Cook --- MAINTAINERS | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 69a2935daf6c..2a4de0619c49 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7225,6 +7225,9 @@ F: net/core/of_net.c EXEC & BINFMT API R: Eric Biederman R: Kees Cook +L: linux-mm@kvack.org +S: Supported +T: git git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git for-next/execve F: arch/alpha/kernel/binfmt_loader.c F: arch/x86/ia32/ia32_aout.c F: fs/*binfmt_*.c @@ -7232,6 +7235,7 @@ F: fs/exec.c F: include/linux/binfmts.h F: include/linux/elf.h F: include/uapi/linux/binfmts.h +F: include/uapi/linux/elf.h F: tools/testing/selftests/exec/ N: asm/elf.h N: binfmt From 4f0bfdfd8323e5b461fc1042143a1097dba9fced Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 23 Feb 2022 19:34:31 -0800 Subject: [PATCH 19/30] ELF: Properly redefine PT_GNU_* in terms of PT_LOOS The PT_GNU_* program header types are actually offsets from PT_LOOS, so redefine them as such, reorder them, and add the missing PT_GNU_RELRO. Cc: Eric Biederman Cc: Peter Collingbourne Cc: Catalin Marinas Cc: Dave Martin Signed-off-by: Kees Cook --- include/uapi/linux/elf.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/uapi/linux/elf.h b/include/uapi/linux/elf.h index 61bf4774b8f2..6438d55529bf 100644 --- a/include/uapi/linux/elf.h +++ b/include/uapi/linux/elf.h @@ -35,10 +35,11 @@ typedef __s64 Elf64_Sxword; #define PT_HIOS 0x6fffffff /* OS-specific */ #define PT_LOPROC 0x70000000 #define PT_HIPROC 0x7fffffff -#define PT_GNU_EH_FRAME 0x6474e550 -#define PT_GNU_PROPERTY 0x6474e553 - +#define PT_GNU_EH_FRAME (PT_LOOS + 0x474e550) #define PT_GNU_STACK (PT_LOOS + 0x474e551) +#define PT_GNU_RELRO (PT_LOOS + 0x474e552) +#define PT_GNU_PROPERTY (PT_LOOS + 0x474e553) + /* * Extended Numbering From 9e1a3ce0a952450a1163cc93ab1df6d4fa8c8155 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Wed, 23 Feb 2022 19:32:10 -0800 Subject: [PATCH 20/30] binfmt_elf: Introduce KUnit test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds simple KUnit test for some binfmt_elf internals: specifically a regression test for the problem fixed by commit 8904d9cd90ee ("ELF: fix overflow in total mapping size calculation"). $ ./tools/testing/kunit/kunit.py run --arch x86_64 \ --kconfig_add CONFIG_IA32_EMULATION=y '*binfmt_elf' ... [19:41:08] ================== binfmt_elf (1 subtest) ================== [19:41:08] [PASSED] total_mapping_size_test [19:41:08] =================== [PASSED] binfmt_elf ==================== [19:41:08] ============== compat_binfmt_elf (1 subtest) =============== [19:41:08] [PASSED] total_mapping_size_test [19:41:08] ================ [PASSED] compat_binfmt_elf ================ [19:41:08] ============================================================ [19:41:08] Testing complete. Passed: 2, Failed: 0, Crashed: 0, Skipped: 0, Errors: 0 Cc: Eric Biederman Cc: David Gow Cc: Alexey Dobriyan Cc: "Magnus Groß" Cc: kunit-dev@googlegroups.com Cc: linux-fsdevel@vger.kernel.org Signed-off-by: Kees Cook --- v1: https://lore.kernel.org/lkml/20220224054332.1852813-1-keescook@chromium.org v2: - improve commit log - fix comment URL (Daniel) - drop redundant KUnit Kconfig help info (Daniel) - note in Kconfig help that COMPAT builds add a compat test (David) --- fs/Kconfig.binfmt | 10 +++++++ fs/binfmt_elf.c | 4 +++ fs/binfmt_elf_test.c | 64 ++++++++++++++++++++++++++++++++++++++++++ fs/compat_binfmt_elf.c | 2 ++ 4 files changed, 80 insertions(+) create mode 100644 fs/binfmt_elf_test.c diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index 4d5ae61580aa..f7065e825b7f 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -28,6 +28,16 @@ config BINFMT_ELF ld.so (check the file for location and latest version). +config BINFMT_ELF_KUNIT_TEST + bool "Build KUnit tests for ELF binary support" if !KUNIT_ALL_TESTS + depends on KUNIT=y && BINFMT_ELF=y + default KUNIT_ALL_TESTS + help + This builds the ELF loader KUnit tests, which try to gather + prior bug fixes into a regression test collection. This is really + only needed for debugging. Note that with CONFIG_COMPAT=y, the + compat_binfmt_elf KUnit test is also created. + config COMPAT_BINFMT_ELF def_bool y depends on COMPAT && BINFMT_ELF diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 4c5a2175f605..eaf39b1bdbbb 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2346,3 +2346,7 @@ static void __exit exit_elf_binfmt(void) core_initcall(init_elf_binfmt); module_exit(exit_elf_binfmt); MODULE_LICENSE("GPL"); + +#ifdef CONFIG_BINFMT_ELF_KUNIT_TEST +#include "binfmt_elf_test.c" +#endif diff --git a/fs/binfmt_elf_test.c b/fs/binfmt_elf_test.c new file mode 100644 index 000000000000..11d734fec366 --- /dev/null +++ b/fs/binfmt_elf_test.c @@ -0,0 +1,64 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include + +static void total_mapping_size_test(struct kunit *test) +{ + struct elf_phdr empty[] = { + { .p_type = PT_LOAD, .p_vaddr = 0, .p_memsz = 0, }, + { .p_type = PT_INTERP, .p_vaddr = 10, .p_memsz = 999999, }, + }; + /* + * readelf -lW /bin/mount | grep '^ .*0x0' | awk '{print "\t\t{ .p_type = PT_" \ + * $1 ", .p_vaddr = " $3 ", .p_memsz = " $6 ", },"}' + */ + struct elf_phdr mount[] = { + { .p_type = PT_PHDR, .p_vaddr = 0x00000040, .p_memsz = 0x0002d8, }, + { .p_type = PT_INTERP, .p_vaddr = 0x00000318, .p_memsz = 0x00001c, }, + { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, }, + { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, }, + { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, }, + { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, }, + { .p_type = PT_DYNAMIC, .p_vaddr = 0x0000d928, .p_memsz = 0x000200, }, + { .p_type = PT_NOTE, .p_vaddr = 0x00000338, .p_memsz = 0x000030, }, + { .p_type = PT_NOTE, .p_vaddr = 0x00000368, .p_memsz = 0x000044, }, + { .p_type = PT_GNU_PROPERTY, .p_vaddr = 0x00000338, .p_memsz = 0x000030, }, + { .p_type = PT_GNU_EH_FRAME, .p_vaddr = 0x0000b490, .p_memsz = 0x0001ec, }, + { .p_type = PT_GNU_STACK, .p_vaddr = 0x00000000, .p_memsz = 0x000000, }, + { .p_type = PT_GNU_RELRO, .p_vaddr = 0x0000d330, .p_memsz = 0x000cd0, }, + }; + size_t mount_size = 0xE070; + /* https://lore.kernel.org/linux-fsdevel/YfF18Dy85mCntXrx@fractal.localdomain */ + struct elf_phdr unordered[] = { + { .p_type = PT_LOAD, .p_vaddr = 0x00000000, .p_memsz = 0x0033a8, }, + { .p_type = PT_LOAD, .p_vaddr = 0x0000d330, .p_memsz = 0x000d40, }, + { .p_type = PT_LOAD, .p_vaddr = 0x00004000, .p_memsz = 0x005c91, }, + { .p_type = PT_LOAD, .p_vaddr = 0x0000a000, .p_memsz = 0x0022f8, }, + }; + + /* No headers, no size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(NULL, 0), 0); + KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 0), 0); + /* Empty headers, no size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 1), 0); + /* No PT_LOAD headers, no size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(&empty[1], 1), 0); + /* Empty PT_LOAD and non-PT_LOAD headers, no size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(empty, 2), 0); + + /* Normal set of PT_LOADS, and expected size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(mount, ARRAY_SIZE(mount)), mount_size); + /* Unordered PT_LOADs result in same size. */ + KUNIT_EXPECT_EQ(test, total_mapping_size(unordered, ARRAY_SIZE(unordered)), mount_size); +} + +static struct kunit_case binfmt_elf_test_cases[] = { + KUNIT_CASE(total_mapping_size_test), + {}, +}; + +static struct kunit_suite binfmt_elf_test_suite = { + .name = KBUILD_MODNAME, + .test_cases = binfmt_elf_test_cases, +}; + +kunit_test_suite(binfmt_elf_test_suite); diff --git a/fs/compat_binfmt_elf.c b/fs/compat_binfmt_elf.c index 95e72d271b95..8f0af4f62631 100644 --- a/fs/compat_binfmt_elf.c +++ b/fs/compat_binfmt_elf.c @@ -135,6 +135,8 @@ #define elf_format compat_elf_format #define init_elf_binfmt init_compat_elf_binfmt #define exit_elf_binfmt exit_compat_elf_binfmt +#define binfmt_elf_test_cases compat_binfmt_elf_test_cases +#define binfmt_elf_test_suite compat_binfmt_elf_test_suite /* * We share all the actual code with the native (64-bit) version. From a99a3e2efaf1f4454eb5c9176f47e66de075b134 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 31 Jan 2022 11:50:46 -0600 Subject: [PATCH 21/30] coredump: Move definition of struct coredump_params into coredump.h Move the definition of struct coredump_params into coredump.h where it belongs. Remove the slightly errorneous comment explaining why struct coredump_params was declared in binfmts.h. Signed-off-by: "Eric W. Biederman" --- fs/binfmt_flat.c | 1 + include/linux/binfmts.h | 13 +------------ include/linux/coredump.h | 12 +++++++++++- 3 files changed, 13 insertions(+), 13 deletions(-) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 5f0bf24bb3b8..208cdce16de1 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include diff --git a/include/linux/binfmts.h b/include/linux/binfmts.h index 5d651c219c99..3dc20c4f394c 100644 --- a/include/linux/binfmts.h +++ b/include/linux/binfmts.h @@ -8,6 +8,7 @@ #include struct filename; +struct coredump_params; #define CORENAME_MAX_SIZE 128 @@ -77,18 +78,6 @@ struct linux_binprm { #define BINPRM_FLAGS_PRESERVE_ARGV0_BIT 3 #define BINPRM_FLAGS_PRESERVE_ARGV0 (1 << BINPRM_FLAGS_PRESERVE_ARGV0_BIT) -/* Function parameter for binfmt->coredump */ -struct coredump_params { - const kernel_siginfo_t *siginfo; - struct pt_regs *regs; - struct file *file; - unsigned long limit; - unsigned long mm_flags; - loff_t written; - loff_t pos; - loff_t to_skip; -}; - /* * This structure defines the functions that are used to load the binary formats that * linux accepts. diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 248a68c668b4..2ee1460a1d66 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -14,11 +14,21 @@ struct core_vma_metadata { unsigned long dump_size; }; +struct coredump_params { + const kernel_siginfo_t *siginfo; + struct pt_regs *regs; + struct file *file; + unsigned long limit; + unsigned long mm_flags; + loff_t written; + loff_t pos; + loff_t to_skip; +}; + /* * These are the only things you should do on a core-file: use only these * functions to write out all the necessary info. */ -struct coredump_params; extern void dump_skip_to(struct coredump_params *cprm, unsigned long to); extern void dump_skip(struct coredump_params *cprm, size_t nr); extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); From 95c5436a4883841588dae86fb0b9325f47ba5ad3 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 8 Mar 2022 12:55:29 -0600 Subject: [PATCH 22/30] coredump: Snapshot the vmas in do_coredump Move the call of dump_vma_snapshot and kvfree(vma_meta) out of the individual coredump routines into do_coredump itself. This makes the code less error prone and easier to maintain. Make the vma snapshot available to the coredump routines in struct coredump_params. This makes it easier to change and update what is captures in the vma snapshot and will be needed for fixing fill_file_notes. Reviewed-by: Jann Horn Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- fs/binfmt_elf.c | 20 +++++++------------- fs/binfmt_elf_fdpic.c | 18 ++++++------------ fs/coredump.c | 41 ++++++++++++++++++++++------------------ include/linux/coredump.h | 6 +++--- 4 files changed, 39 insertions(+), 46 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index eaf39b1bdbbb..5710467cf4b6 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2191,8 +2191,7 @@ static void fill_extnum_info(struct elfhdr *elf, struct elf_shdr *shdr4extnum, static int elf_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int vma_count, segs, i; - size_t vma_data_size; + int segs, i; struct elfhdr elf; loff_t offset = 0, dataoff; struct elf_note_info info = { }; @@ -2200,16 +2199,12 @@ static int elf_core_dump(struct coredump_params *cprm) struct elf_shdr *shdr4extnum = NULL; Elf_Half e_phnum; elf_addr_t e_shoff; - struct core_vma_metadata *vma_meta; - - if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) - return 0; /* * The number of segs are recored into ELF header as 16bit value. * Please check DEFAULT_MAX_MAP_COUNT definition when you modify here. */ - segs = vma_count + elf_core_extra_phdrs(); + segs = cprm->vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -2248,7 +2243,7 @@ static int elf_core_dump(struct coredump_params *cprm) dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += vma_data_size; + offset += cprm->vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -2268,8 +2263,8 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Write program headers for segments dump */ - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; struct elf_phdr phdr; phdr.p_type = PT_LOAD; @@ -2306,8 +2301,8 @@ static int elf_core_dump(struct coredump_params *cprm) /* Align to page */ dump_skip_to(cprm, dataoff); - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; if (!dump_user_range(cprm, meta->start, meta->dump_size)) goto end_coredump; @@ -2324,7 +2319,6 @@ static int elf_core_dump(struct coredump_params *cprm) end_coredump: free_note_info(&info); kfree(shdr4extnum); - kvfree(vma_meta); kfree(phdr4note); return has_dumped; } diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 7fa6e6632d9d..08d0c8797828 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1465,7 +1465,7 @@ static bool elf_fdpic_dump_segments(struct coredump_params *cprm, static int elf_fdpic_core_dump(struct coredump_params *cprm) { int has_dumped = 0; - int vma_count, segs; + int segs; int i; struct elfhdr *elf = NULL; loff_t offset = 0, dataoff; @@ -1480,8 +1480,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) elf_addr_t e_shoff; struct core_thread *ct; struct elf_thread_status *tmp; - struct core_vma_metadata *vma_meta = NULL; - size_t vma_data_size; /* alloc memory for large data structures: too large to be on stack */ elf = kmalloc(sizeof(*elf), GFP_KERNEL); @@ -1491,9 +1489,6 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) if (!psinfo) goto end_coredump; - if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size)) - goto end_coredump; - for (ct = current->signal->core_state->dumper.next; ct; ct = ct->next) { tmp = elf_dump_thread_status(cprm->siginfo->si_signo, @@ -1513,7 +1508,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) tmp->next = thread_list; thread_list = tmp; - segs = vma_count + elf_core_extra_phdrs(); + segs = cprm->vma_count + elf_core_extra_phdrs(); /* for notes section */ segs++; @@ -1558,7 +1553,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) /* Page-align dumped data */ dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE); - offset += vma_data_size; + offset += cprm->vma_data_size; offset += elf_core_extra_data_size(); e_shoff = offset; @@ -1578,8 +1573,8 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; /* write program headers for segments dump */ - for (i = 0; i < vma_count; i++) { - struct core_vma_metadata *meta = vma_meta + i; + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *meta = cprm->vma_meta + i; struct elf_phdr phdr; size_t sz; @@ -1628,7 +1623,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) dump_skip_to(cprm, dataoff); - if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) + if (!elf_fdpic_dump_segments(cprm, cprm->vma_meta, cprm->vma_count)) goto end_coredump; if (!elf_core_write_extra_data(cprm)) @@ -1652,7 +1647,6 @@ end_coredump: thread_list = thread_list->next; kfree(tmp); } - kvfree(vma_meta); kfree(phdr4note); kfree(elf); kfree(psinfo); diff --git a/fs/coredump.c b/fs/coredump.c index b73817712dd2..88ac67994d03 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -54,6 +54,8 @@ #include +static bool dump_vma_snapshot(struct coredump_params *cprm); + static int core_uses_pid; static unsigned int core_pipe_limit; static char core_pattern[CORENAME_MAX_SIZE] = "core"; @@ -532,6 +534,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) * by any locks. */ .mm_flags = mm->flags, + .vma_meta = NULL, }; audit_core_dumps(siginfo->si_signo); @@ -746,6 +749,9 @@ void do_coredump(const kernel_siginfo_t *siginfo) pr_info("Core dump to |%s disabled\n", cn.corename); goto close_fail; } + if (!dump_vma_snapshot(&cprm)) + goto close_fail; + file_start_write(cprm.file); core_dumped = binfmt->core_dump(&cprm); /* @@ -759,6 +765,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) dump_emit(&cprm, "", 1); } file_end_write(cprm.file); + kvfree(cprm.vma_meta); } if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); @@ -1096,14 +1103,11 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, * Under the mmap_lock, take a snapshot of relevant information about the task's * VMAs. */ -int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, - struct core_vma_metadata **vma_meta, - size_t *vma_data_size_ptr) +static bool dump_vma_snapshot(struct coredump_params *cprm) { struct vm_area_struct *vma, *gate_vma; struct mm_struct *mm = current->mm; int i; - size_t vma_data_size = 0; /* * Once the stack expansion code is fixed to not change VMA bounds @@ -1111,20 +1115,21 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, * mmap_lock in read mode. */ if (mmap_write_lock_killable(mm)) - return -EINTR; + return false; + cprm->vma_data_size = 0; gate_vma = get_gate_vma(mm); - *vma_count = mm->map_count + (gate_vma ? 1 : 0); + cprm->vma_count = mm->map_count + (gate_vma ? 1 : 0); - *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL); - if (!*vma_meta) { + cprm->vma_meta = kvmalloc_array(cprm->vma_count, sizeof(*cprm->vma_meta), GFP_KERNEL); + if (!cprm->vma_meta) { mmap_write_unlock(mm); - return -ENOMEM; + return false; } for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; vma = next_vma(vma, gate_vma), i++) { - struct core_vma_metadata *m = (*vma_meta) + i; + struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; m->end = vma->vm_end; @@ -1134,13 +1139,14 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, mmap_write_unlock(mm); - if (WARN_ON(i != *vma_count)) { - kvfree(*vma_meta); - return -EFAULT; + if (WARN_ON(i != cprm->vma_count)) { + kvfree(cprm->vma_meta); + return false; } - for (i = 0; i < *vma_count; i++) { - struct core_vma_metadata *m = (*vma_meta) + i; + + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *m = cprm->vma_meta + i; if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) { char elfmag[SELFMAG]; @@ -1153,9 +1159,8 @@ int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, } } - vma_data_size += m->dump_size; + cprm->vma_data_size += m->dump_size; } - *vma_data_size_ptr = vma_data_size; - return 0; + return true; } diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 2ee1460a1d66..7d05370e555e 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -23,6 +23,9 @@ struct coredump_params { loff_t written; loff_t pos; loff_t to_skip; + int vma_count; + size_t vma_data_size; + struct core_vma_metadata *vma_meta; }; /* @@ -35,9 +38,6 @@ extern int dump_emit(struct coredump_params *cprm, const void *addr, int nr); extern int dump_align(struct coredump_params *cprm, int align); int dump_user_range(struct coredump_params *cprm, unsigned long start, unsigned long len); -int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count, - struct core_vma_metadata **vma_meta, - size_t *vma_data_size_ptr); extern void do_coredump(const kernel_siginfo_t *siginfo); #else static inline void do_coredump(const kernel_siginfo_t *siginfo) {} From 49c1866348f364478a0c4d3dd13fd08bb82d3a5b Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 8 Mar 2022 13:01:19 -0600 Subject: [PATCH 23/30] coredump: Remove the WARN_ON in dump_vma_snapshot The condition is impossible and to the best of my knowledge has never triggered. We are in deep trouble if that conditions happens and we walk past the end of our allocated array. So delete the WARN_ON and the code that makes it look like the kernel can handle the case of walking past the end of it's vma_meta array. Reviewed-by: Jann Horn Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- fs/coredump.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 88ac67994d03..7f100a637264 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1139,12 +1139,6 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) mmap_write_unlock(mm); - if (WARN_ON(i != cprm->vma_count)) { - kvfree(cprm->vma_meta); - return false; - } - - for (i = 0; i < cprm->vma_count; i++) { struct core_vma_metadata *m = cprm->vma_meta + i; From 9ec7d3230717b4fe9b6c7afeb4811909c23fa1d7 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Mon, 31 Jan 2022 12:17:38 -0600 Subject: [PATCH 24/30] coredump/elf: Pass coredump_params into fill_note_info Instead of individually passing cprm->siginfo and cprm->regs into fill_note_info pass all of struct coredump_params. This is preparation to allow fill_files_note to use the existing vma snapshot. Reviewed-by: Jann Horn Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- fs/binfmt_elf.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 5710467cf4b6..7f0c391832cf 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1822,7 +1822,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - const kernel_siginfo_t *siginfo, struct pt_regs *regs) + struct coredump_params *cprm) { struct task_struct *dump_task = current; const struct user_regset_view *view = task_user_regset_view(dump_task); @@ -1894,7 +1894,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, * Now fill in each thread's information. */ for (t = info->thread; t != NULL; t = t->next) - if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size)) + if (!fill_thread_core_info(t, view, cprm->siginfo->si_signo, &info->size)) return 0; /* @@ -1903,7 +1903,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm); info->size += notesize(&info->psinfo); - fill_siginfo_note(&info->signote, &info->csigdata, siginfo); + fill_siginfo_note(&info->signote, &info->csigdata, cprm->siginfo); info->size += notesize(&info->signote); fill_auxv_note(&info->auxv, current->mm); @@ -2051,7 +2051,7 @@ static int elf_note_info_init(struct elf_note_info *info) static int fill_note_info(struct elfhdr *elf, int phdrs, struct elf_note_info *info, - const kernel_siginfo_t *siginfo, struct pt_regs *regs) + struct coredump_params *cprm) { struct core_thread *ct; struct elf_thread_status *ets; @@ -2072,13 +2072,13 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, list_for_each_entry(ets, &info->thread_list, list) { int sz; - sz = elf_dump_thread_status(siginfo->si_signo, ets); + sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets); info->thread_status_size += sz; } /* now collect the dump for the current */ memset(info->prstatus, 0, sizeof(*info->prstatus)); - fill_prstatus(&info->prstatus->common, current, siginfo->si_signo); - elf_core_copy_regs(&info->prstatus->pr_reg, regs); + fill_prstatus(&info->prstatus->common, current, cprm->siginfo->si_signo); + elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs); /* Set up header */ fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS); @@ -2094,7 +2094,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_note(info->notes + 1, "CORE", NT_PRPSINFO, sizeof(*info->psinfo), info->psinfo); - fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo); + fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo); fill_auxv_note(info->notes + 3, current->mm); info->numnote = 4; @@ -2104,8 +2104,8 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, } /* Try to dump the FPU. */ - info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs, - info->fpu); + info->prstatus->pr_fpvalid = + elf_core_copy_task_fpregs(current, cprm->regs, info->fpu); if (info->prstatus->pr_fpvalid) fill_note(info->notes + info->numnote++, "CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu); @@ -2218,7 +2218,7 @@ static int elf_core_dump(struct coredump_params *cprm) * Collect all the non-memory information about the process for the * notes. This also sets up the file header. */ - if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs)) + if (!fill_note_info(&elf, e_phnum, &info, cprm)) goto end_coredump; has_dumped = 1; From 390031c942116d4733310f0684beb8db19885fe6 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Tue, 8 Mar 2022 13:04:19 -0600 Subject: [PATCH 25/30] coredump: Use the vma snapshot in fill_files_note Matthew Wilcox reported that there is a missing mmap_lock in file_files_note that could possibly lead to a user after free. Solve this by using the existing vma snapshot for consistency and to avoid the need to take the mmap_lock anywhere in the coredump code except for dump_vma_snapshot. Update the dump_vma_snapshot to capture vm_pgoff and vm_file that are neeeded by fill_files_note. Add free_vma_snapshot to free the captured values of vm_file. Reported-by: Matthew Wilcox Link: https://lkml.kernel.org/r/20220131153740.2396974-1-willy@infradead.org Cc: stable@vger.kernel.org Fixes: a07279c9a8cd ("binfmt_elf, binfmt_elf_fdpic: use a VMA list snapshot") Fixes: 2aa362c49c31 ("coredump: extend core dump note section to contain file names of mapped files") Reviewed-by: Kees Cook Signed-off-by: "Eric W. Biederman" --- fs/binfmt_elf.c | 24 ++++++++++++------------ fs/coredump.c | 22 +++++++++++++++++++++- include/linux/coredump.h | 2 ++ 3 files changed, 35 insertions(+), 13 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 7f0c391832cf..ca5296cae979 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1641,17 +1641,16 @@ static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, * long file_ofs * followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL... */ -static int fill_files_note(struct memelfnote *note) +static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm) { - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; unsigned count, size, names_ofs, remaining, n; user_long_t *data; user_long_t *start_end_ofs; char *name_base, *name_curpos; + int i; /* *Estimated* file count and total data size needed */ - count = mm->map_count; + count = cprm->vma_count; if (count > UINT_MAX / 64) return -EINVAL; size = count * 64; @@ -1673,11 +1672,12 @@ static int fill_files_note(struct memelfnote *note) name_base = name_curpos = ((char *)data) + names_ofs; remaining = size - names_ofs; count = 0; - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + for (i = 0; i < cprm->vma_count; i++) { + struct core_vma_metadata *m = &cprm->vma_meta[i]; struct file *file; const char *filename; - file = vma->vm_file; + file = m->file; if (!file) continue; filename = file_path(file, name_curpos, remaining); @@ -1697,9 +1697,9 @@ static int fill_files_note(struct memelfnote *note) memmove(name_curpos, filename, n); name_curpos += n; - *start_end_ofs++ = vma->vm_start; - *start_end_ofs++ = vma->vm_end; - *start_end_ofs++ = vma->vm_pgoff; + *start_end_ofs++ = m->start; + *start_end_ofs++ = m->end; + *start_end_ofs++ = m->pgoff; count++; } @@ -1710,7 +1710,7 @@ static int fill_files_note(struct memelfnote *note) * Count usually is less than mm->map_count, * we need to move filenames down. */ - n = mm->map_count - count; + n = cprm->vma_count - count; if (n != 0) { unsigned shift_bytes = n * 3 * sizeof(data[0]); memmove(name_base - shift_bytes, name_base, @@ -1909,7 +1909,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_auxv_note(&info->auxv, current->mm); info->size += notesize(&info->auxv); - if (fill_files_note(&info->files) == 0) + if (fill_files_note(&info->files, cprm) == 0) info->size += notesize(&info->files); return 1; @@ -2098,7 +2098,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, fill_auxv_note(info->notes + 3, current->mm); info->numnote = 4; - if (fill_files_note(info->notes + info->numnote) == 0) { + if (fill_files_note(info->notes + info->numnote, cprm) == 0) { info->notes_files = info->notes + info->numnote; info->numnote++; } diff --git a/fs/coredump.c b/fs/coredump.c index 7f100a637264..7ed7d601e5e0 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -55,6 +55,7 @@ #include static bool dump_vma_snapshot(struct coredump_params *cprm); +static void free_vma_snapshot(struct coredump_params *cprm); static int core_uses_pid; static unsigned int core_pipe_limit; @@ -765,7 +766,7 @@ void do_coredump(const kernel_siginfo_t *siginfo) dump_emit(&cprm, "", 1); } file_end_write(cprm.file); - kvfree(cprm.vma_meta); + free_vma_snapshot(&cprm); } if (ispipe && core_pipe_limit) wait_for_dump_helpers(cprm.file); @@ -1099,6 +1100,20 @@ static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, return gate_vma; } +static void free_vma_snapshot(struct coredump_params *cprm) +{ + if (cprm->vma_meta) { + int i; + for (i = 0; i < cprm->vma_count; i++) { + struct file *file = cprm->vma_meta[i].file; + if (file) + fput(file); + } + kvfree(cprm->vma_meta); + cprm->vma_meta = NULL; + } +} + /* * Under the mmap_lock, take a snapshot of relevant information about the task's * VMAs. @@ -1135,6 +1150,11 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) m->end = vma->vm_end; m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); + m->pgoff = vma->vm_pgoff; + + m->file = vma->vm_file; + if (m->file) + get_file(m->file); } mmap_write_unlock(mm); diff --git a/include/linux/coredump.h b/include/linux/coredump.h index 7d05370e555e..08a1d3e7e46d 100644 --- a/include/linux/coredump.h +++ b/include/linux/coredump.h @@ -12,6 +12,8 @@ struct core_vma_metadata { unsigned long start, end; unsigned long flags; unsigned long dump_size; + unsigned long pgoff; + struct file *file; }; struct coredump_params { From f833116ad2c3eabf9c739946170e07825cca67ed Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 9 Mar 2022 10:37:07 -0600 Subject: [PATCH 26/30] coredump: Don't compile flat_core_dump when coredumps are disabled Recently the kernel test robot reported: > In file included from include/linux/kernel.h:29, > from fs/binfmt_flat.c:21: > fs/binfmt_flat.c: In function 'flat_core_dump': > >> fs/binfmt_flat.c:121:50: error: invalid use of undefined type 'struct coredump_params' > 121 | current->comm, current->pid, cprm->siginfo->si_signo); > | ^~ > include/linux/printk.h:418:33: note: in definition of macro 'printk_index_wrap' > 418 | _p_func(_fmt, ##__VA_ARGS__); \ > | ^~~~~~~~~~~ > include/linux/printk.h:499:9: note: in expansion of macro 'printk' > 499 | printk(KERN_WARNING pr_fmt(fmt), ##__VA_ARGS__) > | ^~~~~~ > fs/binfmt_flat.c:120:9: note: in expansion of macro 'pr_warn' > 120 | pr_warn("Process %s:%d received signr %d and should have core dumped\n", > | ^~~~~~~ > At top level: > fs/binfmt_flat.c:118:12: warning: 'flat_core_dump' defined but not used [-Wunused-function] > 118 | static int flat_core_dump(struct coredump_params *cprm) > | ^~~~~~~~~~~~~~ The little dinky do nothing function flat_core_dump has always been compiled unconditionally. With my change to move coredump_params into coredump.h coredump_params reasonably becomes unavailable when coredump support is not compiled in. Fix this old issue by simply not compiling flat_core_dump when coredump support is not supported. Fixes: a99a3e2efaf1 ("coredump: Move definition of struct coredump_params into coredump.h") Reported-by: kernel test robot Signed-off-by: "Eric W. Biederman" --- fs/binfmt_flat.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index 208cdce16de1..626898150011 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -98,7 +98,9 @@ static int load_flat_shared_library(int id, struct lib_info *p); #endif static int load_flat_binary(struct linux_binprm *); +#ifdef CONFIG_COREDUMP static int flat_core_dump(struct coredump_params *cprm); +#endif static struct linux_binfmt flat_format = { .module = THIS_MODULE, @@ -115,12 +117,14 @@ static struct linux_binfmt flat_format = { * Currently only a stub-function. */ +#ifdef CONFIG_COREDUMP static int flat_core_dump(struct coredump_params *cprm) { pr_warn("Process %s:%d received signr %d and should have core dumped\n", current->comm, current->pid, cprm->siginfo->si_signo); return 1; } +#endif /****************************************************************************/ /* From 19e8b701e258701b52b1e6aea99572518d69a2fb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Wed, 9 Mar 2022 14:03:42 -0600 Subject: [PATCH 27/30] a.out: Stop building a.out/osf1 support on alpha and m68k There has been repeated discussion on removing a.out support, the most recent was[1]. Having read through a bunch of the discussion it looks like no one has see any reason why we need to keep a.out support. The m68k maintainer has even come out in favor of removing a.out support[2]. At a practical level with only two rarely used architectures building a.out support, it gets increasingly hard to test and to care about. Which means the code will almost certainly bit-rot. Let's see if anyone cares about a.out support on the last two architectures that build it, by disabling the build of the support in Kconfig. If anyone cares, this can be easily reverted, and we can then have a discussion about what it is going to take to support a.out binaries in the long term. [1] https://lkml.kernel.org/r/20220113160115.5375-1-bp@alien8.de [2] https://lkml.kernel.org/r/CAMuHMdUbTNNr16YY1TFe=-uRLjg6yGzgw_RqtAFpyhnOMM5Pvw@mail.gmail.com Cc: Richard Henderson Cc: Ivan Kokshaysky Cc: Matt Turner Cc: linux-alpha@vger.kernel.org Cc: Geert Uytterhoeven Cc: linux-m68k@lists.linux-m68k.org Signed-off-by: "Eric W. Biederman" Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/87ilsmdhb5.fsf_-_@email.froward.int.ebiederm.org Acked-by: Geert Uytterhoeven Link: https://lore.kernel.org/lkml/CAMuHMdVLyu6LNONJa1QcMGv__bWSCRvVq9haD7=fOm1k5O3Pnw@mail.gmail.com --- arch/alpha/Kconfig | 1 - arch/m68k/Kconfig | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/alpha/Kconfig b/arch/alpha/Kconfig index 4e87783c90ad..14c97acea351 100644 --- a/arch/alpha/Kconfig +++ b/arch/alpha/Kconfig @@ -12,7 +12,6 @@ config ALPHA select FORCE_PCI if !ALPHA_JENSEN select PCI_DOMAINS if PCI select PCI_SYSCALL if PCI - select HAVE_AOUT select HAVE_ASM_MODVERSIONS select HAVE_PCSPKR_PLATFORM select HAVE_PERF_EVENTS diff --git a/arch/m68k/Kconfig b/arch/m68k/Kconfig index 936e1803c7c7..268b3860d40d 100644 --- a/arch/m68k/Kconfig +++ b/arch/m68k/Kconfig @@ -17,7 +17,6 @@ config M68K select GENERIC_CPU_DEVICES select GENERIC_IOMAP select GENERIC_IRQ_SHOW - select HAVE_AOUT if MMU select HAVE_ASM_MODVERSIONS select HAVE_DEBUG_BUGVERBOSE select HAVE_EFFICIENT_UNALIGNED_ACCESS if !CPU_HAS_NO_UNALIGNED From afcf5441b9ff22ac57244cd45ff102ebc2e32d1a Mon Sep 17 00:00:00 2001 From: Dan Li Date: Wed, 2 Mar 2022 23:43:23 -0800 Subject: [PATCH 28/30] arm64: Add gcc Shadow Call Stack support Shadow call stacks will be available in GCC >= 12, this patch makes the corresponding kernel configuration available when compiling the kernel with the gcc. Note that the implementation in GCC is slightly different from Clang. With SCS enabled, functions will only pop x30 once in the epilogue, like: str x30, [x18], #8 stp x29, x30, [sp, #-16]! ...... - ldp x29, x30, [sp], #16 //clang + ldr x29, [sp], #16 //GCC ldr x30, [x18, #-8]! Link: https://gcc.gnu.org/git/?p=gcc.git;a=commit;h=ce09ab17ddd21f73ff2caf6eec3b0ee9b0e1a11e Reviewed-by: Nathan Chancellor Reviewed-by: Kees Cook Reviewed-by: Nick Desaulniers Signed-off-by: Dan Li Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220303074323.86282-1-ashimida@linux.alibaba.com --- arch/Kconfig | 19 ++++++++++--------- arch/arm64/Kconfig | 2 +- include/linux/compiler-gcc.h | 4 ++++ 3 files changed, 15 insertions(+), 10 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index c5b50bfe31c1..cabfac22f2fb 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -599,21 +599,22 @@ config STACKPROTECTOR_STRONG config ARCH_SUPPORTS_SHADOW_CALL_STACK bool help - An architecture should select this if it supports Clang's Shadow - Call Stack and implements runtime support for shadow stack + An architecture should select this if it supports the compiler's + Shadow Call Stack and implements runtime support for shadow stack switching. config SHADOW_CALL_STACK - bool "Clang Shadow Call Stack" - depends on CC_IS_CLANG && ARCH_SUPPORTS_SHADOW_CALL_STACK + bool "Shadow Call Stack" + depends on ARCH_SUPPORTS_SHADOW_CALL_STACK depends on DYNAMIC_FTRACE_WITH_REGS || !FUNCTION_GRAPH_TRACER help - This option enables Clang's Shadow Call Stack, which uses a - shadow stack to protect function return addresses from being - overwritten by an attacker. More information can be found in - Clang's documentation: + This option enables the compiler's Shadow Call Stack, which + uses a shadow stack to protect function return addresses from + being overwritten by an attacker. More information can be found + in the compiler's documentation: - https://clang.llvm.org/docs/ShadowCallStack.html + - Clang: https://clang.llvm.org/docs/ShadowCallStack.html + - GCC: https://gcc.gnu.org/onlinedocs/gcc/Instrumentation-Options.html#Instrumentation-Options Note that security guarantees in the kernel differ from the ones documented for user space. The kernel must store addresses diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b8ab790555c8..a6733b972212 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -1239,7 +1239,7 @@ config HW_PERF_EVENTS config ARCH_HAS_FILTER_PGPROT def_bool y -# Supported by clang >= 7.0 +# Supported by clang >= 7.0 or GCC >= 12.0.0 config CC_HAVE_SHADOW_CALL_STACK def_bool $(cc-option, -fsanitize=shadow-call-stack -ffixed-x18) diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index ccbbd31b3aae..deff5b308470 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -97,6 +97,10 @@ #define KASAN_ABI_VERSION 4 #endif +#ifdef CONFIG_SHADOW_CALL_STACK +#define __noscs __attribute__((__no_sanitize__("shadow-call-stack"))) +#endif + #if __has_attribute(__no_sanitize_address__) #define __no_sanitize_address __attribute__((no_sanitize_address)) #else From 8126b1c73108bc691f5643df19071a59a69d0bc6 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Mon, 14 Mar 2022 19:59:53 +0100 Subject: [PATCH 29/30] pstore: Don't use semaphores in always-atomic-context code pstore_dump() is *always* invoked in atomic context (nowadays in an RCU read-side critical section, before that under a spinlock). It doesn't make sense to try to use semaphores here. This is mostly a revert of commit ea84b580b955 ("pstore: Convert buf_lock to semaphore"), except that two parts aren't restored back exactly as they were: - keep the lock initialization in pstore_register - in efi_pstore_write(), always set the "block" flag to false - omit "is_locked", that was unnecessary since commit 959217c84c27 ("pstore: Actually give up during locking failure") - fix the bailout message The actual problem that the buggy commit was trying to address may have been that the use of preemptible() in efi_pstore_write() was wrong - it only looks at preempt_count() and the state of IRQs, but __rcu_read_lock() doesn't touch either of those under CONFIG_PREEMPT_RCU. (Sidenote: CONFIG_PREEMPT_RCU means that the scheduler can preempt tasks in RCU read-side critical sections, but you're not allowed to actively block/reschedule.) Lockdep probably never caught the problem because it's very rare that you actually hit the contended case, so lockdep always just sees the down_trylock(), not the down_interruptible(), and so it can't tell that there's a problem. Fixes: ea84b580b955 ("pstore: Convert buf_lock to semaphore") Cc: stable@vger.kernel.org Acked-by: Sebastian Andrzej Siewior Signed-off-by: Jann Horn Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220314185953.2068993-1-jannh@google.com --- drivers/firmware/efi/efi-pstore.c | 2 +- fs/pstore/platform.c | 38 +++++++++++++++---------------- include/linux/pstore.h | 6 ++--- 3 files changed, 22 insertions(+), 24 deletions(-) diff --git a/drivers/firmware/efi/efi-pstore.c b/drivers/firmware/efi/efi-pstore.c index 0ef086e43090..7e771c56c13c 100644 --- a/drivers/firmware/efi/efi-pstore.c +++ b/drivers/firmware/efi/efi-pstore.c @@ -266,7 +266,7 @@ static int efi_pstore_write(struct pstore_record *record) efi_name[i] = name[i]; ret = efivar_entry_set_safe(efi_name, vendor, PSTORE_EFI_ATTRIBUTES, - preemptible(), record->size, record->psi->buf); + false, record->size, record->psi->buf); if (record->reason == KMSG_DUMP_OOPS && try_module_get(THIS_MODULE)) if (!schedule_work(&efivar_work)) diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f243cb5e6a4f..e26162f102ff 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -143,21 +143,22 @@ static void pstore_timer_kick(void) mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms)); } -/* - * Should pstore_dump() wait for a concurrent pstore_dump()? If - * not, the current pstore_dump() will report a failure to dump - * and return. - */ -static bool pstore_cannot_wait(enum kmsg_dump_reason reason) +static bool pstore_cannot_block_path(enum kmsg_dump_reason reason) { - /* In NMI path, pstore shouldn't block regardless of reason. */ + /* + * In case of NMI path, pstore shouldn't be blocked + * regardless of reason. + */ if (in_nmi()) return true; switch (reason) { /* In panic case, other cpus are stopped by smp_send_stop(). */ case KMSG_DUMP_PANIC: - /* Emergency restart shouldn't be blocked. */ + /* + * Emergency restart shouldn't be blocked by spinning on + * pstore_info::buf_lock. + */ case KMSG_DUMP_EMERG: return true; default: @@ -389,21 +390,19 @@ static void pstore_dump(struct kmsg_dumper *dumper, unsigned long total = 0; const char *why; unsigned int part = 1; + unsigned long flags = 0; int ret; why = kmsg_dump_reason_str(reason); - if (down_trylock(&psinfo->buf_lock)) { - /* Failed to acquire lock: give up if we cannot wait. */ - if (pstore_cannot_wait(reason)) { - pr_err("dump skipped in %s path: may corrupt error record\n", - in_nmi() ? "NMI" : why); - return; - } - if (down_interruptible(&psinfo->buf_lock)) { - pr_err("could not grab semaphore?!\n"); + if (pstore_cannot_block_path(reason)) { + if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) { + pr_err("dump skipped in %s path because of concurrent dump\n", + in_nmi() ? "NMI" : why); return; } + } else { + spin_lock_irqsave(&psinfo->buf_lock, flags); } kmsg_dump_rewind(&iter); @@ -467,8 +466,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, total += record.size; part++; } - - up(&psinfo->buf_lock); + spin_unlock_irqrestore(&psinfo->buf_lock, flags); } static struct kmsg_dumper pstore_dumper = { @@ -594,7 +592,7 @@ int pstore_register(struct pstore_info *psi) psi->write_user = pstore_write_user_compat; psinfo = psi; mutex_init(&psinfo->read_mutex); - sema_init(&psinfo->buf_lock, 1); + spin_lock_init(&psinfo->buf_lock); if (psi->flags & PSTORE_FLAGS_DMESG) allocate_buf_for_compression(); diff --git a/include/linux/pstore.h b/include/linux/pstore.h index eb93a54cff31..e97a8188f0fd 100644 --- a/include/linux/pstore.h +++ b/include/linux/pstore.h @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include #include @@ -87,7 +87,7 @@ struct pstore_record { * @owner: module which is responsible for this backend driver * @name: name of the backend driver * - * @buf_lock: semaphore to serialize access to @buf + * @buf_lock: spinlock to serialize access to @buf * @buf: preallocated crash dump buffer * @bufsize: size of @buf available for crash dump bytes (must match * smallest number of bytes available for writing to a @@ -178,7 +178,7 @@ struct pstore_info { struct module *owner; const char *name; - struct semaphore buf_lock; + spinlock_t buf_lock; char *buf; size_t bufsize; From dd664099002db909912a23215f8775c97f7f4f10 Mon Sep 17 00:00:00 2001 From: Rick Edgecombe Date: Thu, 17 Mar 2022 12:20:13 -0700 Subject: [PATCH 30/30] binfmt_elf: Don't write past end of notes for regset gap In fill_thread_core_info() the ptrace accessible registers are collected to be written out as notes in a core file. The note array is allocated from a size calculated by iterating the user regset view, and counting the regsets that have a non-zero core_note_type. However, this only allows for there to be non-zero core_note_type at the end of the regset view. If there are any gaps in the middle, fill_thread_core_info() will overflow the note allocation, as it iterates over the size of the view and the allocation would be smaller than that. There doesn't appear to be any arch that has gaps such that they exceed the notes allocation, but the code is brittle and tries to support something it doesn't. It could be fixed by increasing the allocation size, but instead just have the note collecting code utilize the array better. This way the allocation can stay smaller. Even in the case of no arch's that have gaps in their regset views, this introduces a change in the resulting indicies of t->notes. It does not introduce any changes to the core file itself, because any blank notes are skipped in write_note_info(). In case, the allocation logic between fill_note_info() and fill_thread_core_info() ever diverges from the usage logic, warn and skip writing any notes that would overflow the array. This fix is derrived from an earlier one[0] by Yu-cheng Yu. [0] https://lore.kernel.org/lkml/20180717162502.32274-1-yu-cheng.yu@intel.com/ Co-developed-by: Yu-cheng Yu Signed-off-by: Yu-cheng Yu Signed-off-by: Rick Edgecombe Signed-off-by: Kees Cook Link: https://lore.kernel.org/r/20220317192013.13655-4-rick.p.edgecombe@intel.com --- fs/binfmt_elf.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index ca5296cae979..37d9c455d535 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -1766,9 +1766,9 @@ static void do_thread_regset_writeback(struct task_struct *task, static int fill_thread_core_info(struct elf_thread_core_info *t, const struct user_regset_view *view, - long signr, size_t *total) + long signr, struct elf_note_info *info) { - unsigned int i; + unsigned int note_iter, view_iter; /* * NT_PRSTATUS is the one special case, because the regset data @@ -1782,17 +1782,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, fill_note(&t->notes[0], "CORE", NT_PRSTATUS, PRSTATUS_SIZE, &t->prstatus); - *total += notesize(&t->notes[0]); + info->size += notesize(&t->notes[0]); do_thread_regset_writeback(t->task, &view->regsets[0]); /* * Each other regset might generate a note too. For each regset - * that has no core_note_type or is inactive, we leave t->notes[i] - * all zero and we'll know to skip writing it later. + * that has no core_note_type or is inactive, skip it. */ - for (i = 1; i < view->n; ++i) { - const struct user_regset *regset = &view->regsets[i]; + note_iter = 1; + for (view_iter = 1; view_iter < view->n; ++view_iter) { + const struct user_regset *regset = &view->regsets[view_iter]; int note_type = regset->core_note_type; bool is_fpreg = note_type == NT_PRFPREG; void *data; @@ -1808,13 +1808,17 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, if (ret < 0) continue; + if (WARN_ON_ONCE(note_iter >= info->thread_notes)) + break; + if (is_fpreg) SET_PR_FPVALID(&t->prstatus); - fill_note(&t->notes[i], is_fpreg ? "CORE" : "LINUX", + fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX", note_type, ret, data); - *total += notesize(&t->notes[i]); + info->size += notesize(&t->notes[note_iter]); + note_iter++; } return 1; @@ -1894,7 +1898,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, * Now fill in each thread's information. */ for (t = info->thread; t != NULL; t = t->next) - if (!fill_thread_core_info(t, view, cprm->siginfo->si_signo, &info->size)) + if (!fill_thread_core_info(t, view, cprm->siginfo->si_signo, info)) return 0; /*