diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 8d7f141c6fc7..061744c436d9 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -426,12 +426,14 @@ with the memory region, as the case would be with BSS (uninitialized data). The "pathname" shows the name associated file for this mapping. If the mapping is not associated with a file: - ======= ==================================== + ============= ==================================== [heap] the heap of the program [stack] the stack of the main process [vdso] the "virtual dynamic shared object", the kernel system call handler - ======= ==================================== + [anon:] an anonymous mapping that has been + named by userspace + ============= ==================================== or if empty, the mapping is anonymous. diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index a34430b7af4a..dea9fe8a56cc 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -753,7 +753,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a) return true; if ((pte_flags(a) & _PAGE_PROTNONE) && - mm_tlb_flush_pending(mm)) + atomic_read(&mm->tlb_flush_pending)) return true; return false; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ad667dbc96f5..18f8c3acbb85 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 #include #include +#include #include #include #include @@ -308,6 +309,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) name = arch_vma_name(vma); if (!name) { + const char *anon_name; + if (!mm) { name = "[vdso]"; goto done; @@ -319,8 +322,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma) goto done; } - if (is_stack(vma)) + if (is_stack(vma)) { name = "[stack]"; + goto done; + } + + anon_name = vma_anon_name(vma); + if (anon_name) { + seq_pad(m, ' '); + seq_printf(m, "[anon:%s]", anon_name); + } } done: diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 22bf14ab2d16..e26b10132d47 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -15,6 +15,7 @@ #include #include #include +#include #include #include #include @@ -877,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, vma_anon_name(vma)); if (prev) vma = prev; else @@ -1436,7 +1437,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - ((struct vm_userfaultfd_ctx){ ctx })); + ((struct vm_userfaultfd_ctx){ ctx }), + vma_anon_name(vma)); if (prev) { vma = prev; goto next; @@ -1613,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, vma_policy(vma), - NULL_VM_UFFD_CTX); + NULL_VM_UFFD_CTX, vma_anon_name(vma)); if (prev) { vma = prev; goto next; diff --git a/include/linux/mm.h b/include/linux/mm.h index c768a7c81b0b..34a92d9e8059 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -424,51 +424,6 @@ extern unsigned int kobjsize(const void *objp); */ extern pgprot_t protection_map[16]; -/** - * enum fault_flag - Fault flag definitions. - * @FAULT_FLAG_WRITE: Fault was a write fault. - * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. - * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. - * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. - * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. - * @FAULT_FLAG_TRIED: The fault has been tried once. - * @FAULT_FLAG_USER: The fault originated in userspace. - * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. - * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. - * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. - * - * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify - * whether we would allow page faults to retry by specifying these two - * fault flags correctly. Currently there can be three legal combinations: - * - * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and - * this is the first try - * - * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and - * we've already tried at least once - * - * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry - * - * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never - * be used. Note that page faults can be allowed to retry for multiple times, - * in which case we'll have an initial fault with flags (a) then later on - * continuous faults with flags (b). We should always try to detect pending - * signals before a retry to make sure the continuous page faults can still be - * interrupted if necessary. - */ -enum fault_flag { - FAULT_FLAG_WRITE = 1 << 0, - FAULT_FLAG_MKWRITE = 1 << 1, - FAULT_FLAG_ALLOW_RETRY = 1 << 2, - FAULT_FLAG_RETRY_NOWAIT = 1 << 3, - FAULT_FLAG_KILLABLE = 1 << 4, - FAULT_FLAG_TRIED = 1 << 5, - FAULT_FLAG_USER = 1 << 6, - FAULT_FLAG_REMOTE = 1 << 7, - FAULT_FLAG_INSTRUCTION = 1 << 8, - FAULT_FLAG_INTERRUPTIBLE = 1 << 9, -}; - /* * The default fault flags that should be used by most of the * arch-specific page fault handlers. @@ -577,6 +532,10 @@ enum page_entry_size { */ struct vm_operations_struct { void (*open)(struct vm_area_struct * area); + /** + * @close: Called when the VMA is being removed from the MM. + * Context: User context. May sleep. Caller holds mmap_lock. + */ void (*close)(struct vm_area_struct * area); /* Called any time before splitting to check if it's allowed */ int (*may_split)(struct vm_area_struct *area, unsigned long addr); @@ -2644,7 +2603,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start, extern struct vm_area_struct *vma_merge(struct mm_struct *, struct vm_area_struct *prev, unsigned long addr, unsigned long end, unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t, - struct mempolicy *, struct vm_userfaultfd_ctx); + struct mempolicy *, struct vm_userfaultfd_ctx, const char *); extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *); extern int __split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); @@ -3390,5 +3349,16 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma) return 0; } +#ifdef CONFIG_ANON_VMA_NAME +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name); +#else +static inline int +madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name) { + return 0; +} +#endif + #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index e2ec68b0515c..b725839dfe71 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -2,8 +2,10 @@ #ifndef LINUX_MM_INLINE_H #define LINUX_MM_INLINE_H +#include #include #include +#include /** * folio_is_file_lru - Should the folio be on a file LRU or anon LRU? @@ -135,4 +137,138 @@ static __always_inline void del_page_from_lru_list(struct page *page, { lruvec_del_folio(lruvec, page_folio(page)); } + +#ifdef CONFIG_ANON_VMA_NAME +/* + * mmap_lock should be read-locked when calling vma_anon_name() and while using + * the returned pointer. + */ +extern const char *vma_anon_name(struct vm_area_struct *vma); + +/* + * mmap_lock should be read-locked for orig_vma->vm_mm. + * mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be + * isolated. + */ +extern void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma); + +/* + * mmap_lock should be write-locked or vma should have been isolated under + * write-locked mmap_lock protection. + */ +extern void free_vma_anon_name(struct vm_area_struct *vma); + +/* mmap_lock should be read-locked */ +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + const char *vma_name = vma_anon_name(vma); + + /* either both NULL, or pointers to same string */ + if (vma_name == name) + return true; + + return name && vma_name && !strcmp(name, vma_name); +} +#else /* CONFIG_ANON_VMA_NAME */ +static inline const char *vma_anon_name(struct vm_area_struct *vma) +{ + return NULL; +} +static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) {} +static inline void free_vma_anon_name(struct vm_area_struct *vma) {} +static inline bool is_same_vma_anon_name(struct vm_area_struct *vma, + const char *name) +{ + return true; +} +#endif /* CONFIG_ANON_VMA_NAME */ + +static inline void init_tlb_flush_pending(struct mm_struct *mm) +{ + atomic_set(&mm->tlb_flush_pending, 0); +} + +static inline void inc_tlb_flush_pending(struct mm_struct *mm) +{ + atomic_inc(&mm->tlb_flush_pending); + /* + * The only time this value is relevant is when there are indeed pages + * to flush. And we'll only flush pages after changing them, which + * requires the PTL. + * + * So the ordering here is: + * + * atomic_inc(&mm->tlb_flush_pending); + * spin_lock(&ptl); + * ... + * set_pte_at(); + * spin_unlock(&ptl); + * + * spin_lock(&ptl) + * mm_tlb_flush_pending(); + * .... + * spin_unlock(&ptl); + * + * flush_tlb_range(); + * atomic_dec(&mm->tlb_flush_pending); + * + * Where the increment if constrained by the PTL unlock, it thus + * ensures that the increment is visible if the PTE modification is + * visible. After all, if there is no PTE modification, nobody cares + * about TLB flushes either. + * + * This very much relies on users (mm_tlb_flush_pending() and + * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and + * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc + * locks (PPC) the unlock of one doesn't order against the lock of + * another PTL. + * + * The decrement is ordered by the flush_tlb_range(), such that + * mm_tlb_flush_pending() will not return false unless all flushes have + * completed. + */ +} + +static inline void dec_tlb_flush_pending(struct mm_struct *mm) +{ + /* + * See inc_tlb_flush_pending(). + * + * This cannot be smp_mb__before_atomic() because smp_mb() simply does + * not order against TLB invalidate completion, which is what we need. + * + * Therefore we must rely on tlb_flush_*() to guarantee order. + */ + atomic_dec(&mm->tlb_flush_pending); +} + +static inline bool mm_tlb_flush_pending(struct mm_struct *mm) +{ + /* + * Must be called after having acquired the PTL; orders against that + * PTLs release and therefore ensures that if we observe the modified + * PTE we must also observe the increment from inc_tlb_flush_pending(). + * + * That is, it only guarantees to return true if there is a flush + * pending for _this_ PTL. + */ + return atomic_read(&mm->tlb_flush_pending); +} + +static inline bool mm_tlb_flush_nested(struct mm_struct *mm) +{ + /* + * Similar to mm_tlb_flush_pending(), we must have acquired the PTL + * for which there is a TLB flush pending in order to guarantee + * we've seen both that PTE modification and the increment. + * + * (no requirement on actually still holding the PTL, that is irrelevant) + */ + return atomic_read(&mm->tlb_flush_pending) > 1; +} + + #endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1ae3537c7920..07bfd6f99ca0 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -5,6 +5,7 @@ #include #include +#include #include #include #include @@ -386,6 +387,12 @@ struct vm_userfaultfd_ctx { struct vm_userfaultfd_ctx {}; #endif /* CONFIG_USERFAULTFD */ +struct anon_vma_name { + struct kref kref; + /* The name needs to be at the end because it is dynamically sized. */ + char name[]; +}; + /* * This struct describes a virtual memory area. There is one of these * per VM-area/task. A VM area is any part of the process virtual memory @@ -426,11 +433,19 @@ struct vm_area_struct { /* * For areas with an address space and backing store, * linkage into the address_space->i_mmap interval tree. + * + * For private anonymous mappings, a pointer to a null terminated string + * containing the name given to the vma, or NULL if unnamed. */ - struct { - struct rb_node rb; - unsigned long rb_subtree_last; - } shared; + + union { + struct { + struct rb_node rb; + unsigned long rb_subtree_last; + } shared; + /* Serialized by mmap_sem. */ + struct anon_vma_name *anon_name; + }; /* * A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma @@ -677,90 +692,6 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); extern void tlb_finish_mmu(struct mmu_gather *tlb); -static inline void init_tlb_flush_pending(struct mm_struct *mm) -{ - atomic_set(&mm->tlb_flush_pending, 0); -} - -static inline void inc_tlb_flush_pending(struct mm_struct *mm) -{ - atomic_inc(&mm->tlb_flush_pending); - /* - * The only time this value is relevant is when there are indeed pages - * to flush. And we'll only flush pages after changing them, which - * requires the PTL. - * - * So the ordering here is: - * - * atomic_inc(&mm->tlb_flush_pending); - * spin_lock(&ptl); - * ... - * set_pte_at(); - * spin_unlock(&ptl); - * - * spin_lock(&ptl) - * mm_tlb_flush_pending(); - * .... - * spin_unlock(&ptl); - * - * flush_tlb_range(); - * atomic_dec(&mm->tlb_flush_pending); - * - * Where the increment if constrained by the PTL unlock, it thus - * ensures that the increment is visible if the PTE modification is - * visible. After all, if there is no PTE modification, nobody cares - * about TLB flushes either. - * - * This very much relies on users (mm_tlb_flush_pending() and - * mm_tlb_flush_nested()) only caring about _specific_ PTEs (and - * therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc - * locks (PPC) the unlock of one doesn't order against the lock of - * another PTL. - * - * The decrement is ordered by the flush_tlb_range(), such that - * mm_tlb_flush_pending() will not return false unless all flushes have - * completed. - */ -} - -static inline void dec_tlb_flush_pending(struct mm_struct *mm) -{ - /* - * See inc_tlb_flush_pending(). - * - * This cannot be smp_mb__before_atomic() because smp_mb() simply does - * not order against TLB invalidate completion, which is what we need. - * - * Therefore we must rely on tlb_flush_*() to guarantee order. - */ - atomic_dec(&mm->tlb_flush_pending); -} - -static inline bool mm_tlb_flush_pending(struct mm_struct *mm) -{ - /* - * Must be called after having acquired the PTL; orders against that - * PTLs release and therefore ensures that if we observe the modified - * PTE we must also observe the increment from inc_tlb_flush_pending(). - * - * That is, it only guarantees to return true if there is a flush - * pending for _this_ PTL. - */ - return atomic_read(&mm->tlb_flush_pending); -} - -static inline bool mm_tlb_flush_nested(struct mm_struct *mm) -{ - /* - * Similar to mm_tlb_flush_pending(), we must have acquired the PTL - * for which there is a TLB flush pending in order to guarantee - * we've seen both that PTE modification and the increment. - * - * (no requirement on actually still holding the PTL, that is irrelevant) - */ - return atomic_read(&mm->tlb_flush_pending) > 1; -} - struct vm_fault; /** @@ -875,4 +806,49 @@ typedef struct { unsigned long val; } swp_entry_t; +/** + * enum fault_flag - Fault flag definitions. + * @FAULT_FLAG_WRITE: Fault was a write fault. + * @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE. + * @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked. + * @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying. + * @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region. + * @FAULT_FLAG_TRIED: The fault has been tried once. + * @FAULT_FLAG_USER: The fault originated in userspace. + * @FAULT_FLAG_REMOTE: The fault is not for current task/mm. + * @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch. + * @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals. + * + * About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify + * whether we would allow page faults to retry by specifying these two + * fault flags correctly. Currently there can be three legal combinations: + * + * (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and + * this is the first try + * + * (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and + * we've already tried at least once + * + * (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry + * + * The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never + * be used. Note that page faults can be allowed to retry for multiple times, + * in which case we'll have an initial fault with flags (a) then later on + * continuous faults with flags (b). We should always try to detect pending + * signals before a retry to make sure the continuous page faults can still be + * interrupted if necessary. + */ +enum fault_flag { + FAULT_FLAG_WRITE = 1 << 0, + FAULT_FLAG_MKWRITE = 1 << 1, + FAULT_FLAG_ALLOW_RETRY = 1 << 2, + FAULT_FLAG_RETRY_NOWAIT = 1 << 3, + FAULT_FLAG_KILLABLE = 1 << 4, + FAULT_FLAG_TRIED = 1 << 5, + FAULT_FLAG_USER = 1 << 6, + FAULT_FLAG_REMOTE = 1 << 7, + FAULT_FLAG_INSTRUCTION = 1 << 8, + FAULT_FLAG_INTERRUPTIBLE = 1 << 9, +}; + #endif /* _LINUX_MM_TYPES_H */ diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index bb73e9a0b24f..e998764f0262 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -272,4 +272,7 @@ struct prctl_mm_map { # define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1 # define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2 +#define PR_SET_VMA 0x53564d41 +# define PR_SET_VMA_ANON_NAME 0 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index 90f9365d3139..c959163d5e6c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -371,12 +372,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); new->vm_next = new->vm_prev = NULL; + dup_vma_anon_name(orig, new); } return new; } void vm_area_free(struct vm_area_struct *vma) { + free_vma_anon_name(vma); kmem_cache_free(vm_area_cachep, vma); } diff --git a/kernel/sys.c b/kernel/sys.c index d5d933504e0c..ddc173344d48 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2263,6 +2263,66 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, #define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE) +#ifdef CONFIG_ANON_VMA_NAME + +#define ANON_VMA_NAME_MAX_LEN 80 +#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]" + +static inline bool is_valid_name_char(char ch) +{ + /* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */ + return ch > 0x1f && ch < 0x7f && + !strchr(ANON_VMA_NAME_INVALID_CHARS, ch); +} + +static int prctl_set_vma(unsigned long opt, unsigned long addr, + unsigned long size, unsigned long arg) +{ + struct mm_struct *mm = current->mm; + const char __user *uname; + char *name, *pch; + int error; + + switch (opt) { + case PR_SET_VMA_ANON_NAME: + uname = (const char __user *)arg; + if (uname) { + name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN); + + if (IS_ERR(name)) + return PTR_ERR(name); + + for (pch = name; *pch != '\0'; pch++) { + if (!is_valid_name_char(*pch)) { + kfree(name); + return -EINVAL; + } + } + } else { + /* Reset the name */ + name = NULL; + } + + mmap_write_lock(mm); + error = madvise_set_anon_name(mm, addr, size, name); + mmap_write_unlock(mm); + kfree(name); + break; + default: + error = -EINVAL; + } + + return error; +} + +#else /* CONFIG_ANON_VMA_NAME */ +static int prctl_set_vma(unsigned long opt, unsigned long start, + unsigned long size, unsigned long arg) +{ + return -EINVAL; +} +#endif /* CONFIG_ANON_VMA_NAME */ + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2532,6 +2592,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, error = sched_core_share_pid(arg2, arg3, arg4, arg5); break; #endif + case PR_SET_VMA: + error = prctl_set_vma(arg2, arg3, arg4, arg5); + break; default: error = -EINVAL; break; diff --git a/mm/Kconfig b/mm/Kconfig index 356f4f2c779e..53d7485fc38f 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -900,6 +900,20 @@ config IO_MAPPING config SECRETMEM def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED +config ANON_VMA_NAME + bool "Anonymous VMA name support" + depends on PROC_FS && ADVISE_SYSCALLS && MMU + + help + Allow naming anonymous virtual memory areas. + + This feature allows assigning names to virtual memory areas. Assigned + names can be later retrieved from /proc/pid/maps and /proc/pid/smaps + and help identifying individual anonymous memory areas. + Assigning a name to anonymous virtual memory area might prevent that + area from being merged with adjacent virtual memory areas due to the + difference in their name. + source "mm/damon/Kconfig" endmenu diff --git a/mm/ksm.c b/mm/ksm.c index 0662093237e4..f34476ac0a41 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -15,6 +15,7 @@ #include #include +#include #include #include #include diff --git a/mm/madvise.c b/mm/madvise.c index 4b9c5509990c..5604064df464 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -18,6 +18,8 @@ #include #include #include +#include +#include #include #include #include @@ -62,19 +64,114 @@ static int madvise_need_mmap_write(int behavior) } } +#ifdef CONFIG_ANON_VMA_NAME +static struct anon_vma_name *anon_vma_name_alloc(const char *name) +{ + struct anon_vma_name *anon_name; + size_t count; + + /* Add 1 for NUL terminator at the end of the anon_name->name */ + count = strlen(name) + 1; + anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); + if (anon_name) { + kref_init(&anon_name->kref); + memcpy(anon_name->name, name, count); + } + + return anon_name; +} + +static void vma_anon_name_free(struct kref *kref) +{ + struct anon_vma_name *anon_name = + container_of(kref, struct anon_vma_name, kref); + kfree(anon_name); +} + +static inline bool has_vma_anon_name(struct vm_area_struct *vma) +{ + return !vma->vm_file && vma->anon_name; +} + +const char *vma_anon_name(struct vm_area_struct *vma) +{ + if (!has_vma_anon_name(vma)) + return NULL; + + mmap_assert_locked(vma->vm_mm); + + return vma->anon_name->name; +} + +void dup_vma_anon_name(struct vm_area_struct *orig_vma, + struct vm_area_struct *new_vma) +{ + if (!has_vma_anon_name(orig_vma)) + return; + + kref_get(&orig_vma->anon_name->kref); + new_vma->anon_name = orig_vma->anon_name; +} + +void free_vma_anon_name(struct vm_area_struct *vma) +{ + struct anon_vma_name *anon_name; + + if (!has_vma_anon_name(vma)) + return; + + anon_name = vma->anon_name; + vma->anon_name = NULL; + kref_put(&anon_name->kref, vma_anon_name_free); +} + +/* mmap_lock should be write-locked */ +static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +{ + const char *anon_name; + + if (!name) { + free_vma_anon_name(vma); + return 0; + } + + anon_name = vma_anon_name(vma); + if (anon_name) { + /* Same name, nothing to do here */ + if (!strcmp(name, anon_name)) + return 0; + + free_vma_anon_name(vma); + } + vma->anon_name = anon_vma_name_alloc(name); + if (!vma->anon_name) + return -ENOMEM; + + return 0; +} +#else /* CONFIG_ANON_VMA_NAME */ +static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name) +{ + if (name) + return -EINVAL; + + return 0; +} +#endif /* CONFIG_ANON_VMA_NAME */ /* * Update the vm_flags on region of a vma, splitting it or merging it as * necessary. Must be called with mmap_sem held for writing; */ static int madvise_update_vma(struct vm_area_struct *vma, struct vm_area_struct **prev, unsigned long start, - unsigned long end, unsigned long new_flags) + unsigned long end, unsigned long new_flags, + const char *name) { struct mm_struct *mm = vma->vm_mm; int error; pgoff_t pgoff; - if (new_flags == vma->vm_flags) { + if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) { *prev = vma; return 0; } @@ -82,7 +179,7 @@ static int madvise_update_vma(struct vm_area_struct *vma, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, name); if (*prev) { vma = *prev; goto success; @@ -111,6 +208,11 @@ success: * vm_flags is protected by the mmap_lock held in write mode. */ vma->vm_flags = new_flags; + if (!vma->vm_file) { + error = replace_vma_anon_name(vma, name); + if (error) + return error; + } return 0; } @@ -938,7 +1040,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, break; } - error = madvise_update_vma(vma, prev, start, end, new_flags); + error = madvise_update_vma(vma, prev, start, end, new_flags, + vma_anon_name(vma)); out: /* @@ -1118,6 +1221,55 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, return unmapped_error; } +#ifdef CONFIG_ANON_VMA_NAME +static int madvise_vma_anon_name(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end, + unsigned long name) +{ + int error; + + /* Only anonymous mappings can be named */ + if (vma->vm_file) + return -EBADF; + + error = madvise_update_vma(vma, prev, start, end, vma->vm_flags, + (const char *)name); + + /* + * madvise() returns EAGAIN if kernel resources, such as + * slab, are temporarily unavailable. + */ + if (error == -ENOMEM) + error = -EAGAIN; + return error; +} + +int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, + unsigned long len_in, const char *name) +{ + unsigned long end; + unsigned long len; + + if (start & ~PAGE_MASK) + return -EINVAL; + len = (len_in + ~PAGE_MASK) & PAGE_MASK; + + /* Check to see whether len was rounded up from small -ve to zero */ + if (len_in && !len) + return -EINVAL; + + end = start + len; + if (end < start) + return -EINVAL; + + if (end == start) + return 0; + + return madvise_walk_vmas(mm, start, end, (unsigned long)name, + madvise_vma_anon_name); +} +#endif /* CONFIG_ANON_VMA_NAME */ /* * The madvise(2) system call. * diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c index ea734f248fce..1b0ab8fcfd8b 100644 --- a/mm/mapping_dirty_helpers.c +++ b/mm/mapping_dirty_helpers.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include diff --git a/mm/memory.c b/mm/memory.c index d672ee14f8f5..a95b66c7fc11 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -41,6 +41,7 @@ #include #include +#include #include #include #include diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f6248affaf38..679f47b3a079 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -810,7 +810,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, ((vmstart - vma->vm_start) >> PAGE_SHIFT); prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, - new_pol, vma->vm_userfaultfd_ctx); + new_pol, vma->vm_userfaultfd_ctx, + vma_anon_name(vma)); if (prev) { vma = prev; next = vma->vm_next; diff --git a/mm/mlock.c b/mm/mlock.c index e263d62ae2d0..8f584eddd305 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (*prev) { vma = *prev; goto success; diff --git a/mm/mmap.c b/mm/mmap.c index bfb0ea164a90..3f48d0928e6b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include @@ -1029,7 +1030,8 @@ again: */ static inline int is_mergeable_vma(struct vm_area_struct *vma, struct file *file, unsigned long vm_flags, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { /* * VM_SOFTDIRTY should not prevent from VMA merging, if we @@ -1047,6 +1049,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma, return 0; if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx)) return 0; + if (!is_same_vma_anon_name(vma, anon_name)) + return 0; return 1; } @@ -1079,9 +1083,10 @@ static int can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { if (vma->vm_pgoff == vm_pgoff) return 1; @@ -1100,9 +1105,10 @@ static int can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t vm_pgoff, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { - if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) && + if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) { pgoff_t vm_pglen; vm_pglen = vma_pages(vma); @@ -1113,9 +1119,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags, } /* - * Given a mapping request (addr,end,vm_flags,file,pgoff), figure out - * whether that can be merged with its predecessor or its successor. - * Or both (it neatly fills a hole). + * Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name), + * figure out whether that can be merged with its predecessor or its + * successor. Or both (it neatly fills a hole). * * In most cases - when called for mmap, brk or mremap - [addr,end) is * certain not to be mapped by the time vma_merge is called; but when @@ -1160,7 +1166,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, unsigned long end, unsigned long vm_flags, struct anon_vma *anon_vma, struct file *file, pgoff_t pgoff, struct mempolicy *policy, - struct vm_userfaultfd_ctx vm_userfaultfd_ctx) + struct vm_userfaultfd_ctx vm_userfaultfd_ctx, + const char *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; struct vm_area_struct *area, *next; @@ -1190,7 +1197,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { /* * OK, it can. Can we now merge in the successor as well? */ @@ -1199,7 +1206,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx) && + vm_userfaultfd_ctx, anon_name) && is_mergeable_anon_vma(prev->anon_vma, next->anon_vma, NULL)) { /* cases 1, 6 */ @@ -1222,7 +1229,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, - vm_userfaultfd_ctx)) { + vm_userfaultfd_ctx, anon_name)) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, addr, prev->vm_pgoff, NULL, next); @@ -1754,7 +1761,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * Can we just expand an old mapping? */ vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -1803,7 +1810,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, */ if (unlikely(vm_flags != vma->vm_flags && prev)) { merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (merge) { /* ->mmap() can change vma->vm_file and fput the original file. So * fput the vma->vm_file here or we would add an extra fput for file @@ -3056,7 +3063,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla /* Can we just expand an old private anonymous mapping? */ vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX); + NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (vma) goto out; @@ -3142,25 +3149,27 @@ void exit_mmap(struct mm_struct *mm) * to mmu_notifier_release(mm) ensures mmu notifier callbacks in * __oom_reap_task_mm() will not block. * - * This needs to be done before calling munlock_vma_pages_all(), + * This needs to be done before calling unlock_range(), * which clears VM_LOCKED, otherwise the oom reaper cannot * reliably test it. */ (void)__oom_reap_task_mm(mm); set_bit(MMF_OOM_SKIP, &mm->flags); - mmap_write_lock(mm); - mmap_write_unlock(mm); } + mmap_write_lock(mm); if (mm->locked_vm) unlock_range(mm->mmap, ULONG_MAX); arch_exit_mmap(mm); vma = mm->mmap; - if (!vma) /* Can happen if dup_mmap() received an OOM */ + if (!vma) { + /* Can happen if dup_mmap() received an OOM */ + mmap_write_unlock(mm); return; + } lru_add_drain(); flush_cache_mm(mm); @@ -3171,16 +3180,14 @@ void exit_mmap(struct mm_struct *mm) free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); - /* - * Walk the list again, actually closing and freeing it, - * with preemption enabled, without holding any MM locks. - */ + /* Walk the list again, actually closing and freeing it. */ while (vma) { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); vma = remove_vma(vma); cond_resched(); } + mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } @@ -3249,7 +3256,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (new_vma) { /* * Source vma may have been merged into new_vma diff --git a/mm/mmu_gather.c b/mm/mmu_gather.c index 1b9837419bf9..afb7185ffdc4 100644 --- a/mm/mmu_gather.c +++ b/mm/mmu_gather.c @@ -3,6 +3,7 @@ #include #include #include +#include #include #include #include diff --git a/mm/mprotect.c b/mm/mprotect.c index e552f5e0ccbd..0138dfcdb1d8 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT); *pprev = vma_merge(mm, *pprev, start, end, newflags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), - vma->vm_userfaultfd_ctx); + vma->vm_userfaultfd_ctx, vma_anon_name(vma)); if (*pprev) { vma = *pprev; VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index e52ce0b1465d..3390316c8a32 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1170,15 +1170,15 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) goto put_task; } - if (mmget_not_zero(p->mm)) { - mm = p->mm; - if (task_will_free_mem(p)) - reap = true; - else { - /* Error only if the work has not been done already */ - if (!test_bit(MMF_OOM_SKIP, &mm->flags)) - ret = -EINVAL; - } + mm = p->mm; + mmgrab(mm); + + if (task_will_free_mem(p)) + reap = true; + else { + /* Error only if the work has not been done already */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags)) + ret = -EINVAL; } task_unlock(p); @@ -1189,13 +1189,16 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags) ret = -EINTR; goto drop_mm; } - if (!__oom_reap_task_mm(mm)) + /* + * Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure + * possible change in exit_mmap is seen + */ + if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm)) ret = -EAGAIN; mmap_read_unlock(mm); drop_mm: - if (mm) - mmput(mm); + mmdrop(mm); put_task: put_task_struct(task); return ret; diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 4e640baf9794..6523fda274e5 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -10,6 +10,7 @@ #include #include #include +#include #include /*