Merge ba535c1caf ("mm/oom_kill: allow process_mrelease to run under mmap_lock protection") into android-mainline

Steps on the way to 5.17-rc1

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: I29d376595033f4ef7895319d9398c2cb415de883
This commit is contained in:
Greg Kroah-Hartman
2022-02-14 09:09:17 +01:00
22 changed files with 532 additions and 184 deletions

View File

@@ -426,12 +426,14 @@ with the memory region, as the case would be with BSS (uninitialized data).
The "pathname" shows the name associated file for this mapping. If the mapping
is not associated with a file:
======= ====================================
============= ====================================
[heap] the heap of the program
[stack] the stack of the main process
[vdso] the "virtual dynamic shared object",
the kernel system call handler
======= ====================================
[anon:<name>] an anonymous mapping that has been
named by userspace
============= ====================================
or if empty, the mapping is anonymous.

View File

@@ -753,7 +753,7 @@ static inline bool pte_accessible(struct mm_struct *mm, pte_t a)
return true;
if ((pte_flags(a) & _PAGE_PROTNONE) &&
mm_tlb_flush_pending(mm))
atomic_read(&mm->tlb_flush_pending))
return true;
return false;

View File

@@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/pagewalk.h>
#include <linux/vmacache.h>
#include <linux/mm_inline.h>
#include <linux/hugetlb.h>
#include <linux/huge_mm.h>
#include <linux/mount.h>
@@ -308,6 +309,8 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
name = arch_vma_name(vma);
if (!name) {
const char *anon_name;
if (!mm) {
name = "[vdso]";
goto done;
@@ -319,8 +322,16 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma)
goto done;
}
if (is_stack(vma))
if (is_stack(vma)) {
name = "[stack]";
goto done;
}
anon_name = vma_anon_name(vma);
if (anon_name) {
seq_pad(m, ' ');
seq_printf(m, "[anon:%s]", anon_name);
}
}
done:

View File

@@ -15,6 +15,7 @@
#include <linux/sched/signal.h>
#include <linux/sched/mm.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/mmu_notifier.h>
#include <linux/poll.h>
#include <linux/slab.h>
@@ -877,7 +878,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file)
new_flags, vma->anon_vma,
vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
NULL_VM_UFFD_CTX);
NULL_VM_UFFD_CTX, vma_anon_name(vma));
if (prev)
vma = prev;
else
@@ -1436,7 +1437,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
((struct vm_userfaultfd_ctx){ ctx }));
((struct vm_userfaultfd_ctx){ ctx }),
vma_anon_name(vma));
if (prev) {
vma = prev;
goto next;
@@ -1613,7 +1615,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx,
prev = vma_merge(mm, prev, start, vma_end, new_flags,
vma->anon_vma, vma->vm_file, vma->vm_pgoff,
vma_policy(vma),
NULL_VM_UFFD_CTX);
NULL_VM_UFFD_CTX, vma_anon_name(vma));
if (prev) {
vma = prev;
goto next;

View File

@@ -424,51 +424,6 @@ extern unsigned int kobjsize(const void *objp);
*/
extern pgprot_t protection_map[16];
/**
* enum fault_flag - Fault flag definitions.
* @FAULT_FLAG_WRITE: Fault was a write fault.
* @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
* @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
* @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying.
* @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
* @FAULT_FLAG_TRIED: The fault has been tried once.
* @FAULT_FLAG_USER: The fault originated in userspace.
* @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
* @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
* @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
*
* About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
* whether we would allow page faults to retry by specifying these two
* fault flags correctly. Currently there can be three legal combinations:
*
* (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and
* this is the first try
*
* (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and
* we've already tried at least once
*
* (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
*
* The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
* be used. Note that page faults can be allowed to retry for multiple times,
* in which case we'll have an initial fault with flags (a) then later on
* continuous faults with flags (b). We should always try to detect pending
* signals before a retry to make sure the continuous page faults can still be
* interrupted if necessary.
*/
enum fault_flag {
FAULT_FLAG_WRITE = 1 << 0,
FAULT_FLAG_MKWRITE = 1 << 1,
FAULT_FLAG_ALLOW_RETRY = 1 << 2,
FAULT_FLAG_RETRY_NOWAIT = 1 << 3,
FAULT_FLAG_KILLABLE = 1 << 4,
FAULT_FLAG_TRIED = 1 << 5,
FAULT_FLAG_USER = 1 << 6,
FAULT_FLAG_REMOTE = 1 << 7,
FAULT_FLAG_INSTRUCTION = 1 << 8,
FAULT_FLAG_INTERRUPTIBLE = 1 << 9,
};
/*
* The default fault flags that should be used by most of the
* arch-specific page fault handlers.
@@ -577,6 +532,10 @@ enum page_entry_size {
*/
struct vm_operations_struct {
void (*open)(struct vm_area_struct * area);
/**
* @close: Called when the VMA is being removed from the MM.
* Context: User context. May sleep. Caller holds mmap_lock.
*/
void (*close)(struct vm_area_struct * area);
/* Called any time before splitting to check if it's allowed */
int (*may_split)(struct vm_area_struct *area, unsigned long addr);
@@ -2644,7 +2603,7 @@ static inline int vma_adjust(struct vm_area_struct *vma, unsigned long start,
extern struct vm_area_struct *vma_merge(struct mm_struct *,
struct vm_area_struct *prev, unsigned long addr, unsigned long end,
unsigned long vm_flags, struct anon_vma *, struct file *, pgoff_t,
struct mempolicy *, struct vm_userfaultfd_ctx);
struct mempolicy *, struct vm_userfaultfd_ctx, const char *);
extern struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *);
extern int __split_vma(struct mm_struct *, struct vm_area_struct *,
unsigned long addr, int new_below);
@@ -3390,5 +3349,16 @@ static inline int seal_check_future_write(int seals, struct vm_area_struct *vma)
return 0;
}
#ifdef CONFIG_ANON_VMA_NAME
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
unsigned long len_in, const char *name);
#else
static inline int
madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
unsigned long len_in, const char *name) {
return 0;
}
#endif
#endif /* __KERNEL__ */
#endif /* _LINUX_MM_H */

View File

@@ -2,8 +2,10 @@
#ifndef LINUX_MM_INLINE_H
#define LINUX_MM_INLINE_H
#include <linux/atomic.h>
#include <linux/huge_mm.h>
#include <linux/swap.h>
#include <linux/string.h>
/**
* folio_is_file_lru - Should the folio be on a file LRU or anon LRU?
@@ -135,4 +137,138 @@ static __always_inline void del_page_from_lru_list(struct page *page,
{
lruvec_del_folio(lruvec, page_folio(page));
}
#ifdef CONFIG_ANON_VMA_NAME
/*
* mmap_lock should be read-locked when calling vma_anon_name() and while using
* the returned pointer.
*/
extern const char *vma_anon_name(struct vm_area_struct *vma);
/*
* mmap_lock should be read-locked for orig_vma->vm_mm.
* mmap_lock should be write-locked for new_vma->vm_mm or new_vma should be
* isolated.
*/
extern void dup_vma_anon_name(struct vm_area_struct *orig_vma,
struct vm_area_struct *new_vma);
/*
* mmap_lock should be write-locked or vma should have been isolated under
* write-locked mmap_lock protection.
*/
extern void free_vma_anon_name(struct vm_area_struct *vma);
/* mmap_lock should be read-locked */
static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
const char *name)
{
const char *vma_name = vma_anon_name(vma);
/* either both NULL, or pointers to same string */
if (vma_name == name)
return true;
return name && vma_name && !strcmp(name, vma_name);
}
#else /* CONFIG_ANON_VMA_NAME */
static inline const char *vma_anon_name(struct vm_area_struct *vma)
{
return NULL;
}
static inline void dup_vma_anon_name(struct vm_area_struct *orig_vma,
struct vm_area_struct *new_vma) {}
static inline void free_vma_anon_name(struct vm_area_struct *vma) {}
static inline bool is_same_vma_anon_name(struct vm_area_struct *vma,
const char *name)
{
return true;
}
#endif /* CONFIG_ANON_VMA_NAME */
static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
atomic_set(&mm->tlb_flush_pending, 0);
}
static inline void inc_tlb_flush_pending(struct mm_struct *mm)
{
atomic_inc(&mm->tlb_flush_pending);
/*
* The only time this value is relevant is when there are indeed pages
* to flush. And we'll only flush pages after changing them, which
* requires the PTL.
*
* So the ordering here is:
*
* atomic_inc(&mm->tlb_flush_pending);
* spin_lock(&ptl);
* ...
* set_pte_at();
* spin_unlock(&ptl);
*
* spin_lock(&ptl)
* mm_tlb_flush_pending();
* ....
* spin_unlock(&ptl);
*
* flush_tlb_range();
* atomic_dec(&mm->tlb_flush_pending);
*
* Where the increment if constrained by the PTL unlock, it thus
* ensures that the increment is visible if the PTE modification is
* visible. After all, if there is no PTE modification, nobody cares
* about TLB flushes either.
*
* This very much relies on users (mm_tlb_flush_pending() and
* mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
* therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
* locks (PPC) the unlock of one doesn't order against the lock of
* another PTL.
*
* The decrement is ordered by the flush_tlb_range(), such that
* mm_tlb_flush_pending() will not return false unless all flushes have
* completed.
*/
}
static inline void dec_tlb_flush_pending(struct mm_struct *mm)
{
/*
* See inc_tlb_flush_pending().
*
* This cannot be smp_mb__before_atomic() because smp_mb() simply does
* not order against TLB invalidate completion, which is what we need.
*
* Therefore we must rely on tlb_flush_*() to guarantee order.
*/
atomic_dec(&mm->tlb_flush_pending);
}
static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
{
/*
* Must be called after having acquired the PTL; orders against that
* PTLs release and therefore ensures that if we observe the modified
* PTE we must also observe the increment from inc_tlb_flush_pending().
*
* That is, it only guarantees to return true if there is a flush
* pending for _this_ PTL.
*/
return atomic_read(&mm->tlb_flush_pending);
}
static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
{
/*
* Similar to mm_tlb_flush_pending(), we must have acquired the PTL
* for which there is a TLB flush pending in order to guarantee
* we've seen both that PTE modification and the increment.
*
* (no requirement on actually still holding the PTL, that is irrelevant)
*/
return atomic_read(&mm->tlb_flush_pending) > 1;
}
#endif

View File

@@ -5,6 +5,7 @@
#include <linux/mm_types_task.h>
#include <linux/auxvec.h>
#include <linux/kref.h>
#include <linux/list.h>
#include <linux/spinlock.h>
#include <linux/rbtree.h>
@@ -386,6 +387,12 @@ struct vm_userfaultfd_ctx {
struct vm_userfaultfd_ctx {};
#endif /* CONFIG_USERFAULTFD */
struct anon_vma_name {
struct kref kref;
/* The name needs to be at the end because it is dynamically sized. */
char name[];
};
/*
* This struct describes a virtual memory area. There is one of these
* per VM-area/task. A VM area is any part of the process virtual memory
@@ -426,11 +433,19 @@ struct vm_area_struct {
/*
* For areas with an address space and backing store,
* linkage into the address_space->i_mmap interval tree.
*
* For private anonymous mappings, a pointer to a null terminated string
* containing the name given to the vma, or NULL if unnamed.
*/
struct {
struct rb_node rb;
unsigned long rb_subtree_last;
} shared;
union {
struct {
struct rb_node rb;
unsigned long rb_subtree_last;
} shared;
/* Serialized by mmap_sem. */
struct anon_vma_name *anon_name;
};
/*
* A file's MAP_PRIVATE vma can be in both i_mmap tree and anon_vma
@@ -677,90 +692,6 @@ extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm);
extern void tlb_finish_mmu(struct mmu_gather *tlb);
static inline void init_tlb_flush_pending(struct mm_struct *mm)
{
atomic_set(&mm->tlb_flush_pending, 0);
}
static inline void inc_tlb_flush_pending(struct mm_struct *mm)
{
atomic_inc(&mm->tlb_flush_pending);
/*
* The only time this value is relevant is when there are indeed pages
* to flush. And we'll only flush pages after changing them, which
* requires the PTL.
*
* So the ordering here is:
*
* atomic_inc(&mm->tlb_flush_pending);
* spin_lock(&ptl);
* ...
* set_pte_at();
* spin_unlock(&ptl);
*
* spin_lock(&ptl)
* mm_tlb_flush_pending();
* ....
* spin_unlock(&ptl);
*
* flush_tlb_range();
* atomic_dec(&mm->tlb_flush_pending);
*
* Where the increment if constrained by the PTL unlock, it thus
* ensures that the increment is visible if the PTE modification is
* visible. After all, if there is no PTE modification, nobody cares
* about TLB flushes either.
*
* This very much relies on users (mm_tlb_flush_pending() and
* mm_tlb_flush_nested()) only caring about _specific_ PTEs (and
* therefore specific PTLs), because with SPLIT_PTE_PTLOCKS and RCpc
* locks (PPC) the unlock of one doesn't order against the lock of
* another PTL.
*
* The decrement is ordered by the flush_tlb_range(), such that
* mm_tlb_flush_pending() will not return false unless all flushes have
* completed.
*/
}
static inline void dec_tlb_flush_pending(struct mm_struct *mm)
{
/*
* See inc_tlb_flush_pending().
*
* This cannot be smp_mb__before_atomic() because smp_mb() simply does
* not order against TLB invalidate completion, which is what we need.
*
* Therefore we must rely on tlb_flush_*() to guarantee order.
*/
atomic_dec(&mm->tlb_flush_pending);
}
static inline bool mm_tlb_flush_pending(struct mm_struct *mm)
{
/*
* Must be called after having acquired the PTL; orders against that
* PTLs release and therefore ensures that if we observe the modified
* PTE we must also observe the increment from inc_tlb_flush_pending().
*
* That is, it only guarantees to return true if there is a flush
* pending for _this_ PTL.
*/
return atomic_read(&mm->tlb_flush_pending);
}
static inline bool mm_tlb_flush_nested(struct mm_struct *mm)
{
/*
* Similar to mm_tlb_flush_pending(), we must have acquired the PTL
* for which there is a TLB flush pending in order to guarantee
* we've seen both that PTE modification and the increment.
*
* (no requirement on actually still holding the PTL, that is irrelevant)
*/
return atomic_read(&mm->tlb_flush_pending) > 1;
}
struct vm_fault;
/**
@@ -875,4 +806,49 @@ typedef struct {
unsigned long val;
} swp_entry_t;
/**
* enum fault_flag - Fault flag definitions.
* @FAULT_FLAG_WRITE: Fault was a write fault.
* @FAULT_FLAG_MKWRITE: Fault was mkwrite of existing PTE.
* @FAULT_FLAG_ALLOW_RETRY: Allow to retry the fault if blocked.
* @FAULT_FLAG_RETRY_NOWAIT: Don't drop mmap_lock and wait when retrying.
* @FAULT_FLAG_KILLABLE: The fault task is in SIGKILL killable region.
* @FAULT_FLAG_TRIED: The fault has been tried once.
* @FAULT_FLAG_USER: The fault originated in userspace.
* @FAULT_FLAG_REMOTE: The fault is not for current task/mm.
* @FAULT_FLAG_INSTRUCTION: The fault was during an instruction fetch.
* @FAULT_FLAG_INTERRUPTIBLE: The fault can be interrupted by non-fatal signals.
*
* About @FAULT_FLAG_ALLOW_RETRY and @FAULT_FLAG_TRIED: we can specify
* whether we would allow page faults to retry by specifying these two
* fault flags correctly. Currently there can be three legal combinations:
*
* (a) ALLOW_RETRY and !TRIED: this means the page fault allows retry, and
* this is the first try
*
* (b) ALLOW_RETRY and TRIED: this means the page fault allows retry, and
* we've already tried at least once
*
* (c) !ALLOW_RETRY and !TRIED: this means the page fault does not allow retry
*
* The unlisted combination (!ALLOW_RETRY && TRIED) is illegal and should never
* be used. Note that page faults can be allowed to retry for multiple times,
* in which case we'll have an initial fault with flags (a) then later on
* continuous faults with flags (b). We should always try to detect pending
* signals before a retry to make sure the continuous page faults can still be
* interrupted if necessary.
*/
enum fault_flag {
FAULT_FLAG_WRITE = 1 << 0,
FAULT_FLAG_MKWRITE = 1 << 1,
FAULT_FLAG_ALLOW_RETRY = 1 << 2,
FAULT_FLAG_RETRY_NOWAIT = 1 << 3,
FAULT_FLAG_KILLABLE = 1 << 4,
FAULT_FLAG_TRIED = 1 << 5,
FAULT_FLAG_USER = 1 << 6,
FAULT_FLAG_REMOTE = 1 << 7,
FAULT_FLAG_INSTRUCTION = 1 << 8,
FAULT_FLAG_INTERRUPTIBLE = 1 << 9,
};
#endif /* _LINUX_MM_TYPES_H */

View File

@@ -272,4 +272,7 @@ struct prctl_mm_map {
# define PR_SCHED_CORE_SCOPE_THREAD_GROUP 1
# define PR_SCHED_CORE_SCOPE_PROCESS_GROUP 2
#define PR_SET_VMA 0x53564d41
# define PR_SET_VMA_ANON_NAME 0
#endif /* _LINUX_PRCTL_H */

View File

@@ -42,6 +42,7 @@
#include <linux/mmu_notifier.h>
#include <linux/fs.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/vmacache.h>
#include <linux/nsproxy.h>
#include <linux/capability.h>
@@ -371,12 +372,14 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig)
*new = data_race(*orig);
INIT_LIST_HEAD(&new->anon_vma_chain);
new->vm_next = new->vm_prev = NULL;
dup_vma_anon_name(orig, new);
}
return new;
}
void vm_area_free(struct vm_area_struct *vma)
{
free_vma_anon_name(vma);
kmem_cache_free(vm_area_cachep, vma);
}

View File

@@ -2263,6 +2263,66 @@ int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which,
#define PR_IO_FLUSHER (PF_MEMALLOC_NOIO | PF_LOCAL_THROTTLE)
#ifdef CONFIG_ANON_VMA_NAME
#define ANON_VMA_NAME_MAX_LEN 80
#define ANON_VMA_NAME_INVALID_CHARS "\\`$[]"
static inline bool is_valid_name_char(char ch)
{
/* printable ascii characters, excluding ANON_VMA_NAME_INVALID_CHARS */
return ch > 0x1f && ch < 0x7f &&
!strchr(ANON_VMA_NAME_INVALID_CHARS, ch);
}
static int prctl_set_vma(unsigned long opt, unsigned long addr,
unsigned long size, unsigned long arg)
{
struct mm_struct *mm = current->mm;
const char __user *uname;
char *name, *pch;
int error;
switch (opt) {
case PR_SET_VMA_ANON_NAME:
uname = (const char __user *)arg;
if (uname) {
name = strndup_user(uname, ANON_VMA_NAME_MAX_LEN);
if (IS_ERR(name))
return PTR_ERR(name);
for (pch = name; *pch != '\0'; pch++) {
if (!is_valid_name_char(*pch)) {
kfree(name);
return -EINVAL;
}
}
} else {
/* Reset the name */
name = NULL;
}
mmap_write_lock(mm);
error = madvise_set_anon_name(mm, addr, size, name);
mmap_write_unlock(mm);
kfree(name);
break;
default:
error = -EINVAL;
}
return error;
}
#else /* CONFIG_ANON_VMA_NAME */
static int prctl_set_vma(unsigned long opt, unsigned long start,
unsigned long size, unsigned long arg)
{
return -EINVAL;
}
#endif /* CONFIG_ANON_VMA_NAME */
SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
unsigned long, arg4, unsigned long, arg5)
{
@@ -2532,6 +2592,9 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
error = sched_core_share_pid(arg2, arg3, arg4, arg5);
break;
#endif
case PR_SET_VMA:
error = prctl_set_vma(arg2, arg3, arg4, arg5);
break;
default:
error = -EINVAL;
break;

View File

@@ -900,6 +900,20 @@ config IO_MAPPING
config SECRETMEM
def_bool ARCH_HAS_SET_DIRECT_MAP && !EMBEDDED
config ANON_VMA_NAME
bool "Anonymous VMA name support"
depends on PROC_FS && ADVISE_SYSCALLS && MMU
help
Allow naming anonymous virtual memory areas.
This feature allows assigning names to virtual memory areas. Assigned
names can be later retrieved from /proc/pid/maps and /proc/pid/smaps
and help identifying individual anonymous memory areas.
Assigning a name to anonymous virtual memory area might prevent that
area from being merged with adjacent virtual memory areas due to the
difference in their name.
source "mm/damon/Kconfig"
endmenu

View File

@@ -15,6 +15,7 @@
#include <linux/errno.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/fs.h>
#include <linux/mman.h>
#include <linux/sched.h>

View File

@@ -18,6 +18,8 @@
#include <linux/fadvise.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/mm_inline.h>
#include <linux/string.h>
#include <linux/uio.h>
#include <linux/ksm.h>
#include <linux/fs.h>
@@ -62,19 +64,114 @@ static int madvise_need_mmap_write(int behavior)
}
}
#ifdef CONFIG_ANON_VMA_NAME
static struct anon_vma_name *anon_vma_name_alloc(const char *name)
{
struct anon_vma_name *anon_name;
size_t count;
/* Add 1 for NUL terminator at the end of the anon_name->name */
count = strlen(name) + 1;
anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL);
if (anon_name) {
kref_init(&anon_name->kref);
memcpy(anon_name->name, name, count);
}
return anon_name;
}
static void vma_anon_name_free(struct kref *kref)
{
struct anon_vma_name *anon_name =
container_of(kref, struct anon_vma_name, kref);
kfree(anon_name);
}
static inline bool has_vma_anon_name(struct vm_area_struct *vma)
{
return !vma->vm_file && vma->anon_name;
}
const char *vma_anon_name(struct vm_area_struct *vma)
{
if (!has_vma_anon_name(vma))
return NULL;
mmap_assert_locked(vma->vm_mm);
return vma->anon_name->name;
}
void dup_vma_anon_name(struct vm_area_struct *orig_vma,
struct vm_area_struct *new_vma)
{
if (!has_vma_anon_name(orig_vma))
return;
kref_get(&orig_vma->anon_name->kref);
new_vma->anon_name = orig_vma->anon_name;
}
void free_vma_anon_name(struct vm_area_struct *vma)
{
struct anon_vma_name *anon_name;
if (!has_vma_anon_name(vma))
return;
anon_name = vma->anon_name;
vma->anon_name = NULL;
kref_put(&anon_name->kref, vma_anon_name_free);
}
/* mmap_lock should be write-locked */
static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
{
const char *anon_name;
if (!name) {
free_vma_anon_name(vma);
return 0;
}
anon_name = vma_anon_name(vma);
if (anon_name) {
/* Same name, nothing to do here */
if (!strcmp(name, anon_name))
return 0;
free_vma_anon_name(vma);
}
vma->anon_name = anon_vma_name_alloc(name);
if (!vma->anon_name)
return -ENOMEM;
return 0;
}
#else /* CONFIG_ANON_VMA_NAME */
static int replace_vma_anon_name(struct vm_area_struct *vma, const char *name)
{
if (name)
return -EINVAL;
return 0;
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
* Update the vm_flags on region of a vma, splitting it or merging it as
* necessary. Must be called with mmap_sem held for writing;
*/
static int madvise_update_vma(struct vm_area_struct *vma,
struct vm_area_struct **prev, unsigned long start,
unsigned long end, unsigned long new_flags)
unsigned long end, unsigned long new_flags,
const char *name)
{
struct mm_struct *mm = vma->vm_mm;
int error;
pgoff_t pgoff;
if (new_flags == vma->vm_flags) {
if (new_flags == vma->vm_flags && is_same_vma_anon_name(vma, name)) {
*prev = vma;
return 0;
}
@@ -82,7 +179,7 @@ static int madvise_update_vma(struct vm_area_struct *vma,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, new_flags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx);
vma->vm_userfaultfd_ctx, name);
if (*prev) {
vma = *prev;
goto success;
@@ -111,6 +208,11 @@ success:
* vm_flags is protected by the mmap_lock held in write mode.
*/
vma->vm_flags = new_flags;
if (!vma->vm_file) {
error = replace_vma_anon_name(vma, name);
if (error)
return error;
}
return 0;
}
@@ -938,7 +1040,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
break;
}
error = madvise_update_vma(vma, prev, start, end, new_flags);
error = madvise_update_vma(vma, prev, start, end, new_flags,
vma_anon_name(vma));
out:
/*
@@ -1118,6 +1221,55 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start,
return unmapped_error;
}
#ifdef CONFIG_ANON_VMA_NAME
static int madvise_vma_anon_name(struct vm_area_struct *vma,
struct vm_area_struct **prev,
unsigned long start, unsigned long end,
unsigned long name)
{
int error;
/* Only anonymous mappings can be named */
if (vma->vm_file)
return -EBADF;
error = madvise_update_vma(vma, prev, start, end, vma->vm_flags,
(const char *)name);
/*
* madvise() returns EAGAIN if kernel resources, such as
* slab, are temporarily unavailable.
*/
if (error == -ENOMEM)
error = -EAGAIN;
return error;
}
int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
unsigned long len_in, const char *name)
{
unsigned long end;
unsigned long len;
if (start & ~PAGE_MASK)
return -EINVAL;
len = (len_in + ~PAGE_MASK) & PAGE_MASK;
/* Check to see whether len was rounded up from small -ve to zero */
if (len_in && !len)
return -EINVAL;
end = start + len;
if (end < start)
return -EINVAL;
if (end == start)
return 0;
return madvise_walk_vmas(mm, start, end, (unsigned long)name,
madvise_vma_anon_name);
}
#endif /* CONFIG_ANON_VMA_NAME */
/*
* The madvise(2) system call.
*

View File

@@ -3,6 +3,7 @@
#include <linux/hugetlb.h>
#include <linux/bitops.h>
#include <linux/mmu_notifier.h>
#include <linux/mm_inline.h>
#include <asm/cacheflush.h>
#include <asm/tlbflush.h>

View File

@@ -41,6 +41,7 @@
#include <linux/kernel_stat.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/sched/mm.h>
#include <linux/sched/coredump.h>
#include <linux/sched/numa_balancing.h>

View File

@@ -810,7 +810,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start,
((vmstart - vma->vm_start) >> PAGE_SHIFT);
prev = vma_merge(mm, prev, vmstart, vmend, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff,
new_pol, vma->vm_userfaultfd_ctx);
new_pol, vma->vm_userfaultfd_ctx,
vma_anon_name(vma));
if (prev) {
vma = prev;
next = vma->vm_next;

View File

@@ -512,7 +512,7 @@ static int mlock_fixup(struct vm_area_struct *vma, struct vm_area_struct **prev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*prev = vma_merge(mm, *prev, start, end, newflags, vma->anon_vma,
vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx);
vma->vm_userfaultfd_ctx, vma_anon_name(vma));
if (*prev) {
vma = *prev;
goto success;

View File

@@ -13,6 +13,7 @@
#include <linux/slab.h>
#include <linux/backing-dev.h>
#include <linux/mm.h>
#include <linux/mm_inline.h>
#include <linux/vmacache.h>
#include <linux/shm.h>
#include <linux/mman.h>
@@ -1029,7 +1030,8 @@ again:
*/
static inline int is_mergeable_vma(struct vm_area_struct *vma,
struct file *file, unsigned long vm_flags,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
const char *anon_name)
{
/*
* VM_SOFTDIRTY should not prevent from VMA merging, if we
@@ -1047,6 +1049,8 @@ static inline int is_mergeable_vma(struct vm_area_struct *vma,
return 0;
if (!is_mergeable_vm_userfaultfd_ctx(vma, vm_userfaultfd_ctx))
return 0;
if (!is_same_vma_anon_name(vma, anon_name))
return 0;
return 1;
}
@@ -1079,9 +1083,10 @@ static int
can_vma_merge_before(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
const char *anon_name)
{
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
if (vma->vm_pgoff == vm_pgoff)
return 1;
@@ -1100,9 +1105,10 @@ static int
can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t vm_pgoff,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
const char *anon_name)
{
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx) &&
if (is_mergeable_vma(vma, file, vm_flags, vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(anon_vma, vma->anon_vma, vma)) {
pgoff_t vm_pglen;
vm_pglen = vma_pages(vma);
@@ -1113,9 +1119,9 @@ can_vma_merge_after(struct vm_area_struct *vma, unsigned long vm_flags,
}
/*
* Given a mapping request (addr,end,vm_flags,file,pgoff), figure out
* whether that can be merged with its predecessor or its successor.
* Or both (it neatly fills a hole).
* Given a mapping request (addr,end,vm_flags,file,pgoff,anon_name),
* figure out whether that can be merged with its predecessor or its
* successor. Or both (it neatly fills a hole).
*
* In most cases - when called for mmap, brk or mremap - [addr,end) is
* certain not to be mapped by the time vma_merge is called; but when
@@ -1160,7 +1166,8 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
unsigned long end, unsigned long vm_flags,
struct anon_vma *anon_vma, struct file *file,
pgoff_t pgoff, struct mempolicy *policy,
struct vm_userfaultfd_ctx vm_userfaultfd_ctx)
struct vm_userfaultfd_ctx vm_userfaultfd_ctx,
const char *anon_name)
{
pgoff_t pglen = (end - addr) >> PAGE_SHIFT;
struct vm_area_struct *area, *next;
@@ -1190,7 +1197,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
mpol_equal(vma_policy(prev), policy) &&
can_vma_merge_after(prev, vm_flags,
anon_vma, file, pgoff,
vm_userfaultfd_ctx)) {
vm_userfaultfd_ctx, anon_name)) {
/*
* OK, it can. Can we now merge in the successor as well?
*/
@@ -1199,7 +1206,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
can_vma_merge_before(next, vm_flags,
anon_vma, file,
pgoff+pglen,
vm_userfaultfd_ctx) &&
vm_userfaultfd_ctx, anon_name) &&
is_mergeable_anon_vma(prev->anon_vma,
next->anon_vma, NULL)) {
/* cases 1, 6 */
@@ -1222,7 +1229,7 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm,
mpol_equal(policy, vma_policy(next)) &&
can_vma_merge_before(next, vm_flags,
anon_vma, file, pgoff+pglen,
vm_userfaultfd_ctx)) {
vm_userfaultfd_ctx, anon_name)) {
if (prev && addr < prev->vm_end) /* case 4 */
err = __vma_adjust(prev, prev->vm_start,
addr, prev->vm_pgoff, NULL, next);
@@ -1754,7 +1761,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
* Can we just expand an old mapping?
*/
vma = vma_merge(mm, prev, addr, addr + len, vm_flags,
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX);
NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
@@ -1803,7 +1810,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
*/
if (unlikely(vm_flags != vma->vm_flags && prev)) {
merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags,
NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX);
NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (merge) {
/* ->mmap() can change vma->vm_file and fput the original file. So
* fput the vma->vm_file here or we would add an extra fput for file
@@ -3056,7 +3063,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla
/* Can we just expand an old private anonymous mapping? */
vma = vma_merge(mm, prev, addr, addr + len, flags,
NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX);
NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL);
if (vma)
goto out;
@@ -3142,25 +3149,27 @@ void exit_mmap(struct mm_struct *mm)
* to mmu_notifier_release(mm) ensures mmu notifier callbacks in
* __oom_reap_task_mm() will not block.
*
* This needs to be done before calling munlock_vma_pages_all(),
* This needs to be done before calling unlock_range(),
* which clears VM_LOCKED, otherwise the oom reaper cannot
* reliably test it.
*/
(void)__oom_reap_task_mm(mm);
set_bit(MMF_OOM_SKIP, &mm->flags);
mmap_write_lock(mm);
mmap_write_unlock(mm);
}
mmap_write_lock(mm);
if (mm->locked_vm)
unlock_range(mm->mmap, ULONG_MAX);
arch_exit_mmap(mm);
vma = mm->mmap;
if (!vma) /* Can happen if dup_mmap() received an OOM */
if (!vma) {
/* Can happen if dup_mmap() received an OOM */
mmap_write_unlock(mm);
return;
}
lru_add_drain();
flush_cache_mm(mm);
@@ -3171,16 +3180,14 @@ void exit_mmap(struct mm_struct *mm)
free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING);
tlb_finish_mmu(&tlb);
/*
* Walk the list again, actually closing and freeing it,
* with preemption enabled, without holding any MM locks.
*/
/* Walk the list again, actually closing and freeing it. */
while (vma) {
if (vma->vm_flags & VM_ACCOUNT)
nr_accounted += vma_pages(vma);
vma = remove_vma(vma);
cond_resched();
}
mmap_write_unlock(mm);
vm_unacct_memory(nr_accounted);
}
@@ -3249,7 +3256,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap,
return NULL; /* should never get here */
new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx);
vma->vm_userfaultfd_ctx, vma_anon_name(vma));
if (new_vma) {
/*
* Source vma may have been merged into new_vma

View File

@@ -3,6 +3,7 @@
#include <linux/kernel.h>
#include <linux/mmdebug.h>
#include <linux/mm_types.h>
#include <linux/mm_inline.h>
#include <linux/pagemap.h>
#include <linux/rcupdate.h>
#include <linux/smp.h>

View File

@@ -464,7 +464,7 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
pgoff = vma->vm_pgoff + ((start - vma->vm_start) >> PAGE_SHIFT);
*pprev = vma_merge(mm, *pprev, start, end, newflags,
vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx);
vma->vm_userfaultfd_ctx, vma_anon_name(vma));
if (*pprev) {
vma = *pprev;
VM_WARN_ON((vma->vm_flags ^ newflags) & ~VM_SOFTDIRTY);

View File

@@ -1170,15 +1170,15 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
goto put_task;
}
if (mmget_not_zero(p->mm)) {
mm = p->mm;
if (task_will_free_mem(p))
reap = true;
else {
/* Error only if the work has not been done already */
if (!test_bit(MMF_OOM_SKIP, &mm->flags))
ret = -EINVAL;
}
mm = p->mm;
mmgrab(mm);
if (task_will_free_mem(p))
reap = true;
else {
/* Error only if the work has not been done already */
if (!test_bit(MMF_OOM_SKIP, &mm->flags))
ret = -EINVAL;
}
task_unlock(p);
@@ -1189,13 +1189,16 @@ SYSCALL_DEFINE2(process_mrelease, int, pidfd, unsigned int, flags)
ret = -EINTR;
goto drop_mm;
}
if (!__oom_reap_task_mm(mm))
/*
* Check MMF_OOM_SKIP again under mmap_read_lock protection to ensure
* possible change in exit_mmap is seen
*/
if (!test_bit(MMF_OOM_SKIP, &mm->flags) && !__oom_reap_task_mm(mm))
ret = -EAGAIN;
mmap_read_unlock(mm);
drop_mm:
if (mm)
mmput(mm);
mmdrop(mm);
put_task:
put_task_struct(task);
return ret;

View File

@@ -10,6 +10,7 @@
#include <linux/pagemap.h>
#include <linux/hugetlb.h>
#include <linux/pgtable.h>
#include <linux/mm_inline.h>
#include <asm/tlb.h>
/*