mirror of
https://github.com/hardkernel/linux.git
synced 2026-06-06 02:50:49 +09:00
Merge 57eb60c04d ("tools/vm/page_owner_sort: fix -f option") into android-mainline
Steps on the way to 6.1-rc1 Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: I106856c416425a16c8aa8851e19d2f4f9e9efc51
This commit is contained in:
@@ -5,10 +5,10 @@ CMA Debugfs Interface
|
||||
The CMA debugfs interface is useful to retrieve basic information out of the
|
||||
different CMA areas and to test allocation/release in each of the areas.
|
||||
|
||||
Each CMA zone represents a directory under <debugfs>/cma/, indexed by the
|
||||
kernel's CMA index. So the first CMA zone would be:
|
||||
Each CMA area represents a directory under <debugfs>/cma/, represented by
|
||||
its CMA name like below:
|
||||
|
||||
<debugfs>/cma/cma-0
|
||||
<debugfs>/cma/<cma_name>
|
||||
|
||||
The structure of the files created under that directory is as follows:
|
||||
|
||||
@@ -18,8 +18,8 @@ The structure of the files created under that directory is as follows:
|
||||
- [RO] bitmap: The bitmap of page states in the zone.
|
||||
- [WO] alloc: Allocate N pages from that CMA area. For example::
|
||||
|
||||
echo 5 > <debugfs>/cma/cma-2/alloc
|
||||
echo 5 > <debugfs>/cma/<cma_name>/alloc
|
||||
|
||||
would try to allocate 5 pages from the cma-2 area.
|
||||
would try to allocate 5 pages from the 'cma_name' area.
|
||||
|
||||
- [WO] free: Free N pages from that CMA area, similar to the above.
|
||||
|
||||
@@ -17,7 +17,10 @@ of the ``PROT_NONE+SIGSEGV`` trick.
|
||||
Design
|
||||
======
|
||||
|
||||
Userfaults are delivered and resolved through the ``userfaultfd`` syscall.
|
||||
Userspace creates a new userfaultfd, initializes it, and registers one or more
|
||||
regions of virtual memory with it. Then, any page faults which occur within the
|
||||
region(s) result in a message being delivered to the userfaultfd, notifying
|
||||
userspace of the fault.
|
||||
|
||||
The ``userfaultfd`` (aside from registering and unregistering virtual
|
||||
memory ranges) provides two primary functionalities:
|
||||
@@ -34,12 +37,11 @@ The real advantage of userfaults if compared to regular virtual memory
|
||||
management of mremap/mprotect is that the userfaults in all their
|
||||
operations never involve heavyweight structures like vmas (in fact the
|
||||
``userfaultfd`` runtime load never takes the mmap_lock for writing).
|
||||
|
||||
Vmas are not suitable for page- (or hugepage) granular fault tracking
|
||||
when dealing with virtual address spaces that could span
|
||||
Terabytes. Too many vmas would be needed for that.
|
||||
|
||||
The ``userfaultfd`` once opened by invoking the syscall, can also be
|
||||
The ``userfaultfd``, once created, can also be
|
||||
passed using unix domain sockets to a manager process, so the same
|
||||
manager process could handle the userfaults of a multitude of
|
||||
different processes without them being aware about what is going on
|
||||
@@ -50,6 +52,39 @@ is a corner case that would currently return ``-EBUSY``).
|
||||
API
|
||||
===
|
||||
|
||||
Creating a userfaultfd
|
||||
----------------------
|
||||
|
||||
There are two ways to create a new userfaultfd, each of which provide ways to
|
||||
restrict access to this functionality (since historically userfaultfds which
|
||||
handle kernel page faults have been a useful tool for exploiting the kernel).
|
||||
|
||||
The first way, supported since userfaultfd was introduced, is the
|
||||
userfaultfd(2) syscall. Access to this is controlled in several ways:
|
||||
|
||||
- Any user can always create a userfaultfd which traps userspace page faults
|
||||
only. Such a userfaultfd can be created using the userfaultfd(2) syscall
|
||||
with the flag UFFD_USER_MODE_ONLY.
|
||||
|
||||
- In order to also trap kernel page faults for the address space, either the
|
||||
process needs the CAP_SYS_PTRACE capability, or the system must have
|
||||
vm.unprivileged_userfaultfd set to 1. By default, vm.unprivileged_userfaultfd
|
||||
is set to 0.
|
||||
|
||||
The second way, added to the kernel more recently, is by opening
|
||||
/dev/userfaultfd and issuing a USERFAULTFD_IOC_NEW ioctl to it. This method
|
||||
yields equivalent userfaultfds to the userfaultfd(2) syscall.
|
||||
|
||||
Unlike userfaultfd(2), access to /dev/userfaultfd is controlled via normal
|
||||
filesystem permissions (user/group/mode), which gives fine grained access to
|
||||
userfaultfd specifically, without also granting other unrelated privileges at
|
||||
the same time (as e.g. granting CAP_SYS_PTRACE would do). Users who have access
|
||||
to /dev/userfaultfd can always create userfaultfds that trap kernel page faults;
|
||||
vm.unprivileged_userfaultfd is not considered.
|
||||
|
||||
Initializing a userfaultfd
|
||||
--------------------------
|
||||
|
||||
When first opened the ``userfaultfd`` must be enabled invoking the
|
||||
``UFFDIO_API`` ioctl specifying a ``uffdio_api.api`` value set to ``UFFD_API`` (or
|
||||
a later API version) which will specify the ``read/POLLIN`` protocol
|
||||
|
||||
@@ -926,6 +926,9 @@ calls without any restrictions.
|
||||
|
||||
The default value is 0.
|
||||
|
||||
Another way to control permissions for userfaultfd is to use
|
||||
/dev/userfaultfd instead of userfaultfd(2). See
|
||||
Documentation/admin-guide/mm/userfaultfd.rst.
|
||||
|
||||
user_reserve_kbytes
|
||||
===================
|
||||
|
||||
@@ -76,6 +76,8 @@
|
||||
|
||||
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
|
||||
|
||||
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
|
||||
|
||||
/* compatibility flags */
|
||||
#define MAP_FILE 0
|
||||
|
||||
|
||||
@@ -103,6 +103,8 @@
|
||||
|
||||
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
|
||||
|
||||
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
|
||||
|
||||
/* compatibility flags */
|
||||
#define MAP_FILE 0
|
||||
|
||||
|
||||
@@ -70,6 +70,8 @@
|
||||
#define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */
|
||||
#define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */
|
||||
|
||||
#define MADV_COLLAPSE 73 /* Synchronous hugepage collapse */
|
||||
|
||||
#define MADV_HWPOISON 100 /* poison a page for testing */
|
||||
#define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */
|
||||
|
||||
|
||||
@@ -111,6 +111,8 @@
|
||||
|
||||
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
|
||||
|
||||
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
|
||||
|
||||
/* compatibility flags */
|
||||
#define MAP_FILE 0
|
||||
|
||||
|
||||
@@ -864,7 +864,7 @@ static int show_smap(struct seq_file *m, void *v)
|
||||
__show_smap(m, &mss, false);
|
||||
|
||||
seq_printf(m, "THPeligible: %d\n",
|
||||
hugepage_vma_check(vma, vma->vm_flags, true, false));
|
||||
hugepage_vma_check(vma, vma->vm_flags, true, false, true));
|
||||
|
||||
if (arch_pkeys_enabled())
|
||||
seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma));
|
||||
|
||||
@@ -30,6 +30,7 @@
|
||||
#include <linux/security.h>
|
||||
#include <linux/hugetlb.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/miscdevice.h>
|
||||
|
||||
int sysctl_unprivileged_userfaultfd __read_mostly;
|
||||
|
||||
@@ -415,13 +416,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason)
|
||||
|
||||
if (ctx->features & UFFD_FEATURE_SIGBUS)
|
||||
goto out;
|
||||
if ((vmf->flags & FAULT_FLAG_USER) == 0 &&
|
||||
ctx->flags & UFFD_USER_MODE_ONLY) {
|
||||
printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
|
||||
"sysctl knob to 1 if kernel faults must be handled "
|
||||
"without obtaining CAP_SYS_PTRACE capability\n");
|
||||
if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY))
|
||||
goto out;
|
||||
}
|
||||
|
||||
/*
|
||||
* If it's already released don't get it. This avoids to loop
|
||||
@@ -2056,20 +2052,11 @@ static void init_once_userfaultfd_ctx(void *mem)
|
||||
seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock);
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE1(userfaultfd, int, flags)
|
||||
static int new_userfaultfd(int flags)
|
||||
{
|
||||
struct userfaultfd_ctx *ctx;
|
||||
int fd;
|
||||
|
||||
if (!sysctl_unprivileged_userfaultfd &&
|
||||
(flags & UFFD_USER_MODE_ONLY) == 0 &&
|
||||
!capable(CAP_SYS_PTRACE)) {
|
||||
printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd "
|
||||
"sysctl knob to 1 if kernel faults must be handled "
|
||||
"without obtaining CAP_SYS_PTRACE capability\n");
|
||||
return -EPERM;
|
||||
}
|
||||
|
||||
BUG_ON(!current->mm);
|
||||
|
||||
/* Check the UFFD_* constants for consistency. */
|
||||
@@ -2102,8 +2089,60 @@ SYSCALL_DEFINE1(userfaultfd, int, flags)
|
||||
return fd;
|
||||
}
|
||||
|
||||
static inline bool userfaultfd_syscall_allowed(int flags)
|
||||
{
|
||||
/* Userspace-only page faults are always allowed */
|
||||
if (flags & UFFD_USER_MODE_ONLY)
|
||||
return true;
|
||||
|
||||
/*
|
||||
* The user is requesting a userfaultfd which can handle kernel faults.
|
||||
* Privileged users are always allowed to do this.
|
||||
*/
|
||||
if (capable(CAP_SYS_PTRACE))
|
||||
return true;
|
||||
|
||||
/* Otherwise, access to kernel fault handling is sysctl controlled. */
|
||||
return sysctl_unprivileged_userfaultfd;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE1(userfaultfd, int, flags)
|
||||
{
|
||||
if (!userfaultfd_syscall_allowed(flags))
|
||||
return -EPERM;
|
||||
|
||||
return new_userfaultfd(flags);
|
||||
}
|
||||
|
||||
static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags)
|
||||
{
|
||||
if (cmd != USERFAULTFD_IOC_NEW)
|
||||
return -EINVAL;
|
||||
|
||||
return new_userfaultfd(flags);
|
||||
}
|
||||
|
||||
static const struct file_operations userfaultfd_dev_fops = {
|
||||
.unlocked_ioctl = userfaultfd_dev_ioctl,
|
||||
.compat_ioctl = userfaultfd_dev_ioctl,
|
||||
.owner = THIS_MODULE,
|
||||
.llseek = noop_llseek,
|
||||
};
|
||||
|
||||
static struct miscdevice userfaultfd_misc = {
|
||||
.minor = MISC_DYNAMIC_MINOR,
|
||||
.name = "userfaultfd",
|
||||
.fops = &userfaultfd_dev_fops
|
||||
};
|
||||
|
||||
static int __init userfaultfd_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
ret = misc_register(&userfaultfd_misc);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache",
|
||||
sizeof(struct userfaultfd_ctx),
|
||||
0,
|
||||
|
||||
@@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma)
|
||||
!inode_is_open_for_write(inode) && S_ISREG(inode->i_mode);
|
||||
}
|
||||
|
||||
bool hugepage_vma_check(struct vm_area_struct *vma,
|
||||
unsigned long vm_flags,
|
||||
bool smaps, bool in_pf);
|
||||
bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
|
||||
bool smaps, bool in_pf, bool enforce_sysfs);
|
||||
|
||||
#define transparent_hugepage_use_zero_page() \
|
||||
(transparent_hugepage_flags & \
|
||||
@@ -219,6 +218,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud,
|
||||
|
||||
int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags,
|
||||
int advice);
|
||||
int madvise_collapse(struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end);
|
||||
void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start,
|
||||
unsigned long end, long adjust_next);
|
||||
spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma);
|
||||
@@ -321,8 +323,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma,
|
||||
}
|
||||
|
||||
static inline bool hugepage_vma_check(struct vm_area_struct *vma,
|
||||
unsigned long vm_flags,
|
||||
bool smaps, bool in_pf)
|
||||
unsigned long vm_flags, bool smaps,
|
||||
bool in_pf, bool enforce_sysfs)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
@@ -362,9 +364,16 @@ static inline void split_huge_pmd_address(struct vm_area_struct *vma,
|
||||
static inline int hugepage_madvise(struct vm_area_struct *vma,
|
||||
unsigned long *vm_flags, int advice)
|
||||
{
|
||||
BUG();
|
||||
return 0;
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static inline int madvise_collapse(struct vm_area_struct *vma,
|
||||
struct vm_area_struct **prev,
|
||||
unsigned long start, unsigned long end)
|
||||
{
|
||||
return -EINVAL;
|
||||
}
|
||||
|
||||
static inline void vma_adjust_trans_huge(struct vm_area_struct *vma,
|
||||
unsigned long start,
|
||||
unsigned long end,
|
||||
|
||||
@@ -151,13 +151,6 @@ extern bool mempolicy_in_oom_domain(struct task_struct *tsk,
|
||||
const nodemask_t *mask);
|
||||
extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy);
|
||||
|
||||
static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
|
||||
{
|
||||
struct mempolicy *mpol = get_task_policy(current);
|
||||
|
||||
return policy_nodemask(gfp, mpol);
|
||||
}
|
||||
|
||||
extern unsigned int mempolicy_slab_node(void);
|
||||
|
||||
extern enum zone_type policy_zone;
|
||||
@@ -189,6 +182,7 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol)
|
||||
return (pol->mode == MPOL_PREFERRED_MANY);
|
||||
}
|
||||
|
||||
extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone);
|
||||
|
||||
#else
|
||||
|
||||
@@ -294,11 +288,6 @@ static inline void mpol_put_task_policy(struct task_struct *task)
|
||||
{
|
||||
}
|
||||
|
||||
static inline nodemask_t *policy_nodemask_current(gfp_t gfp)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline bool mpol_is_preferred_many(struct mempolicy *pol)
|
||||
{
|
||||
return false;
|
||||
|
||||
@@ -305,6 +305,8 @@ static inline bool is_active_lru(enum lru_list lru)
|
||||
return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE);
|
||||
}
|
||||
|
||||
#define WORKINGSET_ANON 0
|
||||
#define WORKINGSET_FILE 1
|
||||
#define ANON_AND_FILE 2
|
||||
|
||||
enum lruvec_flags {
|
||||
|
||||
@@ -11,6 +11,7 @@
|
||||
EM( SCAN_FAIL, "failed") \
|
||||
EM( SCAN_SUCCEED, "succeeded") \
|
||||
EM( SCAN_PMD_NULL, "pmd_null") \
|
||||
EM( SCAN_PMD_MAPPED, "page_pmd_mapped") \
|
||||
EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \
|
||||
EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \
|
||||
EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \
|
||||
|
||||
@@ -77,6 +77,8 @@
|
||||
|
||||
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
|
||||
|
||||
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
|
||||
|
||||
/* compatibility flags */
|
||||
#define MAP_FILE 0
|
||||
|
||||
|
||||
@@ -12,6 +12,10 @@
|
||||
|
||||
#include <linux/types.h>
|
||||
|
||||
/* ioctls for /dev/userfaultfd */
|
||||
#define USERFAULTFD_IOC 0xAA
|
||||
#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00)
|
||||
|
||||
/*
|
||||
* If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and
|
||||
* UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In
|
||||
|
||||
@@ -163,11 +163,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n");
|
||||
static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry)
|
||||
{
|
||||
struct dentry *tmp;
|
||||
char name[CMA_MAX_NAME];
|
||||
|
||||
scnprintf(name, sizeof(name), "cma-%s", cma->name);
|
||||
|
||||
tmp = debugfs_create_dir(name, root_dentry);
|
||||
tmp = debugfs_create_dir(cma->name, root_dentry);
|
||||
|
||||
debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops);
|
||||
debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops);
|
||||
|
||||
@@ -1053,7 +1053,7 @@ static int __init __damon_dbgfs_init(void)
|
||||
fops[i]);
|
||||
dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]);
|
||||
|
||||
dbgfs_dirs = kmalloc_array(1, sizeof(dbgfs_root), GFP_KERNEL);
|
||||
dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL);
|
||||
if (!dbgfs_dirs) {
|
||||
debugfs_remove(dbgfs_root);
|
||||
return -ENOMEM;
|
||||
|
||||
16
mm/filemap.c
16
mm/filemap.c
@@ -1633,24 +1633,26 @@ EXPORT_SYMBOL(folio_end_writeback);
|
||||
*/
|
||||
void page_endio(struct page *page, bool is_write, int err)
|
||||
{
|
||||
struct folio *folio = page_folio(page);
|
||||
|
||||
if (!is_write) {
|
||||
if (!err) {
|
||||
SetPageUptodate(page);
|
||||
folio_mark_uptodate(folio);
|
||||
} else {
|
||||
ClearPageUptodate(page);
|
||||
SetPageError(page);
|
||||
folio_clear_uptodate(folio);
|
||||
folio_set_error(folio);
|
||||
}
|
||||
unlock_page(page);
|
||||
folio_unlock(folio);
|
||||
} else {
|
||||
if (err) {
|
||||
struct address_space *mapping;
|
||||
|
||||
SetPageError(page);
|
||||
mapping = page_mapping(page);
|
||||
folio_set_error(folio);
|
||||
mapping = folio_mapping(folio);
|
||||
if (mapping)
|
||||
mapping_set_error(mapping, err);
|
||||
}
|
||||
end_page_writeback(page);
|
||||
folio_end_writeback(folio);
|
||||
}
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(page_endio);
|
||||
|
||||
@@ -70,9 +70,8 @@ static atomic_t huge_zero_refcount;
|
||||
struct page *huge_zero_page __read_mostly;
|
||||
unsigned long huge_zero_pfn __read_mostly = ~0UL;
|
||||
|
||||
bool hugepage_vma_check(struct vm_area_struct *vma,
|
||||
unsigned long vm_flags,
|
||||
bool smaps, bool in_pf)
|
||||
bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags,
|
||||
bool smaps, bool in_pf, bool enforce_sysfs)
|
||||
{
|
||||
if (!vma->vm_mm) /* vdso */
|
||||
return false;
|
||||
@@ -121,11 +120,10 @@ bool hugepage_vma_check(struct vm_area_struct *vma,
|
||||
if (!in_pf && shmem_file(vma->vm_file))
|
||||
return shmem_huge_enabled(vma);
|
||||
|
||||
if (!hugepage_flags_enabled())
|
||||
return false;
|
||||
|
||||
/* THP settings require madvise. */
|
||||
if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always())
|
||||
/* Enforce sysfs THP requirements as necessary */
|
||||
if (enforce_sysfs &&
|
||||
(!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) &&
|
||||
!hugepage_flags_always())))
|
||||
return false;
|
||||
|
||||
/* Only regular file is valid */
|
||||
@@ -2288,25 +2286,11 @@ out:
|
||||
void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address,
|
||||
bool freeze, struct folio *folio)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd;
|
||||
pmd_t *pmd = mm_find_pmd(vma->vm_mm, address);
|
||||
|
||||
pgd = pgd_offset(vma->vm_mm, address);
|
||||
if (!pgd_present(*pgd))
|
||||
if (!pmd)
|
||||
return;
|
||||
|
||||
p4d = p4d_offset(pgd, address);
|
||||
if (!p4d_present(*p4d))
|
||||
return;
|
||||
|
||||
pud = pud_offset(p4d, address);
|
||||
if (!pud_present(*pud))
|
||||
return;
|
||||
|
||||
pmd = pmd_offset(pud, address);
|
||||
|
||||
__split_huge_pmd(vma, pmd, address, freeze, folio);
|
||||
}
|
||||
|
||||
|
||||
24
mm/hugetlb.c
24
mm/hugetlb.c
@@ -4332,18 +4332,34 @@ static int __init default_hugepagesz_setup(char *s)
|
||||
}
|
||||
__setup("default_hugepagesz=", default_hugepagesz_setup);
|
||||
|
||||
static nodemask_t *policy_mbind_nodemask(gfp_t gfp)
|
||||
{
|
||||
#ifdef CONFIG_NUMA
|
||||
struct mempolicy *mpol = get_task_policy(current);
|
||||
|
||||
/*
|
||||
* Only enforce MPOL_BIND policy which overlaps with cpuset policy
|
||||
* (from policy_nodemask) specifically for hugetlb case
|
||||
*/
|
||||
if (mpol->mode == MPOL_BIND &&
|
||||
(apply_policy_zone(mpol, gfp_zone(gfp)) &&
|
||||
cpuset_nodemask_valid_mems_allowed(&mpol->nodes)))
|
||||
return &mpol->nodes;
|
||||
#endif
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static unsigned int allowed_mems_nr(struct hstate *h)
|
||||
{
|
||||
int node;
|
||||
unsigned int nr = 0;
|
||||
nodemask_t *mpol_allowed;
|
||||
nodemask_t *mbind_nodemask;
|
||||
unsigned int *array = h->free_huge_pages_node;
|
||||
gfp_t gfp_mask = htlb_alloc_mask(h);
|
||||
|
||||
mpol_allowed = policy_nodemask_current(gfp_mask);
|
||||
|
||||
mbind_nodemask = policy_mbind_nodemask(gfp_mask);
|
||||
for_each_node_mask(node, cpuset_current_mems_allowed) {
|
||||
if (!mpol_allowed || node_isset(node, *mpol_allowed))
|
||||
if (!mbind_nodemask || node_isset(node, *mbind_nodemask))
|
||||
nr += array[node];
|
||||
}
|
||||
|
||||
|
||||
@@ -187,7 +187,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason
|
||||
/*
|
||||
* in mm/rmap.c:
|
||||
*/
|
||||
extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
|
||||
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address);
|
||||
|
||||
/*
|
||||
* in mm/page_alloc.c
|
||||
|
||||
763
mm/khugepaged.c
763
mm/khugepaged.c
File diff suppressed because it is too large
Load Diff
10
mm/ksm.c
10
mm/ksm.c
@@ -1134,6 +1134,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
|
||||
{
|
||||
struct mm_struct *mm = vma->vm_mm;
|
||||
pmd_t *pmd;
|
||||
pmd_t pmde;
|
||||
pte_t *ptep;
|
||||
pte_t newpte;
|
||||
spinlock_t *ptl;
|
||||
@@ -1148,6 +1149,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *page,
|
||||
pmd = mm_find_pmd(mm, addr);
|
||||
if (!pmd)
|
||||
goto out;
|
||||
/*
|
||||
* Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
|
||||
* without holding anon_vma lock for write. So when looking for a
|
||||
* genuine pmde (in which to find pte), test present and !THP together.
|
||||
*/
|
||||
pmde = *pmd;
|
||||
barrier();
|
||||
if (!pmd_present(pmde) || pmd_trans_huge(pmde))
|
||||
goto out;
|
||||
|
||||
mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr,
|
||||
addr + PAGE_SIZE);
|
||||
|
||||
@@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior)
|
||||
case MADV_FREE:
|
||||
case MADV_POPULATE_READ:
|
||||
case MADV_POPULATE_WRITE:
|
||||
case MADV_COLLAPSE:
|
||||
return 0;
|
||||
default:
|
||||
/* be safe, default to 1. list exceptions explicitly */
|
||||
@@ -1060,6 +1061,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma,
|
||||
if (error)
|
||||
goto out;
|
||||
break;
|
||||
case MADV_COLLAPSE:
|
||||
return madvise_collapse(vma, prev, start, end);
|
||||
}
|
||||
|
||||
anon_name = anon_vma_name(vma);
|
||||
@@ -1153,6 +1156,7 @@ madvise_behavior_valid(int behavior)
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
case MADV_HUGEPAGE:
|
||||
case MADV_NOHUGEPAGE:
|
||||
case MADV_COLLAPSE:
|
||||
#endif
|
||||
case MADV_DONTDUMP:
|
||||
case MADV_DODUMP:
|
||||
@@ -1169,13 +1173,13 @@ madvise_behavior_valid(int behavior)
|
||||
}
|
||||
}
|
||||
|
||||
static bool
|
||||
process_madvise_behavior_valid(int behavior)
|
||||
static bool process_madvise_behavior_valid(int behavior)
|
||||
{
|
||||
switch (behavior) {
|
||||
case MADV_COLD:
|
||||
case MADV_PAGEOUT:
|
||||
case MADV_WILLNEED:
|
||||
case MADV_COLLAPSE:
|
||||
return true;
|
||||
default:
|
||||
return false;
|
||||
@@ -1342,6 +1346,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start,
|
||||
* MADV_NOHUGEPAGE - mark the given range as not worth being backed by
|
||||
* transparent huge pages so the existing pages will not be
|
||||
* coalesced into THP and new pages will not be allocated as THP.
|
||||
* MADV_COLLAPSE - synchronously coalesce pages into new THP.
|
||||
* MADV_DONTDUMP - the application wants to prevent pages in the given range
|
||||
* from being included in its core dump.
|
||||
* MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump.
|
||||
|
||||
@@ -1529,20 +1529,18 @@ static int identify_page_state(unsigned long pfn, struct page *p,
|
||||
return page_action(ps, p, pfn);
|
||||
}
|
||||
|
||||
static int try_to_split_thp_page(struct page *page, const char *msg)
|
||||
static int try_to_split_thp_page(struct page *page)
|
||||
{
|
||||
lock_page(page);
|
||||
if (unlikely(split_huge_page(page))) {
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
int ret;
|
||||
|
||||
unlock_page(page);
|
||||
pr_info("%s: %#lx: thp split failed\n", msg, pfn);
|
||||
put_page(page);
|
||||
return -EBUSY;
|
||||
}
|
||||
lock_page(page);
|
||||
ret = split_huge_page(page);
|
||||
unlock_page(page);
|
||||
|
||||
return 0;
|
||||
if (unlikely(ret))
|
||||
put_page(page);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn,
|
||||
@@ -2084,7 +2082,7 @@ try_again:
|
||||
* page is a valid handlable page.
|
||||
*/
|
||||
SetPageHasHWPoisoned(hpage);
|
||||
if (try_to_split_thp_page(p, "Memory Failure") < 0) {
|
||||
if (try_to_split_thp_page(p) < 0) {
|
||||
action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
|
||||
res = -EBUSY;
|
||||
goto unlock_mutex;
|
||||
@@ -2510,8 +2508,11 @@ static int soft_offline_in_use_page(struct page *page)
|
||||
struct page *hpage = compound_head(page);
|
||||
|
||||
if (!PageHuge(page) && PageTransHuge(hpage))
|
||||
if (try_to_split_thp_page(page, "soft offline") < 0)
|
||||
if (try_to_split_thp_page(page) < 0) {
|
||||
pr_info("soft offline: %#lx: thp split failed\n",
|
||||
page_to_pfn(page));
|
||||
return -EBUSY;
|
||||
}
|
||||
return __soft_offline_page(page);
|
||||
}
|
||||
|
||||
|
||||
@@ -4992,7 +4992,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma,
|
||||
return VM_FAULT_OOM;
|
||||
retry_pud:
|
||||
if (pud_none(*vmf.pud) &&
|
||||
hugepage_vma_check(vma, vm_flags, false, true)) {
|
||||
hugepage_vma_check(vma, vm_flags, false, true, true)) {
|
||||
ret = create_huge_pud(&vmf);
|
||||
if (!(ret & VM_FAULT_FALLBACK))
|
||||
return ret;
|
||||
@@ -5026,7 +5026,7 @@ retry_pud:
|
||||
goto retry_pud;
|
||||
|
||||
if (pmd_none(*vmf.pmd) &&
|
||||
hugepage_vma_check(vma, vm_flags, false, true)) {
|
||||
hugepage_vma_check(vma, vm_flags, false, true, true)) {
|
||||
ret = create_huge_pmd(&vmf);
|
||||
if (!(ret & VM_FAULT_FALLBACK))
|
||||
return ret;
|
||||
|
||||
@@ -853,12 +853,14 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags,
|
||||
goto out;
|
||||
}
|
||||
|
||||
task_lock(current);
|
||||
ret = mpol_set_nodemask(new, nodes, scratch);
|
||||
if (ret) {
|
||||
task_unlock(current);
|
||||
mpol_put(new);
|
||||
goto out;
|
||||
}
|
||||
task_lock(current);
|
||||
|
||||
old = current->mempolicy;
|
||||
current->mempolicy = new;
|
||||
if (new && new->mode == MPOL_INTERLEAVE)
|
||||
@@ -1803,7 +1805,7 @@ bool vma_policy_mof(struct vm_area_struct *vma)
|
||||
return pol->flags & MPOL_F_MOF;
|
||||
}
|
||||
|
||||
static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
|
||||
bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone)
|
||||
{
|
||||
enum zone_type dynamic_policy_zone = policy_zone;
|
||||
|
||||
|
||||
@@ -2232,6 +2232,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len,
|
||||
*/
|
||||
pgoff = 0;
|
||||
get_area = shmem_get_unmapped_area;
|
||||
} else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) {
|
||||
/* Ensures that larger anonymous mappings are THP aligned. */
|
||||
get_area = thp_get_unmapped_area;
|
||||
}
|
||||
|
||||
addr = get_area(file, addr, len, pgoff, flags);
|
||||
|
||||
@@ -3010,7 +3010,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
|
||||
* i.e. orders < pageblock_order. If there are no local zones free,
|
||||
* the zonelists will be reiterated without ALLOC_NOFRAGMENT.
|
||||
*/
|
||||
if (alloc_flags & ALLOC_NOFRAGMENT)
|
||||
if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT)
|
||||
min_order = pageblock_order;
|
||||
|
||||
/*
|
||||
@@ -9023,7 +9023,7 @@ void *__init alloc_large_system_hash(const char *tablename,
|
||||
{
|
||||
unsigned long long max = high_limit;
|
||||
unsigned long log2qty, size;
|
||||
void *table = NULL;
|
||||
void *table;
|
||||
gfp_t gfp_flags;
|
||||
bool virt;
|
||||
bool huge;
|
||||
|
||||
@@ -336,7 +336,7 @@ static int __meminit online_page_ext(unsigned long start_pfn,
|
||||
}
|
||||
|
||||
static int __meminit offline_page_ext(unsigned long start_pfn,
|
||||
unsigned long nr_pages, int nid)
|
||||
unsigned long nr_pages)
|
||||
{
|
||||
unsigned long start, end, pfn;
|
||||
|
||||
@@ -362,11 +362,11 @@ static int __meminit page_ext_callback(struct notifier_block *self,
|
||||
break;
|
||||
case MEM_OFFLINE:
|
||||
offline_page_ext(mn->start_pfn,
|
||||
mn->nr_pages, mn->status_change_nid);
|
||||
mn->nr_pages);
|
||||
break;
|
||||
case MEM_CANCEL_ONLINE:
|
||||
offline_page_ext(mn->start_pfn,
|
||||
mn->nr_pages, mn->status_change_nid);
|
||||
mn->nr_pages);
|
||||
break;
|
||||
case MEM_GOING_OFFLINE:
|
||||
break;
|
||||
|
||||
@@ -28,7 +28,7 @@
|
||||
#include <linux/delayacct.h>
|
||||
#include "swap.h"
|
||||
|
||||
void end_swap_bio_write(struct bio *bio)
|
||||
static void end_swap_bio_write(struct bio *bio)
|
||||
{
|
||||
struct page *page = bio_first_page_all(bio);
|
||||
|
||||
@@ -202,7 +202,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc)
|
||||
end_page_writeback(page);
|
||||
goto out;
|
||||
}
|
||||
ret = __swap_writepage(page, wbc, end_swap_bio_write);
|
||||
ret = __swap_writepage(page, wbc);
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
@@ -332,8 +332,7 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc)
|
||||
return 0;
|
||||
}
|
||||
|
||||
int __swap_writepage(struct page *page, struct writeback_control *wbc,
|
||||
bio_end_io_t end_write_func)
|
||||
int __swap_writepage(struct page *page, struct writeback_control *wbc)
|
||||
{
|
||||
struct bio *bio;
|
||||
int ret;
|
||||
@@ -358,7 +357,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc,
|
||||
REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc),
|
||||
GFP_NOIO);
|
||||
bio->bi_iter.bi_sector = swap_page_sector(page);
|
||||
bio->bi_end_io = end_write_func;
|
||||
bio->bi_end_io = end_swap_bio_write;
|
||||
bio_add_page(bio, page, thp_size(page), 0);
|
||||
|
||||
bio_associate_blkg_from_page(bio, page);
|
||||
|
||||
15
mm/rmap.c
15
mm/rmap.c
@@ -770,13 +770,17 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma)
|
||||
return vma_address(page, vma);
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns the actual pmd_t* where we expect 'address' to be mapped from, or
|
||||
* NULL if it doesn't exist. No guarantees / checks on what the pmd_t*
|
||||
* represents.
|
||||
*/
|
||||
pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
|
||||
{
|
||||
pgd_t *pgd;
|
||||
p4d_t *p4d;
|
||||
pud_t *pud;
|
||||
pmd_t *pmd = NULL;
|
||||
pmd_t pmde;
|
||||
|
||||
pgd = pgd_offset(mm, address);
|
||||
if (!pgd_present(*pgd))
|
||||
@@ -791,15 +795,6 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address)
|
||||
goto out;
|
||||
|
||||
pmd = pmd_offset(pud, address);
|
||||
/*
|
||||
* Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at()
|
||||
* without holding anon_vma lock for write. So when looking for a
|
||||
* genuine pmde (in which to find pte), test present and !THP together.
|
||||
*/
|
||||
pmde = *pmd;
|
||||
barrier();
|
||||
if (!pmd_present(pmde) || pmd_trans_huge(pmde))
|
||||
pmd = NULL;
|
||||
out:
|
||||
return pmd;
|
||||
}
|
||||
|
||||
@@ -18,9 +18,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug)
|
||||
}
|
||||
void swap_write_unplug(struct swap_iocb *sio);
|
||||
int swap_writepage(struct page *page, struct writeback_control *wbc);
|
||||
void end_swap_bio_write(struct bio *bio);
|
||||
int __swap_writepage(struct page *page, struct writeback_control *wbc,
|
||||
bio_end_io_t end_write_func);
|
||||
int __swap_writepage(struct page *page, struct writeback_control *wbc);
|
||||
|
||||
/* linux/mm/swap_state.c */
|
||||
/* One swap address space for each 64M swap space */
|
||||
|
||||
@@ -859,10 +859,10 @@ int folio_mapcount(struct folio *folio)
|
||||
return atomic_read(&folio->_mapcount) + 1;
|
||||
|
||||
compound = folio_entire_mapcount(folio);
|
||||
nr = folio_nr_pages(folio);
|
||||
if (folio_test_hugetlb(folio))
|
||||
return compound;
|
||||
ret = compound;
|
||||
nr = folio_nr_pages(folio);
|
||||
for (i = 0; i < nr; i++)
|
||||
ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1;
|
||||
/* File pages has compound_mapcount included in _mapcount */
|
||||
|
||||
@@ -3238,7 +3238,7 @@ again:
|
||||
|
||||
refaults = lruvec_page_state(target_lruvec,
|
||||
WORKINGSET_ACTIVATE_ANON);
|
||||
if (refaults != target_lruvec->refaults[0] ||
|
||||
if (refaults != target_lruvec->refaults[WORKINGSET_ANON] ||
|
||||
inactive_is_low(target_lruvec, LRU_INACTIVE_ANON))
|
||||
sc->may_deactivate |= DEACTIVATE_ANON;
|
||||
else
|
||||
@@ -3251,7 +3251,7 @@ again:
|
||||
*/
|
||||
refaults = lruvec_page_state(target_lruvec,
|
||||
WORKINGSET_ACTIVATE_FILE);
|
||||
if (refaults != target_lruvec->refaults[1] ||
|
||||
if (refaults != target_lruvec->refaults[WORKINGSET_FILE] ||
|
||||
inactive_is_low(target_lruvec, LRU_INACTIVE_FILE))
|
||||
sc->may_deactivate |= DEACTIVATE_FILE;
|
||||
else
|
||||
@@ -3567,9 +3567,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat)
|
||||
|
||||
target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat);
|
||||
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON);
|
||||
target_lruvec->refaults[0] = refaults;
|
||||
target_lruvec->refaults[WORKINGSET_ANON] = refaults;
|
||||
refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE);
|
||||
target_lruvec->refaults[1] = refaults;
|
||||
target_lruvec->refaults[WORKINGSET_FILE] = refaults;
|
||||
}
|
||||
|
||||
/*
|
||||
|
||||
@@ -1555,6 +1555,13 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
|
||||
d_off += size;
|
||||
d_size -= size;
|
||||
|
||||
/*
|
||||
* Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic()
|
||||
* calls must occurs in reverse order of calls to kmap_atomic().
|
||||
* So, to call kunmap_atomic(s_addr) we should first call
|
||||
* kunmap_atomic(d_addr). For more details see
|
||||
* https://lore.kernel.org/linux-mm/5512421D.4000603@samsung.com/
|
||||
*/
|
||||
if (s_off >= PAGE_SIZE) {
|
||||
kunmap_atomic(d_addr);
|
||||
kunmap_atomic(s_addr);
|
||||
@@ -2103,8 +2110,6 @@ unsigned long zs_compact(struct zs_pool *pool)
|
||||
|
||||
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
|
||||
class = pool->size_class[i];
|
||||
if (!class)
|
||||
continue;
|
||||
if (class->index != i)
|
||||
continue;
|
||||
pages_freed += __zs_compact(pool, class);
|
||||
@@ -2149,8 +2154,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker,
|
||||
|
||||
for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) {
|
||||
class = pool->size_class[i];
|
||||
if (!class)
|
||||
continue;
|
||||
if (class->index != i)
|
||||
continue;
|
||||
|
||||
@@ -2308,9 +2311,6 @@ void zs_destroy_pool(struct zs_pool *pool)
|
||||
int fg;
|
||||
struct size_class *class = pool->size_class[i];
|
||||
|
||||
if (!class)
|
||||
continue;
|
||||
|
||||
if (class->index != i)
|
||||
continue;
|
||||
|
||||
|
||||
@@ -1026,7 +1026,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
|
||||
SetPageReclaim(page);
|
||||
|
||||
/* start writeback */
|
||||
__swap_writepage(page, &wbc, end_swap_bio_write);
|
||||
__swap_writepage(page, &wbc);
|
||||
put_page(page);
|
||||
zswap_written_back_pages++;
|
||||
|
||||
|
||||
@@ -77,6 +77,8 @@
|
||||
|
||||
#define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */
|
||||
|
||||
#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */
|
||||
|
||||
/* compatibility flags */
|
||||
#define MAP_FILE 0
|
||||
|
||||
|
||||
@@ -14,6 +14,9 @@
|
||||
#ifndef MADV_PAGEOUT
|
||||
#define MADV_PAGEOUT 21
|
||||
#endif
|
||||
#ifndef MADV_COLLAPSE
|
||||
#define MADV_COLLAPSE 25
|
||||
#endif
|
||||
|
||||
#define BASE_ADDR ((void *)(1UL << 30))
|
||||
static unsigned long hpage_pmd_size;
|
||||
@@ -23,6 +26,11 @@ static int hpage_pmd_nr;
|
||||
#define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/"
|
||||
#define PID_SMAPS "/proc/self/smaps"
|
||||
|
||||
struct collapse_context {
|
||||
void (*collapse)(const char *msg, char *p, int nr_hpages, bool expect);
|
||||
bool enforce_pte_scan_limits;
|
||||
};
|
||||
|
||||
enum thp_enabled {
|
||||
THP_ALWAYS,
|
||||
THP_MADVISE,
|
||||
@@ -90,18 +98,6 @@ struct settings {
|
||||
struct khugepaged_settings khugepaged;
|
||||
};
|
||||
|
||||
static struct settings default_settings = {
|
||||
.thp_enabled = THP_MADVISE,
|
||||
.thp_defrag = THP_DEFRAG_ALWAYS,
|
||||
.shmem_enabled = SHMEM_NEVER,
|
||||
.use_zero_page = 0,
|
||||
.khugepaged = {
|
||||
.defrag = 1,
|
||||
.alloc_sleep_millisecs = 10,
|
||||
.scan_sleep_millisecs = 10,
|
||||
},
|
||||
};
|
||||
|
||||
static struct settings saved_settings;
|
||||
static bool skip_settings_restore;
|
||||
|
||||
@@ -279,6 +275,39 @@ static void write_settings(struct settings *settings)
|
||||
write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan);
|
||||
}
|
||||
|
||||
#define MAX_SETTINGS_DEPTH 4
|
||||
static struct settings settings_stack[MAX_SETTINGS_DEPTH];
|
||||
static int settings_index;
|
||||
|
||||
static struct settings *current_settings(void)
|
||||
{
|
||||
if (!settings_index) {
|
||||
printf("Fail: No settings set");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
return settings_stack + settings_index - 1;
|
||||
}
|
||||
|
||||
static void push_settings(struct settings *settings)
|
||||
{
|
||||
if (settings_index >= MAX_SETTINGS_DEPTH) {
|
||||
printf("Fail: Settings stack exceeded");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
settings_stack[settings_index++] = *settings;
|
||||
write_settings(current_settings());
|
||||
}
|
||||
|
||||
static void pop_settings(void)
|
||||
{
|
||||
if (settings_index <= 0) {
|
||||
printf("Fail: Settings stack empty");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
--settings_index;
|
||||
write_settings(current_settings());
|
||||
}
|
||||
|
||||
static void restore_settings(int sig)
|
||||
{
|
||||
if (skip_settings_restore)
|
||||
@@ -322,14 +351,6 @@ static void save_settings(void)
|
||||
signal(SIGQUIT, restore_settings);
|
||||
}
|
||||
|
||||
static void adjust_settings(void)
|
||||
{
|
||||
|
||||
printf("Adjust settings...");
|
||||
write_settings(&default_settings);
|
||||
success("OK");
|
||||
}
|
||||
|
||||
#define MAX_LINE_LENGTH 500
|
||||
|
||||
static bool check_for_pattern(FILE *fp, char *pattern, char *buf)
|
||||
@@ -341,7 +362,7 @@ static bool check_for_pattern(FILE *fp, char *pattern, char *buf)
|
||||
return false;
|
||||
}
|
||||
|
||||
static bool check_huge(void *addr)
|
||||
static bool check_huge(void *addr, int nr_hpages)
|
||||
{
|
||||
bool thp = false;
|
||||
int ret;
|
||||
@@ -366,7 +387,7 @@ static bool check_huge(void *addr)
|
||||
goto err_out;
|
||||
|
||||
ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB",
|
||||
hpage_pmd_size >> 10);
|
||||
nr_hpages * (hpage_pmd_size >> 10));
|
||||
if (ret >= MAX_LINE_LENGTH) {
|
||||
printf("%s: Pattern is too long\n", __func__);
|
||||
exit(EXIT_FAILURE);
|
||||
@@ -434,12 +455,12 @@ err_out:
|
||||
return swap;
|
||||
}
|
||||
|
||||
static void *alloc_mapping(void)
|
||||
static void *alloc_mapping(int nr)
|
||||
{
|
||||
void *p;
|
||||
|
||||
p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE,
|
||||
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
|
||||
p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE,
|
||||
MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
|
||||
if (p != BASE_ADDR) {
|
||||
printf("Failed to allocate VMA at %p\n", BASE_ADDR);
|
||||
exit(EXIT_FAILURE);
|
||||
@@ -456,6 +477,25 @@ static void fill_memory(int *p, unsigned long start, unsigned long end)
|
||||
p[i * page_size / sizeof(*p)] = i + 0xdead0000;
|
||||
}
|
||||
|
||||
/*
|
||||
* Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with
|
||||
* validate_memory()'able contents.
|
||||
*/
|
||||
static void *alloc_hpage(void)
|
||||
{
|
||||
void *p;
|
||||
|
||||
p = alloc_mapping(1);
|
||||
printf("Allocate huge page...");
|
||||
madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
|
||||
fill_memory(p, 0, hpage_pmd_size);
|
||||
if (check_huge(p, 1))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
return p;
|
||||
}
|
||||
|
||||
static void validate_memory(int *p, unsigned long start, unsigned long end)
|
||||
{
|
||||
int i;
|
||||
@@ -469,26 +509,59 @@ static void validate_memory(int *p, unsigned long start, unsigned long end)
|
||||
}
|
||||
}
|
||||
|
||||
static void madvise_collapse(const char *msg, char *p, int nr_hpages,
|
||||
bool expect)
|
||||
{
|
||||
int ret;
|
||||
struct settings settings = *current_settings();
|
||||
|
||||
printf("%s...", msg);
|
||||
/* Sanity check */
|
||||
if (!check_huge(p, 0)) {
|
||||
printf("Unexpected huge page\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
/*
|
||||
* Prevent khugepaged interference and tests that MADV_COLLAPSE
|
||||
* ignores /sys/kernel/mm/transparent_hugepage/enabled
|
||||
*/
|
||||
settings.thp_enabled = THP_NEVER;
|
||||
push_settings(&settings);
|
||||
|
||||
/* Clear VM_NOHUGEPAGE */
|
||||
madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
|
||||
ret = madvise(p, nr_hpages * hpage_pmd_size, MADV_COLLAPSE);
|
||||
if (((bool)ret) == expect)
|
||||
fail("Fail: Bad return value");
|
||||
else if (check_huge(p, nr_hpages) != expect)
|
||||
fail("Fail: check_huge()");
|
||||
else
|
||||
success("OK");
|
||||
|
||||
pop_settings();
|
||||
}
|
||||
|
||||
#define TICK 500000
|
||||
static bool wait_for_scan(const char *msg, char *p)
|
||||
static bool wait_for_scan(const char *msg, char *p, int nr_hpages)
|
||||
{
|
||||
int full_scans;
|
||||
int timeout = 6; /* 3 seconds */
|
||||
|
||||
/* Sanity check */
|
||||
if (check_huge(p)) {
|
||||
if (!check_huge(p, 0)) {
|
||||
printf("Unexpected huge page\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
|
||||
madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
|
||||
madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE);
|
||||
|
||||
/* Wait until the second full_scan completed */
|
||||
full_scans = read_num("khugepaged/full_scans") + 2;
|
||||
|
||||
printf("%s...", msg);
|
||||
while (timeout--) {
|
||||
if (check_huge(p))
|
||||
if (check_huge(p, nr_hpages))
|
||||
break;
|
||||
if (read_num("khugepaged/full_scans") >= full_scans)
|
||||
break;
|
||||
@@ -496,121 +569,121 @@ static bool wait_for_scan(const char *msg, char *p)
|
||||
usleep(TICK);
|
||||
}
|
||||
|
||||
madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
|
||||
madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE);
|
||||
|
||||
return timeout == -1;
|
||||
}
|
||||
|
||||
static void khugepaged_collapse(const char *msg, char *p, int nr_hpages,
|
||||
bool expect)
|
||||
{
|
||||
if (wait_for_scan(msg, p, nr_hpages)) {
|
||||
if (expect)
|
||||
fail("Timeout");
|
||||
else
|
||||
success("OK");
|
||||
return;
|
||||
} else if (check_huge(p, nr_hpages) == expect) {
|
||||
success("OK");
|
||||
} else {
|
||||
fail("Fail");
|
||||
}
|
||||
}
|
||||
|
||||
static void alloc_at_fault(void)
|
||||
{
|
||||
struct settings settings = default_settings;
|
||||
struct settings settings = *current_settings();
|
||||
char *p;
|
||||
|
||||
settings.thp_enabled = THP_ALWAYS;
|
||||
write_settings(&settings);
|
||||
push_settings(&settings);
|
||||
|
||||
p = alloc_mapping();
|
||||
p = alloc_mapping(1);
|
||||
*p = 1;
|
||||
printf("Allocate huge page on fault...");
|
||||
if (check_huge(p))
|
||||
if (check_huge(p, 1))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
|
||||
write_settings(&default_settings);
|
||||
pop_settings();
|
||||
|
||||
madvise(p, page_size, MADV_DONTNEED);
|
||||
printf("Split huge PMD on MADV_DONTNEED...");
|
||||
if (!check_huge(p))
|
||||
if (check_huge(p, 0))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
munmap(p, hpage_pmd_size);
|
||||
}
|
||||
|
||||
static void collapse_full(void)
|
||||
static void collapse_full(struct collapse_context *c)
|
||||
{
|
||||
void *p;
|
||||
int nr_hpages = 4;
|
||||
unsigned long size = nr_hpages * hpage_pmd_size;
|
||||
|
||||
p = alloc_mapping(nr_hpages);
|
||||
fill_memory(p, 0, size);
|
||||
c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages,
|
||||
true);
|
||||
validate_memory(p, 0, size);
|
||||
munmap(p, size);
|
||||
}
|
||||
|
||||
static void collapse_empty(struct collapse_context *c)
|
||||
{
|
||||
void *p;
|
||||
|
||||
p = alloc_mapping();
|
||||
fill_memory(p, 0, hpage_pmd_size);
|
||||
if (wait_for_scan("Collapse fully populated PTE table", p))
|
||||
fail("Timeout");
|
||||
else if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
validate_memory(p, 0, hpage_pmd_size);
|
||||
p = alloc_mapping(1);
|
||||
c->collapse("Do not collapse empty PTE table", p, 1, false);
|
||||
munmap(p, hpage_pmd_size);
|
||||
}
|
||||
|
||||
static void collapse_empty(void)
|
||||
static void collapse_single_pte_entry(struct collapse_context *c)
|
||||
{
|
||||
void *p;
|
||||
|
||||
p = alloc_mapping();
|
||||
if (wait_for_scan("Do not collapse empty PTE table", p))
|
||||
fail("Timeout");
|
||||
else if (check_huge(p))
|
||||
fail("Fail");
|
||||
else
|
||||
success("OK");
|
||||
munmap(p, hpage_pmd_size);
|
||||
}
|
||||
|
||||
static void collapse_single_pte_entry(void)
|
||||
{
|
||||
void *p;
|
||||
|
||||
p = alloc_mapping();
|
||||
p = alloc_mapping(1);
|
||||
fill_memory(p, 0, page_size);
|
||||
if (wait_for_scan("Collapse PTE table with single PTE entry present", p))
|
||||
fail("Timeout");
|
||||
else if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
c->collapse("Collapse PTE table with single PTE entry present", p,
|
||||
1, true);
|
||||
validate_memory(p, 0, page_size);
|
||||
munmap(p, hpage_pmd_size);
|
||||
}
|
||||
|
||||
static void collapse_max_ptes_none(void)
|
||||
static void collapse_max_ptes_none(struct collapse_context *c)
|
||||
{
|
||||
int max_ptes_none = hpage_pmd_nr / 2;
|
||||
struct settings settings = default_settings;
|
||||
struct settings settings = *current_settings();
|
||||
void *p;
|
||||
|
||||
settings.khugepaged.max_ptes_none = max_ptes_none;
|
||||
write_settings(&settings);
|
||||
push_settings(&settings);
|
||||
|
||||
p = alloc_mapping();
|
||||
p = alloc_mapping(1);
|
||||
|
||||
fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
|
||||
if (wait_for_scan("Do not collapse with max_ptes_none exceeded", p))
|
||||
fail("Timeout");
|
||||
else if (check_huge(p))
|
||||
fail("Fail");
|
||||
else
|
||||
success("OK");
|
||||
c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1,
|
||||
!c->enforce_pte_scan_limits);
|
||||
validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size);
|
||||
|
||||
fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
|
||||
if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p))
|
||||
fail("Timeout");
|
||||
else if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
|
||||
if (c->enforce_pte_scan_limits) {
|
||||
fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size);
|
||||
c->collapse("Collapse with max_ptes_none PTEs empty", p, 1,
|
||||
true);
|
||||
validate_memory(p, 0,
|
||||
(hpage_pmd_nr - max_ptes_none) * page_size);
|
||||
}
|
||||
|
||||
munmap(p, hpage_pmd_size);
|
||||
write_settings(&default_settings);
|
||||
pop_settings();
|
||||
}
|
||||
|
||||
static void collapse_swapin_single_pte(void)
|
||||
static void collapse_swapin_single_pte(struct collapse_context *c)
|
||||
{
|
||||
void *p;
|
||||
p = alloc_mapping();
|
||||
p = alloc_mapping(1);
|
||||
fill_memory(p, 0, hpage_pmd_size);
|
||||
|
||||
printf("Swapout one page...");
|
||||
@@ -625,23 +698,18 @@ static void collapse_swapin_single_pte(void)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (wait_for_scan("Collapse with swapping in single PTE entry", p))
|
||||
fail("Timeout");
|
||||
else if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
c->collapse("Collapse with swapping in single PTE entry", p, 1, true);
|
||||
validate_memory(p, 0, hpage_pmd_size);
|
||||
out:
|
||||
munmap(p, hpage_pmd_size);
|
||||
}
|
||||
|
||||
static void collapse_max_ptes_swap(void)
|
||||
static void collapse_max_ptes_swap(struct collapse_context *c)
|
||||
{
|
||||
int max_ptes_swap = read_num("khugepaged/max_ptes_swap");
|
||||
void *p;
|
||||
|
||||
p = alloc_mapping();
|
||||
p = alloc_mapping(1);
|
||||
|
||||
fill_memory(p, 0, hpage_pmd_size);
|
||||
printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr);
|
||||
@@ -656,115 +724,83 @@ static void collapse_max_ptes_swap(void)
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (wait_for_scan("Do not collapse with max_ptes_swap exceeded", p))
|
||||
fail("Timeout");
|
||||
else if (check_huge(p))
|
||||
fail("Fail");
|
||||
else
|
||||
success("OK");
|
||||
c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1,
|
||||
!c->enforce_pte_scan_limits);
|
||||
validate_memory(p, 0, hpage_pmd_size);
|
||||
|
||||
fill_memory(p, 0, hpage_pmd_size);
|
||||
printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr);
|
||||
if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
|
||||
perror("madvise(MADV_PAGEOUT)");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (check_swap(p, max_ptes_swap * page_size)) {
|
||||
success("OK");
|
||||
} else {
|
||||
fail("Fail");
|
||||
goto out;
|
||||
}
|
||||
if (c->enforce_pte_scan_limits) {
|
||||
fill_memory(p, 0, hpage_pmd_size);
|
||||
printf("Swapout %d of %d pages...", max_ptes_swap,
|
||||
hpage_pmd_nr);
|
||||
if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) {
|
||||
perror("madvise(MADV_PAGEOUT)");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
if (check_swap(p, max_ptes_swap * page_size)) {
|
||||
success("OK");
|
||||
} else {
|
||||
fail("Fail");
|
||||
goto out;
|
||||
}
|
||||
|
||||
if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p))
|
||||
fail("Timeout");
|
||||
else if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
validate_memory(p, 0, hpage_pmd_size);
|
||||
c->collapse("Collapse with max_ptes_swap pages swapped out", p,
|
||||
1, true);
|
||||
validate_memory(p, 0, hpage_pmd_size);
|
||||
}
|
||||
out:
|
||||
munmap(p, hpage_pmd_size);
|
||||
}
|
||||
|
||||
static void collapse_single_pte_entry_compound(void)
|
||||
static void collapse_single_pte_entry_compound(struct collapse_context *c)
|
||||
{
|
||||
void *p;
|
||||
|
||||
p = alloc_mapping();
|
||||
|
||||
printf("Allocate huge page...");
|
||||
madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
|
||||
fill_memory(p, 0, hpage_pmd_size);
|
||||
if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
p = alloc_hpage();
|
||||
madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
|
||||
|
||||
printf("Split huge page leaving single PTE mapping compound page...");
|
||||
madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED);
|
||||
if (!check_huge(p))
|
||||
if (check_huge(p, 0))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
|
||||
if (wait_for_scan("Collapse PTE table with single PTE mapping compound page", p))
|
||||
fail("Timeout");
|
||||
else if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
c->collapse("Collapse PTE table with single PTE mapping compound page",
|
||||
p, 1, true);
|
||||
validate_memory(p, 0, page_size);
|
||||
munmap(p, hpage_pmd_size);
|
||||
}
|
||||
|
||||
static void collapse_full_of_compound(void)
|
||||
static void collapse_full_of_compound(struct collapse_context *c)
|
||||
{
|
||||
void *p;
|
||||
|
||||
p = alloc_mapping();
|
||||
|
||||
printf("Allocate huge page...");
|
||||
madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
|
||||
fill_memory(p, 0, hpage_pmd_size);
|
||||
if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
|
||||
p = alloc_hpage();
|
||||
printf("Split huge page leaving single PTE page table full of compound pages...");
|
||||
madvise(p, page_size, MADV_NOHUGEPAGE);
|
||||
madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
|
||||
if (!check_huge(p))
|
||||
if (check_huge(p, 0))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
|
||||
if (wait_for_scan("Collapse PTE table full of compound pages", p))
|
||||
fail("Timeout");
|
||||
else if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
c->collapse("Collapse PTE table full of compound pages", p, 1, true);
|
||||
validate_memory(p, 0, hpage_pmd_size);
|
||||
munmap(p, hpage_pmd_size);
|
||||
}
|
||||
|
||||
static void collapse_compound_extreme(void)
|
||||
static void collapse_compound_extreme(struct collapse_context *c)
|
||||
{
|
||||
void *p;
|
||||
int i;
|
||||
|
||||
p = alloc_mapping();
|
||||
p = alloc_mapping(1);
|
||||
for (i = 0; i < hpage_pmd_nr; i++) {
|
||||
printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...",
|
||||
i + 1, hpage_pmd_nr);
|
||||
|
||||
madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE);
|
||||
fill_memory(BASE_ADDR, 0, hpage_pmd_size);
|
||||
if (!check_huge(BASE_ADDR)) {
|
||||
if (!check_huge(BASE_ADDR, 1)) {
|
||||
printf("Failed to allocate huge page\n");
|
||||
exit(EXIT_FAILURE);
|
||||
}
|
||||
@@ -793,32 +829,28 @@ static void collapse_compound_extreme(void)
|
||||
|
||||
munmap(BASE_ADDR, hpage_pmd_size);
|
||||
fill_memory(p, 0, hpage_pmd_size);
|
||||
if (!check_huge(p))
|
||||
if (check_huge(p, 0))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
|
||||
if (wait_for_scan("Collapse PTE table full of different compound pages", p))
|
||||
fail("Timeout");
|
||||
else if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
c->collapse("Collapse PTE table full of different compound pages", p, 1,
|
||||
true);
|
||||
|
||||
validate_memory(p, 0, hpage_pmd_size);
|
||||
munmap(p, hpage_pmd_size);
|
||||
}
|
||||
|
||||
static void collapse_fork(void)
|
||||
static void collapse_fork(struct collapse_context *c)
|
||||
{
|
||||
int wstatus;
|
||||
void *p;
|
||||
|
||||
p = alloc_mapping();
|
||||
p = alloc_mapping(1);
|
||||
|
||||
printf("Allocate small page...");
|
||||
fill_memory(p, 0, page_size);
|
||||
if (!check_huge(p))
|
||||
if (check_huge(p, 0))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
@@ -829,19 +861,14 @@ static void collapse_fork(void)
|
||||
skip_settings_restore = true;
|
||||
exit_status = 0;
|
||||
|
||||
if (!check_huge(p))
|
||||
if (check_huge(p, 0))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
|
||||
fill_memory(p, page_size, 2 * page_size);
|
||||
|
||||
if (wait_for_scan("Collapse PTE table with single page shared with parent process", p))
|
||||
fail("Timeout");
|
||||
else if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
c->collapse("Collapse PTE table with single page shared with parent process",
|
||||
p, 1, true);
|
||||
|
||||
validate_memory(p, 0, page_size);
|
||||
munmap(p, hpage_pmd_size);
|
||||
@@ -852,7 +879,7 @@ static void collapse_fork(void)
|
||||
exit_status += WEXITSTATUS(wstatus);
|
||||
|
||||
printf("Check if parent still has small page...");
|
||||
if (!check_huge(p))
|
||||
if (check_huge(p, 0))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
@@ -860,28 +887,19 @@ static void collapse_fork(void)
|
||||
munmap(p, hpage_pmd_size);
|
||||
}
|
||||
|
||||
static void collapse_fork_compound(void)
|
||||
static void collapse_fork_compound(struct collapse_context *c)
|
||||
{
|
||||
int wstatus;
|
||||
void *p;
|
||||
|
||||
p = alloc_mapping();
|
||||
|
||||
printf("Allocate huge page...");
|
||||
madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
|
||||
fill_memory(p, 0, hpage_pmd_size);
|
||||
if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
|
||||
p = alloc_hpage();
|
||||
printf("Share huge page over fork()...");
|
||||
if (!fork()) {
|
||||
/* Do not touch settings on child exit */
|
||||
skip_settings_restore = true;
|
||||
exit_status = 0;
|
||||
|
||||
if (check_huge(p))
|
||||
if (check_huge(p, 1))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
@@ -889,21 +907,17 @@ static void collapse_fork_compound(void)
|
||||
printf("Split huge page PMD in child process...");
|
||||
madvise(p, page_size, MADV_NOHUGEPAGE);
|
||||
madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE);
|
||||
if (!check_huge(p))
|
||||
if (check_huge(p, 0))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
fill_memory(p, 0, page_size);
|
||||
|
||||
write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1);
|
||||
if (wait_for_scan("Collapse PTE table full of compound pages in child", p))
|
||||
fail("Timeout");
|
||||
else if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
c->collapse("Collapse PTE table full of compound pages in child",
|
||||
p, 1, true);
|
||||
write_num("khugepaged/max_ptes_shared",
|
||||
default_settings.khugepaged.max_ptes_shared);
|
||||
current_settings()->khugepaged.max_ptes_shared);
|
||||
|
||||
validate_memory(p, 0, hpage_pmd_size);
|
||||
munmap(p, hpage_pmd_size);
|
||||
@@ -914,7 +928,7 @@ static void collapse_fork_compound(void)
|
||||
exit_status += WEXITSTATUS(wstatus);
|
||||
|
||||
printf("Check if parent still has huge page...");
|
||||
if (check_huge(p))
|
||||
if (check_huge(p, 1))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
@@ -922,29 +936,20 @@ static void collapse_fork_compound(void)
|
||||
munmap(p, hpage_pmd_size);
|
||||
}
|
||||
|
||||
static void collapse_max_ptes_shared()
|
||||
static void collapse_max_ptes_shared(struct collapse_context *c)
|
||||
{
|
||||
int max_ptes_shared = read_num("khugepaged/max_ptes_shared");
|
||||
int wstatus;
|
||||
void *p;
|
||||
|
||||
p = alloc_mapping();
|
||||
|
||||
printf("Allocate huge page...");
|
||||
madvise(p, hpage_pmd_size, MADV_HUGEPAGE);
|
||||
fill_memory(p, 0, hpage_pmd_size);
|
||||
if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
|
||||
p = alloc_hpage();
|
||||
printf("Share huge page over fork()...");
|
||||
if (!fork()) {
|
||||
/* Do not touch settings on child exit */
|
||||
skip_settings_restore = true;
|
||||
exit_status = 0;
|
||||
|
||||
if (check_huge(p))
|
||||
if (check_huge(p, 1))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
@@ -952,33 +957,27 @@ static void collapse_max_ptes_shared()
|
||||
printf("Trigger CoW on page %d of %d...",
|
||||
hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr);
|
||||
fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size);
|
||||
if (!check_huge(p))
|
||||
if (check_huge(p, 0))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
|
||||
if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p))
|
||||
fail("Timeout");
|
||||
else if (!check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
c->collapse("Maybe collapse with max_ptes_shared exceeded", p,
|
||||
1, !c->enforce_pte_scan_limits);
|
||||
|
||||
printf("Trigger CoW on page %d of %d...",
|
||||
hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
|
||||
fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size);
|
||||
if (!check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
if (c->enforce_pte_scan_limits) {
|
||||
printf("Trigger CoW on page %d of %d...",
|
||||
hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr);
|
||||
fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) *
|
||||
page_size);
|
||||
if (check_huge(p, 0))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
|
||||
|
||||
if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p))
|
||||
fail("Timeout");
|
||||
else if (check_huge(p))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
c->collapse("Collapse with max_ptes_shared PTEs shared",
|
||||
p, 1, true);
|
||||
}
|
||||
|
||||
validate_memory(p, 0, hpage_pmd_size);
|
||||
munmap(p, hpage_pmd_size);
|
||||
@@ -989,7 +988,7 @@ static void collapse_max_ptes_shared()
|
||||
exit_status += WEXITSTATUS(wstatus);
|
||||
|
||||
printf("Check if parent still has huge page...");
|
||||
if (check_huge(p))
|
||||
if (check_huge(p, 1))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
@@ -997,8 +996,52 @@ static void collapse_max_ptes_shared()
|
||||
munmap(p, hpage_pmd_size);
|
||||
}
|
||||
|
||||
int main(void)
|
||||
static void madvise_collapse_existing_thps(void)
|
||||
{
|
||||
void *p;
|
||||
int err;
|
||||
|
||||
p = alloc_mapping(1);
|
||||
fill_memory(p, 0, hpage_pmd_size);
|
||||
|
||||
printf("Collapse fully populated PTE table...");
|
||||
/*
|
||||
* Note that we don't set MADV_HUGEPAGE here, which
|
||||
* also tests that VM_HUGEPAGE isn't required for
|
||||
* MADV_COLLAPSE in "madvise" mode.
|
||||
*/
|
||||
err = madvise(p, hpage_pmd_size, MADV_COLLAPSE);
|
||||
if (err == 0 && check_huge(p, 1)) {
|
||||
success("OK");
|
||||
printf("Re-collapse PMD-mapped hugepage");
|
||||
err = madvise(p, hpage_pmd_size, MADV_COLLAPSE);
|
||||
if (err == 0 && check_huge(p, 1))
|
||||
success("OK");
|
||||
else
|
||||
fail("Fail");
|
||||
} else {
|
||||
fail("Fail");
|
||||
}
|
||||
validate_memory(p, 0, hpage_pmd_size);
|
||||
munmap(p, hpage_pmd_size);
|
||||
}
|
||||
|
||||
int main(int argc, const char **argv)
|
||||
{
|
||||
struct collapse_context c;
|
||||
struct settings default_settings = {
|
||||
.thp_enabled = THP_MADVISE,
|
||||
.thp_defrag = THP_DEFRAG_ALWAYS,
|
||||
.shmem_enabled = SHMEM_NEVER,
|
||||
.use_zero_page = 0,
|
||||
.khugepaged = {
|
||||
.defrag = 1,
|
||||
.alloc_sleep_millisecs = 10,
|
||||
.scan_sleep_millisecs = 10,
|
||||
},
|
||||
};
|
||||
const char *tests = argc == 1 ? "all" : argv[1];
|
||||
|
||||
setbuf(stdout, NULL);
|
||||
|
||||
page_size = getpagesize();
|
||||
@@ -1011,21 +1054,47 @@ int main(void)
|
||||
default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8;
|
||||
|
||||
save_settings();
|
||||
adjust_settings();
|
||||
push_settings(&default_settings);
|
||||
|
||||
alloc_at_fault();
|
||||
collapse_full();
|
||||
collapse_empty();
|
||||
collapse_single_pte_entry();
|
||||
collapse_max_ptes_none();
|
||||
collapse_swapin_single_pte();
|
||||
collapse_max_ptes_swap();
|
||||
collapse_single_pte_entry_compound();
|
||||
collapse_full_of_compound();
|
||||
collapse_compound_extreme();
|
||||
collapse_fork();
|
||||
collapse_fork_compound();
|
||||
collapse_max_ptes_shared();
|
||||
|
||||
if (!strcmp(tests, "khugepaged") || !strcmp(tests, "all")) {
|
||||
printf("\n*** Testing context: khugepaged ***\n");
|
||||
c.collapse = &khugepaged_collapse;
|
||||
c.enforce_pte_scan_limits = true;
|
||||
|
||||
collapse_full(&c);
|
||||
collapse_empty(&c);
|
||||
collapse_single_pte_entry(&c);
|
||||
collapse_max_ptes_none(&c);
|
||||
collapse_swapin_single_pte(&c);
|
||||
collapse_max_ptes_swap(&c);
|
||||
collapse_single_pte_entry_compound(&c);
|
||||
collapse_full_of_compound(&c);
|
||||
collapse_compound_extreme(&c);
|
||||
collapse_fork(&c);
|
||||
collapse_fork_compound(&c);
|
||||
collapse_max_ptes_shared(&c);
|
||||
}
|
||||
if (!strcmp(tests, "madvise") || !strcmp(tests, "all")) {
|
||||
printf("\n*** Testing context: madvise ***\n");
|
||||
c.collapse = &madvise_collapse;
|
||||
c.enforce_pte_scan_limits = false;
|
||||
|
||||
collapse_full(&c);
|
||||
collapse_empty(&c);
|
||||
collapse_single_pte_entry(&c);
|
||||
collapse_max_ptes_none(&c);
|
||||
collapse_swapin_single_pte(&c);
|
||||
collapse_max_ptes_swap(&c);
|
||||
collapse_single_pte_entry_compound(&c);
|
||||
collapse_full_of_compound(&c);
|
||||
collapse_compound_extreme(&c);
|
||||
collapse_fork(&c);
|
||||
collapse_fork_compound(&c);
|
||||
collapse_max_ptes_shared(&c);
|
||||
madvise_collapse_existing_thps();
|
||||
}
|
||||
|
||||
restore_settings(0);
|
||||
}
|
||||
|
||||
@@ -120,11 +120,16 @@ run_test ./gup_test -a
|
||||
# Dump pages 0, 19, and 4096, using pin_user_pages:
|
||||
run_test ./gup_test -ct -F 0x1 0 19 0x1000
|
||||
|
||||
run_test ./userfaultfd anon 20 16
|
||||
# Test requires source and destination huge pages. Size of source
|
||||
# (half_ufd_size_MB) is passed as argument to test.
|
||||
run_test ./userfaultfd hugetlb "$half_ufd_size_MB" 32
|
||||
run_test ./userfaultfd shmem 20 16
|
||||
uffd_mods=("" ":dev")
|
||||
for mod in "${uffd_mods[@]}"; do
|
||||
run_test ./userfaultfd anon${mod} 20 16
|
||||
# Hugetlb tests require source and destination huge pages. Pass in half
|
||||
# the size ($half_ufd_size_MB), which is used for *each*.
|
||||
run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32
|
||||
run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 "$mnt"/uffd-test
|
||||
rm -f "$mnt"/uffd-test
|
||||
run_test ./userfaultfd shmem${mod} 20 16
|
||||
done
|
||||
|
||||
#cleanup
|
||||
umount "$mnt"
|
||||
|
||||
@@ -77,6 +77,11 @@ static int bounces;
|
||||
#define TEST_SHMEM 3
|
||||
static int test_type;
|
||||
|
||||
#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY)
|
||||
|
||||
/* test using /dev/userfaultfd, instead of userfaultfd(2) */
|
||||
static bool test_dev_userfaultfd;
|
||||
|
||||
/* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
|
||||
#define ALARM_INTERVAL_SECS 10
|
||||
static volatile bool test_uffdio_copy_eexist = true;
|
||||
@@ -125,6 +130,8 @@ struct uffd_stats {
|
||||
const char *examples =
|
||||
"# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
|
||||
"./userfaultfd anon 100 99999\n\n"
|
||||
"# Run the same anonymous memory test, but using /dev/userfaultfd:\n"
|
||||
"./userfaultfd anon:dev 100 99999\n\n"
|
||||
"# Run share memory test on 1GiB region with 99 bounces:\n"
|
||||
"./userfaultfd shmem 1000 99\n\n"
|
||||
"# Run hugetlb memory test on 256MiB region with 50 bounces:\n"
|
||||
@@ -141,6 +148,14 @@ static void usage(void)
|
||||
"[hugetlbfs_file]\n\n");
|
||||
fprintf(stderr, "Supported <test type>: anon, hugetlb, "
|
||||
"hugetlb_shared, shmem\n\n");
|
||||
fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. "
|
||||
"Supported mods:\n");
|
||||
fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n");
|
||||
fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n");
|
||||
fprintf(stderr, "\nExample test mod usage:\n");
|
||||
fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n");
|
||||
fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n");
|
||||
|
||||
fprintf(stderr, "Examples:\n\n");
|
||||
fprintf(stderr, "%s", examples);
|
||||
exit(1);
|
||||
@@ -154,12 +169,14 @@ static void usage(void)
|
||||
ret, __LINE__); \
|
||||
} while (0)
|
||||
|
||||
#define err(fmt, ...) \
|
||||
#define errexit(exitcode, fmt, ...) \
|
||||
do { \
|
||||
_err(fmt, ##__VA_ARGS__); \
|
||||
exit(1); \
|
||||
exit(exitcode); \
|
||||
} while (0)
|
||||
|
||||
#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__)
|
||||
|
||||
static void uffd_stats_reset(struct uffd_stats *uffd_stats,
|
||||
unsigned long n_cpus)
|
||||
{
|
||||
@@ -383,13 +400,34 @@ static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls)
|
||||
}
|
||||
}
|
||||
|
||||
static int __userfaultfd_open_dev(void)
|
||||
{
|
||||
int fd, _uffd;
|
||||
|
||||
fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
|
||||
if (fd < 0)
|
||||
errexit(KSFT_SKIP, "opening /dev/userfaultfd failed");
|
||||
|
||||
_uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS);
|
||||
if (_uffd < 0)
|
||||
errexit(errno == ENOTTY ? KSFT_SKIP : 1,
|
||||
"creating userfaultfd failed");
|
||||
close(fd);
|
||||
return _uffd;
|
||||
}
|
||||
|
||||
static void userfaultfd_open(uint64_t *features)
|
||||
{
|
||||
struct uffdio_api uffdio_api;
|
||||
|
||||
uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY);
|
||||
if (uffd < 0)
|
||||
err("userfaultfd syscall not available in this kernel");
|
||||
if (test_dev_userfaultfd)
|
||||
uffd = __userfaultfd_open_dev();
|
||||
else {
|
||||
uffd = syscall(__NR_userfaultfd, UFFD_FLAGS);
|
||||
if (uffd < 0)
|
||||
errexit(errno == ENOSYS ? KSFT_SKIP : 1,
|
||||
"creating userfaultfd failed");
|
||||
}
|
||||
uffd_flags = fcntl(uffd, F_GETFD, NULL);
|
||||
|
||||
uffdio_api.api = UFFD_API;
|
||||
@@ -1584,8 +1622,6 @@ unsigned long default_huge_page_size(void)
|
||||
|
||||
static void set_test_type(const char *type)
|
||||
{
|
||||
uint64_t features = UFFD_API_FEATURES;
|
||||
|
||||
if (!strcmp(type, "anon")) {
|
||||
test_type = TEST_ANON;
|
||||
uffd_test_ops = &anon_uffd_test_ops;
|
||||
@@ -1603,9 +1639,29 @@ static void set_test_type(const char *type)
|
||||
test_type = TEST_SHMEM;
|
||||
uffd_test_ops = &shmem_uffd_test_ops;
|
||||
test_uffdio_minor = true;
|
||||
} else {
|
||||
err("Unknown test type: %s", type);
|
||||
}
|
||||
}
|
||||
|
||||
static void parse_test_type_arg(const char *raw_type)
|
||||
{
|
||||
char *buf = strdup(raw_type);
|
||||
uint64_t features = UFFD_API_FEATURES;
|
||||
|
||||
while (buf) {
|
||||
const char *token = strsep(&buf, ":");
|
||||
|
||||
if (!test_type)
|
||||
set_test_type(token);
|
||||
else if (!strcmp(token, "dev"))
|
||||
test_dev_userfaultfd = true;
|
||||
else if (!strcmp(token, "syscall"))
|
||||
test_dev_userfaultfd = false;
|
||||
else
|
||||
err("unrecognized test mod '%s'", token);
|
||||
}
|
||||
|
||||
if (!test_type)
|
||||
err("failed to parse test type argument: '%s'", raw_type);
|
||||
|
||||
if (test_type == TEST_HUGETLB)
|
||||
page_size = default_huge_page_size();
|
||||
@@ -1653,7 +1709,7 @@ int main(int argc, char **argv)
|
||||
err("failed to arm SIGALRM");
|
||||
alarm(ALARM_INTERVAL_SECS);
|
||||
|
||||
set_test_type(argv[1]);
|
||||
parse_test_type_arg(argv[1]);
|
||||
|
||||
nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
|
||||
nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
|
||||
|
||||
@@ -470,7 +470,12 @@ static bool match_str_list(const char *str, char **list, int list_size)
|
||||
|
||||
static bool is_need(char *buf)
|
||||
{
|
||||
if ((filter & FILTER_UNRELEASE) && get_free_ts_nsec(buf) != 0)
|
||||
__u64 ts_nsec, free_ts_nsec;
|
||||
|
||||
ts_nsec = get_ts_nsec(buf);
|
||||
free_ts_nsec = get_free_ts_nsec(buf);
|
||||
|
||||
if ((filter & FILTER_UNRELEASE) && free_ts_nsec != 0 && ts_nsec < free_ts_nsec)
|
||||
return false;
|
||||
if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size))
|
||||
return false;
|
||||
|
||||
Reference in New Issue
Block a user