diff --git a/Documentation/admin-guide/mm/cma_debugfs.rst b/Documentation/admin-guide/mm/cma_debugfs.rst index 4e06ffabd78a..7367e6294ef6 100644 --- a/Documentation/admin-guide/mm/cma_debugfs.rst +++ b/Documentation/admin-guide/mm/cma_debugfs.rst @@ -5,10 +5,10 @@ CMA Debugfs Interface The CMA debugfs interface is useful to retrieve basic information out of the different CMA areas and to test allocation/release in each of the areas. -Each CMA zone represents a directory under /cma/, indexed by the -kernel's CMA index. So the first CMA zone would be: +Each CMA area represents a directory under /cma/, represented by +its CMA name like below: - /cma/cma-0 + /cma/ The structure of the files created under that directory is as follows: @@ -18,8 +18,8 @@ The structure of the files created under that directory is as follows: - [RO] bitmap: The bitmap of page states in the zone. - [WO] alloc: Allocate N pages from that CMA area. For example:: - echo 5 > /cma/cma-2/alloc + echo 5 > /cma//alloc -would try to allocate 5 pages from the cma-2 area. +would try to allocate 5 pages from the 'cma_name' area. - [WO] free: Free N pages from that CMA area, similar to the above. diff --git a/Documentation/admin-guide/mm/userfaultfd.rst b/Documentation/admin-guide/mm/userfaultfd.rst index 6528036093e1..83f31919ebb3 100644 --- a/Documentation/admin-guide/mm/userfaultfd.rst +++ b/Documentation/admin-guide/mm/userfaultfd.rst @@ -17,7 +17,10 @@ of the ``PROT_NONE+SIGSEGV`` trick. Design ====== -Userfaults are delivered and resolved through the ``userfaultfd`` syscall. +Userspace creates a new userfaultfd, initializes it, and registers one or more +regions of virtual memory with it. Then, any page faults which occur within the +region(s) result in a message being delivered to the userfaultfd, notifying +userspace of the fault. The ``userfaultfd`` (aside from registering and unregistering virtual memory ranges) provides two primary functionalities: @@ -34,12 +37,11 @@ The real advantage of userfaults if compared to regular virtual memory management of mremap/mprotect is that the userfaults in all their operations never involve heavyweight structures like vmas (in fact the ``userfaultfd`` runtime load never takes the mmap_lock for writing). - Vmas are not suitable for page- (or hugepage) granular fault tracking when dealing with virtual address spaces that could span Terabytes. Too many vmas would be needed for that. -The ``userfaultfd`` once opened by invoking the syscall, can also be +The ``userfaultfd``, once created, can also be passed using unix domain sockets to a manager process, so the same manager process could handle the userfaults of a multitude of different processes without them being aware about what is going on @@ -50,6 +52,39 @@ is a corner case that would currently return ``-EBUSY``). API === +Creating a userfaultfd +---------------------- + +There are two ways to create a new userfaultfd, each of which provide ways to +restrict access to this functionality (since historically userfaultfds which +handle kernel page faults have been a useful tool for exploiting the kernel). + +The first way, supported since userfaultfd was introduced, is the +userfaultfd(2) syscall. Access to this is controlled in several ways: + +- Any user can always create a userfaultfd which traps userspace page faults + only. Such a userfaultfd can be created using the userfaultfd(2) syscall + with the flag UFFD_USER_MODE_ONLY. + +- In order to also trap kernel page faults for the address space, either the + process needs the CAP_SYS_PTRACE capability, or the system must have + vm.unprivileged_userfaultfd set to 1. By default, vm.unprivileged_userfaultfd + is set to 0. + +The second way, added to the kernel more recently, is by opening +/dev/userfaultfd and issuing a USERFAULTFD_IOC_NEW ioctl to it. This method +yields equivalent userfaultfds to the userfaultfd(2) syscall. + +Unlike userfaultfd(2), access to /dev/userfaultfd is controlled via normal +filesystem permissions (user/group/mode), which gives fine grained access to +userfaultfd specifically, without also granting other unrelated privileges at +the same time (as e.g. granting CAP_SYS_PTRACE would do). Users who have access +to /dev/userfaultfd can always create userfaultfds that trap kernel page faults; +vm.unprivileged_userfaultfd is not considered. + +Initializing a userfaultfd +-------------------------- + When first opened the ``userfaultfd`` must be enabled invoking the ``UFFDIO_API`` ioctl specifying a ``uffdio_api.api`` value set to ``UFFD_API`` (or a later API version) which will specify the ``read/POLLIN`` protocol diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 9b833e439f09..988f6a4c8084 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -926,6 +926,9 @@ calls without any restrictions. The default value is 0. +Another way to control permissions for userfaultfd is to use +/dev/userfaultfd instead of userfaultfd(2). See +Documentation/admin-guide/mm/userfaultfd.rst. user_reserve_kbytes =================== diff --git a/arch/alpha/include/uapi/asm/mman.h b/arch/alpha/include/uapi/asm/mman.h index 4aa996423b0d..763929e814e9 100644 --- a/arch/alpha/include/uapi/asm/mman.h +++ b/arch/alpha/include/uapi/asm/mman.h @@ -76,6 +76,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/mips/include/uapi/asm/mman.h b/arch/mips/include/uapi/asm/mman.h index 1be428663c10..c6e1fc77c996 100644 --- a/arch/mips/include/uapi/asm/mman.h +++ b/arch/mips/include/uapi/asm/mman.h @@ -103,6 +103,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/arch/parisc/include/uapi/asm/mman.h b/arch/parisc/include/uapi/asm/mman.h index a7ea3204a5fa..22133a6a506e 100644 --- a/arch/parisc/include/uapi/asm/mman.h +++ b/arch/parisc/include/uapi/asm/mman.h @@ -70,6 +70,8 @@ #define MADV_WIPEONFORK 71 /* Zero memory on fork, child only */ #define MADV_KEEPONFORK 72 /* Undo MADV_WIPEONFORK */ +#define MADV_COLLAPSE 73 /* Synchronous hugepage collapse */ + #define MADV_HWPOISON 100 /* poison a page for testing */ #define MADV_SOFT_OFFLINE 101 /* soft offline page for testing */ diff --git a/arch/xtensa/include/uapi/asm/mman.h b/arch/xtensa/include/uapi/asm/mman.h index 7966a58af472..1ff0c858544f 100644 --- a/arch/xtensa/include/uapi/asm/mman.h +++ b/arch/xtensa/include/uapi/asm/mman.h @@ -111,6 +111,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4e0023643f8b..482f91577f8c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -864,7 +864,7 @@ static int show_smap(struct seq_file *m, void *v) __show_smap(m, &mss, false); seq_printf(m, "THPeligible: %d\n", - hugepage_vma_check(vma, vma->vm_flags, true, false)); + hugepage_vma_check(vma, vma->vm_flags, true, false, true)); if (arch_pkeys_enabled()) seq_printf(m, "ProtectionKey: %8u\n", vma_pkey(vma)); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 0c1d33c4f74c..d94891e4ab1f 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -30,6 +30,7 @@ #include #include #include +#include int sysctl_unprivileged_userfaultfd __read_mostly; @@ -415,13 +416,8 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) if (ctx->features & UFFD_FEATURE_SIGBUS) goto out; - if ((vmf->flags & FAULT_FLAG_USER) == 0 && - ctx->flags & UFFD_USER_MODE_ONLY) { - printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " - "sysctl knob to 1 if kernel faults must be handled " - "without obtaining CAP_SYS_PTRACE capability\n"); + if (!(vmf->flags & FAULT_FLAG_USER) && (ctx->flags & UFFD_USER_MODE_ONLY)) goto out; - } /* * If it's already released don't get it. This avoids to loop @@ -2056,20 +2052,11 @@ static void init_once_userfaultfd_ctx(void *mem) seqcount_spinlock_init(&ctx->refile_seq, &ctx->fault_pending_wqh.lock); } -SYSCALL_DEFINE1(userfaultfd, int, flags) +static int new_userfaultfd(int flags) { struct userfaultfd_ctx *ctx; int fd; - if (!sysctl_unprivileged_userfaultfd && - (flags & UFFD_USER_MODE_ONLY) == 0 && - !capable(CAP_SYS_PTRACE)) { - printk_once(KERN_WARNING "uffd: Set unprivileged_userfaultfd " - "sysctl knob to 1 if kernel faults must be handled " - "without obtaining CAP_SYS_PTRACE capability\n"); - return -EPERM; - } - BUG_ON(!current->mm); /* Check the UFFD_* constants for consistency. */ @@ -2102,8 +2089,60 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) return fd; } +static inline bool userfaultfd_syscall_allowed(int flags) +{ + /* Userspace-only page faults are always allowed */ + if (flags & UFFD_USER_MODE_ONLY) + return true; + + /* + * The user is requesting a userfaultfd which can handle kernel faults. + * Privileged users are always allowed to do this. + */ + if (capable(CAP_SYS_PTRACE)) + return true; + + /* Otherwise, access to kernel fault handling is sysctl controlled. */ + return sysctl_unprivileged_userfaultfd; +} + +SYSCALL_DEFINE1(userfaultfd, int, flags) +{ + if (!userfaultfd_syscall_allowed(flags)) + return -EPERM; + + return new_userfaultfd(flags); +} + +static long userfaultfd_dev_ioctl(struct file *file, unsigned int cmd, unsigned long flags) +{ + if (cmd != USERFAULTFD_IOC_NEW) + return -EINVAL; + + return new_userfaultfd(flags); +} + +static const struct file_operations userfaultfd_dev_fops = { + .unlocked_ioctl = userfaultfd_dev_ioctl, + .compat_ioctl = userfaultfd_dev_ioctl, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice userfaultfd_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "userfaultfd", + .fops = &userfaultfd_dev_fops +}; + static int __init userfaultfd_init(void) { + int ret; + + ret = misc_register(&userfaultfd_misc); + if (ret) + return ret; + userfaultfd_ctx_cachep = kmem_cache_create("userfaultfd_ctx_cache", sizeof(struct userfaultfd_ctx), 0, diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 768e5261fdae..38265f9f782e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -168,9 +168,8 @@ static inline bool file_thp_enabled(struct vm_area_struct *vma) !inode_is_open_for_write(inode) && S_ISREG(inode->i_mode); } -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf); +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, + bool smaps, bool in_pf, bool enforce_sysfs); #define transparent_hugepage_use_zero_page() \ (transparent_hugepage_flags & \ @@ -219,6 +218,9 @@ void __split_huge_pud(struct vm_area_struct *vma, pud_t *pud, int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice); +int madvise_collapse(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end); void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma); @@ -321,8 +323,8 @@ static inline bool transhuge_vma_suitable(struct vm_area_struct *vma, } static inline bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf) + unsigned long vm_flags, bool smaps, + bool in_pf, bool enforce_sysfs) { return false; } @@ -362,9 +364,16 @@ static inline void split_huge_pmd_address(struct vm_area_struct *vma, static inline int hugepage_madvise(struct vm_area_struct *vma, unsigned long *vm_flags, int advice) { - BUG(); - return 0; + return -EINVAL; } + +static inline int madvise_collapse(struct vm_area_struct *vma, + struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + return -EINVAL; +} + static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 668389b4b53d..d232de7cdc56 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -151,13 +151,6 @@ extern bool mempolicy_in_oom_domain(struct task_struct *tsk, const nodemask_t *mask); extern nodemask_t *policy_nodemask(gfp_t gfp, struct mempolicy *policy); -static inline nodemask_t *policy_nodemask_current(gfp_t gfp) -{ - struct mempolicy *mpol = get_task_policy(current); - - return policy_nodemask(gfp, mpol); -} - extern unsigned int mempolicy_slab_node(void); extern enum zone_type policy_zone; @@ -189,6 +182,7 @@ static inline bool mpol_is_preferred_many(struct mempolicy *pol) return (pol->mode == MPOL_PREFERRED_MANY); } +extern bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone); #else @@ -294,11 +288,6 @@ static inline void mpol_put_task_policy(struct task_struct *task) { } -static inline nodemask_t *policy_nodemask_current(gfp_t gfp) -{ - return NULL; -} - static inline bool mpol_is_preferred_many(struct mempolicy *pol) { return false; diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 5a18c518dca1..52b41d391be4 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -305,6 +305,8 @@ static inline bool is_active_lru(enum lru_list lru) return (lru == LRU_ACTIVE_ANON || lru == LRU_ACTIVE_FILE); } +#define WORKINGSET_ANON 0 +#define WORKINGSET_FILE 1 #define ANON_AND_FILE 2 enum lruvec_flags { diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index d651f3437367..55392bf30a03 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -11,6 +11,7 @@ EM( SCAN_FAIL, "failed") \ EM( SCAN_SUCCEED, "succeeded") \ EM( SCAN_PMD_NULL, "pmd_null") \ + EM( SCAN_PMD_MAPPED, "page_pmd_mapped") \ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \ diff --git a/include/uapi/asm-generic/mman-common.h b/include/uapi/asm-generic/mman-common.h index 6c1aa92a92e4..6ce1f1ceb432 100644 --- a/include/uapi/asm-generic/mman-common.h +++ b/include/uapi/asm-generic/mman-common.h @@ -77,6 +77,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/include/uapi/linux/userfaultfd.h b/include/uapi/linux/userfaultfd.h index 7d32b1e797fb..005e5e306266 100644 --- a/include/uapi/linux/userfaultfd.h +++ b/include/uapi/linux/userfaultfd.h @@ -12,6 +12,10 @@ #include +/* ioctls for /dev/userfaultfd */ +#define USERFAULTFD_IOC 0xAA +#define USERFAULTFD_IOC_NEW _IO(USERFAULTFD_IOC, 0x00) + /* * If the UFFDIO_API is upgraded someday, the UFFDIO_UNREGISTER and * UFFDIO_WAKE ioctls should be defined as _IOW and not as _IOR. In diff --git a/mm/cma_debug.c b/mm/cma_debug.c index c3ffe253e055..602fff89b15f 100644 --- a/mm/cma_debug.c +++ b/mm/cma_debug.c @@ -163,11 +163,8 @@ DEFINE_DEBUGFS_ATTRIBUTE(cma_alloc_fops, NULL, cma_alloc_write, "%llu\n"); static void cma_debugfs_add_one(struct cma *cma, struct dentry *root_dentry) { struct dentry *tmp; - char name[CMA_MAX_NAME]; - scnprintf(name, sizeof(name), "cma-%s", cma->name); - - tmp = debugfs_create_dir(name, root_dentry); + tmp = debugfs_create_dir(cma->name, root_dentry); debugfs_create_file("alloc", 0200, tmp, cma, &cma_alloc_fops); debugfs_create_file("free", 0200, tmp, cma, &cma_free_fops); diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 4e51466c4e74..652a94deafe3 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -1053,7 +1053,7 @@ static int __init __damon_dbgfs_init(void) fops[i]); dbgfs_fill_ctx_dir(dbgfs_root, dbgfs_ctxs[0]); - dbgfs_dirs = kmalloc_array(1, sizeof(dbgfs_root), GFP_KERNEL); + dbgfs_dirs = kmalloc(sizeof(dbgfs_root), GFP_KERNEL); if (!dbgfs_dirs) { debugfs_remove(dbgfs_root); return -ENOMEM; diff --git a/mm/filemap.c b/mm/filemap.c index c943d1b90cc2..c8d7ef70e48c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1633,24 +1633,26 @@ EXPORT_SYMBOL(folio_end_writeback); */ void page_endio(struct page *page, bool is_write, int err) { + struct folio *folio = page_folio(page); + if (!is_write) { if (!err) { - SetPageUptodate(page); + folio_mark_uptodate(folio); } else { - ClearPageUptodate(page); - SetPageError(page); + folio_clear_uptodate(folio); + folio_set_error(folio); } - unlock_page(page); + folio_unlock(folio); } else { if (err) { struct address_space *mapping; - SetPageError(page); - mapping = page_mapping(page); + folio_set_error(folio); + mapping = folio_mapping(folio); if (mapping) mapping_set_error(mapping, err); } - end_page_writeback(page); + folio_end_writeback(folio); } } EXPORT_SYMBOL_GPL(page_endio); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f42bb51e023a..196031d28a12 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -70,9 +70,8 @@ static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; unsigned long huge_zero_pfn __read_mostly = ~0UL; -bool hugepage_vma_check(struct vm_area_struct *vma, - unsigned long vm_flags, - bool smaps, bool in_pf) +bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, + bool smaps, bool in_pf, bool enforce_sysfs) { if (!vma->vm_mm) /* vdso */ return false; @@ -121,11 +120,10 @@ bool hugepage_vma_check(struct vm_area_struct *vma, if (!in_pf && shmem_file(vma->vm_file)) return shmem_huge_enabled(vma); - if (!hugepage_flags_enabled()) - return false; - - /* THP settings require madvise. */ - if (!(vm_flags & VM_HUGEPAGE) && !hugepage_flags_always()) + /* Enforce sysfs THP requirements as necessary */ + if (enforce_sysfs && + (!hugepage_flags_enabled() || (!(vm_flags & VM_HUGEPAGE) && + !hugepage_flags_always()))) return false; /* Only regular file is valid */ @@ -2288,25 +2286,11 @@ out: void split_huge_pmd_address(struct vm_area_struct *vma, unsigned long address, bool freeze, struct folio *folio) { - pgd_t *pgd; - p4d_t *p4d; - pud_t *pud; - pmd_t *pmd; + pmd_t *pmd = mm_find_pmd(vma->vm_mm, address); - pgd = pgd_offset(vma->vm_mm, address); - if (!pgd_present(*pgd)) + if (!pmd) return; - p4d = p4d_offset(pgd, address); - if (!p4d_present(*p4d)) - return; - - pud = pud_offset(p4d, address); - if (!pud_present(*pud)) - return; - - pmd = pmd_offset(pud, address); - __split_huge_pmd(vma, pmd, address, freeze, folio); } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0bdfc7e1c933..ca0e8d53a26d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4332,18 +4332,34 @@ static int __init default_hugepagesz_setup(char *s) } __setup("default_hugepagesz=", default_hugepagesz_setup); +static nodemask_t *policy_mbind_nodemask(gfp_t gfp) +{ +#ifdef CONFIG_NUMA + struct mempolicy *mpol = get_task_policy(current); + + /* + * Only enforce MPOL_BIND policy which overlaps with cpuset policy + * (from policy_nodemask) specifically for hugetlb case + */ + if (mpol->mode == MPOL_BIND && + (apply_policy_zone(mpol, gfp_zone(gfp)) && + cpuset_nodemask_valid_mems_allowed(&mpol->nodes))) + return &mpol->nodes; +#endif + return NULL; +} + static unsigned int allowed_mems_nr(struct hstate *h) { int node; unsigned int nr = 0; - nodemask_t *mpol_allowed; + nodemask_t *mbind_nodemask; unsigned int *array = h->free_huge_pages_node; gfp_t gfp_mask = htlb_alloc_mask(h); - mpol_allowed = policy_nodemask_current(gfp_mask); - + mbind_nodemask = policy_mbind_nodemask(gfp_mask); for_each_node_mask(node, cpuset_current_mems_allowed) { - if (!mpol_allowed || node_isset(node, *mpol_allowed)) + if (!mbind_nodemask || node_isset(node, *mbind_nodemask)) nr += array[node]; } diff --git a/mm/internal.h b/mm/internal.h index 785409805ed7..55ce10e4d0c0 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -187,7 +187,7 @@ extern void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason /* * in mm/rmap.c: */ -extern pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); +pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address); /* * in mm/page_alloc.c diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 01dbc6dbd599..28448e2234cc 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -28,6 +28,7 @@ enum scan_result { SCAN_FAIL, SCAN_SUCCEED, SCAN_PMD_NULL, + SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, SCAN_EXCEED_SHARED_PTE, @@ -73,6 +74,8 @@ static DECLARE_WAIT_QUEUE_HEAD(khugepaged_wait); * default collapse hugepages if there is at least one pte mapped like * it would have happened if the vma was large enough during page * fault. + * + * Note that these are only respected if collapse was initiated by khugepaged. */ static unsigned int khugepaged_max_ptes_none __read_mostly; static unsigned int khugepaged_max_ptes_swap __read_mostly; @@ -85,6 +88,16 @@ static struct kmem_cache *mm_slot_cache __read_mostly; #define MAX_PTE_MAPPED_THP 8 +struct collapse_control { + bool is_khugepaged; + + /* Num pages scanned per node */ + u32 node_load[MAX_NUMNODES]; + + /* Last target selected in hpage_collapse_find_target_node() */ + int last_target_node; +}; + /** * struct mm_slot - hash lookup from mm to mm_slot * @hash: hash collision list @@ -425,7 +438,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); } -static inline int khugepaged_test_exit(struct mm_struct *mm) +static inline int hpage_collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; } @@ -440,7 +453,7 @@ void __khugepaged_enter(struct mm_struct *mm) return; /* __khugepaged_exit() must not run from under us */ - VM_BUG_ON_MM(khugepaged_test_exit(mm), mm); + VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { free_mm_slot(mm_slot); return; @@ -466,7 +479,7 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, { if (!test_bit(MMF_VM_HUGEPAGE, &vma->vm_mm->flags) && hugepage_flags_enabled()) { - if (hugepage_vma_check(vma, vm_flags, false, false)) + if (hugepage_vma_check(vma, vm_flags, false, false, true)) __khugepaged_enter(vma->vm_mm); } } @@ -492,11 +505,10 @@ void __khugepaged_exit(struct mm_struct *mm) } else if (mm_slot) { /* * This is required to serialize against - * khugepaged_test_exit() (which is guaranteed to run - * under mmap sem read mode). Stop here (after we - * return all pagetables will be destroyed) until - * khugepaged has finished working on the pagetables - * under the mmap_lock. + * hpage_collapse_test_exit() (which is guaranteed to run + * under mmap sem read mode). Stop here (after we return all + * pagetables will be destroyed) until khugepaged has finished + * working on the pagetables under the mmap_lock. */ mmap_write_lock(mm); mmap_write_unlock(mm); @@ -546,11 +558,12 @@ static bool is_refcount_suitable(struct page *page) static int __collapse_huge_page_isolate(struct vm_area_struct *vma, unsigned long address, pte_t *pte, + struct collapse_control *cc, struct list_head *compound_pagelist) { struct page *page = NULL; pte_t *_pte; - int none_or_zero = 0, shared = 0, result = 0, referenced = 0; + int none_or_zero = 0, shared = 0, result = SCAN_FAIL, referenced = 0; bool writable = false; for (_pte = pte; _pte < pte + HPAGE_PMD_NR; @@ -558,8 +571,10 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, pte_t pteval = *_pte; if (pte_none(pteval) || (pte_present(pteval) && is_zero_pfn(pte_pfn(pteval)))) { + ++none_or_zero; if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -579,11 +594,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, VM_BUG_ON_PAGE(!PageAnon(page), page); - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { - result = SCAN_EXCEED_SHARED_PTE; - count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); - goto out; + if (page_mapcount(page) > 1) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out; + } } if (PageCompound(page)) { @@ -646,10 +664,14 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (PageCompound(page)) list_add_tail(&page->lru, compound_pagelist); next: - /* There should be enough young pte to collapse the page */ - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) referenced++; if (pte_write(pteval)) @@ -658,19 +680,19 @@ next: if (unlikely(!writable)) { result = SCAN_PAGE_RO; - } else if (unlikely(!referenced)) { + } else if (unlikely(cc->is_khugepaged && !referenced)) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 1; + return result; } out: release_pte_pages(pte, _pte, compound_pagelist); trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); - return 0; + return result; } static void __collapse_huge_page_copy(pte_t *pte, struct page *page, @@ -735,9 +757,12 @@ static void khugepaged_alloc_sleep(void) remove_wait_queue(&khugepaged_wait, &wait); } -static int khugepaged_node_load[MAX_NUMNODES]; +struct collapse_control khugepaged_collapse_control = { + .is_khugepaged = true, + .last_target_node = NUMA_NO_NODE, +}; -static bool khugepaged_scan_abort(int nid) +static bool hpage_collapse_scan_abort(int nid, struct collapse_control *cc) { int i; @@ -749,11 +774,11 @@ static bool khugepaged_scan_abort(int nid) return false; /* If there is a count for this node already, it must be acceptable */ - if (khugepaged_node_load[nid]) + if (cc->node_load[nid]) return false; for (i = 0; i < MAX_NUMNODES; i++) { - if (!khugepaged_node_load[i]) + if (!cc->node_load[i]) continue; if (node_distance(nid, i) > node_reclaim_distance) return true; @@ -772,146 +797,62 @@ static inline gfp_t alloc_hugepage_khugepaged_gfpmask(void) } #ifdef CONFIG_NUMA -static int khugepaged_find_target_node(void) +static int hpage_collapse_find_target_node(struct collapse_control *cc) { - static int last_khugepaged_target_node = NUMA_NO_NODE; int nid, target_node = 0, max_value = 0; /* find first node with max normal pages hit */ for (nid = 0; nid < MAX_NUMNODES; nid++) - if (khugepaged_node_load[nid] > max_value) { - max_value = khugepaged_node_load[nid]; + if (cc->node_load[nid] > max_value) { + max_value = cc->node_load[nid]; target_node = nid; } /* do some balance if several nodes have the same hit record */ - if (target_node <= last_khugepaged_target_node) - for (nid = last_khugepaged_target_node + 1; nid < MAX_NUMNODES; - nid++) - if (max_value == khugepaged_node_load[nid]) { + if (target_node <= cc->last_target_node) + for (nid = cc->last_target_node + 1; nid < MAX_NUMNODES; + nid++) + if (max_value == cc->node_load[nid]) { target_node = nid; break; } - last_khugepaged_target_node = target_node; + cc->last_target_node = target_node; return target_node; } - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) +#else +static int hpage_collapse_find_target_node(struct collapse_control *cc) { - if (IS_ERR(*hpage)) { - if (!*wait) - return false; - - *wait = false; - *hpage = NULL; - khugepaged_alloc_sleep(); - } else if (*hpage) { - put_page(*hpage); - *hpage = NULL; - } - - return true; + return 0; } +#endif -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) +static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node) { - VM_BUG_ON_PAGE(*hpage, *hpage); - *hpage = __alloc_pages_node(node, gfp, HPAGE_PMD_ORDER); if (unlikely(!*hpage)) { count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - *hpage = ERR_PTR(-ENOMEM); - return NULL; + return false; } prep_transhuge_page(*hpage); count_vm_event(THP_COLLAPSE_ALLOC); - return *hpage; -} -#else -static int khugepaged_find_target_node(void) -{ - return 0; -} - -static inline struct page *alloc_khugepaged_hugepage(void) -{ - struct page *page; - - page = alloc_pages(alloc_hugepage_khugepaged_gfpmask(), - HPAGE_PMD_ORDER); - if (page) - prep_transhuge_page(page); - return page; -} - -static struct page *khugepaged_alloc_hugepage(bool *wait) -{ - struct page *hpage; - - do { - hpage = alloc_khugepaged_hugepage(); - if (!hpage) { - count_vm_event(THP_COLLAPSE_ALLOC_FAILED); - if (!*wait) - return NULL; - - *wait = false; - khugepaged_alloc_sleep(); - } else - count_vm_event(THP_COLLAPSE_ALLOC); - } while (unlikely(!hpage) && likely(hugepage_flags_enabled())); - - return hpage; -} - -static bool khugepaged_prealloc_page(struct page **hpage, bool *wait) -{ - /* - * If the hpage allocated earlier was briefly exposed in page cache - * before collapse_file() failed, it is possible that racing lookups - * have not yet completed, and would then be unpleasantly surprised by - * finding the hpage reused for the same mapping at a different offset. - * Just release the previous allocation if there is any danger of that. - */ - if (*hpage && page_count(*hpage) > 1) { - put_page(*hpage); - *hpage = NULL; - } - - if (!*hpage) - *hpage = khugepaged_alloc_hugepage(wait); - - if (unlikely(!*hpage)) - return false; - return true; } -static struct page * -khugepaged_alloc_page(struct page **hpage, gfp_t gfp, int node) -{ - VM_BUG_ON(!*hpage); - - return *hpage; -} -#endif - /* * If mmap_lock temporarily dropped, revalidate vma * before taking mmap_lock. - * Return 0 if succeeds, otherwise return none-zero - * value (scan code). + * Returns enum scan_result value. */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, - struct vm_area_struct **vmap) + struct vm_area_struct **vmap, + struct collapse_control *cc) { struct vm_area_struct *vma; - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) return SCAN_ANY_PROCESS; *vmap = vma = find_vma(mm, address); @@ -920,7 +861,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, if (!transhuge_vma_suitable(vma, address)) return SCAN_ADDRESS_RANGE; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, + cc->is_khugepaged)) return SCAN_VMA_CHECK; /* * Anon VMA expected, the address may be unmapped then @@ -931,21 +873,60 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, */ if (!vma->anon_vma || !vma_is_anonymous(vma)) return SCAN_VMA_CHECK; - return 0; + return SCAN_SUCCEED; +} + +static int find_pmd_or_thp_or_none(struct mm_struct *mm, + unsigned long address, + pmd_t **pmd) +{ + pmd_t pmde; + + *pmd = mm_find_pmd(mm, address); + if (!*pmd) + return SCAN_PMD_NULL; + + pmde = pmd_read_atomic(*pmd); + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ + barrier(); +#endif + if (!pmd_present(pmde)) + return SCAN_PMD_NULL; + if (pmd_trans_huge(pmde)) + return SCAN_PMD_MAPPED; + if (pmd_bad(pmde)) + return SCAN_PMD_NULL; + return SCAN_SUCCEED; +} + +static int check_pmd_still_valid(struct mm_struct *mm, + unsigned long address, + pmd_t *pmd) +{ + pmd_t *new_pmd; + int result = find_pmd_or_thp_or_none(mm, address, &new_pmd); + + if (result != SCAN_SUCCEED) + return result; + if (new_pmd != pmd) + return SCAN_FAIL; + return SCAN_SUCCEED; } /* * Bring missing pages in from swap, to complete THP collapse. - * Only done if khugepaged_scan_pmd believes it is worthwhile. + * Only done if hpage_collapse_scan_pmd believes it is worthwhile. * * Called and returns without pte mapped or spinlocks held. * Note that if false is returned, mmap_lock will be released. */ -static bool __collapse_huge_page_swapin(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long haddr, pmd_t *pmd, - int referenced) +static int __collapse_huge_page_swapin(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long haddr, pmd_t *pmd, + int referenced) { int swapped_in = 0; vm_fault_t ret = 0; @@ -976,12 +957,13 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, */ if (ret & VM_FAULT_RETRY) { trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; + /* Likely, but not guaranteed, that page lock failed */ + return SCAN_PAGE_LOCK; } if (ret & VM_FAULT_ERROR) { mmap_read_unlock(mm); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 0); - return false; + return SCAN_FAIL; } swapped_in++; } @@ -991,30 +973,41 @@ static bool __collapse_huge_page_swapin(struct mm_struct *mm, lru_add_drain(); trace_mm_collapse_huge_page_swapin(mm, swapped_in, referenced, 1); - return true; + return SCAN_SUCCEED; } -static void collapse_huge_page(struct mm_struct *mm, - unsigned long address, - struct page **hpage, - int node, int referenced, int unmapped) +static int alloc_charge_hpage(struct page **hpage, struct mm_struct *mm, + struct collapse_control *cc) +{ + /* Only allocate from the target node */ + gfp_t gfp = (cc->is_khugepaged ? alloc_hugepage_khugepaged_gfpmask() : + GFP_TRANSHUGE) | __GFP_THISNODE; + int node = hpage_collapse_find_target_node(cc); + + if (!hpage_collapse_alloc_page(hpage, gfp, node)) + return SCAN_ALLOC_HUGE_PAGE_FAIL; + if (unlikely(mem_cgroup_charge(page_folio(*hpage), mm, gfp))) + return SCAN_CGROUP_CHARGE_FAIL; + count_memcg_page_event(*hpage, THP_COLLAPSE_ALLOC); + return SCAN_SUCCEED; +} + +static int collapse_huge_page(struct mm_struct *mm, unsigned long address, + int referenced, int unmapped, + struct collapse_control *cc) { LIST_HEAD(compound_pagelist); pmd_t *pmd, _pmd; pte_t *pte; pgtable_t pgtable; - struct page *new_page; + struct page *hpage; spinlock_t *pmd_ptl, *pte_ptl; - int isolated = 0, result = 0; + int result = SCAN_FAIL; struct vm_area_struct *vma; struct mmu_notifier_range range; - gfp_t gfp; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; - /* * Before allocating the hugepage, release the mmap_lock read lock. * The allocation can take potentially a long time if it involves @@ -1022,40 +1015,34 @@ static void collapse_huge_page(struct mm_struct *mm, * that. We will recheck the vma after taking it again in write mode. */ mmap_read_unlock(mm); - new_page = khugepaged_alloc_page(hpage, gfp, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; - goto out_nolock; - } - if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { - result = SCAN_CGROUP_CHARGE_FAIL; + result = alloc_charge_hpage(&hpage, mm, cc); + if (result != SCAN_SUCCEED) goto out_nolock; - } - count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); mmap_read_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); - if (result) { + result = hugepage_vma_revalidate(mm, address, &vma, cc); + if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; } - /* - * __collapse_huge_page_swapin will return with mmap_lock released - * when it fails. So we jump out_nolock directly in that case. - * Continuing to collapse causes inconsistency. - */ - if (unmapped && !__collapse_huge_page_swapin(mm, vma, address, - pmd, referenced)) { - goto out_nolock; + if (unmapped) { + /* + * __collapse_huge_page_swapin will return with mmap_lock + * released when it fails. So we jump out_nolock directly in + * that case. Continuing to collapse causes inconsistency. + */ + result = __collapse_huge_page_swapin(mm, vma, address, pmd, + referenced); + if (result != SCAN_SUCCEED) + goto out_nolock; } mmap_read_unlock(mm); @@ -1065,11 +1052,12 @@ static void collapse_huge_page(struct mm_struct *mm, * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma); - if (result) + result = hugepage_vma_revalidate(mm, address, &vma, cc); + if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ - if (mm_find_pmd(mm, address) != pmd) + result = check_pmd_still_valid(mm, address, pmd); + if (result != SCAN_SUCCEED) goto out_up_write; anon_vma_lock_write(vma->anon_vma); @@ -1095,11 +1083,11 @@ static void collapse_huge_page(struct mm_struct *mm, mmu_notifier_invalidate_range_end(&range); spin_lock(pte_ptl); - isolated = __collapse_huge_page_isolate(vma, address, pte, - &compound_pagelist); + result = __collapse_huge_page_isolate(vma, address, pte, cc, + &compound_pagelist); spin_unlock(pte_ptl); - if (unlikely(!isolated)) { + if (unlikely(result != SCAN_SUCCEED)) { pte_unmap(pte); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); @@ -1111,7 +1099,6 @@ static void collapse_huge_page(struct mm_struct *mm, pmd_populate(mm, pmd, pmd_pgtable(_pmd)); spin_unlock(pmd_ptl); anon_vma_unlock_write(vma->anon_vma); - result = SCAN_FAIL; goto out_up_write; } @@ -1121,8 +1108,8 @@ static void collapse_huge_page(struct mm_struct *mm, */ anon_vma_unlock_write(vma->anon_vma); - __collapse_huge_page_copy(pte, new_page, vma, address, pte_ptl, - &compound_pagelist); + __collapse_huge_page_copy(pte, hpage, vma, address, pte_ptl, + &compound_pagelist); pte_unmap(pte); /* * spin_lock() below is not the equivalent of smp_wmb(), but @@ -1130,42 +1117,43 @@ static void collapse_huge_page(struct mm_struct *mm, * avoid the copy_huge_page writes to become visible after * the set_pmd_at() write. */ - __SetPageUptodate(new_page); + __SetPageUptodate(hpage); pgtable = pmd_pgtable(_pmd); - _pmd = mk_huge_pmd(new_page, vma->vm_page_prot); + _pmd = mk_huge_pmd(hpage, vma->vm_page_prot); _pmd = maybe_pmd_mkwrite(pmd_mkdirty(_pmd), vma); spin_lock(pmd_ptl); BUG_ON(!pmd_none(*pmd)); - page_add_new_anon_rmap(new_page, vma, address); - lru_cache_add_inactive_or_unevictable(new_page, vma); + page_add_new_anon_rmap(hpage, vma, address); + lru_cache_add_inactive_or_unevictable(hpage, vma); pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, address, pmd, _pmd); update_mmu_cache_pmd(vma, address, pmd); spin_unlock(pmd_ptl); - *hpage = NULL; + hpage = NULL; - khugepaged_pages_collapsed++; result = SCAN_SUCCEED; out_up_write: mmap_write_unlock(mm); out_nolock: - if (!IS_ERR_OR_NULL(*hpage)) - mem_cgroup_uncharge(page_folio(*hpage)); - trace_mm_collapse_huge_page(mm, isolated, result); - return; + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); + } + trace_mm_collapse_huge_page(mm, result == SCAN_SUCCEED, result); + return result; } -static int khugepaged_scan_pmd(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long address, - struct page **hpage) +static int hpage_collapse_scan_pmd(struct mm_struct *mm, + struct vm_area_struct *vma, + unsigned long address, bool *mmap_locked, + struct collapse_control *cc) { pmd_t *pmd; pte_t *pte, *_pte; - int ret = 0, result = 0, referenced = 0; + int result = SCAN_FAIL, referenced = 0; int none_or_zero = 0, shared = 0; struct page *page = NULL; unsigned long _address; @@ -1175,19 +1163,19 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, VM_BUG_ON(address & ~HPAGE_PMD_MASK); - pmd = mm_find_pmd(mm, address); - if (!pmd) { - result = SCAN_PMD_NULL; + result = find_pmd_or_thp_or_none(mm, address, &pmd); + if (result != SCAN_SUCCEED) goto out; - } - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); pte = pte_offset_map_lock(mm, pmd, address, &ptl); for (_address = address, _pte = pte; _pte < pte + HPAGE_PMD_NR; _pte++, _address += PAGE_SIZE) { pte_t pteval = *_pte; if (is_swap_pte(pteval)) { - if (++unmapped <= khugepaged_max_ptes_swap) { + ++unmapped; + if (!cc->is_khugepaged || + unmapped <= khugepaged_max_ptes_swap) { /* * Always be strict with uffd-wp * enabled swap entries. Please see @@ -1205,8 +1193,10 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, } } if (pte_none(pteval) || is_zero_pfn(pte_pfn(pteval))) { + ++none_or_zero; if (!userfaultfd_armed(vma) && - ++none_or_zero <= khugepaged_max_ptes_none) { + (!cc->is_khugepaged || + none_or_zero <= khugepaged_max_ptes_none)) { continue; } else { result = SCAN_EXCEED_NONE_PTE; @@ -1236,27 +1226,30 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, goto out_unmap; } - if (page_mapcount(page) > 1 && - ++shared > khugepaged_max_ptes_shared) { - result = SCAN_EXCEED_SHARED_PTE; - count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); - goto out_unmap; + if (page_mapcount(page) > 1) { + ++shared; + if (cc->is_khugepaged && + shared > khugepaged_max_ptes_shared) { + result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); + goto out_unmap; + } } page = compound_head(page); /* * Record which node the original page is from and save this - * information to khugepaged_node_load[]. + * information to cc->node_load[]. * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; goto out_unmap; } - khugepaged_node_load[node]++; + cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; goto out_unmap; @@ -1291,31 +1284,38 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, result = SCAN_PAGE_COUNT; goto out_unmap; } - if (pte_young(pteval) || - page_is_young(page) || PageReferenced(page) || - mmu_notifier_test_young(vma->vm_mm, address)) + + /* + * If collapse was initiated by khugepaged, check that there is + * enough young pte to justify collapsing the page + */ + if (cc->is_khugepaged && + (pte_young(pteval) || page_is_young(page) || + PageReferenced(page) || mmu_notifier_test_young(vma->vm_mm, + address))) referenced++; } if (!writable) { result = SCAN_PAGE_RO; - } else if (!referenced || (unmapped && referenced < HPAGE_PMD_NR/2)) { + } else if (cc->is_khugepaged && + (!referenced || + (unmapped && referenced < HPAGE_PMD_NR / 2))) { result = SCAN_LACK_REFERENCED_PAGE; } else { result = SCAN_SUCCEED; - ret = 1; } out_unmap: pte_unmap_unlock(pte, ptl); - if (ret) { - node = khugepaged_find_target_node(); + if (result == SCAN_SUCCEED) { + result = collapse_huge_page(mm, address, referenced, + unmapped, cc); /* collapse_huge_page will return with the mmap_lock released */ - collapse_huge_page(mm, address, hpage, node, - referenced, unmapped); + *mmap_locked = false; } out: trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, none_or_zero, result, unmapped); - return ret; + return result; } static void collect_mm_slot(struct mm_slot *mm_slot) @@ -1324,7 +1324,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) lockdep_assert_held(&khugepaged_mm_lock); - if (khugepaged_test_exit(mm)) { + if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); @@ -1402,12 +1402,13 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) return; /* - * This vm_flags may not have VM_HUGEPAGE if the page was not - * collapsed by this mm. But we can still collapse if the page is - * the valid THP. Add extra VM_HUGEPAGE so hugepage_vma_check() - * will not fail the vma for missing VM_HUGEPAGE + * If we are here, we've succeeded in replacing all the native pages + * in the page cache with a single hugepage. If a mm were to fault-in + * this memory (mapped by a suitably aligned VMA), we'd get the hugepage + * and map it by a PMD, regardless of sysfs THP settings. As such, let's + * analogously elide sysfs THP settings here. */ - if (!hugepage_vma_check(vma, vma->vm_flags | VM_HUGEPAGE, false, false)) + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) return; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ @@ -1422,8 +1423,7 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!PageHead(hpage)) goto drop_hpage; - pmd = mm_find_pmd(mm, haddr); - if (!pmd) + if (find_pmd_or_thp_or_none(mm, haddr, &pmd) != SCAN_SUCCEED) goto drop_hpage; start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); @@ -1497,7 +1497,7 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) if (!mmap_write_trylock(mm)) return; - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) goto out; for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) @@ -1541,8 +1541,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) if (vma->vm_end < addr + HPAGE_PMD_SIZE) continue; mm = vma->vm_mm; - pmd = mm_find_pmd(mm, addr); - if (!pmd) + if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) continue; /* * We need exclusive mmap_lock to retract page table. @@ -1560,7 +1559,8 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * it'll always mapped in small page size for uffd-wp * registered ranges. */ - if (!khugepaged_test_exit(mm) && !userfaultfd_wp(vma)) + if (!hpage_collapse_test_exit(mm) && + !userfaultfd_wp(vma)) collapse_and_free_pmd(mm, vma, addr, pmd); mmap_write_unlock(mm); } else { @@ -1577,8 +1577,7 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * @mm: process address space where collapse happens * @file: file that collapse on * @start: collapse start address - * @hpage: new allocated huge page for collapse - * @node: appointed node the new huge page allocate from + * @cc: collapse context and scratchpad * * Basic scheme is simple, details are more complex: * - allocate and lock a new huge page; @@ -1595,13 +1594,11 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ -static void collapse_file(struct mm_struct *mm, - struct file *file, pgoff_t start, - struct page **hpage, int node) +static int collapse_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; - gfp_t gfp; - struct page *new_page; + struct page *hpage; pgoff_t index, end = start + HPAGE_PMD_NR; LIST_HEAD(pagelist); XA_STATE_ORDER(xas, &mapping->i_pages, start, HPAGE_PMD_ORDER); @@ -1612,20 +1609,9 @@ static void collapse_file(struct mm_struct *mm, VM_BUG_ON(!IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) && !is_shmem); VM_BUG_ON(start & (HPAGE_PMD_NR - 1)); - /* Only allocate from the target node */ - gfp = alloc_hugepage_khugepaged_gfpmask() | __GFP_THISNODE; - - new_page = khugepaged_alloc_page(hpage, gfp, node); - if (!new_page) { - result = SCAN_ALLOC_HUGE_PAGE_FAIL; + result = alloc_charge_hpage(&hpage, mm, cc); + if (result != SCAN_SUCCEED) goto out; - } - - if (unlikely(mem_cgroup_charge(page_folio(new_page), mm, gfp))) { - result = SCAN_CGROUP_CHARGE_FAIL; - goto out; - } - count_memcg_page_event(new_page, THP_COLLAPSE_ALLOC); /* * Ensure we have slots for all the pages in the range. This is @@ -1643,14 +1629,14 @@ static void collapse_file(struct mm_struct *mm, } } while (1); - __SetPageLocked(new_page); + __SetPageLocked(hpage); if (is_shmem) - __SetPageSwapBacked(new_page); - new_page->index = start; - new_page->mapping = mapping; + __SetPageSwapBacked(hpage); + hpage->index = start; + hpage->mapping = mapping; /* - * At this point the new_page is locked and not up-to-date. + * At this point the hpage is locked and not up-to-date. * It's safe to insert it into the page cache, because nobody would * be able to map it or use it in another way until we unlock it. */ @@ -1678,7 +1664,7 @@ static void collapse_file(struct mm_struct *mm, result = SCAN_FAIL; goto xa_locked; } - xas_store(&xas, new_page); + xas_store(&xas, hpage); nr_none++; continue; } @@ -1820,19 +1806,19 @@ static void collapse_file(struct mm_struct *mm, list_add_tail(&page->lru, &pagelist); /* Finally, replace with the new page. */ - xas_store(&xas, new_page); + xas_store(&xas, hpage); continue; out_unlock: unlock_page(page); put_page(page); goto xa_unlocked; } - nr = thp_nr_pages(new_page); + nr = thp_nr_pages(hpage); if (is_shmem) - __mod_lruvec_page_state(new_page, NR_SHMEM_THPS, nr); + __mod_lruvec_page_state(hpage, NR_SHMEM_THPS, nr); else { - __mod_lruvec_page_state(new_page, NR_FILE_THPS, nr); + __mod_lruvec_page_state(hpage, NR_FILE_THPS, nr); filemap_nr_thps_inc(mapping); /* * Paired with smp_mb() in do_dentry_open() to ensure @@ -1843,21 +1829,21 @@ out_unlock: smp_mb(); if (inode_is_open_for_write(mapping->host)) { result = SCAN_FAIL; - __mod_lruvec_page_state(new_page, NR_FILE_THPS, -nr); + __mod_lruvec_page_state(hpage, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); goto xa_locked; } } if (nr_none) { - __mod_lruvec_page_state(new_page, NR_FILE_PAGES, nr_none); + __mod_lruvec_page_state(hpage, NR_FILE_PAGES, nr_none); /* nr_none is always 0 for non-shmem. */ - __mod_lruvec_page_state(new_page, NR_SHMEM, nr_none); + __mod_lruvec_page_state(hpage, NR_SHMEM, nr_none); } /* Join all the small entries into a single multi-index entry */ xas_set_order(&xas, start, HPAGE_PMD_ORDER); - xas_store(&xas, new_page); + xas_store(&xas, hpage); xa_locked: xas_unlock_irq(&xas); xa_unlocked: @@ -1879,11 +1865,11 @@ xa_unlocked: index = start; list_for_each_entry_safe(page, tmp, &pagelist, lru) { while (index < page->index) { - clear_highpage(new_page + (index % HPAGE_PMD_NR)); + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - copy_highpage(new_page + (page->index % HPAGE_PMD_NR), - page); + copy_highpage(hpage + (page->index % HPAGE_PMD_NR), + page); list_del(&page->lru); page->mapping = NULL; page_ref_unfreeze(page, 1); @@ -1894,23 +1880,22 @@ xa_unlocked: index++; } while (index < end) { - clear_highpage(new_page + (index % HPAGE_PMD_NR)); + clear_highpage(hpage + (index % HPAGE_PMD_NR)); index++; } - SetPageUptodate(new_page); - page_ref_add(new_page, HPAGE_PMD_NR - 1); + SetPageUptodate(hpage); + page_ref_add(hpage, HPAGE_PMD_NR - 1); if (is_shmem) - set_page_dirty(new_page); - lru_cache_add(new_page); + set_page_dirty(hpage); + lru_cache_add(hpage); /* * Remove pte page tables, so we can re-fault the page as huge. */ retract_page_tables(mapping, start); - *hpage = NULL; - - khugepaged_pages_collapsed++; + unlock_page(hpage); + hpage = NULL; } else { struct page *page; @@ -1949,19 +1934,23 @@ xa_unlocked: VM_BUG_ON(nr_none); xas_unlock_irq(&xas); - new_page->mapping = NULL; + hpage->mapping = NULL; } - unlock_page(new_page); + if (hpage) + unlock_page(hpage); out: VM_BUG_ON(!list_empty(&pagelist)); - if (!IS_ERR_OR_NULL(*hpage)) - mem_cgroup_uncharge(page_folio(*hpage)); + if (hpage) { + mem_cgroup_uncharge(page_folio(hpage)); + put_page(hpage); + } /* TODO: tracepoints */ + return result; } -static void khugepaged_scan_file(struct mm_struct *mm, - struct file *file, pgoff_t start, struct page **hpage) +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct collapse_control *cc) { struct page *page = NULL; struct address_space *mapping = file->f_mapping; @@ -1972,14 +1961,16 @@ static void khugepaged_scan_file(struct mm_struct *mm, present = 0; swap = 0; - memset(khugepaged_node_load, 0, sizeof(khugepaged_node_load)); + memset(cc->node_load, 0, sizeof(cc->node_load)); rcu_read_lock(); xas_for_each(&xas, page, start + HPAGE_PMD_NR - 1) { if (xas_retry(&xas, page)) continue; if (xa_is_value(page)) { - if (++swap > khugepaged_max_ptes_swap) { + ++swap; + if (cc->is_khugepaged && + swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; @@ -1997,11 +1988,11 @@ static void khugepaged_scan_file(struct mm_struct *mm, } node = page_to_nid(page); - if (khugepaged_scan_abort(node)) { + if (hpage_collapse_scan_abort(node, cc)) { result = SCAN_SCAN_ABORT; break; } - khugepaged_node_load[node]++; + cc->node_load[node]++; if (!PageLRU(page)) { result = SCAN_PAGE_LRU; @@ -2030,20 +2021,21 @@ static void khugepaged_scan_file(struct mm_struct *mm, rcu_read_unlock(); if (result == SCAN_SUCCEED) { - if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { + if (cc->is_khugepaged && + present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - node = khugepaged_find_target_node(); - collapse_file(mm, file, start, hpage, node); + result = collapse_file(mm, file, start, cc); } } /* TODO: tracepoints */ + return result; } #else -static void khugepaged_scan_file(struct mm_struct *mm, - struct file *file, pgoff_t start, struct page **hpage) +static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, + pgoff_t start, struct collapse_control *cc) { BUILD_BUG(); } @@ -2053,8 +2045,8 @@ static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) } #endif -static unsigned int khugepaged_scan_mm_slot(unsigned int pages, - struct page **hpage) +static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, + struct collapse_control *cc) __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { @@ -2065,6 +2057,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, VM_BUG_ON(!pages); lockdep_assert_held(&khugepaged_mm_lock); + *result = SCAN_FAIL; if (khugepaged_scan.mm_slot) mm_slot = khugepaged_scan.mm_slot; @@ -2085,7 +2078,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; - if (likely(!khugepaged_test_exit(mm))) + if (likely(!hpage_collapse_test_exit(mm))) vma = find_vma(mm, khugepaged_scan.address); progress++; @@ -2093,11 +2086,11 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, unsigned long hstart, hend; cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) { + if (unlikely(hpage_collapse_test_exit(mm))) { progress++; break; } - if (!hugepage_vma_check(vma, vma->vm_flags, false, false)) { + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, true)) { skip: progress++; continue; @@ -2111,9 +2104,10 @@ skip: VM_BUG_ON(khugepaged_scan.address & ~HPAGE_PMD_MASK); while (khugepaged_scan.address < hend) { - int ret; + bool mmap_locked = true; + cond_resched(); - if (unlikely(khugepaged_test_exit(mm))) + if (unlikely(hpage_collapse_test_exit(mm))) goto breakouterloop; VM_BUG_ON(khugepaged_scan.address < hstart || @@ -2125,19 +2119,29 @@ skip: khugepaged_scan.address); mmap_read_unlock(mm); - ret = 1; - khugepaged_scan_file(mm, file, pgoff, hpage); + *result = khugepaged_scan_file(mm, file, pgoff, + cc); + mmap_locked = false; fput(file); } else { - ret = khugepaged_scan_pmd(mm, vma, - khugepaged_scan.address, - hpage); + *result = hpage_collapse_scan_pmd(mm, vma, + khugepaged_scan.address, + &mmap_locked, + cc); } + if (*result == SCAN_SUCCEED) + ++khugepaged_pages_collapsed; /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; - if (ret) - /* we released mmap_lock so break loop */ + if (!mmap_locked) + /* + * We released mmap_lock so break loop. Note + * that we drop mmap_lock before all hugepage + * allocations, so if allocation fails, we are + * guaranteed to break here and report the + * correct result back to caller. + */ goto breakouterloop_mmap_lock; if (progress >= pages) goto breakouterloop; @@ -2153,7 +2157,7 @@ breakouterloop_mmap_lock: * Release the current mm_slot if this mm is about to die, or * if we scanned all vmas of this mm. */ - if (khugepaged_test_exit(mm) || !vma) { + if (hpage_collapse_test_exit(mm) || !vma) { /* * Make sure that if mm_users is reaching zero while * khugepaged runs here, khugepaged_exit will find @@ -2187,19 +2191,16 @@ static int khugepaged_wait_event(void) kthread_should_stop(); } -static void khugepaged_do_scan(void) +static void khugepaged_do_scan(struct collapse_control *cc) { - struct page *hpage = NULL; unsigned int progress = 0, pass_through_head = 0; unsigned int pages = READ_ONCE(khugepaged_pages_to_scan); bool wait = true; + int result = SCAN_SUCCEED; lru_add_drain_all(); - while (progress < pages) { - if (!khugepaged_prealloc_page(&hpage, &wait)) - break; - + while (true) { cond_resched(); if (unlikely(kthread_should_stop() || try_to_freeze())) @@ -2211,14 +2212,25 @@ static void khugepaged_do_scan(void) if (khugepaged_has_work() && pass_through_head < 2) progress += khugepaged_scan_mm_slot(pages - progress, - &hpage); + &result, cc); else progress = pages; spin_unlock(&khugepaged_mm_lock); - } - if (!IS_ERR_OR_NULL(hpage)) - put_page(hpage); + if (progress >= pages) + break; + + if (result == SCAN_ALLOC_HUGE_PAGE_FAIL) { + /* + * If fail to allocate the first time, try to sleep for + * a while. When hit again, cancel the scan. + */ + if (!wait) + break; + wait = false; + khugepaged_alloc_sleep(); + } + } } static bool khugepaged_should_wakeup(void) @@ -2255,7 +2267,7 @@ static int khugepaged(void *none) set_user_nice(current, MAX_NICE); while (!kthread_should_stop()) { - khugepaged_do_scan(); + khugepaged_do_scan(&khugepaged_collapse_control); khugepaged_wait_work(); } @@ -2354,3 +2366,120 @@ void khugepaged_min_free_kbytes_update(void) set_recommended_min_free_kbytes(); mutex_unlock(&khugepaged_mutex); } + +static int madvise_collapse_errno(enum scan_result r) +{ + /* + * MADV_COLLAPSE breaks from existing madvise(2) conventions to provide + * actionable feedback to caller, so they may take an appropriate + * fallback measure depending on the nature of the failure. + */ + switch (r) { + case SCAN_ALLOC_HUGE_PAGE_FAIL: + return -ENOMEM; + case SCAN_CGROUP_CHARGE_FAIL: + return -EBUSY; + /* Resource temporary unavailable - trying again might succeed */ + case SCAN_PAGE_LOCK: + case SCAN_PAGE_LRU: + return -EAGAIN; + /* + * Other: Trying again likely not to succeed / error intrinsic to + * specified memory range. khugepaged likely won't be able to collapse + * either. + */ + default: + return -EINVAL; + } +} + +int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, + unsigned long start, unsigned long end) +{ + struct collapse_control *cc; + struct mm_struct *mm = vma->vm_mm; + unsigned long hstart, hend, addr; + int thps = 0, last_fail = SCAN_FAIL; + bool mmap_locked = true; + + BUG_ON(vma->vm_start > start); + BUG_ON(vma->vm_end < end); + + *prev = vma; + + /* TODO: Support file/shmem */ + if (!vma->anon_vma || !vma_is_anonymous(vma)) + return -EINVAL; + + if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) + return -EINVAL; + + cc = kmalloc(sizeof(*cc), GFP_KERNEL); + if (!cc) + return -ENOMEM; + cc->is_khugepaged = false; + cc->last_target_node = NUMA_NO_NODE; + + mmgrab(mm); + lru_add_drain_all(); + + hstart = (start + ~HPAGE_PMD_MASK) & HPAGE_PMD_MASK; + hend = end & HPAGE_PMD_MASK; + + for (addr = hstart; addr < hend; addr += HPAGE_PMD_SIZE) { + int result = SCAN_FAIL; + + if (!mmap_locked) { + cond_resched(); + mmap_read_lock(mm); + mmap_locked = true; + result = hugepage_vma_revalidate(mm, addr, &vma, cc); + if (result != SCAN_SUCCEED) { + last_fail = result; + goto out_nolock; + } + } + mmap_assert_locked(mm); + memset(cc->node_load, 0, sizeof(cc->node_load)); + result = hpage_collapse_scan_pmd(mm, vma, addr, &mmap_locked, + cc); + if (!mmap_locked) + *prev = NULL; /* Tell caller we dropped mmap_lock */ + + switch (result) { + case SCAN_SUCCEED: + case SCAN_PMD_MAPPED: + ++thps; + break; + /* Whitelisted set of results where continuing OK */ + case SCAN_PMD_NULL: + case SCAN_PTE_NON_PRESENT: + case SCAN_PTE_UFFD_WP: + case SCAN_PAGE_RO: + case SCAN_LACK_REFERENCED_PAGE: + case SCAN_PAGE_NULL: + case SCAN_PAGE_COUNT: + case SCAN_PAGE_LOCK: + case SCAN_PAGE_COMPOUND: + case SCAN_PAGE_LRU: + last_fail = result; + break; + default: + last_fail = result; + /* Other error, exit */ + goto out_maybelock; + } + } + +out_maybelock: + /* Caller expects us to hold mmap_lock on return */ + if (!mmap_locked) + mmap_read_lock(mm); +out_nolock: + mmap_assert_locked(mm); + mmdrop(mm); + kfree(cc); + + return thps == ((hend - hstart) >> HPAGE_PMD_SHIFT) ? 0 + : madvise_collapse_errno(last_fail); +} diff --git a/mm/ksm.c b/mm/ksm.c index 42ab153335a2..2f315c69fa2c 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1134,6 +1134,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, { struct mm_struct *mm = vma->vm_mm; pmd_t *pmd; + pmd_t pmde; pte_t *ptep; pte_t newpte; spinlock_t *ptl; @@ -1148,6 +1149,15 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, pmd = mm_find_pmd(mm, addr); if (!pmd) goto out; + /* + * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() + * without holding anon_vma lock for write. So when looking for a + * genuine pmde (in which to find pte), test present and !THP together. + */ + pmde = *pmd; + barrier(); + if (!pmd_present(pmde) || pmd_trans_huge(pmde)) + goto out; mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, addr, addr + PAGE_SIZE); diff --git a/mm/madvise.c b/mm/madvise.c index 9ff51650f4f0..4f86eb7f554d 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -59,6 +59,7 @@ static int madvise_need_mmap_write(int behavior) case MADV_FREE: case MADV_POPULATE_READ: case MADV_POPULATE_WRITE: + case MADV_COLLAPSE: return 0; default: /* be safe, default to 1. list exceptions explicitly */ @@ -1060,6 +1061,8 @@ static int madvise_vma_behavior(struct vm_area_struct *vma, if (error) goto out; break; + case MADV_COLLAPSE: + return madvise_collapse(vma, prev, start, end); } anon_name = anon_vma_name(vma); @@ -1153,6 +1156,7 @@ madvise_behavior_valid(int behavior) #ifdef CONFIG_TRANSPARENT_HUGEPAGE case MADV_HUGEPAGE: case MADV_NOHUGEPAGE: + case MADV_COLLAPSE: #endif case MADV_DONTDUMP: case MADV_DODUMP: @@ -1169,13 +1173,13 @@ madvise_behavior_valid(int behavior) } } -static bool -process_madvise_behavior_valid(int behavior) +static bool process_madvise_behavior_valid(int behavior) { switch (behavior) { case MADV_COLD: case MADV_PAGEOUT: case MADV_WILLNEED: + case MADV_COLLAPSE: return true; default: return false; @@ -1342,6 +1346,7 @@ int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, * MADV_NOHUGEPAGE - mark the given range as not worth being backed by * transparent huge pages so the existing pages will not be * coalesced into THP and new pages will not be allocated as THP. + * MADV_COLLAPSE - synchronously coalesce pages into new THP. * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e7ac570dda75..e6713a979e42 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1529,20 +1529,18 @@ static int identify_page_state(unsigned long pfn, struct page *p, return page_action(ps, p, pfn); } -static int try_to_split_thp_page(struct page *page, const char *msg) +static int try_to_split_thp_page(struct page *page) { - lock_page(page); - if (unlikely(split_huge_page(page))) { - unsigned long pfn = page_to_pfn(page); + int ret; - unlock_page(page); - pr_info("%s: %#lx: thp split failed\n", msg, pfn); - put_page(page); - return -EBUSY; - } + lock_page(page); + ret = split_huge_page(page); unlock_page(page); - return 0; + if (unlikely(ret)) + put_page(page); + + return ret; } static void unmap_and_kill(struct list_head *to_kill, unsigned long pfn, @@ -2084,7 +2082,7 @@ try_again: * page is a valid handlable page. */ SetPageHasHWPoisoned(hpage); - if (try_to_split_thp_page(p, "Memory Failure") < 0) { + if (try_to_split_thp_page(p) < 0) { action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); res = -EBUSY; goto unlock_mutex; @@ -2510,8 +2508,11 @@ static int soft_offline_in_use_page(struct page *page) struct page *hpage = compound_head(page); if (!PageHuge(page) && PageTransHuge(hpage)) - if (try_to_split_thp_page(page, "soft offline") < 0) + if (try_to_split_thp_page(page) < 0) { + pr_info("soft offline: %#lx: thp split failed\n", + page_to_pfn(page)); return -EBUSY; + } return __soft_offline_page(page); } diff --git a/mm/memory.c b/mm/memory.c index 861a3f67997c..e775728f15be 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4992,7 +4992,7 @@ static vm_fault_t __handle_mm_fault(struct vm_area_struct *vma, return VM_FAULT_OOM; retry_pud: if (pud_none(*vmf.pud) && - hugepage_vma_check(vma, vm_flags, false, true)) { + hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pud(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; @@ -5026,7 +5026,7 @@ retry_pud: goto retry_pud; if (pmd_none(*vmf.pmd) && - hugepage_vma_check(vma, vm_flags, false, true)) { + hugepage_vma_check(vma, vm_flags, false, true, true)) { ret = create_huge_pmd(&vmf); if (!(ret & VM_FAULT_FALLBACK)) return ret; diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b73d3248d976..a88fd94e18d6 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -853,12 +853,14 @@ static long do_set_mempolicy(unsigned short mode, unsigned short flags, goto out; } + task_lock(current); ret = mpol_set_nodemask(new, nodes, scratch); if (ret) { + task_unlock(current); mpol_put(new); goto out; } - task_lock(current); + old = current->mempolicy; current->mempolicy = new; if (new && new->mode == MPOL_INTERLEAVE) @@ -1803,7 +1805,7 @@ bool vma_policy_mof(struct vm_area_struct *vma) return pol->flags & MPOL_F_MOF; } -static int apply_policy_zone(struct mempolicy *policy, enum zone_type zone) +bool apply_policy_zone(struct mempolicy *policy, enum zone_type zone) { enum zone_type dynamic_policy_zone = policy_zone; diff --git a/mm/mmap.c b/mm/mmap.c index 9d780f415be3..dd25a2aa94f7 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2232,6 +2232,9 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, */ pgoff = 0; get_area = shmem_get_unmapped_area; + } else if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE)) { + /* Ensures that larger anonymous mappings are THP aligned. */ + get_area = thp_get_unmapped_area; } addr = get_area(file, addr, len, pgoff, flags); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 08522a831c7a..7e16873b4b6f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3010,7 +3010,7 @@ __rmqueue_fallback(struct zone *zone, int order, int start_migratetype, * i.e. orders < pageblock_order. If there are no local zones free, * the zonelists will be reiterated without ALLOC_NOFRAGMENT. */ - if (alloc_flags & ALLOC_NOFRAGMENT) + if (order < pageblock_order && alloc_flags & ALLOC_NOFRAGMENT) min_order = pageblock_order; /* @@ -9023,7 +9023,7 @@ void *__init alloc_large_system_hash(const char *tablename, { unsigned long long max = high_limit; unsigned long log2qty, size; - void *table = NULL; + void *table; gfp_t gfp_flags; bool virt; bool huge; diff --git a/mm/page_ext.c b/mm/page_ext.c index 3dc715d7ac29..e22a928dd66a 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -336,7 +336,7 @@ static int __meminit online_page_ext(unsigned long start_pfn, } static int __meminit offline_page_ext(unsigned long start_pfn, - unsigned long nr_pages, int nid) + unsigned long nr_pages) { unsigned long start, end, pfn; @@ -362,11 +362,11 @@ static int __meminit page_ext_callback(struct notifier_block *self, break; case MEM_OFFLINE: offline_page_ext(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); + mn->nr_pages); break; case MEM_CANCEL_ONLINE: offline_page_ext(mn->start_pfn, - mn->nr_pages, mn->status_change_nid); + mn->nr_pages); break; case MEM_GOING_OFFLINE: break; diff --git a/mm/page_io.c b/mm/page_io.c index 68318134dc92..68d53fc27598 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -28,7 +28,7 @@ #include #include "swap.h" -void end_swap_bio_write(struct bio *bio) +static void end_swap_bio_write(struct bio *bio) { struct page *page = bio_first_page_all(bio); @@ -202,7 +202,7 @@ int swap_writepage(struct page *page, struct writeback_control *wbc) end_page_writeback(page); goto out; } - ret = __swap_writepage(page, wbc, end_swap_bio_write); + ret = __swap_writepage(page, wbc); out: return ret; } @@ -332,8 +332,7 @@ static int swap_writepage_fs(struct page *page, struct writeback_control *wbc) return 0; } -int __swap_writepage(struct page *page, struct writeback_control *wbc, - bio_end_io_t end_write_func) +int __swap_writepage(struct page *page, struct writeback_control *wbc) { struct bio *bio; int ret; @@ -358,7 +357,7 @@ int __swap_writepage(struct page *page, struct writeback_control *wbc, REQ_OP_WRITE | REQ_SWAP | wbc_to_write_flags(wbc), GFP_NOIO); bio->bi_iter.bi_sector = swap_page_sector(page); - bio->bi_end_io = end_write_func; + bio->bi_end_io = end_swap_bio_write; bio_add_page(bio, page, thp_size(page), 0); bio_associate_blkg_from_page(bio, page); diff --git a/mm/rmap.c b/mm/rmap.c index 93d5a6f793d2..9af08343ce55 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -770,13 +770,17 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) return vma_address(page, vma); } +/* + * Returns the actual pmd_t* where we expect 'address' to be mapped from, or + * NULL if it doesn't exist. No guarantees / checks on what the pmd_t* + * represents. + */ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) { pgd_t *pgd; p4d_t *p4d; pud_t *pud; pmd_t *pmd = NULL; - pmd_t pmde; pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) @@ -791,15 +795,6 @@ pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) goto out; pmd = pmd_offset(pud, address); - /* - * Some THP functions use the sequence pmdp_huge_clear_flush(), set_pmd_at() - * without holding anon_vma lock for write. So when looking for a - * genuine pmde (in which to find pte), test present and !THP together. - */ - pmde = *pmd; - barrier(); - if (!pmd_present(pmde) || pmd_trans_huge(pmde)) - pmd = NULL; out: return pmd; } diff --git a/mm/swap.h b/mm/swap.h index 17936e068c1c..0ffa5b478051 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -18,9 +18,7 @@ static inline void swap_read_unplug(struct swap_iocb *plug) } void swap_write_unplug(struct swap_iocb *sio); int swap_writepage(struct page *page, struct writeback_control *wbc); -void end_swap_bio_write(struct bio *bio); -int __swap_writepage(struct page *page, struct writeback_control *wbc, - bio_end_io_t end_write_func); +int __swap_writepage(struct page *page, struct writeback_control *wbc); /* linux/mm/swap_state.c */ /* One swap address space for each 64M swap space */ diff --git a/mm/util.c b/mm/util.c index 7c00973424cd..a5b5a747c305 100644 --- a/mm/util.c +++ b/mm/util.c @@ -859,10 +859,10 @@ int folio_mapcount(struct folio *folio) return atomic_read(&folio->_mapcount) + 1; compound = folio_entire_mapcount(folio); - nr = folio_nr_pages(folio); if (folio_test_hugetlb(folio)) return compound; ret = compound; + nr = folio_nr_pages(folio); for (i = 0; i < nr; i++) ret += atomic_read(&folio_page(folio, i)->_mapcount) + 1; /* File pages has compound_mapcount included in _mapcount */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 9d04f40de4f0..bf3a2e616756 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3238,7 +3238,7 @@ again: refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); - if (refaults != target_lruvec->refaults[0] || + if (refaults != target_lruvec->refaults[WORKINGSET_ANON] || inactive_is_low(target_lruvec, LRU_INACTIVE_ANON)) sc->may_deactivate |= DEACTIVATE_ANON; else @@ -3251,7 +3251,7 @@ again: */ refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); - if (refaults != target_lruvec->refaults[1] || + if (refaults != target_lruvec->refaults[WORKINGSET_FILE] || inactive_is_low(target_lruvec, LRU_INACTIVE_FILE)) sc->may_deactivate |= DEACTIVATE_FILE; else @@ -3567,9 +3567,9 @@ static void snapshot_refaults(struct mem_cgroup *target_memcg, pg_data_t *pgdat) target_lruvec = mem_cgroup_lruvec(target_memcg, pgdat); refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_ANON); - target_lruvec->refaults[0] = refaults; + target_lruvec->refaults[WORKINGSET_ANON] = refaults; refaults = lruvec_page_state(target_lruvec, WORKINGSET_ACTIVATE_FILE); - target_lruvec->refaults[1] = refaults; + target_lruvec->refaults[WORKINGSET_FILE] = refaults; } /* diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 907c9b1e1e61..7b3bffc06078 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1555,6 +1555,13 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, d_off += size; d_size -= size; + /* + * Calling kunmap_atomic(d_addr) is necessary. kunmap_atomic() + * calls must occurs in reverse order of calls to kmap_atomic(). + * So, to call kunmap_atomic(s_addr) we should first call + * kunmap_atomic(d_addr). For more details see + * https://lore.kernel.org/linux-mm/5512421D.4000603@samsung.com/ + */ if (s_off >= PAGE_SIZE) { kunmap_atomic(d_addr); kunmap_atomic(s_addr); @@ -2103,8 +2110,6 @@ unsigned long zs_compact(struct zs_pool *pool) for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; - if (!class) - continue; if (class->index != i) continue; pages_freed += __zs_compact(pool, class); @@ -2149,8 +2154,6 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, for (i = ZS_SIZE_CLASSES - 1; i >= 0; i--) { class = pool->size_class[i]; - if (!class) - continue; if (class->index != i) continue; @@ -2308,9 +2311,6 @@ void zs_destroy_pool(struct zs_pool *pool) int fg; struct size_class *class = pool->size_class[i]; - if (!class) - continue; - if (class->index != i) continue; diff --git a/mm/zswap.c b/mm/zswap.c index 104835b379ec..2d48fd59cc7a 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1026,7 +1026,7 @@ static int zswap_writeback_entry(struct zpool *pool, unsigned long handle) SetPageReclaim(page); /* start writeback */ - __swap_writepage(page, &wbc, end_swap_bio_write); + __swap_writepage(page, &wbc); put_page(page); zswap_written_back_pages++; diff --git a/tools/include/uapi/asm-generic/mman-common.h b/tools/include/uapi/asm-generic/mman-common.h index 6c1aa92a92e4..6ce1f1ceb432 100644 --- a/tools/include/uapi/asm-generic/mman-common.h +++ b/tools/include/uapi/asm-generic/mman-common.h @@ -77,6 +77,8 @@ #define MADV_DONTNEED_LOCKED 24 /* like DONTNEED, but drop locked pages too */ +#define MADV_COLLAPSE 25 /* Synchronous hugepage collapse */ + /* compatibility flags */ #define MAP_FILE 0 diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 155120b67a16..b77b1e28cdb3 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -14,6 +14,9 @@ #ifndef MADV_PAGEOUT #define MADV_PAGEOUT 21 #endif +#ifndef MADV_COLLAPSE +#define MADV_COLLAPSE 25 +#endif #define BASE_ADDR ((void *)(1UL << 30)) static unsigned long hpage_pmd_size; @@ -23,6 +26,11 @@ static int hpage_pmd_nr; #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" #define PID_SMAPS "/proc/self/smaps" +struct collapse_context { + void (*collapse)(const char *msg, char *p, int nr_hpages, bool expect); + bool enforce_pte_scan_limits; +}; + enum thp_enabled { THP_ALWAYS, THP_MADVISE, @@ -90,18 +98,6 @@ struct settings { struct khugepaged_settings khugepaged; }; -static struct settings default_settings = { - .thp_enabled = THP_MADVISE, - .thp_defrag = THP_DEFRAG_ALWAYS, - .shmem_enabled = SHMEM_NEVER, - .use_zero_page = 0, - .khugepaged = { - .defrag = 1, - .alloc_sleep_millisecs = 10, - .scan_sleep_millisecs = 10, - }, -}; - static struct settings saved_settings; static bool skip_settings_restore; @@ -279,6 +275,39 @@ static void write_settings(struct settings *settings) write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); } +#define MAX_SETTINGS_DEPTH 4 +static struct settings settings_stack[MAX_SETTINGS_DEPTH]; +static int settings_index; + +static struct settings *current_settings(void) +{ + if (!settings_index) { + printf("Fail: No settings set"); + exit(EXIT_FAILURE); + } + return settings_stack + settings_index - 1; +} + +static void push_settings(struct settings *settings) +{ + if (settings_index >= MAX_SETTINGS_DEPTH) { + printf("Fail: Settings stack exceeded"); + exit(EXIT_FAILURE); + } + settings_stack[settings_index++] = *settings; + write_settings(current_settings()); +} + +static void pop_settings(void) +{ + if (settings_index <= 0) { + printf("Fail: Settings stack empty"); + exit(EXIT_FAILURE); + } + --settings_index; + write_settings(current_settings()); +} + static void restore_settings(int sig) { if (skip_settings_restore) @@ -322,14 +351,6 @@ static void save_settings(void) signal(SIGQUIT, restore_settings); } -static void adjust_settings(void) -{ - - printf("Adjust settings..."); - write_settings(&default_settings); - success("OK"); -} - #define MAX_LINE_LENGTH 500 static bool check_for_pattern(FILE *fp, char *pattern, char *buf) @@ -341,7 +362,7 @@ static bool check_for_pattern(FILE *fp, char *pattern, char *buf) return false; } -static bool check_huge(void *addr) +static bool check_huge(void *addr, int nr_hpages) { bool thp = false; int ret; @@ -366,7 +387,7 @@ static bool check_huge(void *addr) goto err_out; ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB", - hpage_pmd_size >> 10); + nr_hpages * (hpage_pmd_size >> 10)); if (ret >= MAX_LINE_LENGTH) { printf("%s: Pattern is too long\n", __func__); exit(EXIT_FAILURE); @@ -434,12 +455,12 @@ err_out: return swap; } -static void *alloc_mapping(void) +static void *alloc_mapping(int nr) { void *p; - p = mmap(BASE_ADDR, hpage_pmd_size, PROT_READ | PROT_WRITE, - MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + p = mmap(BASE_ADDR, nr * hpage_pmd_size, PROT_READ | PROT_WRITE, + MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); if (p != BASE_ADDR) { printf("Failed to allocate VMA at %p\n", BASE_ADDR); exit(EXIT_FAILURE); @@ -456,6 +477,25 @@ static void fill_memory(int *p, unsigned long start, unsigned long end) p[i * page_size / sizeof(*p)] = i + 0xdead0000; } +/* + * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with + * validate_memory()'able contents. + */ +static void *alloc_hpage(void) +{ + void *p; + + p = alloc_mapping(1); + printf("Allocate huge page..."); + madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + fill_memory(p, 0, hpage_pmd_size); + if (check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + return p; +} + static void validate_memory(int *p, unsigned long start, unsigned long end) { int i; @@ -469,26 +509,59 @@ static void validate_memory(int *p, unsigned long start, unsigned long end) } } +static void madvise_collapse(const char *msg, char *p, int nr_hpages, + bool expect) +{ + int ret; + struct settings settings = *current_settings(); + + printf("%s...", msg); + /* Sanity check */ + if (!check_huge(p, 0)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + + /* + * Prevent khugepaged interference and tests that MADV_COLLAPSE + * ignores /sys/kernel/mm/transparent_hugepage/enabled + */ + settings.thp_enabled = THP_NEVER; + push_settings(&settings); + + /* Clear VM_NOHUGEPAGE */ + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); + ret = madvise(p, nr_hpages * hpage_pmd_size, MADV_COLLAPSE); + if (((bool)ret) == expect) + fail("Fail: Bad return value"); + else if (check_huge(p, nr_hpages) != expect) + fail("Fail: check_huge()"); + else + success("OK"); + + pop_settings(); +} + #define TICK 500000 -static bool wait_for_scan(const char *msg, char *p) +static bool wait_for_scan(const char *msg, char *p, int nr_hpages) { int full_scans; int timeout = 6; /* 3 seconds */ /* Sanity check */ - if (check_huge(p)) { + if (!check_huge(p, 0)) { printf("Unexpected huge page\n"); exit(EXIT_FAILURE); } - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); + madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); /* Wait until the second full_scan completed */ full_scans = read_num("khugepaged/full_scans") + 2; printf("%s...", msg); while (timeout--) { - if (check_huge(p)) + if (check_huge(p, nr_hpages)) break; if (read_num("khugepaged/full_scans") >= full_scans) break; @@ -496,121 +569,121 @@ static bool wait_for_scan(const char *msg, char *p) usleep(TICK); } - madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); + madvise(p, nr_hpages * hpage_pmd_size, MADV_NOHUGEPAGE); return timeout == -1; } +static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, + bool expect) +{ + if (wait_for_scan(msg, p, nr_hpages)) { + if (expect) + fail("Timeout"); + else + success("OK"); + return; + } else if (check_huge(p, nr_hpages) == expect) { + success("OK"); + } else { + fail("Fail"); + } +} + static void alloc_at_fault(void) { - struct settings settings = default_settings; + struct settings settings = *current_settings(); char *p; settings.thp_enabled = THP_ALWAYS; - write_settings(&settings); + push_settings(&settings); - p = alloc_mapping(); + p = alloc_mapping(1); *p = 1; printf("Allocate huge page on fault..."); - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); - write_settings(&default_settings); + pop_settings(); madvise(p, page_size, MADV_DONTNEED); printf("Split huge PMD on MADV_DONTNEED..."); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); munmap(p, hpage_pmd_size); } -static void collapse_full(void) +static void collapse_full(struct collapse_context *c) +{ + void *p; + int nr_hpages = 4; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = alloc_mapping(nr_hpages); + fill_memory(p, 0, size); + c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, + true); + validate_memory(p, 0, size); + munmap(p, size); +} + +static void collapse_empty(struct collapse_context *c) { void *p; - p = alloc_mapping(); - fill_memory(p, 0, hpage_pmd_size); - if (wait_for_scan("Collapse fully populated PTE table", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, hpage_pmd_size); + p = alloc_mapping(1); + c->collapse("Do not collapse empty PTE table", p, 1, false); munmap(p, hpage_pmd_size); } -static void collapse_empty(void) +static void collapse_single_pte_entry(struct collapse_context *c) { void *p; - p = alloc_mapping(); - if (wait_for_scan("Do not collapse empty PTE table", p)) - fail("Timeout"); - else if (check_huge(p)) - fail("Fail"); - else - success("OK"); - munmap(p, hpage_pmd_size); -} - -static void collapse_single_pte_entry(void) -{ - void *p; - - p = alloc_mapping(); + p = alloc_mapping(1); fill_memory(p, 0, page_size); - if (wait_for_scan("Collapse PTE table with single PTE entry present", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table with single PTE entry present", p, + 1, true); validate_memory(p, 0, page_size); munmap(p, hpage_pmd_size); } -static void collapse_max_ptes_none(void) +static void collapse_max_ptes_none(struct collapse_context *c) { int max_ptes_none = hpage_pmd_nr / 2; - struct settings settings = default_settings; + struct settings settings = *current_settings(); void *p; settings.khugepaged.max_ptes_none = max_ptes_none; - write_settings(&settings); + push_settings(&settings); - p = alloc_mapping(); + p = alloc_mapping(1); fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - if (wait_for_scan("Do not collapse with max_ptes_none exceeded", p)) - fail("Timeout"); - else if (check_huge(p)) - fail("Fail"); - else - success("OK"); + c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, + !c->enforce_pte_scan_limits); validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); - if (wait_for_scan("Collapse with max_ptes_none PTEs empty", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + if (c->enforce_pte_scan_limits) { + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, + true); + validate_memory(p, 0, + (hpage_pmd_nr - max_ptes_none) * page_size); + } munmap(p, hpage_pmd_size); - write_settings(&default_settings); + pop_settings(); } -static void collapse_swapin_single_pte(void) +static void collapse_swapin_single_pte(struct collapse_context *c) { void *p; - p = alloc_mapping(); + p = alloc_mapping(1); fill_memory(p, 0, hpage_pmd_size); printf("Swapout one page..."); @@ -625,23 +698,18 @@ static void collapse_swapin_single_pte(void) goto out; } - if (wait_for_scan("Collapse with swapping in single PTE entry", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse with swapping in single PTE entry", p, 1, true); validate_memory(p, 0, hpage_pmd_size); out: munmap(p, hpage_pmd_size); } -static void collapse_max_ptes_swap(void) +static void collapse_max_ptes_swap(struct collapse_context *c) { int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); void *p; - p = alloc_mapping(); + p = alloc_mapping(1); fill_memory(p, 0, hpage_pmd_size); printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); @@ -656,115 +724,83 @@ static void collapse_max_ptes_swap(void) goto out; } - if (wait_for_scan("Do not collapse with max_ptes_swap exceeded", p)) - fail("Timeout"); - else if (check_huge(p)) - fail("Fail"); - else - success("OK"); + c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, + !c->enforce_pte_scan_limits); validate_memory(p, 0, hpage_pmd_size); - fill_memory(p, 0, hpage_pmd_size); - printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr); - if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { - perror("madvise(MADV_PAGEOUT)"); - exit(EXIT_FAILURE); - } - if (check_swap(p, max_ptes_swap * page_size)) { - success("OK"); - } else { - fail("Fail"); - goto out; - } + if (c->enforce_pte_scan_limits) { + fill_memory(p, 0, hpage_pmd_size); + printf("Swapout %d of %d pages...", max_ptes_swap, + hpage_pmd_nr); + if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { + perror("madvise(MADV_PAGEOUT)"); + exit(EXIT_FAILURE); + } + if (check_swap(p, max_ptes_swap * page_size)) { + success("OK"); + } else { + fail("Fail"); + goto out; + } - if (wait_for_scan("Collapse with max_ptes_swap pages swapped out", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); - validate_memory(p, 0, hpage_pmd_size); + c->collapse("Collapse with max_ptes_swap pages swapped out", p, + 1, true); + validate_memory(p, 0, hpage_pmd_size); + } out: munmap(p, hpage_pmd_size); } -static void collapse_single_pte_entry_compound(void) +static void collapse_single_pte_entry_compound(struct collapse_context *c) { void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); + p = alloc_hpage(); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - printf("Split huge page leaving single PTE mapping compound page..."); madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); - if (wait_for_scan("Collapse PTE table with single PTE mapping compound page", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table with single PTE mapping compound page", + p, 1, true); validate_memory(p, 0, page_size); munmap(p, hpage_pmd_size); } -static void collapse_full_of_compound(void) +static void collapse_full_of_compound(struct collapse_context *c) { void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); - + p = alloc_hpage(); printf("Split huge page leaving single PTE page table full of compound pages..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); - if (wait_for_scan("Collapse PTE table full of compound pages", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table full of compound pages", p, 1, true); validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); } -static void collapse_compound_extreme(void) +static void collapse_compound_extreme(struct collapse_context *c) { void *p; int i; - p = alloc_mapping(); + p = alloc_mapping(1); for (i = 0; i < hpage_pmd_nr; i++) { printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", i + 1, hpage_pmd_nr); madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); fill_memory(BASE_ADDR, 0, hpage_pmd_size); - if (!check_huge(BASE_ADDR)) { + if (!check_huge(BASE_ADDR, 1)) { printf("Failed to allocate huge page\n"); exit(EXIT_FAILURE); } @@ -793,32 +829,28 @@ static void collapse_compound_extreme(void) munmap(BASE_ADDR, hpage_pmd_size); fill_memory(p, 0, hpage_pmd_size); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); - if (wait_for_scan("Collapse PTE table full of different compound pages", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table full of different compound pages", p, 1, + true); validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); } -static void collapse_fork(void) +static void collapse_fork(struct collapse_context *c) { int wstatus; void *p; - p = alloc_mapping(); + p = alloc_mapping(1); printf("Allocate small page..."); fill_memory(p, 0, page_size); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); @@ -829,19 +861,14 @@ static void collapse_fork(void) skip_settings_restore = true; exit_status = 0; - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); fill_memory(p, page_size, 2 * page_size); - - if (wait_for_scan("Collapse PTE table with single page shared with parent process", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table with single page shared with parent process", + p, 1, true); validate_memory(p, 0, page_size); munmap(p, hpage_pmd_size); @@ -852,7 +879,7 @@ static void collapse_fork(void) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has small page..."); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); @@ -860,28 +887,19 @@ static void collapse_fork(void) munmap(p, hpage_pmd_size); } -static void collapse_fork_compound(void) +static void collapse_fork_compound(struct collapse_context *c) { int wstatus; void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); - + p = alloc_hpage(); printf("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; exit_status = 0; - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -889,21 +907,17 @@ static void collapse_fork_compound(void) printf("Split huge page PMD in child process..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); fill_memory(p, 0, page_size); write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); - if (wait_for_scan("Collapse PTE table full of compound pages in child", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse PTE table full of compound pages in child", + p, 1, true); write_num("khugepaged/max_ptes_shared", - default_settings.khugepaged.max_ptes_shared); + current_settings()->khugepaged.max_ptes_shared); validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); @@ -914,7 +928,7 @@ static void collapse_fork_compound(void) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has huge page..."); - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -922,29 +936,20 @@ static void collapse_fork_compound(void) munmap(p, hpage_pmd_size); } -static void collapse_max_ptes_shared() +static void collapse_max_ptes_shared(struct collapse_context *c) { int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); int wstatus; void *p; - p = alloc_mapping(); - - printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p)) - success("OK"); - else - fail("Fail"); - + p = alloc_hpage(); printf("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; exit_status = 0; - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -952,33 +957,27 @@ static void collapse_max_ptes_shared() printf("Trigger CoW on page %d of %d...", hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); - if (!check_huge(p)) + if (check_huge(p, 0)) success("OK"); else fail("Fail"); - if (wait_for_scan("Do not collapse with max_ptes_shared exceeded", p)) - fail("Timeout"); - else if (!check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Maybe collapse with max_ptes_shared exceeded", p, + 1, !c->enforce_pte_scan_limits); - printf("Trigger CoW on page %d of %d...", - hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); - if (!check_huge(p)) - success("OK"); - else - fail("Fail"); + if (c->enforce_pte_scan_limits) { + printf("Trigger CoW on page %d of %d...", + hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); + fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * + page_size); + if (check_huge(p, 0)) + success("OK"); + else + fail("Fail"); - - if (wait_for_scan("Collapse with max_ptes_shared PTEs shared", p)) - fail("Timeout"); - else if (check_huge(p)) - success("OK"); - else - fail("Fail"); + c->collapse("Collapse with max_ptes_shared PTEs shared", + p, 1, true); + } validate_memory(p, 0, hpage_pmd_size); munmap(p, hpage_pmd_size); @@ -989,7 +988,7 @@ static void collapse_max_ptes_shared() exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has huge page..."); - if (check_huge(p)) + if (check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -997,8 +996,52 @@ static void collapse_max_ptes_shared() munmap(p, hpage_pmd_size); } -int main(void) +static void madvise_collapse_existing_thps(void) { + void *p; + int err; + + p = alloc_mapping(1); + fill_memory(p, 0, hpage_pmd_size); + + printf("Collapse fully populated PTE table..."); + /* + * Note that we don't set MADV_HUGEPAGE here, which + * also tests that VM_HUGEPAGE isn't required for + * MADV_COLLAPSE in "madvise" mode. + */ + err = madvise(p, hpage_pmd_size, MADV_COLLAPSE); + if (err == 0 && check_huge(p, 1)) { + success("OK"); + printf("Re-collapse PMD-mapped hugepage"); + err = madvise(p, hpage_pmd_size, MADV_COLLAPSE); + if (err == 0 && check_huge(p, 1)) + success("OK"); + else + fail("Fail"); + } else { + fail("Fail"); + } + validate_memory(p, 0, hpage_pmd_size); + munmap(p, hpage_pmd_size); +} + +int main(int argc, const char **argv) +{ + struct collapse_context c; + struct settings default_settings = { + .thp_enabled = THP_MADVISE, + .thp_defrag = THP_DEFRAG_ALWAYS, + .shmem_enabled = SHMEM_NEVER, + .use_zero_page = 0, + .khugepaged = { + .defrag = 1, + .alloc_sleep_millisecs = 10, + .scan_sleep_millisecs = 10, + }, + }; + const char *tests = argc == 1 ? "all" : argv[1]; + setbuf(stdout, NULL); page_size = getpagesize(); @@ -1011,21 +1054,47 @@ int main(void) default_settings.khugepaged.pages_to_scan = hpage_pmd_nr * 8; save_settings(); - adjust_settings(); + push_settings(&default_settings); alloc_at_fault(); - collapse_full(); - collapse_empty(); - collapse_single_pte_entry(); - collapse_max_ptes_none(); - collapse_swapin_single_pte(); - collapse_max_ptes_swap(); - collapse_single_pte_entry_compound(); - collapse_full_of_compound(); - collapse_compound_extreme(); - collapse_fork(); - collapse_fork_compound(); - collapse_max_ptes_shared(); + + if (!strcmp(tests, "khugepaged") || !strcmp(tests, "all")) { + printf("\n*** Testing context: khugepaged ***\n"); + c.collapse = &khugepaged_collapse; + c.enforce_pte_scan_limits = true; + + collapse_full(&c); + collapse_empty(&c); + collapse_single_pte_entry(&c); + collapse_max_ptes_none(&c); + collapse_swapin_single_pte(&c); + collapse_max_ptes_swap(&c); + collapse_single_pte_entry_compound(&c); + collapse_full_of_compound(&c); + collapse_compound_extreme(&c); + collapse_fork(&c); + collapse_fork_compound(&c); + collapse_max_ptes_shared(&c); + } + if (!strcmp(tests, "madvise") || !strcmp(tests, "all")) { + printf("\n*** Testing context: madvise ***\n"); + c.collapse = &madvise_collapse; + c.enforce_pte_scan_limits = false; + + collapse_full(&c); + collapse_empty(&c); + collapse_single_pte_entry(&c); + collapse_max_ptes_none(&c); + collapse_swapin_single_pte(&c); + collapse_max_ptes_swap(&c); + collapse_single_pte_entry_compound(&c); + collapse_full_of_compound(&c); + collapse_compound_extreme(&c); + collapse_fork(&c); + collapse_fork_compound(&c); + collapse_max_ptes_shared(&c); + madvise_collapse_existing_thps(); + } restore_settings(0); } diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index de86983b8a0f..e780e76c26b8 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -120,11 +120,16 @@ run_test ./gup_test -a # Dump pages 0, 19, and 4096, using pin_user_pages: run_test ./gup_test -ct -F 0x1 0 19 0x1000 -run_test ./userfaultfd anon 20 16 -# Test requires source and destination huge pages. Size of source -# (half_ufd_size_MB) is passed as argument to test. -run_test ./userfaultfd hugetlb "$half_ufd_size_MB" 32 -run_test ./userfaultfd shmem 20 16 +uffd_mods=("" ":dev") +for mod in "${uffd_mods[@]}"; do + run_test ./userfaultfd anon${mod} 20 16 + # Hugetlb tests require source and destination huge pages. Pass in half + # the size ($half_ufd_size_MB), which is used for *each*. + run_test ./userfaultfd hugetlb${mod} "$half_ufd_size_MB" 32 + run_test ./userfaultfd hugetlb_shared${mod} "$half_ufd_size_MB" 32 "$mnt"/uffd-test + rm -f "$mnt"/uffd-test + run_test ./userfaultfd shmem${mod} 20 16 +done #cleanup umount "$mnt" diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 7c3f1b0ab468..7be709d9eed0 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -77,6 +77,11 @@ static int bounces; #define TEST_SHMEM 3 static int test_type; +#define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) + +/* test using /dev/userfaultfd, instead of userfaultfd(2) */ +static bool test_dev_userfaultfd; + /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */ #define ALARM_INTERVAL_SECS 10 static volatile bool test_uffdio_copy_eexist = true; @@ -125,6 +130,8 @@ struct uffd_stats { const char *examples = "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" "./userfaultfd anon 100 99999\n\n" + "# Run the same anonymous memory test, but using /dev/userfaultfd:\n" + "./userfaultfd anon:dev 100 99999\n\n" "# Run share memory test on 1GiB region with 99 bounces:\n" "./userfaultfd shmem 1000 99\n\n" "# Run hugetlb memory test on 256MiB region with 50 bounces:\n" @@ -141,6 +148,14 @@ static void usage(void) "[hugetlbfs_file]\n\n"); fprintf(stderr, "Supported : anon, hugetlb, " "hugetlb_shared, shmem\n\n"); + fprintf(stderr, "'Test mods' can be joined to the test type string with a ':'. " + "Supported mods:\n"); + fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); + fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); + fprintf(stderr, "\nExample test mod usage:\n"); + fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); + fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); + fprintf(stderr, "Examples:\n\n"); fprintf(stderr, "%s", examples); exit(1); @@ -154,12 +169,14 @@ static void usage(void) ret, __LINE__); \ } while (0) -#define err(fmt, ...) \ +#define errexit(exitcode, fmt, ...) \ do { \ _err(fmt, ##__VA_ARGS__); \ - exit(1); \ + exit(exitcode); \ } while (0) +#define err(fmt, ...) errexit(1, fmt, ##__VA_ARGS__) + static void uffd_stats_reset(struct uffd_stats *uffd_stats, unsigned long n_cpus) { @@ -383,13 +400,34 @@ static void assert_expected_ioctls_present(uint64_t mode, uint64_t ioctls) } } +static int __userfaultfd_open_dev(void) +{ + int fd, _uffd; + + fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC); + if (fd < 0) + errexit(KSFT_SKIP, "opening /dev/userfaultfd failed"); + + _uffd = ioctl(fd, USERFAULTFD_IOC_NEW, UFFD_FLAGS); + if (_uffd < 0) + errexit(errno == ENOTTY ? KSFT_SKIP : 1, + "creating userfaultfd failed"); + close(fd); + return _uffd; +} + static void userfaultfd_open(uint64_t *features) { struct uffdio_api uffdio_api; - uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); - if (uffd < 0) - err("userfaultfd syscall not available in this kernel"); + if (test_dev_userfaultfd) + uffd = __userfaultfd_open_dev(); + else { + uffd = syscall(__NR_userfaultfd, UFFD_FLAGS); + if (uffd < 0) + errexit(errno == ENOSYS ? KSFT_SKIP : 1, + "creating userfaultfd failed"); + } uffd_flags = fcntl(uffd, F_GETFD, NULL); uffdio_api.api = UFFD_API; @@ -1584,8 +1622,6 @@ unsigned long default_huge_page_size(void) static void set_test_type(const char *type) { - uint64_t features = UFFD_API_FEATURES; - if (!strcmp(type, "anon")) { test_type = TEST_ANON; uffd_test_ops = &anon_uffd_test_ops; @@ -1603,9 +1639,29 @@ static void set_test_type(const char *type) test_type = TEST_SHMEM; uffd_test_ops = &shmem_uffd_test_ops; test_uffdio_minor = true; - } else { - err("Unknown test type: %s", type); } +} + +static void parse_test_type_arg(const char *raw_type) +{ + char *buf = strdup(raw_type); + uint64_t features = UFFD_API_FEATURES; + + while (buf) { + const char *token = strsep(&buf, ":"); + + if (!test_type) + set_test_type(token); + else if (!strcmp(token, "dev")) + test_dev_userfaultfd = true; + else if (!strcmp(token, "syscall")) + test_dev_userfaultfd = false; + else + err("unrecognized test mod '%s'", token); + } + + if (!test_type) + err("failed to parse test type argument: '%s'", raw_type); if (test_type == TEST_HUGETLB) page_size = default_huge_page_size(); @@ -1653,7 +1709,7 @@ int main(int argc, char **argv) err("failed to arm SIGALRM"); alarm(ALARM_INTERVAL_SECS); - set_test_type(argv[1]); + parse_test_type_arg(argv[1]); nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size / diff --git a/tools/vm/page_owner_sort.c b/tools/vm/page_owner_sort.c index ec2e67c85b84..ce860ab94162 100644 --- a/tools/vm/page_owner_sort.c +++ b/tools/vm/page_owner_sort.c @@ -470,7 +470,12 @@ static bool match_str_list(const char *str, char **list, int list_size) static bool is_need(char *buf) { - if ((filter & FILTER_UNRELEASE) && get_free_ts_nsec(buf) != 0) + __u64 ts_nsec, free_ts_nsec; + + ts_nsec = get_ts_nsec(buf); + free_ts_nsec = get_free_ts_nsec(buf); + + if ((filter & FILTER_UNRELEASE) && free_ts_nsec != 0 && ts_nsec < free_ts_nsec) return false; if ((filter & FILTER_PID) && !match_num_list(get_pid(buf), fc.pids, fc.pids_size)) return false;