From 524e00b36e8c547f5582eef3fb645a8d9fc5e3df Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:48 +0000 Subject: [PATCH 001/350] mm: remove rb tree. Remove the RB tree and start using the maple tree for vm_area_struct tracking. Drop validate_mm() calls in expand_upwards() and expand_downwards() as the lock is not held. Link: https://lkml.kernel.org/r/20220906194824.2110408-18-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/kernel/tboot.c | 1 - drivers/firmware/efi/efi.c | 1 - include/linux/mm.h | 2 - include/linux/mm_types.h | 14 - kernel/fork.c | 8 - mm/init-mm.c | 2 - mm/mmap.c | 506 ++++++++----------------------------- mm/nommu.c | 87 ++----- mm/util.c | 10 +- 9 files changed, 144 insertions(+), 487 deletions(-) diff --git a/arch/x86/kernel/tboot.c b/arch/x86/kernel/tboot.c index e01544202651..4c1bcb6053fc 100644 --- a/arch/x86/kernel/tboot.c +++ b/arch/x86/kernel/tboot.c @@ -95,7 +95,6 @@ void __init tboot_probe(void) static pgd_t *tboot_pg_dir; static struct mm_struct tboot_mm = { - .mm_rb = RB_ROOT, .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, tboot_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 7b6a815b79d3..042a3ef4db1c 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -57,7 +57,6 @@ static unsigned long __initdata mem_reserve = EFI_INVALID_TABLE_ADDR; static unsigned long __initdata rt_prop = EFI_INVALID_TABLE_ADDR; struct mm_struct efi_mm = { - .mm_rb = RB_ROOT, .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, efi_mm.mmap_lock), .mm_users = ATOMIC_INIT(2), .mm_count = ATOMIC_INIT(1), diff --git a/include/linux/mm.h b/include/linux/mm.h index 646ea4d3bd74..dfce1aaa7a64 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2654,8 +2654,6 @@ extern int __split_vma(struct mm_struct *, struct vm_area_struct *, extern int split_vma(struct mm_struct *, struct vm_area_struct *, unsigned long addr, int new_below); extern int insert_vm_struct(struct mm_struct *, struct vm_area_struct *); -extern void __vma_link_rb(struct mm_struct *, struct vm_area_struct *, - struct rb_node **, struct rb_node *); extern void unlink_file_vma(struct vm_area_struct *); extern struct vm_area_struct *copy_vma(struct vm_area_struct **, unsigned long addr, unsigned long len, pgoff_t pgoff, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index d0b51fbdf5d4..ac747273c4d6 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -410,19 +410,6 @@ struct vm_area_struct { /* linked list of VM areas per task, sorted by address */ struct vm_area_struct *vm_next, *vm_prev; - - struct rb_node vm_rb; - - /* - * Largest free memory gap in bytes to the left of this VMA. - * Either between this VMA and vma->vm_prev, or between one of the - * VMAs below us in the VMA rbtree and its ->vm_prev. This helps - * get_unmapped_area find a free area of the right size. - */ - unsigned long rb_subtree_gap; - - /* Second cache line starts here. */ - struct mm_struct *vm_mm; /* The address space we belong to. */ /* @@ -488,7 +475,6 @@ struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ struct maple_tree mm_mt; - struct rb_root mm_rb; u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, diff --git a/kernel/fork.c b/kernel/fork.c index 16970c346b5b..5f81c009bb20 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -581,7 +581,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { struct vm_area_struct *mpnt, *tmp, *prev, **pprev; - struct rb_node **rb_link, *rb_parent; int retval; unsigned long charge = 0; LIST_HEAD(uf); @@ -608,8 +607,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) @@ -701,10 +698,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, tmp->vm_prev = prev; prev = tmp; - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; - /* Link the vma into the MT */ mas.index = tmp->vm_start; mas.last = tmp->vm_end - 1; @@ -1133,7 +1126,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { mm->mmap = NULL; - mm->mm_rb = RB_ROOT; mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); mm->vmacache_seqnum = 0; diff --git a/mm/init-mm.c b/mm/init-mm.c index b912b0f2eced..c9327abb771c 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include #include #include #include @@ -29,7 +28,6 @@ * and size this cpu_bitmask to NR_CPUS. */ struct mm_struct init_mm = { - .mm_rb = RB_ROOT, .mm_mt = MTREE_INIT_EXT(mm_mt, MM_MT_FLAGS, init_mm.mmap_lock), .pgd = swapper_pg_dir, .mm_users = ATOMIC_INIT(2), diff --git a/mm/mmap.c b/mm/mmap.c index 68ee2958c0be..f60d83c7f233 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include @@ -247,93 +246,6 @@ out: return origbrk; } -static inline unsigned long vma_compute_gap(struct vm_area_struct *vma) -{ - unsigned long gap, prev_end; - - /* - * Note: in the rare case of a VM_GROWSDOWN above a VM_GROWSUP, we - * allow two stack_guard_gaps between them here, and when choosing - * an unmapped area; whereas when expanding we only require one. - * That's a little inconsistent, but keeps the code here simpler. - */ - gap = vm_start_gap(vma); - if (vma->vm_prev) { - prev_end = vm_end_gap(vma->vm_prev); - if (gap > prev_end) - gap -= prev_end; - else - gap = 0; - } - return gap; -} - -#ifdef CONFIG_DEBUG_VM_RB -static unsigned long vma_compute_subtree_gap(struct vm_area_struct *vma) -{ - unsigned long max = vma_compute_gap(vma), subtree_gap; - if (vma->vm_rb.rb_left) { - subtree_gap = rb_entry(vma->vm_rb.rb_left, - struct vm_area_struct, vm_rb)->rb_subtree_gap; - if (subtree_gap > max) - max = subtree_gap; - } - if (vma->vm_rb.rb_right) { - subtree_gap = rb_entry(vma->vm_rb.rb_right, - struct vm_area_struct, vm_rb)->rb_subtree_gap; - if (subtree_gap > max) - max = subtree_gap; - } - return max; -} - -static int browse_rb(struct mm_struct *mm) -{ - struct rb_root *root = &mm->mm_rb; - int i = 0, j, bug = 0; - struct rb_node *nd, *pn = NULL; - unsigned long prev = 0, pend = 0; - - for (nd = rb_first(root); nd; nd = rb_next(nd)) { - struct vm_area_struct *vma; - vma = rb_entry(nd, struct vm_area_struct, vm_rb); - if (vma->vm_start < prev) { - pr_emerg("vm_start %lx < prev %lx\n", - vma->vm_start, prev); - bug = 1; - } - if (vma->vm_start < pend) { - pr_emerg("vm_start %lx < pend %lx\n", - vma->vm_start, pend); - bug = 1; - } - if (vma->vm_start > vma->vm_end) { - pr_emerg("vm_start %lx > vm_end %lx\n", - vma->vm_start, vma->vm_end); - bug = 1; - } - spin_lock(&mm->page_table_lock); - if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { - pr_emerg("free gap %lx, correct %lx\n", - vma->rb_subtree_gap, - vma_compute_subtree_gap(vma)); - bug = 1; - } - spin_unlock(&mm->page_table_lock); - i++; - pn = nd; - prev = vma->vm_start; - pend = vma->vm_end; - } - j = 0; - for (nd = pn; nd; nd = rb_prev(nd)) - j++; - if (i != j) { - pr_emerg("backwards %d, forwards %d\n", j, i); - bug = 1; - } - return bug ? -1 : i; -} #if defined(CONFIG_DEBUG_VM_MAPLE_TREE) extern void mt_validate(struct maple_tree *mt); extern void mt_dump(const struct maple_tree *mt); @@ -361,19 +273,25 @@ static void validate_mm_mt(struct mm_struct *mm) (vma->vm_end - 1 != mas.last)) { pr_emerg("issue in %s\n", current->comm); dump_stack(); -#ifdef CONFIG_DEBUG_VM dump_vma(vma_mt); - pr_emerg("and next in rb\n"); + pr_emerg("and vm_next\n"); dump_vma(vma->vm_next); -#endif pr_emerg("mt piv: %p %lu - %lu\n", vma_mt, mas.index, mas.last); pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, vma_mt->vm_start, vma_mt->vm_end); - pr_emerg("rb vma: %p %lu - %lu\n", vma, + if (vma->vm_prev) { + pr_emerg("ll prev: %p %lu - %lu\n", + vma->vm_prev, vma->vm_prev->vm_start, + vma->vm_prev->vm_end); + } + pr_emerg("ll vma: %p %lu - %lu\n", vma, vma->vm_start, vma->vm_end); - pr_emerg("rb->next = %p %lu - %lu\n", vma->vm_next, - vma->vm_next->vm_start, vma->vm_next->vm_end); + if (vma->vm_next) { + pr_emerg("ll next: %p %lu - %lu\n", + vma->vm_next, vma->vm_next->vm_start, + vma->vm_next->vm_end); + } mt_dump(mas.tree); if (vma_mt->vm_end != mas.last + 1) { @@ -396,21 +314,6 @@ static void validate_mm_mt(struct mm_struct *mm) } VM_BUG_ON(vma); } -#else -#define validate_mm_mt(root) do { } while (0) -#endif -static void validate_mm_rb(struct rb_root *root, struct vm_area_struct *ignore) -{ - struct rb_node *nd; - - for (nd = rb_first(root); nd; nd = rb_next(nd)) { - struct vm_area_struct *vma; - vma = rb_entry(nd, struct vm_area_struct, vm_rb); - VM_BUG_ON_VMA(vma != ignore && - vma->rb_subtree_gap != vma_compute_subtree_gap(vma), - vma); - } -} static void validate_mm(struct mm_struct *mm) { @@ -419,7 +322,10 @@ static void validate_mm(struct mm_struct *mm) unsigned long highest_address = 0; struct vm_area_struct *vma = mm->mmap; + validate_mm_mt(mm); + while (vma) { +#ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; @@ -429,6 +335,7 @@ static void validate_mm(struct mm_struct *mm) anon_vma_interval_tree_verify(avc); anon_vma_unlock_read(anon_vma); } +#endif highest_address = vm_end_gap(vma); vma = vma->vm_next; @@ -443,80 +350,13 @@ static void validate_mm(struct mm_struct *mm) mm->highest_vm_end, highest_address); bug = 1; } - i = browse_rb(mm); - if (i != mm->map_count) { - if (i != -1) - pr_emerg("map_count %d rb %d\n", mm->map_count, i); - bug = 1; - } VM_BUG_ON_MM(bug, mm); } -#else -#define validate_mm_rb(root, ignore) do { } while (0) + +#else /* !CONFIG_DEBUG_VM_MAPLE_TREE */ #define validate_mm_mt(root) do { } while (0) #define validate_mm(mm) do { } while (0) -#endif - -RB_DECLARE_CALLBACKS_MAX(static, vma_gap_callbacks, - struct vm_area_struct, vm_rb, - unsigned long, rb_subtree_gap, vma_compute_gap) - -/* - * Update augmented rbtree rb_subtree_gap values after vma->vm_start or - * vma->vm_prev->vm_end values changed, without modifying the vma's position - * in the rbtree. - */ -static void vma_gap_update(struct vm_area_struct *vma) -{ - /* - * As it turns out, RB_DECLARE_CALLBACKS_MAX() already created - * a callback function that does exactly what we want. - */ - vma_gap_callbacks_propagate(&vma->vm_rb, NULL); -} - -static inline void vma_rb_insert(struct vm_area_struct *vma, - struct rb_root *root) -{ - /* All rb_subtree_gap values must be consistent prior to insertion */ - validate_mm_rb(root, NULL); - - rb_insert_augmented(&vma->vm_rb, root, &vma_gap_callbacks); -} - -static void __vma_rb_erase(struct vm_area_struct *vma, struct rb_root *root) -{ - /* - * Note rb_erase_augmented is a fairly large inline function, - * so make sure we instantiate it only once with our desired - * augmented rbtree callbacks. - */ - rb_erase_augmented(&vma->vm_rb, root, &vma_gap_callbacks); -} - -static __always_inline void vma_rb_erase_ignore(struct vm_area_struct *vma, - struct rb_root *root, - struct vm_area_struct *ignore) -{ - /* - * All rb_subtree_gap values must be consistent prior to erase, - * with the possible exception of - * - * a. the "next" vma being erased if next->vm_start was reduced in - * __vma_adjust() -> __vma_unlink() - * b. the vma being erased in detach_vmas_to_be_unmapped() -> - * vma_rb_erase() - */ - validate_mm_rb(root, ignore); - - __vma_rb_erase(vma, root); -} - -static __always_inline void vma_rb_erase(struct vm_area_struct *vma, - struct rb_root *root) -{ - vma_rb_erase_ignore(vma, root, vma); -} +#endif /* CONFIG_DEBUG_VM_MAPLE_TREE */ /* * vma has some anon_vma assigned, and is already inserted on that @@ -550,39 +390,26 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } -static int find_vma_links(struct mm_struct *mm, unsigned long addr, - unsigned long end, struct vm_area_struct **pprev, - struct rb_node ***rb_link, struct rb_node **rb_parent) +/* + * range_has_overlap() - Check the @start - @end range for overlapping VMAs and + * sets up a pointer to the previous VMA + * @mm: the mm struct + * @start: the start address of the range + * @end: the end address of the range + * @pprev: the pointer to the pointer of the previous VMA + * + * Returns: True if there is an overlapping VMA, false otherwise + */ +static inline +bool range_has_overlap(struct mm_struct *mm, unsigned long start, + unsigned long end, struct vm_area_struct **pprev) { - struct rb_node **__rb_link, *__rb_parent, *rb_prev; + struct vm_area_struct *existing; - mmap_assert_locked(mm); - __rb_link = &mm->mm_rb.rb_node; - rb_prev = __rb_parent = NULL; - - while (*__rb_link) { - struct vm_area_struct *vma_tmp; - - __rb_parent = *__rb_link; - vma_tmp = rb_entry(__rb_parent, struct vm_area_struct, vm_rb); - - if (vma_tmp->vm_end > addr) { - /* Fail if an existing vma overlaps the area */ - if (vma_tmp->vm_start < end) - return -ENOMEM; - __rb_link = &__rb_parent->rb_left; - } else { - rb_prev = __rb_parent; - __rb_link = &__rb_parent->rb_right; - } - } - - *pprev = NULL; - if (rb_prev) - *pprev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); - *rb_link = __rb_link; - *rb_parent = __rb_parent; - return 0; + MA_STATE(mas, &mm->mm_mt, start, start); + existing = mas_find(&mas, end - 1); + *pprev = mas_prev(&mas, 0); + return existing ? true : false; } /* @@ -609,8 +436,6 @@ static inline struct vm_area_struct *__vma_next(struct mm_struct *mm, * @start: The start of the range. * @len: The length of the range. * @pprev: pointer to the pointer that will be set to previous vm_area_struct - * @rb_link: the rb_node - * @rb_parent: the parent rb_node * * Find all the vm_area_struct that overlap from @start to * @end and munmap them. Set @pprev to the previous vm_area_struct. @@ -619,14 +444,11 @@ static inline struct vm_area_struct *__vma_next(struct mm_struct *mm, */ static inline int munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, - struct vm_area_struct **pprev, struct rb_node ***link, - struct rb_node **parent, struct list_head *uf) + struct vm_area_struct **pprev, struct list_head *uf) { - - while (find_vma_links(mm, start, start + len, pprev, link, parent)) + while (range_has_overlap(mm, start, start + len, pprev)) if (do_munmap(mm, start, len, uf)) return -ENOMEM; - return 0; } @@ -647,30 +469,6 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm, return nr_pages; } -void __vma_link_rb(struct mm_struct *mm, struct vm_area_struct *vma, - struct rb_node **rb_link, struct rb_node *rb_parent) -{ - /* Update tracking information for the gap following the new vma. */ - if (vma->vm_next) - vma_gap_update(vma->vm_next); - else - mm->highest_vm_end = vm_end_gap(vma); - - /* - * vma->vm_prev wasn't known when we followed the rbtree to find the - * correct insertion point for that vma. As a result, we could not - * update the vma vm_rb parents rb_subtree_gap values on the way down. - * So, we first insert the vma with a zero rb_subtree_gap value - * (to be consistent with what we did on the way down), and then - * immediately update the gap to the correct value. Finally we - * rebalance the rbtree after all augmented values have been set. - */ - rb_link_node(&vma->vm_rb, rb_parent, rb_link); - vma->rb_subtree_gap = 0; - vma_gap_update(vma); - vma_rb_insert(vma, &mm->mm_rb); -} - static void __vma_link_file(struct vm_area_struct *vma) { struct file *file; @@ -738,18 +536,8 @@ static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, mas_store_prealloc(mas, NULL); } -static void -__vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node **rb_link, - struct rb_node *rb_parent) -{ - __vma_link_list(mm, vma, prev); - __vma_link_rb(mm, vma, rb_link, rb_parent); -} - static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev, struct rb_node **rb_link, - struct rb_node *rb_parent) + struct vm_area_struct *prev) { MA_STATE(mas, &mm->mm_mt, 0, 0); struct address_space *mapping = NULL; @@ -763,7 +551,7 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, } vma_mas_store(vma, &mas); - __vma_link(mm, vma, prev, rb_link, rb_parent); + __vma_link_list(mm, vma, prev); __vma_link_file(vma); if (mapping) @@ -776,34 +564,20 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, /* * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and rbtree. It has already been inserted into the interval tree. + * mm's list and the mm tree. It has already been inserted into the interval tree. */ static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas, struct vm_area_struct *vma) { struct vm_area_struct *prev; - struct rb_node **rb_link, *rb_parent; - - if (find_vma_links(mm, vma->vm_start, vma->vm_end, - &prev, &rb_link, &rb_parent)) - BUG(); + mas_set(mas, vma->vm_start); + prev = mas_prev(mas, 0); vma_mas_store(vma, mas); __vma_link_list(mm, vma, prev); - __vma_link_rb(mm, vma, rb_link, rb_parent); mm->map_count++; } -static __always_inline void __vma_unlink(struct mm_struct *mm, - struct vm_area_struct *vma, - struct vm_area_struct *ignore) -{ - vma_rb_erase_ignore(vma, &mm->mm_rb, ignore); - __vma_unlink_list(mm, vma); - /* Kill the cache */ - vmacache_invalidate(mm); -} - /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -816,21 +590,18 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, struct vm_area_struct *expand) { struct mm_struct *mm = vma->vm_mm; - struct vm_area_struct *next = vma->vm_next, *orig_vma = vma; - struct vm_area_struct *next_next; + struct vm_area_struct *next_next, *next = find_vma(mm, vma->vm_end); + struct vm_area_struct *orig_vma = vma; struct address_space *mapping = NULL; struct rb_root_cached *root = NULL; struct anon_vma *anon_vma = NULL; struct file *file = vma->vm_file; - bool start_changed = false, end_changed = false; + bool vma_changed = false; long adjust_next = 0; int remove_next = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; - validate_mm(mm); - validate_mm_mt(mm); - if (next && !insert) { if (end >= next->vm_end) { /* @@ -957,21 +728,21 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (start != vma->vm_start) { - unsigned long old_start = vma->vm_start; + if (vma->vm_start < start) + vma_mas_szero(&mas, vma->vm_start, start); + vma_changed = true; vma->vm_start = start; - if (old_start < start) - vma_mas_szero(&mas, old_start, start); - start_changed = true; } if (end != vma->vm_end) { - unsigned long old_end = vma->vm_end; + if (vma->vm_end > end) + vma_mas_szero(&mas, end, vma->vm_end); + vma_changed = true; vma->vm_end = end; - if (old_end > end) - vma_mas_szero(&mas, end, old_end); - end_changed = true; + if (!next) + mm->highest_vm_end = vm_end_gap(vma); } - if (end_changed || start_changed) + if (vma_changed) vma_mas_store(vma, &mas); vma->vm_pgoff = pgoff; @@ -995,22 +766,12 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * Since we have expanded over this vma, the maple tree will * have overwritten by storing the value */ - if (remove_next != 3) { - __vma_unlink(mm, next, next); - if (remove_next == 2) - __vma_unlink(mm, next_next, next_next); - } else { - /* - * vma is not before next if they've been - * swapped. - * - * pre-swap() next->vm_start was reduced so - * tell validate_mm_rb to ignore pre-swap() - * "next" (which is stored in post-swap() - * "vma"). - */ - __vma_unlink(mm, next, vma); - } + __vma_unlink_list(mm, next); + if (remove_next == 2) + __vma_unlink_list(mm, next_next); + /* Kill the cache */ + vmacache_invalidate(mm); + if (file) { __remove_shared_vm_struct(next, file, mapping); if (remove_next == 2) @@ -1023,15 +784,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * (it may either follow vma or precede it). */ __insert_vm_struct(mm, &mas, insert); - } else { - if (start_changed) - vma_gap_update(vma); - if (end_changed) { - if (!next) - mm->highest_vm_end = vm_end_gap(vma); - else if (!adjust_next) - vma_gap_update(next); - } } if (anon_vma) { @@ -1059,7 +811,10 @@ again: anon_vma_merge(vma, next); mm->map_count--; mpol_put(vma_policy(next)); + if (remove_next != 2) + BUG_ON(vma->vm_end < next->vm_end); vm_area_free(next); + /* * In mprotect's case 6 (see comments on vma_merge), * we must remove another next too. It would clutter @@ -1089,10 +844,7 @@ again: if (remove_next == 2) { remove_next = 1; goto again; - } - else if (next) - vma_gap_update(next); - else { + } else if (!next) { /* * If remove_next == 2 we obviously can't * reach this path. @@ -1119,8 +871,6 @@ again: uprobe_mmap(insert); validate_mm(mm); - validate_mm_mt(mm); - return 0; } @@ -1273,7 +1023,6 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct vm_area_struct *area, *next; int err; - validate_mm_mt(mm); /* * We later require that vma->vm_flags == vm_flags, * so this tests vma->vm_flags & VM_SPECIAL, too. @@ -1349,7 +1098,6 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, khugepaged_enter_vma(area, vm_flags); return area; } - validate_mm_mt(mm); return NULL; } @@ -1519,6 +1267,7 @@ unsigned long do_mmap(struct file *file, unsigned long addr, vm_flags_t vm_flags; int pkey = 0; + validate_mm(mm); *populate = 0; if (!len) @@ -1829,10 +1578,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev, *merge; int error; - struct rb_node **rb_link, *rb_parent; unsigned long charged = 0; - validate_mm_mt(mm); /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { unsigned long nr_pages; @@ -1848,8 +1595,8 @@ unsigned long mmap_region(struct file *file, unsigned long addr, return -ENOMEM; } - /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ - if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + /* Clear old maps, set up prev and uf */ + if (munmap_vma_range(mm, addr, len, &prev, uf)) return -ENOMEM; /* * Private writable mapping: check memory availability @@ -1947,7 +1694,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto free_vma; } - if (vma_link(mm, vma, prev, rb_link, rb_parent)) { + if (vma_link(mm, vma, prev)) { error = -ENOMEM; if (file) goto unmap_and_free_vma; @@ -1993,7 +1740,6 @@ out: vma_set_page_prot(vma); - validate_mm_mt(mm); return addr; unmap_and_free_vma: @@ -2009,7 +1755,6 @@ free_vma: unacct_error: if (charged) vm_unacct_memory(charged); - validate_mm_mt(mm); return error; } @@ -2367,7 +2112,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) int error = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); - validate_mm_mt(mm); if (!(vma->vm_flags & VM_GROWSUP)) return -EFAULT; @@ -2419,15 +2163,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) error = acct_stack_growth(vma, size, grow); if (!error) { /* - * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_lock - * lock here, so we need to protect against - * concurrent vma expansions. - * anon_vma_lock_write() doesn't help here, as - * we don't guarantee that all growable vmas - * in a mm share the same root anon vma. - * So, we reuse mm->page_table_lock to guard - * against concurrent vma expansions. + * We only hold a shared mmap_lock lock here, so + * we need to protect against concurrent vma + * expansions. anon_vma_lock_write() doesn't + * help here, as we don't guarantee that all + * growable vmas in a mm share the same root + * anon vma. So, we reuse mm->page_table_lock + * to guard against concurrent vma expansions. */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) @@ -2438,9 +2180,7 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Overwrite old entry in mtree. */ vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); - if (vma->vm_next) - vma_gap_update(vma->vm_next); - else + if (!vma->vm_next) mm->highest_vm_end = vm_end_gap(vma); spin_unlock(&mm->page_table_lock); @@ -2450,8 +2190,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) } anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); - validate_mm(mm); - validate_mm_mt(mm); mas_destroy(&mas); return error; } @@ -2460,15 +2198,13 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* * vma is the first one with address < vma->vm_start. Have to extend vma. */ -int expand_downwards(struct vm_area_struct *vma, - unsigned long address) +int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *prev; int error = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); - validate_mm(mm); address &= PAGE_MASK; if (address < mmap_min_addr) return -EPERM; @@ -2510,15 +2246,13 @@ int expand_downwards(struct vm_area_struct *vma, error = acct_stack_growth(vma, size, grow); if (!error) { /* - * vma_gap_update() doesn't support concurrent - * updates, but we only hold a shared mmap_lock - * lock here, so we need to protect against - * concurrent vma expansions. - * anon_vma_lock_write() doesn't help here, as - * we don't guarantee that all growable vmas - * in a mm share the same root anon vma. - * So, we reuse mm->page_table_lock to guard - * against concurrent vma expansions. + * We only hold a shared mmap_lock lock here, so + * we need to protect against concurrent vma + * expansions. anon_vma_lock_write() doesn't + * help here, as we don't guarantee that all + * growable vmas in a mm share the same root + * anon vma. So, we reuse mm->page_table_lock + * to guard against concurrent vma expansions. */ spin_lock(&mm->page_table_lock); if (vma->vm_flags & VM_LOCKED) @@ -2530,7 +2264,6 @@ int expand_downwards(struct vm_area_struct *vma, /* Overwrite old entry in mtree. */ vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); - vma_gap_update(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); @@ -2539,7 +2272,6 @@ int expand_downwards(struct vm_area_struct *vma, } anon_vma_unlock_write(vma->anon_vma); khugepaged_enter_vma(vma, vma->vm_flags); - validate_mm(mm); mas_destroy(&mas); return error; } @@ -2671,10 +2403,8 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas, insertion_point = (prev ? &prev->vm_next : &mm->mmap); vma->vm_prev = NULL; - mas_set_range(mas, vma->vm_start, end - 1); - mas_store_prealloc(mas, NULL); + vma_mas_szero(mas, vma->vm_start, end); do { - vma_rb_erase(vma, &mm->mm_rb); if (vma->vm_flags & VM_LOCKED) mm->locked_vm -= vma_pages(vma); mm->map_count--; @@ -2682,10 +2412,9 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas, vma = vma->vm_next; } while (vma && vma->vm_start < end); *insertion_point = vma; - if (vma) { + if (vma) vma->vm_prev = prev; - vma_gap_update(vma); - } else + else mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; tail_vma->vm_next = NULL; @@ -2807,11 +2536,7 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (len == 0) return -EINVAL; - /* - * arch_unmap() might do unmaps itself. It must be called - * and finish any rbtree manipulation before this code - * runs and also starts to manipulate the rbtree. - */ + /* arch_unmap() might do unmaps itself. */ arch_unmap(mm, start, end); /* Find the first overlapping VMA where start < vma->vm_end */ @@ -2822,6 +2547,11 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, if (mas_preallocate(&mas, vma, GFP_KERNEL)) return -ENOMEM; prev = vma->vm_prev; + /* we have start < vma->vm_end */ + + /* if it doesn't overlap, we have nothing.. */ + if (vma->vm_start >= end) + return 0; /* * If we need to split any vma, do it now to save pain later. @@ -2882,6 +2612,8 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, /* Fix up all other VM information */ remove_vma_list(mm, vma); + + validate_mm(mm); return downgrade ? 1 : 0; map_count_exceeded: @@ -3020,11 +2752,11 @@ out: * anonymous maps. eventually we may be able to do some * brk-specific accounting here. */ -static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long flags, struct list_head *uf) +static int do_brk_flags(unsigned long addr, unsigned long len, + unsigned long flags, struct list_head *uf) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *prev; - struct rb_node **rb_link, *rb_parent; pgoff_t pgoff = addr >> PAGE_SHIFT; int error; unsigned long mapped_addr; @@ -3043,8 +2775,8 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla if (error) return error; - /* Clear old maps, set up prev, rb_link, rb_parent, and uf */ - if (munmap_vma_range(mm, addr, len, &prev, &rb_link, &rb_parent, uf)) + /* Clear old maps, set up prev and uf */ + if (munmap_vma_range(mm, addr, len, &prev, uf)) return -ENOMEM; /* Check against address space limits *after* clearing old maps... */ @@ -3078,7 +2810,7 @@ static int do_brk_flags(unsigned long addr, unsigned long len, unsigned long fla vma->vm_pgoff = pgoff; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); - if (vma_link(mm, vma, prev, rb_link, rb_parent)) + if (vma_link(mm, vma, prev)) goto no_vma_link; out: @@ -3197,29 +2929,12 @@ void exit_mmap(struct mm_struct *mm) int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { struct vm_area_struct *prev; - struct rb_node **rb_link, *rb_parent; - unsigned long start = vma->vm_start; - struct vm_area_struct *overlap = NULL; unsigned long charged = vma_pages(vma); - if (find_vma_links(mm, vma->vm_start, vma->vm_end, - &prev, &rb_link, &rb_parent)) - if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) + if (range_has_overlap(mm, vma->vm_start, vma->vm_end, &prev)) return -ENOMEM; - overlap = mt_find(&mm->mm_mt, &start, vma->vm_end - 1); - if (overlap) { - - pr_err("Found vma ending at %lu\n", start - 1); - pr_err("vma : %lu => %lu-%lu\n", (unsigned long)overlap, - overlap->vm_start, overlap->vm_end - 1); -#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) - mt_dump(&mm->mm_mt); -#endif - BUG(); - } - if ((vma->vm_flags & VM_ACCOUNT) && security_vm_enough_memory_mm(mm, charged)) return -ENOMEM; @@ -3241,7 +2956,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - if (vma_link(mm, vma, prev, rb_link, rb_parent)) { + if (vma_link(mm, vma, prev)) { vm_unacct_memory(charged); return -ENOMEM; } @@ -3261,9 +2976,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, unsigned long vma_start = vma->vm_start; struct mm_struct *mm = vma->vm_mm; struct vm_area_struct *new_vma, *prev; - struct rb_node **rb_link, *rb_parent; bool faulted_in_anon_vma = true; - unsigned long index = addr; validate_mm_mt(mm); /* @@ -3275,10 +2988,9 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } - if (find_vma_links(mm, addr, addr + len, &prev, &rb_link, &rb_parent)) + if (range_has_overlap(mm, addr, addr + len, &prev)) return NULL; /* should never get here */ - if (mt_find(&mm->mm_mt, &index, addr+len - 1)) - BUG(); + new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, vma->anon_vma, vma->vm_file, pgoff, vma_policy(vma), vma->vm_userfaultfd_ctx, anon_vma_name(vma)); @@ -3319,12 +3031,16 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); - vma_link(mm, new_vma, prev, rb_link, rb_parent); + if (vma_link(mm, new_vma, prev)) + goto out_vma_link; *need_rmap_locks = false; } validate_mm_mt(mm); return new_vma; +out_vma_link: + if (new_vma->vm_ops && new_vma->vm_ops->close) + new_vma->vm_ops->close(new_vma); out_free_mempol: mpol_put(vma_policy(new_vma)); out_free_vma: diff --git a/mm/nommu.c b/mm/nommu.c index c63793c53a82..321c7e6718a8 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -566,9 +566,9 @@ void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) */ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *pvma, *prev; struct address_space *mapping; - struct rb_node **p, *parent, *rb_prev; + struct vm_area_struct *prev; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); BUG_ON(!vma->vm_region); @@ -586,42 +586,10 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) i_mmap_unlock_write(mapping); } + prev = mas_prev(&mas, 0); + mas_reset(&mas); /* add the VMA to the tree */ - parent = rb_prev = NULL; - p = &mm->mm_rb.rb_node; - while (*p) { - parent = *p; - pvma = rb_entry(parent, struct vm_area_struct, vm_rb); - - /* sort by: start addr, end addr, VMA struct addr in that order - * (the latter is necessary as we may get identical VMAs) */ - if (vma->vm_start < pvma->vm_start) - p = &(*p)->rb_left; - else if (vma->vm_start > pvma->vm_start) { - rb_prev = parent; - p = &(*p)->rb_right; - } else if (vma->vm_end < pvma->vm_end) - p = &(*p)->rb_left; - else if (vma->vm_end > pvma->vm_end) { - rb_prev = parent; - p = &(*p)->rb_right; - } else if (vma < pvma) - p = &(*p)->rb_left; - else if (vma > pvma) { - rb_prev = parent; - p = &(*p)->rb_right; - } else - BUG(); - } - - rb_link_node(&vma->vm_rb, parent, p); - rb_insert_color(&vma->vm_rb, &mm->mm_rb); - - /* add VMA to the VMA list also */ - prev = NULL; - if (rb_prev) - prev = rb_entry(rb_prev, struct vm_area_struct, vm_rb); - + vma_mas_store(vma, &mas); __vma_link_list(mm, vma, prev); } @@ -634,6 +602,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) struct address_space *mapping; struct mm_struct *mm = vma->vm_mm; struct task_struct *curr = current; + MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); mm->map_count--; for (i = 0; i < VMACACHE_SIZE; i++) { @@ -656,8 +625,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) } /* remove from the MM's tree and list */ - rb_erase(&vma->vm_rb, &mm->mm_rb); - + vma_mas_remove(vma, &mas); __vma_unlink_list(mm, vma); } @@ -681,24 +649,19 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, addr, addr); /* check the cache first */ vma = vmacache_find(mm, addr); if (likely(vma)) return vma; - /* trawl the list (there may be multiple mappings in which addr - * resides) */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_start > addr) - return NULL; - if (vma->vm_end > addr) { - vmacache_update(addr, vma); - return vma; - } - } + vma = mas_walk(&mas); - return NULL; + if (vma) + vmacache_update(addr, vma); + + return vma; } EXPORT_SYMBOL(find_vma); @@ -730,26 +693,23 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long end = addr + len; + MA_STATE(mas, &mm->mm_mt, addr, addr); /* check the cache first */ vma = vmacache_find_exact(mm, addr, end); if (vma) return vma; - /* trawl the list (there may be multiple mappings in which addr - * resides) */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { - if (vma->vm_start < addr) - continue; - if (vma->vm_start > addr) - return NULL; - if (vma->vm_end == end) { - vmacache_update(addr, vma); - return vma; - } - } + vma = mas_walk(&mas); + if (!vma) + return NULL; + if (vma->vm_start != addr) + return NULL; + if (vma->vm_end != end) + return NULL; - return NULL; + vmacache_update(addr, vma); + return vma; } /* @@ -1546,6 +1506,7 @@ void exit_mmap(struct mm_struct *mm) delete_vma(mm, vma); cond_resched(); } + __mt_destroy(&mm->mm_mt); } int vm_brk(unsigned long addr, unsigned long len) diff --git a/mm/util.c b/mm/util.c index 8d944ce71e94..10effe256dfa 100644 --- a/mm/util.c +++ b/mm/util.c @@ -288,6 +288,8 @@ void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, vma->vm_next = next; if (next) next->vm_prev = vma; + else + mm->highest_vm_end = vm_end_gap(vma); } void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) @@ -300,8 +302,14 @@ void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) prev->vm_next = next; else mm->mmap = next; - if (next) + if (next) { next->vm_prev = prev; + } else { + if (prev) + mm->highest_vm_end = vm_end_gap(prev); + else + mm->highest_vm_end = 0; + } } /* Check if the vma is being used as a stack by this task */ From 3b0e81a1cdc9afbddb0543d08e38edb4e33c4baf Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:49 +0000 Subject: [PATCH 002/350] mmap: change zeroing of maple tree in __vma_adjust() Only write to the maple tree if we are not inserting or the insert isn't going to overwrite the area to clear. This avoids spanning writes and node coealescing when unnecessary. The change requires a custom search for the linked list addition to find the correct VMA for the prev link. Link: https://lkml.kernel.org/r/20220906194824.2110408-19-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmap.c | 30 ++++++++++++++++++++++-------- 1 file changed, 22 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index f60d83c7f233..52a774e70e5b 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -567,11 +567,11 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, * mm's list and the mm tree. It has already been inserted into the interval tree. */ static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas, - struct vm_area_struct *vma) + struct vm_area_struct *vma, unsigned long location) { struct vm_area_struct *prev; - mas_set(mas, vma->vm_start); + mas_set(mas, location); prev = mas_prev(mas, 0); vma_mas_store(vma, mas); __vma_link_list(mm, vma, prev); @@ -601,6 +601,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, int remove_next = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; + unsigned long ll_prev = vma->vm_start; /* linked list prev. */ if (next && !insert) { if (end >= next->vm_end) { @@ -728,15 +729,27 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, } if (start != vma->vm_start) { - if (vma->vm_start < start) + if ((vma->vm_start < start) && + (!insert || (insert->vm_end != start))) { vma_mas_szero(&mas, vma->vm_start, start); - vma_changed = true; + VM_WARN_ON(insert && insert->vm_start > vma->vm_start); + } else { + vma_changed = true; + } vma->vm_start = start; } if (end != vma->vm_end) { - if (vma->vm_end > end) - vma_mas_szero(&mas, end, vma->vm_end); - vma_changed = true; + if (vma->vm_end > end) { + if (!insert || (insert->vm_start != end)) { + vma_mas_szero(&mas, end, vma->vm_end); + VM_WARN_ON(insert && + insert->vm_end < vma->vm_end); + } else if (insert->vm_start == end) { + ll_prev = vma->vm_end; + } + } else { + vma_changed = true; + } vma->vm_end = end; if (!next) mm->highest_vm_end = vm_end_gap(vma); @@ -783,7 +796,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - __insert_vm_struct(mm, &mas, insert); + __insert_vm_struct(mm, &mas, insert, ll_prev); } if (anon_vma) { @@ -870,6 +883,7 @@ again: if (insert && file) uprobe_mmap(insert); + mas_destroy(&mas); validate_mm(mm); return 0; } From 7ccf089b262be2c7f2cdc6c08412a1939a812102 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:49 +0000 Subject: [PATCH 003/350] xen: use vma_lookup() in privcmd_ioctl_mmap() vma_lookup() walks the VMA tree for a specific value, find_vma() will search the tree after walking to a specific value. It is more efficient to only walk to the requested value since privcmd_ioctl_mmap() will exit the loop if vm_start != msg->va. Link: https://lkml.kernel.org/r/20220906194824.2110408-20-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/xen/privcmd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/privcmd.c b/drivers/xen/privcmd.c index e88e8f6f0a33..fae50a24630b 100644 --- a/drivers/xen/privcmd.c +++ b/drivers/xen/privcmd.c @@ -282,7 +282,7 @@ static long privcmd_ioctl_mmap(struct file *file, void __user *udata) struct page, lru); struct privcmd_mmap_entry *msg = page_address(page); - vma = find_vma(mm, msg->va); + vma = vma_lookup(mm, msg->va); rc = -EINVAL; if (!vma || (msg->va != vma->vm_start) || vma->vm_private_data) From dc8635b25e87232f62276c02899b9d21dd0793c2 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:49 +0000 Subject: [PATCH 004/350] mm: optimize find_exact_vma() to use vma_lookup() Use vma_lookup() to walk the tree to the start value requested. If the vma at the start does not match, then the answer is NULL and there is no need to look at the next vma the way that find_vma() would. Link: https://lkml.kernel.org/r/20220906194824.2110408-21-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index dfce1aaa7a64..a80083091f53 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2850,7 +2850,7 @@ static inline unsigned long vma_pages(struct vm_area_struct *vma) static inline struct vm_area_struct *find_exact_vma(struct mm_struct *mm, unsigned long vm_start, unsigned long vm_end) { - struct vm_area_struct *vma = find_vma(mm, vm_start); + struct vm_area_struct *vma = vma_lookup(mm, vm_start); if (vma && (vma->vm_start != vm_start || vma->vm_end != vm_end)) vma = NULL; From 94d815b2798bad12d0aec912add7665e69a48400 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:50 +0000 Subject: [PATCH 005/350] mm/khugepaged: optimize collapse_pte_mapped_thp() by using vma_lookup() vma_lookup() will walk the vma tree once and not continue to look for the next vma. Since the exact vma is checked below, this is a more optimal way of searching. Link: https://lkml.kernel.org/r/20220906194824.2110408-22-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/khugepaged.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index dc09cfe76e1f..9ff3d39b286f 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1389,7 +1389,7 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) { unsigned long haddr = addr & HPAGE_PMD_MASK; - struct vm_area_struct *vma = find_vma(mm, haddr); + struct vm_area_struct *vma = vma_lookup(mm, haddr); struct page *hpage; pte_t *start_pte, *pte; pmd_t *pmd; From 2e7ce7d354f2fae4c9becb8af799cbedf4f71665 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:50 +0000 Subject: [PATCH 006/350] mm/mmap: change do_brk_flags() to expand existing VMA and add do_brk_munmap() Avoid allocating a new VMA when it a vma modification can occur. When a brk() can expand or contract a VMA, then the single store operation will only modify one index of the maple tree instead of causing a node to split or coalesce. This avoids unnecessary allocations/frees of maple tree nodes and VMAs. Move some limit & flag verifications out of the do_brk_flags() function to use only relevant checks in the code path of bkr() and vm_brk_flags(). Set the vma to check if it can expand in vm_brk_flags() if extra criteria are met. Drop userfaultfd from do_brk_flags() path and only use it in vm_brk_flags() path since that is the only place a munmap will happen. Add a wraper for munmap for the brk case called do_brk_munmap(). Link: https://lkml.kernel.org/r/20220906194824.2110408-23-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmap.c | 237 ++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 177 insertions(+), 60 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 52a774e70e5b..0baa2ca5b0bf 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -147,17 +147,40 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) return next; } -static int do_brk_flags(unsigned long addr, unsigned long request, unsigned long flags, - struct list_head *uf); +/* + * check_brk_limits() - Use platform specific check of range & verify mlock + * limits. + * @addr: The address to check + * @len: The size of increase. + * + * Return: 0 on success. + */ +static int check_brk_limits(unsigned long addr, unsigned long len) +{ + unsigned long mapped_addr; + + mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (IS_ERR_VALUE(mapped_addr)) + return mapped_addr; + + return mlock_future_check(current->mm, current->mm->def_flags, len); +} +static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long newbrk, unsigned long oldbrk, + struct list_head *uf); +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, + unsigned long addr, unsigned long request, + unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; struct mm_struct *mm = current->mm; - struct vm_area_struct *next; + struct vm_area_struct *brkvma, *next = NULL; unsigned long min_brk; bool populate; bool downgraded = false; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, 0, 0); if (mmap_write_lock_killable(mm)) return -EINTR; @@ -199,35 +222,52 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) /* * Always allow shrinking brk. - * __do_munmap() may downgrade mmap_lock to read. + * do_brk_munmap() may downgrade mmap_lock to read. */ if (brk <= mm->brk) { int ret; + /* Search one past newbrk */ + mas_set(&mas, newbrk); + brkvma = mas_find(&mas, oldbrk); + BUG_ON(brkvma == NULL); + if (brkvma->vm_start >= oldbrk) + goto out; /* mapping intersects with an existing non-brk vma. */ /* - * mm->brk must to be protected by write mmap_lock so update it - * before downgrading mmap_lock. When __do_munmap() fails, - * mm->brk will be restored from origbrk. + * mm->brk must be protected by write mmap_lock. + * do_brk_munmap() may downgrade the lock, so update it + * before calling do_brk_munmap(). */ mm->brk = brk; - ret = __do_munmap(mm, newbrk, oldbrk-newbrk, &uf, true); - if (ret < 0) { - mm->brk = origbrk; - goto out; - } else if (ret == 1) { + mas.last = oldbrk - 1; + ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); + if (ret == 1) { downgraded = true; - } - goto success; + goto success; + } else if (!ret) + goto success; + + mm->brk = origbrk; + goto out; } - /* Check against existing mmap mappings. */ - next = find_vma(mm, oldbrk); + if (check_brk_limits(oldbrk, newbrk - oldbrk)) + goto out; + + /* + * Only check if the next VMA is within the stack_guard_gap of the + * expansion area + */ + mas_set(&mas, oldbrk); + next = mas_find(&mas, newbrk - 1 + PAGE_SIZE + stack_guard_gap); if (next && newbrk + PAGE_SIZE > vm_start_gap(next)) goto out; + brkvma = mas_prev(&mas, mm->start_brk); /* Ok, looks good - let it rip. */ - if (do_brk_flags(oldbrk, newbrk-oldbrk, 0, &uf) < 0) + if (do_brk_flags(&mas, brkvma, oldbrk, newbrk - oldbrk, 0) < 0) goto out; + mm->brk = brk; success: @@ -2762,38 +2802,55 @@ out: } /* - * this is really a simplified "do_mmap". it only handles - * anonymous maps. eventually we may be able to do some - * brk-specific accounting here. + * brk_munmap() - Unmap a parital vma. + * @mas: The maple tree state. + * @vma: The vma to be modified + * @newbrk: the start of the address to unmap + * @oldbrk: The end of the address to unmap + * @uf: The userfaultfd list_head + * + * Returns: 1 on success. + * unmaps a partial VMA mapping. Does not handle alignment, downgrades lock if + * possible. */ -static int do_brk_flags(unsigned long addr, unsigned long len, - unsigned long flags, struct list_head *uf) +static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long newbrk, unsigned long oldbrk, + struct list_head *uf) +{ + struct mm_struct *mm = vma->vm_mm; + int ret; + + arch_unmap(mm, newbrk, oldbrk); + ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true); + validate_mm_mt(mm); + return ret; +} + +/* + * do_brk_flags() - Increase the brk vma if the flags match. + * @mas: The maple tree state. + * @addr: The start address + * @len: The length of the increase + * @vma: The vma, + * @flags: The VMA Flags + * + * Extend the brk VMA from addr to addr + len. If the VMA is NULL or the flags + * do not match then create a new anonymous VMA. Eventually we may be able to + * do some brk-specific accounting here. + */ +static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long addr, unsigned long len, + unsigned long flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev; - pgoff_t pgoff = addr >> PAGE_SHIFT; - int error; - unsigned long mapped_addr; + struct vm_area_struct *prev = NULL; + validate_mm_mt(mm); - - /* Until we need other flags, refuse anything except VM_EXEC. */ - if ((flags & (~VM_EXEC)) != 0) - return -EINVAL; + /* + * Check against address space limits by the changed size + * Note: This happens *after* clearing old mappings in some code paths. + */ flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; - - mapped_addr = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); - if (IS_ERR_VALUE(mapped_addr)) - return mapped_addr; - - error = mlock_future_check(mm, mm->def_flags, len); - if (error) - return error; - - /* Clear old maps, set up prev and uf */ - if (munmap_vma_range(mm, addr, len, &prev, uf)) - return -ENOMEM; - - /* Check against address space limits *after* clearing old maps... */ if (!may_expand_vm(mm, flags, len >> PAGE_SHIFT)) return -ENOMEM; @@ -2803,30 +2860,54 @@ static int do_brk_flags(unsigned long addr, unsigned long len, if (security_vm_enough_memory_mm(mm, len >> PAGE_SHIFT)) return -ENOMEM; - /* Can we just expand an old private anonymous mapping? */ - vma = vma_merge(mm, prev, addr, addr + len, flags, - NULL, NULL, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (vma) - goto out; - /* - * create a vma struct for an anonymous mapping + * Expand the existing vma if possible; Note that singular lists do not + * occur after forking, so the expand will only happen on new VMAs. */ - vma = vm_area_alloc(mm); - if (!vma) { - vm_unacct_memory(len >> PAGE_SHIFT); - return -ENOMEM; + if (vma && + (!vma->anon_vma || list_is_singular(&vma->anon_vma_chain)) && + ((vma->vm_flags & ~VM_SOFTDIRTY) == flags)) { + mas->index = vma->vm_start; + mas->last = addr + len - 1; + vma_adjust_trans_huge(vma, addr, addr + len, 0); + if (vma->anon_vma) { + anon_vma_lock_write(vma->anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + } + vma->vm_end = addr + len; + vma->vm_flags |= VM_SOFTDIRTY; + if (mas_store_gfp(mas, vma, GFP_KERNEL)) + goto mas_expand_failed; + + if (vma->anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(vma->anon_vma); + } + khugepaged_enter_vma(vma, flags); + goto out; } + prev = vma; + + /* create a vma struct for an anonymous mapping */ + vma = vm_area_alloc(mm); + if (!vma) + goto vma_alloc_fail; vma_set_anonymous(vma); vma->vm_start = addr; vma->vm_end = addr + len; - vma->vm_pgoff = pgoff; + vma->vm_pgoff = addr >> PAGE_SHIFT; vma->vm_flags = flags; vma->vm_page_prot = vm_get_page_prot(flags); - if (vma_link(mm, vma, prev)) - goto no_vma_link; + mas_set_range(mas, vma->vm_start, addr + len - 1); + if (mas_store_gfp(mas, vma, GFP_KERNEL)) + goto mas_store_fail; + if (!prev) + prev = mas_prev(mas, 0); + + __vma_link_list(mm, vma, prev); + mm->map_count++; out: perf_event_mmap(vma); mm->total_vm += len >> PAGE_SHIFT; @@ -2837,18 +2918,29 @@ out: validate_mm_mt(mm); return 0; -no_vma_link: +mas_store_fail: vm_area_free(vma); +vma_alloc_fail: + vm_unacct_memory(len >> PAGE_SHIFT); + return -ENOMEM; + +mas_expand_failed: + if (vma->anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(vma->anon_vma); + } return -ENOMEM; } int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) { struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; unsigned long len; int ret; bool populate; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, addr, addr); len = PAGE_ALIGN(request); if (len < request) @@ -2859,13 +2951,38 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (mmap_write_lock_killable(mm)) return -EINTR; - ret = do_brk_flags(addr, len, flags, &uf); + /* Until we need other flags, refuse anything except VM_EXEC. */ + if ((flags & (~VM_EXEC)) != 0) + return -EINVAL; + + ret = check_brk_limits(addr, len); + if (ret) + goto limits_failed; + + if (find_vma_intersection(mm, addr, addr + len)) + ret = do_munmap(mm, addr, len, &uf); + + if (ret) + goto munmap_failed; + + vma = mas_prev(&mas, 0); + if (!vma || vma->vm_end != addr || vma_policy(vma) || + !can_vma_merge_after(vma, flags, NULL, NULL, + addr >> PAGE_SHIFT, NULL_VM_UFFD_CTX, NULL)) + vma = NULL; + + ret = do_brk_flags(&mas, vma, addr, len, flags); populate = ((mm->def_flags & VM_LOCKED) != 0); mmap_write_unlock(mm); userfaultfd_unmap_complete(mm, &uf); if (populate && !ret) mm_populate(addr, len); return ret; + +munmap_failed: +limits_failed: + mmap_write_unlock(mm); + return ret; } EXPORT_SYMBOL(vm_brk_flags); From abdba2dda0c477ca708a939b02f9b2e74666ed2d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:50 +0000 Subject: [PATCH 007/350] mm: use maple tree operations for find_vma_intersection() Move find_vma_intersection() to mmap.c and change implementation to maple tree. When searching for a vma within a range, it is easier to use the maple tree interface. Exported find_vma_intersection() for kvm module. Link: https://lkml.kernel.org/r/20220906194824.2110408-24-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 22 ++++------------------ mm/mmap.c | 29 +++++++++++++++++++++++++++++ mm/nommu.c | 11 +++++++++++ 3 files changed, 44 insertions(+), 18 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a80083091f53..06a6b8db75b7 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2778,26 +2778,12 @@ extern struct vm_area_struct * find_vma(struct mm_struct * mm, unsigned long add extern struct vm_area_struct * find_vma_prev(struct mm_struct * mm, unsigned long addr, struct vm_area_struct **pprev); -/** - * find_vma_intersection() - Look up the first VMA which intersects the interval - * @mm: The process address space. - * @start_addr: The inclusive start user address. - * @end_addr: The exclusive end user address. - * - * Returns: The first VMA within the provided range, %NULL otherwise. Assumes - * start_addr < end_addr. +/* + * Look up the first VMA which intersects the interval [start_addr, end_addr) + * NULL if none. Assume start_addr < end_addr. */ -static inline struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, - unsigned long start_addr, - unsigned long end_addr) -{ - struct vm_area_struct *vma = find_vma(mm, start_addr); - - if (vma && end_addr <= vma->vm_start) - vma = NULL; - return vma; -} + unsigned long start_addr, unsigned long end_addr); /** * vma_lookup() - Find a VMA at a specific address diff --git a/mm/mmap.c b/mm/mmap.c index 0baa2ca5b0bf..699af34c3573 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2061,6 +2061,35 @@ get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, EXPORT_SYMBOL(get_unmapped_area); +/** + * find_vma_intersection() - Look up the first VMA which intersects the interval + * @mm: The process address space. + * @start_addr: The inclusive start user address. + * @end_addr: The exclusive end user address. + * + * Returns: The first VMA within the provided range, %NULL otherwise. Assumes + * start_addr < end_addr. + */ +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) +{ + struct vm_area_struct *vma; + unsigned long index = start_addr; + + mmap_assert_locked(mm); + /* Check the cache first. */ + vma = vmacache_find(mm, start_addr); + if (likely(vma)) + return vma; + + vma = mt_find(&mm->mm_mt, &index, end_addr - 1); + if (vma) + vmacache_update(start_addr, vma); + return vma; +} +EXPORT_SYMBOL(find_vma_intersection); + /** * find_vma() - Find the VMA for a given address, or the next VMA. * @mm: The mm_struct to check diff --git a/mm/nommu.c b/mm/nommu.c index 321c7e6718a8..2702790d05d3 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -642,6 +642,17 @@ static void delete_vma(struct mm_struct *mm, struct vm_area_struct *vma) vm_area_free(vma); } +struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, + unsigned long start_addr, + unsigned long end_addr) +{ + unsigned long index = start_addr; + + mmap_assert_locked(mm); + return mt_find(&mm->mm_mt, &index, end_addr - 1); +} +EXPORT_SYMBOL(find_vma_intersection); + /* * look up the first VMA in which addr resides, NULL if none * - should be called with mm->mmap_lock at least held readlocked From 4dd1b84140c1b87a89d69a683bebbbdaeb620e39 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:51 +0000 Subject: [PATCH 008/350] mm/mmap: use advanced maple tree API for mmap_region() Changing mmap_region() to use the maple tree state and the advanced maple tree interface allows for a lot less tree walking. This change removes the last caller of munmap_vma_range(), so drop this unused function. Add vma_expand() to expand a VMA if possible by doing the necessary hugepage check, uprobe_munmap of files, dcache flush, modifications then undoing the detaches, etc. Link: https://lkml.kernel.org/r/20220906194824.2110408-25-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmap.c | 251 +++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 203 insertions(+), 48 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 699af34c3573..7a1adc916957 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -470,28 +470,6 @@ static inline struct vm_area_struct *__vma_next(struct mm_struct *mm, return vma->vm_next; } -/* - * munmap_vma_range() - munmap VMAs that overlap a range. - * @mm: The mm struct - * @start: The start of the range. - * @len: The length of the range. - * @pprev: pointer to the pointer that will be set to previous vm_area_struct - * - * Find all the vm_area_struct that overlap from @start to - * @end and munmap them. Set @pprev to the previous vm_area_struct. - * - * Returns: -ENOMEM on munmap failure or 0 on success. - */ -static inline int -munmap_vma_range(struct mm_struct *mm, unsigned long start, unsigned long len, - struct vm_area_struct **pprev, struct list_head *uf) -{ - while (range_has_overlap(mm, start, start + len, pprev)) - if (do_munmap(mm, start, len, uf)) - return -ENOMEM; - return 0; -} - static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { @@ -618,6 +596,129 @@ static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas, mm->map_count++; } +/* + * vma_expand - Expand an existing VMA + * + * @mas: The maple state + * @vma: The vma to expand + * @start: The start of the vma + * @end: The exclusive end of the vma + * @pgoff: The page offset of vma + * @next: The current of next vma. + * + * Expand @vma to @start and @end. Can expand off the start and end. Will + * expand over @next if it's different from @vma and @end == @next->vm_end. + * Checking if the @vma can expand and merge with @next needs to be handled by + * the caller. + * + * Returns: 0 on success + */ +inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, + unsigned long start, unsigned long end, pgoff_t pgoff, + struct vm_area_struct *next) +{ + struct mm_struct *mm = vma->vm_mm; + struct address_space *mapping = NULL; + struct rb_root_cached *root = NULL; + struct anon_vma *anon_vma = vma->anon_vma; + struct file *file = vma->vm_file; + bool remove_next = false; + + if (next && (vma != next) && (end == next->vm_end)) { + remove_next = true; + if (next->anon_vma && !vma->anon_vma) { + int error; + + anon_vma = next->anon_vma; + vma->anon_vma = anon_vma; + error = anon_vma_clone(vma, next); + if (error) + return error; + } + } + + /* Not merging but overwriting any part of next is not handled. */ + VM_BUG_ON(next && !remove_next && next != vma && end > next->vm_start); + /* Only handles expanding */ + VM_BUG_ON(vma->vm_start < start || vma->vm_end > end); + + if (mas_preallocate(mas, vma, GFP_KERNEL)) + goto nomem; + + vma_adjust_trans_huge(vma, start, end, 0); + + if (file) { + mapping = file->f_mapping; + root = &mapping->i_mmap; + uprobe_munmap(vma, vma->vm_start, vma->vm_end); + i_mmap_lock_write(mapping); + } + + if (anon_vma) { + anon_vma_lock_write(anon_vma); + anon_vma_interval_tree_pre_update_vma(vma); + } + + if (file) { + flush_dcache_mmap_lock(mapping); + vma_interval_tree_remove(vma, root); + } + + vma->vm_start = start; + vma->vm_end = end; + vma->vm_pgoff = pgoff; + /* Note: mas must be pointing to the expanding VMA */ + vma_mas_store(vma, mas); + + if (file) { + vma_interval_tree_insert(vma, root); + flush_dcache_mmap_unlock(mapping); + } + + /* Expanding over the next vma */ + if (remove_next) { + /* Remove from mm linked list - also updates highest_vm_end */ + __vma_unlink_list(mm, next); + + /* Kill the cache */ + vmacache_invalidate(mm); + + if (file) + __remove_shared_vm_struct(next, file, mapping); + + } else if (!next) { + mm->highest_vm_end = vm_end_gap(vma); + } + + if (anon_vma) { + anon_vma_interval_tree_post_update_vma(vma); + anon_vma_unlock_write(anon_vma); + } + + if (file) { + i_mmap_unlock_write(mapping); + uprobe_mmap(vma); + } + + if (remove_next) { + if (file) { + uprobe_munmap(next, next->vm_start, next->vm_end); + fput(file); + } + if (next->anon_vma) + anon_vma_merge(vma, next); + mm->map_count--; + mpol_put(vma_policy(next)); + vm_area_free(next); + } + + validate_mm(mm); + return 0; + +nomem: + return -ENOMEM; +} + /* * We cannot adjust vm_start, vm_end, vm_pgoff fields of a vma that * is already present in an i_mmap tree without adjusting the tree. @@ -1630,9 +1731,15 @@ unsigned long mmap_region(struct file *file, unsigned long addr, struct list_head *uf) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma, *prev, *merge; - int error; + struct vm_area_struct *vma = NULL; + struct vm_area_struct *next, *prev, *merge; + pgoff_t pglen = len >> PAGE_SHIFT; unsigned long charged = 0; + unsigned long end = addr + len; + unsigned long merge_start = addr, merge_end = end; + pgoff_t vm_pgoff; + int error; + MA_STATE(mas, &mm->mm_mt, addr, end - 1); /* Check against address space limit. */ if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { @@ -1642,16 +1749,17 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * MAP_FIXED may remove pages of mappings that intersects with * requested mapping. Account for the pages it would unmap. */ - nr_pages = count_vma_pages_range(mm, addr, addr + len); + nr_pages = count_vma_pages_range(mm, addr, end); if (!may_expand_vm(mm, vm_flags, (len >> PAGE_SHIFT) - nr_pages)) return -ENOMEM; } - /* Clear old maps, set up prev and uf */ - if (munmap_vma_range(mm, addr, len, &prev, uf)) + /* Unmap any existing mapping in the area */ + if (do_munmap(mm, addr, len, uf)) return -ENOMEM; + /* * Private writable mapping: check memory availability */ @@ -1662,14 +1770,43 @@ unsigned long mmap_region(struct file *file, unsigned long addr, vm_flags |= VM_ACCOUNT; } - /* - * Can we just expand an old mapping? - */ - vma = vma_merge(mm, prev, addr, addr + len, vm_flags, - NULL, file, pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (vma) - goto out; + next = mas_next(&mas, ULONG_MAX); + prev = mas_prev(&mas, 0); + if (vm_flags & VM_SPECIAL) + goto cannot_expand; + /* Attempt to expand an old mapping */ + /* Check next */ + if (next && next->vm_start == end && !vma_policy(next) && + can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, + NULL_VM_UFFD_CTX, NULL)) { + merge_end = next->vm_end; + vma = next; + vm_pgoff = next->vm_pgoff - pglen; + } + + /* Check prev */ + if (prev && prev->vm_end == addr && !vma_policy(prev) && + (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, + pgoff, vma->vm_userfaultfd_ctx, NULL) : + can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, + NULL_VM_UFFD_CTX, NULL))) { + merge_start = prev->vm_start; + vma = prev; + vm_pgoff = prev->vm_pgoff; + } + + + /* Actually expand, if possible */ + if (vma && + !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { + khugepaged_enter_vma(vma, vm_flags); + goto expanded; + } + + mas.index = addr; + mas.last = end - 1; +cannot_expand: /* * Determine the object being mapped and call the appropriate * specific mapper. the address has already been validated, but @@ -1682,7 +1819,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } vma->vm_start = addr; - vma->vm_end = addr + len; + vma->vm_end = end; vma->vm_flags = vm_flags; vma->vm_page_prot = vm_get_page_prot(vm_flags); vma->vm_pgoff = pgoff; @@ -1703,28 +1840,32 @@ unsigned long mmap_region(struct file *file, unsigned long addr, * * Answer: Yes, several device drivers can do it in their * f_op->mmap method. -DaveM - * Bug: If addr is changed, prev, rb_link, rb_parent should - * be updated for vma_link() */ WARN_ON_ONCE(addr != vma->vm_start); addr = vma->vm_start; + mas_reset(&mas); - /* If vm_flags changed after call_mmap(), we should try merge vma again - * as we may succeed this time. + /* + * If vm_flags changed after call_mmap(), we should try merge + * vma again as we may succeed this time. */ if (unlikely(vm_flags != vma->vm_flags && prev)) { merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); if (merge) { - /* ->mmap() can change vma->vm_file and fput the original file. So - * fput the vma->vm_file here or we would add an extra fput for file - * and cause general protection fault ultimately. + /* + * ->mmap() can change vma->vm_file and fput + * the original file. So fput the vma->vm_file + * here or we would add an extra fput for file + * and cause general protection fault + * ultimately. */ fput(vma->vm_file); vm_area_free(vma); vma = merge; /* Update vm_flags to pick up the change. */ + addr = vma->vm_start; vm_flags = vma->vm_flags; goto unmap_writable; } @@ -1748,7 +1889,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto free_vma; } - if (vma_link(mm, vma, prev)) { + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { error = -ENOMEM; if (file) goto unmap_and_free_vma; @@ -1756,6 +1897,22 @@ unsigned long mmap_region(struct file *file, unsigned long addr, goto free_vma; } + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); + + vma_mas_store(vma, &mas); + __vma_link_list(mm, vma, prev); + mm->map_count++; + if (vma->vm_file) { + if (vma->vm_flags & VM_SHARED) + mapping_allow_writable(vma->vm_file->f_mapping); + + flush_dcache_mmap_lock(vma->vm_file->f_mapping); + vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); + flush_dcache_mmap_unlock(vma->vm_file->f_mapping); + i_mmap_unlock_write(vma->vm_file->f_mapping); + } + /* * vma_merge() calls khugepaged_enter_vma() either, the below * call covers the non-merge case. @@ -1767,7 +1924,7 @@ unmap_writable: if (file && vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); file = vma->vm_file; -out: +expanded: perf_event_mmap(vma); vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); @@ -1794,6 +1951,7 @@ out: vma_set_page_prot(vma); + validate_mm(mm); return addr; unmap_and_free_vma: @@ -1809,6 +1967,7 @@ free_vma: unacct_error: if (charged) vm_unacct_memory(charged); + validate_mm(mm); return error; } @@ -2632,10 +2791,6 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, prev = vma->vm_prev; /* we have start < vma->vm_end */ - /* if it doesn't overlap, we have nothing.. */ - if (vma->vm_start >= end) - return 0; - /* * If we need to split any vma, do it now to save pain later. * From 7964cf8caa4dfa42c4149f3833d3878713cda3dc Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:51 +0000 Subject: [PATCH 009/350] mm: remove vmacache By using the maple tree and the maple tree state, the vmacache is no longer beneficial and is complicating the VMA code. Remove the vmacache to reduce the work in keeping it up to date and code complexity. Link: https://lkml.kernel.org/r/20220906194824.2110408-26-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/exec.c | 3 - fs/proc/task_mmu.c | 1 - include/linux/mm_types.h | 1 - include/linux/mm_types_task.h | 12 ---- include/linux/sched.h | 1 - include/linux/vm_event_item.h | 4 -- include/linux/vmacache.h | 28 -------- include/linux/vmstat.h | 6 -- kernel/debug/debug_core.c | 12 ---- kernel/fork.c | 5 -- lib/Kconfig.debug | 8 --- mm/Makefile | 2 +- mm/debug.c | 4 +- mm/mmap.c | 31 +-------- mm/nommu.c | 37 ++--------- mm/vmacache.c | 117 ---------------------------------- mm/vmstat.c | 4 -- 17 files changed, 9 insertions(+), 267 deletions(-) delete mode 100644 include/linux/vmacache.h delete mode 100644 mm/vmacache.c diff --git a/fs/exec.c b/fs/exec.c index 507a317d54db..2b919b30dc97 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include #include @@ -1027,8 +1026,6 @@ static int exec_mmap(struct mm_struct *mm) activate_mm(active_mm, mm); if (IS_ENABLED(CONFIG_ARCH_WANT_IRQS_OFF_ACTIVATE_MM)) local_irq_enable(); - tsk->mm->vmacache_seqnum = 0; - vmacache_flush(tsk); task_unlock(tsk); lru_gen_use_mm(mm); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index db2f3a2946a0..9f70bc1c2766 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1,6 +1,5 @@ // SPDX-License-Identifier: GPL-2.0 #include -#include #include #include #include diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index ac747273c4d6..4541b74b1bdb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -475,7 +475,6 @@ struct mm_struct { struct { struct vm_area_struct *mmap; /* list of VMAs */ struct maple_tree mm_mt; - u64 vmacache_seqnum; /* per-thread vmacache */ #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, unsigned long addr, unsigned long len, diff --git a/include/linux/mm_types_task.h b/include/linux/mm_types_task.h index c1bc6731125c..0bb4b6da9993 100644 --- a/include/linux/mm_types_task.h +++ b/include/linux/mm_types_task.h @@ -24,18 +24,6 @@ IS_ENABLED(CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK)) #define ALLOC_SPLIT_PTLOCKS (SPINLOCK_SIZE > BITS_PER_LONG/8) -/* - * The per task VMA cache array: - */ -#define VMACACHE_BITS 2 -#define VMACACHE_SIZE (1U << VMACACHE_BITS) -#define VMACACHE_MASK (VMACACHE_SIZE - 1) - -struct vmacache { - u64 seqnum; - struct vm_area_struct *vmas[VMACACHE_SIZE]; -}; - /* * When updating this, please also update struct resident_page_types[] in * kernel/fork.c diff --git a/include/linux/sched.h b/include/linux/sched.h index a2dcfb91df03..fbac3c19fe35 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -861,7 +861,6 @@ struct task_struct { struct mm_struct *active_mm; /* Per-thread vma caching: */ - struct vmacache vmacache; #ifdef SPLIT_RSS_COUNTING struct task_rss_stat rss_stat; diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index f3fc36cd2276..3518dba1e02f 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -129,10 +129,6 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, NR_TLB_LOCAL_FLUSH_ALL, NR_TLB_LOCAL_FLUSH_ONE, #endif /* CONFIG_DEBUG_TLBFLUSH */ -#ifdef CONFIG_DEBUG_VM_VMACACHE - VMACACHE_FIND_CALLS, - VMACACHE_FIND_HITS, -#endif #ifdef CONFIG_SWAP SWAP_RA, SWAP_RA_HIT, diff --git a/include/linux/vmacache.h b/include/linux/vmacache.h deleted file mode 100644 index 6fce268a4588..000000000000 --- a/include/linux/vmacache.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __LINUX_VMACACHE_H -#define __LINUX_VMACACHE_H - -#include -#include - -static inline void vmacache_flush(struct task_struct *tsk) -{ - memset(tsk->vmacache.vmas, 0, sizeof(tsk->vmacache.vmas)); -} - -extern void vmacache_update(unsigned long addr, struct vm_area_struct *newvma); -extern struct vm_area_struct *vmacache_find(struct mm_struct *mm, - unsigned long addr); - -#ifndef CONFIG_MMU -extern struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, - unsigned long start, - unsigned long end); -#endif - -static inline void vmacache_invalidate(struct mm_struct *mm) -{ - mm->vmacache_seqnum++; -} - -#endif /* __LINUX_VMACACHE_H */ diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index bfe38869498d..19cf5b6892ce 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -125,12 +125,6 @@ static inline void vm_events_fold_cpu(int cpu) #define count_vm_tlb_events(x, y) do { (void)(y); } while (0) #endif -#ifdef CONFIG_DEBUG_VM_VMACACHE -#define count_vm_vmacache_event(x) count_vm_event(x) -#else -#define count_vm_vmacache_event(x) do {} while (0) -#endif - #define __count_zid_vm_events(item, zid, delta) \ __count_vm_events(item##_NORMAL - ZONE_NORMAL + zid, delta) diff --git a/kernel/debug/debug_core.c b/kernel/debug/debug_core.c index 7beceb447211..d5e9ccde3ab8 100644 --- a/kernel/debug/debug_core.c +++ b/kernel/debug/debug_core.c @@ -50,7 +50,6 @@ #include #include #include -#include #include #include #include @@ -283,17 +282,6 @@ static void kgdb_flush_swbreak_addr(unsigned long addr) if (!CACHE_FLUSH_IS_SAFE) return; - if (current->mm) { - int i; - - for (i = 0; i < VMACACHE_SIZE; i++) { - if (!current->vmacache.vmas[i]) - continue; - flush_cache_range(current->vmacache.vmas[i], - addr, addr + BREAK_INSTR_SIZE); - } - } - /* Force flush instruction cache if it was outside the mm */ flush_icache_range(addr, addr + BREAK_INSTR_SIZE); } diff --git a/kernel/fork.c b/kernel/fork.c index 5f81c009bb20..430f63cd7a37 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -43,7 +43,6 @@ #include #include #include -#include #include #include #include @@ -1128,7 +1127,6 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->mmap = NULL; mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); - mm->vmacache_seqnum = 0; atomic_set(&mm->mm_users, 1); atomic_set(&mm->mm_count, 1); seqcount_init(&mm->write_protect_seq); @@ -1585,9 +1583,6 @@ static int copy_mm(unsigned long clone_flags, struct task_struct *tsk) if (!oldmm) return 0; - /* initialize the new vmacache entries */ - vmacache_flush(tsk); - if (clone_flags & CLONE_VM) { mmget(oldmm); mm = oldmm; diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 2becf60995e1..6d1544d9201e 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -812,14 +812,6 @@ config DEBUG_VM If unsure, say N. -config DEBUG_VM_VMACACHE - bool "Debug VMA caching" - depends on DEBUG_VM - help - Enable this to turn on VMA caching debug information. Doing so - can cause significant overhead, so only enable it in non-production - environments. - config DEBUG_VM_MAPLE_TREE bool "Debug VM maple trees" depends on DEBUG_VM diff --git a/mm/Makefile b/mm/Makefile index 488f604e77e0..a731d1decbb1 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -52,7 +52,7 @@ obj-y := filemap.o mempool.o oom_kill.o fadvise.o \ readahead.o swap.o truncate.o vmscan.o shmem.o \ util.o mmzone.o vmstat.o backing-dev.o \ mm_init.o percpu.o slab_common.o \ - compaction.o vmacache.o \ + compaction.o \ interval_tree.o list_lru.o workingset.o \ debug.o gup.o mmap_lock.o $(mmu-y) diff --git a/mm/debug.c b/mm/debug.c index bef329bf28f0..2d625ca0e326 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -155,7 +155,7 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %px mmap %px seqnum %llu task_size %lu\n" + pr_emerg("mm %px mmap %px task_size %lu\n" #ifdef CONFIG_MMU "get_unmapped_area %px\n" #endif @@ -183,7 +183,7 @@ void dump_mm(const struct mm_struct *mm) "tlb_flush_pending %d\n" "def_flags: %#lx(%pGv)\n", - mm, mm->mmap, (long long) mm->vmacache_seqnum, mm->task_size, + mm, mm->mmap, mm->task_size, #ifdef CONFIG_MMU mm->get_unmapped_area, #endif diff --git a/mm/mmap.c b/mm/mmap.c index 7a1adc916957..7872642e8993 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -14,7 +14,6 @@ #include #include #include -#include #include #include #include @@ -680,9 +679,6 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, /* Remove from mm linked list - also updates highest_vm_end */ __vma_unlink_list(mm, next); - /* Kill the cache */ - vmacache_invalidate(mm); - if (file) __remove_shared_vm_struct(next, file, mapping); @@ -923,8 +919,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, __vma_unlink_list(mm, next); if (remove_next == 2) __vma_unlink_list(mm, next_next); - /* Kill the cache */ - vmacache_invalidate(mm); if (file) { __remove_shared_vm_struct(next, file, mapping); @@ -2233,19 +2227,10 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, unsigned long start_addr, unsigned long end_addr) { - struct vm_area_struct *vma; unsigned long index = start_addr; mmap_assert_locked(mm); - /* Check the cache first. */ - vma = vmacache_find(mm, start_addr); - if (likely(vma)) - return vma; - - vma = mt_find(&mm->mm_mt, &index, end_addr - 1); - if (vma) - vmacache_update(start_addr, vma); - return vma; + return mt_find(&mm->mm_mt, &index, end_addr - 1); } EXPORT_SYMBOL(find_vma_intersection); @@ -2259,19 +2244,10 @@ EXPORT_SYMBOL(find_vma_intersection); */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma; unsigned long index = addr; mmap_assert_locked(mm); - /* Check the cache first. */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; - - vma = mt_find(&mm->mm_mt, &index, ULONG_MAX); - if (vma) - vmacache_update(addr, vma); - return vma; + return mt_find(&mm->mm_mt, &index, ULONG_MAX); } EXPORT_SYMBOL(find_vma); @@ -2660,9 +2636,6 @@ detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas, mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; tail_vma->vm_next = NULL; - /* Kill the cache */ - vmacache_invalidate(mm); - /* * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or * VM_GROWSUP VMA. Such VMAs can change their size under diff --git a/mm/nommu.c b/mm/nommu.c index 2702790d05d3..265a444a2cc2 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -598,23 +597,12 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) */ static void delete_vma_from_mm(struct vm_area_struct *vma) { - int i; - struct address_space *mapping; - struct mm_struct *mm = vma->vm_mm; - struct task_struct *curr = current; MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); - mm->map_count--; - for (i = 0; i < VMACACHE_SIZE; i++) { - /* if the vma is cached, invalidate the entire cache */ - if (curr->vmacache.vmas[i] == vma) { - vmacache_invalidate(mm); - break; - } - } - + vma->vm_mm->map_count--; /* remove the VMA from the mapping */ if (vma->vm_file) { + struct address_space *mapping; mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); @@ -626,7 +614,7 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) /* remove from the MM's tree and list */ vma_mas_remove(vma, &mas); - __vma_unlink_list(mm, vma); + __vma_unlink_list(vma->vm_mm, vma); } /* @@ -659,20 +647,9 @@ EXPORT_SYMBOL(find_vma_intersection); */ struct vm_area_struct *find_vma(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma; MA_STATE(mas, &mm->mm_mt, addr, addr); - /* check the cache first */ - vma = vmacache_find(mm, addr); - if (likely(vma)) - return vma; - - vma = mas_walk(&mas); - - if (vma) - vmacache_update(addr, vma); - - return vma; + return mas_walk(&mas); } EXPORT_SYMBOL(find_vma); @@ -706,11 +683,6 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, unsigned long end = addr + len; MA_STATE(mas, &mm->mm_mt, addr, addr); - /* check the cache first */ - vma = vmacache_find_exact(mm, addr, end); - if (vma) - return vma; - vma = mas_walk(&mas); if (!vma) return NULL; @@ -719,7 +691,6 @@ static struct vm_area_struct *find_vma_exact(struct mm_struct *mm, if (vma->vm_end != end) return NULL; - vmacache_update(addr, vma); return vma; } diff --git a/mm/vmacache.c b/mm/vmacache.c deleted file mode 100644 index 01a6e6688ec1..000000000000 --- a/mm/vmacache.c +++ /dev/null @@ -1,117 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2014 Davidlohr Bueso. - */ -#include -#include -#include -#include - -/* - * Hash based on the pmd of addr if configured with MMU, which provides a good - * hit rate for workloads with spatial locality. Otherwise, use pages. - */ -#ifdef CONFIG_MMU -#define VMACACHE_SHIFT PMD_SHIFT -#else -#define VMACACHE_SHIFT PAGE_SHIFT -#endif -#define VMACACHE_HASH(addr) ((addr >> VMACACHE_SHIFT) & VMACACHE_MASK) - -/* - * This task may be accessing a foreign mm via (for example) - * get_user_pages()->find_vma(). The vmacache is task-local and this - * task's vmacache pertains to a different mm (ie, its own). There is - * nothing we can do here. - * - * Also handle the case where a kernel thread has adopted this mm via - * kthread_use_mm(). That kernel thread's vmacache is not applicable to this mm. - */ -static inline bool vmacache_valid_mm(struct mm_struct *mm) -{ - return current->mm == mm && !(current->flags & PF_KTHREAD); -} - -void vmacache_update(unsigned long addr, struct vm_area_struct *newvma) -{ - if (vmacache_valid_mm(newvma->vm_mm)) - current->vmacache.vmas[VMACACHE_HASH(addr)] = newvma; -} - -static bool vmacache_valid(struct mm_struct *mm) -{ - struct task_struct *curr; - - if (!vmacache_valid_mm(mm)) - return false; - - curr = current; - if (mm->vmacache_seqnum != curr->vmacache.seqnum) { - /* - * First attempt will always be invalid, initialize - * the new cache for this task here. - */ - curr->vmacache.seqnum = mm->vmacache_seqnum; - vmacache_flush(curr); - return false; - } - return true; -} - -struct vm_area_struct *vmacache_find(struct mm_struct *mm, unsigned long addr) -{ - int idx = VMACACHE_HASH(addr); - int i; - - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - - if (!vmacache_valid(mm)) - return NULL; - - for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[idx]; - - if (vma) { -#ifdef CONFIG_DEBUG_VM_VMACACHE - if (WARN_ON_ONCE(vma->vm_mm != mm)) - break; -#endif - if (vma->vm_start <= addr && vma->vm_end > addr) { - count_vm_vmacache_event(VMACACHE_FIND_HITS); - return vma; - } - } - if (++idx == VMACACHE_SIZE) - idx = 0; - } - - return NULL; -} - -#ifndef CONFIG_MMU -struct vm_area_struct *vmacache_find_exact(struct mm_struct *mm, - unsigned long start, - unsigned long end) -{ - int idx = VMACACHE_HASH(start); - int i; - - count_vm_vmacache_event(VMACACHE_FIND_CALLS); - - if (!vmacache_valid(mm)) - return NULL; - - for (i = 0; i < VMACACHE_SIZE; i++) { - struct vm_area_struct *vma = current->vmacache.vmas[idx]; - - if (vma && vma->vm_start == start && vma->vm_end == end) { - count_vm_vmacache_event(VMACACHE_FIND_HITS); - return vma; - } - if (++idx == VMACACHE_SIZE) - idx = 0; - } - - return NULL; -} -#endif diff --git a/mm/vmstat.c b/mm/vmstat.c index 779f1ea6e8ea..bd8040f25c27 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1389,10 +1389,6 @@ const char * const vmstat_text[] = { "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ -#ifdef CONFIG_DEBUG_VM_VMACACHE - "vmacache_find_calls", - "vmacache_find_hits", -#endif #ifdef CONFIG_SWAP "swap_ra", "swap_ra_hit", From d7c62295570f012e1d386ae6ed472b36baf037ad Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:51 +0000 Subject: [PATCH 010/350] mm: convert vma_lookup() to use mtree_load() Unlike the rbtree, the Maple Tree will return a NULL if there's nothing at a particular address. Since the previous commit dropped the vmacache, it is now possible to consult the tree directly. Link: https://lkml.kernel.org/r/20220906194824.2110408-27-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 06a6b8db75b7..49a58807719b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2795,12 +2795,7 @@ struct vm_area_struct *find_vma_intersection(struct mm_struct *mm, static inline struct vm_area_struct *vma_lookup(struct mm_struct *mm, unsigned long addr) { - struct vm_area_struct *vma = find_vma(mm, addr); - - if (vma && addr < vma->vm_start) - vma = NULL; - - return vma; + return mtree_load(&mm->mm_mt, addr); } static inline unsigned long vm_start_gap(struct vm_area_struct *vma) From e99668a56430a25a871113bcd3989ed20eae1cfc Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:52 +0000 Subject: [PATCH 011/350] mm/mmap: move mmap_region() below do_munmap() Relocation of code for the next commit. There should be no changes here. Link: https://lkml.kernel.org/r/20220906194824.2110408-28-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmap.c | 490 +++++++++++++++++++++++++++--------------------------- 1 file changed, 245 insertions(+), 245 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 7872642e8993..8c9e526994be 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1720,251 +1720,6 @@ static inline int accountable_mapping(struct file *file, vm_flags_t vm_flags) return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; } -unsigned long mmap_region(struct file *file, unsigned long addr, - unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, - struct list_head *uf) -{ - struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = NULL; - struct vm_area_struct *next, *prev, *merge; - pgoff_t pglen = len >> PAGE_SHIFT; - unsigned long charged = 0; - unsigned long end = addr + len; - unsigned long merge_start = addr, merge_end = end; - pgoff_t vm_pgoff; - int error; - MA_STATE(mas, &mm->mm_mt, addr, end - 1); - - /* Check against address space limit. */ - if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { - unsigned long nr_pages; - - /* - * MAP_FIXED may remove pages of mappings that intersects with - * requested mapping. Account for the pages it would unmap. - */ - nr_pages = count_vma_pages_range(mm, addr, end); - - if (!may_expand_vm(mm, vm_flags, - (len >> PAGE_SHIFT) - nr_pages)) - return -ENOMEM; - } - - /* Unmap any existing mapping in the area */ - if (do_munmap(mm, addr, len, uf)) - return -ENOMEM; - - /* - * Private writable mapping: check memory availability - */ - if (accountable_mapping(file, vm_flags)) { - charged = len >> PAGE_SHIFT; - if (security_vm_enough_memory_mm(mm, charged)) - return -ENOMEM; - vm_flags |= VM_ACCOUNT; - } - - next = mas_next(&mas, ULONG_MAX); - prev = mas_prev(&mas, 0); - if (vm_flags & VM_SPECIAL) - goto cannot_expand; - - /* Attempt to expand an old mapping */ - /* Check next */ - if (next && next->vm_start == end && !vma_policy(next) && - can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, - NULL_VM_UFFD_CTX, NULL)) { - merge_end = next->vm_end; - vma = next; - vm_pgoff = next->vm_pgoff - pglen; - } - - /* Check prev */ - if (prev && prev->vm_end == addr && !vma_policy(prev) && - (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, - pgoff, vma->vm_userfaultfd_ctx, NULL) : - can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, - NULL_VM_UFFD_CTX, NULL))) { - merge_start = prev->vm_start; - vma = prev; - vm_pgoff = prev->vm_pgoff; - } - - - /* Actually expand, if possible */ - if (vma && - !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { - khugepaged_enter_vma(vma, vm_flags); - goto expanded; - } - - mas.index = addr; - mas.last = end - 1; -cannot_expand: - /* - * Determine the object being mapped and call the appropriate - * specific mapper. the address has already been validated, but - * not unmapped, but the maps are removed from the list. - */ - vma = vm_area_alloc(mm); - if (!vma) { - error = -ENOMEM; - goto unacct_error; - } - - vma->vm_start = addr; - vma->vm_end = end; - vma->vm_flags = vm_flags; - vma->vm_page_prot = vm_get_page_prot(vm_flags); - vma->vm_pgoff = pgoff; - - if (file) { - if (vm_flags & VM_SHARED) { - error = mapping_map_writable(file->f_mapping); - if (error) - goto free_vma; - } - - vma->vm_file = get_file(file); - error = call_mmap(file, vma); - if (error) - goto unmap_and_free_vma; - - /* Can addr have changed?? - * - * Answer: Yes, several device drivers can do it in their - * f_op->mmap method. -DaveM - */ - WARN_ON_ONCE(addr != vma->vm_start); - - addr = vma->vm_start; - mas_reset(&mas); - - /* - * If vm_flags changed after call_mmap(), we should try merge - * vma again as we may succeed this time. - */ - if (unlikely(vm_flags != vma->vm_flags && prev)) { - merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, - NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); - if (merge) { - /* - * ->mmap() can change vma->vm_file and fput - * the original file. So fput the vma->vm_file - * here or we would add an extra fput for file - * and cause general protection fault - * ultimately. - */ - fput(vma->vm_file); - vm_area_free(vma); - vma = merge; - /* Update vm_flags to pick up the change. */ - addr = vma->vm_start; - vm_flags = vma->vm_flags; - goto unmap_writable; - } - } - - vm_flags = vma->vm_flags; - } else if (vm_flags & VM_SHARED) { - error = shmem_zero_setup(vma); - if (error) - goto free_vma; - } else { - vma_set_anonymous(vma); - } - - /* Allow architectures to sanity-check the vm_flags */ - if (!arch_validate_flags(vma->vm_flags)) { - error = -EINVAL; - if (file) - goto unmap_and_free_vma; - else - goto free_vma; - } - - if (mas_preallocate(&mas, vma, GFP_KERNEL)) { - error = -ENOMEM; - if (file) - goto unmap_and_free_vma; - else - goto free_vma; - } - - if (vma->vm_file) - i_mmap_lock_write(vma->vm_file->f_mapping); - - vma_mas_store(vma, &mas); - __vma_link_list(mm, vma, prev); - mm->map_count++; - if (vma->vm_file) { - if (vma->vm_flags & VM_SHARED) - mapping_allow_writable(vma->vm_file->f_mapping); - - flush_dcache_mmap_lock(vma->vm_file->f_mapping); - vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); - flush_dcache_mmap_unlock(vma->vm_file->f_mapping); - i_mmap_unlock_write(vma->vm_file->f_mapping); - } - - /* - * vma_merge() calls khugepaged_enter_vma() either, the below - * call covers the non-merge case. - */ - khugepaged_enter_vma(vma, vma->vm_flags); - - /* Once vma denies write, undo our temporary denial count */ -unmap_writable: - if (file && vm_flags & VM_SHARED) - mapping_unmap_writable(file->f_mapping); - file = vma->vm_file; -expanded: - perf_event_mmap(vma); - - vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); - if (vm_flags & VM_LOCKED) { - if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || - is_vm_hugetlb_page(vma) || - vma == get_gate_vma(current->mm)) - vma->vm_flags &= VM_LOCKED_CLEAR_MASK; - else - mm->locked_vm += (len >> PAGE_SHIFT); - } - - if (file) - uprobe_mmap(vma); - - /* - * New (or expanded) vma always get soft dirty status. - * Otherwise user-space soft-dirty page tracker won't - * be able to distinguish situation when vma area unmapped, - * then new mapped in-place (which must be aimed as - * a completely new data area). - */ - vma->vm_flags |= VM_SOFTDIRTY; - - vma_set_page_prot(vma); - - validate_mm(mm); - return addr; - -unmap_and_free_vma: - fput(vma->vm_file); - vma->vm_file = NULL; - - /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); - if (vm_flags & VM_SHARED) - mapping_unmap_writable(file->f_mapping); -free_vma: - vm_area_free(vma); -unacct_error: - if (charged) - vm_unacct_memory(charged); - validate_mm(mm); - return error; -} - /** * unmapped_area() - Find an area between the low_limit and the high_limit with * the correct alignment and offset, all from @info. Note: current->mm is used @@ -2840,6 +2595,251 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, return __do_munmap(mm, start, len, uf, false); } +unsigned long mmap_region(struct file *file, unsigned long addr, + unsigned long len, vm_flags_t vm_flags, unsigned long pgoff, + struct list_head *uf) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma = NULL; + struct vm_area_struct *next, *prev, *merge; + pgoff_t pglen = len >> PAGE_SHIFT; + unsigned long charged = 0; + unsigned long end = addr + len; + unsigned long merge_start = addr, merge_end = end; + pgoff_t vm_pgoff; + int error; + MA_STATE(mas, &mm->mm_mt, addr, end - 1); + + /* Check against address space limit. */ + if (!may_expand_vm(mm, vm_flags, len >> PAGE_SHIFT)) { + unsigned long nr_pages; + + /* + * MAP_FIXED may remove pages of mappings that intersects with + * requested mapping. Account for the pages it would unmap. + */ + nr_pages = count_vma_pages_range(mm, addr, end); + + if (!may_expand_vm(mm, vm_flags, + (len >> PAGE_SHIFT) - nr_pages)) + return -ENOMEM; + } + + /* Unmap any existing mapping in the area */ + if (do_munmap(mm, addr, len, uf)) + return -ENOMEM; + + /* + * Private writable mapping: check memory availability + */ + if (accountable_mapping(file, vm_flags)) { + charged = len >> PAGE_SHIFT; + if (security_vm_enough_memory_mm(mm, charged)) + return -ENOMEM; + vm_flags |= VM_ACCOUNT; + } + + next = mas_next(&mas, ULONG_MAX); + prev = mas_prev(&mas, 0); + if (vm_flags & VM_SPECIAL) + goto cannot_expand; + + /* Attempt to expand an old mapping */ + /* Check next */ + if (next && next->vm_start == end && !vma_policy(next) && + can_vma_merge_before(next, vm_flags, NULL, file, pgoff+pglen, + NULL_VM_UFFD_CTX, NULL)) { + merge_end = next->vm_end; + vma = next; + vm_pgoff = next->vm_pgoff - pglen; + } + + /* Check prev */ + if (prev && prev->vm_end == addr && !vma_policy(prev) && + (vma ? can_vma_merge_after(prev, vm_flags, vma->anon_vma, file, + pgoff, vma->vm_userfaultfd_ctx, NULL) : + can_vma_merge_after(prev, vm_flags, NULL, file, pgoff, + NULL_VM_UFFD_CTX, NULL))) { + merge_start = prev->vm_start; + vma = prev; + vm_pgoff = prev->vm_pgoff; + } + + + /* Actually expand, if possible */ + if (vma && + !vma_expand(&mas, vma, merge_start, merge_end, vm_pgoff, next)) { + khugepaged_enter_vma(vma, vm_flags); + goto expanded; + } + + mas.index = addr; + mas.last = end - 1; +cannot_expand: + /* + * Determine the object being mapped and call the appropriate + * specific mapper. the address has already been validated, but + * not unmapped, but the maps are removed from the list. + */ + vma = vm_area_alloc(mm); + if (!vma) { + error = -ENOMEM; + goto unacct_error; + } + + vma->vm_start = addr; + vma->vm_end = end; + vma->vm_flags = vm_flags; + vma->vm_page_prot = vm_get_page_prot(vm_flags); + vma->vm_pgoff = pgoff; + + if (file) { + if (vm_flags & VM_SHARED) { + error = mapping_map_writable(file->f_mapping); + if (error) + goto free_vma; + } + + vma->vm_file = get_file(file); + error = call_mmap(file, vma); + if (error) + goto unmap_and_free_vma; + + /* Can addr have changed?? + * + * Answer: Yes, several device drivers can do it in their + * f_op->mmap method. -DaveM + */ + WARN_ON_ONCE(addr != vma->vm_start); + + addr = vma->vm_start; + mas_reset(&mas); + + /* + * If vm_flags changed after call_mmap(), we should try merge + * vma again as we may succeed this time. + */ + if (unlikely(vm_flags != vma->vm_flags && prev)) { + merge = vma_merge(mm, prev, vma->vm_start, vma->vm_end, vma->vm_flags, + NULL, vma->vm_file, vma->vm_pgoff, NULL, NULL_VM_UFFD_CTX, NULL); + if (merge) { + /* + * ->mmap() can change vma->vm_file and fput + * the original file. So fput the vma->vm_file + * here or we would add an extra fput for file + * and cause general protection fault + * ultimately. + */ + fput(vma->vm_file); + vm_area_free(vma); + vma = merge; + /* Update vm_flags to pick up the change. */ + addr = vma->vm_start; + vm_flags = vma->vm_flags; + goto unmap_writable; + } + } + + vm_flags = vma->vm_flags; + } else if (vm_flags & VM_SHARED) { + error = shmem_zero_setup(vma); + if (error) + goto free_vma; + } else { + vma_set_anonymous(vma); + } + + /* Allow architectures to sanity-check the vm_flags */ + if (!arch_validate_flags(vma->vm_flags)) { + error = -EINVAL; + if (file) + goto unmap_and_free_vma; + else + goto free_vma; + } + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + error = -ENOMEM; + if (file) + goto unmap_and_free_vma; + else + goto free_vma; + } + + if (vma->vm_file) + i_mmap_lock_write(vma->vm_file->f_mapping); + + vma_mas_store(vma, &mas); + __vma_link_list(mm, vma, prev); + mm->map_count++; + if (vma->vm_file) { + if (vma->vm_flags & VM_SHARED) + mapping_allow_writable(vma->vm_file->f_mapping); + + flush_dcache_mmap_lock(vma->vm_file->f_mapping); + vma_interval_tree_insert(vma, &vma->vm_file->f_mapping->i_mmap); + flush_dcache_mmap_unlock(vma->vm_file->f_mapping); + i_mmap_unlock_write(vma->vm_file->f_mapping); + } + + /* + * vma_merge() calls khugepaged_enter_vma() either, the below + * call covers the non-merge case. + */ + khugepaged_enter_vma(vma, vma->vm_flags); + + /* Once vma denies write, undo our temporary denial count */ +unmap_writable: + if (file && vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); + file = vma->vm_file; +expanded: + perf_event_mmap(vma); + + vm_stat_account(mm, vm_flags, len >> PAGE_SHIFT); + if (vm_flags & VM_LOCKED) { + if ((vm_flags & VM_SPECIAL) || vma_is_dax(vma) || + is_vm_hugetlb_page(vma) || + vma == get_gate_vma(current->mm)) + vma->vm_flags &= VM_LOCKED_CLEAR_MASK; + else + mm->locked_vm += (len >> PAGE_SHIFT); + } + + if (file) + uprobe_mmap(vma); + + /* + * New (or expanded) vma always get soft dirty status. + * Otherwise user-space soft-dirty page tracker won't + * be able to distinguish situation when vma area unmapped, + * then new mapped in-place (which must be aimed as + * a completely new data area). + */ + vma->vm_flags |= VM_SOFTDIRTY; + + vma_set_page_prot(vma); + + validate_mm(mm); + return addr; + +unmap_and_free_vma: + fput(vma->vm_file); + vma->vm_file = NULL; + + /* Undo any partial mapping done by a device driver. */ + unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); + if (vm_flags & VM_SHARED) + mapping_unmap_writable(file->f_mapping); +free_vma: + vm_area_free(vma); +unacct_error: + if (charged) + vm_unacct_memory(charged); + validate_mm(mm); + return error; +} + static int __vm_munmap(unsigned long start, size_t len, bool downgrade) { int ret; From 11f9a21ab65542189372b7d64bb2d2937dfdc9dc Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:52 +0000 Subject: [PATCH 012/350] mm/mmap: reorganize munmap to use maple states Remove __do_munmap() in favour of do_munmap(), do_mas_munmap(), and do_mas_align_munmap(). do_munmap() is a wrapper to create a maple state for any callers that have not been converted to the maple tree. do_mas_munmap() takes a maple state to mumap a range. This is just a small function which checks for error conditions and aligns the end of the range. do_mas_align_munmap() uses the aligned range to mumap a range. do_mas_align_munmap() starts with the first VMA in the range, then finds the last VMA in the range. Both start and end are split if necessary. Then the VMAs are removed from the linked list and the mm mlock count is updated at the same time. Followed by a single tree operation of overwriting the area in with a NULL. Finally, the detached list is unmapped and freed. By reorganizing the munmap calls as outlined, it is now possible to avoid extra work of aligning pre-aligned callers which are known to be safe, avoid extra VMA lookups or tree walks for modifications. detach_vmas_to_be_unmapped() is no longer used, so drop this code. vm_brk_flags() can just call the do_mas_munmap() as it checks for intersecting VMAs directly. Link: https://lkml.kernel.org/r/20220906194824.2110408-29-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +- mm/mmap.c | 236 ++++++++++++++++++++++++++++----------------- mm/mremap.c | 17 ++-- 3 files changed, 162 insertions(+), 96 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 49a58807719b..579449d6c23b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2710,8 +2710,9 @@ extern unsigned long mmap_region(struct file *file, unsigned long addr, extern unsigned long do_mmap(struct file *file, unsigned long addr, unsigned long len, unsigned long prot, unsigned long flags, unsigned long pgoff, unsigned long *populate, struct list_head *uf); -extern int __do_munmap(struct mm_struct *, unsigned long, size_t, - struct list_head *uf, bool downgrade); +extern int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool downgrade); extern int do_munmap(struct mm_struct *, unsigned long, size_t, struct list_head *uf); extern int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior); diff --git a/mm/mmap.c b/mm/mmap.c index 8c9e526994be..6e587f4e3a7d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2362,47 +2362,6 @@ static void unmap_region(struct mm_struct *mm, tlb_finish_mmu(&tlb); } -/* - * Create a list of vma's touched by the unmap, removing them from the mm's - * vma list as we go.. - */ -static bool -detach_vmas_to_be_unmapped(struct mm_struct *mm, struct ma_state *mas, - struct vm_area_struct *vma, struct vm_area_struct *prev, - unsigned long end) -{ - struct vm_area_struct **insertion_point; - struct vm_area_struct *tail_vma = NULL; - - insertion_point = (prev ? &prev->vm_next : &mm->mmap); - vma->vm_prev = NULL; - vma_mas_szero(mas, vma->vm_start, end); - do { - if (vma->vm_flags & VM_LOCKED) - mm->locked_vm -= vma_pages(vma); - mm->map_count--; - tail_vma = vma; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); - *insertion_point = vma; - if (vma) - vma->vm_prev = prev; - else - mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; - tail_vma->vm_next = NULL; - - /* - * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or - * VM_GROWSUP VMA. Such VMAs can change their size under - * down_read(mmap_lock) and collide with the VMA we are about to unmap. - */ - if (vma && (vma->vm_flags & VM_GROWSDOWN)) - return false; - if (prev && (prev->vm_flags & VM_GROWSUP)) - return false; - return true; -} - /* * __split_vma() bypasses sysctl_max_map_count checking. We use this where it * has already been checked or doesn't make sense to fail. @@ -2485,40 +2444,51 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } -/* Munmap is split into 2 main parts -- this part which finds - * what needs doing, and the areas themselves, which do the - * work. This now handles partial unmappings. - * Jeremy Fitzhardinge - */ -int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, - struct list_head *uf, bool downgrade) +static inline int +unlock_range(struct vm_area_struct *start, struct vm_area_struct **tail, + unsigned long limit) { - unsigned long end; - struct vm_area_struct *vma, *prev, *last; + struct mm_struct *mm = start->vm_mm; + struct vm_area_struct *tmp = start; + int count = 0; + + while (tmp && tmp->vm_start < limit) { + *tail = tmp; + count++; + if (tmp->vm_flags & VM_LOCKED) + mm->locked_vm -= vma_pages(tmp); + + tmp = tmp->vm_next; + } + + return count; +} + +/* + * do_mas_align_munmap() - munmap the aligned region from @start to @end. + * @mas: The maple_state, ideally set up to alter the correct tree location. + * @vma: The starting vm_area_struct + * @mm: The mm_struct + * @start: The aligned start address to munmap. + * @end: The aligned end address to munmap. + * @uf: The userfaultfd list_head + * @downgrade: Set to true to attempt a write downgrade of the mmap_sem + * + * If @downgrade is true, check return code for potential release of the lock. + */ +static int +do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, + struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf, bool downgrade) +{ + struct vm_area_struct *prev, *last; int error = -ENOMEM; - MA_STATE(mas, &mm->mm_mt, 0, 0); - - if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) - return -EINVAL; - - len = PAGE_ALIGN(len); - end = start + len; - if (len == 0) - return -EINVAL; - - /* arch_unmap() might do unmaps itself. */ - arch_unmap(mm, start, end); - - /* Find the first overlapping VMA where start < vma->vm_end */ - vma = find_vma_intersection(mm, start, end); - if (!vma) - return 0; - - if (mas_preallocate(&mas, vma, GFP_KERNEL)) - return -ENOMEM; - prev = vma->vm_prev; /* we have start < vma->vm_end */ + if (mas_preallocate(mas, vma, GFP_KERNEL)) + return -ENOMEM; + + mas->last = end - 1; /* * If we need to split any vma, do it now to save pain later. * @@ -2539,17 +2509,31 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, error = __split_vma(mm, vma, start, 0); if (error) goto split_failed; + prev = vma; + vma = __vma_next(mm, prev); + mas->index = start; + mas_reset(mas); + } else { + prev = vma->vm_prev; } + if (vma->vm_end >= end) + last = vma; + else + last = find_vma_intersection(mm, end - 1, end); + /* Does it split the last one? */ - last = find_vma(mm, end); - if (last && end > last->vm_start) { + if (last && end < last->vm_end) { error = __split_vma(mm, last, end, 1); + if (error) goto split_failed; + + if (vma == last) + vma = __vma_next(mm, prev); + mas_reset(mas); } - vma = __vma_next(mm, prev); if (unlikely(uf)) { /* @@ -2562,16 +2546,46 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, * failure that it's not worth optimizing it for. */ error = userfaultfd_unmap_prep(vma, start, end, uf); + if (error) goto userfaultfd_error; } - /* Detach vmas from rbtree */ - if (!detach_vmas_to_be_unmapped(mm, &mas, vma, prev, end)) - downgrade = false; + /* + * unlock any mlock()ed ranges before detaching vmas, count the number + * of VMAs to be dropped, and return the tail entry of the affected + * area. + */ + mm->map_count -= unlock_range(vma, &last, end); + /* Drop removed area from the tree */ + mas_store_prealloc(mas, NULL); - if (downgrade) - mmap_write_downgrade(mm); + /* Detach vmas from the MM linked list */ + vma->vm_prev = NULL; + if (prev) + prev->vm_next = last->vm_next; + else + mm->mmap = last->vm_next; + + if (last->vm_next) { + last->vm_next->vm_prev = prev; + last->vm_next = NULL; + } else + mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; + + /* + * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or + * VM_GROWSUP VMA. Such VMAs can change their size under + * down_read(mmap_lock) and collide with the VMA we are about to unmap. + */ + if (downgrade) { + if (last && (last->vm_flags & VM_GROWSDOWN)) + downgrade = false; + else if (prev && (prev->vm_flags & VM_GROWSUP)) + downgrade = false; + else + mmap_write_downgrade(mm); + } unmap_region(mm, vma, prev, start, end); @@ -2585,14 +2599,63 @@ int __do_munmap(struct mm_struct *mm, unsigned long start, size_t len, map_count_exceeded: split_failed: userfaultfd_error: - mas_destroy(&mas); + mas_destroy(mas); return error; } +/* + * do_mas_munmap() - munmap a given range. + * @mas: The maple state + * @mm: The mm_struct + * @start: The start address to munmap + * @len: The length of the range to munmap + * @uf: The userfaultfd list_head + * @downgrade: set to true if the user wants to attempt to write_downgrade the + * mmap_sem + * + * This function takes a @mas that is either pointing to the previous VMA or set + * to MA_START and sets it up to remove the mapping(s). The @len will be + * aligned and any arch_unmap work will be preformed. + * + * Returns: -EINVAL on failure, 1 on success and unlock, 0 otherwise. + */ +int do_mas_munmap(struct ma_state *mas, struct mm_struct *mm, + unsigned long start, size_t len, struct list_head *uf, + bool downgrade) +{ + unsigned long end; + struct vm_area_struct *vma; + + if ((offset_in_page(start)) || start > TASK_SIZE || len > TASK_SIZE-start) + return -EINVAL; + + end = start + PAGE_ALIGN(len); + if (end == start) + return -EINVAL; + + /* arch_unmap() might do unmaps itself. */ + arch_unmap(mm, start, end); + + /* Find the first overlapping VMA */ + vma = mas_find(mas, end - 1); + if (!vma) + return 0; + + return do_mas_align_munmap(mas, vma, mm, start, end, uf, downgrade); +} + +/* do_munmap() - Wrapper function for non-maple tree aware do_munmap() calls. + * @mm: The mm_struct + * @start: The start address to munmap + * @len: The length to be munmapped. + * @uf: The userfaultfd list_head + */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { - return __do_munmap(mm, start, len, uf, false); + MA_STATE(mas, &mm->mm_mt, start, start); + + return do_mas_munmap(&mas, mm, start, len, uf, false); } unsigned long mmap_region(struct file *file, unsigned long addr, @@ -2626,7 +2689,7 @@ unsigned long mmap_region(struct file *file, unsigned long addr, } /* Unmap any existing mapping in the area */ - if (do_munmap(mm, addr, len, uf)) + if (do_mas_munmap(&mas, mm, addr, len, uf, false)) return -ENOMEM; /* @@ -2845,11 +2908,12 @@ static int __vm_munmap(unsigned long start, size_t len, bool downgrade) int ret; struct mm_struct *mm = current->mm; LIST_HEAD(uf); + MA_STATE(mas, &mm->mm_mt, start, start); if (mmap_write_lock_killable(mm)) return -EINTR; - ret = __do_munmap(mm, start, len, &uf, downgrade); + ret = do_mas_munmap(&mas, mm, start, len, &uf, downgrade); /* * Returning 1 indicates mmap_lock is downgraded. * But 1 is not legal return value of vm_munmap() and munmap(), reset @@ -2978,7 +3042,7 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, int ret; arch_unmap(mm, newbrk, oldbrk); - ret = __do_munmap(mm, newbrk, oldbrk - newbrk, uf, true); + ret = do_mas_munmap(mas, mm, newbrk, oldbrk-newbrk, uf, true); validate_mm_mt(mm); return ret; } @@ -3116,9 +3180,7 @@ int vm_brk_flags(unsigned long addr, unsigned long request, unsigned long flags) if (ret) goto limits_failed; - if (find_vma_intersection(mm, addr, addr + len)) - ret = do_munmap(mm, addr, len, &uf); - + ret = do_mas_munmap(&mas, mm, addr, len, &uf, 0); if (ret) goto munmap_failed; diff --git a/mm/mremap.c b/mm/mremap.c index b522cd0259a0..e0fba9004246 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -975,20 +975,23 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* * Always allow a shrinking remap: that just unmaps * the unnecessary pages.. - * __do_munmap does all the needed commit accounting, and + * do_mas_munmap does all the needed commit accounting, and * downgrades mmap_lock to read if so directed. */ if (old_len >= new_len) { int retval; + MA_STATE(mas, &mm->mm_mt, addr + new_len, addr + new_len); - retval = __do_munmap(mm, addr+new_len, old_len - new_len, - &uf_unmap, true); - if (retval < 0 && old_len != new_len) { + retval = do_mas_munmap(&mas, mm, addr + new_len, + old_len - new_len, &uf_unmap, true); + /* Returning 1 indicates mmap_lock is downgraded to read. */ + if (retval == 1) { + downgraded = true; + } else if (retval < 0 && old_len != new_len) { ret = retval; goto out; - /* Returning 1 indicates mmap_lock is downgraded to read. */ - } else if (retval == 1) - downgraded = true; + } + ret = addr; goto out; } From 67e7c16764c3cbf84a57d441fba3474217ac08d6 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:52 +0000 Subject: [PATCH 013/350] mm/mmap: change do_brk_munmap() to use do_mas_align_munmap() do_brk_munmap() has already aligned the address and has a maple tree state to be used. Use the new do_mas_align_munmap() to avoid unnecessary alignment and error checks. Link: https://lkml.kernel.org/r/20220906194824.2110408-30-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/mmap.c b/mm/mmap.c index 6e587f4e3a7d..8b7e9d5afd38 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3042,7 +3042,7 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, int ret; arch_unmap(mm, newbrk, oldbrk); - ret = do_mas_munmap(mas, mm, newbrk, oldbrk-newbrk, uf, true); + ret = do_mas_align_munmap(mas, vma, mm, newbrk, oldbrk, uf, true); validate_mm_mt(mm); return ret; } From de2b84d24b87172913754bca6db85d5c5998213b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:53 +0000 Subject: [PATCH 014/350] arm64: remove mmap linked list from vdso Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-31-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/arm64/kernel/vdso.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kernel/vdso.c b/arch/arm64/kernel/vdso.c index a61fc4f989b3..a8388af62b99 100644 --- a/arch/arm64/kernel/vdso.c +++ b/arch/arm64/kernel/vdso.c @@ -136,10 +136,11 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (vma_is_special_mapping(vma, vdso_info[VDSO_ABI_AA64].dm)) From ef770d180ebae967b19a3964bc1cc026f3082f9a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:53 +0000 Subject: [PATCH 015/350] arm64: Change elfcore for_each_mte_vma() to use VMA iterator Rework for_each_mte_vma() to use a VMA iterator instead of an explicit linked-list. Link: https://lkml.kernel.org/r/20220906194824.2110408-32-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Catalin Marinas Link: https://lore.kernel.org/r/20220218023650.672072-1-Liam.Howlett@oracle.com Signed-off-by: Will Deacon Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/arm64/kernel/elfcore.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c index 98d67444a5b6..27ef7ad3ffd2 100644 --- a/arch/arm64/kernel/elfcore.c +++ b/arch/arm64/kernel/elfcore.c @@ -8,9 +8,9 @@ #include #include -#define for_each_mte_vma(tsk, vma) \ +#define for_each_mte_vma(vmi, vma) \ if (system_supports_mte()) \ - for (vma = tsk->mm->mmap; vma; vma = vma->vm_next) \ + for_each_vma(vmi, vma) \ if (vma->vm_flags & VM_MTE) static unsigned long mte_vma_tag_dump_size(struct vm_area_struct *vma) @@ -81,8 +81,9 @@ Elf_Half elf_core_extra_phdrs(void) { struct vm_area_struct *vma; int vma_count = 0; + VMA_ITERATOR(vmi, current->mm, 0); - for_each_mte_vma(current, vma) + for_each_mte_vma(vmi, vma) vma_count++; return vma_count; @@ -91,8 +92,9 @@ Elf_Half elf_core_extra_phdrs(void) int elf_core_write_extra_phdrs(struct coredump_params *cprm, loff_t offset) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, current->mm, 0); - for_each_mte_vma(current, vma) { + for_each_mte_vma(vmi, vma) { struct elf_phdr phdr; phdr.p_type = PT_AARCH64_MEMTAG_MTE; @@ -116,8 +118,9 @@ size_t elf_core_extra_data_size(void) { struct vm_area_struct *vma; size_t data_size = 0; + VMA_ITERATOR(vmi, current->mm, 0); - for_each_mte_vma(current, vma) + for_each_mte_vma(vmi, vma) data_size += mte_vma_tag_dump_size(vma); return data_size; @@ -126,8 +129,9 @@ size_t elf_core_extra_data_size(void) int elf_core_write_extra_data(struct coredump_params *cprm) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, current->mm, 0); - for_each_mte_vma(current, vma) { + for_each_mte_vma(vmi, vma) { if (vma->vm_flags & VM_DONTDUMP) continue; From 70fa203165d96ae03abb83cf60d30c44e6b81a12 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:53 +0000 Subject: [PATCH 016/350] parisc: remove mmap linked list from cache handling Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-33-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/parisc/kernel/cache.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/parisc/kernel/cache.c b/arch/parisc/kernel/cache.c index 3feb7694e0ca..1d3b8bc8a623 100644 --- a/arch/parisc/kernel/cache.c +++ b/arch/parisc/kernel/cache.c @@ -657,15 +657,20 @@ static inline unsigned long mm_total_size(struct mm_struct *mm) { struct vm_area_struct *vma; unsigned long usize = 0; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma && usize < parisc_cache_flush_threshold; vma = vma->vm_next) + for_each_vma(vmi, vma) { + if (usize >= parisc_cache_flush_threshold) + break; usize += vma->vm_end - vma->vm_start; + } return usize; } void flush_cache_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); /* * Flushing the whole cache on each cpu takes forever on @@ -685,7 +690,7 @@ void flush_cache_mm(struct mm_struct *mm) } /* Flush mm */ - for (vma = mm->mmap; vma; vma = vma->vm_next) + for_each_vma(vmi, vma) flush_cache_pages(vma, vma->vm_start, vma->vm_end); } From 405e669172e20be9a42cecf8be0fbed089fab045 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:53 +0000 Subject: [PATCH 017/350] powerpc: remove mmap linked list walks Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-34-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Reviewed-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/powerpc/kernel/vdso.c | 6 +++--- arch/powerpc/mm/book3s32/tlb.c | 11 ++++++----- arch/powerpc/mm/book3s64/subpage_prot.c | 13 ++----------- 3 files changed, 11 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/kernel/vdso.c b/arch/powerpc/kernel/vdso.c index 0da287544054..94a8fa5017c3 100644 --- a/arch/powerpc/kernel/vdso.c +++ b/arch/powerpc/kernel/vdso.c @@ -113,18 +113,18 @@ struct vdso_data *arch_get_vdso_data(void *vvar_page) int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; mmap_read_lock(mm); - - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (vma_is_special_mapping(vma, &vvar_spec)) zap_page_range(vma, vma->vm_start, size); } - mmap_read_unlock(mm); + return 0; } diff --git a/arch/powerpc/mm/book3s32/tlb.c b/arch/powerpc/mm/book3s32/tlb.c index 19f0ef950d77..9ad6b56bfec9 100644 --- a/arch/powerpc/mm/book3s32/tlb.c +++ b/arch/powerpc/mm/book3s32/tlb.c @@ -81,14 +81,15 @@ EXPORT_SYMBOL(hash__flush_range); void hash__flush_tlb_mm(struct mm_struct *mm) { struct vm_area_struct *mp; + VMA_ITERATOR(vmi, mm, 0); /* - * It is safe to go down the mm's list of vmas when called - * from dup_mmap, holding mmap_lock. It would also be safe from - * unmap_region or exit_mmap, but not from vmtruncate on SMP - - * but it seems dup_mmap is the only SMP case which gets here. + * It is safe to iterate the vmas when called from dup_mmap, + * holding mmap_lock. It would also be safe from unmap_region + * or exit_mmap, but not from vmtruncate on SMP - but it seems + * dup_mmap is the only SMP case which gets here. */ - for (mp = mm->mmap; mp != NULL; mp = mp->vm_next) + for_each_vma(vmi, mp) hash__flush_range(mp->vm_mm, mp->vm_start, mp->vm_end); } EXPORT_SYMBOL(hash__flush_tlb_mm); diff --git a/arch/powerpc/mm/book3s64/subpage_prot.c b/arch/powerpc/mm/book3s64/subpage_prot.c index 60c6ea16a972..d73b3b4176e8 100644 --- a/arch/powerpc/mm/book3s64/subpage_prot.c +++ b/arch/powerpc/mm/book3s64/subpage_prot.c @@ -149,24 +149,15 @@ static void subpage_mark_vma_nohuge(struct mm_struct *mm, unsigned long addr, unsigned long len) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, addr); /* * We don't try too hard, we just mark all the vma in that range * VM_NOHUGEPAGE and split them. */ - vma = find_vma(mm, addr); - /* - * If the range is in unmapped range, just return - */ - if (vma && ((addr + len) <= vma->vm_start)) - return; - - while (vma) { - if (vma->vm_start >= (addr + len)) - break; + for_each_vma_range(vmi, vma, addr + len) { vma->vm_flags |= VM_NOHUGEPAGE; walk_page_vma(vma, &subpage_walk_ops, NULL); - vma = vma->vm_next; } } #else From e7b6b990e524f60994da70cf5a22159b1e88ce57 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:54 +0000 Subject: [PATCH 018/350] s390: remove vma linked list walks Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-35-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/s390/kernel/vdso.c | 3 ++- arch/s390/mm/gmap.c | 6 ++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c index 5075cde77b29..535099f2736d 100644 --- a/arch/s390/kernel/vdso.c +++ b/arch/s390/kernel/vdso.c @@ -69,10 +69,11 @@ static struct page *find_timens_vvar_page(struct vm_area_struct *vma) int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (!vma_is_special_mapping(vma, &vvar_mapping)) diff --git a/arch/s390/mm/gmap.c b/arch/s390/mm/gmap.c index 62758cb5872f..02d15c8dc92e 100644 --- a/arch/s390/mm/gmap.c +++ b/arch/s390/mm/gmap.c @@ -2515,8 +2515,9 @@ static const struct mm_walk_ops thp_split_walk_ops = { static inline void thp_split_mm(struct mm_struct *mm) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) { + for_each_vma(vmi, vma) { vma->vm_flags &= ~VM_HUGEPAGE; vma->vm_flags |= VM_NOHUGEPAGE; walk_page_vma(vma, &thp_split_walk_ops, NULL); @@ -2584,8 +2585,9 @@ int gmap_mark_unmergeable(void) struct mm_struct *mm = current->mm; struct vm_area_struct *vma; int ret; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { ret = ksm_madvise(vma, vma->vm_start, vma->vm_end, MADV_UNMERGEABLE, &vma->vm_flags); if (ret) From a3884621163b7d7fab89b44461b2a48a29c5cc9a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:54 +0000 Subject: [PATCH 019/350] x86: remove vma linked list walks Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-36-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/x86/entry/vdso/vma.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c index 1000d457c332..6292b960037b 100644 --- a/arch/x86/entry/vdso/vma.c +++ b/arch/x86/entry/vdso/vma.c @@ -127,17 +127,17 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (vma_is_special_mapping(vma, &vvar_mapping)) zap_page_range(vma, vma->vm_start, size); } - mmap_read_unlock(mm); + return 0; } #else @@ -354,6 +354,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); /* @@ -363,7 +364,7 @@ int map_vdso_once(const struct vdso_image *image, unsigned long addr) * We could search vma near context.vdso, but it's a slowpath, * so let's explicitly check all VMAs to be completely sure. */ - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (vma_is_special_mapping(vma, &vdso_mapping) || vma_is_special_mapping(vma, &vvar_mapping)) { mmap_write_unlock(mm); From 49c40fb4b826c90036f04abf583bb4cb5ba3d203 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:55 +0000 Subject: [PATCH 020/350] xtensa: remove vma linked list walks Use the VMA iterator instead. Since VMA can no longer be NULL in the loop, then deal with out-of-memory outside the loop. This means a slightly longer run time in the failure case (-ENOMEM) - it will run to the end of the VMAs before erroring instead of in the middle of the loop. Link: https://lkml.kernel.org/r/20220906194824.2110408-37-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/xtensa/kernel/syscall.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/xtensa/kernel/syscall.c b/arch/xtensa/kernel/syscall.c index 201356faa7e6..b3c2450d6f23 100644 --- a/arch/xtensa/kernel/syscall.c +++ b/arch/xtensa/kernel/syscall.c @@ -58,6 +58,7 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { struct vm_area_struct *vmm; + struct vma_iterator vmi; if (flags & MAP_FIXED) { /* We do not accept a shared mapping if it would violate @@ -79,15 +80,20 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, else addr = PAGE_ALIGN(addr); - for (vmm = find_vma(current->mm, addr); ; vmm = vmm->vm_next) { - /* At this point: (!vmm || addr < vmm->vm_end). */ - if (TASK_SIZE - len < addr) - return -ENOMEM; - if (!vmm || addr + len <= vm_start_gap(vmm)) - return addr; + vma_iter_init(&vmi, current->mm, addr); + for_each_vma(vmi, vmm) { + /* At this point: (addr < vmm->vm_end). */ + if (addr + len <= vm_start_gap(vmm)) + break; + addr = vmm->vm_end; if (flags & MAP_SHARED) addr = COLOUR_ALIGN(addr, pgoff); } + + if (TASK_SIZE - len < addr) + return -ENOMEM; + + return addr; } #endif From d9fa0e37cdd47cfb71f5fa7a599ef5b9e32c55ed Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:55 +0000 Subject: [PATCH 021/350] cxl: remove vma linked list walk Use the VMA iterator instead. This requires a little restructuring of the surrounding code to hoist the mm to the caller. That turns cxl_prefault_one() into a trivial function, so call cxl_fault_segment() directly. Link: https://lkml.kernel.org/r/20220906194824.2110408-38-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/misc/cxl/fault.c | 45 ++++++++++++++-------------------------- 1 file changed, 15 insertions(+), 30 deletions(-) diff --git a/drivers/misc/cxl/fault.c b/drivers/misc/cxl/fault.c index 60c829113299..2c64f55cf01f 100644 --- a/drivers/misc/cxl/fault.c +++ b/drivers/misc/cxl/fault.c @@ -280,22 +280,6 @@ void cxl_handle_fault(struct work_struct *fault_work) mmput(mm); } -static void cxl_prefault_one(struct cxl_context *ctx, u64 ea) -{ - struct mm_struct *mm; - - mm = get_mem_context(ctx); - if (mm == NULL) { - pr_devel("cxl_prefault_one unable to get mm %i\n", - pid_nr(ctx->pid)); - return; - } - - cxl_fault_segment(ctx, mm, ea); - - mmput(mm); -} - static u64 next_segment(u64 ea, u64 vsid) { if (vsid & SLB_VSID_B_1T) @@ -306,23 +290,16 @@ static u64 next_segment(u64 ea, u64 vsid) return ea + 1; } -static void cxl_prefault_vma(struct cxl_context *ctx) +static void cxl_prefault_vma(struct cxl_context *ctx, struct mm_struct *mm) { u64 ea, last_esid = 0; struct copro_slb slb; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int rc; - struct mm_struct *mm; - - mm = get_mem_context(ctx); - if (mm == NULL) { - pr_devel("cxl_prefault_vm unable to get mm %i\n", - pid_nr(ctx->pid)); - return; - } mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { for (ea = vma->vm_start; ea < vma->vm_end; ea = next_segment(ea, slb.vsid)) { rc = copro_calculate_slb(mm, ea, &slb); @@ -337,20 +314,28 @@ static void cxl_prefault_vma(struct cxl_context *ctx) } } mmap_read_unlock(mm); - - mmput(mm); } void cxl_prefault(struct cxl_context *ctx, u64 wed) { + struct mm_struct *mm = get_mem_context(ctx); + + if (mm == NULL) { + pr_devel("cxl_prefault unable to get mm %i\n", + pid_nr(ctx->pid)); + return; + } + switch (ctx->afu->prefault_mode) { case CXL_PREFAULT_WED: - cxl_prefault_one(ctx, wed); + cxl_fault_segment(ctx, mm, wed); break; case CXL_PREFAULT_ALL: - cxl_prefault_vma(ctx); + cxl_prefault_vma(ctx, mm); break; default: break; } + + mmput(mm); } From df724cedcfd7ce6638f40903144902a3e29fcec7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:55 +0000 Subject: [PATCH 022/350] optee: remove vma linked list walk Use the VMA iterator instead. Change the calling convention of __check_mem_type() to pass in the mm instead of the first vma in the range. Link: https://lkml.kernel.org/r/20220906194824.2110408-39-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/tee/optee/call.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/drivers/tee/optee/call.c b/drivers/tee/optee/call.c index 28f87cd8b3ed..290b1bb0e9cd 100644 --- a/drivers/tee/optee/call.c +++ b/drivers/tee/optee/call.c @@ -492,15 +492,18 @@ static bool is_normal_memory(pgprot_t p) #endif } -static int __check_mem_type(struct vm_area_struct *vma, unsigned long end) +static int __check_mem_type(struct mm_struct *mm, unsigned long start, + unsigned long end) { - while (vma && is_normal_memory(vma->vm_page_prot)) { - if (vma->vm_end >= end) - return 0; - vma = vma->vm_next; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, start); + + for_each_vma_range(vmi, vma, end) { + if (!is_normal_memory(vma->vm_page_prot)) + return -EINVAL; } - return -EINVAL; + return 0; } int optee_check_mem_type(unsigned long start, size_t num_pages) @@ -516,8 +519,7 @@ int optee_check_mem_type(unsigned long start, size_t num_pages) return 0; mmap_read_lock(mm); - rc = __check_mem_type(find_vma(mm, start), - start + num_pages * PAGE_SIZE); + rc = __check_mem_type(mm, start, start + num_pages * PAGE_SIZE); mmap_read_unlock(mm); return rc; From cbd43755ad15687cf8c925793a0b6c60c6181615 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:56 +0000 Subject: [PATCH 023/350] um: remove vma linked list walk Use the VMA iterator instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-40-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/um/kernel/tlb.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/um/kernel/tlb.c b/arch/um/kernel/tlb.c index bc38f79ca3a3..ad449173a1a1 100644 --- a/arch/um/kernel/tlb.c +++ b/arch/um/kernel/tlb.c @@ -584,21 +584,19 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, void flush_tlb_mm(struct mm_struct *mm) { - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - while (vma != NULL) { + for_each_vma(vmi, vma) fix_range(mm, vma->vm_start, vma->vm_end, 0); - vma = vma->vm_next; - } } void force_flush_all(void) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - while (vma != NULL) { + for_each_vma(vmi, vma) fix_range(mm, vma->vm_start, vma->vm_end, 1); - vma = vma->vm_next; - } } From 182ea1d71750ff9a41e7f8225c842246a4375983 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:56 +0000 Subject: [PATCH 024/350] coredump: remove vma linked list walk Use the Maple Tree iterator instead. This is too complicated for the VMA iterator to handle, so let's open-code it for now. If this turns out to be a common pattern, we can migrate it to common code. Link: https://lkml.kernel.org/r/20220906194824.2110408-41-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/coredump.c | 34 ++++++++++++---------------------- 1 file changed, 12 insertions(+), 22 deletions(-) diff --git a/fs/coredump.c b/fs/coredump.c index 9f4aae202109..35f2af85b9bc 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1072,30 +1072,20 @@ whole: return vma->vm_end - vma->vm_start; } -static struct vm_area_struct *first_vma(struct task_struct *tsk, - struct vm_area_struct *gate_vma) -{ - struct vm_area_struct *ret = tsk->mm->mmap; - - if (ret) - return ret; - return gate_vma; -} - /* * Helper function for iterating across a vma list. It ensures that the caller * will visit `gate_vma' prior to terminating the search. */ -static struct vm_area_struct *next_vma(struct vm_area_struct *this_vma, +static struct vm_area_struct *coredump_next_vma(struct ma_state *mas, + struct vm_area_struct *vma, struct vm_area_struct *gate_vma) { - struct vm_area_struct *ret; - - ret = this_vma->vm_next; - if (ret) - return ret; - if (this_vma == gate_vma) + if (gate_vma && (vma == gate_vma)) return NULL; + + vma = mas_next(mas, ULONG_MAX); + if (vma) + return vma; return gate_vma; } @@ -1119,9 +1109,10 @@ static void free_vma_snapshot(struct coredump_params *cprm) */ static bool dump_vma_snapshot(struct coredump_params *cprm) { - struct vm_area_struct *vma, *gate_vma; + struct vm_area_struct *gate_vma, *vma = NULL; struct mm_struct *mm = current->mm; - int i; + MA_STATE(mas, &mm->mm_mt, 0, 0); + int i = 0; /* * Once the stack expansion code is fixed to not change VMA bounds @@ -1141,8 +1132,7 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) return false; } - for (i = 0, vma = first_vma(current, gate_vma); vma != NULL; - vma = next_vma(vma, gate_vma), i++) { + while ((vma = coredump_next_vma(&mas, vma, gate_vma)) != NULL) { struct core_vma_metadata *m = cprm->vma_meta + i; m->start = vma->vm_start; @@ -1150,10 +1140,10 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) m->flags = vma->vm_flags; m->dump_size = vma_dump_size(vma, cprm->mm_flags); m->pgoff = vma->vm_pgoff; - m->file = vma->vm_file; if (m->file) get_file(m->file); + i++; } mmap_write_unlock(mm); From 19066e58682ec156aac8d6cf94b79ab2f122a556 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:56 +0000 Subject: [PATCH 025/350] exec: use VMA iterator instead of linked list Remove a use of the vm_next list by doing the initial lookup with the VMA iterator and then using it to find the next entry. Link: https://lkml.kernel.org/r/20220906194824.2110408-42-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/exec.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 2b919b30dc97..afe55d0c3bcf 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -683,6 +683,8 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) unsigned long length = old_end - old_start; unsigned long new_start = old_start - shift; unsigned long new_end = old_end - shift; + VMA_ITERATOR(vmi, mm, new_start); + struct vm_area_struct *next; struct mmu_gather tlb; BUG_ON(new_start > new_end); @@ -691,7 +693,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * ensure there are no vmas between where we want to go * and where we are */ - if (vma != find_vma(mm, new_start)) + if (vma != vma_next(&vmi)) return -EFAULT; /* @@ -710,12 +712,13 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) lru_add_drain(); tlb_gather_mmu(&tlb, mm); + next = vma_next(&vmi); if (new_end > old_start) { /* * when the old and new regions overlap clear from new_end. */ free_pgd_range(&tlb, new_end, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + next ? next->vm_start : USER_PGTABLES_CEILING); } else { /* * otherwise, clean from old_start; this is done to not touch @@ -724,7 +727,7 @@ static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift) * for the others its just a little faster. */ free_pgd_range(&tlb, old_start, old_end, new_end, - vma->vm_next ? vma->vm_next->vm_start : USER_PGTABLES_CEILING); + next ? next->vm_start : USER_PGTABLES_CEILING); } tlb_finish_mmu(&tlb); From 5f14b9246e8944243c70253b28830de619800d31 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:56 +0000 Subject: [PATCH 026/350] fs/proc/base: use the vma iterators in place of linked list Use the vma iterator instead of a for loop across the linked list. The link list of vmas will be removed in this patch set. Link: https://lkml.kernel.org/r/20220906194824.2110408-43-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/proc/base.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 93f7e3d971e4..12885a75913f 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2350,6 +2350,7 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) GENRADIX(struct map_files_info) fa; struct map_files_info *p; int ret; + struct vma_iterator vmi; genradix_init(&fa); @@ -2388,7 +2389,9 @@ proc_map_files_readdir(struct file *file, struct dir_context *ctx) * routine might require mmap_lock taken in might_fault(). */ - for (vma = mm->mmap, pos = 2; vma; vma = vma->vm_next) { + pos = 2; + vma_iter_init(&vmi, mm, 0); + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (++pos <= ctx->pos) From c4c84f06285e48f80e9843d0775ad92714ffc35a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:57 +0000 Subject: [PATCH 027/350] fs/proc/task_mmu: stop using linked list and highest_vm_end Remove references to mm_struct linked list and highest_vm_end for when they are removed Link: https://lkml.kernel.org/r/20220906194824.2110408-44-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/proc/internal.h | 2 +- fs/proc/task_mmu.c | 73 ++++++++++++++++++++++++++-------------------- 2 files changed, 42 insertions(+), 33 deletions(-) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 06a80f78433d..f03000764ce5 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -285,7 +285,7 @@ struct proc_maps_private { struct task_struct *task; struct mm_struct *mm; #ifdef CONFIG_MMU - struct vm_area_struct *tail_vma; + struct vma_iterator iter; #endif #ifdef CONFIG_NUMA struct mempolicy *task_mempolicy; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 9f70bc1c2766..8b4f3073f8f5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -123,12 +123,26 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif +static struct vm_area_struct *proc_get_vma(struct proc_maps_private *priv, + loff_t *ppos) +{ + struct vm_area_struct *vma = vma_next(&priv->iter); + + if (vma) { + *ppos = vma->vm_start; + } else { + *ppos = -2UL; + vma = get_gate_vma(priv->mm); + } + + return vma; +} + static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; unsigned long last_addr = *ppos; struct mm_struct *mm; - struct vm_area_struct *vma; /* See m_next(). Zero at the start or after lseek. */ if (last_addr == -1UL) @@ -152,31 +166,21 @@ static void *m_start(struct seq_file *m, loff_t *ppos) return ERR_PTR(-EINTR); } + vma_iter_init(&priv->iter, mm, last_addr); hold_task_mempolicy(priv); - priv->tail_vma = get_gate_vma(mm); + if (last_addr == -2UL) + return get_gate_vma(mm); - vma = find_vma(mm, last_addr); - if (vma) - return vma; - - return priv->tail_vma; + return proc_get_vma(priv, ppos); } static void *m_next(struct seq_file *m, void *v, loff_t *ppos) { - struct proc_maps_private *priv = m->private; - struct vm_area_struct *next, *vma = v; - - if (vma == priv->tail_vma) - next = NULL; - else if (vma->vm_next) - next = vma->vm_next; - else - next = priv->tail_vma; - - *ppos = next ? next->vm_start : -1UL; - - return next; + if (*ppos == -2UL) { + *ppos = -1UL; + return NULL; + } + return proc_get_vma(m->private, ppos); } static void m_stop(struct seq_file *m, void *v) @@ -876,16 +880,16 @@ static int show_smaps_rollup(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; struct mem_size_stats mss; - struct mm_struct *mm; + struct mm_struct *mm = priv->mm; struct vm_area_struct *vma; - unsigned long last_vma_end = 0; + unsigned long vma_start = 0, last_vma_end = 0; int ret = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); priv->task = get_proc_task(priv->inode); if (!priv->task) return -ESRCH; - mm = priv->mm; if (!mm || !mmget_not_zero(mm)) { ret = -ESRCH; goto out_put_task; @@ -898,8 +902,13 @@ static int show_smaps_rollup(struct seq_file *m, void *v) goto out_put_mm; hold_task_mempolicy(priv); + vma = mas_find(&mas, 0); - for (vma = priv->mm->mmap; vma;) { + if (unlikely(!vma)) + goto empty_set; + + vma_start = vma->vm_start; + do { smap_gather_stats(vma, &mss, 0); last_vma_end = vma->vm_end; @@ -908,6 +917,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * access it for write request. */ if (mmap_lock_is_contended(mm)) { + mas_pause(&mas); mmap_read_unlock(mm); ret = mmap_read_lock_killable(mm); if (ret) { @@ -951,7 +961,7 @@ static int show_smaps_rollup(struct seq_file *m, void *v) * contains last_vma_end. * Iterate VMA' from last_vma_end. */ - vma = find_vma(mm, last_vma_end - 1); + vma = mas_find(&mas, ULONG_MAX); /* Case 3 above */ if (!vma) break; @@ -965,11 +975,10 @@ static int show_smaps_rollup(struct seq_file *m, void *v) smap_gather_stats(vma, &mss, last_vma_end); } /* Case 2 above */ - vma = vma->vm_next; - } + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); - show_vma_header_prefix(m, priv->mm->mmap->vm_start, - last_vma_end, 0, 0, 0, 0); +empty_set: + show_vma_header_prefix(m, vma_start, last_vma_end, 0, 0, 0, 0); seq_pad(m, ' '); seq_puts(m, "[rollup]\n"); @@ -1262,6 +1271,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, return -ESRCH; mm = get_task_mm(task); if (mm) { + MA_STATE(mas, &mm->mm_mt, 0, 0); struct mmu_notifier_range range; struct clear_refs_private cp = { .type = type, @@ -1281,7 +1291,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } if (type == CLEAR_REFS_SOFT_DIRTY) { - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (!(vma->vm_flags & VM_SOFTDIRTY)) continue; vma->vm_flags &= ~VM_SOFTDIRTY; @@ -1293,8 +1303,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, 0, NULL, mm, 0, -1UL); mmu_notifier_invalidate_range_start(&range); } - walk_page_range(mm, 0, mm->highest_vm_end, &clear_refs_walk_ops, - &cp); + walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp); if (type == CLEAR_REFS_SOFT_DIRTY) { mmu_notifier_invalidate_range_end(&range); flush_tlb_mm(mm); From 69dbe6daf1041e32e003f966d71f70f20c63af53 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:57 +0000 Subject: [PATCH 028/350] userfaultfd: use maple tree iterator to iterate VMAs Don't use the mm_struct linked list or the vma->vm_next in prep for removal. Link: https://lkml.kernel.org/r/20220906194824.2110408-45-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 62 ++++++++++++++++++++++++----------- include/linux/userfaultfd_k.h | 7 ++-- mm/mmap.c | 2 +- 3 files changed, 46 insertions(+), 25 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 4de91ba9e85e..091d95ddf9a0 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -611,14 +611,16 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, if (release_new_ctx) { struct vm_area_struct *vma; struct mm_struct *mm = release_new_ctx->mm; + VMA_ITERATOR(vmi, mm, 0); /* the various vma->vm_userfaultfd_ctx still points to it */ mmap_write_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for_each_vma(vmi, vma) { if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; vma->vm_flags &= ~__VM_UFFD_FLAGS; } + } mmap_write_unlock(mm); userfaultfd_ctx_put(release_new_ctx); @@ -799,11 +801,13 @@ static bool has_unmap_ctx(struct userfaultfd_ctx *ctx, struct list_head *unmaps, return false; } -int userfaultfd_unmap_prep(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *unmaps) +int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *unmaps) { - for ( ; vma && vma->vm_start < end; vma = vma->vm_next) { + VMA_ITERATOR(vmi, mm, start); + struct vm_area_struct *vma; + + for_each_vma_range(vmi, vma, end) { struct userfaultfd_unmap_ctx *unmap_ctx; struct userfaultfd_ctx *ctx = vma->vm_userfaultfd_ctx.ctx; @@ -853,6 +857,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) /* len == 0 means wake all */ struct userfaultfd_wake_range range = { .len = 0, }; unsigned long new_flags; + MA_STATE(mas, &mm->mm_mt, 0, 0); WRITE_ONCE(ctx->released, true); @@ -869,7 +874,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) */ mmap_write_lock(mm); prev = NULL; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { cond_resched(); BUG_ON(!!vma->vm_userfaultfd_ctx.ctx ^ !!(vma->vm_flags & __VM_UFFD_FLAGS)); @@ -883,10 +888,13 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_file, vma->vm_pgoff, vma_policy(vma), NULL_VM_UFFD_CTX, anon_vma_name(vma)); - if (prev) + if (prev) { + mas_pause(&mas); vma = prev; - else + } else { prev = vma; + } + vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } @@ -1268,6 +1276,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, bool found; bool basic_ioctls; unsigned long start, end, vma_end; + MA_STATE(mas, &mm->mm_mt, 0, 0); user_uffdio_register = (struct uffdio_register __user *) arg; @@ -1310,7 +1319,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - vma = find_vma_prev(mm, start, &prev); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) goto out_unlock; @@ -1335,7 +1345,7 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, */ found = false; basic_ioctls = false; - for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1395,8 +1405,10 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, } BUG_ON(!found); - if (vma->vm_start < start) - prev = vma; + mas_set(&mas, start); + prev = mas_prev(&mas, 0); + if (prev != vma) + mas_next(&mas, ULONG_MAX); ret = 0; do { @@ -1426,6 +1438,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ((struct vm_userfaultfd_ctx){ ctx }), anon_vma_name(vma)); if (prev) { + /* vma_merge() invalidated the mas */ + mas_pause(&mas); vma = prev; goto next; } @@ -1433,11 +1447,15 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, ret = split_vma(mm, vma, start, 1); if (ret) break; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } if (vma->vm_end > end) { ret = split_vma(mm, vma, end, 0); if (ret) break; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } next: /* @@ -1454,8 +1472,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); + vma = mas_next(&mas, end - 1); + } while (vma); out_unlock: mmap_write_unlock(mm); mmput(mm); @@ -1499,6 +1517,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, bool found; unsigned long start, end, vma_end; const void __user *buf = (void __user *)arg; + MA_STATE(mas, &mm->mm_mt, 0, 0); ret = -EFAULT; if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) @@ -1517,7 +1536,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; mmap_write_lock(mm); - vma = find_vma_prev(mm, start, &prev); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) goto out_unlock; @@ -1542,7 +1562,7 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, */ found = false; ret = -EINVAL; - for (cur = vma; cur && cur->vm_start < end; cur = cur->vm_next) { + for (cur = vma; cur; cur = mas_next(&mas, end - 1)) { cond_resched(); BUG_ON(!!cur->vm_userfaultfd_ctx.ctx ^ @@ -1562,8 +1582,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, } BUG_ON(!found); - if (vma->vm_start < start) - prev = vma; + mas_set(&mas, start); + prev = mas_prev(&mas, 0); + if (prev != vma) + mas_next(&mas, ULONG_MAX); ret = 0; do { @@ -1632,8 +1654,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, skip: prev = vma; start = vma->vm_end; - vma = vma->vm_next; - } while (vma && vma->vm_start < end); + vma = mas_next(&mas, end - 1); + } while (vma); out_unlock: mmap_write_unlock(mm); mmput(mm); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index e1b8a915e9e9..f07e6998bb68 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -175,9 +175,8 @@ extern bool userfaultfd_remove(struct vm_area_struct *vma, unsigned long start, unsigned long end); -extern int userfaultfd_unmap_prep(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - struct list_head *uf); +extern int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, + unsigned long end, struct list_head *uf); extern void userfaultfd_unmap_complete(struct mm_struct *mm, struct list_head *uf); @@ -258,7 +257,7 @@ static inline bool userfaultfd_remove(struct vm_area_struct *vma, return true; } -static inline int userfaultfd_unmap_prep(struct vm_area_struct *vma, +static inline int userfaultfd_unmap_prep(struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf) { diff --git a/mm/mmap.c b/mm/mmap.c index 8b7e9d5afd38..aabd4f986ccf 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2545,7 +2545,7 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, * split, despite we could. This is unlikely enough * failure that it's not worth optimizing it for. */ - error = userfaultfd_unmap_prep(vma, start, end, uf); + error = userfaultfd_unmap_prep(mm, start, end, uf); if (error) goto userfaultfd_error; From 01293a62bae2fa55c09cebf5a771eab7219171c3 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:58 +0000 Subject: [PATCH 029/350] ipc/shm: use VMA iterator instead of linked list The VMA iterator is faster than the linked llist, and it can be walked even when VMAs are being removed from the address space, so there's no need to keep track of 'next'. Link: https://lkml.kernel.org/r/20220906194824.2110408-46-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- ipc/shm.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/ipc/shm.c b/ipc/shm.c index b3048ebd5c31..7d86f058fb86 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -1721,7 +1721,7 @@ long ksys_shmdt(char __user *shmaddr) #ifdef CONFIG_MMU loff_t size = 0; struct file *file; - struct vm_area_struct *next; + VMA_ITERATOR(vmi, mm, addr); #endif if (addr & ~PAGE_MASK) @@ -1751,12 +1751,9 @@ long ksys_shmdt(char __user *shmaddr) * match the usual checks anyway. So assume all vma's are * above the starting address given. */ - vma = find_vma(mm, addr); #ifdef CONFIG_MMU - while (vma) { - next = vma->vm_next; - + for_each_vma(vmi, vma) { /* * Check if the starting address would match, i.e. it's * a fragment created by mprotect() and/or munmap(), or it @@ -1774,6 +1771,7 @@ long ksys_shmdt(char __user *shmaddr) file = vma->vm_file; size = i_size_read(file_inode(vma->vm_file)); do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); + mas_pause(&vmi.mas); /* * We discovered the size of the shm segment, so * break out of here and fall through to the next @@ -1781,10 +1779,9 @@ long ksys_shmdt(char __user *shmaddr) * searching for matching vma's. */ retval = 0; - vma = next; + vma = vma_next(&vmi); break; } - vma = next; } /* @@ -1794,17 +1791,19 @@ long ksys_shmdt(char __user *shmaddr) */ size = PAGE_ALIGN(size); while (vma && (loff_t)(vma->vm_end - addr) <= size) { - next = vma->vm_next; - /* finding a matching vma now does not alter retval */ if ((vma->vm_ops == &shm_vm_ops) && ((vma->vm_start - addr)/PAGE_SIZE == vma->vm_pgoff) && - (vma->vm_file == file)) + (vma->vm_file == file)) { do_munmap(mm, vma->vm_start, vma->vm_end - vma->vm_start, NULL); - vma = next; + mas_pause(&vmi.mas); + } + + vma = vma_next(&vmi); } #else /* CONFIG_MMU */ + vma = vma_lookup(mm, addr); /* under NOMMU conditions, the exact address to be destroyed must be * given */ From 160c820023bbfe7c478ed3041cc50604d664f047 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:58 +0000 Subject: [PATCH 030/350] acct: use VMA iterator instead of linked list The VMA iterator is faster than the linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-47-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/acct.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/kernel/acct.c b/kernel/acct.c index 13706356ec54..62200d799b9b 100644 --- a/kernel/acct.c +++ b/kernel/acct.c @@ -555,15 +555,14 @@ void acct_collect(long exitcode, int group_dead) unsigned long vsize = 0; if (group_dead && current->mm) { + struct mm_struct *mm = current->mm; + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - mmap_read_lock(current->mm); - vma = current->mm->mmap; - while (vma) { + mmap_read_lock(mm); + for_each_vma(vmi, vma) vsize += vma->vm_end - vma->vm_start; - vma = vma->vm_next; - } - mmap_read_unlock(current->mm); + mmap_read_unlock(mm); } spin_lock_irq(¤t->sighand->siglock); From fcb72a585aaa4caced555e98f8444e6162912cb7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:58 +0000 Subject: [PATCH 031/350] perf: use VMA iterator The VMA iterator is faster than the linked list and removing the linked list will shrink the vm_area_struct. Link: https://lkml.kernel.org/r/20220906194824.2110408-48-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/events/core.c | 3 ++- kernel/events/uprobes.c | 9 ++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 2621fd24ad26..101c5912c3fc 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -10229,8 +10229,9 @@ static void perf_addr_filter_apply(struct perf_addr_filter *filter, struct perf_addr_filter_range *fr) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 2eaa327f8158..401bc2d24ce0 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -349,9 +349,10 @@ static bool valid_ref_ctr_vma(struct uprobe *uprobe, static struct vm_area_struct * find_ref_ctr_vma(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *tmp; - for (tmp = mm->mmap; tmp; tmp = tmp->vm_next) + for_each_vma(vmi, tmp) if (valid_ref_ctr_vma(uprobe, tmp)) return tmp; @@ -1231,11 +1232,12 @@ int uprobe_apply(struct inode *inode, loff_t offset, static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; int err = 0; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long vaddr; loff_t offset; @@ -1983,9 +1985,10 @@ bool uprobe_deny_signal(void) static void mmf_recalc_uprobes(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!valid_vma(vma, false)) continue; /* From 0cd4d02c32123afc25647f1d7123bc13b51ac56b Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:59 +0000 Subject: [PATCH 032/350] sched: use maple tree iterator to walk VMAs The linked list is slower than walking the VMAs using the maple tree. We can't use the VMA iterator here because it doesn't support moving to an earlier position. Link: https://lkml.kernel.org/r/20220906194824.2110408-49-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/sched/fair.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 0e3e08a093d4..ff49f28391ea 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -2930,6 +2930,7 @@ static void task_numa_work(struct callback_head *work) struct task_struct *p = current; struct mm_struct *mm = p->mm; u64 runtime = p->se.sum_exec_runtime; + MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; @@ -2986,13 +2987,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; - vma = find_vma(mm, start); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) { reset_ptenuma_scan(p); start = 0; - vma = mm->mmap; + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); } - for (; vma; vma = vma->vm_next) { + + for (; vma; vma = mas_find(&mas, ULONG_MAX)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; From fa5e587679f034530e8c14bc1c466490053b2ff2 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:48:59 +0000 Subject: [PATCH 033/350] fork: use VMA iterator The VMA iterator is faster than the linked list and removing the linked list will shrink the vm_area_struct. Link: https://lkml.kernel.org/r/20220906194824.2110408-50-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/fork.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 430f63cd7a37..49e4ab6f5208 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1301,13 +1301,16 @@ int replace_mm_exe_file(struct mm_struct *mm, struct file *new_exe_file) /* Forbid mm->exe_file change if old file still mapped. */ old_exe_file = get_mm_exe_file(mm); if (old_exe_file) { + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - for (vma = mm->mmap; vma && !ret; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!vma->vm_file) continue; if (path_equal(&vma->vm_file->f_path, - &old_exe_file->f_path)) + &old_exe_file->f_path)) { ret = -EBUSY; + break; + } } mmap_read_unlock(mm); fput(old_exe_file); From becc8cdb6cb28d9fd3ecf890d1d6e59118a6a53d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:48:59 +0000 Subject: [PATCH 034/350] bpf: remove VMA linked list Use vma_next() and remove reference to the start of the linked list Link: https://lkml.kernel.org/r/20220906194824.2110408-51-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- kernel/bpf/task_iter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/bpf/task_iter.c b/kernel/bpf/task_iter.c index 8c921799def4..1c8debd42dc9 100644 --- a/kernel/bpf/task_iter.c +++ b/kernel/bpf/task_iter.c @@ -299,8 +299,8 @@ struct bpf_iter_seq_task_vma_info { }; enum bpf_task_vma_iter_find_op { - task_vma_iter_first_vma, /* use mm->mmap */ - task_vma_iter_next_vma, /* use curr_vma->vm_next */ + task_vma_iter_first_vma, /* use find_vma() with addr 0 */ + task_vma_iter_next_vma, /* use vma_next() with curr_vma */ task_vma_iter_find_vma, /* use find_vma() to find next vma */ }; @@ -400,10 +400,10 @@ again: switch (op) { case task_vma_iter_first_vma: - curr_vma = curr_task->mm->mmap; + curr_vma = find_vma(curr_task->mm, 0); break; case task_vma_iter_next_vma: - curr_vma = curr_vma->vm_next; + curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); break; case task_vma_iter_find_vma: /* We dropped mmap_lock so it is necessary to use find_vma @@ -417,7 +417,7 @@ again: if (curr_vma && curr_vma->vm_start == info->prev_vm_start && curr_vma->vm_end == info->prev_vm_end) - curr_vma = curr_vma->vm_next; + curr_vma = find_vma(curr_task->mm, curr_vma->vm_end); break; } if (!curr_vma) { From c4d1a92d0d3ada8a4073b8af8eff462d689d64c5 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:00 +0000 Subject: [PATCH 035/350] mm/gup: use maple tree navigation instead of linked list Use find_vma_intersection() to locate the VMAs in __mm_populate() instead of using find_vma() and the linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-52-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/gup.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index d4f706dc245f..6e49fe5da513 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1674,10 +1674,11 @@ int __mm_populate(unsigned long start, unsigned long len, int ignore_errors) if (!locked) { locked = 1; mmap_read_lock(mm); - vma = find_vma(mm, nstart); + vma = find_vma_intersection(mm, nstart, end); } else if (nstart >= vma->vm_end) - vma = vma->vm_next; - if (!vma || vma->vm_start >= end) + vma = find_vma_intersection(mm, vma->vm_end, end); + + if (!vma) break; /* * Set [nstart; nend) to intersection of desired address From 685405020b9f24ec979d41e6c27207be97c000cf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:49:00 +0000 Subject: [PATCH 036/350] mm/khugepaged: stop using vma linked list Use vma iterator & find_vma() instead of vma linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-53-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/huge_memory.c | 4 ++-- mm/khugepaged.c | 9 ++++++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 534d30cff9d7..63b4d8ff4b55 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2341,11 +2341,11 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, split_huge_pmd_if_needed(vma, end); /* - * If we're also updating the vma->vm_next->vm_start, + * If we're also updating the next vma vm_start, * check if we need to split it. */ if (adjust_next > 0) { - struct vm_area_struct *next = vma->vm_next; + struct vm_area_struct *next = find_vma(vma->vm_mm, vma->vm_end); unsigned long nstart = next->vm_start; nstart += adjust_next; split_huge_pmd_if_needed(next, nstart); diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 9ff3d39b286f..7c13d65aeb14 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2050,6 +2050,7 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, __releases(&khugepaged_mm_lock) __acquires(&khugepaged_mm_lock) { + struct vma_iterator vmi; struct mm_slot *mm_slot; struct mm_struct *mm; struct vm_area_struct *vma; @@ -2078,11 +2079,13 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, vma = NULL; if (unlikely(!mmap_read_trylock(mm))) goto breakouterloop_mmap_lock; - if (likely(!hpage_collapse_test_exit(mm))) - vma = find_vma(mm, khugepaged_scan.address); progress++; - for (; vma; vma = vma->vm_next) { + if (unlikely(hpage_collapse_test_exit(mm))) + goto breakouterloop; + + vma_iter_init(&vmi, mm, khugepaged_scan.address); + for_each_vma(vmi, vma) { unsigned long hstart, hend; cond_resched(); From a5f18ba0727656bd1fe3bcdb0d563f81790f9a04 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:49:01 +0000 Subject: [PATCH 037/350] mm/ksm: use vma iterators instead of vma linked list Remove the use of the linked list for eventual removal. Link: https://lkml.kernel.org/r/20220906194824.2110408-54-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/ksm.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index fd6d03cb0463..533ede86b4b9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -981,11 +981,13 @@ static int unmerge_and_remove_all_rmap_items(void) struct mm_slot, mm_list); spin_unlock(&ksm_mmlist_lock); - for (mm_slot = ksm_scan.mm_slot; - mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { + for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; + mm_slot = ksm_scan.mm_slot) { + VMA_ITERATOR(vmi, mm_slot->mm, 0); + mm = mm_slot->mm; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (ksm_test_exit(mm)) break; if (!(vma->vm_flags & VM_MERGEABLE) || !vma->anon_vma) @@ -2243,6 +2245,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) struct mm_slot *slot; struct vm_area_struct *vma; struct rmap_item *rmap_item; + struct vma_iterator vmi; int nid; if (list_empty(&ksm_mm_head.mm_list)) @@ -2301,13 +2304,13 @@ next_mm: } mm = slot->mm; + vma_iter_init(&vmi, mm, ksm_scan.address); + mmap_read_lock(mm); if (ksm_test_exit(mm)) - vma = NULL; - else - vma = find_vma(mm, ksm_scan.address); + goto no_vmas; - for (; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (!(vma->vm_flags & VM_MERGEABLE)) continue; if (ksm_scan.address < vma->vm_start) @@ -2345,6 +2348,7 @@ next_mm: } if (ksm_test_exit(mm)) { +no_vmas: ksm_scan.address = 0; ksm_scan.rmap_list = &slot->rmap_list; } From 3547481831acd99d6f9c3b2cef16f269e6eaad9c Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:01 +0000 Subject: [PATCH 038/350] mm/madvise: use vma_find() instead of vma linked list madvise_walk_vmas() no longer uses linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-55-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/madvise.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/madvise.c b/mm/madvise.c index 4f86eb7f554d..a3fc4cd32ed3 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -1245,7 +1245,7 @@ int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, if (start >= end) break; if (prev) - vma = prev->vm_next; + vma = find_vma(mm, prev->vm_end); else /* madvise_remove dropped mmap_lock */ vma = find_vma(mm, start); } From ba0aff8ea6ff0ba4dacfc896facadf3d91c8cd8a Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:01 +0000 Subject: [PATCH 039/350] mm/memcontrol: stop using mm->highest_vm_end Pass through ULONG_MAX instead. Link: https://lkml.kernel.org/r/20220906194824.2110408-56-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/memcontrol.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 392b1fd1e8c4..e804056422db 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5879,7 +5879,7 @@ static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm) unsigned long precharge; mmap_read_lock(mm); - walk_page_range(mm, 0, mm->highest_vm_end, &precharge_walk_ops, NULL); + walk_page_range(mm, 0, ULONG_MAX, &precharge_walk_ops, NULL); mmap_read_unlock(mm); precharge = mc.precharge; @@ -6177,9 +6177,7 @@ retry: * When we have consumed all precharges and failed in doing * additional charge, the page walk just aborts. */ - walk_page_range(mc.mm, 0, mc.mm->highest_vm_end, &charge_walk_ops, - NULL); - + walk_page_range(mc.mm, 0, ULONG_MAX, &charge_walk_ops, NULL); mmap_read_unlock(mc.mm); atomic_dec(&mc.from->moving_account); } From 66850be55e8e5f371db2c091751a932a656c5f4d Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:02 +0000 Subject: [PATCH 040/350] mm/mempolicy: use vma iterator & maple state instead of vma linked list Reworked the way mbind_range() finds the first VMA to reuse the maple state and limit the number of tree walks needed. Note, this drops the VM_BUG_ON(!vma) call, which would catch a start address higher than the last VMA. The code was written in a way that allowed no VMA updates to occur and still return success. There should be no functional change to this scenario with the new code. Link: https://lkml.kernel.org/r/20220906194824.2110408-57-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Matthew Wilcox (Oracle) Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mempolicy.c | 56 ++++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 24 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index a88fd94e18d6..143e2eaaa6ec 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -381,9 +381,10 @@ void mpol_rebind_task(struct task_struct *tsk, const nodemask_t *new) void mpol_rebind_mm(struct mm_struct *mm, nodemask_t *new) { struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); mmap_write_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) + for_each_vma(vmi, vma) mpol_rebind_policy(vma->vm_policy, new); mmap_write_unlock(mm); } @@ -654,7 +655,7 @@ static unsigned long change_prot_numa(struct vm_area_struct *vma, static int queue_pages_test_walk(unsigned long start, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma = walk->vma; + struct vm_area_struct *next, *vma = walk->vma; struct queue_pages *qp = walk->private; unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; @@ -669,9 +670,10 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, /* hole at head side of range */ return -EFAULT; } + next = find_vma(vma->vm_mm, vma->vm_end); if (!(flags & MPOL_MF_DISCONTIG_OK) && ((vma->vm_end < qp->end) && - (!vma->vm_next || vma->vm_end < vma->vm_next->vm_start))) + (!next || vma->vm_end < next->vm_start))) /* hole at middle or tail of range */ return -EFAULT; @@ -785,26 +787,24 @@ static int vma_replace_policy(struct vm_area_struct *vma, static int mbind_range(struct mm_struct *mm, unsigned long start, unsigned long end, struct mempolicy *new_pol) { + MA_STATE(mas, &mm->mm_mt, start - 1, start - 1); struct vm_area_struct *prev; struct vm_area_struct *vma; int err = 0; pgoff_t pgoff; - unsigned long vmstart; - unsigned long vmend; - vma = find_vma(mm, start); - VM_BUG_ON(!vma); + prev = mas_find_rev(&mas, 0); + if (prev && (start < prev->vm_end)) + vma = prev; + else + vma = mas_next(&mas, end - 1); - prev = vma->vm_prev; - if (start > vma->vm_start) - prev = vma; - - for (; vma && vma->vm_start < end; prev = vma, vma = vma->vm_next) { - vmstart = max(start, vma->vm_start); - vmend = min(end, vma->vm_end); + for (; vma; vma = mas_next(&mas, end - 1)) { + unsigned long vmstart = max(start, vma->vm_start); + unsigned long vmend = min(end, vma->vm_end); if (mpol_equal(vma_policy(vma), new_pol)) - continue; + goto next; pgoff = vma->vm_pgoff + ((vmstart - vma->vm_start) >> PAGE_SHIFT); @@ -813,6 +813,8 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, new_pol, vma->vm_userfaultfd_ctx, anon_vma_name(vma)); if (prev) { + /* vma_merge() invalidated the mas */ + mas_pause(&mas); vma = prev; goto replace; } @@ -820,19 +822,25 @@ static int mbind_range(struct mm_struct *mm, unsigned long start, err = split_vma(vma->vm_mm, vma, vmstart, 1); if (err) goto out; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } if (vma->vm_end != vmend) { err = split_vma(vma->vm_mm, vma, vmend, 0); if (err) goto out; + /* split_vma() invalidated the mas */ + mas_pause(&mas); } - replace: +replace: err = vma_replace_policy(vma, new_pol); if (err) goto out; +next: + prev = vma; } - out: +out: return err; } @@ -1049,6 +1057,7 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, int flags) { nodemask_t nmask; + struct vm_area_struct *vma; LIST_HEAD(pagelist); int err = 0; struct migration_target_control mtc = { @@ -1064,8 +1073,9 @@ static int migrate_to_node(struct mm_struct *mm, int source, int dest, * need migration. Between passing in the full user address * space range and MPOL_MF_DISCONTIG_OK, this call can not fail. */ + vma = find_vma(mm, 0); VM_BUG_ON(!(flags & (MPOL_MF_MOVE | MPOL_MF_MOVE_ALL))); - queue_pages_range(mm, mm->mmap->vm_start, mm->task_size, &nmask, + queue_pages_range(mm, vma->vm_start, mm->task_size, &nmask, flags | MPOL_MF_DISCONTIG_OK, &pagelist); if (!list_empty(&pagelist)) { @@ -1195,14 +1205,13 @@ static struct page *new_page(struct page *page, unsigned long start) struct folio *dst, *src = page_folio(page); struct vm_area_struct *vma; unsigned long address; + VMA_ITERATOR(vmi, current->mm, start); gfp_t gfp = GFP_HIGHUSER_MOVABLE | __GFP_RETRY_MAYFAIL; - vma = find_vma(current->mm, start); - while (vma) { + for_each_vma(vmi, vma) { address = page_address_in_vma(page, vma); if (address != -EFAULT) break; - vma = vma->vm_next; } if (folio_test_hugetlb(src)) @@ -1480,6 +1489,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le unsigned long vmend; unsigned long end; int err = -ENOENT; + VMA_ITERATOR(vmi, mm, start); start = untagged_addr(start); if (start & ~PAGE_MASK) @@ -1505,9 +1515,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (end == start) return 0; mmap_write_lock(mm); - vma = find_vma(mm, start); - for (; vma && vma->vm_start < end; vma = vma->vm_next) { - + for_each_vma_range(vmi, vma, end) { vmstart = max(start, vma->vm_start); vmend = min(end, vma->vm_end); new = mpol_dup(vma_policy(vma)); From 33108b05f39b78137c38c677b7a2d0fb7defed14 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:49:02 +0000 Subject: [PATCH 041/350] mm/mlock: use vma iterator and maple state instead of vma linked list Handle overflow checking in count_mm_mlocked_page_nr() differently. Link: https://lkml.kernel.org/r/20220906194824.2110408-58-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mlock.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index b14e929084cc..43d19a1f28eb 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -471,6 +471,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, unsigned long nstart, end, tmp; struct vm_area_struct *vma, *prev; int error; + MA_STATE(mas, ¤t->mm->mm_mt, start, start); VM_BUG_ON(offset_in_page(start)); VM_BUG_ON(len != PAGE_ALIGN(len)); @@ -479,13 +480,14 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, return -EINVAL; if (end == start) return 0; - vma = find_vma(current->mm, start); - if (!vma || vma->vm_start > start) + vma = mas_walk(&mas); + if (!vma) return -ENOMEM; - prev = vma->vm_prev; if (start > vma->vm_start) prev = vma; + else + prev = mas_prev(&mas, 0); for (nstart = start ; ; ) { vm_flags_t newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; @@ -505,7 +507,7 @@ static int apply_vma_lock_flags(unsigned long start, size_t len, if (nstart >= end) break; - vma = prev->vm_next; + vma = find_vma(prev->vm_mm, prev->vm_end); if (!vma || vma->vm_start != nstart) { error = -ENOMEM; break; @@ -526,24 +528,23 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, { struct vm_area_struct *vma; unsigned long count = 0; + unsigned long end; + VMA_ITERATOR(vmi, mm, start); if (mm == NULL) mm = current->mm; - vma = find_vma(mm, start); - if (vma == NULL) - return 0; - - for (; vma ; vma = vma->vm_next) { - if (start >= vma->vm_end) - continue; - if (start + len <= vma->vm_start) - break; + /* Don't overflow past ULONG_MAX */ + if (unlikely(ULONG_MAX - len < start)) + end = ULONG_MAX; + else + end = start + len; + for_each_vma_range(vmi, vma, end) { if (vma->vm_flags & VM_LOCKED) { if (start > vma->vm_start) count -= (start - vma->vm_start); - if (start + len < vma->vm_end) { - count += start + len - vma->vm_start; + if (end < vma->vm_end) { + count += end - vma->vm_start; break; } count += vma->vm_end - vma->vm_start; @@ -659,6 +660,7 @@ SYSCALL_DEFINE2(munlock, unsigned long, start, size_t, len) */ static int apply_mlockall_flags(int flags) { + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); struct vm_area_struct *vma, *prev = NULL; vm_flags_t to_add = 0; @@ -679,7 +681,7 @@ static int apply_mlockall_flags(int flags) to_add |= VM_LOCKONFAULT; } - for (vma = current->mm->mmap; vma ; vma = prev->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { vm_flags_t newflags; newflags = vma->vm_flags & VM_LOCKED_CLEAR_MASK; @@ -687,6 +689,7 @@ static int apply_mlockall_flags(int flags) /* Ignore errors */ mlock_fixup(vma, &prev, vma->vm_start, vma->vm_end, newflags); + mas_pause(&mas); cond_resched(); } out: From 70821e0b89dd477109c42a92d571f6dc6f6aa956 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:02 +0000 Subject: [PATCH 042/350] mm/mprotect: use maple tree navigation instead of VMA linked list Switch to navigating the VMA list with the maple tree operators in preparation for removing the linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-59-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mprotect.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/mprotect.c b/mm/mprotect.c index 55ed4a889990..461dcbd4f21a 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -676,6 +676,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, const bool rier = (current->personality & READ_IMPLIES_EXEC) && (prot & PROT_READ); struct mmu_gather tlb; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); start = untagged_addr(start); @@ -707,7 +708,8 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if ((pkey != -1) && !mm_pkey_is_allocated(current->mm, pkey)) goto out; - vma = find_vma(current->mm, start); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); error = -ENOMEM; if (!vma) goto out; @@ -733,7 +735,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (start > vma->vm_start) prev = vma; else - prev = vma->vm_prev; + prev = mas_prev(&mas, 0); tlb_gather_mmu(&tlb, current->mm); for (nstart = start ; ; ) { @@ -796,7 +798,7 @@ static int do_mprotect_pkey(unsigned long start, size_t len, if (nstart >= end) break; - vma = prev->vm_next; + vma = find_vma(current->mm, prev->vm_end); if (!vma || vma->vm_start != nstart) { error = -ENOMEM; break; From 396a44cc58910317b03dd73a93ab4fe6b76df658 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:03 +0000 Subject: [PATCH 043/350] mm/mremap: use vma_find_intersection() instead of vma linked list Using the vma_find_intersection() call allows for cleaner code and removes linked list users in preparation of the linked list removal. Also remove one user of the linked list at the same time in favour of find_vma(). Link: https://lkml.kernel.org/r/20220906194824.2110408-60-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mremap.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index e0fba9004246..8644ff278f02 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -716,7 +716,7 @@ static unsigned long move_vma(struct vm_area_struct *vma, if (excess) { vma->vm_flags |= VM_ACCOUNT; if (split) - vma->vm_next->vm_flags |= VM_ACCOUNT; + find_vma(mm, vma->vm_end)->vm_flags |= VM_ACCOUNT; } return new_addr; @@ -866,9 +866,10 @@ out: static int vma_expandable(struct vm_area_struct *vma, unsigned long delta) { unsigned long end = vma->vm_end + delta; + if (end < vma->vm_end) /* overflow */ return 0; - if (vma->vm_next && vma->vm_next->vm_start < end) /* intersection */ + if (find_vma_intersection(vma->vm_mm, vma->vm_end, end)) return 0; if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start, 0, MAP_FIXED) & ~PAGE_MASK) From 4267d1fd7825454ed41ebf53af62e7cedd779f83 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:03 +0000 Subject: [PATCH 044/350] mm/msync: use vma_find() instead of vma linked list Remove a single use of the vma linked list in preparation for the removal of the linked list. Uses find_vma() to get the next element. Link: https://lkml.kernel.org/r/20220906194824.2110408-61-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/msync.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/msync.c b/mm/msync.c index 137d1c104f3e..ac4c9bfea2e7 100644 --- a/mm/msync.c +++ b/mm/msync.c @@ -104,7 +104,7 @@ SYSCALL_DEFINE3(msync, unsigned long, start, size_t, len, int, flags) error = 0; goto out_unlock; } - vma = vma->vm_next; + vma = find_vma(mm, vma->vm_end); } } out_unlock: From e1c2c775d448be0503a3ac90681d86980919bad0 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:03 +0000 Subject: [PATCH 045/350] mm/oom_kill: use vma iterators instead of vma linked list Use vma iterator in preparation of removing the linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-62-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/oom_kill.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3c6cf9e3cd66..3996301450e8 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -513,6 +513,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; bool ret = true; + VMA_ITERATOR(vmi, mm, 0); /* * Tell all users of get_user/copy_from_user etc... that the content @@ -522,7 +523,7 @@ bool __oom_reap_task_mm(struct mm_struct *mm) */ set_bit(MMF_UNSTABLE, &mm->flags); - for (vma = mm->mmap ; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (vma->vm_flags & (VM_HUGETLB|VM_PFNMAP)) continue; From 9ec08f30f86d70b8891c25642df7d1f16647fde4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:49:04 +0000 Subject: [PATCH 046/350] mm/pagewalk: use vma_find() instead of vma linked list walk_page_range() no longer uses the one vma linked list reference. Link: https://lkml.kernel.org/r/20220906194824.2110408-63-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/pagewalk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 908ec1577f40..131b2b335b2c 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -456,7 +456,7 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, } else { /* inside vma */ walk.vma = vma; next = min(end, vma->vm_end); - vma = vma->vm_next; + vma = find_vma(mm, vma->vm_end); err = walk_page_test(start, next, &walk); if (err > 0) { From 208c09db6d88f4442fb755d20cfb237a37a49f48 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:04 +0000 Subject: [PATCH 047/350] mm/swapfile: use vma iterator instead of vma linked list unuse_mm() no longer needs to reference the linked list. Link: https://lkml.kernel.org/r/20220906194824.2110408-64-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/swapfile.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 263b19e693cf..469d9af86be2 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1994,14 +1994,16 @@ static int unuse_mm(struct mm_struct *mm, unsigned int type) { struct vm_area_struct *vma; int ret = 0; + VMA_ITERATOR(vmi, mm, 0); mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { if (vma->anon_vma) { ret = unuse_vma(vma, type); if (ret) break; } + cond_resched(); } mmap_read_unlock(mm); From f683b9d613193362ceb954c216f663a43c027302 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:49:04 +0000 Subject: [PATCH 048/350] i915: use the VMA iterator Replace the linked list in probe_range() with the VMA iterator. Link: https://lkml.kernel.org/r/20220906194824.2110408-65-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- drivers/gpu/drm/i915/gem/i915_gem_userptr.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c index 8423df021b71..d4398948f016 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_userptr.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_userptr.c @@ -426,12 +426,11 @@ static const struct drm_i915_gem_object_ops i915_gem_userptr_ops = { static int probe_range(struct mm_struct *mm, unsigned long addr, unsigned long len) { - const unsigned long end = addr + len; + VMA_ITERATOR(vmi, mm, addr); struct vm_area_struct *vma; - int ret = -EFAULT; mmap_read_lock(mm); - for (vma = find_vma(mm, addr); vma; vma = vma->vm_next) { + for_each_vma_range(vmi, vma, addr + len) { /* Check for holes, note that we also update the addr below */ if (vma->vm_start > addr) break; @@ -439,16 +438,13 @@ probe_range(struct mm_struct *mm, unsigned long addr, unsigned long len) if (vma->vm_flags & (VM_PFNMAP | VM_MIXEDMAP)) break; - if (vma->vm_end >= end) { - ret = 0; - break; - } - addr = vma->vm_end; } mmap_read_unlock(mm); - return ret; + if (vma) + return -EFAULT; + return 0; } /* From 8220543df1489ef96c3d4e8b0b3b03c340e3943e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Tue, 6 Sep 2022 19:49:05 +0000 Subject: [PATCH 049/350] nommu: remove uses of VMA linked list Use the maple tree or VMA iterator instead. This is faster and will allow us to shrink the VMA. Link: https://lkml.kernel.org/r/20220906194824.2110408-66-Liam.Howlett@oracle.com Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/nommu.c | 146 +++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 109 insertions(+), 37 deletions(-) diff --git a/mm/nommu.c b/mm/nommu.c index 265a444a2cc2..269df51e9226 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -557,26 +557,14 @@ void vma_mas_remove(struct vm_area_struct *vma, struct ma_state *mas) mas_store_prealloc(mas, NULL); } -/* - * add a VMA into a process's mm_struct in the appropriate place in the list - * and tree and add to the address space's page tree also if not an anonymous - * page - * - should be called with mm->mmap_lock held writelocked - */ -static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) +static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) { - struct address_space *mapping; - struct vm_area_struct *prev; - MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); - - BUG_ON(!vma->vm_region); - mm->map_count++; vma->vm_mm = mm; /* add the VMA to the mapping */ if (vma->vm_file) { - mapping = vma->vm_file->f_mapping; + struct address_space *mapping = vma->vm_file->f_mapping; i_mmap_lock_write(mapping); flush_dcache_mmap_lock(mapping); @@ -584,21 +572,52 @@ static void add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } +} - prev = mas_prev(&mas, 0); - mas_reset(&mas); +/* + * mas_add_vma_to_mm() - Maple state variant of add_mas_to_mm(). + * @mas: The maple state with preallocations. + * @mm: The mm_struct + * @vma: The vma to add + * + */ +static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, + struct vm_area_struct *vma) +{ + struct vm_area_struct *prev; + + BUG_ON(!vma->vm_region); + + setup_vma_to_mm(vma, mm); + + prev = mas_prev(mas, 0); + mas_reset(mas); /* add the VMA to the tree */ - vma_mas_store(vma, &mas); + vma_mas_store(vma, mas); __vma_link_list(mm, vma, prev); } /* - * delete a VMA from its owning mm_struct and address space + * add a VMA into a process's mm_struct in the appropriate place in the list + * and tree and add to the address space's page tree also if not an anonymous + * page + * - should be called with mm->mmap_lock held writelocked */ -static void delete_vma_from_mm(struct vm_area_struct *vma) +static int add_vma_to_mm(struct mm_struct *mm, struct vm_area_struct *vma) { - MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; + } + mas_add_vma_to_mm(&mas, mm, vma); + return 0; +} + +static void cleanup_vma_from_mm(struct vm_area_struct *vma) +{ vma->vm_mm->map_count--; /* remove the VMA from the mapping */ if (vma->vm_file) { @@ -611,10 +630,25 @@ static void delete_vma_from_mm(struct vm_area_struct *vma) flush_dcache_mmap_unlock(mapping); i_mmap_unlock_write(mapping); } +} +/* + * delete a VMA from its owning mm_struct and address space + */ +static int delete_vma_from_mm(struct vm_area_struct *vma) +{ + MA_STATE(mas, &vma->vm_mm->mm_mt, 0, 0); + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + return -ENOMEM; + } + cleanup_vma_from_mm(vma); /* remove from the MM's tree and list */ vma_mas_remove(vma, &mas); __vma_unlink_list(vma->vm_mm, vma); + return 0; } /* @@ -1024,6 +1058,7 @@ unsigned long do_mmap(struct file *file, vm_flags_t vm_flags; unsigned long capabilities, result; int ret; + MA_STATE(mas, ¤t->mm->mm_mt, 0, 0); *populate = 0; @@ -1042,6 +1077,7 @@ unsigned long do_mmap(struct file *file, * now know into VMA flags */ vm_flags = determine_vm_flags(file, prot, flags, capabilities); + /* we're going to need to record the mapping */ region = kmem_cache_zalloc(vm_region_jar, GFP_KERNEL); if (!region) @@ -1051,6 +1087,9 @@ unsigned long do_mmap(struct file *file, if (!vma) goto error_getting_vma; + if (mas_preallocate(&mas, vma, GFP_KERNEL)) + goto error_maple_preallocate; + region->vm_usage = 1; region->vm_flags = vm_flags; region->vm_pgoff = pgoff; @@ -1191,7 +1230,7 @@ unsigned long do_mmap(struct file *file, current->mm->total_vm += len >> PAGE_SHIFT; share: - add_vma_to_mm(current->mm, vma); + mas_add_vma_to_mm(&mas, current->mm, vma); /* we flush the region from the icache only when the first executable * mapping of it is made */ @@ -1217,6 +1256,7 @@ error: sharing_violation: up_write(&nommu_region_sem); + mas_destroy(&mas); pr_warn("Attempt to share mismatched mappings\n"); ret = -EINVAL; goto error; @@ -1233,6 +1273,14 @@ error_getting_region: len, current->pid); show_free_areas(0, NULL); return -ENOMEM; + +error_maple_preallocate: + kmem_cache_free(vm_region_jar, region); + vm_area_free(vma); + pr_warn("Allocation of vma tree for process %d failed\n", current->pid); + show_free_areas(0, NULL); + return -ENOMEM; + } unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, @@ -1298,6 +1346,7 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, struct vm_area_struct *new; struct vm_region *region; unsigned long npages; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_end); /* we're only permitted to split anonymous regions (these should have * only a single usage on the region) */ @@ -1312,9 +1361,13 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return -ENOMEM; new = vm_area_dup(vma); - if (!new) { - kmem_cache_free(vm_region_jar, region); - return -ENOMEM; + if (!new) + goto err_vma_dup; + + if (mas_preallocate(&mas, vma, GFP_KERNEL)) { + pr_warn("Allocation of vma tree for process %d failed\n", + current->pid); + goto err_mas_preallocate; } /* most fields are the same, copy all, and then fixup */ @@ -1333,7 +1386,6 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, if (new->vm_ops && new->vm_ops->open) new->vm_ops->open(new); - delete_vma_from_mm(vma); down_write(&nommu_region_sem); delete_nommu_region(vma->vm_region); if (new_below) { @@ -1346,9 +1398,19 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, add_nommu_region(vma->vm_region); add_nommu_region(new->vm_region); up_write(&nommu_region_sem); - add_vma_to_mm(mm, vma); - add_vma_to_mm(mm, new); + + setup_vma_to_mm(vma, mm); + setup_vma_to_mm(new, mm); + mas_set_range(&mas, vma->vm_start, vma->vm_end - 1); + mas_store(&mas, vma); + vma_mas_store(new, &mas); return 0; + +err_mas_preallocate: + vm_area_free(new); +err_vma_dup: + kmem_cache_free(vm_region_jar, region); + return -ENOMEM; } /* @@ -1363,12 +1425,14 @@ static int shrink_vma(struct mm_struct *mm, /* adjust the VMA's pointers, which may reposition it in the MM's tree * and list */ - delete_vma_from_mm(vma); + if (delete_vma_from_mm(vma)) + return -ENOMEM; if (from > vma->vm_start) vma->vm_end = from; else vma->vm_start = to; - add_vma_to_mm(mm, vma); + if (add_vma_to_mm(mm, vma)) + return -ENOMEM; /* cut the backing region down to size */ region = vma->vm_region; @@ -1396,9 +1460,10 @@ static int shrink_vma(struct mm_struct *mm, */ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list_head *uf) { + MA_STATE(mas, &mm->mm_mt, start, start); struct vm_area_struct *vma; unsigned long end; - int ret; + int ret = 0; len = PAGE_ALIGN(len); if (len == 0) @@ -1407,7 +1472,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list end = start + len; /* find the first potentially overlapping VMA */ - vma = find_vma(mm, start); + vma = mas_find(&mas, end - 1); if (!vma) { static int limit; if (limit < 5) { @@ -1426,7 +1491,7 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list return -EINVAL; if (end == vma->vm_end) goto erase_whole_vma; - vma = vma->vm_next; + vma = mas_next(&mas, end - 1); } while (vma); return -EINVAL; } else { @@ -1448,9 +1513,10 @@ int do_munmap(struct mm_struct *mm, unsigned long start, size_t len, struct list } erase_whole_vma: - delete_vma_from_mm(vma); + if (delete_vma_from_mm(vma)) + ret = -ENOMEM; delete_vma(mm, vma); - return 0; + return ret; } int vm_munmap(unsigned long addr, size_t len) @@ -1475,6 +1541,7 @@ SYSCALL_DEFINE2(munmap, unsigned long, addr, size_t, len) */ void exit_mmap(struct mm_struct *mm) { + VMA_ITERATOR(vmi, mm, 0); struct vm_area_struct *vma; if (!mm) @@ -1482,13 +1549,18 @@ void exit_mmap(struct mm_struct *mm) mm->total_vm = 0; - while ((vma = mm->mmap)) { - mm->mmap = vma->vm_next; - delete_vma_from_mm(vma); + /* + * Lock the mm to avoid assert complaining even though this is the only + * user of the mm + */ + mmap_write_lock(mm); + for_each_vma(vmi, vma) { + cleanup_vma_from_mm(vma); delete_vma(mm, vma); cond_resched(); } __mt_destroy(&mm->mm_mt); + mmap_write_unlock(mm); } int vm_brk(unsigned long addr, unsigned long len) From 9b580a1d60de0e1c8957e63859aa989196952e63 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:05 +0000 Subject: [PATCH 050/350] riscv: use vma iterator for vdso Remove the linked list use in favour of the vma iterator. Link: https://lkml.kernel.org/r/20220906194824.2110408-67-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Reviewed-by: Davidlohr Bueso Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- arch/riscv/kernel/vdso.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index 69b05b6c181b..692e7ae3dcb8 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -114,11 +114,12 @@ int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) { struct mm_struct *mm = task->mm; struct vm_area_struct *vma; + VMA_ITERATOR(vmi, mm, 0); struct __vdso_info *vdso_info = mm->context.vdso_info; mmap_read_lock(mm); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + for_each_vma(vmi, vma) { unsigned long size = vma->vm_end - vma->vm_start; if (vma_is_special_mapping(vma, vdso_info->dm)) From 78ba531ff3ec2a444001853d8636ff39ed11ca28 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:05 +0000 Subject: [PATCH 051/350] mm/vmscan: use vma iterator instead of vm_next Use the vma iterator in in get_next_vma() instead of the linked list. [yuzhao@google.com: mm/vmscan: use the proper VMA iterator] Link: https://lkml.kernel.org/r/Yx+QGOgHg1Wk8tGK@google.com Link: https://lkml.kernel.org/r/20220906194824.2110408-68-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Yu Zhao Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/vmscan.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 809df16c7c0d..3ba9423b141d 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3774,23 +3774,17 @@ static bool get_next_vma(unsigned long mask, unsigned long size, struct mm_walk { unsigned long start = round_up(*vm_end, size); unsigned long end = (start | ~mask) + 1; + VMA_ITERATOR(vmi, args->mm, start); VM_WARN_ON_ONCE(mask & size); VM_WARN_ON_ONCE((start & mask) != (*vm_start & mask)); - while (args->vma) { - if (start >= args->vma->vm_end) { - args->vma = args->vma->vm_next; - continue; - } - + for_each_vma(vmi, args->vma) { if (end && end <= args->vma->vm_start) return false; - if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) { - args->vma = args->vma->vm_next; + if (should_skip_vma(args->vma->vm_start, args->vma->vm_end, args)) continue; - } *vm_start = max(start, args->vma->vm_start); *vm_end = min(end - 1, args->vma->vm_end - 1) + 1; From 763ecb035029f500d7e6dc99acd1ad299b7726a1 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:06 +0000 Subject: [PATCH 052/350] mm: remove the vma linked list Replace any vm_next use with vma_find(). Update free_pgtables(), unmap_vmas(), and zap_page_range() to use the maple tree. Use the new free_pgtables() and unmap_vmas() in do_mas_align_munmap(). At the same time, alter the loop to be more compact. Now that free_pgtables() and unmap_vmas() take a maple tree as an argument, rearrange do_mas_align_munmap() to use the new tree to hold the vmas to remove. Remove __vma_link_list() and __vma_unlink_list() as they are exclusively used to update the linked list. Drop linked list update from __insert_vm_struct(). Rework validation of tree as it was depending on the linked list. [yang.lee@linux.alibaba.com: fix one kernel-doc comment] Link: https://bugzilla.openanolis.cn/show_bug.cgi?id=1949 Link: https://lkml.kernel.org/r/20220824021918.94116-1-yang.lee@linux.alibaba.comLink: https://lkml.kernel.org/r/20220906194824.2110408-69-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Signed-off-by: Yang Li Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- include/linux/mm.h | 5 +- include/linux/mm_types.h | 4 - kernel/fork.c | 19 +- mm/debug.c | 14 +- mm/internal.h | 8 +- mm/memory.c | 34 ++- mm/mmap.c | 469 ++++++++++++++++----------------------- mm/nommu.c | 6 - mm/util.c | 40 ---- 9 files changed, 225 insertions(+), 374 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 579449d6c23b..37384a84f71a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1857,8 +1857,9 @@ void zap_vma_ptes(struct vm_area_struct *vma, unsigned long address, unsigned long size); void zap_page_range(struct vm_area_struct *vma, unsigned long address, unsigned long size); -void unmap_vmas(struct mmu_gather *tlb, struct vm_area_struct *start_vma, - unsigned long start, unsigned long end); +void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *start_vma, unsigned long start, + unsigned long end); struct mmu_notifier_range; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4541b74b1bdb..5e32211cb5a9 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -408,8 +408,6 @@ struct vm_area_struct { unsigned long vm_end; /* The first byte after our end address within vm_mm. */ - /* linked list of VM areas per task, sorted by address */ - struct vm_area_struct *vm_next, *vm_prev; struct mm_struct *vm_mm; /* The address space we belong to. */ /* @@ -473,7 +471,6 @@ struct vm_area_struct { struct kioctx_table; struct mm_struct { struct { - struct vm_area_struct *mmap; /* list of VMAs */ struct maple_tree mm_mt; #ifdef CONFIG_MMU unsigned long (*get_unmapped_area) (struct file *filp, @@ -488,7 +485,6 @@ struct mm_struct { unsigned long mmap_compat_legacy_base; #endif unsigned long task_size; /* size of task vm space */ - unsigned long highest_vm_end; /* highest vma end address */ pgd_t * pgd; #ifdef CONFIG_MEMBARRIER diff --git a/kernel/fork.c b/kernel/fork.c index 49e4ab6f5208..50460330306a 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -474,7 +474,6 @@ struct vm_area_struct *vm_area_dup(struct vm_area_struct *orig) */ *new = data_race(*orig); INIT_LIST_HEAD(&new->anon_vma_chain); - new->vm_next = new->vm_prev = NULL; dup_anon_vma_name(orig, new); } return new; @@ -579,7 +578,7 @@ static void dup_mm_exe_file(struct mm_struct *mm, struct mm_struct *oldmm) static __latent_entropy int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) { - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; + struct vm_area_struct *mpnt, *tmp; int retval; unsigned long charge = 0; LIST_HEAD(uf); @@ -606,18 +605,11 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, mm->exec_vm = oldmm->exec_vm; mm->stack_vm = oldmm->stack_vm; - pprev = &mm->mmap; retval = ksm_fork(mm, oldmm); if (retval) goto out; khugepaged_fork(mm, oldmm); - retval = mas_expected_entries(&mas, oldmm->map_count); - if (retval) - goto out; - - prev = NULL; - retval = mas_expected_entries(&mas, oldmm->map_count); if (retval) goto out; @@ -689,14 +681,6 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, if (is_vm_hugetlb_page(tmp)) reset_vma_resv_huge_pages(tmp); - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - tmp->vm_prev = prev; - prev = tmp; - /* Link the vma into the MT */ mas.index = tmp->vm_start; mas.last = tmp->vm_end - 1; @@ -1124,7 +1108,6 @@ static void mm_init_uprobes_state(struct mm_struct *mm) static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, struct user_namespace *user_ns) { - mm->mmap = NULL; mt_init_flags(&mm->mm_mt, MM_MT_FLAGS); mt_set_external_lock(&mm->mm_mt, &mm->mmap_lock); atomic_set(&mm->mm_users, 1); diff --git a/mm/debug.c b/mm/debug.c index 2d625ca0e326..0fd15ba70d16 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -139,13 +139,11 @@ EXPORT_SYMBOL(dump_page); void dump_vma(const struct vm_area_struct *vma) { - pr_emerg("vma %px start %px end %px\n" - "next %px prev %px mm %px\n" + pr_emerg("vma %px start %px end %px mm %px\n" "prot %lx anon_vma %px vm_ops %px\n" "pgoff %lx file %px private_data %px\n" "flags: %#lx(%pGv)\n", - vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_next, - vma->vm_prev, vma->vm_mm, + vma, (void *)vma->vm_start, (void *)vma->vm_end, vma->vm_mm, (unsigned long)pgprot_val(vma->vm_page_prot), vma->anon_vma, vma->vm_ops, vma->vm_pgoff, vma->vm_file, vma->vm_private_data, @@ -155,11 +153,11 @@ EXPORT_SYMBOL(dump_vma); void dump_mm(const struct mm_struct *mm) { - pr_emerg("mm %px mmap %px task_size %lu\n" + pr_emerg("mm %px task_size %lu\n" #ifdef CONFIG_MMU "get_unmapped_area %px\n" #endif - "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" + "mmap_base %lu mmap_legacy_base %lu\n" "pgd %px mm_users %d mm_count %d pgtables_bytes %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %llx data_vm %lx exec_vm %lx stack_vm %lx\n" @@ -183,11 +181,11 @@ void dump_mm(const struct mm_struct *mm) "tlb_flush_pending %d\n" "def_flags: %#lx(%pGv)\n", - mm, mm->mmap, mm->task_size, + mm, mm->task_size, #ifdef CONFIG_MMU mm->get_unmapped_area, #endif - mm->mmap_base, mm->mmap_legacy_base, mm->highest_vm_end, + mm->mmap_base, mm->mmap_legacy_base, mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), mm_pgtables_bytes(mm), diff --git a/mm/internal.h b/mm/internal.h index cf134d58fd6d..0f106a3982e7 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -85,8 +85,9 @@ bool __folio_end_writeback(struct folio *folio); void deactivate_file_folio(struct folio *folio); void folio_activate(struct folio *folio); -void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *start_vma, - unsigned long floor, unsigned long ceiling); +void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *start_vma, unsigned long floor, + unsigned long ceiling); void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte); struct zap_details; @@ -480,9 +481,6 @@ static inline bool is_data_mapping(vm_flags_t flags) } /* mm/util.c */ -void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev); -void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma); struct anon_vma *folio_anon_vma(struct folio *folio); #ifdef CONFIG_MMU diff --git a/mm/memory.c b/mm/memory.c index cb955c0b7738..e49faa0a1f9a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -392,12 +392,21 @@ void free_pgd_range(struct mmu_gather *tlb, } while (pgd++, addr = next, addr != end); } -void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, - unsigned long floor, unsigned long ceiling) +void free_pgtables(struct mmu_gather *tlb, struct maple_tree *mt, + struct vm_area_struct *vma, unsigned long floor, + unsigned long ceiling) { - while (vma) { - struct vm_area_struct *next = vma->vm_next; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); + + do { unsigned long addr = vma->vm_start; + struct vm_area_struct *next; + + /* + * Note: USER_PGTABLES_CEILING may be passed as ceiling and may + * be 0. This will underflow and is okay. + */ + next = mas_find(&mas, ceiling - 1); /* * Hide vma from rmap and truncate_pagecache before freeing @@ -416,7 +425,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, while (next && next->vm_start <= vma->vm_end + PMD_SIZE && !is_vm_hugetlb_page(next)) { vma = next; - next = vma->vm_next; + next = mas_find(&mas, ceiling - 1); unlink_anon_vmas(vma); unlink_file_vma(vma); } @@ -424,7 +433,7 @@ void free_pgtables(struct mmu_gather *tlb, struct vm_area_struct *vma, floor, next ? next->vm_start : ceiling); } vma = next; - } + } while (vma); } void pmd_install(struct mm_struct *mm, pmd_t *pmd, pgtable_t *pte) @@ -1688,6 +1697,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, /** * unmap_vmas - unmap a range of memory covered by a list of vma's * @tlb: address of the caller's struct mmu_gather + * @mt: the maple tree * @vma: the starting vma * @start_addr: virtual address at which to start unmapping * @end_addr: virtual address at which to end unmapping @@ -1703,7 +1713,7 @@ static void unmap_single_vma(struct mmu_gather *tlb, * ensure that any thus-far unmapped pages are flushed before unmap_vmas() * drops the lock and schedules. */ -void unmap_vmas(struct mmu_gather *tlb, +void unmap_vmas(struct mmu_gather *tlb, struct maple_tree *mt, struct vm_area_struct *vma, unsigned long start_addr, unsigned long end_addr) { @@ -1713,12 +1723,14 @@ void unmap_vmas(struct mmu_gather *tlb, /* Careful - we need to zap private pages too! */ .even_cows = true, }; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); mmu_notifier_range_init(&range, MMU_NOTIFY_UNMAP, 0, vma, vma->vm_mm, start_addr, end_addr); mmu_notifier_invalidate_range_start(&range); - for ( ; vma && vma->vm_start < end_addr; vma = vma->vm_next) + do { unmap_single_vma(tlb, vma, start_addr, end_addr, &details); + } while ((vma = mas_find(&mas, end_addr - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); } @@ -1733,8 +1745,11 @@ void unmap_vmas(struct mmu_gather *tlb, void zap_page_range(struct vm_area_struct *vma, unsigned long start, unsigned long size) { + struct maple_tree *mt = &vma->vm_mm->mm_mt; + unsigned long end = start + size; struct mmu_notifier_range range; struct mmu_gather tlb; + MA_STATE(mas, mt, vma->vm_end, vma->vm_end); lru_add_drain(); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, @@ -1742,8 +1757,9 @@ void zap_page_range(struct vm_area_struct *vma, unsigned long start, tlb_gather_mmu(&tlb, vma->vm_mm); update_hiwater_rss(vma->vm_mm); mmu_notifier_invalidate_range_start(&range); - for ( ; vma && vma->vm_start < range.end; vma = vma->vm_next) + do { unmap_single_vma(&tlb, vma, start, range.end, NULL); + } while ((vma = mas_find(&mas, end - 1)) != NULL); mmu_notifier_invalidate_range_end(&range); tlb_finish_mmu(&tlb); } diff --git a/mm/mmap.c b/mm/mmap.c index aabd4f986ccf..4441f7ed197a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -75,9 +75,10 @@ int mmap_rnd_compat_bits __read_mostly = CONFIG_ARCH_MMAP_RND_COMPAT_BITS; static bool ignore_rlimit_data; core_param(ignore_rlimit_data, ignore_rlimit_data, bool, 0644); -static void unmap_region(struct mm_struct *mm, +static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, - unsigned long start, unsigned long end); + struct vm_area_struct *next, unsigned long start, + unsigned long end); static pgprot_t vm_pgprot_modify(pgprot_t oldprot, unsigned long vm_flags) { @@ -130,12 +131,10 @@ void unlink_file_vma(struct vm_area_struct *vma) } /* - * Close a vm structure and free it, returning the next. + * Close a vm structure and free it. */ -static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) +static void remove_vma(struct vm_area_struct *vma) { - struct vm_area_struct *next = vma->vm_next; - might_sleep(); if (vma->vm_ops && vma->vm_ops->close) vma->vm_ops->close(vma); @@ -143,7 +142,6 @@ static struct vm_area_struct *remove_vma(struct vm_area_struct *vma) fput(vma->vm_file); mpol_put(vma_policy(vma)); vm_area_free(vma); - return next; } /* @@ -168,8 +166,7 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, unsigned long newbrk, unsigned long oldbrk, struct list_head *uf); static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *brkvma, - unsigned long addr, unsigned long request, - unsigned long flags); + unsigned long addr, unsigned long request, unsigned long flags); SYSCALL_DEFINE1(brk, unsigned long, brk) { unsigned long newbrk, oldbrk, origbrk; @@ -238,7 +235,6 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) * before calling do_brk_munmap(). */ mm->brk = brk; - mas.last = oldbrk - 1; ret = do_brk_munmap(&mas, brkvma, newbrk, oldbrk, &uf); if (ret == 1) { downgraded = true; @@ -293,44 +289,21 @@ extern void mt_dump(const struct maple_tree *mt); static void validate_mm_mt(struct mm_struct *mm) { struct maple_tree *mt = &mm->mm_mt; - struct vm_area_struct *vma_mt, *vma = mm->mmap; + struct vm_area_struct *vma_mt; MA_STATE(mas, mt, 0, 0); mt_validate(&mm->mm_mt); mas_for_each(&mas, vma_mt, ULONG_MAX) { - if (xa_is_zero(vma_mt)) - continue; - - if (!vma) - break; - - if ((vma != vma_mt) || - (vma->vm_start != vma_mt->vm_start) || - (vma->vm_end != vma_mt->vm_end) || - (vma->vm_start != mas.index) || - (vma->vm_end - 1 != mas.last)) { + if ((vma_mt->vm_start != mas.index) || + (vma_mt->vm_end - 1 != mas.last)) { pr_emerg("issue in %s\n", current->comm); dump_stack(); dump_vma(vma_mt); - pr_emerg("and vm_next\n"); - dump_vma(vma->vm_next); pr_emerg("mt piv: %p %lu - %lu\n", vma_mt, mas.index, mas.last); pr_emerg("mt vma: %p %lu - %lu\n", vma_mt, vma_mt->vm_start, vma_mt->vm_end); - if (vma->vm_prev) { - pr_emerg("ll prev: %p %lu - %lu\n", - vma->vm_prev, vma->vm_prev->vm_start, - vma->vm_prev->vm_end); - } - pr_emerg("ll vma: %p %lu - %lu\n", vma, - vma->vm_start, vma->vm_end); - if (vma->vm_next) { - pr_emerg("ll next: %p %lu - %lu\n", - vma->vm_next, vma->vm_next->vm_start, - vma->vm_next->vm_end); - } mt_dump(mas.tree); if (vma_mt->vm_end != mas.last + 1) { @@ -347,23 +320,19 @@ static void validate_mm_mt(struct mm_struct *mm) } VM_BUG_ON_MM(vma_mt->vm_start != mas.index, mm); } - VM_BUG_ON(vma != vma_mt); - vma = vma->vm_next; - } - VM_BUG_ON(vma); } static void validate_mm(struct mm_struct *mm) { int bug = 0; int i = 0; - unsigned long highest_address = 0; - struct vm_area_struct *vma = mm->mmap; + struct vm_area_struct *vma; + MA_STATE(mas, &mm->mm_mt, 0, 0); validate_mm_mt(mm); - while (vma) { + mas_for_each(&mas, vma, ULONG_MAX) { #ifdef CONFIG_DEBUG_VM_RB struct anon_vma *anon_vma = vma->anon_vma; struct anon_vma_chain *avc; @@ -375,18 +344,10 @@ static void validate_mm(struct mm_struct *mm) anon_vma_unlock_read(anon_vma); } #endif - - highest_address = vm_end_gap(vma); - vma = vma->vm_next; i++; } if (i != mm->map_count) { - pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); - bug = 1; - } - if (highest_address != mm->highest_vm_end) { - pr_emerg("mm->highest_vm_end %lx, found %lx\n", - mm->highest_vm_end, highest_address); + pr_emerg("map_count %d mas_for_each %d\n", mm->map_count, i); bug = 1; } VM_BUG_ON_MM(bug, mm); @@ -446,29 +407,13 @@ bool range_has_overlap(struct mm_struct *mm, unsigned long start, struct vm_area_struct *existing; MA_STATE(mas, &mm->mm_mt, start, start); + rcu_read_lock(); existing = mas_find(&mas, end - 1); *pprev = mas_prev(&mas, 0); + rcu_read_unlock(); return existing ? true : false; } -/* - * __vma_next() - Get the next VMA. - * @mm: The mm_struct. - * @vma: The current vma. - * - * If @vma is NULL, return the first vma in the mm. - * - * Returns: The next VMA after @vma. - */ -static inline struct vm_area_struct *__vma_next(struct mm_struct *mm, - struct vm_area_struct *vma) -{ - if (!vma) - return mm->mmap; - - return vma->vm_next; -} - static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { @@ -553,8 +498,7 @@ static inline void vma_mas_szero(struct ma_state *mas, unsigned long start, mas_store_prealloc(mas, NULL); } -static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) +static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) { MA_STATE(mas, &mm->mm_mt, 0, 0); struct address_space *mapping = NULL; @@ -568,7 +512,6 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, } vma_mas_store(vma, &mas); - __vma_link_list(mm, vma, prev); __vma_link_file(vma); if (mapping) @@ -579,22 +522,6 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma, return 0; } -/* - * Helper for vma_adjust() in the split_vma insert case: insert a vma into the - * mm's list and the mm tree. It has already been inserted into the interval tree. - */ -static void __insert_vm_struct(struct mm_struct *mm, struct ma_state *mas, - struct vm_area_struct *vma, unsigned long location) -{ - struct vm_area_struct *prev; - - mas_set(mas, location); - prev = mas_prev(mas, 0); - vma_mas_store(vma, mas); - __vma_link_list(mm, vma, prev); - mm->map_count++; -} - /* * vma_expand - Expand an existing VMA * @@ -675,15 +602,8 @@ inline int vma_expand(struct ma_state *mas, struct vm_area_struct *vma, } /* Expanding over the next vma */ - if (remove_next) { - /* Remove from mm linked list - also updates highest_vm_end */ - __vma_unlink_list(mm, next); - - if (file) - __remove_shared_vm_struct(next, file, mapping); - - } else if (!next) { - mm->highest_vm_end = vm_end_gap(vma); + if (remove_next && file) { + __remove_shared_vm_struct(next, file, mapping); } if (anon_vma) { @@ -738,7 +658,6 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, int remove_next = 0; MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *exporter = NULL, *importer = NULL; - unsigned long ll_prev = vma->vm_start; /* linked list prev. */ if (next && !insert) { if (end >= next->vm_end) { @@ -773,7 +692,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, next_next = find_vma(mm, next->vm_end); VM_WARN_ON(remove_next == 2 && - end != next->vm_next->vm_end); + end != next_next->vm_end); } exporter = next; @@ -784,7 +703,7 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, * next, if the vma overlaps with it. */ if (remove_next == 2 && !next->anon_vma) - exporter = next->vm_next; + exporter = next_next; } else if (end > next->vm_start) { /* @@ -879,17 +798,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, if (vma->vm_end > end) { if (!insert || (insert->vm_start != end)) { vma_mas_szero(&mas, end, vma->vm_end); + mas_reset(&mas); VM_WARN_ON(insert && insert->vm_end < vma->vm_end); - } else if (insert->vm_start == end) { - ll_prev = vma->vm_end; } } else { vma_changed = true; } vma->vm_end = end; - if (!next) - mm->highest_vm_end = vm_end_gap(vma); } if (vma_changed) @@ -909,29 +825,19 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, flush_dcache_mmap_unlock(mapping); } - if (remove_next) { - /* - * vma_merge has merged next into vma, and needs - * us to remove next before dropping the locks. - * Since we have expanded over this vma, the maple tree will - * have overwritten by storing the value - */ - __vma_unlink_list(mm, next); + if (remove_next && file) { + __remove_shared_vm_struct(next, file, mapping); if (remove_next == 2) - __vma_unlink_list(mm, next_next); - - if (file) { - __remove_shared_vm_struct(next, file, mapping); - if (remove_next == 2) - __remove_shared_vm_struct(next_next, file, mapping); - } + __remove_shared_vm_struct(next_next, file, mapping); } else if (insert) { /* * split_vma has split insert from vma, and needs * us to insert it before dropping the locks * (it may either follow vma or precede it). */ - __insert_vm_struct(mm, &mas, insert, ll_prev); + mas_reset(&mas); + vma_mas_store(insert, &mas); + mm->map_count++; } if (anon_vma) { @@ -965,54 +871,12 @@ again: /* * In mprotect's case 6 (see comments on vma_merge), - * we must remove another next too. It would clutter - * up the code too much to do both in one go. + * we must remove next_next too. */ - if (remove_next != 3) { - /* - * If "next" was removed and vma->vm_end was - * expanded (up) over it, in turn - * "next->vm_prev->vm_end" changed and the - * "vma->vm_next" gap must be updated. - */ - next = next_next; - } else { - /* - * For the scope of the comment "next" and - * "vma" considered pre-swap(): if "vma" was - * removed, next->vm_start was expanded (down) - * over it and the "next" gap must be updated. - * Because of the swap() the post-swap() "vma" - * actually points to pre-swap() "next" - * (post-swap() "next" as opposed is now a - * dangling pointer). - */ - next = vma; - } if (remove_next == 2) { remove_next = 1; + next = next_next; goto again; - } else if (!next) { - /* - * If remove_next == 2 we obviously can't - * reach this path. - * - * If remove_next == 3 we can't reach this - * path because pre-swap() next is always not - * NULL. pre-swap() "next" is not being - * removed and its next->vm_end is not altered - * (and furthermore "end" already matches - * next->vm_end in remove_next == 3). - * - * We reach this only in the remove_next == 1 - * case if the "next" vma that was removed was - * the highest vma of the mm. However in such - * case next->vm_end == "end" and the extended - * "vma" has vma->vm_end == next->vm_end so - * mm->highest_vm_end doesn't need any update - * in remove_next == 1 case. - */ - VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); } } if (insert && file) @@ -1020,6 +884,7 @@ again: mas_destroy(&mas); validate_mm(mm); + return 0; } @@ -1179,10 +1044,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, if (vm_flags & VM_SPECIAL) return NULL; - next = __vma_next(mm, prev); + next = find_vma(mm, prev ? prev->vm_end : 0); area = next; if (area && area->vm_end == end) /* cases 6, 7, 8 */ - next = next->vm_next; + next = find_vma(mm, next->vm_end); /* verify some invariant that must be enforced by the caller */ VM_WARN_ON(prev && addr <= prev->vm_start); @@ -1316,18 +1181,24 @@ static struct anon_vma *reusable_anon_vma(struct vm_area_struct *old, struct vm_ */ struct anon_vma *find_mergeable_anon_vma(struct vm_area_struct *vma) { + MA_STATE(mas, &vma->vm_mm->mm_mt, vma->vm_end, vma->vm_end); struct anon_vma *anon_vma = NULL; + struct vm_area_struct *prev, *next; /* Try next first. */ - if (vma->vm_next) { - anon_vma = reusable_anon_vma(vma->vm_next, vma, vma->vm_next); + next = mas_walk(&mas); + if (next) { + anon_vma = reusable_anon_vma(next, vma, next); if (anon_vma) return anon_vma; } + prev = mas_prev(&mas, 0); + VM_BUG_ON_VMA(prev != vma, vma); + prev = mas_prev(&mas, 0); /* Try prev next. */ - if (vma->vm_prev) - anon_vma = reusable_anon_vma(vma->vm_prev, vma->vm_prev, vma); + if (prev) + anon_vma = reusable_anon_vma(prev, prev, vma); /* * We might reach here with anon_vma == NULL if we can't find @@ -2101,8 +1972,8 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) if (gap_addr < address || gap_addr > TASK_SIZE) gap_addr = TASK_SIZE; - next = vma->vm_next; - if (next && next->vm_start < gap_addr && vma_is_accessible(next)) { + next = find_vma_intersection(mm, vma->vm_end, gap_addr); + if (next && vma_is_accessible(next)) { if (!(next->vm_flags & VM_GROWSUP)) return -ENOMEM; /* Check that both stack segments have the same anon_vma? */ @@ -2153,8 +2024,6 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) /* Overwrite old entry in mtree. */ vma_mas_store(vma, &mas); anon_vma_interval_tree_post_update_vma(vma); - if (!vma->vm_next) - mm->highest_vm_end = vm_end_gap(vma); spin_unlock(&mm->page_table_lock); perf_event_mmap(vma); @@ -2174,16 +2043,16 @@ int expand_upwards(struct vm_area_struct *vma, unsigned long address) int expand_downwards(struct vm_area_struct *vma, unsigned long address) { struct mm_struct *mm = vma->vm_mm; + MA_STATE(mas, &mm->mm_mt, vma->vm_start, vma->vm_start); struct vm_area_struct *prev; int error = 0; - MA_STATE(mas, &mm->mm_mt, 0, 0); address &= PAGE_MASK; if (address < mmap_min_addr) return -EPERM; /* Enforce stack_guard_gap */ - prev = vma->vm_prev; + prev = mas_prev(&mas, 0); /* Check that both stack segments have the same anon_vma? */ if (prev && !(prev->vm_flags & VM_GROWSDOWN) && vma_is_accessible(prev)) { @@ -2318,25 +2187,26 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) EXPORT_SYMBOL_GPL(find_extend_vma); /* - * Ok - we have the memory areas we should free on the vma list, - * so release them, and do the vma updates. + * Ok - we have the memory areas we should free on a maple tree so release them, + * and do the vma updates. * * Called with the mm semaphore held. */ -static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) +static inline void remove_mt(struct mm_struct *mm, struct ma_state *mas) { unsigned long nr_accounted = 0; + struct vm_area_struct *vma; /* Update high watermark before we lower total_vm */ update_hiwater_vm(mm); - do { + mas_for_each(mas, vma, ULONG_MAX) { long nrpages = vma_pages(vma); if (vma->vm_flags & VM_ACCOUNT) nr_accounted += nrpages; vm_stat_account(mm, vma->vm_flags, -nrpages); - vma = remove_vma(vma); - } while (vma); + remove_vma(vma); + } vm_unacct_memory(nr_accounted); validate_mm(mm); } @@ -2346,18 +2216,18 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) * * Called with the mm semaphore held. */ -static void unmap_region(struct mm_struct *mm, +static void unmap_region(struct mm_struct *mm, struct maple_tree *mt, struct vm_area_struct *vma, struct vm_area_struct *prev, + struct vm_area_struct *next, unsigned long start, unsigned long end) { - struct vm_area_struct *next = __vma_next(mm, prev); struct mmu_gather tlb; lru_add_drain(); tlb_gather_mmu(&tlb, mm); update_hiwater_rss(mm); - unmap_vmas(&tlb, vma, start, end); - free_pgtables(&tlb, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, + unmap_vmas(&tlb, mt, vma, start, end); + free_pgtables(&tlb, mt, vma, prev ? prev->vm_end : FIRST_USER_ADDRESS, next ? next->vm_start : USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); } @@ -2444,24 +2314,17 @@ int split_vma(struct mm_struct *mm, struct vm_area_struct *vma, return __split_vma(mm, vma, addr, new_below); } -static inline int -unlock_range(struct vm_area_struct *start, struct vm_area_struct **tail, - unsigned long limit) +static inline int munmap_sidetree(struct vm_area_struct *vma, + struct ma_state *mas_detach) { - struct mm_struct *mm = start->vm_mm; - struct vm_area_struct *tmp = start; - int count = 0; + mas_set_range(mas_detach, vma->vm_start, vma->vm_end - 1); + if (mas_store_gfp(mas_detach, vma, GFP_KERNEL)) + return -ENOMEM; - while (tmp && tmp->vm_start < limit) { - *tail = tmp; - count++; - if (tmp->vm_flags & VM_LOCKED) - mm->locked_vm -= vma_pages(tmp); + if (vma->vm_flags & VM_LOCKED) + vma->vm_mm->locked_vm -= vma_pages(vma); - tmp = tmp->vm_next; - } - - return count; + return 0; } /* @@ -2481,9 +2344,13 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, unsigned long end, struct list_head *uf, bool downgrade) { - struct vm_area_struct *prev, *last; + struct vm_area_struct *prev, *next = NULL; + struct maple_tree mt_detach; + int count = 0; int error = -ENOMEM; - /* we have start < vma->vm_end */ + MA_STATE(mas_detach, &mt_detach, 0, 0); + mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN); + mt_set_external_lock(&mt_detach, &mm->mmap_lock); if (mas_preallocate(mas, vma, GFP_KERNEL)) return -ENOMEM; @@ -2496,6 +2363,8 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, * unmapped vm_area_struct will remain in use: so lower split_vma * places tmp vma above, and higher split_vma places tmp vma below. */ + + /* Does it split the first one? */ if (start > vma->vm_start) { /* @@ -2506,35 +2375,60 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, if (end < vma->vm_end && mm->map_count >= sysctl_max_map_count) goto map_count_exceeded; + /* + * mas_pause() is not needed since mas->index needs to be set + * differently than vma->vm_end anyways. + */ error = __split_vma(mm, vma, start, 0); if (error) - goto split_failed; + goto start_split_failed; - prev = vma; - vma = __vma_next(mm, prev); - mas->index = start; - mas_reset(mas); - } else { - prev = vma->vm_prev; + mas_set(mas, start); + vma = mas_walk(mas); } - if (vma->vm_end >= end) - last = vma; - else - last = find_vma_intersection(mm, end - 1, end); + prev = mas_prev(mas, 0); + if (unlikely((!prev))) + mas_set(mas, start); - /* Does it split the last one? */ - if (last && end < last->vm_end) { - error = __split_vma(mm, last, end, 1); + /* + * Detach a range of VMAs from the mm. Using next as a temp variable as + * it is always overwritten. + */ + mas_for_each(mas, next, end - 1) { + /* Does it split the end? */ + if (next->vm_end > end) { + struct vm_area_struct *split; + error = __split_vma(mm, next, end, 1); + if (error) + goto end_split_failed; + + mas_set(mas, end); + split = mas_prev(mas, 0); + error = munmap_sidetree(split, &mas_detach); + if (error) + goto munmap_sidetree_failed; + + count++; + if (vma == next) + vma = split; + break; + } + error = munmap_sidetree(next, &mas_detach); if (error) - goto split_failed; + goto munmap_sidetree_failed; - if (vma == last) - vma = __vma_next(mm, prev); - mas_reset(mas); + count++; +#ifdef CONFIG_DEBUG_VM_MAPLE_TREE + BUG_ON(next->vm_start < start); + BUG_ON(next->vm_start > end); +#endif } + if (!next) + next = mas_next(mas, ULONG_MAX); + if (unlikely(uf)) { /* * If userfaultfd_unmap_prep returns an error the vmas @@ -2551,35 +2445,36 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, goto userfaultfd_error; } - /* - * unlock any mlock()ed ranges before detaching vmas, count the number - * of VMAs to be dropped, and return the tail entry of the affected - * area. - */ - mm->map_count -= unlock_range(vma, &last, end); - /* Drop removed area from the tree */ + /* Point of no return */ + mas_set_range(mas, start, end - 1); +#if defined(CONFIG_DEBUG_VM_MAPLE_TREE) + /* Make sure no VMAs are about to be lost. */ + { + MA_STATE(test, &mt_detach, start, end - 1); + struct vm_area_struct *vma_mas, *vma_test; + int test_count = 0; + + rcu_read_lock(); + vma_test = mas_find(&test, end - 1); + mas_for_each(mas, vma_mas, end - 1) { + BUG_ON(vma_mas != vma_test); + test_count++; + vma_test = mas_next(&test, end - 1); + } + rcu_read_unlock(); + BUG_ON(count != test_count); + mas_set_range(mas, start, end - 1); + } +#endif mas_store_prealloc(mas, NULL); - - /* Detach vmas from the MM linked list */ - vma->vm_prev = NULL; - if (prev) - prev->vm_next = last->vm_next; - else - mm->mmap = last->vm_next; - - if (last->vm_next) { - last->vm_next->vm_prev = prev; - last->vm_next = NULL; - } else - mm->highest_vm_end = prev ? vm_end_gap(prev) : 0; - + mm->map_count -= count; /* * Do not downgrade mmap_lock if we are next to VM_GROWSDOWN or * VM_GROWSUP VMA. Such VMAs can change their size under * down_read(mmap_lock) and collide with the VMA we are about to unmap. */ if (downgrade) { - if (last && (last->vm_flags & VM_GROWSDOWN)) + if (next && (next->vm_flags & VM_GROWSDOWN)) downgrade = false; else if (prev && (prev->vm_flags & VM_GROWSUP)) downgrade = false; @@ -2587,18 +2482,22 @@ do_mas_align_munmap(struct ma_state *mas, struct vm_area_struct *vma, mmap_write_downgrade(mm); } - unmap_region(mm, vma, prev, start, end); - - /* Fix up all other VM information */ - remove_vma_list(mm, vma); + unmap_region(mm, &mt_detach, vma, prev, next, start, end); + /* Statistics and freeing VMAs */ + mas_set(&mas_detach, start); + remove_mt(mm, &mas_detach); + __mt_destroy(&mt_detach); validate_mm(mm); return downgrade ? 1 : 0; -map_count_exceeded: -split_failed: userfaultfd_error: +munmap_sidetree_failed: +end_split_failed: + __mt_destroy(&mt_detach); +start_split_failed: +map_count_exceeded: mas_destroy(mas); return error; } @@ -2833,7 +2732,6 @@ cannot_expand: i_mmap_lock_write(vma->vm_file->f_mapping); vma_mas_store(vma, &mas); - __vma_link_list(mm, vma, prev); mm->map_count++; if (vma->vm_file) { if (vma->vm_flags & VM_SHARED) @@ -2891,7 +2789,7 @@ unmap_and_free_vma: vma->vm_file = NULL; /* Undo any partial mapping done by a device driver. */ - unmap_region(mm, vma, prev, vma->vm_start, vma->vm_end); + unmap_region(mm, mas.tree, vma, prev, next, vma->vm_start, vma->vm_end); if (vm_flags & VM_SHARED) mapping_unmap_writable(file->f_mapping); free_vma: @@ -2979,11 +2877,12 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, goto out; if (start + size > vma->vm_end) { - struct vm_area_struct *next; + VMA_ITERATOR(vmi, mm, vma->vm_end); + struct vm_area_struct *next, *prev = vma; - for (next = vma->vm_next; next; next = next->vm_next) { + for_each_vma_range(vmi, next, start + size) { /* hole between vmas ? */ - if (next->vm_start != next->vm_prev->vm_end) + if (next->vm_start != prev->vm_end) goto out; if (next->vm_file != vma->vm_file) @@ -2992,8 +2891,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, if (next->vm_flags != vma->vm_flags) goto out; - if (start + size <= next->vm_end) - break; + prev = next; } if (!next) @@ -3060,11 +2958,9 @@ static int do_brk_munmap(struct ma_state *mas, struct vm_area_struct *vma, * do some brk-specific accounting here. */ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, - unsigned long addr, unsigned long len, - unsigned long flags) + unsigned long addr, unsigned long len, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *prev = NULL; validate_mm_mt(mm); /* @@ -3107,7 +3003,6 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, khugepaged_enter_vma(vma, flags); goto out; } - prev = vma; /* create a vma struct for an anonymous mapping */ vma = vm_area_alloc(mm); @@ -3124,10 +3019,6 @@ static int do_brk_flags(struct ma_state *mas, struct vm_area_struct *vma, if (mas_store_gfp(mas, vma, GFP_KERNEL)) goto mas_store_fail; - if (!prev) - prev = mas_prev(mas, 0); - - __vma_link_list(mm, vma, prev); mm->map_count++; out: perf_event_mmap(vma); @@ -3136,7 +3027,7 @@ out: if (flags & VM_LOCKED) mm->locked_vm += (len >> PAGE_SHIFT); vma->vm_flags |= VM_SOFTDIRTY; - validate_mm_mt(mm); + validate_mm(mm); return 0; mas_store_fail: @@ -3217,6 +3108,8 @@ void exit_mmap(struct mm_struct *mm) struct mmu_gather tlb; struct vm_area_struct *vma; unsigned long nr_accounted = 0; + MA_STATE(mas, &mm->mm_mt, 0, 0); + int count = 0; /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); @@ -3241,7 +3134,7 @@ void exit_mmap(struct mm_struct *mm) mmap_write_lock(mm); arch_exit_mmap(mm); - vma = mm->mmap; + vma = mas_find(&mas, ULONG_MAX); if (!vma) { /* Can happen if dup_mmap() received an OOM */ mmap_write_unlock(mm); @@ -3252,22 +3145,29 @@ void exit_mmap(struct mm_struct *mm) flush_cache_mm(mm); tlb_gather_mmu_fullmm(&tlb, mm); /* update_hiwater_rss(mm) here? but nobody should be looking */ - /* Use -1 here to ensure all VMAs in the mm are unmapped */ - unmap_vmas(&tlb, vma, 0, -1); - free_pgtables(&tlb, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); + /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ + unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX); + free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, + USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); - /* Walk the list again, actually closing and freeing it. */ - while (vma) { + /* + * Walk the list again, actually closing and freeing it, with preemption + * enabled, without holding any MM locks besides the unreachable + * mmap_write_lock. + */ + do { if (vma->vm_flags & VM_ACCOUNT) nr_accounted += vma_pages(vma); - vma = remove_vma(vma); + remove_vma(vma); + count++; cond_resched(); - } + } while ((vma = mas_find(&mas, ULONG_MAX)) != NULL); + + BUG_ON(count != mm->map_count); trace_exit_mmap(mm); __mt_destroy(&mm->mm_mt); - mm->mmap = NULL; mmap_write_unlock(mm); vm_unacct_memory(nr_accounted); } @@ -3306,7 +3206,7 @@ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) vma->vm_pgoff = vma->vm_start >> PAGE_SHIFT; } - if (vma_link(mm, vma, prev)) { + if (vma_link(mm, vma)) { vm_unacct_memory(charged); return -ENOMEM; } @@ -3338,7 +3238,8 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, faulted_in_anon_vma = false; } - if (range_has_overlap(mm, addr, addr + len, &prev)) + new_vma = find_vma_prev(mm, addr, &prev); + if (new_vma && new_vma->vm_start < addr + len) return NULL; /* should never get here */ new_vma = vma_merge(mm, prev, addr, addr + len, vma->vm_flags, @@ -3381,7 +3282,7 @@ struct vm_area_struct *copy_vma(struct vm_area_struct **vmap, get_file(new_vma->vm_file); if (new_vma->vm_ops && new_vma->vm_ops->open) new_vma->vm_ops->open(new_vma); - if (vma_link(mm, new_vma, prev)) + if (vma_link(mm, new_vma)) goto out_vma_link; *need_rmap_locks = false; } @@ -3686,12 +3587,13 @@ int mm_take_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; + MA_STATE(mas, &mm->mm_mt, 0, 0); mmap_assert_write_locked(mm); mutex_lock(&mm_all_locks_mutex); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && @@ -3699,7 +3601,8 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_mapping(mm, vma->vm_file->f_mapping); } - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_set(&mas, 0); + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->vm_file && vma->vm_file->f_mapping && @@ -3707,7 +3610,8 @@ int mm_take_all_locks(struct mm_struct *mm) vm_lock_mapping(mm, vma->vm_file->f_mapping); } - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_set(&mas, 0); + mas_for_each(&mas, vma, ULONG_MAX) { if (signal_pending(current)) goto out_unlock; if (vma->anon_vma) @@ -3766,11 +3670,12 @@ void mm_drop_all_locks(struct mm_struct *mm) { struct vm_area_struct *vma; struct anon_vma_chain *avc; + MA_STATE(mas, &mm->mm_mt, 0, 0); mmap_assert_write_locked(mm); BUG_ON(!mutex_is_locked(&mm_all_locks_mutex)); - for (vma = mm->mmap; vma; vma = vma->vm_next) { + mas_for_each(&mas, vma, ULONG_MAX) { if (vma->anon_vma) list_for_each_entry(avc, &vma->anon_vma_chain, same_vma) vm_unlock_anon_vma(avc->anon_vma); diff --git a/mm/nommu.c b/mm/nommu.c index 269df51e9226..214c70e1d059 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -584,17 +584,12 @@ static void setup_vma_to_mm(struct vm_area_struct *vma, struct mm_struct *mm) static void mas_add_vma_to_mm(struct ma_state *mas, struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *prev; - BUG_ON(!vma->vm_region); setup_vma_to_mm(vma, mm); - prev = mas_prev(mas, 0); - mas_reset(mas); /* add the VMA to the tree */ vma_mas_store(vma, mas); - __vma_link_list(mm, vma, prev); } /* @@ -647,7 +642,6 @@ static int delete_vma_from_mm(struct vm_area_struct *vma) /* remove from the MM's tree and list */ vma_mas_remove(vma, &mas); - __vma_unlink_list(vma->vm_mm, vma); return 0; } diff --git a/mm/util.c b/mm/util.c index 10effe256dfa..5cd3f7910f2c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -272,46 +272,6 @@ void *memdup_user_nul(const void __user *src, size_t len) } EXPORT_SYMBOL(memdup_user_nul); -void __vma_link_list(struct mm_struct *mm, struct vm_area_struct *vma, - struct vm_area_struct *prev) -{ - struct vm_area_struct *next; - - vma->vm_prev = prev; - if (prev) { - next = prev->vm_next; - prev->vm_next = vma; - } else { - next = mm->mmap; - mm->mmap = vma; - } - vma->vm_next = next; - if (next) - next->vm_prev = vma; - else - mm->highest_vm_end = vm_end_gap(vma); -} - -void __vma_unlink_list(struct mm_struct *mm, struct vm_area_struct *vma) -{ - struct vm_area_struct *prev, *next; - - next = vma->vm_next; - prev = vma->vm_prev; - if (prev) - prev->vm_next = next; - else - mm->mmap = next; - if (next) { - next->vm_prev = prev; - } else { - if (prev) - mm->highest_vm_end = vm_end_gap(prev); - else - mm->highest_vm_end = 0; - } -} - /* Check if the vma is being used as a stack by this task */ int vma_is_stack_for_current(struct vm_area_struct *vma) { From d0601a500c35856f9c134126b2423c9cfc86c701 Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:06 +0000 Subject: [PATCH 053/350] mm/mmap: drop range_has_overlap() function Since there is no longer a linked list, the range_has_overlap() function is identical to the find_vma_intersection() function. Link: https://lkml.kernel.org/r/20220906194824.2110408-70-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Acked-by: Vlastimil Babka Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmap.c | 27 +-------------------------- 1 file changed, 1 insertion(+), 26 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 4441f7ed197a..5070af64b99d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -390,30 +390,6 @@ anon_vma_interval_tree_post_update_vma(struct vm_area_struct *vma) anon_vma_interval_tree_insert(avc, &avc->anon_vma->rb_root); } -/* - * range_has_overlap() - Check the @start - @end range for overlapping VMAs and - * sets up a pointer to the previous VMA - * @mm: the mm struct - * @start: the start address of the range - * @end: the end address of the range - * @pprev: the pointer to the pointer of the previous VMA - * - * Returns: True if there is an overlapping VMA, false otherwise - */ -static inline -bool range_has_overlap(struct mm_struct *mm, unsigned long start, - unsigned long end, struct vm_area_struct **pprev) -{ - struct vm_area_struct *existing; - - MA_STATE(mas, &mm->mm_mt, start, start); - rcu_read_lock(); - existing = mas_find(&mas, end - 1); - *pprev = mas_prev(&mas, 0); - rcu_read_unlock(); - return existing ? true : false; -} - static unsigned long count_vma_pages_range(struct mm_struct *mm, unsigned long addr, unsigned long end) { @@ -3178,11 +3154,10 @@ void exit_mmap(struct mm_struct *mm) */ int insert_vm_struct(struct mm_struct *mm, struct vm_area_struct *vma) { - struct vm_area_struct *prev; unsigned long charged = vma_pages(vma); - if (range_has_overlap(mm, vma->vm_start, vma->vm_end, &prev)) + if (find_vma_intersection(mm, vma->vm_start, vma->vm_end)) return -ENOMEM; if ((vma->vm_flags & VM_ACCOUNT) && From c154124fe925a451e471233aa7d1ab9a91f0a5ad Mon Sep 17 00:00:00 2001 From: "Liam R. Howlett" Date: Tue, 6 Sep 2022 19:49:06 +0000 Subject: [PATCH 054/350] mm/mmap.c: pass in mapping to __vma_link_file() __vma_link_file() resolves the mapping from the file, if there is one. Pass through the mapping and check the vm_file externally since most places already have the required information and check of vm_file. Link: https://lkml.kernel.org/r/20220906194824.2110408-71-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Tested-by: Yu Zhao Cc: Catalin Marinas Cc: David Hildenbrand Cc: David Howells Cc: Davidlohr Bueso Cc: "Matthew Wilcox (Oracle)" Cc: SeongJae Park Cc: Sven Schnelle Cc: Vlastimil Babka Cc: Will Deacon Signed-off-by: Andrew Morton --- mm/mmap.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 5070af64b99d..fbe8b52a90a3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -407,21 +407,15 @@ static unsigned long count_vma_pages_range(struct mm_struct *mm, return nr_pages; } -static void __vma_link_file(struct vm_area_struct *vma) +static void __vma_link_file(struct vm_area_struct *vma, + struct address_space *mapping) { - struct file *file; + if (vma->vm_flags & VM_SHARED) + mapping_allow_writable(mapping); - file = vma->vm_file; - if (file) { - struct address_space *mapping = file->f_mapping; - - if (vma->vm_flags & VM_SHARED) - mapping_allow_writable(mapping); - - flush_dcache_mmap_lock(mapping); - vma_interval_tree_insert(vma, &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - } + flush_dcache_mmap_lock(mapping); + vma_interval_tree_insert(vma, &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); } /* @@ -488,10 +482,11 @@ static int vma_link(struct mm_struct *mm, struct vm_area_struct *vma) } vma_mas_store(vma, &mas); - __vma_link_file(vma); - if (mapping) + if (mapping) { + __vma_link_file(vma, mapping); i_mmap_unlock_write(mapping); + } mm->map_count++; validate_mm(mm); @@ -730,14 +725,14 @@ int __vma_adjust(struct vm_area_struct *vma, unsigned long start, uprobe_munmap(next, next->vm_start, next->vm_end); i_mmap_lock_write(mapping); - if (insert) { + if (insert && insert->vm_file) { /* * Put into interval tree now, so instantiated pages * are visible to arm/parisc __flush_dcache_page * throughout; but we cannot insert into address * space until vma start or end is updated. */ - __vma_link_file(insert); + __vma_link_file(insert, insert->vm_file->f_mapping); } } From 66071896cdfe096fcd4aef55a5efbd5216fa15de Mon Sep 17 00:00:00 2001 From: Liam Howlett Date: Wed, 15 Jun 2022 17:40:58 +0000 Subject: [PATCH 055/350] mm/mlock: drop dead code in count_mm_mlocked_page_nr() The check for mm being null has never been needed since the only caller has always passed in current->mm. Remove the check from count_mm_mlocked_page_nr(). Link: https://lkml.kernel.org/r/20220615174050.738523-1-Liam.Howlett@oracle.com Signed-off-by: Liam R. Howlett Suggested-by: Lukas Bulwahn Signed-off-by: Andrew Morton --- mm/mlock.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/mlock.c b/mm/mlock.c index 43d19a1f28eb..7032f6dd0ce1 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -531,14 +531,12 @@ static unsigned long count_mm_mlocked_page_nr(struct mm_struct *mm, unsigned long end; VMA_ITERATOR(vmi, mm, start); - if (mm == NULL) - mm = current->mm; - /* Don't overflow past ULONG_MAX */ if (unlikely(ULONG_MAX - len < start)) end = ULONG_MAX; else end = start + len; + for_each_vma_range(vmi, vma, end) { if (vma->vm_flags & VM_LOCKED) { if (start > vma->vm_start) From bf3980c85212fc71512d27a46f5aab66f46ca284 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 31 May 2022 15:30:59 -0700 Subject: [PATCH 056/350] mm: drop oom code from exit_mmap The primary reason to invoke the oom reaper from the exit_mmap path used to be a prevention of an excessive oom killing if the oom victim exit races with the oom reaper (see [1] for more details). The invocation has moved around since then because of the interaction with the munlock logic but the underlying reason has remained the same (see [2]). Munlock code is no longer a problem since [3] and there shouldn't be any blocking operation before the memory is unmapped by exit_mmap so the oom reaper invocation can be dropped. The unmapping part can be done with the non-exclusive mmap_sem and the exclusive one is only required when page tables are freed. Remove the oom_reaper from exit_mmap which will make the code easier to read. This is really unlikely to make any observable difference although some microbenchmarks could benefit from one less branch that needs to be evaluated even though it almost never is true. [1] 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently") [2] 27ae357fa82b ("mm, oom: fix concurrent munlock and oom reaper unmap, v3") [3] a213e5cf71cb ("mm/munlock: delete munlock_vma_pages_all(), allow oomreap") [akpm@linux-foundation.org: restore Suren's mmap_read_lock() optimization] Link: https://lkml.kernel.org/r/20220531223100.510392-1-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Cc: Andrea Arcangeli Cc: Christian Brauner (Microsoft) Cc: Christoph Hellwig Cc: David Hildenbrand Cc: David Rientjes Cc: Jann Horn Cc: Johannes Weiner Cc: John Hubbard Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Minchan Kim Cc: Oleg Nesterov Cc: Peter Xu Cc: Roman Gushchin Cc: Shakeel Butt Cc: Shuah Khan Signed-off-by: Andrew Morton --- include/linux/oom.h | 2 -- mm/mmap.c | 30 +++++++++++------------------- mm/oom_kill.c | 2 +- 3 files changed, 12 insertions(+), 22 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 02d1e7bbd8cd..6cdde62b078b 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -106,8 +106,6 @@ static inline vm_fault_t check_stable_address_space(struct mm_struct *mm) return 0; } -bool __oom_reap_task_mm(struct mm_struct *mm); - long oom_badness(struct task_struct *p, unsigned long totalpages); diff --git a/mm/mmap.c b/mm/mmap.c index fbe8b52a90a3..be111bbe8075 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3085,30 +3085,13 @@ void exit_mmap(struct mm_struct *mm) /* mm's last user has gone, and its about to be pulled down */ mmu_notifier_release(mm); - if (unlikely(mm_is_oom_victim(mm))) { - /* - * Manually reap the mm to free as much memory as possible. - * Then, as the oom reaper does, set MMF_OOM_SKIP to disregard - * this mm from further consideration. Taking mm->mmap_lock for - * write after setting MMF_OOM_SKIP will guarantee that the oom - * reaper will not run on this mm again after mmap_lock is - * dropped. - * - * Nothing can be holding mm->mmap_lock here and the above call - * to mmu_notifier_release(mm) ensures mmu notifier callbacks in - * __oom_reap_task_mm() will not block. - */ - (void)__oom_reap_task_mm(mm); - set_bit(MMF_OOM_SKIP, &mm->flags); - } - - mmap_write_lock(mm); + mmap_read_lock(mm); arch_exit_mmap(mm); vma = mas_find(&mas, ULONG_MAX); if (!vma) { /* Can happen if dup_mmap() received an OOM */ - mmap_write_unlock(mm); + mmap_read_unlock(mm); return; } @@ -3118,6 +3101,15 @@ void exit_mmap(struct mm_struct *mm) /* update_hiwater_rss(mm) here? but nobody should be looking */ /* Use ULONG_MAX here to ensure all VMAs in the mm are unmapped */ unmap_vmas(&tlb, &mm->mm_mt, vma, 0, ULONG_MAX); + mmap_read_unlock(mm); + + /* + * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper + * because the memory has been already freed. Do not bother checking + * mm_is_oom_victim because setting a bit unconditionally is cheaper. + */ + set_bit(MMF_OOM_SKIP, &mm->flags); + mmap_write_lock(mm); free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS, USER_PGTABLES_CEILING); tlb_finish_mmu(&tlb); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3996301450e8..decb21474c6c 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -509,7 +509,7 @@ static DECLARE_WAIT_QUEUE_HEAD(oom_reaper_wait); static struct task_struct *oom_reaper_list; static DEFINE_SPINLOCK(oom_reaper_lock); -bool __oom_reap_task_mm(struct mm_struct *mm) +static bool __oom_reap_task_mm(struct mm_struct *mm) { struct vm_area_struct *vma; bool ret = true; From b3541d912a84dc40cabb516f2deeac9ae6fa30da Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 31 May 2022 15:31:00 -0700 Subject: [PATCH 057/350] mm: delete unused MMF_OOM_VICTIM flag With the last usage of MMF_OOM_VICTIM in exit_mmap gone, this flag is now unused and can be removed. [akpm@linux-foundation.org: remove comment about now-removed mm_is_oom_victim()] Link: https://lkml.kernel.org/r/20220531223100.510392-2-surenb@google.com Signed-off-by: Suren Baghdasaryan Acked-by: Michal Hocko Cc: David Rientjes Cc: Matthew Wilcox Cc: Johannes Weiner Cc: Roman Gushchin Cc: Minchan Kim Cc: "Kirill A . Shutemov" Cc: Andrea Arcangeli Cc: Christian Brauner (Microsoft) Cc: Christoph Hellwig Cc: Oleg Nesterov Cc: David Hildenbrand Cc: Jann Horn Cc: Shakeel Butt Cc: Peter Xu Cc: John Hubbard Cc: Shuah Khan Cc: Liam Howlett Signed-off-by: Andrew Morton --- include/linux/oom.h | 9 --------- include/linux/sched/coredump.h | 7 +++---- mm/mmap.c | 3 +-- mm/oom_kill.c | 4 +--- 4 files changed, 5 insertions(+), 18 deletions(-) diff --git a/include/linux/oom.h b/include/linux/oom.h index 6cdde62b078b..7d0c9c48a0c5 100644 --- a/include/linux/oom.h +++ b/include/linux/oom.h @@ -77,15 +77,6 @@ static inline bool tsk_is_oom_victim(struct task_struct * tsk) return tsk->signal->oom_mm; } -/* - * Use this helper if tsk->mm != mm and the victim mm needs a special - * handling. This is guaranteed to stay true after once set. - */ -static inline bool mm_is_oom_victim(struct mm_struct *mm) -{ - return test_bit(MMF_OOM_VICTIM, &mm->flags); -} - /* * Checks whether a page fault on the given mm is still reliable. * This is no longer true if the oom reaper started to reap the diff --git a/include/linux/sched/coredump.h b/include/linux/sched/coredump.h index 4d0a5be28b70..8270ad7ae14c 100644 --- a/include/linux/sched/coredump.h +++ b/include/linux/sched/coredump.h @@ -71,9 +71,8 @@ static inline int get_dumpable(struct mm_struct *mm) #define MMF_UNSTABLE 22 /* mm is unstable for copy_from_user */ #define MMF_HUGE_ZERO_PAGE 23 /* mm has ever used the global huge zero page */ #define MMF_DISABLE_THP 24 /* disable THP for all VMAs */ -#define MMF_OOM_VICTIM 25 /* mm is the oom victim */ -#define MMF_OOM_REAP_QUEUED 26 /* mm was queued for oom_reaper */ -#define MMF_MULTIPROCESS 27 /* mm is shared between processes */ +#define MMF_OOM_REAP_QUEUED 25 /* mm was queued for oom_reaper */ +#define MMF_MULTIPROCESS 26 /* mm is shared between processes */ /* * MMF_HAS_PINNED: Whether this mm has pinned any pages. This can be either * replaced in the future by mm.pinned_vm when it becomes stable, or grow into @@ -81,7 +80,7 @@ static inline int get_dumpable(struct mm_struct *mm) * pinned pages were unpinned later on, we'll still keep this bit set for the * lifecycle of this mm, just for simplicity. */ -#define MMF_HAS_PINNED 28 /* FOLL_PIN has run, never cleared */ +#define MMF_HAS_PINNED 27 /* FOLL_PIN has run, never cleared */ #define MMF_DISABLE_THP_MASK (1 << MMF_DISABLE_THP) #define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK |\ diff --git a/mm/mmap.c b/mm/mmap.c index be111bbe8075..2a62d589d3c2 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3105,8 +3105,7 @@ void exit_mmap(struct mm_struct *mm) /* * Set MMF_OOM_SKIP to hide this task from the oom killer/reaper - * because the memory has been already freed. Do not bother checking - * mm_is_oom_victim because setting a bit unconditionally is cheaper. + * because the memory has been already freed. */ set_bit(MMF_OOM_SKIP, &mm->flags); mmap_write_lock(mm); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index decb21474c6c..35ec75cdfee2 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -765,10 +765,8 @@ static void mark_oom_victim(struct task_struct *tsk) return; /* oom_mm is bound to the signal struct life time. */ - if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) { + if (!cmpxchg(&tsk->signal->oom_mm, NULL, mm)) mmgrab(tsk->signal->oom_mm); - set_bit(MMF_OOM_VICTIM, &mm->flags); - } /* * Make sure that the task is woken up from uninterruptible sleep From eef199440df950942b3c7ef2e2de507fd6ced031 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Mat=C4=9Bna?= Date: Fri, 3 Jun 2022 16:57:18 +0200 Subject: [PATCH 058/350] mm: refactor of vma_merge() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Refactor of vma_merge and new merge call", v4. I am currently working on my master's thesis trying to increase number of merges of VMAs currently failing because of page offset incompatibility and difference in their anon_vmas. The following refactor and added merge call included in this series is just two smaller upgrades I created along the way. This patch (of 2): Refactor vma_merge() to make it shorter and more understandable. Main change is the elimination of code duplicity in the case of merge next check. This is done by first doing checks and caching the results before executing the merge itself. The variable 'area' is divided into 'mid' and 'res' as previously it was used for two purposes, as the middle VMA between prev and next and also as the result of the merge itself. Exit paths are also unified. Link: https://lkml.kernel.org/r/20220603145719.1012094-1-matenajakub@gmail.com Link: https://lkml.kernel.org/r/20220603145719.1012094-2-matenajakub@gmail.com Signed-off-by: Jakub MatÄ›na Reviewed-by: Vlastimil Babka Cc: Michal Hocko Cc: Mel Gorman Cc: Matthew Wilcox Cc: Liam Howlett Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Rik van Riel Cc: Steven Rostedt Cc: Peter Zijlstra (Intel) Signed-off-by: Andrew Morton --- mm/mmap.c | 87 +++++++++++++++++++++++-------------------------------- 1 file changed, 37 insertions(+), 50 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index 2a62d589d3c2..6e447544f07d 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1005,8 +1005,10 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, struct anon_vma_name *anon_name) { pgoff_t pglen = (end - addr) >> PAGE_SHIFT; - struct vm_area_struct *area, *next; - int err; + struct vm_area_struct *mid, *next, *res; + int err = -1; + bool merge_prev = false; + bool merge_next = false; /* * We later require that vma->vm_flags == vm_flags, @@ -1016,75 +1018,60 @@ struct vm_area_struct *vma_merge(struct mm_struct *mm, return NULL; next = find_vma(mm, prev ? prev->vm_end : 0); - area = next; - if (area && area->vm_end == end) /* cases 6, 7, 8 */ + mid = next; + if (next && next->vm_end == end) /* cases 6, 7, 8 */ next = find_vma(mm, next->vm_end); /* verify some invariant that must be enforced by the caller */ VM_WARN_ON(prev && addr <= prev->vm_start); - VM_WARN_ON(area && end > area->vm_end); + VM_WARN_ON(mid && end > mid->vm_end); VM_WARN_ON(addr >= end); - /* - * Can it merge with the predecessor? - */ + /* Can we merge the predecessor? */ if (prev && prev->vm_end == addr && mpol_equal(vma_policy(prev), policy) && can_vma_merge_after(prev, vm_flags, anon_vma, file, pgoff, vm_userfaultfd_ctx, anon_name)) { - /* - * OK, it can. Can we now merge in the successor as well? - */ - if (next && end == next->vm_start && - mpol_equal(policy, vma_policy(next)) && - can_vma_merge_before(next, vm_flags, - anon_vma, file, - pgoff+pglen, - vm_userfaultfd_ctx, anon_name) && - is_mergeable_anon_vma(prev->anon_vma, - next->anon_vma, NULL)) { - /* cases 1, 6 */ - err = __vma_adjust(prev, prev->vm_start, - next->vm_end, prev->vm_pgoff, NULL, - prev); - } else /* cases 2, 5, 7 */ - err = __vma_adjust(prev, prev->vm_start, - end, prev->vm_pgoff, NULL, prev); - if (err) - return NULL; - khugepaged_enter_vma(prev, vm_flags); - return prev; + merge_prev = true; } - - /* - * Can this new request be merged in front of next? - */ + /* Can we merge the successor? */ if (next && end == next->vm_start && mpol_equal(policy, vma_policy(next)) && can_vma_merge_before(next, vm_flags, anon_vma, file, pgoff+pglen, vm_userfaultfd_ctx, anon_name)) { + merge_next = true; + } + /* Can we merge both the predecessor and the successor? */ + if (merge_prev && merge_next && + is_mergeable_anon_vma(prev->anon_vma, + next->anon_vma, NULL)) { /* cases 1, 6 */ + err = __vma_adjust(prev, prev->vm_start, + next->vm_end, prev->vm_pgoff, NULL, + prev); + res = prev; + } else if (merge_prev) { /* cases 2, 5, 7 */ + err = __vma_adjust(prev, prev->vm_start, + end, prev->vm_pgoff, NULL, prev); + res = prev; + } else if (merge_next) { if (prev && addr < prev->vm_end) /* case 4 */ err = __vma_adjust(prev, prev->vm_start, - addr, prev->vm_pgoff, NULL, next); - else { /* cases 3, 8 */ - err = __vma_adjust(area, addr, next->vm_end, - next->vm_pgoff - pglen, NULL, next); - /* - * In case 3 area is already equal to next and - * this is a noop, but in case 8 "area" has - * been removed and next was expanded over it. - */ - area = next; - } - if (err) - return NULL; - khugepaged_enter_vma(area, vm_flags); - return area; + addr, prev->vm_pgoff, NULL, next); + else /* cases 3, 8 */ + err = __vma_adjust(mid, addr, next->vm_end, + next->vm_pgoff - pglen, NULL, next); + res = next; } - return NULL; + /* + * Cannot merge with predecessor or successor or error in __vma_adjust? + */ + if (err) + return NULL; + khugepaged_enter_vma(res, vm_flags); + return res; } /* From ca3d76b0aa808a06997297d123b66d17b81e5285 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Mat=C4=9Bna?= Date: Fri, 3 Jun 2022 16:57:19 +0200 Subject: [PATCH 059/350] mm: add merging after mremap resize MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When mremap call results in expansion, it might be possible to merge the VMA with the next VMA which might become adjacent. This patch adds vma_merge call after the expansion is done to try and merge. [akpm@linux-foundation.org: coding-style cleanups] Link: https://lkml.kernel.org/r/20220603145719.1012094-3-matenajakub@gmail.com Signed-off-by: Jakub MatÄ›na Reviewed-by: Vlastimil Babka Cc: Hugh Dickins Cc: "Kirill A . Shutemov" Cc: Liam Howlett Cc: Matthew Wilcox Cc: Mel Gorman Cc: Michal Hocko Cc: Peter Zijlstra (Intel) Cc: Rik van Riel Cc: Steven Rostedt Signed-off-by: Andrew Morton --- mm/mremap.c | 19 ++++++++- tools/testing/selftests/vm/mremap_test.c | 49 +++++++++++++++++++++++- 2 files changed, 65 insertions(+), 3 deletions(-) diff --git a/mm/mremap.c b/mm/mremap.c index 8644ff278f02..e465ffe279bb 100644 --- a/mm/mremap.c +++ b/mm/mremap.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -23,6 +24,7 @@ #include #include #include +#include #include #include @@ -1012,6 +1014,9 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, /* can we just expand the current mapping? */ if (vma_expandable(vma, new_len - old_len)) { long pages = (new_len - old_len) >> PAGE_SHIFT; + unsigned long extension_start = addr + old_len; + unsigned long extension_end = addr + new_len; + pgoff_t extension_pgoff = vma->vm_pgoff + (old_len >> PAGE_SHIFT); if (vma->vm_flags & VM_ACCOUNT) { if (security_vm_enough_memory_mm(mm, pages)) { @@ -1020,8 +1025,18 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, } } - if (vma_adjust(vma, vma->vm_start, addr + new_len, - vma->vm_pgoff, NULL)) { + /* + * Function vma_merge() is called on the extension we are adding to + * the already existing vma, vma_merge() will merge this extension with + * the already existing vma (expand operation itself) and possibly also + * with the next vma if it becomes adjacent to the expanded vma and + * otherwise compatible. + */ + vma = vma_merge(mm, vma, extension_start, extension_end, + vma->vm_flags, vma->anon_vma, vma->vm_file, + extension_pgoff, vma_policy(vma), + vma->vm_userfaultfd_ctx, anon_vma_name(vma)); + if (!vma) { vm_unacct_memory(pages); ret = -ENOMEM; goto out; diff --git a/tools/testing/selftests/vm/mremap_test.c b/tools/testing/selftests/vm/mremap_test.c index db0270127aeb..9496346973d4 100644 --- a/tools/testing/selftests/vm/mremap_test.c +++ b/tools/testing/selftests/vm/mremap_test.c @@ -118,6 +118,50 @@ static unsigned long long get_mmap_min_addr(void) return addr; } +/* + * This test validates that merge is called when expanding a mapping. + * Mapping containing three pages is created, middle page is unmapped + * and then the mapping containing the first page is expanded so that + * it fills the created hole. The two parts should merge creating + * single mapping with three pages. + */ +static void mremap_expand_merge(unsigned long page_size) +{ + char *test_name = "mremap expand merge"; + FILE *fp; + char *line = NULL; + size_t len = 0; + bool success = false; + char *start = mmap(NULL, 3 * page_size, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + + munmap(start + page_size, page_size); + mremap(start, page_size, 2 * page_size, 0); + + fp = fopen("/proc/self/maps", "r"); + if (fp == NULL) { + ksft_test_result_fail("%s\n", test_name); + return; + } + + while (getline(&line, &len, fp) != -1) { + char *first = strtok(line, "- "); + void *first_val = (void *)strtol(first, NULL, 16); + char *second = strtok(NULL, "- "); + void *second_val = (void *) strtol(second, NULL, 16); + + if (first_val == start && second_val == start + 3 * page_size) { + success = true; + break; + } + } + if (success) + ksft_test_result_pass("%s\n", test_name); + else + ksft_test_result_fail("%s\n", test_name); + fclose(fp); +} + /* * Returns the start address of the mapping on success, else returns * NULL on failure. @@ -336,6 +380,7 @@ int main(int argc, char **argv) int i, run_perf_tests; unsigned int threshold_mb = VALIDATION_DEFAULT_THRESHOLD; unsigned int pattern_seed; + int num_expand_tests = 1; struct test test_cases[MAX_TEST]; struct test perf_test_cases[MAX_PERF_TEST]; int page_size; @@ -407,12 +452,14 @@ int main(int argc, char **argv) (threshold_mb * _1MB >= _1GB); ksft_set_plan(ARRAY_SIZE(test_cases) + (run_perf_tests ? - ARRAY_SIZE(perf_test_cases) : 0)); + ARRAY_SIZE(perf_test_cases) : 0) + num_expand_tests); for (i = 0; i < ARRAY_SIZE(test_cases); i++) run_mremap_test_case(test_cases[i], &failures, threshold_mb, pattern_seed); + mremap_expand_merge(page_size); + if (run_perf_tests) { ksft_print_msg("\n%s\n", "mremap HAVE_MOVE_PMD/PUD optimization time comparison for 1GB region:"); From f7091ed64ec8311b0c35865875f8c3e04e5ea532 Mon Sep 17 00:00:00 2001 From: Haiyue Wang Date: Tue, 23 Aug 2022 21:58:41 +0800 Subject: [PATCH 060/350] mm: fix the handling Non-LRU pages returned by follow_page The handling Non-LRU pages returned by follow_page() jumps directly, it doesn't call put_page() to handle the reference count, since 'FOLL_GET' flag for follow_page() has get_page() called. Fix the zone device page check by handling the page reference count correctly before returning. And as David reviewed, "device pages are never PageKsm pages". Drop this zone device page check for break_ksm(). Since the zone device page can't be a transparent huge page, so drop the redundant zone device page check for split_huge_pages_pid(). (by Miaohe) Link: https://lkml.kernel.org/r/20220823135841.934465-3-haiyue.wang@intel.com Fixes: 3218f8712d6b ("mm: handling Non-LRU pages returned by vm_normal_pages") Signed-off-by: Haiyue Wang Reviewed-by: "Huang, Ying" Reviewed-by: Felix Kuehling Reviewed-by: Alistair Popple Reviewed-by: Miaohe Lin Acked-by: David Hildenbrand Cc: Alex Sierra Cc: Gerald Schaefer Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/huge_memory.c | 2 +- mm/ksm.c | 12 +++++++++--- mm/migrate.c | 19 ++++++++++++------- 3 files changed, 22 insertions(+), 11 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 63b4d8ff4b55..135acf87d24d 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -3001,7 +3001,7 @@ static int split_huge_pages_pid(int pid, unsigned long vaddr_start, /* FOLL_DUMP to ignore special (like zero) pages */ page = follow_page(vma, addr, FOLL_GET | FOLL_DUMP); - if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) + if (IS_ERR_OR_NULL(page)) continue; if (!is_transparent_hugepage(page)) diff --git a/mm/ksm.c b/mm/ksm.c index 533ede86b4b9..1fafd531f669 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -475,7 +475,7 @@ static int break_ksm(struct vm_area_struct *vma, unsigned long addr) cond_resched(); page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); - if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) + if (IS_ERR_OR_NULL(page)) break; if (PageKsm(page)) ret = handle_mm_fault(vma, addr, @@ -560,12 +560,15 @@ static struct page *get_mergeable_page(struct rmap_item *rmap_item) goto out; page = follow_page(vma, addr, FOLL_GET); - if (IS_ERR_OR_NULL(page) || is_zone_device_page(page)) + if (IS_ERR_OR_NULL(page)) goto out; + if (is_zone_device_page(page)) + goto out_putpage; if (PageAnon(page)) { flush_anon_page(vma, page, addr); flush_dcache_page(page); } else { +out_putpage: put_page(page); out: page = NULL; @@ -2322,11 +2325,13 @@ next_mm: if (ksm_test_exit(mm)) break; *page = follow_page(vma, ksm_scan.address, FOLL_GET); - if (IS_ERR_OR_NULL(*page) || is_zone_device_page(*page)) { + if (IS_ERR_OR_NULL(*page)) { ksm_scan.address += PAGE_SIZE; cond_resched(); continue; } + if (is_zone_device_page(*page)) + goto next_page; if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); @@ -2341,6 +2346,7 @@ next_mm: mmap_read_unlock(mm); return rmap_item; } +next_page: put_page(*page); ksm_scan.address += PAGE_SIZE; cond_resched(); diff --git a/mm/migrate.c b/mm/migrate.c index d74573c36573..eb594b0db806 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1691,9 +1691,12 @@ static int add_page_for_migration(struct mm_struct *mm, unsigned long addr, goto out; err = -ENOENT; - if (!page || is_zone_device_page(page)) + if (!page) goto out; + if (is_zone_device_page(page)) + goto out_putpage; + err = 0; if (page_to_nid(page) == node) goto out_putpage; @@ -1891,13 +1894,15 @@ static void do_pages_stat_array(struct mm_struct *mm, unsigned long nr_pages, if (IS_ERR(page)) goto set_status; - if (page && !is_zone_device_page(page)) { + err = -ENOENT; + if (!page) + goto set_status; + + if (!is_zone_device_page(page)) err = page_to_nid(page); - if (foll_flags & FOLL_GET) - put_page(page); - } else { - err = -ENOENT; - } + + if (foll_flags & FOLL_GET) + put_page(page); set_status: *status = err; From 474098edac262ae26bfab1c48445877075a31cbd Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Aug 2022 18:46:57 +0200 Subject: [PATCH 061/350] mm/gup: replace FOLL_NUMA by gup_can_follow_protnone() Patch series "mm: minor cleanups around NUMA hinting". Working on some GUP cleanups (e.g., getting rid of some FOLL_ flags) and preparing for other GUP changes (getting rid of FOLL_FORCE|FOLL_WRITE for for taking a R/O longterm pin), this is something I can easily send out independently. Get rid of FOLL_NUMA, allow FOLL_FORCE access to PROT_NONE mapped pages in GUP-fast, and fixup some documentation around NUMA hinting. This patch (of 3): No need for a special flag that is not even properly documented to be internal-only. Let's just factor this check out and get rid of this flag. The separate function has the nice benefit that we can centralize comments. Link: https://lkml.kernel.org/r/20220825164659.89824-2-david@redhat.com Link: https://lkml.kernel.org/r/20220825164659.89824-1-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox Cc: Mel Gorman Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/mm.h | 16 +++++++++++++++- mm/gup.c | 12 ++---------- mm/huge_memory.c | 2 +- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 37384a84f71a..eb25cae06c55 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2933,7 +2933,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, * and return without waiting upon it */ #define FOLL_NOFAULT 0x80 /* do not fault in pages */ #define FOLL_HWPOISON 0x100 /* check page is hwpoisoned */ -#define FOLL_NUMA 0x200 /* force NUMA hinting page fault */ #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ @@ -3054,6 +3053,21 @@ static inline bool gup_must_unshare(unsigned int flags, struct page *page) return !PageAnonExclusive(page); } +/* + * Indicates whether GUP can follow a PROT_NONE mapped page, or whether + * a (NUMA hinting) fault is required. + */ +static inline bool gup_can_follow_protnone(unsigned int flags) +{ + /* + * FOLL_FORCE has to be able to make progress even if the VMA is + * inaccessible. Further, FOLL_FORCE access usually does not represent + * application behaviour and we should avoid triggering NUMA hinting + * faults. + */ + return flags & FOLL_FORCE; +} + typedef int (*pte_fn_t)(pte_t *pte, unsigned long addr, void *data); extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); diff --git a/mm/gup.c b/mm/gup.c index 6e49fe5da513..f06770e03549 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -561,7 +561,7 @@ retry: migration_entry_wait(mm, pmd, address); goto retry; } - if ((flags & FOLL_NUMA) && pte_protnone(pte)) + if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) goto no_page; page = vm_normal_page(vma, address, pte); @@ -714,7 +714,7 @@ retry: if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags, &ctx->pgmap); - if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) + if (pmd_protnone(pmdval) && !gup_can_follow_protnone(flags)) return no_page_table(vma, flags); retry_locked: @@ -1160,14 +1160,6 @@ static long __get_user_pages(struct mm_struct *mm, VM_BUG_ON(!!pages != !!(gup_flags & (FOLL_GET | FOLL_PIN))); - /* - * If FOLL_FORCE is set then do not force a full fault as the hinting - * fault information is unrelated to the reference behaviour of a task - * using the address space - */ - if (!(gup_flags & FOLL_FORCE)) - gup_flags |= FOLL_NUMA; - do { struct page *page; unsigned int foll_flags = gup_flags; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 135acf87d24d..84bf1d5f6b7e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1447,7 +1447,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, return ERR_PTR(-EFAULT); /* Full NUMA hinting faults to serialise migration in fault paths */ - if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + if (pmd_protnone(*pmd) && !gup_can_follow_protnone(flags)) return NULL; if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) From 0cf459866a91c741eb14a92f3633638d1dd9db59 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Aug 2022 18:46:58 +0200 Subject: [PATCH 062/350] mm/gup: use gup_can_follow_protnone() also in GUP-fast There seems to be no reason why FOLL_FORCE during GUP-fast would have to fallback to the slow path when stumbling over a PROT_NONE mapped page. We only have to trigger hinting faults in case FOLL_FORCE is not set, and any kind of fault handling naturally happens from the slow path -- where NUMA hinting accounting/handling would be performed. Note that the comment regarding THP migration is outdated: commit 2b4847e73004 ("mm: numa: serialise parallel get_user_page against THP migration") described that this was required for THP due to lack of PMD migration entries. Nowadays, we do have proper PMD migration entries in place -- see set_pmd_migration_entry(), which does a proper pmdp_invalidate() when placing the migration entry. So let's just reuse gup_can_follow_protnone() here to make it consistent and drop the somewhat outdated comments. Link: https://lkml.kernel.org/r/20220825164659.89824-3-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox Cc: Mel Gorman Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/gup.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index f06770e03549..ce00a4c40da8 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -2420,11 +2420,7 @@ static int gup_pte_range(pmd_t pmd, pmd_t *pmdp, unsigned long addr, struct page *page; struct folio *folio; - /* - * Similar to the PMD case below, NUMA hinting must take slow - * path using the pte_protnone check. - */ - if (pte_protnone(pte)) + if (pte_protnone(pte) && !gup_can_follow_protnone(flags)) goto pte_unmap; if (!pte_access_permitted(pte, flags & FOLL_WRITE)) @@ -2808,12 +2804,8 @@ static int gup_pmd_range(pud_t *pudp, pud_t pud, unsigned long addr, unsigned lo if (unlikely(pmd_trans_huge(pmd) || pmd_huge(pmd) || pmd_devmap(pmd))) { - /* - * NUMA hinting faults need to be handled in the GUP - * slowpath for accounting purposes and so that they - * can be serialised against THP migration. - */ - if (pmd_protnone(pmd)) + if (pmd_protnone(pmd) && + !gup_can_follow_protnone(flags)) return 0; if (!gup_huge_pmd(pmd, pmdp, addr, next, flags, From 7014887a01587d8c50871d5985cd572ca08b29c0 Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 25 Aug 2022 18:46:59 +0200 Subject: [PATCH 063/350] mm: fixup documentation regarding pte_numa() and PROT_NUMA pte_numa() no longer exists -- replaced by pte_protnone() -- and PROT_NUMA probably never existed: MM_CP_PROT_NUMA also ends up using PROT_NONE. Let's fixup the doc. Link: https://lkml.kernel.org/r/20220825164659.89824-4-david@redhat.com Signed-off-by: David Hildenbrand Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Jason Gunthorpe Cc: John Hubbard Cc: Matthew Wilcox Cc: Mel Gorman Cc: Peter Xu Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5e32211cb5a9..26573ba485f3 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -614,22 +614,22 @@ struct mm_struct { #endif #ifdef CONFIG_NUMA_BALANCING /* - * numa_next_scan is the next time that the PTEs will be marked - * pte_numa. NUMA hinting faults will gather statistics and - * migrate pages to new nodes if necessary. + * numa_next_scan is the next time that PTEs will be remapped + * PROT_NONE to trigger NUMA hinting faults; such faults gather + * statistics and migrate pages to new nodes if necessary. */ unsigned long numa_next_scan; - /* Restart point for scanning and setting pte_numa */ + /* Restart point for scanning and remapping PTEs. */ unsigned long numa_scan_offset; - /* numa_scan_seq prevents two threads setting pte_numa */ + /* numa_scan_seq prevents two threads remapping PTEs. */ int numa_scan_seq; #endif /* * An operation with batched TLB flushing is going on. Anything * that can move process memory needs to flush the TLB when - * moving a PROT_NONE or PROT_NUMA mapped page. + * moving a PROT_NONE mapped page. */ atomic_t tlb_flush_pending; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH From 974f4367dd315acc15ad4a6453f8304aea60dfbd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Tue, 23 Aug 2022 11:22:30 +0200 Subject: [PATCH 064/350] mm: reduce noise in show_mem for lowmem allocations While discussing early DMA pool pre-allocation failure with Christoph [1] I have realized that the allocation failure warning is rather noisy for constrained allocations like GFP_DMA{32}. Those zones are usually not populated on all nodes very often as their memory ranges are constrained. This is an attempt to reduce the ballast that doesn't provide any relevant information for those allocation failures investigation. Please note that I have only compile tested it (in my default config setup) and I am throwing it mostly to see what people think about it. [1] http://lkml.kernel.org/r/20220817060647.1032426-1-hch@lst.de [mhocko@suse.com: update] Link: https://lkml.kernel.org/r/Yw29bmJTIkKogTiW@dhcp22.suse.cz [mhocko@suse.com: fix build] [akpm@linux-foundation.org: fix it for mapletree] [akpm@linux-foundation.org: update it for Michal's update] [mhocko@suse.com: fix arch/powerpc/xmon/xmon.c] Link: https://lkml.kernel.org/r/Ywh3C4dKB9B93jIy@dhcp22.suse.cz [akpm@linux-foundation.org: fix arch/sparc/kernel/setup_32.c] Link: https://lkml.kernel.org/r/YwScVmVofIZkopkF@dhcp22.suse.cz Signed-off-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Cc: Christoph Hellwig Cc: Mel Gorman Cc: Dan Carpenter Signed-off-by: Andrew Morton --- include/linux/mm.h | 13 +++++++++++-- lib/show_mem.c | 4 ++-- mm/oom_kill.c | 2 +- mm/page_alloc.c | 21 +++++++++++++++++++-- 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index eb25cae06c55..e56dd8f7eae1 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1838,7 +1838,11 @@ extern void pagefault_out_of_memory(void); */ #define SHOW_MEM_FILTER_NODES (0x0001u) /* disallowed nodes */ -extern void show_free_areas(unsigned int flags, nodemask_t *nodemask); +extern void __show_free_areas(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); +static void __maybe_unused show_free_areas(unsigned int flags, nodemask_t *nodemask) +{ + __show_free_areas(flags, nodemask, MAX_NR_ZONES - 1); +} #ifdef CONFIG_MMU extern bool can_do_mlock(void); @@ -2578,7 +2582,12 @@ extern void calculate_min_free_kbytes(void); extern int __meminit init_per_zone_wmark_min(void); extern void mem_init(void); extern void __init mmap_init(void); -extern void show_mem(unsigned int flags, nodemask_t *nodemask); + +extern void __show_mem(unsigned int flags, nodemask_t *nodemask, int max_zone_idx); +static inline void show_mem(unsigned int flags, nodemask_t *nodemask) +{ + __show_mem(flags, nodemask, MAX_NR_ZONES - 1); +} extern long si_mem_available(void); extern void si_meminfo(struct sysinfo * val); extern void si_meminfo_node(struct sysinfo *val, int nid); diff --git a/lib/show_mem.c b/lib/show_mem.c index 1c26c14ffbb9..0d7585cde2a6 100644 --- a/lib/show_mem.c +++ b/lib/show_mem.c @@ -8,13 +8,13 @@ #include #include -void show_mem(unsigned int filter, nodemask_t *nodemask) +void __show_mem(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { pg_data_t *pgdat; unsigned long total = 0, reserved = 0, highmem = 0; printk("Mem-Info:\n"); - show_free_areas(filter, nodemask); + __show_free_areas(filter, nodemask, max_zone_idx); for_each_online_pgdat(pgdat) { int zoneid; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 35ec75cdfee2..1276e49b31b0 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -461,7 +461,7 @@ static void dump_header(struct oom_control *oc, struct task_struct *p) if (is_memcg_oom(oc)) mem_cgroup_print_oom_meminfo(oc->memcg); else { - show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask); + __show_mem(SHOW_MEM_FILTER_NODES, oc->nodemask, gfp_zone(oc->gfp_mask)); if (should_dump_unreclaim_slab()) dump_unreclaimable_slab(); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 262896bd1a90..44f3c9364316 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4322,7 +4322,7 @@ static void warn_alloc_show_mem(gfp_t gfp_mask, nodemask_t *nodemask) if (!in_task() || !(gfp_mask & __GFP_DIRECT_RECLAIM)) filter &= ~SHOW_MEM_FILTER_NODES; - show_mem(filter, nodemask); + __show_mem(filter, nodemask, gfp_zone(gfp_mask)); } void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) @@ -6050,6 +6050,15 @@ static void show_migration_types(unsigned char type) printk(KERN_CONT "(%s) ", tmp); } +static bool node_has_managed_zones(pg_data_t *pgdat, int max_zone_idx) +{ + int zone_idx; + for (zone_idx = 0; zone_idx <= max_zone_idx; zone_idx++) + if (zone_managed_pages(pgdat->node_zones + zone_idx)) + return true; + return false; +} + /* * Show free area list (used inside shift_scroll-lock stuff) * We also calculate the percentage fragmentation. We do this by counting the @@ -6059,7 +6068,7 @@ static void show_migration_types(unsigned char type) * SHOW_MEM_FILTER_NODES: suppress nodes that are not allowed by current's * cpuset. */ -void show_free_areas(unsigned int filter, nodemask_t *nodemask) +void __show_free_areas(unsigned int filter, nodemask_t *nodemask, int max_zone_idx) { unsigned long free_pcp = 0; int cpu, nid; @@ -6067,6 +6076,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) pg_data_t *pgdat; for_each_populated_zone(zone) { + if (zone_idx(zone) > max_zone_idx) + continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; @@ -6104,6 +6115,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) for_each_online_pgdat(pgdat) { if (show_mem_node_skip(filter, pgdat->node_id, nodemask)) continue; + if (!node_has_managed_zones(pgdat, max_zone_idx)) + continue; printk("Node %d" " active_anon:%lukB" @@ -6160,6 +6173,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) for_each_populated_zone(zone) { int i; + if (zone_idx(zone) > max_zone_idx) + continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; @@ -6221,6 +6236,8 @@ void show_free_areas(unsigned int filter, nodemask_t *nodemask) unsigned long nr[MAX_ORDER], flags, total = 0; unsigned char types[MAX_ORDER]; + if (zone_idx(zone) > max_zone_idx) + continue; if (show_mem_node_skip(filter, zone_to_nid(zone), nodemask)) continue; show_node(zone); From e6ad640bc404eb298dd1880113131768ddf5c6a8 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Fri, 26 Aug 2022 23:06:42 +0000 Subject: [PATCH 065/350] mm: deduplicate cacheline padding code There are three users (mmzone.h, memcontrol.h, page_counter.h) using similar code for forcing cacheline padding between fields of different structures. Dedup that code. Link: https://lkml.kernel.org/r/20220826230642.566725-1-shakeelb@google.com Signed-off-by: Shakeel Butt Suggested-by: Feng Tang Reviewed-by: Feng Tang Acked-by: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/cache.h | 13 +++++++++++++ include/linux/memcontrol.h | 13 ++----------- include/linux/mmzone.h | 24 +++++------------------- include/linux/page_counter.h | 13 ++----------- 4 files changed, 22 insertions(+), 41 deletions(-) diff --git a/include/linux/cache.h b/include/linux/cache.h index d742c57eaee5..5da1bbd96154 100644 --- a/include/linux/cache.h +++ b/include/linux/cache.h @@ -85,4 +85,17 @@ #define cache_line_size() L1_CACHE_BYTES #endif +/* + * Helper to add padding within a struct to ensure data fall into separate + * cachelines. + */ +#if defined(CONFIG_SMP) +struct cacheline_padding { + char x[0]; +} ____cacheline_internodealigned_in_smp; +#define CACHELINE_PADDING(name) struct cacheline_padding name +#else +#define CACHELINE_PADDING(name) +#endif + #endif /* __LINUX_CACHE_H */ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 344022f102c2..60545e4a1c03 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -185,15 +185,6 @@ struct mem_cgroup_thresholds { struct mem_cgroup_threshold_ary *spare; }; -#if defined(CONFIG_SMP) -struct memcg_padding { - char x[0]; -} ____cacheline_internodealigned_in_smp; -#define MEMCG_PADDING(name) struct memcg_padding name -#else -#define MEMCG_PADDING(name) -#endif - /* * Remember four most recent foreign writebacks with dirty pages in this * cgroup. Inode sharing is expected to be uncommon and, even if we miss @@ -304,7 +295,7 @@ struct mem_cgroup { spinlock_t move_lock; unsigned long move_lock_flags; - MEMCG_PADDING(_pad1_); + CACHELINE_PADDING(_pad1_); /* memory.stat */ struct memcg_vmstats vmstats; @@ -326,7 +317,7 @@ struct mem_cgroup { struct list_head objcg_list; #endif - MEMCG_PADDING(_pad2_); + CACHELINE_PADDING(_pad2_); /* * set > 0 if pages under this cgroup are moving to other cgroup. diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index e335a492c2eb..c69c08156822 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -121,20 +121,6 @@ static inline bool free_area_empty(struct free_area *area, int migratetype) struct pglist_data; -/* - * Add a wild amount of padding here to ensure data fall into separate - * cachelines. There are very few zone structures in the machine, so space - * consumption is not a concern here. - */ -#if defined(CONFIG_SMP) -struct zone_padding { - char x[0]; -} ____cacheline_internodealigned_in_smp; -#define ZONE_PADDING(name) struct zone_padding name; -#else -#define ZONE_PADDING(name) -#endif - #ifdef CONFIG_NUMA enum numa_stat_item { NUMA_HIT, /* allocated in intended node */ @@ -837,7 +823,7 @@ struct zone { int initialized; /* Write-intensive fields used from the page allocator */ - ZONE_PADDING(_pad1_) + CACHELINE_PADDING(_pad1_); /* free areas of different sizes */ struct free_area free_area[MAX_ORDER]; @@ -849,7 +835,7 @@ struct zone { spinlock_t lock; /* Write-intensive fields used by compaction and vmstats. */ - ZONE_PADDING(_pad2_) + CACHELINE_PADDING(_pad2_); /* * When free pages are below this point, additional steps are taken @@ -886,7 +872,7 @@ struct zone { bool contiguous; - ZONE_PADDING(_pad3_) + CACHELINE_PADDING(_pad3_); /* Zone statistics */ atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS]; atomic_long_t vm_numa_event[NR_VM_NUMA_EVENT_ITEMS]; @@ -1196,7 +1182,7 @@ typedef struct pglist_data { #endif /* CONFIG_NUMA */ /* Write-intensive fields used by page reclaim */ - ZONE_PADDING(_pad1_) + CACHELINE_PADDING(_pad1_); #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* @@ -1241,7 +1227,7 @@ typedef struct pglist_data { struct lru_gen_mm_walk mm_walk; #endif - ZONE_PADDING(_pad2_) + CACHELINE_PADDING(_pad2_); /* Per-node vmstats */ struct per_cpu_nodestat __percpu *per_cpu_nodestats; diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 78a1c934e416..c141ea9a95ef 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -7,22 +7,13 @@ #include #include -#if defined(CONFIG_SMP) -struct pc_padding { - char x[0]; -} ____cacheline_internodealigned_in_smp; -#define PC_PADDING(name) struct pc_padding name -#else -#define PC_PADDING(name) -#endif - struct page_counter { /* * Make sure 'usage' does not share cacheline with any other field. The * memcg->memory.usage is a hot member of struct mem_cgroup. */ atomic_long_t usage; - PC_PADDING(_pad1_); + CACHELINE_PADDING(_pad1_); /* effective memory.min and memory.min usage tracking */ unsigned long emin; @@ -38,7 +29,7 @@ struct page_counter { unsigned long failcnt; /* Keep all the read most fields in a separete cacheline. */ - PC_PADDING(_pad2_); + CACHELINE_PADDING(_pad2_); unsigned long min; unsigned long low; From cb4df4cae4f2bd8cf7a32eff81178fce31600f7c Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 30 Aug 2022 14:38:38 +0000 Subject: [PATCH 066/350] ksm: count allocated ksm rmap_items for each process Patch series "ksm: count allocated rmap_items and update documentation", v5. KSM can save memory by merging identical pages, but also can consume additional memory, because it needs to generate rmap_items to save each scanned page's brief rmap information. To determine how beneficial the ksm-policy (like madvise), they are using brings, so we add a new interface /proc//ksm_stat for each process The value "ksm_rmap_items" in it indicates the total allocated ksm rmap_items of this process. The detailed description can be seen in the following patches' commit message. This patch (of 2): KSM can save memory by merging identical pages, but also can consume additional memory, because it needs to generate rmap_items to save each scanned page's brief rmap information. Some of these pages may be merged, but some may not be abled to be merged after being checked several times, which are unprofitable memory consumed. The information about whether KSM save memory or consume memory in system-wide range can be determined by the comprehensive calculation of pages_sharing, pages_shared, pages_unshared and pages_volatile. A simple approximate calculation: profit =~ pages_sharing * sizeof(page) - (all_rmap_items) * sizeof(rmap_item); where all_rmap_items equals to the sum of pages_sharing, pages_shared, pages_unshared and pages_volatile. But we cannot calculate this kind of ksm profit inner single-process wide because the information of ksm rmap_item's number of a process is lacked. For user applications, if this kind of information could be obtained, it helps upper users know how beneficial the ksm-policy (like madvise) they are using brings, and then optimize their app code. For example, one application madvise 1000 pages as MERGEABLE, while only a few pages are really merged, then it's not cost-efficient. So we add a new interface /proc//ksm_stat for each process in which the value of ksm_rmap_itmes is only shown now and so more values can be added in future. So similarly, we can calculate the ksm profit approximately for a single process by: profit =~ ksm_merging_pages * sizeof(page) - ksm_rmap_items * sizeof(rmap_item); where ksm_merging_pages is shown at /proc//ksm_merging_pages, and ksm_rmap_items is shown in /proc//ksm_stat. Link: https://lkml.kernel.org/r/20220830143731.299702-1-xu.xin16@zte.com.cn Link: https://lkml.kernel.org/r/20220830143838.299758-1-xu.xin16@zte.com.cn Signed-off-by: xu xin Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Signed-off-by: CGEL ZTE Cc: Alexey Dobriyan Cc: Bagas Sanjaya Cc: Hugh Dickins Cc: Izik Eidus Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- fs/proc/base.c | 15 +++++++++++++++ include/linux/mm_types.h | 5 +++++ mm/ksm.c | 2 ++ 3 files changed, 22 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index 12885a75913f..ca3e836377e8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3199,6 +3199,19 @@ static int proc_pid_ksm_merging_pages(struct seq_file *m, struct pid_namespace * return 0; } +static int proc_pid_ksm_stat(struct seq_file *m, struct pid_namespace *ns, + struct pid *pid, struct task_struct *task) +{ + struct mm_struct *mm; + + mm = get_task_mm(task); + if (mm) { + seq_printf(m, "ksm_rmap_items %lu\n", mm->ksm_rmap_items); + mmput(mm); + } + + return 0; +} #endif /* CONFIG_KSM */ #ifdef CONFIG_STACKLEAK_METRICS @@ -3334,6 +3347,7 @@ static const struct pid_entry tgid_base_stuff[] = { #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), + ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; @@ -3671,6 +3685,7 @@ static const struct pid_entry tid_base_stuff[] = { #endif #ifdef CONFIG_KSM ONE("ksm_merging_pages", S_IRUSR, proc_pid_ksm_merging_pages), + ONE("ksm_stat", S_IRUSR, proc_pid_ksm_stat), #endif }; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 26573ba485f3..8f30f262431c 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -654,6 +654,11 @@ struct mm_struct { * merging. */ unsigned long ksm_merging_pages; + /* + * Represent how many pages are checked for ksm merging + * including merged and not merged. + */ + unsigned long ksm_rmap_items; #endif #ifdef CONFIG_LRU_GEN struct { diff --git a/mm/ksm.c b/mm/ksm.c index 1fafd531f669..0cd2f4b62334 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -387,6 +387,7 @@ static inline struct rmap_item *alloc_rmap_item(void) static inline void free_rmap_item(struct rmap_item *rmap_item) { ksm_rmap_items--; + rmap_item->mm->ksm_rmap_items--; rmap_item->mm = NULL; /* debug safety */ kmem_cache_free(rmap_item_cache, rmap_item); } @@ -2235,6 +2236,7 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, if (rmap_item) { /* It has already been zeroed */ rmap_item->mm = mm_slot->mm; + rmap_item->mm->ksm_rmap_items++; rmap_item->address = addr; rmap_item->rmap_list = *rmap_list; *rmap_list = rmap_item; From 21b7bdb504ae6b0a795c8d63818611ce02b532c1 Mon Sep 17 00:00:00 2001 From: xu xin Date: Tue, 30 Aug 2022 14:40:03 +0000 Subject: [PATCH 067/350] ksm: add profit monitoring documentation Add the description of KSM profit and how to determine it separately in system-wide range and inner a single process. Link: https://lkml.kernel.org/r/20220830144003.299870-1-xu.xin16@zte.com.cn Signed-off-by: xu xin Reviewed-by: Xiaokai Ran Reviewed-by: Yang Yang Reviewed-by: Bagas Sanjaya Cc: Alexey Dobriyan Cc: Hugh Dickins Cc: Izik Eidus Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/ksm.rst | 36 ++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/Documentation/admin-guide/mm/ksm.rst b/Documentation/admin-guide/mm/ksm.rst index b244f0202a03..fb6ba2002a4b 100644 --- a/Documentation/admin-guide/mm/ksm.rst +++ b/Documentation/admin-guide/mm/ksm.rst @@ -184,6 +184,42 @@ The maximum possible ``pages_sharing/pages_shared`` ratio is limited by the ``max_page_sharing`` tunable. To increase the ratio ``max_page_sharing`` must be increased accordingly. +Monitoring KSM profit +===================== + +KSM can save memory by merging identical pages, but also can consume +additional memory, because it needs to generate a number of rmap_items to +save each scanned page's brief rmap information. Some of these pages may +be merged, but some may not be abled to be merged after being checked +several times, which are unprofitable memory consumed. + +1) How to determine whether KSM save memory or consume memory in system-wide + range? Here is a simple approximate calculation for reference:: + + general_profit =~ pages_sharing * sizeof(page) - (all_rmap_items) * + sizeof(rmap_item); + + where all_rmap_items can be easily obtained by summing ``pages_sharing``, + ``pages_shared``, ``pages_unshared`` and ``pages_volatile``. + +2) The KSM profit inner a single process can be similarly obtained by the + following approximate calculation:: + + process_profit =~ ksm_merging_pages * sizeof(page) - + ksm_rmap_items * sizeof(rmap_item). + + where ksm_merging_pages is shown under the directory ``/proc//``, + and ksm_rmap_items is shown in ``/proc//ksm_stat``. + +From the perspective of application, a high ratio of ``ksm_rmap_items`` to +``ksm_merging_pages`` means a bad madvise-applied policy, so developers or +administrators have to rethink how to change madvise policy. Giving an example +for reference, a page's size is usually 4K, and the rmap_item's size is +separately 32B on 32-bit CPU architecture and 64B on 64-bit CPU architecture. +so if the ``ksm_rmap_items/ksm_merging_pages`` ratio exceeds 64 on 64-bit CPU +or exceeds 128 on 32-bit CPU, then the app's madvise policy should be dropped, +because the ksm profit is approximately zero or negative. + Monitoring KSM events ===================== From 7e736b8e36ff87080890690670c90c91d6d80091 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:45 +0800 Subject: [PATCH 068/350] mm: introduce common struct mm_slot Patch series "add common struct mm_slot and use it in THP and KSM", v2. At present, both THP and KSM module have similar structures mm_slot for organizing and recording the information required for scanning mm, and each defines the following exactly the same operation functions: - alloc_mm_slot - free_mm_slot - get_mm_slot - insert_to_mm_slots_hash In order to de-duplicate these codes, this patchset introduces a common struct mm_slot, and lets THP and KSM to use it. This patch (of 7): At present, both THP and KSM module have similar structures mm_slot for organizing and recording the information required for scanning mm, and each defines the following exactly the same operation functions: - alloc_mm_slot - free_mm_slot - get_mm_slot - insert_to_mm_slots_hash In order to de-duplicate these codes, this patch introduces a common struct mm_slot, and subsequent patches will let THP and KSM to use it. Link: https://lkml.kernel.org/r/20220831031951.43152-1-zhengqi.arch@bytedance.com Link: https://lkml.kernel.org/r/20220831031951.43152-2-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/mm_slot.h | 55 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) create mode 100644 mm/mm_slot.h diff --git a/mm/mm_slot.h b/mm/mm_slot.h new file mode 100644 index 000000000000..83f18ed1c4bd --- /dev/null +++ b/mm/mm_slot.h @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _LINUX_MM_SLOT_H +#define _LINUX_MM_SLOT_H + +#include +#include + +/* + * struct mm_slot - hash lookup from mm to mm_slot + * @hash: link to the mm_slots hash list + * @mm_node: link into the mm_slots list + * @mm: the mm that this information is valid for + */ +struct mm_slot { + struct hlist_node hash; + struct list_head mm_node; + struct mm_struct *mm; +}; + +#define mm_slot_entry(ptr, type, member) \ + container_of(ptr, type, member) + +static inline void *mm_slot_alloc(struct kmem_cache *cache) +{ + if (!cache) /* initialization failed */ + return NULL; + return kmem_cache_zalloc(cache, GFP_KERNEL); +} + +static inline void mm_slot_free(struct kmem_cache *cache, void *objp) +{ + kmem_cache_free(cache, objp); +} + +#define mm_slot_lookup(_hashtable, _mm) \ +({ \ + struct mm_slot *tmp_slot, *mm_slot = NULL; \ + \ + hash_for_each_possible(_hashtable, tmp_slot, hash, (unsigned long)_mm) \ + if (_mm == tmp_slot->mm) { \ + mm_slot = tmp_slot; \ + break; \ + } \ + \ + mm_slot; \ +}) + +#define mm_slot_insert(_hashtable, _mm, _mm_slot) \ +({ \ + _mm_slot->mm = _mm; \ + hash_add(_hashtable, &_mm_slot->hash, (unsigned long)_mm); \ +}) + +#endif /* _LINUX_MM_SLOT_H */ From b26e27015ec9a47eed3c960b7e3065c8ba8d16d7 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:46 +0800 Subject: [PATCH 069/350] mm: thp: convert to use common struct mm_slot Rename private struct mm_slot to struct khugepaged_mm_slot and convert to use common struct mm_slot with no functional change. [zhengqi.arch@bytedance.com: fix build error with CONFIG_SHMEM disabled] Link: https://lkml.kernel.org/r/639fa8d5-8e5b-2333-69dc-40ed46219364@bytedance.com Link: https://lkml.kernel.org/r/20220831031951.43152-3-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/khugepaged.c | 123 ++++++++++++++++++++---------------------------- 1 file changed, 52 insertions(+), 71 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 7c13d65aeb14..1e59fe7bfae3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -23,6 +23,7 @@ #include #include #include "internal.h" +#include "mm_slot.h" enum scan_result { SCAN_FAIL, @@ -99,17 +100,13 @@ struct collapse_control { }; /** - * struct mm_slot - hash lookup from mm to mm_slot - * @hash: hash collision list - * @mm_node: khugepaged scan list headed in khugepaged_scan.mm_head - * @mm: the mm that this information is valid for + * struct khugepaged_mm_slot - khugepaged information per mm that is being scanned + * @slot: hash lookup from mm to mm_slot * @nr_pte_mapped_thp: number of pte mapped THP * @pte_mapped_thp: address array corresponding pte mapped THP */ -struct mm_slot { - struct hlist_node hash; - struct list_head mm_node; - struct mm_struct *mm; +struct khugepaged_mm_slot { + struct mm_slot slot; /* pte-mapped THP in this mm */ int nr_pte_mapped_thp; @@ -126,7 +123,7 @@ struct mm_slot { */ struct khugepaged_scan { struct list_head mm_head; - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; unsigned long address; }; @@ -390,8 +387,9 @@ int hugepage_madvise(struct vm_area_struct *vma, int __init khugepaged_init(void) { mm_slot_cache = kmem_cache_create("khugepaged_mm_slot", - sizeof(struct mm_slot), - __alignof__(struct mm_slot), 0, NULL); + sizeof(struct khugepaged_mm_slot), + __alignof__(struct khugepaged_mm_slot), + 0, NULL); if (!mm_slot_cache) return -ENOMEM; @@ -408,36 +406,6 @@ void __init khugepaged_destroy(void) kmem_cache_destroy(mm_slot_cache); } -static inline struct mm_slot *alloc_mm_slot(void) -{ - if (!mm_slot_cache) /* initialization failed */ - return NULL; - return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); -} - -static inline void free_mm_slot(struct mm_slot *mm_slot) -{ - kmem_cache_free(mm_slot_cache, mm_slot); -} - -static struct mm_slot *get_mm_slot(struct mm_struct *mm) -{ - struct mm_slot *mm_slot; - - hash_for_each_possible(mm_slots_hash, mm_slot, hash, (unsigned long)mm) - if (mm == mm_slot->mm) - return mm_slot; - - return NULL; -} - -static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct mm_slot *mm_slot) -{ - mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->hash, (long)mm); -} - static inline int hpage_collapse_test_exit(struct mm_struct *mm) { return atomic_read(&mm->mm_users) == 0; @@ -445,28 +413,31 @@ static inline int hpage_collapse_test_exit(struct mm_struct *mm) void __khugepaged_enter(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; int wakeup; - mm_slot = alloc_mm_slot(); + mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return; + slot = &mm_slot->slot; + /* __khugepaged_exit() must not run from under us */ VM_BUG_ON_MM(hpage_collapse_test_exit(mm), mm); if (unlikely(test_and_set_bit(MMF_VM_HUGEPAGE, &mm->flags))) { - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); return; } spin_lock(&khugepaged_mm_lock); - insert_to_mm_slots_hash(mm, mm_slot); + mm_slot_insert(mm_slots_hash, mm, slot); /* * Insert just behind the scanning cursor, to let the area settle * down a little. */ wakeup = list_empty(&khugepaged_scan.mm_head); - list_add_tail(&mm_slot->mm_node, &khugepaged_scan.mm_head); + list_add_tail(&slot->mm_node, &khugepaged_scan.mm_head); spin_unlock(&khugepaged_mm_lock); mmgrab(mm); @@ -486,21 +457,23 @@ void khugepaged_enter_vma(struct vm_area_struct *vma, void __khugepaged_exit(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; int free = 0; spin_lock(&khugepaged_mm_lock); - mm_slot = get_mm_slot(mm); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); if (mm_slot && khugepaged_scan.mm_slot != mm_slot) { - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&slot->hash); + list_del(&slot->mm_node); free = 1; } spin_unlock(&khugepaged_mm_lock); if (free) { clear_bit(MMF_VM_HUGEPAGE, &mm->flags); - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } else if (mm_slot) { /* @@ -1318,16 +1291,17 @@ out: return result; } -static void collect_mm_slot(struct mm_slot *mm_slot) +static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) { - struct mm_struct *mm = mm_slot->mm; + struct mm_slot *slot = &mm_slot->slot; + struct mm_struct *mm = slot->mm; lockdep_assert_held(&khugepaged_mm_lock); if (hpage_collapse_test_exit(mm)) { /* free mm_slot */ - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&slot->hash); + list_del(&slot->mm_node); /* * Not strictly needed because the mm exited already. @@ -1336,7 +1310,7 @@ static void collect_mm_slot(struct mm_slot *mm_slot) */ /* khugepaged_mm_lock actually not necessary for the below */ - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); mmdrop(mm); } } @@ -1349,12 +1323,14 @@ static void collect_mm_slot(struct mm_slot *mm_slot) static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; VM_BUG_ON(addr & ~HPAGE_PMD_MASK); spin_lock(&khugepaged_mm_lock); - mm_slot = get_mm_slot(mm); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; spin_unlock(&khugepaged_mm_lock); @@ -1486,9 +1462,10 @@ abort: goto drop_hpage; } -static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) { - struct mm_struct *mm = mm_slot->mm; + struct mm_slot *slot = &mm_slot->slot; + struct mm_struct *mm = slot->mm; int i; if (likely(mm_slot->nr_pte_mapped_thp == 0)) @@ -2040,7 +2017,7 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, BUILD_BUG(); } -static void khugepaged_collapse_pte_mapped_thps(struct mm_slot *mm_slot) +static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) { } #endif @@ -2051,7 +2028,8 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, __acquires(&khugepaged_mm_lock) { struct vma_iterator vmi; - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; + struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int progress = 0; @@ -2060,18 +2038,20 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, lockdep_assert_held(&khugepaged_mm_lock); *result = SCAN_FAIL; - if (khugepaged_scan.mm_slot) + if (khugepaged_scan.mm_slot) { mm_slot = khugepaged_scan.mm_slot; - else { - mm_slot = list_entry(khugepaged_scan.mm_head.next, + slot = &mm_slot->slot; + } else { + slot = list_entry(khugepaged_scan.mm_head.next, struct mm_slot, mm_node); + mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; khugepaged_scan.mm_slot = mm_slot; } spin_unlock(&khugepaged_mm_lock); khugepaged_collapse_pte_mapped_thps(mm_slot); - mm = mm_slot->mm; + mm = slot->mm; /* * Don't wait for semaphore (to avoid long wait times). Just move to * the next mm on the list. @@ -2166,10 +2146,11 @@ breakouterloop_mmap_lock: * khugepaged runs here, khugepaged_exit will find * mm_slot not pointing to the exiting mm. */ - if (mm_slot->mm_node.next != &khugepaged_scan.mm_head) { - khugepaged_scan.mm_slot = list_entry( - mm_slot->mm_node.next, - struct mm_slot, mm_node); + if (slot->mm_node.next != &khugepaged_scan.mm_head) { + slot = list_entry(slot->mm_node.next, + struct mm_slot, mm_node); + khugepaged_scan.mm_slot = + mm_slot_entry(slot, struct khugepaged_mm_slot, slot); khugepaged_scan.address = 0; } else { khugepaged_scan.mm_slot = NULL; @@ -2264,7 +2245,7 @@ static void khugepaged_wait_work(void) static int khugepaged(void *none) { - struct mm_slot *mm_slot; + struct khugepaged_mm_slot *mm_slot; set_freezable(); set_user_nice(current, MAX_NICE); From 79e1119b7e0099c6c9379ca3129ffb7aa2a1c249 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:47 +0800 Subject: [PATCH 070/350] ksm: remove redundant declarations in ksm.h Currently, for struct stable_node, no one uses it in both the include/linux/ksm.h file and the file that contains it. For struct mem_cgroup, it's also not used in ksm.h. So they're all redundant, just remove them. Link: https://lkml.kernel.org/r/20220831031951.43152-4-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/ksm.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 0b4f17418f64..7e232ba59b86 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -15,9 +15,6 @@ #include #include -struct stable_node; -struct mem_cgroup; - #ifdef CONFIG_KSM int ksm_madvise(struct vm_area_struct *vma, unsigned long start, unsigned long end, int advice, unsigned long *vm_flags); From 21fbd59136e0773e0b920371860d9b6757cdb250 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:48 +0800 Subject: [PATCH 071/350] ksm: add the ksm prefix to the names of the ksm private structures In order to prevent the name of the private structure of ksm from being the same as the name of the common structure used in subsequent patches, prefix their names with ksm in advance. Link: https://lkml.kernel.org/r/20220831031951.43152-5-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- Documentation/mm/ksm.rst | 2 +- mm/ksm.c | 216 +++++++++++++++++++-------------------- 2 files changed, 109 insertions(+), 109 deletions(-) diff --git a/Documentation/mm/ksm.rst b/Documentation/mm/ksm.rst index 9e37add068e6..f83cfbc12f4c 100644 --- a/Documentation/mm/ksm.rst +++ b/Documentation/mm/ksm.rst @@ -26,7 +26,7 @@ tree. If a KSM page is shared between less than ``max_page_sharing`` VMAs, the node of the stable tree that represents such KSM page points to a -list of struct rmap_item and the ``page->mapping`` of the +list of struct ksm_rmap_item and the ``page->mapping`` of the KSM page points to the stable tree node. When the sharing passes this threshold, KSM adds a second dimension to diff --git a/mm/ksm.c b/mm/ksm.c index 0cd2f4b62334..de61946106ce 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -82,7 +82,7 @@ * different KSM page copy of that content * * Internally, the regular nodes, "dups" and "chains" are represented - * using the same struct stable_node structure. + * using the same struct ksm_stable_node structure. * * In addition to the stable tree, KSM uses a second data structure called the * unstable tree: this tree holds pointers to pages which have been found to @@ -112,16 +112,16 @@ */ /** - * struct mm_slot - ksm information per mm that is being scanned + * struct ksm_mm_slot - ksm information per mm that is being scanned * @link: link to the mm_slots hash list * @mm_list: link into the mm_slots list, rooted in ksm_mm_head * @rmap_list: head for this mm_slot's singly-linked list of rmap_items * @mm: the mm that this information is valid for */ -struct mm_slot { +struct ksm_mm_slot { struct hlist_node link; struct list_head mm_list; - struct rmap_item *rmap_list; + struct ksm_rmap_item *rmap_list; struct mm_struct *mm; }; @@ -135,14 +135,14 @@ struct mm_slot { * There is only the one ksm_scan instance of this cursor structure. */ struct ksm_scan { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; unsigned long address; - struct rmap_item **rmap_list; + struct ksm_rmap_item **rmap_list; unsigned long seqnr; }; /** - * struct stable_node - node of the stable rbtree + * struct ksm_stable_node - node of the stable rbtree * @node: rb node of this ksm page in the stable tree * @head: (overlaying parent) &migrate_nodes indicates temporarily on that list * @hlist_dup: linked into the stable_node->hlist with a stable_node chain @@ -153,7 +153,7 @@ struct ksm_scan { * @rmap_hlist_len: number of rmap_item entries in hlist or STABLE_NODE_CHAIN * @nid: NUMA node id of stable tree in which linked (may not match kpfn) */ -struct stable_node { +struct ksm_stable_node { union { struct rb_node node; /* when node of stable tree */ struct { /* when listed for migration */ @@ -182,7 +182,7 @@ struct stable_node { }; /** - * struct rmap_item - reverse mapping item for virtual addresses + * struct ksm_rmap_item - reverse mapping item for virtual addresses * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree * @nid: NUMA node id of unstable tree in which linked (may not match page) @@ -193,8 +193,8 @@ struct stable_node { * @head: pointer to stable_node heading this list in the stable tree * @hlist: link into hlist of rmap_items hanging off that stable_node */ -struct rmap_item { - struct rmap_item *rmap_list; +struct ksm_rmap_item { + struct ksm_rmap_item *rmap_list; union { struct anon_vma *anon_vma; /* when stable */ #ifdef CONFIG_NUMA @@ -207,7 +207,7 @@ struct rmap_item { union { struct rb_node node; /* when node of unstable tree */ struct { /* when listed from stable tree */ - struct stable_node *head; + struct ksm_stable_node *head; struct hlist_node hlist; }; }; @@ -230,7 +230,7 @@ static LIST_HEAD(migrate_nodes); #define MM_SLOTS_HASH_BITS 10 static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); -static struct mm_slot ksm_mm_head = { +static struct ksm_mm_slot ksm_mm_head = { .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), }; static struct ksm_scan ksm_scan = { @@ -298,21 +298,21 @@ static DECLARE_WAIT_QUEUE_HEAD(ksm_iter_wait); static DEFINE_MUTEX(ksm_thread_mutex); static DEFINE_SPINLOCK(ksm_mmlist_lock); -#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("ksm_"#__struct,\ +#define KSM_KMEM_CACHE(__struct, __flags) kmem_cache_create(#__struct,\ sizeof(struct __struct), __alignof__(struct __struct),\ (__flags), NULL) static int __init ksm_slab_init(void) { - rmap_item_cache = KSM_KMEM_CACHE(rmap_item, 0); + rmap_item_cache = KSM_KMEM_CACHE(ksm_rmap_item, 0); if (!rmap_item_cache) goto out; - stable_node_cache = KSM_KMEM_CACHE(stable_node, 0); + stable_node_cache = KSM_KMEM_CACHE(ksm_stable_node, 0); if (!stable_node_cache) goto out_free1; - mm_slot_cache = KSM_KMEM_CACHE(mm_slot, 0); + mm_slot_cache = KSM_KMEM_CACHE(ksm_mm_slot, 0); if (!mm_slot_cache) goto out_free2; @@ -334,18 +334,18 @@ static void __init ksm_slab_free(void) mm_slot_cache = NULL; } -static __always_inline bool is_stable_node_chain(struct stable_node *chain) +static __always_inline bool is_stable_node_chain(struct ksm_stable_node *chain) { return chain->rmap_hlist_len == STABLE_NODE_CHAIN; } -static __always_inline bool is_stable_node_dup(struct stable_node *dup) +static __always_inline bool is_stable_node_dup(struct ksm_stable_node *dup) { return dup->head == STABLE_NODE_DUP_HEAD; } -static inline void stable_node_chain_add_dup(struct stable_node *dup, - struct stable_node *chain) +static inline void stable_node_chain_add_dup(struct ksm_stable_node *dup, + struct ksm_stable_node *chain) { VM_BUG_ON(is_stable_node_dup(dup)); dup->head = STABLE_NODE_DUP_HEAD; @@ -354,14 +354,14 @@ static inline void stable_node_chain_add_dup(struct stable_node *dup, ksm_stable_node_dups++; } -static inline void __stable_node_dup_del(struct stable_node *dup) +static inline void __stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(!is_stable_node_dup(dup)); hlist_del(&dup->hlist_dup); ksm_stable_node_dups--; } -static inline void stable_node_dup_del(struct stable_node *dup) +static inline void stable_node_dup_del(struct ksm_stable_node *dup) { VM_BUG_ON(is_stable_node_chain(dup)); if (is_stable_node_dup(dup)) @@ -373,9 +373,9 @@ static inline void stable_node_dup_del(struct stable_node *dup) #endif } -static inline struct rmap_item *alloc_rmap_item(void) +static inline struct ksm_rmap_item *alloc_rmap_item(void) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN); @@ -384,7 +384,7 @@ static inline struct rmap_item *alloc_rmap_item(void) return rmap_item; } -static inline void free_rmap_item(struct rmap_item *rmap_item) +static inline void free_rmap_item(struct ksm_rmap_item *rmap_item) { ksm_rmap_items--; rmap_item->mm->ksm_rmap_items--; @@ -392,7 +392,7 @@ static inline void free_rmap_item(struct rmap_item *rmap_item) kmem_cache_free(rmap_item_cache, rmap_item); } -static inline struct stable_node *alloc_stable_node(void) +static inline struct ksm_stable_node *alloc_stable_node(void) { /* * The allocation can take too long with GFP_KERNEL when memory is under @@ -402,28 +402,28 @@ static inline struct stable_node *alloc_stable_node(void) return kmem_cache_alloc(stable_node_cache, GFP_KERNEL | __GFP_HIGH); } -static inline void free_stable_node(struct stable_node *stable_node) +static inline void free_stable_node(struct ksm_stable_node *stable_node) { VM_BUG_ON(stable_node->rmap_hlist_len && !is_stable_node_chain(stable_node)); kmem_cache_free(stable_node_cache, stable_node); } -static inline struct mm_slot *alloc_mm_slot(void) +static inline struct ksm_mm_slot *alloc_mm_slot(void) { if (!mm_slot_cache) /* initialization failed */ return NULL; return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); } -static inline void free_mm_slot(struct mm_slot *mm_slot) +static inline void free_mm_slot(struct ksm_mm_slot *mm_slot) { kmem_cache_free(mm_slot_cache, mm_slot); } -static struct mm_slot *get_mm_slot(struct mm_struct *mm) +static struct ksm_mm_slot *get_mm_slot(struct mm_struct *mm) { - struct mm_slot *slot; + struct ksm_mm_slot *slot; hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm) if (slot->mm == mm) @@ -433,7 +433,7 @@ static struct mm_slot *get_mm_slot(struct mm_struct *mm) } static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct mm_slot *mm_slot) + struct ksm_mm_slot *mm_slot) { mm_slot->mm = mm; hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); @@ -529,7 +529,7 @@ static struct vm_area_struct *find_mergeable_vma(struct mm_struct *mm, return vma; } -static void break_cow(struct rmap_item *rmap_item) +static void break_cow(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; @@ -548,7 +548,7 @@ static void break_cow(struct rmap_item *rmap_item) mmap_read_unlock(mm); } -static struct page *get_mergeable_page(struct rmap_item *rmap_item) +static struct page *get_mergeable_page(struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; unsigned long addr = rmap_item->address; @@ -589,10 +589,10 @@ static inline int get_kpfn_nid(unsigned long kpfn) return ksm_merge_across_nodes ? 0 : NUMA(pfn_to_nid(kpfn)); } -static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, +static struct ksm_stable_node *alloc_stable_node_chain(struct ksm_stable_node *dup, struct rb_root *root) { - struct stable_node *chain = alloc_stable_node(); + struct ksm_stable_node *chain = alloc_stable_node(); VM_BUG_ON(is_stable_node_chain(dup)); if (likely(chain)) { INIT_HLIST_HEAD(&chain->hlist); @@ -622,7 +622,7 @@ static struct stable_node *alloc_stable_node_chain(struct stable_node *dup, return chain; } -static inline void free_stable_node_chain(struct stable_node *chain, +static inline void free_stable_node_chain(struct ksm_stable_node *chain, struct rb_root *root) { rb_erase(&chain->node, root); @@ -630,9 +630,9 @@ static inline void free_stable_node_chain(struct stable_node *chain, ksm_stable_node_chains--; } -static void remove_node_from_stable_tree(struct stable_node *stable_node) +static void remove_node_from_stable_tree(struct ksm_stable_node *stable_node) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; /* check it's not STABLE_NODE_CHAIN or negative */ BUG_ON(stable_node->rmap_hlist_len < 0); @@ -694,7 +694,7 @@ enum get_ksm_page_flags { * a page to put something that might look like our key in page->mapping. * is on its way to being freed; but it is an anomaly to bear in mind. */ -static struct page *get_ksm_page(struct stable_node *stable_node, +static struct page *get_ksm_page(struct ksm_stable_node *stable_node, enum get_ksm_page_flags flags) { struct page *page; @@ -773,10 +773,10 @@ stale: * Removing rmap_item from stable or unstable tree. * This function will clean the information from the stable/unstable tree. */ -static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) +static void remove_rmap_item_from_tree(struct ksm_rmap_item *rmap_item) { if (rmap_item->address & STABLE_FLAG) { - struct stable_node *stable_node; + struct ksm_stable_node *stable_node; struct page *page; stable_node = rmap_item->head; @@ -823,10 +823,10 @@ out: cond_resched(); /* we're called from many long loops */ } -static void remove_trailing_rmap_items(struct rmap_item **rmap_list) +static void remove_trailing_rmap_items(struct ksm_rmap_item **rmap_list) { while (*rmap_list) { - struct rmap_item *rmap_item = *rmap_list; + struct ksm_rmap_item *rmap_item = *rmap_list; *rmap_list = rmap_item->rmap_list; remove_rmap_item_from_tree(rmap_item); free_rmap_item(rmap_item); @@ -863,18 +863,18 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, return err; } -static inline struct stable_node *folio_stable_node(struct folio *folio) +static inline struct ksm_stable_node *folio_stable_node(struct folio *folio) { return folio_test_ksm(folio) ? folio_raw_mapping(folio) : NULL; } -static inline struct stable_node *page_stable_node(struct page *page) +static inline struct ksm_stable_node *page_stable_node(struct page *page) { return folio_stable_node(page_folio(page)); } static inline void set_page_stable_node(struct page *page, - struct stable_node *stable_node) + struct ksm_stable_node *stable_node) { VM_BUG_ON_PAGE(PageAnon(page) && PageAnonExclusive(page), page); page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); @@ -884,7 +884,7 @@ static inline void set_page_stable_node(struct page *page, /* * Only called through the sysfs control interface: */ -static int remove_stable_node(struct stable_node *stable_node) +static int remove_stable_node(struct ksm_stable_node *stable_node) { struct page *page; int err; @@ -922,10 +922,10 @@ static int remove_stable_node(struct stable_node *stable_node) return err; } -static int remove_stable_node_chain(struct stable_node *stable_node, +static int remove_stable_node_chain(struct ksm_stable_node *stable_node, struct rb_root *root) { - struct stable_node *dup; + struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { @@ -949,14 +949,14 @@ static int remove_stable_node_chain(struct stable_node *stable_node, static int remove_all_stable_nodes(void) { - struct stable_node *stable_node, *next; + struct ksm_stable_node *stable_node, *next; int nid; int err = 0; for (nid = 0; nid < ksm_nr_node_ids; nid++) { while (root_stable_tree[nid].rb_node) { stable_node = rb_entry(root_stable_tree[nid].rb_node, - struct stable_node, node); + struct ksm_stable_node, node); if (remove_stable_node_chain(stable_node, root_stable_tree + nid)) { err = -EBUSY; @@ -975,14 +975,14 @@ static int remove_all_stable_nodes(void) static int unmerge_and_remove_all_rmap_items(void) { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; struct mm_struct *mm; struct vm_area_struct *vma; int err = 0; spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, - struct mm_slot, mm_list); + struct ksm_mm_slot, mm_list); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; @@ -1007,7 +1007,7 @@ static int unmerge_and_remove_all_rmap_items(void) spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, - struct mm_slot, mm_list); + struct ksm_mm_slot, mm_list); if (ksm_test_exit(mm)) { hash_del(&mm_slot->link); list_del(&mm_slot->mm_list); @@ -1295,7 +1295,7 @@ out: * * This function returns 0 if the pages were merged, -EFAULT otherwise. */ -static int try_to_merge_with_ksm_page(struct rmap_item *rmap_item, +static int try_to_merge_with_ksm_page(struct ksm_rmap_item *rmap_item, struct page *page, struct page *kpage) { struct mm_struct *mm = rmap_item->mm; @@ -1332,9 +1332,9 @@ out: * Note that this function upgrades page to ksm page: if one of the pages * is already a ksm page, try_to_merge_with_ksm_page should be used. */ -static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, +static struct page *try_to_merge_two_pages(struct ksm_rmap_item *rmap_item, struct page *page, - struct rmap_item *tree_rmap_item, + struct ksm_rmap_item *tree_rmap_item, struct page *tree_page) { int err; @@ -1354,7 +1354,7 @@ static struct page *try_to_merge_two_pages(struct rmap_item *rmap_item, } static __always_inline -bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset) +bool __is_page_sharing_candidate(struct ksm_stable_node *stable_node, int offset) { VM_BUG_ON(stable_node->rmap_hlist_len < 0); /* @@ -1368,17 +1368,17 @@ bool __is_page_sharing_candidate(struct stable_node *stable_node, int offset) } static __always_inline -bool is_page_sharing_candidate(struct stable_node *stable_node) +bool is_page_sharing_candidate(struct ksm_stable_node *stable_node) { return __is_page_sharing_candidate(stable_node, 0); } -static struct page *stable_node_dup(struct stable_node **_stable_node_dup, - struct stable_node **_stable_node, +static struct page *stable_node_dup(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { - struct stable_node *dup, *found = NULL, *stable_node = *_stable_node; + struct ksm_stable_node *dup, *found = NULL, *stable_node = *_stable_node; struct hlist_node *hlist_safe; struct page *_tree_page, *tree_page = NULL; int nr = 0; @@ -1492,7 +1492,7 @@ static struct page *stable_node_dup(struct stable_node **_stable_node_dup, return tree_page; } -static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, +static struct ksm_stable_node *stable_node_dup_any(struct ksm_stable_node *stable_node, struct rb_root *root) { if (!is_stable_node_chain(stable_node)) @@ -1519,12 +1519,12 @@ static struct stable_node *stable_node_dup_any(struct stable_node *stable_node, * function and will be overwritten in all cases, the caller doesn't * need to initialize it. */ -static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, - struct stable_node **_stable_node, +static struct page *__stable_node_chain(struct ksm_stable_node **_stable_node_dup, + struct ksm_stable_node **_stable_node, struct rb_root *root, bool prune_stale_stable_nodes) { - struct stable_node *stable_node = *_stable_node; + struct ksm_stable_node *stable_node = *_stable_node; if (!is_stable_node_chain(stable_node)) { if (is_page_sharing_candidate(stable_node)) { *_stable_node_dup = stable_node; @@ -1541,18 +1541,18 @@ static struct page *__stable_node_chain(struct stable_node **_stable_node_dup, prune_stale_stable_nodes); } -static __always_inline struct page *chain_prune(struct stable_node **s_n_d, - struct stable_node **s_n, +static __always_inline struct page *chain_prune(struct ksm_stable_node **s_n_d, + struct ksm_stable_node **s_n, struct rb_root *root) { return __stable_node_chain(s_n_d, s_n, root, true); } -static __always_inline struct page *chain(struct stable_node **s_n_d, - struct stable_node *s_n, +static __always_inline struct page *chain(struct ksm_stable_node **s_n_d, + struct ksm_stable_node *s_n, struct rb_root *root) { - struct stable_node *old_stable_node = s_n; + struct ksm_stable_node *old_stable_node = s_n; struct page *tree_page; tree_page = __stable_node_chain(s_n_d, &s_n, root, false); @@ -1576,8 +1576,8 @@ static struct page *stable_tree_search(struct page *page) struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node, *stable_node_dup, *stable_node_any; - struct stable_node *page_node; + struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; + struct ksm_stable_node *page_node; page_node = page_stable_node(page); if (page_node && page_node->head != &migrate_nodes) { @@ -1597,7 +1597,7 @@ again: int ret; cond_resched(); - stable_node = rb_entry(*new, struct stable_node, node); + stable_node = rb_entry(*new, struct ksm_stable_node, node); stable_node_any = NULL; tree_page = chain_prune(&stable_node_dup, &stable_node, root); /* @@ -1820,14 +1820,14 @@ chain_append: * This function returns the stable tree node just allocated on success, * NULL otherwise. */ -static struct stable_node *stable_tree_insert(struct page *kpage) +static struct ksm_stable_node *stable_tree_insert(struct page *kpage) { int nid; unsigned long kpfn; struct rb_root *root; struct rb_node **new; struct rb_node *parent; - struct stable_node *stable_node, *stable_node_dup, *stable_node_any; + struct ksm_stable_node *stable_node, *stable_node_dup, *stable_node_any; bool need_chain = false; kpfn = page_to_pfn(kpage); @@ -1842,7 +1842,7 @@ again: int ret; cond_resched(); - stable_node = rb_entry(*new, struct stable_node, node); + stable_node = rb_entry(*new, struct ksm_stable_node, node); stable_node_any = NULL; tree_page = chain(&stable_node_dup, stable_node, root); if (!stable_node_dup) { @@ -1911,7 +1911,7 @@ again: rb_insert_color(&stable_node_dup->node, root); } else { if (!is_stable_node_chain(stable_node)) { - struct stable_node *orig = stable_node; + struct ksm_stable_node *orig = stable_node; /* chain is missing so create it */ stable_node = alloc_stable_node_chain(orig, root); if (!stable_node) { @@ -1940,7 +1940,7 @@ again: * the same walking algorithm in an rbtree. */ static -struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, +struct ksm_rmap_item *unstable_tree_search_insert(struct ksm_rmap_item *rmap_item, struct page *page, struct page **tree_pagep) { @@ -1954,12 +1954,12 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, new = &root->rb_node; while (*new) { - struct rmap_item *tree_rmap_item; + struct ksm_rmap_item *tree_rmap_item; struct page *tree_page; int ret; cond_resched(); - tree_rmap_item = rb_entry(*new, struct rmap_item, node); + tree_rmap_item = rb_entry(*new, struct ksm_rmap_item, node); tree_page = get_mergeable_page(tree_rmap_item); if (!tree_page) return NULL; @@ -2011,8 +2011,8 @@ struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, * rmap_items hanging off a given node of the stable tree, all sharing * the same ksm page. */ -static void stable_tree_append(struct rmap_item *rmap_item, - struct stable_node *stable_node, +static void stable_tree_append(struct ksm_rmap_item *rmap_item, + struct ksm_stable_node *stable_node, bool max_page_sharing_bypass) { /* @@ -2054,12 +2054,12 @@ static void stable_tree_append(struct rmap_item *rmap_item, * @page: the page that we are searching identical page to. * @rmap_item: the reverse mapping into the virtual address of this page */ -static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) +static void cmp_and_merge_page(struct page *page, struct ksm_rmap_item *rmap_item) { struct mm_struct *mm = rmap_item->mm; - struct rmap_item *tree_rmap_item; + struct ksm_rmap_item *tree_rmap_item; struct page *tree_page = NULL; - struct stable_node *stable_node; + struct ksm_stable_node *stable_node; struct page *kpage; unsigned int checksum; int err; @@ -2215,11 +2215,11 @@ static void cmp_and_merge_page(struct page *page, struct rmap_item *rmap_item) } } -static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, - struct rmap_item **rmap_list, +static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, + struct ksm_rmap_item **rmap_list, unsigned long addr) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; while (*rmap_list) { rmap_item = *rmap_list; @@ -2244,12 +2244,12 @@ static struct rmap_item *get_next_rmap_item(struct mm_slot *mm_slot, return rmap_item; } -static struct rmap_item *scan_get_next_rmap_item(struct page **page) +static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; - struct mm_slot *slot; + struct ksm_mm_slot *slot; struct vm_area_struct *vma; - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; struct vma_iterator vmi; int nid; @@ -2277,7 +2277,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) * so prune them once before each full scan. */ if (!ksm_merge_across_nodes) { - struct stable_node *stable_node, *next; + struct ksm_stable_node *stable_node, *next; struct page *page; list_for_each_entry_safe(stable_node, next, @@ -2294,7 +2294,7 @@ static struct rmap_item *scan_get_next_rmap_item(struct page **page) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); - slot = list_entry(slot->mm_list.next, struct mm_slot, mm_list); + slot = list_entry(slot->mm_list.next, struct ksm_mm_slot, mm_list); ksm_scan.mm_slot = slot; spin_unlock(&ksm_mmlist_lock); /* @@ -2368,7 +2368,7 @@ no_vmas: spin_lock(&ksm_mmlist_lock); ksm_scan.mm_slot = list_entry(slot->mm_list.next, - struct mm_slot, mm_list); + struct ksm_mm_slot, mm_list); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock @@ -2414,7 +2414,7 @@ no_vmas: */ static void ksm_do_scan(unsigned int scan_npages) { - struct rmap_item *rmap_item; + struct ksm_rmap_item *rmap_item; struct page *page; while (scan_npages-- && likely(!freezing(current))) { @@ -2518,7 +2518,7 @@ EXPORT_SYMBOL_GPL(ksm_madvise); int __ksm_enter(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; int needs_wakeup; mm_slot = alloc_mm_slot(); @@ -2557,7 +2557,7 @@ int __ksm_enter(struct mm_struct *mm) void __ksm_exit(struct mm_struct *mm) { - struct mm_slot *mm_slot; + struct ksm_mm_slot *mm_slot; int easy_to_free = 0; /* @@ -2635,8 +2635,8 @@ struct page *ksm_might_need_to_copy(struct page *page, void rmap_walk_ksm(struct folio *folio, struct rmap_walk_control *rwc) { - struct stable_node *stable_node; - struct rmap_item *rmap_item; + struct ksm_stable_node *stable_node; + struct ksm_rmap_item *rmap_item; int search_new_forks = 0; VM_BUG_ON_FOLIO(!folio_test_ksm(folio), folio); @@ -2706,7 +2706,7 @@ again: #ifdef CONFIG_MIGRATION void folio_migrate_ksm(struct folio *newfolio, struct folio *folio) { - struct stable_node *stable_node; + struct ksm_stable_node *stable_node; VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_FOLIO(!folio_test_locked(newfolio), newfolio); @@ -2739,7 +2739,7 @@ static void wait_while_offlining(void) } } -static bool stable_node_dup_remove_range(struct stable_node *stable_node, +static bool stable_node_dup_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn) { @@ -2755,12 +2755,12 @@ static bool stable_node_dup_remove_range(struct stable_node *stable_node, return false; } -static bool stable_node_chain_remove_range(struct stable_node *stable_node, +static bool stable_node_chain_remove_range(struct ksm_stable_node *stable_node, unsigned long start_pfn, unsigned long end_pfn, struct rb_root *root) { - struct stable_node *dup; + struct ksm_stable_node *dup; struct hlist_node *hlist_safe; if (!is_stable_node_chain(stable_node)) { @@ -2784,14 +2784,14 @@ static bool stable_node_chain_remove_range(struct stable_node *stable_node, static void ksm_check_stable_tree(unsigned long start_pfn, unsigned long end_pfn) { - struct stable_node *stable_node, *next; + struct ksm_stable_node *stable_node, *next; struct rb_node *node; int nid; for (nid = 0; nid < ksm_nr_node_ids; nid++) { node = rb_first(root_stable_tree + nid); while (node) { - stable_node = rb_entry(node, struct stable_node, node); + stable_node = rb_entry(node, struct ksm_stable_node, node); if (stable_node_chain_remove_range(stable_node, start_pfn, end_pfn, root_stable_tree + From 23f746e412b405fbd6fb9652c0f7c33818713c43 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:49 +0800 Subject: [PATCH 072/350] ksm: convert ksm_mm_slot.mm_list to ksm_mm_slot.mm_node In order to use common struct mm_slot, convert ksm_mm_slot.mm_list to ksm_mm_slot.mm_node in advance, no functional change. Link: https://lkml.kernel.org/r/20220831031951.43152-6-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/ksm.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index de61946106ce..f9cd502233f0 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -114,13 +114,13 @@ /** * struct ksm_mm_slot - ksm information per mm that is being scanned * @link: link to the mm_slots hash list - * @mm_list: link into the mm_slots list, rooted in ksm_mm_head + * @mm_node: link into the mm_slots list, rooted in ksm_mm_head * @rmap_list: head for this mm_slot's singly-linked list of rmap_items * @mm: the mm that this information is valid for */ struct ksm_mm_slot { struct hlist_node link; - struct list_head mm_list; + struct list_head mm_node; struct ksm_rmap_item *rmap_list; struct mm_struct *mm; }; @@ -231,7 +231,7 @@ static LIST_HEAD(migrate_nodes); static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct ksm_mm_slot ksm_mm_head = { - .mm_list = LIST_HEAD_INIT(ksm_mm_head.mm_list), + .mm_node = LIST_HEAD_INIT(ksm_mm_head.mm_node), }; static struct ksm_scan ksm_scan = { .mm_slot = &ksm_mm_head, @@ -981,8 +981,8 @@ static int unmerge_and_remove_all_rmap_items(void) int err = 0; spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_list.next, - struct ksm_mm_slot, mm_list); + ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_node.next, + struct ksm_mm_slot, mm_node); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; @@ -1006,11 +1006,11 @@ static int unmerge_and_remove_all_rmap_items(void) mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(mm_slot->mm_list.next, - struct ksm_mm_slot, mm_list); + ksm_scan.mm_slot = list_entry(mm_slot->mm_node.next, + struct ksm_mm_slot, mm_node); if (ksm_test_exit(mm)) { hash_del(&mm_slot->link); - list_del(&mm_slot->mm_list); + list_del(&mm_slot->mm_node); spin_unlock(&ksm_mmlist_lock); free_mm_slot(mm_slot); @@ -2253,7 +2253,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) struct vma_iterator vmi; int nid; - if (list_empty(&ksm_mm_head.mm_list)) + if (list_empty(&ksm_mm_head.mm_node)) return NULL; slot = ksm_scan.mm_slot; @@ -2294,7 +2294,7 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); - slot = list_entry(slot->mm_list.next, struct ksm_mm_slot, mm_list); + slot = list_entry(slot->mm_node.next, struct ksm_mm_slot, mm_node); ksm_scan.mm_slot = slot; spin_unlock(&ksm_mmlist_lock); /* @@ -2367,8 +2367,8 @@ no_vmas: remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(slot->mm_list.next, - struct ksm_mm_slot, mm_list); + ksm_scan.mm_slot = list_entry(slot->mm_node.next, + struct ksm_mm_slot, mm_node); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock @@ -2380,7 +2380,7 @@ no_vmas: * mmap_lock then protects against race with MADV_MERGEABLE). */ hash_del(&slot->link); - list_del(&slot->mm_list); + list_del(&slot->mm_node); spin_unlock(&ksm_mmlist_lock); free_mm_slot(slot); @@ -2429,7 +2429,7 @@ static void ksm_do_scan(unsigned int scan_npages) static int ksmd_should_run(void) { - return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_list); + return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_node); } static int ksm_scan_thread(void *nothing) @@ -2526,7 +2526,7 @@ int __ksm_enter(struct mm_struct *mm) return -ENOMEM; /* Check ksm_run too? Would need tighter locking */ - needs_wakeup = list_empty(&ksm_mm_head.mm_list); + needs_wakeup = list_empty(&ksm_mm_head.mm_node); spin_lock(&ksm_mmlist_lock); insert_to_mm_slots_hash(mm, mm_slot); @@ -2541,9 +2541,9 @@ int __ksm_enter(struct mm_struct *mm) * missed: then we might as well insert at the end of the list. */ if (ksm_run & KSM_RUN_UNMERGE) - list_add_tail(&mm_slot->mm_list, &ksm_mm_head.mm_list); + list_add_tail(&mm_slot->mm_node, &ksm_mm_head.mm_node); else - list_add_tail(&mm_slot->mm_list, &ksm_scan.mm_slot->mm_list); + list_add_tail(&mm_slot->mm_node, &ksm_scan.mm_slot->mm_node); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); @@ -2574,11 +2574,11 @@ void __ksm_exit(struct mm_struct *mm) if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { hash_del(&mm_slot->link); - list_del(&mm_slot->mm_list); + list_del(&mm_slot->mm_node); easy_to_free = 1; } else { - list_move(&mm_slot->mm_list, - &ksm_scan.mm_slot->mm_list); + list_move(&mm_slot->mm_node, + &ksm_scan.mm_slot->mm_node); } } spin_unlock(&ksm_mmlist_lock); From 79b09941563737fad52a6b5ce9b9f0e1abf01bec Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:50 +0800 Subject: [PATCH 073/350] ksm: convert ksm_mm_slot.link to ksm_mm_slot.hash In order to use common struct mm_slot, convert ksm_mm_slot.link to ksm_mm_slot.hash in advance, no functional change. Link: https://lkml.kernel.org/r/20220831031951.43152-7-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/ksm.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index f9cd502233f0..9300e7a48e88 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -113,13 +113,13 @@ /** * struct ksm_mm_slot - ksm information per mm that is being scanned - * @link: link to the mm_slots hash list + * @hash: link to the mm_slots hash list * @mm_node: link into the mm_slots list, rooted in ksm_mm_head * @rmap_list: head for this mm_slot's singly-linked list of rmap_items * @mm: the mm that this information is valid for */ struct ksm_mm_slot { - struct hlist_node link; + struct hlist_node hash; struct list_head mm_node; struct ksm_rmap_item *rmap_list; struct mm_struct *mm; @@ -425,7 +425,7 @@ static struct ksm_mm_slot *get_mm_slot(struct mm_struct *mm) { struct ksm_mm_slot *slot; - hash_for_each_possible(mm_slots_hash, slot, link, (unsigned long)mm) + hash_for_each_possible(mm_slots_hash, slot, hash, (unsigned long)mm) if (slot->mm == mm) return slot; @@ -436,7 +436,7 @@ static void insert_to_mm_slots_hash(struct mm_struct *mm, struct ksm_mm_slot *mm_slot) { mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->link, (unsigned long)mm); + hash_add(mm_slots_hash, &mm_slot->hash, (unsigned long)mm); } /* @@ -1009,7 +1009,7 @@ static int unmerge_and_remove_all_rmap_items(void) ksm_scan.mm_slot = list_entry(mm_slot->mm_node.next, struct ksm_mm_slot, mm_node); if (ksm_test_exit(mm)) { - hash_del(&mm_slot->link); + hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); spin_unlock(&ksm_mmlist_lock); @@ -2379,7 +2379,7 @@ no_vmas: * or when all VM_MERGEABLE areas have been unmapped (and * mmap_lock then protects against race with MADV_MERGEABLE). */ - hash_del(&slot->link); + hash_del(&slot->hash); list_del(&slot->mm_node); spin_unlock(&ksm_mmlist_lock); @@ -2573,7 +2573,7 @@ void __ksm_exit(struct mm_struct *mm) mm_slot = get_mm_slot(mm); if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { - hash_del(&mm_slot->link); + hash_del(&mm_slot->hash); list_del(&mm_slot->mm_node); easy_to_free = 1; } else { From 58730ab6c7cab4e8525b7492ac369ccbfff5093a Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Wed, 31 Aug 2022 11:19:51 +0800 Subject: [PATCH 074/350] ksm: convert to use common struct mm_slot Convert to use common struct mm_slot, no functional change. Link: https://lkml.kernel.org/r/20220831031951.43152-8-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Johannes Weiner Cc: Matthew Wilcox Cc: Mike Rapoport Cc: Minchan Kim Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/ksm.c | 132 +++++++++++++++++++++++-------------------------------- 1 file changed, 56 insertions(+), 76 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index 9300e7a48e88..c3edb5836a44 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -42,6 +42,7 @@ #include #include "internal.h" +#include "mm_slot.h" #ifdef CONFIG_NUMA #define NUMA(x) (x) @@ -113,16 +114,12 @@ /** * struct ksm_mm_slot - ksm information per mm that is being scanned - * @hash: link to the mm_slots hash list - * @mm_node: link into the mm_slots list, rooted in ksm_mm_head + * @slot: hash lookup from mm to mm_slot * @rmap_list: head for this mm_slot's singly-linked list of rmap_items - * @mm: the mm that this information is valid for */ struct ksm_mm_slot { - struct hlist_node hash; - struct list_head mm_node; + struct mm_slot slot; struct ksm_rmap_item *rmap_list; - struct mm_struct *mm; }; /** @@ -231,7 +228,7 @@ static LIST_HEAD(migrate_nodes); static DEFINE_HASHTABLE(mm_slots_hash, MM_SLOTS_HASH_BITS); static struct ksm_mm_slot ksm_mm_head = { - .mm_node = LIST_HEAD_INIT(ksm_mm_head.mm_node), + .slot.mm_node = LIST_HEAD_INIT(ksm_mm_head.slot.mm_node), }; static struct ksm_scan ksm_scan = { .mm_slot = &ksm_mm_head, @@ -409,36 +406,6 @@ static inline void free_stable_node(struct ksm_stable_node *stable_node) kmem_cache_free(stable_node_cache, stable_node); } -static inline struct ksm_mm_slot *alloc_mm_slot(void) -{ - if (!mm_slot_cache) /* initialization failed */ - return NULL; - return kmem_cache_zalloc(mm_slot_cache, GFP_KERNEL); -} - -static inline void free_mm_slot(struct ksm_mm_slot *mm_slot) -{ - kmem_cache_free(mm_slot_cache, mm_slot); -} - -static struct ksm_mm_slot *get_mm_slot(struct mm_struct *mm) -{ - struct ksm_mm_slot *slot; - - hash_for_each_possible(mm_slots_hash, slot, hash, (unsigned long)mm) - if (slot->mm == mm) - return slot; - - return NULL; -} - -static void insert_to_mm_slots_hash(struct mm_struct *mm, - struct ksm_mm_slot *mm_slot) -{ - mm_slot->mm = mm; - hash_add(mm_slots_hash, &mm_slot->hash, (unsigned long)mm); -} - /* * ksmd, and unmerge_and_remove_all_rmap_items(), must not touch an mm's * page tables after it has passed through ksm_exit() - which, if necessary, @@ -976,20 +943,22 @@ static int remove_all_stable_nodes(void) static int unmerge_and_remove_all_rmap_items(void) { struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; struct mm_struct *mm; struct vm_area_struct *vma; int err = 0; spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(ksm_mm_head.mm_node.next, - struct ksm_mm_slot, mm_node); + slot = list_entry(ksm_mm_head.slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); spin_unlock(&ksm_mmlist_lock); for (mm_slot = ksm_scan.mm_slot; mm_slot != &ksm_mm_head; mm_slot = ksm_scan.mm_slot) { - VMA_ITERATOR(vmi, mm_slot->mm, 0); + VMA_ITERATOR(vmi, mm_slot->slot.mm, 0); - mm = mm_slot->mm; + mm = mm_slot->slot.mm; mmap_read_lock(mm); for_each_vma(vmi, vma) { if (ksm_test_exit(mm)) @@ -1006,14 +975,15 @@ static int unmerge_and_remove_all_rmap_items(void) mmap_read_unlock(mm); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(mm_slot->mm_node.next, - struct ksm_mm_slot, mm_node); + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_test_exit(mm)) { - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&mm_slot->slot.hash); + list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else @@ -2235,7 +2205,7 @@ static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, rmap_item = alloc_rmap_item(); if (rmap_item) { /* It has already been zeroed */ - rmap_item->mm = mm_slot->mm; + rmap_item->mm = mm_slot->slot.mm; rmap_item->mm->ksm_rmap_items++; rmap_item->address = addr; rmap_item->rmap_list = *rmap_list; @@ -2247,17 +2217,18 @@ static struct ksm_rmap_item *get_next_rmap_item(struct ksm_mm_slot *mm_slot, static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) { struct mm_struct *mm; - struct ksm_mm_slot *slot; + struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; struct vm_area_struct *vma; struct ksm_rmap_item *rmap_item; struct vma_iterator vmi; int nid; - if (list_empty(&ksm_mm_head.mm_node)) + if (list_empty(&ksm_mm_head.slot.mm_node)) return NULL; - slot = ksm_scan.mm_slot; - if (slot == &ksm_mm_head) { + mm_slot = ksm_scan.mm_slot; + if (mm_slot == &ksm_mm_head) { /* * A number of pages can hang around indefinitely on per-cpu * pagevecs, raised page count preventing write_protect_page @@ -2294,20 +2265,23 @@ static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page) root_unstable_tree[nid] = RB_ROOT; spin_lock(&ksm_mmlist_lock); - slot = list_entry(slot->mm_node.next, struct ksm_mm_slot, mm_node); - ksm_scan.mm_slot = slot; + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); + ksm_scan.mm_slot = mm_slot; spin_unlock(&ksm_mmlist_lock); /* * Although we tested list_empty() above, a racing __ksm_exit * of the last mm on the list may have removed it since then. */ - if (slot == &ksm_mm_head) + if (mm_slot == &ksm_mm_head) return NULL; next_mm: ksm_scan.address = 0; - ksm_scan.rmap_list = &slot->rmap_list; + ksm_scan.rmap_list = &mm_slot->rmap_list; } + slot = &mm_slot->slot; mm = slot->mm; vma_iter_init(&vmi, mm, ksm_scan.address); @@ -2337,7 +2311,7 @@ next_mm: if (PageAnon(*page)) { flush_anon_page(vma, *page, ksm_scan.address); flush_dcache_page(*page); - rmap_item = get_next_rmap_item(slot, + rmap_item = get_next_rmap_item(mm_slot, ksm_scan.rmap_list, ksm_scan.address); if (rmap_item) { ksm_scan.rmap_list = @@ -2358,7 +2332,7 @@ next_page: if (ksm_test_exit(mm)) { no_vmas: ksm_scan.address = 0; - ksm_scan.rmap_list = &slot->rmap_list; + ksm_scan.rmap_list = &mm_slot->rmap_list; } /* * Nuke all the rmap_items that are above this current rmap: @@ -2367,8 +2341,9 @@ no_vmas: remove_trailing_rmap_items(ksm_scan.rmap_list); spin_lock(&ksm_mmlist_lock); - ksm_scan.mm_slot = list_entry(slot->mm_node.next, - struct ksm_mm_slot, mm_node); + slot = list_entry(mm_slot->slot.mm_node.next, + struct mm_slot, mm_node); + ksm_scan.mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (ksm_scan.address == 0) { /* * We've completed a full scan of all vmas, holding mmap_lock @@ -2379,11 +2354,11 @@ no_vmas: * or when all VM_MERGEABLE areas have been unmapped (and * mmap_lock then protects against race with MADV_MERGEABLE). */ - hash_del(&slot->hash); - list_del(&slot->mm_node); + hash_del(&mm_slot->slot.hash); + list_del(&mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); - free_mm_slot(slot); + mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmap_read_unlock(mm); mmdrop(mm); @@ -2400,8 +2375,8 @@ no_vmas: } /* Repeat until we've completed scanning the whole list */ - slot = ksm_scan.mm_slot; - if (slot != &ksm_mm_head) + mm_slot = ksm_scan.mm_slot; + if (mm_slot != &ksm_mm_head) goto next_mm; ksm_scan.seqnr++; @@ -2429,7 +2404,7 @@ static void ksm_do_scan(unsigned int scan_npages) static int ksmd_should_run(void) { - return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.mm_node); + return (ksm_run & KSM_RUN_MERGE) && !list_empty(&ksm_mm_head.slot.mm_node); } static int ksm_scan_thread(void *nothing) @@ -2519,17 +2494,20 @@ EXPORT_SYMBOL_GPL(ksm_madvise); int __ksm_enter(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; int needs_wakeup; - mm_slot = alloc_mm_slot(); + mm_slot = mm_slot_alloc(mm_slot_cache); if (!mm_slot) return -ENOMEM; + slot = &mm_slot->slot; + /* Check ksm_run too? Would need tighter locking */ - needs_wakeup = list_empty(&ksm_mm_head.mm_node); + needs_wakeup = list_empty(&ksm_mm_head.slot.mm_node); spin_lock(&ksm_mmlist_lock); - insert_to_mm_slots_hash(mm, mm_slot); + mm_slot_insert(mm_slots_hash, mm, slot); /* * When KSM_RUN_MERGE (or KSM_RUN_STOP), * insert just behind the scanning cursor, to let the area settle @@ -2541,9 +2519,9 @@ int __ksm_enter(struct mm_struct *mm) * missed: then we might as well insert at the end of the list. */ if (ksm_run & KSM_RUN_UNMERGE) - list_add_tail(&mm_slot->mm_node, &ksm_mm_head.mm_node); + list_add_tail(&slot->mm_node, &ksm_mm_head.slot.mm_node); else - list_add_tail(&mm_slot->mm_node, &ksm_scan.mm_slot->mm_node); + list_add_tail(&slot->mm_node, &ksm_scan.mm_slot->slot.mm_node); spin_unlock(&ksm_mmlist_lock); set_bit(MMF_VM_MERGEABLE, &mm->flags); @@ -2558,6 +2536,7 @@ int __ksm_enter(struct mm_struct *mm) void __ksm_exit(struct mm_struct *mm) { struct ksm_mm_slot *mm_slot; + struct mm_slot *slot; int easy_to_free = 0; /* @@ -2570,21 +2549,22 @@ void __ksm_exit(struct mm_struct *mm) */ spin_lock(&ksm_mmlist_lock); - mm_slot = get_mm_slot(mm); + slot = mm_slot_lookup(mm_slots_hash, mm); + mm_slot = mm_slot_entry(slot, struct ksm_mm_slot, slot); if (mm_slot && ksm_scan.mm_slot != mm_slot) { if (!mm_slot->rmap_list) { - hash_del(&mm_slot->hash); - list_del(&mm_slot->mm_node); + hash_del(&slot->hash); + list_del(&slot->mm_node); easy_to_free = 1; } else { - list_move(&mm_slot->mm_node, - &ksm_scan.mm_slot->mm_node); + list_move(&slot->mm_node, + &ksm_scan.mm_slot->slot.mm_node); } } spin_unlock(&ksm_mmlist_lock); if (easy_to_free) { - free_mm_slot(mm_slot); + mm_slot_free(mm_slot_cache, mm_slot); clear_bit(MMF_VM_MERGEABLE, &mm->flags); mmdrop(mm); } else if (mm_slot) { From 49fd9b6df54e610d817f04ab0f94919f5c1a4f66 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:45:57 +0100 Subject: [PATCH 075/350] mm/vmscan: fix a lot of comments Patch series "MM folio changes for 6.1", v2. My focus this round has been on shmem. I believe it is now fully converted to folios. Of course, shmem interacts with a lot of the swap cache and other parts of the kernel, so there are patches all over the MM. This patch series survives a round of xfstests on tmpfs, which is nice, but hardly an exhaustive test. Hugh was nice enough to run a round of tests on it and found a bug which is fixed in this edition. This patch (of 57): A lot of comments mention pages when they should say folios. Fix them up. [akpm@linux-foundation.org: fixups for mglru additions] Link: https://lkml.kernel.org/r/20220902194653.1739778-1-willy@infradead.org Link: https://lkml.kernel.org/r/20220902194653.1739778-2-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Cc: Hugh Dickins Signed-off-by: Andrew Morton --- mm/vmscan.c | 261 ++++++++++++++++++++++++++-------------------------- 1 file changed, 129 insertions(+), 132 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3ba9423b141d..9ce6cc74d9ea 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -90,7 +90,7 @@ struct scan_control { unsigned long anon_cost; unsigned long file_cost; - /* Can active pages be deactivated as part of reclaim? */ + /* Can active folios be deactivated as part of reclaim? */ #define DEACTIVATE_ANON 1 #define DEACTIVATE_FILE 2 unsigned int may_deactivate:2; @@ -100,10 +100,10 @@ struct scan_control { /* Writepage batching in laptop mode; RECLAIM_WRITE */ unsigned int may_writepage:1; - /* Can mapped pages be reclaimed? */ + /* Can mapped folios be reclaimed? */ unsigned int may_unmap:1; - /* Can pages be swapped as part of reclaim? */ + /* Can folios be swapped as part of reclaim? */ unsigned int may_swap:1; /* Proactive reclaim invoked by userspace through memory.reclaim */ @@ -128,7 +128,7 @@ struct scan_control { /* There is easily reclaimable cold cache in the current node */ unsigned int cache_trim_mode:1; - /* The file pages on the current node are dangerously low */ + /* The file folios on the current node are dangerously low */ unsigned int file_is_tiny:1; /* Always discard instead of demoting to lower tier memory */ @@ -146,7 +146,7 @@ struct scan_control { /* Scan (total_size >> priority) pages at once */ s8 priority; - /* The highest zone to isolate pages for reclaim from */ + /* The highest zone to isolate folios for reclaim from */ s8 reclaim_idx; /* This context's GFP mask */ @@ -454,7 +454,7 @@ static bool cgroup_reclaim(struct scan_control *sc) * * The normal page dirty throttling mechanism in balance_dirty_pages() is * completely broken with the legacy memcg and direct stalling in - * shrink_page_list() is used for throttling instead, which lacks all the + * shrink_folio_list() is used for throttling instead, which lacks all the * niceties such as fairness, adaptive pausing, bandwidth proportional * allocation and configurability. * @@ -575,9 +575,9 @@ static inline bool can_reclaim_anon_pages(struct mem_cgroup *memcg, } /* - * This misses isolated pages which are not accounted for to save counters. + * This misses isolated folios which are not accounted for to save counters. * As the data only determines if reclaim or compaction continues, it is - * not expected that isolated pages will be a dominating factor. + * not expected that isolated folios will be a dominating factor. */ unsigned long zone_reclaimable_pages(struct zone *zone) { @@ -1050,9 +1050,9 @@ void drop_slab(void) static inline int is_page_cache_freeable(struct folio *folio) { /* - * A freeable page cache page is referenced only by the caller - * that isolated the page, the page cache and optional buffer - * heads at page->private. + * A freeable page cache folio is referenced only by the caller + * that isolated the folio, the page cache and optional filesystem + * private data at folio->private. */ return folio_ref_count(folio) - folio_test_private(folio) == 1 + folio_nr_pages(folio); @@ -1092,8 +1092,8 @@ static bool skip_throttle_noprogress(pg_data_t *pgdat) return true; /* - * If there are a lot of dirty/writeback pages then do not - * throttle as throttling will occur when the pages cycle + * If there are a lot of dirty/writeback folios then do not + * throttle as throttling will occur when the folios cycle * towards the end of the LRU if still under writeback. */ for (i = 0; i < MAX_NR_ZONES; i++) { @@ -1136,7 +1136,7 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) * short. Failing to make progress or waiting on writeback are * potentially long-lived events so use a longer timeout. This is shaky * logic as a failure to make progress could be due to anything from - * writeback to a slow device to excessive references pages at the tail + * writeback to a slow device to excessive referenced folios at the tail * of the inactive LRU. */ switch(reason) { @@ -1182,8 +1182,8 @@ void reclaim_throttle(pg_data_t *pgdat, enum vmscan_throttle_state reason) } /* - * Account for pages written if tasks are throttled waiting on dirty - * pages to clean. If enough pages have been cleaned since throttling + * Account for folios written if tasks are throttled waiting on dirty + * folios to clean. If enough folios have been cleaned since throttling * started then wakeup the throttled tasks. */ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, @@ -1209,18 +1209,18 @@ void __acct_reclaim_writeback(pg_data_t *pgdat, struct folio *folio, /* possible outcome of pageout() */ typedef enum { - /* failed to write page out, page is locked */ + /* failed to write folio out, folio is locked */ PAGE_KEEP, - /* move page to the active list, page is locked */ + /* move folio to the active list, folio is locked */ PAGE_ACTIVATE, - /* page has been sent to the disk successfully, page is unlocked */ + /* folio has been sent to the disk successfully, folio is unlocked */ PAGE_SUCCESS, - /* page is clean and locked */ + /* folio is clean and locked */ PAGE_CLEAN, } pageout_t; /* - * pageout is called by shrink_page_list() for each dirty page. + * pageout is called by shrink_folio_list() for each dirty folio. * Calls ->writepage(). */ static pageout_t pageout(struct folio *folio, struct address_space *mapping, @@ -1294,7 +1294,7 @@ static pageout_t pageout(struct folio *folio, struct address_space *mapping, } /* - * Same as remove_mapping, but if the page is removed from the mapping, it + * Same as remove_mapping, but if the folio is removed from the mapping, it * gets returned with a refcount of 0. */ static int __remove_mapping(struct address_space *mapping, struct folio *folio, @@ -1310,34 +1310,34 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, spin_lock(&mapping->host->i_lock); xa_lock_irq(&mapping->i_pages); /* - * The non racy check for a busy page. + * The non racy check for a busy folio. * * Must be careful with the order of the tests. When someone has - * a ref to the page, it may be possible that they dirty it then - * drop the reference. So if PageDirty is tested before page_count - * here, then the following race may occur: + * a ref to the folio, it may be possible that they dirty it then + * drop the reference. So if the dirty flag is tested before the + * refcount here, then the following race may occur: * * get_user_pages(&page); * [user mapping goes away] * write_to(page); - * !PageDirty(page) [good] - * SetPageDirty(page); - * put_page(page); - * !page_count(page) [good, discard it] + * !folio_test_dirty(folio) [good] + * folio_set_dirty(folio); + * folio_put(folio); + * !refcount(folio) [good, discard it] * * [oops, our write_to data is lost] * * Reversing the order of the tests ensures such a situation cannot - * escape unnoticed. The smp_rmb is needed to ensure the page->flags - * load is not satisfied before that of page->_refcount. + * escape unnoticed. The smp_rmb is needed to ensure the folio->flags + * load is not satisfied before that of folio->_refcount. * - * Note that if SetPageDirty is always performed via set_page_dirty, + * Note that if the dirty flag is always set via folio_mark_dirty, * and thus under the i_pages lock, then this ordering is not required. */ refcount = 1 + folio_nr_pages(folio); if (!folio_ref_freeze(folio, refcount)) goto cannot_free; - /* note: atomic_cmpxchg in page_ref_freeze provides the smp_rmb */ + /* note: atomic_cmpxchg in folio_ref_freeze provides the smp_rmb */ if (unlikely(folio_test_dirty(folio))) { folio_ref_unfreeze(folio, refcount); goto cannot_free; @@ -1368,7 +1368,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, * back. * * We also don't store shadows for DAX mappings because the - * only page cache pages found in these are zero pages + * only page cache folios found in these are zero pages * covering holes, and because we don't want to mix DAX * exceptional entries and shadow exceptional entries in the * same address_space. @@ -1436,14 +1436,14 @@ void folio_putback_lru(struct folio *folio) folio_put(folio); /* drop ref from isolate */ } -enum page_references { - PAGEREF_RECLAIM, - PAGEREF_RECLAIM_CLEAN, - PAGEREF_KEEP, - PAGEREF_ACTIVATE, +enum folio_references { + FOLIOREF_RECLAIM, + FOLIOREF_RECLAIM_CLEAN, + FOLIOREF_KEEP, + FOLIOREF_ACTIVATE, }; -static enum page_references folio_check_references(struct folio *folio, +static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { int referenced_ptes, referenced_folio; @@ -1458,11 +1458,11 @@ static enum page_references folio_check_references(struct folio *folio, * Let the folio, now marked Mlocked, be moved to the unevictable list. */ if (vm_flags & VM_LOCKED) - return PAGEREF_ACTIVATE; + return FOLIOREF_ACTIVATE; /* rmap lock contention: rotate */ if (referenced_ptes == -1) - return PAGEREF_KEEP; + return FOLIOREF_KEEP; if (referenced_ptes) { /* @@ -1482,34 +1482,34 @@ static enum page_references folio_check_references(struct folio *folio, folio_set_referenced(folio); if (referenced_folio || referenced_ptes > 1) - return PAGEREF_ACTIVATE; + return FOLIOREF_ACTIVATE; /* * Activate file-backed executable folios after first usage. */ if ((vm_flags & VM_EXEC) && folio_is_file_lru(folio)) - return PAGEREF_ACTIVATE; + return FOLIOREF_ACTIVATE; - return PAGEREF_KEEP; + return FOLIOREF_KEEP; } /* Reclaim if clean, defer dirty folios to writeback */ if (referenced_folio && folio_is_file_lru(folio)) - return PAGEREF_RECLAIM_CLEAN; + return FOLIOREF_RECLAIM_CLEAN; - return PAGEREF_RECLAIM; + return FOLIOREF_RECLAIM; } -/* Check if a page is dirty or under writeback */ +/* Check if a folio is dirty or under writeback */ static void folio_check_dirty_writeback(struct folio *folio, bool *dirty, bool *writeback) { struct address_space *mapping; /* - * Anonymous pages are not handled by flushers and must be written + * Anonymous folios are not handled by flushers and must be written * from reclaim context. Do not stall reclaim based on them. - * MADV_FREE anonymous pages are put into inactive file list too. + * MADV_FREE anonymous folios are put into inactive file list too. * They could be mistakenly treated as file lru. So further anon * test is needed. */ @@ -1564,11 +1564,10 @@ static struct page *alloc_demote_page(struct page *page, unsigned long private) } /* - * Take pages on @demote_list and attempt to demote them to - * another node. Pages which are not demoted are left on - * @demote_pages. + * Take folios on @demote_folios and attempt to demote them to another node. + * Folios which are not demoted are left on @demote_folios. */ -static unsigned int demote_page_list(struct list_head *demote_pages, +static unsigned int demote_folio_list(struct list_head *demote_folios, struct pglist_data *pgdat) { int target_nid = next_demotion_node(pgdat->node_id); @@ -1587,7 +1586,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages, .nmask = &allowed_mask }; - if (list_empty(demote_pages)) + if (list_empty(demote_folios)) return 0; if (target_nid == NUMA_NO_NODE) @@ -1596,7 +1595,7 @@ static unsigned int demote_page_list(struct list_head *demote_pages, node_get_allowed_targets(pgdat, &allowed_mask); /* Demotion ignores all cpuset and mempolicy settings */ - migrate_pages(demote_pages, alloc_demote_page, NULL, + migrate_pages(demote_folios, alloc_demote_page, NULL, (unsigned long)&mtc, MIGRATE_ASYNC, MR_DEMOTION, &nr_succeeded); @@ -1625,17 +1624,15 @@ static bool may_enter_fs(struct folio *folio, gfp_t gfp_mask) } /* - * shrink_page_list() returns the number of reclaimed pages + * shrink_folio_list() returns the number of reclaimed pages */ -static unsigned int shrink_page_list(struct list_head *page_list, - struct pglist_data *pgdat, - struct scan_control *sc, - struct reclaim_stat *stat, - bool ignore_references) +static unsigned int shrink_folio_list(struct list_head *folio_list, + struct pglist_data *pgdat, struct scan_control *sc, + struct reclaim_stat *stat, bool ignore_references) { - LIST_HEAD(ret_pages); - LIST_HEAD(free_pages); - LIST_HEAD(demote_pages); + LIST_HEAD(ret_folios); + LIST_HEAD(free_folios); + LIST_HEAD(demote_folios); unsigned int nr_reclaimed = 0; unsigned int pgactivate = 0; bool do_demote_pass; @@ -1646,16 +1643,16 @@ static unsigned int shrink_page_list(struct list_head *page_list, do_demote_pass = can_demote(pgdat->node_id, sc); retry: - while (!list_empty(page_list)) { + while (!list_empty(folio_list)) { struct address_space *mapping; struct folio *folio; - enum page_references references = PAGEREF_RECLAIM; + enum folio_references references = FOLIOREF_RECLAIM; bool dirty, writeback; unsigned int nr_pages; cond_resched(); - folio = lru_to_folio(page_list); + folio = lru_to_folio(folio_list); list_del(&folio->lru); if (!folio_trylock(folio)) @@ -1779,7 +1776,7 @@ retry: folio_unlock(folio); folio_wait_writeback(folio); /* then go back and try same folio again */ - list_add_tail(&folio->lru, page_list); + list_add_tail(&folio->lru, folio_list); continue; } } @@ -1788,13 +1785,13 @@ retry: references = folio_check_references(folio, sc); switch (references) { - case PAGEREF_ACTIVATE: + case FOLIOREF_ACTIVATE: goto activate_locked; - case PAGEREF_KEEP: + case FOLIOREF_KEEP: stat->nr_ref_keep += nr_pages; goto keep_locked; - case PAGEREF_RECLAIM: - case PAGEREF_RECLAIM_CLEAN: + case FOLIOREF_RECLAIM: + case FOLIOREF_RECLAIM_CLEAN: ; /* try to reclaim the folio below */ } @@ -1804,7 +1801,7 @@ retry: */ if (do_demote_pass && (thp_migration_supported() || !folio_test_large(folio))) { - list_add(&folio->lru, &demote_pages); + list_add(&folio->lru, &demote_folios); folio_unlock(folio); continue; } @@ -1831,7 +1828,7 @@ retry: */ if (!folio_entire_mapcount(folio) && split_folio_to_list(folio, - page_list)) + folio_list)) goto activate_locked; } if (!add_to_swap(folio)) { @@ -1839,7 +1836,7 @@ retry: goto activate_locked_split; /* Fallback to swap normal pages */ if (split_folio_to_list(folio, - page_list)) + folio_list)) goto activate_locked; #ifdef CONFIG_TRANSPARENT_HUGEPAGE count_vm_event(THP_SWPOUT_FALLBACK); @@ -1851,7 +1848,7 @@ retry: } else if (folio_test_swapbacked(folio) && folio_test_large(folio)) { /* Split shmem folio */ - if (split_folio_to_list(folio, page_list)) + if (split_folio_to_list(folio, folio_list)) goto keep_locked; } @@ -1916,7 +1913,7 @@ retry: goto activate_locked; } - if (references == PAGEREF_RECLAIM_CLEAN) + if (references == FOLIOREF_RECLAIM_CLEAN) goto keep_locked; if (!may_enter_fs(folio, sc->gfp_mask)) goto keep_locked; @@ -2029,13 +2026,13 @@ free_it: nr_reclaimed += nr_pages; /* - * Is there need to periodically free_page_list? It would + * Is there need to periodically free_folio_list? It would * appear not as the counts should be low */ if (unlikely(folio_test_large(folio))) destroy_large_folio(folio); else - list_add(&folio->lru, &free_pages); + list_add(&folio->lru, &free_folios); continue; activate_locked_split: @@ -2063,29 +2060,29 @@ activate_locked: keep_locked: folio_unlock(folio); keep: - list_add(&folio->lru, &ret_pages); + list_add(&folio->lru, &ret_folios); VM_BUG_ON_FOLIO(folio_test_lru(folio) || folio_test_unevictable(folio), folio); } - /* 'page_list' is always empty here */ + /* 'folio_list' is always empty here */ /* Migrate folios selected for demotion */ - nr_reclaimed += demote_page_list(&demote_pages, pgdat); - /* Folios that could not be demoted are still in @demote_pages */ - if (!list_empty(&demote_pages)) { - /* Folios which weren't demoted go back on @page_list for retry: */ - list_splice_init(&demote_pages, page_list); + nr_reclaimed += demote_folio_list(&demote_folios, pgdat); + /* Folios that could not be demoted are still in @demote_folios */ + if (!list_empty(&demote_folios)) { + /* Folios which weren't demoted go back on @folio_list for retry: */ + list_splice_init(&demote_folios, folio_list); do_demote_pass = false; goto retry; } pgactivate = stat->nr_activate[0] + stat->nr_activate[1]; - mem_cgroup_uncharge_list(&free_pages); + mem_cgroup_uncharge_list(&free_folios); try_to_unmap_flush(); - free_unref_page_list(&free_pages); + free_unref_page_list(&free_folios); - list_splice(&ret_pages, page_list); + list_splice(&ret_folios, folio_list); count_vm_events(PGACTIVATE, pgactivate); if (plug) @@ -2094,7 +2091,7 @@ keep: } unsigned int reclaim_clean_pages_from_list(struct zone *zone, - struct list_head *folio_list) + struct list_head *folio_list) { struct scan_control sc = { .gfp_mask = GFP_KERNEL, @@ -2122,7 +2119,7 @@ unsigned int reclaim_clean_pages_from_list(struct zone *zone, * change in the future. */ noreclaim_flag = memalloc_noreclaim_save(); - nr_reclaimed = shrink_page_list(&clean_folios, zone->zone_pgdat, &sc, + nr_reclaimed = shrink_folio_list(&clean_folios, zone->zone_pgdat, &sc, &stat, true); memalloc_noreclaim_restore(noreclaim_flag); @@ -2181,7 +2178,7 @@ static __always_inline void update_lru_sizes(struct lruvec *lruvec, * * returns how many pages were moved onto *@dst. */ -static unsigned long isolate_lru_pages(unsigned long nr_to_scan, +static unsigned long isolate_lru_folios(unsigned long nr_to_scan, struct lruvec *lruvec, struct list_head *dst, unsigned long *nr_scanned, struct scan_control *sc, enum lru_list lru) @@ -2288,8 +2285,8 @@ move: * * Context: * - * (1) Must be called with an elevated refcount on the page. This is a - * fundamental difference from isolate_lru_pages() (which is called + * (1) Must be called with an elevated refcount on the folio. This is a + * fundamental difference from isolate_lru_folios() (which is called * without a stable reference). * (2) The lru_lock must not be held. * (3) Interrupts must be enabled. @@ -2361,13 +2358,13 @@ static int too_many_isolated(struct pglist_data *pgdat, int file, } /* - * move_pages_to_lru() moves folios from private @list to appropriate LRU list. + * move_folios_to_lru() moves folios from private @list to appropriate LRU list. * On return, @list is reused as a list of folios to be freed by the caller. * * Returns the number of pages moved to the given lruvec. */ -static unsigned int move_pages_to_lru(struct lruvec *lruvec, - struct list_head *list) +static unsigned int move_folios_to_lru(struct lruvec *lruvec, + struct list_head *list) { int nr_pages, nr_moved = 0; LIST_HEAD(folios_to_free); @@ -2387,7 +2384,7 @@ static unsigned int move_pages_to_lru(struct lruvec *lruvec, /* * The folio_set_lru needs to be kept here for list integrity. * Otherwise: - * #0 move_pages_to_lru #1 release_pages + * #0 move_folios_to_lru #1 release_pages * if (!folio_put_testzero()) * if (folio_put_testzero()) * !lru //skip lru_lock @@ -2444,11 +2441,11 @@ static int current_may_throttle(void) * shrink_inactive_list() is a helper for shrink_node(). It returns the number * of reclaimed pages */ -static unsigned long -shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, - struct scan_control *sc, enum lru_list lru) +static unsigned long shrink_inactive_list(unsigned long nr_to_scan, + struct lruvec *lruvec, struct scan_control *sc, + enum lru_list lru) { - LIST_HEAD(page_list); + LIST_HEAD(folio_list); unsigned long nr_scanned; unsigned int nr_reclaimed = 0; unsigned long nr_taken; @@ -2475,7 +2472,7 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_lock_irq(&lruvec->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &page_list, + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &folio_list, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); @@ -2490,10 +2487,10 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, if (nr_taken == 0) return 0; - nr_reclaimed = shrink_page_list(&page_list, pgdat, sc, &stat, false); + nr_reclaimed = shrink_folio_list(&folio_list, pgdat, sc, &stat, false); spin_lock_irq(&lruvec->lru_lock); - move_pages_to_lru(lruvec, &page_list); + move_folios_to_lru(lruvec, &folio_list); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, -nr_taken); item = current_is_kswapd() ? PGSTEAL_KSWAPD : PGSTEAL_DIRECT; @@ -2504,16 +2501,16 @@ shrink_inactive_list(unsigned long nr_to_scan, struct lruvec *lruvec, spin_unlock_irq(&lruvec->lru_lock); lru_note_cost(lruvec, file, stat.nr_pageout); - mem_cgroup_uncharge_list(&page_list); - free_unref_page_list(&page_list); + mem_cgroup_uncharge_list(&folio_list); + free_unref_page_list(&folio_list); /* - * If dirty pages are scanned that are not queued for IO, it + * If dirty folios are scanned that are not queued for IO, it * implies that flushers are not doing their job. This can - * happen when memory pressure pushes dirty pages to the end of + * happen when memory pressure pushes dirty folios to the end of * the LRU before the dirty limits are breached and the dirty * data has expired. It can also happen when the proportion of - * dirty pages grows not through writes but through memory + * dirty folios grows not through writes but through memory * pressure reclaiming all the clean cache. And in some cases, * the flushers simply cannot keep up with the allocation * rate. Nudge the flusher threads in case they are asleep. @@ -2572,7 +2569,7 @@ static void shrink_active_list(unsigned long nr_to_scan, spin_lock_irq(&lruvec->lru_lock); - nr_taken = isolate_lru_pages(nr_to_scan, lruvec, &l_hold, + nr_taken = isolate_lru_folios(nr_to_scan, lruvec, &l_hold, &nr_scanned, sc, lru); __mod_node_page_state(pgdat, NR_ISOLATED_ANON + file, nr_taken); @@ -2632,8 +2629,8 @@ static void shrink_active_list(unsigned long nr_to_scan, */ spin_lock_irq(&lruvec->lru_lock); - nr_activate = move_pages_to_lru(lruvec, &l_active); - nr_deactivate = move_pages_to_lru(lruvec, &l_inactive); + nr_activate = move_folios_to_lru(lruvec, &l_active); + nr_deactivate = move_folios_to_lru(lruvec, &l_inactive); /* Keep all free folios in l_active list */ list_splice(&l_inactive, &l_active); @@ -2649,7 +2646,7 @@ static void shrink_active_list(unsigned long nr_to_scan, nr_deactivate, nr_rotated, sc->priority, file); } -static unsigned int reclaim_page_list(struct list_head *page_list, +static unsigned int reclaim_folio_list(struct list_head *folio_list, struct pglist_data *pgdat) { struct reclaim_stat dummy_stat; @@ -2663,9 +2660,9 @@ static unsigned int reclaim_page_list(struct list_head *page_list, .no_demotion = 1, }; - nr_reclaimed = shrink_page_list(page_list, pgdat, &sc, &dummy_stat, false); - while (!list_empty(page_list)) { - folio = lru_to_folio(page_list); + nr_reclaimed = shrink_folio_list(folio_list, pgdat, &sc, &dummy_stat, false); + while (!list_empty(folio_list)) { + folio = lru_to_folio(folio_list); list_del(&folio->lru); folio_putback_lru(folio); } @@ -2695,11 +2692,11 @@ unsigned long reclaim_pages(struct list_head *folio_list) continue; } - nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); nid = folio_nid(lru_to_folio(folio_list)); } while (!list_empty(folio_list)); - nr_reclaimed += reclaim_page_list(&node_folio_list, NODE_DATA(nid)); + nr_reclaimed += reclaim_folio_list(&node_folio_list, NODE_DATA(nid)); memalloc_noreclaim_restore(noreclaim_flag); @@ -2729,13 +2726,13 @@ static unsigned long shrink_list(enum lru_list lru, unsigned long nr_to_scan, * but large enough to avoid thrashing the aggregate readahead window. * * Both inactive lists should also be large enough that each inactive - * page has a chance to be referenced again before it is reclaimed. + * folio has a chance to be referenced again before it is reclaimed. * * If that fails and refaulting is observed, the inactive list grows. * - * The inactive_ratio is the target ratio of ACTIVE to INACTIVE pages + * The inactive_ratio is the target ratio of ACTIVE to INACTIVE folios * on this LRU, maintained by the pageout code. An inactive_ratio - * of 3 means 3:1 or 25% of the pages are kept on the inactive list. + * of 3 means 3:1 or 25% of the folios are kept on the inactive list. * * total target max * memory ratio inactive @@ -2884,8 +2881,8 @@ static void prepare_scan_count(pg_data_t *pgdat, struct scan_control *sc) * Determine how aggressively the anon and file LRU lists should be * scanned. * - * nr[0] = anon inactive pages to scan; nr[1] = anon active pages to scan - * nr[2] = file inactive pages to scan; nr[3] = file active pages to scan + * nr[0] = anon inactive folios to scan; nr[1] = anon active folios to scan + * nr[2] = file inactive folios to scan; nr[3] = file active folios to scan */ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long *nr) @@ -2900,7 +2897,7 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, unsigned long ap, fp; enum lru_list lru; - /* If we have no swap space, do not bother scanning anon pages. */ + /* If we have no swap space, do not bother scanning anon folios. */ if (!sc->may_swap || !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) { scan_balance = SCAN_FILE; goto out; @@ -3647,7 +3644,7 @@ static int folio_update_gen(struct folio *folio, int gen) do { /* lru_gen_del_folio() has isolated this page? */ if (!(old_flags & LRU_GEN_MASK)) { - /* for shrink_page_list() */ + /* for shrink_folio_list() */ new_flags = old_flags | BIT(PG_referenced); continue; } @@ -4574,7 +4571,7 @@ static void lru_gen_age_node(struct pglist_data *pgdat, struct scan_control *sc) } /* - * This function exploits spatial locality when shrink_page_list() walks the + * This function exploits spatial locality when shrink_folio_list() walks the * rmap. It scans the adjacent PTEs of a young PTE and promotes hot pages. If * the scan was done cacheline efficiently, it adds the PMD entry pointing to * the PTE table to the Bloom filter. This forms a feedback loop between the @@ -4795,7 +4792,7 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca if (!folio_test_referenced(folio)) set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, 0); - /* for shrink_page_list() */ + /* for shrink_folio_list() */ folio_clear_reclaim(folio); folio_clear_referenced(folio); @@ -4998,7 +4995,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap if (list_empty(&list)) return scanned; - reclaimed = shrink_page_list(&list, pgdat, sc, &stat, false); + reclaimed = shrink_folio_list(&list, pgdat, sc, &stat, false); list_for_each_entry(folio, &list, lru) { /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ @@ -5015,7 +5012,7 @@ static int evict_folios(struct lruvec *lruvec, struct scan_control *sc, int swap spin_lock_irq(&lruvec->lru_lock); - move_pages_to_lru(lruvec, &list); + move_folios_to_lru(lruvec, &list); walk = current->reclaim_state->mm_walk; if (walk && walk->batched) From 379708ffde1b049bc41084e0a0572c44c8a1d2c4 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:45:58 +0100 Subject: [PATCH 076/350] mm: add the first tail page to struct folio Some of the static checkers get confused by extracting the page from the folio and referring to fields in the first tail page. Adding these fields to struct folio lets us avoid doing that. It has the risk that people will refer to those fields without checking that the folio is actually a large folio, so prefix them with underscores and document the preferred function to use instead. Link: https://lkml.kernel.org/r/20220902194653.1739778-3-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm_types.h | 30 +++++++++++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8f30f262431c..5c87d0f292a2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -245,6 +245,13 @@ struct page { * @_refcount: Do not access this member directly. Use folio_ref_count() * to find how many references there are to this folio. * @memcg_data: Memory Control Group data. + * @_flags_1: For large folios, additional page flags. + * @__head: Points to the folio. Do not use. + * @_folio_dtor: Which destructor to use for this folio. + * @_folio_order: Do not use directly, call folio_order(). + * @_total_mapcount: Do not use directly, call folio_entire_mapcount(). + * @_pincount: Do not use directly, call folio_maybe_dma_pinned(). + * @_folio_nr_pages: Do not use directly, call folio_nr_pages(). * * A folio is a physically, virtually and logically contiguous set * of bytes. It is a power-of-two in size, and it is aligned to that @@ -283,9 +290,17 @@ struct folio { }; struct page page; }; + unsigned long _flags_1; + unsigned long __head; + unsigned char _folio_dtor; + unsigned char _folio_order; + atomic_t _total_mapcount; + atomic_t _pincount; +#ifdef CONFIG_64BIT + unsigned int _folio_nr_pages; +#endif }; -static_assert(sizeof(struct page) == sizeof(struct folio)); #define FOLIO_MATCH(pg, fl) \ static_assert(offsetof(struct page, pg) == offsetof(struct folio, fl)) FOLIO_MATCH(flags, flags); @@ -300,6 +315,19 @@ FOLIO_MATCH(_refcount, _refcount); FOLIO_MATCH(memcg_data, memcg_data); #endif #undef FOLIO_MATCH +#define FOLIO_MATCH(pg, fl) \ + static_assert(offsetof(struct folio, fl) == \ + offsetof(struct page, pg) + sizeof(struct page)) +FOLIO_MATCH(flags, _flags_1); +FOLIO_MATCH(compound_head, __head); +FOLIO_MATCH(compound_dtor, _folio_dtor); +FOLIO_MATCH(compound_order, _folio_order); +FOLIO_MATCH(compound_mapcount, _total_mapcount); +FOLIO_MATCH(compound_pincount, _pincount); +#ifdef CONFIG_64BIT +FOLIO_MATCH(compound_nr, _folio_nr_pages); +#endif +#undef FOLIO_MATCH static inline atomic_t *folio_mapcount_ptr(struct folio *folio) { From c3a15bff46cb5149aeae4c8ae69443d791fa6578 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:45:59 +0100 Subject: [PATCH 077/350] mm: reimplement folio_order() and folio_nr_pages() Instead of calling compound_order() and compound_nr_pages(), use the folio directly. Saves 1905 bytes from mm/filemap.o due to folio_test_large() now being a cheaper check than PageHead(). Link: https://lkml.kernel.org/r/20220902194653.1739778-4-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/mm.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index e56dd8f7eae1..a37c8a29c49b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -729,7 +729,9 @@ static inline unsigned int compound_order(struct page *page) */ static inline unsigned int folio_order(struct folio *folio) { - return compound_order(&folio->page); + if (!folio_test_large(folio)) + return 0; + return folio->_folio_order; } #include @@ -1659,7 +1661,13 @@ static inline void set_page_links(struct page *page, enum zone_type zone, */ static inline long folio_nr_pages(struct folio *folio) { - return compound_nr(&folio->page); + if (!folio_test_large(folio)) + return 1; +#ifdef CONFIG_64BIT + return folio->_folio_nr_pages; +#else + return 1L << folio->_folio_order; +#endif } /** From d788f5b374c2ba204fed57e39acf2452acc24812 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:00 +0100 Subject: [PATCH 078/350] mm: add split_folio() This wrapper removes a need to use split_huge_page(&folio->page). Convert two callers. Link: https://lkml.kernel.org/r/20220902194653.1739778-5-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/huge_mm.h | 5 +++++ mm/shmem.c | 2 +- mm/truncate.c | 2 +- 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index 38265f9f782e..a1341fdcf666 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -444,6 +444,11 @@ static inline int split_folio_to_list(struct folio *folio, return split_huge_page_to_list(&folio->page, list); } +static inline int split_folio(struct folio *folio) +{ + return split_folio_to_list(folio, NULL); +} + /* * archs that select ARCH_WANTS_THP_SWAP but don't support THP_SWP due to * limitations in the implementation like arm64 MTE can override this to diff --git a/mm/shmem.c b/mm/shmem.c index 42e5888bf84d..674bde8b3085 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -629,7 +629,7 @@ next: goto move_back; } - ret = split_huge_page(&folio->page); + ret = split_folio(folio); folio_unlock(folio); folio_put(folio); diff --git a/mm/truncate.c b/mm/truncate.c index 0b0708bf935f..c0be77e5c008 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -240,7 +240,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end) folio_invalidate(folio, offset, length); if (!folio_test_large(folio)) return true; - if (split_huge_page(&folio->page) == 0) + if (split_folio(folio) == 0) return true; if (folio_test_dirty(folio)) return false; From 681ecf6301786cb06942b57f0ef7103b07ae6813 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:01 +0100 Subject: [PATCH 079/350] mm: add folio_add_lru_vma() Convert lru_cache_add_inactive_or_unevictable() to folio_add_lru_vma() and add a compatibility wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-6-willy@infradead.org Signed-off-by: "Matthew Wilcox (Oracle)" Signed-off-by: Andrew Morton --- include/linux/swap.h | 10 +++++----- mm/folio-compat.c | 6 ++++++ mm/swap.c | 19 +++++++++---------- 3 files changed, 20 insertions(+), 15 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 6308150b234a..2ede1e3695d9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -379,11 +379,11 @@ extern unsigned long totalreserve_pages; /* linux/mm/swap.c */ -extern void lru_note_cost(struct lruvec *lruvec, bool file, - unsigned int nr_pages); -extern void lru_note_cost_folio(struct folio *); -extern void folio_add_lru(struct folio *); -extern void lru_cache_add(struct page *); +void lru_note_cost(struct lruvec *lruvec, bool file, unsigned int nr_pages); +void lru_note_cost_folio(struct folio *); +void folio_add_lru(struct folio *); +void folio_add_lru_vma(struct folio *, struct vm_area_struct *); +void lru_cache_add(struct page *); void mark_page_accessed(struct page *); void folio_mark_accessed(struct folio *); diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 458618c7302c..e1e23b4947d7 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -88,6 +88,12 @@ void lru_cache_add(struct page *page) } EXPORT_SYMBOL(lru_cache_add); +void lru_cache_add_inactive_or_unevictable(struct page *page, + struct vm_area_struct *vma) +{ + folio_add_lru_vma(page_folio(page), vma); +} + int add_to_page_cache_lru(struct page *page, struct address_space *mapping, pgoff_t index, gfp_t gfp) { diff --git a/mm/swap.c b/mm/swap.c index 0a3871a70952..955930f41d20 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -537,22 +537,21 @@ void folio_add_lru(struct folio *folio) EXPORT_SYMBOL(folio_add_lru); /** - * lru_cache_add_inactive_or_unevictable - * @page: the page to be added to LRU - * @vma: vma in which page is mapped for determining reclaimability + * folio_add_lru_vma() - Add a folio to the appropate LRU list for this VMA. + * @folio: The folio to be added to the LRU. + * @vma: VMA in which the folio is mapped. * - * Place @page on the inactive or unevictable LRU list, depending on its - * evictability. + * If the VMA is mlocked, @folio is added to the unevictable list. + * Otherwise, it is treated the same way as folio_add_lru(). */ -void lru_cache_add_inactive_or_unevictable(struct page *page, - struct vm_area_struct *vma) +void folio_add_lru_vma(struct folio *folio, struct vm_area_struct *vma) { - VM_BUG_ON_PAGE(PageLRU(page), page); + VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); if (unlikely((vma->vm_flags & (VM_LOCKED | VM_SPECIAL)) == VM_LOCKED)) - mlock_new_page(page); + mlock_new_page(&folio->page); else - lru_cache_add(page); + folio_add_lru(folio); } /* From f530ed0e2d01aafc4d0e3cf8ab6b64bbdb7696a7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:02 +0100 Subject: [PATCH 080/350] shmem: convert shmem_writepage() to use a folio throughout Even though we will split any large folio that comes in, write the code to handle large folios so as to not leave a trap for whoever tries to handle large folios in the swap cache. Link: https://lkml.kernel.org/r/20220902194653.1739778-7-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 47 ++++++++++++++++++++++++----------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 674bde8b3085..3d2d35728793 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1328,17 +1328,18 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) * "force", drivers/gpu/drm/i915/gem/i915_gem_shmem.c gets huge pages, * and its shmem_writeback() needs them to be split when swapping. */ - if (PageTransCompound(page)) { + if (folio_test_large(folio)) { /* Ensure the subpages are still dirty */ - SetPageDirty(page); + folio_test_set_dirty(folio); if (split_huge_page(page) < 0) goto redirty; - ClearPageDirty(page); + folio = page_folio(page); + folio_clear_dirty(folio); } - BUG_ON(!PageLocked(page)); - mapping = page->mapping; - index = page->index; + BUG_ON(!folio_test_locked(folio)); + mapping = folio->mapping; + index = folio->index; inode = mapping->host; info = SHMEM_I(inode); if (info->flags & VM_LOCKED) @@ -1361,15 +1362,15 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) /* * This is somewhat ridiculous, but without plumbing a SWAP_MAP_FALLOC * value into swapfile.c, the only way we can correctly account for a - * fallocated page arriving here is now to initialize it and write it. + * fallocated folio arriving here is now to initialize it and write it. * - * That's okay for a page already fallocated earlier, but if we have + * That's okay for a folio already fallocated earlier, but if we have * not yet completed the fallocation, then (a) we want to keep track - * of this page in case we have to undo it, and (b) it may not be a + * of this folio in case we have to undo it, and (b) it may not be a * good idea to continue anyway, once we're pushing into swap. So - * reactivate the page, and let shmem_fallocate() quit when too many. + * reactivate the folio, and let shmem_fallocate() quit when too many. */ - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { if (inode->i_private) { struct shmem_falloc *shmem_falloc; spin_lock(&inode->i_lock); @@ -1385,9 +1386,9 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (shmem_falloc) goto redirty; } - clear_highpage(page); - flush_dcache_page(page); - SetPageUptodate(page); + folio_zero_range(folio, 0, folio_size(folio)); + flush_dcache_folio(folio); + folio_mark_uptodate(folio); } swap = folio_alloc_swap(folio); @@ -1396,7 +1397,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) /* * Add inode to shmem_unuse()'s list of swapped-out inodes, - * if it's not already there. Do it now before the page is + * if it's not already there. Do it now before the folio is * moved to swap cache, when its pagelock no longer protects * the inode from eviction. But don't unlock the mutex until * we've incremented swapped, because shmem_unuse_inode() will @@ -1406,7 +1407,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(page, swap, + if (add_to_swap_cache(&folio->page, swap, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, NULL) == 0) { spin_lock_irq(&info->lock); @@ -1415,21 +1416,21 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_unlock_irq(&info->lock); swap_shmem_alloc(swap); - shmem_delete_from_page_cache(page, swp_to_radix_entry(swap)); + shmem_delete_from_page_cache(&folio->page, swp_to_radix_entry(swap)); mutex_unlock(&shmem_swaplist_mutex); - BUG_ON(page_mapped(page)); - swap_writepage(page, wbc); + BUG_ON(folio_mapped(folio)); + swap_writepage(&folio->page, wbc); return 0; } mutex_unlock(&shmem_swaplist_mutex); - put_swap_page(page, swap); + put_swap_page(&folio->page, swap); redirty: - set_page_dirty(page); + folio_mark_dirty(folio); if (wbc->for_reclaim) - return AOP_WRITEPAGE_ACTIVATE; /* Return with page locked */ - unlock_page(page); + return AOP_WRITEPAGE_ACTIVATE; /* Return with folio locked */ + folio_unlock(folio); return 0; } From 4cd400fd1f55dde1fa430a706828042daed94c43 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:03 +0100 Subject: [PATCH 081/350] shmem: convert shmem_delete_from_page_cache() to take a folio Remove the assertion that the page is not Compound as this function now handles large folios correctly. Link: https://lkml.kernel.org/r/20220902194653.1739778-8-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 3d2d35728793..9e851fc87601 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -763,23 +763,22 @@ error: } /* - * Like delete_from_page_cache, but substitutes swap for page. + * Like delete_from_page_cache, but substitutes swap for @folio. */ -static void shmem_delete_from_page_cache(struct page *page, void *radswap) +static void shmem_delete_from_page_cache(struct folio *folio, void *radswap) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = folio->mapping; + long nr = folio_nr_pages(folio); int error; - VM_BUG_ON_PAGE(PageCompound(page), page); - xa_lock_irq(&mapping->i_pages); - error = shmem_replace_entry(mapping, page->index, page, radswap); - page->mapping = NULL; - mapping->nrpages--; - __dec_lruvec_page_state(page, NR_FILE_PAGES); - __dec_lruvec_page_state(page, NR_SHMEM); + error = shmem_replace_entry(mapping, folio->index, folio, radswap); + folio->mapping = NULL; + mapping->nrpages -= nr; + __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); xa_unlock_irq(&mapping->i_pages); - put_page(page); + folio_put(folio); BUG_ON(error); } @@ -1416,7 +1415,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) spin_unlock_irq(&info->lock); swap_shmem_alloc(swap); - shmem_delete_from_page_cache(&folio->page, swp_to_radix_entry(swap)); + shmem_delete_from_page_cache(folio, swp_to_radix_entry(swap)); mutex_unlock(&shmem_swaplist_mutex); BUG_ON(folio_mapped(folio)); From 907ea17eb2b436f07332c935476d77893abae735 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:04 +0100 Subject: [PATCH 082/350] shmem: convert shmem_replace_page() to use folios throughout Introduce folio_set_swap_entry() to abstract how both folio->private and swp_entry_t work. Use swap_address_space() directly instead of indirecting through folio_mapping(). Include an assertion that the old folio is not large as we only allocate a single-page folio to replace it. Use folio_put_refs() instead of calling folio_put() twice. Link: https://lkml.kernel.org/r/20220902194653.1739778-9-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 5 ++++ mm/shmem.c | 67 +++++++++++++++++++++----------------------- 2 files changed, 37 insertions(+), 35 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 2ede1e3695d9..61e13d1a4cab 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -355,6 +355,11 @@ static inline swp_entry_t folio_swap_entry(struct folio *folio) return entry; } +static inline void folio_set_swap_entry(struct folio *folio, swp_entry_t entry) +{ + folio->private = (void *)entry.val; +} + /* linux/mm/workingset.c */ void workingset_age_nonresident(struct lruvec *lruvec, unsigned long nr_pages); void *workingset_eviction(struct folio *folio, struct mem_cgroup *target_memcg); diff --git a/mm/shmem.c b/mm/shmem.c index 9e851fc87601..4113f1b9d4a8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1560,12 +1560,6 @@ static struct folio *shmem_alloc_folio(gfp_t gfp, return folio; } -static struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return &shmem_alloc_folio(gfp, info, index)->page; -} - static struct folio *shmem_alloc_and_acct_folio(gfp_t gfp, struct inode *inode, pgoff_t index, bool huge) { @@ -1617,51 +1611,49 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) static int shmem_replace_page(struct page **pagep, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { - struct page *oldpage, *newpage; struct folio *old, *new; struct address_space *swap_mapping; swp_entry_t entry; pgoff_t swap_index; int error; - oldpage = *pagep; - entry.val = page_private(oldpage); + old = page_folio(*pagep); + entry = folio_swap_entry(old); swap_index = swp_offset(entry); - swap_mapping = page_mapping(oldpage); + swap_mapping = swap_address_space(entry); /* * We have arrived here because our zones are constrained, so don't * limit chance of success by further cpuset and node constraints. */ gfp &= ~GFP_CONSTRAINT_MASK; - newpage = shmem_alloc_page(gfp, info, index); - if (!newpage) + VM_BUG_ON_FOLIO(folio_test_large(old), old); + new = shmem_alloc_folio(gfp, info, index); + if (!new) return -ENOMEM; - get_page(newpage); - copy_highpage(newpage, oldpage); - flush_dcache_page(newpage); + folio_get(new); + folio_copy(new, old); + flush_dcache_folio(new); - __SetPageLocked(newpage); - __SetPageSwapBacked(newpage); - SetPageUptodate(newpage); - set_page_private(newpage, entry.val); - SetPageSwapCache(newpage); + __folio_set_locked(new); + __folio_set_swapbacked(new); + folio_mark_uptodate(new); + folio_set_swap_entry(new, entry); + folio_set_swapcache(new); /* * Our caller will very soon move newpage out of swapcache, but it's * a nice clean interface for us to replace oldpage by newpage there. */ xa_lock_irq(&swap_mapping->i_pages); - error = shmem_replace_entry(swap_mapping, swap_index, oldpage, newpage); + error = shmem_replace_entry(swap_mapping, swap_index, old, new); if (!error) { - old = page_folio(oldpage); - new = page_folio(newpage); mem_cgroup_migrate(old, new); - __inc_lruvec_page_state(newpage, NR_FILE_PAGES); - __inc_lruvec_page_state(newpage, NR_SHMEM); - __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); - __dec_lruvec_page_state(oldpage, NR_SHMEM); + __lruvec_stat_mod_folio(new, NR_FILE_PAGES, 1); + __lruvec_stat_mod_folio(new, NR_SHMEM, 1); + __lruvec_stat_mod_folio(old, NR_FILE_PAGES, -1); + __lruvec_stat_mod_folio(old, NR_SHMEM, -1); } xa_unlock_irq(&swap_mapping->i_pages); @@ -1671,18 +1663,17 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, * both PageSwapCache and page_private after getting page lock; * but be defensive. Reverse old to newpage for clear and free. */ - oldpage = newpage; + old = new; } else { - lru_cache_add(newpage); - *pagep = newpage; + folio_add_lru(new); + *pagep = &new->page; } - ClearPageSwapCache(oldpage); - set_page_private(oldpage, 0); + folio_clear_swapcache(old); + old->private = NULL; - unlock_page(oldpage); - put_page(oldpage); - put_page(oldpage); + folio_unlock(old); + folio_put_refs(old, 2); return error; } @@ -2383,6 +2374,12 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, } #ifdef CONFIG_USERFAULTFD +static struct page *shmem_alloc_page(gfp_t gfp, + struct shmem_inode_info *info, pgoff_t index) +{ + return &shmem_alloc_folio(gfp, info, index)->page; +} + int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, From 14d01ee9fcb901c9e020f2dcd71c500f10c3bd03 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:05 +0100 Subject: [PATCH 083/350] mm/swapfile: remove page_swapcount() By restructuring folio_swapped(), it can use swap_swapcount() instead of page_swapcount(). It's even a little more efficient. Link: https://lkml.kernel.org/r/20220902194653.1739778-10-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 46 +++++++++++++--------------------------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 469d9af86be2..e0aaeac5c829 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1431,30 +1431,6 @@ void swapcache_free_entries(swp_entry_t *entries, int n) spin_unlock(&p->lock); } -/* - * How many references to page are currently swapped out? - * This does not give an exact answer when swap count is continued, - * but does include the high COUNT_CONTINUED flag to allow for that. - */ -static int page_swapcount(struct page *page) -{ - int count = 0; - struct swap_info_struct *p; - struct swap_cluster_info *ci; - swp_entry_t entry; - unsigned long offset; - - entry.val = page_private(page); - p = _swap_info_get(entry); - if (p) { - offset = swp_offset(entry); - ci = lock_cluster_or_swap_info(p, offset); - count = swap_count(p->swap_map[offset]); - unlock_cluster_or_swap_info(p, ci); - } - return count; -} - int __swap_count(swp_entry_t entry) { struct swap_info_struct *si; @@ -1469,11 +1445,16 @@ int __swap_count(swp_entry_t entry) return count; } +/* + * How many references to @entry are currently swapped out? + * This does not give an exact answer when swap count is continued, + * but does include the high COUNT_CONTINUED flag to allow for that. + */ static int swap_swapcount(struct swap_info_struct *si, swp_entry_t entry) { - int count = 0; pgoff_t offset = swp_offset(entry); struct swap_cluster_info *ci; + int count; ci = lock_cluster_or_swap_info(si, offset); count = swap_count(si->swap_map[offset]); @@ -1574,17 +1555,16 @@ unlock_out: static bool folio_swapped(struct folio *folio) { - swp_entry_t entry; - struct swap_info_struct *si; + swp_entry_t entry = folio_swap_entry(folio); + struct swap_info_struct *si = _swap_info_get(entry); + + if (!si) + return false; if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!folio_test_large(folio))) - return page_swapcount(&folio->page) != 0; + return swap_swapcount(si, entry) != 0; - entry = folio_swap_entry(folio); - si = _swap_info_get(entry); - if (si) - return swap_page_trans_huge_swapped(si, entry); - return false; + return swap_page_trans_huge_swapped(si, entry); } /* From bdb0ed54a4768dc3c2613d4c45f94c887d43cd7a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:06 +0100 Subject: [PATCH 084/350] mm/swapfile: convert try_to_free_swap() to folio_free_swap() Add kernel-doc for folio_free_swap() and make it return bool. Add a try_to_free_swap() compatibility wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-11-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 ++++++ mm/folio-compat.c | 7 +++++++ mm/swapfile.c | 32 ++++++++++++++++++-------------- mm/vmscan.c | 2 +- 4 files changed, 32 insertions(+), 15 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 61e13d1a4cab..dac6308d878e 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -490,6 +490,7 @@ static inline long get_nr_swap_pages(void) extern void si_swapinfo(struct sysinfo *); swp_entry_t folio_alloc_swap(struct folio *folio); +bool folio_free_swap(struct folio *folio); extern void put_swap_page(struct page *page, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); @@ -606,6 +607,11 @@ static inline swp_entry_t folio_alloc_swap(struct folio *folio) return entry; } +static inline bool folio_free_swap(struct folio *folio) +{ + return false; +} + static inline int add_swap_extent(struct swap_info_struct *sis, unsigned long start_page, unsigned long nr_pages, sector_t start_block) diff --git a/mm/folio-compat.c b/mm/folio-compat.c index e1e23b4947d7..06d47f00609b 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -146,3 +146,10 @@ void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); } + +#ifdef CONFIG_SWAP +int try_to_free_swap(struct page *page) +{ + return folio_free_swap(page_folio(page)); +} +#endif diff --git a/mm/swapfile.c b/mm/swapfile.c index e0aaeac5c829..f2a446799a39 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1567,43 +1567,47 @@ static bool folio_swapped(struct folio *folio) return swap_page_trans_huge_swapped(si, entry); } -/* - * If swap is getting full, or if there are no more mappings of this page, - * then try_to_free_swap is called to free its swap space. +/** + * folio_free_swap() - Free the swap space used for this folio. + * @folio: The folio to remove. + * + * If swap is getting full, or if there are no more mappings of this folio, + * then call folio_free_swap to free its swap space. + * + * Return: true if we were able to release the swap space. */ -int try_to_free_swap(struct page *page) +bool folio_free_swap(struct folio *folio) { - struct folio *folio = page_folio(page); VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (!folio_test_swapcache(folio)) - return 0; + return false; if (folio_test_writeback(folio)) - return 0; + return false; if (folio_swapped(folio)) - return 0; + return false; /* * Once hibernation has begun to create its image of memory, - * there's a danger that one of the calls to try_to_free_swap() + * there's a danger that one of the calls to folio_free_swap() * - most probably a call from __try_to_reclaim_swap() while * hibernation is allocating its own swap pages for the image, * but conceivably even a call from memory reclaim - will free - * the swap from a page which has already been recorded in the - * image as a clean swapcache page, and then reuse its swap for + * the swap from a folio which has already been recorded in the + * image as a clean swapcache folio, and then reuse its swap for * another page of the image. On waking from hibernation, the - * original page might be freed under memory pressure, then + * original folio might be freed under memory pressure, then * later read back in from swap, now with the wrong data. * * Hibernation suspends storage while it is writing the image * to disk so check that here. */ if (pm_suspended_storage()) - return 0; + return false; delete_from_swap_cache(folio); folio_set_dirty(folio); - return 1; + return true; } /* diff --git a/mm/vmscan.c b/mm/vmscan.c index 9ce6cc74d9ea..9268e64590e4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2049,7 +2049,7 @@ activate_locked: if (folio_test_swapcache(folio) && (mem_cgroup_swap_full(&folio->page) || folio_test_mlocked(folio))) - try_to_free_swap(&folio->page); + folio_free_swap(folio); VM_BUG_ON_FOLIO(folio_test_active(folio), folio); if (!folio_test_mlocked(folio)) { int type = folio_is_file_lru(folio); From a0d3374b070776e985bbd7b165b178fa688bf37a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:07 +0100 Subject: [PATCH 085/350] mm/swap: convert __read_swap_cache_async() to use a folio Remove a few hidden (and one visible) calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-12-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap_state.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 41afa6d45b23..b1e181fc5268 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -411,7 +411,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, bool *new_page_allocated) { struct swap_info_struct *si; - struct page *page; + struct folio *folio; void *shadow = NULL; *new_page_allocated = false; @@ -426,11 +426,11 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, si = get_swap_device(entry); if (!si) return NULL; - page = find_get_page(swap_address_space(entry), - swp_offset(entry)); + folio = filemap_get_folio(swap_address_space(entry), + swp_offset(entry)); put_swap_device(si); - if (page) - return page; + if (folio) + return folio_file_page(folio, swp_offset(entry)); /* * Just skip read ahead for unused swap slot. @@ -448,8 +448,8 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * before marking swap_map SWAP_HAS_CACHE, when -EEXIST will * cause any racers to loop around until we add it to cache. */ - page = alloc_page_vma(gfp_mask, vma, addr); - if (!page) + folio = vma_alloc_folio(gfp_mask, 0, vma, addr, false); + if (!folio) return NULL; /* @@ -459,7 +459,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, if (!err) break; - put_page(page); + folio_put(folio); if (err != -EEXIST) return NULL; @@ -477,30 +477,30 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, * The swap entry is ours to swap in. Prepare the new page. */ - __SetPageLocked(page); - __SetPageSwapBacked(page); + __folio_set_locked(folio); + __folio_set_swapbacked(folio); - if (mem_cgroup_swapin_charge_page(page, NULL, gfp_mask, entry)) + if (mem_cgroup_swapin_charge_page(&folio->page, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) + if (add_to_swap_cache(&folio->page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; mem_cgroup_swapin_uncharge_swap(entry); if (shadow) - workingset_refault(page_folio(page), shadow); + workingset_refault(folio, shadow); - /* Caller will initiate read into locked page */ - lru_cache_add(page); + /* Caller will initiate read into locked folio */ + folio_add_lru(folio); *new_page_allocated = true; - return page; + return &folio->page; fail_unlock: - put_swap_page(page, entry); - unlock_page(page); - put_page(page); + put_swap_page(&folio->page, entry); + folio_unlock(folio); + folio_put(folio); return NULL; } From a4c366f01f10073e0220656561b875627ff7cd90 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:08 +0100 Subject: [PATCH 086/350] mm/swap: convert add_to_swap_cache() to take a folio With all callers using folios, we can convert add_to_swap_cache() to take a folio and use it throughout. Link: https://lkml.kernel.org/r/20220902194653.1739778-13-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 2 +- mm/swap.h | 4 ++-- mm/swap_state.c | 34 +++++++++++++++++----------------- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 4113f1b9d4a8..ced76c229b96 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1406,7 +1406,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (list_empty(&info->swaplist)) list_add(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(&folio->page, swap, + if (add_to_swap_cache(folio, swap, __GFP_HIGH | __GFP_NOMEMALLOC | __GFP_NOWARN, NULL) == 0) { spin_lock_irq(&info->lock); diff --git a/mm/swap.h b/mm/swap.h index 0ffa5b478051..29e38f3d82d0 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -32,7 +32,7 @@ extern struct address_space *swapper_spaces[]; void show_swap_cache_info(void); bool add_to_swap(struct folio *folio); void *get_shadow_from_swap_cache(swp_entry_t entry); -int add_to_swap_cache(struct page *page, swp_entry_t entry, +int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp); void __delete_from_swap_cache(struct folio *folio, swp_entry_t entry, void *shadow); @@ -122,7 +122,7 @@ static inline void *get_shadow_from_swap_cache(swp_entry_t entry) return NULL; } -static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, +static inline int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp_mask, void **shadowp) { return -1; diff --git a/mm/swap_state.c b/mm/swap_state.c index b1e181fc5268..ecf1accc2fb1 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -85,21 +85,21 @@ void *get_shadow_from_swap_cache(swp_entry_t entry) * add_to_swap_cache resembles filemap_add_folio on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int add_to_swap_cache(struct page *page, swp_entry_t entry, +int add_to_swap_cache(struct folio *folio, swp_entry_t entry, gfp_t gfp, void **shadowp) { struct address_space *address_space = swap_address_space(entry); pgoff_t idx = swp_offset(entry); - XA_STATE_ORDER(xas, &address_space->i_pages, idx, compound_order(page)); - unsigned long i, nr = thp_nr_pages(page); + XA_STATE_ORDER(xas, &address_space->i_pages, idx, folio_order(folio)); + unsigned long i, nr = folio_nr_pages(folio); void *old; - VM_BUG_ON_PAGE(!PageLocked(page), page); - VM_BUG_ON_PAGE(PageSwapCache(page), page); - VM_BUG_ON_PAGE(!PageSwapBacked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(folio_test_swapcache(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_swapbacked(folio), folio); - page_ref_add(page, nr); - SetPageSwapCache(page); + folio_ref_add(folio, nr); + folio_set_swapcache(folio); do { xas_lock_irq(&xas); @@ -107,19 +107,19 @@ int add_to_swap_cache(struct page *page, swp_entry_t entry, if (xas_error(&xas)) goto unlock; for (i = 0; i < nr; i++) { - VM_BUG_ON_PAGE(xas.xa_index != idx + i, page); + VM_BUG_ON_FOLIO(xas.xa_index != idx + i, folio); old = xas_load(&xas); if (xa_is_value(old)) { if (shadowp) *shadowp = old; } - set_page_private(page + i, entry.val + i); - xas_store(&xas, page); + set_page_private(folio_page(folio, i), entry.val + i); + xas_store(&xas, folio); xas_next(&xas); } address_space->nrpages += nr; - __mod_node_page_state(page_pgdat(page), NR_FILE_PAGES, nr); - __mod_lruvec_page_state(page, NR_SWAPCACHE, nr); + __node_stat_mod_folio(folio, NR_FILE_PAGES, nr); + __lruvec_stat_mod_folio(folio, NR_SWAPCACHE, nr); unlock: xas_unlock_irq(&xas); } while (xas_nomem(&xas, gfp)); @@ -127,8 +127,8 @@ unlock: if (!xas_error(&xas)) return 0; - ClearPageSwapCache(page); - page_ref_sub(page, nr); + folio_clear_swapcache(folio); + folio_ref_sub(folio, nr); return xas_error(&xas); } @@ -194,7 +194,7 @@ bool add_to_swap(struct folio *folio) /* * Add it to the swap cache. */ - err = add_to_swap_cache(&folio->page, entry, + err = add_to_swap_cache(folio, entry, __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); if (err) /* @@ -484,7 +484,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ - if (add_to_swap_cache(&folio->page, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) + if (add_to_swap_cache(folio, entry, gfp_mask & GFP_RECLAIM_MASK, &shadow)) goto fail_unlock; mem_cgroup_swapin_uncharge_swap(entry); From 4081f7446d95a9d3ced12dc04ff02c187a761e90 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:09 +0100 Subject: [PATCH 087/350] mm/swap: convert put_swap_page() to put_swap_folio() With all callers now using a folio, we can convert this function. Link: https://lkml.kernel.org/r/20220902194653.1739778-14-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- mm/shmem.c | 2 +- mm/swap_slots.c | 2 +- mm/swap_state.c | 6 +++--- mm/swapfile.c | 4 ++-- mm/vmscan.c | 2 +- 6 files changed, 10 insertions(+), 10 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index dac6308d878e..42cbef554de6 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -491,7 +491,7 @@ static inline long get_nr_swap_pages(void) extern void si_swapinfo(struct sysinfo *); swp_entry_t folio_alloc_swap(struct folio *folio); bool folio_free_swap(struct folio *folio); -extern void put_swap_page(struct page *page, swp_entry_t entry); +void put_swap_folio(struct folio *folio, swp_entry_t entry); extern swp_entry_t get_swap_page_of_type(int); extern int get_swap_pages(int n, swp_entry_t swp_entries[], int entry_size); extern int add_swap_count_continuation(swp_entry_t, gfp_t); @@ -576,7 +576,7 @@ static inline void swap_free(swp_entry_t swp) { } -static inline void put_swap_page(struct page *page, swp_entry_t swp) +static inline void put_swap_folio(struct folio *folio, swp_entry_t swp) { } diff --git a/mm/shmem.c b/mm/shmem.c index ced76c229b96..56cabf9bb947 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1424,7 +1424,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); - put_swap_page(&folio->page, swap); + put_swap_folio(folio, swap); redirty: folio_mark_dirty(folio); if (wbc->for_reclaim) diff --git a/mm/swap_slots.c b/mm/swap_slots.c index 10b94d64cc25..0bec1f705f8e 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -343,7 +343,7 @@ repeat: get_swap_pages(1, &entry, 1); out: if (mem_cgroup_try_charge_swap(folio, entry)) { - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); entry.val = 0; } return entry; diff --git a/mm/swap_state.c b/mm/swap_state.c index ecf1accc2fb1..ea354efd3735 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -218,7 +218,7 @@ bool add_to_swap(struct folio *folio) return true; fail: - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); return false; } @@ -237,7 +237,7 @@ void delete_from_swap_cache(struct folio *folio) __delete_from_swap_cache(folio, entry, NULL); xa_unlock_irq(&address_space->i_pages); - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); folio_ref_sub(folio, folio_nr_pages(folio)); } @@ -498,7 +498,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, return &folio->page; fail_unlock: - put_swap_page(&folio->page, entry); + put_swap_folio(folio, entry); folio_unlock(folio); folio_put(folio); return NULL; diff --git a/mm/swapfile.c b/mm/swapfile.c index f2a446799a39..aafe739dc2a6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1332,7 +1332,7 @@ void swap_free(swp_entry_t entry) /* * Called after dropping swapcache to decrease refcnt to swap entries. */ -void put_swap_page(struct page *page, swp_entry_t entry) +void put_swap_folio(struct folio *folio, swp_entry_t entry) { unsigned long offset = swp_offset(entry); unsigned long idx = offset / SWAPFILE_CLUSTER; @@ -1341,7 +1341,7 @@ void put_swap_page(struct page *page, swp_entry_t entry) unsigned char *map; unsigned int i, free_entries = 0; unsigned char val; - int size = swap_entry_size(thp_nr_pages(page)); + int size = swap_entry_size(folio_nr_pages(folio)); si = _swap_info_get(entry); if (!si) diff --git a/mm/vmscan.c b/mm/vmscan.c index 9268e64590e4..1707e3bfcfe4 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1352,7 +1352,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio, mem_cgroup_swapout(folio, swap); __delete_from_swap_cache(folio, swap, shadow); xa_unlock_irq(&mapping->i_pages); - put_swap_page(&folio->page, swap); + put_swap_folio(folio, swap); } else { void (*free_folio)(struct folio *); From 63ad4add3823051aeb1fcd1ba981f6efd07086bf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:10 +0100 Subject: [PATCH 088/350] mm: convert do_swap_page() to use a folio Removes quite a lot of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-15-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 57 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 33 insertions(+), 24 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index e49faa0a1f9a..04f54abdf9d2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3724,6 +3724,7 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; + struct folio *folio; struct page *page = NULL, *swapcache; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; @@ -3768,19 +3769,23 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = lookup_swap_cache(entry, vma, vmf->address); swapcache = page; + if (page) + folio = page_folio(page); if (!page) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ - page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, - vmf->address); - if (page) { - __SetPageLocked(page); - __SetPageSwapBacked(page); + folio = vma_alloc_folio(GFP_HIGHUSER_MOVABLE, 0, + vma, vmf->address, false); + page = &folio->page; + if (folio) { + __folio_set_locked(folio); + __folio_set_swapbacked(folio); if (mem_cgroup_swapin_charge_page(page, - vma->vm_mm, GFP_KERNEL, entry)) { + vma->vm_mm, GFP_KERNEL, + entry)) { ret = VM_FAULT_OOM; goto out_page; } @@ -3788,20 +3793,21 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) shadow = get_shadow_from_swap_cache(entry); if (shadow) - workingset_refault(page_folio(page), - shadow); + workingset_refault(folio, shadow); - lru_cache_add(page); + folio_add_lru(folio); /* To provide entry to swap_readpage() */ - set_page_private(page, entry.val); + folio_set_swap_entry(folio, entry); swap_readpage(page, true, NULL); - set_page_private(page, 0); + folio->private = NULL; } } else { page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); swapcache = page; + if (page) + folio = page_folio(page); } if (!page) { @@ -3844,7 +3850,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * swapcache, we need to check that the page's swap has not * changed. */ - if (unlikely(!PageSwapCache(page) || + if (unlikely(!folio_test_swapcache(folio) || page_private(page) != entry.val)) goto out_page; @@ -3859,6 +3865,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = swapcache; goto out_page; } + folio = page_folio(page); /* * If we want to map a page that's in the swapcache writable, we @@ -3867,7 +3874,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * pagevecs if required. */ if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache && - !PageKsm(page) && !PageLRU(page)) + !folio_test_ksm(folio) && !folio_test_lru(folio)) lru_add_drain(); } @@ -3881,7 +3888,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!pte_same(*vmf->pte, vmf->orig_pte))) goto out_nomap; - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { ret = VM_FAULT_SIGBUS; goto out_nomap; } @@ -3894,14 +3901,14 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * check after taking the PT lock and making sure that nobody * concurrently faulted in this page and set PG_anon_exclusive. */ - BUG_ON(!PageAnon(page) && PageMappedToDisk(page)); - BUG_ON(PageAnon(page) && PageAnonExclusive(page)); + BUG_ON(!folio_test_anon(folio) && folio_test_mappedtodisk(folio)); + BUG_ON(folio_test_anon(folio) && PageAnonExclusive(page)); /* * Check under PT lock (to protect against concurrent fork() sharing * the swap entry concurrently) for certainly exclusive pages. */ - if (!PageKsm(page)) { + if (!folio_test_ksm(folio)) { /* * Note that pte_swp_exclusive() == false for architectures * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE. @@ -3913,7 +3920,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * swapcache -> certainly exclusive. */ exclusive = true; - } else if (exclusive && PageWriteback(page) && + } else if (exclusive && folio_test_writeback(folio) && data_race(si->flags & SWP_STABLE_WRITES)) { /* * This is tricky: not all swap backends support @@ -3956,7 +3963,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * exposing them to the swapcache or because the swap entry indicates * exclusivity. */ - if (!PageKsm(page) && (exclusive || page_count(page) == 1)) { + if (!folio_test_ksm(folio) && + (exclusive || folio_ref_count(folio) == 1)) { if (vmf->flags & FAULT_FLAG_WRITE) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; @@ -3976,16 +3984,17 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) /* ksm created a completely new copy */ if (unlikely(page != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address); - lru_cache_add_inactive_or_unevictable(page, vma); + folio_add_lru_vma(folio, vma); } else { page_add_anon_rmap(page, vma, vmf->address, rmap_flags); } - VM_BUG_ON(!PageAnon(page) || (pte_write(pte) && !PageAnonExclusive(page))); + VM_BUG_ON(!folio_test_anon(folio) || + (pte_write(pte) && !PageAnonExclusive(page))); set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); - unlock_page(page); + folio_unlock(folio); if (page != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused @@ -4017,9 +4026,9 @@ out: out_nomap: pte_unmap_unlock(vmf->pte, vmf->ptl); out_page: - unlock_page(page); + folio_unlock(folio); out_release: - put_page(page); + folio_put(folio); if (page != swapcache && swapcache) { unlock_page(swapcache); put_page(swapcache); From d4f9565ae598bd6b6ffbd8b4dfbf97a9e339da2d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:11 +0100 Subject: [PATCH 089/350] mm: convert do_swap_page()'s swapcache variable to a folio The 'swapcache' variable is used to track whether the page is from the swapcache or not. It can do this equally well by being the folio of the page rather than the page itself, and this saves a number of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-16-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 04f54abdf9d2..1e114438f606 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3724,8 +3724,8 @@ static vm_fault_t handle_pte_marker(struct vm_fault *vmf) vm_fault_t do_swap_page(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; - struct folio *folio; - struct page *page = NULL, *swapcache; + struct folio *swapcache, *folio = NULL; + struct page *page; struct swap_info_struct *si = NULL; rmap_t rmap_flags = RMAP_NONE; bool exclusive = false; @@ -3768,11 +3768,11 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out; page = lookup_swap_cache(entry, vma, vmf->address); - swapcache = page; if (page) folio = page_folio(page); + swapcache = folio; - if (!page) { + if (!folio) { if (data_race(si->flags & SWP_SYNCHRONOUS_IO) && __swap_count(entry) == 1) { /* skip swapcache */ @@ -3805,12 +3805,12 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) } else { page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, vmf); - swapcache = page; if (page) folio = page_folio(page); + swapcache = folio; } - if (!page) { + if (!folio) { /* * Back out if somebody else faulted in this pte * while we released the pte lock. @@ -3862,7 +3862,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) page = ksm_might_need_to_copy(page, vma, vmf->address); if (unlikely(!page)) { ret = VM_FAULT_OOM; - page = swapcache; goto out_page; } folio = page_folio(page); @@ -3873,7 +3872,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * owner. Try removing the extra reference from the local LRU * pagevecs if required. */ - if ((vmf->flags & FAULT_FLAG_WRITE) && page == swapcache && + if ((vmf->flags & FAULT_FLAG_WRITE) && folio == swapcache && !folio_test_ksm(folio) && !folio_test_lru(folio)) lru_add_drain(); } @@ -3914,7 +3913,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * without __HAVE_ARCH_PTE_SWP_EXCLUSIVE. */ exclusive = pte_swp_exclusive(vmf->orig_pte); - if (page != swapcache) { + if (folio != swapcache) { /* * We have a fresh page that is not exposed to the * swapcache -> certainly exclusive. @@ -3982,7 +3981,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) vmf->orig_pte = pte; /* ksm created a completely new copy */ - if (unlikely(page != swapcache && swapcache)) { + if (unlikely(folio != swapcache && swapcache)) { page_add_new_anon_rmap(page, vma, vmf->address); folio_add_lru_vma(folio, vma); } else { @@ -3995,7 +3994,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); folio_unlock(folio); - if (page != swapcache && swapcache) { + if (folio != swapcache && swapcache) { /* * Hold the lock to avoid the swap entry to be reused * until we take the PT lock for the pte_same() check @@ -4004,8 +4003,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * so that the swap count won't change under a * parallel locked swapcache. */ - unlock_page(swapcache); - put_page(swapcache); + folio_unlock(swapcache); + folio_put(swapcache); } if (vmf->flags & FAULT_FLAG_WRITE) { @@ -4029,9 +4028,9 @@ out_page: folio_unlock(folio); out_release: folio_put(folio); - if (page != swapcache && swapcache) { - unlock_page(swapcache); - put_page(swapcache); + if (folio != swapcache && swapcache) { + folio_unlock(swapcache); + folio_put(swapcache); } if (si) put_swap_device(si); From 6599591816f522c1cc8ec4eb5cea75738963756a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:12 +0100 Subject: [PATCH 090/350] memcg: convert mem_cgroup_swapin_charge_page() to mem_cgroup_swapin_charge_folio() All callers now have a folio, so pass it in here and remove an unnecessary call to page_folio(). Link: https://lkml.kernel.org/r/20220902194653.1739778-17-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 4 ++-- mm/memcontrol.c | 13 ++++++------- mm/memory.c | 2 +- mm/swap_state.c | 2 +- 4 files changed, 10 insertions(+), 11 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 60545e4a1c03..ca0df42662ad 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -688,7 +688,7 @@ static inline int mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, return __mem_cgroup_charge(folio, mm, gfp); } -int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, +int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry); void mem_cgroup_swapin_uncharge_swap(swp_entry_t entry); @@ -1254,7 +1254,7 @@ static inline int mem_cgroup_charge(struct folio *folio, return 0; } -static inline int mem_cgroup_swapin_charge_page(struct page *page, +static inline int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { return 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e804056422db..621b4472c409 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6844,21 +6844,20 @@ int __mem_cgroup_charge(struct folio *folio, struct mm_struct *mm, gfp_t gfp) } /** - * mem_cgroup_swapin_charge_page - charge a newly allocated page for swapin - * @page: page to charge + * mem_cgroup_swapin_charge_folio - Charge a newly allocated folio for swapin. + * @folio: folio to charge. * @mm: mm context of the victim * @gfp: reclaim mode - * @entry: swap entry for which the page is allocated + * @entry: swap entry for which the folio is allocated * - * This function charges a page allocated for swapin. Please call this before - * adding the page to the swapcache. + * This function charges a folio allocated for swapin. Please call this before + * adding the folio to the swapcache. * * Returns 0 on success. Otherwise, an error code is returned. */ -int mem_cgroup_swapin_charge_page(struct page *page, struct mm_struct *mm, +int mem_cgroup_swapin_charge_folio(struct folio *folio, struct mm_struct *mm, gfp_t gfp, swp_entry_t entry) { - struct folio *folio = page_folio(page); struct mem_cgroup *memcg; unsigned short id; int ret; diff --git a/mm/memory.c b/mm/memory.c index 1e114438f606..b36b177e0ea9 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3783,7 +3783,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) __folio_set_locked(folio); __folio_set_swapbacked(folio); - if (mem_cgroup_swapin_charge_page(page, + if (mem_cgroup_swapin_charge_folio(folio, vma->vm_mm, GFP_KERNEL, entry)) { ret = VM_FAULT_OOM; diff --git a/mm/swap_state.c b/mm/swap_state.c index ea354efd3735..a7e0438902dd 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -480,7 +480,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, __folio_set_locked(folio); __folio_set_swapbacked(folio); - if (mem_cgroup_swapin_charge_page(&folio->page, NULL, gfp_mask, entry)) + if (mem_cgroup_swapin_charge_folio(folio, NULL, gfp_mask, entry)) goto fail_unlock; /* May fail (-ENOMEM) if XArray node allocation failed. */ From 7a7256d5f512b6c17957df7f59cf5e281b3ddba3 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:13 +0100 Subject: [PATCH 091/350] shmem: convert shmem_mfill_atomic_pte() to use a folio Assert that this is a single-page folio as there are several assumptions in here that it's exactly PAGE_SIZE bytes large. Saves several calls to compound_head() and removes the last caller of shmem_alloc_page(). Link: https://lkml.kernel.org/r/20220902194653.1739778-18-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 45 +++++++++++++++++++-------------------------- 1 file changed, 19 insertions(+), 26 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 56cabf9bb947..8754e2b4800a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2374,12 +2374,6 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, } #ifdef CONFIG_USERFAULTFD -static struct page *shmem_alloc_page(gfp_t gfp, - struct shmem_inode_info *info, pgoff_t index) -{ - return &shmem_alloc_folio(gfp, info, index)->page; -} - int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pmd_t *dst_pmd, struct vm_area_struct *dst_vma, @@ -2395,7 +2389,6 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); void *page_kaddr; struct folio *folio; - struct page *page; int ret; pgoff_t max_off; @@ -2414,53 +2407,53 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, if (!*pagep) { ret = -ENOMEM; - page = shmem_alloc_page(gfp, info, pgoff); - if (!page) + folio = shmem_alloc_folio(gfp, info, pgoff); + if (!folio) goto out_unacct_blocks; if (!zeropage) { /* COPY */ - page_kaddr = kmap_atomic(page); + page_kaddr = kmap_local_folio(folio, 0); ret = copy_from_user(page_kaddr, (const void __user *)src_addr, PAGE_SIZE); - kunmap_atomic(page_kaddr); + kunmap_local(page_kaddr); /* fallback to copy_from_user outside mmap_lock */ if (unlikely(ret)) { - *pagep = page; + *pagep = &folio->page; ret = -ENOENT; /* don't free the page */ goto out_unacct_blocks; } - flush_dcache_page(page); + flush_dcache_folio(folio); } else { /* ZEROPAGE */ - clear_user_highpage(page, dst_addr); + clear_user_highpage(&folio->page, dst_addr); } } else { - page = *pagep; + folio = page_folio(*pagep); + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); *pagep = NULL; } - VM_BUG_ON(PageLocked(page)); - VM_BUG_ON(PageSwapBacked(page)); - __SetPageLocked(page); - __SetPageSwapBacked(page); - __SetPageUptodate(page); + VM_BUG_ON(folio_test_locked(folio)); + VM_BUG_ON(folio_test_swapbacked(folio)); + __folio_set_locked(folio); + __folio_set_swapbacked(folio); + __folio_mark_uptodate(folio); ret = -EFAULT; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(pgoff >= max_off)) goto out_release; - folio = page_folio(page); ret = shmem_add_to_page_cache(folio, mapping, pgoff, NULL, gfp & GFP_RECLAIM_MASK, dst_mm); if (ret) goto out_release; ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, - page, true, wp_copy); + &folio->page, true, wp_copy); if (ret) goto out_delete_from_cache; @@ -2470,13 +2463,13 @@ int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, shmem_recalc_inode(inode); spin_unlock_irq(&info->lock); - unlock_page(page); + folio_unlock(folio); return 0; out_delete_from_cache: - delete_from_page_cache(page); + filemap_remove_folio(folio); out_release: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); out_unacct_blocks: shmem_inode_unacct_blocks(inode, 1); return ret; From 0d698e257241436e01182508d93fc290987eb37d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:14 +0100 Subject: [PATCH 092/350] shmem: convert shmem_replace_page() to shmem_replace_folio() The caller has a folio, so convert the calling convention and rename the function. Link: https://lkml.kernel.org/r/20220902194653.1739778-19-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 8754e2b4800a..2bb6f5cfdc11 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1608,7 +1608,7 @@ static bool shmem_should_replace_folio(struct folio *folio, gfp_t gfp) return folio_zonenum(folio) > gfp_zone(gfp); } -static int shmem_replace_page(struct page **pagep, gfp_t gfp, +static int shmem_replace_folio(struct folio **foliop, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct folio *old, *new; @@ -1617,7 +1617,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, pgoff_t swap_index; int error; - old = page_folio(*pagep); + old = *foliop; entry = folio_swap_entry(old); swap_index = swp_offset(entry); swap_mapping = swap_address_space(entry); @@ -1666,7 +1666,7 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, old = new; } else { folio_add_lru(new); - *pagep = &new->page; + *foliop = new; } folio_clear_swapcache(old); @@ -1772,8 +1772,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, arch_swap_restore(swap, folio); if (shmem_should_replace_folio(folio, gfp)) { - error = shmem_replace_page(&page, gfp, info, index); - folio = page_folio(page); + error = shmem_replace_folio(&folio, gfp, info, index); if (error) goto failed; } From c9edc242811d4c4b939b283f4f40b89f9c5b3b5a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:15 +0100 Subject: [PATCH 093/350] swap: add swap_cache_get_folio() Convert lookup_swap_cache() into swap_cache_get_folio() and add a lookup_swap_cache() wrapper around it. [akpm@linux-foundation.org: add CONFIG_SWAP=n stub for swap_cache_get_folio()] Link: https://lkml.kernel.org/r/20220902194653.1739778-20-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap.h | 8 ++++++++ mm/swap_state.c | 32 +++++++++++++++++++++----------- 2 files changed, 29 insertions(+), 11 deletions(-) diff --git a/mm/swap.h b/mm/swap.h index 29e38f3d82d0..ccd8d9a9ad36 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -39,6 +39,8 @@ void __delete_from_swap_cache(struct folio *folio, void delete_from_swap_cache(struct folio *folio); void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); +struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr); struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); @@ -99,6 +101,12 @@ static inline int swap_writepage(struct page *p, struct writeback_control *wbc) return 0; } +static inline struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) +{ + return NULL; +} + static inline struct page *lookup_swap_cache(swp_entry_t swp, struct vm_area_struct *vma, unsigned long addr) diff --git a/mm/swap_state.c b/mm/swap_state.c index a7e0438902dd..b96bf4ec8b5b 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -317,24 +317,24 @@ static inline bool swap_use_vma_readahead(void) } /* - * Lookup a swap entry in the swap cache. A found page will be returned + * Lookup a swap entry in the swap cache. A found folio will be returned * unlocked and with its refcount incremented - we rely on the kernel - * lock getting page table operations atomic even if we drop the page + * lock getting page table operations atomic even if we drop the folio * lock before returning. */ -struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, - unsigned long addr) +struct folio *swap_cache_get_folio(swp_entry_t entry, + struct vm_area_struct *vma, unsigned long addr) { - struct page *page; + struct folio *folio; struct swap_info_struct *si; si = get_swap_device(entry); if (!si) return NULL; - page = find_get_page(swap_address_space(entry), swp_offset(entry)); + folio = filemap_get_folio(swap_address_space(entry), swp_offset(entry)); put_swap_device(si); - if (page) { + if (folio) { bool vma_ra = swap_use_vma_readahead(); bool readahead; @@ -342,10 +342,10 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, * At the moment, we don't support PG_readahead for anon THP * so let's bail out rather than confusing the readahead stat. */ - if (unlikely(PageTransCompound(page))) - return page; + if (unlikely(folio_test_large(folio))) + return folio; - readahead = TestClearPageReadahead(page); + readahead = folio_test_clear_readahead(folio); if (vma && vma_ra) { unsigned long ra_val; int win, hits; @@ -366,7 +366,17 @@ struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, } } - return page; + return folio; +} + +struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, + unsigned long addr) +{ + struct folio *folio = swap_cache_get_folio(entry, vma, addr); + + if (!folio) + return NULL; + return folio_file_page(folio, swp_offset(entry)); } /** From 5739a81cf89f2bbbfff691439b8fcdf3c8d33f5d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:16 +0100 Subject: [PATCH 094/350] shmem: eliminate struct page from shmem_swapin_folio() Convert shmem_swapin() to return a folio and use swap_cache_get_folio(), removing all uses of struct page in this function. Link: https://lkml.kernel.org/r/20220902194653.1739778-21-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 2bb6f5cfdc11..b685acd9f149 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1486,7 +1486,7 @@ static void shmem_pseudo_vma_destroy(struct vm_area_struct *vma) mpol_cond_put(vma->vm_policy); } -static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, +static struct folio *shmem_swapin(swp_entry_t swap, gfp_t gfp, struct shmem_inode_info *info, pgoff_t index) { struct vm_area_struct pvma; @@ -1499,7 +1499,9 @@ static struct page *shmem_swapin(swp_entry_t swap, gfp_t gfp, page = swap_cluster_readahead(swap, gfp, &vmf); shmem_pseudo_vma_destroy(&pvma); - return page; + if (!page) + return NULL; + return page_folio(page); } /* @@ -1721,7 +1723,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); struct mm_struct *charge_mm = vma ? vma->vm_mm : NULL; - struct page *page; struct folio *folio = NULL; swp_entry_t swap; int error; @@ -1734,8 +1735,8 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, return -EIO; /* Look it up and read it in.. */ - page = lookup_swap_cache(swap, NULL, 0); - if (!page) { + folio = swap_cache_get_folio(swap, NULL, 0); + if (!folio) { /* Or update major stats only when swapin succeeds?? */ if (fault_type) { *fault_type |= VM_FAULT_MAJOR; @@ -1743,13 +1744,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, count_memcg_event_mm(charge_mm, PGMAJFAULT); } /* Here we actually start the io */ - page = shmem_swapin(swap, gfp, info, index); - if (!page) { + folio = shmem_swapin(swap, gfp, info, index); + if (!folio) { error = -ENOMEM; goto failed; } } - folio = page_folio(page); /* We have to do this with folio locked to prevent races */ folio_lock(folio); From fc26babbc7d45a98607918d336744269bc59d7b5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:17 +0100 Subject: [PATCH 095/350] shmem: convert shmem_getpage_gfp() to shmem_get_folio_gfp() Add a shmem_getpage_gfp() wrapper for compatibility with current users. Link: https://lkml.kernel.org/r/20220902194653.1739778-22-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 70 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 41 insertions(+), 29 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index b685acd9f149..89536091928f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -139,17 +139,6 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp, gfp_t gfp, struct vm_area_struct *vma, vm_fault_t *fault_type); -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, - struct vm_fault *vmf, vm_fault_t *fault_type); - -int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp) -{ - return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); -} static inline struct shmem_sb_info *SHMEM_SB(struct super_block *sb) { @@ -1595,7 +1584,7 @@ failed: /* * When a page is moved from swapcache to shmem filecache (either by the - * usual swapin of shmem_getpage_gfp(), or by the less common swapoff of + * usual swapin of shmem_get_folio_gfp(), or by the less common swapoff of * shmem_unuse_inode()), it may have been read in earlier from swap, in * ignorance of the mapping it belongs to. If that mapping has special * constraints (like the gma500 GEM driver, which requires RAM below 4GB), @@ -1812,7 +1801,7 @@ unlock: } /* - * shmem_getpage_gfp - find page in cache, or get from swap, or allocate + * shmem_get_folio_gfp - find page in cache, or get from swap, or allocate * * If we allocate a new one we do not mark it dirty. That's up to the * vm. If we swap it in we mark it dirty since we also free the swap @@ -1821,10 +1810,10 @@ unlock: * vma, vmf, and fault_type are only supplied by shmem_fault: * otherwise they are NULL. */ -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, gfp_t gfp, - struct vm_area_struct *vma, struct vm_fault *vmf, - vm_fault_t *fault_type) +static int shmem_get_folio_gfp(struct inode *inode, pgoff_t index, + struct folio **foliop, enum sgp_type sgp, gfp_t gfp, + struct vm_area_struct *vma, struct vm_fault *vmf, + vm_fault_t *fault_type) { struct address_space *mapping = inode->i_mapping; struct shmem_inode_info *info = SHMEM_I(inode); @@ -1864,7 +1853,7 @@ repeat: if (error == -EEXIST) goto repeat; - *pagep = &folio->page; + *foliop = folio; return error; } @@ -1874,7 +1863,7 @@ repeat: folio_mark_accessed(folio); if (folio_test_uptodate(folio)) goto out; - /* fallocated page */ + /* fallocated folio */ if (sgp != SGP_READ) goto clear; folio_unlock(folio); @@ -1882,10 +1871,10 @@ repeat: } /* - * SGP_READ: succeed on hole, with NULL page, letting caller zero. - * SGP_NOALLOC: fail on hole, with NULL page, letting caller fail. + * SGP_READ: succeed on hole, with NULL folio, letting caller zero. + * SGP_NOALLOC: fail on hole, with NULL folio, letting caller fail. */ - *pagep = NULL; + *foliop = NULL; if (sgp == SGP_READ) return 0; if (sgp == SGP_NOALLOC) @@ -1918,7 +1907,7 @@ alloc_nohuge: if (error != -ENOSPC) goto unlock; /* - * Try to reclaim some space by splitting a huge page + * Try to reclaim some space by splitting a large folio * beyond i_size on the filesystem. */ while (retry--) { @@ -1954,9 +1943,9 @@ alloc_nohuge: if (folio_test_pmd_mappable(folio) && DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE) < - hindex + HPAGE_PMD_NR - 1) { + folio_next_index(folio) - 1) { /* - * Part of the huge page is beyond i_size: subject + * Part of the large folio is beyond i_size: subject * to shrink under memory pressure. */ spin_lock(&sbinfo->shrinklist_lock); @@ -1973,14 +1962,14 @@ alloc_nohuge: } /* - * Let SGP_FALLOC use the SGP_WRITE optimization on a new page. + * Let SGP_FALLOC use the SGP_WRITE optimization on a new folio. */ if (sgp == SGP_FALLOC) sgp = SGP_WRITE; clear: /* - * Let SGP_WRITE caller clear ends if write does not fill page; - * but SGP_FALLOC on a page fallocated earlier must initialize + * Let SGP_WRITE caller clear ends if write does not fill folio; + * but SGP_FALLOC on a folio fallocated earlier must initialize * it now, lest undo on failure cancel our earlier guarantee. */ if (sgp != SGP_WRITE && !folio_test_uptodate(folio)) { @@ -2006,7 +1995,7 @@ clear: goto unlock; } out: - *pagep = folio_page(folio, index - hindex); + *foliop = folio; return 0; /* @@ -2036,6 +2025,29 @@ unlock: return error; } +static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp, + gfp_t gfp, struct vm_area_struct *vma, + struct vm_fault *vmf, vm_fault_t *fault_type) +{ + struct folio *folio = NULL; + int ret = shmem_get_folio_gfp(inode, index, &folio, sgp, gfp, vma, + vmf, fault_type); + + if (folio) + *pagep = folio_file_page(folio, index); + else + *pagep = NULL; + return ret; +} + +int shmem_getpage(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp) +{ + return shmem_getpage_gfp(inode, index, pagep, sgp, + mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); +} + /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the From 68a541001a31856fb99614861de1c03109d2ea4d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:18 +0100 Subject: [PATCH 096/350] shmem: convert shmem_fault() to use shmem_get_folio_gfp() No particular advantage for this function, but necessary to remove shmem_getpage_gfp(). [hughd@google.com: fix crash] Link: https://lkml.kernel.org/r/7693a84-bdc2-27b5-2695-d0fe8566571f@google.com Link: https://lkml.kernel.org/r/20220902194653.1739778-23-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 89536091928f..154432dc847b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2065,6 +2065,7 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); + struct folio *folio = NULL; int err; vm_fault_t ret = VM_FAULT_LOCKED; @@ -2127,10 +2128,12 @@ static vm_fault_t shmem_fault(struct vm_fault *vmf) spin_unlock(&inode->i_lock); } - err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, SGP_CACHE, + err = shmem_get_folio_gfp(inode, vmf->pgoff, &folio, SGP_CACHE, gfp, vma, vmf, &ret); if (err) return vmf_error(err); + if (folio) + vmf->page = folio_file_page(folio, vmf->pgoff); return ret; } From a3a9c39704f4fec403ef173e62e069558b7eb85a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:19 +0100 Subject: [PATCH 097/350] shmem: convert shmem_read_mapping_page_gfp() to use shmem_get_folio_gfp() Saves a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-24-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 154432dc847b..c3e2a65a65fc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -4270,18 +4270,20 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, { #ifdef CONFIG_SHMEM struct inode *inode = mapping->host; + struct folio *folio; struct page *page; int error; BUG_ON(!shmem_mapping(mapping)); - error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, + error = shmem_get_folio_gfp(inode, index, &folio, SGP_CACHE, gfp, NULL, NULL, NULL); if (error) return ERR_PTR(error); - unlock_page(page); + folio_unlock(folio); + page = folio_file_page(folio, index); if (PageHWPoison(page)) { - put_page(page); + folio_put(folio); return ERR_PTR(-EIO); } From 4e1fc793ad9892cec67b40c9f67583160e08f695 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:20 +0100 Subject: [PATCH 098/350] shmem: add shmem_get_folio() With no remaining callers of shmem_getpage_gfp(), add shmem_get_folio() and reimplement shmem_getpage() as a call to shmem_get_folio(). Link: https://lkml.kernel.org/r/20220902194653.1739778-25-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 2 ++ mm/shmem.c | 23 ++++++++++------------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index ff0b990de83d..f4bd50b08a91 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -113,6 +113,8 @@ enum sgp_type { extern int shmem_getpage(struct inode *inode, pgoff_t index, struct page **pagep, enum sgp_type sgp); +int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, + enum sgp_type sgp); static inline struct page *shmem_read_mapping_page( struct address_space *mapping, pgoff_t index) diff --git a/mm/shmem.c b/mm/shmem.c index c3e2a65a65fc..32afc8039e66 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2025,14 +2025,18 @@ unlock: return error; } -static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp, - gfp_t gfp, struct vm_area_struct *vma, - struct vm_fault *vmf, vm_fault_t *fault_type) +int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, + enum sgp_type sgp) +{ + return shmem_get_folio_gfp(inode, index, foliop, sgp, + mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); +} + +int shmem_getpage(struct inode *inode, pgoff_t index, + struct page **pagep, enum sgp_type sgp) { struct folio *folio = NULL; - int ret = shmem_get_folio_gfp(inode, index, &folio, sgp, gfp, vma, - vmf, fault_type); + int ret = shmem_get_folio(inode, index, &folio, sgp); if (folio) *pagep = folio_file_page(folio, index); @@ -2041,13 +2045,6 @@ static int shmem_getpage_gfp(struct inode *inode, pgoff_t index, return ret; } -int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp) -{ - return shmem_getpage_gfp(inode, index, pagep, sgp, - mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); -} - /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the From a7f5862cc0624ca6b21da5a634ff232dc65776b5 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:21 +0100 Subject: [PATCH 099/350] shmem: convert shmem_get_partial_folio() to use shmem_get_folio() Get rid of an unnecessary folio->page->folio conversion. Link: https://lkml.kernel.org/r/20220902194653.1739778-26-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 32afc8039e66..772a30593fcc 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -874,10 +874,9 @@ void shmem_unlock_mapping(struct address_space *mapping) static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) { struct folio *folio; - struct page *page; /* - * At first avoid shmem_getpage(,,,SGP_READ): that fails + * At first avoid shmem_get_folio(,,,SGP_READ): that fails * beyond i_size, and reports fallocated pages as holes. */ folio = __filemap_get_folio(inode->i_mapping, index, @@ -888,9 +887,9 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index) * But read a page back from swap if any of it is within i_size * (although in some cases this is just a waste of time). */ - page = NULL; - shmem_getpage(inode, index, &page, SGP_READ); - return page ? page_folio(page) : NULL; + folio = NULL; + shmem_get_folio(inode, index, &folio, SGP_READ); + return folio; } /* From eff1f906c2dcd83ce7cbd38d2b853d2c49027f39 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:22 +0100 Subject: [PATCH 100/350] shmem: convert shmem_write_begin() to use shmem_get_folio() Use a folio throughout this function, saving a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-27-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 772a30593fcc..c69b53602a1d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2498,6 +2498,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; + struct folio *folio; int ret = 0; /* i_rwsem is held by caller */ @@ -2509,14 +2510,15 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - ret = shmem_getpage(inode, index, pagep, SGP_WRITE); + ret = shmem_get_folio(inode, index, &folio, SGP_WRITE); if (ret) return ret; + *pagep = folio_file_page(folio, index); if (PageHWPoison(*pagep)) { - unlock_page(*pagep); - put_page(*pagep); + folio_unlock(folio); + folio_put(folio); *pagep = NULL; return -EIO; } From 4601e2fc8b57840660ce1a1ee98aea873fa15eee Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:23 +0100 Subject: [PATCH 101/350] shmem: convert shmem_file_read_iter() to use shmem_get_folio() Use a folio throughout, saving five calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-28-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index c69b53602a1d..0f8119312847 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2577,6 +2577,7 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) offset = *ppos & ~PAGE_MASK; for (;;) { + struct folio *folio = NULL; struct page *page = NULL; pgoff_t end_index; unsigned long nr, ret; @@ -2591,17 +2592,18 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) break; } - error = shmem_getpage(inode, index, &page, SGP_READ); + error = shmem_get_folio(inode, index, &folio, SGP_READ); if (error) { if (error == -EINVAL) error = 0; break; } - if (page) { - unlock_page(page); + if (folio) { + folio_unlock(folio); + page = folio_file_page(folio, index); if (PageHWPoison(page)) { - put_page(page); + folio_put(folio); error = -EIO; break; } @@ -2617,14 +2619,14 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (index == end_index) { nr = i_size & ~PAGE_MASK; if (nr <= offset) { - if (page) - put_page(page); + if (folio) + folio_put(folio); break; } } nr -= offset; - if (page) { + if (folio) { /* * If users can be writing to this page using arbitrary * virtual addresses, take care about potential aliasing @@ -2636,13 +2638,13 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) * Mark the page accessed if we read the beginning. */ if (!offset) - mark_page_accessed(page); + folio_mark_accessed(folio); /* * Ok, we have the page, and it's up-to-date, so * now we can copy it to user space... */ ret = copy_page_to_iter(page, offset, nr, to); - put_page(page); + folio_put(folio); } else if (user_backed_iter(to)) { /* From b0802b22a97581608df3d2db2e705fe599777b18 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:24 +0100 Subject: [PATCH 102/350] shmem: convert shmem_fallocate() to use a folio Call shmem_get_folio() and use the folio APIs instead of the page APIs. Saves several calls to compound_head() and removes assumptions about the size of a large folio. Link: https://lkml.kernel.org/r/20220902194653.1739778-29-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 0f8119312847..c2016a7cfc29 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2787,7 +2787,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, info->fallocend = end; for (index = start; index < end; ) { - struct page *page; + struct folio *folio; /* * Good, the fallocate(2) manpage permits EINTR: we may have @@ -2798,10 +2798,11 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, else if (shmem_falloc.nr_unswapped > shmem_falloc.nr_falloced) error = -ENOMEM; else - error = shmem_getpage(inode, index, &page, SGP_FALLOC); + error = shmem_get_folio(inode, index, &folio, + SGP_FALLOC); if (error) { info->fallocend = undo_fallocend; - /* Remove the !PageUptodate pages we added */ + /* Remove the !uptodate folios we added */ if (index > start) { shmem_undo_range(inode, (loff_t)start << PAGE_SHIFT, @@ -2810,37 +2811,34 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, goto undone; } - index++; /* * Here is a more important optimization than it appears: - * a second SGP_FALLOC on the same huge page will clear it, - * making it PageUptodate and un-undoable if we fail later. + * a second SGP_FALLOC on the same large folio will clear it, + * making it uptodate and un-undoable if we fail later. */ - if (PageTransCompound(page)) { - index = round_up(index, HPAGE_PMD_NR); - /* Beware 32-bit wraparound */ - if (!index) - index--; - } + index = folio_next_index(folio); + /* Beware 32-bit wraparound */ + if (!index) + index--; /* * Inform shmem_writepage() how far we have reached. * No need for lock or barrier: we have the page lock. */ - if (!PageUptodate(page)) + if (!folio_test_uptodate(folio)) shmem_falloc.nr_falloced += index - shmem_falloc.next; shmem_falloc.next = index; /* - * If !PageUptodate, leave it that way so that freeable pages + * If !uptodate, leave it that way so that freeable folios * can be recognized if we need to rollback on error later. - * But set_page_dirty so that memory pressure will swap rather - * than free the pages we are allocating (and SGP_CACHE pages + * But mark it dirty so that memory pressure will swap rather + * than free the folios we are allocating (and SGP_CACHE folios * might still be clean: we now need to mark those dirty too). */ - set_page_dirty(page); - unlock_page(page); - put_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); cond_resched(); } From 7ad0414bded6e8678840368be5cc72b9957a4478 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:25 +0100 Subject: [PATCH 103/350] shmem: convert shmem_symlink() to use a folio While symlinks will always be < PAGE_SIZE, using the folio APIs gets rid of unnecessary calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-30-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index c2016a7cfc29..4948ceffcc9f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3093,7 +3093,7 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, int error; int len; struct inode *inode; - struct page *page; + struct folio *folio; len = strlen(symname) + 1; if (len > PAGE_SIZE) @@ -3121,18 +3121,18 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, inode->i_op = &shmem_short_symlink_operations; } else { inode_nohighmem(inode); - error = shmem_getpage(inode, 0, &page, SGP_WRITE); + error = shmem_get_folio(inode, 0, &folio, SGP_WRITE); if (error) { iput(inode); return error; } inode->i_mapping->a_ops = &shmem_aops; inode->i_op = &shmem_symlink_inode_operations; - memcpy(page_address(page), symname, len); - SetPageUptodate(page); - set_page_dirty(page); - unlock_page(page); - put_page(page); + memcpy(folio_address(folio), symname, len); + folio_mark_uptodate(folio); + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); } dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); From e4b57722d0e6be8820039a7d506378640aee5073 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:26 +0100 Subject: [PATCH 104/350] shmem: convert shmem_get_link() to use a folio Symlinks will never use a large folio, but using the folio API removes a lot of unnecessary folio->page->folio conversions. Link: https://lkml.kernel.org/r/20220902194653.1739778-31-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/shmem.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 4948ceffcc9f..e6e934adeed7 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3143,40 +3143,41 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, static void shmem_put_link(void *arg) { - mark_page_accessed(arg); - put_page(arg); + folio_mark_accessed(arg); + folio_put(arg); } static const char *shmem_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *page = NULL; + struct folio *folio = NULL; int error; + if (!dentry) { - page = find_get_page(inode->i_mapping, 0); - if (!page) + folio = filemap_get_folio(inode->i_mapping, 0); + if (!folio) return ERR_PTR(-ECHILD); - if (PageHWPoison(page) || - !PageUptodate(page)) { - put_page(page); + if (PageHWPoison(&folio->page) || + !folio_test_uptodate(folio)) { + folio_put(folio); return ERR_PTR(-ECHILD); } } else { - error = shmem_getpage(inode, 0, &page, SGP_READ); + error = shmem_get_folio(inode, 0, &folio, SGP_READ); if (error) return ERR_PTR(error); - if (!page) + if (!folio) return ERR_PTR(-ECHILD); - if (PageHWPoison(page)) { - unlock_page(page); - put_page(page); + if (PageHWPoison(&folio->page)) { + folio_unlock(folio); + folio_put(folio); return ERR_PTR(-ECHILD); } - unlock_page(page); + folio_unlock(folio); } - set_delayed_call(done, shmem_put_link, page); - return page_address(page); + set_delayed_call(done, shmem_put_link, folio); + return folio_address(folio); } #ifdef CONFIG_TMPFS_XATTR From 7459c149ae9ca7d6f241b3a3764aa81b9c405a0e Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:27 +0100 Subject: [PATCH 105/350] khugepaged: call shmem_get_folio() shmem_getpage() is being removed, so call its replacement and find the precise page ourselves. Link: https://lkml.kernel.org/r/20220902194653.1739778-32-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/khugepaged.c | 7 +++++-- mm/shmem.c | 4 ++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1e59fe7bfae3..57af2c841b41 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1647,13 +1647,16 @@ static int collapse_file(struct mm_struct *mm, struct file *file, } if (xa_is_value(page) || !PageUptodate(page)) { + struct folio *folio; + xas_unlock_irq(&xas); /* swap in or instantiate fallocated page */ - if (shmem_getpage(mapping->host, index, &page, - SGP_NOALLOC)) { + if (shmem_get_folio(mapping->host, index, + &folio, SGP_NOALLOC)) { result = SCAN_FAIL; goto xa_unlocked; } + page = folio_file_page(folio, index); } else if (trylock_page(page)) { get_page(page); xas_unlock_irq(&xas); diff --git a/mm/shmem.c b/mm/shmem.c index e6e934adeed7..909149b25d98 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3158,7 +3158,7 @@ static const char *shmem_get_link(struct dentry *dentry, folio = filemap_get_folio(inode->i_mapping, 0); if (!folio) return ERR_PTR(-ECHILD); - if (PageHWPoison(&folio->page) || + if (PageHWPoison(folio_page(folio, 0)) || !folio_test_uptodate(folio)) { folio_put(folio); return ERR_PTR(-ECHILD); @@ -3169,7 +3169,7 @@ static const char *shmem_get_link(struct dentry *dentry, return ERR_PTR(error); if (!folio) return ERR_PTR(-ECHILD); - if (PageHWPoison(&folio->page)) { + if (PageHWPoison(folio_page(folio, 0))) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-ECHILD); From 12acf4fbc4f78b24822317888b9406d56dc9ad2a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:28 +0100 Subject: [PATCH 106/350] userfaultfd: convert mcontinue_atomic_pte() to use a folio shmem_getpage() is being replaced by shmem_get_folio() so use a folio throughout this function. Saves several calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-33-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/userfaultfd.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 7327b2573f7c..9c035be2148b 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -243,20 +243,22 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, { struct inode *inode = file_inode(dst_vma->vm_file); pgoff_t pgoff = linear_page_index(dst_vma, dst_addr); + struct folio *folio; struct page *page; int ret; - ret = shmem_getpage(inode, pgoff, &page, SGP_NOALLOC); - /* Our caller expects us to return -EFAULT if we failed to find page. */ + ret = shmem_get_folio(inode, pgoff, &folio, SGP_NOALLOC); + /* Our caller expects us to return -EFAULT if we failed to find folio */ if (ret == -ENOENT) ret = -EFAULT; if (ret) goto out; - if (!page) { + if (!folio) { ret = -EFAULT; goto out; } + page = folio_file_page(folio, pgoff); if (PageHWPoison(page)) { ret = -EIO; goto out_release; @@ -267,13 +269,13 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, if (ret) goto out_release; - unlock_page(page); + folio_unlock(folio); ret = 0; out: return ret; out_release: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } From 923e2f0e7c30db5c1ee5d680050ab781e6c114fb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:29 +0100 Subject: [PATCH 107/350] shmem: remove shmem_getpage() With all callers removed, remove this wrapper function. The flags are now mysteriously called SGP, but I think we can live with that. Link: https://lkml.kernel.org/r/20220902194653.1739778-34-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 4 +--- mm/shmem.c | 15 +-------------- 2 files changed, 2 insertions(+), 17 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f4bd50b08a91..f24071e3c826 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -102,7 +102,7 @@ extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end); -/* Flag allocation requirements to shmem_getpage */ +/* Flag allocation requirements to shmem_get_folio */ enum sgp_type { SGP_READ, /* don't exceed i_size, don't allocate page */ SGP_NOALLOC, /* similar, but fail on hole or use fallocated page */ @@ -111,8 +111,6 @@ enum sgp_type { SGP_FALLOC, /* like SGP_WRITE, but make existing page Uptodate */ }; -extern int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp); int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, enum sgp_type sgp); diff --git a/mm/shmem.c b/mm/shmem.c index 909149b25d98..3d0b729fcc5e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -179,7 +179,7 @@ static inline int shmem_reacct_size(unsigned long flags, /* * ... whereas tmpfs objects are accounted incrementally as * pages are allocated, in order to allow large sparse files. - * shmem_getpage reports shmem_acct_block failure as -ENOSPC not -ENOMEM, + * shmem_get_folio reports shmem_acct_block failure as -ENOSPC not -ENOMEM, * so that a failure on a sparse tmpfs mapping will give SIGBUS not OOM. */ static inline int shmem_acct_block(unsigned long flags, long pages) @@ -2031,19 +2031,6 @@ int shmem_get_folio(struct inode *inode, pgoff_t index, struct folio **foliop, mapping_gfp_mask(inode->i_mapping), NULL, NULL, NULL); } -int shmem_getpage(struct inode *inode, pgoff_t index, - struct page **pagep, enum sgp_type sgp) -{ - struct folio *folio = NULL; - int ret = shmem_get_folio(inode, index, &folio, sgp); - - if (folio) - *pagep = folio_file_page(folio, index); - else - *pagep = NULL; - return ret; -} - /* * This is like autoremove_wake_function, but it removes the wait queue * entry unconditionally - even if something else had already woken the From 000085b9af9f3ca13dd672a753f815ac0cb45d0a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:30 +0100 Subject: [PATCH 108/350] swapfile: convert try_to_unuse() to use a folio Saves five calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-35-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index aafe739dc2a6..23cdbe8e47cf 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2032,7 +2032,7 @@ static int try_to_unuse(unsigned int type) struct list_head *p; int retval = 0; struct swap_info_struct *si = swap_info[type]; - struct page *page; + struct folio *folio; swp_entry_t entry; unsigned int i; @@ -2082,21 +2082,21 @@ retry: (i = find_next_to_unuse(si, i)) != 0) { entry = swp_entry(type, i); - page = find_get_page(swap_address_space(entry), i); - if (!page) + folio = filemap_get_folio(swap_address_space(entry), i); + if (!folio) continue; /* - * It is conceivable that a racing task removed this page from - * swap cache just before we acquired the page lock. The page + * It is conceivable that a racing task removed this folio from + * swap cache just before we acquired the page lock. The folio * might even be back in swap cache on another swap area. But - * that is okay, try_to_free_swap() only removes stale pages. + * that is okay, folio_free_swap() only removes stale folios. */ - lock_page(page); - wait_on_page_writeback(page); - try_to_free_swap(page); - unlock_page(page); - put_page(page); + folio_lock(folio); + folio_wait_writeback(folio); + folio_free_swap(folio); + folio_unlock(folio); + folio_put(folio); } /* From 2c3f6194b008b23e52a8e135bdd56b67fdaa55ca Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:31 +0100 Subject: [PATCH 109/350] swapfile: convert __try_to_reclaim_swap() to use a folio Saves five calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-36-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 23cdbe8e47cf..e3e1bd3d20b1 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -132,27 +132,27 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, unsigned long offset, unsigned long flags) { swp_entry_t entry = swp_entry(si->type, offset); - struct page *page; + struct folio *folio; int ret = 0; - page = find_get_page(swap_address_space(entry), offset); - if (!page) + folio = filemap_get_folio(swap_address_space(entry), offset); + if (!folio) return 0; /* * When this function is called from scan_swap_map_slots() and it's - * called by vmscan.c at reclaiming pages. So, we hold a lock on a page, + * called by vmscan.c at reclaiming folios. So we hold a folio lock * here. We have to use trylock for avoiding deadlock. This is a special - * case and you should use try_to_free_swap() with explicit lock_page() + * case and you should use folio_free_swap() with explicit folio_lock() * in usual operations. */ - if (trylock_page(page)) { + if (folio_trylock(folio)) { if ((flags & TTRS_ANYWAY) || - ((flags & TTRS_UNMAPPED) && !page_mapped(page)) || - ((flags & TTRS_FULL) && mem_cgroup_swap_full(page))) - ret = try_to_free_swap(page); - unlock_page(page); + ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || + ((flags & TTRS_FULL) && mem_cgroup_swap_full(&folio->page))) + ret = folio_free_swap(folio); + folio_unlock(folio); } - put_page(page); + folio_put(folio); return ret; } From f102cd8b173e066179b472fb6e3b18e31a1cc394 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:32 +0100 Subject: [PATCH 110/350] swapfile: convert unuse_pte_range() to use a folio Delay fetching the precise page from the folio until we're in unuse_pte(). Saves many calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-37-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swapfile.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index e3e1bd3d20b1..3820b5ab64d9 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1758,8 +1758,9 @@ static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) * force COW, vm_page_prot omits write permission from any private vma. */ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, - unsigned long addr, swp_entry_t entry, struct page *page) + unsigned long addr, swp_entry_t entry, struct folio *folio) { + struct page *page = folio_file_page(folio, swp_offset(entry)); struct page *swapcache; spinlock_t *ptl; pte_t *pte, new_pte; @@ -1831,17 +1832,18 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, unsigned long addr, unsigned long end, unsigned int type) { - struct page *page; swp_entry_t entry; pte_t *pte; struct swap_info_struct *si; - unsigned long offset; int ret = 0; volatile unsigned char *swap_map; si = swap_info[type]; pte = pte_offset_map(pmd, addr); do { + struct folio *folio; + unsigned long offset; + if (!is_swap_pte(*pte)) continue; @@ -1852,8 +1854,9 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, offset = swp_offset(entry); pte_unmap(pte); swap_map = &si->swap_map[offset]; - page = lookup_swap_cache(entry, vma, addr); - if (!page) { + folio = swap_cache_get_folio(entry, vma, addr); + if (!folio) { + struct page *page; struct vm_fault vmf = { .vma = vma, .address = addr, @@ -1863,25 +1866,27 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, page = swapin_readahead(entry, GFP_HIGHUSER_MOVABLE, &vmf); + if (page) + folio = page_folio(page); } - if (!page) { + if (!folio) { if (*swap_map == 0 || *swap_map == SWAP_MAP_BAD) goto try_next; return -ENOMEM; } - lock_page(page); - wait_on_page_writeback(page); - ret = unuse_pte(vma, pmd, addr, entry, page); + folio_lock(folio); + folio_wait_writeback(folio); + ret = unuse_pte(vma, pmd, addr, entry, folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); goto out; } - try_to_free_swap(page); - unlock_page(page); - put_page(page); + folio_free_swap(folio); + folio_unlock(folio); + folio_put(folio); try_next: pte = pte_offset_map(pmd, addr); } while (pte++, addr += PAGE_SIZE, addr != end); From 5a423081b2465d38baf2fcbbc19f77d211507061 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:33 +0100 Subject: [PATCH 111/350] mm: convert do_swap_page() to use swap_cache_get_folio() Saves a folio->page->folio conversion. Link: https://lkml.kernel.org/r/20220902194653.1739778-38-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index b36b177e0ea9..0018df3f0cc2 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3767,9 +3767,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (unlikely(!si)) goto out; - page = lookup_swap_cache(entry, vma, vmf->address); - if (page) - folio = page_folio(page); + folio = swap_cache_get_folio(entry, vma, vmf->address); + if (folio) + page = folio_file_page(folio, swp_offset(entry)); swapcache = folio; if (!folio) { From cb691e2f28bc63b1a872aa593dd542ee796e8364 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:34 +0100 Subject: [PATCH 112/350] mm: remove lookup_swap_cache() All callers have now been converted to swap_cache_get_folio(), so we can remove this wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-39-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- mm/swap.h | 10 ---------- mm/swap_state.c | 12 +----------- 3 files changed, 2 insertions(+), 22 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 621b4472c409..9863fb588972 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5569,7 +5569,7 @@ static struct page *mc_handle_swap_pte(struct vm_area_struct *vma, return NULL; /* - * Because lookup_swap_cache() updates some statistics counter, + * Because swap_cache_get_folio() updates some statistics counter, * we call find_get_page() with swapper_space directly. */ page = find_get_page(swap_address_space(ent), swp_offset(ent)); diff --git a/mm/swap.h b/mm/swap.h index ccd8d9a9ad36..cc08c459c619 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -41,9 +41,6 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, unsigned long end); struct folio *swap_cache_get_folio(swp_entry_t entry, struct vm_area_struct *vma, unsigned long addr); -struct page *lookup_swap_cache(swp_entry_t entry, - struct vm_area_struct *vma, - unsigned long addr); struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index); struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, @@ -107,13 +104,6 @@ static inline struct folio *swap_cache_get_folio(swp_entry_t entry, return NULL; } -static inline struct page *lookup_swap_cache(swp_entry_t swp, - struct vm_area_struct *vma, - unsigned long addr) -{ - return NULL; -} - static inline struct page *find_get_incore_page(struct address_space *mapping, pgoff_t index) { diff --git a/mm/swap_state.c b/mm/swap_state.c index b96bf4ec8b5b..4af135a7b53c 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -369,16 +369,6 @@ struct folio *swap_cache_get_folio(swp_entry_t entry, return folio; } -struct page *lookup_swap_cache(swp_entry_t entry, struct vm_area_struct *vma, - unsigned long addr) -{ - struct folio *folio = swap_cache_get_folio(entry, vma, addr); - - if (!folio) - return NULL; - return folio_file_page(folio, swp_offset(entry)); -} - /** * find_get_incore_page - Find and get a page from the page or swap caches. * @mapping: The address_space to search. @@ -430,7 +420,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, int err; /* * First check the swap cache. Since this is normally - * called after lookup_swap_cache() failed, re-calling + * called after swap_cache_get_folio() failed, re-calling * that would confuse statistics. */ si = get_swap_device(entry); From aedd74d4397a2b1a4882215b6169b47d139c0319 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:35 +0100 Subject: [PATCH 113/350] swap_state: convert free_swap_cache() to use a folio Saves several calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-40-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/swap_state.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/mm/swap_state.c b/mm/swap_state.c index 4af135a7b53c..438d0676c5be 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -272,16 +272,19 @@ void clear_shadow_from_swap_cache(int type, unsigned long begin, /* * If we are the only user, then try to free up the swap cache. * - * Its ok to check for PageSwapCache without the page lock + * Its ok to check the swapcache flag without the folio lock * here because we are going to recheck again inside - * try_to_free_swap() _with_ the lock. + * folio_free_swap() _with_ the lock. * - Marcelo */ void free_swap_cache(struct page *page) { - if (PageSwapCache(page) && !page_mapped(page) && trylock_page(page)) { - try_to_free_swap(page); - unlock_page(page); + struct folio *folio = page_folio(page); + + if (folio_test_swapcache(folio) && !folio_mapped(folio) && + folio_trylock(folio)) { + folio_free_swap(folio); + folio_unlock(folio); } } From 71fa1a533d2e027a3df98fd065605bebab42d7bf Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:36 +0100 Subject: [PATCH 114/350] swap: convert swap_writepage() to use a folio Removes many calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-41-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/page_io.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/page_io.c b/mm/page_io.c index fc6b3fb1f7c5..2af34dd8fa4d 100644 --- a/mm/page_io.c +++ b/mm/page_io.c @@ -180,29 +180,30 @@ bad_bmap: */ int swap_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); int ret = 0; - if (try_to_free_swap(page)) { - unlock_page(page); + if (folio_free_swap(folio)) { + folio_unlock(folio); goto out; } /* * Arch code may have to preserve more data than just the page * contents, e.g. memory tags. */ - ret = arch_prepare_to_swap(page); + ret = arch_prepare_to_swap(&folio->page); if (ret) { - set_page_dirty(page); - unlock_page(page); + folio_mark_dirty(folio); + folio_unlock(folio); goto out; } - if (frontswap_store(page) == 0) { - set_page_writeback(page); - unlock_page(page); - end_page_writeback(page); + if (frontswap_store(&folio->page) == 0) { + folio_start_writeback(folio); + folio_unlock(folio); + folio_end_writeback(folio); goto out; } - ret = __swap_writepage(page, wbc); + ret = __swap_writepage(&folio->page, wbc); out: return ret; } From e4a2ed94908cc0104b8826ed8d831661ed1c3ea1 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:37 +0100 Subject: [PATCH 115/350] mm: convert do_wp_page() to use a folio Saves many calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-42-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 40 ++++++++++++++++++++-------------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 0018df3f0cc2..2f1397b7c77d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3368,6 +3368,7 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; + struct folio *folio; VM_BUG_ON(unshare && (vmf->flags & FAULT_FLAG_WRITE)); VM_BUG_ON(!unshare && !(vmf->flags & FAULT_FLAG_WRITE)); @@ -3414,48 +3415,47 @@ static vm_fault_t do_wp_page(struct vm_fault *vmf) * Take out anonymous pages first, anonymous shared vmas are * not dirty accountable. */ - if (PageAnon(vmf->page)) { - struct page *page = vmf->page; - + folio = page_folio(vmf->page); + if (folio_test_anon(folio)) { /* * If the page is exclusive to this process we must reuse the * page without further checks. */ - if (PageAnonExclusive(page)) + if (PageAnonExclusive(vmf->page)) goto reuse; /* - * We have to verify under page lock: these early checks are - * just an optimization to avoid locking the page and freeing + * We have to verify under folio lock: these early checks are + * just an optimization to avoid locking the folio and freeing * the swapcache if there is little hope that we can reuse. * - * PageKsm() doesn't necessarily raise the page refcount. + * KSM doesn't necessarily raise the folio refcount. */ - if (PageKsm(page) || page_count(page) > 3) + if (folio_test_ksm(folio) || folio_ref_count(folio) > 3) goto copy; - if (!PageLRU(page)) + if (!folio_test_lru(folio)) /* * Note: We cannot easily detect+handle references from - * remote LRU pagevecs or references to PageLRU() pages. + * remote LRU pagevecs or references to LRU folios. */ lru_add_drain(); - if (page_count(page) > 1 + PageSwapCache(page)) + if (folio_ref_count(folio) > 1 + folio_test_swapcache(folio)) goto copy; - if (!trylock_page(page)) + if (!folio_trylock(folio)) goto copy; - if (PageSwapCache(page)) - try_to_free_swap(page); - if (PageKsm(page) || page_count(page) != 1) { - unlock_page(page); + if (folio_test_swapcache(folio)) + folio_free_swap(folio); + if (folio_test_ksm(folio) || folio_ref_count(folio) != 1) { + folio_unlock(folio); goto copy; } /* - * Ok, we've got the only page reference from our mapping - * and the page is locked, it's dark out, and we're wearing + * Ok, we've got the only folio reference from our mapping + * and the folio is locked, it's dark out, and we're wearing * sunglasses. Hit it. */ - page_move_anon_rmap(page, vma); - unlock_page(page); + page_move_anon_rmap(vmf->page, vma); + folio_unlock(folio); reuse: if (unlikely(unshare)) { pte_unmap_unlock(vmf->pte, vmf->ptl); From 2fad3d14b9ebc8e42977bfb34a8165bb61a7c3f7 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:38 +0100 Subject: [PATCH 116/350] huge_memory: convert do_huge_pmd_wp_page() to use a folio Removes many calls to compound_head(). Does not remove the assumption that a folio may not be larger than a PMD. Link: https://lkml.kernel.org/r/20220902194653.1739778-43-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 84bf1d5f6b7e..1181e623bf5b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1305,6 +1305,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) { const bool unshare = vmf->flags & FAULT_FLAG_UNSHARE; struct vm_area_struct *vma = vmf->vma; + struct folio *folio; struct page *page; unsigned long haddr = vmf->address & HPAGE_PMD_MASK; pmd_t orig_pmd = vmf->orig_pmd; @@ -1326,46 +1327,48 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) } page = pmd_page(orig_pmd); + folio = page_folio(page); VM_BUG_ON_PAGE(!PageHead(page), page); /* Early check when only holding the PT lock. */ if (PageAnonExclusive(page)) goto reuse; - if (!trylock_page(page)) { - get_page(page); + if (!folio_trylock(folio)) { + folio_get(folio); spin_unlock(vmf->ptl); - lock_page(page); + folio_lock(folio); spin_lock(vmf->ptl); if (unlikely(!pmd_same(*vmf->pmd, orig_pmd))) { spin_unlock(vmf->ptl); - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return 0; } - put_page(page); + folio_put(folio); } /* Recheck after temporarily dropping the PT lock. */ if (PageAnonExclusive(page)) { - unlock_page(page); + folio_unlock(folio); goto reuse; } /* - * See do_wp_page(): we can only reuse the page exclusively if there are - * no additional references. Note that we always drain the LRU - * pagevecs immediately after adding a THP. + * See do_wp_page(): we can only reuse the folio exclusively if + * there are no additional references. Note that we always drain + * the LRU pagevecs immediately after adding a THP. */ - if (page_count(page) > 1 + PageSwapCache(page) * thp_nr_pages(page)) + if (folio_ref_count(folio) > + 1 + folio_test_swapcache(folio) * folio_nr_pages(folio)) goto unlock_fallback; - if (PageSwapCache(page)) - try_to_free_swap(page); - if (page_count(page) == 1) { + if (folio_test_swapcache(folio)) + folio_free_swap(folio); + if (folio_ref_count(folio) == 1) { pmd_t entry; page_move_anon_rmap(page, vma); - unlock_page(page); + folio_unlock(folio); reuse: if (unlikely(unshare)) { spin_unlock(vmf->ptl); @@ -1380,7 +1383,7 @@ reuse: } unlock_fallback: - unlock_page(page); + folio_unlock(folio); spin_unlock(vmf->ptl); fallback: __split_huge_pmd(vma, vmf->pmd, vmf->address, false, NULL); From 98b211d6415f9538b81e50da71f09d195ce2afe6 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:39 +0100 Subject: [PATCH 117/350] madvise: convert madvise_free_pte_range() to use a folio Saves a lot of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-44-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/madvise.c | 49 +++++++++++++++++++++++++------------------------ 1 file changed, 25 insertions(+), 24 deletions(-) diff --git a/mm/madvise.c b/mm/madvise.c index a3fc4cd32ed3..2baa93ca2310 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -601,6 +601,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *orig_pte, *pte, ptent; + struct folio *folio; struct page *page; int nr_swap = 0; unsigned long next; @@ -645,56 +646,56 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, page = vm_normal_page(vma, addr, ptent); if (!page || is_zone_device_page(page)) continue; + folio = page_folio(page); /* - * If pmd isn't transhuge but the page is THP and + * If pmd isn't transhuge but the folio is large and * is owned by only this process, split it and * deactivate all pages. */ - if (PageTransCompound(page)) { - if (page_mapcount(page) != 1) + if (folio_test_large(folio)) { + if (folio_mapcount(folio) != 1) goto out; - get_page(page); - if (!trylock_page(page)) { - put_page(page); + folio_get(folio); + if (!folio_trylock(folio)) { + folio_put(folio); goto out; } pte_unmap_unlock(orig_pte, ptl); - if (split_huge_page(page)) { - unlock_page(page); - put_page(page); + if (split_folio(folio)) { + folio_unlock(folio); + folio_put(folio); orig_pte = pte_offset_map_lock(mm, pmd, addr, &ptl); goto out; } - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl); pte--; addr -= PAGE_SIZE; continue; } - VM_BUG_ON_PAGE(PageTransCompound(page), page); - - if (PageSwapCache(page) || PageDirty(page)) { - if (!trylock_page(page)) + if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { + if (!folio_trylock(folio)) continue; /* - * If page is shared with others, we couldn't clear - * PG_dirty of the page. + * If folio is shared with others, we mustn't clear + * the folio's dirty flag. */ - if (page_mapcount(page) != 1) { - unlock_page(page); + if (folio_mapcount(folio) != 1) { + folio_unlock(folio); continue; } - if (PageSwapCache(page) && !try_to_free_swap(page)) { - unlock_page(page); + if (folio_test_swapcache(folio) && + !folio_free_swap(folio)) { + folio_unlock(folio); continue; } - ClearPageDirty(page); - unlock_page(page); + folio_clear_dirty(folio); + folio_unlock(folio); } if (pte_young(ptent) || pte_dirty(ptent)) { @@ -712,7 +713,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, set_pte_at(mm, addr, pte, ptent); tlb_remove_tlb_entry(tlb, pte, addr); } - mark_page_lazyfree(page); + mark_page_lazyfree(&folio->page); } out: if (nr_swap) { From 5fcd079af9ed4e69cca0a2f77c6255d0eb8a8cca Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:40 +0100 Subject: [PATCH 118/350] uprobes: use folios more widely in __replace_page() Remove a few hidden calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-45-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 401bc2d24ce0..70375c7c0c4b 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -19,7 +19,7 @@ #include #include /* anon_vma_prepare */ #include /* set_pte_at_notify */ -#include /* try_to_free_swap */ +#include /* folio_free_swap */ #include /* user_enable_single_step */ #include /* notifier mechanism */ #include "../../mm/internal.h" /* munlock_vma_page */ @@ -154,8 +154,9 @@ static loff_t vaddr_to_offset(struct vm_area_struct *vma, unsigned long vaddr) static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { + struct folio *old_folio = page_folio(old_page); struct mm_struct *mm = vma->vm_mm; - DEFINE_FOLIO_VMA_WALK(pvmw, page_folio(old_page), vma, addr, 0); + DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; struct mmu_notifier_range range; @@ -169,8 +170,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, return err; } - /* For try_to_free_swap() below */ - lock_page(old_page); + /* For folio_free_swap() below */ + folio_lock(old_folio); mmu_notifier_invalidate_range_start(&range); err = -EAGAIN; @@ -186,7 +187,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); - if (!PageAnon(old_page)) { + if (!folio_test_anon(old_folio)) { dec_mm_counter(mm, mm_counter_file(old_page)); inc_mm_counter(mm, MM_ANONPAGES); } @@ -198,15 +199,15 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, mk_pte(new_page, vma->vm_page_prot)); page_remove_rmap(old_page, vma, false); - if (!page_mapped(old_page)) - try_to_free_swap(old_page); + if (!folio_mapped(old_folio)) + folio_free_swap(old_folio); page_vma_mapped_walk_done(&pvmw); - put_page(old_page); + folio_put(old_folio); err = 0; unlock: mmu_notifier_invalidate_range_end(&range); - unlock_page(old_page); + folio_unlock(old_folio); return err; } From b4e6f66e45b43aed0903731b6c0700573f88282a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:41 +0100 Subject: [PATCH 119/350] ksm: use a folio in replace_page() Replace three calls to compound_head() with one. Link: https://lkml.kernel.org/r/20220902194653.1739778-46-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/ksm.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/ksm.c b/mm/ksm.c index c3edb5836a44..c19fcca9bc03 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -1110,6 +1110,7 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, struct page *kpage, pte_t orig_pte) { struct mm_struct *mm = vma->vm_mm; + struct folio *folio; pmd_t *pmd; pmd_t pmde; pte_t *ptep; @@ -1178,10 +1179,11 @@ static int replace_page(struct vm_area_struct *vma, struct page *page, ptep_clear_flush(vma, addr, ptep); set_pte_at_notify(mm, addr, ptep, newpte); + folio = page_folio(page); page_remove_rmap(page, vma, false); - if (!page_mapped(page)) - try_to_free_swap(page); - put_page(page); + if (!folio_mapped(folio)) + folio_free_swap(folio); + folio_put(folio); pte_unmap_unlock(ptep, ptl); err = 0; From a160e5377b55bc5c1925a7456b656aabfc07261f Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:42 +0100 Subject: [PATCH 120/350] mm: convert do_swap_page() to use folio_free_swap() Also convert should_try_to_free_swap() to use a folio. This removes a few calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-47-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/memory.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 2f1397b7c77d..b8e4dae18ac1 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3641,14 +3641,14 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) return 0; } -static inline bool should_try_to_free_swap(struct page *page, +static inline bool should_try_to_free_swap(struct folio *folio, struct vm_area_struct *vma, unsigned int fault_flags) { - if (!PageSwapCache(page)) + if (!folio_test_swapcache(folio)) return false; - if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || - PageMlocked(page)) + if (mem_cgroup_swap_full(&folio->page) || (vma->vm_flags & VM_LOCKED) || + folio_test_mlocked(folio)) return true; /* * If we want to map a page that's in the swapcache writable, we @@ -3656,8 +3656,8 @@ static inline bool should_try_to_free_swap(struct page *page, * user. Try freeing the swapcache to get rid of the swapcache * reference only in case it's likely that we'll be the exlusive user. */ - return (fault_flags & FAULT_FLAG_WRITE) && !PageKsm(page) && - page_count(page) == 2; + return (fault_flags & FAULT_FLAG_WRITE) && !folio_test_ksm(folio) && + folio_ref_count(folio) == 2; } static vm_fault_t pte_marker_clear(struct vm_fault *vmf) @@ -3949,8 +3949,8 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) * yet. */ swap_free(entry); - if (should_try_to_free_swap(page, vma, vmf->flags)) - try_to_free_swap(page); + if (should_try_to_free_swap(folio, vma, vmf->flags)) + folio_free_swap(folio); inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); From 9202d527b715f67bcdccbb9b712b65fe053f8109 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:43 +0100 Subject: [PATCH 121/350] memcg: convert mem_cgroup_swap_full() to take a folio All callers now have a folio, so convert the function to take a folio. Saves a couple of calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-48-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 4 ++-- mm/memcontrol.c | 6 +++--- mm/memory.c | 2 +- mm/swapfile.c | 2 +- mm/vmscan.c | 3 +-- 5 files changed, 8 insertions(+), 9 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 42cbef554de6..d8bd6401c3e7 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -692,7 +692,7 @@ static inline void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_p } extern long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg); -extern bool mem_cgroup_swap_full(struct page *page); +extern bool mem_cgroup_swap_full(struct folio *folio); #else static inline void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) { @@ -714,7 +714,7 @@ static inline long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return get_nr_swap_pages(); } -static inline bool mem_cgroup_swap_full(struct page *page) +static inline bool mem_cgroup_swap_full(struct folio *folio) { return vm_swap_full(); } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9863fb588972..632402001bca 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7406,18 +7406,18 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return nr_swap_pages; } -bool mem_cgroup_swap_full(struct page *page) +bool mem_cgroup_swap_full(struct folio *folio) { struct mem_cgroup *memcg; - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); if (vm_swap_full()) return true; if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; - memcg = page_memcg(page); + memcg = folio_memcg(folio); if (!memcg) return false; diff --git a/mm/memory.c b/mm/memory.c index b8e4dae18ac1..2f1a6da7f1e6 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3647,7 +3647,7 @@ static inline bool should_try_to_free_swap(struct folio *folio, { if (!folio_test_swapcache(folio)) return false; - if (mem_cgroup_swap_full(&folio->page) || (vma->vm_flags & VM_LOCKED) || + if (mem_cgroup_swap_full(folio) || (vma->vm_flags & VM_LOCKED) || folio_test_mlocked(folio)) return true; /* diff --git a/mm/swapfile.c b/mm/swapfile.c index 3820b5ab64d9..4efcfe34e45b 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -148,7 +148,7 @@ static int __try_to_reclaim_swap(struct swap_info_struct *si, if (folio_trylock(folio)) { if ((flags & TTRS_ANYWAY) || ((flags & TTRS_UNMAPPED) && !folio_mapped(folio)) || - ((flags & TTRS_FULL) && mem_cgroup_swap_full(&folio->page))) + ((flags & TTRS_FULL) && mem_cgroup_swap_full(folio))) ret = folio_free_swap(folio); folio_unlock(folio); } diff --git a/mm/vmscan.c b/mm/vmscan.c index 1707e3bfcfe4..c5a4bff11da6 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2047,8 +2047,7 @@ activate_locked_split: activate_locked: /* Not a candidate for swapping, so reclaim swap space. */ if (folio_test_swapcache(folio) && - (mem_cgroup_swap_full(&folio->page) || - folio_test_mlocked(folio))) + (mem_cgroup_swap_full(folio) || folio_test_mlocked(folio))) folio_free_swap(folio); VM_BUG_ON_FOLIO(folio_test_active(folio), folio); if (!folio_test_mlocked(folio)) { From 3b344157c0c15b8f9588e3021dfb22ee25f4508a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:44 +0100 Subject: [PATCH 122/350] mm: remove try_to_free_swap() All callers have now been converted to folio_free_swap() and we can remove this wrapper. Link: https://lkml.kernel.org/r/20220902194653.1739778-49-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/swap.h | 6 ------ mm/folio-compat.c | 7 ------- mm/memory.c | 2 +- 3 files changed, 1 insertion(+), 14 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index d8bd6401c3e7..fc8d98660326 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -510,7 +510,6 @@ extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); -extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); extern void exit_swap_address_space(unsigned int type); @@ -595,11 +594,6 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -static inline int try_to_free_swap(struct page *page) -{ - return 0; -} - static inline swp_entry_t folio_alloc_swap(struct folio *folio) { swp_entry_t entry; diff --git a/mm/folio-compat.c b/mm/folio-compat.c index 06d47f00609b..e1e23b4947d7 100644 --- a/mm/folio-compat.c +++ b/mm/folio-compat.c @@ -146,10 +146,3 @@ void putback_lru_page(struct page *page) { folio_putback_lru(page_folio(page)); } - -#ifdef CONFIG_SWAP -int try_to_free_swap(struct page *page) -{ - return folio_free_swap(page_folio(page)); -} -#endif diff --git a/mm/memory.c b/mm/memory.c index 2f1a6da7f1e6..6e568f190e7a 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3844,7 +3844,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) if (swapcache) { /* - * Make sure try_to_free_swap or swapoff did not release the + * Make sure folio_free_swap() or swapoff did not release the * swapcache from under us. The page pin, and pte_same test * below, are not enough to exclude that. Even if it is still * swapcache, we need to check that the page's swap has not From 595af4c9368aba88c45831ef80ed686b602fe3fe Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:45 +0100 Subject: [PATCH 123/350] rmap: convert page_move_anon_rmap() to use a folio Removes one call to compound_head() and a reference to page->mapping. Link: https://lkml.kernel.org/r/20220902194653.1739778-50-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/rmap.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/mm/rmap.c b/mm/rmap.c index 2ff17b9aabd9..d44ff516a208 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1099,22 +1099,20 @@ int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, */ void page_move_anon_rmap(struct page *page, struct vm_area_struct *vma) { - struct anon_vma *anon_vma = vma->anon_vma; - struct page *subpage = page; + void *anon_vma = vma->anon_vma; + struct folio *folio = page_folio(page); - page = compound_head(page); - - VM_BUG_ON_PAGE(!PageLocked(page), page); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); VM_BUG_ON_VMA(!anon_vma, vma); - anon_vma = (void *) anon_vma + PAGE_MAPPING_ANON; + anon_vma += PAGE_MAPPING_ANON; /* * Ensure that anon_vma and the PAGE_MAPPING_ANON bit are written * simultaneously, so a concurrent reader (eg folio_referenced()'s * folio_test_anon()) will not see one without the other. */ - WRITE_ONCE(page->mapping, (struct address_space *) anon_vma); - SetPageAnonExclusive(subpage); + WRITE_ONCE(folio->mapping, anon_vma); + SetPageAnonExclusive(page); } /** From 682a71a1b6b363bff71440f4eca6498f827a839d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:46 +0100 Subject: [PATCH 124/350] migrate: convert __unmap_and_move() to use folios Removes a lot of calls to compound_head(). Also remove a VM_BUG_ON that can never trigger as the PageAnon bit is the bottom bit of page->mapping. Link: https://lkml.kernel.org/r/20220902194653.1739778-51-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/migrate.c | 75 ++++++++++++++++++++++++++-------------------------- 1 file changed, 37 insertions(+), 38 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index eb594b0db806..1ea149f14f84 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -993,17 +993,15 @@ out: return rc; } -static int __unmap_and_move(struct page *page, struct page *newpage, +static int __unmap_and_move(struct folio *src, struct folio *dst, int force, enum migrate_mode mode) { - struct folio *folio = page_folio(page); - struct folio *dst = page_folio(newpage); int rc = -EAGAIN; bool page_was_mapped = false; struct anon_vma *anon_vma = NULL; - bool is_lru = !__PageMovable(page); + bool is_lru = !__PageMovable(&src->page); - if (!trylock_page(page)) { + if (!folio_trylock(src)) { if (!force || mode == MIGRATE_ASYNC) goto out; @@ -1023,10 +1021,10 @@ static int __unmap_and_move(struct page *page, struct page *newpage, if (current->flags & PF_MEMALLOC) goto out; - lock_page(page); + folio_lock(src); } - if (PageWriteback(page)) { + if (folio_test_writeback(src)) { /* * Only in the case of a full synchronous migration is it * necessary to wait for PageWriteback. In the async case, @@ -1043,12 +1041,12 @@ static int __unmap_and_move(struct page *page, struct page *newpage, } if (!force) goto out_unlock; - wait_on_page_writeback(page); + folio_wait_writeback(src); } /* - * By try_to_migrate(), page->mapcount goes down to 0 here. In this case, - * we cannot notice that anon_vma is freed while we migrates a page. + * By try_to_migrate(), src->mapcount goes down to 0 here. In this case, + * we cannot notice that anon_vma is freed while we migrate a page. * This get_anon_vma() delays freeing anon_vma pointer until the end * of migration. File cache pages are no problem because of page_lock() * File Caches may use write_page() or lock_page() in migration, then, @@ -1060,22 +1058,22 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * because that implies that the anon page is no longer mapped * (and cannot be remapped so long as we hold the page lock). */ - if (PageAnon(page) && !PageKsm(page)) - anon_vma = page_get_anon_vma(page); + if (folio_test_anon(src) && !folio_test_ksm(src)) + anon_vma = page_get_anon_vma(&src->page); /* * Block others from accessing the new page when we get around to * establishing additional references. We are usually the only one - * holding a reference to newpage at this point. We used to have a BUG - * here if trylock_page(newpage) fails, but would like to allow for - * cases where there might be a race with the previous use of newpage. + * holding a reference to dst at this point. We used to have a BUG + * here if folio_trylock(dst) fails, but would like to allow for + * cases where there might be a race with the previous use of dst. * This is much like races on refcount of oldpage: just don't BUG(). */ - if (unlikely(!trylock_page(newpage))) + if (unlikely(!folio_trylock(dst))) goto out_unlock; if (unlikely(!is_lru)) { - rc = move_to_new_folio(dst, folio, mode); + rc = move_to_new_folio(dst, src, mode); goto out_unlock_both; } @@ -1083,7 +1081,7 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * Corner case handling: * 1. When a new swap-cache page is read into, it is added to the LRU * and treated as swapcache but it has no rmap yet. - * Calling try_to_unmap() against a page->mapping==NULL page will + * Calling try_to_unmap() against a src->mapping==NULL page will * trigger a BUG. So handle it here. * 2. An orphaned page (see truncate_cleanup_page) might have * fs-private metadata. The page can be picked up due to memory @@ -1091,57 +1089,56 @@ static int __unmap_and_move(struct page *page, struct page *newpage, * invisible to the vm, so the page can not be migrated. So try to * free the metadata, so the page can be freed. */ - if (!page->mapping) { - VM_BUG_ON_PAGE(PageAnon(page), page); - if (page_has_private(page)) { - try_to_free_buffers(folio); + if (!src->mapping) { + if (folio_test_private(src)) { + try_to_free_buffers(src); goto out_unlock_both; } - } else if (page_mapped(page)) { + } else if (folio_mapped(src)) { /* Establish migration ptes */ - VM_BUG_ON_PAGE(PageAnon(page) && !PageKsm(page) && !anon_vma, - page); - try_to_migrate(folio, 0); + VM_BUG_ON_FOLIO(folio_test_anon(src) && + !folio_test_ksm(src) && !anon_vma, src); + try_to_migrate(src, 0); page_was_mapped = true; } - if (!page_mapped(page)) - rc = move_to_new_folio(dst, folio, mode); + if (!folio_mapped(src)) + rc = move_to_new_folio(dst, src, mode); /* - * When successful, push newpage to LRU immediately: so that if it + * When successful, push dst to LRU immediately: so that if it * turns out to be an mlocked page, remove_migration_ptes() will - * automatically build up the correct newpage->mlock_count for it. + * automatically build up the correct dst->mlock_count for it. * * We would like to do something similar for the old page, when * unsuccessful, and other cases when a page has been temporarily * isolated from the unevictable LRU: but this case is the easiest. */ if (rc == MIGRATEPAGE_SUCCESS) { - lru_cache_add(newpage); + folio_add_lru(dst); if (page_was_mapped) lru_add_drain(); } if (page_was_mapped) - remove_migration_ptes(folio, - rc == MIGRATEPAGE_SUCCESS ? dst : folio, false); + remove_migration_ptes(src, + rc == MIGRATEPAGE_SUCCESS ? dst : src, false); out_unlock_both: - unlock_page(newpage); + folio_unlock(dst); out_unlock: /* Drop an anon_vma reference if we took one */ if (anon_vma) put_anon_vma(anon_vma); - unlock_page(page); + folio_unlock(src); out: /* - * If migration is successful, decrease refcount of the newpage, + * If migration is successful, decrease refcount of dst, * which will not free the page because new page owner increased * refcounter. */ if (rc == MIGRATEPAGE_SUCCESS) - put_page(newpage); + folio_put(dst); return rc; } @@ -1157,6 +1154,7 @@ static int unmap_and_move(new_page_t get_new_page, enum migrate_reason reason, struct list_head *ret) { + struct folio *dst, *src = page_folio(page); int rc = MIGRATEPAGE_SUCCESS; struct page *newpage = NULL; @@ -1174,9 +1172,10 @@ static int unmap_and_move(new_page_t get_new_page, newpage = get_new_page(page, private); if (!newpage) return -ENOMEM; + dst = page_folio(newpage); newpage->private = 0; - rc = __unmap_and_move(page, newpage, force, mode); + rc = __unmap_and_move(src, dst, force, mode); if (rc == MIGRATEPAGE_SUCCESS) set_page_owner_migrate_reason(newpage, reason); From c33db29231ad242b0c381c60b1603f5e1dec7e46 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:47 +0100 Subject: [PATCH 125/350] migrate: convert unmap_and_move_huge_page() to use folios Saves several calls to compound_head() and removes a couple of uses of page->lru. Link: https://lkml.kernel.org/r/20220902194653.1739778-52-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/migrate.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 1ea149f14f84..c1c2d9d9032b 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1263,7 +1263,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, if (!hugepage_migration_supported(page_hstate(hpage))) return -ENOSYS; - if (page_count(hpage) == 1) { + if (folio_ref_count(src) == 1) { /* page was freed from under us. So we are done. */ putback_active_hugepage(hpage); return MIGRATEPAGE_SUCCESS; @@ -1274,7 +1274,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, return -ENOMEM; dst = page_folio(new_hpage); - if (!trylock_page(hpage)) { + if (!folio_trylock(src)) { if (!force) goto out; switch (mode) { @@ -1284,29 +1284,29 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, default: goto out; } - lock_page(hpage); + folio_lock(src); } /* * Check for pages which are in the process of being freed. Without - * page_mapping() set, hugetlbfs specific move page routine will not + * folio_mapping() set, hugetlbfs specific move page routine will not * be called and we could leak usage counts for subpools. */ - if (hugetlb_page_subpool(hpage) && !page_mapping(hpage)) { + if (hugetlb_page_subpool(hpage) && !folio_mapping(src)) { rc = -EBUSY; goto out_unlock; } - if (PageAnon(hpage)) - anon_vma = page_get_anon_vma(hpage); + if (folio_test_anon(src)) + anon_vma = page_get_anon_vma(&src->page); - if (unlikely(!trylock_page(new_hpage))) + if (unlikely(!folio_trylock(dst))) goto put_anon; - if (page_mapped(hpage)) { + if (folio_mapped(src)) { enum ttu_flags ttu = 0; - if (!PageAnon(hpage)) { + if (!folio_test_anon(src)) { /* * In shared mappings, try_to_unmap could potentially * call huge_pmd_unshare. Because of this, take @@ -1327,7 +1327,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, i_mmap_unlock_write(mapping); } - if (!page_mapped(hpage)) + if (!folio_mapped(src)) rc = move_to_new_folio(dst, src, mode); if (page_was_mapped) @@ -1335,7 +1335,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, rc == MIGRATEPAGE_SUCCESS ? dst : src, false); unlock_put_anon: - unlock_page(new_hpage); + folio_unlock(dst); put_anon: if (anon_vma) @@ -1347,12 +1347,12 @@ put_anon: } out_unlock: - unlock_page(hpage); + folio_unlock(src); out: if (rc == MIGRATEPAGE_SUCCESS) putback_active_hugepage(hpage); else if (rc != -EAGAIN) - list_move_tail(&hpage->lru, ret); + list_move_tail(&src->lru, ret); /* * If migration was not successful and there's a freeing callback, use From 3e9a13daa61253e28a1c7d8f366931e0a58a2b5a Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:48 +0100 Subject: [PATCH 126/350] huge_memory: convert split_huge_page_to_list() to use a folio Saves many calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-53-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 49 ++++++++++++++++++++++++------------------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 1181e623bf5b..bb8266b099f5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2622,27 +2622,26 @@ bool can_split_folio(struct folio *folio, int *pextra_pins) int split_huge_page_to_list(struct page *page, struct list_head *list) { struct folio *folio = page_folio(page); - struct page *head = &folio->page; - struct deferred_split *ds_queue = get_deferred_split_queue(head); - XA_STATE(xas, &head->mapping->i_pages, head->index); + struct deferred_split *ds_queue = get_deferred_split_queue(&folio->page); + XA_STATE(xas, &folio->mapping->i_pages, folio->index); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; int extra_pins, ret; pgoff_t end; bool is_hzp; - VM_BUG_ON_PAGE(!PageLocked(head), head); - VM_BUG_ON_PAGE(!PageCompound(head), head); + VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); - is_hzp = is_huge_zero_page(head); - VM_WARN_ON_ONCE_PAGE(is_hzp, head); + is_hzp = is_huge_zero_page(&folio->page); + VM_WARN_ON_ONCE_FOLIO(is_hzp, folio); if (is_hzp) return -EBUSY; - if (PageWriteback(head)) + if (folio_test_writeback(folio)) return -EBUSY; - if (PageAnon(head)) { + if (folio_test_anon(folio)) { /* * The caller does not necessarily hold an mmap_lock that would * prevent the anon_vma disappearing so we first we take a @@ -2651,7 +2650,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * is taken to serialise against parallel split or collapse * operations. */ - anon_vma = page_get_anon_vma(head); + anon_vma = page_get_anon_vma(&folio->page); if (!anon_vma) { ret = -EBUSY; goto out; @@ -2662,7 +2661,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } else { gfp_t gfp; - mapping = head->mapping; + mapping = folio->mapping; /* Truncated ? */ if (!mapping) { @@ -2679,7 +2678,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } - xas_split_alloc(&xas, head, compound_order(head), gfp); + xas_split_alloc(&xas, folio, folio_order(folio), gfp); if (xas_error(&xas)) { ret = xas_error(&xas); goto out; @@ -2693,7 +2692,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * but on 32-bit, i_size_read() takes an irq-unsafe seqlock, * which cannot be nested inside the page tree lock. So note * end now: i_size itself may be changed at any moment, but - * head page lock is good enough to serialize the trimming. + * folio lock is good enough to serialize the trimming. */ end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE); if (shmem_mapping(mapping)) @@ -2709,38 +2708,38 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out_unlock; } - unmap_page(head); + unmap_page(&folio->page); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); if (mapping) { /* - * Check if the head page is present in page cache. - * We assume all tail are present too, if head is there. + * Check if the folio is present in page cache. + * We assume all tail are present too, if folio is there. */ xas_lock(&xas); xas_reset(&xas); - if (xas_load(&xas) != head) + if (xas_load(&xas) != folio) goto fail; } /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); - if (page_ref_freeze(head, 1 + extra_pins)) { - if (!list_empty(page_deferred_list(head))) { + if (folio_ref_freeze(folio, 1 + extra_pins)) { + if (!list_empty(page_deferred_list(&folio->page))) { ds_queue->split_queue_len--; - list_del(page_deferred_list(head)); + list_del(page_deferred_list(&folio->page)); } spin_unlock(&ds_queue->split_queue_lock); if (mapping) { - int nr = thp_nr_pages(head); + int nr = folio_nr_pages(folio); - xas_split(&xas, head, thp_order(head)); - if (PageSwapBacked(head)) { - __mod_lruvec_page_state(head, NR_SHMEM_THPS, + xas_split(&xas, folio, folio_order(folio)); + if (folio_test_swapbacked(folio)) { + __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else { - __mod_lruvec_page_state(head, NR_FILE_THPS, + __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } From 684555aacc90d70e6a4b96b3b238f1d9ea87408d Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:49 +0100 Subject: [PATCH 127/350] huge_memory: convert unmap_page() to unmap_folio() Remove a folio->page->folio conversion. Link: https://lkml.kernel.org/r/20220902194653.1739778-54-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- mm/huge_memory.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index bb8266b099f5..22949ff6df13 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2355,13 +2355,12 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, } } -static void unmap_page(struct page *page) +static void unmap_folio(struct folio *folio) { - struct folio *folio = page_folio(page); enum ttu_flags ttu_flags = TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD | TTU_SYNC; - VM_BUG_ON_PAGE(!PageHead(page), page); + VM_BUG_ON_FOLIO(!folio_test_large(folio), folio); /* * Anon pages need migration entries to preserve them, but file @@ -2378,7 +2377,7 @@ static void remap_page(struct folio *folio, unsigned long nr) { int i = 0; - /* If unmap_page() uses try_to_migrate() on file, remove this check */ + /* If unmap_folio() uses try_to_migrate() on file, remove this check */ if (!folio_test_anon(folio)) return; for (;;) { @@ -2428,7 +2427,7 @@ static void __split_huge_page_tail(struct page *head, int tail, * for example lock_page() which set PG_waiters. * * Note that for mapped sub-pages of an anonymous THP, - * PG_anon_exclusive has been cleared in unmap_page() and is stored in + * PG_anon_exclusive has been cleared in unmap_folio() and is stored in * the migration entry instead from where remap_page() will restore it. * We can still have PG_anon_exclusive set on effectively unmapped and * unreferenced sub-pages of an anonymous THP: we can simply drop @@ -2700,7 +2699,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } /* - * Racy check if we can split the page, before unmap_page() will + * Racy check if we can split the page, before unmap_folio() will * split PMDs */ if (!can_split_folio(folio, &extra_pins)) { @@ -2708,7 +2707,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out_unlock; } - unmap_page(&folio->page); + unmap_folio(folio); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); From 29eea9b5a9c9ecf21164a082a42bfabe06fdcb30 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:50 +0100 Subject: [PATCH 128/350] mm: convert page_get_anon_vma() to folio_get_anon_vma() With all callers now passing in a folio, rename the function and convert all callers. Removes a couple of calls to compound_head() and a reference to page->mapping. Link: https://lkml.kernel.org/r/20220902194653.1739778-55-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 2 +- mm/huge_memory.c | 2 +- mm/migrate.c | 6 +++--- mm/rmap.c | 14 +++++++------- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 72b2bcc37f73..3d56e3712bb2 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -163,7 +163,7 @@ static inline void anon_vma_merge(struct vm_area_struct *vma, unlink_anon_vmas(next); } -struct anon_vma *page_get_anon_vma(struct page *page); +struct anon_vma *folio_get_anon_vma(struct folio *folio); /* RMAP flags, currently only relevant for some anon rmap operations. */ typedef int __bitwise rmap_t; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 22949ff6df13..36ef79b85195 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2649,7 +2649,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) * is taken to serialise against parallel split or collapse * operations. */ - anon_vma = page_get_anon_vma(&folio->page); + anon_vma = folio_get_anon_vma(folio); if (!anon_vma) { ret = -EBUSY; goto out; diff --git a/mm/migrate.c b/mm/migrate.c index c1c2d9d9032b..c228afba0963 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1052,14 +1052,14 @@ static int __unmap_and_move(struct folio *src, struct folio *dst, * File Caches may use write_page() or lock_page() in migration, then, * just care Anon page here. * - * Only page_get_anon_vma() understands the subtleties of + * Only folio_get_anon_vma() understands the subtleties of * getting a hold on an anon_vma from outside one of its mms. * But if we cannot get anon_vma, then we won't need it anyway, * because that implies that the anon page is no longer mapped * (and cannot be remapped so long as we hold the page lock). */ if (folio_test_anon(src) && !folio_test_ksm(src)) - anon_vma = page_get_anon_vma(&src->page); + anon_vma = folio_get_anon_vma(src); /* * Block others from accessing the new page when we get around to @@ -1298,7 +1298,7 @@ static int unmap_and_move_huge_page(new_page_t get_new_page, } if (folio_test_anon(src)) - anon_vma = page_get_anon_vma(&src->page); + anon_vma = folio_get_anon_vma(src); if (unlikely(!folio_trylock(dst))) goto put_anon; diff --git a/mm/rmap.c b/mm/rmap.c index d44ff516a208..86511e633fcd 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -486,16 +486,16 @@ void __init anon_vma_init(void) * if there is a mapcount, we can dereference the anon_vma after observing * those. */ -struct anon_vma *page_get_anon_vma(struct page *page) +struct anon_vma *folio_get_anon_vma(struct folio *folio) { struct anon_vma *anon_vma = NULL; unsigned long anon_mapping; rcu_read_lock(); - anon_mapping = (unsigned long)READ_ONCE(page->mapping); + anon_mapping = (unsigned long)READ_ONCE(folio->mapping); if ((anon_mapping & PAGE_MAPPING_FLAGS) != PAGE_MAPPING_ANON) goto out; - if (!page_mapped(page)) + if (!folio_mapped(folio)) goto out; anon_vma = (struct anon_vma *) (anon_mapping - PAGE_MAPPING_ANON); @@ -505,13 +505,13 @@ struct anon_vma *page_get_anon_vma(struct page *page) } /* - * If this page is still mapped, then its anon_vma cannot have been + * If this folio is still mapped, then its anon_vma cannot have been * freed. But if it has been unmapped, we have no security against the * anon_vma structure being freed and reused (for another anon_vma: * SLAB_TYPESAFE_BY_RCU guarantees that - so the atomic_inc_not_zero() * above cannot corrupt). */ - if (!page_mapped(page)) { + if (!folio_mapped(folio)) { rcu_read_unlock(); put_anon_vma(anon_vma); return NULL; @@ -523,11 +523,11 @@ out: } /* - * Similar to page_get_anon_vma() except it locks the anon_vma. + * Similar to folio_get_anon_vma() except it locks the anon_vma. * * Its a little more complex as it tries to keep the fast path to a single * atomic op -- the trylock. If we fail the trylock, we fall back to getting a - * reference like with page_get_anon_vma() and then block on the mutex + * reference like with folio_get_anon_vma() and then block on the mutex * on !rwc->try_lock case. */ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, From 0c826c0b6a176b9ed5ace7106fd1770bb48f1898 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:51 +0100 Subject: [PATCH 129/350] rmap: remove page_unlock_anon_vma_read() This was simply an alias for anon_vma_unlock_read() since 2011. Link: https://lkml.kernel.org/r/20220902194653.1739778-56-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/rmap.h | 5 ----- mm/memory-failure.c | 2 +- mm/rmap.c | 5 ----- 3 files changed, 1 insertion(+), 11 deletions(-) diff --git a/include/linux/rmap.h b/include/linux/rmap.h index 3d56e3712bb2..ca3e4ba6c58c 100644 --- a/include/linux/rmap.h +++ b/include/linux/rmap.h @@ -458,13 +458,8 @@ struct rmap_walk_control { void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); - -/* - * Called by memory-failure.c to kill processes. - */ struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, struct rmap_walk_control *rwc); -void page_unlock_anon_vma_read(struct anon_vma *anon_vma); #else /* !CONFIG_MMU */ diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e554f9f583ca..145bb561ddb3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -529,7 +529,7 @@ static void collect_procs_anon(struct page *page, struct list_head *to_kill, } } read_unlock(&tasklist_lock); - page_unlock_anon_vma_read(av); + anon_vma_unlock_read(av); } /* diff --git a/mm/rmap.c b/mm/rmap.c index 86511e633fcd..0b9264e58d25 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -599,11 +599,6 @@ out: return anon_vma; } -void page_unlock_anon_vma_read(struct anon_vma *anon_vma) -{ - anon_vma_unlock_read(anon_vma); -} - #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* * Flush TLB entries for recently unmapped pages from remote CPUs. It is From 82e66bf76173a1525db9866455a7fdbc07b57297 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:52 +0100 Subject: [PATCH 130/350] uprobes: use new_folio in __replace_page() Saves several calls to compound_head(). Link: https://lkml.kernel.org/r/20220902194653.1739778-57-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- kernel/events/uprobes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index 70375c7c0c4b..e0a9b945e7bc 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -155,6 +155,7 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, struct page *old_page, struct page *new_page) { struct folio *old_folio = page_folio(old_page); + struct folio *new_folio; struct mm_struct *mm = vma->vm_mm; DEFINE_FOLIO_VMA_WALK(pvmw, old_folio, vma, addr, 0); int err; @@ -164,8 +165,8 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, addr + PAGE_SIZE); if (new_page) { - err = mem_cgroup_charge(page_folio(new_page), vma->vm_mm, - GFP_KERNEL); + new_folio = page_folio(new_page); + err = mem_cgroup_charge(new_folio, vma->vm_mm, GFP_KERNEL); if (err) return err; } @@ -180,9 +181,9 @@ static int __replace_page(struct vm_area_struct *vma, unsigned long addr, VM_BUG_ON_PAGE(addr != pvmw.address, old_page); if (new_page) { - get_page(new_page); + folio_get(new_folio); page_add_new_anon_rmap(new_page, vma, addr); - lru_cache_add_inactive_or_unevictable(new_page, vma); + folio_add_lru_vma(new_folio, vma); } else /* no new page, just dec_mm_counter for old_page */ dec_mm_counter(mm, MM_ANONPAGES); From 19672a9e4a75252871cba319f4e3b859b8fdf671 Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 2 Sep 2022 20:46:53 +0100 Subject: [PATCH 131/350] mm: convert lock_page_or_retry() to folio_lock_or_retry() Remove a call to compound_head() in each of the two callers. Link: https://lkml.kernel.org/r/20220902194653.1739778-58-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 9 +++------ mm/memory.c | 10 +++++----- 2 files changed, 8 insertions(+), 11 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 09de43e36a64..32846b6306db 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -989,19 +989,16 @@ static inline int lock_page_killable(struct page *page) } /* - * lock_page_or_retry - Lock the page, unless this would block and the + * folio_lock_or_retry - Lock the folio, unless this would block and the * caller indicated that it can handle a retry. * * Return value and mmap_lock implications depend on flags; see * __folio_lock_or_retry(). */ -static inline bool lock_page_or_retry(struct page *page, struct mm_struct *mm, - unsigned int flags) +static inline bool folio_lock_or_retry(struct folio *folio, + struct mm_struct *mm, unsigned int flags) { - struct folio *folio; might_sleep(); - - folio = page_folio(page); return folio_trylock(folio) || __folio_lock_or_retry(folio, mm, flags); } diff --git a/mm/memory.c b/mm/memory.c index 6e568f190e7a..d671ad367d67 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3618,11 +3618,11 @@ EXPORT_SYMBOL(unmap_mapping_range); */ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) { - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct vm_area_struct *vma = vmf->vma; struct mmu_notifier_range range; - if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) + if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) return VM_FAULT_RETRY; mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0, vma, vma->vm_mm, vmf->address & PAGE_MASK, @@ -3632,10 +3632,10 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf) vmf->pte = pte_offset_map_lock(vma->vm_mm, vmf->pmd, vmf->address, &vmf->ptl); if (likely(pte_same(*vmf->pte, vmf->orig_pte))) - restore_exclusive_pte(vma, page, vmf->address, vmf->pte); + restore_exclusive_pte(vma, vmf->page, vmf->address, vmf->pte); pte_unmap_unlock(vmf->pte, vmf->ptl); - unlock_page(page); + folio_unlock(folio); mmu_notifier_invalidate_range_end(&range); return 0; @@ -3835,7 +3835,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) goto out_release; } - locked = lock_page_or_retry(page, vma->vm_mm, vmf->flags); + locked = folio_lock_or_retry(folio, vma->vm_mm, vmf->flags); if (!locked) { ret |= VM_FAULT_RETRY; From 8eeda55fe08944421cf57f6185fe37b069829e7b Mon Sep 17 00:00:00 2001 From: Li zeming Date: Mon, 5 Sep 2022 10:09:18 +0800 Subject: [PATCH 132/350] mm/hugetlb.c: remove unnecessary initialization of local `err' Link: https://lkml.kernel.org/r/20220905020918.3552-1-zeming@nfschina.com Signed-off-by: Li zeming Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/hugetlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2ca4e8c3163e..008955d8f411 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3718,7 +3718,7 @@ static ssize_t demote_store(struct kobject *kobj, unsigned long nr_available; nodemask_t nodes_allowed, *n_mask; struct hstate *h; - int err = 0; + int err; int nid; err = kstrtoul(buf, 10, &nr_demote); From c274cd5c9bf5ded4b3f2a4e99f76223c8f006051 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Sun, 4 Sep 2022 22:36:06 +0800 Subject: [PATCH 133/350] mm/damon/sysfs: simplify the judgement whether kdamonds are busy It is unnecessary to get the number of the running kdamond to judge whether kdamonds are busy. Here we can use the damon_sysfs_kdamond_running() helper and return -EBUSY directly when finding a running kdamond. Meanwhile, merging with the judgement that a kdamond has current sysfs command callback request to make the code more clear. Link: https://lkml.kernel.org/r/1662302166-13216-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 7488e27c87c3..fe6c6870cf86 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2657,23 +2657,18 @@ static void damon_sysfs_kdamonds_rm_dirs(struct damon_sysfs_kdamonds *kdamonds) kdamonds->kdamonds_arr = NULL; } -static int damon_sysfs_nr_running_ctxs(struct damon_sysfs_kdamond **kdamonds, +static bool damon_sysfs_kdamonds_busy(struct damon_sysfs_kdamond **kdamonds, int nr_kdamonds) { - int nr_running_ctxs = 0; int i; for (i = 0; i < nr_kdamonds; i++) { - struct damon_ctx *ctx = kdamonds[i]->damon_ctx; - - if (!ctx) - continue; - mutex_lock(&ctx->kdamond_lock); - if (ctx->kdamond) - nr_running_ctxs++; - mutex_unlock(&ctx->kdamond_lock); + if (damon_sysfs_kdamond_running(kdamonds[i]) || + damon_sysfs_cmd_request.kdamond == kdamonds[i]) + return true; } - return nr_running_ctxs; + + return false; } static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds, @@ -2682,15 +2677,9 @@ static int damon_sysfs_kdamonds_add_dirs(struct damon_sysfs_kdamonds *kdamonds, struct damon_sysfs_kdamond **kdamonds_arr, *kdamond; int err, i; - if (damon_sysfs_nr_running_ctxs(kdamonds->kdamonds_arr, kdamonds->nr)) + if (damon_sysfs_kdamonds_busy(kdamonds->kdamonds_arr, kdamonds->nr)) return -EBUSY; - for (i = 0; i < kdamonds->nr; i++) { - if (damon_sysfs_cmd_request.kdamond == - kdamonds->kdamonds_arr[i]) - return -EBUSY; - } - damon_sysfs_kdamonds_rm_dirs(kdamonds); if (!nr_kdamonds) return 0; From 710bb68c2e3a24512e2d2bae470960d7488e97b1 Mon Sep 17 00:00:00 2001 From: Matthias Goergens Date: Mon, 5 Sep 2022 11:19:04 +0800 Subject: [PATCH 134/350] hugetlb_encode.h: fix undefined behaviour (34 << 26) Left-shifting past the size of your datatype is undefined behaviour in C. The literal 34 gets the type `int`, and that one is not big enough to be left shifted by 26 bits. An `unsigned` is long enough (on any machine that has at least 32 bits for their ints.) For uniformity, we mark all the literals as unsigned. But it's only really needed for HUGETLB_FLAG_ENCODE_16GB. Thanks to Randy Dunlap for an initial review and suggestion. Link: https://lkml.kernel.org/r/20220905031904.150925-1-matthias.goergens@gmail.com Signed-off-by: Matthias Goergens Acked-by: Randy Dunlap Cc: Mike Kravetz Cc: Muchun Song Signed-off-by: Andrew Morton --- include/uapi/asm-generic/hugetlb_encode.h | 26 +++++++++++----------- tools/include/asm-generic/hugetlb_encode.h | 26 +++++++++++----------- 2 files changed, 26 insertions(+), 26 deletions(-) diff --git a/include/uapi/asm-generic/hugetlb_encode.h b/include/uapi/asm-generic/hugetlb_encode.h index 4f3d5aaa11f5..de687009bfe5 100644 --- a/include/uapi/asm-generic/hugetlb_encode.h +++ b/include/uapi/asm-generic/hugetlb_encode.h @@ -20,18 +20,18 @@ #define HUGETLB_FLAG_ENCODE_SHIFT 26 #define HUGETLB_FLAG_ENCODE_MASK 0x3f -#define HUGETLB_FLAG_ENCODE_16KB (14 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_32MB (25 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_512MB (29 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16KB (14U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_64KB (16U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512KB (19U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1MB (20U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2MB (21U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_8MB (23U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16MB (24U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_32MB (25U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_256MB (28U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512MB (29U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1GB (30U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2GB (31U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16GB (34U << HUGETLB_FLAG_ENCODE_SHIFT) #endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */ diff --git a/tools/include/asm-generic/hugetlb_encode.h b/tools/include/asm-generic/hugetlb_encode.h index 4f3d5aaa11f5..de687009bfe5 100644 --- a/tools/include/asm-generic/hugetlb_encode.h +++ b/tools/include/asm-generic/hugetlb_encode.h @@ -20,18 +20,18 @@ #define HUGETLB_FLAG_ENCODE_SHIFT 26 #define HUGETLB_FLAG_ENCODE_MASK 0x3f -#define HUGETLB_FLAG_ENCODE_16KB (14 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_64KB (16 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_512KB (19 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_1MB (20 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_2MB (21 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_8MB (23 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_16MB (24 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_32MB (25 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_256MB (28 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_512MB (29 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_1GB (30 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_2GB (31 << HUGETLB_FLAG_ENCODE_SHIFT) -#define HUGETLB_FLAG_ENCODE_16GB (34 << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16KB (14U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_64KB (16U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512KB (19U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1MB (20U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2MB (21U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_8MB (23U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16MB (24U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_32MB (25U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_256MB (28U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_512MB (29U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_1GB (30U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_2GB (31U << HUGETLB_FLAG_ENCODE_SHIFT) +#define HUGETLB_FLAG_ENCODE_16GB (34U << HUGETLB_FLAG_ENCODE_SHIFT) #endif /* _ASM_GENERIC_HUGETLB_ENCODE_H_ */ From b05f41a1aa56fd646f2aa048ee446b6a2edb80d3 Mon Sep 17 00:00:00 2001 From: "Vishal Moola (Oracle)" Date: Mon, 5 Sep 2022 14:45:57 -0700 Subject: [PATCH 135/350] filemap: convert filemap_range_has_writeback() to use folios Removes 3 calls to compound_head(). Link: https://lkml.kernel.org/r/20220905214557.868606-1-vishal.moola@gmail.com Signed-off-by: Vishal Moola (Oracle) Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/filemap.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/mm/filemap.c b/mm/filemap.c index 68bd70fe71d5..aab125d423b8 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -632,22 +632,23 @@ bool filemap_range_has_writeback(struct address_space *mapping, { XA_STATE(xas, &mapping->i_pages, start_byte >> PAGE_SHIFT); pgoff_t max = end_byte >> PAGE_SHIFT; - struct page *page; + struct folio *folio; if (end_byte < start_byte) return false; rcu_read_lock(); - xas_for_each(&xas, page, max) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, max) { + if (xas_retry(&xas, folio)) continue; - if (xa_is_value(page)) + if (xa_is_value(folio)) continue; - if (PageDirty(page) || PageLocked(page) || PageWriteback(page)) + if (folio_test_dirty(folio) || folio_test_locked(folio) || + folio_test_writeback(folio)) break; } rcu_read_unlock(); - return page != NULL; + return folio != NULL; } EXPORT_SYMBOL_GPL(filemap_range_has_writeback); From ca77f290cff1dfa095d71ae16cc7cda8ee6df495 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:16 +0200 Subject: [PATCH 136/350] kasan: check KASAN_NO_FREE_META in __kasan_metadata_size Patch series "kasan: switch tag-based modes to stack ring from per-object metadata", v3. This series makes the tag-based KASAN modes use a ring buffer for storing stack depot handles for alloc/free stack traces for slab objects instead of per-object metadata. This ring buffer is referred to as the stack ring. On each alloc/free of a slab object, the tagged address of the object and the current stack trace are recorded in the stack ring. On each bug report, if the accessed address belongs to a slab object, the stack ring is scanned for matching entries. The newest entries are used to print the alloc/free stack traces in the report: one entry for alloc and one for free. The advantages of this approach over storing stack trace handles in per-object metadata with the tag-based KASAN modes: - Allows to find relevant stack traces for use-after-free bugs without using quarantine for freed memory. (Currently, if the object was reallocated multiple times, the report contains the latest alloc/free stack traces, not necessarily the ones relevant to the buggy allocation.) - Allows to better identify and mark use-after-free bugs, effectively making the CONFIG_KASAN_TAGS_IDENTIFY functionality always-on. - Has fixed memory overhead. The disadvantage: - If the affected object was allocated/freed long before the bug happened and the stack trace events were purged from the stack ring, the report will have no stack traces. Discussion ========== The proposed implementation of the stack ring uses a single ring buffer for the whole kernel. This might lead to contention due to atomic accesses to the ring buffer index on multicore systems. At this point, it is unknown whether the performance impact from this contention would be significant compared to the slowdown introduced by collecting stack traces due to the planned changes to the latter part, see the section below. For now, the proposed implementation is deemed to be good enough, but this might need to be revisited once the stack collection becomes faster. A considered alternative is to keep a separate ring buffer for each CPU and then iterate over all of them when printing a bug report. This approach requires somehow figuring out which of the stack rings has the freshest stack traces for an object if multiple stack rings have them. Further plans ============= This series is a part of an effort to make KASAN stack trace collection suitable for production. This requires stack trace collection to be fast and memory-bounded. The planned steps are: 1. Speed up stack trace collection (potentially, by using SCS; patches on-hold until steps #2 and #3 are completed). 2. Keep stack trace handles in the stack ring (this series). 3. Add a memory-bounded mode to stack depot or provide an alternative memory-bounded stack storage. 4. Potentially, implement stack trace collection sampling to minimize the performance impact. This patch (of 34): __kasan_metadata_size() calculates the size of the redzone for objects in a slab cache. When accounting for presence of kasan_free_meta in the redzone, this function only compares free_meta_offset with 0. But free_meta_offset could also be equal to KASAN_NO_FREE_META, which indicates that kasan_free_meta is not present at all. Add a comparison with KASAN_NO_FREE_META into __kasan_metadata_size(). Link: https://lkml.kernel.org/r/cover.1662411799.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/c7b316d30d90e5947eb8280f4dc78856a49298cf.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 69f583855c8b..f6a6c7d0d8b8 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -224,8 +224,9 @@ size_t __kasan_metadata_size(struct kmem_cache *cache) return 0; return (cache->kasan_info.alloc_meta_offset ? sizeof(struct kasan_alloc_meta) : 0) + - (cache->kasan_info.free_meta_offset ? - sizeof(struct kasan_free_meta) : 0); + ((cache->kasan_info.free_meta_offset && + cache->kasan_info.free_meta_offset != KASAN_NO_FREE_META) ? + sizeof(struct kasan_free_meta) : 0); } struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, From c249f9af85ee006976c0fae584daf947cc959931 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:17 +0200 Subject: [PATCH 137/350] kasan: rename kasan_set_*_info to kasan_save_*_info Rename set_alloc_info() and kasan_set_free_info() to save_alloc_info() and kasan_save_free_info(). The new names make more sense. Link: https://lkml.kernel.org/r/9f04777a15cb9d96bf00331da98e021d732fe1c9.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 8 ++++---- mm/kasan/generic.c | 2 +- mm/kasan/kasan.h | 2 +- mm/kasan/tags.c | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index f6a6c7d0d8b8..90b6cadd2dac 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -365,7 +365,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, return false; if (kasan_stack_collection_enabled()) - kasan_set_free_info(cache, object, tag); + kasan_save_free_info(cache, object, tag); return kasan_quarantine_put(cache, object); } @@ -424,7 +424,7 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) } } -static void set_alloc_info(struct kmem_cache *cache, void *object, +static void save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags, bool is_kmalloc) { struct kasan_alloc_meta *alloc_meta; @@ -468,7 +468,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, /* Save alloc info (if possible) for non-kmalloc() allocations. */ if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags, false); + save_alloc_info(cache, (void *)object, flags, false); return tagged_object; } @@ -514,7 +514,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, * This also rewrites the alloc info when called from kasan_krealloc(). */ if (kasan_stack_collection_enabled()) - set_alloc_info(cache, (void *)object, flags, true); + save_alloc_info(cache, (void *)object, flags, true); /* Keep the tag that was set by kasan_slab_alloc(). */ return (void *)object; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 437fcc7e77cf..03a3770cfeae 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -358,7 +358,7 @@ void kasan_record_aux_stack_noalloc(void *addr) return __kasan_record_aux_stack(addr, false); } -void kasan_set_free_info(struct kmem_cache *cache, +void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { struct kasan_free_meta *free_meta; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 01c03e45acd4..bf16a74dc027 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -285,7 +285,7 @@ struct slab *kasan_addr_to_slab(const void *addr); depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); -void kasan_set_free_info(struct kmem_cache *cache, void *object, u8 tag); +void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag); struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, void *object, u8 tag); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 8f48b9502a17..b453a353bc86 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -17,7 +17,7 @@ #include "kasan.h" -void kasan_set_free_info(struct kmem_cache *cache, +void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { struct kasan_alloc_meta *alloc_meta; From 196894a6e20273d78479bdf76eec3a741e72d31c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:18 +0200 Subject: [PATCH 138/350] kasan: move is_kmalloc check out of save_alloc_info Move kasan_info.is_kmalloc check out of save_alloc_info(). This is a preparatory change that simplifies the following patches in this series. Link: https://lkml.kernel.org/r/df89f1915b788f9a10319905af6d0202a3b30c30.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 90b6cadd2dac..6a75237ed308 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -424,15 +424,10 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) } } -static void save_alloc_info(struct kmem_cache *cache, void *object, - gfp_t flags, bool is_kmalloc) +static void save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { struct kasan_alloc_meta *alloc_meta; - /* Don't save alloc info for kmalloc caches in kasan_slab_alloc(). */ - if (cache->kasan_info.is_kmalloc && !is_kmalloc) - return; - alloc_meta = kasan_get_alloc_meta(cache, object); if (alloc_meta) kasan_set_track(&alloc_meta->alloc_track, flags); @@ -467,8 +462,8 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, kasan_unpoison(tagged_object, cache->object_size, init); /* Save alloc info (if possible) for non-kmalloc() allocations. */ - if (kasan_stack_collection_enabled()) - save_alloc_info(cache, (void *)object, flags, false); + if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc) + save_alloc_info(cache, (void *)object, flags); return tagged_object; } @@ -513,8 +508,8 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, * Save alloc info (if possible) for kmalloc() allocations. * This also rewrites the alloc info when called from kasan_krealloc(). */ - if (kasan_stack_collection_enabled()) - save_alloc_info(cache, (void *)object, flags, true); + if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc) + save_alloc_info(cache, (void *)object, flags); /* Keep the tag that was set by kasan_slab_alloc(). */ return (void *)object; From ccf643e6dacf33ec618bd64e10eb0347173ad482 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:19 +0200 Subject: [PATCH 139/350] kasan: split save_alloc_info implementations Provide standalone implementations of save_alloc_info() for the Generic and tag-based modes. For now, the implementations are the same, but they will diverge later in the series. Link: https://lkml.kernel.org/r/77f1a078489c1e859aedb5403f772e5e1f7410a0.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 13 ++----------- mm/kasan/generic.c | 9 +++++++++ mm/kasan/kasan.h | 1 + mm/kasan/tags.c | 9 +++++++++ 4 files changed, 21 insertions(+), 11 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6a75237ed308..93e64e1b4413 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -424,15 +424,6 @@ void __kasan_slab_free_mempool(void *ptr, unsigned long ip) } } -static void save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) -{ - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - kasan_set_track(&alloc_meta->alloc_track, flags); -} - void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, void *object, gfp_t flags, bool init) { @@ -463,7 +454,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, /* Save alloc info (if possible) for non-kmalloc() allocations. */ if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc) - save_alloc_info(cache, (void *)object, flags); + kasan_save_alloc_info(cache, (void *)object, flags); return tagged_object; } @@ -509,7 +500,7 @@ static inline void *____kasan_kmalloc(struct kmem_cache *cache, * This also rewrites the alloc info when called from kasan_krealloc(). */ if (kasan_stack_collection_enabled() && cache->kasan_info.is_kmalloc) - save_alloc_info(cache, (void *)object, flags); + kasan_save_alloc_info(cache, (void *)object, flags); /* Keep the tag that was set by kasan_slab_alloc(). */ return (void *)object; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 03a3770cfeae..98c451a3b01f 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -358,6 +358,15 @@ void kasan_record_aux_stack_noalloc(void *addr) return __kasan_record_aux_stack(addr, false); } +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + kasan_set_track(&alloc_meta->alloc_track, flags); +} + void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index bf16a74dc027..d401fb770f67 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -285,6 +285,7 @@ struct slab *kasan_addr_to_slab(const void *addr); depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag); struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, void *object, u8 tag); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index b453a353bc86..1ba3c8399f72 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -17,6 +17,15 @@ #include "kasan.h" +void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + kasan_set_track(&alloc_meta->alloc_track, flags); +} + void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { From 687c85afa67a635dae683cf0ab6012e76333065b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:20 +0200 Subject: [PATCH 140/350] kasan: drop CONFIG_KASAN_TAGS_IDENTIFY Drop CONFIG_KASAN_TAGS_IDENTIFY and related code to simplify making changes to the reporting code. The dropped functionality will be restored in the following patches in this series. Link: https://lkml.kernel.org/r/4c66ba98eb237e9ed9312c19d423bbcf4ecf88f8.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- lib/Kconfig.kasan | 8 -------- mm/kasan/kasan.h | 12 +----------- mm/kasan/report_tags.c | 28 ---------------------------- mm/kasan/tags.c | 21 ++------------------- 4 files changed, 3 insertions(+), 66 deletions(-) diff --git a/lib/Kconfig.kasan b/lib/Kconfig.kasan index f0973da583e0..ca09b1cf8ee9 100644 --- a/lib/Kconfig.kasan +++ b/lib/Kconfig.kasan @@ -167,14 +167,6 @@ config KASAN_STACK as well, as it adds inline-style instrumentation that is run unconditionally. -config KASAN_TAGS_IDENTIFY - bool "Memory corruption type identification" - depends on KASAN_SW_TAGS || KASAN_HW_TAGS - help - Enables best-effort identification of the bug types (use-after-free - or out-of-bounds) at the cost of increased memory consumption. - Only applicable for the tag-based KASAN modes. - config KASAN_VMALLOC bool "Check accesses to vmalloc allocations" depends on HAVE_ARCH_KASAN_VMALLOC diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index d401fb770f67..15c718782c1f 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -169,23 +169,13 @@ struct kasan_track { depot_stack_handle_t stack; }; -#if defined(CONFIG_KASAN_TAGS_IDENTIFY) && defined(CONFIG_KASAN_SW_TAGS) -#define KASAN_NR_FREE_STACKS 5 -#else -#define KASAN_NR_FREE_STACKS 1 -#endif - struct kasan_alloc_meta { struct kasan_track alloc_track; /* Generic mode stores free track in kasan_free_meta. */ #ifdef CONFIG_KASAN_GENERIC depot_stack_handle_t aux_stack[2]; #else - struct kasan_track free_track[KASAN_NR_FREE_STACKS]; -#endif -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - u8 free_pointer_tag[KASAN_NR_FREE_STACKS]; - u8 free_track_idx; + struct kasan_track free_track; #endif }; diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index e25d2166e813..35cf3cae4aa4 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -5,37 +5,9 @@ */ #include "kasan.h" -#include "../slab.h" const char *kasan_get_bug_type(struct kasan_report_info *info) { -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - struct kasan_alloc_meta *alloc_meta; - struct kmem_cache *cache; - struct slab *slab; - const void *addr; - void *object; - u8 tag; - int i; - - tag = get_tag(info->access_addr); - addr = kasan_reset_tag(info->access_addr); - slab = kasan_addr_to_slab(addr); - if (slab) { - cache = slab->slab_cache; - object = nearest_obj(cache, slab, (void *)addr); - alloc_meta = kasan_get_alloc_meta(cache, object); - - if (alloc_meta) { - for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { - if (alloc_meta->free_pointer_tag[i] == tag) - return "use-after-free"; - } - } - return "out-of-bounds"; - } -#endif - /* * If access_size is a negative number, then it has reason to be * defined as out-of-bounds bug type. diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 1ba3c8399f72..e0e5de8ce834 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -30,39 +30,22 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { struct kasan_alloc_meta *alloc_meta; - u8 idx = 0; alloc_meta = kasan_get_alloc_meta(cache, object); if (!alloc_meta) return; -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - idx = alloc_meta->free_track_idx; - alloc_meta->free_pointer_tag[idx] = tag; - alloc_meta->free_track_idx = (idx + 1) % KASAN_NR_FREE_STACKS; -#endif - - kasan_set_track(&alloc_meta->free_track[idx], GFP_NOWAIT); + kasan_set_track(&alloc_meta->free_track, GFP_NOWAIT); } struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, void *object, u8 tag) { struct kasan_alloc_meta *alloc_meta; - int i = 0; alloc_meta = kasan_get_alloc_meta(cache, object); if (!alloc_meta) return NULL; -#ifdef CONFIG_KASAN_TAGS_IDENTIFY - for (i = 0; i < KASAN_NR_FREE_STACKS; i++) { - if (alloc_meta->free_pointer_tag[i] == tag) - break; - } - if (i == KASAN_NR_FREE_STACKS) - i = alloc_meta->free_track_idx; -#endif - - return &alloc_meta->free_track[i]; + return &alloc_meta->free_track; } From 88f29765ae3b00f8b9362f299f6140cd9b988f75 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:21 +0200 Subject: [PATCH 141/350] kasan: introduce kasan_print_aux_stacks Add a kasan_print_aux_stacks() helper that prints the auxiliary stack traces for the Generic mode. This change hides references to alloc_meta from the common reporting code. This is desired as only the Generic mode will be using per-object metadata after this series. Link: https://lkml.kernel.org/r/67c7a9ea6615533762b1f8ccc267cd7f9bafb749.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 6 ++++++ mm/kasan/report.c | 15 +-------------- mm/kasan/report_generic.c | 20 ++++++++++++++++++++ 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 15c718782c1f..30ff341b6d35 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -266,6 +266,12 @@ void kasan_print_address_stack_frame(const void *addr); static inline void kasan_print_address_stack_frame(const void *addr) { } #endif +#ifdef CONFIG_KASAN_GENERIC +void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object); +#else +static inline void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) { } +#endif + bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type); diff --git a/mm/kasan/report.c b/mm/kasan/report.c index fe3f606b3a98..cd9f5c7fc6db 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -270,20 +270,7 @@ static void describe_object_stacks(struct kmem_cache *cache, void *object, pr_err("\n"); } -#ifdef CONFIG_KASAN_GENERIC - if (!alloc_meta) - return; - if (alloc_meta->aux_stack[0]) { - pr_err("Last potentially related work creation:\n"); - stack_depot_print(alloc_meta->aux_stack[0]); - pr_err("\n"); - } - if (alloc_meta->aux_stack[1]) { - pr_err("Second to last potentially related work creation:\n"); - stack_depot_print(alloc_meta->aux_stack[1]); - pr_err("\n"); - } -#endif + kasan_print_aux_stacks(cache, object); } static void describe_object(struct kmem_cache *cache, void *object, diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 6689fb9a919b..348dc207d462 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -132,6 +132,26 @@ void kasan_metadata_fetch_row(char *buffer, void *row) memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); } +void kasan_print_aux_stacks(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return; + + if (alloc_meta->aux_stack[0]) { + pr_err("Last potentially related work creation:\n"); + stack_depot_print(alloc_meta->aux_stack[0]); + pr_err("\n"); + } + if (alloc_meta->aux_stack[1]) { + pr_err("Second to last potentially related work creation:\n"); + stack_depot_print(alloc_meta->aux_stack[1]); + pr_err("\n"); + } +} + #ifdef CONFIG_KASAN_STACK static bool __must_check tokenize_frame_descr(const char **frame_descr, char *token, size_t max_tok_len, From f3647cbfe5a34af1a22f2627dda5fb078a47f0d3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:22 +0200 Subject: [PATCH 142/350] kasan: introduce kasan_get_alloc_track Add a kasan_get_alloc_track() helper that fetches alloc_track for a slab object and use this helper in the common reporting code. For now, the implementations of this helper are the same for the Generic and tag-based modes, but they will diverge later in the series. This change hides references to alloc_meta from the common reporting code. This is desired as only the Generic mode will be using per-object metadata after this series. Link: https://lkml.kernel.org/r/0c365a35f4a833fff46f9d42c3212b32f7166556.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 14 +++++++++++++- mm/kasan/kasan.h | 4 +++- mm/kasan/report.c | 8 ++++---- mm/kasan/tags.c | 14 +++++++++++++- 4 files changed, 33 insertions(+), 7 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 98c451a3b01f..f212b9ae57b5 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -381,8 +381,20 @@ void kasan_save_free_info(struct kmem_cache *cache, *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK; } +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return NULL; + + return &alloc_meta->alloc_track; +} + struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) + void *object, u8 tag) { if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK) return NULL; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 30ff341b6d35..b65a51349c51 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -283,8 +283,10 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag); +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object); struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag); + void *object, u8 tag); #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index cd9f5c7fc6db..5d225d7d9c4c 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -255,12 +255,12 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, static void describe_object_stacks(struct kmem_cache *cache, void *object, const void *addr, u8 tag) { - struct kasan_alloc_meta *alloc_meta; + struct kasan_track *alloc_track; struct kasan_track *free_track; - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) { - print_track(&alloc_meta->alloc_track, "Allocated"); + alloc_track = kasan_get_alloc_track(cache, object); + if (alloc_track) { + print_track(alloc_track, "Allocated"); pr_err("\n"); } diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index e0e5de8ce834..7b1fc8e7c99c 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -38,8 +38,20 @@ void kasan_save_free_info(struct kmem_cache *cache, kasan_set_track(&alloc_meta->free_track, GFP_NOWAIT); } +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return NULL; + + return &alloc_meta->alloc_track; +} + struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) + void *object, u8 tag) { struct kasan_alloc_meta *alloc_meta; From 836daba099472baaa8b6a57772e8bb2d55f1f9d7 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:23 +0200 Subject: [PATCH 143/350] kasan: introduce kasan_init_object_meta Add a kasan_init_object_meta() helper that initializes metadata for a slab object and use it in the common code. For now, the implementations of this helper are the same for the Generic and tag-based modes, but they will diverge later in the series. This change hides references to alloc_meta from the common code. This is desired as only the Generic mode will be using per-object metadata after this series. Link: https://lkml.kernel.org/r/47c12938fc7f8105e7aaa592527c0e9d3c81fc37.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 10 +++------- mm/kasan/generic.c | 9 +++++++++ mm/kasan/kasan.h | 2 ++ mm/kasan/tags.c | 9 +++++++++ 4 files changed, 23 insertions(+), 7 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 93e64e1b4413..18107675a7fe 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -313,13 +313,9 @@ static inline u8 assign_tag(struct kmem_cache *cache, void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, const void *object) { - struct kasan_alloc_meta *alloc_meta; - - if (kasan_stack_collection_enabled()) { - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - __memset(alloc_meta, 0, sizeof(*alloc_meta)); - } + /* Initialize per-object metadata if it is present. */ + if (kasan_stack_collection_enabled()) + kasan_init_object_meta(cache, object); /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */ object = set_tag(object, assign_tag(cache, object, true)); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index f212b9ae57b5..5462ddbc21e6 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,6 +328,15 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); +void kasan_init_object_meta(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + __memset(alloc_meta, 0, sizeof(*alloc_meta)); +} + static void __kasan_record_aux_stack(void *addr, bool can_alloc) { struct slab *slab = kasan_addr_to_slab(addr); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index b65a51349c51..2c8c3cce7bc6 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -279,6 +279,8 @@ void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); +void kasan_init_object_meta(struct kmem_cache *cache, const void *object); + depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 7b1fc8e7c99c..2e200969a4b8 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -17,6 +17,15 @@ #include "kasan.h" +void kasan_init_object_meta(struct kmem_cache *cache, const void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (alloc_meta) + __memset(alloc_meta, 0, sizeof(*alloc_meta)); +} + void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { struct kasan_alloc_meta *alloc_meta; From 74984e79071aafd528f03b8418657c05011b94f3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:24 +0200 Subject: [PATCH 144/350] kasan: clear metadata functions for tag-based modes Remove implementations of the metadata-related functions for the tag-based modes. The following patches in the series will provide alternative implementations. As of this patch, the tag-based modes no longer collect alloc and free stack traces. This functionality will be restored later in the series. Link: https://lkml.kernel.org/r/470fbe5d15e8015092e76e395de354be18ccceab.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/tags.c | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 2e200969a4b8..f11c89505c77 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -19,54 +19,25 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - __memset(alloc_meta, 0, sizeof(*alloc_meta)); } void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (alloc_meta) - kasan_set_track(&alloc_meta->alloc_track, flags); } void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return; - - kasan_set_track(&alloc_meta->free_track, GFP_NOWAIT); } struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, void *object) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; - - return &alloc_meta->alloc_track; + return NULL; } struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, void *object, u8 tag) { - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; - - return &alloc_meta->free_track; + return NULL; } From 2f3568017268fc34eb0b6b4b3163c0f2e619fde6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:25 +0200 Subject: [PATCH 145/350] kasan: move kasan_get_*_meta to generic.c Move the implementations of kasan_get_alloc/free_meta() to generic.c, as the common KASAN code does not use these functions anymore. Also drop kasan_reset_tag() from the implementation, as the Generic mode does not tag pointers. Link: https://lkml.kernel.org/r/ffcfc0ad654d78a2ef4ca054c943ddb4e5ca477b.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 19 ------------------- mm/kasan/generic.c | 17 +++++++++++++++++ mm/kasan/kasan.h | 14 +++++++------- 3 files changed, 24 insertions(+), 26 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 18107675a7fe..19ddc0ed0e7b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -229,25 +229,6 @@ size_t __kasan_metadata_size(struct kmem_cache *cache) sizeof(struct kasan_free_meta) : 0); } -struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, - const void *object) -{ - if (!cache->kasan_info.alloc_meta_offset) - return NULL; - return kasan_reset_tag(object) + cache->kasan_info.alloc_meta_offset; -} - -#ifdef CONFIG_KASAN_GENERIC -struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, - const void *object) -{ - BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); - if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META) - return NULL; - return kasan_reset_tag(object) + cache->kasan_info.free_meta_offset; -} -#endif - void __kasan_poison_slab(struct slab *slab) { struct page *page = slab_page(slab); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 5462ddbc21e6..fa654cb96a0d 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,6 +328,23 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); +struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, + const void *object) +{ + if (!cache->kasan_info.alloc_meta_offset) + return NULL; + return (void *)object + cache->kasan_info.alloc_meta_offset; +} + +struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, + const void *object) +{ + BUILD_BUG_ON(sizeof(struct kasan_free_meta) > 32); + if (cache->kasan_info.free_meta_offset == KASAN_NO_FREE_META) + return NULL; + return (void *)object + cache->kasan_info.free_meta_offset; +} + void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { struct kasan_alloc_meta *alloc_meta; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 2c8c3cce7bc6..fdd577f3eb9d 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -209,13 +209,6 @@ struct kunit_kasan_status { }; #endif -struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, - const void *object); -#ifdef CONFIG_KASAN_GENERIC -struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, - const void *object); -#endif - #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) static inline const void *kasan_shadow_to_mem(const void *shadow_addr) @@ -281,6 +274,13 @@ struct slab *kasan_addr_to_slab(const void *addr); void kasan_init_object_meta(struct kmem_cache *cache, const void *object); +#ifdef CONFIG_KASAN_GENERIC +struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, + const void *object); +struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, + const void *object); +#endif + depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); From 284f8590a1dfbe1c33b50bf6e8f8dc714e61bfd3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:26 +0200 Subject: [PATCH 146/350] kasan: introduce kasan_requires_meta Add a kasan_requires_meta() helper that indicates whether the enabled KASAN mode requires per-object metadata and use this helper in the common code. Also hide kasan_init_object_meta() under CONFIG_KASAN_GENERIC ifdef check, as Generic is the only mode that uses per-object metadata. To allow for a potential future change that makes Generic KASAN support the kasan.stacktrace command-line parameter, let kasan_requires_meta() return kasan_stack_collection_enabled() instead of simply returning true. Link: https://lkml.kernel.org/r/cf837e9996246aaaeebf704ccf8ec26a34fcf64f.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 13 +++++-------- mm/kasan/kasan.h | 33 +++++++++++++++++++++++++++++---- mm/kasan/tags.c | 4 ---- 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 19ddc0ed0e7b..d0300954d76b 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -88,13 +88,10 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) } #endif /* CONFIG_KASAN_STACK */ -/* - * Only allow cache merging when stack collection is disabled and no metadata - * is present. - */ +/* Only allow cache merging when no per-object metadata is present. */ slab_flags_t __kasan_never_merge(void) { - if (kasan_stack_collection_enabled()) + if (kasan_requires_meta()) return SLAB_KASAN; return 0; } @@ -152,7 +149,7 @@ void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, */ *flags |= SLAB_KASAN; - if (!kasan_stack_collection_enabled()) + if (!kasan_requires_meta()) return; ok_size = *size; @@ -220,7 +217,7 @@ void __kasan_cache_create_kmalloc(struct kmem_cache *cache) size_t __kasan_metadata_size(struct kmem_cache *cache) { - if (!kasan_stack_collection_enabled()) + if (!kasan_requires_meta()) return 0; return (cache->kasan_info.alloc_meta_offset ? sizeof(struct kasan_alloc_meta) : 0) + @@ -295,7 +292,7 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, const void *object) { /* Initialize per-object metadata if it is present. */ - if (kasan_stack_collection_enabled()) + if (kasan_requires_meta()) kasan_init_object_meta(cache, object); /* Tag is ignored in set_tag() without CONFIG_KASAN_SW/HW_TAGS */ diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index fdd577f3eb9d..1736abd661b6 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -43,7 +43,7 @@ static inline bool kasan_sync_fault_possible(void) return kasan_mode == KASAN_MODE_SYNC || kasan_mode == KASAN_MODE_ASYMM; } -#else +#else /* CONFIG_KASAN_HW_TAGS */ static inline bool kasan_stack_collection_enabled(void) { @@ -60,7 +60,31 @@ static inline bool kasan_sync_fault_possible(void) return true; } -#endif +#endif /* CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_GENERIC + +/* Generic KASAN uses per-object metadata to store stack traces. */ +static inline bool kasan_requires_meta(void) +{ + /* + * Technically, Generic KASAN always collects stack traces right now. + * However, let's use kasan_stack_collection_enabled() in case the + * kasan.stacktrace command-line argument is changed to affect + * Generic KASAN. + */ + return kasan_stack_collection_enabled(); +} + +#else /* CONFIG_KASAN_GENERIC */ + +/* Tag-based KASAN modes do not use per-object metadata. */ +static inline bool kasan_requires_meta(void) +{ + return false; +} + +#endif /* CONFIG_KASAN_GENERIC */ #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) #define KASAN_GRANULE_SIZE (1UL << KASAN_SHADOW_SCALE_SHIFT) @@ -272,13 +296,14 @@ void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); -void kasan_init_object_meta(struct kmem_cache *cache, const void *object); - #ifdef CONFIG_KASAN_GENERIC +void kasan_init_object_meta(struct kmem_cache *cache, const void *object); struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, const void *object); struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, const void *object); +#else +static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { } #endif depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index f11c89505c77..4f24669085e9 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -17,10 +17,6 @@ #include "kasan.h" -void kasan_init_object_meta(struct kmem_cache *cache, const void *object) -{ -} - void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { } From 5935143d118569cdbccbae182763d2b451120c40 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:27 +0200 Subject: [PATCH 147/350] kasan: introduce kasan_init_cache_meta Add a kasan_init_cache_meta() helper that initializes metadata-related cache parameters and use this helper in the common KASAN code. Put the implementation of this new helper into generic.c, as only the Generic mode uses per-object metadata. Link: https://lkml.kernel.org/r/a6d7ea01876eb36472c9879f7b23f1b24766276e.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 80 ++-------------------------------------------- mm/kasan/generic.c | 79 +++++++++++++++++++++++++++++++++++++++++++++ mm/kasan/kasan.h | 2 ++ 3 files changed, 83 insertions(+), 78 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index d0300954d76b..b6a74fe5e740 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -118,28 +118,9 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init) KASAN_PAGE_FREE, init); } -/* - * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. - * For larger allocations larger redzones are used. - */ -static inline unsigned int optimal_redzone(unsigned int object_size) -{ - return - object_size <= 64 - 16 ? 16 : - object_size <= 128 - 32 ? 32 : - object_size <= 512 - 64 ? 64 : - object_size <= 4096 - 128 ? 128 : - object_size <= (1 << 14) - 256 ? 256 : - object_size <= (1 << 15) - 512 ? 512 : - object_size <= (1 << 16) - 1024 ? 1024 : 2048; -} - void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) { - unsigned int ok_size; - unsigned int optimal_size; - /* * SLAB_KASAN is used to mark caches as ones that are sanitized by * KASAN. Currently this flag is used in two places: @@ -149,65 +130,8 @@ void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, */ *flags |= SLAB_KASAN; - if (!kasan_requires_meta()) - return; - - ok_size = *size; - - /* Add alloc meta into redzone. */ - cache->kasan_info.alloc_meta_offset = *size; - *size += sizeof(struct kasan_alloc_meta); - - /* - * If alloc meta doesn't fit, don't add it. - * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal - * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for - * larger sizes. - */ - if (*size > KMALLOC_MAX_SIZE) { - cache->kasan_info.alloc_meta_offset = 0; - *size = ok_size; - /* Continue, since free meta might still fit. */ - } - - /* Only the generic mode uses free meta or flexible redzones. */ - if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { - cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; - return; - } - - /* - * Add free meta into redzone when it's not possible to store - * it in the object. This is the case when: - * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can - * be touched after it was freed, or - * 2. Object has a constructor, which means it's expected to - * retain its content until the next allocation, or - * 3. Object is too small. - * Otherwise cache->kasan_info.free_meta_offset = 0 is implied. - */ - if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || - cache->object_size < sizeof(struct kasan_free_meta)) { - ok_size = *size; - - cache->kasan_info.free_meta_offset = *size; - *size += sizeof(struct kasan_free_meta); - - /* If free meta doesn't fit, don't add it. */ - if (*size > KMALLOC_MAX_SIZE) { - cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; - *size = ok_size; - } - } - - /* Calculate size with optimal redzone. */ - optimal_size = cache->object_size + optimal_redzone(cache->object_size); - /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */ - if (optimal_size > KMALLOC_MAX_SIZE) - optimal_size = KMALLOC_MAX_SIZE; - /* Use optimal size if the size with added metas is not large enough. */ - if (*size < optimal_size) - *size = optimal_size; + if (kasan_requires_meta()) + kasan_init_cache_meta(cache, size); } void __kasan_cache_create_kmalloc(struct kmem_cache *cache) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index fa654cb96a0d..73aea784040a 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,6 +328,85 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); +/* + * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. + * For larger allocations larger redzones are used. + */ +static inline unsigned int optimal_redzone(unsigned int object_size) +{ + return + object_size <= 64 - 16 ? 16 : + object_size <= 128 - 32 ? 32 : + object_size <= 512 - 64 ? 64 : + object_size <= 4096 - 128 ? 128 : + object_size <= (1 << 14) - 256 ? 256 : + object_size <= (1 << 15) - 512 ? 512 : + object_size <= (1 << 16) - 1024 ? 1024 : 2048; +} + +void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) +{ + unsigned int ok_size; + unsigned int optimal_size; + + ok_size = *size; + + /* Add alloc meta into redzone. */ + cache->kasan_info.alloc_meta_offset = *size; + *size += sizeof(struct kasan_alloc_meta); + + /* + * If alloc meta doesn't fit, don't add it. + * This can only happen with SLAB, as it has KMALLOC_MAX_SIZE equal + * to KMALLOC_MAX_CACHE_SIZE and doesn't fall back to page_alloc for + * larger sizes. + */ + if (*size > KMALLOC_MAX_SIZE) { + cache->kasan_info.alloc_meta_offset = 0; + *size = ok_size; + /* Continue, since free meta might still fit. */ + } + + /* Only the generic mode uses free meta or flexible redzones. */ + if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { + cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; + return; + } + + /* + * Add free meta into redzone when it's not possible to store + * it in the object. This is the case when: + * 1. Object is SLAB_TYPESAFE_BY_RCU, which means that it can + * be touched after it was freed, or + * 2. Object has a constructor, which means it's expected to + * retain its content until the next allocation, or + * 3. Object is too small. + * Otherwise cache->kasan_info.free_meta_offset = 0 is implied. + */ + if ((cache->flags & SLAB_TYPESAFE_BY_RCU) || cache->ctor || + cache->object_size < sizeof(struct kasan_free_meta)) { + ok_size = *size; + + cache->kasan_info.free_meta_offset = *size; + *size += sizeof(struct kasan_free_meta); + + /* If free meta doesn't fit, don't add it. */ + if (*size > KMALLOC_MAX_SIZE) { + cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; + *size = ok_size; + } + } + + /* Calculate size with optimal redzone. */ + optimal_size = cache->object_size + optimal_redzone(cache->object_size); + /* Limit it with KMALLOC_MAX_SIZE (relevant for SLAB only). */ + if (optimal_size > KMALLOC_MAX_SIZE) + optimal_size = KMALLOC_MAX_SIZE; + /* Use optimal size if the size with added metas is not large enough. */ + if (*size < optimal_size) + *size = optimal_size; +} + struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, const void *object) { diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 1736abd661b6..6da35370ba37 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -297,12 +297,14 @@ struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); #ifdef CONFIG_KASAN_GENERIC +void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size); void kasan_init_object_meta(struct kmem_cache *cache, const void *object); struct kasan_alloc_meta *kasan_get_alloc_meta(struct kmem_cache *cache, const void *object); struct kasan_free_meta *kasan_get_free_meta(struct kmem_cache *cache, const void *object); #else +static inline void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) { } static inline void kasan_init_object_meta(struct kmem_cache *cache, const void *object) { } #endif From 02856beb2d801423f88f2e8cb2eed0d6f14a4f92 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:28 +0200 Subject: [PATCH 148/350] kasan: drop CONFIG_KASAN_GENERIC check from kasan_init_cache_meta As kasan_init_cache_meta() is only defined for the Generic mode, it does not require the CONFIG_KASAN_GENERIC check. Link: https://lkml.kernel.org/r/211f8f2b213aa91e9148ca63342990b491c4917a.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 73aea784040a..5125fad76f70 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -367,12 +367,6 @@ void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) /* Continue, since free meta might still fit. */ } - /* Only the generic mode uses free meta or flexible redzones. */ - if (!IS_ENABLED(CONFIG_KASAN_GENERIC)) { - cache->kasan_info.free_meta_offset = KASAN_NO_FREE_META; - return; - } - /* * Add free meta into redzone when it's not possible to store * it in the object. This is the case when: From f372bde922e2ced8e0b5a928887b4cf587cc4453 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:29 +0200 Subject: [PATCH 149/350] kasan: only define kasan_metadata_size for Generic mode KASAN provides a helper for calculating the size of per-object metadata stored in the redzone. As now only the Generic mode uses per-object metadata, only define kasan_metadata_size() for this mode. Link: https://lkml.kernel.org/r/8f81d4938b80446bc72538a08217009f328a3e23.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 17 ++++++++--------- mm/kasan/common.c | 11 ----------- mm/kasan/generic.c | 11 +++++++++++ 3 files changed, 19 insertions(+), 20 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index b092277bf48d..027df7599573 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -150,14 +150,6 @@ static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) __kasan_cache_create_kmalloc(cache); } -size_t __kasan_metadata_size(struct kmem_cache *cache); -static __always_inline size_t kasan_metadata_size(struct kmem_cache *cache) -{ - if (kasan_enabled()) - return __kasan_metadata_size(cache); - return 0; -} - void __kasan_poison_slab(struct slab *slab); static __always_inline void kasan_poison_slab(struct slab *slab) { @@ -282,7 +274,6 @@ static inline void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, slab_flags_t *flags) {} static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} -static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, void *object) {} @@ -333,6 +324,8 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #ifdef CONFIG_KASAN_GENERIC +size_t kasan_metadata_size(struct kmem_cache *cache); + void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); void kasan_record_aux_stack(void *ptr); @@ -340,6 +333,12 @@ void kasan_record_aux_stack_noalloc(void *ptr); #else /* CONFIG_KASAN_GENERIC */ +/* Tag-based KASAN modes do not use per-object metadata. */ +static inline size_t kasan_metadata_size(struct kmem_cache *cache) +{ + return 0; +} + static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} static inline void kasan_record_aux_stack(void *ptr) {} diff --git a/mm/kasan/common.c b/mm/kasan/common.c index b6a74fe5e740..7c79c560315d 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -139,17 +139,6 @@ void __kasan_cache_create_kmalloc(struct kmem_cache *cache) cache->kasan_info.is_kmalloc = true; } -size_t __kasan_metadata_size(struct kmem_cache *cache) -{ - if (!kasan_requires_meta()) - return 0; - return (cache->kasan_info.alloc_meta_offset ? - sizeof(struct kasan_alloc_meta) : 0) + - ((cache->kasan_info.free_meta_offset && - cache->kasan_info.free_meta_offset != KASAN_NO_FREE_META) ? - sizeof(struct kasan_free_meta) : 0); -} - void __kasan_poison_slab(struct slab *slab) { struct page *page = slab_page(slab); diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 5125fad76f70..806ab92032c3 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -427,6 +427,17 @@ void kasan_init_object_meta(struct kmem_cache *cache, const void *object) __memset(alloc_meta, 0, sizeof(*alloc_meta)); } +size_t kasan_metadata_size(struct kmem_cache *cache) +{ + if (!kasan_requires_meta()) + return 0; + return (cache->kasan_info.alloc_meta_offset ? + sizeof(struct kasan_alloc_meta) : 0) + + ((cache->kasan_info.free_meta_offset && + cache->kasan_info.free_meta_offset != KASAN_NO_FREE_META) ? + sizeof(struct kasan_free_meta) : 0); +} + static void __kasan_record_aux_stack(void *addr, bool can_alloc) { struct slab *slab = kasan_addr_to_slab(addr); From 3b7f8813e9ecf7fe91f2f8dc3b581a111cd374a5 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:30 +0200 Subject: [PATCH 150/350] kasan: only define kasan_never_merge for Generic mode KASAN prevents merging of slab caches whose objects have per-object metadata stored in redzones. As now only the Generic mode uses per-object metadata, define kasan_never_merge() only for this mode. Link: https://lkml.kernel.org/r/81ed01f29ff3443580b7e2fe362a8b47b1e8006d.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 18 ++++++------------ mm/kasan/common.c | 8 -------- mm/kasan/generic.c | 8 ++++++++ 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 027df7599573..9743d4b3a918 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -103,14 +103,6 @@ struct kasan_cache { bool is_kmalloc; }; -slab_flags_t __kasan_never_merge(void); -static __always_inline slab_flags_t kasan_never_merge(void) -{ - if (kasan_enabled()) - return __kasan_never_merge(); - return 0; -} - void __kasan_unpoison_range(const void *addr, size_t size); static __always_inline void kasan_unpoison_range(const void *addr, size_t size) { @@ -261,10 +253,6 @@ static __always_inline bool kasan_check_byte(const void *addr) #else /* CONFIG_KASAN */ -static inline slab_flags_t kasan_never_merge(void) -{ - return 0; -} static inline void kasan_unpoison_range(const void *address, size_t size) {} static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} @@ -325,6 +313,7 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} #ifdef CONFIG_KASAN_GENERIC size_t kasan_metadata_size(struct kmem_cache *cache); +slab_flags_t kasan_never_merge(void); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); @@ -338,6 +327,11 @@ static inline size_t kasan_metadata_size(struct kmem_cache *cache) { return 0; } +/* And thus nothing prevents cache merging. */ +static inline slab_flags_t kasan_never_merge(void) +{ + return 0; +} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 7c79c560315d..c2690e938030 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -88,14 +88,6 @@ asmlinkage void kasan_unpoison_task_stack_below(const void *watermark) } #endif /* CONFIG_KASAN_STACK */ -/* Only allow cache merging when no per-object metadata is present. */ -slab_flags_t __kasan_never_merge(void) -{ - if (kasan_requires_meta()) - return SLAB_KASAN; - return 0; -} - void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { u8 tag; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 806ab92032c3..25333bf3c99f 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -328,6 +328,14 @@ DEFINE_ASAN_SET_SHADOW(f3); DEFINE_ASAN_SET_SHADOW(f5); DEFINE_ASAN_SET_SHADOW(f8); +/* Only allow cache merging when no per-object metadata is present. */ +slab_flags_t kasan_never_merge(void) +{ + if (!kasan_requires_meta()) + return 0; + return SLAB_KASAN; +} + /* * Adaptive redzone policy taken from the userspace AddressSanitizer runtime. * For larger allocations larger redzones are used. From 26f21f3ac76df6cf3b447e8231f8754991165475 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:31 +0200 Subject: [PATCH 151/350] kasan: only define metadata offsets for Generic mode Hide the definitions of alloc_meta_offset and free_meta_offset under an ifdef CONFIG_KASAN_GENERIC check, as these fields are now only used when the Generic mode is enabled. Link: https://lkml.kernel.org/r/d4bafa0534facafd1a23c465a94261e64f366493.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index 9743d4b3a918..a212c2e3f32d 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -98,8 +98,10 @@ static inline bool kasan_has_integrated_init(void) #ifdef CONFIG_KASAN struct kasan_cache { +#ifdef CONFIG_KASAN_GENERIC int alloc_meta_offset; int free_meta_offset; +#endif bool is_kmalloc; }; From be95e13fcc6ded156c65ece01486d9cc33d22dc8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:32 +0200 Subject: [PATCH 152/350] kasan: only define metadata structs for Generic mode Hide the definitions of kasan_alloc_meta and kasan_free_meta under an ifdef CONFIG_KASAN_GENERIC check, as these structures are now only used when the Generic mode is enabled. Link: https://lkml.kernel.org/r/8d2aabff8c227c444a3f62edf87d5630beb77640.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 6da35370ba37..cae60e4d8842 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -193,14 +193,12 @@ struct kasan_track { depot_stack_handle_t stack; }; +#ifdef CONFIG_KASAN_GENERIC + struct kasan_alloc_meta { struct kasan_track alloc_track; - /* Generic mode stores free track in kasan_free_meta. */ -#ifdef CONFIG_KASAN_GENERIC + /* Free track is stored in kasan_free_meta. */ depot_stack_handle_t aux_stack[2]; -#else - struct kasan_track free_track; -#endif }; struct qlist_node { @@ -219,12 +217,12 @@ struct qlist_node { * After that, slab allocator stores the freelist pointer in the object. */ struct kasan_free_meta { -#ifdef CONFIG_KASAN_GENERIC struct qlist_node quarantine_link; struct kasan_track free_track; -#endif }; +#endif /* CONFIG_KASAN_GENERIC */ + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) /* Used in KUnit-compatible KASAN tests. */ struct kunit_kasan_status { From 682ed08924407b719fa0b1123a26971748d76ace Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:33 +0200 Subject: [PATCH 153/350] kasan: only define kasan_cache_create for Generic mode Right now, kasan_cache_create() assigns SLAB_KASAN for all KASAN modes and then sets up metadata-related cache parameters for the Generic mode. SLAB_KASAN is used in two places: 1. In slab_ksize() to account for per-object metadata when calculating the size of the accessible memory within the object. 2. In slab_common.c via kasan_never_merge() to prevent merging of caches with per-object metadata. Both cases are only relevant when per-object metadata is present, which is only the case with the Generic mode. Thus, assign SLAB_KASAN and define kasan_cache_create() only for the Generic mode. Also update the SLAB_KASAN-related comment. Link: https://lkml.kernel.org/r/61faa2aa1906e2d02c97d00ddf99ce8911dda095.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- include/linux/kasan.h | 18 ++++++------------ include/linux/slab.h | 2 +- mm/kasan/common.c | 16 ---------------- mm/kasan/generic.c | 17 ++++++++++++++++- 4 files changed, 23 insertions(+), 30 deletions(-) diff --git a/include/linux/kasan.h b/include/linux/kasan.h index a212c2e3f32d..d811b3d7d2a1 100644 --- a/include/linux/kasan.h +++ b/include/linux/kasan.h @@ -128,15 +128,6 @@ static __always_inline void kasan_unpoison_pages(struct page *page, __kasan_unpoison_pages(page, order, init); } -void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, - slab_flags_t *flags); -static __always_inline void kasan_cache_create(struct kmem_cache *cache, - unsigned int *size, slab_flags_t *flags) -{ - if (kasan_enabled()) - __kasan_cache_create(cache, size, flags); -} - void __kasan_cache_create_kmalloc(struct kmem_cache *cache); static __always_inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) { @@ -260,9 +251,6 @@ static inline void kasan_poison_pages(struct page *page, unsigned int order, bool init) {} static inline void kasan_unpoison_pages(struct page *page, unsigned int order, bool init) {} -static inline void kasan_cache_create(struct kmem_cache *cache, - unsigned int *size, - slab_flags_t *flags) {} static inline void kasan_cache_create_kmalloc(struct kmem_cache *cache) {} static inline void kasan_poison_slab(struct slab *slab) {} static inline void kasan_unpoison_object_data(struct kmem_cache *cache, @@ -316,6 +304,8 @@ static inline void kasan_unpoison_task_stack(struct task_struct *task) {} size_t kasan_metadata_size(struct kmem_cache *cache); slab_flags_t kasan_never_merge(void); +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, + slab_flags_t *flags); void kasan_cache_shrink(struct kmem_cache *cache); void kasan_cache_shutdown(struct kmem_cache *cache); @@ -334,6 +324,10 @@ static inline slab_flags_t kasan_never_merge(void) { return 0; } +/* And no cache-related metadata initialization is required. */ +static inline void kasan_cache_create(struct kmem_cache *cache, + unsigned int *size, + slab_flags_t *flags) {} static inline void kasan_cache_shrink(struct kmem_cache *cache) {} static inline void kasan_cache_shutdown(struct kmem_cache *cache) {} diff --git a/include/linux/slab.h b/include/linux/slab.h index 352e3f082acc..617a39f7db46 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -106,7 +106,7 @@ # define SLAB_ACCOUNT 0 #endif -#ifdef CONFIG_KASAN +#ifdef CONFIG_KASAN_GENERIC #define SLAB_KASAN ((slab_flags_t __force)0x08000000U) #else #define SLAB_KASAN 0 diff --git a/mm/kasan/common.c b/mm/kasan/common.c index c2690e938030..8efa63190951 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -110,22 +110,6 @@ void __kasan_poison_pages(struct page *page, unsigned int order, bool init) KASAN_PAGE_FREE, init); } -void __kasan_cache_create(struct kmem_cache *cache, unsigned int *size, - slab_flags_t *flags) -{ - /* - * SLAB_KASAN is used to mark caches as ones that are sanitized by - * KASAN. Currently this flag is used in two places: - * 1. In slab_ksize() when calculating the size of the accessible - * memory within the object. - * 2. In slab_common.c to prevent merging of sanitized caches. - */ - *flags |= SLAB_KASAN; - - if (kasan_requires_meta()) - kasan_init_cache_meta(cache, size); -} - void __kasan_cache_create_kmalloc(struct kmem_cache *cache) { cache->kasan_info.is_kmalloc = true; diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index 25333bf3c99f..f6bef347de87 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -352,11 +352,26 @@ static inline unsigned int optimal_redzone(unsigned int object_size) object_size <= (1 << 16) - 1024 ? 1024 : 2048; } -void kasan_init_cache_meta(struct kmem_cache *cache, unsigned int *size) +void kasan_cache_create(struct kmem_cache *cache, unsigned int *size, + slab_flags_t *flags) { unsigned int ok_size; unsigned int optimal_size; + if (!kasan_requires_meta()) + return; + + /* + * SLAB_KASAN is used to mark caches that are sanitized by KASAN + * and that thus have per-object metadata. + * Currently this flag is used in two places: + * 1. In slab_ksize() to account for per-object metadata when + * calculating the size of the accessible memory within the object. + * 2. In slab_common.c via kasan_never_merge() to prevent merging of + * caches with per-object metadata. + */ + *flags |= SLAB_KASAN; + ok_size = *size; /* Add alloc meta into redzone. */ From 6b07434980a1926780cb5c5644fb198fb9c3997b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:34 +0200 Subject: [PATCH 154/350] kasan: pass tagged pointers to kasan_save_alloc/free_info Pass tagged pointers to kasan_save_alloc/free_info(). This is a preparatory patch to simplify other changes in the series. Link: https://lkml.kernel.org/r/d5bc48cfcf0dca8269dc3ed863047e4d4d2030f1.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 6 ++---- mm/kasan/generic.c | 3 +-- mm/kasan/kasan.h | 2 +- mm/kasan/tags.c | 3 +-- 4 files changed, 5 insertions(+), 9 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 8efa63190951..f8e16a242197 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -193,13 +193,11 @@ void * __must_check __kasan_init_slab_obj(struct kmem_cache *cache, static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, unsigned long ip, bool quarantine, bool init) { - u8 tag; void *tagged_object; if (!kasan_arch_is_ready()) return false; - tag = get_tag(object); tagged_object = object; object = kasan_reset_tag(object); @@ -228,7 +226,7 @@ static inline bool ____kasan_slab_free(struct kmem_cache *cache, void *object, return false; if (kasan_stack_collection_enabled()) - kasan_save_free_info(cache, object, tag); + kasan_save_free_info(cache, tagged_object); return kasan_quarantine_put(cache, object); } @@ -317,7 +315,7 @@ void * __must_check __kasan_slab_alloc(struct kmem_cache *cache, /* Save alloc info (if possible) for non-kmalloc() allocations. */ if (kasan_stack_collection_enabled() && !cache->kasan_info.is_kmalloc) - kasan_save_alloc_info(cache, (void *)object, flags); + kasan_save_alloc_info(cache, tagged_object, flags); return tagged_object; } diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index f6bef347de87..aff39af3c532 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -500,8 +500,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) kasan_set_track(&alloc_meta->alloc_track, flags); } -void kasan_save_free_info(struct kmem_cache *cache, - void *object, u8 tag) +void kasan_save_free_info(struct kmem_cache *cache, void *object) { struct kasan_free_meta *free_meta; diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cae60e4d8842..cca49ab029f1 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -309,7 +309,7 @@ static inline void kasan_init_object_meta(struct kmem_cache *cache, const void * depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); -void kasan_save_free_info(struct kmem_cache *cache, void *object, u8 tag); +void kasan_save_free_info(struct kmem_cache *cache, void *object); struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, void *object); struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 4f24669085e9..fd11d10a4ffc 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -21,8 +21,7 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { } -void kasan_save_free_info(struct kmem_cache *cache, - void *object, u8 tag) +void kasan_save_free_info(struct kmem_cache *cache, void *object) { } From b89933e9a54d3e7c4da081bc0b986341b62cdab6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:35 +0200 Subject: [PATCH 155/350] kasan: move kasan_get_alloc/free_track definitions Move the definitions of kasan_get_alloc/free_track() to report_*.c, as they belong with other the reporting code. Link: https://lkml.kernel.org/r/0cb15423956889b3905a0174b58782633bbbd72e.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/generic.c | 21 --------------------- mm/kasan/report_generic.c | 21 +++++++++++++++++++++ mm/kasan/report_tags.c | 12 ++++++++++++ mm/kasan/tags.c | 12 ------------ 4 files changed, 33 insertions(+), 33 deletions(-) diff --git a/mm/kasan/generic.c b/mm/kasan/generic.c index aff39af3c532..d8b5590f9484 100644 --- a/mm/kasan/generic.c +++ b/mm/kasan/generic.c @@ -512,24 +512,3 @@ void kasan_save_free_info(struct kmem_cache *cache, void *object) /* The object was freed and has free track set. */ *(u8 *)kasan_mem_to_shadow(object) = KASAN_SLAB_FREETRACK; } - -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object) -{ - struct kasan_alloc_meta *alloc_meta; - - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; - - return &alloc_meta->alloc_track; -} - -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK) - return NULL; - /* Free meta must be present with KASAN_SLAB_FREETRACK. */ - return &kasan_get_free_meta(cache, object)->free_track; -} diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 348dc207d462..74d21786ef09 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -127,6 +127,27 @@ const char *kasan_get_bug_type(struct kasan_report_info *info) return get_wild_bug_type(info); } +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object) +{ + struct kasan_alloc_meta *alloc_meta; + + alloc_meta = kasan_get_alloc_meta(cache, object); + if (!alloc_meta) + return NULL; + + return &alloc_meta->alloc_track; +} + +struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, + void *object, u8 tag) +{ + if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK) + return NULL; + /* Free meta must be present with KASAN_SLAB_FREETRACK. */ + return &kasan_get_free_meta(cache, object)->free_track; +} + void kasan_metadata_fetch_row(char *buffer, void *row) { memcpy(buffer, kasan_mem_to_shadow(row), META_BYTES_PER_ROW); diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 35cf3cae4aa4..79b6497d8a81 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -21,3 +21,15 @@ const char *kasan_get_bug_type(struct kasan_report_info *info) return "invalid-access"; } + +struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, + void *object) +{ + return NULL; +} + +struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, + void *object, u8 tag) +{ + return NULL; +} diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index fd11d10a4ffc..39a0481e5228 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -24,15 +24,3 @@ void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) void kasan_save_free_info(struct kmem_cache *cache, void *object) { } - -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object) -{ - return NULL; -} - -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - return NULL; -} From 9ef08d265e3f02b3266a46f684de5741724bd7f8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:36 +0200 Subject: [PATCH 156/350] kasan: cosmetic changes in report.c Do a few non-functional style fixes for the code in report.c. Link: https://lkml.kernel.org/r/b728eae71f3ea505a885449724de21cf3f476a7b.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 5d225d7d9c4c..83f420a28c0b 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -200,25 +200,22 @@ static void print_error_description(struct kasan_report_info *info) static void print_track(struct kasan_track *track, const char *prefix) { pr_err("%s by task %u:\n", prefix, track->pid); - if (track->stack) { + if (track->stack) stack_depot_print(track->stack); - } else { + else pr_err("(stack is not available)\n"); - } } struct page *kasan_addr_to_page(const void *addr) { - if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) + if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) return virt_to_head_page(addr); return NULL; } struct slab *kasan_addr_to_slab(const void *addr) { - if ((addr >= (void *)PAGE_OFFSET) && - (addr < high_memory)) + if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) return virt_to_slab(addr); return NULL; } From 2c9fb1fd1dd0b17cf8f48935a9c3ecea066f10e8 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:37 +0200 Subject: [PATCH 157/350] kasan: use virt_addr_valid in kasan_addr_to_page/slab Instead of open-coding the validity checks for addr in kasan_addr_to_page/slab(), use the virt_addr_valid() helper. Link: https://lkml.kernel.org/r/c22a4850d74d7430f8a6c08216fd55c2860a2b9e.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 83f420a28c0b..570f9419b90c 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -208,14 +208,14 @@ static void print_track(struct kasan_track *track, const char *prefix) struct page *kasan_addr_to_page(const void *addr) { - if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) + if (virt_addr_valid(addr)) return virt_to_head_page(addr); return NULL; } struct slab *kasan_addr_to_slab(const void *addr) { - if ((addr >= (void *)PAGE_OFFSET) && (addr < high_memory)) + if (virt_addr_valid(addr)) return virt_to_slab(addr); return NULL; } From 0f282f15dcc479b1f70ef4c2324db8a6df670fcb Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:38 +0200 Subject: [PATCH 158/350] kasan: use kasan_addr_to_slab in print_address_description Use the kasan_addr_to_slab() helper in print_address_description() instead of separately invoking PageSlab() and page_slab(). Link: https://lkml.kernel.org/r/8b744fbf8c3c7fc5d34329ec70b60ee5c8dba66c.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/common.c | 7 +++++++ mm/kasan/report.c | 11 ++--------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/mm/kasan/common.c b/mm/kasan/common.c index f8e16a242197..50f4338b477f 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -30,6 +30,13 @@ #include "kasan.h" #include "../slab.h" +struct slab *kasan_addr_to_slab(const void *addr) +{ + if (virt_addr_valid(addr)) + return virt_to_slab(addr); + return NULL; +} + depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) { unsigned long entries[KASAN_STACK_DEPTH]; diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 570f9419b90c..cd31b3b89ca1 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -213,13 +213,6 @@ struct page *kasan_addr_to_page(const void *addr) return NULL; } -struct slab *kasan_addr_to_slab(const void *addr) -{ - if (virt_addr_valid(addr)) - return virt_to_slab(addr); - return NULL; -} - static void describe_object_addr(struct kmem_cache *cache, void *object, const void *addr) { @@ -297,12 +290,12 @@ static inline bool init_task_stack_addr(const void *addr) static void print_address_description(void *addr, u8 tag) { struct page *page = kasan_addr_to_page(addr); + struct slab *slab = kasan_addr_to_slab(addr); dump_stack_lvl(KERN_ERR); pr_err("\n"); - if (page && PageSlab(page)) { - struct slab *slab = page_slab(page); + if (slab) { struct kmem_cache *cache = slab->slab_cache; void *object = nearest_obj(cache, slab, addr); From 559756e8a2e153f0f2ddf29c0ed9ac7b88345fb6 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:39 +0200 Subject: [PATCH 159/350] kasan: make kasan_addr_to_page static As kasan_addr_to_page() is only used in report.c, rename it to addr_to_page() and make it static. Link: https://lkml.kernel.org/r/66c1267200fe0c16e2ac8847a9315fda041918cb.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 1 - mm/kasan/report.c | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cca49ab029f1..4fddfdb08abf 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -291,7 +291,6 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, unsigned long ip); void kasan_report_invalid_free(void *object, unsigned long ip, enum kasan_report_type type); -struct page *kasan_addr_to_page(const void *addr); struct slab *kasan_addr_to_slab(const void *addr); #ifdef CONFIG_KASAN_GENERIC diff --git a/mm/kasan/report.c b/mm/kasan/report.c index cd31b3b89ca1..ac526c10ebff 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -206,7 +206,7 @@ static void print_track(struct kasan_track *track, const char *prefix) pr_err("(stack is not available)\n"); } -struct page *kasan_addr_to_page(const void *addr) +static inline struct page *addr_to_page(const void *addr) { if (virt_addr_valid(addr)) return virt_to_head_page(addr); @@ -289,7 +289,7 @@ static inline bool init_task_stack_addr(const void *addr) static void print_address_description(void *addr, u8 tag) { - struct page *page = kasan_addr_to_page(addr); + struct page *page = addr_to_page(addr); struct slab *slab = kasan_addr_to_slab(addr); dump_stack_lvl(KERN_ERR); From a794898a0e17c1c563fcce614efbd3644d48fa2e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:40 +0200 Subject: [PATCH 160/350] kasan: simplify print_report To simplify reading the implementation of print_report(), remove the tagged_addr variable and rename untagged_addr to addr. Link: https://lkml.kernel.org/r/f64f5f1093b3c06896bf0f850c5d9e661313fcb2.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ac526c10ebff..dc38ada86f85 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -397,17 +397,16 @@ static void print_memory_metadata(const void *addr) static void print_report(struct kasan_report_info *info) { - void *tagged_addr = info->access_addr; - void *untagged_addr = kasan_reset_tag(tagged_addr); - u8 tag = get_tag(tagged_addr); + void *addr = kasan_reset_tag(info->access_addr); + u8 tag = get_tag(info->access_addr); print_error_description(info); - if (addr_has_metadata(untagged_addr)) + if (addr_has_metadata(addr)) kasan_print_tags(tag, info->first_bad_addr); pr_err("\n"); - if (addr_has_metadata(untagged_addr)) { - print_address_description(untagged_addr, tag); + if (addr_has_metadata(addr)) { + print_address_description(addr, tag); print_memory_metadata(info->first_bad_addr); } else { dump_stack_lvl(KERN_ERR); From 015b109f1f7a799a51def6be37a53b650c4a8fda Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:41 +0200 Subject: [PATCH 161/350] kasan: introduce complete_report_info Introduce a complete_report_info() function that fills in the first_bad_addr field of kasan_report_info instead of doing it in kasan_report_*(). This function will be extended in the next patch. Link: https://lkml.kernel.org/r/8eb1a9bd01f5d31eab4524da54a101b8720b469e.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 5 ++++- mm/kasan/report.c | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 4fddfdb08abf..7e07115873d3 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -153,12 +153,15 @@ enum kasan_report_type { }; struct kasan_report_info { + /* Filled in by kasan_report_*(). */ enum kasan_report_type type; void *access_addr; - void *first_bad_addr; size_t access_size; bool is_write; unsigned long ip; + + /* Filled in by the common reporting code. */ + void *first_bad_addr; }; /* Do not change the struct layout: compiler ABI. */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index dc38ada86f85..0c2e7a58095d 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -413,6 +413,17 @@ static void print_report(struct kasan_report_info *info) } } +static void complete_report_info(struct kasan_report_info *info) +{ + void *addr = kasan_reset_tag(info->access_addr); + + if (info->type == KASAN_REPORT_ACCESS) + info->first_bad_addr = kasan_find_first_bad_addr( + info->access_addr, info->access_size); + else + info->first_bad_addr = addr; +} + void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type) { unsigned long flags; @@ -430,11 +441,12 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty info.type = type; info.access_addr = ptr; - info.first_bad_addr = kasan_reset_tag(ptr); info.access_size = 0; info.is_write = false; info.ip = ip; + complete_report_info(&info); + print_report(&info); end_report(&flags, ptr); @@ -463,11 +475,12 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, info.type = KASAN_REPORT_ACCESS; info.access_addr = ptr; - info.first_bad_addr = kasan_find_first_bad_addr(ptr, size); info.access_size = size; info.is_write = is_write; info.ip = ip; + complete_report_info(&info); + print_report(&info); end_report(&irq_flags, ptr); From 7fae3dd08e3e88491f06e22e648913e3f8cf30f0 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:42 +0200 Subject: [PATCH 162/350] kasan: fill in cache and object in complete_report_info Add cache and object fields to kasan_report_info and fill them in in complete_report_info() instead of fetching them in the middle of the report printing code. This allows the reporting code to get access to the object information before starting printing the report. One of the following patches uses this information to determine the bug type with the tag-based modes. Link: https://lkml.kernel.org/r/23264572cb2cbb8f0efbb51509b6757eb3cc1fc9.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 2 ++ mm/kasan/report.c | 21 +++++++++++++-------- 2 files changed, 15 insertions(+), 8 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 7e07115873d3..b8fa1e50f3d4 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -162,6 +162,8 @@ struct kasan_report_info { /* Filled in by the common reporting code. */ void *first_bad_addr; + struct kmem_cache *cache; + void *object; }; /* Do not change the struct layout: compiler ABI. */ diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 0c2e7a58095d..763de8e68887 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -287,19 +287,16 @@ static inline bool init_task_stack_addr(const void *addr) sizeof(init_thread_union.stack)); } -static void print_address_description(void *addr, u8 tag) +static void print_address_description(void *addr, u8 tag, + struct kasan_report_info *info) { struct page *page = addr_to_page(addr); - struct slab *slab = kasan_addr_to_slab(addr); dump_stack_lvl(KERN_ERR); pr_err("\n"); - if (slab) { - struct kmem_cache *cache = slab->slab_cache; - void *object = nearest_obj(cache, slab, addr); - - describe_object(cache, object, addr, tag); + if (info->cache && info->object) { + describe_object(info->cache, info->object, addr, tag); pr_err("\n"); } @@ -406,7 +403,7 @@ static void print_report(struct kasan_report_info *info) pr_err("\n"); if (addr_has_metadata(addr)) { - print_address_description(addr, tag); + print_address_description(addr, tag, info); print_memory_metadata(info->first_bad_addr); } else { dump_stack_lvl(KERN_ERR); @@ -416,12 +413,20 @@ static void print_report(struct kasan_report_info *info) static void complete_report_info(struct kasan_report_info *info) { void *addr = kasan_reset_tag(info->access_addr); + struct slab *slab; if (info->type == KASAN_REPORT_ACCESS) info->first_bad_addr = kasan_find_first_bad_addr( info->access_addr, info->access_size); else info->first_bad_addr = addr; + + slab = kasan_addr_to_slab(addr); + if (slab) { + info->cache = slab->slab_cache; + info->object = nearest_obj(info->cache, slab, addr); + } else + info->cache = info->object = NULL; } void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type) From 92a38eacd6412bb09f98245ba5b3aa89e3dd6656 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:43 +0200 Subject: [PATCH 163/350] kasan: rework function arguments in report.c Pass a pointer to kasan_report_info to describe_object() and describe_object_stacks(), instead of passing the structure's fields. The untagged pointer and the tag are still passed as separate arguments to some of the functions to avoid duplicating the untagging logic. This is preparatory change for the next patch. Link: https://lkml.kernel.org/r/2e0cdb91524ab528a3c2b12b6d8bcb69512fc4af.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 763de8e68887..ec018f849992 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -213,8 +213,8 @@ static inline struct page *addr_to_page(const void *addr) return NULL; } -static void describe_object_addr(struct kmem_cache *cache, void *object, - const void *addr) +static void describe_object_addr(const void *addr, struct kmem_cache *cache, + void *object) { unsigned long access_addr = (unsigned long)addr; unsigned long object_addr = (unsigned long)object; @@ -242,33 +242,32 @@ static void describe_object_addr(struct kmem_cache *cache, void *object, (void *)(object_addr + cache->object_size)); } -static void describe_object_stacks(struct kmem_cache *cache, void *object, - const void *addr, u8 tag) +static void describe_object_stacks(u8 tag, struct kasan_report_info *info) { struct kasan_track *alloc_track; struct kasan_track *free_track; - alloc_track = kasan_get_alloc_track(cache, object); + alloc_track = kasan_get_alloc_track(info->cache, info->object); if (alloc_track) { print_track(alloc_track, "Allocated"); pr_err("\n"); } - free_track = kasan_get_free_track(cache, object, tag); + free_track = kasan_get_free_track(info->cache, info->object, tag); if (free_track) { print_track(free_track, "Freed"); pr_err("\n"); } - kasan_print_aux_stacks(cache, object); + kasan_print_aux_stacks(info->cache, info->object); } -static void describe_object(struct kmem_cache *cache, void *object, - const void *addr, u8 tag) +static void describe_object(const void *addr, u8 tag, + struct kasan_report_info *info) { if (kasan_stack_collection_enabled()) - describe_object_stacks(cache, object, addr, tag); - describe_object_addr(cache, object, addr); + describe_object_stacks(tag, info); + describe_object_addr(addr, info->cache, info->object); } static inline bool kernel_or_module_addr(const void *addr) @@ -296,7 +295,7 @@ static void print_address_description(void *addr, u8 tag, pr_err("\n"); if (info->cache && info->object) { - describe_object(info->cache, info->object, addr, tag); + describe_object(addr, tag, info); pr_err("\n"); } From 59e6e098d1c156f7c449af903c3b48a5470f6120 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:44 +0200 Subject: [PATCH 164/350] kasan: introduce kasan_complete_mode_report_info Add bug_type and alloc/free_track fields to kasan_report_info and add a kasan_complete_mode_report_info() function that fills in these fields. This function is implemented differently for different KASAN mode. Change the reporting code to use the filled in fields instead of invoking kasan_get_bug_type() and kasan_get_alloc/free_track(). For the Generic mode, kasan_complete_mode_report_info() invokes these functions instead. For the tag-based modes, only the bug_type field is filled in; alloc/free_track are handled in the next patch. Using a single function that fills in these fields is required for the tag-based modes, as the values for all three fields are determined in a single procedure implemented in the following patch. Link: https://lkml.kernel.org/r/8432b861054fa8d0cee79a8877dedeaf3b677ca8.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 33 +++++++++++++++++---------------- mm/kasan/report.c | 30 ++++++++++++++---------------- mm/kasan/report_generic.c | 32 +++++++++++++++++--------------- mm/kasan/report_tags.c | 13 +++---------- 4 files changed, 51 insertions(+), 57 deletions(-) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index b8fa1e50f3d4..7df107dc400a 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -146,6 +146,13 @@ static inline bool kasan_requires_meta(void) #define META_MEM_BYTES_PER_ROW (META_BYTES_PER_ROW * KASAN_GRANULE_SIZE) #define META_ROWS_AROUND_ADDR 2 +#define KASAN_STACK_DEPTH 64 + +struct kasan_track { + u32 pid; + depot_stack_handle_t stack; +}; + enum kasan_report_type { KASAN_REPORT_ACCESS, KASAN_REPORT_INVALID_FREE, @@ -164,6 +171,11 @@ struct kasan_report_info { void *first_bad_addr; struct kmem_cache *cache; void *object; + + /* Filled in by the mode-specific reporting code. */ + const char *bug_type; + struct kasan_track alloc_track; + struct kasan_track free_track; }; /* Do not change the struct layout: compiler ABI. */ @@ -189,14 +201,7 @@ struct kasan_global { #endif }; -/* Structures for keeping alloc and free tracks. */ - -#define KASAN_STACK_DEPTH 64 - -struct kasan_track { - u32 pid; - depot_stack_handle_t stack; -}; +/* Structures for keeping alloc and free meta. */ #ifdef CONFIG_KASAN_GENERIC @@ -270,16 +275,16 @@ static inline bool addr_has_metadata(const void *addr) #endif /* CONFIG_KASAN_GENERIC || CONFIG_KASAN_SW_TAGS */ +void *kasan_find_first_bad_addr(void *addr, size_t size); +void kasan_complete_mode_report_info(struct kasan_report_info *info); +void kasan_metadata_fetch_row(char *buffer, void *row); + #if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) void kasan_print_tags(u8 addr_tag, const void *addr); #else static inline void kasan_print_tags(u8 addr_tag, const void *addr) { } #endif -void *kasan_find_first_bad_addr(void *addr, size_t size); -const char *kasan_get_bug_type(struct kasan_report_info *info); -void kasan_metadata_fetch_row(char *buffer, void *row); - #if defined(CONFIG_KASAN_STACK) void kasan_print_address_stack_frame(const void *addr); #else @@ -314,10 +319,6 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc); void kasan_set_track(struct kasan_track *track, gfp_t flags); void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags); void kasan_save_free_info(struct kmem_cache *cache, void *object); -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object); -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag); #if defined(CONFIG_KASAN_GENERIC) && \ (defined(CONFIG_SLAB) || defined(CONFIG_SLUB)) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index ec018f849992..39e8e5a80b82 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -185,8 +185,7 @@ static void print_error_description(struct kasan_report_info *info) return; } - pr_err("BUG: KASAN: %s in %pS\n", - kasan_get_bug_type(info), (void *)info->ip); + pr_err("BUG: KASAN: %s in %pS\n", info->bug_type, (void *)info->ip); if (info->access_size) pr_err("%s of size %zu at addr %px by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, @@ -242,31 +241,25 @@ static void describe_object_addr(const void *addr, struct kmem_cache *cache, (void *)(object_addr + cache->object_size)); } -static void describe_object_stacks(u8 tag, struct kasan_report_info *info) +static void describe_object_stacks(struct kasan_report_info *info) { - struct kasan_track *alloc_track; - struct kasan_track *free_track; - - alloc_track = kasan_get_alloc_track(info->cache, info->object); - if (alloc_track) { - print_track(alloc_track, "Allocated"); + if (info->alloc_track.stack) { + print_track(&info->alloc_track, "Allocated"); pr_err("\n"); } - free_track = kasan_get_free_track(info->cache, info->object, tag); - if (free_track) { - print_track(free_track, "Freed"); + if (info->free_track.stack) { + print_track(&info->free_track, "Freed"); pr_err("\n"); } kasan_print_aux_stacks(info->cache, info->object); } -static void describe_object(const void *addr, u8 tag, - struct kasan_report_info *info) +static void describe_object(const void *addr, struct kasan_report_info *info) { if (kasan_stack_collection_enabled()) - describe_object_stacks(tag, info); + describe_object_stacks(info); describe_object_addr(addr, info->cache, info->object); } @@ -295,7 +288,7 @@ static void print_address_description(void *addr, u8 tag, pr_err("\n"); if (info->cache && info->object) { - describe_object(addr, tag, info); + describe_object(addr, info); pr_err("\n"); } @@ -426,6 +419,9 @@ static void complete_report_info(struct kasan_report_info *info) info->object = nearest_obj(info->cache, slab, addr); } else info->cache = info->object = NULL; + + /* Fill in mode-specific report info fields. */ + kasan_complete_mode_report_info(info); } void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_type type) @@ -443,6 +439,7 @@ void kasan_report_invalid_free(void *ptr, unsigned long ip, enum kasan_report_ty start_report(&flags, true); + memset(&info, 0, sizeof(info)); info.type = type; info.access_addr = ptr; info.access_size = 0; @@ -477,6 +474,7 @@ bool kasan_report(unsigned long addr, size_t size, bool is_write, start_report(&irq_flags, true); + memset(&info, 0, sizeof(info)); info.type = KASAN_REPORT_ACCESS; info.access_addr = ptr; info.access_size = size; diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 74d21786ef09..087c1d8c8145 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -109,7 +109,7 @@ static const char *get_wild_bug_type(struct kasan_report_info *info) return bug_type; } -const char *kasan_get_bug_type(struct kasan_report_info *info) +static const char *get_bug_type(struct kasan_report_info *info) { /* * If access_size is a negative number, then it has reason to be @@ -127,25 +127,27 @@ const char *kasan_get_bug_type(struct kasan_report_info *info) return get_wild_bug_type(info); } -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object) +void kasan_complete_mode_report_info(struct kasan_report_info *info) { struct kasan_alloc_meta *alloc_meta; + struct kasan_free_meta *free_meta; - alloc_meta = kasan_get_alloc_meta(cache, object); - if (!alloc_meta) - return NULL; + info->bug_type = get_bug_type(info); - return &alloc_meta->alloc_track; -} + if (!info->cache || !info->object) + return; -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - if (*(u8 *)kasan_mem_to_shadow(object) != KASAN_SLAB_FREETRACK) - return NULL; - /* Free meta must be present with KASAN_SLAB_FREETRACK. */ - return &kasan_get_free_meta(cache, object)->free_track; + alloc_meta = kasan_get_alloc_meta(info->cache, info->object); + if (alloc_meta) + memcpy(&info->alloc_track, &alloc_meta->alloc_track, + sizeof(info->alloc_track)); + + if (*(u8 *)kasan_mem_to_shadow(info->object) == KASAN_SLAB_FREETRACK) { + /* Free meta must be present with KASAN_SLAB_FREETRACK. */ + free_meta = kasan_get_free_meta(info->cache, info->object); + memcpy(&info->free_track, &free_meta->free_track, + sizeof(info->free_track)); + } } void kasan_metadata_fetch_row(char *buffer, void *row) diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 79b6497d8a81..5cbac2cdb177 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -6,7 +6,7 @@ #include "kasan.h" -const char *kasan_get_bug_type(struct kasan_report_info *info) +static const char *get_bug_type(struct kasan_report_info *info) { /* * If access_size is a negative number, then it has reason to be @@ -22,14 +22,7 @@ const char *kasan_get_bug_type(struct kasan_report_info *info) return "invalid-access"; } -struct kasan_track *kasan_get_alloc_track(struct kmem_cache *cache, - void *object) +void kasan_complete_mode_report_info(struct kasan_report_info *info) { - return NULL; -} - -struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, - void *object, u8 tag) -{ - return NULL; + info->bug_type = get_bug_type(info); } From 7bc0584e5d2a687c0855a1b3dec9a6d6857d757b Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:45 +0200 Subject: [PATCH 165/350] kasan: implement stack ring for tag-based modes Implement storing stack depot handles for alloc/free stack traces for slab objects for the tag-based KASAN modes in a ring buffer. This ring buffer is referred to as the stack ring. On each alloc/free of a slab object, the tagged address of the object and the current stack trace are recorded in the stack ring. On each bug report, if the accessed address belongs to a slab object, the stack ring is scanned for matching entries. The newest entries are used to print the alloc/free stack traces in the report: one entry for alloc and one for free. The number of entries in the stack ring is fixed in this patch, but one of the following patches adds a command-line argument to control it. [andreyknvl@google.com: initialize read-write lock in stack ring] Link: https://lkml.kernel.org/r/576182d194e27531e8090bad809e4136953895f4.1663700262.git.andreyknvl@google.com Link: https://lkml.kernel.org/r/692de14b6b6a1bc817fd55e4ad92fc1f83c1ab59.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/kasan.h | 21 +++++++++++++ mm/kasan/report_tags.c | 71 ++++++++++++++++++++++++++++++++++++++++++ mm/kasan/tags.c | 52 +++++++++++++++++++++++++++++++ 3 files changed, 144 insertions(+) diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 7df107dc400a..cfff81139d67 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -2,6 +2,7 @@ #ifndef __MM_KASAN_KASAN_H #define __MM_KASAN_KASAN_H +#include #include #include #include @@ -233,6 +234,26 @@ struct kasan_free_meta { #endif /* CONFIG_KASAN_GENERIC */ +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) + +struct kasan_stack_ring_entry { + void *ptr; + size_t size; + u32 pid; + depot_stack_handle_t stack; + bool is_free; +}; + +#define KASAN_STACK_RING_SIZE (32 << 10) + +struct kasan_stack_ring { + rwlock_t lock; + atomic64_t pos; + struct kasan_stack_ring_entry entries[KASAN_STACK_RING_SIZE]; +}; + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) /* Used in KUnit-compatible KASAN tests. */ struct kunit_kasan_status { diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 5cbac2cdb177..1b78136542bb 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -4,8 +4,12 @@ * Copyright (c) 2020 Google, Inc. */ +#include + #include "kasan.h" +extern struct kasan_stack_ring stack_ring; + static const char *get_bug_type(struct kasan_report_info *info) { /* @@ -24,5 +28,72 @@ static const char *get_bug_type(struct kasan_report_info *info) void kasan_complete_mode_report_info(struct kasan_report_info *info) { + unsigned long flags; + u64 pos; + struct kasan_stack_ring_entry *entry; + void *ptr; + u32 pid; + depot_stack_handle_t stack; + bool is_free; + bool alloc_found = false, free_found = false; + info->bug_type = get_bug_type(info); + + if (!info->cache || !info->object) + return; + } + + write_lock_irqsave(&stack_ring.lock, flags); + + pos = atomic64_read(&stack_ring.pos); + + /* + * The loop below tries to find stack ring entries relevant to the + * buggy object. This is a best-effort process. + * + * First, another object with the same tag can be allocated in place of + * the buggy object. Also, since the number of entries is limited, the + * entries relevant to the buggy object can be overwritten. + */ + + for (u64 i = pos - 1; i != pos - 1 - KASAN_STACK_RING_SIZE; i--) { + if (alloc_found && free_found) + break; + + entry = &stack_ring.entries[i % KASAN_STACK_RING_SIZE]; + + /* Paired with smp_store_release() in save_stack_info(). */ + ptr = (void *)smp_load_acquire(&entry->ptr); + + if (kasan_reset_tag(ptr) != info->object || + get_tag(ptr) != get_tag(info->access_addr)) + continue; + + pid = READ_ONCE(entry->pid); + stack = READ_ONCE(entry->stack); + is_free = READ_ONCE(entry->is_free); + + if (is_free) { + /* + * Second free of the same object. + * Give up on trying to find the alloc entry. + */ + if (free_found) + break; + + info->free_track.pid = pid; + info->free_track.stack = stack; + free_found = true; + } else { + /* Second alloc of the same object. Give up. */ + if (alloc_found) + break; + + info->alloc_track.pid = pid; + info->alloc_track.stack = stack; + alloc_found = true; + } + } + + write_unlock_irqrestore(&stack_ring.lock, flags); } diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index 39a0481e5228..a0524e037f49 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -6,6 +6,7 @@ * Copyright (c) 2020 Google, Inc. */ +#include #include #include #include @@ -16,11 +17,62 @@ #include #include "kasan.h" +#include "../slab.h" + +/* Non-zero, as initial pointer values are 0. */ +#define STACK_RING_BUSY_PTR ((void *)1) + +struct kasan_stack_ring stack_ring = { + .lock = __RW_LOCK_UNLOCKED(stack_ring.lock) +}; + +static void save_stack_info(struct kmem_cache *cache, void *object, + gfp_t gfp_flags, bool is_free) +{ + unsigned long flags; + depot_stack_handle_t stack; + u64 pos; + struct kasan_stack_ring_entry *entry; + void *old_ptr; + + stack = kasan_save_stack(gfp_flags, true); + + /* + * Prevent save_stack_info() from modifying stack ring + * when kasan_complete_mode_report_info() is walking it. + */ + read_lock_irqsave(&stack_ring.lock, flags); + +next: + pos = atomic64_fetch_add(1, &stack_ring.pos); + entry = &stack_ring.entries[pos % KASAN_STACK_RING_SIZE]; + + /* Detect stack ring entry slots that are being written to. */ + old_ptr = READ_ONCE(entry->ptr); + if (old_ptr == STACK_RING_BUSY_PTR) + goto next; /* Busy slot. */ + if (!try_cmpxchg(&entry->ptr, &old_ptr, STACK_RING_BUSY_PTR)) + goto next; /* Busy slot. */ + + WRITE_ONCE(entry->size, cache->object_size); + WRITE_ONCE(entry->pid, current->pid); + WRITE_ONCE(entry->stack, stack); + WRITE_ONCE(entry->is_free, is_free); + + /* + * Paired with smp_load_acquire() in kasan_complete_mode_report_info(). + */ + smp_store_release(&entry->ptr, (s64)object); + + read_unlock_irqrestore(&stack_ring.lock, flags); +} void kasan_save_alloc_info(struct kmem_cache *cache, void *object, gfp_t flags) { + save_stack_info(cache, object, flags, false); } void kasan_save_free_info(struct kmem_cache *cache, void *object) { + save_stack_info(cache, object, GFP_NOWAIT, true); } From 7ebfce33125100e3f0c5e059845a019a1401433d Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:46 +0200 Subject: [PATCH 166/350] kasan: support kasan.stacktrace for SW_TAGS Add support for the kasan.stacktrace command-line argument for Software Tag-Based KASAN. The following patch adds a command-line argument for selecting the stack ring size, and, as the stack ring is supported by both the Software and the Hardware Tag-Based KASAN modes, it is natural that both of them have support for kasan.stacktrace too. Link: https://lkml.kernel.org/r/3b43059103faa7f8796017847b7d674b658f11b5.1662411799.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 15 ++++++----- mm/kasan/hw_tags.c | 39 +--------------------------- mm/kasan/kasan.h | 36 +++++++++++++++++--------- mm/kasan/sw_tags.c | 5 +++- mm/kasan/tags.c | 43 +++++++++++++++++++++++++++++++ 5 files changed, 81 insertions(+), 57 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 1772fd457fed..7bd38c181018 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -111,9 +111,15 @@ parameter can be used to control panic and reporting behaviour: report or also panic the kernel (default: ``report``). The panic happens even if ``kasan_multi_shot`` is enabled. -Hardware Tag-Based KASAN mode (see the section about various modes below) is -intended for use in production as a security mitigation. Therefore, it supports -additional boot parameters that allow disabling KASAN or controlling features: +Software and Hardware Tag-Based KASAN modes (see the section about various +modes below) support disabling stack trace collection: + +- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack + traces collection (default: ``on``). + +Hardware Tag-Based KASAN mode is intended for use in production as a security +mitigation. Therefore, it supports additional boot parameters that allow +disabling KASAN altogether or controlling its features: - ``kasan=off`` or ``=on`` controls whether KASAN is enabled (default: ``on``). @@ -132,9 +138,6 @@ additional boot parameters that allow disabling KASAN or controlling features: - ``kasan.vmalloc=off`` or ``=on`` disables or enables tagging of vmalloc allocations (default: ``on``). -- ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack - traces collection (default: ``on``). - Error reports ~~~~~~~~~~~~~ diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 9ad8eff71b28..b22c4f461cb0 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -38,16 +38,9 @@ enum kasan_arg_vmalloc { KASAN_ARG_VMALLOC_ON, }; -enum kasan_arg_stacktrace { - KASAN_ARG_STACKTRACE_DEFAULT, - KASAN_ARG_STACKTRACE_OFF, - KASAN_ARG_STACKTRACE_ON, -}; - static enum kasan_arg kasan_arg __ro_after_init; static enum kasan_arg_mode kasan_arg_mode __ro_after_init; static enum kasan_arg_vmalloc kasan_arg_vmalloc __initdata; -static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; /* * Whether KASAN is enabled at all. @@ -66,9 +59,6 @@ EXPORT_SYMBOL_GPL(kasan_mode); /* Whether to enable vmalloc tagging. */ DEFINE_STATIC_KEY_TRUE(kasan_flag_vmalloc); -/* Whether to collect alloc/free stack traces. */ -DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); - /* kasan=off/on */ static int __init early_kasan_flag(char *arg) { @@ -122,23 +112,6 @@ static int __init early_kasan_flag_vmalloc(char *arg) } early_param("kasan.vmalloc", early_kasan_flag_vmalloc); -/* kasan.stacktrace=off/on */ -static int __init early_kasan_flag_stacktrace(char *arg) -{ - if (!arg) - return -EINVAL; - - if (!strcmp(arg, "off")) - kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF; - else if (!strcmp(arg, "on")) - kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON; - else - return -EINVAL; - - return 0; -} -early_param("kasan.stacktrace", early_kasan_flag_stacktrace); - static inline const char *kasan_mode_info(void) { if (kasan_mode == KASAN_MODE_ASYNC) @@ -213,17 +186,7 @@ void __init kasan_init_hw_tags(void) break; } - switch (kasan_arg_stacktrace) { - case KASAN_ARG_STACKTRACE_DEFAULT: - /* Default is specified by kasan_flag_stacktrace definition. */ - break; - case KASAN_ARG_STACKTRACE_OFF: - static_branch_disable(&kasan_flag_stacktrace); - break; - case KASAN_ARG_STACKTRACE_ON: - static_branch_enable(&kasan_flag_stacktrace); - break; - } + kasan_init_tags(); /* KASAN is now initialized, enable it. */ static_branch_enable(&kasan_flag_enabled); diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index cfff81139d67..447baf1a7a2e 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -8,13 +8,31 @@ #include #include -#ifdef CONFIG_KASAN_HW_TAGS +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) #include + +DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); + +static inline bool kasan_stack_collection_enabled(void) +{ + return static_branch_unlikely(&kasan_flag_stacktrace); +} + +#else /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + +static inline bool kasan_stack_collection_enabled(void) +{ + return true; +} + +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + +#ifdef CONFIG_KASAN_HW_TAGS + #include "../slab.h" DECLARE_STATIC_KEY_TRUE(kasan_flag_vmalloc); -DECLARE_STATIC_KEY_TRUE(kasan_flag_stacktrace); enum kasan_mode { KASAN_MODE_SYNC, @@ -29,11 +47,6 @@ static inline bool kasan_vmalloc_enabled(void) return static_branch_likely(&kasan_flag_vmalloc); } -static inline bool kasan_stack_collection_enabled(void) -{ - return static_branch_unlikely(&kasan_flag_stacktrace); -} - static inline bool kasan_async_fault_possible(void) { return kasan_mode == KASAN_MODE_ASYNC || kasan_mode == KASAN_MODE_ASYMM; @@ -46,11 +59,6 @@ static inline bool kasan_sync_fault_possible(void) #else /* CONFIG_KASAN_HW_TAGS */ -static inline bool kasan_stack_collection_enabled(void) -{ - return true; -} - static inline bool kasan_async_fault_possible(void) { return false; @@ -410,6 +418,10 @@ static inline void kasan_enable_tagging(void) { } #endif /* CONFIG_KASAN_HW_TAGS */ +#if defined(CONFIG_KASAN_SW_TAGS) || defined(CONFIG_KASAN_HW_TAGS) +void __init kasan_init_tags(void); +#endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ + #if defined(CONFIG_KASAN_HW_TAGS) && IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_force_async_fault(void); diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 77f13f391b57..a3afaf2ad1b1 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -42,7 +42,10 @@ void __init kasan_init_sw_tags(void) for_each_possible_cpu(cpu) per_cpu(prng_state, cpu) = (u32)get_cycles(); - pr_info("KernelAddressSanitizer initialized (sw-tags)\n"); + kasan_init_tags(); + + pr_info("KernelAddressSanitizer initialized (sw-tags, stacktrace=%s)\n", + kasan_stack_collection_enabled() ? "on" : "off"); } /* diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index a0524e037f49..dd929ab166fb 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -19,6 +19,17 @@ #include "kasan.h" #include "../slab.h" +enum kasan_arg_stacktrace { + KASAN_ARG_STACKTRACE_DEFAULT, + KASAN_ARG_STACKTRACE_OFF, + KASAN_ARG_STACKTRACE_ON, +}; + +static enum kasan_arg_stacktrace kasan_arg_stacktrace __initdata; + +/* Whether to collect alloc/free stack traces. */ +DEFINE_STATIC_KEY_TRUE(kasan_flag_stacktrace); + /* Non-zero, as initial pointer values are 0. */ #define STACK_RING_BUSY_PTR ((void *)1) @@ -26,6 +37,38 @@ struct kasan_stack_ring stack_ring = { .lock = __RW_LOCK_UNLOCKED(stack_ring.lock) }; +/* kasan.stacktrace=off/on */ +static int __init early_kasan_flag_stacktrace(char *arg) +{ + if (!arg) + return -EINVAL; + + if (!strcmp(arg, "off")) + kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_OFF; + else if (!strcmp(arg, "on")) + kasan_arg_stacktrace = KASAN_ARG_STACKTRACE_ON; + else + return -EINVAL; + + return 0; +} +early_param("kasan.stacktrace", early_kasan_flag_stacktrace); + +void __init kasan_init_tags(void) +{ + switch (kasan_arg_stacktrace) { + case KASAN_ARG_STACKTRACE_DEFAULT: + /* Default is specified by kasan_flag_stacktrace definition. */ + break; + case KASAN_ARG_STACKTRACE_OFF: + static_branch_disable(&kasan_flag_stacktrace); + break; + case KASAN_ARG_STACKTRACE_ON: + static_branch_enable(&kasan_flag_stacktrace); + break; + } +} + static void save_stack_info(struct kmem_cache *cache, void *object, gfp_t gfp_flags, bool is_free) { From 80b92bfe3bb75aa6688f58af9df356757a46f659 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:47 +0200 Subject: [PATCH 167/350] kasan: dynamically allocate stack ring entries Instead of using a large static array, allocate the stack ring dynamically via memblock_alloc(). The size of the stack ring is controlled by a new kasan.stack_ring_size command-line parameter. When kasan.stack_ring_size is not provided, the default value of 32 << 10 is used. When the stack trace collection is disabled via kasan.stacktrace=off, the stack ring is not allocated. Link: https://lkml.kernel.org/r/03b82ab60db53427e9818e0b0c1971baa10c3cbc.1662411800.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- Documentation/dev-tools/kasan.rst | 4 +++- mm/kasan/kasan.h | 5 ++--- mm/kasan/report_tags.c | 4 ++-- mm/kasan/tags.c | 25 ++++++++++++++++++++++++- 4 files changed, 31 insertions(+), 7 deletions(-) diff --git a/Documentation/dev-tools/kasan.rst b/Documentation/dev-tools/kasan.rst index 7bd38c181018..5c93ab915049 100644 --- a/Documentation/dev-tools/kasan.rst +++ b/Documentation/dev-tools/kasan.rst @@ -112,10 +112,12 @@ parameter can be used to control panic and reporting behaviour: if ``kasan_multi_shot`` is enabled. Software and Hardware Tag-Based KASAN modes (see the section about various -modes below) support disabling stack trace collection: +modes below) support altering stack trace collection behavior: - ``kasan.stacktrace=off`` or ``=on`` disables or enables alloc and free stack traces collection (default: ``on``). +- ``kasan.stack_ring_size=`` specifies the number of entries + in the stack ring (default: ``32768``). Hardware Tag-Based KASAN mode is intended for use in production as a security mitigation. Therefore, it supports additional boot parameters that allow diff --git a/mm/kasan/kasan.h b/mm/kasan/kasan.h index 447baf1a7a2e..abbcc1b0eec5 100644 --- a/mm/kasan/kasan.h +++ b/mm/kasan/kasan.h @@ -252,12 +252,11 @@ struct kasan_stack_ring_entry { bool is_free; }; -#define KASAN_STACK_RING_SIZE (32 << 10) - struct kasan_stack_ring { rwlock_t lock; + size_t size; atomic64_t pos; - struct kasan_stack_ring_entry entries[KASAN_STACK_RING_SIZE]; + struct kasan_stack_ring_entry *entries; }; #endif /* CONFIG_KASAN_SW_TAGS || CONFIG_KASAN_HW_TAGS */ diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 1b78136542bb..57f7355377f1 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -56,11 +56,11 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) * entries relevant to the buggy object can be overwritten. */ - for (u64 i = pos - 1; i != pos - 1 - KASAN_STACK_RING_SIZE; i--) { + for (u64 i = pos - 1; i != pos - 1 - stack_ring.size; i--) { if (alloc_found && free_found) break; - entry = &stack_ring.entries[i % KASAN_STACK_RING_SIZE]; + entry = &stack_ring.entries[i % stack_ring.size]; /* Paired with smp_store_release() in save_stack_info(). */ ptr = (void *)smp_load_acquire(&entry->ptr); diff --git a/mm/kasan/tags.c b/mm/kasan/tags.c index dd929ab166fb..67a222586846 100644 --- a/mm/kasan/tags.c +++ b/mm/kasan/tags.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -19,6 +20,8 @@ #include "kasan.h" #include "../slab.h" +#define KASAN_STACK_RING_SIZE_DEFAULT (32 << 10) + enum kasan_arg_stacktrace { KASAN_ARG_STACKTRACE_DEFAULT, KASAN_ARG_STACKTRACE_OFF, @@ -54,6 +57,16 @@ static int __init early_kasan_flag_stacktrace(char *arg) } early_param("kasan.stacktrace", early_kasan_flag_stacktrace); +/* kasan.stack_ring_size= */ +static int __init early_kasan_flag_stack_ring_size(char *arg) +{ + if (!arg) + return -EINVAL; + + return kstrtoul(arg, 0, &stack_ring.size); +} +early_param("kasan.stack_ring_size", early_kasan_flag_stack_ring_size); + void __init kasan_init_tags(void) { switch (kasan_arg_stacktrace) { @@ -67,6 +80,16 @@ void __init kasan_init_tags(void) static_branch_enable(&kasan_flag_stacktrace); break; } + + if (kasan_stack_collection_enabled()) { + if (!stack_ring.size) + stack_ring.size = KASAN_STACK_RING_SIZE_DEFAULT; + stack_ring.entries = memblock_alloc( + sizeof(stack_ring.entries[0]) * stack_ring.size, + SMP_CACHE_BYTES); + if (WARN_ON(!stack_ring.entries)) + static_branch_disable(&kasan_flag_stacktrace); + } } static void save_stack_info(struct kmem_cache *cache, void *object, @@ -88,7 +111,7 @@ static void save_stack_info(struct kmem_cache *cache, void *object, next: pos = atomic64_fetch_add(1, &stack_ring.pos); - entry = &stack_ring.entries[pos % KASAN_STACK_RING_SIZE]; + entry = &stack_ring.entries[pos % stack_ring.size]; /* Detect stack ring entry slots that are being written to. */ old_ptr = READ_ONCE(entry->ptr); From 1f538e1f2d294cf8a9486fb1a7d4d4f0d16e2b01 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:48 +0200 Subject: [PATCH 168/350] kasan: better identify bug types for tag-based modes Identify the bug type for the tag-based modes based on the stack trace entries found in the stack ring. If a free entry is found first (meaning that it was added last), mark the bug as use-after-free. If an alloc entry is found first, mark the bug as slab-out-of-bounds. Otherwise, assign the common bug type. This change returns the functionalify of the previously dropped CONFIG_KASAN_TAGS_IDENTIFY. Link: https://lkml.kernel.org/r/13ce7fa07d9d995caedd1439dfae4d51401842f2.1662411800.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- mm/kasan/report_tags.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index 57f7355377f1..d3510424d29b 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -10,7 +10,7 @@ extern struct kasan_stack_ring stack_ring; -static const char *get_bug_type(struct kasan_report_info *info) +static const char *get_common_bug_type(struct kasan_report_info *info) { /* * If access_size is a negative number, then it has reason to be @@ -37,9 +37,8 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) bool is_free; bool alloc_found = false, free_found = false; - info->bug_type = get_bug_type(info); - - if (!info->cache || !info->object) + if (!info->cache || !info->object) { + info->bug_type = get_common_bug_type(info); return; } @@ -84,6 +83,13 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) info->free_track.pid = pid; info->free_track.stack = stack; free_found = true; + + /* + * If a free entry is found first, the bug is likely + * a use-after-free. + */ + if (!info->bug_type) + info->bug_type = "use-after-free"; } else { /* Second alloc of the same object. Give up. */ if (alloc_found) @@ -92,8 +98,19 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) info->alloc_track.pid = pid; info->alloc_track.stack = stack; alloc_found = true; + + /* + * If an alloc entry is found first, the bug is likely + * an out-of-bounds. + */ + if (!info->bug_type) + info->bug_type = "slab-out-of-bounds"; } } write_unlock_irqrestore(&stack_ring.lock, flags); + + /* Assign the common bug type if no entries were found. */ + if (!info->bug_type) + info->bug_type = get_common_bug_type(info); } From 34b592ce5cc2dbd7d94812bff12ec32d3ec6f65c Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Mon, 5 Sep 2022 23:05:49 +0200 Subject: [PATCH 169/350] kasan: add another use-after-free test Add a new use-after-free test that checks that KASAN detects use-after-free when another object was allocated in the same slot. This test is mainly relevant for the tag-based modes, which do not use quarantine. Once [1] is resolved, this test can be extended to check that the stack traces in the report point to the proper kmalloc/kfree calls. [1] https://bugzilla.kernel.org/show_bug.cgi?id=212203 Link: https://lkml.kernel.org/r/0659cfa15809dd38faa02bc0a59d0b5dbbd81211.1662411800.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Acked-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Evgenii Stepanov Cc: Peter Collingbourne Signed-off-by: Andrew Morton --- lib/test_kasan.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/lib/test_kasan.c b/lib/test_kasan.c index 58c1b01ccfe2..505f77ffad27 100644 --- a/lib/test_kasan.c +++ b/lib/test_kasan.c @@ -612,6 +612,29 @@ again: kfree(ptr2); } +/* + * Check that KASAN detects use-after-free when another object was allocated in + * the same slot. Relevant for the tag-based modes, which do not use quarantine. + */ +static void kmalloc_uaf3(struct kunit *test) +{ + char *ptr1, *ptr2; + size_t size = 100; + + /* This test is specifically crafted for tag-based modes. */ + KASAN_TEST_NEEDS_CONFIG_OFF(test, CONFIG_KASAN_GENERIC); + + ptr1 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr1); + kfree(ptr1); + + ptr2 = kmalloc(size, GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, ptr2); + kfree(ptr2); + + KUNIT_EXPECT_KASAN_FAIL(test, ((volatile char *)ptr1)[8]); +} + static void kfree_via_page(struct kunit *test) { char *ptr; @@ -1382,6 +1405,7 @@ static struct kunit_case kasan_kunit_test_cases[] = { KUNIT_CASE(kmalloc_uaf), KUNIT_CASE(kmalloc_uaf_memset), KUNIT_CASE(kmalloc_uaf2), + KUNIT_CASE(kmalloc_uaf3), KUNIT_CASE(kfree_via_page), KUNIT_CASE(kfree_via_phys), KUNIT_CASE(kmem_cache_oob), From f7e01ab828fd4bf6d25b1f143a3994241e8572bf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Tue, 6 Sep 2022 00:18:36 +0200 Subject: [PATCH 170/350] kasan: move tests to mm/kasan/ Move KASAN tests to mm/kasan/ to keep the test code alongside the implementation. Link: https://lkml.kernel.org/r/676398f0aeecd47d2f8e3369ea0e95563f641a36.1662416260.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Konovalov Cc: Andrey Ryabinin Cc: Dmitry Vyukov Cc: Marco Elver Signed-off-by: Andrew Morton --- MAINTAINERS | 1 - lib/Makefile | 5 ----- mm/kasan/Makefile | 8 ++++++++ lib/test_kasan.c => mm/kasan/kasan_test.c | 2 +- lib/test_kasan_module.c => mm/kasan/kasan_test_module.c | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-) rename lib/test_kasan.c => mm/kasan/kasan_test.c (99%) rename lib/test_kasan_module.c => mm/kasan/kasan_test_module.c (99%) diff --git a/MAINTAINERS b/MAINTAINERS index c66b63ad83d8..6f1033f3c1ed 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10938,7 +10938,6 @@ F: arch/*/include/asm/*kasan.h F: arch/*/mm/kasan_init* F: include/linux/kasan*.h F: lib/Kconfig.kasan -F: lib/test_kasan*.c F: mm/kasan/ F: scripts/Makefile.kasan diff --git a/lib/Makefile b/lib/Makefile index 6dc0d6f8e57d..d7d94102991b 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -65,11 +65,6 @@ obj-$(CONFIG_TEST_SYSCTL) += test_sysctl.o obj-$(CONFIG_TEST_SIPHASH) += test_siphash.o obj-$(CONFIG_HASH_KUNIT_TEST) += test_hash.o obj-$(CONFIG_TEST_IDA) += test_ida.o -obj-$(CONFIG_KASAN_KUNIT_TEST) += test_kasan.o -CFLAGS_test_kasan.o += -fno-builtin -CFLAGS_test_kasan.o += $(call cc-disable-warning, vla) -obj-$(CONFIG_KASAN_MODULE_TEST) += test_kasan_module.o -CFLAGS_test_kasan_module.o += -fno-builtin obj-$(CONFIG_TEST_UBSAN) += test_ubsan.o CFLAGS_test_ubsan.o += $(call cc-disable-warning, vla) UBSAN_SANITIZE_test_ubsan.o := y diff --git a/mm/kasan/Makefile b/mm/kasan/Makefile index 1f84df9c302e..d4837bff3b60 100644 --- a/mm/kasan/Makefile +++ b/mm/kasan/Makefile @@ -35,7 +35,15 @@ CFLAGS_shadow.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_hw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) CFLAGS_sw_tags.o := $(CC_FLAGS_KASAN_RUNTIME) +CFLAGS_KASAN_TEST := $(CFLAGS_KASAN) -fno-builtin $(call cc-disable-warning, vla) + +CFLAGS_kasan_test.o := $(CFLAGS_KASAN_TEST) +CFLAGS_kasan_test_module.o := $(CFLAGS_KASAN_TEST) + obj-y := common.o report.o obj-$(CONFIG_KASAN_GENERIC) += init.o generic.o report_generic.o shadow.o quarantine.o obj-$(CONFIG_KASAN_HW_TAGS) += hw_tags.o report_hw_tags.o tags.o report_tags.o obj-$(CONFIG_KASAN_SW_TAGS) += init.o report_sw_tags.o shadow.o sw_tags.o tags.o report_tags.o + +obj-$(CONFIG_KASAN_KUNIT_TEST) += kasan_test.o +obj-$(CONFIG_KASAN_MODULE_TEST) += kasan_test_module.o diff --git a/lib/test_kasan.c b/mm/kasan/kasan_test.c similarity index 99% rename from lib/test_kasan.c rename to mm/kasan/kasan_test.c index 505f77ffad27..f25692def781 100644 --- a/lib/test_kasan.c +++ b/mm/kasan/kasan_test.c @@ -25,7 +25,7 @@ #include -#include "../mm/kasan/kasan.h" +#include "kasan.h" #define OOB_TAG_OFF (IS_ENABLED(CONFIG_KASAN_GENERIC) ? 0 : KASAN_GRANULE_SIZE) diff --git a/lib/test_kasan_module.c b/mm/kasan/kasan_test_module.c similarity index 99% rename from lib/test_kasan_module.c rename to mm/kasan/kasan_test_module.c index b112cbc835e9..e4ca82dc2c16 100644 --- a/lib/test_kasan_module.c +++ b/mm/kasan/kasan_test_module.c @@ -13,7 +13,7 @@ #include #include -#include "../mm/kasan/kasan.h" +#include "kasan.h" static noinline void __init copy_user_test(void) { From dcc579663f607392ade99a2301278239e819f57e Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Sun, 11 Sep 2022 01:25:30 +0200 Subject: [PATCH 171/350] kasan: better invalid/double-free report header Update the report header for invalid- and double-free bugs to contain the address being freed: BUG: KASAN: invalid-free in kfree+0x280/0x2a8 Free of addr ffff00000beac001 by task kunit_try_catch/99 Link: https://lkml.kernel.org/r/fce40f8dbd160972fe01a1ff39d0c426c310e4b7.1662852281.git.andreyknvl@google.com Signed-off-by: Andrey Konovalov Reviewed-by: Marco Elver Cc: Alexander Potapenko Cc: Andrey Ryabinin Cc: Dmitry Vyukov Signed-off-by: Andrew Morton --- mm/kasan/report.c | 29 +++++++++++++++++++---------- mm/kasan/report_generic.c | 3 ++- mm/kasan/report_tags.c | 2 +- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/mm/kasan/report.c b/mm/kasan/report.c index 39e8e5a80b82..df3602062bfd 100644 --- a/mm/kasan/report.c +++ b/mm/kasan/report.c @@ -175,17 +175,14 @@ static void end_report(unsigned long *flags, void *addr) static void print_error_description(struct kasan_report_info *info) { - if (info->type == KASAN_REPORT_INVALID_FREE) { - pr_err("BUG: KASAN: invalid-free in %pS\n", (void *)info->ip); - return; - } - - if (info->type == KASAN_REPORT_DOUBLE_FREE) { - pr_err("BUG: KASAN: double-free in %pS\n", (void *)info->ip); - return; - } - pr_err("BUG: KASAN: %s in %pS\n", info->bug_type, (void *)info->ip); + + if (info->type != KASAN_REPORT_ACCESS) { + pr_err("Free of addr %px by task %s/%d\n", + info->access_addr, current->comm, task_pid_nr(current)); + return; + } + if (info->access_size) pr_err("%s of size %zu at addr %px by task %s/%d\n", info->is_write ? "Write" : "Read", info->access_size, @@ -420,6 +417,18 @@ static void complete_report_info(struct kasan_report_info *info) } else info->cache = info->object = NULL; + switch (info->type) { + case KASAN_REPORT_INVALID_FREE: + info->bug_type = "invalid-free"; + break; + case KASAN_REPORT_DOUBLE_FREE: + info->bug_type = "double-free"; + break; + default: + /* bug_type filled in by kasan_complete_mode_report_info. */ + break; + } + /* Fill in mode-specific report info fields. */ kasan_complete_mode_report_info(info); } diff --git a/mm/kasan/report_generic.c b/mm/kasan/report_generic.c index 087c1d8c8145..043c94b04605 100644 --- a/mm/kasan/report_generic.c +++ b/mm/kasan/report_generic.c @@ -132,7 +132,8 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) struct kasan_alloc_meta *alloc_meta; struct kasan_free_meta *free_meta; - info->bug_type = get_bug_type(info); + if (!info->bug_type) + info->bug_type = get_bug_type(info); if (!info->cache || !info->object) return; diff --git a/mm/kasan/report_tags.c b/mm/kasan/report_tags.c index d3510424d29b..ecede06ef374 100644 --- a/mm/kasan/report_tags.c +++ b/mm/kasan/report_tags.c @@ -37,7 +37,7 @@ void kasan_complete_mode_report_info(struct kasan_report_info *info) bool is_free; bool alloc_found = false, free_found = false; - if (!info->cache || !info->object) { + if ((!info->cache || !info->object) && !info->bug_type) { info->bug_type = get_common_bug_type(info); return; } From 6a760f58c792b6f7411f886271bb03f697464433 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mika=20Penttil=C3=A4?= Date: Fri, 26 Aug 2022 08:06:31 +0300 Subject: [PATCH 172/350] mm/hmm/test: use char dev with struct device to get device node MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit HMM selftests use an in-kernel pseudo device to emulate device memory. The pseudo device registers a major device range for two or four pseudo device instances. User space has a script that reads /proc/devices in order to find the assigned major number, and sends that to mknod(1), once for each node. Change this to properly use cdev and struct device APIs. Delete the /proc/devices parsing from the user-space test script, now that it is unnecessary. Also, delete an unused field in struct dmirror_device: devmem. Link: https://lkml.kernel.org/r/20220826050631.25771-1-mpenttil@redhat.com Signed-off-by: Mika Penttilä Reviewed-by: John Hubbard Reviewed-by: Christoph Hellwig Reviewed-by: Jason Gunthorpe Cc: Alistair Popple Cc: Ralph Campbell Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/test_hmm.c | 13 ++++++++++--- tools/testing/selftests/vm/test_hmm.sh | 10 ---------- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e3965cafd27c..6a33f6b1b465 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -107,8 +107,8 @@ struct dmirror_chunk { */ struct dmirror_device { struct cdev cdevice; - struct hmm_devmem *devmem; unsigned int zone_device_type; + struct device device; unsigned int devmem_capacity; unsigned int devmem_count; @@ -1390,7 +1390,14 @@ static int dmirror_device_init(struct dmirror_device *mdevice, int id) cdev_init(&mdevice->cdevice, &dmirror_fops); mdevice->cdevice.owner = THIS_MODULE; - ret = cdev_add(&mdevice->cdevice, dev, 1); + device_initialize(&mdevice->device); + mdevice->device.devt = dev; + + ret = dev_set_name(&mdevice->device, "hmm_dmirror%u", id); + if (ret) + return ret; + + ret = cdev_device_add(&mdevice->cdevice, &mdevice->device); if (ret) return ret; @@ -1416,7 +1423,7 @@ static void dmirror_device_remove(struct dmirror_device *mdevice) kfree(mdevice->devmem_chunks); } - cdev_del(&mdevice->cdevice); + cdev_device_del(&mdevice->cdevice, &mdevice->device); } static int __init hmm_dmirror_init(void) diff --git a/tools/testing/selftests/vm/test_hmm.sh b/tools/testing/selftests/vm/test_hmm.sh index 539c9371e592..46e19b5d648d 100755 --- a/tools/testing/selftests/vm/test_hmm.sh +++ b/tools/testing/selftests/vm/test_hmm.sh @@ -52,21 +52,11 @@ load_driver() usage fi fi - if [ $? == 0 ]; then - major=$(awk "\$2==\"HMM_DMIRROR\" {print \$1}" /proc/devices) - mknod /dev/hmm_dmirror0 c $major 0 - mknod /dev/hmm_dmirror1 c $major 1 - if [ $# -eq 2 ]; then - mknod /dev/hmm_dmirror2 c $major 2 - mknod /dev/hmm_dmirror3 c $major 3 - fi - fi } unload_driver() { modprobe -r $DRIVER > /dev/null 2>&1 - rm -f /dev/hmm_dmirror? } run_smoke() From 36001cba4f728e7fa2a58bc69fece22eaeef5cca Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 6 Sep 2022 23:18:47 +0800 Subject: [PATCH 173/350] mm/damon/core: iterate the regions list from current point in damon_set_regions() We iterate the whole regions list every time to get the first/last regions intersecting with the specific range in damon_set_regions(), in order to add new region or resize existing regions to fit in the specific range. Actually, it is unnecessary to iterate the new added regions and the front regions that have been checked. Just iterate the regions list from the current point using list_for_each_entry_from() every time to improve performance. The kunit tests passed: [PASSED] damon_test_apply_three_regions1 [PASSED] damon_test_apply_three_regions2 [PASSED] damon_test_apply_three_regions3 [PASSED] damon_test_apply_three_regions4 Link: https://lkml.kernel.org/r/1662477527-13003-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 8 ++++++++ mm/damon/core.c | 3 ++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 7b1f4a488230..d54acec048d6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -463,9 +463,17 @@ static inline struct damon_region *damon_last_region(struct damon_target *t) return list_last_entry(&t->regions_list, struct damon_region, list); } +static inline struct damon_region *damon_first_region(struct damon_target *t) +{ + return list_first_entry(&t->regions_list, struct damon_region, list); +} + #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) +#define damon_for_each_region_from(r, t) \ + list_for_each_entry_from(r, &t->regions_list, list) + #define damon_for_each_region_safe(r, next, t) \ list_for_each_entry_safe(r, next, &t->regions_list, list) diff --git a/mm/damon/core.c b/mm/damon/core.c index 9964b9d00768..5e00c04ceef0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -195,6 +195,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, damon_destroy_region(r, t); } + r = damon_first_region(t); /* Add new regions or resize existing regions to fit in the ranges */ for (i = 0; i < nr_ranges; i++) { struct damon_region *first = NULL, *last, *newr; @@ -202,7 +203,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, range = &ranges[i]; /* Get the first/last regions intersecting with the range */ - damon_for_each_region(r, t) { + damon_for_each_region_from(r, t) { if (damon_intersect(r, range)) { if (!first) first = r; From 61768a1b37c664faf028d925e6b7825768afcc00 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Wed, 7 Sep 2022 16:41:16 +0800 Subject: [PATCH 174/350] mm/damon: simplify damon_ctx check in damon_sysfs_before_terminate In damon_sysfs_before_terminate(), it needs to check whether ctx->ops.id supports 'DAMON_OPS_VADDR' or 'DAMON_OPS_FVADDR', there we can use damon_target_has_pid() instead. Link: https://lkml.kernel.org/r/20220907084116.62053-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index fe6c6870cf86..1719bb3531e3 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2309,7 +2309,7 @@ static void damon_sysfs_before_terminate(struct damon_ctx *ctx) { struct damon_target *t, *next; - if (ctx->ops.id != DAMON_OPS_VADDR && ctx->ops.id != DAMON_OPS_FVADDR) + if (!damon_target_has_pid(ctx)) return; mutex_lock(&ctx->kdamond_lock); From 0bba9af03d55d2cc1aa7616a8b9e522ceb49d180 Mon Sep 17 00:00:00 2001 From: Zhenhua Huang Date: Wed, 7 Sep 2022 16:01:13 +0800 Subject: [PATCH 175/350] mm/page_owner.c: remove redundant drain_all_pages Remove an expensive and unnecessary operation as PCP pages are safely skipped when reading page owner.PCP pages can be skipped because PAGE_EXT_OWNER_ALLOCATED is cleared. With draining PCP pages, these pages are moved to buddy list so they can be identified as buddy pages and skipped quickly. Although it improved efficiency of PFN walker, the drain is guaranteed expensive that is unlikely to be offset by a slight increase in efficiency when skipping free pages. PAGE_EXT_OWNER_ALLOCATED is cleared in the page owner reset path below: free_unref_page -> free_unref_page_prepare -> free_pcp_prepare -> free_pages_prepare which do page owner reset -> free_unref_page_commit which add pages into pcp list Link: https://lkml.kernel.org/r/1662704326-15899-1-git-send-email-quic_zhenhuah@quicinc.com Link: https://lkml.kernel.org/r/1662633204-10044-1-git-send-email-quic_zhenhuah@quicinc.com Link: https://lkml.kernel.org/r/1662537673-9392-1-git-send-email-quic_zhenhuah@quicinc.com Signed-off-by: Zhenhua Huang Acked-by: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_owner.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 90023f938c19..54f3e039fb48 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -524,8 +524,6 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) pfn++; - drain_all_pages(NULL); - /* Find an allocated page */ for (; pfn < max_pfn; pfn++) { /* From 4f9bc69ac5ce34071a9a51343bc81ca76cb2e3f1 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 14:08:42 +0800 Subject: [PATCH 176/350] mm: reuse pageblock_start/end_pfn() macro Move pageblock_start_pfn/pageblock_end_pfn() into pageblock-flags.h, then they could be used somewhere else, not only in compaction, also use ALIGN_DOWN() instead of round_down() to be pair with ALIGN(), which should be same for pageblock usage. Link: https://lkml.kernel.org/r/20220907060844.126891-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 2 ++ mm/compaction.c | 2 -- mm/memblock.c | 2 +- mm/page_alloc.c | 13 ++++++------- mm/page_isolation.c | 11 +++++------ mm/page_owner.c | 4 ++-- 6 files changed, 16 insertions(+), 18 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 83c7248053a1..a09b7fe6bbf8 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -53,6 +53,8 @@ extern unsigned int pageblock_order; #endif /* CONFIG_HUGETLB_PAGE */ #define pageblock_nr_pages (1UL << pageblock_order) +#define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) +#define pageblock_end_pfn(pfn) ALIGN((pfn) + 1, pageblock_nr_pages) /* Forward declaration */ struct page; diff --git a/mm/compaction.c b/mm/compaction.c index 262c4676b32c..9cbe8562b63a 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -52,8 +52,6 @@ static inline void count_compact_events(enum vm_event_item item, long delta) #define block_start_pfn(pfn, order) round_down(pfn, 1UL << (order)) #define block_end_pfn(pfn, order) ALIGN((pfn) + 1, 1UL << (order)) -#define pageblock_start_pfn(pfn) block_start_pfn(pfn, pageblock_order) -#define pageblock_end_pfn(pfn) block_end_pfn(pfn, pageblock_order) /* * Page order with-respect-to which proactive compaction diff --git a/mm/memblock.c b/mm/memblock.c index b5d3026979fc..46fe7575f03c 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2000,7 +2000,7 @@ static void __init free_unused_memmap(void) * presume that there are no holes in the memory map inside * a pageblock */ - start = round_down(start, pageblock_nr_pages); + start = pageblock_start_pfn(start); /* * If we had a previous bank, and there is a space diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 44f3c9364316..1637db90472e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -544,7 +544,7 @@ static inline int pfn_to_bitidx(const struct page *page, unsigned long pfn) #ifdef CONFIG_SPARSEMEM pfn &= (PAGES_PER_SECTION-1); #else - pfn = pfn - round_down(page_zone(page)->zone_start_pfn, pageblock_nr_pages); + pfn = pfn - pageblock_start_pfn(page_zone(page)->zone_start_pfn); #endif /* CONFIG_SPARSEMEM */ return (pfn >> pageblock_order) * NR_PAGEBLOCK_BITS; } @@ -1857,7 +1857,7 @@ void set_zone_contiguous(struct zone *zone) unsigned long block_start_pfn = zone->zone_start_pfn; unsigned long block_end_pfn; - block_end_pfn = ALIGN(block_start_pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(block_start_pfn); for (; block_start_pfn < zone_end_pfn(zone); block_start_pfn = block_end_pfn, block_end_pfn += pageblock_nr_pages) { @@ -2653,8 +2653,8 @@ int move_freepages_block(struct zone *zone, struct page *page, *num_movable = 0; pfn = page_to_pfn(page); - start_pfn = pfn & ~(pageblock_nr_pages - 1); - end_pfn = start_pfn + pageblock_nr_pages - 1; + start_pfn = pageblock_start_pfn(pfn); + end_pfn = pageblock_end_pfn(pfn) - 1; /* Do not cross zone boundaries */ if (!zone_spans_pfn(zone, start_pfn)) @@ -6934,9 +6934,8 @@ static void __init init_unavailable_range(unsigned long spfn, u64 pgcnt = 0; for (pfn = spfn; pfn < epfn; pfn++) { - if (!pfn_valid(ALIGN_DOWN(pfn, pageblock_nr_pages))) { - pfn = ALIGN_DOWN(pfn, pageblock_nr_pages) - + pageblock_nr_pages - 1; + if (!pfn_valid(pageblock_start_pfn(pfn))) { + pfn = pageblock_end_pfn(pfn) - 1; continue; } __init_single_page(pfn_to_page(pfn), pfn, zone, node); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index eb3a68ca92ad..5819cb9c62f3 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -37,8 +37,8 @@ static struct page *has_unmovable_pages(unsigned long start_pfn, unsigned long e struct zone *zone = page_zone(page); unsigned long pfn; - VM_BUG_ON(ALIGN_DOWN(start_pfn, pageblock_nr_pages) != - ALIGN_DOWN(end_pfn - 1, pageblock_nr_pages)); + VM_BUG_ON(pageblock_start_pfn(start_pfn) != + pageblock_start_pfn(end_pfn - 1)); if (is_migrate_cma_page(page)) { /* @@ -172,7 +172,7 @@ static int set_migratetype_isolate(struct page *page, int migratetype, int isol_ * to avoid redundant checks. */ check_unmovable_start = max(page_to_pfn(page), start_pfn); - check_unmovable_end = min(ALIGN(page_to_pfn(page) + 1, pageblock_nr_pages), + check_unmovable_end = min(pageblock_end_pfn(page_to_pfn(page)), end_pfn); unmovable = has_unmovable_pages(check_unmovable_start, check_unmovable_end, @@ -532,7 +532,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn; struct page *page; /* isolation is done at page block granularity */ - unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); + unsigned long isolate_start = pageblock_start_pfn(start_pfn); unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); int ret; bool skip_isolation = false; @@ -579,10 +579,9 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, { unsigned long pfn; struct page *page; - unsigned long isolate_start = ALIGN_DOWN(start_pfn, pageblock_nr_pages); + unsigned long isolate_start = pageblock_start_pfn(start_pfn); unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); - for (pfn = isolate_start; pfn < isolate_end; pfn += pageblock_nr_pages) { diff --git a/mm/page_owner.c b/mm/page_owner.c index 54f3e039fb48..2d27f532df4c 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -297,7 +297,7 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, continue; } - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); pageblock_mt = get_pageblock_migratetype(page); @@ -635,7 +635,7 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) continue; } - block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); + block_end_pfn = pageblock_end_pfn(pfn); block_end_pfn = min(block_end_pfn, end_pfn); for (; pfn < block_end_pfn; pfn++) { From 5f7fa13fa858c17580ed513bd5e0a4b36d68fdd6 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 14:08:43 +0800 Subject: [PATCH 177/350] mm: add pageblock_align() macro Add pageblock_align() macro and use it to simplify code. Link: https://lkml.kernel.org/r/20220907060844.126891-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport Reviewed-by: David Hildenbrand Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 1 + mm/memblock.c | 4 ++-- mm/page_isolation.c | 4 ++-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index a09b7fe6bbf8..293c76630fa8 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -53,6 +53,7 @@ extern unsigned int pageblock_order; #endif /* CONFIG_HUGETLB_PAGE */ #define pageblock_nr_pages (1UL << pageblock_order) +#define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) #define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) #define pageblock_end_pfn(pfn) ALIGN((pfn) + 1, pageblock_nr_pages) diff --git a/mm/memblock.c b/mm/memblock.c index 46fe7575f03c..511d4783dcf1 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -2014,12 +2014,12 @@ static void __init free_unused_memmap(void) * presume that there are no holes in the memory map inside * a pageblock */ - prev_end = ALIGN(end, pageblock_nr_pages); + prev_end = pageblock_align(end); } #ifdef CONFIG_SPARSEMEM if (!IS_ALIGNED(prev_end, PAGES_PER_SECTION)) { - prev_end = ALIGN(end, pageblock_nr_pages); + prev_end = pageblock_align(end); free_memmap(prev_end, ALIGN(prev_end, PAGES_PER_SECTION)); } #endif diff --git a/mm/page_isolation.c b/mm/page_isolation.c index 5819cb9c62f3..fa82faa07daf 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -533,7 +533,7 @@ int start_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, struct page *page; /* isolation is done at page block granularity */ unsigned long isolate_start = pageblock_start_pfn(start_pfn); - unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); + unsigned long isolate_end = pageblock_align(end_pfn); int ret; bool skip_isolation = false; @@ -580,7 +580,7 @@ void undo_isolate_page_range(unsigned long start_pfn, unsigned long end_pfn, unsigned long pfn; struct page *page; unsigned long isolate_start = pageblock_start_pfn(start_pfn); - unsigned long isolate_end = ALIGN(end_pfn, pageblock_nr_pages); + unsigned long isolate_end = pageblock_align(end_pfn); for (pfn = isolate_start; pfn < isolate_end; From ee0913c4719610204315a0d8a35122c6233249e0 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 14:08:44 +0800 Subject: [PATCH 178/350] mm: add pageblock_aligned() macro Add pageblock_aligned() and use it to simplify code. Link: https://lkml.kernel.org/r/20220907060844.126891-3-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Acked-by: Mike Rapoport Cc: David Hildenbrand Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/pageblock-flags.h | 1 + mm/compaction.c | 8 ++++---- mm/memory_hotplug.c | 6 ++---- mm/page_alloc.c | 17 +++++++---------- mm/page_isolation.c | 2 +- 5 files changed, 15 insertions(+), 19 deletions(-) diff --git a/include/linux/pageblock-flags.h b/include/linux/pageblock-flags.h index 293c76630fa8..5f1ae07d724b 100644 --- a/include/linux/pageblock-flags.h +++ b/include/linux/pageblock-flags.h @@ -54,6 +54,7 @@ extern unsigned int pageblock_order; #define pageblock_nr_pages (1UL << pageblock_order) #define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) +#define pageblock_aligned(pfn) IS_ALIGNED((pfn), pageblock_nr_pages) #define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) #define pageblock_end_pfn(pfn) ALIGN((pfn) + 1, pageblock_nr_pages) diff --git a/mm/compaction.c b/mm/compaction.c index 9cbe8562b63a..e2a9615f5fde 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -402,7 +402,7 @@ static bool test_and_set_skip(struct compact_control *cc, struct page *page, if (cc->ignore_skip_hint) return false; - if (!IS_ALIGNED(pfn, pageblock_nr_pages)) + if (!pageblock_aligned(pfn)) return false; skip = get_pageblock_skip(page); @@ -884,7 +884,7 @@ isolate_migratepages_block(struct compact_control *cc, unsigned long low_pfn, * COMPACT_CLUSTER_MAX at a time so the second call must * not falsely conclude that the block should be skipped. */ - if (!valid_page && IS_ALIGNED(low_pfn, pageblock_nr_pages)) { + if (!valid_page && pageblock_aligned(low_pfn)) { if (!isolation_suitable(cc, page)) { low_pfn = end_pfn; page = NULL; @@ -1937,7 +1937,7 @@ static isolate_migrate_t isolate_migratepages(struct compact_control *cc) * before making it "skip" so other compaction instances do * not scan the same block. */ - if (IS_ALIGNED(low_pfn, pageblock_nr_pages) && + if (pageblock_aligned(low_pfn) && !fast_find_block && !isolation_suitable(cc, page)) continue; @@ -2123,7 +2123,7 @@ static enum compact_result __compact_finished(struct compact_control *cc) * migration source is unmovable/reclaimable but it's not worth * special casing. */ - if (!IS_ALIGNED(cc->migrate_pfn, pageblock_nr_pages)) + if (!pageblock_aligned(cc->migrate_pfn)) return COMPACT_CONTINUE; /* Direct compactor: Is a suitable page free? */ diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9ae1f98548b1..fd40f7e9f176 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1085,8 +1085,7 @@ int __ref online_pages(unsigned long pfn, unsigned long nr_pages, * of the physical memory space for vmemmaps. That space is pageblock * aligned. */ - if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(pfn, pageblock_nr_pages) || + if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(pfn) || !IS_ALIGNED(pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; @@ -1806,8 +1805,7 @@ int __ref offline_pages(unsigned long start_pfn, unsigned long nr_pages, * of the physical memory space for vmemmaps. That space is pageblock * aligned. */ - if (WARN_ON_ONCE(!nr_pages || - !IS_ALIGNED(start_pfn, pageblock_nr_pages) || + if (WARN_ON_ONCE(!nr_pages || !pageblock_aligned(start_pfn) || !IS_ALIGNED(start_pfn + nr_pages, PAGES_PER_SECTION))) return -EINVAL; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1637db90472e..0002ded4ab0e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1892,15 +1892,14 @@ static void __init deferred_free_range(unsigned long pfn, page = pfn_to_page(pfn); /* Free a large naturally-aligned chunk if possible */ - if (nr_pages == pageblock_nr_pages && - (pfn & (pageblock_nr_pages - 1)) == 0) { + if (nr_pages == pageblock_nr_pages && pageblock_aligned(pfn)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_core(page, pageblock_order); return; } for (i = 0; i < nr_pages; i++, page++, pfn++) { - if ((pfn & (pageblock_nr_pages - 1)) == 0) + if (pageblock_aligned(pfn)) set_pageblock_migratetype(page, MIGRATE_MOVABLE); __free_pages_core(page, 0); } @@ -1928,7 +1927,7 @@ static inline void __init pgdat_init_report_one_done(void) */ static inline bool __init deferred_pfn_valid(unsigned long pfn) { - if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) + if (pageblock_aligned(pfn) && !pfn_valid(pfn)) return false; return true; } @@ -1940,14 +1939,13 @@ static inline bool __init deferred_pfn_valid(unsigned long pfn) static void __init deferred_free_pages(unsigned long pfn, unsigned long end_pfn) { - unsigned long nr_pgmask = pageblock_nr_pages - 1; unsigned long nr_free = 0; for (; pfn < end_pfn; pfn++) { if (!deferred_pfn_valid(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 0; - } else if (!(pfn & nr_pgmask)) { + } else if (pageblock_aligned(pfn)) { deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; } else { @@ -1967,7 +1965,6 @@ static unsigned long __init deferred_init_pages(struct zone *zone, unsigned long pfn, unsigned long end_pfn) { - unsigned long nr_pgmask = pageblock_nr_pages - 1; int nid = zone_to_nid(zone); unsigned long nr_pages = 0; int zid = zone_idx(zone); @@ -1977,7 +1974,7 @@ static unsigned long __init deferred_init_pages(struct zone *zone, if (!deferred_pfn_valid(pfn)) { page = NULL; continue; - } else if (!page || !(pfn & nr_pgmask)) { + } else if (!page || pageblock_aligned(pfn)) { page = pfn_to_page(pfn); } else { page++; @@ -6759,7 +6756,7 @@ void __meminit memmap_init_range(unsigned long size, int nid, unsigned long zone * such that unmovable allocations won't be scattered all * over the place during system boot. */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + if (pageblock_aligned(pfn)) { set_pageblock_migratetype(page, migratetype); cond_resched(); } @@ -6802,7 +6799,7 @@ static void __ref __init_zone_device_page(struct page *page, unsigned long pfn, * Please note that MEMINIT_HOTPLUG path doesn't clear memmap * because this is done early in section_activate() */ - if (IS_ALIGNED(pfn, pageblock_nr_pages)) { + if (pageblock_aligned(pfn)) { set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } diff --git a/mm/page_isolation.c b/mm/page_isolation.c index fa82faa07daf..04141a9bea70 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -312,7 +312,7 @@ static int isolate_single_pageblock(unsigned long boundary_pfn, int flags, struct zone *zone; int ret; - VM_BUG_ON(!IS_ALIGNED(boundary_pfn, pageblock_nr_pages)); + VM_BUG_ON(!pageblock_aligned(boundary_pfn)); if (isolate_before) isolate_pageblock = boundary_pfn - pageblock_nr_pages; From fc5dfebc8055426299739dd1a7828af9638c94fb Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Wed, 7 Sep 2022 16:26:43 +0800 Subject: [PATCH 179/350] memblock tests: add new pageblock related macro Add new pageblock_start_pfn() and pageblock_align() macro which are needed by memblock tests. Link: https://lkml.kernel.org/r/20220907082643.186979-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Cc: David Hildenbrand Cc: Mike Rapoport Cc: Oscar Salvador Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/testing/memblock/linux/mmzone.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/testing/memblock/linux/mmzone.h b/tools/testing/memblock/linux/mmzone.h index 7c2eb5c9bb54..e65f89b12f1c 100644 --- a/tools/testing/memblock/linux/mmzone.h +++ b/tools/testing/memblock/linux/mmzone.h @@ -22,6 +22,8 @@ enum zone_type { #define pageblock_order (MAX_ORDER - 1) #define pageblock_nr_pages BIT(pageblock_order) +#define pageblock_align(pfn) ALIGN((pfn), pageblock_nr_pages) +#define pageblock_start_pfn(pfn) ALIGN_DOWN((pfn), pageblock_nr_pages) struct zone { atomic_long_t managed_pages; From 410f8e82689e1e66044fea51ef852054a09502b7 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 7 Sep 2022 04:35:35 +0000 Subject: [PATCH 180/350] memcg: extract memcg_vmstats from struct mem_cgroup Patch series "memcg: reduce memory overhead of memory cgroups". Currently a lot of memory is wasted to maintain the vmevents for memory cgroups as we have multiple arrays of size NR_VM_EVENT_ITEMS which can be as large as 110. However memcg code uses small portion of those entries. This patch series eliminate this overhead by removing the unneeded vmevent entries from memory cgroup data structures. This patch (of 3): This is a preparatory patch to reduce the memory overhead of memory cgroup. The struct memcg_vmstats is the largest object embedded into the struct mem_cgroup. This patch extracts struct memcg_vmstats from struct mem_cgroup to ease the following patches in reducing the size of struct memcg_vmstats. Link: https://lkml.kernel.org/r/20220907043537.3457014-1-shakeelb@google.com Link: https://lkml.kernel.org/r/20220907043537.3457014-2-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 37 +++---------------------- mm/memcontrol.c | 57 ++++++++++++++++++++++++++++++++------ 2 files changed, 52 insertions(+), 42 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ca0df42662ad..dc7d40e575d5 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -80,29 +80,8 @@ enum mem_cgroup_events_target { MEM_CGROUP_NTARGETS, }; -struct memcg_vmstats_percpu { - /* Local (CPU and cgroup) page state & events */ - long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; - - /* Delta calculation for lockless upward propagation */ - long state_prev[MEMCG_NR_STAT]; - unsigned long events_prev[NR_VM_EVENT_ITEMS]; - - /* Cgroup1: threshold notifications & softlimit tree updates */ - unsigned long nr_page_events; - unsigned long targets[MEM_CGROUP_NTARGETS]; -}; - -struct memcg_vmstats { - /* Aggregated (CPU and subtree) page state & events */ - long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; - - /* Pending child counts during tree propagation */ - long state_pending[MEMCG_NR_STAT]; - unsigned long events_pending[NR_VM_EVENT_ITEMS]; -}; +struct memcg_vmstats_percpu; +struct memcg_vmstats; struct mem_cgroup_reclaim_iter { struct mem_cgroup *position; @@ -298,7 +277,7 @@ struct mem_cgroup { CACHELINE_PADDING(_pad1_); /* memory.stat */ - struct memcg_vmstats vmstats; + struct memcg_vmstats *vmstats; /* memory.events */ atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; @@ -1001,15 +980,7 @@ static inline void mod_memcg_page_state(struct page *page, rcu_read_unlock(); } -static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) -{ - long x = READ_ONCE(memcg->vmstats.state[idx]); -#ifdef CONFIG_SMP - if (x < 0) - x = 0; -#endif - return x; -} +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx); static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 632402001bca..0a44a733bb03 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -669,6 +669,40 @@ static void flush_memcg_stats_dwork(struct work_struct *w) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } +struct memcg_vmstats_percpu { + /* Local (CPU and cgroup) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + + /* Delta calculation for lockless upward propagation */ + long state_prev[MEMCG_NR_STAT]; + unsigned long events_prev[NR_VM_EVENT_ITEMS]; + + /* Cgroup1: threshold notifications & softlimit tree updates */ + unsigned long nr_page_events; + unsigned long targets[MEM_CGROUP_NTARGETS]; +}; + +struct memcg_vmstats { + /* Aggregated (CPU and subtree) page state & events */ + long state[MEMCG_NR_STAT]; + unsigned long events[NR_VM_EVENT_ITEMS]; + + /* Pending child counts during tree propagation */ + long state_pending[MEMCG_NR_STAT]; + unsigned long events_pending[NR_VM_EVENT_ITEMS]; +}; + +unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) +{ + long x = READ_ONCE(memcg->vmstats->state[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; +} + /** * __mod_memcg_state - update cgroup memory statistics * @memcg: the memory cgroup @@ -827,7 +861,7 @@ void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - return READ_ONCE(memcg->vmstats.events[event]); + return READ_ONCE(memcg->vmstats->events[event]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) @@ -5170,6 +5204,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); + kfree(memcg->vmstats); free_percpu(memcg->vmstats_percpu); kfree(memcg); } @@ -5199,6 +5234,10 @@ static struct mem_cgroup *mem_cgroup_alloc(void) goto fail; } + memcg->vmstats = kzalloc(sizeof(struct memcg_vmstats), GFP_KERNEL); + if (!memcg->vmstats) + goto fail; + memcg->vmstats_percpu = alloc_percpu_gfp(struct memcg_vmstats_percpu, GFP_KERNEL_ACCOUNT); if (!memcg->vmstats_percpu) @@ -5418,9 +5457,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) * below us. We're in a per-cpu loop here and this is * a global counter, so the first cycle will get them. */ - delta = memcg->vmstats.state_pending[i]; + delta = memcg->vmstats->state_pending[i]; if (delta) - memcg->vmstats.state_pending[i] = 0; + memcg->vmstats->state_pending[i] = 0; /* Add CPU changes on this level since the last flush */ v = READ_ONCE(statc->state[i]); @@ -5433,15 +5472,15 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) continue; /* Aggregate counts on this level and propagate upwards */ - memcg->vmstats.state[i] += delta; + memcg->vmstats->state[i] += delta; if (parent) - parent->vmstats.state_pending[i] += delta; + parent->vmstats->state_pending[i] += delta; } for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { - delta = memcg->vmstats.events_pending[i]; + delta = memcg->vmstats->events_pending[i]; if (delta) - memcg->vmstats.events_pending[i] = 0; + memcg->vmstats->events_pending[i] = 0; v = READ_ONCE(statc->events[i]); if (v != statc->events_prev[i]) { @@ -5452,9 +5491,9 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) if (!delta) continue; - memcg->vmstats.events[i] += delta; + memcg->vmstats->events[i] += delta; if (parent) - parent->vmstats.events_pending[i] += delta; + parent->vmstats->events_pending[i] += delta; } for_each_node_state(nid, N_MEMORY) { From d396def5d86dbeb4ceb4a9dca92611ce206dc66a Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 7 Sep 2022 04:35:36 +0000 Subject: [PATCH 181/350] memcg: rearrange code This is a preparatory patch for easing the review of the follow up patch which will reduce the memory overhead of memory cgroups. Link: https://lkml.kernel.org/r/20220907043537.3457014-3-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 46 +++++++++++++++++++++++----------------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0a44a733bb03..78fd7cfb4f92 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -669,6 +669,29 @@ static void flush_memcg_stats_dwork(struct work_struct *w) queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME); } +/* Subset of vm_event_item to report for memcg event stats */ +static const unsigned int memcg_vm_event_stat[] = { + PGSCAN_KSWAPD, + PGSCAN_DIRECT, + PGSTEAL_KSWAPD, + PGSTEAL_DIRECT, + PGFAULT, + PGMAJFAULT, + PGREFILL, + PGACTIVATE, + PGDEACTIVATE, + PGLAZYFREE, + PGLAZYFREED, +#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) + ZSWPIN, + ZSWPOUT, +#endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + THP_FAULT_ALLOC, + THP_COLLAPSE_ALLOC, +#endif +}; + struct memcg_vmstats_percpu { /* Local (CPU and cgroup) page state & events */ long state[MEMCG_NR_STAT]; @@ -1501,29 +1524,6 @@ static inline unsigned long memcg_page_state_output(struct mem_cgroup *memcg, return memcg_page_state(memcg, item) * memcg_page_state_unit(item); } -/* Subset of vm_event_item to report for memcg event stats */ -static const unsigned int memcg_vm_event_stat[] = { - PGSCAN_KSWAPD, - PGSCAN_DIRECT, - PGSTEAL_KSWAPD, - PGSTEAL_DIRECT, - PGFAULT, - PGMAJFAULT, - PGREFILL, - PGACTIVATE, - PGDEACTIVATE, - PGLAZYFREE, - PGLAZYFREED, -#if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) - ZSWPIN, - ZSWPOUT, -#endif -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - THP_FAULT_ALLOC, - THP_COLLAPSE_ALLOC, -#endif -}; - static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) { struct seq_buf s; From 8278f1c7b4920105f2f30a8df9b8212b378101d2 Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Wed, 7 Sep 2022 04:35:37 +0000 Subject: [PATCH 182/350] memcg: reduce size of memcg vmstats structures The struct memcg_vmstats and struct memcg_vmstats_percpu contains two arrays each for events of size NR_VM_EVENT_ITEMS which can be as large as 110. However the memcg v1 only uses 4 of those while memcg v2 uses 15. The union of both is 17. On a 64 bit system, we are wasting approximately ((110 - 17) * 8 * 2) * (nr_cpus + 1) bytes which is significant on large machines. This patch reduces the size of the given structures by adding one indirection and only stores array of events which are actually used by the memcg code. With this patch, the size of memcg_vmstats has reduced from 2544 bytes to 1056 bytes while the size of memcg_vmstats_percpu has reduced from 2568 bytes to 1080 bytes. [akpm@linux-foundation.org: fix memcg_events_local() array index, per Shakeel] Link: https://lkml.kernel.org/r/CALvZod70Mvxr+Nzb6k0yiU2RFYjTD=0NFhKK-Eyp+5ejd1PSFw@mail.gmail.com Link: https://lkml.kernel.org/r/20220907043537.3457014-4-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Roman Gushchin Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Signed-off-by: Andrew Morton --- mm/memcontrol.c | 54 ++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 44 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 78fd7cfb4f92..1f204a262054 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -671,6 +671,8 @@ static void flush_memcg_stats_dwork(struct work_struct *w) /* Subset of vm_event_item to report for memcg event stats */ static const unsigned int memcg_vm_event_stat[] = { + PGPGIN, + PGPGOUT, PGSCAN_KSWAPD, PGSCAN_DIRECT, PGSTEAL_KSWAPD, @@ -692,14 +694,30 @@ static const unsigned int memcg_vm_event_stat[] = { #endif }; +#define NR_MEMCG_EVENTS ARRAY_SIZE(memcg_vm_event_stat) +static int mem_cgroup_events_index[NR_VM_EVENT_ITEMS] __read_mostly; + +static void init_memcg_events(void) +{ + int i; + + for (i = 0; i < NR_MEMCG_EVENTS; ++i) + mem_cgroup_events_index[memcg_vm_event_stat[i]] = i + 1; +} + +static inline int memcg_events_index(enum vm_event_item idx) +{ + return mem_cgroup_events_index[idx] - 1; +} + struct memcg_vmstats_percpu { /* Local (CPU and cgroup) page state & events */ long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; + unsigned long events[NR_MEMCG_EVENTS]; /* Delta calculation for lockless upward propagation */ long state_prev[MEMCG_NR_STAT]; - unsigned long events_prev[NR_VM_EVENT_ITEMS]; + unsigned long events_prev[NR_MEMCG_EVENTS]; /* Cgroup1: threshold notifications & softlimit tree updates */ unsigned long nr_page_events; @@ -709,11 +727,11 @@ struct memcg_vmstats_percpu { struct memcg_vmstats { /* Aggregated (CPU and subtree) page state & events */ long state[MEMCG_NR_STAT]; - unsigned long events[NR_VM_EVENT_ITEMS]; + unsigned long events[NR_MEMCG_EVENTS]; /* Pending child counts during tree propagation */ long state_pending[MEMCG_NR_STAT]; - unsigned long events_pending[NR_VM_EVENT_ITEMS]; + unsigned long events_pending[NR_MEMCG_EVENTS]; }; unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) @@ -873,27 +891,37 @@ void __mod_lruvec_kmem_state(void *p, enum node_stat_item idx, int val) void __count_memcg_events(struct mem_cgroup *memcg, enum vm_event_item idx, unsigned long count) { - if (mem_cgroup_disabled()) + int index = memcg_events_index(idx); + + if (mem_cgroup_disabled() || index < 0) return; memcg_stats_lock(); - __this_cpu_add(memcg->vmstats_percpu->events[idx], count); + __this_cpu_add(memcg->vmstats_percpu->events[index], count); memcg_rstat_updated(memcg, count); memcg_stats_unlock(); } static unsigned long memcg_events(struct mem_cgroup *memcg, int event) { - return READ_ONCE(memcg->vmstats->events[event]); + int index = memcg_events_index(event); + + if (index < 0) + return 0; + return READ_ONCE(memcg->vmstats->events[index]); } static unsigned long memcg_events_local(struct mem_cgroup *memcg, int event) { long x = 0; int cpu; + int index = memcg_events_index(event); + + if (index < 0) + return 0; for_each_possible_cpu(cpu) - x += per_cpu(memcg->vmstats_percpu->events[event], cpu); + x += per_cpu(memcg->vmstats_percpu->events[index], cpu); return x; } @@ -1564,10 +1592,15 @@ static void memory_stat_format(struct mem_cgroup *memcg, char *buf, int bufsize) memcg_events(memcg, PGSTEAL_KSWAPD) + memcg_events(memcg, PGSTEAL_DIRECT)); - for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) + for (i = 0; i < ARRAY_SIZE(memcg_vm_event_stat); i++) { + if (memcg_vm_event_stat[i] == PGPGIN || + memcg_vm_event_stat[i] == PGPGOUT) + continue; + seq_buf_printf(&s, "%s %lu\n", vm_event_name(memcg_vm_event_stat[i]), memcg_events(memcg, memcg_vm_event_stat[i])); + } /* The above should easily fit into one page */ WARN_ON_ONCE(seq_buf_has_overflowed(&s)); @@ -5309,6 +5342,7 @@ mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) page_counter_init(&memcg->kmem, &parent->kmem); page_counter_init(&memcg->tcpmem, &parent->tcpmem); } else { + init_memcg_events(); page_counter_init(&memcg->memory, NULL); page_counter_init(&memcg->swap, NULL); page_counter_init(&memcg->kmem, NULL); @@ -5477,7 +5511,7 @@ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu) parent->vmstats->state_pending[i] += delta; } - for (i = 0; i < NR_VM_EVENT_ITEMS; i++) { + for (i = 0; i < NR_MEMCG_EVENTS; i++) { delta = memcg->vmstats->events_pending[i]; if (delta) memcg->vmstats->events_pending[i] = 0; From 4e07acdda7fc23f5c4666e54961ef972a1195ffd Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 6 Sep 2022 17:35:30 +0800 Subject: [PATCH 183/350] mm/hwpoison: add __init/__exit annotations to module init/exit funcs Add missing __init/__exit annotations to module init/exit funcs. Link: https://lkml.kernel.org/r/20220906093530.243262-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Reviewed-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/hwpoison-inject.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hwpoison-inject.c b/mm/hwpoison-inject.c index 65e242b5a432..d0548e382b6b 100644 --- a/mm/hwpoison-inject.c +++ b/mm/hwpoison-inject.c @@ -63,13 +63,13 @@ static int hwpoison_unpoison(void *data, u64 val) DEFINE_DEBUGFS_ATTRIBUTE(hwpoison_fops, NULL, hwpoison_inject, "%lli\n"); DEFINE_DEBUGFS_ATTRIBUTE(unpoison_fops, NULL, hwpoison_unpoison, "%lli\n"); -static void pfn_inject_exit(void) +static void __exit pfn_inject_exit(void) { hwpoison_filter_enable = 0; debugfs_remove_recursive(hwpoison_dir); } -static int pfn_inject_init(void) +static int __init pfn_inject_init(void) { hwpoison_dir = debugfs_create_dir("hwpoison", NULL); From 679d7f69d60bbd124542e620b745c17643cdf680 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 6 Sep 2022 15:53:12 +0800 Subject: [PATCH 184/350] mm/rodata_test: use PAGE_ALIGNED() helper Use PAGE_ALIGNED() helper instead of open-coding operation, no functional changes here. Link: https://lkml.kernel.org/r/20220906075312.166595-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Signed-off-by: Andrew Morton --- mm/rodata_test.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/mm/rodata_test.c b/mm/rodata_test.c index 2613371945b7..6d783436951f 100644 --- a/mm/rodata_test.c +++ b/mm/rodata_test.c @@ -9,13 +9,13 @@ #include #include +#include #include static const int rodata_test_data = 0xC3; void rodata_test(void) { - unsigned long start, end; int zero = 0; /* test 1: read the value */ @@ -39,13 +39,11 @@ void rodata_test(void) } /* test 4: check if the rodata section is PAGE_SIZE aligned */ - start = (unsigned long)__start_rodata; - end = (unsigned long)__end_rodata; - if (start & (PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(__start_rodata)) { pr_err("start of .rodata is not page size aligned\n"); return; } - if (end & (PAGE_SIZE - 1)) { + if (!PAGE_ALIGNED(__end_rodata)) { pr_err("end of .rodata is not page size aligned\n"); return; } From f5a79d7c0c87c8d88bb5e3f3c898258fdf1b3b05 Mon Sep 17 00:00:00 2001 From: Yajun Deng Date: Thu, 8 Sep 2022 19:14:43 +0000 Subject: [PATCH 185/350] mm/damon: introduce struct damos_access_pattern damon_new_scheme() has too many parameters, so introduce struct damos_access_pattern to simplify it. In additon, we can't use a bpf trace kprobe that has more than 5 parameters. Link: https://lkml.kernel.org/r/20220908191443.129534-1-sj@kernel.org Signed-off-by: Yajun Deng Signed-off-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 37 ++++++++++++++++++---------------- mm/damon/core.c | 31 ++++++++++++++--------------- mm/damon/dbgfs.c | 27 +++++++++++++++---------- mm/damon/lru_sort.c | 46 ++++++++++++++++++++++++++----------------- mm/damon/reclaim.c | 23 +++++++++++++--------- mm/damon/sysfs.c | 17 +++++++++++----- 6 files changed, 106 insertions(+), 75 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index d54acec048d6..90f20675da22 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -216,13 +216,26 @@ struct damos_stat { }; /** - * struct damos - Represents a Data Access Monitoring-based Operation Scheme. + * struct damos_access_pattern - Target access pattern of the given scheme. * @min_sz_region: Minimum size of target regions. * @max_sz_region: Maximum size of target regions. * @min_nr_accesses: Minimum ``->nr_accesses`` of target regions. * @max_nr_accesses: Maximum ``->nr_accesses`` of target regions. * @min_age_region: Minimum age of target regions. * @max_age_region: Maximum age of target regions. + */ +struct damos_access_pattern { + unsigned long min_sz_region; + unsigned long max_sz_region; + unsigned int min_nr_accesses; + unsigned int max_nr_accesses; + unsigned int min_age_region; + unsigned int max_age_region; +}; + +/** + * struct damos - Represents a Data Access Monitoring-based Operation Scheme. + * @pattern: Access pattern of target regions. * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. @@ -230,10 +243,8 @@ struct damos_stat { * @list: List head for siblings. * * For each aggregation interval, DAMON finds regions which fit in the - * condition (&min_sz_region, &max_sz_region, &min_nr_accesses, - * &max_nr_accesses, &min_age_region, &max_age_region) and applies &action to - * those. To avoid consuming too much CPU time or IO resources for the - * &action, "a is used. + * &pattern and applies &action to those. To avoid consuming too much + * CPU time or IO resources for the &action, "a is used. * * To do the work only when needed, schemes can be activated for specific * system situations using &wmarks. If all schemes that registered to the @@ -248,12 +259,7 @@ struct damos_stat { * &action is applied. */ struct damos { - unsigned long min_sz_region; - unsigned long max_sz_region; - unsigned int min_nr_accesses; - unsigned int max_nr_accesses; - unsigned int min_age_region; - unsigned int max_age_region; + struct damos_access_pattern pattern; enum damos_action action; struct damos_quota quota; struct damos_watermarks wmarks; @@ -509,12 +515,9 @@ void damon_destroy_region(struct damon_region *r, struct damon_target *t); int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, unsigned int nr_ranges); -struct damos *damon_new_scheme( - unsigned long min_sz_region, unsigned long max_sz_region, - unsigned int min_nr_accesses, unsigned int max_nr_accesses, - unsigned int min_age_region, unsigned int max_age_region, - enum damos_action action, struct damos_quota *quota, - struct damos_watermarks *wmarks); +struct damos *damon_new_scheme(struct damos_access_pattern *pattern, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks); void damon_add_scheme(struct damon_ctx *ctx, struct damos *s); void damon_destroy_scheme(struct damos *s); diff --git a/mm/damon/core.c b/mm/damon/core.c index 5e00c04ceef0..bae41990f422 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -231,24 +231,21 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, return 0; } -struct damos *damon_new_scheme( - unsigned long min_sz_region, unsigned long max_sz_region, - unsigned int min_nr_accesses, unsigned int max_nr_accesses, - unsigned int min_age_region, unsigned int max_age_region, - enum damos_action action, struct damos_quota *quota, - struct damos_watermarks *wmarks) +struct damos *damon_new_scheme(struct damos_access_pattern *pattern, + enum damos_action action, struct damos_quota *quota, + struct damos_watermarks *wmarks) { struct damos *scheme; scheme = kmalloc(sizeof(*scheme), GFP_KERNEL); if (!scheme) return NULL; - scheme->min_sz_region = min_sz_region; - scheme->max_sz_region = max_sz_region; - scheme->min_nr_accesses = min_nr_accesses; - scheme->max_nr_accesses = max_nr_accesses; - scheme->min_age_region = min_age_region; - scheme->max_age_region = max_age_region; + scheme->pattern.min_sz_region = pattern->min_sz_region; + scheme->pattern.max_sz_region = pattern->max_sz_region; + scheme->pattern.min_nr_accesses = pattern->min_nr_accesses; + scheme->pattern.max_nr_accesses = pattern->max_nr_accesses; + scheme->pattern.min_age_region = pattern->min_age_region; + scheme->pattern.max_age_region = pattern->max_age_region; scheme->action = action; scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); @@ -667,10 +664,12 @@ static bool __damos_valid_target(struct damon_region *r, struct damos *s) unsigned long sz; sz = r->ar.end - r->ar.start; - return s->min_sz_region <= sz && sz <= s->max_sz_region && - s->min_nr_accesses <= r->nr_accesses && - r->nr_accesses <= s->max_nr_accesses && - s->min_age_region <= r->age && r->age <= s->max_age_region; + return s->pattern.min_sz_region <= sz && + sz <= s->pattern.max_sz_region && + s->pattern.min_nr_accesses <= r->nr_accesses && + r->nr_accesses <= s->pattern.max_nr_accesses && + s->pattern.min_age_region <= r->age && + r->age <= s->pattern.max_age_region; } static bool damos_valid_target(struct damon_ctx *c, struct damon_target *t, diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 652a94deafe3..1422037cedd2 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -131,9 +131,12 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) damon_for_each_scheme(s, c) { rc = scnprintf(&buf[written], len - written, "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", - s->min_sz_region, s->max_sz_region, - s->min_nr_accesses, s->max_nr_accesses, - s->min_age_region, s->max_age_region, + s->pattern.min_sz_region, + s->pattern.max_sz_region, + s->pattern.min_nr_accesses, + s->pattern.max_nr_accesses, + s->pattern.min_age_region, + s->pattern.max_age_region, damos_action_to_dbgfs_scheme_action(s->action), s->quota.ms, s->quota.sz, s->quota.reset_interval, @@ -221,8 +224,6 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, struct damos *scheme, **schemes; const int max_nr_schemes = 256; int pos = 0, parsed, ret; - unsigned long min_sz, max_sz; - unsigned int min_nr_a, max_nr_a, min_age, max_age; unsigned int action_input; enum damos_action action; @@ -233,13 +234,18 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, *nr_schemes = 0; while (pos < len && *nr_schemes < max_nr_schemes) { + struct damos_access_pattern pattern = {}; struct damos_quota quota = {}; struct damos_watermarks wmarks; ret = sscanf(&str[pos], "%lu %lu %u %u %u %u %u %lu %lu %lu %u %u %u %u %lu %lu %lu %lu%n", - &min_sz, &max_sz, &min_nr_a, &max_nr_a, - &min_age, &max_age, &action_input, "a.ms, + &pattern.min_sz_region, &pattern.max_sz_region, + &pattern.min_nr_accesses, + &pattern.max_nr_accesses, + &pattern.min_age_region, + &pattern.max_age_region, + &action_input, "a.ms, "a.sz, "a.reset_interval, "a.weight_sz, "a.weight_nr_accesses, "a.weight_age, &wmarks.metric, @@ -251,7 +257,9 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, if ((int)action < 0) goto fail; - if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age) + if (pattern.min_sz_region > pattern.max_sz_region || + pattern.min_nr_accesses > pattern.max_nr_accesses || + pattern.min_age_region > pattern.max_age_region) goto fail; if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || @@ -259,8 +267,7 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, goto fail; pos += parsed; - scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a, - min_age, max_age, action, "a, &wmarks); + scheme = damon_new_scheme(&pattern, action, "a, &wmarks); if (!scheme) goto fail; diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 9de6f00a71c5..0184ed4828b7 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -293,6 +293,17 @@ static bool get_monitoring_region(unsigned long *start, unsigned long *end) /* Create a DAMON-based operation scheme for hot memory regions */ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) { + struct damos_access_pattern pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* and accessed for more than the threshold */ + .min_nr_accesses = hot_thres, + .max_nr_accesses = UINT_MAX, + /* no matter its age */ + .min_age_region = 0, + .max_age_region = UINT_MAX, + }; struct damos_watermarks wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = wmarks_interval, @@ -313,26 +324,31 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) .weight_nr_accesses = 1, .weight_age = 0, }; - struct damos *scheme = damon_new_scheme( - /* Find regions having PAGE_SIZE or larger size */ - PAGE_SIZE, ULONG_MAX, - /* and accessed for more than the threshold */ - hot_thres, UINT_MAX, - /* no matter its age */ - 0, UINT_MAX, + + return damon_new_scheme( + &pattern, /* prioritize those on LRU lists, as soon as found */ DAMOS_LRU_PRIO, /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ &wmarks); - - return scheme; } /* Create a DAMON-based operation scheme for cold memory regions */ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) { + struct damos_access_pattern pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* and not accessed at all */ + .min_nr_accesses = 0, + .max_nr_accesses = 0, + /* for min_age or more micro-seconds */ + .min_age_region = cold_thres, + .max_age_region = UINT_MAX, + }; struct damos_watermarks wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = wmarks_interval, @@ -354,21 +370,15 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) .weight_nr_accesses = 0, .weight_age = 1, }; - struct damos *scheme = damon_new_scheme( - /* Find regions having PAGE_SIZE or larger size */ - PAGE_SIZE, ULONG_MAX, - /* and not accessed at all */ - 0, 0, - /* for cold_thres or more micro-seconds, and */ - cold_thres, UINT_MAX, + + return damon_new_scheme( + &pattern, /* mark those as not accessed, as soon as found */ DAMOS_LRU_DEPRIO, /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ &wmarks); - - return scheme; } static int damon_lru_sort_apply_parameters(void) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index a7faf51b4bd4..5aeca0b9e88e 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -264,6 +264,17 @@ static bool get_monitoring_region(unsigned long *start, unsigned long *end) static struct damos *damon_reclaim_new_scheme(void) { + struct damos_access_pattern pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* and not accessed at all */ + .min_nr_accesses = 0, + .max_nr_accesses = 0, + /* for min_age or more micro-seconds */ + .min_age_region = min_age / aggr_interval, + .max_age_region = UINT_MAX, + }; struct damos_watermarks wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = wmarks_interval, @@ -284,21 +295,15 @@ static struct damos *damon_reclaim_new_scheme(void) .weight_nr_accesses = 0, .weight_age = 1 }; - struct damos *scheme = damon_new_scheme( - /* Find regions having PAGE_SIZE or larger size */ - PAGE_SIZE, ULONG_MAX, - /* and not accessed at all */ - 0, 0, - /* for min_age or more micro-seconds, and */ - min_age / aggr_interval, UINT_MAX, + + return damon_new_scheme( + &pattern, /* page out those, as soon as found */ DAMOS_PAGEOUT, /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ &wmarks); - - return scheme; } static int damon_reclaim_apply_parameters(void) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 1719bb3531e3..9fcf7bae41eb 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2259,11 +2259,20 @@ static int damon_sysfs_set_targets(struct damon_ctx *ctx, static struct damos *damon_sysfs_mk_scheme( struct damon_sysfs_scheme *sysfs_scheme) { - struct damon_sysfs_access_pattern *pattern = + struct damon_sysfs_access_pattern *access_pattern = sysfs_scheme->access_pattern; struct damon_sysfs_quotas *sysfs_quotas = sysfs_scheme->quotas; struct damon_sysfs_weights *sysfs_weights = sysfs_quotas->weights; struct damon_sysfs_watermarks *sysfs_wmarks = sysfs_scheme->watermarks; + + struct damos_access_pattern pattern = { + .min_sz_region = access_pattern->sz->min, + .max_sz_region = access_pattern->sz->max, + .min_nr_accesses = access_pattern->nr_accesses->min, + .max_nr_accesses = access_pattern->nr_accesses->max, + .min_age_region = access_pattern->age->min, + .max_age_region = access_pattern->age->max, + }; struct damos_quota quota = { .ms = sysfs_quotas->ms, .sz = sysfs_quotas->sz, @@ -2280,10 +2289,8 @@ static struct damos *damon_sysfs_mk_scheme( .low = sysfs_wmarks->low, }; - return damon_new_scheme(pattern->sz->min, pattern->sz->max, - pattern->nr_accesses->min, pattern->nr_accesses->max, - pattern->age->min, pattern->age->max, - sysfs_scheme->action, "a, &wmarks); + return damon_new_scheme(&pattern, sysfs_scheme->action, "a, + &wmarks); } static int damon_sysfs_set_schemes(struct damon_ctx *ctx, From 5934ec1362b235c4341807c28f79b6a596ce1b40 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 8 Sep 2022 11:13:17 +0800 Subject: [PATCH 186/350] mm/damon/vaddr: add a comment for 'default' case in damon_va_apply_scheme() The switch case 'DAMOS_STAT' and switch case 'default' have same return value in damon_va_apply_scheme(), and the 'default' case is for DAMOS actions that not supported by 'vaddr'. It might make sense to add a comment here. [akpm@linux-foundation.org: fx comment grammar] Link: https://lkml.kernel.org/r/1662606797-23534-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 349b44d699e2..c2c08c1b316b 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -658,6 +658,9 @@ static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, case DAMOS_STAT: return 0; default: + /* + * DAMOS actions that are not yet supported by 'vaddr'. + */ return 0; } From 36f05cab0a2c97bda288c3b6a557ec5fb8d9bba6 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Fri, 9 Sep 2022 09:00:31 -0400 Subject: [PATCH 187/350] tmpfs: add support for an i_version counter NFSv4 mandates a change attribute to avoid problems with timestamp granularity, which Linux implements using the i_version counter. This is particularly important when the underlying filesystem is fast. Give tmpfs an i_version counter. Since it doesn't have to be persistent, we can just turn on SB_I_VERSION and sprinkle some inode_inc_iversion calls in the right places. Also, while there is no formal spec for xattrs, most implementations update the ctime on setxattr. Fix shmem_xattr_handler_set to update the ctime and bump the i_version appropriately. Link: https://lkml.kernel.org/r/20220909130031.15477-1-jlayton@kernel.org Signed-off-by: Jeff Layton Cc: Chuck Lever Cc: Alexander Viro Cc: Hugh Dickins Signed-off-by: Andrew Morton --- fs/posix_acl.c | 3 +++ mm/shmem.c | 31 ++++++++++++++++++++++++++++--- 2 files changed, 31 insertions(+), 3 deletions(-) diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 5af33800743e..efb88a5e59f9 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -24,6 +24,7 @@ #include #include #include +#include static struct posix_acl **acl_by_type(struct inode *inode, int type) { @@ -1073,6 +1074,8 @@ int simple_set_acl(struct user_namespace *mnt_userns, struct inode *inode, } inode->i_ctime = current_time(inode); + if (IS_I_VERSION(inode)) + inode_inc_iversion(inode); set_cached_acl(inode, type, acl); return 0; } diff --git a/mm/shmem.c b/mm/shmem.c index 3d0b729fcc5e..275899bacbea 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -38,6 +38,7 @@ #include #include #include +#include #include "swap.h" static struct vfsmount *shm_mnt; @@ -1030,6 +1031,7 @@ void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend) { shmem_undo_range(inode, lstart, lend, false); inode->i_ctime = inode->i_mtime = current_time(inode); + inode_inc_iversion(inode); } EXPORT_SYMBOL_GPL(shmem_truncate_range); @@ -1074,6 +1076,8 @@ static int shmem_setattr(struct user_namespace *mnt_userns, struct inode *inode = d_inode(dentry); struct shmem_inode_info *info = SHMEM_I(inode); int error; + bool update_mtime = false; + bool update_ctime = true; error = setattr_prepare(&init_user_ns, dentry, attr); if (error) @@ -1094,7 +1098,9 @@ static int shmem_setattr(struct user_namespace *mnt_userns, if (error) return error; i_size_write(inode, newsize); - inode->i_ctime = inode->i_mtime = current_time(inode); + update_mtime = true; + } else { + update_ctime = false; } if (newsize <= oldsize) { loff_t holebegin = round_up(newsize, PAGE_SIZE); @@ -1114,6 +1120,12 @@ static int shmem_setattr(struct user_namespace *mnt_userns, setattr_copy(&init_user_ns, inode, attr); if (attr->ia_valid & ATTR_MODE) error = posix_acl_chmod(&init_user_ns, inode, inode->i_mode); + if (!error && update_ctime) { + inode->i_ctime = current_time(inode); + if (update_mtime) + inode->i_mtime = inode->i_ctime; + inode_inc_iversion(inode); + } return error; } @@ -2890,6 +2902,7 @@ shmem_mknod(struct user_namespace *mnt_userns, struct inode *dir, error = 0; dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); d_instantiate(dentry, inode); dget(dentry); /* Extra count - pin the dentry in core */ } @@ -2965,6 +2978,7 @@ static int shmem_link(struct dentry *old_dentry, struct inode *dir, struct dentr dir->i_size += BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_inc_iversion(dir); inc_nlink(inode); ihold(inode); /* New dentry reference */ dget(dentry); /* Extra pinning count for the created dentry */ @@ -2982,6 +2996,7 @@ static int shmem_unlink(struct inode *dir, struct dentry *dentry) dir->i_size -= BOGO_DIRENT_SIZE; inode->i_ctime = dir->i_ctime = dir->i_mtime = current_time(inode); + inode_inc_iversion(dir); drop_nlink(inode); dput(dentry); /* Undo the count from "create" - this does all the work */ return 0; @@ -3071,6 +3086,8 @@ static int shmem_rename2(struct user_namespace *mnt_userns, old_dir->i_ctime = old_dir->i_mtime = new_dir->i_ctime = new_dir->i_mtime = inode->i_ctime = current_time(old_dir); + inode_inc_iversion(old_dir); + inode_inc_iversion(new_dir); return 0; } @@ -3123,6 +3140,7 @@ static int shmem_symlink(struct user_namespace *mnt_userns, struct inode *dir, } dir->i_size += BOGO_DIRENT_SIZE; dir->i_ctime = dir->i_mtime = current_time(dir); + inode_inc_iversion(dir); d_instantiate(dentry, inode); dget(dentry); return 0; @@ -3194,6 +3212,7 @@ static int shmem_fileattr_set(struct user_namespace *mnt_userns, shmem_set_inode_flags(inode, info->fsflags); inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); return 0; } @@ -3257,9 +3276,15 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler, size_t size, int flags) { struct shmem_inode_info *info = SHMEM_I(inode); + int err; name = xattr_full_name(handler, name); - return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); + err = simple_xattr_set(&info->xattrs, name, value, size, flags, NULL); + if (!err) { + inode->i_ctime = current_time(inode); + inode_inc_iversion(inode); + } + return err; } static const struct xattr_handler shmem_security_xattr_handler = { @@ -3722,7 +3747,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_NOUSER; } sb->s_export_op = &shmem_export_ops; - sb->s_flags |= SB_NOSEC; + sb->s_flags |= SB_NOSEC | SB_I_VERSION; #else sb->s_flags |= SB_NOUSER; #endif From ade38b8ca5ceeeb72e8d01357f3dcde7c87570cc Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:55 +0000 Subject: [PATCH 188/350] selftest/damon: add a test for duplicate context dirs creation Patch series "mm/damon: minor fixes and cleanups". This patchset contains minor fixes and cleanups for DAMON including - selftest for a bug we found before (Patch 1), - fix of region holes in vaddr corner case and a kunit test for it (Patches 2 and 3), and - documents/Kconfig updates for title wordsmithing (Patch 4) and more aggressive DAMON debugfs interface deprecation announcement (Patches 5-7). This patch (of 7): Commit d26f60703606 ("mm/damon/dbgfs: avoid duplicate context directory creation") fixes a bug which could result in memory leak and DAMON disablement. This commit adds a selftest for verifying the fix and avoid regression. Link: https://lkml.kernel.org/r/20220909202901.57977-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220909202901.57977-2-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- tools/testing/selftests/damon/Makefile | 1 + .../debugfs_duplicate_context_creation.sh | 27 +++++++++++++++++++ 2 files changed, 28 insertions(+) create mode 100644 tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh diff --git a/tools/testing/selftests/damon/Makefile b/tools/testing/selftests/damon/Makefile index 0470c5f3e690..a1fa2eff8192 100644 --- a/tools/testing/selftests/damon/Makefile +++ b/tools/testing/selftests/damon/Makefile @@ -6,6 +6,7 @@ TEST_GEN_FILES += huge_count_read_write TEST_FILES = _chk_dependency.sh _debugfs_common.sh TEST_PROGS = debugfs_attrs.sh debugfs_schemes.sh debugfs_target_ids.sh TEST_PROGS += debugfs_empty_targets.sh debugfs_huge_count_read_write.sh +TEST_PROGS += debugfs_duplicate_context_creation.sh TEST_PROGS += sysfs.sh include ../lib.mk diff --git a/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh new file mode 100644 index 000000000000..4a76e37ef16b --- /dev/null +++ b/tools/testing/selftests/damon/debugfs_duplicate_context_creation.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +source _debugfs_common.sh + +# Test duplicated context creation +# ================================ + +if ! echo foo > "$DBGFS/mk_contexts" +then + echo "context creation failed" + exit 1 +fi + +if echo foo > "$DBGFS/mk_contexts" +then + echo "duplicate context creation success" + exit 1 +fi + +if ! echo foo > "$DBGFS/rm_contexts" +then + echo "context deletion failed" + exit 1 +fi + +exit 0 From 9c950c22833cfd9887da7679534e5c6deb44b008 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:56 +0000 Subject: [PATCH 189/350] mm/damon/core: avoid holes in newly set monitoring target ranges When there are two or more non-contiguous regions intersecting with given new ranges, 'damon_set_regions()' does not fill the holes. This commit makes the function to fill the holes with newly created regions. [sj@kernel.org: handle error from 'damon_fill_regions_holes()'] Link: https://lkml.kernel.org/r/20220913215420.57761-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220909202901.57977-3-sj@kernel.org Fixes: 3f49584b262c ("mm/damon: implement primitives for the virtual memory address spaces") Signed-off-by: SeongJae Park Reported-by: Yun Levi Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Signed-off-by: Andrew Morton --- mm/damon/core.c | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/mm/damon/core.c b/mm/damon/core.c index bae41990f422..5ad31d2feae4 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -168,6 +168,30 @@ static bool damon_intersect(struct damon_region *r, return !(r->ar.end <= re->start || re->end <= r->ar.start); } +/* + * Fill holes in regions with new regions. + */ +static int damon_fill_regions_holes(struct damon_region *first, + struct damon_region *last, struct damon_target *t) +{ + struct damon_region *r = first; + + damon_for_each_region_from(r, t) { + struct damon_region *next, *newr; + + if (r == last) + break; + next = damon_next_region(r); + if (r->ar.end != next->ar.start) { + newr = damon_new_region(r->ar.end, next->ar.start); + if (!newr) + return -ENOMEM; + damon_insert_region(newr, r, next, t); + } + } + return 0; +} + /* * damon_set_regions() - Set regions of a target for given address ranges. * @t: the given target. @@ -184,6 +208,7 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, { struct damon_region *r, *next; unsigned int i; + int err; /* Remove regions which are not in the new ranges */ damon_for_each_region_safe(r, next, t) { @@ -226,6 +251,11 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, first->ar.start = ALIGN_DOWN(range->start, DAMON_MIN_REGION); last->ar.end = ALIGN(range->end, DAMON_MIN_REGION); + + /* fill possible holes in the range */ + err = damon_fill_regions_holes(first, last, t); + if (err) + return err; } } return 0; From 62f409560eb235ad9c2c9dbe1a3a57801431da5a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:57 +0000 Subject: [PATCH 190/350] mm/damon/core-test: test damon_set_regions Preceding commit fixes a bug in 'damon_set_regions()', which allows holes in the new monitoring target ranges. This commit adds a kunit test case for the problem to avoid any regression. Link: https://lkml.kernel.org/r/20220909202901.57977-4-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- mm/damon/core-test.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/mm/damon/core-test.h b/mm/damon/core-test.h index 45db79d28fdc..3db9b7368756 100644 --- a/mm/damon/core-test.h +++ b/mm/damon/core-test.h @@ -267,6 +267,28 @@ static void damon_test_ops_registration(struct kunit *test) KUNIT_EXPECT_EQ(test, damon_register_ops(&ops), -EINVAL); } +static void damon_test_set_regions(struct kunit *test) +{ + struct damon_target *t = damon_new_target(); + struct damon_region *r1 = damon_new_region(4, 16); + struct damon_region *r2 = damon_new_region(24, 32); + struct damon_addr_range range = {.start = 8, .end = 28}; + unsigned long expects[] = {8, 16, 16, 24, 24, 28}; + int expect_idx = 0; + struct damon_region *r; + + damon_add_region(r1, t); + damon_add_region(r2, t); + damon_set_regions(t, &range, 1); + + KUNIT_EXPECT_EQ(test, damon_nr_regions(t), 3); + damon_for_each_region(r, t) { + KUNIT_EXPECT_EQ(test, r->ar.start, expects[expect_idx++]); + KUNIT_EXPECT_EQ(test, r->ar.end, expects[expect_idx++]); + } + damon_destroy_target(t); +} + static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_target), KUNIT_CASE(damon_test_regions), @@ -276,6 +298,7 @@ static struct kunit_case damon_test_cases[] = { KUNIT_CASE(damon_test_merge_regions_of), KUNIT_CASE(damon_test_split_regions_of), KUNIT_CASE(damon_test_ops_registration), + KUNIT_CASE(damon_test_set_regions), {}, }; From 0ff11f103f5d9daf14dddf05de9b12611eaf3fc1 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:58 +0000 Subject: [PATCH 191/350] Docs/admin-guide/mm/damon: rename the title of the document The title of the DAMON document for admin-guide, 'Monitoring Data Accesses', could confuse readers in some ways. First of all, DAMON is not the only single way for data access monitoring. And the document is for not only the data access monitoring but also data access pattern based memory management optimizations (DAMOS). This commit updates the title to 'DAMON: Data Access MONitor', which more explicitly explains what the document describes. Link: https://lkml.kernel.org/r/20220909202901.57977-5-sj@kernel.org Fixes: c4ba6014aec3 ("Documentation: add documents for DAMON") Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/index.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/index.rst b/Documentation/admin-guide/mm/damon/index.rst index 05500042f777..33d37bb2fb4e 100644 --- a/Documentation/admin-guide/mm/damon/index.rst +++ b/Documentation/admin-guide/mm/damon/index.rst @@ -1,8 +1,8 @@ .. SPDX-License-Identifier: GPL-2.0 -======================== -Monitoring Data Accesses -======================== +========================== +DAMON: Data Access MONitor +========================== :doc:`DAMON ` allows light-weight data access monitoring. Using DAMON, users can analyze the memory access patterns of their systems and From e8600ce2d2e6ad1df4d0717beb362ee4cd39aaa3 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:28:59 +0000 Subject: [PATCH 192/350] mm/damon/Kconfig: notify debugfs deprecation plan Commit b18402726bd1 ("Docs/admin-guide/mm/damon/usage: document DAMON sysfs interface") announced the DAMON debugfs interface deprecation plan, but it is not so aggressively announced. As the deprecation time is coming, this commit makes the announce more easy to be found by adding the note to the config menu of DAMON debugfs interface. Link: https://lkml.kernel.org/r/20220909202901.57977-6-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- mm/damon/Kconfig | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 66265e3a9c65..7821fcb3f258 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -68,6 +68,9 @@ config DAMON_DBGFS If unsure, say N. + This will be removed after >5.15.y LTS kernel is released, so users + should move to the sysfs interface (DAMON_SYSFS). + config DAMON_DBGFS_KUNIT_TEST bool "Test for damon debugfs interface" if !KUNIT_ALL_TESTS depends on DAMON_DBGFS && KUNIT=y From 04cc7e4bf7c4bdff24b62432d2beafdde60cb72b Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:29:00 +0000 Subject: [PATCH 193/350] Docs/admin-guide/mm/damon/start: mention the dependency as sysfs instead of debugfs 'Getting Started' document of DAMON says DAMON user-space tool, damo[1], is using DAMON debugfs interface, and therefore it needs to ensure debugfs is mounted. However, the latest version of the tool is using DAMON sysfs interface. Moreover, DAMON debugfs interface is going to be deprecated as announced by commit b18402726bd1 ("Docs/admin-guide/mm/damon/usage: document DAMON sysfs interface"). This commit therefore update the document to tell readers about DAMON sysfs interface dependency instead and never mention about debugfs interface, which will be deprecated. [1] https://github.com/awslabs/damo Link: https://lkml.kernel.org/r/20220909202901.57977-7-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/start.rst | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/Documentation/admin-guide/mm/damon/start.rst b/Documentation/admin-guide/mm/damon/start.rst index 4d5ca2c46288..9f88afc734da 100644 --- a/Documentation/admin-guide/mm/damon/start.rst +++ b/Documentation/admin-guide/mm/damon/start.rst @@ -29,16 +29,9 @@ called DAMON Operator (DAMO). It is available at https://github.com/awslabs/damo. The examples below assume that ``damo`` is on your ``$PATH``. It's not mandatory, though. -Because DAMO is using the debugfs interface (refer to :doc:`usage` for the -detail) of DAMON, you should ensure debugfs is mounted. Mount it manually as -below:: - - # mount -t debugfs none /sys/kernel/debug/ - -or append the following line to your ``/etc/fstab`` file so that your system -can automatically mount debugfs upon booting:: - - debugfs /sys/kernel/debug debugfs defaults 0 0 +Because DAMO is using the sysfs interface (refer to :doc:`usage` for the +detail) of DAMON, you should ensure :doc:`sysfs ` is +mounted. Recording Data Access Patterns From f1f3afd59d78db163f6655394980290c1bdf9eab Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Fri, 9 Sep 2022 20:29:01 +0000 Subject: [PATCH 194/350] Docs/admin-guide/mm/damon/usage: note DAMON debugfs interface deprecation plan Commit b18402726bd1 ("Docs/admin-guide/mm/damon/usage: document DAMON sysfs interface") announced the DAMON debugfs interface deprecation plan, but it is not so aggressively announced. As the deprecation time is coming, this commit makes the announce more easy to be found by adding the note at the beginning of the DAMON debugfs interface usage document. Link: https://lkml.kernel.org/r/20220909202901.57977-8-sj@kernel.org Signed-off-by: SeongJae Park Cc: Brendan Higgins Cc: Jonathan Corbet Cc: Shuah Khan Cc: Yun Levi Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/damon/usage.rst | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index d52f572a9029..c050b882ddc1 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -393,6 +393,11 @@ the files as above. Above is only for an example. debugfs Interface ================= +.. note:: + + DAMON debugfs interface will be removed after next LTS kernel is released, so + users should move to the :ref:`sysfs interface `. + DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``, ``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and ``rm_contexts`` under its debugfs directory, ``/damon/``. From 85a34107eba913a2cb7c7c47c49f50073bfb67dd Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 9 Sep 2022 16:39:47 +0800 Subject: [PATCH 195/350] mm/shuffle: convert module_param_call to module_param_cb module_param_call is now completely consistent with module_param_cb, so there is no need to keep two macros. Convert module_param_call to module_param_cb since former is obsolete and latter is more kernel-ish. Link: https://lkml.kernel.org/r/20220909083947.3595610-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: David Hildenbrand Cc: Dan Williams Cc: Kefeng Wang Cc: Liu Shixin Cc: Paul Russel Signed-off-by: Andrew Morton --- mm/shuffle.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/mm/shuffle.c b/mm/shuffle.c index c13c33b247e8..fb1393b8b3a9 100644 --- a/mm/shuffle.c +++ b/mm/shuffle.c @@ -12,23 +12,22 @@ DEFINE_STATIC_KEY_FALSE(page_alloc_shuffle_key); static bool shuffle_param; -static int shuffle_show(char *buffer, const struct kernel_param *kp) -{ - return sprintf(buffer, "%c\n", shuffle_param ? 'Y' : 'N'); -} -static __meminit int shuffle_store(const char *val, +static __meminit int shuffle_param_set(const char *val, const struct kernel_param *kp) { - int rc = param_set_bool(val, kp); - - if (rc < 0) - return rc; - if (shuffle_param) + if (param_set_bool(val, kp)) + return -EINVAL; + if (*(bool *)kp->arg) static_branch_enable(&page_alloc_shuffle_key); return 0; } -module_param_call(shuffle, shuffle_store, shuffle_show, &shuffle_param, 0400); + +static const struct kernel_param_ops shuffle_param_ops = { + .set = shuffle_param_set, + .get = param_get_bool, +}; +module_param_cb(shuffle, &shuffle_param_ops, &shuffle_param, 0400); /* * For two pages to be swapped in the shuffle, they must be free (on a From 671f2fa8a2b2d15940d80be4a2baf22758724647 Mon Sep 17 00:00:00 2001 From: Alexey Romanov Date: Fri, 9 Sep 2022 11:37:22 +0300 Subject: [PATCH 196/350] zsmalloc: use correct types in _first_obj_offset functions Since commit ffedd09fa9b0 ("zsmalloc: Stop using slab fields in struct page") we are using page->page_type (unsigned int) field instead of page->units (int) as first object offset in a subpage of zspage. So get_first_obj_offset() and set_first_obj_offset() functions should work with unsigned int type. Link: https://lkml.kernel.org/r/20220909083722.85024-1-avromanov@sberdevices.ru Fixes: ffedd09fa9b0 ("zsmalloc: Stop using slab fields in struct page") Signed-off-by: Alexey Romanov Reviewed-by: Sergey Senozhatsky Cc: Alexey Romanov Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- mm/zsmalloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 12eb11e70939..525758713a55 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -472,12 +472,12 @@ static inline struct page *get_first_page(struct zspage *zspage) return first_page; } -static inline int get_first_obj_offset(struct page *page) +static inline unsigned int get_first_obj_offset(struct page *page) { return page->page_type; } -static inline void set_first_obj_offset(struct page *page, int offset) +static inline void set_first_obj_offset(struct page *page, unsigned int offset) { page->page_type = offset; } @@ -1592,7 +1592,7 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, static unsigned long find_alloced_obj(struct size_class *class, struct page *page, int *obj_idx) { - int offset = 0; + unsigned int offset; int index = *obj_idx; unsigned long handle = 0; void *addr = kmap_atomic(page); @@ -1846,7 +1846,7 @@ static int zs_page_migrate(struct page *newpage, struct page *page, struct zspage *zspage; struct page *dummy; void *s_addr, *d_addr, *addr; - int offset; + unsigned int offset; unsigned long handle; unsigned long old_obj, new_obj; unsigned int obj_idx; From 6b1964e685544b8f8ba6780c10a6b38c2b1282a5 Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 9 Sep 2022 16:31:40 +0800 Subject: [PATCH 197/350] mm: kfence: convert to DEFINE_SEQ_ATTRIBUTE Use DEFINE_SEQ_ATTRIBUTE helper macro to simplify the code. Link: https://lkml.kernel.org/r/20220909083140.3592919-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Reviewed-by: Marco Elver Tested-by: Marco Elver Cc: Alexander Potapenko Cc: Dmitry Vyukov Cc: Kefeng Wang Signed-off-by: Andrew Morton --- mm/kfence/core.c | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/mm/kfence/core.c b/mm/kfence/core.c index 8c08ae2101d7..26de62a51665 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -719,24 +719,13 @@ static int show_object(struct seq_file *seq, void *v) return 0; } -static const struct seq_operations object_seqops = { +static const struct seq_operations objects_sops = { .start = start_object, .next = next_object, .stop = stop_object, .show = show_object, }; - -static int open_objects(struct inode *inode, struct file *file) -{ - return seq_open(file, &object_seqops); -} - -static const struct file_operations objects_fops = { - .open = open_objects, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; +DEFINE_SEQ_ATTRIBUTE(objects); static int __init kfence_debugfs_init(void) { From 0d83b2d89dbfad17b62d4e7fb8f0b0525ba1a204 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Fri, 9 Sep 2022 21:36:06 +0000 Subject: [PATCH 198/350] mm/damon: remove duplicate get_monitoring_region() definitions In lru_sort.c and reclaim.c, they are all defining get_monitoring_region() function, there is no need to define it separately. As 'get_monitoring_region()' is not a 'static' function anymore, we try to use a prefix to distinguish with other functions, so there rename it to 'damon_find_biggest_system_ram'. Link: https://lkml.kernel.org/r/20220909213606.136221-1-sj@kernel.org Signed-off-by: Xin Hao Signed-off-by: SeongJae Park Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 ++ mm/damon/core.c | 40 ++++++++++++++++++++++++++++++++++++++++ mm/damon/lru_sort.c | 37 ++----------------------------------- mm/damon/reclaim.c | 37 ++----------------------------------- 4 files changed, 46 insertions(+), 70 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 90f20675da22..016b6c9c03d6 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -549,6 +549,8 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx) int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); +bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end); + #endif /* CONFIG_DAMON */ #endif /* _DAMON_H */ diff --git a/mm/damon/core.c b/mm/damon/core.c index 5ad31d2feae4..2437c61b0bc0 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1245,4 +1245,44 @@ static int kdamond_fn(void *data) return 0; } +/* + * struct damon_system_ram_region - System RAM resource address region of + * [@start, @end). + * @start: Start address of the region (inclusive). + * @end: End address of the region (exclusive). + */ +struct damon_system_ram_region { + unsigned long start; + unsigned long end; +}; + +static int walk_system_ram(struct resource *res, void *arg) +{ + struct damon_system_ram_region *a = arg; + + if (a->end - a->start < resource_size(res)) { + a->start = res->start; + a->end = res->end; + } + return 0; +} + +/* + * Find biggest 'System RAM' resource and store its start and end address in + * @start and @end, respectively. If no System RAM is found, returns false. + */ +bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end) + +{ + struct damon_system_ram_region arg = {}; + + walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); + if (arg.end <= arg.start) + return false; + + *start = arg.start; + *end = arg.end; + return true; +} + #include "core-test.h" diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 0184ed4828b7..8415e18fcf0e 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -257,39 +257,6 @@ module_param(nr_cold_quota_exceeds, ulong, 0400); static struct damon_ctx *ctx; static struct damon_target *target; -struct damon_lru_sort_ram_walk_arg { - unsigned long start; - unsigned long end; -}; - -static int walk_system_ram(struct resource *res, void *arg) -{ - struct damon_lru_sort_ram_walk_arg *a = arg; - - if (a->end - a->start < resource_size(res)) { - a->start = res->start; - a->end = res->end; - } - return 0; -} - -/* - * Find biggest 'System RAM' resource and store its start and end address in - * @start and @end, respectively. If no System RAM is found, returns false. - */ -static bool get_monitoring_region(unsigned long *start, unsigned long *end) -{ - struct damon_lru_sort_ram_walk_arg arg = {}; - - walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); - if (arg.end <= arg.start) - return false; - - *start = arg.start; - *end = arg.end; - return true; -} - /* Create a DAMON-based operation scheme for hot memory regions */ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) { @@ -414,8 +381,8 @@ static int damon_lru_sort_apply_parameters(void) if (monitor_region_start > monitor_region_end) return -EINVAL; if (!monitor_region_start && !monitor_region_end && - !get_monitoring_region(&monitor_region_start, - &monitor_region_end)) + !damon_find_biggest_system_ram(&monitor_region_start, + &monitor_region_end)) return -EINVAL; addr_range.start = monitor_region_start; addr_range.end = monitor_region_end; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 5aeca0b9e88e..fe7bc0c55ecb 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -229,39 +229,6 @@ module_param(nr_quota_exceeds, ulong, 0400); static struct damon_ctx *ctx; static struct damon_target *target; -struct damon_reclaim_ram_walk_arg { - unsigned long start; - unsigned long end; -}; - -static int walk_system_ram(struct resource *res, void *arg) -{ - struct damon_reclaim_ram_walk_arg *a = arg; - - if (a->end - a->start < resource_size(res)) { - a->start = res->start; - a->end = res->end; - } - return 0; -} - -/* - * Find biggest 'System RAM' resource and store its start and end address in - * @start and @end, respectively. If no System RAM is found, returns false. - */ -static bool get_monitoring_region(unsigned long *start, unsigned long *end) -{ - struct damon_reclaim_ram_walk_arg arg = {}; - - walk_system_ram_res(0, ULONG_MAX, &arg, walk_system_ram); - if (arg.end <= arg.start) - return false; - - *start = arg.start; - *end = arg.end; - return true; -} - static struct damos *damon_reclaim_new_scheme(void) { struct damos_access_pattern pattern = { @@ -328,8 +295,8 @@ static int damon_reclaim_apply_parameters(void) if (monitor_region_start > monitor_region_end) return -EINVAL; if (!monitor_region_start && !monitor_region_end && - !get_monitoring_region(&monitor_region_start, - &monitor_region_end)) + !damon_find_biggest_system_ram(&monitor_region_start, + &monitor_region_end)) return -EINVAL; addr_range.start = monitor_region_start; addr_range.end = monitor_region_end; From 14455eabd8404a503dc8e80cd8ce185e96a94b22 Mon Sep 17 00:00:00 2001 From: Cheng Li Date: Fri, 9 Sep 2022 07:31:09 +0000 Subject: [PATCH 199/350] mm: use nth_page instead of mem_map_offset mem_map_next To handle the discontiguous case, mem_map_next() has a parameter named `offset`. As a function caller, one would be confused why "get next entry" needs a parameter named "offset". The other drawback of mem_map_next() is that the callers must take care of the map between parameter "iter" and "offset", otherwise we may get an hole or duplication during iteration. So we use nth_page instead of mem_map_next. And replace mem_map_offset with nth_page() per Matthew's comments. Link: https://lkml.kernel.org/r/1662708669-9395-1-git-send-email-lic121@chinatelecom.cn Signed-off-by: Cheng Li Fixes: 69d177c2fc70 ("hugetlbfs: handle pages higher order than MAX_ORDER") Reviewed-by: Matthew Wilcox (Oracle) Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 29 +++++++++++++++++------------ mm/internal.h | 28 ---------------------------- mm/memory.c | 21 ++++++++++----------- 3 files changed, 27 insertions(+), 51 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 008955d8f411..6af123374e98 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1306,12 +1306,13 @@ static void __destroy_compound_gigantic_page(struct page *page, { int i; int nr_pages = 1 << order; - struct page *p = page + 1; + struct page *p; atomic_set(compound_mapcount_ptr(page), 0); atomic_set(compound_pincount_ptr(page), 0); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + for (i = 1; i < nr_pages; i++) { + p = nth_page(page, i); p->mapping = NULL; clear_compound_head(p); if (!demote) @@ -1532,7 +1533,7 @@ static void add_hugetlb_page(struct hstate *h, struct page *page, static void __update_and_free_page(struct hstate *h, struct page *page) { int i; - struct page *subpage = page; + struct page *subpage; if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) return; @@ -1563,8 +1564,8 @@ static void __update_and_free_page(struct hstate *h, struct page *page) if (unlikely(PageHWPoison(page))) hugetlb_clear_page_hwpoison(page); - for (i = 0; i < pages_per_huge_page(h); - i++, subpage = mem_map_next(subpage, page, i)) { + for (i = 0; i < pages_per_huge_page(h); i++) { + subpage = nth_page(page, i); subpage->flags &= ~(1 << PG_locked | 1 << PG_error | 1 << PG_referenced | 1 << PG_dirty | 1 << PG_active | 1 << PG_private | @@ -1771,13 +1772,15 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, { int i, j; int nr_pages = 1 << order; - struct page *p = page + 1; + struct page *p; /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); __ClearPageReserved(page); __SetPageHead(page); - for (i = 1; i < nr_pages; i++, p = mem_map_next(p, page, i)) { + for (i = 1; i < nr_pages; i++) { + p = nth_page(page, i); + /* * For gigantic hugepages allocated through bootmem at * boot, it's safer to be consistent with the not-gigantic @@ -1824,14 +1827,16 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, out_error: /* undo tail page modifications made above */ - p = page + 1; - for (j = 1; j < i; j++, p = mem_map_next(p, page, j)) { + for (j = 1; j < i; j++) { + p = nth_page(page, j); clear_compound_head(p); set_page_refcounted(p); } /* need to clear PG_reserved on remaining tail pages */ - for (; j < nr_pages; j++, p = mem_map_next(p, page, j)) + for (; j < nr_pages; j++) { + p = nth_page(page, j); __ClearPageReserved(p); + } set_compound_order(page, 0); #ifdef CONFIG_64BIT page[1].compound_nr = 0; @@ -6128,7 +6133,7 @@ static void record_subpages_vmas(struct page *page, struct vm_area_struct *vma, for (nr = 0; nr < refs; nr++) { if (likely(pages)) - pages[nr] = mem_map_offset(page, nr); + pages[nr] = nth_page(page, nr); if (vmas) vmas[nr] = vma; } @@ -6292,7 +6297,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma, (vma->vm_end - ALIGN_DOWN(vaddr, PAGE_SIZE)) >> PAGE_SHIFT); if (pages || vmas) - record_subpages_vmas(mem_map_offset(page, pfn_offset), + record_subpages_vmas(nth_page(page, pfn_offset), vma, refs, likely(pages) ? pages + i : NULL, vmas ? vmas + i : NULL); diff --git a/mm/internal.h b/mm/internal.h index 0f106a3982e7..e497ab14c984 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -638,34 +638,6 @@ static inline void vunmap_range_noflush(unsigned long start, unsigned long end) } #endif /* !CONFIG_MMU */ -/* - * Return the mem_map entry representing the 'offset' subpage within - * the maximally aligned gigantic page 'base'. Handle any discontiguity - * in the mem_map at MAX_ORDER_NR_PAGES boundaries. - */ -static inline struct page *mem_map_offset(struct page *base, int offset) -{ - if (unlikely(offset >= MAX_ORDER_NR_PAGES)) - return nth_page(base, offset); - return base + offset; -} - -/* - * Iterator over all subpages within the maximally aligned gigantic - * page 'base'. Handle any discontiguity in the mem_map. - */ -static inline struct page *mem_map_next(struct page *iter, - struct page *base, int offset) -{ - if (unlikely((offset & (MAX_ORDER_NR_PAGES - 1)) == 0)) { - unsigned long pfn = page_to_pfn(base) + offset; - if (!pfn_valid(pfn)) - return NULL; - return pfn_to_page(pfn); - } - return iter + 1; -} - /* Memory initialisation debug and verification */ enum mminit_level { MMINIT_WARNING, diff --git a/mm/memory.c b/mm/memory.c index d671ad367d67..c01c12500169 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -5690,11 +5690,11 @@ static void clear_gigantic_page(struct page *page, unsigned int pages_per_huge_page) { int i; - struct page *p = page; + struct page *p; might_sleep(); - for (i = 0; i < pages_per_huge_page; - i++, p = mem_map_next(p, page, i)) { + for (i = 0; i < pages_per_huge_page; i++) { + p = nth_page(page, i); cond_resched(); clear_user_highpage(p, addr + i * PAGE_SIZE); } @@ -5730,13 +5730,12 @@ static void copy_user_gigantic_page(struct page *dst, struct page *src, struct page *dst_base = dst; struct page *src_base = src; - for (i = 0; i < pages_per_huge_page; ) { + for (i = 0; i < pages_per_huge_page; i++) { + dst = nth_page(dst_base, i); + src = nth_page(src_base, i); + cond_resched(); copy_user_highpage(dst, src, addr + i*PAGE_SIZE, vma); - - i++; - dst = mem_map_next(dst, dst_base, i); - src = mem_map_next(src, src_base, i); } } @@ -5783,10 +5782,10 @@ long copy_huge_page_from_user(struct page *dst_page, void *page_kaddr; unsigned long i, rc = 0; unsigned long ret_val = pages_per_huge_page * PAGE_SIZE; - struct page *subpage = dst_page; + struct page *subpage; - for (i = 0; i < pages_per_huge_page; - i++, subpage = mem_map_next(subpage, dst_page, i)) { + for (i = 0; i < pages_per_huge_page; i++) { + subpage = nth_page(dst_page, i); if (allow_pagefault) page_kaddr = kmap(subpage); else From 13cc378403a83e70430ae9bad53fd65199f21fe1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 9 Sep 2022 10:57:11 +0800 Subject: [PATCH 200/350] writeback: remove unused macro DIRTY_FULL_SCOPE It's introduced but never used. Remove it. Link: https://lkml.kernel.org/r/20220909025711.32012-1-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: Jan Kara Acked-by: Jens Axboe Cc: Bart Van Assche Cc: David Howells Cc: Matthew Wilcox Cc: NeilBrown Cc: Vlastimil Babka Cc: zhanglianjie Signed-off-by: Andrew Morton --- include/linux/writeback.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 3f045f6d6c4f..06f9291b6fd5 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -17,20 +17,12 @@ struct bio; DECLARE_PER_CPU(int, dirty_throttle_leaks); /* - * The 1/4 region under the global dirty thresh is for smooth dirty throttling: - * - * (thresh - thresh/DIRTY_FULL_SCOPE, thresh) - * - * Further beyond, all dirtier tasks will enter a loop waiting (possibly long - * time) for the dirty pages to drop, unless written enough pages. - * * The global dirty threshold is normally equal to the global dirty limit, * except when the system suddenly allocates a lot of anonymous memory and * knocks down the global dirty threshold quickly, in which case the global * dirty limit will follow down slowly to prevent livelocking all dirtier tasks. */ #define DIRTY_SCOPE 8 -#define DIRTY_FULL_SCOPE (DIRTY_SCOPE / 2) struct backing_dev_info; From f4981502088f8ea704beeedf3470e1d53bc2e46c Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 9 Sep 2022 10:16:53 +0800 Subject: [PATCH 201/350] mm/huge_memory: prevent THP_ZERO_PAGE_ALLOC increased twice A user who reads THP_ZERO_PAGE_ALLOC may be more concerned about the huge zero pages that are really allocated for thp. It is misleading to increase THP_ZERO_PAGE_ALLOC twice if two threads call get_huge_zero_page concurrently. Don't increase the value if the huge page is not really used. Update Documentation/admin-guide/mm/transhuge.rst to suit. Link: https://lkml.kernel.org/r/20220909021653.3371879-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Cc: Alexander Potapenko Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Kefeng Wang Cc: "Kirill A. Shutemov" Cc: Mike Kravetz Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 7 +++---- mm/huge_memory.c | 2 +- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index c9c37f16eef8..8e3418ec4503 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -366,10 +366,9 @@ thp_split_pmd page table entry. thp_zero_page_alloc - is incremented every time a huge zero page is - successfully allocated. It includes allocations which where - dropped due race with other allocation. Note, it doesn't count - every map of the huge zero page, only its allocation. + is incremented every time a huge zero page used for thp is + successfully allocated. Note, it doesn't count every map of + the huge zero page, only its allocation. thp_zero_page_alloc_failed is incremented if kernel fails to allocate diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 36ef79b85195..4938defe4e73 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -163,7 +163,6 @@ retry: count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED); return false; } - count_vm_event(THP_ZERO_PAGE_ALLOC); preempt_disable(); if (cmpxchg(&huge_zero_page, NULL, zero_page)) { preempt_enable(); @@ -175,6 +174,7 @@ retry: /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); preempt_enable(); + count_vm_event(THP_ZERO_PAGE_ALLOC); return true; } From a17a8b3b3e6b08a9cd3b2134789843323d998bed Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 8 Sep 2022 16:19:32 +0800 Subject: [PATCH 202/350] mm/damon/sysfs: change few functions execute order There's no need to run container_of() as early as we do. The compiler figures this out, but the resulting code is more readable. Link: https://lkml.kernel.org/r/20220908081932.77370-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 9fcf7bae41eb..d27dad5affec 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -1031,8 +1031,7 @@ static ssize_t nr_schemes_show(struct kobject *kobj, static ssize_t nr_schemes_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_schemes *schemes = container_of(kobj, - struct damon_sysfs_schemes, kobj); + struct damon_sysfs_schemes *schemes; int nr, err = kstrtoint(buf, 0, &nr); if (err) @@ -1040,6 +1039,8 @@ static ssize_t nr_schemes_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + schemes = container_of(kobj, struct damon_sysfs_schemes, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_schemes_add_dirs(schemes, nr); @@ -1237,8 +1238,7 @@ static ssize_t nr_regions_show(struct kobject *kobj, static ssize_t nr_regions_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_regions *regions = container_of(kobj, - struct damon_sysfs_regions, kobj); + struct damon_sysfs_regions *regions; int nr, err = kstrtoint(buf, 0, &nr); if (err) @@ -1246,6 +1246,8 @@ static ssize_t nr_regions_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + regions = container_of(kobj, struct damon_sysfs_regions, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_regions_add_dirs(regions, nr); @@ -1440,8 +1442,7 @@ static ssize_t nr_targets_show(struct kobject *kobj, static ssize_t nr_targets_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_targets *targets = container_of(kobj, - struct damon_sysfs_targets, kobj); + struct damon_sysfs_targets *targets; int nr, err = kstrtoint(buf, 0, &nr); if (err) @@ -1449,6 +1450,8 @@ static ssize_t nr_targets_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + targets = container_of(kobj, struct damon_sysfs_targets, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_targets_add_dirs(targets, nr); @@ -1962,8 +1965,7 @@ static ssize_t nr_contexts_show(struct kobject *kobj, static ssize_t nr_contexts_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_contexts *contexts = container_of(kobj, - struct damon_sysfs_contexts, kobj); + struct damon_sysfs_contexts *contexts; int nr, err; err = kstrtoint(buf, 0, &nr); @@ -1973,6 +1975,7 @@ static ssize_t nr_contexts_store(struct kobject *kobj, if (nr < 0 || 1 < nr) return -EINVAL; + contexts = container_of(kobj, struct damon_sysfs_contexts, kobj); if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_contexts_add_dirs(contexts, nr); @@ -2737,8 +2740,7 @@ static ssize_t nr_kdamonds_show(struct kobject *kobj, static ssize_t nr_kdamonds_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, size_t count) { - struct damon_sysfs_kdamonds *kdamonds = container_of(kobj, - struct damon_sysfs_kdamonds, kobj); + struct damon_sysfs_kdamonds *kdamonds; int nr, err; err = kstrtoint(buf, 0, &nr); @@ -2747,6 +2749,8 @@ static ssize_t nr_kdamonds_store(struct kobject *kobj, if (nr < 0) return -EINVAL; + kdamonds = container_of(kobj, struct damon_sysfs_kdamonds, kobj); + if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; err = damon_sysfs_kdamonds_add_dirs(kdamonds, nr); From e7fcac4cd2674fe6849c6ac8a51a7fc878a5e436 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Mon, 12 Sep 2022 23:11:53 +0800 Subject: [PATCH 203/350] mm/damon/sysfs: use the wrapper directly to check if the kdamond is running We can use the 'damon_sysfs_kdamond_running()' wrapper directly to check if the kdamond is running in 'damon_sysfs_turn_damon_on()'. Link: https://lkml.kernel.org/r/1662995513-24489-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index d27dad5affec..da01befae8bd 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2465,8 +2465,7 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond) struct damon_ctx *ctx; int err; - if (kdamond->damon_ctx && - damon_sysfs_ctx_running(kdamond->damon_ctx)) + if (damon_sysfs_kdamond_running(kdamond)) return -EBUSY; if (damon_sysfs_cmd_request.kdamond == kdamond) return -EBUSY; From a18709442869e55c42969142d5abf6beb776dbba Mon Sep 17 00:00:00 2001 From: Dawei Li Date: Mon, 12 Sep 2022 22:39:03 +0800 Subject: [PATCH 204/350] mm/damon: improve damon_new_region strategy Kdamond is implemented as a periodical split-merge pattern, which will create and destroy regions possibly at high frequency (hundreds or even thousands of per sec), depending on the number of regions and aggregation period. In that case, kmalloc and kfree could bring speed and space overheads, which can be improved by using a private kmem cache. [set_pte_at@outlook.com: creating kmem cache for damon regions by KMEM_CACHE()] Link: https://lkml.kernel.org/r/Message-ID: Link: https://lkml.kernel.org/r/TYCP286MB2323DA1894FA55BB9CF90978CA449@TYCP286MB2323.JPNP286.PROD.OUTLOOK.COM Signed-off-by: Dawei Li Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 2437c61b0bc0..c9ec2de845b3 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -29,6 +29,8 @@ static bool running_exclusive_ctxs; static DEFINE_MUTEX(damon_ops_lock); static struct damon_operations damon_registered_ops[NR_DAMON_OPS]; +static struct kmem_cache *damon_region_cache __ro_after_init; + /* Should be called under damon_ops_lock with id smaller than NR_DAMON_OPS */ static bool __damon_is_registered_ops(enum damon_ops_id id) { @@ -119,7 +121,7 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) { struct damon_region *region; - region = kmalloc(sizeof(*region), GFP_KERNEL); + region = kmem_cache_alloc(damon_region_cache, GFP_KERNEL); if (!region) return NULL; @@ -148,7 +150,7 @@ static void damon_del_region(struct damon_region *r, struct damon_target *t) static void damon_free_region(struct damon_region *r) { - kfree(r); + kmem_cache_free(damon_region_cache, r); } void damon_destroy_region(struct damon_region *r, struct damon_target *t) @@ -1285,4 +1287,17 @@ bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end) return true; } +static int __init damon_init(void) +{ + damon_region_cache = KMEM_CACHE(damon_region, 0); + if (unlikely(!damon_region_cache)) { + pr_err("creating damon_region_cache fails\n"); + return -ENOMEM; + } + + return 0; +} + +subsys_initcall(damon_init); + #include "core-test.h" From f635725c3905e755a8c3e2dc8cab7fcd0d38977f Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Tue, 13 Sep 2022 00:27:44 +0900 Subject: [PATCH 205/350] zram: do not waste zram_table_entry flags bits zram_table_entry::flags stores object size in the lower bits and zram pageflags in the upper bits. However, for some reason, we use 24 lower bits, while maximum zram object size is PAGE_SIZE, which requires PAGE_SHIFT bits (up to 16 on arm64). This wastes 24 - PAGE_SHIFT bits that we can use for additional zram pageflags instead. Also add a BUILD_BUG_ON() to alert us should we run out of bits in zram_table_entry::flags. Link: https://lkml.kernel.org/r/20220912152744.527438-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Reviewed-by: Brian Geffon Acked-by: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 2 ++ drivers/block/zram/zram_drv.h | 15 +++++++-------- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 607f4634c27d..eb021db21ddf 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -2131,6 +2131,8 @@ static int __init zram_init(void) { int ret; + BUILD_BUG_ON(__NR_ZRAM_PAGEFLAGS > BITS_PER_LONG); + ret = cpuhp_setup_state_multi(CPUHP_ZCOMP_PREPARE, "block/zram:prepare", zcomp_cpu_up_prepare, zcomp_cpu_dead); if (ret < 0) diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 80c3b43b4828..a2bda53020fd 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -30,16 +30,15 @@ /* - * The lower ZRAM_FLAG_SHIFT bits of table.flags is for - * object size (excluding header), the higher bits is for - * zram_pageflags. + * ZRAM is mainly used for memory efficiency so we want to keep memory + * footprint small and thus squeeze size and zram pageflags into a flags + * member. The lower ZRAM_FLAG_SHIFT bits is for object size (excluding + * header), which cannot be larger than PAGE_SIZE (requiring PAGE_SHIFT + * bits), the higher bits are for zram_pageflags. * - * zram is mainly used for memory efficiency so we want to keep memory - * footprint small so we can squeeze size and flags into a field. - * The lower ZRAM_FLAG_SHIFT bits is for object size (excluding header), - * the higher bits is for zram_pageflags. + * We use BUILD_BUG_ON() to make sure that zram pageflags don't overflow. */ -#define ZRAM_FLAG_SHIFT 24 +#define ZRAM_FLAG_SHIFT (PAGE_SHIFT + 1) /* Flags for zram pages (table[page_no].flags) */ enum zram_pageflags { From f9bceb2f4114fe9a9725c922f9f1500d173d4763 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 14 Sep 2022 14:20:33 +0900 Subject: [PATCH 206/350] zram: keep comments within 80-columns limit Several trivial fixups (that I should have spotted during review). Link: https://lkml.kernel.org/r/20220914052033.838050-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index eb021db21ddf..43eeef2b9fbe 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -329,8 +329,8 @@ static ssize_t idle_store(struct device *dev, if (!sysfs_streq(buf, "all")) { /* - * If it did not parse as 'all' try to treat it as an integer when - * we have memory tracking enabled. + * If it did not parse as 'all' try to treat it as an integer + * when we have memory tracking enabled. */ u64 age_sec; @@ -345,7 +345,10 @@ static ssize_t idle_store(struct device *dev, if (!init_done(zram)) goto out_unlock; - /* A cutoff_time of 0 marks everything as idle, this is the "all" behavior */ + /* + * A cutoff_time of 0 marks everything as idle, this is the + * "all" behavior. + */ mark_idle(zram, cutoff_time); rv = len; @@ -1416,11 +1419,11 @@ compress_again: if (comp_len != PAGE_SIZE) goto compress_again; /* - * If the page is not compressible, you need to acquire the lock and - * execute the code below. The zcomp_stream_get() call is needed to - * disable the cpu hotplug and grab the zstrm buffer back. - * It is necessary that the dereferencing of the zstrm variable below - * occurs correctly. + * If the page is not compressible, you need to acquire the + * lock and execute the code below. The zcomp_stream_get() + * call is needed to disable the cpu hotplug and grab the + * zstrm buffer back. It is necessary that the dereferencing + * of the zstrm variable below occurs correctly. */ zstrm = zcomp_stream_get(zram->comp); } From 3791bc7bf1034dcce89541e54630d0307cc199fb Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Sun, 11 Sep 2022 08:59:17 +0800 Subject: [PATCH 207/350] mm/damon: simplify scheme create in damon_lru_sort_apply_parameters In damon_lru_sort_apply_parameters(), we can use damon_set_schemes() to replace the way of creating the first 'scheme' in original code, this makes the code look cleaner. Link: https://lkml.kernel.org/r/20220911005917.835-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 8415e18fcf0e..307ba71adcfa 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -350,7 +350,7 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) static int damon_lru_sort_apply_parameters(void) { - struct damos *scheme, *next_scheme; + struct damos *scheme; struct damon_addr_range addr_range; unsigned int hot_thres, cold_thres; int err = 0; @@ -360,17 +360,15 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; - /* free previously set schemes */ - damon_for_each_scheme_safe(scheme, next_scheme, ctx) - damon_destroy_scheme(scheme); - /* aggr_interval / sample_interval is the maximum nr_accesses */ hot_thres = aggr_interval / sample_interval * hot_thres_access_freq / 1000; scheme = damon_lru_sort_new_hot_scheme(hot_thres); if (!scheme) return -ENOMEM; - damon_add_scheme(ctx, scheme); + err = damon_set_schemes(ctx, &scheme, 1); + if (err) + return err; cold_thres = cold_min_age / aggr_interval; scheme = damon_lru_sort_new_cold_scheme(cold_thres); From f82e70e26b505cd8a1d5c670dc5038a938708d4a Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:28 +0000 Subject: [PATCH 208/350] mm/damon/paddr: make supported DAMOS actions of paddr clear Patch series "mm/damon: cleanup code". DAMON code was not so clean from the beginning, but it has been too much nowadays, especially due to the duplicates in DAMON_RECLAIM and DAMON_LRU_SORT. This patchset cleans some of the mess. This patch (of 22): The 'switch-case' statement in 'damon_va_apply_scheme()' function provides a 'case' for every supported DAMOS action while all not-yet-supported DAMOS actions fall through the 'default' case, and comment it so that people can easily know which actions are supported. Its counterpart in 'paddr', 'damon_pa_apply_scheme()', however, doesn't. This commit makes the 'paddr' side function follows the pattern of 'vaddr' for better readability and consistency. Link: https://lkml.kernel.org/r/20220913174449.50645-1-sj@kernel.org Link: https://lkml.kernel.org/r/20220913174449.50645-2-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 6b0d9e6aa677..219127cb49e2 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -275,7 +275,10 @@ static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, return damon_pa_mark_accessed(r); case DAMOS_LRU_DEPRIO: return damon_pa_deactivate_pages(r); + case DAMOS_STAT: + break; default: + /* DAMOS actions that not yet supported by 'paddr'. */ break; } return 0; From 8193321ac90d525b33815c77faae7d2d12042c03 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:29 +0000 Subject: [PATCH 209/350] mm/damon/paddr: deduplicate damon_pa_{mark_accessed,deactivate_pages}() The bodies of damon_pa_{mark_accessed,deactivate_pages}() contains duplicates. This commit factors out the common part to a separate function and removes the duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-3-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 219127cb49e2..1ada62db68b1 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -232,7 +232,8 @@ static unsigned long damon_pa_pageout(struct damon_region *r) return applied * PAGE_SIZE; } -static unsigned long damon_pa_mark_accessed(struct damon_region *r) +static inline unsigned long damon_pa_mark_accessed_or_deactivate( + struct damon_region *r, bool mark_accessed) { unsigned long addr, applied = 0; @@ -241,27 +242,24 @@ static unsigned long damon_pa_mark_accessed(struct damon_region *r) if (!page) continue; - mark_page_accessed(page); + if (mark_accessed) + mark_page_accessed(page); + else + deactivate_page(page); put_page(page); applied++; } return applied * PAGE_SIZE; } +static unsigned long damon_pa_mark_accessed(struct damon_region *r) +{ + return damon_pa_mark_accessed_or_deactivate(r, true); +} + static unsigned long damon_pa_deactivate_pages(struct damon_region *r) { - unsigned long addr, applied = 0; - - for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { - struct page *page = damon_get_page(PHYS_PFN(addr)); - - if (!page) - continue; - deactivate_page(page); - put_page(page); - applied++; - } - return applied * PAGE_SIZE; + return damon_pa_mark_accessed_or_deactivate(r, false); } static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, From 02f17037fc6e38ca1c00ac87a112372a3867ba45 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:30 +0000 Subject: [PATCH 210/350] mm/damon/core: copy struct-to-struct instead of field-to-field in damon_new_scheme() The function for new 'struct damos' creation, 'damon_new_scheme()', copies each field of the struct one by one, though it could simply copied via struct to struct. This commit replaces the unnecessarily verbose field-to-field copies with struct-to-struct copies to make code simple and short. Link: https://lkml.kernel.org/r/20220913174449.50645-4-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index c9ec2de845b3..a564f83e9efe 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -272,22 +272,13 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, scheme = kmalloc(sizeof(*scheme), GFP_KERNEL); if (!scheme) return NULL; - scheme->pattern.min_sz_region = pattern->min_sz_region; - scheme->pattern.max_sz_region = pattern->max_sz_region; - scheme->pattern.min_nr_accesses = pattern->min_nr_accesses; - scheme->pattern.max_nr_accesses = pattern->max_nr_accesses; - scheme->pattern.min_age_region = pattern->min_age_region; - scheme->pattern.max_age_region = pattern->max_age_region; + scheme->pattern = *pattern; scheme->action = action; scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); - scheme->quota.ms = quota->ms; - scheme->quota.sz = quota->sz; - scheme->quota.reset_interval = quota->reset_interval; - scheme->quota.weight_sz = quota->weight_sz; - scheme->quota.weight_nr_accesses = quota->weight_nr_accesses; - scheme->quota.weight_age = quota->weight_age; + scheme->quota = *quota; + /* caller might not zero-initialized the private fileds */ scheme->quota.total_charged_sz = 0; scheme->quota.total_charged_ns = 0; scheme->quota.esz = 0; @@ -296,11 +287,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, scheme->quota.charge_target_from = NULL; scheme->quota.charge_addr_from = 0; - scheme->wmarks.metric = wmarks->metric; - scheme->wmarks.interval = wmarks->interval; - scheme->wmarks.high = wmarks->high; - scheme->wmarks.mid = wmarks->mid; - scheme->wmarks.low = wmarks->low; + scheme->wmarks = *wmarks; scheme->wmarks.activated = true; return scheme; From 70e0c1d1bf945328915f52f7132b2d6ee8f25d46 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:31 +0000 Subject: [PATCH 211/350] mm/damon/core: factor out 'damos_quota' private fileds initialization The 'struct damos' creation function, 'damon_new_scheme()', does initialization of private fileds of 'struct damos_quota' in it. As its verbose and makes the function unnecessarily long, this commit factors it out to separate function. Link: https://lkml.kernel.org/r/20220913174449.50645-5-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index a564f83e9efe..6d9f4c2dee35 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -263,6 +263,19 @@ int damon_set_regions(struct damon_target *t, struct damon_addr_range *ranges, return 0; } +/* initialize private fields of damos_quota and return the pointer */ +static struct damos_quota *damos_quota_init_priv(struct damos_quota *quota) +{ + quota->total_charged_sz = 0; + quota->total_charged_ns = 0; + quota->esz = 0; + quota->charged_sz = 0; + quota->charged_from = 0; + quota->charge_target_from = NULL; + quota->charge_addr_from = 0; + return quota; +} + struct damos *damon_new_scheme(struct damos_access_pattern *pattern, enum damos_action action, struct damos_quota *quota, struct damos_watermarks *wmarks) @@ -277,15 +290,7 @@ struct damos *damon_new_scheme(struct damos_access_pattern *pattern, scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); - scheme->quota = *quota; - /* caller might not zero-initialized the private fileds */ - scheme->quota.total_charged_sz = 0; - scheme->quota.total_charged_ns = 0; - scheme->quota.esz = 0; - scheme->quota.charged_sz = 0; - scheme->quota.charged_from = 0; - scheme->quota.charge_target_from = NULL; - scheme->quota.charge_addr_from = 0; + scheme->quota = *(damos_quota_init_priv(quota)); scheme->wmarks = *wmarks; scheme->wmarks.activated = true; From cbeaa77b044938cfe91818821ece6b0b1511e967 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:32 +0000 Subject: [PATCH 212/350] mm/damon/core: use a dedicated struct for monitoring attributes DAMON monitoring attributes are directly defined as fields of 'struct damon_ctx'. This makes 'struct damon_ctx' a little long and complicated. This commit defines and uses a struct, 'struct damon_attrs', which is dedicated for only the monitoring attributes to make the purpose of the five values clearer and simplify 'struct damon_ctx'. Link: https://lkml.kernel.org/r/20220913174449.50645-6-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 30 ++++++++++++++++++++---------- mm/damon/core.c | 34 +++++++++++++++++----------------- mm/damon/dbgfs.c | 6 +++--- mm/damon/ops-common.c | 4 ++-- mm/damon/vaddr.c | 4 ++-- 5 files changed, 44 insertions(+), 34 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 016b6c9c03d6..2ceee8b07726 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -389,13 +389,15 @@ struct damon_callback { }; /** - * struct damon_ctx - Represents a context for each monitoring. This is the - * main interface that allows users to set the attributes and get the results - * of the monitoring. + * struct damon_attrs - Monitoring attributes for accuracy/overhead control. * * @sample_interval: The time between access samplings. * @aggr_interval: The time between monitor results aggregations. * @ops_update_interval: The time between monitoring operations updates. + * @min_nr_regions: The minimum number of adaptive monitoring + * regions. + * @max_nr_regions: The maximum number of adaptive monitoring + * regions. * * For each @sample_interval, DAMON checks whether each region is accessed or * not. It aggregates and keeps the access information (number of accesses to @@ -405,7 +407,21 @@ struct damon_callback { * @ops_update_interval. All time intervals are in micro-seconds. * Please refer to &struct damon_operations and &struct damon_callback for more * detail. + */ +struct damon_attrs { + unsigned long sample_interval; + unsigned long aggr_interval; + unsigned long ops_update_interval; + unsigned long min_nr_regions; + unsigned long max_nr_regions; +}; + +/** + * struct damon_ctx - Represents a context for each monitoring. This is the + * main interface that allows users to set the attributes and get the results + * of the monitoring. * + * @attrs: Monitoring attributes for accuracy/overhead control. * @kdamond: Kernel thread who does the monitoring. * @kdamond_lock: Mutex for the synchronizations with @kdamond. * @@ -427,15 +443,11 @@ struct damon_callback { * @ops: Set of monitoring operations for given use cases. * @callback: Set of callbacks for monitoring events notifications. * - * @min_nr_regions: The minimum number of adaptive monitoring regions. - * @max_nr_regions: The maximum number of adaptive monitoring regions. * @adaptive_targets: Head of monitoring targets (&damon_target) list. * @schemes: Head of schemes (&damos) list. */ struct damon_ctx { - unsigned long sample_interval; - unsigned long aggr_interval; - unsigned long ops_update_interval; + struct damon_attrs attrs; /* private: internal use only */ struct timespec64 last_aggregation; @@ -448,8 +460,6 @@ struct damon_ctx { struct damon_operations ops; struct damon_callback callback; - unsigned long min_nr_regions; - unsigned long max_nr_regions; struct list_head adaptive_targets; struct list_head schemes; }; diff --git a/mm/damon/core.c b/mm/damon/core.c index 6d9f4c2dee35..bbd4c2d991dd 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -382,17 +382,17 @@ struct damon_ctx *damon_new_ctx(void) if (!ctx) return NULL; - ctx->sample_interval = 5 * 1000; - ctx->aggr_interval = 100 * 1000; - ctx->ops_update_interval = 60 * 1000 * 1000; + ctx->attrs.sample_interval = 5 * 1000; + ctx->attrs.aggr_interval = 100 * 1000; + ctx->attrs.ops_update_interval = 60 * 1000 * 1000; ktime_get_coarse_ts64(&ctx->last_aggregation); ctx->last_ops_update = ctx->last_aggregation; mutex_init(&ctx->kdamond_lock); - ctx->min_nr_regions = 10; - ctx->max_nr_regions = 1000; + ctx->attrs.min_nr_regions = 10; + ctx->attrs.max_nr_regions = 1000; INIT_LIST_HEAD(&ctx->adaptive_targets); INIT_LIST_HEAD(&ctx->schemes); @@ -448,11 +448,11 @@ int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, if (min_nr_reg > max_nr_reg) return -EINVAL; - ctx->sample_interval = sample_int; - ctx->aggr_interval = aggr_int; - ctx->ops_update_interval = ops_upd_int; - ctx->min_nr_regions = min_nr_reg; - ctx->max_nr_regions = max_nr_reg; + ctx->attrs.sample_interval = sample_int; + ctx->attrs.aggr_interval = aggr_int; + ctx->attrs.ops_update_interval = ops_upd_int; + ctx->attrs.min_nr_regions = min_nr_reg; + ctx->attrs.max_nr_regions = max_nr_reg; return 0; } @@ -507,8 +507,8 @@ static unsigned long damon_region_sz_limit(struct damon_ctx *ctx) sz += r->ar.end - r->ar.start; } - if (ctx->min_nr_regions) - sz /= ctx->min_nr_regions; + if (ctx->attrs.min_nr_regions) + sz /= ctx->attrs.min_nr_regions; if (sz < DAMON_MIN_REGION) sz = DAMON_MIN_REGION; @@ -657,7 +657,7 @@ static bool damon_check_reset_time_interval(struct timespec64 *baseline, static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) { return damon_check_reset_time_interval(&ctx->last_aggregation, - ctx->aggr_interval); + ctx->attrs.aggr_interval); } /* @@ -1016,12 +1016,12 @@ static void kdamond_split_regions(struct damon_ctx *ctx) damon_for_each_target(t, ctx) nr_regions += damon_nr_regions(t); - if (nr_regions > ctx->max_nr_regions / 2) + if (nr_regions > ctx->attrs.max_nr_regions / 2) return; /* Maybe the middle of the region has different access frequency */ if (last_nr_regions == nr_regions && - nr_regions < ctx->max_nr_regions / 3) + nr_regions < ctx->attrs.max_nr_regions / 3) nr_subregions = 3; damon_for_each_target(t, ctx) @@ -1039,7 +1039,7 @@ static void kdamond_split_regions(struct damon_ctx *ctx) static bool kdamond_need_update_operations(struct damon_ctx *ctx) { return damon_check_reset_time_interval(&ctx->last_ops_update, - ctx->ops_update_interval); + ctx->attrs.ops_update_interval); } /* @@ -1188,7 +1188,7 @@ static int kdamond_fn(void *data) continue; } - kdamond_usleep(ctx->sample_interval); + kdamond_usleep(ctx->attrs.sample_interval); if (ctx->ops.check_accesses) max_nr_accesses = ctx->ops.check_accesses(ctx); diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 1422037cedd2..74e7542af6d3 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -55,9 +55,9 @@ static ssize_t dbgfs_attrs_read(struct file *file, mutex_lock(&ctx->kdamond_lock); ret = scnprintf(kbuf, ARRAY_SIZE(kbuf), "%lu %lu %lu %lu %lu\n", - ctx->sample_interval, ctx->aggr_interval, - ctx->ops_update_interval, ctx->min_nr_regions, - ctx->max_nr_regions); + ctx->attrs.sample_interval, ctx->attrs.aggr_interval, + ctx->attrs.ops_update_interval, + ctx->attrs.min_nr_regions, ctx->attrs.max_nr_regions); mutex_unlock(&ctx->kdamond_lock); return simple_read_from_buffer(buf, count, ppos, kbuf, ret); diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index f599838b5f64..9310df72e1c5 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -99,10 +99,10 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, unsigned int age_weight = s->quota.weight_age; int hotness; - max_nr_accesses = c->aggr_interval / c->sample_interval; + max_nr_accesses = c->attrs.aggr_interval / c->attrs.sample_interval; freq_subscore = r->nr_accesses * DAMON_MAX_SUBSCORE / max_nr_accesses; - age_in_sec = (unsigned long)r->age * c->aggr_interval / 1000000; + age_in_sec = (unsigned long)r->age * c->attrs.aggr_interval / 1000000; for (age_in_log = 0; age_in_log < DAMON_MAX_AGE_IN_LOG && age_in_sec; age_in_log++, age_in_sec >>= 1) ; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index c2c08c1b316b..0eae47bd9ccb 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -251,8 +251,8 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, for (i = 0; i < 3; i++) sz += regions[i].end - regions[i].start; - if (ctx->min_nr_regions) - sz /= ctx->min_nr_regions; + if (ctx->attrs.min_nr_regions) + sz /= ctx->attrs.min_nr_regions; if (sz < DAMON_MIN_REGION) sz = DAMON_MIN_REGION; From bead3b00088eb8016b32cafa7e0701b3283e68a4 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:33 +0000 Subject: [PATCH 213/350] mm/damon/core: reduce parameters for damon_set_attrs() Number of parameters for 'damon_set_attrs()' is six. As it could be confusing and verbose, this commit reduces the number by receiving single pointer to a 'struct damon_attrs'. Link: https://lkml.kernel.org/r/20220913174449.50645-7-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 4 +--- mm/damon/core.c | 21 +++++---------------- mm/damon/dbgfs.c | 9 ++++++--- mm/damon/lru_sort.c | 10 ++++++++-- mm/damon/reclaim.c | 10 ++++++++-- mm/damon/sysfs.c | 12 ++++++++---- 6 files changed, 36 insertions(+), 30 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 2ceee8b07726..c5dc0c77c772 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -540,9 +540,7 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); -int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long ops_upd_int, - unsigned long min_nr_reg, unsigned long max_nr_reg); +int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs); int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); diff --git a/mm/damon/core.c b/mm/damon/core.c index bbd4c2d991dd..29635a82cb69 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -428,32 +428,21 @@ void damon_destroy_ctx(struct damon_ctx *ctx) /** * damon_set_attrs() - Set attributes for the monitoring. * @ctx: monitoring context - * @sample_int: time interval between samplings - * @aggr_int: time interval between aggregations - * @ops_upd_int: time interval between monitoring operations updates - * @min_nr_reg: minimal number of regions - * @max_nr_reg: maximum number of regions + * @attrs: monitoring attributes * * This function should not be called while the kdamond is running. * Every time interval is in micro-seconds. * * Return: 0 on success, negative error code otherwise. */ -int damon_set_attrs(struct damon_ctx *ctx, unsigned long sample_int, - unsigned long aggr_int, unsigned long ops_upd_int, - unsigned long min_nr_reg, unsigned long max_nr_reg) +int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) { - if (min_nr_reg < 3) + if (attrs->min_nr_regions < 3) return -EINVAL; - if (min_nr_reg > max_nr_reg) + if (attrs->min_nr_regions > attrs->max_nr_regions) return -EINVAL; - ctx->attrs.sample_interval = sample_int; - ctx->attrs.aggr_interval = aggr_int; - ctx->attrs.ops_update_interval = ops_upd_int; - ctx->attrs.min_nr_regions = min_nr_reg; - ctx->attrs.max_nr_regions = max_nr_reg; - + ctx->attrs = *attrs; return 0; } diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index 74e7542af6d3..c00eba4448d8 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -67,7 +67,7 @@ static ssize_t dbgfs_attrs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { struct damon_ctx *ctx = file->private_data; - unsigned long s, a, r, minr, maxr; + struct damon_attrs attrs; char *kbuf; ssize_t ret; @@ -76,7 +76,10 @@ static ssize_t dbgfs_attrs_write(struct file *file, return PTR_ERR(kbuf); if (sscanf(kbuf, "%lu %lu %lu %lu %lu", - &s, &a, &r, &minr, &maxr) != 5) { + &attrs.sample_interval, &attrs.aggr_interval, + &attrs.ops_update_interval, + &attrs.min_nr_regions, + &attrs.max_nr_regions) != 5) { ret = -EINVAL; goto out; } @@ -87,7 +90,7 @@ static ssize_t dbgfs_attrs_write(struct file *file, goto unlock_out; } - ret = damon_set_attrs(ctx, s, a, r, minr, maxr); + ret = damon_set_attrs(ctx, &attrs); if (!ret) ret = count; unlock_out: diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 307ba71adcfa..6d5f83965276 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -350,13 +350,19 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) static int damon_lru_sort_apply_parameters(void) { + struct damon_attrs attrs = { + .sample_interval = sample_interval, + .aggr_interval = aggr_interval, + .ops_update_interval = 0, + .min_nr_regions = min_nr_regions, + .max_nr_regions = max_nr_regions, + }; struct damos *scheme; struct damon_addr_range addr_range; unsigned int hot_thres, cold_thres; int err = 0; - err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, - min_nr_regions, max_nr_regions); + err = damon_set_attrs(ctx, &attrs); if (err) return err; diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index fe7bc0c55ecb..bc841efbab45 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -275,12 +275,18 @@ static struct damos *damon_reclaim_new_scheme(void) static int damon_reclaim_apply_parameters(void) { + struct damon_attrs attrs = { + .sample_interval = sample_interval, + .aggr_interval = aggr_interval, + .ops_update_interval = 0, + .min_nr_regions = min_nr_regions, + .max_nr_regions = max_nr_regions, + }; struct damos *scheme; struct damon_addr_range addr_range; int err = 0; - err = damon_set_attrs(ctx, sample_interval, aggr_interval, 0, - min_nr_regions, max_nr_regions); + err = damon_set_attrs(ctx, &attrs); if (err) return err; diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index da01befae8bd..3dbf3804ec88 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2130,10 +2130,14 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, struct damon_sysfs_intervals *sys_intervals = sys_attrs->intervals; struct damon_sysfs_ul_range *sys_nr_regions = sys_attrs->nr_regions_range; - - return damon_set_attrs(ctx, sys_intervals->sample_us, - sys_intervals->aggr_us, sys_intervals->update_us, - sys_nr_regions->min, sys_nr_regions->max); + struct damon_attrs attrs = { + .sample_interval = sys_intervals->sample_us, + .aggr_interval = sys_intervals->aggr_us, + .ops_update_interval = sys_intervals->update_us, + .min_nr_regions = sys_nr_regions->min, + .max_nr_regions = sys_nr_regions->max, + }; + return damon_set_attrs(ctx, &attrs); } static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) From 8c341ae3341188a0bcef02f05aca7345501ce697 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:34 +0000 Subject: [PATCH 214/350] mm/damon/reclaim: use 'struct damon_attrs' for storing parameters for it DAMON_RECLAIM receives monitoring attributes by parameters one by one to separate variables, and then combine those into 'struct damon_attrs'. This commit makes the module directly stores the parameter values to a static 'struct damon_attrs' variable and use it to simplify the code. Link: https://lkml.kernel.org/r/20220913174449.50645-8-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 36 +++++++++++++++++++----------------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index bc841efbab45..d35a00d8dde2 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -129,14 +129,22 @@ module_param(wmarks_mid, ulong, 0600); static unsigned long wmarks_low __read_mostly = 200; module_param(wmarks_low, ulong, 0600); +static struct damon_attrs damon_reclaim_mon_attrs = { + .sample_interval = 5000, + .aggr_interval = 100000, + .ops_update_interval = 0, + .min_nr_regions = 10, + .max_nr_regions = 1000, +}; + /* * Sampling interval for the monitoring in microseconds. * * The sampling interval of DAMON for the cold memory monitoring. Please refer * to the DAMON documentation for more detail. 5 ms by default. */ -static unsigned long sample_interval __read_mostly = 5000; -module_param(sample_interval, ulong, 0600); +module_param_named(sample_interval, damon_reclaim_mon_attrs.sample_interval, + ulong, 0600); /* * Aggregation interval for the monitoring in microseconds. @@ -144,8 +152,8 @@ module_param(sample_interval, ulong, 0600); * The aggregation interval of DAMON for the cold memory monitoring. Please * refer to the DAMON documentation for more detail. 100 ms by default. */ -static unsigned long aggr_interval __read_mostly = 100000; -module_param(aggr_interval, ulong, 0600); +module_param_named(aggr_interval, damon_reclaim_mon_attrs.aggr_interval, ulong, + 0600); /* * Minimum number of monitoring regions. @@ -155,8 +163,8 @@ module_param(aggr_interval, ulong, 0600); * But, setting this too high could result in increased monitoring overhead. * Please refer to the DAMON documentation for more detail. 10 by default. */ -static unsigned long min_nr_regions __read_mostly = 10; -module_param(min_nr_regions, ulong, 0600); +module_param_named(min_nr_regions, damon_reclaim_mon_attrs.min_nr_regions, + ulong, 0600); /* * Maximum number of monitoring regions. @@ -166,8 +174,8 @@ module_param(min_nr_regions, ulong, 0600); * However, setting this too low could result in bad monitoring quality. * Please refer to the DAMON documentation for more detail. 1000 by default. */ -static unsigned long max_nr_regions __read_mostly = 1000; -module_param(max_nr_regions, ulong, 0600); +module_param_named(max_nr_regions, damon_reclaim_mon_attrs.max_nr_regions, + ulong, 0600); /* * Start of the target memory region in physical address. @@ -239,7 +247,8 @@ static struct damos *damon_reclaim_new_scheme(void) .min_nr_accesses = 0, .max_nr_accesses = 0, /* for min_age or more micro-seconds */ - .min_age_region = min_age / aggr_interval, + .min_age_region = min_age / + damon_reclaim_mon_attrs.aggr_interval, .max_age_region = UINT_MAX, }; struct damos_watermarks wmarks = { @@ -275,18 +284,11 @@ static struct damos *damon_reclaim_new_scheme(void) static int damon_reclaim_apply_parameters(void) { - struct damon_attrs attrs = { - .sample_interval = sample_interval, - .aggr_interval = aggr_interval, - .ops_update_interval = 0, - .min_nr_regions = min_nr_regions, - .max_nr_regions = max_nr_regions, - }; struct damos *scheme; struct damon_addr_range addr_range; int err = 0; - err = damon_set_attrs(ctx, &attrs); + err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs); if (err) return err; From 135e128f8e48f30ea65e0ffad34dca37d2c8d171 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:35 +0000 Subject: [PATCH 215/350] mm/damon/lru_sort: use 'struct damon_attrs' for storing parameters for it DAMON_LRU_SORT receives monitoring attributes by parameters one by one to separate variables, and then combines those into 'struct damon_attrs'. This commit makes the module directly stores the parameter values to a static 'struct damon_attrs' variable and use it to simplify the code. Link: https://lkml.kernel.org/r/20220913174449.50645-9-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 6d5f83965276..ade985b83652 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -127,14 +127,22 @@ module_param(wmarks_mid, ulong, 0600); static unsigned long wmarks_low __read_mostly = 50; module_param(wmarks_low, ulong, 0600); +static struct damon_attrs damon_lru_sort_mon_attrs = { + .sample_interval = 5000, + .aggr_interval = 100000, + .ops_update_interval = 0, + .min_nr_regions = 10, + .max_nr_regions = 1000, +}; + /* * Sampling interval for the monitoring in microseconds. * * The sampling interval of DAMON for the hot/cold memory monitoring. Please * refer to the DAMON documentation for more detail. 5 ms by default. */ -static unsigned long sample_interval __read_mostly = 5000; -module_param(sample_interval, ulong, 0600); +module_param_named(sample_interval, damon_lru_sort_mon_attrs.sample_interval, + ulong, 0600); /* * Aggregation interval for the monitoring in microseconds. @@ -142,8 +150,8 @@ module_param(sample_interval, ulong, 0600); * The aggregation interval of DAMON for the hot/cold memory monitoring. * Please refer to the DAMON documentation for more detail. 100 ms by default. */ -static unsigned long aggr_interval __read_mostly = 100000; -module_param(aggr_interval, ulong, 0600); +module_param_named(aggr_interval, damon_lru_sort_mon_attrs.aggr_interval, ulong, + 0600); /* * Minimum number of monitoring regions. @@ -153,8 +161,8 @@ module_param(aggr_interval, ulong, 0600); * But, setting this too high could result in increased monitoring overhead. * Please refer to the DAMON documentation for more detail. 10 by default. */ -static unsigned long min_nr_regions __read_mostly = 10; -module_param(min_nr_regions, ulong, 0600); +module_param_named(min_nr_regions, damon_lru_sort_mon_attrs.min_nr_regions, + ulong, 0600); /* * Maximum number of monitoring regions. @@ -164,8 +172,8 @@ module_param(min_nr_regions, ulong, 0600); * However, setting this too low could result in bad monitoring quality. * Please refer to the DAMON documentation for more detail. 1000 by default. */ -static unsigned long max_nr_regions __read_mostly = 1000; -module_param(max_nr_regions, ulong, 0600); +module_param_named(max_nr_regions, damon_lru_sort_mon_attrs.max_nr_regions, + ulong, 0600); /* * Start of the target memory region in physical address. @@ -350,25 +358,19 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) static int damon_lru_sort_apply_parameters(void) { - struct damon_attrs attrs = { - .sample_interval = sample_interval, - .aggr_interval = aggr_interval, - .ops_update_interval = 0, - .min_nr_regions = min_nr_regions, - .max_nr_regions = max_nr_regions, - }; struct damos *scheme; struct damon_addr_range addr_range; unsigned int hot_thres, cold_thres; int err = 0; - err = damon_set_attrs(ctx, &attrs); + err = damon_set_attrs(ctx, &damon_lru_sort_mon_attrs); if (err) return err; /* aggr_interval / sample_interval is the maximum nr_accesses */ - hot_thres = aggr_interval / sample_interval * hot_thres_access_freq / - 1000; + hot_thres = damon_lru_sort_mon_attrs.aggr_interval / + damon_lru_sort_mon_attrs.sample_interval * + hot_thres_access_freq / 1000; scheme = damon_lru_sort_new_hot_scheme(hot_thres); if (!scheme) return -ENOMEM; @@ -376,7 +378,7 @@ static int damon_lru_sort_apply_parameters(void) if (err) return err; - cold_thres = cold_min_age / aggr_interval; + cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval; scheme = damon_lru_sort_new_cold_scheme(cold_thres); if (!scheme) return -ENOMEM; From b3c28d886329d8df66679f72f3f3c81c0dd21e88 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:36 +0000 Subject: [PATCH 216/350] mm/damon: implement a monitoring attributes module parameters generator macro DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for monitoring attributes that having same names. This commot implements a macro for generating such module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-10-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 mm/damon/modules-common.h diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h new file mode 100644 index 000000000000..0abd0636bc64 --- /dev/null +++ b/mm/damon/modules-common.h @@ -0,0 +1,18 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Common Primitives for DAMON Modules + * + * Author: SeongJae Park + */ + +#include + +#define DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(attrs) \ + module_param_named(sample_interval, attrs.sample_interval, \ + ulong, 0600); \ + module_param_named(aggr_interval, attrs.aggr_interval, ulong, \ + 0600); \ + module_param_named(min_nr_regions, attrs.min_nr_regions, ulong, \ + 0600); \ + module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ + 0600); From 95f7c05d73fc6d9cfe43fb18b2f16b21eb55b5bf Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:37 +0000 Subject: [PATCH 217/350] mm/damon/lru_sort: use monitoring attributes parameters generaotr macro This commit makes DAMON_LRU_SORT to generate the module parameters for DAMON monitoring attributes using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-11-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 47 +++++---------------------------------------- 1 file changed, 5 insertions(+), 42 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index ade985b83652..e95626acee6f 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -13,6 +13,8 @@ #include #include +#include "modules-common.h" + #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif @@ -128,52 +130,13 @@ static unsigned long wmarks_low __read_mostly = 50; module_param(wmarks_low, ulong, 0600); static struct damon_attrs damon_lru_sort_mon_attrs = { - .sample_interval = 5000, - .aggr_interval = 100000, + .sample_interval = 5000, /* 5 ms */ + .aggr_interval = 100000, /* 100 ms */ .ops_update_interval = 0, .min_nr_regions = 10, .max_nr_regions = 1000, }; - -/* - * Sampling interval for the monitoring in microseconds. - * - * The sampling interval of DAMON for the hot/cold memory monitoring. Please - * refer to the DAMON documentation for more detail. 5 ms by default. - */ -module_param_named(sample_interval, damon_lru_sort_mon_attrs.sample_interval, - ulong, 0600); - -/* - * Aggregation interval for the monitoring in microseconds. - * - * The aggregation interval of DAMON for the hot/cold memory monitoring. - * Please refer to the DAMON documentation for more detail. 100 ms by default. - */ -module_param_named(aggr_interval, damon_lru_sort_mon_attrs.aggr_interval, ulong, - 0600); - -/* - * Minimum number of monitoring regions. - * - * The minimal number of monitoring regions of DAMON for the hot/cold memory - * monitoring. This can be used to set lower-bound of the monitoring quality. - * But, setting this too high could result in increased monitoring overhead. - * Please refer to the DAMON documentation for more detail. 10 by default. - */ -module_param_named(min_nr_regions, damon_lru_sort_mon_attrs.min_nr_regions, - ulong, 0600); - -/* - * Maximum number of monitoring regions. - * - * The maximum number of monitoring regions of DAMON for the hot/cold memory - * monitoring. This can be used to set upper-bound of the monitoring overhead. - * However, setting this too low could result in bad monitoring quality. - * Please refer to the DAMON documentation for more detail. 1000 by default. - */ -module_param_named(max_nr_regions, damon_lru_sort_mon_attrs.max_nr_regions, - ulong, 0600); +DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_lru_sort_mon_attrs); /* * Start of the target memory region in physical address. From fdfc119c17cfbc0aa26be6b070f49aa1584a7e08 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:38 +0000 Subject: [PATCH 218/350] mm/damon/reclaim: use monitoring attributes parameters generator macro This commit makes DAMON_RECLAIM to generate the module parameters for DAMON monitoring attributes using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-12-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 47 +++++----------------------------------------- 1 file changed, 5 insertions(+), 42 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index d35a00d8dde2..48326bef20f5 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -13,6 +13,8 @@ #include #include +#include "modules-common.h" + #ifdef MODULE_PARAM_PREFIX #undef MODULE_PARAM_PREFIX #endif @@ -130,52 +132,13 @@ static unsigned long wmarks_low __read_mostly = 200; module_param(wmarks_low, ulong, 0600); static struct damon_attrs damon_reclaim_mon_attrs = { - .sample_interval = 5000, - .aggr_interval = 100000, + .sample_interval = 5000, /* 5 ms */ + .aggr_interval = 100000, /* 100 ms */ .ops_update_interval = 0, .min_nr_regions = 10, .max_nr_regions = 1000, }; - -/* - * Sampling interval for the monitoring in microseconds. - * - * The sampling interval of DAMON for the cold memory monitoring. Please refer - * to the DAMON documentation for more detail. 5 ms by default. - */ -module_param_named(sample_interval, damon_reclaim_mon_attrs.sample_interval, - ulong, 0600); - -/* - * Aggregation interval for the monitoring in microseconds. - * - * The aggregation interval of DAMON for the cold memory monitoring. Please - * refer to the DAMON documentation for more detail. 100 ms by default. - */ -module_param_named(aggr_interval, damon_reclaim_mon_attrs.aggr_interval, ulong, - 0600); - -/* - * Minimum number of monitoring regions. - * - * The minimal number of monitoring regions of DAMON for the cold memory - * monitoring. This can be used to set lower-bound of the monitoring quality. - * But, setting this too high could result in increased monitoring overhead. - * Please refer to the DAMON documentation for more detail. 10 by default. - */ -module_param_named(min_nr_regions, damon_reclaim_mon_attrs.min_nr_regions, - ulong, 0600); - -/* - * Maximum number of monitoring regions. - * - * The maximum number of monitoring regions of DAMON for the cold memory - * monitoring. This can be used to set upper-bound of the monitoring overhead. - * However, setting this too low could result in bad monitoring quality. - * Please refer to the DAMON documentation for more detail. 1000 by default. - */ -module_param_named(max_nr_regions, damon_reclaim_mon_attrs.max_nr_regions, - ulong, 0600); +DEFINE_DAMON_MODULES_MON_ATTRS_PARAMS(damon_reclaim_mon_attrs); /* * Start of the target memory region in physical address. From b324ee36e9685689a55c1faee669cd7a1a42bae0 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:39 +0000 Subject: [PATCH 219/350] mm/damon/modules-common: implement a watermarks module parameters generator macro DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for watermarks that having same names. This commit implements a macro for generating such module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-13-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 0abd0636bc64..1370590a37d1 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -16,3 +16,10 @@ 0600); \ module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ 0600); + +#define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ + module_param_named(wmarks_interval, wmarks->interval, ulong, \ + 0600); \ + module_param_named(wmarks_high, wmarks.high, ulong, 0600); \ + module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \ + module_param_named(wmarks_low, wmarks.lowulong, 0600); From 6517d2d97709e01c6758dcccc7a51e3731c8706f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:40 +0000 Subject: [PATCH 220/350] mm/damon/lru_sort: use watermarks parameters generator macro This commit makes DAMON_LRU_SORT to generate the module parameters for DAMOS watermarks using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-14-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 64 ++++++--------------------------------- mm/damon/modules-common.h | 4 +-- 2 files changed, 12 insertions(+), 56 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index e95626acee6f..20760b39b50a 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -90,44 +90,14 @@ module_param(quota_ms, ulong, 0600); static unsigned long quota_reset_interval_ms __read_mostly = 1000; module_param(quota_reset_interval_ms, ulong, 0600); -/* - * The watermarks check time interval in microseconds. - * - * Minimal time to wait before checking the watermarks, when DAMON_LRU_SORT is - * enabled but inactive due to its watermarks rule. 5 seconds by default. - */ -static unsigned long wmarks_interval __read_mostly = 5000000; -module_param(wmarks_interval, ulong, 0600); - -/* - * Free memory rate (per thousand) for the high watermark. - * - * If free memory of the system in bytes per thousand bytes is higher than - * this, DAMON_LRU_SORT becomes inactive, so it does nothing but periodically - * checks the watermarks. 200 (20%) by default. - */ -static unsigned long wmarks_high __read_mostly = 200; -module_param(wmarks_high, ulong, 0600); - -/* - * Free memory rate (per thousand) for the middle watermark. - * - * If free memory of the system in bytes per thousand bytes is between this and - * the low watermark, DAMON_LRU_SORT becomes active, so starts the monitoring - * and the LRU-lists sorting. 150 (15%) by default. - */ -static unsigned long wmarks_mid __read_mostly = 150; -module_param(wmarks_mid, ulong, 0600); - -/* - * Free memory rate (per thousand) for the low watermark. - * - * If free memory of the system in bytes per thousand bytes is lower than this, - * DAMON_LRU_SORT becomes inactive, so it does nothing but periodically checks - * the watermarks. 50 (5%) by default. - */ -static unsigned long wmarks_low __read_mostly = 50; -module_param(wmarks_low, ulong, 0600); +struct damos_watermarks damon_lru_sort_wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = 5000000, /* 5 seconds */ + .high = 200, /* 20 percent */ + .mid = 150, /* 15 percent */ + .low = 50, /* 5 percent */ +}; +DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_lru_sort_wmarks); static struct damon_attrs damon_lru_sort_mon_attrs = { .sample_interval = 5000, /* 5 ms */ @@ -242,13 +212,6 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) .min_age_region = 0, .max_age_region = UINT_MAX, }; - struct damos_watermarks wmarks = { - .metric = DAMOS_WMARK_FREE_MEM_RATE, - .interval = wmarks_interval, - .high = wmarks_high, - .mid = wmarks_mid, - .low = wmarks_low, - }; struct damos_quota quota = { /* * Do not try LRU-lists sorting of hot pages for more than half @@ -270,7 +233,7 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ - &wmarks); + &damon_lru_sort_wmarks); } /* Create a DAMON-based operation scheme for cold memory regions */ @@ -287,13 +250,6 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) .min_age_region = cold_thres, .max_age_region = UINT_MAX, }; - struct damos_watermarks wmarks = { - .metric = DAMOS_WMARK_FREE_MEM_RATE, - .interval = wmarks_interval, - .high = wmarks_high, - .mid = wmarks_mid, - .low = wmarks_low, - }; struct damos_quota quota = { /* * Do not try LRU-lists sorting of cold pages for more than @@ -316,7 +272,7 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ - &wmarks); + &damon_lru_sort_wmarks); } static int damon_lru_sort_apply_parameters(void) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 1370590a37d1..4c2ce84869d5 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -18,8 +18,8 @@ 0600); #define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ - module_param_named(wmarks_interval, wmarks->interval, ulong, \ + module_param_named(wmarks_interval, wmarks.interval, ulong, \ 0600); \ module_param_named(wmarks_high, wmarks.high, ulong, 0600); \ module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \ - module_param_named(wmarks_low, wmarks.lowulong, 0600); + module_param_named(wmarks_low, wmarks.low, ulong, 0600); From 34f47ea688bb6d1c6d04f8d72546a623bd8d59de Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:41 +0000 Subject: [PATCH 221/350] mm/damon/reclaim: use watermarks parameters generator macro This commit makes DAMON_RECLAIM to generate the module parameters for DAMOS watermarks using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-15-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 56 ++++++++-------------------------------------- 1 file changed, 9 insertions(+), 47 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 48326bef20f5..7f845f617dc5 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -91,45 +91,14 @@ module_param(quota_sz, ulong, 0600); static unsigned long quota_reset_interval_ms __read_mostly = 1000; module_param(quota_reset_interval_ms, ulong, 0600); -/* - * The watermarks check time interval in microseconds. - * - * Minimal time to wait before checking the watermarks, when DAMON_RECLAIM is - * enabled but inactive due to its watermarks rule. 5 seconds by default. - */ -static unsigned long wmarks_interval __read_mostly = 5000000; -module_param(wmarks_interval, ulong, 0600); - -/* - * Free memory rate (per thousand) for the high watermark. - * - * If free memory of the system in bytes per thousand bytes is higher than - * this, DAMON_RECLAIM becomes inactive, so it does nothing but periodically - * checks the watermarks. 500 (50%) by default. - */ -static unsigned long wmarks_high __read_mostly = 500; -module_param(wmarks_high, ulong, 0600); - -/* - * Free memory rate (per thousand) for the middle watermark. - * - * If free memory of the system in bytes per thousand bytes is between this and - * the low watermark, DAMON_RECLAIM becomes active, so starts the monitoring - * and the reclaiming. 400 (40%) by default. - */ -static unsigned long wmarks_mid __read_mostly = 400; -module_param(wmarks_mid, ulong, 0600); - -/* - * Free memory rate (per thousand) for the low watermark. - * - * If free memory of the system in bytes per thousand bytes is lower than this, - * DAMON_RECLAIM becomes inactive, so it does nothing but periodically checks - * the watermarks. In the case, the system falls back to the LRU-based page - * granularity reclamation logic. 200 (20%) by default. - */ -static unsigned long wmarks_low __read_mostly = 200; -module_param(wmarks_low, ulong, 0600); +struct damos_watermarks damon_reclaim_wmarks = { + .metric = DAMOS_WMARK_FREE_MEM_RATE, + .interval = 5000000, /* 5 seconds */ + .high = 500, /* 50 percent */ + .mid = 400, /* 40 percent */ + .low = 200, /* 20 percent */ +}; +DEFINE_DAMON_MODULES_WMARKS_PARAMS(damon_reclaim_wmarks); static struct damon_attrs damon_reclaim_mon_attrs = { .sample_interval = 5000, /* 5 ms */ @@ -214,13 +183,6 @@ static struct damos *damon_reclaim_new_scheme(void) damon_reclaim_mon_attrs.aggr_interval, .max_age_region = UINT_MAX, }; - struct damos_watermarks wmarks = { - .metric = DAMOS_WMARK_FREE_MEM_RATE, - .interval = wmarks_interval, - .high = wmarks_high, - .mid = wmarks_mid, - .low = wmarks_low, - }; struct damos_quota quota = { /* * Do not try reclamation for more than quota_ms milliseconds @@ -242,7 +204,7 @@ static struct damos *damon_reclaim_new_scheme(void) /* under the quota. */ "a, /* (De)activate this according to the watermarks. */ - &wmarks); + &damon_reclaim_wmarks); } static int damon_reclaim_apply_parameters(void) From 528ef2d996408d4b9cccf4b23a9976ab5e75cf39 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:42 +0000 Subject: [PATCH 222/350] mm/damon/modules-common: implement a stats parameters generator macro DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for DAMOS statistics that having same names. This commit implements a macro for generating such module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-16-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 4c2ce84869d5..ed973e0770ae 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -23,3 +23,15 @@ module_param_named(wmarks_high, wmarks.high, ulong, 0600); \ module_param_named(wmarks_mid, wmarks.mid, ulong, 0600); \ module_param_named(wmarks_low, wmarks.low, ulong, 0600); + +#define DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(stat, try_name, \ + succ_name, qt_exceed_name) \ + module_param_named(nr_##try_name, stat.nr_tried, ulong, 0400); \ + module_param_named(bytes_##try_name, stat.sz_tried, ulong, \ + 0400); \ + module_param_named(nr_##succ_name, stat.nr_applied, ulong, \ + 0400); \ + module_param_named(bytes_##succ_name, stat.sz_applied, ulong, \ + 0400); \ + module_param_named(qt_exceed_name, stat.qt_exceeds, ulong, \ + 0400); From b71f3ea83242890900bb0668201568df81244547 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:43 +0000 Subject: [PATCH 223/350] mm/damon/reclaim: use stat parameters generator This commit makes DAMON_RECLAIM to generate the module parameters for DAMOS statistics using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-17-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 41 +++++------------------------------------ 1 file changed, 5 insertions(+), 36 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 7f845f617dc5..1ef8353ac15a 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -136,35 +136,9 @@ module_param(monitor_region_end, ulong, 0600); static int kdamond_pid __read_mostly = -1; module_param(kdamond_pid, int, 0400); -/* - * Number of memory regions that tried to be reclaimed. - */ -static unsigned long nr_reclaim_tried_regions __read_mostly; -module_param(nr_reclaim_tried_regions, ulong, 0400); - -/* - * Total bytes of memory regions that tried to be reclaimed. - */ -static unsigned long bytes_reclaim_tried_regions __read_mostly; -module_param(bytes_reclaim_tried_regions, ulong, 0400); - -/* - * Number of memory regions that successfully be reclaimed. - */ -static unsigned long nr_reclaimed_regions __read_mostly; -module_param(nr_reclaimed_regions, ulong, 0400); - -/* - * Total bytes of memory regions that successfully be reclaimed. - */ -static unsigned long bytes_reclaimed_regions __read_mostly; -module_param(bytes_reclaimed_regions, ulong, 0400); - -/* - * Number of times that the time/space quota limits have exceeded - */ -static unsigned long nr_quota_exceeds __read_mostly; -module_param(nr_quota_exceeds, ulong, 0400); +static struct damos_stat damon_reclaim_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_reclaim_stat, + reclaim_tried_regions, reclaimed_regions, quota_exceeds); static struct damon_ctx *ctx; static struct damon_target *target; @@ -318,13 +292,8 @@ static int damon_reclaim_after_aggregation(struct damon_ctx *c) struct damos *s; /* update the stats parameter */ - damon_for_each_scheme(s, c) { - nr_reclaim_tried_regions = s->stat.nr_tried; - bytes_reclaim_tried_regions = s->stat.sz_tried; - nr_reclaimed_regions = s->stat.nr_applied; - bytes_reclaimed_regions = s->stat.sz_applied; - nr_quota_exceeds = s->stat.qt_exceeds; - } + damon_for_each_scheme(s, c) + damon_reclaim_stat = s->stat; return damon_reclaim_handle_commit_inputs(); } From dd172fbf8f1d3befd0a22357a251d8d516354d5f Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:44 +0000 Subject: [PATCH 224/350] mm/damon/lru_sort: use stat generator This commit makes DAMON_LRU_SORT to generate the module parameters for DAMOS statistics using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-18-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 83 +++++++-------------------------------------- 1 file changed, 12 insertions(+), 71 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 20760b39b50a..13a752aed272 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -135,65 +135,15 @@ module_param(monitor_region_end, ulong, 0600); static int kdamond_pid __read_mostly = -1; module_param(kdamond_pid, int, 0400); -/* - * Number of hot memory regions that tried to be LRU-sorted. - */ -static unsigned long nr_lru_sort_tried_hot_regions __read_mostly; -module_param(nr_lru_sort_tried_hot_regions, ulong, 0400); +static struct damos_stat damon_lru_sort_hot_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_hot_stat, + lru_sort_tried_hot_regions, lru_sorted_hot_regions, + hot_quota_exceeds); -/* - * Total bytes of hot memory regions that tried to be LRU-sorted. - */ -static unsigned long bytes_lru_sort_tried_hot_regions __read_mostly; -module_param(bytes_lru_sort_tried_hot_regions, ulong, 0400); - -/* - * Number of hot memory regions that successfully be LRU-sorted. - */ -static unsigned long nr_lru_sorted_hot_regions __read_mostly; -module_param(nr_lru_sorted_hot_regions, ulong, 0400); - -/* - * Total bytes of hot memory regions that successfully be LRU-sorted. - */ -static unsigned long bytes_lru_sorted_hot_regions __read_mostly; -module_param(bytes_lru_sorted_hot_regions, ulong, 0400); - -/* - * Number of times that the time quota limit for hot regions have exceeded - */ -static unsigned long nr_hot_quota_exceeds __read_mostly; -module_param(nr_hot_quota_exceeds, ulong, 0400); - -/* - * Number of cold memory regions that tried to be LRU-sorted. - */ -static unsigned long nr_lru_sort_tried_cold_regions __read_mostly; -module_param(nr_lru_sort_tried_cold_regions, ulong, 0400); - -/* - * Total bytes of cold memory regions that tried to be LRU-sorted. - */ -static unsigned long bytes_lru_sort_tried_cold_regions __read_mostly; -module_param(bytes_lru_sort_tried_cold_regions, ulong, 0400); - -/* - * Number of cold memory regions that successfully be LRU-sorted. - */ -static unsigned long nr_lru_sorted_cold_regions __read_mostly; -module_param(nr_lru_sorted_cold_regions, ulong, 0400); - -/* - * Total bytes of cold memory regions that successfully be LRU-sorted. - */ -static unsigned long bytes_lru_sorted_cold_regions __read_mostly; -module_param(bytes_lru_sorted_cold_regions, ulong, 0400); - -/* - * Number of times that the time quota limit for cold regions have exceeded - */ -static unsigned long nr_cold_quota_exceeds __read_mostly; -module_param(nr_cold_quota_exceeds, ulong, 0400); +static struct damos_stat damon_lru_sort_cold_stat; +DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat, + lru_sort_tried_cold_regions, lru_sorted_cold_regions, + cold_quota_exceeds); static struct damon_ctx *ctx; static struct damon_target *target; @@ -397,19 +347,10 @@ static int damon_lru_sort_after_aggregation(struct damon_ctx *c) /* update the stats parameter */ damon_for_each_scheme(s, c) { - if (s->action == DAMOS_LRU_PRIO) { - nr_lru_sort_tried_hot_regions = s->stat.nr_tried; - bytes_lru_sort_tried_hot_regions = s->stat.sz_tried; - nr_lru_sorted_hot_regions = s->stat.nr_applied; - bytes_lru_sorted_hot_regions = s->stat.sz_applied; - nr_hot_quota_exceeds = s->stat.qt_exceeds; - } else if (s->action == DAMOS_LRU_DEPRIO) { - nr_lru_sort_tried_cold_regions = s->stat.nr_tried; - bytes_lru_sort_tried_cold_regions = s->stat.sz_tried; - nr_lru_sorted_cold_regions = s->stat.nr_applied; - bytes_lru_sorted_cold_regions = s->stat.sz_applied; - nr_cold_quota_exceeds = s->stat.qt_exceeds; - } + if (s->action == DAMOS_LRU_PRIO) + damon_lru_sort_hot_stat = s->stat; + else if (s->action == DAMOS_LRU_DEPRIO) + damon_lru_sort_cold_stat = s->stat; } return damon_lru_sort_handle_commit_inputs(); From 63e0f90bac0c772c14aecfe36783ab60795d05db Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:45 +0000 Subject: [PATCH 225/350] mm/damon/modules-common: implement a damos quota params generator DAMON_RECLAIM and DAMON_LRU_SORT have module parameters for DAMOS quotas that having same names. This commit implements a macro for generating such module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-19-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index ed973e0770ae..3e99810b4689 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -17,6 +17,12 @@ module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ 0600); +#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \ + module_param_named(quota_ms, quota.ms, ulong, 0600); \ + module_param_named(quota_sz, quota.sz, ulong, 0600); \ + module_param_named(quota_reset_interval_ms, \ + quota.reset_interval, ulong, 0600); + #define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ module_param_named(wmarks_interval, wmarks.interval, ulong, \ 0600); \ @@ -33,5 +39,5 @@ 0400); \ module_param_named(bytes_##succ_name, stat.sz_applied, ulong, \ 0400); \ - module_param_named(qt_exceed_name, stat.qt_exceeds, ulong, \ + module_param_named(nr_##qt_exceed_name, stat.qt_exceeds, ulong, \ 0400); From 1f55402685d10aa336cf9b25e83b416e4fc0c153 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:46 +0000 Subject: [PATCH 226/350] mm/damon/modules-common: implement damos time quota params generator DAMON_LRU_SORT have module parameters for DAMOS time quota only but size quota. This commit implements a macro for generating the module parameters so that we can reuse later. Link: https://lkml.kernel.org/r/20220913174449.50645-20-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/modules-common.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/damon/modules-common.h b/mm/damon/modules-common.h index 3e99810b4689..5a4921851d32 100644 --- a/mm/damon/modules-common.h +++ b/mm/damon/modules-common.h @@ -17,12 +17,15 @@ module_param_named(max_nr_regions, attrs.max_nr_regions, ulong, \ 0600); -#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \ +#define DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \ module_param_named(quota_ms, quota.ms, ulong, 0600); \ - module_param_named(quota_sz, quota.sz, ulong, 0600); \ module_param_named(quota_reset_interval_ms, \ quota.reset_interval, ulong, 0600); +#define DEFINE_DAMON_MODULES_DAMOS_QUOTAS(quota) \ + DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(quota) \ + module_param_named(quota_sz, quota.sz, ulong, 0600); + #define DEFINE_DAMON_MODULES_WMARKS_PARAMS(wmarks) \ module_param_named(wmarks_interval, wmarks.interval, ulong, \ 0600); \ From a9d57c7369532cdcd3a834c3f0cc5ad6b2f0f1ff Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:47 +0000 Subject: [PATCH 227/350] mm/damon/reclaim: use the quota params generator macro This commit makes DAMON_RECLAIM to generate the module parameters for DAMOS quotas using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-21-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 64 +++++++++------------------------------------- 1 file changed, 12 insertions(+), 52 deletions(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 1ef8353ac15a..1acf808e1624 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -52,44 +52,17 @@ module_param(commit_inputs, bool, 0600); static unsigned long min_age __read_mostly = 120000000; module_param(min_age, ulong, 0600); -/* - * Limit of time for trying the reclamation in milliseconds. - * - * DAMON_RECLAIM tries to use only up to this time within a time window - * (quota_reset_interval_ms) for trying reclamation of cold pages. This can be - * used for limiting CPU consumption of DAMON_RECLAIM. If the value is zero, - * the limit is disabled. - * - * 10 ms by default. - */ -static unsigned long quota_ms __read_mostly = 10; -module_param(quota_ms, ulong, 0600); - -/* - * Limit of size of memory for the reclamation in bytes. - * - * DAMON_RECLAIM charges amount of memory which it tried to reclaim within a - * time window (quota_reset_interval_ms) and makes no more than this limit is - * tried. This can be used for limiting consumption of CPU and IO. If this - * value is zero, the limit is disabled. - * - * 128 MiB by default. - */ -static unsigned long quota_sz __read_mostly = 128 * 1024 * 1024; -module_param(quota_sz, ulong, 0600); - -/* - * The time/size quota charge reset interval in milliseconds. - * - * The charge reset interval for the quota of time (quota_ms) and size - * (quota_sz). That is, DAMON_RECLAIM does not try reclamation for more than - * quota_ms milliseconds or quota_sz bytes within quota_reset_interval_ms - * milliseconds. - * - * 1 second by default. - */ -static unsigned long quota_reset_interval_ms __read_mostly = 1000; -module_param(quota_reset_interval_ms, ulong, 0600); +static struct damos_quota damon_reclaim_quota = { + /* use up to 10 ms time, reclaim up to 128 MiB per 1 sec by default */ + .ms = 10, + .sz = 128 * 1024 * 1024, + .reset_interval = 1000, + /* Within the quota, page out older regions first. */ + .weight_sz = 0, + .weight_nr_accesses = 0, + .weight_age = 1 +}; +DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota); struct damos_watermarks damon_reclaim_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, @@ -157,26 +130,13 @@ static struct damos *damon_reclaim_new_scheme(void) damon_reclaim_mon_attrs.aggr_interval, .max_age_region = UINT_MAX, }; - struct damos_quota quota = { - /* - * Do not try reclamation for more than quota_ms milliseconds - * or quota_sz bytes within quota_reset_interval_ms. - */ - .ms = quota_ms, - .sz = quota_sz, - .reset_interval = quota_reset_interval_ms, - /* Within the quota, page out older regions first. */ - .weight_sz = 0, - .weight_nr_accesses = 0, - .weight_age = 1 - }; return damon_new_scheme( &pattern, /* page out those, as soon as found */ DAMOS_PAGEOUT, /* under the quota. */ - "a, + &damon_reclaim_quota, /* (De)activate this according to the watermarks. */ &damon_reclaim_wmarks); } From 45b8212fc555d07ed78b9270283d61afbdee1df6 Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:48 +0000 Subject: [PATCH 228/350] mm/damon/lru_sort: use quotas param generator This commit makes DAMON_LRU_SORT to generate the module parameters for DAMOS watermarks using the generator macro to simplify the code and reduce duplicates. Link: https://lkml.kernel.org/r/20220913174449.50645-22-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 70 ++++++++++++--------------------------------- 1 file changed, 19 insertions(+), 51 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 13a752aed272..8d9c3d1fd6be 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -65,30 +65,17 @@ module_param(hot_thres_access_freq, ulong, 0600); static unsigned long cold_min_age __read_mostly = 120000000; module_param(cold_min_age, ulong, 0600); -/* - * Limit of time for trying the LRU lists sorting in milliseconds. - * - * DAMON_LRU_SORT tries to use only up to this time within a time window - * (quota_reset_interval_ms) for trying LRU lists sorting. This can be used - * for limiting CPU consumption of DAMON_LRU_SORT. If the value is zero, the - * limit is disabled. - * - * 10 ms by default. - */ -static unsigned long quota_ms __read_mostly = 10; -module_param(quota_ms, ulong, 0600); - -/* - * The time quota charge reset interval in milliseconds. - * - * The charge reset interval for the quota of time (quota_ms). That is, - * DAMON_LRU_SORT does not try LRU-lists sorting for more than quota_ms - * milliseconds or quota_sz bytes within quota_reset_interval_ms milliseconds. - * - * 1 second by default. - */ -static unsigned long quota_reset_interval_ms __read_mostly = 1000; -module_param(quota_reset_interval_ms, ulong, 0600); +static struct damos_quota damon_lru_sort_quota = { + /* Use up to 10 ms per 1 sec, by default */ + .ms = 10, + .sz = 0, + .reset_interval = 1000, + /* Within the quota, mark hotter regions accessed first. */ + .weight_sz = 0, + .weight_nr_accesses = 1, + .weight_age = 0, +}; +DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota); struct damos_watermarks damon_lru_sort_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, @@ -162,19 +149,10 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) .min_age_region = 0, .max_age_region = UINT_MAX, }; - struct damos_quota quota = { - /* - * Do not try LRU-lists sorting of hot pages for more than half - * of quota_ms milliseconds within quota_reset_interval_ms. - */ - .ms = quota_ms / 2, - .sz = 0, - .reset_interval = quota_reset_interval_ms, - /* Within the quota, mark hotter regions accessed first. */ - .weight_sz = 0, - .weight_nr_accesses = 1, - .weight_age = 0, - }; + struct damos_quota quota = damon_lru_sort_quota; + + /* Use half of total quota for hot pages sorting */ + quota.ms = quota.ms / 2; return damon_new_scheme( &pattern, @@ -200,20 +178,10 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) .min_age_region = cold_thres, .max_age_region = UINT_MAX, }; - struct damos_quota quota = { - /* - * Do not try LRU-lists sorting of cold pages for more than - * half of quota_ms milliseconds within - * quota_reset_interval_ms. - */ - .ms = quota_ms / 2, - .sz = 0, - .reset_interval = quota_reset_interval_ms, - /* Within the quota, mark colder regions not accessed first. */ - .weight_sz = 0, - .weight_nr_accesses = 0, - .weight_age = 1, - }; + struct damos_quota quota = damon_lru_sort_quota; + + /* Use half of total quota for cold pages sorting */ + quota.ms = quota.ms / 2; return damon_new_scheme( &pattern, From a62518ab1da4eb8bf0335c0e254b3e82e9ce222e Mon Sep 17 00:00:00 2001 From: SeongJae Park Date: Tue, 13 Sep 2022 17:44:49 +0000 Subject: [PATCH 229/350] mm/damon/lru_sort: deduplicate hot/cold schemes generators damon_lru_sort_new_{hot,cold}_scheme() have quite a lot of duplicates. This commit factors out the duplicate to a separate function and use it for reducing the duplicate. Link: https://lkml.kernel.org/r/20220913174449.50645-23-sj@kernel.org Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 45 +++++++++++++++++++++------------------------ 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 8d9c3d1fd6be..07a0908963fd 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -135,6 +135,25 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat, static struct damon_ctx *ctx; static struct damon_target *target; +static struct damos *damon_lru_sort_new_scheme( + struct damos_access_pattern *pattern, enum damos_action action) +{ + struct damos_quota quota = damon_lru_sort_quota; + + /* Use half of total quota for hot/cold pages sorting */ + quota.ms = quota.ms / 2; + + return damon_new_scheme( + /* find the pattern, and */ + pattern, + /* (de)prioritize on LRU-lists */ + action, + /* under the quota. */ + "a, + /* (De)activate this according to the watermarks. */ + &damon_lru_sort_wmarks); +} + /* Create a DAMON-based operation scheme for hot memory regions */ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) { @@ -149,19 +168,8 @@ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) .min_age_region = 0, .max_age_region = UINT_MAX, }; - struct damos_quota quota = damon_lru_sort_quota; - /* Use half of total quota for hot pages sorting */ - quota.ms = quota.ms / 2; - - return damon_new_scheme( - &pattern, - /* prioritize those on LRU lists, as soon as found */ - DAMOS_LRU_PRIO, - /* under the quota. */ - "a, - /* (De)activate this according to the watermarks. */ - &damon_lru_sort_wmarks); + return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_PRIO); } /* Create a DAMON-based operation scheme for cold memory regions */ @@ -178,19 +186,8 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) .min_age_region = cold_thres, .max_age_region = UINT_MAX, }; - struct damos_quota quota = damon_lru_sort_quota; - /* Use half of total quota for cold pages sorting */ - quota.ms = quota.ms / 2; - - return damon_new_scheme( - &pattern, - /* mark those as not accessed, as soon as found */ - DAMOS_LRU_DEPRIO, - /* under the quota. */ - "a, - /* (De)activate this according to the watermarks. */ - &damon_lru_sort_wmarks); + return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); } static int damon_lru_sort_apply_parameters(void) From 8ef4d5caa66d62b3b87a14d01562fb487651df2e Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 13 Sep 2022 17:11:24 +0800 Subject: [PATCH 230/350] mm/damon: simplify the parameter passing for 'prepare_access_checks' Patch series "mm/damon: code simplifications and cleanups". This patchset contains some code simplifications and cleanups for DAMON. This patch (of 4): The parameter 'struct damon_ctx *ctx' isn't used in the functions __damon_{p,v}a_prepare_access_check(), so we can remove it and simplify the parameter passing. Link: https://lkml.kernel.org/r/1663060287-30201-1-git-send-email-kaixuxia@tencent.com Link: https://lkml.kernel.org/r/1663060287-30201-2-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/paddr.c | 5 ++--- mm/damon/vaddr.c | 6 +++--- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index 1ada62db68b1..dfeebffe82f4 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -63,8 +63,7 @@ out: folio_put(folio); } -static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, - struct damon_region *r) +static void __damon_pa_prepare_access_check(struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); @@ -78,7 +77,7 @@ static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) damon_for_each_target(t, ctx) { damon_for_each_region(r, t) - __damon_pa_prepare_access_check(ctx, r); + __damon_pa_prepare_access_check(r); } } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 0eae47bd9ccb..3f84584f9982 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -397,8 +397,8 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) * Functions for the access checking of the regions */ -static void __damon_va_prepare_access_check(struct damon_ctx *ctx, - struct mm_struct *mm, struct damon_region *r) +static void __damon_va_prepare_access_check(struct mm_struct *mm, + struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); @@ -416,7 +416,7 @@ static void damon_va_prepare_access_checks(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) - __damon_va_prepare_access_check(ctx, mm, r); + __damon_va_prepare_access_check(mm, r); mmput(mm); } } From f1c71c2825218dc8b35c04ab439fdf3d32778c7c Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 13 Sep 2022 17:11:25 +0800 Subject: [PATCH 231/350] mm/damon/sysfs: simplify the variable 'pid' assignment operation We can initialize the variable 'pid' with '-1' in pid_show() to simplify the variable assignment operation and make the code more readable. Link: https://lkml.kernel.org/r/1663060287-30201-3-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 3dbf3804ec88..1fa0023f136e 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2592,19 +2592,16 @@ static ssize_t pid_show(struct kobject *kobj, struct damon_sysfs_kdamond *kdamond = container_of(kobj, struct damon_sysfs_kdamond, kobj); struct damon_ctx *ctx; - int pid; + int pid = -1; if (!mutex_trylock(&damon_sysfs_lock)) return -EBUSY; ctx = kdamond->damon_ctx; - if (!ctx) { - pid = -1; + if (!ctx) goto out; - } + mutex_lock(&ctx->kdamond_lock); - if (!ctx->kdamond) - pid = -1; - else + if (ctx->kdamond) pid = ctx->kdamond->pid; mutex_unlock(&ctx->kdamond_lock); out: From 29454cf6ab3c49bc5d3f443e1d1417feca3d0ce5 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 13 Sep 2022 17:11:26 +0800 Subject: [PATCH 232/350] mm/damon/core: simplify the kdamond stop mechanism by removing 'done' When the 'kdamond_wait_activation()' function or 'after_sampling()' or 'after_aggregation()' DAMON callbacks return an error, it is unnecessary to use bool 'done' to check if kdamond should be finished. This commit simplifies the kdamond stop mechanism by removing 'done' and break the while loop directly in the cases. Link: https://lkml.kernel.org/r/1663060287-30201-4-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/core.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/mm/damon/core.c b/mm/damon/core.c index 29635a82cb69..a843673c11cf 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1152,30 +1152,25 @@ static int kdamond_fn(void *data) struct damon_region *r, *next; unsigned int max_nr_accesses = 0; unsigned long sz_limit = 0; - bool done = false; pr_debug("kdamond (%d) starts\n", current->pid); if (ctx->ops.init) ctx->ops.init(ctx); if (ctx->callback.before_start && ctx->callback.before_start(ctx)) - done = true; + goto done; sz_limit = damon_region_sz_limit(ctx); - while (!kdamond_need_stop(ctx) && !done) { - if (kdamond_wait_activation(ctx)) { - done = true; - continue; - } + while (!kdamond_need_stop(ctx)) { + if (kdamond_wait_activation(ctx)) + break; if (ctx->ops.prepare_access_checks) ctx->ops.prepare_access_checks(ctx); if (ctx->callback.after_sampling && - ctx->callback.after_sampling(ctx)) { - done = true; - continue; - } + ctx->callback.after_sampling(ctx)) + break; kdamond_usleep(ctx->attrs.sample_interval); @@ -1187,10 +1182,8 @@ static int kdamond_fn(void *data) max_nr_accesses / 10, sz_limit); if (ctx->callback.after_aggregation && - ctx->callback.after_aggregation(ctx)) { - done = true; - continue; - } + ctx->callback.after_aggregation(ctx)) + break; kdamond_apply_schemes(ctx); kdamond_reset_aggregated(ctx); kdamond_split_regions(ctx); @@ -1204,6 +1197,7 @@ static int kdamond_fn(void *data) sz_limit = damon_region_sz_limit(ctx); } } +done: damon_for_each_target(t, ctx) { damon_for_each_region_safe(r, next, t) damon_destroy_region(r, t); From 4988fe69527c6e02066aeb454c2db4d6d51d317b Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Tue, 13 Sep 2022 15:13:58 +0800 Subject: [PATCH 233/350] mm/memcontrol: use kstrtobool for swapaccount param parsing Use kstrtobool which is more powerful to handle all kinds of parameters like 'Yy1Nn0' or [oO][NnFf] for "on" and "off". Link: https://lkml.kernel.org/r/20220913071358.1812206-1-liushixin2@huawei.com Signed-off-by: Liu Shixin Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Jonathan Corbet Cc: Kefeng Wang Cc: Muchun Song Cc: Roman Gushchin Cc: Shakeel Butt Signed-off-by: Andrew Morton --- mm/memcontrol.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1f204a262054..ac6440daf208 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7507,10 +7507,10 @@ bool mem_cgroup_swap_full(struct folio *folio) static int __init setup_swap_account(char *s) { - if (!strcmp(s, "1")) - cgroup_memory_noswap = false; - else if (!strcmp(s, "0")) - cgroup_memory_noswap = true; + bool res; + + if (!kstrtobool(s, &res)) + cgroup_memory_noswap = !res; return 1; } __setup("swapaccount=", setup_swap_account); From a8368cd8e22531b3b248a2c869d71b668aeeb789 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 13 Sep 2022 15:20:48 -0700 Subject: [PATCH 234/350] mm/page_alloc.c: rename check_free_page() to free_page_is_bad() The name "check_free_page()" provides no information regarding its return value when the page is indeed found to be bad. Renaming it to "free_page_is_bad()" makes it clear that a `true' return value means the page was bad. And make it return a bool, not an int. [akpm@linux-foundation.org: don't use bool as int] Cc: Catalin Marinas Cc: ke.wang Cc: Matthew Wilcox Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/page_alloc.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0002ded4ab0e..c48357c124eb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1285,20 +1285,20 @@ static const char *page_bad_reason(struct page *page, unsigned long flags) return bad_reason; } -static void check_free_page_bad(struct page *page) +static void free_page_is_bad_report(struct page *page) { bad_page(page, page_bad_reason(page, PAGE_FLAGS_CHECK_AT_FREE)); } -static inline int check_free_page(struct page *page) +static inline bool free_page_is_bad(struct page *page) { if (likely(page_expected_state(page, PAGE_FLAGS_CHECK_AT_FREE))) - return 0; + return false; /* Something has gone sideways, find it */ - check_free_page_bad(page); - return 1; + free_page_is_bad_report(page); + return true; } static int free_tail_pages_check(struct page *head_page, struct page *page) @@ -1430,7 +1430,7 @@ static __always_inline bool free_pages_prepare(struct page *page, for (i = 1; i < (1 << order); i++) { if (compound) bad += free_tail_pages_check(page, page + i); - if (unlikely(check_free_page(page + i))) { + if (unlikely(free_page_is_bad(page + i))) { bad++; continue; } @@ -1441,8 +1441,8 @@ static __always_inline bool free_pages_prepare(struct page *page, page->mapping = NULL; if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); - if (check_free) - bad += check_free_page(page); + if (check_free && free_page_is_bad(page)) + bad++; if (bad) return false; @@ -1504,7 +1504,7 @@ static bool free_pcp_prepare(struct page *page, unsigned int order) static bool bulkfree_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) - return check_free_page(page); + return free_page_is_bad(page); else return false; } @@ -1525,7 +1525,7 @@ static bool free_pcp_prepare(struct page *page, unsigned int order) static bool bulkfree_pcp_prepare(struct page *page) { - return check_free_page(page); + return free_page_is_bad(page); } #endif /* CONFIG_DEBUG_VM */ From d452289fcd68f13f4067f0ddd78a5d948cb7d9ea Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Tue, 13 Sep 2022 15:30:38 -0700 Subject: [PATCH 235/350] mm/page_alloc.c: document bulkfree_pcp_prepare() return value Cc: Catalin Marinas Cc: ke.wang Cc: Matthew Wilcox Cc: Zhaoyang Huang Signed-off-by: Andrew Morton --- mm/page_alloc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c48357c124eb..4e8ea824e765 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1501,6 +1501,7 @@ static bool free_pcp_prepare(struct page *page, unsigned int order) return free_pages_prepare(page, order, true, FPI_NONE); } +/* return true if this page has an inappropriate state */ static bool bulkfree_pcp_prepare(struct page *page) { if (debug_pagealloc_enabled_static()) From aaa31e058dd82453c89302c9331945894ff555a6 Mon Sep 17 00:00:00 2001 From: ze zuo Date: Tue, 13 Sep 2022 01:55:05 +0000 Subject: [PATCH 236/350] mm/mempolicy: use PAGE_ALIGN instead of open-coding it Replace the simple calculation with PAGE_ALIGN. Link: https://lkml.kernel.org/r/20220913015505.1998958-1-zuoze1@huawei.com Signed-off-by: ze zuo Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- mm/mempolicy.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 143e2eaaa6ec..a937eaec5b68 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1270,7 +1270,7 @@ static long do_mbind(unsigned long start, unsigned long len, if (mode == MPOL_DEFAULT) flags &= ~MPOL_MF_STRICT; - len = (len + PAGE_SIZE - 1) & PAGE_MASK; + len = PAGE_ALIGN(len); end = start + len; if (end < start) @@ -1507,7 +1507,7 @@ SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, le if (home_node >= MAX_NUMNODES || !node_online(home_node)) return -EINVAL; - len = (len + PAGE_SIZE - 1) & PAGE_MASK; + len = PAGE_ALIGN(len); end = start + len; if (end < start) From b958d4d08fbfe938af24ea06ebbf839b48fa18a9 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 14 Sep 2022 15:26:02 +0800 Subject: [PATCH 237/350] mm: hugetlb: simplify per-node sysfs creation and removal Patch series "simplify handling of per-node sysfs creation and removal", v4. This patch (of 2): The following commit offload per-node sysfs creation and removal to a kworker and did not say why it is needed. And it also said "I don't know that this is absolutely required". It seems like the author was not sure as well. Since it only complicates the code, this patch will revert the changes to simplify the code. 39da08cb074c ("hugetlb: offload per node attribute registrations") We could use memory hotplug notifier to do per-node sysfs creation and removal instead of inserting those operations to node registration and unregistration. Then, it can reduce the code coupling between node.c and hugetlb.c. Also, it can simplify the code. Link: https://lkml.kernel.org/r/20220914072603.60293-1-songmuchun@bytedance.com Link: https://lkml.kernel.org/r/20220914072603.60293-2-songmuchun@bytedance.com Signed-off-by: Muchun Song Acked-by: Mike Kravetz Acked-by: David Hildenbrand Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Muchun Song Cc: Oscar Salvador Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- drivers/base/node.c | 139 +------------------------------------------ include/linux/node.h | 24 ++------ mm/hugetlb.c | 35 +++++++---- 3 files changed, 30 insertions(+), 168 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index eb0f43784c2b..ed391cb09999 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -587,64 +587,9 @@ static const struct attribute_group *node_dev_groups[] = { NULL }; -#ifdef CONFIG_HUGETLBFS -/* - * hugetlbfs per node attributes registration interface: - * When/if hugetlb[fs] subsystem initializes [sometime after this module], - * it will register its per node attributes for all online nodes with - * memory. It will also call register_hugetlbfs_with_node(), below, to - * register its attribute registration functions with this node driver. - * Once these hooks have been initialized, the node driver will call into - * the hugetlb module to [un]register attributes for hot-plugged nodes. - */ -static node_registration_func_t __hugetlb_register_node; -static node_registration_func_t __hugetlb_unregister_node; - -static inline bool hugetlb_register_node(struct node *node) -{ - if (__hugetlb_register_node && - node_state(node->dev.id, N_MEMORY)) { - __hugetlb_register_node(node); - return true; - } - return false; -} - -static inline void hugetlb_unregister_node(struct node *node) -{ - if (__hugetlb_unregister_node) - __hugetlb_unregister_node(node); -} - -void register_hugetlbfs_with_node(node_registration_func_t doregister, - node_registration_func_t unregister) -{ - __hugetlb_register_node = doregister; - __hugetlb_unregister_node = unregister; -} -#else -static inline void hugetlb_register_node(struct node *node) {} - -static inline void hugetlb_unregister_node(struct node *node) {} -#endif - static void node_device_release(struct device *dev) { - struct node *node = to_node(dev); - -#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) - /* - * We schedule the work only when a memory section is - * onlined/offlined on this node. When we come here, - * all the memory on this node has been offlined, - * so we won't enqueue new work to this work. - * - * The work is using node->node_work, so we should - * flush work before freeing the memory. - */ - flush_work(&node->node_work); -#endif - kfree(node); + kfree(to_node(dev)); } /* @@ -665,11 +610,9 @@ static int register_node(struct node *node, int num) if (error) put_device(&node->dev); - else { - hugetlb_register_node(node); - + else compaction_register_node(node); - } + return error; } @@ -683,7 +626,6 @@ static int register_node(struct node *node, int num) void unregister_node(struct node *node) { compaction_unregister_node(node); - hugetlb_unregister_node(node); /* no-op, if memoryless node */ node_remove_accesses(node); node_remove_caches(node); device_unregister(&node->dev); @@ -905,74 +847,8 @@ void register_memory_blocks_under_node(int nid, unsigned long start_pfn, (void *)&nid, func); return; } - -#ifdef CONFIG_HUGETLBFS -/* - * Handle per node hstate attribute [un]registration on transistions - * to/from memoryless state. - */ -static void node_hugetlb_work(struct work_struct *work) -{ - struct node *node = container_of(work, struct node, node_work); - - /* - * We only get here when a node transitions to/from memoryless state. - * We can detect which transition occurred by examining whether the - * node has memory now. hugetlb_register_node() already check this - * so we try to register the attributes. If that fails, then the - * node has transitioned to memoryless, try to unregister the - * attributes. - */ - if (!hugetlb_register_node(node)) - hugetlb_unregister_node(node); -} - -static void init_node_hugetlb_work(int nid) -{ - INIT_WORK(&node_devices[nid]->node_work, node_hugetlb_work); -} - -static int node_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mnb = arg; - int nid = mnb->status_change_nid; - - switch (action) { - case MEM_ONLINE: - case MEM_OFFLINE: - /* - * offload per node hstate [un]registration to a work thread - * when transitioning to/from memoryless state. - */ - if (nid != NUMA_NO_NODE) - schedule_work(&node_devices[nid]->node_work); - break; - - case MEM_GOING_ONLINE: - case MEM_GOING_OFFLINE: - case MEM_CANCEL_ONLINE: - case MEM_CANCEL_OFFLINE: - default: - break; - } - - return NOTIFY_OK; -} -#endif /* CONFIG_HUGETLBFS */ #endif /* CONFIG_MEMORY_HOTPLUG */ -#if !defined(CONFIG_MEMORY_HOTPLUG) || !defined(CONFIG_HUGETLBFS) -static inline int node_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - return NOTIFY_OK; -} - -static void init_node_hugetlb_work(int nid) { } - -#endif - int __register_one_node(int nid) { int error; @@ -991,8 +867,6 @@ int __register_one_node(int nid) } INIT_LIST_HEAD(&node_devices[nid]->access_list); - /* initialize work queue for memory hot plug */ - init_node_hugetlb_work(nid); node_init_caches(nid); return error; @@ -1063,13 +937,8 @@ static const struct attribute_group *cpu_root_attr_groups[] = { NULL, }; -#define NODE_CALLBACK_PRI 2 /* lower than SLAB */ void __init node_dev_init(void) { - static struct notifier_block node_memory_callback_nb = { - .notifier_call = node_memory_callback, - .priority = NODE_CALLBACK_PRI, - }; int ret, i; BUILD_BUG_ON(ARRAY_SIZE(node_state_attr) != NR_NODE_STATES); @@ -1079,8 +948,6 @@ void __init node_dev_init(void) if (ret) panic("%s() failed to register subsystem: %d\n", __func__, ret); - register_hotmemory_notifier(&node_memory_callback_nb); - /* * Create all node devices, which will properly link the node * to applicable memory block devices and already created cpu devices. diff --git a/include/linux/node.h b/include/linux/node.h index 9ec680dd607f..427a5975cf40 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -2,15 +2,15 @@ /* * include/linux/node.h - generic node definition * - * This is mainly for topological representation. We define the - * basic 'struct node' here, which can be embedded in per-arch + * This is mainly for topological representation. We define the + * basic 'struct node' here, which can be embedded in per-arch * definitions of processors. * * Basic handling of the devices is done in drivers/base/node.c - * and system devices are handled in drivers/base/sys.c. + * and system devices are handled in drivers/base/sys.c. * * Nodes are exported via driverfs in the class/node/devices/ - * directory. + * directory. */ #ifndef _LINUX_NODE_H_ #define _LINUX_NODE_H_ @@ -18,7 +18,6 @@ #include #include #include -#include /** * struct node_hmem_attrs - heterogeneous memory performance attributes @@ -84,10 +83,6 @@ static inline void node_set_perf_attrs(unsigned int nid, struct node { struct device dev; struct list_head access_list; - -#if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_HUGETLBFS) - struct work_struct node_work; -#endif #ifdef CONFIG_HMEM_REPORTING struct list_head cache_attrs; struct device *cache_dev; @@ -96,7 +91,6 @@ struct node { struct memory_block; extern struct node *node_devices[]; -typedef void (*node_registration_func_t)(struct node *); #if defined(CONFIG_MEMORY_HOTPLUG) && defined(CONFIG_NUMA) void register_memory_blocks_under_node(int nid, unsigned long start_pfn, @@ -144,11 +138,6 @@ extern void unregister_memory_block_under_nodes(struct memory_block *mem_blk); extern int register_memory_node_under_compute_node(unsigned int mem_nid, unsigned int cpu_nid, unsigned access); - -#ifdef CONFIG_HUGETLBFS -extern void register_hugetlbfs_with_node(node_registration_func_t doregister, - node_registration_func_t unregister); -#endif #else static inline void node_dev_init(void) { @@ -176,11 +165,6 @@ static inline int unregister_cpu_under_node(unsigned int cpu, unsigned int nid) static inline void unregister_memory_block_under_nodes(struct memory_block *mem_blk) { } - -static inline void register_hugetlbfs_with_node(node_registration_func_t reg, - node_registration_func_t unreg) -{ -} #endif #define to_node(device) container_of(device, struct node, dev) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6af123374e98..397f2988c37f 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include @@ -4000,6 +4001,23 @@ static void hugetlb_register_node(struct node *node) } } +static int __meminit hugetlb_memory_callback(struct notifier_block *self, + unsigned long action, void *arg) +{ + struct memory_notify *mnb = arg; + int nid = mnb->status_change_nid; + + if (nid == NUMA_NO_NODE) + return NOTIFY_DONE; + + if (action == MEM_GOING_ONLINE) + hugetlb_register_node(node_devices[nid]); + else if (action == MEM_CANCEL_ONLINE || action == MEM_OFFLINE) + hugetlb_unregister_node(node_devices[nid]); + + return NOTIFY_OK; +} + /* * hugetlb init time: register hstate attributes for all registered node * devices of nodes that have memory. All on-line nodes should have @@ -4009,18 +4027,11 @@ static void __init hugetlb_register_all_nodes(void) { int nid; - for_each_node_state(nid, N_MEMORY) { - struct node *node = node_devices[nid]; - if (node->dev.id == nid) - hugetlb_register_node(node); - } - - /* - * Let the node device driver know we're here so it can - * [un]register hstate attributes on node hotplug. - */ - register_hugetlbfs_with_node(hugetlb_register_node, - hugetlb_unregister_node); + get_online_mems(); + hotplug_memory_notifier(hugetlb_memory_callback, 0); + for_each_node_state(nid, N_MEMORY) + hugetlb_register_node(node_devices[nid]); + put_online_mems(); } #else /* !CONFIG_NUMA */ From a4a00b451ef5e1deb959088e25e248f4ee399792 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Wed, 14 Sep 2022 15:26:03 +0800 Subject: [PATCH 238/350] mm: hugetlb: eliminate memory-less nodes handling The memory-notify-based approach aims to handle meory-less nodes, however, it just adds the complexity of code as pointed by David in thread [1]. The handling of memory-less nodes is introduced by commit 4faf8d950ec4 ("hugetlb: handle memory hot-plug events"). >From its commit message, we cannot find any necessity of handling this case. So, we can simply register/unregister sysfs entries in register_node/unregister_node to simlify the code. BTW, hotplug callback added because in hugetlb_register_all_nodes() we register sysfs nodes only for N_MEMORY nodes, seeing commit 9b5e5d0fdc91, which said it was a preparation for handling memory-less nodes via memory hotplug. Since we want to remove memory hotplug, so make sure we only register per-node sysfs for online (N_ONLINE) nodes in hugetlb_register_all_nodes(). https://lore.kernel.org/linux-mm/60933ffc-b850-976c-78a0-0ee6e0ea9ef0@redhat.com/ [1] Link: https://lkml.kernel.org/r/20220914072603.60293-3-songmuchun@bytedance.com Suggested-by: David Hildenbrand Signed-off-by: Muchun Song Acked-by: David Hildenbrand Cc: Andi Kleen Cc: Greg Kroah-Hartman Cc: Mike Kravetz Cc: Oscar Salvador Cc: Rafael J. Wysocki Signed-off-by: Andrew Morton --- drivers/base/node.c | 8 +++-- include/linux/hugetlb.h | 14 +++++++++ mm/hugetlb.c | 70 +++++++++++++++++------------------------ 3 files changed, 49 insertions(+), 43 deletions(-) diff --git a/drivers/base/node.c b/drivers/base/node.c index ed391cb09999..80b1e91b9608 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -20,6 +20,7 @@ #include #include #include +#include static struct bus_type node_subsys = { .name = "node", @@ -608,10 +609,12 @@ static int register_node(struct node *node, int num) node->dev.groups = node_dev_groups; error = device_register(&node->dev); - if (error) + if (error) { put_device(&node->dev); - else + } else { + hugetlb_register_node(node); compaction_register_node(node); + } return error; } @@ -625,6 +628,7 @@ static int register_node(struct node *node, int num) */ void unregister_node(struct node *node) { + hugetlb_unregister_node(node); compaction_unregister_node(node); node_remove_accesses(node); node_remove_caches(node); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 57e72954a482..6d7f39754060 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -16,6 +16,7 @@ struct ctl_table; struct user_struct; struct mmu_gather; +struct node; #ifndef is_hugepd typedef struct { unsigned long pd; } hugepd_t; @@ -935,6 +936,11 @@ static inline void huge_ptep_modify_prot_commit(struct vm_area_struct *vma, } #endif +#ifdef CONFIG_NUMA +void hugetlb_register_node(struct node *node); +void hugetlb_unregister_node(struct node *node); +#endif + #else /* CONFIG_HUGETLB_PAGE */ struct hstate {}; @@ -1109,6 +1115,14 @@ static inline void set_huge_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { } + +static inline void hugetlb_register_node(struct node *node) +{ +} + +static inline void hugetlb_unregister_node(struct node *node) +{ +} #endif /* CONFIG_HUGETLB_PAGE */ static inline spinlock_t *huge_pte_lock(struct hstate *h, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 397f2988c37f..0b1ab5af939e 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3871,24 +3871,8 @@ static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, return 0; } -static void __init hugetlb_sysfs_init(void) -{ - struct hstate *h; - int err; - - hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); - if (!hugepages_kobj) - return; - - for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, - hstate_kobjs, &hstate_attr_group); - if (err) - pr_err("HugeTLB: Unable to add hstate %s", h->name); - } -} - #ifdef CONFIG_NUMA +static bool hugetlb_sysfs_initialized __ro_after_init; /* * node_hstate/s - associate per node hstate attributes, via their kobjects, @@ -3944,7 +3928,7 @@ static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) * Unregister hstate attributes from a single node device. * No-op if no hstate attributes attached. */ -static void hugetlb_unregister_node(struct node *node) +void hugetlb_unregister_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; @@ -3974,12 +3958,15 @@ static void hugetlb_unregister_node(struct node *node) * Register hstate attributes for a single node device. * No-op if attributes already registered. */ -static void hugetlb_register_node(struct node *node) +void hugetlb_register_node(struct node *node) { struct hstate *h; struct node_hstate *nhs = &node_hstates[node->dev.id]; int err; + if (!hugetlb_sysfs_initialized) + return; + if (nhs->hugepages_kobj) return; /* already allocated */ @@ -4001,23 +3988,6 @@ static void hugetlb_register_node(struct node *node) } } -static int __meminit hugetlb_memory_callback(struct notifier_block *self, - unsigned long action, void *arg) -{ - struct memory_notify *mnb = arg; - int nid = mnb->status_change_nid; - - if (nid == NUMA_NO_NODE) - return NOTIFY_DONE; - - if (action == MEM_GOING_ONLINE) - hugetlb_register_node(node_devices[nid]); - else if (action == MEM_CANCEL_ONLINE || action == MEM_OFFLINE) - hugetlb_unregister_node(node_devices[nid]); - - return NOTIFY_OK; -} - /* * hugetlb init time: register hstate attributes for all registered node * devices of nodes that have memory. All on-line nodes should have @@ -4027,11 +3997,8 @@ static void __init hugetlb_register_all_nodes(void) { int nid; - get_online_mems(); - hotplug_memory_notifier(hugetlb_memory_callback, 0); - for_each_node_state(nid, N_MEMORY) + for_each_online_node(nid) hugetlb_register_node(node_devices[nid]); - put_online_mems(); } #else /* !CONFIG_NUMA */ @@ -4055,6 +4022,28 @@ static inline __init void hugetlb_cma_check(void) } #endif +static void __init hugetlb_sysfs_init(void) +{ + struct hstate *h; + int err; + + hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); + if (!hugepages_kobj) + return; + + for_each_hstate(h) { + err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, + hstate_kobjs, &hstate_attr_group); + if (err) + pr_err("HugeTLB: Unable to add hstate %s", h->name); + } + +#ifdef CONFIG_NUMA + hugetlb_sysfs_initialized = true; +#endif + hugetlb_register_all_nodes(); +} + static int __init hugetlb_init(void) { int i; @@ -4109,7 +4098,6 @@ static int __init hugetlb_init(void) report_hugepages(); hugetlb_sysfs_init(); - hugetlb_register_all_nodes(); hugetlb_cgroup_file_init(); #ifdef CONFIG_SMP From c195c3215741746b1eb7ab7980b926ddc37a4be3 Mon Sep 17 00:00:00 2001 From: Ke Sun Date: Wed, 14 Sep 2022 10:17:38 +0800 Subject: [PATCH 239/350] mm/filemap: make folio_put_wait_locked static It's only used in mm/filemap.c, since commit ("mm/migrate.c: rework migration_entry_wait() to not take a pageref"). Make it static. Link: https://lkml.kernel.org/r/20220914021738.3228011-1-sunke@kylinos.cn Signed-off-by: Ke Sun Reported-by: k2ci Signed-off-by: Andrew Morton --- include/linux/pagemap.h | 1 - mm/filemap.c | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 32846b6306db..23125ab87ded 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1039,7 +1039,6 @@ static inline int wait_on_page_locked_killable(struct page *page) return folio_wait_locked_killable(page_folio(page)); } -int folio_put_wait_locked(struct folio *folio, int state); void wait_on_page_writeback(struct page *page); void folio_wait_writeback(struct folio *folio); int folio_wait_writeback_killable(struct folio *folio); diff --git a/mm/filemap.c b/mm/filemap.c index aab125d423b8..f27c93a581ab 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1460,7 +1460,7 @@ EXPORT_SYMBOL(folio_wait_bit_killable); * * Return: 0 if the folio was unlocked or -EINTR if interrupted by a signal. */ -int folio_put_wait_locked(struct folio *folio, int state) +static int folio_put_wait_locked(struct folio *folio, int state) { return folio_wait_bit_common(folio, PG_locked, state, DROP); } From 3259914f8cab1bab3fe691a90ac3c47411cb0aba Mon Sep 17 00:00:00 2001 From: XU pengfei Date: Wed, 14 Sep 2022 09:21:14 +0800 Subject: [PATCH 240/350] mm/hugetlb: remove unnecessary 'NULL' values from pointer Pointer variables allocate memory first, and then judge. There is no need to initialize the assignment. Link: https://lkml.kernel.org/r/20220914012113.6271-1-xupengfei@nfschina.com Signed-off-by: XU pengfei Reviewed-by: Muchun Song Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0b1ab5af939e..d4347ae337fb 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -258,7 +258,7 @@ static inline struct hugepage_subpool *subpool_vma(struct vm_area_struct *vma) static struct file_region * get_file_region_entry_from_cache(struct resv_map *resv, long from, long to) { - struct file_region *nrg = NULL; + struct file_region *nrg; VM_BUG_ON(resv->region_cache_count <= 0); @@ -340,7 +340,7 @@ static bool has_same_uncharge_info(struct file_region *rg, static void coalesce_file_region(struct resv_map *resv, struct file_region *rg) { - struct file_region *nrg = NULL, *prg = NULL; + struct file_region *nrg, *prg; prg = list_prev_entry(rg, link); if (&prg->link != &resv->regions && prg->to == rg->from && From 188a39725ad7ded2d13e752a1a620152b0750175 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:02 -0700 Subject: [PATCH 241/350] hugetlbfs: revert use i_mmap_rwsem to address page fault/truncate race Patch series "hugetlb: Use new vma lock for huge pmd sharing synchronization", v2. hugetlb fault scalability regressions have recently been reported [1]. This is not the first such report, as regressions were also noted when commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") was added [2] in v5.7. At that time, a proposal to address the regression was suggested [3] but went nowhere. The regression and benefit of this patch series is not evident when using the vm_scalability benchmark reported in [2] on a recent kernel. Results from running, "./usemem -n 48 --prealloc --prefault -O -U 3448054972" 48 sample Avg next-20220913 next-20220913 next-20220913 unmodified revert i_mmap_sema locking vma sema locking, this series ----------------------------------------------------------------------------- 498150 KB/s 501934 KB/s 504793 KB/s The recent regression report [1] notes page fault and fork latency of shared hugetlb mappings. To measure this, I created two simple programs: 1) map a shared hugetlb area, write fault all pages, unmap area Do this in a continuous loop to measure faults per second 2) map a shared hugetlb area, write fault a few pages, fork and exit Do this in a continuous loop to measure forks per second These programs were run on a 48 CPU VM with 320GB memory. The shared mapping size was 250GB. For comparison, a single instance of the program was run. Then, multiple instances were run in parallel to introduce lock contention. Changing the locking scheme results in a significant performance benefit. test instances unmodified revert vma -------------------------------------------------------------------------- faults per sec 1 393043 395680 389932 faults per sec 24 71405 81191 79048 forks per sec 1 2802 2747 2725 forks per sec 24 439 536 500 Combined faults 24 1621 68070 53662 Combined forks 24 358 67 142 Combined test is when running both faulting program and forking program simultaneously. Patches 1 and 2 of this series revert c0d0381ade79 and 87bf91d39bb5 which depends on c0d0381ade79. Acquisition of i_mmap_rwsem is still required in the fault path to establish pmd sharing, so this is moved back to huge_pmd_share. With c0d0381ade79 reverted, this race is exposed: Faulting thread Unsharing thread ... ... ptep = huge_pte_offset() or ptep = huge_pte_alloc() ... i_mmap_lock_write lock page table ptep invalid <------------------------ huge_pmd_unshare() Could be in a previously unlock_page_table sharing process or worse i_mmap_unlock_write ... ptl = huge_pte_lock(ptep) get/update pte set_pte_at(pte, ptep) Reverting 87bf91d39bb5 exposes races in page fault/file truncation. When the new vma lock is put to use in patch 8, this will handle the fault/file truncation races. This is explained in patch 9 where code associated with these races is cleaned up. Patches 3 - 5 restructure existing code in preparation for using the new vma lock (rw semaphore) for pmd sharing synchronization. The idea is that this semaphore will be held in read mode for the duration of fault processing, and held in write mode for unmap operations which may call huge_pmd_unshare. Acquiring i_mmap_rwsem is also still required to synchronize huge pmd sharing. However it is only required in the fault path when setting up sharing, and will be acquired in huge_pmd_share(). Patch 6 adds the new vma lock and all supporting routines, but does not actually change code to use the new lock. Patch 7 refactors code in preparation for using the new lock. And, patch 8 finally adds code to make use of this new vma lock. Unfortunately, the fault code and truncate/hole punch code would naturally take locks in the opposite order which could lead to deadlock. Since the performance of page faults is more important, the truncation/hole punch code is modified to back out and take locks in the correct order if necessary. [1] https://lore.kernel.org/linux-mm/43faf292-245b-5db5-cce9-369d8fb6bd21@infradead.org/ [2] https://lore.kernel.org/lkml/20200622005551.GK5535@shao2-debian/ [3] https://lore.kernel.org/linux-mm/20200706202615.32111-1-mike.kravetz@oracle.com/ This patch (of 9): Commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") added code to take i_mmap_rwsem in read mode for the duration of fault processing. The use of i_mmap_rwsem to prevent fault/truncate races depends on this. However, this has been shown to cause performance/scaling issues. As a result, that code will be reverted. Since the use i_mmap_rwsem to address page fault/truncate races depends on this, it must also be reverted. In a subsequent patch, code will be added to detect the fault/truncate race and back out operations as required. Link: https://lkml.kernel.org/r/20220914221810.95771-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20220914221810.95771-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 30 +++++++++--------------------- mm/hugetlb.c | 22 +++++++++++----------- 2 files changed, 20 insertions(+), 32 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f7a5b5124d8a..a32031e751d1 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -419,9 +419,10 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve * maps and global counts. Page faults can not race with truncation - * in this routine. hugetlb_no_page() holds i_mmap_rwsem and prevents - * page faults in the truncated range by checking i_size. i_size is - * modified while holding i_mmap_rwsem. + * in this routine. hugetlb_no_page() prevents page faults in the + * truncated range. It checks i_size before allocation, and again after + * with the page table lock for the page held. The same lock must be + * acquired to unmap a page. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserve map @@ -451,16 +452,8 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, u32 hash = 0; index = folio->index; - if (!truncate_op) { - /* - * Only need to hold the fault mutex in the - * hole punch case. This prevents races with - * page faults. Races are not possible in the - * case of truncation. - */ - hash = hugetlb_fault_mutex_hash(mapping, index); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - } + hash = hugetlb_fault_mutex_hash(mapping, index); + mutex_lock(&hugetlb_fault_mutex_table[hash]); /* * If folio is mapped, it was faulted in after being @@ -504,8 +497,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, } folio_unlock(folio); - if (!truncate_op) - mutex_unlock(&hugetlb_fault_mutex_table[hash]); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); } folio_batch_release(&fbatch); cond_resched(); @@ -543,8 +535,8 @@ static void hugetlb_vmtruncate(struct inode *inode, loff_t offset) BUG_ON(offset & ~huge_page_mask(h)); pgoff = offset >> PAGE_SHIFT; - i_mmap_lock_write(mapping); i_size_write(inode, offset); + i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, pgoff, 0, ZAP_FLAG_DROP_MARKER); @@ -703,11 +695,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, /* addr is the offset within the file (zero based) */ addr = index * hpage_size; - /* - * fault mutex taken here, protects against fault path - * and hole punch. inode_lock previously taken protects - * against truncation. - */ + /* mutex taken here, fault path and hole punch */ hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d4347ae337fb..14afb5b67dd4 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5560,17 +5560,15 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } /* - * We can not race with truncation due to holding i_mmap_rwsem. - * i_size is modified when holding i_mmap_rwsem, so check here - * once for faults beyond end of file. + * Use page lock to guard against racing truncation + * before we get page_table_lock. */ - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto out; - new_page = false; page = find_lock_page(mapping, idx); if (!page) { + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto out; /* Check for page in userfault range */ if (userfaultfd_missing(vma)) { ret = hugetlb_handle_userfault(vma, mapping, idx, @@ -5666,6 +5664,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } ptl = huge_pte_lock(h, mm, ptep); + size = i_size_read(mapping->host) >> huge_page_shift(h); + if (idx >= size) + goto backout; + ret = 0; /* If pte changed from under us, retry */ if (!pte_same(huge_ptep_get(ptep), old_pte)) @@ -5774,10 +5776,8 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold - * until finished with ptep. This serves two purposes: - * 1) It prevents huge_pmd_unshare from being called elsewhere - * and making the ptep no longer valid. - * 2) It synchronizes us with i_size modifications during truncation. + * until finished with ptep. This prevents huge_pmd_unshare from + * being called elsewhere and making the ptep no longer valid. * * ptep could have already be assigned via huge_pte_offset. That * is OK, as huge_pte_alloc will return the same value unless From 3a47c54f09c4c89128d8f67d49296b1c25b317d0 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:03 -0700 Subject: [PATCH 242/350] hugetlbfs: revert use i_mmap_rwsem for more pmd sharing synchronization Commit c0d0381ade79 ("hugetlbfs: use i_mmap_rwsem for more pmd sharing synchronization") added code to take i_mmap_rwsem in read mode for the duration of fault processing. However, this has been shown to cause performance/scaling issues. Revert the code and go back to only taking the semaphore in huge_pmd_share during the fault path. Keep the code that takes i_mmap_rwsem in write mode before calling try_to_unmap as this is required if huge_pmd_unshare is called. NOTE: Reverting this code does expose the following race condition. Faulting thread Unsharing thread ... ... ptep = huge_pte_offset() or ptep = huge_pte_alloc() ... i_mmap_lock_write lock page table ptep invalid <------------------------ huge_pmd_unshare() Could be in a previously unlock_page_table sharing process or worse i_mmap_unlock_write ... ptl = huge_pte_lock(ptep) get/update pte set_pte_at(pte, ptep) It is unknown if the above race was ever experienced by a user. It was discovered via code inspection when initially addressed. In subsequent patches, a new synchronization mechanism will be added to coordinate pmd sharing and eliminate this race. Link: https://lkml.kernel.org/r/20220914221810.95771-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 2 -- mm/hugetlb.c | 77 +++++++------------------------------------- mm/rmap.c | 8 +---- mm/userfaultfd.c | 11 ++----- 4 files changed, 15 insertions(+), 83 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index a32031e751d1..dfb735a91bbb 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -467,9 +467,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, if (unlikely(folio_mapped(folio))) { BUG_ON(truncate_op); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); i_mmap_lock_write(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); hugetlb_vmdelete_list(&mapping->i_mmap, index * pages_per_huge_page(h), (index + 1) * pages_per_huge_page(h), diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 14afb5b67dd4..8283706bd81d 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4770,7 +4770,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, struct hstate *h = hstate_vma(src_vma); unsigned long sz = huge_page_size(h); unsigned long npages = pages_per_huge_page(h); - struct address_space *mapping = src_vma->vm_file->f_mapping; struct mmu_notifier_range range; unsigned long last_addr_mask; int ret = 0; @@ -4782,14 +4781,6 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, mmu_notifier_invalidate_range_start(&range); mmap_assert_write_locked(src); raw_write_seqcount_begin(&src->write_protect_seq); - } else { - /* - * For shared mappings i_mmap_rwsem must be held to call - * huge_pte_alloc, otherwise the returned ptep could go - * away if part of a shared pmd and another thread calls - * huge_pmd_unshare. - */ - i_mmap_lock_read(mapping); } last_addr_mask = hugetlb_mask_last_page(h); @@ -4936,8 +4927,6 @@ again: if (cow) { raw_write_seqcount_end(&src->write_protect_seq); mmu_notifier_invalidate_range_end(&range); - } else { - i_mmap_unlock_read(mapping); } return ret; @@ -5346,29 +5335,8 @@ retry_avoidcopy: * may get SIGKILLed if it later faults. */ if (outside_reserve) { - struct address_space *mapping = vma->vm_file->f_mapping; - pgoff_t idx; - u32 hash; - put_page(old_page); - /* - * Drop hugetlb_fault_mutex and i_mmap_rwsem before - * unmapping. unmapping needs to hold i_mmap_rwsem - * in write mode. Dropping i_mmap_rwsem in read mode - * here is OK as COW mappings do not interact with - * PMD sharing. - * - * Reacquire both after unmap operation. - */ - idx = vma_hugecache_offset(h, vma, haddr); - hash = hugetlb_fault_mutex_hash(mapping, idx); - mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); - unmap_ref_private(mm, vma, old_page, haddr); - - i_mmap_lock_read(mapping); - mutex_lock(&hugetlb_fault_mutex_table[hash]); spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && @@ -5523,9 +5491,7 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, */ hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); ret = handle_userfault(&vmf, reason); - i_mmap_lock_read(mapping); mutex_lock(&hugetlb_fault_mutex_table[hash]); return ret; @@ -5760,11 +5726,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { - /* - * Since we hold no locks, ptep could be stale. That is - * OK as we are only making decisions based on content and - * not actually modifying content here. - */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { migration_entry_wait_huge(vma, ptep); @@ -5772,31 +5733,20 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } else { + ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); + if (!ptep) + return VM_FAULT_OOM; } - /* - * Acquire i_mmap_rwsem before calling huge_pte_alloc and hold - * until finished with ptep. This prevents huge_pmd_unshare from - * being called elsewhere and making the ptep no longer valid. - * - * ptep could have already be assigned via huge_pte_offset. That - * is OK, as huge_pte_alloc will return the same value unless - * something has changed. - */ mapping = vma->vm_file->f_mapping; - i_mmap_lock_read(mapping); - ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); - if (!ptep) { - i_mmap_unlock_read(mapping); - return VM_FAULT_OOM; - } + idx = vma_hugecache_offset(h, vma, haddr); /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -5861,7 +5811,6 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, put_page(pagecache_page); } mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); return handle_userfault(&vmf, VM_UFFD_WP); } @@ -5905,7 +5854,6 @@ out_ptl: } out_mutex: mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); /* * Generally it's safe to hold refcount during waiting page lock. But * here we just wait to defer the next page fault to avoid busy loop and @@ -6745,12 +6693,10 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the * !shared pmd case because we can allocate the pmd later as well, it makes the - * code much cleaner. - * - * This routine must be called with i_mmap_rwsem held in at least read mode if - * sharing is possible. For hugetlbfs, this prevents removal of any page - * table entries associated with the address space. This is important as we - * are setting up sharing based on existing page table entries (mappings). + * code much cleaner. pmd allocation is essential for the shared case because + * pud has to be populated inside the same i_mmap_rwsem section - otherwise + * racing tasks could either miss the sharing (see huge_pte_offset) or select a + * bad pmd for sharing. */ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) @@ -6764,7 +6710,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, pte_t *pte; spinlock_t *ptl; - i_mmap_assert_locked(mapping); + i_mmap_lock_read(mapping); vma_interval_tree_foreach(svma, &mapping->i_mmap, idx, idx) { if (svma == vma) continue; @@ -6794,6 +6740,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); + i_mmap_unlock_read(mapping); return pte; } @@ -6804,7 +6751,7 @@ out: * indicated by page_count > 1, unmap is achieved by clearing pud and * decrementing the ref count. If count == 1, the pte page is not shared. * - * Called with page table lock held and i_mmap_rwsem held in write mode. + * Called with page table lock held. * * returns: 1 successfully unmapped a shared pte page * 0 the underlying pte page is not shared, or it is the last user diff --git a/mm/rmap.c b/mm/rmap.c index 0b9264e58d25..2a08647a61fc 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -23,10 +23,9 @@ * inode->i_rwsem (while writing or truncating, not reading or faulting) * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) - * page->flags PG_locked (lock_page) * (see hugetlbfs below) + * page->flags PG_locked (lock_page) * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) * mapping->i_mmap_rwsem - * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) * anon_vma->rwsem * mm->page_table_lock or pte_lock * swap_lock (in swap_duplicate, swap_info_get) @@ -45,11 +44,6 @@ * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock - * - * * hugetlbfs PageHuge() pages take locks in this order: - * mapping->i_mmap_rwsem - * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) - * page->flags PG_locked (lock_page) */ #include diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 9c035be2148b..0fdbd2c05587 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -379,14 +379,10 @@ retry: BUG_ON(dst_addr >= dst_start + len); /* - * Serialize via i_mmap_rwsem and hugetlb_fault_mutex. - * i_mmap_rwsem ensures the dst_pte remains valid even - * in the case of shared pmds. fault mutex prevents - * races with other faulting threads. + * Serialize via hugetlb_fault_mutex. */ - mapping = dst_vma->vm_file->f_mapping; - i_mmap_lock_read(mapping); idx = linear_page_index(dst_vma, dst_addr); + mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); @@ -394,7 +390,6 @@ retry: dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); if (!dst_pte) { mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); goto out_unlock; } @@ -402,7 +397,6 @@ retry: !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { err = -EEXIST; mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); goto out_unlock; } @@ -411,7 +405,6 @@ retry: wp_copy); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - i_mmap_unlock_read(mapping); cond_resched(); From 7e1813d48dd30e6c6f235f6661d1bc108fcab528 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:04 -0700 Subject: [PATCH 243/350] hugetlb: rename remove_huge_page to hugetlb_delete_from_page_cache remove_huge_page removes a hugetlb page from the page cache. Change to hugetlb_delete_from_page_cache as it is a more descriptive name. huge_add_to_page_cache is global in scope, but only deals with hugetlb pages. For consistency and clarity, rename to hugetlb_add_to_page_cache. Link: https://lkml.kernel.org/r/20220914221810.95771-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 21 ++++++++++----------- include/linux/hugetlb.h | 2 +- mm/hugetlb.c | 8 ++++---- 3 files changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index dfb735a91bbb..edd69cc43ca5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -364,7 +364,7 @@ static int hugetlbfs_write_end(struct file *file, struct address_space *mapping, return -EINVAL; } -static void remove_huge_page(struct page *page) +static void hugetlb_delete_from_page_cache(struct page *page) { ClearPageDirty(page); ClearPageUptodate(page); @@ -478,15 +478,14 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, folio_lock(folio); /* * We must free the huge page and remove from page - * cache (remove_huge_page) BEFORE removing the - * region/reserve map (hugetlb_unreserve_pages). In - * rare out of memory conditions, removal of the - * region/reserve map could fail. Correspondingly, - * the subpool and global reserve usage count can need - * to be adjusted. + * cache BEFORE removing the region/reserve map + * (hugetlb_unreserve_pages). In rare out of memory + * conditions, removal of the region/reserve map could + * fail. Correspondingly, the subpool and global + * reserve usage count can need to be adjusted. */ VM_BUG_ON(HPageRestoreReserve(&folio->page)); - remove_huge_page(&folio->page); + hugetlb_delete_from_page_cache(&folio->page); freed++; if (!truncate_op) { if (unlikely(hugetlb_unreserve_pages(inode, @@ -723,7 +722,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, } clear_huge_page(page, addr, pages_per_huge_page(h)); __SetPageUptodate(page); - error = huge_add_to_page_cache(page, mapping, index); + error = hugetlb_add_to_page_cache(page, mapping, index); if (unlikely(error)) { restore_reserve_on_error(h, &pseudo_vma, addr, page); put_page(page); @@ -735,7 +734,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, SetHPageMigratable(page); /* - * unlock_page because locked by huge_add_to_page_cache() + * unlock_page because locked by hugetlb_add_to_page_cache() * put_page() due to reference from alloc_huge_page() */ unlock_page(page); @@ -980,7 +979,7 @@ static int hugetlbfs_error_remove_page(struct address_space *mapping, struct inode *inode = mapping->host; pgoff_t index = page->index; - remove_huge_page(page); + hugetlb_delete_from_page_cache(page); if (unlikely(hugetlb_unreserve_pages(inode, index, index + 1, 1))) hugetlb_fix_reserve_counts(inode); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6d7f39754060..4893d6d07099 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -666,7 +666,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask); struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address); -int huge_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, unsigned long address, struct page *page); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8283706bd81d..accb166791c7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5430,7 +5430,7 @@ static bool hugetlbfs_pagecache_present(struct hstate *h, return page != NULL; } -int huge_add_to_page_cache(struct page *page, struct address_space *mapping, +int hugetlb_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx) { struct folio *folio = page_folio(page); @@ -5569,7 +5569,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, new_page = true; if (vma->vm_flags & VM_MAYSHARE) { - int err = huge_add_to_page_cache(page, mapping, idx); + int err = hugetlb_add_to_page_cache(page, mapping, idx); if (err) { /* * err can't be -EEXIST which implies someone @@ -5981,11 +5981,11 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, /* * Serialization between remove_inode_hugepages() and - * huge_add_to_page_cache() below happens through the + * hugetlb_add_to_page_cache() below happens through the * hugetlb_fault_mutex_table that here must be hold by * the caller. */ - ret = huge_add_to_page_cache(page, mapping, idx); + ret = hugetlb_add_to_page_cache(page, mapping, idx); if (ret) goto out_release_nounlock; page_in_pagecache = true; From c86272287bc65cb3d698a95c19651265e9f287cd Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:05 -0700 Subject: [PATCH 244/350] hugetlb: create remove_inode_single_folio to remove single file folio Create the new routine remove_inode_single_folio that will remove a single folio from a file. This is refactored code from remove_inode_hugepages. It checks for the uncommon case in which the folio is still mapped and unmaps. No functional change. This refactoring will be put to use and expanded upon in a subsequent patches. Link: https://lkml.kernel.org/r/20220914221810.95771-5-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 103 ++++++++++++++++++++++++++----------------- 1 file changed, 62 insertions(+), 41 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index edd69cc43ca5..7112a9a9f54d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -411,6 +411,60 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, } } +/* + * Called with hugetlb fault mutex held. + * Returns true if page was actually removed, false otherwise. + */ +static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, + struct address_space *mapping, + struct folio *folio, pgoff_t index, + bool truncate_op) +{ + bool ret = false; + + /* + * If folio is mapped, it was faulted in after being + * unmapped in caller. Unmap (again) while holding + * the fault mutex. The mutex will prevent faults + * until we finish removing the folio. + */ + if (unlikely(folio_mapped(folio))) { + i_mmap_lock_write(mapping); + hugetlb_vmdelete_list(&mapping->i_mmap, + index * pages_per_huge_page(h), + (index + 1) * pages_per_huge_page(h), + ZAP_FLAG_DROP_MARKER); + i_mmap_unlock_write(mapping); + } + + folio_lock(folio); + /* + * After locking page, make sure mapping is the same. + * We could have raced with page fault populate and + * backout code. + */ + if (folio_mapping(folio) == mapping) { + /* + * We must remove the folio from page cache before removing + * the region/ reserve map (hugetlb_unreserve_pages). In + * rare out of memory conditions, removal of the region/reserve + * map could fail. Correspondingly, the subpool and global + * reserve usage count can need to be adjusted. + */ + VM_BUG_ON(HPageRestoreReserve(&folio->page)); + hugetlb_delete_from_page_cache(&folio->page); + ret = true; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages(inode, index, + index + 1, 1))) + hugetlb_fix_reserve_counts(inode); + } + } + + folio_unlock(folio); + return ret; +} + /* * remove_inode_hugepages handles two distinct cases: truncation and hole * punch. There are subtle differences in operation for each case. @@ -418,11 +472,10 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * truncation is indicated by end of range being LLONG_MAX * In this case, we first scan the range and release found pages. * After releasing pages, hugetlb_unreserve_pages cleans up region/reserve - * maps and global counts. Page faults can not race with truncation - * in this routine. hugetlb_no_page() prevents page faults in the - * truncated range. It checks i_size before allocation, and again after - * with the page table lock for the page held. The same lock must be - * acquired to unmap a page. + * maps and global counts. Page faults can race with truncation. + * During faults, hugetlb_no_page() checks i_size before page allocation, + * and again after obtaining page table lock. It will 'back out' + * allocations in the truncated range. * hole punch is indicated if end is not LLONG_MAX * In the hole punch case we scan the range and release found pages. * Only when releasing a page is the associated region/reserve map @@ -456,44 +509,12 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, mutex_lock(&hugetlb_fault_mutex_table[hash]); /* - * If folio is mapped, it was faulted in after being - * unmapped in caller. Unmap (again) now after taking - * the fault mutex. The mutex will prevent faults - * until we finish removing the folio. - * - * This race can only happen in the hole punch case. - * Getting here in a truncate operation is a bug. + * Remove folio that was part of folio_batch. */ - if (unlikely(folio_mapped(folio))) { - BUG_ON(truncate_op); + if (remove_inode_single_folio(h, inode, mapping, folio, + index, truncate_op)) + freed++; - i_mmap_lock_write(mapping); - hugetlb_vmdelete_list(&mapping->i_mmap, - index * pages_per_huge_page(h), - (index + 1) * pages_per_huge_page(h), - ZAP_FLAG_DROP_MARKER); - i_mmap_unlock_write(mapping); - } - - folio_lock(folio); - /* - * We must free the huge page and remove from page - * cache BEFORE removing the region/reserve map - * (hugetlb_unreserve_pages). In rare out of memory - * conditions, removal of the region/reserve map could - * fail. Correspondingly, the subpool and global - * reserve usage count can need to be adjusted. - */ - VM_BUG_ON(HPageRestoreReserve(&folio->page)); - hugetlb_delete_from_page_cache(&folio->page); - freed++; - if (!truncate_op) { - if (unlikely(hugetlb_unreserve_pages(inode, - index, index + 1, 1))) - hugetlb_fix_reserve_counts(inode); - } - - folio_unlock(folio); mutex_unlock(&hugetlb_fault_mutex_table[hash]); } folio_batch_release(&fbatch); From 12710fd696343a0d6c318bdad22fa7809af7859b Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:06 -0700 Subject: [PATCH 245/350] hugetlb: rename vma_shareable() and refactor code Rename the routine vma_shareable to vma_addr_pmd_shareable as it is checking a specific address within the vma. Refactor code to check if an aligned range is shareable as this will be needed in a subsequent patch. Link: https://lkml.kernel.org/r/20220914221810.95771-6-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- mm/hugetlb.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index accb166791c7..482f7f357f75 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6640,26 +6640,33 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, return saddr; } -static bool vma_shareable(struct vm_area_struct *vma, unsigned long addr) +static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma, + unsigned long start, unsigned long end) { - unsigned long base = addr & PUD_MASK; - unsigned long end = base + PUD_SIZE; - /* * check on proper vm_flags and page table alignment */ - if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, base, end)) + if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, start, end)) return true; return false; } +static bool vma_addr_pmd_shareable(struct vm_area_struct *vma, + unsigned long addr) +{ + unsigned long start = addr & PUD_MASK; + unsigned long end = start + PUD_SIZE; + + return __vma_aligned_range_pmd_shareable(vma, start, end); +} + bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { #ifdef CONFIG_USERFAULTFD if (uffd_disable_huge_pmd_share(vma)) return false; #endif - return vma_shareable(vma, addr); + return vma_addr_pmd_shareable(vma, addr); } /* From 8d9bfb2608145cf3e408428c224099e1585471af Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:07 -0700 Subject: [PATCH 246/350] hugetlb: add vma based lock for pmd sharing Allocate a new hugetlb_vma_lock structure and hang off vm_private_data for synchronization use by vmas that could be involved in pmd sharing. This data structure contains a rw semaphore that is the primary tool used for synchronization. This new structure is ref counted, so that it can exist when NOT attached to a vma. This is only helpful in resolving lock ordering issues where code may need to obtain the vma_lock while there are no guarantees the vma may go away. By obtaining a ref on the structure, it can be guaranteed that at least the rw semaphore will not go away. Only add infrastructure for the new lock here. Actual use will be added in subsequent patches. [mike.kravetz@oracle.com: fix build issue for missing hugetlb_vma_lock_release] Link: https://lkml.kernel.org/r/YyNUtA1vRASOE4+M@monkey Link: https://lkml.kernel.org/r/20220914221810.95771-7-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- include/linux/hugetlb.h | 43 ++++++++- kernel/fork.c | 6 +- mm/hugetlb.c | 207 ++++++++++++++++++++++++++++++++++++---- mm/rmap.c | 8 +- 4 files changed, 240 insertions(+), 24 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 4893d6d07099..7b70aa931729 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -115,6 +115,12 @@ struct file_region { #endif }; +struct hugetlb_vma_lock { + struct kref refs; + struct rw_semaphore rw_sema; + struct vm_area_struct *vma; +}; + extern struct resv_map *resv_map_alloc(void); void resv_map_release(struct kref *ref); @@ -127,7 +133,7 @@ struct hugepage_subpool *hugepage_new_subpool(struct hstate *h, long max_hpages, long min_hpages); void hugepage_put_subpool(struct hugepage_subpool *spool); -void reset_vma_resv_huge_pages(struct vm_area_struct *vma); +void hugetlb_dup_vma_private(struct vm_area_struct *vma); void clear_vma_resv_huge_pages(struct vm_area_struct *vma); int hugetlb_sysctl_handler(struct ctl_table *, int, void *, size_t *, loff_t *); int hugetlb_overcommit_handler(struct ctl_table *, int, void *, size_t *, @@ -215,6 +221,14 @@ struct page *follow_huge_pud(struct mm_struct *mm, unsigned long address, struct page *follow_huge_pgd(struct mm_struct *mm, unsigned long address, pgd_t *pgd, int flags); +void hugetlb_vma_lock_read(struct vm_area_struct *vma); +void hugetlb_vma_unlock_read(struct vm_area_struct *vma); +void hugetlb_vma_lock_write(struct vm_area_struct *vma); +void hugetlb_vma_unlock_write(struct vm_area_struct *vma); +int hugetlb_vma_trylock_write(struct vm_area_struct *vma); +void hugetlb_vma_assert_locked(struct vm_area_struct *vma); +void hugetlb_vma_lock_release(struct kref *kref); + int pmd_huge(pmd_t pmd); int pud_huge(pud_t pud); unsigned long hugetlb_change_protection(struct vm_area_struct *vma, @@ -226,7 +240,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma); #else /* !CONFIG_HUGETLB_PAGE */ -static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +static inline void hugetlb_dup_vma_private(struct vm_area_struct *vma) { } @@ -337,6 +351,31 @@ static inline int prepare_hugepage_range(struct file *file, return -EINVAL; } +static inline void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ +} + +static inline void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ +} + +static inline int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + return 1; +} + +static inline void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ +} + static inline int pmd_huge(pmd_t pmd) { return 0; diff --git a/kernel/fork.c b/kernel/fork.c index 50460330306a..3d788f759e5f 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -674,12 +674,10 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm, } /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only + * Copy/update hugetlb private vma information. */ if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); + hugetlb_dup_vma_private(tmp); /* Link the vma into the MT */ mas.index = tmp->vm_start; diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 482f7f357f75..f44b79998ac2 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -91,6 +91,8 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); +static void hugetlb_vma_lock_free(struct vm_area_struct *vma); +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -859,7 +861,7 @@ __weak unsigned long vma_mmu_pagesize(struct vm_area_struct *vma) * faults in a MAP_PRIVATE mapping. Only the process that called mmap() * is guaranteed to have their future faults succeed. * - * With the exception of reset_vma_resv_huge_pages() which is called at fork(), + * With the exception of hugetlb_dup_vma_private() which is called at fork(), * the reserve counters are updated with the hugetlb_lock held. It is safe * to reset the VMA at fork() time as it is not in use yet and there is no * chance of the global counters getting corrupted as a result of the values. @@ -1006,12 +1008,20 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } -/* Reset counters to 0 and clear all HPAGE_RESV_* flags */ -void reset_vma_resv_huge_pages(struct vm_area_struct *vma) +void hugetlb_dup_vma_private(struct vm_area_struct *vma) { VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); + /* + * Clear vm_private_data + * - For MAP_PRIVATE mappings, this is the reserve map which does + * not apply to children. Faults generated by the children are + * not guaranteed to succeed, even if read-only. + * - For shared mappings this is a per-vma semaphore that may be + * allocated in a subsequent call to hugetlb_vm_op_open. + */ + vma->vm_private_data = (void *)0; if (!(vma->vm_flags & VM_MAYSHARE)) - vma->vm_private_data = (void *)0; + return; } /* @@ -1042,7 +1052,7 @@ void clear_vma_resv_huge_pages(struct vm_area_struct *vma) kref_put(&reservations->refs, resv_map_release); } - reset_vma_resv_huge_pages(vma); + hugetlb_dup_vma_private(vma); } /* Returns true if the VMA has associated reserve pages */ @@ -4623,16 +4633,21 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) resv_map_dup_hugetlb_cgroup_uncharge_info(resv); kref_get(&resv->refs); } + + hugetlb_vma_lock_alloc(vma); } static void hugetlb_vm_op_close(struct vm_area_struct *vma) { struct hstate *h = hstate_vma(vma); - struct resv_map *resv = vma_resv_map(vma); + struct resv_map *resv; struct hugepage_subpool *spool = subpool_vma(vma); unsigned long reserve, start, end; long gbl_reserve; + hugetlb_vma_lock_free(vma); + + resv = vma_resv_map(vma); if (!resv || !is_vma_resv_set(vma, HPAGE_RESV_OWNER)) return; @@ -6439,6 +6454,11 @@ bool hugetlb_reserve_pages(struct inode *inode, return false; } + /* + * vma specific semaphore used for pmd sharing synchronization + */ + hugetlb_vma_lock_alloc(vma); + /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page @@ -6462,12 +6482,11 @@ bool hugetlb_reserve_pages(struct inode *inode, resv_map = inode_resv_map(inode); chg = region_chg(resv_map, from, to, ®ions_needed); - } else { /* Private mapping. */ resv_map = resv_map_alloc(); if (!resv_map) - return false; + goto out_err; chg = to - from; @@ -6562,6 +6581,7 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: + hugetlb_vma_lock_free(vma); if (!vma || vma->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. @@ -6641,14 +6661,34 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, } static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma, - unsigned long start, unsigned long end) + unsigned long start, unsigned long end, + bool check_vma_lock) { +#ifdef CONFIG_USERFAULTFD + if (uffd_disable_huge_pmd_share(vma)) + return false; +#endif /* * check on proper vm_flags and page table alignment */ - if (vma->vm_flags & VM_MAYSHARE && range_in_vma(vma, start, end)) - return true; - return false; + if (!(vma->vm_flags & VM_MAYSHARE)) + return false; + if (check_vma_lock && !vma->vm_private_data) + return false; + if (!range_in_vma(vma, start, end)) + return false; + return true; +} + +static bool vma_pmd_shareable(struct vm_area_struct *vma) +{ + unsigned long start = ALIGN(vma->vm_start, PUD_SIZE), + end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); + + if (start >= end) + return false; + + return __vma_aligned_range_pmd_shareable(vma, start, end, false); } static bool vma_addr_pmd_shareable(struct vm_area_struct *vma, @@ -6657,15 +6697,11 @@ static bool vma_addr_pmd_shareable(struct vm_area_struct *vma, unsigned long start = addr & PUD_MASK; unsigned long end = start + PUD_SIZE; - return __vma_aligned_range_pmd_shareable(vma, start, end); + return __vma_aligned_range_pmd_shareable(vma, start, end, true); } bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { -#ifdef CONFIG_USERFAULTFD - if (uffd_disable_huge_pmd_share(vma)) - return false; -#endif return vma_addr_pmd_shareable(vma, addr); } @@ -6696,6 +6732,130 @@ void adjust_range_if_pmd_sharing_possible(struct vm_area_struct *vma, *end = ALIGN(*end, PUD_SIZE); } +static bool __vma_shareable_flags_pmd(struct vm_area_struct *vma) +{ + return vma->vm_flags & (VM_MAYSHARE | VM_SHARED) && + vma->vm_private_data; +} + +void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_read(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + up_read(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + down_write(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + up_write(&vma_lock->rw_sema); + } +} + +int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + if (!__vma_shareable_flags_pmd(vma)) + return 1; + + return down_write_trylock(&vma_lock->rw_sema); +} + +void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + lockdep_assert_held(&vma_lock->rw_sema); + } +} + +void hugetlb_vma_lock_release(struct kref *kref) +{ + struct hugetlb_vma_lock *vma_lock = container_of(kref, + struct hugetlb_vma_lock, refs); + + kfree(vma_lock); +} + +static void hugetlb_vma_lock_free(struct vm_area_struct *vma) +{ + /* + * Only present in sharable vmas. See comment in + * __unmap_hugepage_range_final about how VM_SHARED could + * be set without VM_MAYSHARE. As a result, we need to + * check if either is set in the free path. + */ + if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED))) + return; + + if (vma->vm_private_data) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + /* + * vma_lock structure may or not be released, but it + * certainly will no longer be attached to vma so clear + * pointer. + */ + vma_lock->vma = NULL; + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + vma->vm_private_data = NULL; + } +} + +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ + struct hugetlb_vma_lock *vma_lock; + + /* Only establish in (flags) sharable vmas */ + if (!vma || !(vma->vm_flags & VM_MAYSHARE)) + return; + + /* Should never get here with non-NULL vm_private_data */ + if (vma->vm_private_data) + return; + + /* Check size/alignment for pmd sharing possible */ + if (!vma_pmd_shareable(vma)) + return; + + vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); + if (!vma_lock) + /* + * If we can not allocate structure, then vma can not + * participate in pmd sharing. + */ + return; + + kref_init(&vma_lock->refs); + init_rwsem(&vma_lock->rw_sema); + vma_lock->vma = vma; + vma->vm_private_data = vma_lock; +} + /* * Search for a shareable pmd page for hugetlb. In any case calls pmd_alloc() * and returns the corresponding pte. While this is not necessary for the @@ -6782,6 +6942,19 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, } #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ + +void hugetlb_vma_lock_release(struct kref *kref) +{ +} + +static void hugetlb_vma_lock_free(struct vm_area_struct *vma) +{ +} + +static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +{ +} + pte_t *huge_pmd_share(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pud_t *pud) { diff --git a/mm/rmap.c b/mm/rmap.c index 2a08647a61fc..0e179c823e0a 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -24,7 +24,7 @@ * mm->mmap_lock * mapping->invalidate_lock (in filemap_fault) * page->flags PG_locked (lock_page) - * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share) + * hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share, see hugetlbfs below) * mapping->i_mmap_rwsem * anon_vma->rwsem * mm->page_table_lock or pte_lock @@ -44,6 +44,12 @@ * anon_vma->rwsem,mapping->i_mmap_rwsem (memory_failure, collect_procs_anon) * ->tasklist_lock * pte map lock + * + * hugetlbfs PageHuge() take locks in this order: + * hugetlb_fault_mutex (hugetlbfs specific page fault mutex) + * vma_lock (hugetlb specific lock for pmd_sharing) + * mapping->i_mmap_rwsem (also used for hugetlb pmd sharing) + * page->flags PG_locked (lock_page) */ #include From 378397ccb8e5a695a42e819df545ccd28641b683 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:08 -0700 Subject: [PATCH 247/350] hugetlb: create hugetlb_unmap_file_folio to unmap single file folio Create the new routine hugetlb_unmap_file_folio that will unmap a single file folio. This is refactored code from hugetlb_vmdelete_list. It is modified to do locking within the routine itself and check whether the page is mapped within a specific vma before unmapping. This refactoring will be put to use and expanded upon in a subsequent patch adding vma specific locking. Link: https://lkml.kernel.org/r/20220914221810.95771-8-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Miaohe Lin Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 123 +++++++++++++++++++++++++++++++++---------- 1 file changed, 94 insertions(+), 29 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 7112a9a9f54d..3bb1772fce2f 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -371,6 +371,94 @@ static void hugetlb_delete_from_page_cache(struct page *page) delete_from_page_cache(page); } +/* + * Called with i_mmap_rwsem held for inode based vma maps. This makes + * sure vma (and vm_mm) will not go away. We also hold the hugetlb fault + * mutex for the page in the mapping. So, we can not race with page being + * faulted into the vma. + */ +static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, + unsigned long addr, struct page *page) +{ + pte_t *ptep, pte; + + ptep = huge_pte_offset(vma->vm_mm, addr, + huge_page_size(hstate_vma(vma))); + + if (!ptep) + return false; + + pte = huge_ptep_get(ptep); + if (huge_pte_none(pte) || !pte_present(pte)) + return false; + + if (pte_page(pte) == page) + return true; + + return false; +} + +/* + * Can vma_offset_start/vma_offset_end overflow on 32-bit arches? + * No, because the interval tree returns us only those vmas + * which overlap the truncated area starting at pgoff, + * and no vma on a 32-bit arch can span beyond the 4GB. + */ +static unsigned long vma_offset_start(struct vm_area_struct *vma, pgoff_t start) +{ + if (vma->vm_pgoff < start) + return (start - vma->vm_pgoff) << PAGE_SHIFT; + else + return 0; +} + +static unsigned long vma_offset_end(struct vm_area_struct *vma, pgoff_t end) +{ + unsigned long t_end; + + if (!end) + return vma->vm_end; + + t_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) + vma->vm_start; + if (t_end > vma->vm_end) + t_end = vma->vm_end; + return t_end; +} + +/* + * Called with hugetlb fault mutex held. Therefore, no more mappings to + * this folio can be created while executing the routine. + */ +static void hugetlb_unmap_file_folio(struct hstate *h, + struct address_space *mapping, + struct folio *folio, pgoff_t index) +{ + struct rb_root_cached *root = &mapping->i_mmap; + struct page *page = &folio->page; + struct vm_area_struct *vma; + unsigned long v_start; + unsigned long v_end; + pgoff_t start, end; + + start = index * pages_per_huge_page(h); + end = (index + 1) * pages_per_huge_page(h); + + i_mmap_lock_write(mapping); + + vma_interval_tree_foreach(vma, root, start, end - 1) { + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + + if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + continue; + + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, + NULL, ZAP_FLAG_DROP_MARKER); + } + + i_mmap_unlock_write(mapping); +} + static void hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, zap_flags_t zap_flags) @@ -383,30 +471,13 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, * an inclusive "last". */ vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { - unsigned long v_offset; + unsigned long v_start; unsigned long v_end; - /* - * Can the expression below overflow on 32-bit arches? - * No, because the interval tree returns us only those vmas - * which overlap the truncated area starting at pgoff, - * and no vma on a 32-bit arch can span beyond the 4GB. - */ - if (vma->vm_pgoff < start) - v_offset = (start - vma->vm_pgoff) << PAGE_SHIFT; - else - v_offset = 0; + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); - if (!end) - v_end = vma->vm_end; - else { - v_end = ((end - vma->vm_pgoff) << PAGE_SHIFT) - + vma->vm_start; - if (v_end > vma->vm_end) - v_end = vma->vm_end; - } - - unmap_hugepage_range(vma, vma->vm_start + v_offset, v_end, + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, NULL, zap_flags); } } @@ -428,14 +499,8 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, * the fault mutex. The mutex will prevent faults * until we finish removing the folio. */ - if (unlikely(folio_mapped(folio))) { - i_mmap_lock_write(mapping); - hugetlb_vmdelete_list(&mapping->i_mmap, - index * pages_per_huge_page(h), - (index + 1) * pages_per_huge_page(h), - ZAP_FLAG_DROP_MARKER); - i_mmap_unlock_write(mapping); - } + if (unlikely(folio_mapped(folio))) + hugetlb_unmap_file_folio(h, mapping, folio, index); folio_lock(folio); /* From 40549ba8f8e0ed1f8b235979563f619e9aa34fdf Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:09 -0700 Subject: [PATCH 248/350] hugetlb: use new vma_lock for pmd sharing synchronization The new hugetlb vma lock is used to address this race: Faulting thread Unsharing thread ... ... ptep = huge_pte_offset() or ptep = huge_pte_alloc() ... i_mmap_lock_write lock page table ptep invalid <------------------------ huge_pmd_unshare() Could be in a previously unlock_page_table sharing process or worse i_mmap_unlock_write ... The vma_lock is used as follows: - During fault processing. The lock is acquired in read mode before doing a page table lock and allocation (huge_pte_alloc). The lock is held until code is finished with the page table entry (ptep). - The lock must be held in write mode whenever huge_pmd_unshare is called. Lock ordering issues come into play when unmapping a page from all vmas mapping the page. The i_mmap_rwsem must be held to search for the vmas, and the vma lock must be held before calling unmap which will call huge_pmd_unshare. This is done today in: - try_to_migrate_one and try_to_unmap_ for page migration and memory error handling. In these routines we 'try' to obtain the vma lock and fail to unmap if unsuccessful. Calling routines already deal with the failure of unmapping. - hugetlb_vmdelete_list for truncation and hole punch. This routine also tries to acquire the vma lock. If it fails, it skips the unmapping. However, we can not have file truncation or hole punch fail because of contention. After hugetlb_vmdelete_list, truncation and hole punch call remove_inode_hugepages. remove_inode_hugepages checks for mapped pages and call hugetlb_unmap_file_page to unmap them. hugetlb_unmap_file_page is designed to drop locks and reacquire in the correct order to guarantee unmap success. Link: https://lkml.kernel.org/r/20220914221810.95771-9-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Miaohe Lin Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 66 +++++++++++++++++++++++++++- mm/hugetlb.c | 102 +++++++++++++++++++++++++++++++++++++++---- mm/memory.c | 2 + mm/rmap.c | 98 ++++++++++++++++++++++++++--------------- mm/userfaultfd.c | 9 +++- 5 files changed, 232 insertions(+), 45 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 3bb1772fce2f..009ae539b9b2 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -434,6 +434,7 @@ static void hugetlb_unmap_file_folio(struct hstate *h, struct folio *folio, pgoff_t index) { struct rb_root_cached *root = &mapping->i_mmap; + struct hugetlb_vma_lock *vma_lock; struct page *page = &folio->page; struct vm_area_struct *vma; unsigned long v_start; @@ -444,7 +445,8 @@ static void hugetlb_unmap_file_folio(struct hstate *h, end = (index + 1) * pages_per_huge_page(h); i_mmap_lock_write(mapping); - +retry: + vma_lock = NULL; vma_interval_tree_foreach(vma, root, start, end - 1) { v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); @@ -452,11 +454,63 @@ static void hugetlb_unmap_file_folio(struct hstate *h, if (!hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) continue; + if (!hugetlb_vma_trylock_write(vma)) { + vma_lock = vma->vm_private_data; + /* + * If we can not get vma lock, we need to drop + * immap_sema and take locks in order. First, + * take a ref on the vma_lock structure so that + * we can be guaranteed it will not go away when + * dropping immap_sema. + */ + kref_get(&vma_lock->refs); + break; + } + unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, NULL, ZAP_FLAG_DROP_MARKER); + hugetlb_vma_unlock_write(vma); } i_mmap_unlock_write(mapping); + + if (vma_lock) { + /* + * Wait on vma_lock. We know it is still valid as we have + * a reference. We must 'open code' vma locking as we do + * not know if vma_lock is still attached to vma. + */ + down_write(&vma_lock->rw_sema); + i_mmap_lock_write(mapping); + + vma = vma_lock->vma; + if (!vma) { + /* + * If lock is no longer attached to vma, then just + * unlock, drop our reference and retry looking for + * other vmas. + */ + up_write(&vma_lock->rw_sema); + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + goto retry; + } + + /* + * vma_lock is still attached to vma. Check to see if vma + * still maps page and if so, unmap. + */ + v_start = vma_offset_start(vma, start); + v_end = vma_offset_end(vma, end); + if (hugetlb_vma_maps_page(vma, vma->vm_start + v_start, page)) + unmap_hugepage_range(vma, vma->vm_start + v_start, + v_end, NULL, + ZAP_FLAG_DROP_MARKER); + + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); + hugetlb_vma_unlock_write(vma); + + goto retry; + } } static void @@ -474,11 +528,21 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end, unsigned long v_start; unsigned long v_end; + if (!hugetlb_vma_trylock_write(vma)) + continue; + v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); unmap_hugepage_range(vma, vma->vm_start + v_start, v_end, NULL, zap_flags); + + /* + * Note that vma lock only exists for shared/non-private + * vmas. Therefore, lock is not held when calling + * unmap_hugepage_range for private vmas. + */ + hugetlb_vma_unlock_write(vma); } } diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f44b79998ac2..d78504959df7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4796,6 +4796,14 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, mmu_notifier_invalidate_range_start(&range); mmap_assert_write_locked(src); raw_write_seqcount_begin(&src->write_protect_seq); + } else { + /* + * For shared mappings the vma lock must be held before + * calling huge_pte_offset in the src vma. Otherwise, the + * returned ptep could go away if part of a shared pmd and + * another thread calls huge_pmd_unshare. + */ + hugetlb_vma_lock_read(src_vma); } last_addr_mask = hugetlb_mask_last_page(h); @@ -4942,6 +4950,8 @@ again: if (cow) { raw_write_seqcount_end(&src->write_protect_seq); mmu_notifier_invalidate_range_end(&range); + } else { + hugetlb_vma_unlock_read(src_vma); } return ret; @@ -5000,6 +5010,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, mmu_notifier_invalidate_range_start(&range); last_addr_mask = hugetlb_mask_last_page(h); /* Prevent race with file truncation */ + hugetlb_vma_lock_write(vma); i_mmap_lock_write(mapping); for (; old_addr < old_end; old_addr += sz, new_addr += sz) { src_pte = huge_pte_offset(mm, old_addr, sz); @@ -5031,6 +5042,7 @@ int move_hugetlb_page_tables(struct vm_area_struct *vma, flush_tlb_range(vma, old_end - len, old_end); mmu_notifier_invalidate_range_end(&range); i_mmap_unlock_write(mapping); + hugetlb_vma_unlock_write(vma); return len + old_addr - old_end; } @@ -5350,8 +5362,29 @@ retry_avoidcopy: * may get SIGKILLed if it later faults. */ if (outside_reserve) { + struct address_space *mapping = vma->vm_file->f_mapping; + pgoff_t idx; + u32 hash; + put_page(old_page); + /* + * Drop hugetlb_fault_mutex and vma_lock before + * unmapping. unmapping needs to hold vma_lock + * in write mode. Dropping vma_lock in read mode + * here is OK as COW mappings do not interact with + * PMD sharing. + * + * Reacquire both after unmap operation. + */ + idx = vma_hugecache_offset(h, vma, haddr); + hash = hugetlb_fault_mutex_hash(mapping, idx); + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + unmap_ref_private(mm, vma, old_page, haddr); + + mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(vma); spin_lock(ptl); ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (likely(ptep && @@ -5500,14 +5533,16 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, }; /* - * hugetlb_fault_mutex and i_mmap_rwsem must be + * vma_lock and hugetlb_fault_mutex must be * dropped before handling userfault. Reacquire * after handling fault to make calling code simpler. */ + hugetlb_vma_unlock_read(vma); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, reason); mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(vma); return ret; } @@ -5741,6 +5776,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { + /* + * Since we hold no locks, ptep could be stale. That is + * OK as we are only making decisions based on content and + * not actually modifying content here. + */ entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { migration_entry_wait_huge(vma, ptep); @@ -5748,23 +5788,35 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); - } else { - ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); - if (!ptep) - return VM_FAULT_OOM; } - mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, haddr); - /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ + mapping = vma->vm_file->f_mapping; + idx = vma_hugecache_offset(h, vma, haddr); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); + /* + * Acquire vma lock before calling huge_pte_alloc and hold + * until finished with ptep. This prevents huge_pmd_unshare from + * being called elsewhere and making the ptep no longer valid. + * + * ptep could have already be assigned via huge_pte_offset. That + * is OK, as huge_pte_alloc will return the same value unless + * something has changed. + */ + hugetlb_vma_lock_read(vma); + ptep = huge_pte_alloc(mm, vma, haddr, huge_page_size(h)); + if (!ptep) { + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); + return VM_FAULT_OOM; + } + entry = huge_ptep_get(ptep); /* PTE markers should be handled the same way as none pte */ if (huge_pte_none_mostly(entry)) { @@ -5825,6 +5877,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, unlock_page(pagecache_page); put_page(pagecache_page); } + hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); return handle_userfault(&vmf, VM_UFFD_WP); } @@ -5868,6 +5921,7 @@ out_ptl: put_page(pagecache_page); } out_mutex: + hugetlb_vma_unlock_read(vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); /* * Generally it's safe to hold refcount during waiting page lock. But @@ -6330,8 +6384,9 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, flush_cache_range(vma, range.start, range.end); mmu_notifier_invalidate_range_start(&range); - last_addr_mask = hugetlb_mask_last_page(h); + hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); + last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { spinlock_t *ptl; ptep = huge_pte_offset(mm, address, psize); @@ -6430,6 +6485,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, * See Documentation/mm/mmu_notifier.rst */ i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); mmu_notifier_invalidate_range_end(&range); return pages << h->order; @@ -6931,6 +6987,7 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, pud_t *pud = pud_offset(p4d, addr); i_mmap_assert_write_locked(vma->vm_file->f_mapping); + hugetlb_vma_assert_locked(vma); BUG_ON(page_count(virt_to_page(ptep)) == 0); if (page_count(virt_to_page(ptep)) == 1) return 0; @@ -6943,6 +7000,31 @@ int huge_pmd_unshare(struct mm_struct *mm, struct vm_area_struct *vma, #else /* !CONFIG_ARCH_WANT_HUGE_PMD_SHARE */ +void hugetlb_vma_lock_read(struct vm_area_struct *vma) +{ +} + +void hugetlb_vma_unlock_read(struct vm_area_struct *vma) +{ +} + +void hugetlb_vma_lock_write(struct vm_area_struct *vma) +{ +} + +void hugetlb_vma_unlock_write(struct vm_area_struct *vma) +{ +} + +int hugetlb_vma_trylock_write(struct vm_area_struct *vma) +{ + return 1; +} + +void hugetlb_vma_assert_locked(struct vm_area_struct *vma) +{ +} + void hugetlb_vma_lock_release(struct kref *kref) { } @@ -7325,6 +7407,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, mm, start, end); mmu_notifier_invalidate_range_start(&range); + hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); for (address = start; address < end; address += PUD_SIZE) { ptep = huge_pte_offset(mm, address, sz); @@ -7336,6 +7419,7 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) } flush_hugetlb_tlb_range(vma, start, end); i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); /* * No need to call mmu_notifier_invalidate_range(), see * Documentation/mm/mmu_notifier.rst. diff --git a/mm/memory.c b/mm/memory.c index c01c12500169..b3ed17219d77 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1684,10 +1684,12 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) { zap_flags_t zap_flags = details ? details->zap_flags : 0; + hugetlb_vma_lock_write(vma); i_mmap_lock_write(vma->vm_file->f_mapping); __unmap_hugepage_range_final(tlb, vma, start, end, NULL, zap_flags); i_mmap_unlock_write(vma->vm_file->f_mapping); + hugetlb_vma_unlock_write(vma); } } else unmap_page_range(tlb, vma, start, end, details); diff --git a/mm/rmap.c b/mm/rmap.c index 0e179c823e0a..b6743c2b8b5f 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -1551,24 +1551,39 @@ static bool try_to_unmap_one(struct folio *folio, struct vm_area_struct *vma, * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. + * + * We also must hold hugetlb vma_lock in write mode. + * Lock order dictates acquiring vma_lock BEFORE + * i_mmap_rwsem. We can only try lock here and fail + * if unsuccessful. */ - VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); - if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) { - flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, range.start, - range.end); - - /* - * The ref count of the PMD page was dropped - * which is part of the way map counting - * is done for shared PMDs. Return 'true' - * here. When there is no other sharing, - * huge_pmd_unshare returns false and we will - * unmap the actual page and drop map count - * to zero. - */ - page_vma_mapped_walk_done(&pvmw); - break; + if (!anon) { + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + if (!hugetlb_vma_trylock_write(vma)) { + page_vma_mapped_walk_done(&pvmw); + ret = false; + break; + } + if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + flush_tlb_range(vma, + range.start, range.end); + mmu_notifier_invalidate_range(mm, + range.start, range.end); + /* + * The ref count of the PMD page was + * dropped which is part of the way map + * counting is done for shared PMDs. + * Return 'true' here. When there is + * no other sharing, huge_pmd_unshare + * returns false and we will unmap the + * actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + hugetlb_vma_unlock_write(vma); } pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { @@ -1926,26 +1941,41 @@ static bool try_to_migrate_one(struct folio *folio, struct vm_area_struct *vma, * To call huge_pmd_unshare, i_mmap_rwsem must be * held in write mode. Caller needs to explicitly * do this outside rmap routines. + * + * We also must hold hugetlb vma_lock in write mode. + * Lock order dictates acquiring vma_lock BEFORE + * i_mmap_rwsem. We can only try lock here and + * fail if unsuccessful. */ - VM_BUG_ON(!anon && !(flags & TTU_RMAP_LOCKED)); - if (!anon && huge_pmd_unshare(mm, vma, address, pvmw.pte)) { - flush_tlb_range(vma, range.start, range.end); - mmu_notifier_invalidate_range(mm, range.start, - range.end); + if (!anon) { + VM_BUG_ON(!(flags & TTU_RMAP_LOCKED)); + if (!hugetlb_vma_trylock_write(vma)) { + page_vma_mapped_walk_done(&pvmw); + ret = false; + break; + } + if (huge_pmd_unshare(mm, vma, address, pvmw.pte)) { + hugetlb_vma_unlock_write(vma); + flush_tlb_range(vma, + range.start, range.end); + mmu_notifier_invalidate_range(mm, + range.start, range.end); - /* - * The ref count of the PMD page was dropped - * which is part of the way map counting - * is done for shared PMDs. Return 'true' - * here. When there is no other sharing, - * huge_pmd_unshare returns false and we will - * unmap the actual page and drop map count - * to zero. - */ - page_vma_mapped_walk_done(&pvmw); - break; + /* + * The ref count of the PMD page was + * dropped which is part of the way map + * counting is done for shared PMDs. + * Return 'true' here. When there is + * no other sharing, huge_pmd_unshare + * returns false and we will unmap the + * actual page and drop map count + * to zero. + */ + page_vma_mapped_walk_done(&pvmw); + break; + } + hugetlb_vma_unlock_write(vma); } - /* Nuke the hugetlb page table entry */ pteval = huge_ptep_clear_flush(vma, address, pvmw.pte); } else { diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 0fdbd2c05587..e24e8a47ce8a 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -379,16 +379,21 @@ retry: BUG_ON(dst_addr >= dst_start + len); /* - * Serialize via hugetlb_fault_mutex. + * Serialize via vma_lock and hugetlb_fault_mutex. + * vma_lock ensures the dst_pte remains valid even + * in the case of shared pmds. fault mutex prevents + * races with other faulting threads. */ idx = linear_page_index(dst_vma, dst_addr); mapping = dst_vma->vm_file->f_mapping; hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_lock(&hugetlb_fault_mutex_table[hash]); + hugetlb_vma_lock_read(dst_vma); err = -ENOMEM; dst_pte = huge_pte_alloc(dst_mm, dst_vma, dst_addr, vma_hpagesize); if (!dst_pte) { + hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; } @@ -396,6 +401,7 @@ retry: if (mode != MCOPY_ATOMIC_CONTINUE && !huge_pte_none_mostly(huge_ptep_get(dst_pte))) { err = -EEXIST; + hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); goto out_unlock; } @@ -404,6 +410,7 @@ retry: dst_addr, src_addr, mode, &page, wp_copy); + hugetlb_vma_unlock_read(dst_vma); mutex_unlock(&hugetlb_fault_mutex_table[hash]); cond_resched(); From fa27759af4a6d7494c986c44695b13bcd6eaf46b Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Wed, 14 Sep 2022 15:18:10 -0700 Subject: [PATCH 249/350] hugetlb: clean up code checking for fault/truncation races With the new hugetlb vma lock in place, it can also be used to handle page fault races with file truncation. The lock is taken at the beginning of the code fault path in read mode. During truncation, it is taken in write mode for each vma which has the file mapped. The file's size (i_size) is modified before taking the vma lock to unmap. How are races handled? The page fault code checks i_size early in processing after taking the vma lock. If the fault is beyond i_size, the fault is aborted. If the fault is not beyond i_size the fault will continue and a new page will be added to the file. It could be that truncation code modifies i_size after the check in fault code. That is OK, as truncation code will soon remove the page. The truncation code will wait until the fault is finished, as it must obtain the vma lock in write mode. This patch cleans up/removes late checks in the fault paths that try to back out pages racing with truncation. As noted above, we just let the truncation code remove the pages. [mike.kravetz@oracle.com: fix reserve_alloc set but not used compiler warning] Link: https://lkml.kernel.org/r/Yyj7HsJWfHDoU24U@monkey Link: https://lkml.kernel.org/r/20220914221810.95771-10-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Miaohe Lin Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- fs/hugetlbfs/inode.c | 31 ++++++++++++------------------- mm/hugetlb.c | 24 +++--------------------- 2 files changed, 15 insertions(+), 40 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 009ae539b9b2..ed57a029eab0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -568,26 +568,19 @@ static bool remove_inode_single_folio(struct hstate *h, struct inode *inode, folio_lock(folio); /* - * After locking page, make sure mapping is the same. - * We could have raced with page fault populate and - * backout code. + * We must remove the folio from page cache before removing + * the region/ reserve map (hugetlb_unreserve_pages). In + * rare out of memory conditions, removal of the region/reserve + * map could fail. Correspondingly, the subpool and global + * reserve usage count can need to be adjusted. */ - if (folio_mapping(folio) == mapping) { - /* - * We must remove the folio from page cache before removing - * the region/ reserve map (hugetlb_unreserve_pages). In - * rare out of memory conditions, removal of the region/reserve - * map could fail. Correspondingly, the subpool and global - * reserve usage count can need to be adjusted. - */ - VM_BUG_ON(HPageRestoreReserve(&folio->page)); - hugetlb_delete_from_page_cache(&folio->page); - ret = true; - if (!truncate_op) { - if (unlikely(hugetlb_unreserve_pages(inode, index, - index + 1, 1))) - hugetlb_fix_reserve_counts(inode); - } + VM_BUG_ON(HPageRestoreReserve(&folio->page)); + hugetlb_delete_from_page_cache(&folio->page); + ret = true; + if (!truncate_op) { + if (unlikely(hugetlb_unreserve_pages(inode, index, + index + 1, 1))) + hugetlb_fix_reserve_counts(inode); } folio_unlock(folio); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index d78504959df7..b0e39045a7a8 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5680,10 +5680,6 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, } ptl = huge_pte_lock(h, mm, ptep); - size = i_size_read(mapping->host) >> huge_page_shift(h); - if (idx >= size) - goto backout; - ret = 0; /* If pte changed from under us, retry */ if (!pte_same(huge_ptep_get(ptep), old_pte)) @@ -5727,10 +5723,10 @@ out: backout: spin_unlock(ptl); backout_unlocked: - unlock_page(page); - /* restore reserve for newly allocated pages not in page cache */ if (new_page && !new_pagecache_page) restore_reserve_on_error(h, vma, haddr, page); + + unlock_page(page); put_page(page); goto out; } @@ -6062,26 +6058,12 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, ptl = huge_pte_lock(h, dst_mm, dst_pte); - /* - * Recheck the i_size after holding PT lock to make sure not - * to leave any page mapped (as page_mapped()) beyond the end - * of the i_size (remove_inode_hugepages() is strict about - * enforcing that). If we bail out here, we'll also leave a - * page in the radix tree in the vm_shared case beyond the end - * of the i_size, but remove_inode_hugepages() will take care - * of it as soon as we drop the hugetlb_fault_mutex_table. - */ - size = i_size_read(mapping->host) >> huge_page_shift(h); - ret = -EFAULT; - if (idx >= size) - goto out_release_unlock; - - ret = -EEXIST; /* * We allow to overwrite a pte marker: consider when both MISSING|WP * registered, we firstly wr-protect a none pte which has no page cache * page backing it, then access the page. */ + ret = -EEXIST; if (!huge_pte_none_mostly(huge_ptep_get(dst_pte))) goto out_release_unlock; From e41e614f6a3e3d0d21874a785d3a67d353e282da Mon Sep 17 00:00:00 2001 From: Dmitry Vyukov Date: Thu, 15 Sep 2022 17:03:35 +0200 Subject: [PATCH 250/350] x86: add missing include to sparsemem.h Patch series "Add KernelMemorySanitizer infrastructure", v7. KernelMemorySanitizer (KMSAN) is a detector of errors related to uses of uninitialized memory. It relies on compile-time Clang instrumentation (similar to MSan in the userspace [1]) and tracks the state of every bit of kernel memory, being able to report an error if uninitialized value is used in a condition, dereferenced, or escapes to userspace, USB or DMA. KMSAN has reported more than 300 bugs in the past few years (recently fixed bugs: [2]), most of them with the help of syzkaller. Such bugs keep getting introduced into the kernel despite new compiler warnings and other analyses (the 6.0 cycle already resulted in several KMSAN-reported bugs, e.g. [3]). Mitigations like total stack and heap initialization are unfortunately very far from being deployable. The proposed patchset contains KMSAN runtime implementation together with small changes to other subsystems needed to make KMSAN work. The latter changes fall into several categories: 1. Changes and refactorings of existing code required to add KMSAN: - [01/43] x86: add missing include to sparsemem.h - [02/43] stackdepot: reserve 5 extra bits in depot_stack_handle_t - [03/43] instrumented.h: allow instrumenting both sides of copy_from_user() - [04/43] x86: asm: instrument usercopy in get_user() and __put_user_size() - [05/43] asm-generic: instrument usercopy in cacheflush.h - [10/43] libnvdimm/pfn_dev: increase MAX_STRUCT_PAGE_SIZE 2. KMSAN-related declarations in generic code, KMSAN runtime library, docs and configs: - [06/43] kmsan: add ReST documentation - [07/43] kmsan: introduce __no_sanitize_memory and __no_kmsan_checks - [09/43] x86: kmsan: pgtable: reduce vmalloc space - [11/43] kmsan: add KMSAN runtime core - [13/43] MAINTAINERS: add entry for KMSAN - [24/43] kmsan: add tests for KMSAN - [31/43] objtool: kmsan: list KMSAN API functions as uaccess-safe - [35/43] x86: kmsan: use __msan_ string functions where possible - [43/43] x86: kmsan: enable KMSAN builds for x86 3. Adding hooks from different subsystems to notify KMSAN about memory state changes: - [14/43] mm: kmsan: maintain KMSAN metadata for page - [15/43] mm: kmsan: call KMSAN hooks from SLUB code - [16/43] kmsan: handle task creation and exiting - [17/43] init: kmsan: call KMSAN initialization routines - [18/43] instrumented.h: add KMSAN support - [19/43] kmsan: add iomap support - [20/43] Input: libps2: mark data received in __ps2_command() as initialized - [21/43] dma: kmsan: unpoison DMA mappings - [34/43] x86: kmsan: handle open-coded assembly in lib/iomem.c - [36/43] x86: kmsan: sync metadata pages on page fault 4. Changes that prevent false reports by explicitly initializing memory, disabling optimized code that may trick KMSAN, selectively skipping instrumentation: - [08/43] kmsan: mark noinstr as __no_sanitize_memory - [12/43] kmsan: disable instrumentation of unsupported common kernel code - [22/43] virtio: kmsan: check/unpoison scatterlist in vring_map_one_sg() - [23/43] kmsan: handle memory sent to/from USB - [25/43] kmsan: disable strscpy() optimization under KMSAN - [26/43] crypto: kmsan: disable accelerated configs under KMSAN - [27/43] kmsan: disable physical page merging in biovec - [28/43] block: kmsan: skip bio block merging logic for KMSAN - [29/43] kcov: kmsan: unpoison area->list in kcov_remote_area_put() - [30/43] security: kmsan: fix interoperability with auto-initialization - [32/43] x86: kmsan: disable instrumentation of unsupported code - [33/43] x86: kmsan: skip shadow checks in __switch_to() - [37/43] x86: kasan: kmsan: support CONFIG_GENERIC_CSUM on x86, enable it for KASAN/KMSAN - [38/43] x86: fs: kmsan: disable CONFIG_DCACHE_WORD_ACCESS - [39/43] x86: kmsan: don't instrument stack walking functions - [40/43] entry: kmsan: introduce kmsan_unpoison_entry_regs() 5. Fixes for bugs detected with CONFIG_KMSAN_CHECK_PARAM_RETVAL: - [41/43] bpf: kmsan: initialize BPF registers with zeroes - [42/43] mm: fs: initialize fsdata passed to write_begin/write_end interface This patchset allows one to boot and run a defconfig+KMSAN kernel on a QEMU without known false positives. It however doesn't guarantee there are no false positives in drivers of certain devices or less tested subsystems, although KMSAN is actively tested on syzbot with a large config. By default, KMSAN enforces conservative checks of most kernel function parameters passed by value (via CONFIG_KMSAN_CHECK_PARAM_RETVAL, which maps to the -fsanitize-memory-param-retval compiler flag). As discussed in [4] and [5], passing uninitialized values as function parameters is considered undefined behavior, therefore KMSAN now reports such cases as errors. Several newly added patches fix known manifestations of these errors. This patch (of 43): Including sparsemem.h from other files (e.g. transitively via asm/pgtable_64_types.h) results in compilation errors due to unknown types: sparsemem.h:34:32: error: unknown type name 'phys_addr_t' extern int phys_to_target_node(phys_addr_t start); ^ sparsemem.h:36:39: error: unknown type name 'u64' extern int memory_add_physaddr_to_nid(u64 start); ^ Fix these errors by including linux/types.h from sparsemem.h This is required for the upcoming KMSAN patches. Link: https://lkml.kernel.org/r/20220915150417.722975-1-glider@google.com Link: https://lkml.kernel.org/r/20220915150417.722975-2-glider@google.com Signed-off-by: Dmitry Vyukov Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Cc: Andrey Konovalov Cc: Eric Biggers Signed-off-by: Andrew Morton --- arch/x86/include/asm/sparsemem.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index 6a9ccc1b2be5..64df897c0ee3 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -2,6 +2,8 @@ #ifndef _ASM_X86_SPARSEMEM_H #define _ASM_X86_SPARSEMEM_H +#include + #ifdef CONFIG_SPARSEMEM /* * generic non-linear memory support: From 83a4f1ef45a90d740bc6edf6a2533b14a3e5d183 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:36 +0200 Subject: [PATCH 251/350] stackdepot: reserve 5 extra bits in depot_stack_handle_t Some users (currently only KMSAN) may want to use spare bits in depot_stack_handle_t. Let them do so by adding @extra_bits to __stack_depot_save() to store arbitrary flags, and providing stack_depot_get_extra_bits() to retrieve those flags. Also adapt KASAN to the new prototype by passing extra_bits=0, as KASAN does not intend to store additional information in the stack handle. Link: https://lkml.kernel.org/r/20220915150417.722975-3-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/stackdepot.h | 8 ++++++++ lib/stackdepot.c | 29 ++++++++++++++++++++++++----- mm/kasan/common.c | 2 +- 3 files changed, 33 insertions(+), 6 deletions(-) diff --git a/include/linux/stackdepot.h b/include/linux/stackdepot.h index bc2797955de9..9ca7798d7a31 100644 --- a/include/linux/stackdepot.h +++ b/include/linux/stackdepot.h @@ -14,9 +14,15 @@ #include typedef u32 depot_stack_handle_t; +/* + * Number of bits in the handle that stack depot doesn't use. Users may store + * information in them. + */ +#define STACK_DEPOT_EXTRA_BITS 5 depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, + unsigned int extra_bits, gfp_t gfp_flags, bool can_alloc); /* @@ -59,6 +65,8 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int stack_depot_fetch(depot_stack_handle_t handle, unsigned long **entries); +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle); + int stack_depot_snprint(depot_stack_handle_t handle, char *buf, size_t size, int spaces); diff --git a/lib/stackdepot.c b/lib/stackdepot.c index e73fda23388d..79e894cf8406 100644 --- a/lib/stackdepot.c +++ b/lib/stackdepot.c @@ -43,7 +43,8 @@ #define STACK_ALLOC_OFFSET_BITS (STACK_ALLOC_ORDER + PAGE_SHIFT - \ STACK_ALLOC_ALIGN) #define STACK_ALLOC_INDEX_BITS (DEPOT_STACK_BITS - \ - STACK_ALLOC_NULL_PROTECTION_BITS - STACK_ALLOC_OFFSET_BITS) + STACK_ALLOC_NULL_PROTECTION_BITS - \ + STACK_ALLOC_OFFSET_BITS - STACK_DEPOT_EXTRA_BITS) #define STACK_ALLOC_SLABS_CAP 8192 #define STACK_ALLOC_MAX_SLABS \ (((1LL << (STACK_ALLOC_INDEX_BITS)) < STACK_ALLOC_SLABS_CAP) ? \ @@ -56,6 +57,7 @@ union handle_parts { u32 slabindex : STACK_ALLOC_INDEX_BITS; u32 offset : STACK_ALLOC_OFFSET_BITS; u32 valid : STACK_ALLOC_NULL_PROTECTION_BITS; + u32 extra : STACK_DEPOT_EXTRA_BITS; }; }; @@ -77,6 +79,14 @@ static int next_slab_inited; static size_t depot_offset; static DEFINE_RAW_SPINLOCK(depot_lock); +unsigned int stack_depot_get_extra_bits(depot_stack_handle_t handle) +{ + union handle_parts parts = { .handle = handle }; + + return parts.extra; +} +EXPORT_SYMBOL(stack_depot_get_extra_bits); + static bool init_stack_slab(void **prealloc) { if (!*prealloc) @@ -140,6 +150,7 @@ depot_alloc_stack(unsigned long *entries, int size, u32 hash, void **prealloc) stack->handle.slabindex = depot_index; stack->handle.offset = depot_offset >> STACK_ALLOC_ALIGN; stack->handle.valid = 1; + stack->handle.extra = 0; memcpy(stack->entries, entries, flex_array_size(stack, entries, size)); depot_offset += required_size; @@ -382,6 +393,7 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); * * @entries: Pointer to storage array * @nr_entries: Size of the storage array + * @extra_bits: Flags to store in unused bits of depot_stack_handle_t * @alloc_flags: Allocation gfp flags * @can_alloc: Allocate stack slabs (increased chance of failure if false) * @@ -393,6 +405,10 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); * If the stack trace in @entries is from an interrupt, only the portion up to * interrupt entry is saved. * + * Additional opaque flags can be passed in @extra_bits, stored in the unused + * bits of the stack handle, and retrieved using stack_depot_get_extra_bits() + * without calling stack_depot_fetch(). + * * Context: Any context, but setting @can_alloc to %false is required if * alloc_pages() cannot be used from the current context. Currently * this is the case from contexts where neither %GFP_ATOMIC nor @@ -402,10 +418,11 @@ EXPORT_SYMBOL_GPL(stack_depot_fetch); */ depot_stack_handle_t __stack_depot_save(unsigned long *entries, unsigned int nr_entries, + unsigned int extra_bits, gfp_t alloc_flags, bool can_alloc) { struct stack_record *found = NULL, **bucket; - depot_stack_handle_t retval = 0; + union handle_parts retval = { .handle = 0 }; struct page *page = NULL; void *prealloc = NULL; unsigned long flags; @@ -489,9 +506,11 @@ exit: free_pages((unsigned long)prealloc, STACK_ALLOC_ORDER); } if (found) - retval = found->handle.handle; + retval.handle = found->handle.handle; fast_exit: - return retval; + retval.extra = extra_bits; + + return retval.handle; } EXPORT_SYMBOL_GPL(__stack_depot_save); @@ -511,6 +530,6 @@ depot_stack_handle_t stack_depot_save(unsigned long *entries, unsigned int nr_entries, gfp_t alloc_flags) { - return __stack_depot_save(entries, nr_entries, alloc_flags, true); + return __stack_depot_save(entries, nr_entries, 0, alloc_flags, true); } EXPORT_SYMBOL_GPL(stack_depot_save); diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 50f4338b477f..833bf2cfd2a3 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -43,7 +43,7 @@ depot_stack_handle_t kasan_save_stack(gfp_t flags, bool can_alloc) unsigned int nr_entries; nr_entries = stack_trace_save(entries, ARRAY_SIZE(entries), 0); - return __stack_depot_save(entries, nr_entries, flags, can_alloc); + return __stack_depot_save(entries, nr_entries, 0, flags, can_alloc); } void kasan_set_track(struct kasan_track *track, gfp_t flags) From 33b75c1d884e81ec97525e0a6fdcb187adf273f4 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:37 +0200 Subject: [PATCH 252/350] instrumented.h: allow instrumenting both sides of copy_from_user() Introduce instrument_copy_from_user_before() and instrument_copy_from_user_after() hooks to be invoked before and after the call to copy_from_user(). KASAN and KCSAN will be only using instrument_copy_from_user_before(), but for KMSAN we'll need to insert code after copy_from_user(). Link: https://lkml.kernel.org/r/20220915150417.722975-4-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/s390/lib/uaccess.c | 3 ++- include/linux/instrumented.h | 21 +++++++++++++++++++-- include/linux/uaccess.h | 19 ++++++++++++++----- lib/iov_iter.c | 9 ++++++--- lib/usercopy.c | 3 ++- 5 files changed, 43 insertions(+), 12 deletions(-) diff --git a/arch/s390/lib/uaccess.c b/arch/s390/lib/uaccess.c index d7b3b193d108..58033dfcb6d4 100644 --- a/arch/s390/lib/uaccess.c +++ b/arch/s390/lib/uaccess.c @@ -81,8 +81,9 @@ unsigned long _copy_from_user_key(void *to, const void __user *from, might_fault(); if (!should_fail_usercopy()) { - instrument_copy_from_user(to, from, n); + instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user_key(to, from, n, key); + instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index 42faebbaa202..ee8f7d17d34f 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -120,7 +120,7 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n) } /** - * instrument_copy_from_user - instrument writes of copy_from_user + * instrument_copy_from_user_before - add instrumentation before copy_from_user * * Instrument writes to kernel memory, that are due to copy_from_user (and * variants). The instrumentation should be inserted before the accesses. @@ -130,10 +130,27 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n) * @n number of bytes to copy */ static __always_inline void -instrument_copy_from_user(const void *to, const void __user *from, unsigned long n) +instrument_copy_from_user_before(const void *to, const void __user *from, unsigned long n) { kasan_check_write(to, n); kcsan_check_write(to, n); } +/** + * instrument_copy_from_user_after - add instrumentation after copy_from_user + * + * Instrument writes to kernel memory, that are due to copy_from_user (and + * variants). The instrumentation should be inserted after the accesses. + * + * @to destination address + * @from source address + * @n number of bytes to copy + * @left number of bytes not copied (as returned by copy_from_user) + */ +static __always_inline void +instrument_copy_from_user_after(const void *to, const void __user *from, + unsigned long n, unsigned long left) +{ +} + #endif /* _LINUX_INSTRUMENTED_H */ diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 47e5d374c7eb..afb18f198843 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -58,20 +58,28 @@ static __always_inline __must_check unsigned long __copy_from_user_inatomic(void *to, const void __user *from, unsigned long n) { - instrument_copy_from_user(to, from, n); + unsigned long res; + + instrument_copy_from_user_before(to, from, n); check_object_size(to, n, false); - return raw_copy_from_user(to, from, n); + res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); + return res; } static __always_inline __must_check unsigned long __copy_from_user(void *to, const void __user *from, unsigned long n) { + unsigned long res; + might_fault(); + instrument_copy_from_user_before(to, from, n); if (should_fail_usercopy()) return n; - instrument_copy_from_user(to, from, n); check_object_size(to, n, false); - return raw_copy_from_user(to, from, n); + res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); + return res; } /** @@ -115,8 +123,9 @@ _copy_from_user(void *to, const void __user *from, unsigned long n) unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { - instrument_copy_from_user(to, from, n); + instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 4b7fce72e3e5..c3ca28ca68a6 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -174,13 +174,16 @@ static int copyout(void __user *to, const void *from, size_t n) static int copyin(void *to, const void __user *from, size_t n) { + size_t res = n; + if (should_fail_usercopy()) return n; if (access_ok(from, n)) { - instrument_copy_from_user(to, from, n); - n = raw_copy_from_user(to, from, n); + instrument_copy_from_user_before(to, from, n); + res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); } - return n; + return res; } static inline struct pipe_buffer *pipe_buf(const struct pipe_inode_info *pipe, diff --git a/lib/usercopy.c b/lib/usercopy.c index 7413dd300516..1505a52f23a0 100644 --- a/lib/usercopy.c +++ b/lib/usercopy.c @@ -12,8 +12,9 @@ unsigned long _copy_from_user(void *to, const void __user *from, unsigned long n unsigned long res = n; might_fault(); if (!should_fail_usercopy() && likely(access_ok(from, n))) { - instrument_copy_from_user(to, from, n); + instrument_copy_from_user_before(to, from, n); res = raw_copy_from_user(to, from, n); + instrument_copy_from_user_after(to, from, n, res); } if (unlikely(res)) memset(to + (n - res), 0, res); From 888f84a6da4d17e453058169fa7b235fff34f5bf Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:38 +0200 Subject: [PATCH 253/350] x86: asm: instrument usercopy in get_user() and put_user() Use hooks from instrumented.h to notify bug detection tools about usercopy events in variations of get_user() and put_user(). Link: https://lkml.kernel.org/r/20220915150417.722975-5-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/uaccess.h | 22 +++++++++++++++------- include/linux/instrumented.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+), 7 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 913e593a3b45..c1b8982899ec 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -5,6 +5,7 @@ * User space memory access functions */ #include +#include #include #include #include @@ -103,6 +104,7 @@ extern int __get_user_bad(void); : "=a" (__ret_gu), "=r" (__val_gu), \ ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ + instrument_get_user(__val_gu); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ }) @@ -192,9 +194,11 @@ extern void __put_user_nocheck_8(void); int __ret_pu; \ void __user *__ptr_pu; \ register __typeof__(*(ptr)) __val_pu asm("%"_ASM_AX); \ - __chk_user_ptr(ptr); \ - __ptr_pu = (ptr); \ - __val_pu = (x); \ + __typeof__(*(ptr)) __x = (x); /* eval x once */ \ + __typeof__(ptr) __ptr = (ptr); /* eval ptr once */ \ + __chk_user_ptr(__ptr); \ + __ptr_pu = __ptr; \ + __val_pu = __x; \ asm volatile("call __" #fn "_%P[size]" \ : "=c" (__ret_pu), \ ASM_CALL_CONSTRAINT \ @@ -202,6 +206,7 @@ extern void __put_user_nocheck_8(void); "r" (__val_pu), \ [size] "i" (sizeof(*(ptr))) \ :"ebx"); \ + instrument_put_user(__x, __ptr, sizeof(*(ptr))); \ __builtin_expect(__ret_pu, 0); \ }) @@ -248,23 +253,25 @@ extern void __put_user_nocheck_8(void); #define __put_user_size(x, ptr, size, label) \ do { \ + __typeof__(*(ptr)) __x = (x); /* eval x once */ \ __chk_user_ptr(ptr); \ switch (size) { \ case 1: \ - __put_user_goto(x, ptr, "b", "iq", label); \ + __put_user_goto(__x, ptr, "b", "iq", label); \ break; \ case 2: \ - __put_user_goto(x, ptr, "w", "ir", label); \ + __put_user_goto(__x, ptr, "w", "ir", label); \ break; \ case 4: \ - __put_user_goto(x, ptr, "l", "ir", label); \ + __put_user_goto(__x, ptr, "l", "ir", label); \ break; \ case 8: \ - __put_user_goto_u64(x, ptr, label); \ + __put_user_goto_u64(__x, ptr, label); \ break; \ default: \ __put_user_bad(); \ } \ + instrument_put_user(__x, ptr, size); \ } while (0) #ifdef CONFIG_CC_HAS_ASM_GOTO_OUTPUT @@ -305,6 +312,7 @@ do { \ default: \ (x) = __get_user_bad(); \ } \ + instrument_get_user(x); \ } while (0) #define __get_user_asm(x, addr, itype, ltype, label) \ diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index ee8f7d17d34f..9f1dba8f717b 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -153,4 +153,32 @@ instrument_copy_from_user_after(const void *to, const void __user *from, { } +/** + * instrument_get_user() - add instrumentation to get_user()-like macros + * + * get_user() and friends are fragile, so it may depend on the implementation + * whether the instrumentation happens before or after the data is copied from + * the userspace. + * + * @to destination variable, may not be address-taken + */ +#define instrument_get_user(to) \ +({ \ +}) + +/** + * instrument_put_user() - add instrumentation to put_user()-like macros + * + * put_user() and friends are fragile, so it may depend on the implementation + * whether the instrumentation happens before or after the data is copied from + * the userspace. + * + * @from source address + * @ptr userspace pointer to copy to + * @size number of bytes to copy + */ +#define instrument_put_user(from, ptr, size) \ +({ \ +}) + #endif /* _LINUX_INSTRUMENTED_H */ From 2b420aaf80408fd45d86ce983819813d43ac210f Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:39 +0200 Subject: [PATCH 254/350] asm-generic: instrument usercopy in cacheflush.h Notify memory tools about usercopy events in copy_to_user_page() and copy_from_user_page(). Link: https://lkml.kernel.org/r/20220915150417.722975-6-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/asm-generic/cacheflush.h | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index 4f07afacbc23..f46258d1a080 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -2,6 +2,8 @@ #ifndef _ASM_GENERIC_CACHEFLUSH_H #define _ASM_GENERIC_CACHEFLUSH_H +#include + struct mm_struct; struct vm_area_struct; struct page; @@ -105,14 +107,22 @@ static inline void flush_cache_vunmap(unsigned long start, unsigned long end) #ifndef copy_to_user_page #define copy_to_user_page(vma, page, vaddr, dst, src, len) \ do { \ + instrument_copy_to_user((void __user *)dst, src, len); \ memcpy(dst, src, len); \ flush_icache_user_page(vma, page, vaddr, len); \ } while (0) #endif + #ifndef copy_from_user_page -#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ - memcpy(dst, src, len) +#define copy_from_user_page(vma, page, vaddr, dst, src, len) \ + do { \ + instrument_copy_from_user_before(dst, (void __user *)src, \ + len); \ + memcpy(dst, src, len); \ + instrument_copy_from_user_after(dst, (void __user *)src, len, \ + 0); \ + } while (0) #endif #endif /* _ASM_GENERIC_CACHEFLUSH_H */ From 93858ae70cf4fb2ec75ae2f1e495b85b26614883 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:40 +0200 Subject: [PATCH 255/350] kmsan: add ReST documentation Add Documentation/dev-tools/kmsan.rst and reference it in the dev-tools index. Link: https://lkml.kernel.org/r/20220915150417.722975-7-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/dev-tools/index.rst | 1 + Documentation/dev-tools/kmsan.rst | 427 ++++++++++++++++++++++++++++++ 2 files changed, 428 insertions(+) create mode 100644 Documentation/dev-tools/kmsan.rst diff --git a/Documentation/dev-tools/index.rst b/Documentation/dev-tools/index.rst index 4621eac290f4..6b0663075dc0 100644 --- a/Documentation/dev-tools/index.rst +++ b/Documentation/dev-tools/index.rst @@ -24,6 +24,7 @@ Documentation/dev-tools/testing-overview.rst kcov gcov kasan + kmsan ubsan kmemleak kcsan diff --git a/Documentation/dev-tools/kmsan.rst b/Documentation/dev-tools/kmsan.rst new file mode 100644 index 000000000000..2a53a801198c --- /dev/null +++ b/Documentation/dev-tools/kmsan.rst @@ -0,0 +1,427 @@ +.. SPDX-License-Identifier: GPL-2.0 +.. Copyright (C) 2022, Google LLC. + +=================================== +The Kernel Memory Sanitizer (KMSAN) +=================================== + +KMSAN is a dynamic error detector aimed at finding uses of uninitialized +values. It is based on compiler instrumentation, and is quite similar to the +userspace `MemorySanitizer tool`_. + +An important note is that KMSAN is not intended for production use, because it +drastically increases kernel memory footprint and slows the whole system down. + +Usage +===== + +Building the kernel +------------------- + +In order to build a kernel with KMSAN you will need a fresh Clang (14.0.6+). +Please refer to `LLVM documentation`_ for the instructions on how to build Clang. + +Now configure and build the kernel with CONFIG_KMSAN enabled. + +Example report +-------------- + +Here is an example of a KMSAN report:: + + ===================================================== + BUG: KMSAN: uninit-value in test_uninit_kmsan_check_memory+0x1be/0x380 [kmsan_test] + test_uninit_kmsan_check_memory+0x1be/0x380 mm/kmsan/kmsan_test.c:273 + kunit_run_case_internal lib/kunit/test.c:333 + kunit_try_run_case+0x206/0x420 lib/kunit/test.c:374 + kunit_generic_run_threadfn_adapter+0x6d/0xc0 lib/kunit/try-catch.c:28 + kthread+0x721/0x850 kernel/kthread.c:327 + ret_from_fork+0x1f/0x30 ??:? + + Uninit was stored to memory at: + do_uninit_local_array+0xfa/0x110 mm/kmsan/kmsan_test.c:260 + test_uninit_kmsan_check_memory+0x1a2/0x380 mm/kmsan/kmsan_test.c:271 + kunit_run_case_internal lib/kunit/test.c:333 + kunit_try_run_case+0x206/0x420 lib/kunit/test.c:374 + kunit_generic_run_threadfn_adapter+0x6d/0xc0 lib/kunit/try-catch.c:28 + kthread+0x721/0x850 kernel/kthread.c:327 + ret_from_fork+0x1f/0x30 ??:? + + Local variable uninit created at: + do_uninit_local_array+0x4a/0x110 mm/kmsan/kmsan_test.c:256 + test_uninit_kmsan_check_memory+0x1a2/0x380 mm/kmsan/kmsan_test.c:271 + + Bytes 4-7 of 8 are uninitialized + Memory access of size 8 starts at ffff888083fe3da0 + + CPU: 0 PID: 6731 Comm: kunit_try_catch Tainted: G B E 5.16.0-rc3+ #104 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.14.0-2 04/01/2014 + ===================================================== + +The report says that the local variable ``uninit`` was created uninitialized in +``do_uninit_local_array()``. The third stack trace corresponds to the place +where this variable was created. + +The first stack trace shows where the uninit value was used (in +``test_uninit_kmsan_check_memory()``). The tool shows the bytes which were left +uninitialized in the local variable, as well as the stack where the value was +copied to another memory location before use. + +A use of uninitialized value ``v`` is reported by KMSAN in the following cases: + - in a condition, e.g. ``if (v) { ... }``; + - in an indexing or pointer dereferencing, e.g. ``array[v]`` or ``*v``; + - when it is copied to userspace or hardware, e.g. ``copy_to_user(..., &v, ...)``; + - when it is passed as an argument to a function, and + ``CONFIG_KMSAN_CHECK_PARAM_RETVAL`` is enabled (see below). + +The mentioned cases (apart from copying data to userspace or hardware, which is +a security issue) are considered undefined behavior from the C11 Standard point +of view. + +Disabling the instrumentation +----------------------------- + +A function can be marked with ``__no_kmsan_checks``. Doing so makes KMSAN +ignore uninitialized values in that function and mark its output as initialized. +As a result, the user will not get KMSAN reports related to that function. + +Another function attribute supported by KMSAN is ``__no_sanitize_memory``. +Applying this attribute to a function will result in KMSAN not instrumenting +it, which can be helpful if we do not want the compiler to interfere with some +low-level code (e.g. that marked with ``noinstr`` which implicitly adds +``__no_sanitize_memory``). + +This however comes at a cost: stack allocations from such functions will have +incorrect shadow/origin values, likely leading to false positives. Functions +called from non-instrumented code may also receive incorrect metadata for their +parameters. + +As a rule of thumb, avoid using ``__no_sanitize_memory`` explicitly. + +It is also possible to disable KMSAN for a single file (e.g. main.o):: + + KMSAN_SANITIZE_main.o := n + +or for the whole directory:: + + KMSAN_SANITIZE := n + +in the Makefile. Think of this as applying ``__no_sanitize_memory`` to every +function in the file or directory. Most users won't need KMSAN_SANITIZE, unless +their code gets broken by KMSAN (e.g. runs at early boot time). + +Support +======= + +In order for KMSAN to work the kernel must be built with Clang, which so far is +the only compiler that has KMSAN support. The kernel instrumentation pass is +based on the userspace `MemorySanitizer tool`_. + +The runtime library only supports x86_64 at the moment. + +How KMSAN works +=============== + +KMSAN shadow memory +------------------- + +KMSAN associates a metadata byte (also called shadow byte) with every byte of +kernel memory. A bit in the shadow byte is set iff the corresponding bit of the +kernel memory byte is uninitialized. Marking the memory uninitialized (i.e. +setting its shadow bytes to ``0xff``) is called poisoning, marking it +initialized (setting the shadow bytes to ``0x00``) is called unpoisoning. + +When a new variable is allocated on the stack, it is poisoned by default by +instrumentation code inserted by the compiler (unless it is a stack variable +that is immediately initialized). Any new heap allocation done without +``__GFP_ZERO`` is also poisoned. + +Compiler instrumentation also tracks the shadow values as they are used along +the code. When needed, instrumentation code invokes the runtime library in +``mm/kmsan/`` to persist shadow values. + +The shadow value of a basic or compound type is an array of bytes of the same +length. When a constant value is written into memory, that memory is unpoisoned. +When a value is read from memory, its shadow memory is also obtained and +propagated into all the operations which use that value. For every instruction +that takes one or more values the compiler generates code that calculates the +shadow of the result depending on those values and their shadows. + +Example:: + + int a = 0xff; // i.e. 0x000000ff + int b; + int c = a | b; + +In this case the shadow of ``a`` is ``0``, shadow of ``b`` is ``0xffffffff``, +shadow of ``c`` is ``0xffffff00``. This means that the upper three bytes of +``c`` are uninitialized, while the lower byte is initialized. + +Origin tracking +--------------- + +Every four bytes of kernel memory also have a so-called origin mapped to them. +This origin describes the point in program execution at which the uninitialized +value was created. Every origin is associated with either the full allocation +stack (for heap-allocated memory), or the function containing the uninitialized +variable (for locals). + +When an uninitialized variable is allocated on stack or heap, a new origin +value is created, and that variable's origin is filled with that value. When a +value is read from memory, its origin is also read and kept together with the +shadow. For every instruction that takes one or more values, the origin of the +result is one of the origins corresponding to any of the uninitialized inputs. +If a poisoned value is written into memory, its origin is written to the +corresponding storage as well. + +Example 1:: + + int a = 42; + int b; + int c = a + b; + +In this case the origin of ``b`` is generated upon function entry, and is +stored to the origin of ``c`` right before the addition result is written into +memory. + +Several variables may share the same origin address, if they are stored in the +same four-byte chunk. In this case every write to either variable updates the +origin for all of them. We have to sacrifice precision in this case, because +storing origins for individual bits (and even bytes) would be too costly. + +Example 2:: + + int combine(short a, short b) { + union ret_t { + int i; + short s[2]; + } ret; + ret.s[0] = a; + ret.s[1] = b; + return ret.i; + } + +If ``a`` is initialized and ``b`` is not, the shadow of the result would be +0xffff0000, and the origin of the result would be the origin of ``b``. +``ret.s[0]`` would have the same origin, but it will never be used, because +that variable is initialized. + +If both function arguments are uninitialized, only the origin of the second +argument is preserved. + +Origin chaining +~~~~~~~~~~~~~~~ + +To ease debugging, KMSAN creates a new origin for every store of an +uninitialized value to memory. The new origin references both its creation stack +and the previous origin the value had. This may cause increased memory +consumption, so we limit the length of origin chains in the runtime. + +Clang instrumentation API +------------------------- + +Clang instrumentation pass inserts calls to functions defined in +``mm/kmsan/nstrumentation.c`` into the kernel code. + +Shadow manipulation +~~~~~~~~~~~~~~~~~~~ + +For every memory access the compiler emits a call to a function that returns a +pair of pointers to the shadow and origin addresses of the given memory:: + + typedef struct { + void *shadow, *origin; + } shadow_origin_ptr_t + + shadow_origin_ptr_t __msan_metadata_ptr_for_load_{1,2,4,8}(void *addr) + shadow_origin_ptr_t __msan_metadata_ptr_for_store_{1,2,4,8}(void *addr) + shadow_origin_ptr_t __msan_metadata_ptr_for_load_n(void *addr, uintptr_t size) + shadow_origin_ptr_t __msan_metadata_ptr_for_store_n(void *addr, uintptr_t size) + +The function name depends on the memory access size. + +The compiler makes sure that for every loaded value its shadow and origin +values are read from memory. When a value is stored to memory, its shadow and +origin are also stored using the metadata pointers. + +Handling locals +~~~~~~~~~~~~~~~ + +A special function is used to create a new origin value for a local variable and +set the origin of that variable to that value:: + + void __msan_poison_alloca(void *addr, uintptr_t size, char *descr) + +Access to per-task data +~~~~~~~~~~~~~~~~~~~~~~~ + +At the beginning of every instrumented function KMSAN inserts a call to +``__msan_get_context_state()``:: + + kmsan_context_state *__msan_get_context_state(void) + +``kmsan_context_state`` is declared in ``include/linux/kmsan.h``:: + + struct kmsan_context_state { + char param_tls[KMSAN_PARAM_SIZE]; + char retval_tls[KMSAN_RETVAL_SIZE]; + char va_arg_tls[KMSAN_PARAM_SIZE]; + char va_arg_origin_tls[KMSAN_PARAM_SIZE]; + u64 va_arg_overflow_size_tls; + char param_origin_tls[KMSAN_PARAM_SIZE]; + depot_stack_handle_t retval_origin_tls; + }; + +This structure is used by KMSAN to pass parameter shadows and origins between +instrumented functions (unless the parameters are checked immediately by +``CONFIG_KMSAN_CHECK_PARAM_RETVAL``). + +Passing uninitialized values to functions +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Clang's MemorySanitizer instrumentation has an option, +``-fsanitize-memory-param-retval``, which makes the compiler check function +parameters passed by value, as well as function return values. + +The option is controlled by ``CONFIG_KMSAN_CHECK_PARAM_RETVAL``, which is +enabled by default to let KMSAN report uninitialized values earlier. +Please refer to the `LKML discussion`_ for more details. + +Because of the way the checks are implemented in LLVM (they are only applied to +parameters marked as ``noundef``), not all parameters are guaranteed to be +checked, so we cannot give up the metadata storage in ``kmsan_context_state``. + +String functions +~~~~~~~~~~~~~~~~ + +The compiler replaces calls to ``memcpy()``/``memmove()``/``memset()`` with the +following functions. These functions are also called when data structures are +initialized or copied, making sure shadow and origin values are copied alongside +with the data:: + + void *__msan_memcpy(void *dst, void *src, uintptr_t n) + void *__msan_memmove(void *dst, void *src, uintptr_t n) + void *__msan_memset(void *dst, int c, uintptr_t n) + +Error reporting +~~~~~~~~~~~~~~~ + +For each use of a value the compiler emits a shadow check that calls +``__msan_warning()`` in the case that value is poisoned:: + + void __msan_warning(u32 origin) + +``__msan_warning()`` causes KMSAN runtime to print an error report. + +Inline assembly instrumentation +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +KMSAN instruments every inline assembly output with a call to:: + + void __msan_instrument_asm_store(void *addr, uintptr_t size) + +, which unpoisons the memory region. + +This approach may mask certain errors, but it also helps to avoid a lot of +false positives in bitwise operations, atomics etc. + +Sometimes the pointers passed into inline assembly do not point to valid memory. +In such cases they are ignored at runtime. + + +Runtime library +--------------- + +The code is located in ``mm/kmsan/``. + +Per-task KMSAN state +~~~~~~~~~~~~~~~~~~~~ + +Every task_struct has an associated KMSAN task state that holds the KMSAN +context (see above) and a per-task flag disallowing KMSAN reports:: + + struct kmsan_context { + ... + bool allow_reporting; + struct kmsan_context_state cstate; + ... + } + + struct task_struct { + ... + struct kmsan_context kmsan; + ... + } + +KMSAN contexts +~~~~~~~~~~~~~~ + +When running in a kernel task context, KMSAN uses ``current->kmsan.cstate`` to +hold the metadata for function parameters and return values. + +But in the case the kernel is running in the interrupt, softirq or NMI context, +where ``current`` is unavailable, KMSAN switches to per-cpu interrupt state:: + + DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); + +Metadata allocation +~~~~~~~~~~~~~~~~~~~ + +There are several places in the kernel for which the metadata is stored. + +1. Each ``struct page`` instance contains two pointers to its shadow and +origin pages:: + + struct page { + ... + struct page *shadow, *origin; + ... + }; + +At boot-time, the kernel allocates shadow and origin pages for every available +kernel page. This is done quite late, when the kernel address space is already +fragmented, so normal data pages may arbitrarily interleave with the metadata +pages. + +This means that in general for two contiguous memory pages their shadow/origin +pages may not be contiguous. Consequently, if a memory access crosses the +boundary of a memory block, accesses to shadow/origin memory may potentially +corrupt other pages or read incorrect values from them. + +In practice, contiguous memory pages returned by the same ``alloc_pages()`` +call will have contiguous metadata, whereas if these pages belong to two +different allocations their metadata pages can be fragmented. + +For the kernel data (``.data``, ``.bss`` etc.) and percpu memory regions +there also are no guarantees on metadata contiguity. + +In the case ``__msan_metadata_ptr_for_XXX_YYY()`` hits the border between two +pages with non-contiguous metadata, it returns pointers to fake shadow/origin regions:: + + char dummy_load_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); + char dummy_store_page[PAGE_SIZE] __attribute__((aligned(PAGE_SIZE))); + +``dummy_load_page`` is zero-initialized, so reads from it always yield zeroes. +All stores to ``dummy_store_page`` are ignored. + +2. For vmalloc memory and modules, there is a direct mapping between the memory +range, its shadow and origin. KMSAN reduces the vmalloc area by 3/4, making only +the first quarter available to ``vmalloc()``. The second quarter of the vmalloc +area contains shadow memory for the first quarter, the third one holds the +origins. A small part of the fourth quarter contains shadow and origins for the +kernel modules. Please refer to ``arch/x86/include/asm/pgtable_64_types.h`` for +more details. + +When an array of pages is mapped into a contiguous virtual memory space, their +shadow and origin pages are similarly mapped into contiguous regions. + +References +========== + +E. Stepanov, K. Serebryany. `MemorySanitizer: fast detector of uninitialized +memory use in C++ +`_. +In Proceedings of CGO 2015. + +.. _MemorySanitizer tool: https://clang.llvm.org/docs/MemorySanitizer.html +.. _LLVM documentation: https://llvm.org/docs/GettingStarted.html +.. _LKML discussion: https://lore.kernel.org/all/20220614144853.3693273-1-glider@google.com/ From 9b448bc25b776daab3215393c3ce6953dd3bb8ad Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:41 +0200 Subject: [PATCH 256/350] kmsan: introduce __no_sanitize_memory and __no_kmsan_checks __no_sanitize_memory is a function attribute that instructs KMSAN to skip a function during instrumentation. This is needed to e.g. implement the noinstr functions. __no_kmsan_checks is a function attribute that makes KMSAN ignore the uninitialized values coming from the function's inputs, and initialize the function's outputs. Functions marked with this attribute can't be inlined into functions not marked with it, and vice versa. This behavior is overridden by __always_inline. __SANITIZE_MEMORY__ is a macro that's defined iff the file is instrumented with KMSAN. This is not the same as CONFIG_KMSAN, which is defined for every file. Link: https://lkml.kernel.org/r/20220915150417.722975-8-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/compiler-clang.h | 23 +++++++++++++++++++++++ include/linux/compiler-gcc.h | 6 ++++++ 2 files changed, 29 insertions(+) diff --git a/include/linux/compiler-clang.h b/include/linux/compiler-clang.h index c84fec767445..4fa0cc4cbd2c 100644 --- a/include/linux/compiler-clang.h +++ b/include/linux/compiler-clang.h @@ -51,6 +51,29 @@ #define __no_sanitize_undefined #endif +#if __has_feature(memory_sanitizer) +#define __SANITIZE_MEMORY__ +/* + * Unlike other sanitizers, KMSAN still inserts code into functions marked with + * no_sanitize("kernel-memory"). Using disable_sanitizer_instrumentation + * provides the behavior consistent with other __no_sanitize_ attributes, + * guaranteeing that __no_sanitize_memory functions remain uninstrumented. + */ +#define __no_sanitize_memory __disable_sanitizer_instrumentation + +/* + * The __no_kmsan_checks attribute ensures that a function does not produce + * false positive reports by: + * - initializing all local variables and memory stores in this function; + * - skipping all shadow checks; + * - passing initialized arguments to this function's callees. + */ +#define __no_kmsan_checks __attribute__((no_sanitize("kernel-memory"))) +#else +#define __no_sanitize_memory +#define __no_kmsan_checks +#endif + /* * Support for __has_feature(coverage_sanitizer) was added in Clang 13 together * with no_sanitize("coverage"). Prior versions of Clang support coverage diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h index 9b157b71036f..f55a37efdb97 100644 --- a/include/linux/compiler-gcc.h +++ b/include/linux/compiler-gcc.h @@ -114,6 +114,12 @@ #define __SANITIZE_ADDRESS__ #endif +/* + * GCC does not support KMSAN. + */ +#define __no_sanitize_memory +#define __no_kmsan_checks + /* * Turn individual warnings and errors on and off locally, depending * on version. From 5de0ce85f5a4d2883eae6f48eb015bc5dfbd91e9 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:42 +0200 Subject: [PATCH 257/350] kmsan: mark noinstr as __no_sanitize_memory noinstr functions should never be instrumented, so make KMSAN skip them by applying the __no_sanitize_memory attribute. Link: https://lkml.kernel.org/r/20220915150417.722975-9-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/compiler_types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/compiler_types.h b/include/linux/compiler_types.h index 4f2a819fd60a..015207a6e2bf 100644 --- a/include/linux/compiler_types.h +++ b/include/linux/compiler_types.h @@ -229,7 +229,8 @@ struct ftrace_likely_data { /* Section for code which can't be instrumented at all */ #define noinstr \ noinline notrace __attribute((__section__(".noinstr.text"))) \ - __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage + __no_kcsan __no_sanitize_address __no_profile __no_sanitize_coverage \ + __no_sanitize_memory #endif /* __KERNEL__ */ From 1a167ddd3c561b21a76187a81530a167e3522261 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:43 +0200 Subject: [PATCH 258/350] x86: kmsan: pgtable: reduce vmalloc space KMSAN is going to use 3/4 of existing vmalloc space to hold the metadata, therefore we lower VMALLOC_END to make sure vmalloc() doesn't allocate past the first 1/4. Link: https://lkml.kernel.org/r/20220915150417.722975-10-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/pgtable_64_types.h | 47 ++++++++++++++++++++++++- arch/x86/mm/init_64.c | 2 +- 2 files changed, 47 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index 70e360a2e5fb..04f36063ad54 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -139,7 +139,52 @@ extern unsigned int ptrs_per_p4d; # define VMEMMAP_START __VMEMMAP_BASE_L4 #endif /* CONFIG_DYNAMIC_MEMORY_LAYOUT */ -#define VMALLOC_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) +/* + * End of the region for which vmalloc page tables are pre-allocated. + * For non-KMSAN builds, this is the same as VMALLOC_END. + * For KMSAN builds, VMALLOC_START..VMEMORY_END is 4 times bigger than + * VMALLOC_START..VMALLOC_END (see below). + */ +#define VMEMORY_END (VMALLOC_START + (VMALLOC_SIZE_TB << 40) - 1) + +#ifndef CONFIG_KMSAN +#define VMALLOC_END VMEMORY_END +#else +/* + * In KMSAN builds vmalloc area is four times smaller, and the remaining 3/4 + * are used to keep the metadata for virtual pages. The memory formerly + * belonging to vmalloc area is now laid out as follows: + * + * 1st quarter: VMALLOC_START to VMALLOC_END - new vmalloc area + * 2nd quarter: KMSAN_VMALLOC_SHADOW_START to + * VMALLOC_END+KMSAN_VMALLOC_SHADOW_OFFSET - vmalloc area shadow + * 3rd quarter: KMSAN_VMALLOC_ORIGIN_START to + * VMALLOC_END+KMSAN_VMALLOC_ORIGIN_OFFSET - vmalloc area origins + * 4th quarter: KMSAN_MODULES_SHADOW_START to KMSAN_MODULES_ORIGIN_START + * - shadow for modules, + * KMSAN_MODULES_ORIGIN_START to + * KMSAN_MODULES_ORIGIN_START + MODULES_LEN - origins for modules. + */ +#define VMALLOC_QUARTER_SIZE ((VMALLOC_SIZE_TB << 40) >> 2) +#define VMALLOC_END (VMALLOC_START + VMALLOC_QUARTER_SIZE - 1) + +/* + * vmalloc metadata addresses are calculated by adding shadow/origin offsets + * to vmalloc address. + */ +#define KMSAN_VMALLOC_SHADOW_OFFSET VMALLOC_QUARTER_SIZE +#define KMSAN_VMALLOC_ORIGIN_OFFSET (VMALLOC_QUARTER_SIZE << 1) + +#define KMSAN_VMALLOC_SHADOW_START (VMALLOC_START + KMSAN_VMALLOC_SHADOW_OFFSET) +#define KMSAN_VMALLOC_ORIGIN_START (VMALLOC_START + KMSAN_VMALLOC_ORIGIN_OFFSET) + +/* + * The shadow/origin for modules are placed one by one in the last 1/4 of + * vmalloc space. + */ +#define KMSAN_MODULES_SHADOW_START (VMALLOC_END + KMSAN_VMALLOC_ORIGIN_OFFSET + 1) +#define KMSAN_MODULES_ORIGIN_START (KMSAN_MODULES_SHADOW_START + MODULES_LEN) +#endif /* CONFIG_KMSAN */ #define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 0fe690ebc269..39b6bfcaa0ed 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -1287,7 +1287,7 @@ static void __init preallocate_vmalloc_pages(void) unsigned long addr; const char *lvl; - for (addr = VMALLOC_START; addr <= VMALLOC_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) { + for (addr = VMALLOC_START; addr <= VMEMORY_END; addr = ALIGN(addr + 1, PGDIR_SIZE)) { pgd_t *pgd = pgd_offset_k(addr); p4d_t *p4d; pud_t *pud; From 6e9f05dc66f951e8812c84a3ef148b601e3f8f45 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:44 +0200 Subject: [PATCH 259/350] libnvdimm/pfn_dev: increase MAX_STRUCT_PAGE_SIZE KMSAN adds extra metadata fields to struct page, so it does not fit into 64 bytes anymore. This change leads to increased memory consumption of the nvdimm driver, regardless of whether the kernel is built with KMSAN or not. Link: https://lkml.kernel.org/r/20220915150417.722975-11-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/nvdimm/nd.h | 2 +- drivers/nvdimm/pfn_devs.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvdimm/nd.h b/drivers/nvdimm/nd.h index ec5219680092..85ca5b4da3cf 100644 --- a/drivers/nvdimm/nd.h +++ b/drivers/nvdimm/nd.h @@ -652,7 +652,7 @@ void devm_namespace_disable(struct device *dev, struct nd_namespace_common *ndns); #if IS_ENABLED(CONFIG_ND_CLAIM) /* max struct page size independent of kernel config */ -#define MAX_STRUCT_PAGE_SIZE 64 +#define MAX_STRUCT_PAGE_SIZE 128 int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, struct dev_pagemap *pgmap); #else static inline int nvdimm_setup_pfn(struct nd_pfn *nd_pfn, diff --git a/drivers/nvdimm/pfn_devs.c b/drivers/nvdimm/pfn_devs.c index 0e92ab4b3283..61af072ac98f 100644 --- a/drivers/nvdimm/pfn_devs.c +++ b/drivers/nvdimm/pfn_devs.c @@ -787,7 +787,7 @@ static int nd_pfn_init(struct nd_pfn *nd_pfn) * when populating the vmemmap. This *should* be equal to * PMD_SIZE for most architectures. * - * Also make sure size of struct page is less than 64. We + * Also make sure size of struct page is less than 128. We * want to make sure we use large enough size here so that * we don't have a dynamic reserve space depending on * struct page size. But we also want to make sure we notice From f80be4571b19b9fd8dd1528cd2a2f123aff51f70 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:45 +0200 Subject: [PATCH 260/350] kmsan: add KMSAN runtime core MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For each memory location KernelMemorySanitizer maintains two types of metadata: 1. The so-called shadow of that location - а byte:byte mapping describing whether or not individual bits of memory are initialized (shadow is 0) or not (shadow is 1). 2. The origins of that location - а 4-byte:4-byte mapping containing 4-byte IDs of the stack traces where uninitialized values were created. Each struct page now contains pointers to two struct pages holding KMSAN metadata (shadow and origins) for the original struct page. Utility routines in mm/kmsan/core.c and mm/kmsan/shadow.c handle the metadata creation, addressing, copying and checking. mm/kmsan/report.c performs error reporting in the cases an uninitialized value is used in a way that leads to undefined behavior. KMSAN compiler instrumentation is responsible for tracking the metadata along with the kernel memory. mm/kmsan/instrumentation.c provides the implementation for instrumentation hooks that are called from files compiled with -fsanitize=kernel-memory. To aid parameter passing (also done at instrumentation level), each task_struct now contains a struct kmsan_task_state used to track the metadata of function parameters and return values for that task. Finally, this patch provides CONFIG_KMSAN that enables KMSAN, and declares CFLAGS_KMSAN, which are applied to files compiled with KMSAN. The KMSAN_SANITIZE:=n Makefile directive can be used to completely disable KMSAN instrumentation for certain files. Similarly, KMSAN_ENABLE_CHECKS:=n disables KMSAN checks and makes newly created stack memory initialized. Users can also use functions from include/linux/kmsan-checks.h to mark certain memory regions as uninitialized or initialized (this is called "poisoning" and "unpoisoning") or check that a particular region is initialized. Link: https://lkml.kernel.org/r/20220915150417.722975-12-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Makefile | 1 + include/linux/kmsan-checks.h | 64 +++++ include/linux/kmsan_types.h | 35 +++ include/linux/mm_types.h | 12 + include/linux/sched.h | 5 + lib/Kconfig.debug | 1 + lib/Kconfig.kmsan | 50 ++++ mm/Makefile | 1 + mm/kmsan/Makefile | 23 ++ mm/kmsan/core.c | 440 +++++++++++++++++++++++++++++++++++ mm/kmsan/hooks.c | 66 ++++++ mm/kmsan/instrumentation.c | 307 ++++++++++++++++++++++++ mm/kmsan/kmsan.h | 204 ++++++++++++++++ mm/kmsan/report.c | 219 +++++++++++++++++ mm/kmsan/shadow.c | 147 ++++++++++++ scripts/Makefile.kmsan | 8 + scripts/Makefile.lib | 9 + 17 files changed, 1592 insertions(+) create mode 100644 include/linux/kmsan-checks.h create mode 100644 include/linux/kmsan_types.h create mode 100644 lib/Kconfig.kmsan create mode 100644 mm/kmsan/Makefile create mode 100644 mm/kmsan/core.c create mode 100644 mm/kmsan/hooks.c create mode 100644 mm/kmsan/instrumentation.c create mode 100644 mm/kmsan/kmsan.h create mode 100644 mm/kmsan/report.c create mode 100644 mm/kmsan/shadow.c create mode 100644 scripts/Makefile.kmsan diff --git a/Makefile b/Makefile index 952d354069a4..c9f37d25ea63 100644 --- a/Makefile +++ b/Makefile @@ -1015,6 +1015,7 @@ include-y := scripts/Makefile.extrawarn include-$(CONFIG_DEBUG_INFO) += scripts/Makefile.debug include-$(CONFIG_KASAN) += scripts/Makefile.kasan include-$(CONFIG_KCSAN) += scripts/Makefile.kcsan +include-$(CONFIG_KMSAN) += scripts/Makefile.kmsan include-$(CONFIG_UBSAN) += scripts/Makefile.ubsan include-$(CONFIG_KCOV) += scripts/Makefile.kcov include-$(CONFIG_RANDSTRUCT) += scripts/Makefile.randstruct diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h new file mode 100644 index 000000000000..a6522a0c28df --- /dev/null +++ b/include/linux/kmsan-checks.h @@ -0,0 +1,64 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KMSAN checks to be used for one-off annotations in subsystems. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#ifndef _LINUX_KMSAN_CHECKS_H +#define _LINUX_KMSAN_CHECKS_H + +#include + +#ifdef CONFIG_KMSAN + +/** + * kmsan_poison_memory() - Mark the memory range as uninitialized. + * @address: address to start with. + * @size: size of buffer to poison. + * @flags: GFP flags for allocations done by this function. + * + * Until other data is written to this range, KMSAN will treat it as + * uninitialized. Error reports for this memory will reference the call site of + * kmsan_poison_memory() as origin. + */ +void kmsan_poison_memory(const void *address, size_t size, gfp_t flags); + +/** + * kmsan_unpoison_memory() - Mark the memory range as initialized. + * @address: address to start with. + * @size: size of buffer to unpoison. + * + * Until other data is written to this range, KMSAN will treat it as + * initialized. + */ +void kmsan_unpoison_memory(const void *address, size_t size); + +/** + * kmsan_check_memory() - Check the memory range for being initialized. + * @address: address to start with. + * @size: size of buffer to check. + * + * If any piece of the given range is marked as uninitialized, KMSAN will report + * an error. + */ +void kmsan_check_memory(const void *address, size_t size); + +#else + +static inline void kmsan_poison_memory(const void *address, size_t size, + gfp_t flags) +{ +} +static inline void kmsan_unpoison_memory(const void *address, size_t size) +{ +} +static inline void kmsan_check_memory(const void *address, size_t size) +{ +} + +#endif + +#endif /* _LINUX_KMSAN_CHECKS_H */ diff --git a/include/linux/kmsan_types.h b/include/linux/kmsan_types.h new file mode 100644 index 000000000000..8bfa6c98176d --- /dev/null +++ b/include/linux/kmsan_types.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * A minimal header declaring types added by KMSAN to existing kernel structs. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ +#ifndef _LINUX_KMSAN_TYPES_H +#define _LINUX_KMSAN_TYPES_H + +/* These constants are defined in the MSan LLVM instrumentation pass. */ +#define KMSAN_RETVAL_SIZE 800 +#define KMSAN_PARAM_SIZE 800 + +struct kmsan_context_state { + char param_tls[KMSAN_PARAM_SIZE]; + char retval_tls[KMSAN_RETVAL_SIZE]; + char va_arg_tls[KMSAN_PARAM_SIZE]; + char va_arg_origin_tls[KMSAN_PARAM_SIZE]; + u64 va_arg_overflow_size_tls; + char param_origin_tls[KMSAN_PARAM_SIZE]; + u32 retval_origin_tls; +}; + +#undef KMSAN_PARAM_SIZE +#undef KMSAN_RETVAL_SIZE + +struct kmsan_ctx { + struct kmsan_context_state cstate; + int kmsan_in_runtime; + bool allow_reporting; +}; + +#endif /* _LINUX_KMSAN_TYPES_H */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5c87d0f292a2..500e536796ca 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -224,6 +224,18 @@ struct page { not kmapped, ie. highmem) */ #endif /* WANT_PAGE_VIRTUAL */ +#ifdef CONFIG_KMSAN + /* + * KMSAN metadata for this page: + * - shadow page: every bit indicates whether the corresponding + * bit of the original page is initialized (0) or not (1); + * - origin page: every 4 bytes contain an id of the stack trace + * where the uninitialized value was created. + */ + struct page *kmsan_shadow; + struct page *kmsan_origin; +#endif + #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif diff --git a/include/linux/sched.h b/include/linux/sched.h index fbac3c19fe35..88a043f7235e 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -1362,6 +1363,10 @@ struct task_struct { #endif #endif +#ifdef CONFIG_KMSAN + struct kmsan_ctx kmsan_ctx; +#endif + #if IS_ENABLED(CONFIG_KUNIT) struct kunit *kunit_test; #endif diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 6d1544d9201e..0129bee7de01 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -970,6 +970,7 @@ config DEBUG_STACKOVERFLOW source "lib/Kconfig.kasan" source "lib/Kconfig.kfence" +source "lib/Kconfig.kmsan" endmenu # "Memory Debugging" diff --git a/lib/Kconfig.kmsan b/lib/Kconfig.kmsan new file mode 100644 index 000000000000..5b19dbd34d76 --- /dev/null +++ b/lib/Kconfig.kmsan @@ -0,0 +1,50 @@ +# SPDX-License-Identifier: GPL-2.0-only +config HAVE_ARCH_KMSAN + bool + +config HAVE_KMSAN_COMPILER + # Clang versions <14.0.0 also support -fsanitize=kernel-memory, but not + # all the features necessary to build the kernel with KMSAN. + depends on CC_IS_CLANG && CLANG_VERSION >= 140000 + def_bool $(cc-option,-fsanitize=kernel-memory -mllvm -msan-disable-checks=1) + +config KMSAN + bool "KMSAN: detector of uninitialized values use" + depends on HAVE_ARCH_KMSAN && HAVE_KMSAN_COMPILER + depends on SLUB && DEBUG_KERNEL && !KASAN && !KCSAN + select STACKDEPOT + select STACKDEPOT_ALWAYS_INIT + help + KernelMemorySanitizer (KMSAN) is a dynamic detector of uses of + uninitialized values in the kernel. It is based on compiler + instrumentation provided by Clang and thus requires Clang to build. + + An important note is that KMSAN is not intended for production use, + because it drastically increases kernel memory footprint and slows + the whole system down. + + See for more details. + +if KMSAN + +config HAVE_KMSAN_PARAM_RETVAL + # -fsanitize-memory-param-retval is supported only by Clang >= 14. + depends on HAVE_KMSAN_COMPILER + def_bool $(cc-option,-fsanitize=kernel-memory -fsanitize-memory-param-retval) + +config KMSAN_CHECK_PARAM_RETVAL + bool "Check for uninitialized values passed to and returned from functions" + default y + depends on HAVE_KMSAN_PARAM_RETVAL + help + If the compiler supports -fsanitize-memory-param-retval, KMSAN will + eagerly check every function parameter passed by value and every + function return value. + + Disabling KMSAN_CHECK_PARAM_RETVAL will result in tracking shadow for + function parameters and return values across function borders. This + is a more relaxed mode, but it generates more instrumentation code and + may potentially report errors in corner cases when non-instrumented + functions call instrumented ones. + +endif diff --git a/mm/Makefile b/mm/Makefile index a731d1decbb1..cc23b0052584 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -89,6 +89,7 @@ obj-$(CONFIG_SLAB) += slab.o obj-$(CONFIG_SLUB) += slub.o obj-$(CONFIG_KASAN) += kasan/ obj-$(CONFIG_KFENCE) += kfence/ +obj-$(CONFIG_KMSAN) += kmsan/ obj-$(CONFIG_FAILSLAB) += failslab.o obj-$(CONFIG_MEMTEST) += memtest.o obj-$(CONFIG_MIGRATION) += migrate.o diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile new file mode 100644 index 000000000000..550ad8625e4f --- /dev/null +++ b/mm/kmsan/Makefile @@ -0,0 +1,23 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for KernelMemorySanitizer (KMSAN). +# +# +obj-y := core.o instrumentation.o hooks.o report.o shadow.o + +KMSAN_SANITIZE := n +KCOV_INSTRUMENT := n +UBSAN_SANITIZE := n + +# Disable instrumentation of KMSAN runtime with other tools. +CC_FLAGS_KMSAN_RUNTIME := -fno-stack-protector +CC_FLAGS_KMSAN_RUNTIME += $(call cc-option,-fno-conserve-stack) +CC_FLAGS_KMSAN_RUNTIME += -DDISABLE_BRANCH_PROFILING + +CFLAGS_REMOVE.o = $(CC_FLAGS_FTRACE) + +CFLAGS_core.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_hooks.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_instrumentation.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_report.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_shadow.o := $(CC_FLAGS_KMSAN_RUNTIME) diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c new file mode 100644 index 000000000000..5330138fda5b --- /dev/null +++ b/mm/kmsan/core.c @@ -0,0 +1,440 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN runtime library. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../slab.h" +#include "kmsan.h" + +bool kmsan_enabled __read_mostly; + +/* + * Per-CPU KMSAN context to be used in interrupts, where current->kmsan is + * unavaliable. + */ +DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); + +void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags, + unsigned int poison_flags) +{ + u32 extra_bits = + kmsan_extra_bits(/*depth*/ 0, poison_flags & KMSAN_POISON_FREE); + bool checked = poison_flags & KMSAN_POISON_CHECK; + depot_stack_handle_t handle; + + handle = kmsan_save_stack_with_flags(flags, extra_bits); + kmsan_internal_set_shadow_origin(address, size, -1, handle, checked); +} + +void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked) +{ + kmsan_internal_set_shadow_origin(address, size, 0, 0, checked); +} + +depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, + unsigned int extra) +{ + unsigned long entries[KMSAN_STACK_DEPTH]; + unsigned int nr_entries; + + nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0); + + /* Don't sleep (see might_sleep_if() in __alloc_pages_nodemask()). */ + flags &= ~__GFP_DIRECT_RECLAIM; + + return __stack_depot_save(entries, nr_entries, extra, flags, true); +} + +/* Copy the metadata following the memmove() behavior. */ +void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n) +{ + depot_stack_handle_t old_origin = 0, new_origin = 0; + int src_slots, dst_slots, i, iter, step, skip_bits; + depot_stack_handle_t *origin_src, *origin_dst; + void *shadow_src, *shadow_dst; + u32 *align_shadow_src, shadow; + bool backwards; + + shadow_dst = kmsan_get_metadata(dst, KMSAN_META_SHADOW); + if (!shadow_dst) + return; + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(dst, n)); + + shadow_src = kmsan_get_metadata(src, KMSAN_META_SHADOW); + if (!shadow_src) { + /* + * @src is untracked: zero out destination shadow, ignore the + * origins, we're done. + */ + __memset(shadow_dst, 0, n); + return; + } + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(src, n)); + + __memmove(shadow_dst, shadow_src, n); + + origin_dst = kmsan_get_metadata(dst, KMSAN_META_ORIGIN); + origin_src = kmsan_get_metadata(src, KMSAN_META_ORIGIN); + KMSAN_WARN_ON(!origin_dst || !origin_src); + src_slots = (ALIGN((u64)src + n, KMSAN_ORIGIN_SIZE) - + ALIGN_DOWN((u64)src, KMSAN_ORIGIN_SIZE)) / + KMSAN_ORIGIN_SIZE; + dst_slots = (ALIGN((u64)dst + n, KMSAN_ORIGIN_SIZE) - + ALIGN_DOWN((u64)dst, KMSAN_ORIGIN_SIZE)) / + KMSAN_ORIGIN_SIZE; + KMSAN_WARN_ON((src_slots < 1) || (dst_slots < 1)); + KMSAN_WARN_ON((src_slots - dst_slots > 1) || + (dst_slots - src_slots < -1)); + + backwards = dst > src; + i = backwards ? min(src_slots, dst_slots) - 1 : 0; + iter = backwards ? -1 : 1; + + align_shadow_src = + (u32 *)ALIGN_DOWN((u64)shadow_src, KMSAN_ORIGIN_SIZE); + for (step = 0; step < min(src_slots, dst_slots); step++, i += iter) { + KMSAN_WARN_ON(i < 0); + shadow = align_shadow_src[i]; + if (i == 0) { + /* + * If @src isn't aligned on KMSAN_ORIGIN_SIZE, don't + * look at the first @src % KMSAN_ORIGIN_SIZE bytes + * of the first shadow slot. + */ + skip_bits = ((u64)src % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow >> skip_bits) << skip_bits; + } + if (i == src_slots - 1) { + /* + * If @src + n isn't aligned on + * KMSAN_ORIGIN_SIZE, don't look at the last + * (@src + n) % KMSAN_ORIGIN_SIZE bytes of the + * last shadow slot. + */ + skip_bits = (((u64)src + n) % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow << skip_bits) >> skip_bits; + } + /* + * Overwrite the origin only if the corresponding + * shadow is nonempty. + */ + if (origin_src[i] && (origin_src[i] != old_origin) && shadow) { + old_origin = origin_src[i]; + new_origin = kmsan_internal_chain_origin(old_origin); + /* + * kmsan_internal_chain_origin() may return + * NULL, but we don't want to lose the previous + * origin value. + */ + if (!new_origin) + new_origin = old_origin; + } + if (shadow) + origin_dst[i] = new_origin; + else + origin_dst[i] = 0; + } + /* + * If dst_slots is greater than src_slots (i.e. + * dst_slots == src_slots + 1), there is an extra origin slot at the + * beginning or end of the destination buffer, for which we take the + * origin from the previous slot. + * This is only done if the part of the source shadow corresponding to + * slot is non-zero. + * + * E.g. if we copy 8 aligned bytes that are marked as uninitialized + * and have origins o111 and o222, to an unaligned buffer with offset 1, + * these two origins are copied to three origin slots, so one of then + * needs to be duplicated, depending on the copy direction (@backwards) + * + * src shadow: |uuuu|uuuu|....| + * src origin: |o111|o222|....| + * + * backwards = 0: + * dst shadow: |.uuu|uuuu|u...| + * dst origin: |....|o111|o222| - fill the empty slot with o111 + * backwards = 1: + * dst shadow: |.uuu|uuuu|u...| + * dst origin: |o111|o222|....| - fill the empty slot with o222 + */ + if (src_slots < dst_slots) { + if (backwards) { + shadow = align_shadow_src[src_slots - 1]; + skip_bits = (((u64)dst + n) % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow << skip_bits) >> skip_bits; + if (shadow) + /* src_slots > 0, therefore dst_slots is at least 2 */ + origin_dst[dst_slots - 1] = + origin_dst[dst_slots - 2]; + } else { + shadow = align_shadow_src[0]; + skip_bits = ((u64)dst % KMSAN_ORIGIN_SIZE) * 8; + shadow = (shadow >> skip_bits) << skip_bits; + if (shadow) + origin_dst[0] = origin_dst[1]; + } + } +} + +depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id) +{ + unsigned long entries[3]; + u32 extra_bits; + int depth; + bool uaf; + + if (!id) + return id; + /* + * Make sure we have enough spare bits in @id to hold the UAF bit and + * the chain depth. + */ + BUILD_BUG_ON( + (1 << STACK_DEPOT_EXTRA_BITS) <= (KMSAN_MAX_ORIGIN_DEPTH << 1)); + + extra_bits = stack_depot_get_extra_bits(id); + depth = kmsan_depth_from_eb(extra_bits); + uaf = kmsan_uaf_from_eb(extra_bits); + + /* + * Stop chaining origins once the depth reached KMSAN_MAX_ORIGIN_DEPTH. + * This mostly happens in the case structures with uninitialized padding + * are copied around many times. Origin chains for such structures are + * usually periodic, and it does not make sense to fully store them. + */ + if (depth == KMSAN_MAX_ORIGIN_DEPTH) + return id; + + depth++; + extra_bits = kmsan_extra_bits(depth, uaf); + + entries[0] = KMSAN_CHAIN_MAGIC_ORIGIN; + entries[1] = kmsan_save_stack_with_flags(GFP_ATOMIC, 0); + entries[2] = id; + /* + * @entries is a local var in non-instrumented code, so KMSAN does not + * know it is initialized. Explicitly unpoison it to avoid false + * positives when __stack_depot_save() passes it to instrumented code. + */ + kmsan_internal_unpoison_memory(entries, sizeof(entries), false); + return __stack_depot_save(entries, ARRAY_SIZE(entries), extra_bits, + GFP_ATOMIC, true); +} + +void kmsan_internal_set_shadow_origin(void *addr, size_t size, int b, + u32 origin, bool checked) +{ + u64 address = (u64)addr; + void *shadow_start; + u32 *origin_start; + size_t pad = 0; + + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size)); + shadow_start = kmsan_get_metadata(addr, KMSAN_META_SHADOW); + if (!shadow_start) { + /* + * kmsan_metadata_is_contiguous() is true, so either all shadow + * and origin pages are NULL, or all are non-NULL. + */ + if (checked) { + pr_err("%s: not memsetting %ld bytes starting at %px, because the shadow is NULL\n", + __func__, size, addr); + KMSAN_WARN_ON(true); + } + return; + } + __memset(shadow_start, b, size); + + if (!IS_ALIGNED(address, KMSAN_ORIGIN_SIZE)) { + pad = address % KMSAN_ORIGIN_SIZE; + address -= pad; + size += pad; + } + size = ALIGN(size, KMSAN_ORIGIN_SIZE); + origin_start = + (u32 *)kmsan_get_metadata((void *)address, KMSAN_META_ORIGIN); + + for (int i = 0; i < size / KMSAN_ORIGIN_SIZE; i++) + origin_start[i] = origin; +} + +struct page *kmsan_vmalloc_to_page_or_null(void *vaddr) +{ + struct page *page; + + if (!kmsan_internal_is_vmalloc_addr(vaddr) && + !kmsan_internal_is_module_addr(vaddr)) + return NULL; + page = vmalloc_to_page(vaddr); + if (pfn_valid(page_to_pfn(page))) + return page; + else + return NULL; +} + +void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, + int reason) +{ + depot_stack_handle_t cur_origin = 0, new_origin = 0; + unsigned long addr64 = (unsigned long)addr; + depot_stack_handle_t *origin = NULL; + unsigned char *shadow = NULL; + int cur_off_start = -1; + int chunk_size; + size_t pos = 0; + + if (!size) + return; + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(addr, size)); + while (pos < size) { + chunk_size = min(size - pos, + PAGE_SIZE - ((addr64 + pos) % PAGE_SIZE)); + shadow = kmsan_get_metadata((void *)(addr64 + pos), + KMSAN_META_SHADOW); + if (!shadow) { + /* + * This page is untracked. If there were uninitialized + * bytes before, report them. + */ + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos - 1, user_addr, + reason); + kmsan_leave_runtime(); + } + cur_origin = 0; + cur_off_start = -1; + pos += chunk_size; + continue; + } + for (int i = 0; i < chunk_size; i++) { + if (!shadow[i]) { + /* + * This byte is unpoisoned. If there were + * poisoned bytes before, report them. + */ + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos + i - 1, + user_addr, reason); + kmsan_leave_runtime(); + } + cur_origin = 0; + cur_off_start = -1; + continue; + } + origin = kmsan_get_metadata((void *)(addr64 + pos + i), + KMSAN_META_ORIGIN); + KMSAN_WARN_ON(!origin); + new_origin = *origin; + /* + * Encountered new origin - report the previous + * uninitialized range. + */ + if (cur_origin != new_origin) { + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, + cur_off_start, pos + i - 1, + user_addr, reason); + kmsan_leave_runtime(); + } + cur_origin = new_origin; + cur_off_start = pos + i; + } + } + pos += chunk_size; + } + KMSAN_WARN_ON(pos != size); + if (cur_origin) { + kmsan_enter_runtime(); + kmsan_report(cur_origin, addr, size, cur_off_start, pos - 1, + user_addr, reason); + kmsan_leave_runtime(); + } +} + +bool kmsan_metadata_is_contiguous(void *addr, size_t size) +{ + char *cur_shadow = NULL, *next_shadow = NULL, *cur_origin = NULL, + *next_origin = NULL; + u64 cur_addr = (u64)addr, next_addr = cur_addr + PAGE_SIZE; + depot_stack_handle_t *origin_p; + bool all_untracked = false; + + if (!size) + return true; + + /* The whole range belongs to the same page. */ + if (ALIGN_DOWN(cur_addr + size - 1, PAGE_SIZE) == + ALIGN_DOWN(cur_addr, PAGE_SIZE)) + return true; + + cur_shadow = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ false); + if (!cur_shadow) + all_untracked = true; + cur_origin = kmsan_get_metadata((void *)cur_addr, /*is_origin*/ true); + if (all_untracked && cur_origin) + goto report; + + for (; next_addr < (u64)addr + size; + cur_addr = next_addr, cur_shadow = next_shadow, + cur_origin = next_origin, next_addr += PAGE_SIZE) { + next_shadow = kmsan_get_metadata((void *)next_addr, false); + next_origin = kmsan_get_metadata((void *)next_addr, true); + if (all_untracked) { + if (next_shadow || next_origin) + goto report; + if (!next_shadow && !next_origin) + continue; + } + if (((u64)cur_shadow == ((u64)next_shadow - PAGE_SIZE)) && + ((u64)cur_origin == ((u64)next_origin - PAGE_SIZE))) + continue; + goto report; + } + return true; + +report: + pr_err("%s: attempting to access two shadow page ranges.\n", __func__); + pr_err("Access of size %ld at %px.\n", size, addr); + pr_err("Addresses belonging to different ranges: %px and %px\n", + (void *)cur_addr, (void *)next_addr); + pr_err("page[0].shadow: %px, page[1].shadow: %px\n", cur_shadow, + next_shadow); + pr_err("page[0].origin: %px, page[1].origin: %px\n", cur_origin, + next_origin); + origin_p = kmsan_get_metadata(addr, KMSAN_META_ORIGIN); + if (origin_p) { + pr_err("Origin: %08x\n", *origin_p); + kmsan_print_origin(*origin_p); + } else { + pr_err("Origin: unavailable\n"); + } + return false; +} diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c new file mode 100644 index 000000000000..4ac62fa67a02 --- /dev/null +++ b/mm/kmsan/hooks.c @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN hooks for kernel subsystems. + * + * These functions handle creation of KMSAN metadata for memory allocations. + * + * Copyright (C) 2018-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "../slab.h" +#include "kmsan.h" + +/* + * Instrumented functions shouldn't be called under + * kmsan_enter_runtime()/kmsan_leave_runtime(), because this will lead to + * skipping effects of functions like memset() inside instrumented code. + */ + +/* Functions from kmsan-checks.h follow. */ +void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + /* The users may want to poison/unpoison random memory. */ + kmsan_internal_poison_memory((void *)address, size, flags, + KMSAN_POISON_NOCHECK); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(kmsan_poison_memory); + +void kmsan_unpoison_memory(const void *address, size_t size) +{ + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ua_flags = user_access_save(); + kmsan_enter_runtime(); + /* The users may want to poison/unpoison random memory. */ + kmsan_internal_unpoison_memory((void *)address, size, + KMSAN_POISON_NOCHECK); + kmsan_leave_runtime(); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(kmsan_unpoison_memory); + +void kmsan_check_memory(const void *addr, size_t size) +{ + if (!kmsan_enabled) + return; + return kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); +} +EXPORT_SYMBOL(kmsan_check_memory); diff --git a/mm/kmsan/instrumentation.c b/mm/kmsan/instrumentation.c new file mode 100644 index 000000000000..280d15413268 --- /dev/null +++ b/mm/kmsan/instrumentation.c @@ -0,0 +1,307 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN compiler API. + * + * This file implements __msan_XXX hooks that Clang inserts into the code + * compiled with -fsanitize=kernel-memory. + * See Documentation/dev-tools/kmsan.rst for more information on how KMSAN + * instrumentation works. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include "kmsan.h" +#include +#include +#include + +static inline bool is_bad_asm_addr(void *addr, uintptr_t size, bool is_store) +{ + if ((u64)addr < TASK_SIZE) + return true; + if (!kmsan_get_metadata(addr, KMSAN_META_SHADOW)) + return true; + return false; +} + +static inline struct shadow_origin_ptr +get_shadow_origin_ptr(void *addr, u64 size, bool store) +{ + unsigned long ua_flags = user_access_save(); + struct shadow_origin_ptr ret; + + ret = kmsan_get_shadow_origin_ptr(addr, size, store); + user_access_restore(ua_flags); + return ret; +} + +/* Get shadow and origin pointers for a memory load with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_load_n(void *addr, + uintptr_t size) +{ + return get_shadow_origin_ptr(addr, size, /*store*/ false); +} +EXPORT_SYMBOL(__msan_metadata_ptr_for_load_n); + +/* Get shadow and origin pointers for a memory store with non-standard size. */ +struct shadow_origin_ptr __msan_metadata_ptr_for_store_n(void *addr, + uintptr_t size) +{ + return get_shadow_origin_ptr(addr, size, /*store*/ true); +} +EXPORT_SYMBOL(__msan_metadata_ptr_for_store_n); + +/* + * Declare functions that obtain shadow/origin pointers for loads and stores + * with fixed size. + */ +#define DECLARE_METADATA_PTR_GETTER(size) \ + struct shadow_origin_ptr __msan_metadata_ptr_for_load_##size( \ + void *addr) \ + { \ + return get_shadow_origin_ptr(addr, size, /*store*/ false); \ + } \ + EXPORT_SYMBOL(__msan_metadata_ptr_for_load_##size); \ + struct shadow_origin_ptr __msan_metadata_ptr_for_store_##size( \ + void *addr) \ + { \ + return get_shadow_origin_ptr(addr, size, /*store*/ true); \ + } \ + EXPORT_SYMBOL(__msan_metadata_ptr_for_store_##size) + +DECLARE_METADATA_PTR_GETTER(1); +DECLARE_METADATA_PTR_GETTER(2); +DECLARE_METADATA_PTR_GETTER(4); +DECLARE_METADATA_PTR_GETTER(8); + +/* + * Handle a memory store performed by inline assembly. KMSAN conservatively + * attempts to unpoison the outputs of asm() directives to prevent false + * positives caused by missed stores. + */ +void __msan_instrument_asm_store(void *addr, uintptr_t size) +{ + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ua_flags = user_access_save(); + /* + * Most of the accesses are below 32 bytes. The two exceptions so far + * are clwb() (64 bytes) and FPU state (512 bytes). + * It's unlikely that the assembly will touch more than 512 bytes. + */ + if (size > 512) { + WARN_ONCE(1, "assembly store size too big: %ld\n", size); + size = 8; + } + if (is_bad_asm_addr(addr, size, /*is_store*/ true)) { + user_access_restore(ua_flags); + return; + } + kmsan_enter_runtime(); + /* Unpoisoning the memory on best effort. */ + kmsan_internal_unpoison_memory(addr, size, /*checked*/ false); + kmsan_leave_runtime(); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(__msan_instrument_asm_store); + +/* + * KMSAN instrumentation pass replaces LLVM memcpy, memmove and memset + * intrinsics with calls to respective __msan_ functions. We use + * get_param0_metadata() and set_retval_metadata() to store the shadow/origin + * values for the destination argument of these functions and use them for the + * functions' return values. + */ +static inline void get_param0_metadata(u64 *shadow, + depot_stack_handle_t *origin) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + *shadow = *(u64 *)(ctx->cstate.param_tls); + *origin = ctx->cstate.param_origin_tls[0]; +} + +static inline void set_retval_metadata(u64 shadow, depot_stack_handle_t origin) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + *(u64 *)(ctx->cstate.retval_tls) = shadow; + ctx->cstate.retval_origin_tls = origin; +} + +/* Handle llvm.memmove intrinsic. */ +void *__msan_memmove(void *dst, const void *src, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memmove(dst, src, n); + if (!n) + /* Some people call memmove() with zero length. */ + return result; + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + kmsan_internal_memmove_metadata(dst, (void *)src, n); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memmove); + +/* Handle llvm.memcpy intrinsic. */ +void *__msan_memcpy(void *dst, const void *src, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memcpy(dst, src, n); + if (!n) + /* Some people call memcpy() with zero length. */ + return result; + + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + /* Using memmove instead of memcpy doesn't affect correctness. */ + kmsan_internal_memmove_metadata(dst, (void *)src, n); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memcpy); + +/* Handle llvm.memset intrinsic. */ +void *__msan_memset(void *dst, int c, uintptr_t n) +{ + depot_stack_handle_t origin; + void *result; + u64 shadow; + + get_param0_metadata(&shadow, &origin); + result = __memset(dst, c, n); + if (!kmsan_enabled || kmsan_in_runtime()) + return result; + + kmsan_enter_runtime(); + /* + * Clang doesn't pass parameter metadata here, so it is impossible to + * use shadow of @c to set up the shadow for @dst. + */ + kmsan_internal_unpoison_memory(dst, n, /*checked*/ false); + kmsan_leave_runtime(); + + set_retval_metadata(shadow, origin); + return result; +} +EXPORT_SYMBOL(__msan_memset); + +/* + * Create a new origin from an old one. This is done when storing an + * uninitialized value to memory. When reporting an error, KMSAN unrolls and + * prints the whole chain of stores that preceded the use of this value. + */ +depot_stack_handle_t __msan_chain_origin(depot_stack_handle_t origin) +{ + depot_stack_handle_t ret = 0; + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return ret; + + ua_flags = user_access_save(); + + /* Creating new origins may allocate memory. */ + kmsan_enter_runtime(); + ret = kmsan_internal_chain_origin(origin); + kmsan_leave_runtime(); + user_access_restore(ua_flags); + return ret; +} +EXPORT_SYMBOL(__msan_chain_origin); + +/* Poison a local variable when entering a function. */ +void __msan_poison_alloca(void *address, uintptr_t size, char *descr) +{ + depot_stack_handle_t handle; + unsigned long entries[4]; + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ua_flags = user_access_save(); + entries[0] = KMSAN_ALLOCA_MAGIC_ORIGIN; + entries[1] = (u64)descr; + entries[2] = (u64)__builtin_return_address(0); + /* + * With frame pointers enabled, it is possible to quickly fetch the + * second frame of the caller stack without calling the unwinder. + * Without them, simply do not bother. + */ + if (IS_ENABLED(CONFIG_UNWINDER_FRAME_POINTER)) + entries[3] = (u64)__builtin_return_address(1); + else + entries[3] = 0; + + /* stack_depot_save() may allocate memory. */ + kmsan_enter_runtime(); + handle = stack_depot_save(entries, ARRAY_SIZE(entries), GFP_ATOMIC); + kmsan_leave_runtime(); + + kmsan_internal_set_shadow_origin(address, size, -1, handle, + /*checked*/ true); + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(__msan_poison_alloca); + +/* Unpoison a local variable. */ +void __msan_unpoison_alloca(void *address, uintptr_t size) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + kmsan_enter_runtime(); + kmsan_internal_unpoison_memory(address, size, /*checked*/ true); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(__msan_unpoison_alloca); + +/* + * Report that an uninitialized value with the given origin was used in a way + * that constituted undefined behavior. + */ +void __msan_warning(u32 origin) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + kmsan_report(origin, /*address*/ 0, /*size*/ 0, + /*off_first*/ 0, /*off_last*/ 0, /*user_addr*/ 0, + REASON_ANY); + kmsan_leave_runtime(); +} +EXPORT_SYMBOL(__msan_warning); + +/* + * At the beginning of an instrumented function, obtain the pointer to + * `struct kmsan_context_state` holding the metadata for function parameters. + */ +struct kmsan_context_state *__msan_get_context_state(void) +{ + return &kmsan_get_context()->cstate; +} +EXPORT_SYMBOL(__msan_get_context_state); diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h new file mode 100644 index 000000000000..97d48b45dba5 --- /dev/null +++ b/mm/kmsan/kmsan.h @@ -0,0 +1,204 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Functions used by the KMSAN runtime. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#ifndef __MM_KMSAN_KMSAN_H +#define __MM_KMSAN_KMSAN_H + +#include +#include +#include +#include +#include +#include +#include +#include + +#define KMSAN_ALLOCA_MAGIC_ORIGIN 0xabcd0100 +#define KMSAN_CHAIN_MAGIC_ORIGIN 0xabcd0200 + +#define KMSAN_POISON_NOCHECK 0x0 +#define KMSAN_POISON_CHECK 0x1 +#define KMSAN_POISON_FREE 0x2 + +#define KMSAN_ORIGIN_SIZE 4 +#define KMSAN_MAX_ORIGIN_DEPTH 7 + +#define KMSAN_STACK_DEPTH 64 + +#define KMSAN_META_SHADOW (false) +#define KMSAN_META_ORIGIN (true) + +extern bool kmsan_enabled; +extern int panic_on_kmsan; + +/* + * KMSAN performs a lot of consistency checks that are currently enabled by + * default. BUG_ON is normally discouraged in the kernel, unless used for + * debugging, but KMSAN itself is a debugging tool, so it makes little sense to + * recover if something goes wrong. + */ +#define KMSAN_WARN_ON(cond) \ + ({ \ + const bool __cond = WARN_ON(cond); \ + if (unlikely(__cond)) { \ + WRITE_ONCE(kmsan_enabled, false); \ + if (panic_on_kmsan) { \ + /* Can't call panic() here because */ \ + /* of uaccess checks. */ \ + BUG(); \ + } \ + } \ + __cond; \ + }) + +/* + * A pair of metadata pointers to be returned by the instrumentation functions. + */ +struct shadow_origin_ptr { + void *shadow, *origin; +}; + +struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *addr, u64 size, + bool store); +void *kmsan_get_metadata(void *addr, bool is_origin); + +enum kmsan_bug_reason { + REASON_ANY, + REASON_COPY_TO_USER, + REASON_SUBMIT_URB, +}; + +void kmsan_print_origin(depot_stack_handle_t origin); + +/** + * kmsan_report() - Report a use of uninitialized value. + * @origin: Stack ID of the uninitialized value. + * @address: Address at which the memory access happens. + * @size: Memory access size. + * @off_first: Offset (from @address) of the first byte to be reported. + * @off_last: Offset (from @address) of the last byte to be reported. + * @user_addr: When non-NULL, denotes the userspace address to which the kernel + * is leaking data. + * @reason: Error type from enum kmsan_bug_reason. + * + * kmsan_report() prints an error message for a consequent group of bytes + * sharing the same origin. If an uninitialized value is used in a comparison, + * this function is called once without specifying the addresses. When checking + * a memory range, KMSAN may call kmsan_report() multiple times with the same + * @address, @size, @user_addr and @reason, but different @off_first and + * @off_last corresponding to different @origin values. + */ +void kmsan_report(depot_stack_handle_t origin, void *address, int size, + int off_first, int off_last, const void *user_addr, + enum kmsan_bug_reason reason); + +DECLARE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); + +static __always_inline struct kmsan_ctx *kmsan_get_context(void) +{ + return in_task() ? ¤t->kmsan_ctx : raw_cpu_ptr(&kmsan_percpu_ctx); +} + +/* + * When a compiler hook or KMSAN runtime function is invoked, it may make a + * call to instrumented code and eventually call itself recursively. To avoid + * that, we guard the runtime entry regions with + * kmsan_enter_runtime()/kmsan_leave_runtime() and exit the hook if + * kmsan_in_runtime() is true. + * + * Non-runtime code may occasionally get executed in nested IRQs from the + * runtime code (e.g. when called via smp_call_function_single()). Because some + * KMSAN routines may take locks (e.g. for memory allocation), we conservatively + * bail out instead of calling them. To minimize the effect of this (potentially + * missing initialization events) kmsan_in_runtime() is not checked in + * non-blocking runtime functions. + */ +static __always_inline bool kmsan_in_runtime(void) +{ + if ((hardirq_count() >> HARDIRQ_SHIFT) > 1) + return true; + return kmsan_get_context()->kmsan_in_runtime; +} + +static __always_inline void kmsan_enter_runtime(void) +{ + struct kmsan_ctx *ctx; + + ctx = kmsan_get_context(); + KMSAN_WARN_ON(ctx->kmsan_in_runtime++); +} + +static __always_inline void kmsan_leave_runtime(void) +{ + struct kmsan_ctx *ctx = kmsan_get_context(); + + KMSAN_WARN_ON(--ctx->kmsan_in_runtime); +} + +depot_stack_handle_t kmsan_save_stack(void); +depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags, + unsigned int extra_bits); + +/* + * Pack and unpack the origin chain depth and UAF flag to/from the extra bits + * provided by the stack depot. + * The UAF flag is stored in the lowest bit, followed by the depth in the upper + * bits. + * set_dsh_extra_bits() is responsible for clamping the value. + */ +static __always_inline unsigned int kmsan_extra_bits(unsigned int depth, + bool uaf) +{ + return (depth << 1) | uaf; +} + +static __always_inline bool kmsan_uaf_from_eb(unsigned int extra_bits) +{ + return extra_bits & 1; +} + +static __always_inline unsigned int kmsan_depth_from_eb(unsigned int extra_bits) +{ + return extra_bits >> 1; +} + +/* + * kmsan_internal_ functions are supposed to be very simple and not require the + * kmsan_in_runtime() checks. + */ +void kmsan_internal_memmove_metadata(void *dst, void *src, size_t n); +void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags, + unsigned int poison_flags); +void kmsan_internal_unpoison_memory(void *address, size_t size, bool checked); +void kmsan_internal_set_shadow_origin(void *address, size_t size, int b, + u32 origin, bool checked); +depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id); + +bool kmsan_metadata_is_contiguous(void *addr, size_t size); +void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, + int reason); + +struct page *kmsan_vmalloc_to_page_or_null(void *vaddr); + +/* + * kmsan_internal_is_module_addr() and kmsan_internal_is_vmalloc_addr() are + * non-instrumented versions of is_module_address() and is_vmalloc_addr() that + * are safe to call from KMSAN runtime without recursion. + */ +static inline bool kmsan_internal_is_module_addr(void *vaddr) +{ + return ((u64)vaddr >= MODULES_VADDR) && ((u64)vaddr < MODULES_END); +} + +static inline bool kmsan_internal_is_vmalloc_addr(void *addr) +{ + return ((u64)addr >= VMALLOC_START) && ((u64)addr < VMALLOC_END); +} + +#endif /* __MM_KMSAN_KMSAN_H */ diff --git a/mm/kmsan/report.c b/mm/kmsan/report.c new file mode 100644 index 000000000000..02736ec757f2 --- /dev/null +++ b/mm/kmsan/report.c @@ -0,0 +1,219 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN error reporting routines. + * + * Copyright (C) 2019-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include + +#include "kmsan.h" + +static DEFINE_RAW_SPINLOCK(kmsan_report_lock); +#define DESCR_SIZE 128 +/* Protected by kmsan_report_lock */ +static char report_local_descr[DESCR_SIZE]; +int panic_on_kmsan __read_mostly; + +#ifdef MODULE_PARAM_PREFIX +#undef MODULE_PARAM_PREFIX +#endif +#define MODULE_PARAM_PREFIX "kmsan." +module_param_named(panic, panic_on_kmsan, int, 0); + +/* + * Skip internal KMSAN frames. + */ +static int get_stack_skipnr(const unsigned long stack_entries[], + int num_entries) +{ + int len, skip; + char buf[64]; + + for (skip = 0; skip < num_entries; ++skip) { + len = scnprintf(buf, sizeof(buf), "%ps", + (void *)stack_entries[skip]); + + /* Never show __msan_* or kmsan_* functions. */ + if ((strnstr(buf, "__msan_", len) == buf) || + (strnstr(buf, "kmsan_", len) == buf)) + continue; + + /* + * No match for runtime functions -- @skip entries to skip to + * get to first frame of interest. + */ + break; + } + + return skip; +} + +/* + * Currently the descriptions of locals generated by Clang look as follows: + * ----local_name@function_name + * We want to print only the name of the local, as other information in that + * description can be confusing. + * The meaningful part of the description is copied to a global buffer to avoid + * allocating memory. + */ +static char *pretty_descr(char *descr) +{ + int pos = 0, len = strlen(descr); + + for (int i = 0; i < len; i++) { + if (descr[i] == '@') + break; + if (descr[i] == '-') + continue; + report_local_descr[pos] = descr[i]; + if (pos + 1 == DESCR_SIZE) + break; + pos++; + } + report_local_descr[pos] = 0; + return report_local_descr; +} + +void kmsan_print_origin(depot_stack_handle_t origin) +{ + unsigned long *entries = NULL, *chained_entries = NULL; + unsigned int nr_entries, chained_nr_entries, skipnr; + void *pc1 = NULL, *pc2 = NULL; + depot_stack_handle_t head; + unsigned long magic; + char *descr = NULL; + unsigned int depth; + + if (!origin) + return; + + while (true) { + nr_entries = stack_depot_fetch(origin, &entries); + depth = kmsan_depth_from_eb(stack_depot_get_extra_bits(origin)); + magic = nr_entries ? entries[0] : 0; + if ((nr_entries == 4) && (magic == KMSAN_ALLOCA_MAGIC_ORIGIN)) { + descr = (char *)entries[1]; + pc1 = (void *)entries[2]; + pc2 = (void *)entries[3]; + pr_err("Local variable %s created at:\n", + pretty_descr(descr)); + if (pc1) + pr_err(" %pSb\n", pc1); + if (pc2) + pr_err(" %pSb\n", pc2); + break; + } + if ((nr_entries == 3) && (magic == KMSAN_CHAIN_MAGIC_ORIGIN)) { + /* + * Origin chains deeper than KMSAN_MAX_ORIGIN_DEPTH are + * not stored, so the output may be incomplete. + */ + if (depth == KMSAN_MAX_ORIGIN_DEPTH) + pr_err("\n\n"); + head = entries[1]; + origin = entries[2]; + pr_err("Uninit was stored to memory at:\n"); + chained_nr_entries = + stack_depot_fetch(head, &chained_entries); + kmsan_internal_unpoison_memory( + chained_entries, + chained_nr_entries * sizeof(*chained_entries), + /*checked*/ false); + skipnr = get_stack_skipnr(chained_entries, + chained_nr_entries); + stack_trace_print(chained_entries + skipnr, + chained_nr_entries - skipnr, 0); + pr_err("\n"); + continue; + } + pr_err("Uninit was created at:\n"); + if (nr_entries) { + skipnr = get_stack_skipnr(entries, nr_entries); + stack_trace_print(entries + skipnr, nr_entries - skipnr, + 0); + } else { + pr_err("(stack is not available)\n"); + } + break; + } +} + +void kmsan_report(depot_stack_handle_t origin, void *address, int size, + int off_first, int off_last, const void *user_addr, + enum kmsan_bug_reason reason) +{ + unsigned long stack_entries[KMSAN_STACK_DEPTH]; + int num_stack_entries, skipnr; + char *bug_type = NULL; + unsigned long ua_flags; + bool is_uaf; + + if (!kmsan_enabled) + return; + if (!current->kmsan_ctx.allow_reporting) + return; + if (!origin) + return; + + current->kmsan_ctx.allow_reporting = false; + ua_flags = user_access_save(); + raw_spin_lock(&kmsan_report_lock); + pr_err("=====================================================\n"); + is_uaf = kmsan_uaf_from_eb(stack_depot_get_extra_bits(origin)); + switch (reason) { + case REASON_ANY: + bug_type = is_uaf ? "use-after-free" : "uninit-value"; + break; + case REASON_COPY_TO_USER: + bug_type = is_uaf ? "kernel-infoleak-after-free" : + "kernel-infoleak"; + break; + case REASON_SUBMIT_URB: + bug_type = is_uaf ? "kernel-usb-infoleak-after-free" : + "kernel-usb-infoleak"; + break; + } + + num_stack_entries = + stack_trace_save(stack_entries, KMSAN_STACK_DEPTH, 1); + skipnr = get_stack_skipnr(stack_entries, num_stack_entries); + + pr_err("BUG: KMSAN: %s in %pSb\n", bug_type, + (void *)stack_entries[skipnr]); + stack_trace_print(stack_entries + skipnr, num_stack_entries - skipnr, + 0); + pr_err("\n"); + + kmsan_print_origin(origin); + + if (size) { + pr_err("\n"); + if (off_first == off_last) + pr_err("Byte %d of %d is uninitialized\n", off_first, + size); + else + pr_err("Bytes %d-%d of %d are uninitialized\n", + off_first, off_last, size); + } + if (address) + pr_err("Memory access of size %d starts at %px\n", size, + address); + if (user_addr && reason == REASON_COPY_TO_USER) + pr_err("Data copied to user address %px\n", user_addr); + pr_err("\n"); + dump_stack_print_info(KERN_ERR); + pr_err("=====================================================\n"); + add_taint(TAINT_BAD_PAGE, LOCKDEP_NOW_UNRELIABLE); + raw_spin_unlock(&kmsan_report_lock); + if (panic_on_kmsan) + panic("kmsan.panic set ...\n"); + user_access_restore(ua_flags); + current->kmsan_ctx.allow_reporting = true; +} diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c new file mode 100644 index 000000000000..acc5279acc3b --- /dev/null +++ b/mm/kmsan/shadow.c @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN shadow implementation. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "../internal.h" +#include "kmsan.h" + +#define shadow_page_for(page) ((page)->kmsan_shadow) + +#define origin_page_for(page) ((page)->kmsan_origin) + +static void *shadow_ptr_for(struct page *page) +{ + return page_address(shadow_page_for(page)); +} + +static void *origin_ptr_for(struct page *page) +{ + return page_address(origin_page_for(page)); +} + +static bool page_has_metadata(struct page *page) +{ + return shadow_page_for(page) && origin_page_for(page); +} + +static void set_no_shadow_origin_page(struct page *page) +{ + shadow_page_for(page) = NULL; + origin_page_for(page) = NULL; +} + +/* + * Dummy load and store pages to be used when the real metadata is unavailable. + * There are separate pages for loads and stores, so that every load returns a + * zero, and every store doesn't affect other loads. + */ +static char dummy_load_page[PAGE_SIZE] __aligned(PAGE_SIZE); +static char dummy_store_page[PAGE_SIZE] __aligned(PAGE_SIZE); + +static unsigned long vmalloc_meta(void *addr, bool is_origin) +{ + unsigned long addr64 = (unsigned long)addr, off; + + KMSAN_WARN_ON(is_origin && !IS_ALIGNED(addr64, KMSAN_ORIGIN_SIZE)); + if (kmsan_internal_is_vmalloc_addr(addr)) { + off = addr64 - VMALLOC_START; + return off + (is_origin ? KMSAN_VMALLOC_ORIGIN_START : + KMSAN_VMALLOC_SHADOW_START); + } + if (kmsan_internal_is_module_addr(addr)) { + off = addr64 - MODULES_VADDR; + return off + (is_origin ? KMSAN_MODULES_ORIGIN_START : + KMSAN_MODULES_SHADOW_START); + } + return 0; +} + +static struct page *virt_to_page_or_null(void *vaddr) +{ + if (kmsan_virt_addr_valid(vaddr)) + return virt_to_page(vaddr); + else + return NULL; +} + +struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *address, u64 size, + bool store) +{ + struct shadow_origin_ptr ret; + void *shadow; + + /* + * Even if we redirect this memory access to the dummy page, it will + * go out of bounds. + */ + KMSAN_WARN_ON(size > PAGE_SIZE); + + if (!kmsan_enabled) + goto return_dummy; + + KMSAN_WARN_ON(!kmsan_metadata_is_contiguous(address, size)); + shadow = kmsan_get_metadata(address, KMSAN_META_SHADOW); + if (!shadow) + goto return_dummy; + + ret.shadow = shadow; + ret.origin = kmsan_get_metadata(address, KMSAN_META_ORIGIN); + return ret; + +return_dummy: + if (store) { + /* Ignore this store. */ + ret.shadow = dummy_store_page; + ret.origin = dummy_store_page; + } else { + /* This load will return zero. */ + ret.shadow = dummy_load_page; + ret.origin = dummy_load_page; + } + return ret; +} + +/* + * Obtain the shadow or origin pointer for the given address, or NULL if there's + * none. The caller must check the return value for being non-NULL if needed. + * The return value of this function should not depend on whether we're in the + * runtime or not. + */ +void *kmsan_get_metadata(void *address, bool is_origin) +{ + u64 addr = (u64)address, pad, off; + struct page *page; + + if (is_origin && !IS_ALIGNED(addr, KMSAN_ORIGIN_SIZE)) { + pad = addr % KMSAN_ORIGIN_SIZE; + addr -= pad; + } + address = (void *)addr; + if (kmsan_internal_is_vmalloc_addr(address) || + kmsan_internal_is_module_addr(address)) + return (void *)vmalloc_meta(address, is_origin); + + page = virt_to_page_or_null(address); + if (!page) + return NULL; + if (!page_has_metadata(page)) + return NULL; + off = addr % PAGE_SIZE; + + return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off; +} diff --git a/scripts/Makefile.kmsan b/scripts/Makefile.kmsan new file mode 100644 index 000000000000..b5b0aa61322e --- /dev/null +++ b/scripts/Makefile.kmsan @@ -0,0 +1,8 @@ +# SPDX-License-Identifier: GPL-2.0 +kmsan-cflags := -fsanitize=kernel-memory + +ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL +kmsan-cflags += -fsanitize-memory-param-retval +endif + +export CFLAGS_KMSAN := $(kmsan-cflags) diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 3fb6a99e78c4..ac32429e93b7 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -157,6 +157,15 @@ _c_flags += $(if $(patsubst n%,, \ endif endif +ifeq ($(CONFIG_KMSAN),y) +_c_flags += $(if $(patsubst n%,, \ + $(KMSAN_SANITIZE_$(basetarget).o)$(KMSAN_SANITIZE)y), \ + $(CFLAGS_KMSAN)) +_c_flags += $(if $(patsubst n%,, \ + $(KMSAN_ENABLE_CHECKS_$(basetarget).o)$(KMSAN_ENABLE_CHECKS)y), \ + , -mllvm -msan-disable-checks=1) +endif + ifeq ($(CONFIG_UBSAN),y) _c_flags += $(if $(patsubst n%,, \ $(UBSAN_SANITIZE_$(basetarget).o)$(UBSAN_SANITIZE)$(CONFIG_UBSAN_SANITIZE_ALL)), \ From 79dbd006a6d6f51777ba4948046561b6d9270504 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:46 +0200 Subject: [PATCH 261/350] kmsan: disable instrumentation of unsupported common kernel code EFI stub cannot be linked with KMSAN runtime, so we disable instrumentation for it. Instrumenting kcov, stackdepot or lockdep leads to infinite recursion caused by instrumentation hooks calling instrumented code again. Link: https://lkml.kernel.org/r/20220915150417.722975-13-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/firmware/efi/libstub/Makefile | 1 + kernel/Makefile | 1 + kernel/locking/Makefile | 3 ++- lib/Makefile | 3 +++ 4 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/firmware/efi/libstub/Makefile b/drivers/firmware/efi/libstub/Makefile index d0537573501e..81432d0c904b 100644 --- a/drivers/firmware/efi/libstub/Makefile +++ b/drivers/firmware/efi/libstub/Makefile @@ -46,6 +46,7 @@ GCOV_PROFILE := n # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n UBSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y diff --git a/kernel/Makefile b/kernel/Makefile index 318789c728d3..d754e0be1176 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -38,6 +38,7 @@ KCOV_INSTRUMENT_kcov.o := n KASAN_SANITIZE_kcov.o := n KCSAN_SANITIZE_kcov.o := n UBSAN_SANITIZE_kcov.o := n +KMSAN_SANITIZE_kcov.o := n CFLAGS_kcov.o := $(call cc-option, -fno-conserve-stack) -fno-stack-protector # Don't instrument error handlers diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index d51cabf28f38..ea925731fa40 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -5,8 +5,9 @@ KCOV_INSTRUMENT := n obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o -# Avoid recursion lockdep -> KCSAN -> ... -> lockdep. +# Avoid recursion lockdep -> sanitizer -> ... -> lockdep. KCSAN_SANITIZE_lockdep.o := n +KMSAN_SANITIZE_lockdep.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) diff --git a/lib/Makefile b/lib/Makefile index d7d94102991b..42e185cdecd0 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -270,6 +270,9 @@ obj-$(CONFIG_POLYNOMIAL) += polynomial.o CFLAGS_stackdepot.o += -fno-builtin obj-$(CONFIG_STACKDEPOT) += stackdepot.o KASAN_SANITIZE_stackdepot.o := n +# In particular, instrumenting stackdepot.c with KMSAN will result in infinite +# recursion. +KMSAN_SANITIZE_stackdepot.o := n KCOV_INSTRUMENT_stackdepot.o := n obj-$(CONFIG_REF_TRACKER) += ref_tracker.o From d596b04f5967c75c196eb582fefba49488c57289 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:47 +0200 Subject: [PATCH 262/350] MAINTAINERS: add entry for KMSAN Add entry for KMSAN maintainers/reviewers. Link: https://lkml.kernel.org/r/20220915150417.722975-14-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- MAINTAINERS | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index 6f1033f3c1ed..3c7dfe9bb712 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11371,6 +11371,19 @@ F: kernel/kmod.c F: lib/test_kmod.c F: tools/testing/selftests/kmod/ +KMSAN +M: Alexander Potapenko +R: Marco Elver +R: Dmitry Vyukov +L: kasan-dev@googlegroups.com +S: Maintained +F: Documentation/dev-tools/kmsan.rst +F: arch/*/include/asm/kmsan.h +F: include/linux/kmsan*.h +F: lib/Kconfig.kmsan +F: mm/kmsan/ +F: scripts/Makefile.kmsan + KPROBES M: Naveen N. Rao M: Anil S Keshavamurthy From b073d7f8aee4ebf05d10e3380df377b73120cf16 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:48 +0200 Subject: [PATCH 263/350] mm: kmsan: maintain KMSAN metadata for page operations Insert KMSAN hooks that make the necessary bookkeeping changes: - poison page shadow and origins in alloc_pages()/free_page(); - clear page shadow and origins in clear_page(), copy_user_highpage(); - copy page metadata in copy_highpage(), wp_page_copy(); - handle vmap()/vunmap()/iounmap(); Link: https://lkml.kernel.org/r/20220915150417.722975-15-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/page_64.h | 7 ++ arch/x86/mm/ioremap.c | 3 + include/linux/highmem.h | 3 + include/linux/kmsan.h | 145 +++++++++++++++++++++++++++++++++ mm/internal.h | 6 ++ mm/kmsan/hooks.c | 86 +++++++++++++++++++ mm/kmsan/shadow.c | 113 +++++++++++++++++++++++++ mm/memory.c | 2 + mm/page_alloc.c | 11 +++ mm/vmalloc.c | 20 ++++- 10 files changed, 394 insertions(+), 2 deletions(-) create mode 100644 include/linux/kmsan.h diff --git a/arch/x86/include/asm/page_64.h b/arch/x86/include/asm/page_64.h index baa70451b8df..198e03e59ca1 100644 --- a/arch/x86/include/asm/page_64.h +++ b/arch/x86/include/asm/page_64.h @@ -8,6 +8,8 @@ #include #include +#include + /* duplicated to the one in bootmem.h */ extern unsigned long max_pfn; extern unsigned long phys_base; @@ -47,6 +49,11 @@ void clear_page_erms(void *page); static inline void clear_page(void *page) { + /* + * Clean up KMSAN metadata for the page being cleared. The assembly call + * below clobbers @page, so we perform unpoisoning before it. + */ + kmsan_unpoison_memory(page, PAGE_SIZE); alternative_call_2(clear_page_orig, clear_page_rep, X86_FEATURE_REP_GOOD, clear_page_erms, X86_FEATURE_ERMS, diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c index 1ad0228f8ceb..78c5bc654cff 100644 --- a/arch/x86/mm/ioremap.c +++ b/arch/x86/mm/ioremap.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include @@ -479,6 +480,8 @@ void iounmap(volatile void __iomem *addr) return; } + kmsan_iounmap_page_range((unsigned long)addr, + (unsigned long)addr + get_vm_area_size(p)); memtype_free(p->phys_addr, p->phys_addr + get_vm_area_size(p)); /* Finally remove it */ diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 25679035ca28..e9912da5441b 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -6,6 +6,7 @@ #include #include #include +#include #include #include #include @@ -311,6 +312,7 @@ static inline void copy_user_highpage(struct page *to, struct page *from, vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_user_page(vto, vfrom, vaddr, to); + kmsan_unpoison_memory(page_address(to), PAGE_SIZE); kunmap_local(vto); kunmap_local(vfrom); } @@ -326,6 +328,7 @@ static inline void copy_highpage(struct page *to, struct page *from) vfrom = kmap_local_page(from); vto = kmap_local_page(to); copy_page(vto, vfrom); + kmsan_copy_page_meta(to, from); kunmap_local(vto); kunmap_local(vfrom); } diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h new file mode 100644 index 000000000000..b36bf3db835e --- /dev/null +++ b/include/linux/kmsan.h @@ -0,0 +1,145 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * KMSAN API for subsystems. + * + * Copyright (C) 2017-2022 Google LLC + * Author: Alexander Potapenko + * + */ +#ifndef _LINUX_KMSAN_H +#define _LINUX_KMSAN_H + +#include +#include +#include + +struct page; + +#ifdef CONFIG_KMSAN + +/** + * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call. + * @page: struct page pointer returned by alloc_pages(). + * @order: order of allocated struct page. + * @flags: GFP flags used by alloc_pages() + * + * KMSAN marks 1<<@order pages starting at @page as uninitialized, unless + * @flags contain __GFP_ZERO. + */ +void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags); + +/** + * kmsan_free_page() - Notify KMSAN about a free_pages() call. + * @page: struct page pointer passed to free_pages(). + * @order: order of deallocated struct page. + * + * KMSAN marks freed memory as uninitialized. + */ +void kmsan_free_page(struct page *page, unsigned int order); + +/** + * kmsan_copy_page_meta() - Copy KMSAN metadata between two pages. + * @dst: destination page. + * @src: source page. + * + * KMSAN copies the contents of metadata pages for @src into the metadata pages + * for @dst. If @dst has no associated metadata pages, nothing happens. + * If @src has no associated metadata pages, @dst metadata pages are unpoisoned. + */ +void kmsan_copy_page_meta(struct page *dst, struct page *src); + +/** + * kmsan_map_kernel_range_noflush() - Notify KMSAN about a vmap. + * @start: start of vmapped range. + * @end: end of vmapped range. + * @prot: page protection flags used for vmap. + * @pages: array of pages. + * @page_shift: page_shift passed to vmap_range_noflush(). + * + * KMSAN maps shadow and origin pages of @pages into contiguous ranges in + * vmalloc metadata address range. + */ +void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift); + +/** + * kmsan_vunmap_kernel_range_noflush() - Notify KMSAN about a vunmap. + * @start: start of vunmapped range. + * @end: end of vunmapped range. + * + * KMSAN unmaps the contiguous metadata ranges created by + * kmsan_map_kernel_range_noflush(). + */ +void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end); + +/** + * kmsan_ioremap_page_range() - Notify KMSAN about a ioremap_page_range() call. + * @addr: range start. + * @end: range end. + * @phys_addr: physical range start. + * @prot: page protection flags used for ioremap_page_range(). + * @page_shift: page_shift argument passed to vmap_range_noflush(). + * + * KMSAN creates new metadata pages for the physical pages mapped into the + * virtual memory. + */ +void kmsan_ioremap_page_range(unsigned long addr, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift); + +/** + * kmsan_iounmap_page_range() - Notify KMSAN about a iounmap_page_range() call. + * @start: range start. + * @end: range end. + * + * KMSAN unmaps the metadata pages for the given range and, unlike for + * vunmap_page_range(), also deallocates them. + */ +void kmsan_iounmap_page_range(unsigned long start, unsigned long end); + +#else + +static inline int kmsan_alloc_page(struct page *page, unsigned int order, + gfp_t flags) +{ + return 0; +} + +static inline void kmsan_free_page(struct page *page, unsigned int order) +{ +} + +static inline void kmsan_copy_page_meta(struct page *dst, struct page *src) +{ +} + +static inline void kmsan_vmap_pages_range_noflush(unsigned long start, + unsigned long end, + pgprot_t prot, + struct page **pages, + unsigned int page_shift) +{ +} + +static inline void kmsan_vunmap_range_noflush(unsigned long start, + unsigned long end) +{ +} + +static inline void kmsan_ioremap_page_range(unsigned long start, + unsigned long end, + phys_addr_t phys_addr, + pgprot_t prot, + unsigned int page_shift) +{ +} + +static inline void kmsan_iounmap_page_range(unsigned long start, + unsigned long end) +{ +} + +#endif + +#endif /* _LINUX_KMSAN_H */ diff --git a/mm/internal.h b/mm/internal.h index e497ab14c984..fea3cba15484 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -818,8 +818,14 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, } #endif +int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift); + void vunmap_range_noflush(unsigned long start, unsigned long end); +void __vunmap_range_noflush(unsigned long start, unsigned long end); + int numa_migrate_prep(struct page *page, struct vm_area_struct *vma, unsigned long addr, int page_nid, int *flags); diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 4ac62fa67a02..040111bb9f6a 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -26,6 +27,91 @@ * skipping effects of functions like memset() inside instrumented code. */ +static unsigned long vmalloc_shadow(unsigned long addr) +{ + return (unsigned long)kmsan_get_metadata((void *)addr, + KMSAN_META_SHADOW); +} + +static unsigned long vmalloc_origin(unsigned long addr) +{ + return (unsigned long)kmsan_get_metadata((void *)addr, + KMSAN_META_ORIGIN); +} + +void kmsan_vunmap_range_noflush(unsigned long start, unsigned long end) +{ + __vunmap_range_noflush(vmalloc_shadow(start), vmalloc_shadow(end)); + __vunmap_range_noflush(vmalloc_origin(start), vmalloc_origin(end)); + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); +} + +/* + * This function creates new shadow/origin pages for the physical pages mapped + * into the virtual memory. If those physical pages already had shadow/origin, + * those are ignored. + */ +void kmsan_ioremap_page_range(unsigned long start, unsigned long end, + phys_addr_t phys_addr, pgprot_t prot, + unsigned int page_shift) +{ + gfp_t gfp_mask = GFP_KERNEL | __GFP_ZERO; + struct page *shadow, *origin; + unsigned long off = 0; + int nr; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + nr = (end - start) / PAGE_SIZE; + kmsan_enter_runtime(); + for (int i = 0; i < nr; i++, off += PAGE_SIZE) { + shadow = alloc_pages(gfp_mask, 1); + origin = alloc_pages(gfp_mask, 1); + __vmap_pages_range_noflush( + vmalloc_shadow(start + off), + vmalloc_shadow(start + off + PAGE_SIZE), prot, &shadow, + PAGE_SHIFT); + __vmap_pages_range_noflush( + vmalloc_origin(start + off), + vmalloc_origin(start + off + PAGE_SIZE), prot, &origin, + PAGE_SHIFT); + } + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); + kmsan_leave_runtime(); +} + +void kmsan_iounmap_page_range(unsigned long start, unsigned long end) +{ + unsigned long v_shadow, v_origin; + struct page *shadow, *origin; + int nr; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + nr = (end - start) / PAGE_SIZE; + kmsan_enter_runtime(); + v_shadow = (unsigned long)vmalloc_shadow(start); + v_origin = (unsigned long)vmalloc_origin(start); + for (int i = 0; i < nr; + i++, v_shadow += PAGE_SIZE, v_origin += PAGE_SIZE) { + shadow = kmsan_vmalloc_to_page_or_null((void *)v_shadow); + origin = kmsan_vmalloc_to_page_or_null((void *)v_origin); + __vunmap_range_noflush(v_shadow, vmalloc_shadow(end)); + __vunmap_range_noflush(v_origin, vmalloc_origin(end)); + if (shadow) + __free_pages(shadow, 1); + if (origin) + __free_pages(origin, 1); + } + flush_cache_vmap(vmalloc_shadow(start), vmalloc_shadow(end)); + flush_cache_vmap(vmalloc_origin(start), vmalloc_origin(end)); + kmsan_leave_runtime(); +} + /* Functions from kmsan-checks.h follow. */ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) { diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index acc5279acc3b..8c81a059beea 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -145,3 +145,116 @@ void *kmsan_get_metadata(void *address, bool is_origin) return (is_origin ? origin_ptr_for(page) : shadow_ptr_for(page)) + off; } + +void kmsan_copy_page_meta(struct page *dst, struct page *src) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + if (!dst || !page_has_metadata(dst)) + return; + if (!src || !page_has_metadata(src)) { + kmsan_internal_unpoison_memory(page_address(dst), PAGE_SIZE, + /*checked*/ false); + return; + } + + kmsan_enter_runtime(); + __memcpy(shadow_ptr_for(dst), shadow_ptr_for(src), PAGE_SIZE); + __memcpy(origin_ptr_for(dst), origin_ptr_for(src), PAGE_SIZE); + kmsan_leave_runtime(); +} + +void kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) +{ + bool initialized = (flags & __GFP_ZERO) || !kmsan_enabled; + struct page *shadow, *origin; + depot_stack_handle_t handle; + int pages = 1 << order; + + if (!page) + return; + + shadow = shadow_page_for(page); + origin = origin_page_for(page); + + if (initialized) { + __memset(page_address(shadow), 0, PAGE_SIZE * pages); + __memset(page_address(origin), 0, PAGE_SIZE * pages); + return; + } + + /* Zero pages allocated by the runtime should also be initialized. */ + if (kmsan_in_runtime()) + return; + + __memset(page_address(shadow), -1, PAGE_SIZE * pages); + kmsan_enter_runtime(); + handle = kmsan_save_stack_with_flags(flags, /*extra_bits*/ 0); + kmsan_leave_runtime(); + /* + * Addresses are page-aligned, pages are contiguous, so it's ok + * to just fill the origin pages with @handle. + */ + for (int i = 0; i < PAGE_SIZE * pages / sizeof(handle); i++) + ((depot_stack_handle_t *)page_address(origin))[i] = handle; +} + +void kmsan_free_page(struct page *page, unsigned int order) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + kmsan_internal_poison_memory(page_address(page), + PAGE_SIZE << compound_order(page), + GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + +void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, + pgprot_t prot, struct page **pages, + unsigned int page_shift) +{ + unsigned long shadow_start, origin_start, shadow_end, origin_end; + struct page **s_pages, **o_pages; + int nr, mapped; + + if (!kmsan_enabled) + return; + + shadow_start = vmalloc_meta((void *)start, KMSAN_META_SHADOW); + shadow_end = vmalloc_meta((void *)end, KMSAN_META_SHADOW); + if (!shadow_start) + return; + + nr = (end - start) / PAGE_SIZE; + s_pages = kcalloc(nr, sizeof(*s_pages), GFP_KERNEL); + o_pages = kcalloc(nr, sizeof(*o_pages), GFP_KERNEL); + if (!s_pages || !o_pages) + goto ret; + for (int i = 0; i < nr; i++) { + s_pages[i] = shadow_page_for(pages[i]); + o_pages[i] = origin_page_for(pages[i]); + } + prot = __pgprot(pgprot_val(prot) | _PAGE_NX); + prot = PAGE_KERNEL; + + origin_start = vmalloc_meta((void *)start, KMSAN_META_ORIGIN); + origin_end = vmalloc_meta((void *)end, KMSAN_META_ORIGIN); + kmsan_enter_runtime(); + mapped = __vmap_pages_range_noflush(shadow_start, shadow_end, prot, + s_pages, page_shift); + KMSAN_WARN_ON(mapped); + mapped = __vmap_pages_range_noflush(origin_start, origin_end, prot, + o_pages, page_shift); + KMSAN_WARN_ON(mapped); + kmsan_leave_runtime(); + flush_tlb_kernel_range(shadow_start, shadow_end); + flush_tlb_kernel_range(origin_start, origin_end); + flush_cache_vmap(shadow_start, shadow_end); + flush_cache_vmap(origin_start, origin_end); + +ret: + kfree(s_pages); + kfree(o_pages); +} diff --git a/mm/memory.c b/mm/memory.c index b3ed17219d77..118e5f023597 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include @@ -3136,6 +3137,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) delayacct_wpcopy_end(); return 0; } + kmsan_copy_page_meta(new_page, old_page); } if (mem_cgroup_charge(page_folio(new_page), mm, GFP_KERNEL)) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4e8ea824e765..1db1ac74ef14 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -1400,6 +1401,7 @@ static __always_inline bool free_pages_prepare(struct page *page, VM_BUG_ON_PAGE(PageTail(page), page); trace_mm_page_free(page, order); + kmsan_free_page(page, order); if (unlikely(PageHWPoison(page)) && !order) { /* @@ -3808,6 +3810,14 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, /* * Allocate a page from the given zone. Use pcplists for order-0 allocations. */ + +/* + * Do not instrument rmqueue() with KMSAN. This function may call + * __msan_poison_alloca() through a call to set_pfnblock_flags_mask(). + * If __msan_poison_alloca() attempts to allocate pages for the stack depot, it + * may call rmqueue() again, which will result in a deadlock. + */ +__no_sanitize_memory static inline struct page *rmqueue(struct zone *preferred_zone, struct zone *zone, unsigned int order, @@ -5560,6 +5570,7 @@ out: } trace_mm_page_alloc(page, order, alloc_gfp, ac.migratetype); + kmsan_alloc_page(page, order, alloc_gfp); return page; } diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a991b909866f..ccaa461998f3 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -320,6 +320,9 @@ int ioremap_page_range(unsigned long addr, unsigned long end, err = vmap_range_noflush(addr, end, phys_addr, pgprot_nx(prot), ioremap_max_page_shift); flush_cache_vmap(addr, end); + if (!err) + kmsan_ioremap_page_range(addr, end, phys_addr, prot, + ioremap_max_page_shift); return err; } @@ -416,7 +419,7 @@ static void vunmap_p4d_range(pgd_t *pgd, unsigned long addr, unsigned long end, * * This is an internal function only. Do not use outside mm/. */ -void vunmap_range_noflush(unsigned long start, unsigned long end) +void __vunmap_range_noflush(unsigned long start, unsigned long end) { unsigned long next; pgd_t *pgd; @@ -438,6 +441,12 @@ void vunmap_range_noflush(unsigned long start, unsigned long end) arch_sync_kernel_mappings(start, end); } +void vunmap_range_noflush(unsigned long start, unsigned long end) +{ + kmsan_vunmap_range_noflush(start, end); + __vunmap_range_noflush(start, end); +} + /** * vunmap_range - unmap kernel virtual addresses * @addr: start of the VM area to unmap @@ -575,7 +584,7 @@ static int vmap_small_pages_range_noflush(unsigned long addr, unsigned long end, * * This is an internal function only. Do not use outside mm/. */ -int vmap_pages_range_noflush(unsigned long addr, unsigned long end, +int __vmap_pages_range_noflush(unsigned long addr, unsigned long end, pgprot_t prot, struct page **pages, unsigned int page_shift) { unsigned int i, nr = (end - addr) >> PAGE_SHIFT; @@ -601,6 +610,13 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, return 0; } +int vmap_pages_range_noflush(unsigned long addr, unsigned long end, + pgprot_t prot, struct page **pages, unsigned int page_shift) +{ + kmsan_vmap_pages_range_noflush(addr, end, prot, pages, page_shift); + return __vmap_pages_range_noflush(addr, end, prot, pages, page_shift); +} + /** * vmap_pages_range - map pages to a kernel virtual address * @addr: start of the VM area to map From 68ef169a1dd20df5cfa5a161b7304ad9fdd14c36 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:49 +0200 Subject: [PATCH 264/350] mm: kmsan: call KMSAN hooks from SLUB code In order to report uninitialized memory coming from heap allocations KMSAN has to poison them unless they're created with __GFP_ZERO. It's handy that we need KMSAN hooks in the places where init_on_alloc/init_on_free initialization is performed. In addition, we apply __no_kmsan_checks to get_freepointer_safe() to suppress reports when accessing freelist pointers that reside in freed objects. Link: https://lkml.kernel.org/r/20220915150417.722975-16-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 57 ++++++++++++++++++++++++++++++++ mm/kmsan/hooks.c | 76 +++++++++++++++++++++++++++++++++++++++++++ mm/slab.h | 1 + mm/slub.c | 17 ++++++++++ 4 files changed, 151 insertions(+) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index b36bf3db835e..5c4e0079054e 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -14,6 +14,7 @@ #include struct page; +struct kmem_cache; #ifdef CONFIG_KMSAN @@ -48,6 +49,44 @@ void kmsan_free_page(struct page *page, unsigned int order); */ void kmsan_copy_page_meta(struct page *dst, struct page *src); +/** + * kmsan_slab_alloc() - Notify KMSAN about a slab allocation. + * @s: slab cache the object belongs to. + * @object: object pointer. + * @flags: GFP flags passed to the allocator. + * + * Depending on cache flags and GFP flags, KMSAN sets up the metadata of the + * newly created object, marking it as initialized or uninitialized. + */ +void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags); + +/** + * kmsan_slab_free() - Notify KMSAN about a slab deallocation. + * @s: slab cache the object belongs to. + * @object: object pointer. + * + * KMSAN marks the freed object as uninitialized. + */ +void kmsan_slab_free(struct kmem_cache *s, void *object); + +/** + * kmsan_kmalloc_large() - Notify KMSAN about a large slab allocation. + * @ptr: object pointer. + * @size: object size. + * @flags: GFP flags passed to the allocator. + * + * Similar to kmsan_slab_alloc(), but for large allocations. + */ +void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags); + +/** + * kmsan_kfree_large() - Notify KMSAN about a large slab deallocation. + * @ptr: object pointer. + * + * Similar to kmsan_slab_free(), but for large allocations. + */ +void kmsan_kfree_large(const void *ptr); + /** * kmsan_map_kernel_range_noflush() - Notify KMSAN about a vmap. * @start: start of vmapped range. @@ -114,6 +153,24 @@ static inline void kmsan_copy_page_meta(struct page *dst, struct page *src) { } +static inline void kmsan_slab_alloc(struct kmem_cache *s, void *object, + gfp_t flags) +{ +} + +static inline void kmsan_slab_free(struct kmem_cache *s, void *object) +{ +} + +static inline void kmsan_kmalloc_large(const void *ptr, size_t size, + gfp_t flags) +{ +} + +static inline void kmsan_kfree_large(const void *ptr) +{ +} + static inline void kmsan_vmap_pages_range_noflush(unsigned long start, unsigned long end, pgprot_t prot, diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 040111bb9f6a..000703c563a4 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -27,6 +27,82 @@ * skipping effects of functions like memset() inside instrumented code. */ +void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) +{ + if (unlikely(object == NULL)) + return; + if (!kmsan_enabled || kmsan_in_runtime()) + return; + /* + * There's a ctor or this is an RCU cache - do nothing. The memory + * status hasn't changed since last use. + */ + if (s->ctor || (s->flags & SLAB_TYPESAFE_BY_RCU)) + return; + + kmsan_enter_runtime(); + if (flags & __GFP_ZERO) + kmsan_internal_unpoison_memory(object, s->object_size, + KMSAN_POISON_CHECK); + else + kmsan_internal_poison_memory(object, s->object_size, flags, + KMSAN_POISON_CHECK); + kmsan_leave_runtime(); +} + +void kmsan_slab_free(struct kmem_cache *s, void *object) +{ + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + /* RCU slabs could be legally used after free within the RCU period */ + if (unlikely(s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON))) + return; + /* + * If there's a constructor, freed memory must remain in the same state + * until the next allocation. We cannot save its state to detect + * use-after-free bugs, instead we just keep it unpoisoned. + */ + if (s->ctor) + return; + kmsan_enter_runtime(); + kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + +void kmsan_kmalloc_large(const void *ptr, size_t size, gfp_t flags) +{ + if (unlikely(ptr == NULL)) + return; + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + if (flags & __GFP_ZERO) + kmsan_internal_unpoison_memory((void *)ptr, size, + /*checked*/ true); + else + kmsan_internal_poison_memory((void *)ptr, size, flags, + KMSAN_POISON_CHECK); + kmsan_leave_runtime(); +} + +void kmsan_kfree_large(const void *ptr) +{ + struct page *page; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + kmsan_enter_runtime(); + page = virt_to_head_page((void *)ptr); + KMSAN_WARN_ON(ptr != page_address(page)); + kmsan_internal_poison_memory((void *)ptr, + PAGE_SIZE << compound_order(page), + GFP_KERNEL, + KMSAN_POISON_CHECK | KMSAN_POISON_FREE); + kmsan_leave_runtime(); +} + static unsigned long vmalloc_shadow(unsigned long addr) { return (unsigned long)kmsan_get_metadata((void *)addr, diff --git a/mm/slab.h b/mm/slab.h index 4ec82bec15ec..9d0afd2985df 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -729,6 +729,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, memset(p[i], 0, s->object_size); kmemleak_alloc_recursive(p[i], s->object_size, 1, s->flags, flags); + kmsan_slab_alloc(s, p[i], flags); } memcg_slab_post_alloc_hook(s, objcg, flags, size, p); diff --git a/mm/slub.c b/mm/slub.c index 6953c3367bc2..ce8310e131b3 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -359,6 +360,17 @@ static void prefetch_freepointer(const struct kmem_cache *s, void *object) prefetchw(object + s->offset); } +/* + * When running under KMSAN, get_freepointer_safe() may return an uninitialized + * pointer value in the case the current thread loses the race for the next + * memory chunk in the freelist. In that case this_cpu_cmpxchg_double() in + * slab_alloc_node() will fail, so the uninitialized value won't be used, but + * KMSAN will still check all arguments of cmpxchg because of imperfect + * handling of inline assembly. + * To work around this problem, we apply __no_kmsan_checks to ensure that + * get_freepointer_safe() returns initialized memory. + */ +__no_kmsan_checks static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) { unsigned long freepointer_addr; @@ -1709,6 +1721,7 @@ static inline void *kmalloc_large_node_hook(void *ptr, size_t size, gfp_t flags) ptr = kasan_kmalloc_large(ptr, size, flags); /* As ptr might get tagged, call kmemleak hook after KASAN. */ kmemleak_alloc(ptr, size, 1, flags); + kmsan_kmalloc_large(ptr, size, flags); return ptr; } @@ -1716,12 +1729,14 @@ static __always_inline void kfree_hook(void *x) { kmemleak_free(x); kasan_kfree_large(x); + kmsan_kfree_large(x); } static __always_inline bool slab_free_hook(struct kmem_cache *s, void *x, bool init) { kmemleak_free_recursive(x, s->flags); + kmsan_slab_free(s, x); debug_check_no_locks_freed(x, s->object_size); @@ -5941,6 +5956,7 @@ static char *create_unique_id(struct kmem_cache *s) p += sprintf(p, "%07u", s->size); BUG_ON(p > name + ID_STR_LENGTH - 1); + kmsan_unpoison_memory(name, p - name); return name; } @@ -6042,6 +6058,7 @@ static int sysfs_slab_alias(struct kmem_cache *s, const char *name) al->name = name; al->next = alias_list; alias_list = al; + kmsan_unpoison_memory(al, sizeof(*al)); return 0; } From 50b5e49ca694a60f84a2a12d62b6cb6ec8e3649f Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:50 +0200 Subject: [PATCH 265/350] kmsan: handle task creation and exiting Tell KMSAN that a new task is created, so the tool creates a backing metadata structure for that task. Link: https://lkml.kernel.org/r/20220915150417.722975-17-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 21 +++++++++++++++++++++ kernel/exit.c | 2 ++ kernel/fork.c | 2 ++ mm/kmsan/core.c | 10 ++++++++++ mm/kmsan/hooks.c | 17 +++++++++++++++++ mm/kmsan/kmsan.h | 2 ++ 6 files changed, 54 insertions(+) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 5c4e0079054e..354aee6f7b1a 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -15,9 +15,22 @@ struct page; struct kmem_cache; +struct task_struct; #ifdef CONFIG_KMSAN +/** + * kmsan_task_create() - Initialize KMSAN state for the task. + * @task: task to initialize. + */ +void kmsan_task_create(struct task_struct *task); + +/** + * kmsan_task_exit() - Notify KMSAN that a task has exited. + * @task: task about to finish. + */ +void kmsan_task_exit(struct task_struct *task); + /** * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call. * @page: struct page pointer returned by alloc_pages(). @@ -139,6 +152,14 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); #else +static inline void kmsan_task_create(struct task_struct *task) +{ +} + +static inline void kmsan_task_exit(struct task_struct *task) +{ +} + static inline int kmsan_alloc_page(struct page *page, unsigned int order, gfp_t flags) { diff --git a/kernel/exit.c b/kernel/exit.c index 98a33bd7c25c..1899d73bdfb7 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -60,6 +60,7 @@ #include #include #include +#include #include #include #include @@ -742,6 +743,7 @@ void __noreturn do_exit(long code) WARN_ON(tsk->plug); kcov_task_exit(tsk); + kmsan_task_exit(tsk); coredump_task_exit(tsk); ptrace_event(PTRACE_EVENT_EXIT, code); diff --git a/kernel/fork.c b/kernel/fork.c index 3d788f759e5f..3c2f8601b2b8 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include #include @@ -1023,6 +1024,7 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->worker_private = NULL; kcov_task_init(tsk); + kmsan_task_create(tsk); kmap_local_fork(tsk); #ifdef CONFIG_FAULT_INJECTION diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c index 5330138fda5b..112dce135c7f 100644 --- a/mm/kmsan/core.c +++ b/mm/kmsan/core.c @@ -37,6 +37,16 @@ bool kmsan_enabled __read_mostly; */ DEFINE_PER_CPU(struct kmsan_ctx, kmsan_percpu_ctx); +void kmsan_internal_task_create(struct task_struct *task) +{ + struct kmsan_ctx *ctx = &task->kmsan_ctx; + struct thread_info *info = current_thread_info(); + + __memset(ctx, 0, sizeof(*ctx)); + ctx->allow_reporting = true; + kmsan_internal_unpoison_memory(info, sizeof(*info), false); +} + void kmsan_internal_poison_memory(void *address, size_t size, gfp_t flags, unsigned int poison_flags) { diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 000703c563a4..6f3e64b0b61f 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -27,6 +27,23 @@ * skipping effects of functions like memset() inside instrumented code. */ +void kmsan_task_create(struct task_struct *task) +{ + kmsan_enter_runtime(); + kmsan_internal_task_create(task); + kmsan_leave_runtime(); +} + +void kmsan_task_exit(struct task_struct *task) +{ + struct kmsan_ctx *ctx = &task->kmsan_ctx; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + + ctx->allow_reporting = false; +} + void kmsan_slab_alloc(struct kmem_cache *s, void *object, gfp_t flags) { if (unlikely(object == NULL)) diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h index 97d48b45dba5..77ee068c04ae 100644 --- a/mm/kmsan/kmsan.h +++ b/mm/kmsan/kmsan.h @@ -180,6 +180,8 @@ void kmsan_internal_set_shadow_origin(void *address, size_t size, int b, u32 origin, bool checked); depot_stack_handle_t kmsan_internal_chain_origin(depot_stack_handle_t id); +void kmsan_internal_task_create(struct task_struct *task); + bool kmsan_metadata_is_contiguous(void *addr, size_t size); void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, int reason); From 3c206509826094e85ead0b056f484db96829248d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:51 +0200 Subject: [PATCH 266/350] init: kmsan: call KMSAN initialization routines kmsan_init_shadow() scans the mappings created at boot time and creates metadata pages for those mappings. When the memblock allocator returns pages to pagealloc, we reserve 2/3 of those pages and use them as metadata for the remaining 1/3. Once KMSAN starts, every page allocated by pagealloc has its associated shadow and origin pages. kmsan_initialize() initializes the bookkeeping for init_task and enables KMSAN. Link: https://lkml.kernel.org/r/20220915150417.722975-18-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 36 +++++++ init/main.c | 3 + mm/kmsan/Makefile | 3 +- mm/kmsan/init.c | 235 ++++++++++++++++++++++++++++++++++++++++++ mm/kmsan/kmsan.h | 3 + mm/kmsan/shadow.c | 34 ++++++ mm/page_alloc.c | 4 + 7 files changed, 317 insertions(+), 1 deletion(-) create mode 100644 mm/kmsan/init.c diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index 354aee6f7b1a..e00de976ee43 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -31,6 +31,28 @@ void kmsan_task_create(struct task_struct *task); */ void kmsan_task_exit(struct task_struct *task); +/** + * kmsan_init_shadow() - Initialize KMSAN shadow at boot time. + * + * Allocate and initialize KMSAN metadata for early allocations. + */ +void __init kmsan_init_shadow(void); + +/** + * kmsan_init_runtime() - Initialize KMSAN state and enable KMSAN. + */ +void __init kmsan_init_runtime(void); + +/** + * kmsan_memblock_free_pages() - handle freeing of memblock pages. + * @page: struct page to free. + * @order: order of @page. + * + * Freed pages are either returned to buddy allocator or held back to be used + * as metadata pages. + */ +bool __init kmsan_memblock_free_pages(struct page *page, unsigned int order); + /** * kmsan_alloc_page() - Notify KMSAN about an alloc_pages() call. * @page: struct page pointer returned by alloc_pages(). @@ -152,6 +174,20 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); #else +static inline void kmsan_init_shadow(void) +{ +} + +static inline void kmsan_init_runtime(void) +{ +} + +static inline bool kmsan_memblock_free_pages(struct page *page, + unsigned int order) +{ + return true; +} + static inline void kmsan_task_create(struct task_struct *task) { } diff --git a/init/main.c b/init/main.c index eebe0cad4e37..93b000f2de8d 100644 --- a/init/main.c +++ b/init/main.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include #include @@ -837,6 +838,7 @@ static void __init mm_init(void) init_mem_debugging_and_hardening(); kfence_alloc_pool(); report_meminit(); + kmsan_init_shadow(); stack_depot_early_init(); mem_init(); mem_init_print_info(); @@ -857,6 +859,7 @@ static void __init mm_init(void) init_espfix_bsp(); /* Should be run after espfix64 is set up. */ pti_init(); + kmsan_init_runtime(); } #ifdef CONFIG_RANDOMIZE_KSTACK_OFFSET diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile index 550ad8625e4f..401acb1a491c 100644 --- a/mm/kmsan/Makefile +++ b/mm/kmsan/Makefile @@ -3,7 +3,7 @@ # Makefile for KernelMemorySanitizer (KMSAN). # # -obj-y := core.o instrumentation.o hooks.o report.o shadow.o +obj-y := core.o instrumentation.o init.o hooks.o report.o shadow.o KMSAN_SANITIZE := n KCOV_INSTRUMENT := n @@ -18,6 +18,7 @@ CFLAGS_REMOVE.o = $(CC_FLAGS_FTRACE) CFLAGS_core.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_hooks.o := $(CC_FLAGS_KMSAN_RUNTIME) +CFLAGS_init.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_instrumentation.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_report.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_shadow.o := $(CC_FLAGS_KMSAN_RUNTIME) diff --git a/mm/kmsan/init.c b/mm/kmsan/init.c new file mode 100644 index 000000000000..7fb794242fad --- /dev/null +++ b/mm/kmsan/init.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * KMSAN initialization routines. + * + * Copyright (C) 2017-2021 Google LLC + * Author: Alexander Potapenko + * + */ + +#include "kmsan.h" + +#include +#include +#include + +#include "../internal.h" + +#define NUM_FUTURE_RANGES 128 +struct start_end_pair { + u64 start, end; +}; + +static struct start_end_pair start_end_pairs[NUM_FUTURE_RANGES] __initdata; +static int future_index __initdata; + +/* + * Record a range of memory for which the metadata pages will be created once + * the page allocator becomes available. + */ +static void __init kmsan_record_future_shadow_range(void *start, void *end) +{ + u64 nstart = (u64)start, nend = (u64)end, cstart, cend; + bool merged = false; + + KMSAN_WARN_ON(future_index == NUM_FUTURE_RANGES); + KMSAN_WARN_ON((nstart >= nend) || !nstart || !nend); + nstart = ALIGN_DOWN(nstart, PAGE_SIZE); + nend = ALIGN(nend, PAGE_SIZE); + + /* + * Scan the existing ranges to see if any of them overlaps with + * [start, end). In that case, merge the two ranges instead of + * creating a new one. + * The number of ranges is less than 20, so there is no need to organize + * them into a more intelligent data structure. + */ + for (int i = 0; i < future_index; i++) { + cstart = start_end_pairs[i].start; + cend = start_end_pairs[i].end; + if ((cstart < nstart && cend < nstart) || + (cstart > nend && cend > nend)) + /* ranges are disjoint - do not merge */ + continue; + start_end_pairs[i].start = min(nstart, cstart); + start_end_pairs[i].end = max(nend, cend); + merged = true; + break; + } + if (merged) + return; + start_end_pairs[future_index].start = nstart; + start_end_pairs[future_index].end = nend; + future_index++; +} + +/* + * Initialize the shadow for existing mappings during kernel initialization. + * These include kernel text/data sections, NODE_DATA and future ranges + * registered while creating other data (e.g. percpu). + * + * Allocations via memblock can be only done before slab is initialized. + */ +void __init kmsan_init_shadow(void) +{ + const size_t nd_size = roundup(sizeof(pg_data_t), PAGE_SIZE); + phys_addr_t p_start, p_end; + u64 loop; + int nid; + + for_each_reserved_mem_range(loop, &p_start, &p_end) + kmsan_record_future_shadow_range(phys_to_virt(p_start), + phys_to_virt(p_end)); + /* Allocate shadow for .data */ + kmsan_record_future_shadow_range(_sdata, _edata); + + for_each_online_node(nid) + kmsan_record_future_shadow_range( + NODE_DATA(nid), (char *)NODE_DATA(nid) + nd_size); + + for (int i = 0; i < future_index; i++) + kmsan_init_alloc_meta_for_range( + (void *)start_end_pairs[i].start, + (void *)start_end_pairs[i].end); +} + +struct metadata_page_pair { + struct page *shadow, *origin; +}; +static struct metadata_page_pair held_back[MAX_ORDER] __initdata; + +/* + * Eager metadata allocation. When the memblock allocator is freeing pages to + * pagealloc, we use 2/3 of them as metadata for the remaining 1/3. + * We store the pointers to the returned blocks of pages in held_back[] grouped + * by their order: when kmsan_memblock_free_pages() is called for the first + * time with a certain order, it is reserved as a shadow block, for the second + * time - as an origin block. On the third time the incoming block receives its + * shadow and origin ranges from the previously saved shadow and origin blocks, + * after which held_back[order] can be used again. + * + * At the very end there may be leftover blocks in held_back[]. They are + * collected later by kmsan_memblock_discard(). + */ +bool kmsan_memblock_free_pages(struct page *page, unsigned int order) +{ + struct page *shadow, *origin; + + if (!held_back[order].shadow) { + held_back[order].shadow = page; + return false; + } + if (!held_back[order].origin) { + held_back[order].origin = page; + return false; + } + shadow = held_back[order].shadow; + origin = held_back[order].origin; + kmsan_setup_meta(page, shadow, origin, order); + + held_back[order].shadow = NULL; + held_back[order].origin = NULL; + return true; +} + +#define MAX_BLOCKS 8 +struct smallstack { + struct page *items[MAX_BLOCKS]; + int index; + int order; +}; + +static struct smallstack collect = { + .index = 0, + .order = MAX_ORDER, +}; + +static void smallstack_push(struct smallstack *stack, struct page *pages) +{ + KMSAN_WARN_ON(stack->index == MAX_BLOCKS); + stack->items[stack->index] = pages; + stack->index++; +} +#undef MAX_BLOCKS + +static struct page *smallstack_pop(struct smallstack *stack) +{ + struct page *ret; + + KMSAN_WARN_ON(stack->index == 0); + stack->index--; + ret = stack->items[stack->index]; + stack->items[stack->index] = NULL; + return ret; +} + +static void do_collection(void) +{ + struct page *page, *shadow, *origin; + + while (collect.index >= 3) { + page = smallstack_pop(&collect); + shadow = smallstack_pop(&collect); + origin = smallstack_pop(&collect); + kmsan_setup_meta(page, shadow, origin, collect.order); + __free_pages_core(page, collect.order); + } +} + +static void collect_split(void) +{ + struct smallstack tmp = { + .order = collect.order - 1, + .index = 0, + }; + struct page *page; + + if (!collect.order) + return; + while (collect.index) { + page = smallstack_pop(&collect); + smallstack_push(&tmp, &page[0]); + smallstack_push(&tmp, &page[1 << tmp.order]); + } + __memcpy(&collect, &tmp, sizeof(tmp)); +} + +/* + * Memblock is about to go away. Split the page blocks left over in held_back[] + * and return 1/3 of that memory to the system. + */ +static void kmsan_memblock_discard(void) +{ + /* + * For each order=N: + * - push held_back[N].shadow and .origin to @collect; + * - while there are >= 3 elements in @collect, do garbage collection: + * - pop 3 ranges from @collect; + * - use two of them as shadow and origin for the third one; + * - repeat; + * - split each remaining element from @collect into 2 ranges of + * order=N-1, + * - repeat. + */ + collect.order = MAX_ORDER - 1; + for (int i = MAX_ORDER - 1; i >= 0; i--) { + if (held_back[i].shadow) + smallstack_push(&collect, held_back[i].shadow); + if (held_back[i].origin) + smallstack_push(&collect, held_back[i].origin); + held_back[i].shadow = NULL; + held_back[i].origin = NULL; + do_collection(); + collect_split(); + } +} + +void __init kmsan_init_runtime(void) +{ + /* Assuming current is init_task */ + kmsan_internal_task_create(current); + kmsan_memblock_discard(); + pr_info("Starting KernelMemorySanitizer\n"); + pr_info("ATTENTION: KMSAN is a debugging tool! Do not use it on production machines!\n"); + kmsan_enabled = true; +} diff --git a/mm/kmsan/kmsan.h b/mm/kmsan/kmsan.h index 77ee068c04ae..7019c46d33a7 100644 --- a/mm/kmsan/kmsan.h +++ b/mm/kmsan/kmsan.h @@ -67,6 +67,7 @@ struct shadow_origin_ptr { struct shadow_origin_ptr kmsan_get_shadow_origin_ptr(void *addr, u64 size, bool store); void *kmsan_get_metadata(void *addr, bool is_origin); +void __init kmsan_init_alloc_meta_for_range(void *start, void *end); enum kmsan_bug_reason { REASON_ANY, @@ -187,6 +188,8 @@ void kmsan_internal_check_memory(void *addr, size_t size, const void *user_addr, int reason); struct page *kmsan_vmalloc_to_page_or_null(void *vaddr); +void kmsan_setup_meta(struct page *page, struct page *shadow, + struct page *origin, int order); /* * kmsan_internal_is_module_addr() and kmsan_internal_is_vmalloc_addr() are diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 8c81a059beea..6e90a806a704 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -258,3 +258,37 @@ ret: kfree(s_pages); kfree(o_pages); } + +/* Allocate metadata for pages allocated at boot time. */ +void __init kmsan_init_alloc_meta_for_range(void *start, void *end) +{ + struct page *shadow_p, *origin_p; + void *shadow, *origin; + struct page *page; + u64 size; + + start = (void *)ALIGN_DOWN((u64)start, PAGE_SIZE); + size = ALIGN((u64)end - (u64)start, PAGE_SIZE); + shadow = memblock_alloc(size, PAGE_SIZE); + origin = memblock_alloc(size, PAGE_SIZE); + for (u64 addr = 0; addr < size; addr += PAGE_SIZE) { + page = virt_to_page_or_null((char *)start + addr); + shadow_p = virt_to_page_or_null((char *)shadow + addr); + set_no_shadow_origin_page(shadow_p); + shadow_page_for(page) = shadow_p; + origin_p = virt_to_page_or_null((char *)origin + addr); + set_no_shadow_origin_page(origin_p); + origin_page_for(page) = origin_p; + } +} + +void kmsan_setup_meta(struct page *page, struct page *shadow, + struct page *origin, int order) +{ + for (int i = 0; i < (1 << order); i++) { + set_no_shadow_origin_page(&shadow[i]); + set_no_shadow_origin_page(&origin[i]); + shadow_page_for(&page[i]) = &shadow[i]; + origin_page_for(&page[i]) = &origin[i]; + } +} diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 1db1ac74ef14..118462ae6800 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1809,6 +1809,10 @@ void __init memblock_free_pages(struct page *page, unsigned long pfn, { if (early_page_uninitialised(pfn)) return; + if (!kmsan_memblock_free_pages(page, order)) { + /* KMSAN will take care of these pages. */ + return; + } __free_pages_core(page, order); } From 75cf0290271bf6dae9dee982aef15242dadf97e4 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:52 +0200 Subject: [PATCH 267/350] instrumented.h: add KMSAN support To avoid false positives, KMSAN needs to unpoison the data copied from the userspace. To detect infoleaks - check the memory buffer passed to copy_to_user(). Link: https://lkml.kernel.org/r/20220915150417.722975-19-glider@google.com Signed-off-by: Alexander Potapenko Reviewed-by: Marco Elver Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/instrumented.h | 18 ++++++++++++----- include/linux/kmsan-checks.h | 19 ++++++++++++++++++ mm/kmsan/hooks.c | 38 ++++++++++++++++++++++++++++++++++++ 3 files changed, 70 insertions(+), 5 deletions(-) diff --git a/include/linux/instrumented.h b/include/linux/instrumented.h index 9f1dba8f717b..501fa8486749 100644 --- a/include/linux/instrumented.h +++ b/include/linux/instrumented.h @@ -2,7 +2,7 @@ /* * This header provides generic wrappers for memory access instrumentation that - * the compiler cannot emit for: KASAN, KCSAN. + * the compiler cannot emit for: KASAN, KCSAN, KMSAN. */ #ifndef _LINUX_INSTRUMENTED_H #define _LINUX_INSTRUMENTED_H @@ -10,6 +10,7 @@ #include #include #include +#include #include /** @@ -117,6 +118,7 @@ instrument_copy_to_user(void __user *to, const void *from, unsigned long n) { kasan_check_read(from, n); kcsan_check_read(from, n); + kmsan_copy_to_user(to, from, n, 0); } /** @@ -151,6 +153,7 @@ static __always_inline void instrument_copy_from_user_after(const void *to, const void __user *from, unsigned long n, unsigned long left) { + kmsan_unpoison_memory(to, n - left); } /** @@ -162,10 +165,14 @@ instrument_copy_from_user_after(const void *to, const void __user *from, * * @to destination variable, may not be address-taken */ -#define instrument_get_user(to) \ -({ \ +#define instrument_get_user(to) \ +({ \ + u64 __tmp = (u64)(to); \ + kmsan_unpoison_memory(&__tmp, sizeof(__tmp)); \ + to = __tmp; \ }) + /** * instrument_put_user() - add instrumentation to put_user()-like macros * @@ -177,8 +184,9 @@ instrument_copy_from_user_after(const void *to, const void __user *from, * @ptr userspace pointer to copy to * @size number of bytes to copy */ -#define instrument_put_user(from, ptr, size) \ -({ \ +#define instrument_put_user(from, ptr, size) \ +({ \ + kmsan_copy_to_user(ptr, &from, sizeof(from), 0); \ }) #endif /* _LINUX_INSTRUMENTED_H */ diff --git a/include/linux/kmsan-checks.h b/include/linux/kmsan-checks.h index a6522a0c28df..c4cae333deec 100644 --- a/include/linux/kmsan-checks.h +++ b/include/linux/kmsan-checks.h @@ -46,6 +46,21 @@ void kmsan_unpoison_memory(const void *address, size_t size); */ void kmsan_check_memory(const void *address, size_t size); +/** + * kmsan_copy_to_user() - Notify KMSAN about a data transfer to userspace. + * @to: destination address in the userspace. + * @from: source address in the kernel. + * @to_copy: number of bytes to copy. + * @left: number of bytes not copied. + * + * If this is a real userspace data transfer, KMSAN checks the bytes that were + * actually copied to ensure there was no information leak. If @to belongs to + * the kernel space (which is possible for compat syscalls), KMSAN just copies + * the metadata. + */ +void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, + size_t left); + #else static inline void kmsan_poison_memory(const void *address, size_t size, @@ -58,6 +73,10 @@ static inline void kmsan_unpoison_memory(const void *address, size_t size) static inline void kmsan_check_memory(const void *address, size_t size) { } +static inline void kmsan_copy_to_user(void __user *to, const void *from, + size_t to_copy, size_t left) +{ +} #endif diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 6f3e64b0b61f..5c0eb25d984d 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -205,6 +205,44 @@ void kmsan_iounmap_page_range(unsigned long start, unsigned long end) kmsan_leave_runtime(); } +void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, + size_t left) +{ + unsigned long ua_flags; + + if (!kmsan_enabled || kmsan_in_runtime()) + return; + /* + * At this point we've copied the memory already. It's hard to check it + * before copying, as the size of actually copied buffer is unknown. + */ + + /* copy_to_user() may copy zero bytes. No need to check. */ + if (!to_copy) + return; + /* Or maybe copy_to_user() failed to copy anything. */ + if (to_copy <= left) + return; + + ua_flags = user_access_save(); + if ((u64)to < TASK_SIZE) { + /* This is a user memory access, check it. */ + kmsan_internal_check_memory((void *)from, to_copy - left, to, + REASON_COPY_TO_USER); + } else { + /* Otherwise this is a kernel memory access. This happens when a + * compat syscall passes an argument allocated on the kernel + * stack to a real syscall. + * Don't check anything, just copy the shadow of the copied + * bytes. + */ + kmsan_internal_memmove_metadata((void *)to, (void *)from, + to_copy - left); + } + user_access_restore(ua_flags); +} +EXPORT_SYMBOL(kmsan_copy_to_user); + /* Functions from kmsan-checks.h follow. */ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) { From a28a4d4723c11fe5fd3e725f5eb1b3472e80fe12 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:53 +0200 Subject: [PATCH 268/350] kmsan: add iomap support Functions from lib/iomap.c interact with hardware, so KMSAN must ensure that: - every read function returns an initialized value - every write function checks values before sending them to hardware. Link: https://lkml.kernel.org/r/20220915150417.722975-20-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/iomap.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/lib/iomap.c b/lib/iomap.c index fbaa3e8f19d6..4f8b31baa575 100644 --- a/lib/iomap.c +++ b/lib/iomap.c @@ -6,6 +6,7 @@ */ #include #include +#include #include @@ -70,26 +71,35 @@ static void bad_io_access(unsigned long port, const char *access) #define mmio_read64be(addr) swab64(readq(addr)) #endif +/* + * Here and below, we apply __no_kmsan_checks to functions reading data from + * hardware, to ensure that KMSAN marks their return values as initialized. + */ +__no_kmsan_checks unsigned int ioread8(const void __iomem *addr) { IO_COND(addr, return inb(port), return readb(addr)); return 0xff; } +__no_kmsan_checks unsigned int ioread16(const void __iomem *addr) { IO_COND(addr, return inw(port), return readw(addr)); return 0xffff; } +__no_kmsan_checks unsigned int ioread16be(const void __iomem *addr) { IO_COND(addr, return pio_read16be(port), return mmio_read16be(addr)); return 0xffff; } +__no_kmsan_checks unsigned int ioread32(const void __iomem *addr) { IO_COND(addr, return inl(port), return readl(addr)); return 0xffffffff; } +__no_kmsan_checks unsigned int ioread32be(const void __iomem *addr) { IO_COND(addr, return pio_read32be(port), return mmio_read32be(addr)); @@ -142,18 +152,21 @@ static u64 pio_read64be_hi_lo(unsigned long port) return lo | (hi << 32); } +__no_kmsan_checks u64 ioread64_lo_hi(const void __iomem *addr) { IO_COND(addr, return pio_read64_lo_hi(port), return readq(addr)); return 0xffffffffffffffffULL; } +__no_kmsan_checks u64 ioread64_hi_lo(const void __iomem *addr) { IO_COND(addr, return pio_read64_hi_lo(port), return readq(addr)); return 0xffffffffffffffffULL; } +__no_kmsan_checks u64 ioread64be_lo_hi(const void __iomem *addr) { IO_COND(addr, return pio_read64be_lo_hi(port), @@ -161,6 +174,7 @@ u64 ioread64be_lo_hi(const void __iomem *addr) return 0xffffffffffffffffULL; } +__no_kmsan_checks u64 ioread64be_hi_lo(const void __iomem *addr) { IO_COND(addr, return pio_read64be_hi_lo(port), @@ -188,22 +202,32 @@ EXPORT_SYMBOL(ioread64be_hi_lo); void iowrite8(u8 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outb(val,port), writeb(val, addr)); } void iowrite16(u16 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outw(val,port), writew(val, addr)); } void iowrite16be(u16 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write16be(val,port), mmio_write16be(val, addr)); } void iowrite32(u32 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, outl(val,port), writel(val, addr)); } void iowrite32be(u32 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write32be(val,port), mmio_write32be(val, addr)); } EXPORT_SYMBOL(iowrite8); @@ -239,24 +263,32 @@ static void pio_write64be_hi_lo(u64 val, unsigned long port) void iowrite64_lo_hi(u64 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64_lo_hi(val, port), writeq(val, addr)); } void iowrite64_hi_lo(u64 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64_hi_lo(val, port), writeq(val, addr)); } void iowrite64be_lo_hi(u64 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64be_lo_hi(val, port), mmio_write64be(val, addr)); } void iowrite64be_hi_lo(u64 val, void __iomem *addr) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(&val, sizeof(val)); IO_COND(addr, pio_write64be_hi_lo(val, port), mmio_write64be(val, addr)); } @@ -328,14 +360,20 @@ static inline void mmio_outsl(void __iomem *addr, const u32 *src, int count) void ioread8_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insb(port,dst,count), mmio_insb(addr, dst, count)); + /* KMSAN must treat values read from devices as initialized. */ + kmsan_unpoison_memory(dst, count); } void ioread16_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insw(port,dst,count), mmio_insw(addr, dst, count)); + /* KMSAN must treat values read from devices as initialized. */ + kmsan_unpoison_memory(dst, count * 2); } void ioread32_rep(const void __iomem *addr, void *dst, unsigned long count) { IO_COND(addr, insl(port,dst,count), mmio_insl(addr, dst, count)); + /* KMSAN must treat values read from devices as initialized. */ + kmsan_unpoison_memory(dst, count * 4); } EXPORT_SYMBOL(ioread8_rep); EXPORT_SYMBOL(ioread16_rep); @@ -343,14 +381,20 @@ EXPORT_SYMBOL(ioread32_rep); void iowrite8_rep(void __iomem *addr, const void *src, unsigned long count) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(src, count); IO_COND(addr, outsb(port, src, count), mmio_outsb(addr, src, count)); } void iowrite16_rep(void __iomem *addr, const void *src, unsigned long count) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(src, count * 2); IO_COND(addr, outsw(port, src, count), mmio_outsw(addr, src, count)); } void iowrite32_rep(void __iomem *addr, const void *src, unsigned long count) { + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(src, count * 4); IO_COND(addr, outsl(port, src,count), mmio_outsl(addr, src, count)); } EXPORT_SYMBOL(iowrite8_rep); From 38317724f6a85572af373229a27e214d5282ddf8 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:54 +0200 Subject: [PATCH 269/350] input: libps2: mark data received in __ps2_command() as initialized KMSAN does not know that the device initializes certain bytes in ps2dev->cmdbuf. Call kmsan_unpoison_memory() to explicitly mark them as initialized. Link: https://lkml.kernel.org/r/20220915150417.722975-21-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/input/serio/libps2.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/input/serio/libps2.c b/drivers/input/serio/libps2.c index 250e213cc80c..3e19344eda93 100644 --- a/drivers/input/serio/libps2.c +++ b/drivers/input/serio/libps2.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -294,9 +295,11 @@ int __ps2_command(struct ps2dev *ps2dev, u8 *param, unsigned int command) serio_pause_rx(ps2dev->serio); - if (param) + if (param) { for (i = 0; i < receive; i++) param[i] = ps2dev->cmdbuf[(receive - 1) - i]; + kmsan_unpoison_memory(param, receive); + } if (ps2dev->cmdcnt && (command != PS2_CMD_RESET_BAT || ps2dev->cmdcnt != 1)) { From 7ade4f10779cb46f5c29ced9b7a41f68501cf0ed Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:55 +0200 Subject: [PATCH 270/350] dma: kmsan: unpoison DMA mappings KMSAN doesn't know about DMA memory writes performed by devices. We unpoison such memory when it's mapped to avoid false positive reports. Link: https://lkml.kernel.org/r/20220915150417.722975-22-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 41 ++++++++++++++++++++++++++++++ kernel/dma/mapping.c | 10 +++++--- mm/kmsan/hooks.c | 59 +++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 107 insertions(+), 3 deletions(-) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index e00de976ee43..dac296da45c5 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -9,6 +9,7 @@ #ifndef _LINUX_KMSAN_H #define _LINUX_KMSAN_H +#include #include #include #include @@ -16,6 +17,7 @@ struct page; struct kmem_cache; struct task_struct; +struct scatterlist; #ifdef CONFIG_KMSAN @@ -172,6 +174,35 @@ void kmsan_ioremap_page_range(unsigned long addr, unsigned long end, */ void kmsan_iounmap_page_range(unsigned long start, unsigned long end); +/** + * kmsan_handle_dma() - Handle a DMA data transfer. + * @page: first page of the buffer. + * @offset: offset of the buffer within the first page. + * @size: buffer size. + * @dir: one of possible dma_data_direction values. + * + * Depending on @direction, KMSAN: + * * checks the buffer, if it is copied to device; + * * initializes the buffer, if it is copied from device; + * * does both, if this is a DMA_BIDIRECTIONAL transfer. + */ +void kmsan_handle_dma(struct page *page, size_t offset, size_t size, + enum dma_data_direction dir); + +/** + * kmsan_handle_dma_sg() - Handle a DMA transfer using scatterlist. + * @sg: scatterlist holding DMA buffers. + * @nents: number of scatterlist entries. + * @dir: one of possible dma_data_direction values. + * + * Depending on @direction, KMSAN: + * * checks the buffers in the scatterlist, if they are copied to device; + * * initializes the buffers, if they are copied from device; + * * does both, if this is a DMA_BIDIRECTIONAL transfer. + */ +void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, + enum dma_data_direction dir); + #else static inline void kmsan_init_shadow(void) @@ -254,6 +285,16 @@ static inline void kmsan_iounmap_page_range(unsigned long start, { } +static inline void kmsan_handle_dma(struct page *page, size_t offset, + size_t size, enum dma_data_direction dir) +{ +} + +static inline void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/kernel/dma/mapping.c b/kernel/dma/mapping.c index 49cbf3e33de7..a8400aa9bcd4 100644 --- a/kernel/dma/mapping.c +++ b/kernel/dma/mapping.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -156,6 +157,7 @@ dma_addr_t dma_map_page_attrs(struct device *dev, struct page *page, addr = dma_direct_map_page(dev, page, offset, size, dir, attrs); else addr = ops->map_page(dev, page, offset, size, dir, attrs); + kmsan_handle_dma(page, offset, size, dir); debug_dma_map_page(dev, page, offset, size, dir, addr, attrs); return addr; @@ -194,11 +196,13 @@ static int __dma_map_sg_attrs(struct device *dev, struct scatterlist *sg, else ents = ops->map_sg(dev, sg, nents, dir, attrs); - if (ents > 0) + if (ents > 0) { + kmsan_handle_dma_sg(sg, nents, dir); debug_dma_map_sg(dev, sg, nents, ents, dir, attrs); - else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && - ents != -EIO && ents != -EREMOTEIO)) + } else if (WARN_ON_ONCE(ents != -EINVAL && ents != -ENOMEM && + ents != -EIO && ents != -EREMOTEIO)) { return -EIO; + } return ents; } diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 5c0eb25d984d..563c09443a37 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -10,10 +10,12 @@ */ #include +#include #include #include #include #include +#include #include #include @@ -243,6 +245,63 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, } EXPORT_SYMBOL(kmsan_copy_to_user); +static void kmsan_handle_dma_page(const void *addr, size_t size, + enum dma_data_direction dir) +{ + switch (dir) { + case DMA_BIDIRECTIONAL: + kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); + kmsan_internal_unpoison_memory((void *)addr, size, + /*checked*/ false); + break; + case DMA_TO_DEVICE: + kmsan_internal_check_memory((void *)addr, size, /*user_addr*/ 0, + REASON_ANY); + break; + case DMA_FROM_DEVICE: + kmsan_internal_unpoison_memory((void *)addr, size, + /*checked*/ false); + break; + case DMA_NONE: + break; + } +} + +/* Helper function to handle DMA data transfers. */ +void kmsan_handle_dma(struct page *page, size_t offset, size_t size, + enum dma_data_direction dir) +{ + u64 page_offset, to_go, addr; + + if (PageHighMem(page)) + return; + addr = (u64)page_address(page) + offset; + /* + * The kernel may occasionally give us adjacent DMA pages not belonging + * to the same allocation. Process them separately to avoid triggering + * internal KMSAN checks. + */ + while (size > 0) { + page_offset = addr % PAGE_SIZE; + to_go = min(PAGE_SIZE - page_offset, (u64)size); + kmsan_handle_dma_page((void *)addr, to_go, dir); + addr += to_go; + size -= to_go; + } +} + +void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, + enum dma_data_direction dir) +{ + struct scatterlist *item; + int i; + + for_each_sg(sg, item, nents, i) + kmsan_handle_dma(sg_page(item), item->offset, item->length, + dir); +} + /* Functions from kmsan-checks.h follow. */ void kmsan_poison_memory(const void *address, size_t size, gfp_t flags) { From 88938359e2dfe1f5f5840268b98935948db8fbd9 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:56 +0200 Subject: [PATCH 271/350] virtio: kmsan: check/unpoison scatterlist in vring_map_one_sg() If vring doesn't use the DMA API, KMSAN is unable to tell whether the memory is initialized by hardware. Explicitly call kmsan_handle_dma() from vring_map_one_sg() in this case to prevent false positives. Link: https://lkml.kernel.org/r/20220915150417.722975-23-glider@google.com Signed-off-by: Alexander Potapenko Acked-by: Michael S. Tsirkin Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/virtio/virtio_ring.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index 4620e9d79dde..8974c34b40fd 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include @@ -352,8 +353,15 @@ static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq, struct scatterlist *sg, enum dma_data_direction direction) { - if (!vq->use_dma_api) + if (!vq->use_dma_api) { + /* + * If DMA is not used, KMSAN doesn't know that the scatterlist + * is initialized by the hardware. Explicitly check/unpoison it + * depending on the direction. + */ + kmsan_handle_dma(sg_page(sg), sg->offset, sg->length, direction); return (dma_addr_t)sg_phys(sg); + } /* * We can't use dma_map_sg, because we don't use scatterlists in From 553a80188a5d7164d2b0688b06bf3fe297023bfe Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:57 +0200 Subject: [PATCH 272/350] kmsan: handle memory sent to/from USB Depending on the value of is_out kmsan_handle_urb() KMSAN either marks the data copied to the kernel from a USB device as initialized, or checks the data sent to the device for being initialized. Link: https://lkml.kernel.org/r/20220915150417.722975-24-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- drivers/usb/core/urb.c | 2 ++ include/linux/kmsan.h | 15 +++++++++++++++ mm/kmsan/hooks.c | 16 ++++++++++++++++ 3 files changed, 33 insertions(+) diff --git a/drivers/usb/core/urb.c b/drivers/usb/core/urb.c index 33d62d7e3929..9f3c54032556 100644 --- a/drivers/usb/core/urb.c +++ b/drivers/usb/core/urb.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -426,6 +427,7 @@ int usb_submit_urb(struct urb *urb, gfp_t mem_flags) URB_SETUP_MAP_SINGLE | URB_SETUP_MAP_LOCAL | URB_DMA_SG_COMBINED); urb->transfer_flags |= (is_out ? URB_DIR_OUT : URB_DIR_IN); + kmsan_handle_urb(urb, is_out); if (xfertype != USB_ENDPOINT_XFER_CONTROL && dev->state < USB_STATE_CONFIGURED) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index dac296da45c5..c473e0e21683 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -18,6 +18,7 @@ struct page; struct kmem_cache; struct task_struct; struct scatterlist; +struct urb; #ifdef CONFIG_KMSAN @@ -203,6 +204,16 @@ void kmsan_handle_dma(struct page *page, size_t offset, size_t size, void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, enum dma_data_direction dir); +/** + * kmsan_handle_urb() - Handle a USB data transfer. + * @urb: struct urb pointer. + * @is_out: data transfer direction (true means output to hardware). + * + * If @is_out is true, KMSAN checks the transfer buffer of @urb. Otherwise, + * KMSAN initializes the transfer buffer. + */ +void kmsan_handle_urb(const struct urb *urb, bool is_out); + #else static inline void kmsan_init_shadow(void) @@ -295,6 +306,10 @@ static inline void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, { } +static inline void kmsan_handle_urb(const struct urb *urb, bool is_out) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 563c09443a37..79d7e73e2cfd 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "../internal.h" #include "../slab.h" @@ -245,6 +246,21 @@ void kmsan_copy_to_user(void __user *to, const void *from, size_t to_copy, } EXPORT_SYMBOL(kmsan_copy_to_user); +/* Helper function to check an URB. */ +void kmsan_handle_urb(const struct urb *urb, bool is_out) +{ + if (!urb) + return; + if (is_out) + kmsan_internal_check_memory(urb->transfer_buffer, + urb->transfer_buffer_length, + /*user_addr*/ 0, REASON_SUBMIT_URB); + else + kmsan_internal_unpoison_memory(urb->transfer_buffer, + urb->transfer_buffer_length, + /*checked*/ false); +} + static void kmsan_handle_dma_page(const void *addr, size_t size, enum dma_data_direction dir) { From 8ed691b02ade8f755f34aa1fa8beff8ce4f81f6d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:58 +0200 Subject: [PATCH 273/350] kmsan: add tests for KMSAN The testing module triggers KMSAN warnings in different cases and checks that the errors are properly reported, using console probes to capture the tool's output. Link: https://lkml.kernel.org/r/20220915150417.722975-25-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/Kconfig.kmsan | 12 + mm/kmsan/Makefile | 4 + mm/kmsan/kmsan_test.c | 581 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 597 insertions(+) create mode 100644 mm/kmsan/kmsan_test.c diff --git a/lib/Kconfig.kmsan b/lib/Kconfig.kmsan index 5b19dbd34d76..b2489dd6503f 100644 --- a/lib/Kconfig.kmsan +++ b/lib/Kconfig.kmsan @@ -47,4 +47,16 @@ config KMSAN_CHECK_PARAM_RETVAL may potentially report errors in corner cases when non-instrumented functions call instrumented ones. +config KMSAN_KUNIT_TEST + tristate "KMSAN integration test suite" if !KUNIT_ALL_TESTS + default KUNIT_ALL_TESTS + depends on TRACEPOINTS && KUNIT + help + Test suite for KMSAN, testing various error detection scenarios, + and checking that reports are correctly output to console. + + Say Y here if you want the test to be built into the kernel and run + during boot; say M if you want the test to build as a module; say N + if you are unsure. + endif diff --git a/mm/kmsan/Makefile b/mm/kmsan/Makefile index 401acb1a491c..98eab2856626 100644 --- a/mm/kmsan/Makefile +++ b/mm/kmsan/Makefile @@ -22,3 +22,7 @@ CFLAGS_init.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_instrumentation.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_report.o := $(CC_FLAGS_KMSAN_RUNTIME) CFLAGS_shadow.o := $(CC_FLAGS_KMSAN_RUNTIME) + +obj-$(CONFIG_KMSAN_KUNIT_TEST) += kmsan_test.o +KMSAN_SANITIZE_kmsan_test.o := y +CFLAGS_kmsan_test.o += $(call cc-disable-warning, uninitialized) diff --git a/mm/kmsan/kmsan_test.c b/mm/kmsan/kmsan_test.c new file mode 100644 index 000000000000..9a29ea2dbfb9 --- /dev/null +++ b/mm/kmsan/kmsan_test.c @@ -0,0 +1,581 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Test cases for KMSAN. + * For each test case checks the presence (or absence) of generated reports. + * Relies on 'console' tracepoint to capture reports as they appear in the + * kernel log. + * + * Copyright (C) 2021-2022, Google LLC. + * Author: Alexander Potapenko + * + */ + +#include +#include "kmsan.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static DEFINE_PER_CPU(int, per_cpu_var); + +/* Report as observed from console. */ +static struct { + spinlock_t lock; + bool available; + bool ignore; /* Stop console output collection. */ + char header[256]; +} observed = { + .lock = __SPIN_LOCK_UNLOCKED(observed.lock), +}; + +/* Probe for console output: obtains observed lines of interest. */ +static void probe_console(void *ignore, const char *buf, size_t len) +{ + unsigned long flags; + + if (observed.ignore) + return; + spin_lock_irqsave(&observed.lock, flags); + + if (strnstr(buf, "BUG: KMSAN: ", len)) { + /* + * KMSAN report and related to the test. + * + * The provided @buf is not NUL-terminated; copy no more than + * @len bytes and let strscpy() add the missing NUL-terminator. + */ + strscpy(observed.header, buf, + min(len + 1, sizeof(observed.header))); + WRITE_ONCE(observed.available, true); + observed.ignore = true; + } + spin_unlock_irqrestore(&observed.lock, flags); +} + +/* Check if a report related to the test exists. */ +static bool report_available(void) +{ + return READ_ONCE(observed.available); +} + +/* Information we expect in a report. */ +struct expect_report { + const char *error_type; /* Error type. */ + /* + * Kernel symbol from the error header, or NULL if no report is + * expected. + */ + const char *symbol; +}; + +/* Check observed report matches information in @r. */ +static bool report_matches(const struct expect_report *r) +{ + typeof(observed.header) expected_header; + unsigned long flags; + bool ret = false; + const char *end; + char *cur; + + /* Doubled-checked locking. */ + if (!report_available() || !r->symbol) + return (!report_available() && !r->symbol); + + /* Generate expected report contents. */ + + /* Title */ + cur = expected_header; + end = &expected_header[sizeof(expected_header) - 1]; + + cur += scnprintf(cur, end - cur, "BUG: KMSAN: %s", r->error_type); + + scnprintf(cur, end - cur, " in %s", r->symbol); + /* The exact offset won't match, remove it; also strip module name. */ + cur = strchr(expected_header, '+'); + if (cur) + *cur = '\0'; + + spin_lock_irqsave(&observed.lock, flags); + if (!report_available()) + goto out; /* A new report is being captured. */ + + /* Finally match expected output to what we actually observed. */ + ret = strstr(observed.header, expected_header); +out: + spin_unlock_irqrestore(&observed.lock, flags); + + return ret; +} + +/* ===== Test cases ===== */ + +/* Prevent replacing branch with select in LLVM. */ +static noinline void check_true(char *arg) +{ + pr_info("%s is true\n", arg); +} + +static noinline void check_false(char *arg) +{ + pr_info("%s is false\n", arg); +} + +#define USE(x) \ + do { \ + if (x) \ + check_true(#x); \ + else \ + check_false(#x); \ + } while (0) + +#define EXPECTATION_ETYPE_FN(e, reason, fn) \ + struct expect_report e = { \ + .error_type = reason, \ + .symbol = fn, \ + } + +#define EXPECTATION_NO_REPORT(e) EXPECTATION_ETYPE_FN(e, NULL, NULL) +#define EXPECTATION_UNINIT_VALUE_FN(e, fn) \ + EXPECTATION_ETYPE_FN(e, "uninit-value", fn) +#define EXPECTATION_UNINIT_VALUE(e) EXPECTATION_UNINIT_VALUE_FN(e, __func__) +#define EXPECTATION_USE_AFTER_FREE(e) \ + EXPECTATION_ETYPE_FN(e, "use-after-free", __func__) + +/* Test case: ensure that kmalloc() returns uninitialized memory. */ +static void test_uninit_kmalloc(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + int *ptr; + + kunit_info(test, "uninitialized kmalloc test (UMR report)\n"); + ptr = kmalloc(sizeof(*ptr), GFP_KERNEL); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that kmalloc'ed memory becomes initialized after memset(). + */ +static void test_init_kmalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int *ptr; + + kunit_info(test, "initialized kmalloc test (no reports)\n"); + ptr = kmalloc(sizeof(*ptr), GFP_KERNEL); + memset(ptr, 0, sizeof(*ptr)); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that kzalloc() returns initialized memory. */ +static void test_init_kzalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int *ptr; + + kunit_info(test, "initialized kzalloc test (no reports)\n"); + ptr = kzalloc(sizeof(*ptr), GFP_KERNEL); + USE(*ptr); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that local variables are uninitialized by default. */ +static void test_uninit_stack_var(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile int cond; + + kunit_info(test, "uninitialized stack variable (UMR report)\n"); + USE(cond); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that local variables with initializers are initialized. */ +static void test_init_stack_var(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + volatile int cond = 1; + + kunit_info(test, "initialized stack variable (no reports)\n"); + USE(cond); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static noinline void two_param_fn_2(int arg1, int arg2) +{ + USE(arg1); + USE(arg2); +} + +static noinline void one_param_fn(int arg) +{ + two_param_fn_2(arg, arg); + USE(arg); +} + +static noinline void two_param_fn(int arg1, int arg2) +{ + int init = 0; + + one_param_fn(init); + USE(arg1); + USE(arg2); +} + +static void test_params(struct kunit *test) +{ +#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL + /* + * With eager param/retval checking enabled, KMSAN will report an error + * before the call to two_param_fn(). + */ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_params"); +#else + EXPECTATION_UNINIT_VALUE_FN(expect, "two_param_fn"); +#endif + volatile int uninit, init = 1; + + kunit_info(test, + "uninit passed through a function parameter (UMR report)\n"); + two_param_fn(uninit, init); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static int signed_sum3(int a, int b, int c) +{ + return a + b + c; +} + +/* + * Test case: ensure that uninitialized values are tracked through function + * arguments. + */ +static void test_uninit_multiple_params(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile char b = 3, c; + volatile int a; + + kunit_info(test, "uninitialized local passed to fn (UMR report)\n"); + USE(signed_sum3(a, b, c)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Helper function to make an array uninitialized. */ +static noinline void do_uninit_local_array(char *array, int start, int stop) +{ + volatile char uninit; + + for (int i = start; i < stop; i++) + array[i] = uninit; +} + +/* + * Test case: ensure kmsan_check_memory() reports an error when checking + * uninitialized memory. + */ +static void test_uninit_kmsan_check_memory(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_uninit_kmsan_check_memory"); + volatile char local_array[8]; + + kunit_info( + test, + "kmsan_check_memory() called on uninit local (UMR report)\n"); + do_uninit_local_array((char *)local_array, 5, 7); + + kmsan_check_memory((char *)local_array, 8); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: check that a virtual memory range created with vmap() from + * initialized pages is still considered as initialized. + */ +static void test_init_kmsan_vmap_vunmap(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + const int npages = 2; + struct page **pages; + void *vbuf; + + kunit_info(test, "pages initialized via vmap (no reports)\n"); + + pages = kmalloc_array(npages, sizeof(*pages), GFP_KERNEL); + for (int i = 0; i < npages; i++) + pages[i] = alloc_page(GFP_KERNEL); + vbuf = vmap(pages, npages, VM_MAP, PAGE_KERNEL); + memset(vbuf, 0xfe, npages * PAGE_SIZE); + for (int i = 0; i < npages; i++) + kmsan_check_memory(page_address(pages[i]), PAGE_SIZE); + + if (vbuf) + vunmap(vbuf); + for (int i = 0; i < npages; i++) { + if (pages[i]) + __free_page(pages[i]); + } + kfree(pages); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memset() can initialize a buffer allocated via + * vmalloc(). + */ +static void test_init_vmalloc(struct kunit *test) +{ + EXPECTATION_NO_REPORT(expect); + int npages = 8; + char *buf; + + kunit_info(test, "vmalloc buffer can be initialized (no reports)\n"); + buf = vmalloc(PAGE_SIZE * npages); + buf[0] = 1; + memset(buf, 0xfe, PAGE_SIZE * npages); + USE(buf[0]); + for (int i = 0; i < npages; i++) + kmsan_check_memory(&buf[PAGE_SIZE * i], PAGE_SIZE); + vfree(buf); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* Test case: ensure that use-after-free reporting works. */ +static void test_uaf(struct kunit *test) +{ + EXPECTATION_USE_AFTER_FREE(expect); + volatile int value; + volatile int *var; + + kunit_info(test, "use-after-free in kmalloc-ed buffer (UMR report)\n"); + var = kmalloc(80, GFP_KERNEL); + var[3] = 0xfeedface; + kfree((int *)var); + /* Copy the invalid value before checking it. */ + value = var[3]; + USE(value); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that uninitialized values are propagated through per-CPU + * memory. + */ +static void test_percpu_propagate(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE(expect); + volatile int uninit, check; + + kunit_info(test, + "uninit local stored to per_cpu memory (UMR report)\n"); + + this_cpu_write(per_cpu_var, uninit); + check = this_cpu_read(per_cpu_var); + USE(check); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that passing uninitialized values to printk() leads to an + * error report. + */ +static void test_printk(struct kunit *test) +{ +#ifdef CONFIG_KMSAN_CHECK_PARAM_RETVAL + /* + * With eager param/retval checking enabled, KMSAN will report an error + * before the call to pr_info(). + */ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_printk"); +#else + EXPECTATION_UNINIT_VALUE_FN(expect, "number"); +#endif + volatile int uninit; + + kunit_info(test, "uninit local passed to pr_info() (UMR report)\n"); + pr_info("%px contains %d\n", &uninit, uninit); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and `dst`. + */ +static void test_memcpy_aligned_to_aligned(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_aligned"); + volatile int uninit_src; + volatile int dst = 0; + + kunit_info( + test, + "memcpy()ing aligned uninit src to aligned dst (UMR report)\n"); + memcpy((void *)&dst, (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)&dst, sizeof(dst)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and unaligned `dst`. + * + * Copying aligned 4-byte value to an unaligned one leads to touching two + * aligned 4-byte values. This test case checks that KMSAN correctly reports an + * error on the first of the two values. + */ +static void test_memcpy_aligned_to_unaligned(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, "test_memcpy_aligned_to_unaligned"); + volatile int uninit_src; + volatile char dst[8] = { 0 }; + + kunit_info( + test, + "memcpy()ing aligned uninit src to unaligned dst (UMR report)\n"); + memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)dst, 4); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +/* + * Test case: ensure that memcpy() correctly copies uninitialized values between + * aligned `src` and unaligned `dst`. + * + * Copying aligned 4-byte value to an unaligned one leads to touching two + * aligned 4-byte values. This test case checks that KMSAN correctly reports an + * error on the second of the two values. + */ +static void test_memcpy_aligned_to_unaligned2(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, + "test_memcpy_aligned_to_unaligned2"); + volatile int uninit_src; + volatile char dst[8] = { 0 }; + + kunit_info( + test, + "memcpy()ing aligned uninit src to unaligned dst - part 2 (UMR report)\n"); + memcpy((void *)&dst[1], (void *)&uninit_src, sizeof(uninit_src)); + kmsan_check_memory((void *)&dst[4], sizeof(uninit_src)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static noinline void fibonacci(int *array, int size, int start) { + if (start < 2 || (start == size)) + return; + array[start] = array[start - 1] + array[start - 2]; + fibonacci(array, size, start + 1); +} + +static void test_long_origin_chain(struct kunit *test) +{ + EXPECTATION_UNINIT_VALUE_FN(expect, + "test_long_origin_chain"); + /* (KMSAN_MAX_ORIGIN_DEPTH * 2) recursive calls to fibonacci(). */ + volatile int accum[KMSAN_MAX_ORIGIN_DEPTH * 2 + 2]; + int last = ARRAY_SIZE(accum) - 1; + + kunit_info( + test, + "origin chain exceeding KMSAN_MAX_ORIGIN_DEPTH (UMR report)\n"); + /* + * We do not set accum[1] to 0, so the uninitializedness will be carried + * over to accum[2..last]. + */ + accum[0] = 1; + fibonacci((int *)accum, ARRAY_SIZE(accum), 2); + kmsan_check_memory((void *)&accum[last], sizeof(int)); + KUNIT_EXPECT_TRUE(test, report_matches(&expect)); +} + +static struct kunit_case kmsan_test_cases[] = { + KUNIT_CASE(test_uninit_kmalloc), + KUNIT_CASE(test_init_kmalloc), + KUNIT_CASE(test_init_kzalloc), + KUNIT_CASE(test_uninit_stack_var), + KUNIT_CASE(test_init_stack_var), + KUNIT_CASE(test_params), + KUNIT_CASE(test_uninit_multiple_params), + KUNIT_CASE(test_uninit_kmsan_check_memory), + KUNIT_CASE(test_init_kmsan_vmap_vunmap), + KUNIT_CASE(test_init_vmalloc), + KUNIT_CASE(test_uaf), + KUNIT_CASE(test_percpu_propagate), + KUNIT_CASE(test_printk), + KUNIT_CASE(test_memcpy_aligned_to_aligned), + KUNIT_CASE(test_memcpy_aligned_to_unaligned), + KUNIT_CASE(test_memcpy_aligned_to_unaligned2), + KUNIT_CASE(test_long_origin_chain), + {}, +}; + +/* ===== End test cases ===== */ + +static int test_init(struct kunit *test) +{ + unsigned long flags; + + spin_lock_irqsave(&observed.lock, flags); + observed.header[0] = '\0'; + observed.ignore = false; + observed.available = false; + spin_unlock_irqrestore(&observed.lock, flags); + + return 0; +} + +static void test_exit(struct kunit *test) +{ +} + +static void register_tracepoints(struct tracepoint *tp, void *ignore) +{ + check_trace_callback_type_console(probe_console); + if (!strcmp(tp->name, "console")) + WARN_ON(tracepoint_probe_register(tp, probe_console, NULL)); +} + +static void unregister_tracepoints(struct tracepoint *tp, void *ignore) +{ + if (!strcmp(tp->name, "console")) + tracepoint_probe_unregister(tp, probe_console, NULL); +} + +static int kmsan_suite_init(struct kunit_suite *suite) +{ + /* + * Because we want to be able to build the test as a module, we need to + * iterate through all known tracepoints, since the static registration + * won't work here. + */ + for_each_kernel_tracepoint(register_tracepoints, NULL); + return 0; +} + +static void kmsan_suite_exit(struct kunit_suite *suite) +{ + for_each_kernel_tracepoint(unregister_tracepoints, NULL); + tracepoint_synchronize_unregister(); +} + +static struct kunit_suite kmsan_test_suite = { + .name = "kmsan", + .test_cases = kmsan_test_cases, + .init = test_init, + .exit = test_exit, + .suite_init = kmsan_suite_init, + .suite_exit = kmsan_suite_exit, +}; +kunit_test_suites(&kmsan_test_suite); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Alexander Potapenko "); From 2de6f3bf75058e35eff04e6fab7ca41533bdb027 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:03:59 +0200 Subject: [PATCH 274/350] kmsan: disable strscpy() optimization under KMSAN Disable the efficient 8-byte reading under KMSAN to avoid false positives. Link: https://lkml.kernel.org/r/20220915150417.722975-26-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- lib/string.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/lib/string.c b/lib/string.c index 6f334420f687..3371d26a0e39 100644 --- a/lib/string.c +++ b/lib/string.c @@ -197,6 +197,14 @@ ssize_t strscpy(char *dest, const char *src, size_t count) max = 0; #endif + /* + * read_word_at_a_time() below may read uninitialized bytes after the + * trailing zero and use them in comparisons. Disable this optimization + * under KMSAN to prevent false positive reports. + */ + if (IS_ENABLED(CONFIG_KMSAN)) + max = 0; + while (max >= sizeof(unsigned long)) { unsigned long c, data; From 440fed95ebc30420d1f7802c6578f95e18523140 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:00 +0200 Subject: [PATCH 275/350] crypto: kmsan: disable accelerated configs under KMSAN KMSAN is unable to understand when initialized values come from assembly. Disable accelerated configs in KMSAN builds to prevent false positive reports. Link: https://lkml.kernel.org/r/20220915150417.722975-27-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- crypto/Kconfig | 30 ++++++++++++++++++++++++++++++ drivers/net/Kconfig | 1 + 2 files changed, 31 insertions(+) diff --git a/crypto/Kconfig b/crypto/Kconfig index bb427a835e44..182fb817ebb5 100644 --- a/crypto/Kconfig +++ b/crypto/Kconfig @@ -319,6 +319,7 @@ config CRYPTO_CURVE25519 config CRYPTO_CURVE25519_X86 tristate "x86_64 accelerated Curve25519 scalar multiplication library" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_LIB_CURVE25519_GENERIC select CRYPTO_ARCH_HAVE_LIB_CURVE25519 @@ -367,11 +368,13 @@ config CRYPTO_AEGIS128 config CRYPTO_AEGIS128_SIMD bool "Support SIMD acceleration for AEGIS-128" depends on CRYPTO_AEGIS128 && ((ARM || ARM64) && KERNEL_MODE_NEON) + depends on !KMSAN # avoid false positives from assembly default y config CRYPTO_AEGIS128_AESNI_SSE2 tristate "AEGIS-128 AEAD algorithm (x86_64 AESNI+SSE2 implementation)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_AEAD select CRYPTO_SIMD help @@ -517,6 +520,7 @@ config CRYPTO_NHPOLY1305 config CRYPTO_NHPOLY1305_SSE2 tristate "NHPoly1305 hash function (x86_64 SSE2 implementation)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_NHPOLY1305 help SSE2 optimized implementation of the hash function used by the @@ -525,6 +529,7 @@ config CRYPTO_NHPOLY1305_SSE2 config CRYPTO_NHPOLY1305_AVX2 tristate "NHPoly1305 hash function (x86_64 AVX2 implementation)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_NHPOLY1305 help AVX2 optimized implementation of the hash function used by the @@ -649,6 +654,7 @@ config CRYPTO_CRC32C config CRYPTO_CRC32C_INTEL tristate "CRC32c INTEL hardware acceleration" depends on X86 + depends on !KMSAN # avoid false positives from assembly select CRYPTO_HASH help In Intel processor with SSE4.2 supported, the processor will @@ -689,6 +695,7 @@ config CRYPTO_CRC32 config CRYPTO_CRC32_PCLMUL tristate "CRC32 PCLMULQDQ hardware acceleration" depends on X86 + depends on !KMSAN # avoid false positives from assembly select CRYPTO_HASH select CRC32 help @@ -748,6 +755,7 @@ config CRYPTO_BLAKE2B config CRYPTO_BLAKE2S_X86 bool "BLAKE2s digest algorithm (x86 accelerated version)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_LIB_BLAKE2S_GENERIC select CRYPTO_ARCH_HAVE_LIB_BLAKE2S @@ -762,6 +770,7 @@ config CRYPTO_CRCT10DIF config CRYPTO_CRCT10DIF_PCLMUL tristate "CRCT10DIF PCLMULQDQ hardware acceleration" depends on X86 && 64BIT && CRC_T10DIF + depends on !KMSAN # avoid false positives from assembly select CRYPTO_HASH help For x86_64 processors with SSE4.2 and PCLMULQDQ supported, @@ -831,6 +840,7 @@ config CRYPTO_POLY1305 config CRYPTO_POLY1305_X86_64 tristate "Poly1305 authenticator algorithm (x86_64/SSE2/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_LIB_POLY1305_GENERIC select CRYPTO_ARCH_HAVE_LIB_POLY1305 help @@ -920,6 +930,7 @@ config CRYPTO_SHA1 config CRYPTO_SHA1_SSSE3 tristate "SHA1 digest algorithm (SSSE3/AVX/AVX2/SHA-NI)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SHA1 select CRYPTO_HASH help @@ -931,6 +942,7 @@ config CRYPTO_SHA1_SSSE3 config CRYPTO_SHA256_SSSE3 tristate "SHA256 digest algorithm (SSSE3/AVX/AVX2/SHA-NI)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SHA256 select CRYPTO_HASH help @@ -943,6 +955,7 @@ config CRYPTO_SHA256_SSSE3 config CRYPTO_SHA512_SSSE3 tristate "SHA512 digest algorithm (SSSE3/AVX/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SHA512 select CRYPTO_HASH help @@ -1168,6 +1181,7 @@ config CRYPTO_WP512 config CRYPTO_GHASH_CLMUL_NI_INTEL tristate "GHASH hash function (CLMUL-NI accelerated)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_CRYPTD help This is the x86_64 CLMUL-NI accelerated implementation of @@ -1228,6 +1242,7 @@ config CRYPTO_AES_TI config CRYPTO_AES_NI_INTEL tristate "AES cipher algorithms (AES-NI)" depends on X86 + depends on !KMSAN # avoid false positives from assembly select CRYPTO_AEAD select CRYPTO_LIB_AES select CRYPTO_ALGAPI @@ -1369,6 +1384,7 @@ config CRYPTO_BLOWFISH_COMMON config CRYPTO_BLOWFISH_X86_64 tristate "Blowfish cipher algorithm (x86_64)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_BLOWFISH_COMMON imply CRYPTO_CTR @@ -1399,6 +1415,7 @@ config CRYPTO_CAMELLIA config CRYPTO_CAMELLIA_X86_64 tristate "Camellia cipher algorithm (x86_64)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER imply CRYPTO_CTR help @@ -1415,6 +1432,7 @@ config CRYPTO_CAMELLIA_X86_64 config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_CAMELLIA_X86_64 select CRYPTO_SIMD @@ -1433,6 +1451,7 @@ config CRYPTO_CAMELLIA_AESNI_AVX_X86_64 config CRYPTO_CAMELLIA_AESNI_AVX2_X86_64 tristate "Camellia cipher algorithm (x86_64/AES-NI/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_CAMELLIA_AESNI_AVX_X86_64 help Camellia cipher algorithm module (x86_64/AES-NI/AVX2). @@ -1478,6 +1497,7 @@ config CRYPTO_CAST5 config CRYPTO_CAST5_AVX_X86_64 tristate "CAST5 (CAST-128) cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_CAST5 select CRYPTO_CAST_COMMON @@ -1501,6 +1521,7 @@ config CRYPTO_CAST6 config CRYPTO_CAST6_AVX_X86_64 tristate "CAST6 (CAST-256) cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_CAST6 select CRYPTO_CAST_COMMON @@ -1534,6 +1555,7 @@ config CRYPTO_DES_SPARC64 config CRYPTO_DES3_EDE_X86_64 tristate "Triple DES EDE cipher algorithm (x86-64)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_LIB_DES imply CRYPTO_CTR @@ -1604,6 +1626,7 @@ config CRYPTO_CHACHA20 config CRYPTO_CHACHA20_X86_64 tristate "ChaCha stream cipher algorithms (x86_64/SSSE3/AVX2/AVX-512VL)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_LIB_CHACHA_GENERIC select CRYPTO_ARCH_HAVE_LIB_CHACHA @@ -1674,6 +1697,7 @@ config CRYPTO_SERPENT config CRYPTO_SERPENT_SSE2_X86_64 tristate "Serpent cipher algorithm (x86_64/SSE2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SERPENT select CRYPTO_SIMD @@ -1693,6 +1717,7 @@ config CRYPTO_SERPENT_SSE2_X86_64 config CRYPTO_SERPENT_SSE2_586 tristate "Serpent cipher algorithm (i586/SSE2)" depends on X86 && !64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SERPENT select CRYPTO_SIMD @@ -1712,6 +1737,7 @@ config CRYPTO_SERPENT_SSE2_586 config CRYPTO_SERPENT_AVX_X86_64 tristate "Serpent cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SERPENT select CRYPTO_SIMD @@ -1732,6 +1758,7 @@ config CRYPTO_SERPENT_AVX_X86_64 config CRYPTO_SERPENT_AVX2_X86_64 tristate "Serpent cipher algorithm (x86_64/AVX2)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SERPENT_AVX_X86_64 help Serpent cipher algorithm, by Anderson, Biham & Knudsen. @@ -1876,6 +1903,7 @@ config CRYPTO_TWOFISH_586 config CRYPTO_TWOFISH_X86_64 tristate "Twofish cipher algorithm (x86_64)" depends on (X86 || UML_X86) && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_ALGAPI select CRYPTO_TWOFISH_COMMON imply CRYPTO_CTR @@ -1893,6 +1921,7 @@ config CRYPTO_TWOFISH_X86_64 config CRYPTO_TWOFISH_X86_64_3WAY tristate "Twofish cipher algorithm (x86_64, 3-way parallel)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_TWOFISH_COMMON select CRYPTO_TWOFISH_X86_64 @@ -1913,6 +1942,7 @@ config CRYPTO_TWOFISH_X86_64_3WAY config CRYPTO_TWOFISH_AVX_X86_64 tristate "Twofish cipher algorithm (x86_64/AVX)" depends on X86 && 64BIT + depends on !KMSAN # avoid false positives from assembly select CRYPTO_SKCIPHER select CRYPTO_SIMD select CRYPTO_TWOFISH_COMMON diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 94c889802566..2aaf02bfe6f7 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -76,6 +76,7 @@ config WIREGUARD tristate "WireGuard secure network tunnel" depends on NET && INET depends on IPV6 || !IPV6 + depends on !KMSAN # KMSAN doesn't support the crypto configs below select NET_UDP_TUNNEL select DST_CACHE select CRYPTO From f630a5d0ca59a6e73b61e3f82c371dc230da99ff Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:01 +0200 Subject: [PATCH 276/350] kmsan: disable physical page merging in biovec KMSAN metadata for adjacent physical pages may not be adjacent, therefore accessing such pages together may lead to metadata corruption. We disable merging pages in biovec to prevent such corruptions. Link: https://lkml.kernel.org/r/20220915150417.722975-28-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- block/blk.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/block/blk.h b/block/blk.h index d7142c4d2fef..af02b93c1dba 100644 --- a/block/blk.h +++ b/block/blk.h @@ -88,6 +88,13 @@ static inline bool biovec_phys_mergeable(struct request_queue *q, phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset; + /* + * Merging adjacent physical pages may not work correctly under KMSAN + * if their metadata pages aren't adjacent. Just disable merging. + */ + if (IS_ENABLED(CONFIG_KMSAN)) + return false; + if (addr1 + vec1->bv_len != addr2) return false; if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page)) From 11b331f857b5fc3ff76bfec36c44a137a6b37de1 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:02 +0200 Subject: [PATCH 277/350] block: kmsan: skip bio block merging logic for KMSAN KMSAN doesn't allow treating adjacent memory pages as such, if they were allocated by different alloc_pages() calls. The block layer however does so: adjacent pages end up being used together. To prevent this, make page_is_mergeable() return false under KMSAN. Link: https://lkml.kernel.org/r/20220915150417.722975-29-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Eric Biggers Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- block/bio.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/block/bio.c b/block/bio.c index 3d3a2678fea2..106ef14f28c2 100644 --- a/block/bio.c +++ b/block/bio.c @@ -869,6 +869,8 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); if (*same_page) return true; + else if (IS_ENABLED(CONFIG_KMSAN)) + return false; return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); } From 74d899098854b4e56cf9dc9d0245d4d40f5efcd4 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:03 +0200 Subject: [PATCH 278/350] kcov: kmsan: unpoison area->list in kcov_remote_area_put() KMSAN does not instrument kernel/kcov.c for performance reasons (with CONFIG_KCOV=y virtually every place in the kernel invokes kcov instrumentation). Therefore the tool may miss writes from kcov.c that initialize memory. When CONFIG_DEBUG_LIST is enabled, list pointers from kernel/kcov.c are passed to instrumented helpers in lib/list_debug.c, resulting in false positives. To work around these reports, we unpoison the contents of area->list after initializing it. Link: https://lkml.kernel.org/r/20220915150417.722975-30-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- kernel/kcov.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/kcov.c b/kernel/kcov.c index e19c84b02452..e5cd09fd8a05 100644 --- a/kernel/kcov.c +++ b/kernel/kcov.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -152,6 +153,12 @@ static void kcov_remote_area_put(struct kcov_remote_area *area, INIT_LIST_HEAD(&area->list); area->size = size; list_add(&area->list, &kcov_remote_areas); + /* + * KMSAN doesn't instrument this file, so it may not know area->list + * is initialized. Unpoison it explicitly to avoid reports in + * kcov_remote_area_get(). + */ + kmsan_unpoison_memory(&area->list, sizeof(area->list)); } static notrace bool check_kcov_mode(enum kcov_mode needed_mode, struct task_struct *t) From 42eaa27d9e7aafb4049fc3a5b02005a917013e65 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:04 +0200 Subject: [PATCH 279/350] security: kmsan: fix interoperability with auto-initialization Heap and stack initialization is great, but not when we are trying uses of uninitialized memory. When the kernel is built with KMSAN, having kernel memory initialization enabled may introduce false negatives. We disable CONFIG_INIT_STACK_ALL_PATTERN and CONFIG_INIT_STACK_ALL_ZERO under CONFIG_KMSAN, making it impossible to auto-initialize stack variables in KMSAN builds. We also disable CONFIG_INIT_ON_ALLOC_DEFAULT_ON and CONFIG_INIT_ON_FREE_DEFAULT_ON to prevent accidental use of heap auto-initialization. We however still let the users enable heap auto-initialization at boot-time (by setting init_on_alloc=1 or init_on_free=1), in which case a warning is printed. Link: https://lkml.kernel.org/r/20220915150417.722975-31-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/page_alloc.c | 4 ++++ security/Kconfig.hardening | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 118462ae6800..c7e9451c69fc 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -938,6 +938,10 @@ void init_mem_debugging_and_hardening(void) else static_branch_disable(&init_on_free); + if (IS_ENABLED(CONFIG_KMSAN) && + (_init_on_alloc_enabled_early || _init_on_free_enabled_early)) + pr_info("mem auto-init: please make sure init_on_alloc and init_on_free are disabled when running KMSAN\n"); + #ifdef CONFIG_DEBUG_PAGEALLOC if (!debug_pagealloc_enabled()) return; diff --git a/security/Kconfig.hardening b/security/Kconfig.hardening index bd2aabb2c60f..2739a6776454 100644 --- a/security/Kconfig.hardening +++ b/security/Kconfig.hardening @@ -106,6 +106,7 @@ choice config INIT_STACK_ALL_PATTERN bool "pattern-init everything (strongest)" depends on CC_HAS_AUTO_VAR_INIT_PATTERN + depends on !KMSAN help Initializes everything on the stack (including padding) with a specific debug value. This is intended to eliminate @@ -124,6 +125,7 @@ choice config INIT_STACK_ALL_ZERO bool "zero-init everything (strongest and safest)" depends on CC_HAS_AUTO_VAR_INIT_ZERO + depends on !KMSAN help Initializes everything on the stack (including padding) with a zero value. This is intended to eliminate all @@ -218,6 +220,7 @@ config STACKLEAK_RUNTIME_DISABLE config INIT_ON_ALLOC_DEFAULT_ON bool "Enable heap memory zeroing on allocation by default" + depends on !KMSAN help This has the effect of setting "init_on_alloc=1" on the kernel command line. This can be disabled with "init_on_alloc=0". @@ -230,6 +233,7 @@ config INIT_ON_ALLOC_DEFAULT_ON config INIT_ON_FREE_DEFAULT_ON bool "Enable heap memory zeroing on free by default" + depends on !KMSAN help This has the effect of setting "init_on_free=1" on the kernel command line. This can be disabled with "init_on_free=0". From 40b22c9df2c51c6ce459953f57c720b129332fbf Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:05 +0200 Subject: [PATCH 280/350] objtool: kmsan: list KMSAN API functions as uaccess-safe KMSAN inserts API function calls in a lot of places (function entries and exits, local variables, memory accesses), so they may get called from the uaccess regions as well. KMSAN API functions are used to update the metadata (shadow/origin pages) for kernel memory accesses. The metadata pages for kernel pointers are also located in the kernel memory, so touching them is not a problem. For userspace pointers, no metadata is allocated. If an API function is supposed to read or modify the metadata, it does so for kernel pointers and ignores userspace pointers. If an API function is supposed to return a pair of metadata pointers for the instrumentation to use (like all __msan_metadata_ptr_for_TYPE_SIZE() functions do), it returns the allocated metadata for kernel pointers and special dummy buffers residing in the kernel memory for userspace pointers. As a result, none of KMSAN API functions perform userspace accesses, but since they might be called from UACCESS regions they use user_access_save/restore(). Link: https://lkml.kernel.org/r/20220915150417.722975-32-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- tools/objtool/check.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index e55fdf952a3a..7c048c11ce7d 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1062,6 +1062,26 @@ static const char *uaccess_safe_builtin[] = { "__sanitizer_cov_trace_cmp4", "__sanitizer_cov_trace_cmp8", "__sanitizer_cov_trace_switch", + /* KMSAN */ + "kmsan_copy_to_user", + "kmsan_report", + "kmsan_unpoison_entry_regs", + "kmsan_unpoison_memory", + "__msan_chain_origin", + "__msan_get_context_state", + "__msan_instrument_asm_store", + "__msan_metadata_ptr_for_load_1", + "__msan_metadata_ptr_for_load_2", + "__msan_metadata_ptr_for_load_4", + "__msan_metadata_ptr_for_load_8", + "__msan_metadata_ptr_for_load_n", + "__msan_metadata_ptr_for_store_1", + "__msan_metadata_ptr_for_store_2", + "__msan_metadata_ptr_for_store_4", + "__msan_metadata_ptr_for_store_8", + "__msan_metadata_ptr_for_store_n", + "__msan_poison_alloca", + "__msan_warning", /* UBSAN */ "ubsan_type_mismatch_common", "__ubsan_handle_type_mismatch", From 93324e6842148cfdb44d1437fb586b957ace1f8c Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:06 +0200 Subject: [PATCH 281/350] x86: kmsan: disable instrumentation of unsupported code Instrumenting some files with KMSAN will result in kernel being unable to link, boot or crashing at runtime for various reasons (e.g. infinite recursion caused by instrumentation hooks calling instrumented code again). Completely omit KMSAN instrumentation in the following places: - arch/x86/boot and arch/x86/realmode/rm, as KMSAN doesn't work for i386; - arch/x86/entry/vdso, which isn't linked with KMSAN runtime; - three files in arch/x86/kernel - boot problems; - arch/x86/mm/cpu_entry_area.c - recursion. Link: https://lkml.kernel.org/r/20220915150417.722975-33-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/boot/Makefile | 1 + arch/x86/boot/compressed/Makefile | 1 + arch/x86/entry/vdso/Makefile | 3 +++ arch/x86/kernel/Makefile | 2 ++ arch/x86/kernel/cpu/Makefile | 1 + arch/x86/mm/Makefile | 2 ++ arch/x86/realmode/rm/Makefile | 1 + 7 files changed, 11 insertions(+) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index ffec8bb01ba8..9860ca5979f8 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -12,6 +12,7 @@ # Sanitizer runtimes are unavailable and cannot be linked for early boot code. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Kernel does not boot with kcov instrumentation here. diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 35ce1a64068b..3a261abb6d15 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -20,6 +20,7 @@ # Sanitizer runtimes are unavailable and cannot be linked for early boot code. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. diff --git a/arch/x86/entry/vdso/Makefile b/arch/x86/entry/vdso/Makefile index 12f6c4d714cd..ce4eb7e44e5b 100644 --- a/arch/x86/entry/vdso/Makefile +++ b/arch/x86/entry/vdso/Makefile @@ -11,6 +11,9 @@ include $(srctree)/lib/vdso/Makefile # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n +KMSAN_SANITIZE_vclock_gettime.o := n +KMSAN_SANITIZE_vgetcpu.o := n + UBSAN_SANITIZE := n KCSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index a20a5ebfacd7..ac564c5d7b1f 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -33,6 +33,8 @@ KASAN_SANITIZE_sev.o := n # With some compiler versions the generated code results in boot hangs, caused # by several compilation units. To be safe, disable all instrumentation. KCSAN_SANITIZE := n +KMSAN_SANITIZE_head$(BITS).o := n +KMSAN_SANITIZE_nmi.o := n # If instrumentation of this dir is enabled, boot hangs during first second. # Probably could be more selective here, but note that files related to irqs, diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 9661e3e802be..f10a921ee756 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -12,6 +12,7 @@ endif # If these files are instrumented, boot hangs during the first second. KCOV_INSTRUMENT_common.o := n KCOV_INSTRUMENT_perf_event.o := n +KMSAN_SANITIZE_common.o := n # As above, instrumenting secondary CPU boot code causes boot hangs. KCSAN_SANITIZE_common.o := n diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index 829c1409ffbd..afb6f7187dad 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -14,6 +14,8 @@ KASAN_SANITIZE_pgprot.o := n # Disable KCSAN entirely, because otherwise we get warnings that some functions # reference __initdata sections. KCSAN_SANITIZE := n +# Avoid recursion by not calling KMSAN hooks for CEA code. +KMSAN_SANITIZE_cpu_entry_area.o := n ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_mem_encrypt.o = -pg diff --git a/arch/x86/realmode/rm/Makefile b/arch/x86/realmode/rm/Makefile index 83f1b6a56449..f614009d3e4e 100644 --- a/arch/x86/realmode/rm/Makefile +++ b/arch/x86/realmode/rm/Makefile @@ -10,6 +10,7 @@ # Sanitizer runtimes are unavailable and cannot be linked here. KASAN_SANITIZE := n KCSAN_SANITIZE := n +KMSAN_SANITIZE := n OBJECT_FILES_NON_STANDARD := y # Prevents link failures: __sanitizer_cov_trace_pc() is not linked in. From b11671b37f8f4761ff5a3d344553d65238309954 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:07 +0200 Subject: [PATCH 282/350] x86: kmsan: skip shadow checks in __switch_to() When instrumenting functions, KMSAN obtains the per-task state (mostly pointers to metadata for function arguments and return values) once per function at its beginning, using the `current` pointer. Every time the instrumented function calls another function, this state (`struct kmsan_context_state`) is updated with shadow/origin data of the passed and returned values. When `current` changes in the low-level arch code, instrumented code can not notice that, and will still refer to the old state, possibly corrupting it or using stale data. This may result in false positive reports. To deal with that, we need to apply __no_kmsan_checks to the functions performing context switching - this will result in skipping all KMSAN shadow checks and marking newly created values as initialized, preventing all false positive reports in those functions. False negatives are still possible, but we expect them to be rare and impersistent. Link: https://lkml.kernel.org/r/20220915150417.722975-34-glider@google.com Suggested-by: Marco Elver Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/kernel/process_64.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 1962008fe743..6b3418bff326 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -553,6 +553,7 @@ void compat_start_thread(struct pt_regs *regs, u32 new_ip, u32 new_sp, bool x32) * Kprobes not supported here. Set the probe on schedule instead. * Function graph tracer not supported too. */ +__no_kmsan_checks __visible __notrace_funcgraph struct task_struct * __switch_to(struct task_struct *prev_p, struct task_struct *next_p) { From 9245ec01ce848eb5147e2e5030cf33ffbf1befff Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:08 +0200 Subject: [PATCH 283/350] x86: kmsan: handle open-coded assembly in lib/iomem.c KMSAN cannot intercept memory accesses within asm() statements. That's why we add kmsan_unpoison_memory() and kmsan_check_memory() to hint it how to handle memory copied from/to I/O memory. Link: https://lkml.kernel.org/r/20220915150417.722975-35-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/lib/iomem.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/lib/iomem.c b/arch/x86/lib/iomem.c index 3e2f33fc33de..e0411a3774d4 100644 --- a/arch/x86/lib/iomem.c +++ b/arch/x86/lib/iomem.c @@ -1,6 +1,7 @@ #include #include #include +#include #define movs(type,to,from) \ asm volatile("movs" type:"=&D" (to), "=&S" (from):"0" (to), "1" (from):"memory") @@ -37,6 +38,8 @@ static void string_memcpy_fromio(void *to, const volatile void __iomem *from, si n-=2; } rep_movs(to, (const void *)from, n); + /* KMSAN must treat values read from devices as initialized. */ + kmsan_unpoison_memory(to, n); } static void string_memcpy_toio(volatile void __iomem *to, const void *from, size_t n) @@ -44,6 +47,8 @@ static void string_memcpy_toio(volatile void __iomem *to, const void *from, size if (unlikely(!n)) return; + /* Make sure uninitialized memory isn't copied to devices. */ + kmsan_check_memory(from, n); /* Align any unaligned destination IO */ if (unlikely(1 & (unsigned long)to)) { movs("b", to, from); From ff901d80fff6d65ada6f2a60a1f7d180ee2e0416 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:09 +0200 Subject: [PATCH 284/350] x86: kmsan: use __msan_ string functions where possible. Unless stated otherwise (by explicitly calling __memcpy(), __memset() or __memmove()) we want all string functions to call their __msan_ versions (e.g. __msan_memcpy() instead of memcpy()), so that shadow and origin values are updated accordingly. Bootloader must still use the default string functions to avoid crashes. Link: https://lkml.kernel.org/r/20220915150417.722975-36-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/include/asm/string_64.h | 23 +++++++++++++++++++++-- include/linux/fortify-string.h | 2 ++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/string_64.h b/arch/x86/include/asm/string_64.h index 6e450827f677..3b87d889b6e1 100644 --- a/arch/x86/include/asm/string_64.h +++ b/arch/x86/include/asm/string_64.h @@ -11,11 +11,23 @@ function. */ #define __HAVE_ARCH_MEMCPY 1 +#if defined(__SANITIZE_MEMORY__) +#undef memcpy +void *__msan_memcpy(void *dst, const void *src, size_t size); +#define memcpy __msan_memcpy +#else extern void *memcpy(void *to, const void *from, size_t len); +#endif extern void *__memcpy(void *to, const void *from, size_t len); #define __HAVE_ARCH_MEMSET +#if defined(__SANITIZE_MEMORY__) +extern void *__msan_memset(void *s, int c, size_t n); +#undef memset +#define memset __msan_memset +#else void *memset(void *s, int c, size_t n); +#endif void *__memset(void *s, int c, size_t n); #define __HAVE_ARCH_MEMSET16 @@ -55,7 +67,13 @@ static inline void *memset64(uint64_t *s, uint64_t v, size_t n) } #define __HAVE_ARCH_MEMMOVE +#if defined(__SANITIZE_MEMORY__) +#undef memmove +void *__msan_memmove(void *dest, const void *src, size_t len); +#define memmove __msan_memmove +#else void *memmove(void *dest, const void *src, size_t count); +#endif void *__memmove(void *dest, const void *src, size_t count); int memcmp(const void *cs, const void *ct, size_t count); @@ -64,8 +82,7 @@ char *strcpy(char *dest, const char *src); char *strcat(char *dest, const char *src); int strcmp(const char *cs, const char *ct); -#if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) - +#if (defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__)) /* * For files that not instrumented (e.g. mm/slub.c) we * should use not instrumented version of mem* functions. @@ -73,7 +90,9 @@ int strcmp(const char *cs, const char *ct); #undef memcpy #define memcpy(dst, src, len) __memcpy(dst, src, len) +#undef memmove #define memmove(dst, src, len) __memmove(dst, src, len) +#undef memset #define memset(s, c, n) __memset(s, c, n) #ifndef __NO_FORTIFY diff --git a/include/linux/fortify-string.h b/include/linux/fortify-string.h index 3b401fa0f374..6c8a1a29d0b6 100644 --- a/include/linux/fortify-string.h +++ b/include/linux/fortify-string.h @@ -285,8 +285,10 @@ __FORTIFY_INLINE void fortify_memset_chk(__kernel_size_t size, * __builtin_object_size() must be captured here to avoid evaluating argument * side-effects further into the macro layers. */ +#ifndef CONFIG_KMSAN #define memset(p, c, s) __fortify_memset_chk(p, c, s, \ __builtin_object_size(p, 0), __builtin_object_size(p, 1)) +#endif /* * To make sure the compiler can enforce protection against buffer overflows, From 3f1e2c7a9099c1ed32c67f12cdf432ba782cf51f Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:10 +0200 Subject: [PATCH 285/350] x86: kmsan: sync metadata pages on page fault KMSAN assumes shadow and origin pages for every allocated page are accessible. For pages between [VMALLOC_START, VMALLOC_END] those metadata pages start at KMSAN_VMALLOC_SHADOW_START and KMSAN_VMALLOC_ORIGIN_START, therefore we must sync a bigger memory region. Link: https://lkml.kernel.org/r/20220915150417.722975-37-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/mm/fault.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index fa71a5d12e87..d728791be8ac 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -260,7 +260,7 @@ static noinline int vmalloc_fault(unsigned long address) } NOKPROBE_SYMBOL(vmalloc_fault); -void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +static void __arch_sync_kernel_mappings(unsigned long start, unsigned long end) { unsigned long addr; @@ -284,6 +284,27 @@ void arch_sync_kernel_mappings(unsigned long start, unsigned long end) } } +void arch_sync_kernel_mappings(unsigned long start, unsigned long end) +{ + __arch_sync_kernel_mappings(start, end); +#ifdef CONFIG_KMSAN + /* + * KMSAN maintains two additional metadata page mappings for the + * [VMALLOC_START, VMALLOC_END) range. These mappings start at + * KMSAN_VMALLOC_SHADOW_START and KMSAN_VMALLOC_ORIGIN_START and + * have to be synced together with the vmalloc memory mapping. + */ + if (start >= VMALLOC_START && end < VMALLOC_END) { + __arch_sync_kernel_mappings( + start - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START, + end - VMALLOC_START + KMSAN_VMALLOC_SHADOW_START); + __arch_sync_kernel_mappings( + start - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START, + end - VMALLOC_START + KMSAN_VMALLOC_ORIGIN_START); + } +#endif +} + static bool low_pfn(unsigned long pfn) { return pfn < max_low_pfn; From d911c67e10b47eb1ace08dcf95ce98fe4d408c88 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:11 +0200 Subject: [PATCH 286/350] x86: kasan: kmsan: support CONFIG_GENERIC_CSUM on x86, enable it for KASAN/KMSAN This is needed to allow memory tools like KASAN and KMSAN see the memory accesses from the checksum code. Without CONFIG_GENERIC_CSUM the tools can't see memory accesses originating from handwritten assembly code. For KASAN it's a question of detecting more bugs, for KMSAN using the C implementation also helps avoid false positives originating from seemingly uninitialized checksum values. Link: https://lkml.kernel.org/r/20220915150417.722975-38-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 4 ++++ arch/x86/include/asm/checksum.h | 16 ++++++++++------ arch/x86/lib/Makefile | 2 ++ 3 files changed, 16 insertions(+), 6 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 674d694a665e..eb158eb8a985 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -325,6 +325,10 @@ config GENERIC_ISA_DMA def_bool y depends on ISA_DMA_API +config GENERIC_CSUM + bool + default y if KMSAN || KASAN + config GENERIC_BUG def_bool y depends on BUG diff --git a/arch/x86/include/asm/checksum.h b/arch/x86/include/asm/checksum.h index bca625a60186..6df6ece8a28e 100644 --- a/arch/x86/include/asm/checksum.h +++ b/arch/x86/include/asm/checksum.h @@ -1,9 +1,13 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 -#define HAVE_CSUM_COPY_USER -#define _HAVE_ARCH_CSUM_AND_COPY -#ifdef CONFIG_X86_32 -# include +#ifdef CONFIG_GENERIC_CSUM +# include #else -# include +# define _HAVE_ARCH_COPY_AND_CSUM_FROM_USER 1 +# define HAVE_CSUM_COPY_USER +# define _HAVE_ARCH_CSUM_AND_COPY +# ifdef CONFIG_X86_32 +# include +# else +# include +# endif #endif diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index f76747862bd2..7ba5f61d7273 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -65,7 +65,9 @@ ifneq ($(CONFIG_X86_CMPXCHG64),y) endif else obj-y += iomap_copy_64.o +ifneq ($(CONFIG_GENERIC_CSUM),y) lib-y += csum-partial_64.o csum-copy_64.o csum-wrappers_64.o +endif lib-y += clear_page_64.o copy_page_64.o lib-y += memmove_64.o memset_64.o lib-y += copy_user_64.o From 7cf8f44a5a1c0cb10a594996797e5a988cf0589d Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:12 +0200 Subject: [PATCH 287/350] x86: fs: kmsan: disable CONFIG_DCACHE_WORD_ACCESS dentry_string_cmp() calls read_word_at_a_time(), which might read uninitialized bytes to optimize string comparisons. Disabling CONFIG_DCACHE_WORD_ACCESS should prohibit this optimization, as well as (probably) similar ones. Link: https://lkml.kernel.org/r/20220915150417.722975-39-glider@google.com Signed-off-by: Alexander Potapenko Suggested-by: Andrey Konovalov Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index eb158eb8a985..edf7f12935d7 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -129,7 +129,9 @@ config X86 select CLKEVT_I8253 select CLOCKSOURCE_VALIDATE_LAST_CYCLE select CLOCKSOURCE_WATCHDOG - select DCACHE_WORD_ACCESS + # Word-size accesses may read uninitialized data past the trailing \0 + # in strings and cause false KMSAN reports. + select DCACHE_WORD_ACCESS if !KMSAN select DYNAMIC_SIGFRAME select EDAC_ATOMIC_SCRUB select EDAC_SUPPORT From 37ad4ee8364255c73026a3c343403b5977fa7e79 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:13 +0200 Subject: [PATCH 288/350] x86: kmsan: don't instrument stack walking functions Upon function exit, KMSAN marks local variables as uninitialized. Further function calls may result in the compiler creating the stack frame where these local variables resided. This results in frame pointers being marked as uninitialized data, which is normally correct, because they are not stack-allocated. However stack unwinding functions are supposed to read and dereference the frame pointers, in which case KMSAN might be reporting uses of uninitialized values. To work around that, we mark update_stack_state(), unwind_next_frame() and show_trace_log_lvl() with __no_kmsan_checks, preventing all KMSAN reports inside those functions and making them return initialized values. Link: https://lkml.kernel.org/r/20220915150417.722975-40-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/kernel/dumpstack.c | 6 ++++++ arch/x86/kernel/unwind_frame.c | 11 +++++++++++ 2 files changed, 17 insertions(+) diff --git a/arch/x86/kernel/dumpstack.c b/arch/x86/kernel/dumpstack.c index afae4dd77495..476eb504084e 100644 --- a/arch/x86/kernel/dumpstack.c +++ b/arch/x86/kernel/dumpstack.c @@ -177,6 +177,12 @@ static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, } } +/* + * This function reads pointers from the stack and dereferences them. The + * pointers may not have their KMSAN shadow set up properly, which may result + * in false positive reports. Disable instrumentation to avoid those. + */ +__no_kmsan_checks static void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, const char *log_lvl) { diff --git a/arch/x86/kernel/unwind_frame.c b/arch/x86/kernel/unwind_frame.c index 8e1c50c86e5d..d8ba93778ae3 100644 --- a/arch/x86/kernel/unwind_frame.c +++ b/arch/x86/kernel/unwind_frame.c @@ -183,6 +183,16 @@ static struct pt_regs *decode_frame_pointer(unsigned long *bp) } #endif +/* + * While walking the stack, KMSAN may stomp on stale locals from other + * functions that were marked as uninitialized upon function exit, and + * now hold the call frame information for the current function (e.g. the frame + * pointer). Because KMSAN does not specifically mark call frames as + * initialized, false positive reports are possible. To prevent such reports, + * we mark the functions scanning the stack (here and below) with + * __no_kmsan_checks. + */ +__no_kmsan_checks static bool update_stack_state(struct unwind_state *state, unsigned long *next_bp) { @@ -250,6 +260,7 @@ static bool update_stack_state(struct unwind_state *state, return true; } +__no_kmsan_checks bool unwind_next_frame(struct unwind_state *state) { struct pt_regs *regs; From 6cae637fa26df867449c6bc20ea8bc693abe49b0 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:14 +0200 Subject: [PATCH 289/350] entry: kmsan: introduce kmsan_unpoison_entry_regs() struct pt_regs passed into IRQ entry code is set up by uninstrumented asm functions, therefore KMSAN may not notice the registers are initialized. kmsan_unpoison_entry_regs() unpoisons the contents of struct pt_regs, preventing potential false positives. Unlike kmsan_unpoison_memory(), it can be called under kmsan_in_runtime(), which is often the case in IRQ entry code. Link: https://lkml.kernel.org/r/20220915150417.722975-41-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/kmsan.h | 15 +++++++++++++++ kernel/entry/common.c | 5 +++++ mm/kmsan/hooks.c | 26 ++++++++++++++++++++++++++ 3 files changed, 46 insertions(+) diff --git a/include/linux/kmsan.h b/include/linux/kmsan.h index c473e0e21683..e38ae3c34618 100644 --- a/include/linux/kmsan.h +++ b/include/linux/kmsan.h @@ -214,6 +214,17 @@ void kmsan_handle_dma_sg(struct scatterlist *sg, int nents, */ void kmsan_handle_urb(const struct urb *urb, bool is_out); +/** + * kmsan_unpoison_entry_regs() - Handle pt_regs in low-level entry code. + * @regs: struct pt_regs pointer received from assembly code. + * + * KMSAN unpoisons the contents of the passed pt_regs, preventing potential + * false positive reports. Unlike kmsan_unpoison_memory(), + * kmsan_unpoison_entry_regs() can be called from the regions where + * kmsan_in_runtime() returns true, which is the case in early entry code. + */ +void kmsan_unpoison_entry_regs(const struct pt_regs *regs); + #else static inline void kmsan_init_shadow(void) @@ -310,6 +321,10 @@ static inline void kmsan_handle_urb(const struct urb *urb, bool is_out) { } +static inline void kmsan_unpoison_entry_regs(const struct pt_regs *regs) +{ +} + #endif #endif /* _LINUX_KMSAN_H */ diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 063068a9ea9b..846add8394c4 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -24,6 +25,7 @@ static __always_inline void __enter_from_user_mode(struct pt_regs *regs) user_exit_irqoff(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); instrumentation_end(); } @@ -352,6 +354,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) lockdep_hardirqs_off(CALLER_ADDR0); ct_irq_enter(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); instrumentation_end(); @@ -367,6 +370,7 @@ noinstr irqentry_state_t irqentry_enter(struct pt_regs *regs) */ lockdep_hardirqs_off(CALLER_ADDR0); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); rcu_irq_enter_check_tick(); trace_hardirqs_off_finish(); instrumentation_end(); @@ -452,6 +456,7 @@ irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs) ct_nmi_enter(); instrumentation_begin(); + kmsan_unpoison_entry_regs(regs); trace_hardirqs_off_finish(); ftrace_nmi_enter(); instrumentation_end(); diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c index 79d7e73e2cfd..35f6b6e6a908 100644 --- a/mm/kmsan/hooks.c +++ b/mm/kmsan/hooks.c @@ -348,6 +348,32 @@ void kmsan_unpoison_memory(const void *address, size_t size) } EXPORT_SYMBOL(kmsan_unpoison_memory); +/* + * Version of kmsan_unpoison_memory() that can be called from within the KMSAN + * runtime. + * + * Non-instrumented IRQ entry functions receive struct pt_regs from assembly + * code. Those regs need to be unpoisoned, otherwise using them will result in + * false positives. + * Using kmsan_unpoison_memory() is not an option in entry code, because the + * return value of in_task() is inconsistent - as a result, certain calls to + * kmsan_unpoison_memory() are ignored. kmsan_unpoison_entry_regs() ensures that + * the registers are unpoisoned even if kmsan_in_runtime() is true in the early + * entry code. + */ +void kmsan_unpoison_entry_regs(const struct pt_regs *regs) +{ + unsigned long ua_flags; + + if (!kmsan_enabled) + return; + + ua_flags = user_access_save(); + kmsan_internal_unpoison_memory((void *)regs, sizeof(*regs), + KMSAN_POISON_NOCHECK); + user_access_restore(ua_flags); +} + void kmsan_check_memory(const void *addr, size_t size) { if (!kmsan_enabled) From a6a7aaba7f39ee439f3d42e4b5bfc6e7f762d126 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:15 +0200 Subject: [PATCH 290/350] bpf: kmsan: initialize BPF registers with zeroes When executing BPF programs, certain registers may get passed uninitialized to helper functions. E.g. when performing a JMP_CALL, registers BPF_R1-BPF_R5 are always passed to the helper, no matter how many of them are actually used. Passing uninitialized values as function parameters is technically undefined behavior, so we work around it by always initializing the registers. Link: https://lkml.kernel.org/r/20220915150417.722975-42-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- kernel/bpf/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index c1e10d088dbb..547d139ab98a 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -2002,7 +2002,7 @@ out: static unsigned int PROG_NAME(stack_size)(const void *ctx, const struct bpf_insn *insn) \ { \ u64 stack[stack_size / sizeof(u64)]; \ - u64 regs[MAX_BPF_EXT_REG]; \ + u64 regs[MAX_BPF_EXT_REG] = {}; \ \ FP = (u64) (unsigned long) &stack[ARRAY_SIZE(stack)]; \ ARG1 = (u64) (unsigned long) ctx; \ From 1468c6f4558b1bcd92aa0400f2920f9dc7588402 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:16 +0200 Subject: [PATCH 291/350] mm: fs: initialize fsdata passed to write_begin/write_end interface Functions implementing the a_ops->write_end() interface accept the `void *fsdata` parameter that is supposed to be initialized by the corresponding a_ops->write_begin() (which accepts `void **fsdata`). However not all a_ops->write_begin() implementations initialize `fsdata` unconditionally, so it may get passed uninitialized to a_ops->write_end(), resulting in undefined behavior. Fix this by initializing fsdata with NULL before the call to write_begin(), rather than doing so in all possible a_ops implementations. This patch covers only the following cases found by running x86 KMSAN under syzkaller: - generic_perform_write() - cont_expand_zero() and generic_cont_expand_simple() - page_symlink() Other cases of passing uninitialized fsdata may persist in the codebase. Link: https://lkml.kernel.org/r/20220915150417.722975-43-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- fs/buffer.c | 4 ++-- fs/namei.c | 2 +- mm/filemap.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/buffer.c b/fs/buffer.c index b4c9fff3ab6c..b55252078e7b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2341,7 +2341,7 @@ int generic_cont_expand_simple(struct inode *inode, loff_t size) struct address_space *mapping = inode->i_mapping; const struct address_space_operations *aops = mapping->a_ops; struct page *page; - void *fsdata; + void *fsdata = NULL; int err; err = inode_newsize_ok(inode, size); @@ -2367,7 +2367,7 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, const struct address_space_operations *aops = mapping->a_ops; unsigned int blocksize = i_blocksize(inode); struct page *page; - void *fsdata; + void *fsdata = NULL; pgoff_t index, curidx; loff_t curpos; unsigned zerofrom, offset, len; diff --git a/fs/namei.c b/fs/namei.c index 53b4bc094db2..076ae96ca0b1 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -5088,7 +5088,7 @@ int page_symlink(struct inode *inode, const char *symname, int len) const struct address_space_operations *aops = mapping->a_ops; bool nofs = !mapping_gfp_constraint(mapping, __GFP_FS); struct page *page; - void *fsdata; + void *fsdata = NULL; int err; unsigned int flags; diff --git a/mm/filemap.c b/mm/filemap.c index f27c93a581ab..ec17bd1a3bb7 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -3720,7 +3720,7 @@ ssize_t generic_perform_write(struct kiocb *iocb, struct iov_iter *i) unsigned long offset; /* Offset into pagecache page */ unsigned long bytes; /* Bytes to write to page */ size_t copied; /* Bytes copied from user */ - void *fsdata; + void *fsdata = NULL; offset = (pos & (PAGE_SIZE - 1)); bytes = min_t(unsigned long, PAGE_SIZE - offset, From 4ca8cc8d1bbe582bfc7a4d80bd72cfd8d3d0e2e8 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Thu, 15 Sep 2022 17:04:17 +0200 Subject: [PATCH 292/350] x86: kmsan: enable KMSAN builds for x86 Make KMSAN usable by adding the necessary Kconfig bits. Also declare x86-specific functions checking address validity in arch/x86/include/asm/kmsan.h. Link: https://lkml.kernel.org/r/20220915150417.722975-44-glider@google.com Signed-off-by: Alexander Potapenko Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- arch/x86/Kconfig | 1 + arch/x86/include/asm/kmsan.h | 55 ++++++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+) create mode 100644 arch/x86/include/asm/kmsan.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index edf7f12935d7..dce94e84199d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -169,6 +169,7 @@ config X86 select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KASAN_VMALLOC if X86_64 select HAVE_ARCH_KFENCE + select HAVE_ARCH_KMSAN if X86_64 select HAVE_ARCH_KGDB select HAVE_ARCH_MMAP_RND_BITS if MMU select HAVE_ARCH_MMAP_RND_COMPAT_BITS if MMU && COMPAT diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h new file mode 100644 index 000000000000..a790b865d0a6 --- /dev/null +++ b/arch/x86/include/asm/kmsan.h @@ -0,0 +1,55 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * x86 KMSAN support. + * + * Copyright (C) 2022, Google LLC + * Author: Alexander Potapenko + */ + +#ifndef _ASM_X86_KMSAN_H +#define _ASM_X86_KMSAN_H + +#ifndef MODULE + +#include +#include + +/* + * Taken from arch/x86/mm/physaddr.h to avoid using an instrumented version. + */ +static inline bool kmsan_phys_addr_valid(unsigned long addr) +{ + if (IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT)) + return !(addr >> boot_cpu_data.x86_phys_bits); + else + return true; +} + +/* + * Taken from arch/x86/mm/physaddr.c to avoid using an instrumented version. + */ +static inline bool kmsan_virt_addr_valid(void *addr) +{ + unsigned long x = (unsigned long)addr; + unsigned long y = x - __START_KERNEL_map; + + /* use the carry flag to determine if x was < __START_KERNEL_map */ + if (unlikely(x > y)) { + x = y + phys_base; + + if (y >= KERNEL_IMAGE_SIZE) + return false; + } else { + x = y + (__START_KERNEL_map - PAGE_OFFSET); + + /* carry flag will be set if starting x was >= PAGE_OFFSET */ + if ((x > y) || !kmsan_phys_addr_valid(x)) + return false; + } + + return pfn_valid(x >> PAGE_SHIFT); +} + +#endif /* !MODULE */ + +#endif /* _ASM_X86_KMSAN_H */ From ce732a7520b093091c345cba1b84542d1abd83ed Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 28 Sep 2022 14:32:19 +0200 Subject: [PATCH 293/350] x86: kmsan: handle CPU entry area Among other data, CPU entry area holds exception stacks, so addresses from this area can be passed to kmsan_get_metadata(). This previously led to kmsan_get_metadata() returning NULL, which in turn resulted in a warning that triggered further attempts to call kmsan_get_metadata() in the exception context, which quickly exhausted the exception stack. This patch allocates shadow and origin for the CPU entry area on x86 and introduces arch_kmsan_get_meta_or_null(), which performs arch-specific metadata mapping. Link: https://lkml.kernel.org/r/20220928123219.1101883-1-glider@google.com Signed-off-by: Alexander Potapenko Fixes: 21d723a7c1409 ("kmsan: add KMSAN runtime core") Cc: Alexander Viro Cc: Alexei Starovoitov Cc: Andrey Konovalov Cc: Andrey Konovalov Cc: Andy Lutomirski Cc: Arnd Bergmann Cc: Borislav Petkov Cc: Christoph Hellwig Cc: Christoph Lameter Cc: David Rientjes Cc: Dmitry Vyukov Cc: Eric Biggers Cc: Eric Biggers Cc: Eric Dumazet Cc: Greg Kroah-Hartman Cc: Herbert Xu Cc: Ilya Leoshkevich Cc: Ingo Molnar Cc: Jens Axboe Cc: Joonsoo Kim Cc: Kees Cook Cc: Marco Elver Cc: Mark Rutland Cc: Matthew Wilcox Cc: Michael S. Tsirkin Cc: Pekka Enberg Cc: Peter Zijlstra Cc: Petr Mladek Cc: Stephen Rothwell Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Vasily Gorbik Cc: Vegard Nossum Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- MAINTAINERS | 1 + arch/x86/include/asm/kmsan.h | 32 ++++++++++++++++++++++++++++++++ arch/x86/mm/Makefile | 3 +++ arch/x86/mm/kmsan_shadow.c | 20 ++++++++++++++++++++ mm/kmsan/shadow.c | 6 +++++- 5 files changed, 61 insertions(+), 1 deletion(-) create mode 100644 arch/x86/mm/kmsan_shadow.c diff --git a/MAINTAINERS b/MAINTAINERS index 3c7dfe9bb712..456b07f02803 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -11379,6 +11379,7 @@ L: kasan-dev@googlegroups.com S: Maintained F: Documentation/dev-tools/kmsan.rst F: arch/*/include/asm/kmsan.h +F: arch/*/mm/kmsan_* F: include/linux/kmsan*.h F: lib/Kconfig.kmsan F: mm/kmsan/ diff --git a/arch/x86/include/asm/kmsan.h b/arch/x86/include/asm/kmsan.h index a790b865d0a6..8fa6ac0e2d76 100644 --- a/arch/x86/include/asm/kmsan.h +++ b/arch/x86/include/asm/kmsan.h @@ -11,9 +11,41 @@ #ifndef MODULE +#include #include #include +DECLARE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_shadow); +DECLARE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_origin); + +/* + * Functions below are declared in the header to make sure they are inlined. + * They all are called from kmsan_get_metadata() for every memory access in + * the kernel, so speed is important here. + */ + +/* + * Compute metadata addresses for the CPU entry area on x86. + */ +static inline void *arch_kmsan_get_meta_or_null(void *addr, bool is_origin) +{ + unsigned long addr64 = (unsigned long)addr; + char *metadata_array; + unsigned long off; + int cpu; + + if ((addr64 < CPU_ENTRY_AREA_BASE) || + (addr64 >= (CPU_ENTRY_AREA_BASE + CPU_ENTRY_AREA_MAP_SIZE))) + return NULL; + cpu = (addr64 - CPU_ENTRY_AREA_BASE) / CPU_ENTRY_AREA_SIZE; + off = addr64 - (unsigned long)get_cpu_entry_area(cpu); + if ((off < 0) || (off >= CPU_ENTRY_AREA_SIZE)) + return NULL; + metadata_array = is_origin ? cpu_entry_area_origin : + cpu_entry_area_shadow; + return &per_cpu(metadata_array[off], cpu); +} + /* * Taken from arch/x86/mm/physaddr.h to avoid using an instrumented version. */ diff --git a/arch/x86/mm/Makefile b/arch/x86/mm/Makefile index afb6f7187dad..c80febc44cd2 100644 --- a/arch/x86/mm/Makefile +++ b/arch/x86/mm/Makefile @@ -46,6 +46,9 @@ obj-$(CONFIG_HIGHMEM) += highmem_32.o KASAN_SANITIZE_kasan_init_$(BITS).o := n obj-$(CONFIG_KASAN) += kasan_init_$(BITS).o +KMSAN_SANITIZE_kmsan_shadow.o := n +obj-$(CONFIG_KMSAN) += kmsan_shadow.o + obj-$(CONFIG_MMIOTRACE) += mmiotrace.o mmiotrace-y := kmmio.o pf_in.o mmio-mod.o obj-$(CONFIG_MMIOTRACE_TEST) += testmmiotrace.o diff --git a/arch/x86/mm/kmsan_shadow.c b/arch/x86/mm/kmsan_shadow.c new file mode 100644 index 000000000000..bee2ec4a3bfa --- /dev/null +++ b/arch/x86/mm/kmsan_shadow.c @@ -0,0 +1,20 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * x86-specific bits of KMSAN shadow implementation. + * + * Copyright (C) 2022 Google LLC + * Author: Alexander Potapenko + */ + +#include +#include + +/* + * Addresses within the CPU entry area (including e.g. exception stacks) do not + * have struct page entries corresponding to them, so they need separate + * handling. + * arch_kmsan_get_meta_or_null() (declared in the header) maps the addresses in + * CPU entry area to addresses in cpu_entry_area_shadow/cpu_entry_area_origin. + */ +DEFINE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_shadow); +DEFINE_PER_CPU(char[CPU_ENTRY_AREA_SIZE], cpu_entry_area_origin); diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c index 6e90a806a704..21e3e196ec3c 100644 --- a/mm/kmsan/shadow.c +++ b/mm/kmsan/shadow.c @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -126,6 +125,7 @@ void *kmsan_get_metadata(void *address, bool is_origin) { u64 addr = (u64)address, pad, off; struct page *page; + void *ret; if (is_origin && !IS_ALIGNED(addr, KMSAN_ORIGIN_SIZE)) { pad = addr % KMSAN_ORIGIN_SIZE; @@ -136,6 +136,10 @@ void *kmsan_get_metadata(void *address, bool is_origin) kmsan_internal_is_module_addr(address)) return (void *)vmalloc_meta(address, is_origin); + ret = arch_kmsan_get_meta_or_null(address, is_origin); + if (ret) + return ret; + page = virt_to_page_or_null(address); if (!page) return NULL; From 871f697b494b04f8d78cc090e49b416062d23a10 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 15 Sep 2022 22:22:36 +0800 Subject: [PATCH 294/350] mm/damon/sysfs: avoid call damon_target_has_pid() repeatedly In damon_sysfs_destroy_targets(), we call damon_target_has_pid() to check whether the 'ctx' include a valid pid, but there no need to call damon_target_has_pid() to check repeatedly, just need call it once. [xhao@linux.alibaba.com: more simplified code calls damon_target_has_pid()] Link: https://lkml.kernel.org/r/20220916133535.7428-1-xhao@linux.alibaba.com Link: https://lkml.kernel.org/r/20220915142237.92529-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 1fa0023f136e..0cca1909bf67 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -2143,9 +2143,10 @@ static int damon_sysfs_set_attrs(struct damon_ctx *ctx, static void damon_sysfs_destroy_targets(struct damon_ctx *ctx) { struct damon_target *t, *next; + bool has_pid = damon_target_has_pid(ctx); damon_for_each_target_safe(t, next, ctx) { - if (damon_target_has_pid(ctx)) + if (has_pid) put_pid(t->pid); damon_destroy_target(t); } From a07b8eafa43fdbe1df33256b06d625c80829e557 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 15 Sep 2022 13:30:41 +0000 Subject: [PATCH 295/350] mm/damon: simplify scheme create in lru_sort.c In damon_lru_sort_new_hot_scheme() and damon_lru_sort_new_cold_scheme(), they have so much in common, so we can combine them into a single function, and we just need to distinguish their differences. [yangyingliang@huawei.com: change damon_lru_sort_stub_pattern to static] Link: https://lkml.kernel.org/r/20220917121228.1889699-1-yangyingliang@huawei.com Link: https://lkml.kernel.org/r/20220915133041.71819-1-sj@kernel.org Signed-off-by: Xin Hao Signed-off-by: SeongJae Park Signed-off-by: Yang Yingliang Suggested-by: SeongJae Park Reviewed-by: Xin Hao Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 07a0908963fd..a91c1e364fc7 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -132,6 +132,18 @@ DEFINE_DAMON_MODULES_DAMOS_STATS_PARAMS(damon_lru_sort_cold_stat, lru_sort_tried_cold_regions, lru_sorted_cold_regions, cold_quota_exceeds); +static struct damos_access_pattern damon_lru_sort_stub_pattern = { + /* Find regions having PAGE_SIZE or larger size */ + .min_sz_region = PAGE_SIZE, + .max_sz_region = ULONG_MAX, + /* no matter its access frequency */ + .min_nr_accesses = 0, + .max_nr_accesses = UINT_MAX, + /* no matter its age */ + .min_age_region = 0, + .max_age_region = UINT_MAX, +}; + static struct damon_ctx *ctx; static struct damon_target *target; @@ -157,36 +169,19 @@ static struct damos *damon_lru_sort_new_scheme( /* Create a DAMON-based operation scheme for hot memory regions */ static struct damos *damon_lru_sort_new_hot_scheme(unsigned int hot_thres) { - struct damos_access_pattern pattern = { - /* Find regions having PAGE_SIZE or larger size */ - .min_sz_region = PAGE_SIZE, - .max_sz_region = ULONG_MAX, - /* and accessed for more than the threshold */ - .min_nr_accesses = hot_thres, - .max_nr_accesses = UINT_MAX, - /* no matter its age */ - .min_age_region = 0, - .max_age_region = UINT_MAX, - }; + struct damos_access_pattern pattern = damon_lru_sort_stub_pattern; + pattern.min_nr_accesses = hot_thres; return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_PRIO); } /* Create a DAMON-based operation scheme for cold memory regions */ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) { - struct damos_access_pattern pattern = { - /* Find regions having PAGE_SIZE or larger size */ - .min_sz_region = PAGE_SIZE, - .max_sz_region = ULONG_MAX, - /* and not accessed at all */ - .min_nr_accesses = 0, - .max_nr_accesses = 0, - /* for min_age or more micro-seconds */ - .min_age_region = cold_thres, - .max_age_region = UINT_MAX, - }; + struct damos_access_pattern pattern = damon_lru_sort_stub_pattern; + pattern.max_nr_accesses = 0; + pattern.min_age_region = cold_thres; return damon_lru_sort_new_scheme(&pattern, DAMOS_LRU_DEPRIO); } From 16bc1b0f0269b6110f6d25880b52947d354e2980 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Thu, 15 Sep 2022 19:33:41 +0800 Subject: [PATCH 296/350] mm/damon: use 'struct damon_target *' instead of 'void *' in target_valid() We could use 'struct damon_target *' directly instead of 'void *' in target_valid() operation to make code simple. Link: https://lkml.kernel.org/r/1663241621-13293-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- mm/damon/vaddr.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index c5dc0c77c772..1dda8d0068e5 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -346,7 +346,7 @@ struct damon_operations { unsigned long (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); - bool (*target_valid)(void *target); + bool (*target_valid)(struct damon_target *t); void (*cleanup)(struct damon_ctx *context); }; diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 3f84584f9982..f53c2ff2bcc8 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -593,9 +593,8 @@ static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) * Functions for the target validity check and cleanup */ -static bool damon_va_target_valid(void *target) +static bool damon_va_target_valid(struct damon_target *t) { - struct damon_target *t = target; struct task_struct *task; task = damon_get_task_struct(t); From 81f8f57f853ed6fb8ae9bbc96e3aead10d6e248a Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 15 Sep 2022 10:10:23 +0800 Subject: [PATCH 297/350] mm/damon/reclaim: change damon_reclaim_wmarks to static damon_reclaim_wmarks is only used in reclaim.c now, change it to static. Link: https://lkml.kernel.org/r/20220915021024.4177940-1-yangyingliang@huawei.com Fixes: 89dd02d8abd1 ("mm/damon/reclaim: use watermarks parameters generator macro") Signed-off-by: Yang Yingliang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/reclaim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 1acf808e1624..039fa55e0ae9 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -64,7 +64,7 @@ static struct damos_quota damon_reclaim_quota = { }; DEFINE_DAMON_MODULES_DAMOS_QUOTAS(damon_reclaim_quota); -struct damos_watermarks damon_reclaim_wmarks = { +static struct damos_watermarks damon_reclaim_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = 5000000, /* 5 seconds */ .high = 500, /* 50 percent */ From e47b082579f307d0367b1fb7ca3698fd9c73a88b Mon Sep 17 00:00:00 2001 From: Yang Yingliang Date: Thu, 15 Sep 2022 10:10:24 +0800 Subject: [PATCH 298/350] mm/damon/lru_sort: change damon_lru_sort_wmarks to static damon_lru_sort_wmarks is only used in lru_sort.c now, change it to static. Link: https://lkml.kernel.org/r/20220915021024.4177940-2-yangyingliang@huawei.com Fixes: 189aa3d58206 ("mm/damon/lru_sort: use watermarks parameters generator macro") Signed-off-by: Yang Yingliang Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/lru_sort.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index a91c1e364fc7..4a40054ba03b 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -77,7 +77,7 @@ static struct damos_quota damon_lru_sort_quota = { }; DEFINE_DAMON_MODULES_DAMOS_TIME_QUOTA(damon_lru_sort_quota); -struct damos_watermarks damon_lru_sort_wmarks = { +static struct damos_watermarks damon_lru_sort_wmarks = { .metric = DAMOS_WMARK_FREE_MEM_RATE, .interval = 5000000, /* 5 seconds */ .high = 200, /* 20 percent */ From 1ea41595f606e21ba422c59dcdc637f9a9513f2e Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Thu, 15 Sep 2022 09:16:02 +0800 Subject: [PATCH 299/350] mm/secretmem: add __init annotation to secretmem_init() It's a fs_initcall entry, add __init annotation to it. Link: https://lkml.kernel.org/r/20220915011602.176967-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Cc: Mike Rapoport Signed-off-by: Andrew Morton --- mm/secretmem.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/secretmem.c b/mm/secretmem.c index 3f7154099795..6a44efb673b2 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -276,7 +276,7 @@ static struct file_system_type secretmem_fs = { .kill_sb = kill_anon_super, }; -static int secretmem_init(void) +static int __init secretmem_init(void) { int ret = 0; From cc713520bdc1b84fc5394f6ac8649b93ad2c5dde Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Fri, 16 Sep 2022 23:20:35 +0800 Subject: [PATCH 300/350] mm/damon: return void from damon_set_schemes() There is no point in returning an int from damon_set_schemes(). It always returns 0 which is meaningless for the caller, so change it to return void directly. Link: https://lkml.kernel.org/r/1663341635-12675-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Reviewed-by: Muchun Song Signed-off-by: Andrew Morton --- include/linux/damon.h | 2 +- mm/damon/core.c | 5 +---- mm/damon/dbgfs.c | 8 +++----- mm/damon/lru_sort.c | 4 +--- mm/damon/reclaim.c | 4 +--- 5 files changed, 7 insertions(+), 16 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index 1dda8d0068e5..e7808a84675f 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -541,7 +541,7 @@ unsigned int damon_nr_regions(struct damon_target *t); struct damon_ctx *damon_new_ctx(void); void damon_destroy_ctx(struct damon_ctx *ctx); int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs); -int damon_set_schemes(struct damon_ctx *ctx, +void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes); int damon_nr_running_ctxs(void); bool damon_is_registered_ops(enum damon_ops_id id); diff --git a/mm/damon/core.c b/mm/damon/core.c index a843673c11cf..9c80c6eb00c2 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -454,10 +454,8 @@ int damon_set_attrs(struct damon_ctx *ctx, struct damon_attrs *attrs) * * This function should not be called while the kdamond of the context is * running. - * - * Return: 0 if success, or negative error code otherwise. */ -int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, +void damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, ssize_t nr_schemes) { struct damos *s, *next; @@ -467,7 +465,6 @@ int damon_set_schemes(struct damon_ctx *ctx, struct damos **schemes, damon_destroy_scheme(s); for (i = 0; i < nr_schemes; i++) damon_add_scheme(ctx, schemes[i]); - return 0; } /** diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index c00eba4448d8..6f0ae7d3ae39 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -307,11 +307,9 @@ static ssize_t dbgfs_schemes_write(struct file *file, const char __user *buf, goto unlock_out; } - ret = damon_set_schemes(ctx, schemes, nr_schemes); - if (!ret) { - ret = count; - nr_schemes = 0; - } + damon_set_schemes(ctx, schemes, nr_schemes); + ret = count; + nr_schemes = 0; unlock_out: mutex_unlock(&ctx->kdamond_lock); diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index 4a40054ba03b..d7eb72b41cb6 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -203,9 +203,7 @@ static int damon_lru_sort_apply_parameters(void) scheme = damon_lru_sort_new_hot_scheme(hot_thres); if (!scheme) return -ENOMEM; - err = damon_set_schemes(ctx, &scheme, 1); - if (err) - return err; + damon_set_schemes(ctx, &scheme, 1); cold_thres = cold_min_age / damon_lru_sort_mon_attrs.aggr_interval; scheme = damon_lru_sort_new_cold_scheme(cold_thres); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 039fa55e0ae9..3d59ab11b7b3 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -155,9 +155,7 @@ static int damon_reclaim_apply_parameters(void) scheme = damon_reclaim_new_scheme(); if (!scheme) return -ENOMEM; - err = damon_set_schemes(ctx, &scheme, 1); - if (err) - return err; + damon_set_schemes(ctx, &scheme, 1); if (monitor_region_start > monitor_region_end) return -EINVAL; From 3ae6d3e30a52a7af222f284d0bf5d424b4f2f365 Mon Sep 17 00:00:00 2001 From: Chih-En Lin Date: Fri, 16 Sep 2022 17:04:34 +0800 Subject: [PATCH 301/350] mm/page_table_check: fix typos Link: https://lkml.kernel.org/r/20220916090434.701194-1-shiyn.lin@gmail.com Signed-off-by: Chih-En Lin Acked-by: Pasha Tatashin Signed-off-by: Andrew Morton --- mm/page_table_check.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index 903db62794d3..433dbce13fe1 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -53,7 +53,7 @@ static struct page_table_check *get_page_table_check(struct page_ext *page_ext) } /* - * An enty is removed from the page table, decrement the counters for that page + * An entry is removed from the page table, decrement the counters for that page * verify that it is of correct type and counters do not become negative. */ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, @@ -87,7 +87,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, } /* - * A new enty is added to the page table, increment the counters for that page + * A new entry is added to the page table, increment the counters for that page * verify that it is of correct type and is not being mapped with a different * type to a different process. */ From ce96fa6223ee851cb83118678f6e75f260852a80 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:42 +0800 Subject: [PATCH 302/350] mm/page_alloc: ensure kswapd doesn't accidentally go to sleep Patch series "A few cleanup patches for mm", v2. This series contains a few cleanup patches to remove the obsolete comments and functions, use helper macro to improve readability and so on. More details can be found in the respective changelogs. This patch (of 16): If ALLOC_KSWAPD is set, wake_all_kswapds() will be called to ensure kswapd doesn't accidentally go to sleep. But when reserve_flags is set, alloc_flags will be overwritten and ALLOC_KSWAPD is thus lost. Preserve the ALLOC_KSWAPD flag in alloc_flags to ensure kswapd won't go to sleep accidentally. Link: https://lkml.kernel.org/r/20220916072257.9639-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220916072257.9639-2-linmiaohe@huawei.com Fixes: 0a79cdad5eb2 ("mm: use alloc_flags to record if kswapd can wake") Signed-off-by: Miaohe Lin Acked-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Anshuman Khandual Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c7e9451c69fc..24caa11db8ae 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5156,7 +5156,8 @@ retry: reserve_flags = __gfp_pfmemalloc_flags(gfp_mask); if (reserve_flags) - alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags); + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, reserve_flags) | + (alloc_flags & ALLOC_KSWAPD); /* * Reset the nodemask and zonelist iterators if memory policies can be From b89f1735169b8ab54b6a03bf4823657ee4e30073 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:43 +0800 Subject: [PATCH 303/350] mm/page_alloc: make zone_pcp_update() static Since commit b92ca18e8ca5 ("mm/page_alloc: disassociate the pcp->high from pcp->batch"), zone_pcp_update() is only used in mm/page_alloc.c. Move zone_pcp_update() up to avoid forward declaration and then make it static. No functional change intended. Link: https://lkml.kernel.org/r/20220916072257.9639-3-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/internal.h | 1 - mm/page_alloc.c | 22 +++++++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index fea3cba15484..f46cd8a6694a 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -367,7 +367,6 @@ extern int user_min_free_kbytes; extern void free_unref_page(struct page *page, unsigned int order); extern void free_unref_page_list(struct list_head *list); -extern void zone_pcp_update(struct zone *zone, int cpu_online); extern void zone_pcp_reset(struct zone *zone); extern void zone_pcp_disable(struct zone *zone); extern void zone_pcp_enable(struct zone *zone); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 24caa11db8ae..49efaf9963d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7242,6 +7242,17 @@ void __meminit setup_zone_pageset(struct zone *zone) zone_set_pageset_high_and_batch(zone, 0); } +/* + * The zone indicated has a new number of managed_pages; batch sizes and percpu + * page high values need to be recalculated. + */ +static void zone_pcp_update(struct zone *zone, int cpu_online) +{ + mutex_lock(&pcp_batch_high_lock); + zone_set_pageset_high_and_batch(zone, cpu_online); + mutex_unlock(&pcp_batch_high_lock); +} + /* * Allocate per cpu pagesets and initialize them. * Before this call only boot pagesets were available. @@ -9473,17 +9484,6 @@ void free_contig_range(unsigned long pfn, unsigned long nr_pages) } EXPORT_SYMBOL(free_contig_range); -/* - * The zone indicated has a new number of managed_pages; batch sizes and percpu - * page high values need to be recalculated. - */ -void zone_pcp_update(struct zone *zone, int cpu_online) -{ - mutex_lock(&pcp_batch_high_lock); - zone_set_pageset_high_and_batch(zone, cpu_online); - mutex_unlock(&pcp_batch_high_lock); -} - /* * Effectively disable pcplists for the zone by setting the high limit to 0 * and draining all cpus. A concurrent page freeing on another CPU that's about From 638a9ae97ab596f1f7b7522dad709e69cb5b4e9d Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:44 +0800 Subject: [PATCH 304/350] mm: remove obsolete macro NR_PCP_ORDER_MASK and NR_PCP_ORDER_WIDTH Since commit 8b10b465d0e1 ("mm/page_alloc: free pages in a single pass during bulk free"), they're not used anymore. Remove them. Link: https://lkml.kernel.org/r/20220916072257.9639-4-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 7 ------- mm/page_alloc.c | 1 - 2 files changed, 8 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index c69c08156822..3ff1e757d5aa 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -564,13 +564,6 @@ enum zone_watermarks { #define NR_LOWORDER_PCP_LISTS (MIGRATE_PCPTYPES * (PAGE_ALLOC_COSTLY_ORDER + 1)) #define NR_PCP_LISTS (NR_LOWORDER_PCP_LISTS + NR_PCP_THP) -/* - * Shift to encode migratetype and order in the same integer, with order - * in the least significant bits. - */ -#define NR_PCP_ORDER_WIDTH 8 -#define NR_PCP_ORDER_MASK ((1<_watermark[WMARK_MIN] + z->watermark_boost) #define low_wmark_pages(z) (z->_watermark[WMARK_LOW] + z->watermark_boost) #define high_wmark_pages(z) (z->_watermark[WMARK_HIGH] + z->watermark_boost) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 49efaf9963d1..fa3dd2e1d566 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1584,7 +1584,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, order = pindex_to_order(pindex); nr_pages = 1 << order; - BUILD_BUG_ON(MAX_ORDER >= (1< Date: Fri, 16 Sep 2022 15:22:45 +0800 Subject: [PATCH 305/350] mm/page_alloc: remove obsolete comment in zone_statistics() Since commit 43c95bcc51e4 ("mm/page_alloc: reduce duration that IRQs are disabled for VM counters"), zone_statistics() is not called with interrupts disabled. Update the corresponding comment. Link: https://lkml.kernel.org/r/20220916072257.9639-5-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fa3dd2e1d566..b94d3a42cb8b 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3671,8 +3671,6 @@ void __putback_isolated_page(struct page *page, unsigned int order, int mt) /* * Update NUMA hit/miss statistics - * - * Must be called with interrupts disabled. */ static inline void zone_statistics(struct zone *preferred_zone, struct zone *z, long nr_account) From 5749fcc5f04cef4091dea0c2ba6b5c5f5e05a549 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:46 +0800 Subject: [PATCH 306/350] mm/page_alloc: add __init annotations to init_mem_debugging_and_hardening() It's only called by mm_init(). Add __init annotations to it. Link: https://lkml.kernel.org/r/20220916072257.9639-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mm.h | 2 +- mm/page_alloc.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index a37c8a29c49b..8bbcccbc5565 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -3092,7 +3092,7 @@ extern int apply_to_existing_page_range(struct mm_struct *mm, unsigned long address, unsigned long size, pte_fn_t fn, void *data); -extern void init_mem_debugging_and_hardening(void); +extern void __init init_mem_debugging_and_hardening(void); #ifdef CONFIG_PAGE_POISONING extern void __kernel_poison_pages(struct page *page, int numpages); extern void __kernel_unpoison_pages(struct page *page, int numpages); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b94d3a42cb8b..21261f55dab1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -903,7 +903,7 @@ static inline void clear_page_guard(struct zone *zone, struct page *page, * order of appearance. So we need to first gather the full picture of what was * enabled, and then make decisions. */ -void init_mem_debugging_and_hardening(void) +void __init init_mem_debugging_and_hardening(void) { bool page_poisoning_requested = false; From 022e7fa0f73d7c90cf3d6bea3d4e4cc5df1e1087 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:47 +0800 Subject: [PATCH 307/350] mm/page_alloc: fix freeing static percpu memory The size of struct per_cpu_zonestat can be 0 on !SMP && !NUMA. In that case, zone->per_cpu_zonestats will always equal to boot_zonestats. But in zone_pcp_reset(), zone->per_cpu_zonestats is freed via free_percpu() directly without checking against boot_zonestats first. boot_zonestats will be released by free_percpu() unexpectedly. Link: https://lkml.kernel.org/r/20220916072257.9639-7-linmiaohe@huawei.com Fixes: 28f836b6777b ("mm/page_alloc: split per cpu page lists and zone stats") Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Cc: Anshuman Khandual Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 21261f55dab1..43114e172592 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -9513,9 +9513,11 @@ void zone_pcp_reset(struct zone *zone) drain_zonestat(zone, pzstats); } free_percpu(zone->per_cpu_pageset); - free_percpu(zone->per_cpu_zonestats); zone->per_cpu_pageset = &boot_pageset; - zone->per_cpu_zonestats = &boot_zonestats; + if (zone->per_cpu_zonestats != &boot_zonestats) { + free_percpu(zone->per_cpu_zonestats); + zone->per_cpu_zonestats = &boot_zonestats; + } } } From 30e3b5d7c82f78c63c53197b5d8b99636bb60d56 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:48 +0800 Subject: [PATCH 308/350] mm: remove obsolete pgdat_is_empty() There's no caller. Remove it. Link: https://lkml.kernel.org/r/20220916072257.9639-8-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/mmzone.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 3ff1e757d5aa..4c8510f26b02 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1241,11 +1241,6 @@ static inline unsigned long pgdat_end_pfn(pg_data_t *pgdat) return pgdat->node_start_pfn + pgdat->node_spanned_pages; } -static inline bool pgdat_is_empty(pg_data_t *pgdat) -{ - return !pgdat->node_start_pfn && !pgdat->node_spanned_pages; -} - #include void build_all_zonelists(pg_data_t *pgdat); From b36184553d41c59e6712f9d4699aca24577fbd4a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:49 +0800 Subject: [PATCH 309/350] mm/page_alloc: add missing is_migrate_isolate() check in set_page_guard() In MIGRATE_ISOLATE case, zone freepage state shouldn't be modified as caller will take care of it. Add missing is_migrate_isolate() here to avoid possible unbalanced freepage state. This would happen if someone isolates the block, and then we face an MCE failure/soft-offline on a page within that block. __mod_zone_freepage_state() will be triggered via below call trace which already had been triggered back when block was isolated: take_page_off_buddy break_down_buddy_pages set_page_guard Link: https://lkml.kernel.org/r/20220916072257.9639-9-linmiaohe@huawei.com Fixes: 06be6ff3d2ec ("mm,hwpoison: rework soft offline for free pages") Signed-off-by: Miaohe Lin Reviewed-by: Oscar Salvador Reviewed-by: David Hildenbrand Cc: Anshuman Khandual Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 43114e172592..c055b4cd37f0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -873,7 +873,8 @@ static inline bool set_page_guard(struct zone *zone, struct page *page, INIT_LIST_HEAD(&page->buddy_list); set_page_private(page, order); /* Guard pages are not available for any usage */ - __mod_zone_freepage_state(zone, -(1 << order), migratetype); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, -(1 << order), migratetype); return true; } From c035290424a9b7b64477752058b460d0ecc21987 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:50 +0800 Subject: [PATCH 310/350] mm/page_alloc: use local variable zone_idx directly Use local variable zone_idx directly since it holds the exact value of zone_idx(). No functional change intended. Link: https://lkml.kernel.org/r/20220916072257.9639-10-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index c055b4cd37f0..ec865cfd0c3a 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6881,7 +6881,7 @@ void __ref memmap_init_zone_device(struct zone *zone, unsigned long start = jiffies; int nid = pgdat->node_id; - if (WARN_ON_ONCE(!pgmap || zone_idx(zone) != ZONE_DEVICE)) + if (WARN_ON_ONCE(!pgmap || zone_idx != ZONE_DEVICE)) return; /* From f774a6a6fd39e1b5677bdf71f6813b382faddeeb Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:51 +0800 Subject: [PATCH 311/350] mm, memory_hotplug: remove obsolete generic_free_nodedata() Commit 390511e1476e ("mm, memory_hotplug: drop arch_free_nodedata") drops the last caller of generic_free_nodedata(). Remove it too. Link: https://lkml.kernel.org/r/20220916072257.9639-11-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- include/linux/memory_hotplug.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 51052969dbfe..9fcbf5706595 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -43,11 +43,6 @@ extern void arch_refresh_nodedata(int nid, pg_data_t *pgdat); ({ \ memblock_alloc(sizeof(*pgdat), SMP_CACHE_BYTES); \ }) -/* - * This definition is just for error path in node hotadd. - * For node hotremove, we have to replace this. - */ -#define generic_free_nodedata(pgdat) kfree(pgdat) extern pg_data_t *node_data[]; static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) @@ -63,9 +58,6 @@ static inline pg_data_t *generic_alloc_nodedata(int nid) BUG(); return NULL; } -static inline void generic_free_nodedata(pg_data_t *pgdat) -{ -} static inline void arch_refresh_nodedata(int nid, pg_data_t *pgdat) { } From 6dc2c87a5a8878b657d08e34ca0e757d31273e12 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:52 +0800 Subject: [PATCH 312/350] mm/page_alloc: make boot_nodestats static It's only used in mm/page_alloc.c now. Make it static. Link: https://lkml.kernel.org/r/20220916072257.9639-12-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/internal.h | 2 -- mm/page_alloc.c | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/internal.h b/mm/internal.h index f46cd8a6694a..6b7ef495b56d 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -836,8 +836,6 @@ int migrate_device_coherent_page(struct page *page); */ struct folio *try_grab_folio(struct page *page, int refs, unsigned int flags); -DECLARE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); - extern bool mirrored_kernelcore; static inline bool vma_soft_dirty_enabled(struct vm_area_struct *vma) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ec865cfd0c3a..0f856b4ce3b0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6579,7 +6579,7 @@ static void per_cpu_pages_init(struct per_cpu_pages *pcp, struct per_cpu_zonesta #define BOOT_PAGESET_BATCH 1 static DEFINE_PER_CPU(struct per_cpu_pages, boot_pageset); static DEFINE_PER_CPU(struct per_cpu_zonestat, boot_zonestats); -DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); +static DEFINE_PER_CPU(struct per_cpu_nodestat, boot_nodestats); static void __build_all_zonelists(void *data) { From c940e0207a1c307fdab92b32d0522271036fc3ef Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:53 +0800 Subject: [PATCH 313/350] mm/page_alloc: use helper macro SZ_1{K,M} Use helper macro SZ_1K and SZ_1M to do the size conversion. Minor readability improvement. Link: https://lkml.kernel.org/r/20220916072257.9639-13-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Cc: Anshuman Khandual Cc: David Hildenbrand Cc: Matthew Wilcox Cc: Oscar Salvador Signed-off-by: Andrew Morton --- mm/page_alloc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 0f856b4ce3b0..3216477d9ba6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7056,7 +7056,7 @@ static int zone_batchsize(struct zone *zone) * size is striking a balance between allocation latency * and zone lock contention. */ - batch = min(zone_managed_pages(zone) >> 10, (1024 * 1024) / PAGE_SIZE); + batch = min(zone_managed_pages(zone) >> 10, SZ_1M / PAGE_SIZE); batch /= 4; /* We effectively *= 4 below */ if (batch < 1) batch = 1; @@ -8531,8 +8531,8 @@ void __init mem_init_print_info(void) #endif ")\n", K(nr_free_pages()), K(physpages), - codesize >> 10, datasize >> 10, rosize >> 10, - (init_data_size + init_code_size) >> 10, bss_size >> 10, + codesize / SZ_1K, datasize / SZ_1K, rosize / SZ_1K, + (init_data_size + init_code_size) / SZ_1K, bss_size / SZ_1K, K(physpages - totalram_pages() - totalcma_pages), K(totalcma_pages) #ifdef CONFIG_HIGHMEM @@ -9057,8 +9057,8 @@ void *__init alloc_large_system_hash(const char *tablename, numentries -= arch_reserved_kernel_pages(); /* It isn't necessary when PAGE_SIZE >= 1MB */ - if (PAGE_SHIFT < 20) - numentries = round_up(numentries, (1<<20)/PAGE_SIZE); + if (PAGE_SIZE < SZ_1M) + numentries = round_up(numentries, SZ_1M / PAGE_SIZE); #if __BITS_PER_LONG > 32 if (!high_limit) { From dae37a5dccd104fc465241c42d9e17756ddebbc1 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:54 +0800 Subject: [PATCH 314/350] mm/page_alloc: init local variable buddy_pfn The local variable buddy_pfn could be passed to buddy_merge_likely() without initialization if the passed in order is MAX_ORDER - 1. This looks buggy but buddy_pfn won't be used in this case as there's a order >= MAX_ORDER - 2 check. Init buddy_pfn to 0 anyway to avoid possible future misuse. Link: https://lkml.kernel.org/r/20220916072257.9639-14-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 3216477d9ba6..4dc2fe575fc8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1113,7 +1113,7 @@ static inline void __free_one_page(struct page *page, int migratetype, fpi_t fpi_flags) { struct capture_control *capc = task_capc(zone); - unsigned long buddy_pfn; + unsigned long buddy_pfn = 0; unsigned long combined_pfn; struct page *buddy; bool to_tail; From 896c4d52538df231c3847491acc4f2c23891fe6a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:55 +0800 Subject: [PATCH 315/350] mm/page_alloc: use costly_order in WARN_ON_ONCE_GFP() There's no need to check whether order > PAGE_ALLOC_COSTLY_ORDER again. Minor readability improvement. Link: https://lkml.kernel.org/r/20220916072257.9639-15-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4dc2fe575fc8..23f839e1d89e 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -5280,7 +5280,7 @@ nopage: * so that we can identify them and convert them to something * else. */ - WARN_ON_ONCE_GFP(order > PAGE_ALLOC_COSTLY_ORDER, gfp_mask); + WARN_ON_ONCE_GFP(costly_order, gfp_mask); /* * Help non-failing allocations by giving them access to memory From def76fd549c513bb90278a8d6d0fe3ef3faa20a7 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:56 +0800 Subject: [PATCH 316/350] mm/page_alloc: remove obsolete gfpflags_normal_context() Since commit dacb5d8875cc ("tcp: fix page frag corruption on page fault"), there's no caller of gfpflags_normal_context(). Remove it as this helper is strictly tied to the sk page frag usage and there won't be other user in the future. [linmiaohe@huawei.com: fix htmldocs] Link: https://lkml.kernel.org/r/1bc55727-9b66-0e9e-c306-f10c4716ea89@huawei.com Link: https://lkml.kernel.org/r/20220916072257.9639-16-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- Documentation/core-api/mm-api.rst | 3 --- include/linux/gfp.h | 23 ----------------------- 2 files changed, 26 deletions(-) diff --git a/Documentation/core-api/mm-api.rst b/Documentation/core-api/mm-api.rst index 1ebcc6c3fafe..f5dde5bceaea 100644 --- a/Documentation/core-api/mm-api.rst +++ b/Documentation/core-api/mm-api.rst @@ -19,9 +19,6 @@ User Space Memory Access Memory Allocation Controls ========================== -.. kernel-doc:: include/linux/gfp.h - :internal: - .. kernel-doc:: include/linux/gfp_types.h :doc: Page mobility and placement hints diff --git a/include/linux/gfp.h b/include/linux/gfp.h index ea6cb9399152..ef4aea3b356e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -36,29 +36,6 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) return !!(gfp_flags & __GFP_DIRECT_RECLAIM); } -/** - * gfpflags_normal_context - is gfp_flags a normal sleepable context? - * @gfp_flags: gfp_flags to test - * - * Test whether @gfp_flags indicates that the allocation is from the - * %current context and allowed to sleep. - * - * An allocation being allowed to block doesn't mean it owns the %current - * context. When direct reclaim path tries to allocate memory, the - * allocation context is nested inside whatever %current was doing at the - * time of the original allocation. The nested allocation may be allowed - * to block but modifying anything %current owns can corrupt the outer - * context's expectations. - * - * %true result from this function indicates that the allocation context - * can sleep and use anything that's associated with %current. - */ -static inline bool gfpflags_normal_context(const gfp_t gfp_flags) -{ - return (gfp_flags & (__GFP_DIRECT_RECLAIM | __GFP_MEMALLOC)) == - __GFP_DIRECT_RECLAIM; -} - #ifdef CONFIG_HIGHMEM #define OPT_ZONE_HIGHMEM ZONE_HIGHMEM #else From c9b3637f8a5a4c869f78c26773c559669796212f Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Fri, 16 Sep 2022 15:22:57 +0800 Subject: [PATCH 317/350] mm/page_alloc: fix obsolete comment in deferred_pfn_valid() There are no architectures that can have holes in the memory map within a pageblock since commit 859a85ddf90e ("mm: remove pfn_valid_within() and CONFIG_HOLES_IN_ZONE"). Update the corresponding comment. Link: https://lkml.kernel.org/r/20220916072257.9639-17-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Reviewed-by: Oscar Salvador Cc: Matthew Wilcox Signed-off-by: Andrew Morton --- mm/page_alloc.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 23f839e1d89e..66f7778732fb 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1929,11 +1929,7 @@ static inline void __init pgdat_init_report_one_done(void) /* * Returns true if page needs to be initialized or freed to buddy allocator. * - * First we check if pfn is valid on architectures where it is possible to have - * holes within pageblock_nr_pages. On systems where it is not possible, this - * function is optimized out. - * - * Then, we check if a current large page is valid by only checking the validity + * We check if a current large page is valid by only checking the validity * of the head pfn. */ static inline bool __init deferred_pfn_valid(unsigned long pfn) From 2b21624fc23277553ef254b3ad02c37afa1c484d Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Fri, 16 Sep 2022 14:46:38 -0700 Subject: [PATCH 318/350] hugetlb: freeze allocated pages before creating hugetlb pages When creating hugetlb pages, the hugetlb code must first allocate contiguous pages from a low level allocator such as buddy, cma or memblock. The pages returned from these low level allocators are ref counted. This creates potential issues with other code taking speculative references on these pages before they can be transformed to a hugetlb page. This issue has been addressed with methods and code such as that provided in [1]. Recent discussions about vmemmap freeing [2] have indicated that it would be beneficial to freeze all sub pages, including the head page of pages returned from low level allocators before converting to a hugetlb page. This helps avoid races if we want to replace the page containing vmemmap for the head page. There have been proposals to change at least the buddy allocator to return frozen pages as described at [3]. If such a change is made, it can be employed by the hugetlb code. However, as mentioned above hugetlb uses several low level allocators so each would need to be modified to return frozen pages. For now, we can manually freeze the returned pages. This is done in two places: 1) alloc_buddy_huge_page, only the returned head page is ref counted. We freeze the head page, retrying once in the VERY rare case where there may be an inflated ref count. 2) prep_compound_gigantic_page, for gigantic pages the current code freezes all pages except the head page. New code will simply freeze the head page as well. In a few other places, code checks for inflated ref counts on newly allocated hugetlb pages. With the modifications to freeze after allocating, this code can be removed. After hugetlb pages are freshly allocated, they are often added to the hugetlb free lists. Since these pages were previously ref counted, this was done via put_page() which would end up calling the hugetlb destructor: free_huge_page. With changes to freeze pages, we simply call free_huge_page directly to add the pages to the free list. In a few other places, freshly allocated hugetlb pages were immediately put into use, and the expectation was they were already ref counted. In these cases, we must manually ref count the page. [1] https://lore.kernel.org/linux-mm/20210622021423.154662-3-mike.kravetz@oracle.com/ [2] https://lore.kernel.org/linux-mm/20220802180309.19340-1-joao.m.martins@oracle.com/ [3] https://lore.kernel.org/linux-mm/20220809171854.3725722-1-willy@infradead.org/ [mike.kravetz@oracle.com: fix NULL pointer dereference] Link: https://lkml.kernel.org/r/20220921202702.106069-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20220916214638.155744-1-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Oscar Salvador Reviewed-by: Muchun Song Reviewed-by: Miaohe Lin Cc: Joao Martins Cc: Matthew Wilcox Cc: Michal Hocko Cc: Peter Xu Signed-off-by: Andrew Morton --- mm/hugetlb.c | 102 +++++++++++++++++++-------------------------------- 1 file changed, 38 insertions(+), 64 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b0e39045a7a8..2182134216f0 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1787,9 +1787,8 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, /* we rely on prep_new_huge_page to set the destructor */ set_compound_order(page, order); - __ClearPageReserved(page); __SetPageHead(page); - for (i = 1; i < nr_pages; i++) { + for (i = 0; i < nr_pages; i++) { p = nth_page(page, i); /* @@ -1830,17 +1829,19 @@ static bool __prep_compound_gigantic_page(struct page *page, unsigned int order, } else { VM_BUG_ON_PAGE(page_count(p), p); } - set_compound_head(p, page); + if (i != 0) + set_compound_head(p, page); } atomic_set(compound_mapcount_ptr(page), -1); atomic_set(compound_pincount_ptr(page), 0); return true; out_error: - /* undo tail page modifications made above */ - for (j = 1; j < i; j++) { + /* undo page modifications made above */ + for (j = 0; j < i; j++) { p = nth_page(page, j); - clear_compound_head(p); + if (j != 0) + clear_compound_head(p); set_page_refcounted(p); } /* need to clear PG_reserved on remaining tail pages */ @@ -1936,6 +1937,7 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, int order = huge_page_order(h); struct page *page; bool alloc_try_hard = true; + bool retry = true; /* * By default we always try hard to allocate the page with @@ -1951,7 +1953,21 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, gfp_mask |= __GFP_RETRY_MAYFAIL; if (nid == NUMA_NO_NODE) nid = numa_mem_id(); +retry: page = __alloc_pages(gfp_mask, order, nid, nmask); + + /* Freeze head page */ + if (page && !page_ref_freeze(page, 1)) { + __free_pages(page, order); + if (retry) { /* retry once */ + retry = false; + goto retry; + } + /* WOW! twice in a row. */ + pr_warn("HugeTLB head page unexpected inflated ref count\n"); + page = NULL; + } + if (page) __count_vm_event(HTLB_BUDDY_PGALLOC); else @@ -1979,6 +1995,9 @@ static struct page *alloc_buddy_huge_page(struct hstate *h, /* * Common helper to allocate a fresh hugetlb page. All specific allocators * should use this function to get new hugetlb pages + * + * Note that returned page is 'frozen': ref count of head page and all tail + * pages is zero. */ static struct page *alloc_fresh_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask, @@ -2036,7 +2055,7 @@ static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed, if (!page) return 0; - put_page(page); /* free it into the hugepage allocator */ + free_huge_page(page); /* free it into the hugepage allocator */ return 1; } @@ -2193,10 +2212,9 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) * Allocates a fresh surplus page from the page allocator. */ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, - int nid, nodemask_t *nmask, bool zero_ref) + int nid, nodemask_t *nmask) { struct page *page = NULL; - bool retry = false; if (hstate_is_gigantic(h)) return NULL; @@ -2206,7 +2224,6 @@ static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock_irq(&hugetlb_lock); -retry: page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask, NULL); if (!page) return NULL; @@ -2222,34 +2239,10 @@ retry: if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { SetHPageTemporary(page); spin_unlock_irq(&hugetlb_lock); - put_page(page); + free_huge_page(page); return NULL; } - if (zero_ref) { - /* - * Caller requires a page with zero ref count. - * We will drop ref count here. If someone else is holding - * a ref, the page will be freed when they drop it. Abuse - * temporary page flag to accomplish this. - */ - SetHPageTemporary(page); - if (!put_page_testzero(page)) { - /* - * Unexpected inflated ref count on freshly allocated - * huge. Retry once. - */ - pr_info("HugeTLB unexpected inflated ref count on freshly allocated page\n"); - spin_unlock_irq(&hugetlb_lock); - if (retry) - return NULL; - - retry = true; - goto retry; - } - ClearHPageTemporary(page); - } - h->surplus_huge_pages++; h->surplus_huge_pages_node[page_to_nid(page)]++; @@ -2271,6 +2264,9 @@ static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, if (!page) return NULL; + /* fresh huge pages are frozen */ + set_page_refcounted(page); + /* * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference @@ -2298,14 +2294,14 @@ struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, gfp_t gfp = gfp_mask | __GFP_NOWARN; gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); - page = alloc_surplus_huge_page(h, gfp, nid, nodemask, false); + page = alloc_surplus_huge_page(h, gfp, nid, nodemask); /* Fallback to all nodes if page==NULL */ nodemask = NULL; } if (!page) - page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask, false); + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return page; } @@ -2375,7 +2371,7 @@ retry: spin_unlock_irq(&hugetlb_lock); for (i = 0; i < needed; i++) { page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), - NUMA_NO_NODE, NULL, true); + NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; break; @@ -2737,7 +2733,6 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, { gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; int nid = page_to_nid(old_page); - bool alloc_retry = false; struct page *new_page; int ret = 0; @@ -2748,30 +2743,9 @@ static int alloc_and_dissolve_huge_page(struct hstate *h, struct page *old_page, * the pool. This simplifies and let us do most of the processing * under the lock. */ -alloc_retry: new_page = alloc_buddy_huge_page(h, gfp_mask, nid, NULL, NULL); if (!new_page) return -ENOMEM; - /* - * If all goes well, this page will be directly added to the free - * list in the pool. For this the ref count needs to be zero. - * Attempt to drop now, and retry once if needed. It is VERY - * unlikely there is another ref on the page. - * - * If someone else has a reference to the page, it will be freed - * when they drop their ref. Abuse temporary page flag to accomplish - * this. Retry once if there is an inflated ref count. - */ - SetHPageTemporary(new_page); - if (!put_page_testzero(new_page)) { - if (alloc_retry) - return -EBUSY; - - alloc_retry = true; - goto alloc_retry; - } - ClearHPageTemporary(new_page); - __prep_new_huge_page(h, new_page); retry: @@ -2951,6 +2925,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, } spin_lock_irq(&hugetlb_lock); list_add(&page->lru, &h->hugepage_activelist); + set_page_refcounted(page); /* Fall through */ } hugetlb_cgroup_commit_charge(idx, pages_per_huge_page(h), h_cg, page); @@ -3055,7 +3030,7 @@ static void __init gather_bootmem_prealloc(void) if (prep_compound_gigantic_page(page, huge_page_order(h))) { WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); - put_page(page); /* add to the hugepage allocator */ + free_huge_page(page); /* add to the hugepage allocator */ } else { /* VERY unlikely inflated ref count on a tail page */ free_gigantic_page(page, huge_page_order(h)); @@ -3087,7 +3062,7 @@ static void __init hugetlb_hstate_alloc_pages_onenode(struct hstate *h, int nid) &node_states[N_MEMORY], NULL); if (!page) break; - put_page(page); /* free it into the hugepage allocator */ + free_huge_page(page); /* free it into the hugepage allocator */ } cond_resched(); } @@ -3478,9 +3453,8 @@ static int demote_free_huge_page(struct hstate *h, struct page *page) else prep_compound_page(subpage, target_hstate->order); set_page_private(subpage, 0); - set_page_refcounted(subpage); prep_new_huge_page(target_hstate, subpage, nid); - put_page(subpage); + free_huge_page(subpage); } mutex_unlock(&target_hstate->resize_lock); From e3e486e634bfd652036292c3d66f9d388614ffe6 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Sat, 17 Sep 2022 21:56:54 +0800 Subject: [PATCH 319/350] mm/damon: rename damon_pageout_score() to damon_cold_score() In the beginning there is only one damos_action 'DAMOS_PAGEOUT' that need to get the coldness score of a region for a scheme, which using damon_pageout_score() to do that. But now there are also other damos_action actions need the coldness score, so rename it to damon_cold_score() to make more sense. Link: https://lkml.kernel.org/r/1663423014-28907-1-git-send-email-kaixuxia@tencent.com Signed-off-by: Kaixu Xia Reviewed-by: SeongJae Park Signed-off-by: Andrew Morton --- mm/damon/ops-common.c | 2 +- mm/damon/ops-common.h | 2 +- mm/damon/paddr.c | 4 ++-- mm/damon/vaddr.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/mm/damon/ops-common.c b/mm/damon/ops-common.c index 9310df72e1c5..75409601f934 100644 --- a/mm/damon/ops-common.c +++ b/mm/damon/ops-common.c @@ -130,7 +130,7 @@ int damon_hot_score(struct damon_ctx *c, struct damon_region *r, return hotness; } -int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, +int damon_cold_score(struct damon_ctx *c, struct damon_region *r, struct damos *s) { int hotness = damon_hot_score(c, r, s); diff --git a/mm/damon/ops-common.h b/mm/damon/ops-common.h index 52329ff361cd..8d82d3722204 100644 --- a/mm/damon/ops-common.h +++ b/mm/damon/ops-common.h @@ -12,7 +12,7 @@ struct page *damon_get_page(unsigned long pfn); void damon_ptep_mkold(pte_t *pte, struct mm_struct *mm, unsigned long addr); void damon_pmdp_mkold(pmd_t *pmd, struct mm_struct *mm, unsigned long addr); -int damon_pageout_score(struct damon_ctx *c, struct damon_region *r, +int damon_cold_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); int damon_hot_score(struct damon_ctx *c, struct damon_region *r, struct damos *s); diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index dfeebffe82f4..e1a4315c4be6 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -287,11 +287,11 @@ static int damon_pa_scheme_score(struct damon_ctx *context, { switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pageout_score(context, r, scheme); + return damon_cold_score(context, r, scheme); case DAMOS_LRU_PRIO: return damon_hot_score(context, r, scheme); case DAMOS_LRU_DEPRIO: - return damon_pageout_score(context, r, scheme); + return damon_cold_score(context, r, scheme); default: break; } diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index f53c2ff2bcc8..ea94e0b2c311 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -673,7 +673,7 @@ static int damon_va_scheme_score(struct damon_ctx *context, switch (scheme->action) { case DAMOS_PAGEOUT: - return damon_pageout_score(context, r, scheme); + return damon_cold_score(context, r, scheme); default: break; } From a57ae9ef9e1a20b68ae841a8cab7aff3f000ed9d Mon Sep 17 00:00:00 2001 From: Ran Xiaokai Date: Sun, 18 Sep 2022 02:56:40 +0000 Subject: [PATCH 320/350] mm/page_alloc: update comments for rmqueue() Since commit 44042b449872 ("mm/page_alloc: allow high-order pages to be stored on the per-cpu lists"), the per-cpu page allocators (PCP) is not only for order-0 pages. Update the comments. Link: https://lkml.kernel.org/r/20220918025640.208586-1-ran.xiaokai@zte.com.cn Signed-off-by: Ran Xiaokai Cc: Mel Gorman Signed-off-by: Andrew Morton --- mm/page_alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 66f7778732fb..12b6184cbbed 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3810,7 +3810,8 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, } /* - * Allocate a page from the given zone. Use pcplists for order-0 allocations. + * Allocate a page from the given zone. + * Use pcplists for THP or "cheap" high-order allocations. */ /* From 30b6242c49cd2a98def3bb2feee68d82a0e9686b Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Tue, 20 Sep 2022 16:35:30 +0000 Subject: [PATCH 321/350] mm/damon/sysfs: return 'err' value when call kstrtoul() failed We had better return the 'err' value when calling kstrtoul() failed, so the user will know why it really fails, there do little change, let it return the 'err' value when failed. Link: https://lkml.kernel.org/r/6329ebe0.050a0220.ec4bd.297cSMTPIN_ADDED_BROKEN@mx.google.com Suggested-by: SeongJae Park Signed-off-by: Xin Hao Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Reviewed-by: Xin Hao Signed-off-by: Andrew Morton --- mm/damon/sysfs.c | 46 ++++++++++++++-------------------------------- 1 file changed, 14 insertions(+), 32 deletions(-) diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c index 0cca1909bf67..455215a5c059 100644 --- a/mm/damon/sysfs.c +++ b/mm/damon/sysfs.c @@ -58,7 +58,7 @@ static ssize_t min_store(struct kobject *kobj, struct kobj_attribute *attr, err = kstrtoul(buf, 0, &min); if (err) - return -EINVAL; + return err; range->min = min; return count; @@ -83,7 +83,7 @@ static ssize_t max_store(struct kobject *kobj, struct kobj_attribute *attr, err = kstrtoul(buf, 0, &max); if (err) - return -EINVAL; + return err; range->max = max; return count; @@ -291,9 +291,7 @@ static ssize_t interval_us_store(struct kobject *kobj, struct damon_sysfs_watermarks, kobj); int err = kstrtoul(buf, 0, &watermarks->interval_us); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t high_show(struct kobject *kobj, @@ -312,9 +310,7 @@ static ssize_t high_store(struct kobject *kobj, struct damon_sysfs_watermarks, kobj); int err = kstrtoul(buf, 0, &watermarks->high); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t mid_show(struct kobject *kobj, @@ -333,9 +329,7 @@ static ssize_t mid_store(struct kobject *kobj, struct damon_sysfs_watermarks, kobj); int err = kstrtoul(buf, 0, &watermarks->mid); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t low_show(struct kobject *kobj, @@ -354,9 +348,7 @@ static ssize_t low_store(struct kobject *kobj, struct damon_sysfs_watermarks, kobj); int err = kstrtoul(buf, 0, &watermarks->low); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static void damon_sysfs_watermarks_release(struct kobject *kobj) @@ -437,9 +429,7 @@ static ssize_t sz_permil_store(struct kobject *kobj, struct damon_sysfs_weights, kobj); int err = kstrtouint(buf, 0, &weights->sz); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t nr_accesses_permil_show(struct kobject *kobj, @@ -458,9 +448,7 @@ static ssize_t nr_accesses_permil_store(struct kobject *kobj, struct damon_sysfs_weights, kobj); int err = kstrtouint(buf, 0, &weights->nr_accesses); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t age_permil_show(struct kobject *kobj, @@ -479,9 +467,7 @@ static ssize_t age_permil_store(struct kobject *kobj, struct damon_sysfs_weights, kobj); int err = kstrtouint(buf, 0, &weights->age); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static void damon_sysfs_weights_release(struct kobject *kobj) @@ -1111,9 +1097,7 @@ static ssize_t start_store(struct kobject *kobj, struct kobj_attribute *attr, struct damon_sysfs_region, kobj); int err = kstrtoul(buf, 0, ®ion->start); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static ssize_t end_show(struct kobject *kobj, struct kobj_attribute *attr, @@ -1132,9 +1116,7 @@ static ssize_t end_store(struct kobject *kobj, struct kobj_attribute *attr, struct damon_sysfs_region, kobj); int err = kstrtoul(buf, 0, ®ion->end); - if (err) - return -EINVAL; - return count; + return err ? err : count; } static void damon_sysfs_region_release(struct kobject *kobj) @@ -1528,7 +1510,7 @@ static ssize_t sample_us_store(struct kobject *kobj, int err = kstrtoul(buf, 0, &us); if (err) - return -EINVAL; + return err; intervals->sample_us = us; return count; @@ -1552,7 +1534,7 @@ static ssize_t aggr_us_store(struct kobject *kobj, struct kobj_attribute *attr, int err = kstrtoul(buf, 0, &us); if (err) - return -EINVAL; + return err; intervals->aggr_us = us; return count; @@ -1576,7 +1558,7 @@ static ssize_t update_us_store(struct kobject *kobj, int err = kstrtoul(buf, 0, &us); if (err) - return -EINVAL; + return err; intervals->update_us = us; return count; From 233f0b31bd9503ce2be7be0bde69c67287c8a741 Mon Sep 17 00:00:00 2001 From: Kaixu Xia Date: Tue, 20 Sep 2022 16:53:22 +0000 Subject: [PATCH 322/350] mm/damon: deduplicate damon_{reclaim,lru_sort}_apply_parameters() The bodies of damon_{reclaim,lru_sort}_apply_parameters() contain duplicates. This commit adds a common function damon_set_region_biggest_system_ram_default() to remove the duplicates. Link: https://lkml.kernel.org/r/6329f00d.a70a0220.9bb29.3678SMTPIN_ADDED_BROKEN@mx.google.com Signed-off-by: Kaixu Xia Suggested-by: SeongJae Park Reviewed-by: SeongJae Park Signed-off-by: SeongJae Park Signed-off-by: Andrew Morton --- include/linux/damon.h | 3 ++- mm/damon/core.c | 35 ++++++++++++++++++++++++++++++++++- mm/damon/lru_sort.c | 13 +++---------- mm/damon/reclaim.c | 13 +++---------- 4 files changed, 42 insertions(+), 22 deletions(-) diff --git a/include/linux/damon.h b/include/linux/damon.h index e7808a84675f..ed5470f50bab 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -557,7 +557,8 @@ static inline bool damon_target_has_pid(const struct damon_ctx *ctx) int damon_start(struct damon_ctx **ctxs, int nr_ctxs, bool exclusive); int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); -bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end); +int damon_set_region_biggest_system_ram_default(struct damon_target *t, + unsigned long *start, unsigned long *end); #endif /* CONFIG_DAMON */ diff --git a/mm/damon/core.c b/mm/damon/core.c index 9c80c6eb00c2..4de8c7c52979 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -1245,7 +1245,8 @@ static int walk_system_ram(struct resource *res, void *arg) * Find biggest 'System RAM' resource and store its start and end address in * @start and @end, respectively. If no System RAM is found, returns false. */ -bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end) +static bool damon_find_biggest_system_ram(unsigned long *start, + unsigned long *end) { struct damon_system_ram_region arg = {}; @@ -1259,6 +1260,38 @@ bool damon_find_biggest_system_ram(unsigned long *start, unsigned long *end) return true; } +/** + * damon_set_region_biggest_system_ram_default() - Set the region of the given + * monitoring target as requested, or biggest 'System RAM'. + * @t: The monitoring target to set the region. + * @start: The pointer to the start address of the region. + * @end: The pointer to the end address of the region. + * + * This function sets the region of @t as requested by @start and @end. If the + * values of @start and @end are zero, however, this function finds the biggest + * 'System RAM' resource and sets the region to cover the resource. In the + * latter case, this function saves the start and end addresses of the resource + * in @start and @end, respectively. + * + * Return: 0 on success, negative error code otherwise. + */ +int damon_set_region_biggest_system_ram_default(struct damon_target *t, + unsigned long *start, unsigned long *end) +{ + struct damon_addr_range addr_range; + + if (*start > *end) + return -EINVAL; + + if (!*start && !*end && + !damon_find_biggest_system_ram(start, end)) + return -EINVAL; + + addr_range.start = *start; + addr_range.end = *end; + return damon_set_regions(t, &addr_range, 1); +} + static int __init damon_init(void) { damon_region_cache = KMEM_CACHE(damon_region, 0); diff --git a/mm/damon/lru_sort.c b/mm/damon/lru_sort.c index d7eb72b41cb6..efbc2bda8b9c 100644 --- a/mm/damon/lru_sort.c +++ b/mm/damon/lru_sort.c @@ -188,7 +188,6 @@ static struct damos *damon_lru_sort_new_cold_scheme(unsigned int cold_thres) static int damon_lru_sort_apply_parameters(void) { struct damos *scheme; - struct damon_addr_range addr_range; unsigned int hot_thres, cold_thres; int err = 0; @@ -211,15 +210,9 @@ static int damon_lru_sort_apply_parameters(void) return -ENOMEM; damon_add_scheme(ctx, scheme); - if (monitor_region_start > monitor_region_end) - return -EINVAL; - if (!monitor_region_start && !monitor_region_end && - !damon_find_biggest_system_ram(&monitor_region_start, - &monitor_region_end)) - return -EINVAL; - addr_range.start = monitor_region_start; - addr_range.end = monitor_region_end; - return damon_set_regions(target, &addr_range, 1); + return damon_set_region_biggest_system_ram_default(target, + &monitor_region_start, + &monitor_region_end); } static int damon_lru_sort_turn(bool on) diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index 3d59ab11b7b3..162c9b1ca00f 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -144,7 +144,6 @@ static struct damos *damon_reclaim_new_scheme(void) static int damon_reclaim_apply_parameters(void) { struct damos *scheme; - struct damon_addr_range addr_range; int err = 0; err = damon_set_attrs(ctx, &damon_reclaim_mon_attrs); @@ -157,15 +156,9 @@ static int damon_reclaim_apply_parameters(void) return -ENOMEM; damon_set_schemes(ctx, &scheme, 1); - if (monitor_region_start > monitor_region_end) - return -EINVAL; - if (!monitor_region_start && !monitor_region_end && - !damon_find_biggest_system_ram(&monitor_region_start, - &monitor_region_end)) - return -EINVAL; - addr_range.start = monitor_region_start; - addr_range.end = monitor_region_end; - return damon_set_regions(target, &addr_range, 1); + return damon_set_region_biggest_system_ram_default(target, + &monitor_region_start, + &monitor_region_end); } static int damon_reclaim_turn(bool on) From 2eb989195d9a361d13d66ffb8738847649e080ad Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Sep 2022 02:06:33 +0800 Subject: [PATCH 323/350] mm: memcontrol: use memcg_kmem_enabled in count_objcg_event Patch series "mm: memcontrol: cleanup and optimize for two accounting params", v2. This patch (of 2): There are currently two helpers for checking if cgroup kmem accounting is enabled: - mem_cgroup_kmem_disabled - memcg_kmem_enabled mem_cgroup_kmem_disabled is a simple helper that returns true if cgroup.memory=nokmem is specified, otherwise returns false. memcg_kmem_enabled is a bit different, it returns true if cgroup.memory=nokmem is not specified and there was at least one non-root memory control enabled cgroup ever created. This help improve performance when kmem accounting was not actually activated. And it's optimized with static branch. The usage of mem_cgroup_kmem_disabled is for sub-systems that need to preallocate data for kmem accounting since they could be initialized before kmem accounting is activated. But count_objcg_event doesn't need that, so using memcg_kmem_enabled is better here. Link: https://lkml.kernel.org/r/20220919180634.45958-1-ryncsn@gmail.com Link: https://lkml.kernel.org/r/20220919180634.45958-2-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Muchun Song Cc: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton --- include/linux/memcontrol.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index dc7d40e575d5..ef479e554253 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1778,7 +1778,7 @@ static inline void count_objcg_event(struct obj_cgroup *objcg, { struct mem_cgroup *memcg; - if (mem_cgroup_kmem_disabled()) + if (!memcg_kmem_enabled()) return; rcu_read_lock(); From c1b8fdae62e59904ecdfe4f50410575ea02fec18 Mon Sep 17 00:00:00 2001 From: Kairui Song Date: Tue, 20 Sep 2022 02:06:34 +0800 Subject: [PATCH 324/350] mm: memcontrol: make cgroup_memory_noswap a static key cgroup_memory_noswap is used in many hot path, so make it a static key to lower the kernel overhead. Using 8G of ZRAM as SWAP, benchmark using `perf stat -d -d -d --repeat 100` with the following code snip in a non-root cgroup: #include #include #include #include #define MB 1024UL * 1024UL int main(int argc, char **argv){ void *p = mmap(NULL, 8000 * MB, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); memset(p, 0xff, 8000 * MB); madvise(p, 8000 * MB, MADV_PAGEOUT); memset(p, 0xff, 8000 * MB); return 0; } Before: 7,021.43 msec task-clock # 0.967 CPUs utilized ( +- 0.03% ) 4,010 context-switches # 573.853 /sec ( +- 0.01% ) 0 cpu-migrations # 0.000 /sec 2,052,057 page-faults # 293.661 K/sec ( +- 0.00% ) 12,616,546,027 cycles # 1.805 GHz ( +- 0.06% ) (39.92%) 156,823,666 stalled-cycles-frontend # 1.25% frontend cycles idle ( +- 0.10% ) (40.25%) 310,130,812 stalled-cycles-backend # 2.47% backend cycles idle ( +- 4.39% ) (40.73%) 18,692,516,591 instructions # 1.49 insn per cycle # 0.01 stalled cycles per insn ( +- 0.04% ) (40.75%) 4,907,447,976 branches # 702.283 M/sec ( +- 0.05% ) (40.30%) 13,002,578 branch-misses # 0.26% of all branches ( +- 0.08% ) (40.48%) 7,069,786,296 L1-dcache-loads # 1.012 G/sec ( +- 0.03% ) (40.32%) 649,385,847 L1-dcache-load-misses # 9.13% of all L1-dcache accesses ( +- 0.07% ) (40.10%) 1,485,448,688 L1-icache-loads # 212.576 M/sec ( +- 0.15% ) (39.49%) 31,628,457 L1-icache-load-misses # 2.13% of all L1-icache accesses ( +- 0.40% ) (39.57%) 6,667,311 dTLB-loads # 954.129 K/sec ( +- 0.21% ) (39.50%) 5,668,555 dTLB-load-misses # 86.40% of all dTLB cache accesses ( +- 0.12% ) (39.03%) 765 iTLB-loads # 109.476 /sec ( +- 21.81% ) (39.44%) 4,370,351 iTLB-load-misses # 214320.09% of all iTLB cache accesses ( +- 1.44% ) (39.86%) 149,207,254 L1-dcache-prefetches # 21.352 M/sec ( +- 0.13% ) (40.27%) 7.25869 +- 0.00203 seconds time elapsed ( +- 0.03% ) After: 6,576.16 msec task-clock # 0.953 CPUs utilized ( +- 0.10% ) 4,020 context-switches # 605.595 /sec ( +- 0.01% ) 0 cpu-migrations # 0.000 /sec 2,052,056 page-faults # 309.133 K/sec ( +- 0.00% ) 11,967,619,180 cycles # 1.803 GHz ( +- 0.36% ) (38.76%) 161,259,240 stalled-cycles-frontend # 1.38% frontend cycles idle ( +- 0.27% ) (36.58%) 253,605,302 stalled-cycles-backend # 2.16% backend cycles idle ( +- 4.45% ) (34.78%) 19,328,171,892 instructions # 1.65 insn per cycle # 0.01 stalled cycles per insn ( +- 0.10% ) (31.46%) 5,213,967,902 branches # 785.461 M/sec ( +- 0.18% ) (30.68%) 12,385,170 branch-misses # 0.24% of all branches ( +- 0.26% ) (34.13%) 7,271,687,822 L1-dcache-loads # 1.095 G/sec ( +- 0.12% ) (35.29%) 649,873,045 L1-dcache-load-misses # 8.93% of all L1-dcache accesses ( +- 0.11% ) (41.41%) 1,950,037,608 L1-icache-loads # 293.764 M/sec ( +- 0.33% ) (43.11%) 31,365,566 L1-icache-load-misses # 1.62% of all L1-icache accesses ( +- 0.39% ) (45.89%) 6,767,809 dTLB-loads # 1.020 M/sec ( +- 0.47% ) (48.42%) 6,339,590 dTLB-load-misses # 95.43% of all dTLB cache accesses ( +- 0.50% ) (46.60%) 736 iTLB-loads # 110.875 /sec ( +- 1.79% ) (48.60%) 4,314,836 iTLB-load-misses # 518653.73% of all iTLB cache accesses ( +- 0.63% ) (42.91%) 144,950,156 L1-dcache-prefetches # 21.836 M/sec ( +- 0.37% ) (41.39%) 6.89935 +- 0.00703 seconds time elapsed ( +- 0.10% ) The performance is clearly better. There is no significant hotspot improvement according to perf report, as there are quite a few callers of memcg_swap_enabled and do_memsw_account (which calls memcg_swap_enabled). Many pieces of minor optimizations resulted in lower overhead for the branch predictor, and bettter performance. Link: https://lkml.kernel.org/r/20220919180634.45958-3-ryncsn@gmail.com Signed-off-by: Kairui Song Acked-by: Michal Hocko Acked-by: Shakeel Butt Acked-by: Roman Gushchin Acked-by: Muchun Song Cc: Johannes Weiner Signed-off-by: Andrew Morton --- mm/memcontrol.c | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ac6440daf208..6b74bbdc2659 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -90,9 +90,18 @@ static bool cgroup_memory_nokmem __ro_after_init; /* Whether the swap controller is active */ #ifdef CONFIG_MEMCG_SWAP -static bool cgroup_memory_noswap __ro_after_init; +static bool cgroup_memory_noswap __initdata; + +static DEFINE_STATIC_KEY_FALSE(memcg_swap_enabled_key); +static inline bool memcg_swap_enabled(void) +{ + return static_branch_likely(&memcg_swap_enabled_key); +} #else -#define cgroup_memory_noswap 1 +static inline bool memcg_swap_enabled(void) +{ + return false; +} #endif #ifdef CONFIG_CGROUP_WRITEBACK @@ -102,7 +111,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap; + return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg_swap_enabled(); } #define THRESHOLDS_EVENTS_TARGET 128 @@ -7370,7 +7379,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); - if (!cgroup_memory_noswap && memcg != swap_memcg) { + if (memcg_swap_enabled() && memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries); @@ -7422,7 +7431,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); - if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg) && + if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); @@ -7454,7 +7463,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (!cgroup_memory_noswap && !mem_cgroup_is_root(memcg)) { + if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->swap, nr_pages); else @@ -7470,7 +7479,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, @@ -7487,7 +7496,7 @@ bool mem_cgroup_swap_full(struct folio *folio) if (vm_swap_full()) return true; - if (cgroup_memory_noswap || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) return false; memcg = folio_memcg(folio); @@ -7795,6 +7804,8 @@ static int __init mem_cgroup_swap_init(void) if (cgroup_memory_noswap) return 0; + static_branch_enable(&memcg_swap_enabled_key); + WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) From 958f32ce832ba781ac20e11bb2d12a9352ea28fc Mon Sep 17 00:00:00 2001 From: Liu Shixin Date: Fri, 23 Sep 2022 12:21:13 +0800 Subject: [PATCH 325/350] mm: hugetlb: fix UAF in hugetlb_handle_userfault The vma_lock and hugetlb_fault_mutex are dropped before handling userfault and reacquire them again after handle_userfault(), but reacquire the vma_lock could lead to UAF[1,2] due to the following race, hugetlb_fault hugetlb_no_page /*unlock vma_lock */ hugetlb_handle_userfault handle_userfault /* unlock mm->mmap_lock*/ vm_mmap_pgoff do_mmap mmap_region munmap_vma_range /* clean old vma */ /* lock vma_lock again <--- UAF */ /* unlock vma_lock */ Since the vma_lock will unlock immediately after hugetlb_handle_userfault(), let's drop the unneeded lock and unlock in hugetlb_handle_userfault() to fix the issue. [1] https://lore.kernel.org/linux-mm/000000000000d5e00a05e834962e@google.com/ [2] https://lore.kernel.org/linux-mm/20220921014457.1668-1-liuzixian4@huawei.com/ Link: https://lkml.kernel.org/r/20220923042113.137273-1-liushixin2@huawei.com Fixes: 1a1aad8a9b7b ("userfaultfd: hugetlbfs: add userfaultfd hugetlb hook") Signed-off-by: Liu Shixin Signed-off-by: Kefeng Wang Reported-by: syzbot+193f9cee8638750b23cf@syzkaller.appspotmail.com Reported-by: Liu Zixian Reviewed-by: Mike Kravetz Cc: David Hildenbrand Cc: John Hubbard Cc: Muchun Song Cc: Sidhartha Kumar Cc: [4.14+] Signed-off-by: Andrew Morton --- mm/hugetlb.c | 37 +++++++++++++++++-------------------- 1 file changed, 17 insertions(+), 20 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 2182134216f0..3c1316ad54b5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5489,7 +5489,6 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, unsigned long addr, unsigned long reason) { - vm_fault_t ret; u32 hash; struct vm_fault vmf = { .vma = vma, @@ -5507,18 +5506,14 @@ static inline vm_fault_t hugetlb_handle_userfault(struct vm_area_struct *vma, }; /* - * vma_lock and hugetlb_fault_mutex must be - * dropped before handling userfault. Reacquire - * after handling fault to make calling code simpler. + * vma_lock and hugetlb_fault_mutex must be dropped before handling + * userfault. Also mmap_lock could be dropped due to handling + * userfault, any vma operation should be careful from here. */ hugetlb_vma_unlock_read(vma); hash = hugetlb_fault_mutex_hash(mapping, idx); mutex_unlock(&hugetlb_fault_mutex_table[hash]); - ret = handle_userfault(&vmf, reason); - mutex_lock(&hugetlb_fault_mutex_table[hash]); - hugetlb_vma_lock_read(vma); - - return ret; + return handle_userfault(&vmf, reason); } static vm_fault_t hugetlb_no_page(struct mm_struct *mm, @@ -5536,6 +5531,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, spinlock_t *ptl; unsigned long haddr = address & huge_page_mask(h); bool new_page, new_pagecache_page = false; + u32 hash = hugetlb_fault_mutex_hash(mapping, idx); /* * Currently, we are forced to kill the process in the event the @@ -5546,7 +5542,7 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (is_vma_resv_set(vma, HPAGE_RESV_UNMAPPED)) { pr_warn_ratelimited("PID %d killed due to inadequate hugepage pool\n", current->pid); - return ret; + goto out; } /* @@ -5560,12 +5556,10 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (idx >= size) goto out; /* Check for page in userfault range */ - if (userfaultfd_missing(vma)) { - ret = hugetlb_handle_userfault(vma, mapping, idx, + if (userfaultfd_missing(vma)) + return hugetlb_handle_userfault(vma, mapping, idx, flags, haddr, address, VM_UFFD_MISSING); - goto out; - } page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { @@ -5631,10 +5625,9 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, if (userfaultfd_minor(vma)) { unlock_page(page); put_page(page); - ret = hugetlb_handle_userfault(vma, mapping, idx, + return hugetlb_handle_userfault(vma, mapping, idx, flags, haddr, address, VM_UFFD_MINOR); - goto out; } } @@ -5692,6 +5685,8 @@ static vm_fault_t hugetlb_no_page(struct mm_struct *mm, unlock_page(page); out: + hugetlb_vma_unlock_read(vma); + mutex_unlock(&hugetlb_fault_mutex_table[hash]); return ret; backout: @@ -5789,11 +5784,13 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, entry = huge_ptep_get(ptep); /* PTE markers should be handled the same way as none pte */ - if (huge_pte_none_mostly(entry)) { - ret = hugetlb_no_page(mm, vma, mapping, idx, address, ptep, + if (huge_pte_none_mostly(entry)) + /* + * hugetlb_no_page will drop vma lock and hugetlb fault + * mutex internally, which make us return immediately. + */ + return hugetlb_no_page(mm, vma, mapping, idx, address, ptep, entry, flags); - goto out_mutex; - } ret = 0; From 780a4b6fb865534fcb3aa9150942f3a719d11ce9 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:27:31 -0700 Subject: [PATCH 326/350] mm/khugepaged: check compound_order() in collapse_pte_mapped_thp() By the time we lock a page in collapse_pte_mapped_thp(), the page mapped by the address pushed onto the slot's .pte_mapped_thp[] array might have changed arbitrarily since we last looked at it. We revalidate that the page is still the head of a compound page, but we don't revalidate if the compound page is of order HPAGE_PMD_ORDER before applying rmap and page table updates. Since the kernel now supports large folios of arbitrary order, and since replacing page's pte mappings by a pmd mapping only makes sense for compound pages of order HPAGE_PMD_ORDER, revalidate that the compound order is indeed of order HPAGE_PMD_ORDER before proceeding. Link: https://lore.kernel.org/linux-mm/CAHbLzkon+2ky8v9ywGcsTUgXM_B35jt5NThYqQKXW2YV_GUacw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220922222731.1124481-1-zokeefe@google.com Signed-off-by: Zach O'Keefe Suggested-by: Yang Shi Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- mm/khugepaged.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 57af2c841b41..40fd9f7b3ed3 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1399,6 +1399,9 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) if (!PageHead(hpage)) goto drop_hpage; + if (compound_order(hpage) != HPAGE_PMD_ORDER) + goto drop_hpage; + if (find_pmd_or_thp_or_none(mm, haddr, &pmd) != SCAN_SUCCEED) goto drop_hpage; From 0f3e2a2c4243695c5ac3fbccce18dc74c0250df6 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 11:46:50 -0700 Subject: [PATCH 327/350] mm/madvise: MADV_COLLAPSE return EAGAIN when page cannot be isolated MADV_COLLAPSE is a best-effort request that attempts to set an actionable errno value if the request cannot be fulfilled at the time. EAGAIN should be used to communicate that a resource was temporarily unavailable, but that the user may try again immediately. SCAN_DEL_PAGE_LRU is an internal result code used when a page cannot be isolated from it's LRU list. Since this, like SCAN_PAGE_LRU, is likely a transitory state, make MADV_COLLAPSE return EAGAIN so that users know they may reattempt the operation. Another important scenario to consider is race with khugepaged. khugepaged might isolate a page while MADV_COLLAPSE is interested in it. Even though racing with khugepaged might mean that the memory has already been collapsed, signalling an errno that is non-intrinsic to that memory or arguments provided to madvise(2) lets the user know that future attempts might (and in this case likely would) succeed, and avoids false-negative assumptions by the user. Link: https://lkml.kernel.org/r/20220922184651.1016461-1-zokeefe@google.com Fixes: 7d8faaf15545 ("mm/madvise: introduce MADV_COLLAPSE sync hugepage collapse") Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- mm/khugepaged.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 40fd9f7b3ed3..b3ebe90a66d9 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2372,6 +2372,7 @@ static int madvise_collapse_errno(enum scan_result r) /* Resource temporary unavailable - trying again might succeed */ case SCAN_PAGE_LOCK: case SCAN_PAGE_LRU: + case SCAN_DEL_PAGE_LRU: return -EAGAIN; /* * Other: Trying again likely not to succeed / error intrinsic to @@ -2454,6 +2455,7 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, case SCAN_PAGE_LOCK: case SCAN_PAGE_COMPOUND: case SCAN_PAGE_LRU: + case SCAN_DEL_PAGE_LRU: last_fail = result; break; default: From 3505c8e62acfb62908ffd7d2d6c5971657596d1d Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 11:46:51 -0700 Subject: [PATCH 328/350] selftests/vm: retry on EAGAIN for MADV_COLLAPSE selftest MADV_COLLAPSE is a best-effort request that will set errno to an actionable value if the request cannot be performed. For example, if pages are not found on the LRU, or if they are currently locked by something else, MADV_COLLAPSE will fail and set errno to EAGAIN to inform callers that they may try again. Since the khugepaged selftest is the first public use of MADV_COLLAPSE, set a best practice of checking errno and retrying on EAGAIN. Link: https://lkml.kernel.org/r/20220922184651.1016461-2-zokeefe@google.com Fixes: 9330694de59f ("selftests/vm: add MADV_COLLAPSE collapse context to selftests") Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index b77b1e28cdb3..b55dc331af13 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -1,4 +1,5 @@ #define _GNU_SOURCE +#include #include #include #include @@ -477,6 +478,26 @@ static void fill_memory(int *p, unsigned long start, unsigned long end) p[i * page_size / sizeof(*p)] = i + 0xdead0000; } +/* + * MADV_COLLAPSE is a best-effort request and may fail if an internal + * resource is temporarily unavailable, in which case it will set errno to + * EAGAIN. In such a case, immediately reattempt the operation one more + * time. + */ +static int madvise_collapse_retry(void *p, unsigned long size) +{ + bool retry = true; + int ret; + +retry: + ret = madvise(p, size, MADV_COLLAPSE); + if (ret && errno == EAGAIN && retry) { + retry = false; + goto retry; + } + return ret; +} + /* * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with * validate_memory()'able contents. @@ -531,7 +552,7 @@ static void madvise_collapse(const char *msg, char *p, int nr_hpages, /* Clear VM_NOHUGEPAGE */ madvise(p, nr_hpages * hpage_pmd_size, MADV_HUGEPAGE); - ret = madvise(p, nr_hpages * hpage_pmd_size, MADV_COLLAPSE); + ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); if (((bool)ret) == expect) fail("Fail: Bad return value"); else if (check_huge(p, nr_hpages) != expect) From 7c6c6cc4d3a213e7303ef06ff40f6193df01839c Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:37 -0700 Subject: [PATCH 329/350] mm/shmem: add flag to enforce shmem THP in hugepage_vma_check() Patch series "mm: add file/shmem support to MADV_COLLAPSE", v4. This series builds on top of the previous "mm: userspace hugepage collapse" series which introduced the MADV_COLLAPSE madvise mode and added support for private, anonymous mappings[2], by adding support for file and shmem backed memory to CONFIG_READ_ONLY_THP_FOR_FS=y kernels. File and shmem support have been added with effort to align with existing MADV_COLLAPSE semantics and policy decisions[3]. Collapse of shmem-backed memory ignores kernel-guiding directives and heuristics including all sysfs settings (transparent_hugepage/shmem_enabled), and tmpfs huge= mount options (shmem always supports large folios). Like anonymous mappings, on successful return of MADV_COLLAPSE on file/shmem memory, the contents of memory mapped by the addresses provided will be synchronously pmd-mapped THPs. This functionality unlocks two important uses: (1) Immediately back executable text by THPs. Current support provided by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large system which might impair services from serving at their full rated load after (re)starting. Tricks like mremap(2)'ing text onto anonymous memory to immediately realize iTLB performance prevents page sharing and demand paging, both of which increase steady state memory footprint. Now, we can have the best of both worlds: Peak upfront performance and lower RAM footprints. (2) userfaultfd-based live migration of virtual machines satisfy UFFD faults by fetching native-sized pages over the network (to avoid latency of transferring an entire hugepage). However, after guest memory has been fully copied to the new host, MADV_COLLAPSE can be used to immediately increase guest performance. khugepaged has received a small improvement by association and can now detect and collapse pte-mapped THPs. However, there is still work to be done along the file collapse path. Compound pages of arbitrary order still needs to be supported and THP collapse needs to be converted to using folios in general. Eventually, we'd like to move away from the read-only and executable-mapped constraints currently imposed on eligible files and support any inode claiming huge folio support. That said, I think the series as-is covers enough to claim that MADV_COLLAPSE supports file/shmem memory. Patches 1-3 Implement the guts of the series. Patch 4 Is a tracepoint for debugging. Patches 5-9 Refactor existing khugepaged selftests to work with new memory types + new collapse tests. Patch 10 Adds a userfaultfd selftest mode to mimic a functional test of UFFDIO_REGISTER_MODE_MINOR+MADV_COLLAPSE live migration. (v4 note: "userfaultfd shmem" selftest is failing as of Sep 22 mm-unstable) [1] https://lore.kernel.org/linux-mm/YyiK8YvVcrtZo0z3@google.com/ [2] https://lore.kernel.org/linux-mm/20220706235936.2197195-1-zokeefe@google.com/ [3] https://lore.kernel.org/linux-mm/YtBmhaiPHUTkJml8@google.com/ [4] https://lore.kernel.org/linux-mm/20220922222731.1124481-1-zokeefe@google.com/ [5] https://lore.kernel.org/linux-mm/20220922184651.1016461-1-zokeefe@google.com/ This patch (of 10): Extend 'mm/thp: add flag to enforce sysfs THP in hugepage_vma_check()' to shmem, allowing callers to ignore /sys/kernel/transparent_hugepage/shmem_enabled and tmpfs huge= mount. This is intended to be used by MADV_COLLAPSE, and the rationale is analogous to the anon/file case: MADV_COLLAPSE is not coupled to directives that advise the kernel's decisions on when THPs should be considered eligible. shmem/tmpfs always claims large folio support, regardless of sysfs or mount options. [shy828301@gmail.com: test shmem_huge_force explicitly] Link: https://lore.kernel.org/linux-mm/CAHbLzko3A5-TpS0BgBeKkx5cuOkWgLvWXQH=TdgW-baO4rPtdg@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220922224046.1143204-1-zokeefe@google.com Link: https://lkml.kernel.org/r/20220907144521.3115321-2-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-2-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 10 ++++++---- mm/huge_memory.c | 2 +- mm/shmem.c | 18 ++++++++++-------- 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index f24071e3c826..d500ea967dc7 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -92,11 +92,13 @@ extern struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, extern void shmem_truncate_range(struct inode *inode, loff_t start, loff_t end); int shmem_unuse(unsigned int type); -extern bool shmem_is_huge(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index); -static inline bool shmem_huge_enabled(struct vm_area_struct *vma) +extern bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force); +static inline bool shmem_huge_enabled(struct vm_area_struct *vma, + bool shmem_huge_force) { - return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff); + return shmem_is_huge(vma, file_inode(vma->vm_file), vma->vm_pgoff, + shmem_huge_force); } extern unsigned long shmem_swap_usage(struct vm_area_struct *vma); extern unsigned long shmem_partial_swap_usage(struct address_space *mapping, diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 4938defe4e73..1cc4a5f4791e 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -119,7 +119,7 @@ bool hugepage_vma_check(struct vm_area_struct *vma, unsigned long vm_flags, * own flags. */ if (!in_pf && shmem_file(vma->vm_file)) - return shmem_huge_enabled(vma); + return shmem_huge_enabled(vma, !enforce_sysfs); /* Enforce sysfs THP requirements as necessary */ if (enforce_sysfs && diff --git a/mm/shmem.c b/mm/shmem.c index 275899bacbea..cabe48d55a64 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -462,20 +462,22 @@ static bool shmem_confirm_swap(struct address_space *mapping, static int shmem_huge __read_mostly = SHMEM_HUGE_NEVER; -bool shmem_is_huge(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) +bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force) { loff_t i_size; if (!S_ISREG(inode->i_mode)) return false; - if (shmem_huge == SHMEM_HUGE_DENY) - return false; if (vma && ((vma->vm_flags & VM_NOHUGEPAGE) || test_bit(MMF_DISABLE_THP, &vma->vm_mm->flags))) return false; + if (shmem_huge_force) + return true; if (shmem_huge == SHMEM_HUGE_FORCE) return true; + if (shmem_huge == SHMEM_HUGE_DENY) + return false; switch (SHMEM_SB(inode->i_sb)->huge) { case SHMEM_HUGE_ALWAYS: @@ -670,8 +672,8 @@ static long shmem_unused_huge_count(struct super_block *sb, #define shmem_huge SHMEM_HUGE_DENY -bool shmem_is_huge(struct vm_area_struct *vma, - struct inode *inode, pgoff_t index) +bool shmem_is_huge(struct vm_area_struct *vma, struct inode *inode, + pgoff_t index, bool shmem_huge_force) { return false; } @@ -1058,7 +1060,7 @@ static int shmem_getattr(struct user_namespace *mnt_userns, STATX_ATTR_NODUMP); generic_fillattr(&init_user_ns, inode, stat); - if (shmem_is_huge(NULL, inode, 0)) + if (shmem_is_huge(NULL, inode, 0, false)) stat->blksize = HPAGE_PMD_SIZE; if (request_mask & STATX_BTIME) { @@ -1900,7 +1902,7 @@ repeat: return 0; } - if (!shmem_is_huge(vma, inode, index)) + if (!shmem_is_huge(vma, inode, index, false)) goto alloc_nohuge; huge_gfp = vma_thp_gfp_mask(vma); From 58ac9a8993a13ebcbb0682ede0e3a158b4a41b28 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:38 -0700 Subject: [PATCH 330/350] mm/khugepaged: attempt to map file/shmem-backed pte-mapped THPs by pmds The main benefit of THPs are that they can be mapped at the pmd level, increasing the likelihood of TLB hit and spending less cycles in page table walks. pte-mapped hugepages - that is - hugepage-aligned compound pages of order HPAGE_PMD_ORDER mapped by ptes - although being contiguous in physical memory, don't have this advantage. In fact, one could argue they are detrimental to system performance overall since they occupy a precious hugepage-aligned/sized region of physical memory that could otherwise be used more effectively. Additionally, pte-mapped hugepages can be the cheapest memory to collapse for khugepaged since no new hugepage allocation or copying of memory contents is necessary - we only need to update the mapping page tables. In the anonymous collapse path, we are able to collapse pte-mapped hugepages (albeit, perhaps suboptimally), but the file/shmem path makes no effort when compound pages (of any order) are encountered. Identify pte-mapped hugepages in the file/shmem collapse path. The final step of which makes a racy check of the value of the pmd to ensure it maps a pte table. This should be fine, since races that result in false-positive (i.e. attempt collapse even though we shouldn't) will fail later in collapse_pte_mapped_thp() once we actually lock mmap_lock and reinspect the pmd value. Races that result in false-negatives (i.e. where we decide to not attempt collapse, but should have) shouldn't be an issue, since in the worst case, we do nothing - which is what we've done up to this point. We make a similar check in retract_page_tables(). If we do think we've found a pte-mapped hugepgae in khugepaged context, attempt to update page tables mapping this hugepage. Note that these collapses still count towards the /sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed counter, and if the pte-mapped hugepage was also mapped into multiple process' address spaces, could be incremented for each page table update. Since we increment the counter when a pte-mapped hugepage is successfully added to the list of to-collapse pte-mapped THPs, it's possible that we never actually update the page table either. This is different from how file/shmem pages_collapsed accounting works today where only a successful page cache update is counted (it's also possible here that no page tables are actually changed). Though it incurs some slop, this is preferred to either not accounting for the event at all, or plumbing through data in struct mm_slot on whether to account for the collapse or not. Also note that work still needs to be done to support arbitrary compound pages, and that this should all be converted to using folios. [shy828301@gmail.com: Spelling mistake, update comment, and add Documentation] Link: https://lore.kernel.org/linux-mm/CAHbLzkpHwZxFzjfX9nxVoRhzup8WMjMfyL6Xiq8mZ9M-N3ombw@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220907144521.3115321-3-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-3-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- Documentation/admin-guide/mm/transhuge.rst | 9 ++- include/trace/events/huge_memory.h | 1 + mm/khugepaged.c | 69 +++++++++++++++++++--- 3 files changed, 71 insertions(+), 8 deletions(-) diff --git a/Documentation/admin-guide/mm/transhuge.rst b/Documentation/admin-guide/mm/transhuge.rst index 8e3418ec4503..8ee78ec232eb 100644 --- a/Documentation/admin-guide/mm/transhuge.rst +++ b/Documentation/admin-guide/mm/transhuge.rst @@ -191,7 +191,14 @@ allocation failure to throttle the next allocation attempt:: /sys/kernel/mm/transparent_hugepage/khugepaged/alloc_sleep_millisecs -The khugepaged progress can be seen in the number of pages collapsed:: +The khugepaged progress can be seen in the number of pages collapsed (note +that this counter may not be an exact count of the number of pages +collapsed, since "collapsed" could mean multiple things: (1) A PTE mapping +being replaced by a PMD mapping, or (2) All 4K physical pages replaced by +one 2M hugepage. Each may happen independently, or together, depending on +the type of memory and the failures that occur. As such, this value should +be interpreted roughly as a sign of progress, and counters in /proc/vmstat +consulted for more accurate accounting):: /sys/kernel/mm/transparent_hugepage/khugepaged/pages_collapsed diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 55392bf30a03..fbbb25494d60 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -17,6 +17,7 @@ EM( SCAN_EXCEED_SHARED_PTE, "exceed_shared_pte") \ EM( SCAN_PTE_NON_PRESENT, "pte_non_present") \ EM( SCAN_PTE_UFFD_WP, "pte_uffd_wp") \ + EM( SCAN_PTE_MAPPED_HUGEPAGE, "pte_mapped_hugepage") \ EM( SCAN_PAGE_RO, "no_writable_page") \ EM( SCAN_LACK_REFERENCED_PAGE, "lack_referenced_page") \ EM( SCAN_PAGE_NULL, "page_null") \ diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b3ebe90a66d9..b1e3f83c4eb2 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -35,6 +35,7 @@ enum scan_result { SCAN_EXCEED_SHARED_PTE, SCAN_PTE_NON_PRESENT, SCAN_PTE_UFFD_WP, + SCAN_PTE_MAPPED_HUGEPAGE, SCAN_PAGE_RO, SCAN_LACK_REFERENCED_PAGE, SCAN_PAGE_NULL, @@ -1320,20 +1321,24 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. */ -static void khugepaged_add_pte_mapped_thp(struct mm_struct *mm, +static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) { struct khugepaged_mm_slot *mm_slot; struct mm_slot *slot; + bool ret = false; VM_BUG_ON(addr & ~HPAGE_PMD_MASK); spin_lock(&khugepaged_mm_lock); slot = mm_slot_lookup(mm_slots_hash, mm); mm_slot = mm_slot_entry(slot, struct khugepaged_mm_slot, slot); - if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) + if (likely(mm_slot && mm_slot->nr_pte_mapped_thp < MAX_PTE_MAPPED_THP)) { mm_slot->pte_mapped_thp[mm_slot->nr_pte_mapped_thp++] = addr; + ret = true; + } spin_unlock(&khugepaged_mm_lock); + return ret; } static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, @@ -1370,9 +1375,16 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) pte_t *start_pte, *pte; pmd_t *pmd; spinlock_t *ptl; - int count = 0; + int count = 0, result = SCAN_FAIL; int i; + mmap_assert_write_locked(mm); + + /* Fast check before locking page if not PMD mapping PTE table */ + result = find_pmd_or_thp_or_none(mm, haddr, &pmd); + if (result != SCAN_SUCCEED) + return; + if (!vma || !vma->vm_file || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) return; @@ -1726,9 +1738,16 @@ static int collapse_file(struct mm_struct *mm, struct file *file, /* * If file was truncated then extended, or hole-punched, before * we locked the first page, then a THP might be there already. + * This will be discovered on the first iteration. */ if (PageTransCompound(page)) { - result = SCAN_PAGE_COMPOUND; + struct page *head = compound_head(page); + + result = compound_order(head) == HPAGE_PMD_ORDER && + head->index == start + /* Maybe PMD-mapped */ + ? SCAN_PTE_MAPPED_HUGEPAGE + : SCAN_PAGE_COMPOUND; goto out_unlock; } @@ -1962,11 +1981,23 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, } /* - * XXX: khugepaged should compact smaller compound pages + * TODO: khugepaged should compact smaller compound pages * into a PMD sized page */ if (PageTransCompound(page)) { - result = SCAN_PAGE_COMPOUND; + struct page *head = compound_head(page); + + result = compound_order(head) == HPAGE_PMD_ORDER && + head->index == start + /* Maybe PMD-mapped */ + ? SCAN_PTE_MAPPED_HUGEPAGE + : SCAN_PAGE_COMPOUND; + /* + * For SCAN_PTE_MAPPED_HUGEPAGE, further processing + * by the caller won't touch the page cache, and so + * it's safe to skip LRU and refcount checks before + * returning. + */ break; } @@ -2026,6 +2057,12 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_slot) { } + +static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr) +{ + return false; +} #endif static unsigned int khugepaged_scan_mm_slot(unsigned int pages, int *result, @@ -2118,8 +2155,26 @@ skip: &mmap_locked, cc); } - if (*result == SCAN_SUCCEED) + switch (*result) { + case SCAN_PTE_MAPPED_HUGEPAGE: { + pmd_t *pmd; + + *result = find_pmd_or_thp_or_none(mm, + khugepaged_scan.address, + &pmd); + if (*result != SCAN_SUCCEED) + break; + if (!khugepaged_add_pte_mapped_thp(mm, + khugepaged_scan.address)) + break; + } fallthrough; + case SCAN_SUCCEED: ++khugepaged_pages_collapsed; + break; + default: + break; + } + /* move to next address */ khugepaged_scan.address += HPAGE_PMD_SIZE; progress += HPAGE_PMD_NR; From 34488399fa08faaf664743fa54b271eb6f9e1321 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:39 -0700 Subject: [PATCH 331/350] mm/madvise: add file and shmem support to MADV_COLLAPSE Add support for MADV_COLLAPSE to collapse shmem-backed and file-backed memory into THPs (requires CONFIG_READ_ONLY_THP_FOR_FS=y). On success, the backing memory will be a hugepage. For the memory range and process provided, the page tables will synchronously have a huge pmd installed, mapping the THP. Other mappings of the file extent mapped by the memory range may be added to a set of entries that khugepaged will later process and attempt update their page tables to map the THP by a pmd. This functionality unlocks two important uses: (1) Immediately back executable text by THPs. Current support provided by CONFIG_READ_ONLY_THP_FOR_FS may take a long time on a large system which might impair services from serving at their full rated load after (re)starting. Tricks like mremap(2)'ing text onto anonymous memory to immediately realize iTLB performance prevents page sharing and demand paging, both of which increase steady state memory footprint. Now, we can have the best of both worlds: Peak upfront performance and lower RAM footprints. (2) userfaultfd-based live migration of virtual machines satisfy UFFD faults by fetching native-sized pages over the network (to avoid latency of transferring an entire hugepage). However, after guest memory has been fully copied to the new host, MADV_COLLAPSE can be used to immediately increase guest performance. Since khugepaged is single threaded, this change now introduces possibility of collapse contexts racing in file collapse path. There a important few places to consider: (1) hpage_collapse_scan_file(), when we xas_pause() and drop RCU. We could have the memory collapsed out from under us, but the next xas_for_each() iteration will correctly pick up the hugepage. The hugepage might not be up to date (insofar as copying of small page contents might not have completed - the page still may be locked), but regardless what small page index we were iterating over, we'll find the hugepage and identify it as a suitably aligned compound page of order HPAGE_PMD_ORDER. In khugepaged path, we locklessly check the value of the pmd, and only add it to deferred collapse array if we find pmd mapping pte table. This is fine, since other values that could have raced in right afterwards denote failure, or that the memory was successfully collapsed, so we don't need further processing. In madvise path, we'll take mmap_lock() in write to serialize against page table updates and will know what to do based on the true value of the pmd: recheck all ptes if we point to a pte table, directly install the pmd, if the pmd has been cleared, but memory not yet faulted, or nothing at all if we find a huge pmd. It's worth putting emphasis here on how we treat the none pmd here. If khugepaged has processed this mm's page tables already, it will have left the pmd cleared (ready for refault by the process). Depending on the VMA flags and sysfs settings, amount of RAM on the machine, and the current load, could be a relatively common occurrence - and as such is one we'd like to handle successfully in MADV_COLLAPSE. When we see the none pmd in collapse_pte_mapped_thp(), we've locked mmap_lock in write and checked (a) huepaged_vma_check() to see if the backing memory is appropriate still, along with VMA sizing and appropriate hugepage alignment within the file, and (b) we've found a hugepage head of order HPAGE_PMD_ORDER at the offset in the file mapped by our hugepage-aligned virtual address. Even though the common-case is likely race with khugepaged, given these checks (regardless how we got here - we could be operating on a completely different file than originally checked in hpage_collapse_scan_file() for all we know) it should be safe to directly make the pmd a huge pmd pointing to this hugepage. (2) collapse_file() is mostly serialized on the same file extent by lock sequence: | lock hupepage | lock mapping->i_pages | lock 1st page | unlock mapping->i_pages | | lock mapping->i_pages | page_ref_freeze(3) | xas_store(hugepage) | unlock mapping->i_pages | page_ref_unfreeze(1) | unlock 1st page V unlock hugepage Once a context (who already has their fresh hugepage locked) locks mapping->i_pages exclusively, it will hold said lock until it locks the first page, and it will hold that lock until the after the hugepage has been added to the page cache (and will unlock the hugepage after page table update, though that isn't important here). A racing context that loses the race for mapping->i_pages will then lose the race to locking the first page. Here - depending on how far the other racing context has gotten - we might find the new hugepage (in which case we'll exit cleanly when we check PageTransCompound()), or we'll find the "old" 1st small page (in which we'll exit cleanly when we discover unexpected refcount of 2 after isolate_lru_page()). This is assuming we are able to successfully lock the page we find - in shmem path, we could just fail the trylock and exit cleanly anyways. Failure path in collapse_file() is similar: once we hold lock on 1st small page, we are serialized against other collapse contexts. Before the 1st small page is unlocked, we add it back to the pagecache and unfreeze the refcount appropriately. Contexts who lost the race to the 1st small page will then find the same 1st small page with the correct refcount and will be able to proceed. [zokeefe@google.com: don't check pmd value twice in collapse_pte_mapped_thp()] Link: https://lkml.kernel.org/r/20220927033854.477018-1-zokeefe@google.com [shy828301@gmail.com: Delete hugepage_vma_revalidate_anon(), remove check for multi-add in khugepaged_add_pte_mapped_thp()] Link: https://lore.kernel.org/linux-mm/CAHbLzkrtpM=ic7cYAHcqkubah5VTR8N5=k5RT8MTvv5rN1Y91w@mail.gmail.com/ Link: https://lkml.kernel.org/r/20220907144521.3115321-4-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-4-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- include/linux/khugepaged.h | 13 +- include/trace/events/huge_memory.h | 1 + kernel/events/uprobes.c | 2 +- mm/khugepaged.c | 247 ++++++++++++++++++++++------- 4 files changed, 199 insertions(+), 64 deletions(-) diff --git a/include/linux/khugepaged.h b/include/linux/khugepaged.h index 384f034ae947..70162d707caf 100644 --- a/include/linux/khugepaged.h +++ b/include/linux/khugepaged.h @@ -16,11 +16,13 @@ extern void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags); extern void khugepaged_min_free_kbytes_update(void); #ifdef CONFIG_SHMEM -extern void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr); +extern int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd); #else -static inline void collapse_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr) +static inline int collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr, bool install_pmd) { + return 0; } #endif @@ -46,9 +48,10 @@ static inline void khugepaged_enter_vma(struct vm_area_struct *vma, unsigned long vm_flags) { } -static inline void collapse_pte_mapped_thp(struct mm_struct *mm, - unsigned long addr) +static inline int collapse_pte_mapped_thp(struct mm_struct *mm, + unsigned long addr, bool install_pmd) { + return 0; } static inline void khugepaged_min_free_kbytes_update(void) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index fbbb25494d60..df33453b70fc 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -11,6 +11,7 @@ EM( SCAN_FAIL, "failed") \ EM( SCAN_SUCCEED, "succeeded") \ EM( SCAN_PMD_NULL, "pmd_null") \ + EM( SCAN_PMD_NONE, "pmd_none") \ EM( SCAN_PMD_MAPPED, "page_pmd_mapped") \ EM( SCAN_EXCEED_NONE_PTE, "exceed_none_pte") \ EM( SCAN_EXCEED_SWAP_PTE, "exceed_swap_pte") \ diff --git a/kernel/events/uprobes.c b/kernel/events/uprobes.c index e0a9b945e7bc..d9e357b7e17c 100644 --- a/kernel/events/uprobes.c +++ b/kernel/events/uprobes.c @@ -555,7 +555,7 @@ put_old: /* try collapse pmd for compound page */ if (!ret && orig_page_huge) - collapse_pte_mapped_thp(mm, vaddr); + collapse_pte_mapped_thp(mm, vaddr, false); return ret; } diff --git a/mm/khugepaged.c b/mm/khugepaged.c index b1e3f83c4eb2..3bd6e2a74163 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -29,6 +29,7 @@ enum scan_result { SCAN_FAIL, SCAN_SUCCEED, SCAN_PMD_NULL, + SCAN_PMD_NONE, SCAN_PMD_MAPPED, SCAN_EXCEED_NONE_PTE, SCAN_EXCEED_SWAP_PTE, @@ -821,6 +822,7 @@ static bool hpage_collapse_alloc_page(struct page **hpage, gfp_t gfp, int node) */ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, + bool expect_anon, struct vm_area_struct **vmap, struct collapse_control *cc) { @@ -845,8 +847,8 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, * hugepage_vma_check may return true for qualified file * vmas. */ - if (!vma->anon_vma || !vma_is_anonymous(vma)) - return SCAN_VMA_CHECK; + if (expect_anon && (!(*vmap)->anon_vma || !vma_is_anonymous(*vmap))) + return SCAN_PAGE_ANON; return SCAN_SUCCEED; } @@ -866,8 +868,8 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ barrier(); #endif - if (!pmd_present(pmde)) - return SCAN_PMD_NULL; + if (pmd_none(pmde)) + return SCAN_PMD_NONE; if (pmd_trans_huge(pmde)) return SCAN_PMD_MAPPED; if (pmd_bad(pmde)) @@ -995,7 +997,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, goto out_nolock; mmap_read_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma, cc); + result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) { mmap_read_unlock(mm); goto out_nolock; @@ -1026,7 +1028,7 @@ static int collapse_huge_page(struct mm_struct *mm, unsigned long address, * handled by the anon_vma lock + PG_lock. */ mmap_write_lock(mm); - result = hugepage_vma_revalidate(mm, address, &vma, cc); + result = hugepage_vma_revalidate(mm, address, true, &vma, cc); if (result != SCAN_SUCCEED) goto out_up_write; /* check if the pmd is still valid */ @@ -1320,6 +1322,26 @@ static void collect_mm_slot(struct khugepaged_mm_slot *mm_slot) /* * Notify khugepaged that given addr of the mm is pte-mapped THP. Then * khugepaged should try to collapse the page table. + * + * Note that following race exists: + * (1) khugepaged calls khugepaged_collapse_pte_mapped_thps() for mm_struct A, + * emptying the A's ->pte_mapped_thp[] array. + * (2) MADV_COLLAPSE collapses some file extent with target mm_struct B, and + * retract_page_tables() finds a VMA in mm_struct A mapping the same extent + * (at virtual address X) and adds an entry (for X) into mm_struct A's + * ->pte-mapped_thp[] array. + * (3) khugepaged calls khugepaged_collapse_scan_file() for mm_struct A at X, + * sees a pte-mapped THP (SCAN_PTE_MAPPED_HUGEPAGE) and adds an entry + * (for X) into mm_struct A's ->pte-mapped_thp[] array. + * Thus, it's possible the same address is added multiple times for the same + * mm_struct. Should this happen, we'll simply attempt + * collapse_pte_mapped_thp() multiple times for the same address, under the same + * exclusive mmap_lock, and assuming the first call is successful, subsequent + * attempts will return quickly (without grabbing any additional locks) when + * a huge pmd is found in find_pmd_or_thp_or_none(). Since this is a cheap + * check, and since this is a rare occurrence, the cost of preventing this + * "multiple-add" is thought to be more expensive than just handling it, should + * it occur. */ static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) @@ -1341,6 +1363,27 @@ static bool khugepaged_add_pte_mapped_thp(struct mm_struct *mm, return ret; } +/* hpage must be locked, and mmap_lock must be held in write */ +static int set_huge_pmd(struct vm_area_struct *vma, unsigned long addr, + pmd_t *pmdp, struct page *hpage) +{ + struct vm_fault vmf = { + .vma = vma, + .address = addr, + .flags = 0, + .pmd = pmdp, + }; + + VM_BUG_ON(!PageTransHuge(hpage)); + mmap_assert_write_locked(vma->vm_mm); + + if (do_set_pmd(&vmf, hpage)) + return SCAN_FAIL; + + get_page(hpage); + return SCAN_SUCCEED; +} + static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { @@ -1362,12 +1405,14 @@ static void collapse_and_free_pmd(struct mm_struct *mm, struct vm_area_struct *v * * @mm: process address space where collapse happens * @addr: THP collapse address + * @install_pmd: If a huge PMD should be installed * * This function checks whether all the PTEs in the PMD are pointing to the * right THP. If so, retract the page table so the THP can refault in with - * as pmd-mapped. + * as pmd-mapped. Possibly install a huge PMD mapping the THP. */ -void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) +int collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr, + bool install_pmd) { unsigned long haddr = addr & HPAGE_PMD_MASK; struct vm_area_struct *vma = vma_lookup(mm, haddr); @@ -1380,14 +1425,14 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) mmap_assert_write_locked(mm); - /* Fast check before locking page if not PMD mapping PTE table */ + /* Fast check before locking page if already PMD-mapped */ result = find_pmd_or_thp_or_none(mm, haddr, &pmd); - if (result != SCAN_SUCCEED) - return; + if (result == SCAN_PMD_MAPPED) + return result; if (!vma || !vma->vm_file || !range_in_vma(vma, haddr, haddr + HPAGE_PMD_SIZE)) - return; + return SCAN_VMA_CHECK; /* * If we are here, we've succeeded in replacing all the native pages @@ -1397,27 +1442,43 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) * analogously elide sysfs THP settings here. */ if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) - return; + return SCAN_VMA_CHECK; /* Keep pmd pgtable for uffd-wp; see comment in retract_page_tables() */ if (userfaultfd_wp(vma)) - return; + return SCAN_PTE_UFFD_WP; hpage = find_lock_page(vma->vm_file->f_mapping, linear_page_index(vma, haddr)); if (!hpage) - return; + return SCAN_PAGE_NULL; - if (!PageHead(hpage)) + if (!PageHead(hpage)) { + result = SCAN_FAIL; goto drop_hpage; + } - if (compound_order(hpage) != HPAGE_PMD_ORDER) + if (compound_order(hpage) != HPAGE_PMD_ORDER) { + result = SCAN_PAGE_COMPOUND; goto drop_hpage; + } - if (find_pmd_or_thp_or_none(mm, haddr, &pmd) != SCAN_SUCCEED) + switch (result) { + case SCAN_SUCCEED: + break; + case SCAN_PMD_NONE: + /* + * In MADV_COLLAPSE path, possible race with khugepaged where + * all pte entries have been removed and pmd cleared. If so, + * skip all the pte checks and just update the pmd mapping. + */ + goto maybe_install_pmd; + default: goto drop_hpage; + } start_pte = pte_offset_map_lock(mm, pmd, haddr, &ptl); + result = SCAN_FAIL; /* step 1: check all mapped PTEs are to the right huge page */ for (i = 0, addr = haddr, pte = start_pte; @@ -1429,8 +1490,10 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) continue; /* page swapped out, abort */ - if (!pte_present(*pte)) + if (!pte_present(*pte)) { + result = SCAN_PTE_NON_PRESENT; goto abort; + } page = vm_normal_page(vma, addr, *pte); if (WARN_ON_ONCE(page && is_zone_device_page(page))) @@ -1465,12 +1528,19 @@ void collapse_pte_mapped_thp(struct mm_struct *mm, unsigned long addr) add_mm_counter(vma->vm_mm, mm_counter_file(hpage), -count); } - /* step 4: collapse pmd */ + /* step 4: remove pte entries */ collapse_and_free_pmd(mm, vma, haddr, pmd); + +maybe_install_pmd: + /* step 5: install pmd entry */ + result = install_pmd + ? set_huge_pmd(vma, haddr, pmd, hpage) + : SCAN_SUCCEED; + drop_hpage: unlock_page(hpage); put_page(hpage); - return; + return result; abort: pte_unmap_unlock(start_pte, ptl); @@ -1493,22 +1563,29 @@ static void khugepaged_collapse_pte_mapped_thps(struct khugepaged_mm_slot *mm_sl goto out; for (i = 0; i < mm_slot->nr_pte_mapped_thp; i++) - collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i]); + collapse_pte_mapped_thp(mm, mm_slot->pte_mapped_thp[i], false); out: mm_slot->nr_pte_mapped_thp = 0; mmap_write_unlock(mm); } -static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) +static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, + struct mm_struct *target_mm, + unsigned long target_addr, struct page *hpage, + struct collapse_control *cc) { struct vm_area_struct *vma; - struct mm_struct *mm; - unsigned long addr; - pmd_t *pmd; + int target_result = SCAN_FAIL; i_mmap_lock_write(mapping); vma_interval_tree_foreach(vma, &mapping->i_mmap, pgoff, pgoff) { + int result = SCAN_FAIL; + struct mm_struct *mm = NULL; + unsigned long addr = 0; + pmd_t *pmd; + bool is_target = false; + /* * Check vma->anon_vma to exclude MAP_PRIVATE mappings that * got written to. These VMAs are likely not worth investing @@ -1525,24 +1602,34 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * ptl. It has higher chance to recover THP for the VMA, but * has higher cost too. */ - if (vma->anon_vma) - continue; + if (vma->anon_vma) { + result = SCAN_PAGE_ANON; + goto next; + } addr = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); - if (addr & ~HPAGE_PMD_MASK) - continue; - if (vma->vm_end < addr + HPAGE_PMD_SIZE) - continue; + if (addr & ~HPAGE_PMD_MASK || + vma->vm_end < addr + HPAGE_PMD_SIZE) { + result = SCAN_VMA_CHECK; + goto next; + } mm = vma->vm_mm; - if (find_pmd_or_thp_or_none(mm, addr, &pmd) != SCAN_SUCCEED) - continue; + is_target = mm == target_mm && addr == target_addr; + result = find_pmd_or_thp_or_none(mm, addr, &pmd); + if (result != SCAN_SUCCEED) + goto next; /* * We need exclusive mmap_lock to retract page table. * * We use trylock due to lock inversion: we need to acquire * mmap_lock while holding page lock. Fault path does it in * reverse order. Trylock is a way to avoid deadlock. + * + * Also, it's not MADV_COLLAPSE's job to collapse other + * mappings - let khugepaged take care of them later. */ - if (mmap_write_trylock(mm)) { + result = SCAN_PTE_MAPPED_HUGEPAGE; + if ((cc->is_khugepaged || is_target) && + mmap_write_trylock(mm)) { /* * When a vma is registered with uffd-wp, we can't * recycle the pmd pgtable because there can be pte @@ -1551,22 +1638,45 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * it'll always mapped in small page size for uffd-wp * registered ranges. */ - if (!hpage_collapse_test_exit(mm) && - !userfaultfd_wp(vma)) - collapse_and_free_pmd(mm, vma, addr, pmd); + if (hpage_collapse_test_exit(mm)) { + result = SCAN_ANY_PROCESS; + goto unlock_next; + } + if (userfaultfd_wp(vma)) { + result = SCAN_PTE_UFFD_WP; + goto unlock_next; + } + collapse_and_free_pmd(mm, vma, addr, pmd); + if (!cc->is_khugepaged && is_target) + result = set_huge_pmd(vma, addr, pmd, hpage); + else + result = SCAN_SUCCEED; + +unlock_next: mmap_write_unlock(mm); - } else { - /* Try again later */ - khugepaged_add_pte_mapped_thp(mm, addr); + goto next; } + /* + * Calling context will handle target mm/addr. Otherwise, let + * khugepaged try again later. + */ + if (!is_target) { + khugepaged_add_pte_mapped_thp(mm, addr); + continue; + } +next: + if (is_target) + target_result = result; } i_mmap_unlock_write(mapping); + return target_result; } /** * collapse_file - collapse filemap/tmpfs/shmem pages into huge one. * * @mm: process address space where collapse happens + * @addr: virtual collapse start address * @file: file that collapse on * @start: collapse start address * @cc: collapse context and scratchpad @@ -1586,8 +1696,9 @@ static void retract_page_tables(struct address_space *mapping, pgoff_t pgoff) * + restore gaps in the page cache; * + unlock and free huge page; */ -static int collapse_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct collapse_control *cc) +static int collapse_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { struct address_space *mapping = file->f_mapping; struct page *hpage; @@ -1895,7 +2006,8 @@ xa_unlocked: /* * Remove pte page tables, so we can re-fault the page as huge. */ - retract_page_tables(mapping, start); + result = retract_page_tables(mapping, start, mm, addr, hpage, + cc); unlock_page(hpage); hpage = NULL; } else { @@ -1951,8 +2063,9 @@ out: return result; } -static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct collapse_control *cc) +static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { struct page *page = NULL; struct address_space *mapping = file->f_mapping; @@ -2040,7 +2153,7 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, result = SCAN_EXCEED_NONE_PTE; count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { - result = collapse_file(mm, file, start, cc); + result = collapse_file(mm, addr, file, start, cc); } } @@ -2048,8 +2161,9 @@ static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, return result; } #else -static int khugepaged_scan_file(struct mm_struct *mm, struct file *file, - pgoff_t start, struct collapse_control *cc) +static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, + struct file *file, pgoff_t start, + struct collapse_control *cc) { BUILD_BUG(); } @@ -2145,8 +2259,9 @@ skip: khugepaged_scan.address); mmap_read_unlock(mm); - *result = khugepaged_scan_file(mm, file, pgoff, - cc); + *result = hpage_collapse_scan_file(mm, + khugepaged_scan.address, + file, pgoff, cc); mmap_locked = false; fput(file); } else { @@ -2453,10 +2568,6 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, *prev = vma; - /* TODO: Support file/shmem */ - if (!vma->anon_vma || !vma_is_anonymous(vma)) - return -EINVAL; - if (!hugepage_vma_check(vma, vma->vm_flags, false, false, false)) return -EINVAL; @@ -2479,7 +2590,8 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, cond_resched(); mmap_read_lock(mm); mmap_locked = true; - result = hugepage_vma_revalidate(mm, addr, &vma, cc); + result = hugepage_vma_revalidate(mm, addr, false, &vma, + cc); if (result != SCAN_SUCCEED) { last_fail = result; goto out_nolock; @@ -2489,16 +2601,35 @@ int madvise_collapse(struct vm_area_struct *vma, struct vm_area_struct **prev, } mmap_assert_locked(mm); memset(cc->node_load, 0, sizeof(cc->node_load)); - result = hpage_collapse_scan_pmd(mm, vma, addr, &mmap_locked, - cc); + if (IS_ENABLED(CONFIG_SHMEM) && vma->vm_file) { + struct file *file = get_file(vma->vm_file); + pgoff_t pgoff = linear_page_index(vma, addr); + + mmap_read_unlock(mm); + mmap_locked = false; + result = hpage_collapse_scan_file(mm, addr, file, pgoff, + cc); + fput(file); + } else { + result = hpage_collapse_scan_pmd(mm, vma, addr, + &mmap_locked, cc); + } if (!mmap_locked) *prev = NULL; /* Tell caller we dropped mmap_lock */ +handle_result: switch (result) { case SCAN_SUCCEED: case SCAN_PMD_MAPPED: ++thps; break; + case SCAN_PTE_MAPPED_HUGEPAGE: + BUG_ON(mmap_locked); + BUG_ON(*prev); + mmap_write_lock(mm); + result = collapse_pte_mapped_thp(mm, addr, true); + mmap_write_unlock(mm); + goto handle_result; /* Whitelisted set of results where continuing OK */ case SCAN_PMD_NULL: case SCAN_PTE_NON_PRESENT: From d41fd2016ed07a630da2817b76c98eeab7931e1e Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:40 -0700 Subject: [PATCH 332/350] mm/khugepaged: add tracepoint to hpage_collapse_scan_file() Add huge_memory:trace_mm_khugepaged_scan_file tracepoint to hpage_collapse_scan_file() analogously to hpage_collapse_scan_pmd(). While this change is targeted at debugging MADV_COLLAPSE pathway, the "mm_khugepaged" prefix is retained for symmetry with huge_memory:trace_mm_khugepaged_scan_pmd, which retains it's legacy name to prevent changing kernel ABI as much as possible. Link: https://lkml.kernel.org/r/20220907144521.3115321-5-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-5-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Yang Shi Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Signed-off-by: Andrew Morton --- include/trace/events/huge_memory.h | 34 ++++++++++++++++++++++++++++++ mm/khugepaged.c | 3 ++- 2 files changed, 36 insertions(+), 1 deletion(-) diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index df33453b70fc..935af4947917 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -169,5 +169,39 @@ TRACE_EVENT(mm_collapse_huge_page_swapin, __entry->ret) ); +TRACE_EVENT(mm_khugepaged_scan_file, + + TP_PROTO(struct mm_struct *mm, struct page *page, const char *filename, + int present, int swap, int result), + + TP_ARGS(mm, page, filename, present, swap, result), + + TP_STRUCT__entry( + __field(struct mm_struct *, mm) + __field(unsigned long, pfn) + __string(filename, filename) + __field(int, present) + __field(int, swap) + __field(int, result) + ), + + TP_fast_assign( + __entry->mm = mm; + __entry->pfn = page ? page_to_pfn(page) : -1; + __assign_str(filename, filename); + __entry->present = present; + __entry->swap = swap; + __entry->result = result; + ), + + TP_printk("mm=%p, scan_pfn=0x%lx, filename=%s, present=%d, swap=%d, result=%s", + __entry->mm, + __entry->pfn, + __get_str(filename), + __entry->present, + __entry->swap, + __print_symbolic(__entry->result, SCAN_STATUS)) +); + #endif /* __HUGE_MEMORY_H */ #include diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 3bd6e2a74163..c7699fabf302 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -2157,7 +2157,8 @@ static int hpage_collapse_scan_file(struct mm_struct *mm, unsigned long addr, } } - /* TODO: tracepoints */ + trace_mm_khugepaged_scan_file(mm, page, file->f_path.dentry->d_iname, + present, swap, result); return result; } #else From c07c343cda8ef02985ac6583a2e5af892726f734 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:41 -0700 Subject: [PATCH 333/350] selftests/vm: dedup THP helpers These files: tools/testing/selftests/vm/vm_util.c tools/testing/selftests/vm/khugepaged.c Both contain logic to: 1) Determine hugepage size on current system 2) Read /proc/self/smaps to determine number of THPs at an address Refactor selftests/vm/khugepaged.c to use the vm_util common helpers and add it as a build dependency. Since selftests/vm/khugepaged.c is the largest user of check_huge(), change the signature of check_huge() to match selftests/vm/khugepaged.c's useage: take an expected number of hugepages, and return a bool indicating if the correct number of hugepages were found. Add a wrapper, check_huge_anon(), in anticipation of checking smaps for file and shmem hugepages. Update existing callsites to use the new pattern / function. Likewise, check_for_pattern() was duplicated, and it's a general enough helper to include in vm_util helpers as well. Link: https://lkml.kernel.org/r/20220907144521.3115321-6-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-6-zokeefe@google.com Signed-off-by: Zach O'Keefe Reviewed-by: Zi Yan Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/khugepaged.c | 64 ++----------------- tools/testing/selftests/vm/soft-dirty.c | 2 +- .../selftests/vm/split_huge_page_test.c | 12 ++-- tools/testing/selftests/vm/vm_util.c | 26 +++++--- tools/testing/selftests/vm/vm_util.h | 3 +- 6 files changed, 32 insertions(+), 76 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 4ae879f70f4c..c9c0996c122b 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -95,6 +95,7 @@ TEST_FILES += va_128TBswitch.sh include ../lib.mk +$(OUTPUT)/khugepaged: vm_util.c $(OUTPUT)/madv_populate: vm_util.c $(OUTPUT)/soft-dirty: vm_util.c $(OUTPUT)/split_huge_page_test: vm_util.c diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index b55dc331af13..235a64b4458c 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -12,6 +12,8 @@ #include #include +#include "vm_util.h" + #ifndef MADV_PAGEOUT #define MADV_PAGEOUT 21 #endif @@ -352,64 +354,12 @@ static void save_settings(void) signal(SIGQUIT, restore_settings); } -#define MAX_LINE_LENGTH 500 - -static bool check_for_pattern(FILE *fp, char *pattern, char *buf) -{ - while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { - if (!strncmp(buf, pattern, strlen(pattern))) - return true; - } - return false; -} - static bool check_huge(void *addr, int nr_hpages) { - bool thp = false; - int ret; - FILE *fp; - char buffer[MAX_LINE_LENGTH]; - char addr_pattern[MAX_LINE_LENGTH]; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "%08lx-", - (unsigned long) addr); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - - - fp = fopen(PID_SMAPS, "r"); - if (!fp) { - printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); - exit(EXIT_FAILURE); - } - if (!check_for_pattern(fp, addr_pattern, buffer)) - goto err_out; - - ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "AnonHugePages:%10ld kB", - nr_hpages * (hpage_pmd_size >> 10)); - if (ret >= MAX_LINE_LENGTH) { - printf("%s: Pattern is too long\n", __func__); - exit(EXIT_FAILURE); - } - /* - * Fetch the AnonHugePages: in the same block and check whether it got - * the expected number of hugeepages next. - */ - if (!check_for_pattern(fp, "AnonHugePages:", buffer)) - goto err_out; - - if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) - goto err_out; - - thp = true; -err_out: - fclose(fp); - return thp; + return check_huge_anon(addr, nr_hpages, hpage_pmd_size); } - +#define MAX_LINE_LENGTH 500 static bool check_swap(void *addr, unsigned long size) { bool swap = false; @@ -431,7 +381,7 @@ static bool check_swap(void *addr, unsigned long size) printf("%s: Failed to open file %s\n", __func__, PID_SMAPS); exit(EXIT_FAILURE); } - if (!check_for_pattern(fp, addr_pattern, buffer)) + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) goto err_out; ret = snprintf(addr_pattern, MAX_LINE_LENGTH, "Swap:%19ld kB", @@ -444,7 +394,7 @@ static bool check_swap(void *addr, unsigned long size) * Fetch the Swap: in the same block and check whether it got * the expected number of hugeepages next. */ - if (!check_for_pattern(fp, "Swap:", buffer)) + if (!check_for_pattern(fp, "Swap:", buffer, sizeof(buffer))) goto err_out; if (strncmp(buffer, addr_pattern, strlen(addr_pattern))) @@ -1066,7 +1016,7 @@ int main(int argc, const char **argv) setbuf(stdout, NULL); page_size = getpagesize(); - hpage_pmd_size = read_num("hpage_pmd_size"); + hpage_pmd_size = read_pmd_pagesize(); hpage_pmd_nr = hpage_pmd_size / page_size; default_settings.khugepaged.max_ptes_none = hpage_pmd_nr - 1; diff --git a/tools/testing/selftests/vm/soft-dirty.c b/tools/testing/selftests/vm/soft-dirty.c index e3a43f5d4fa2..21d8830c5f24 100644 --- a/tools/testing/selftests/vm/soft-dirty.c +++ b/tools/testing/selftests/vm/soft-dirty.c @@ -91,7 +91,7 @@ static void test_hugepage(int pagemap_fd, int pagesize) for (i = 0; i < hpage_len; i++) map[i] = (char)i; - if (check_huge(map)) { + if (check_huge_anon(map, 1, hpage_len)) { ksft_test_result_pass("Test %s huge page allocation\n", __func__); clear_softdirty(); diff --git a/tools/testing/selftests/vm/split_huge_page_test.c b/tools/testing/selftests/vm/split_huge_page_test.c index 6aa2b8253aed..76e1c36dd9e5 100644 --- a/tools/testing/selftests/vm/split_huge_page_test.c +++ b/tools/testing/selftests/vm/split_huge_page_test.c @@ -92,7 +92,6 @@ void split_pmd_thp(void) { char *one_page; size_t len = 4 * pmd_pagesize; - uint64_t thp_size; size_t i; one_page = memalign(pmd_pagesize, len); @@ -107,8 +106,7 @@ void split_pmd_thp(void) for (i = 0; i < len; i++) one_page[i] = (char)i; - thp_size = check_huge(one_page); - if (!thp_size) { + if (!check_huge_anon(one_page, 1, pmd_pagesize)) { printf("No THP is allocated\n"); exit(EXIT_FAILURE); } @@ -124,9 +122,8 @@ void split_pmd_thp(void) } - thp_size = check_huge(one_page); - if (thp_size) { - printf("Still %ld kB AnonHugePages not split\n", thp_size); + if (check_huge_anon(one_page, 0, pmd_pagesize)) { + printf("Still AnonHugePages not split\n"); exit(EXIT_FAILURE); } @@ -172,8 +169,7 @@ void split_pte_mapped_thp(void) for (i = 0; i < len; i++) one_page[i] = (char)i; - thp_size = check_huge(one_page); - if (!thp_size) { + if (!check_huge_anon(one_page, 1, pmd_pagesize)) { printf("No THP is allocated\n"); exit(EXIT_FAILURE); } diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index b58ab11a7a30..9dae51b8219f 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -42,9 +42,9 @@ void clear_softdirty(void) ksft_exit_fail_msg("writing clear_refs failed\n"); } -static bool check_for_pattern(FILE *fp, const char *pattern, char *buf) +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len) { - while (fgets(buf, MAX_LINE_LENGTH, fp) != NULL) { + while (fgets(buf, len, fp)) { if (!strncmp(buf, pattern, strlen(pattern))) return true; } @@ -72,9 +72,10 @@ uint64_t read_pmd_pagesize(void) return strtoul(buf, NULL, 10); } -uint64_t check_huge(void *addr) +bool __check_huge(void *addr, char *pattern, int nr_hpages, + uint64_t hpage_size) { - uint64_t thp = 0; + uint64_t thp = -1; int ret; FILE *fp; char buffer[MAX_LINE_LENGTH]; @@ -89,20 +90,27 @@ uint64_t check_huge(void *addr) if (!fp) ksft_exit_fail_msg("%s: Failed to open file %s\n", __func__, SMAP_FILE_PATH); - if (!check_for_pattern(fp, addr_pattern, buffer)) + if (!check_for_pattern(fp, addr_pattern, buffer, sizeof(buffer))) goto err_out; /* - * Fetch the AnonHugePages: in the same block and check the number of + * Fetch the pattern in the same block and check the number of * hugepages. */ - if (!check_for_pattern(fp, "AnonHugePages:", buffer)) + if (!check_for_pattern(fp, pattern, buffer, sizeof(buffer))) goto err_out; - if (sscanf(buffer, "AnonHugePages:%10ld kB", &thp) != 1) + snprintf(addr_pattern, MAX_LINE_LENGTH, "%s%%9ld kB", pattern); + + if (sscanf(buffer, addr_pattern, &thp) != 1) ksft_exit_fail_msg("Reading smap error\n"); err_out: fclose(fp); - return thp; + return thp == (nr_hpages * (hpage_size >> 10)); +} + +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size); } diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h index 2e512bd57ae1..8434ea0c95cd 100644 --- a/tools/testing/selftests/vm/vm_util.h +++ b/tools/testing/selftests/vm/vm_util.h @@ -5,5 +5,6 @@ uint64_t pagemap_get_entry(int fd, char *start); bool pagemap_is_softdirty(int fd, char *start); void clear_softdirty(void); +bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); -uint64_t check_huge(void *addr); +bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); From 8e638707a3f1a82dccbdc9285980329644946d4f Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:42 -0700 Subject: [PATCH 334/350] selftests/vm: modularize thp collapse memory operations Modularize operations to setup, cleanup, fault, and check for huge pages, for a given memory type. This allows reusing existing tests with additional memory types by defining new memory operations. Following patches will add file and shmem memory types. Link: https://lkml.kernel.org/r/20220907144521.3115321-7-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-7-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 377 +++++++++++++----------- 1 file changed, 209 insertions(+), 168 deletions(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 235a64b4458c..06ea6f18980e 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -29,8 +29,16 @@ static int hpage_pmd_nr; #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" #define PID_SMAPS "/proc/self/smaps" +struct mem_ops { + void *(*setup_area)(int nr_hpages); + void (*cleanup_area)(void *p, unsigned long size); + void (*fault)(void *p, unsigned long start, unsigned long end); + bool (*check_huge)(void *addr, int nr_hpages); +}; + struct collapse_context { - void (*collapse)(const char *msg, char *p, int nr_hpages, bool expect); + void (*collapse)(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect); bool enforce_pte_scan_limits; }; @@ -354,11 +362,6 @@ static void save_settings(void) signal(SIGQUIT, restore_settings); } -static bool check_huge(void *addr, int nr_hpages) -{ - return check_huge_anon(addr, nr_hpages, hpage_pmd_size); -} - #define MAX_LINE_LENGTH 500 static bool check_swap(void *addr, unsigned long size) { @@ -452,18 +455,33 @@ retry: * Returns pmd-mapped hugepage in VMA marked VM_HUGEPAGE, filled with * validate_memory()'able contents. */ -static void *alloc_hpage(void) +static void *alloc_hpage(struct mem_ops *ops) { - void *p; + void *p = ops->setup_area(1); - p = alloc_mapping(1); + ops->fault(p, 0, hpage_pmd_size); + + /* + * VMA should be neither VM_HUGEPAGE nor VM_NOHUGEPAGE. + * The latter is ineligible for collapse by MADV_COLLAPSE + * while the former might cause MADV_COLLAPSE to race with + * khugepaged on low-load system (like a test machine), which + * would cause MADV_COLLAPSE to fail with EAGAIN. + */ printf("Allocate huge page..."); - madvise(p, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p, 1)) - success("OK"); - else - fail("Fail"); + if (madvise_collapse_retry(p, hpage_pmd_size)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (!ops->check_huge(p, 1)) { + perror("madvise(MADV_COLLAPSE)"); + exit(EXIT_FAILURE); + } + if (madvise(p, hpage_pmd_size, MADV_HUGEPAGE)) { + perror("madvise(MADV_HUGEPAGE)"); + exit(EXIT_FAILURE); + } + success("OK"); return p; } @@ -480,18 +498,40 @@ static void validate_memory(int *p, unsigned long start, unsigned long end) } } -static void madvise_collapse(const char *msg, char *p, int nr_hpages, - bool expect) +static void *anon_setup_area(int nr_hpages) +{ + return alloc_mapping(nr_hpages); +} + +static void anon_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); +} + +static void anon_fault(void *p, unsigned long start, unsigned long end) +{ + fill_memory(p, start, end); +} + +static bool anon_check_huge(void *addr, int nr_hpages) +{ + return check_huge_anon(addr, nr_hpages, hpage_pmd_size); +} + +static struct mem_ops anon_ops = { + .setup_area = &anon_setup_area, + .cleanup_area = &anon_cleanup_area, + .fault = &anon_fault, + .check_huge = &anon_check_huge, +}; + +static void __madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) { int ret; struct settings settings = *current_settings(); printf("%s...", msg); - /* Sanity check */ - if (!check_huge(p, 0)) { - printf("Unexpected huge page\n"); - exit(EXIT_FAILURE); - } /* * Prevent khugepaged interference and tests that MADV_COLLAPSE @@ -505,7 +545,7 @@ static void madvise_collapse(const char *msg, char *p, int nr_hpages, ret = madvise_collapse_retry(p, nr_hpages * hpage_pmd_size); if (((bool)ret) == expect) fail("Fail: Bad return value"); - else if (check_huge(p, nr_hpages) != expect) + else if (!ops->check_huge(p, expect ? nr_hpages : 0)) fail("Fail: check_huge()"); else success("OK"); @@ -513,14 +553,26 @@ static void madvise_collapse(const char *msg, char *p, int nr_hpages, pop_settings(); } +static void madvise_collapse(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops, bool expect) +{ + /* Sanity check */ + if (!ops->check_huge(p, 0)) { + printf("Unexpected huge page\n"); + exit(EXIT_FAILURE); + } + __madvise_collapse(msg, p, nr_hpages, ops, expect); +} + #define TICK 500000 -static bool wait_for_scan(const char *msg, char *p, int nr_hpages) +static bool wait_for_scan(const char *msg, char *p, int nr_hpages, + struct mem_ops *ops) { int full_scans; int timeout = 6; /* 3 seconds */ /* Sanity check */ - if (!check_huge(p, 0)) { + if (!ops->check_huge(p, 0)) { printf("Unexpected huge page\n"); exit(EXIT_FAILURE); } @@ -532,7 +584,7 @@ static bool wait_for_scan(const char *msg, char *p, int nr_hpages) printf("%s...", msg); while (timeout--) { - if (check_huge(p, nr_hpages)) + if (ops->check_huge(p, nr_hpages)) break; if (read_num("khugepaged/full_scans") >= full_scans) break; @@ -546,19 +598,20 @@ static bool wait_for_scan(const char *msg, char *p, int nr_hpages) } static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, - bool expect) + struct mem_ops *ops, bool expect) { - if (wait_for_scan(msg, p, nr_hpages)) { + if (wait_for_scan(msg, p, nr_hpages, ops)) { if (expect) fail("Timeout"); else success("OK"); return; - } else if (check_huge(p, nr_hpages) == expect) { - success("OK"); - } else { - fail("Fail"); } + + if (ops->check_huge(p, expect ? nr_hpages : 0)) + success("OK"); + else + fail("Fail"); } static void alloc_at_fault(void) @@ -572,7 +625,7 @@ static void alloc_at_fault(void) p = alloc_mapping(1); *p = 1; printf("Allocate huge page on fault..."); - if (check_huge(p, 1)) + if (check_huge_anon(p, 1, hpage_pmd_size)) success("OK"); else fail("Fail"); @@ -581,49 +634,48 @@ static void alloc_at_fault(void) madvise(p, page_size, MADV_DONTNEED); printf("Split huge PMD on MADV_DONTNEED..."); - if (check_huge(p, 0)) + if (check_huge_anon(p, 0, hpage_pmd_size)) success("OK"); else fail("Fail"); munmap(p, hpage_pmd_size); } -static void collapse_full(struct collapse_context *c) +static void collapse_full(struct collapse_context *c, struct mem_ops *ops) { void *p; int nr_hpages = 4; unsigned long size = nr_hpages * hpage_pmd_size; - p = alloc_mapping(nr_hpages); - fill_memory(p, 0, size); + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); c->collapse("Collapse multiple fully populated PTE table", p, nr_hpages, - true); + ops, true); validate_memory(p, 0, size); - munmap(p, size); + ops->cleanup_area(p, size); } -static void collapse_empty(struct collapse_context *c) +static void collapse_empty(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_mapping(1); - c->collapse("Do not collapse empty PTE table", p, 1, false); - munmap(p, hpage_pmd_size); + p = ops->setup_area(1); + c->collapse("Do not collapse empty PTE table", p, 1, ops, false); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_single_pte_entry(struct collapse_context *c) +static void collapse_single_pte_entry(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_mapping(1); - fill_memory(p, 0, page_size); + p = ops->setup_area(1); + ops->fault(p, 0, page_size); c->collapse("Collapse PTE table with single PTE entry present", p, - 1, true); - validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + 1, ops, true); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_max_ptes_none(struct collapse_context *c) +static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_none = hpage_pmd_nr / 2; struct settings settings = *current_settings(); @@ -632,30 +684,30 @@ static void collapse_max_ptes_none(struct collapse_context *c) settings.khugepaged.max_ptes_none = max_ptes_none; push_settings(&settings); - p = alloc_mapping(1); + p = ops->setup_area(1); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, - !c->enforce_pte_scan_limits); + ops, !c->enforce_pte_scan_limits); validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); if (c->enforce_pte_scan_limits) { - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); - c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); + c->collapse("Collapse with max_ptes_none PTEs empty", p, 1, ops, true); validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); } - - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); pop_settings(); } -static void collapse_swapin_single_pte(struct collapse_context *c) +static void collapse_swapin_single_pte(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_mapping(1); - fill_memory(p, 0, hpage_pmd_size); + + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); printf("Swapout one page..."); if (madvise(p, page_size, MADV_PAGEOUT)) { @@ -669,20 +721,21 @@ static void collapse_swapin_single_pte(struct collapse_context *c) goto out; } - c->collapse("Collapse with swapping in single PTE entry", p, 1, true); + c->collapse("Collapse with swapping in single PTE entry", p, 1, ops, + true); validate_memory(p, 0, hpage_pmd_size); out: - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_max_ptes_swap(struct collapse_context *c) +static void collapse_max_ptes_swap(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_swap = read_num("khugepaged/max_ptes_swap"); void *p; - p = alloc_mapping(1); + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); - fill_memory(p, 0, hpage_pmd_size); printf("Swapout %d of %d pages...", max_ptes_swap + 1, hpage_pmd_nr); if (madvise(p, (max_ptes_swap + 1) * page_size, MADV_PAGEOUT)) { perror("madvise(MADV_PAGEOUT)"); @@ -695,12 +748,12 @@ static void collapse_max_ptes_swap(struct collapse_context *c) goto out; } - c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, + c->collapse("Maybe collapse with max_ptes_swap exceeded", p, 1, ops, !c->enforce_pte_scan_limits); validate_memory(p, 0, hpage_pmd_size); if (c->enforce_pte_scan_limits) { - fill_memory(p, 0, hpage_pmd_size); + ops->fault(p, 0, hpage_pmd_size); printf("Swapout %d of %d pages...", max_ptes_swap, hpage_pmd_nr); if (madvise(p, max_ptes_swap * page_size, MADV_PAGEOUT)) { @@ -715,63 +768,65 @@ static void collapse_max_ptes_swap(struct collapse_context *c) } c->collapse("Collapse with max_ptes_swap pages swapped out", p, - 1, true); + 1, ops, true); validate_memory(p, 0, hpage_pmd_size); } out: - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_single_pte_entry_compound(struct collapse_context *c) +static void collapse_single_pte_entry_compound(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_hpage(); + p = alloc_hpage(ops); + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); printf("Split huge page leaving single PTE mapping compound page..."); madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); c->collapse("Collapse PTE table with single PTE mapping compound page", - p, 1, true); + p, 1, ops, true); validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_full_of_compound(struct collapse_context *c) +static void collapse_full_of_compound(struct collapse_context *c, struct mem_ops *ops) { void *p; - p = alloc_hpage(); + p = alloc_hpage(ops); printf("Split huge page leaving single PTE page table full of compound pages..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - c->collapse("Collapse PTE table full of compound pages", p, 1, true); + c->collapse("Collapse PTE table full of compound pages", p, 1, ops, + true); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_compound_extreme(struct collapse_context *c) +static void collapse_compound_extreme(struct collapse_context *c, struct mem_ops *ops) { void *p; int i; - p = alloc_mapping(1); + p = ops->setup_area(1); for (i = 0; i < hpage_pmd_nr; i++) { printf("\rConstruct PTE page table full of different PTE-mapped compound pages %3d/%d...", i + 1, hpage_pmd_nr); madvise(BASE_ADDR, hpage_pmd_size, MADV_HUGEPAGE); - fill_memory(BASE_ADDR, 0, hpage_pmd_size); - if (!check_huge(BASE_ADDR, 1)) { + ops->fault(BASE_ADDR, 0, hpage_pmd_size); + if (!ops->check_huge(BASE_ADDR, 1)) { printf("Failed to allocate huge page\n"); exit(EXIT_FAILURE); } @@ -798,30 +853,30 @@ static void collapse_compound_extreme(struct collapse_context *c) } } - munmap(BASE_ADDR, hpage_pmd_size); - fill_memory(p, 0, hpage_pmd_size); - if (check_huge(p, 0)) + ops->cleanup_area(BASE_ADDR, hpage_pmd_size); + ops->fault(p, 0, hpage_pmd_size); + if (!ops->check_huge(p, 1)) success("OK"); else fail("Fail"); c->collapse("Collapse PTE table full of different compound pages", p, 1, - true); + ops, true); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_fork(struct collapse_context *c) +static void collapse_fork(struct collapse_context *c, struct mem_ops *ops) { int wstatus; void *p; - p = alloc_mapping(1); + p = ops->setup_area(1); printf("Allocate small page..."); - fill_memory(p, 0, page_size); - if (check_huge(p, 0)) + ops->fault(p, 0, page_size); + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); @@ -832,17 +887,17 @@ static void collapse_fork(struct collapse_context *c) skip_settings_restore = true; exit_status = 0; - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - fill_memory(p, page_size, 2 * page_size); + ops->fault(p, page_size, 2 * page_size); c->collapse("Collapse PTE table with single page shared with parent process", - p, 1, true); + p, 1, ops, true); validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); exit(exit_status); } @@ -850,27 +905,27 @@ static void collapse_fork(struct collapse_context *c) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has small page..."); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); validate_memory(p, 0, page_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_fork_compound(struct collapse_context *c) +static void collapse_fork_compound(struct collapse_context *c, struct mem_ops *ops) { int wstatus; void *p; - p = alloc_hpage(); + p = alloc_hpage(ops); printf("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; exit_status = 0; - if (check_huge(p, 1)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); @@ -878,20 +933,20 @@ static void collapse_fork_compound(struct collapse_context *c) printf("Split huge page PMD in child process..."); madvise(p, page_size, MADV_NOHUGEPAGE); madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); - fill_memory(p, 0, page_size); + ops->fault(p, 0, page_size); write_num("khugepaged/max_ptes_shared", hpage_pmd_nr - 1); c->collapse("Collapse PTE table full of compound pages in child", - p, 1, true); + p, 1, ops, true); write_num("khugepaged/max_ptes_shared", current_settings()->khugepaged.max_ptes_shared); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); exit(exit_status); } @@ -899,59 +954,59 @@ static void collapse_fork_compound(struct collapse_context *c) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has huge page..."); - if (check_huge(p, 1)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void collapse_max_ptes_shared(struct collapse_context *c) +static void collapse_max_ptes_shared(struct collapse_context *c, struct mem_ops *ops) { int max_ptes_shared = read_num("khugepaged/max_ptes_shared"); int wstatus; void *p; - p = alloc_hpage(); + p = alloc_hpage(ops); printf("Share huge page over fork()..."); if (!fork()) { /* Do not touch settings on child exit */ skip_settings_restore = true; exit_status = 0; - if (check_huge(p, 1)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); printf("Trigger CoW on page %d of %d...", hpage_pmd_nr - max_ptes_shared - 1, hpage_pmd_nr); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); - if (check_huge(p, 0)) + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared - 1) * page_size); + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); c->collapse("Maybe collapse with max_ptes_shared exceeded", p, - 1, !c->enforce_pte_scan_limits); + 1, ops, !c->enforce_pte_scan_limits); if (c->enforce_pte_scan_limits) { printf("Trigger CoW on page %d of %d...", hpage_pmd_nr - max_ptes_shared, hpage_pmd_nr); - fill_memory(p, 0, (hpage_pmd_nr - max_ptes_shared) * + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_shared) * page_size); - if (check_huge(p, 0)) + if (ops->check_huge(p, 0)) success("OK"); else fail("Fail"); c->collapse("Collapse with max_ptes_shared PTEs shared", - p, 1, true); + p, 1, ops, true); } validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); exit(exit_status); } @@ -959,42 +1014,28 @@ static void collapse_max_ptes_shared(struct collapse_context *c) exit_status += WEXITSTATUS(wstatus); printf("Check if parent still has huge page..."); - if (check_huge(p, 1)) + if (ops->check_huge(p, 1)) success("OK"); else fail("Fail"); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } -static void madvise_collapse_existing_thps(void) +static void madvise_collapse_existing_thps(struct collapse_context *c, + struct mem_ops *ops) { void *p; - int err; - p = alloc_mapping(1); - fill_memory(p, 0, hpage_pmd_size); - - printf("Collapse fully populated PTE table..."); - /* - * Note that we don't set MADV_HUGEPAGE here, which - * also tests that VM_HUGEPAGE isn't required for - * MADV_COLLAPSE in "madvise" mode. - */ - err = madvise(p, hpage_pmd_size, MADV_COLLAPSE); - if (err == 0 && check_huge(p, 1)) { - success("OK"); - printf("Re-collapse PMD-mapped hugepage"); - err = madvise(p, hpage_pmd_size, MADV_COLLAPSE); - if (err == 0 && check_huge(p, 1)) - success("OK"); - else - fail("Fail"); - } else { - fail("Fail"); - } + p = ops->setup_area(1); + ops->fault(p, 0, hpage_pmd_size); + c->collapse("Collapse fully populated PTE table...", p, 1, ops, true); validate_memory(p, 0, hpage_pmd_size); - munmap(p, hpage_pmd_size); + + /* c->collapse() will find a hugepage and complain - call directly. */ + __madvise_collapse("Re-collapse PMD-mapped hugepage", p, 1, ops, true); + validate_memory(p, 0, hpage_pmd_size); + ops->cleanup_area(p, hpage_pmd_size); } int main(int argc, const char **argv) @@ -1034,37 +1075,37 @@ int main(int argc, const char **argv) c.collapse = &khugepaged_collapse; c.enforce_pte_scan_limits = true; - collapse_full(&c); - collapse_empty(&c); - collapse_single_pte_entry(&c); - collapse_max_ptes_none(&c); - collapse_swapin_single_pte(&c); - collapse_max_ptes_swap(&c); - collapse_single_pte_entry_compound(&c); - collapse_full_of_compound(&c); - collapse_compound_extreme(&c); - collapse_fork(&c); - collapse_fork_compound(&c); - collapse_max_ptes_shared(&c); + collapse_full(&c, &anon_ops); + collapse_empty(&c, &anon_ops); + collapse_single_pte_entry(&c, &anon_ops); + collapse_max_ptes_none(&c, &anon_ops); + collapse_swapin_single_pte(&c, &anon_ops); + collapse_max_ptes_swap(&c, &anon_ops); + collapse_single_pte_entry_compound(&c, &anon_ops); + collapse_full_of_compound(&c, &anon_ops); + collapse_compound_extreme(&c, &anon_ops); + collapse_fork(&c, &anon_ops); + collapse_fork_compound(&c, &anon_ops); + collapse_max_ptes_shared(&c, &anon_ops); } if (!strcmp(tests, "madvise") || !strcmp(tests, "all")) { printf("\n*** Testing context: madvise ***\n"); c.collapse = &madvise_collapse; c.enforce_pte_scan_limits = false; - collapse_full(&c); - collapse_empty(&c); - collapse_single_pte_entry(&c); - collapse_max_ptes_none(&c); - collapse_swapin_single_pte(&c); - collapse_max_ptes_swap(&c); - collapse_single_pte_entry_compound(&c); - collapse_full_of_compound(&c); - collapse_compound_extreme(&c); - collapse_fork(&c); - collapse_fork_compound(&c); - collapse_max_ptes_shared(&c); - madvise_collapse_existing_thps(); + collapse_full(&c, &anon_ops); + collapse_empty(&c, &anon_ops); + collapse_single_pte_entry(&c, &anon_ops); + collapse_max_ptes_none(&c, &anon_ops); + collapse_swapin_single_pte(&c, &anon_ops); + collapse_max_ptes_swap(&c, &anon_ops); + collapse_single_pte_entry_compound(&c, &anon_ops); + collapse_full_of_compound(&c, &anon_ops); + collapse_compound_extreme(&c, &anon_ops); + collapse_fork(&c, &anon_ops); + collapse_fork_compound(&c, &anon_ops); + collapse_max_ptes_shared(&c, &anon_ops); + madvise_collapse_existing_thps(&c, &anon_ops); } restore_settings(0); From 1b03d0d558a281f12f68e5917dfa781c3b94e074 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:43 -0700 Subject: [PATCH 335/350] selftests/vm: add thp collapse file and tmpfs testing Add memory operations for file-backed and tmpfs memory. Call existing tests with these new memory operations to test collapse functionality of khugepaged and MADV_COLLAPSE on file-backed and tmpfs memory. Not all tests are reusable; for example, collapse_swapin_single_pte() which checks swap usage. Refactor test arguments. Usage is now: Usage: ./khugepaged [dir] : : : [all|khugepaged|madvise] : [all|anon|file] "file,all" mem_type requires [dir] argument "file,all" mem_type requires kernel built with CONFIG_READ_ONLY_THP_FOR_FS=y if [dir] is a (sub)directory of a tmpfs mount, tmpfs must be mounted with huge=madvise option for khugepaged tests to work Refactor calling tests to make it clear what collapse context / memory operations they support, but only invoke tests requested by user. Also log what test is being ran, and with what context / memory, to make test logs more human readable. A new test file is created and deleted for every test to ensure no pages remain in the page cache between tests (tests also may attempt to collapse different amount of memory). For file-backed memory where the file is stored on a block device, disable /sys/block//queue/read_ahead_kb so that pages don't find their way into the page cache without the tests faulting them in. Add file and shmem wrappers to vm_utils check for file and shmem hugepages in smaps. [zokeefe@google.com: fix "add thp collapse file and tmpfs testing" for tmpfs] Link: https://lkml.kernel.org/r/20220913212517.3163701-1-zokeefe@google.com Link: https://lkml.kernel.org/r/20220907144521.3115321-8-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-8-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 471 +++++++++++++++++++++--- tools/testing/selftests/vm/vm_util.c | 10 + tools/testing/selftests/vm/vm_util.h | 2 + 3 files changed, 429 insertions(+), 54 deletions(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 06ea6f18980e..08de6141c2af 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -1,7 +1,9 @@ #define _GNU_SOURCE +#include #include #include #include +#include #include #include #include @@ -11,12 +13,21 @@ #include #include +#include +#include +#include +#include + +#include "linux/magic.h" #include "vm_util.h" #ifndef MADV_PAGEOUT #define MADV_PAGEOUT 21 #endif +#ifndef MADV_POPULATE_READ +#define MADV_POPULATE_READ 22 +#endif #ifndef MADV_COLLAPSE #define MADV_COLLAPSE 25 #endif @@ -28,20 +39,47 @@ static int hpage_pmd_nr; #define THP_SYSFS "/sys/kernel/mm/transparent_hugepage/" #define PID_SMAPS "/proc/self/smaps" +#define TEST_FILE "collapse_test_file" + +#define MAX_LINE_LENGTH 500 + +enum vma_type { + VMA_ANON, + VMA_FILE, + VMA_SHMEM, +}; struct mem_ops { void *(*setup_area)(int nr_hpages); void (*cleanup_area)(void *p, unsigned long size); void (*fault)(void *p, unsigned long start, unsigned long end); bool (*check_huge)(void *addr, int nr_hpages); + const char *name; }; +static struct mem_ops *file_ops; +static struct mem_ops *anon_ops; + struct collapse_context { void (*collapse)(const char *msg, char *p, int nr_hpages, struct mem_ops *ops, bool expect); bool enforce_pte_scan_limits; + const char *name; }; +static struct collapse_context *khugepaged_context; +static struct collapse_context *madvise_context; + +struct file_info { + const char *dir; + char path[PATH_MAX]; + enum vma_type type; + int fd; + char dev_queue_read_ahead_path[PATH_MAX]; +}; + +static struct file_info finfo; + enum thp_enabled { THP_ALWAYS, THP_MADVISE, @@ -107,6 +145,7 @@ struct settings { enum shmem_enabled shmem_enabled; bool use_zero_page; struct khugepaged_settings khugepaged; + unsigned long read_ahead_kb; }; static struct settings saved_settings; @@ -125,6 +164,11 @@ static void fail(const char *msg) exit_status++; } +static void skip(const char *msg) +{ + printf(" \e[33m%s\e[0m\n", msg); +} + static int read_file(const char *path, char *buf, size_t buflen) { int fd; @@ -152,13 +196,19 @@ static int write_file(const char *path, const char *buf, size_t buflen) ssize_t numwritten; fd = open(path, O_WRONLY); - if (fd == -1) + if (fd == -1) { + printf("open(%s)\n", path); + exit(EXIT_FAILURE); return 0; + } numwritten = write(fd, buf, buflen - 1); close(fd); - if (numwritten < 1) + if (numwritten < 1) { + printf("write(%s)\n", buf); + exit(EXIT_FAILURE); return 0; + } return (unsigned int) numwritten; } @@ -225,20 +275,11 @@ static void write_string(const char *name, const char *val) } } -static const unsigned long read_num(const char *name) +static const unsigned long _read_num(const char *path) { - char path[PATH_MAX]; char buf[21]; - int ret; - ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); - if (ret >= PATH_MAX) { - printf("%s: Pathname is too long\n", __func__); - exit(EXIT_FAILURE); - } - - ret = read_file(path, buf, sizeof(buf)); - if (ret < 0) { + if (read_file(path, buf, sizeof(buf)) < 0) { perror("read_file(read_num)"); exit(EXIT_FAILURE); } @@ -246,10 +287,9 @@ static const unsigned long read_num(const char *name) return strtoul(buf, NULL, 10); } -static void write_num(const char *name, unsigned long num) +static const unsigned long read_num(const char *name) { char path[PATH_MAX]; - char buf[21]; int ret; ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); @@ -257,6 +297,12 @@ static void write_num(const char *name, unsigned long num) printf("%s: Pathname is too long\n", __func__); exit(EXIT_FAILURE); } + return _read_num(path); +} + +static void _write_num(const char *path, unsigned long num) +{ + char buf[21]; sprintf(buf, "%ld", num); if (!write_file(path, buf, strlen(buf) + 1)) { @@ -265,6 +311,19 @@ static void write_num(const char *name, unsigned long num) } } +static void write_num(const char *name, unsigned long num) +{ + char path[PATH_MAX]; + int ret; + + ret = snprintf(path, PATH_MAX, THP_SYSFS "%s", name); + if (ret >= PATH_MAX) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + _write_num(path, num); +} + static void write_settings(struct settings *settings) { struct khugepaged_settings *khugepaged = &settings->khugepaged; @@ -284,6 +343,10 @@ static void write_settings(struct settings *settings) write_num("khugepaged/max_ptes_swap", khugepaged->max_ptes_swap); write_num("khugepaged/max_ptes_shared", khugepaged->max_ptes_shared); write_num("khugepaged/pages_to_scan", khugepaged->pages_to_scan); + + if (file_ops && finfo.type == VMA_FILE) + _write_num(finfo.dev_queue_read_ahead_path, + settings->read_ahead_kb); } #define MAX_SETTINGS_DEPTH 4 @@ -354,6 +417,10 @@ static void save_settings(void) .max_ptes_shared = read_num("khugepaged/max_ptes_shared"), .pages_to_scan = read_num("khugepaged/pages_to_scan"), }; + if (file_ops && finfo.type == VMA_FILE) + saved_settings.read_ahead_kb = + _read_num(finfo.dev_queue_read_ahead_path); + success("OK"); signal(SIGTERM, restore_settings); @@ -362,7 +429,90 @@ static void save_settings(void) signal(SIGQUIT, restore_settings); } -#define MAX_LINE_LENGTH 500 +static void get_finfo(const char *dir) +{ + struct stat path_stat; + struct statfs fs; + char buf[1 << 10]; + char path[PATH_MAX]; + char *str, *end; + + finfo.dir = dir; + stat(finfo.dir, &path_stat); + if (!S_ISDIR(path_stat.st_mode)) { + printf("%s: Not a directory (%s)\n", __func__, finfo.dir); + exit(EXIT_FAILURE); + } + if (snprintf(finfo.path, sizeof(finfo.path), "%s/" TEST_FILE, + finfo.dir) >= sizeof(finfo.path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (statfs(finfo.dir, &fs)) { + perror("statfs()"); + exit(EXIT_FAILURE); + } + finfo.type = fs.f_type == TMPFS_MAGIC ? VMA_SHMEM : VMA_FILE; + if (finfo.type == VMA_SHMEM) + return; + + /* Find owning device's queue/read_ahead_kb control */ + if (snprintf(path, sizeof(path), "/sys/dev/block/%d:%d/uevent", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + if (read_file(path, buf, sizeof(buf)) < 0) { + perror("read_file(read_num)"); + exit(EXIT_FAILURE); + } + if (strstr(buf, "DEVTYPE=disk")) { + /* Found it */ + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/dev/block/%d:%d/queue/read_ahead_kb", + major(path_stat.st_dev), minor(path_stat.st_dev)) + >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + if (!strstr(buf, "DEVTYPE=partition")) { + printf("%s: Unknown device type: %s\n", __func__, path); + exit(EXIT_FAILURE); + } + /* + * Partition of block device - need to find actual device. + * Using naming convention that devnameN is partition of + * device devname. + */ + str = strstr(buf, "DEVNAME="); + if (!str) { + printf("%s: Could not read: %s", __func__, path); + exit(EXIT_FAILURE); + } + str += 8; + end = str; + while (*end) { + if (isdigit(*end)) { + *end = '\0'; + if (snprintf(finfo.dev_queue_read_ahead_path, + sizeof(finfo.dev_queue_read_ahead_path), + "/sys/block/%s/queue/read_ahead_kb", + str) >= sizeof(finfo.dev_queue_read_ahead_path)) { + printf("%s: Pathname is too long\n", __func__); + exit(EXIT_FAILURE); + } + return; + } + ++end; + } + printf("%s: Could not read: %s\n", __func__, path); + exit(EXIT_FAILURE); +} + static bool check_swap(void *addr, unsigned long size) { bool swap = false; @@ -518,11 +668,91 @@ static bool anon_check_huge(void *addr, int nr_hpages) return check_huge_anon(addr, nr_hpages, hpage_pmd_size); } -static struct mem_ops anon_ops = { +static void *file_setup_area(int nr_hpages) +{ + int fd; + void *p; + unsigned long size; + + unlink(finfo.path); /* Cleanup from previous failed tests */ + printf("Creating %s for collapse%s...", finfo.path, + finfo.type == VMA_SHMEM ? " (tmpfs)" : ""); + fd = open(finfo.path, O_DSYNC | O_CREAT | O_RDWR | O_TRUNC | O_EXCL, + 777); + if (fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + + size = nr_hpages * hpage_pmd_size; + p = alloc_mapping(nr_hpages); + fill_memory(p, 0, size); + write(fd, p, size); + close(fd); + munmap(p, size); + success("OK"); + + printf("Opening %s read only for collapse...", finfo.path); + finfo.fd = open(finfo.path, O_RDONLY, 777); + if (finfo.fd < 0) { + perror("open()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_EXEC, + MAP_PRIVATE, finfo.fd, 0); + if (p == MAP_FAILED || p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + + /* Drop page cache */ + write_file("/proc/sys/vm/drop_caches", "3", 2); + success("OK"); + return p; +} + +static void file_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); + unlink(finfo.path); +} + +static void file_fault(void *p, unsigned long start, unsigned long end) +{ + if (madvise(((char *)p) + start, end - start, MADV_POPULATE_READ)) { + perror("madvise(MADV_POPULATE_READ"); + exit(EXIT_FAILURE); + } +} + +static bool file_check_huge(void *addr, int nr_hpages) +{ + switch (finfo.type) { + case VMA_FILE: + return check_huge_file(addr, nr_hpages, hpage_pmd_size); + case VMA_SHMEM: + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); + default: + exit(EXIT_FAILURE); + return false; + } +} + +static struct mem_ops __anon_ops = { .setup_area = &anon_setup_area, .cleanup_area = &anon_cleanup_area, .fault = &anon_fault, .check_huge = &anon_check_huge, + .name = "anon", +}; + +static struct mem_ops __file_ops = { + .setup_area = &file_setup_area, + .cleanup_area = &file_cleanup_area, + .fault = &file_fault, + .check_huge = &file_check_huge, + .name = "file", }; static void __madvise_collapse(const char *msg, char *p, int nr_hpages, @@ -538,6 +768,7 @@ static void __madvise_collapse(const char *msg, char *p, int nr_hpages, * ignores /sys/kernel/mm/transparent_hugepage/enabled */ settings.thp_enabled = THP_NEVER; + settings.shmem_enabled = SHMEM_NEVER; push_settings(&settings); /* Clear VM_NOHUGEPAGE */ @@ -608,12 +839,37 @@ static void khugepaged_collapse(const char *msg, char *p, int nr_hpages, return; } + /* + * For file and shmem memory, khugepaged only retracts pte entries after + * putting the new hugepage in the page cache. The hugepage must be + * subsequently refaulted to install the pmd mapping for the mm. + */ + if (ops != &__anon_ops) + ops->fault(p, 0, nr_hpages * hpage_pmd_size); + if (ops->check_huge(p, expect ? nr_hpages : 0)) success("OK"); else fail("Fail"); } +static struct collapse_context __khugepaged_context = { + .collapse = &khugepaged_collapse, + .enforce_pte_scan_limits = true, + .name = "khugepaged", +}; + +static struct collapse_context __madvise_context = { + .collapse = &madvise_collapse, + .enforce_pte_scan_limits = false, + .name = "madvise", +}; + +static bool is_tmpfs(struct mem_ops *ops) +{ + return ops == &__file_ops && finfo.type == VMA_SHMEM; +} + static void alloc_at_fault(void) { struct settings settings = *current_settings(); @@ -686,6 +942,13 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o p = ops->setup_area(1); + if (is_tmpfs(ops)) { + /* shmem pages always in the page cache */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + ops->fault(p, 0, (hpage_pmd_nr - max_ptes_none - 1) * page_size); c->collapse("Maybe collapse with max_ptes_none exceeded", p, 1, ops, !c->enforce_pte_scan_limits); @@ -698,6 +961,7 @@ static void collapse_max_ptes_none(struct collapse_context *c, struct mem_ops *o validate_memory(p, 0, (hpage_pmd_nr - max_ptes_none) * page_size); } +skip: ops->cleanup_area(p, hpage_pmd_size); pop_settings(); } @@ -781,6 +1045,13 @@ static void collapse_single_pte_entry_compound(struct collapse_context *c, struc p = alloc_hpage(ops); + if (is_tmpfs(ops)) { + /* MADV_DONTNEED won't evict tmpfs pages */ + printf("tmpfs..."); + skip("Skip"); + goto skip; + } + madvise(p, hpage_pmd_size, MADV_NOHUGEPAGE); printf("Split huge page leaving single PTE mapping compound page..."); madvise(p + page_size, hpage_pmd_size - page_size, MADV_DONTNEED); @@ -792,6 +1063,7 @@ static void collapse_single_pte_entry_compound(struct collapse_context *c, struc c->collapse("Collapse PTE table with single PTE mapping compound page", p, 1, ops, true); validate_memory(p, 0, page_size); +skip: ops->cleanup_area(p, hpage_pmd_size); } @@ -1038,9 +1310,70 @@ static void madvise_collapse_existing_thps(struct collapse_context *c, ops->cleanup_area(p, hpage_pmd_size); } +static void usage(void) +{ + fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); + fprintf(stderr, "\t\t: :\n"); + fprintf(stderr, "\t\t: [all|khugepaged|madvise]\n"); + fprintf(stderr, "\t\t: [all|anon|file]\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); + fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); + fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); + fprintf(stderr, "\n\tif [dir] is a (sub)directory of a tmpfs mount, tmpfs must be\n"); + fprintf(stderr, "\tmounted with huge=madvise option for khugepaged tests to work\n"); + exit(1); +} + +static void parse_test_type(int argc, const char **argv) +{ + char *buf; + const char *token; + + if (argc == 1) { + /* Backwards compatibility */ + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + anon_ops = &__anon_ops; + return; + } + + buf = strdup(argv[1]); + token = strsep(&buf, ":"); + + if (!strcmp(token, "all")) { + khugepaged_context = &__khugepaged_context; + madvise_context = &__madvise_context; + } else if (!strcmp(token, "khugepaged")) { + khugepaged_context = &__khugepaged_context; + } else if (!strcmp(token, "madvise")) { + madvise_context = &__madvise_context; + } else { + usage(); + } + + if (!buf) + usage(); + + if (!strcmp(buf, "all")) { + file_ops = &__file_ops; + anon_ops = &__anon_ops; + } else if (!strcmp(buf, "anon")) { + anon_ops = &__anon_ops; + } else if (!strcmp(buf, "file")) { + file_ops = &__file_ops; + } else { + usage(); + } + + if (!file_ops) + return; + + if (argc != 3) + usage(); +} + int main(int argc, const char **argv) { - struct collapse_context c; struct settings default_settings = { .thp_enabled = THP_MADVISE, .thp_defrag = THP_DEFRAG_ALWAYS, @@ -1051,8 +1384,20 @@ int main(int argc, const char **argv) .alloc_sleep_millisecs = 10, .scan_sleep_millisecs = 10, }, + /* + * When testing file-backed memory, the collapse path + * looks at how many pages are found in the page cache, not + * what pages are mapped. Disable read ahead optimization so + * pages don't find their way into the page cache unless + * we mem_ops->fault() them in. + */ + .read_ahead_kb = 0, }; - const char *tests = argc == 1 ? "all" : argv[1]; + + parse_test_type(argc, argv); + + if (file_ops) + get_finfo(argv[2]); setbuf(stdout, NULL); @@ -1070,43 +1415,61 @@ int main(int argc, const char **argv) alloc_at_fault(); - if (!strcmp(tests, "khugepaged") || !strcmp(tests, "all")) { - printf("\n*** Testing context: khugepaged ***\n"); - c.collapse = &khugepaged_collapse; - c.enforce_pte_scan_limits = true; +#define TEST(t, c, o) do { \ + if (c && o) { \ + printf("\nRun test: " #t " (%s:%s)\n", c->name, o->name); \ + t(c, o); \ + } \ + } while (0) - collapse_full(&c, &anon_ops); - collapse_empty(&c, &anon_ops); - collapse_single_pte_entry(&c, &anon_ops); - collapse_max_ptes_none(&c, &anon_ops); - collapse_swapin_single_pte(&c, &anon_ops); - collapse_max_ptes_swap(&c, &anon_ops); - collapse_single_pte_entry_compound(&c, &anon_ops); - collapse_full_of_compound(&c, &anon_ops); - collapse_compound_extreme(&c, &anon_ops); - collapse_fork(&c, &anon_ops); - collapse_fork_compound(&c, &anon_ops); - collapse_max_ptes_shared(&c, &anon_ops); - } - if (!strcmp(tests, "madvise") || !strcmp(tests, "all")) { - printf("\n*** Testing context: madvise ***\n"); - c.collapse = &madvise_collapse; - c.enforce_pte_scan_limits = false; + TEST(collapse_full, khugepaged_context, anon_ops); + TEST(collapse_full, khugepaged_context, file_ops); + TEST(collapse_full, madvise_context, anon_ops); + TEST(collapse_full, madvise_context, file_ops); - collapse_full(&c, &anon_ops); - collapse_empty(&c, &anon_ops); - collapse_single_pte_entry(&c, &anon_ops); - collapse_max_ptes_none(&c, &anon_ops); - collapse_swapin_single_pte(&c, &anon_ops); - collapse_max_ptes_swap(&c, &anon_ops); - collapse_single_pte_entry_compound(&c, &anon_ops); - collapse_full_of_compound(&c, &anon_ops); - collapse_compound_extreme(&c, &anon_ops); - collapse_fork(&c, &anon_ops); - collapse_fork_compound(&c, &anon_ops); - collapse_max_ptes_shared(&c, &anon_ops); - madvise_collapse_existing_thps(&c, &anon_ops); - } + TEST(collapse_empty, khugepaged_context, anon_ops); + TEST(collapse_empty, madvise_context, anon_ops); + + TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry, madvise_context, anon_ops); + TEST(collapse_single_pte_entry, madvise_context, file_ops); + + TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_none, khugepaged_context, file_ops); + TEST(collapse_max_ptes_none, madvise_context, anon_ops); + TEST(collapse_max_ptes_none, madvise_context, file_ops); + + TEST(collapse_single_pte_entry_compound, khugepaged_context, anon_ops); + TEST(collapse_single_pte_entry_compound, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, anon_ops); + TEST(collapse_single_pte_entry_compound, madvise_context, file_ops); + + TEST(collapse_full_of_compound, khugepaged_context, anon_ops); + TEST(collapse_full_of_compound, khugepaged_context, file_ops); + TEST(collapse_full_of_compound, madvise_context, anon_ops); + TEST(collapse_full_of_compound, madvise_context, file_ops); + + TEST(collapse_compound_extreme, khugepaged_context, anon_ops); + TEST(collapse_compound_extreme, madvise_context, anon_ops); + + TEST(collapse_swapin_single_pte, khugepaged_context, anon_ops); + TEST(collapse_swapin_single_pte, madvise_context, anon_ops); + + TEST(collapse_max_ptes_swap, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_swap, madvise_context, anon_ops); + + TEST(collapse_fork, khugepaged_context, anon_ops); + TEST(collapse_fork, madvise_context, anon_ops); + + TEST(collapse_fork_compound, khugepaged_context, anon_ops); + TEST(collapse_fork_compound, madvise_context, anon_ops); + + TEST(collapse_max_ptes_shared, khugepaged_context, anon_ops); + TEST(collapse_max_ptes_shared, madvise_context, anon_ops); + + TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); + TEST(madvise_collapse_existing_thps, madvise_context, file_ops); restore_settings(0); } diff --git a/tools/testing/selftests/vm/vm_util.c b/tools/testing/selftests/vm/vm_util.c index 9dae51b8219f..f11f8adda521 100644 --- a/tools/testing/selftests/vm/vm_util.c +++ b/tools/testing/selftests/vm/vm_util.c @@ -114,3 +114,13 @@ bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size) { return __check_huge(addr, "AnonHugePages: ", nr_hpages, hpage_size); } + +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "FilePmdMapped:", nr_hpages, hpage_size); +} + +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size) +{ + return __check_huge(addr, "ShmemPmdMapped:", nr_hpages, hpage_size); +} diff --git a/tools/testing/selftests/vm/vm_util.h b/tools/testing/selftests/vm/vm_util.h index 8434ea0c95cd..5c35de454e08 100644 --- a/tools/testing/selftests/vm/vm_util.h +++ b/tools/testing/selftests/vm/vm_util.h @@ -8,3 +8,5 @@ void clear_softdirty(void); bool check_for_pattern(FILE *fp, const char *pattern, char *buf, size_t len); uint64_t read_pmd_pagesize(void); bool check_huge_anon(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_file(void *addr, int nr_hpages, uint64_t hpage_size); +bool check_huge_shmem(void *addr, int nr_hpages, uint64_t hpage_size); From d0d35b6010a8bcc12b986f51d29cf3a8635cdbb4 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:44 -0700 Subject: [PATCH 336/350] selftests/vm: add thp collapse shmem testing Add memory operations for shmem (memfd) memory, and reuse existing tests with the new memory operations. Shmem tests can be called with "shmem" mem_type, and shmem tests are ran with "all" mem_type as well. Link: https://lkml.kernel.org/r/20220907144521.3115321-9-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-9-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 57 ++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index 08de6141c2af..eabbd2710096 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -59,6 +59,7 @@ struct mem_ops { static struct mem_ops *file_ops; static struct mem_ops *anon_ops; +static struct mem_ops *shmem_ops; struct collapse_context { void (*collapse)(const char *msg, char *p, int nr_hpages, @@ -739,6 +740,40 @@ static bool file_check_huge(void *addr, int nr_hpages) } } +static void *shmem_setup_area(int nr_hpages) +{ + void *p; + unsigned long size = nr_hpages * hpage_pmd_size; + + finfo.fd = memfd_create("khugepaged-selftest-collapse-shmem", 0); + if (finfo.fd < 0) { + perror("memfd_create()"); + exit(EXIT_FAILURE); + } + if (ftruncate(finfo.fd, size)) { + perror("ftruncate()"); + exit(EXIT_FAILURE); + } + p = mmap(BASE_ADDR, size, PROT_READ | PROT_WRITE, MAP_SHARED, finfo.fd, + 0); + if (p != BASE_ADDR) { + perror("mmap()"); + exit(EXIT_FAILURE); + } + return p; +} + +static void shmem_cleanup_area(void *p, unsigned long size) +{ + munmap(p, size); + close(finfo.fd); +} + +static bool shmem_check_huge(void *addr, int nr_hpages) +{ + return check_huge_shmem(addr, nr_hpages, hpage_pmd_size); +} + static struct mem_ops __anon_ops = { .setup_area = &anon_setup_area, .cleanup_area = &anon_cleanup_area, @@ -755,6 +790,14 @@ static struct mem_ops __file_ops = { .name = "file", }; +static struct mem_ops __shmem_ops = { + .setup_area = &shmem_setup_area, + .cleanup_area = &shmem_cleanup_area, + .fault = &anon_fault, + .check_huge = &shmem_check_huge, + .name = "shmem", +}; + static void __madvise_collapse(const char *msg, char *p, int nr_hpages, struct mem_ops *ops, bool expect) { @@ -1315,7 +1358,7 @@ static void usage(void) fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); fprintf(stderr, "\t\t: :\n"); fprintf(stderr, "\t\t: [all|khugepaged|madvise]\n"); - fprintf(stderr, "\t\t: [all|anon|file]\n"); + fprintf(stderr, "\t\t: [all|anon|file|shmem]\n"); fprintf(stderr, "\n\t\"file,all\" mem_type requires [dir] argument\n"); fprintf(stderr, "\n\t\"file,all\" mem_type requires kernel built with\n"); fprintf(stderr, "\tCONFIG_READ_ONLY_THP_FOR_FS=y\n"); @@ -1357,10 +1400,13 @@ static void parse_test_type(int argc, const char **argv) if (!strcmp(buf, "all")) { file_ops = &__file_ops; anon_ops = &__anon_ops; + shmem_ops = &__shmem_ops; } else if (!strcmp(buf, "anon")) { anon_ops = &__anon_ops; } else if (!strcmp(buf, "file")) { file_ops = &__file_ops; + } else if (!strcmp(buf, "shmem")) { + shmem_ops = &__shmem_ops; } else { usage(); } @@ -1377,7 +1423,7 @@ int main(int argc, const char **argv) struct settings default_settings = { .thp_enabled = THP_MADVISE, .thp_defrag = THP_DEFRAG_ALWAYS, - .shmem_enabled = SHMEM_NEVER, + .shmem_enabled = SHMEM_ADVISE, .use_zero_page = 0, .khugepaged = { .defrag = 1, @@ -1424,16 +1470,20 @@ int main(int argc, const char **argv) TEST(collapse_full, khugepaged_context, anon_ops); TEST(collapse_full, khugepaged_context, file_ops); + TEST(collapse_full, khugepaged_context, shmem_ops); TEST(collapse_full, madvise_context, anon_ops); TEST(collapse_full, madvise_context, file_ops); + TEST(collapse_full, madvise_context, shmem_ops); TEST(collapse_empty, khugepaged_context, anon_ops); TEST(collapse_empty, madvise_context, anon_ops); TEST(collapse_single_pte_entry, khugepaged_context, anon_ops); TEST(collapse_single_pte_entry, khugepaged_context, file_ops); + TEST(collapse_single_pte_entry, khugepaged_context, shmem_ops); TEST(collapse_single_pte_entry, madvise_context, anon_ops); TEST(collapse_single_pte_entry, madvise_context, file_ops); + TEST(collapse_single_pte_entry, madvise_context, shmem_ops); TEST(collapse_max_ptes_none, khugepaged_context, anon_ops); TEST(collapse_max_ptes_none, khugepaged_context, file_ops); @@ -1447,8 +1497,10 @@ int main(int argc, const char **argv) TEST(collapse_full_of_compound, khugepaged_context, anon_ops); TEST(collapse_full_of_compound, khugepaged_context, file_ops); + TEST(collapse_full_of_compound, khugepaged_context, shmem_ops); TEST(collapse_full_of_compound, madvise_context, anon_ops); TEST(collapse_full_of_compound, madvise_context, file_ops); + TEST(collapse_full_of_compound, madvise_context, shmem_ops); TEST(collapse_compound_extreme, khugepaged_context, anon_ops); TEST(collapse_compound_extreme, madvise_context, anon_ops); @@ -1470,6 +1522,7 @@ int main(int argc, const char **argv) TEST(madvise_collapse_existing_thps, madvise_context, anon_ops); TEST(madvise_collapse_existing_thps, madvise_context, file_ops); + TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); restore_settings(0); } From 69d9428ce97f28eb1ba8acee552cf46014663d2b Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:45 -0700 Subject: [PATCH 337/350] selftests/vm: add file/shmem MADV_COLLAPSE selftest for cleared pmd This test tests that MADV_COLLAPSE acting on file/shmem memory for which (1) the file extent mapping by the memory is already a huge page in the page cache, and (2) the pmd mapping this memory in the target process is none. In practice, (1)+(2) is the state left over after khugepaged has successfully collapsed file/shmem memory for a target VMA, but the memory has not yet been refaulted. So, this test in-effect tests MADV_COLLAPSE racing with khugepaged to collapse the memory first. Link: https://lkml.kernel.org/r/20220907144521.3115321-10-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-10-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/khugepaged.c | 30 +++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/tools/testing/selftests/vm/khugepaged.c b/tools/testing/selftests/vm/khugepaged.c index eabbd2710096..64126c8cd561 100644 --- a/tools/testing/selftests/vm/khugepaged.c +++ b/tools/testing/selftests/vm/khugepaged.c @@ -1353,6 +1353,33 @@ static void madvise_collapse_existing_thps(struct collapse_context *c, ops->cleanup_area(p, hpage_pmd_size); } +/* + * Test race with khugepaged where page tables have been retracted and + * pmd cleared. + */ +static void madvise_retracted_page_tables(struct collapse_context *c, + struct mem_ops *ops) +{ + void *p; + int nr_hpages = 1; + unsigned long size = nr_hpages * hpage_pmd_size; + + p = ops->setup_area(nr_hpages); + ops->fault(p, 0, size); + + /* Let khugepaged collapse and leave pmd cleared */ + if (wait_for_scan("Collapse and leave PMD cleared", p, nr_hpages, + ops)) { + fail("Timeout"); + return; + } + success("OK"); + c->collapse("Install huge PMD from page cache", p, nr_hpages, ops, + true); + validate_memory(p, 0, size); + ops->cleanup_area(p, size); +} + static void usage(void) { fprintf(stderr, "\nUsage: ./khugepaged [dir]\n\n"); @@ -1524,5 +1551,8 @@ int main(int argc, const char **argv) TEST(madvise_collapse_existing_thps, madvise_context, file_ops); TEST(madvise_collapse_existing_thps, madvise_context, shmem_ops); + TEST(madvise_retracted_page_tables, madvise_context, file_ops); + TEST(madvise_retracted_page_tables, madvise_context, shmem_ops); + restore_settings(0); } From 0f633baac0f1716200bbccc6430b6006d103d7b9 Mon Sep 17 00:00:00 2001 From: Zach O'Keefe Date: Thu, 22 Sep 2022 15:40:46 -0700 Subject: [PATCH 338/350] selftests/vm: add selftest for MADV_COLLAPSE of uffd-minor memory Add :collapse mod to userfaultfd selftest. Currently this mod is only valid for "shmem" test type, but could be used for other test types. When provided, memory allocated by ->allocate_area() will be hugepage-aligned enforced to be hugepage-sized. userfaultf_minor_test, after the UFFD-registered mapping has been populated by UUFD minor fault handler, attempt to MADV_COLLAPSE the UFFD-registered mapping to collapse the memory into a pmd-mapped THP. This test is meant to be a functional test of what occurs during UFFD-driven live migration of VMs backed by huge tmpfs where, after a hugepage-sized region has been successfully migrated (in native page-sized chunks, to avoid latency of fetched a hugepage over the network), we want to reclaim previous VM performance by remapping it at the PMD level. Link: https://lkml.kernel.org/r/20220907144521.3115321-11-zokeefe@google.com Link: https://lkml.kernel.org/r/20220922224046.1143204-11-zokeefe@google.com Signed-off-by: Zach O'Keefe Cc: Axel Rasmussen Cc: Chris Kennelly Cc: David Hildenbrand Cc: David Rientjes Cc: Hugh Dickins Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Minchan Kim Cc: Pasha Tatashin Cc: Peter Xu Cc: Rongwei Wang Cc: SeongJae Park Cc: Song Liu Cc: Vlastimil Babka Cc: Yang Shi Signed-off-by: Andrew Morton --- tools/testing/selftests/vm/Makefile | 1 + tools/testing/selftests/vm/userfaultfd.c | 173 ++++++++++++++++++----- 2 files changed, 135 insertions(+), 39 deletions(-) diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index c9c0996c122b..c687533374e6 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -99,6 +99,7 @@ $(OUTPUT)/khugepaged: vm_util.c $(OUTPUT)/madv_populate: vm_util.c $(OUTPUT)/soft-dirty: vm_util.c $(OUTPUT)/split_huge_page_test: vm_util.c +$(OUTPUT)/userfaultfd: vm_util.c ifeq ($(MACHINE),x86_64) BINARIES_32 := $(patsubst %,$(OUTPUT)/%,$(BINARIES_32)) diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 7be709d9eed0..74babdbc02e5 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -61,10 +61,11 @@ #include #include "../kselftest.h" +#include "vm_util.h" #ifdef __NR_userfaultfd -static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size; +static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size, hpage_size; #define BOUNCE_RANDOM (1<<0) #define BOUNCE_RACINGFAULTS (1<<1) @@ -79,6 +80,8 @@ static int test_type; #define UFFD_FLAGS (O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY) +#define BASE_PMD_ADDR ((void *)(1UL << 30)) + /* test using /dev/userfaultfd, instead of userfaultfd(2) */ static bool test_dev_userfaultfd; @@ -97,9 +100,10 @@ static int huge_fd; static unsigned long long *count_verify; static int uffd = -1; static int uffd_flags, finished, *pipefd; -static char *area_src, *area_src_alias, *area_dst, *area_dst_alias; +static char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap; static char *zeropage; pthread_attr_t attr; +static bool test_collapse; /* Userfaultfd test statistics */ struct uffd_stats { @@ -127,6 +131,8 @@ struct uffd_stats { #define swap(a, b) \ do { typeof(a) __tmp = (a); (a) = (b); (b) = __tmp; } while (0) +#define factor_of_2(x) ((x) ^ ((x) & ((x) - 1))) + const char *examples = "# Run anonymous memory test on 100MiB region with 99999 bounces:\n" "./userfaultfd anon 100 99999\n\n" @@ -152,6 +158,8 @@ static void usage(void) "Supported mods:\n"); fprintf(stderr, "\tsyscall - Use userfaultfd(2) (default)\n"); fprintf(stderr, "\tdev - Use /dev/userfaultfd instead of userfaultfd(2)\n"); + fprintf(stderr, "\tcollapse - Test MADV_COLLAPSE of UFFDIO_REGISTER_MODE_MINOR\n" + "memory\n"); fprintf(stderr, "\nExample test mod usage:\n"); fprintf(stderr, "# Run anonymous memory test with /dev/userfaultfd:\n"); fprintf(stderr, "./userfaultfd anon:dev 100 99999\n\n"); @@ -229,12 +237,10 @@ static void anon_release_pages(char *rel_area) err("madvise(MADV_DONTNEED) failed"); } -static void anon_allocate_area(void **alloc_area) +static void anon_allocate_area(void **alloc_area, bool is_src) { *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); - if (*alloc_area == MAP_FAILED) - err("mmap of anonymous memory failed"); } static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) @@ -252,7 +258,7 @@ static void hugetlb_release_pages(char *rel_area) } } -static void hugetlb_allocate_area(void **alloc_area) +static void hugetlb_allocate_area(void **alloc_area, bool is_src) { void *area_alias = NULL; char **alloc_area_alias; @@ -262,7 +268,7 @@ static void hugetlb_allocate_area(void **alloc_area) nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | MAP_HUGETLB | - (*alloc_area == area_src ? 0 : MAP_NORESERVE), + (is_src ? 0 : MAP_NORESERVE), -1, 0); else @@ -270,9 +276,9 @@ static void hugetlb_allocate_area(void **alloc_area) nr_pages * page_size, PROT_READ | PROT_WRITE, MAP_SHARED | - (*alloc_area == area_src ? 0 : MAP_NORESERVE), + (is_src ? 0 : MAP_NORESERVE), huge_fd, - *alloc_area == area_src ? 0 : nr_pages * page_size); + is_src ? 0 : nr_pages * page_size); if (*alloc_area == MAP_FAILED) err("mmap of hugetlbfs file failed"); @@ -282,12 +288,12 @@ static void hugetlb_allocate_area(void **alloc_area) PROT_READ | PROT_WRITE, MAP_SHARED, huge_fd, - *alloc_area == area_src ? 0 : nr_pages * page_size); + is_src ? 0 : nr_pages * page_size); if (area_alias == MAP_FAILED) err("mmap of hugetlb file alias failed"); } - if (*alloc_area == area_src) { + if (is_src) { alloc_area_alias = &area_src_alias; } else { alloc_area_alias = &area_dst_alias; @@ -310,21 +316,36 @@ static void shmem_release_pages(char *rel_area) err("madvise(MADV_REMOVE) failed"); } -static void shmem_allocate_area(void **alloc_area) +static void shmem_allocate_area(void **alloc_area, bool is_src) { void *area_alias = NULL; - bool is_src = alloc_area == (void **)&area_src; - unsigned long offset = is_src ? 0 : nr_pages * page_size; + size_t bytes = nr_pages * page_size; + unsigned long offset = is_src ? 0 : bytes; + char *p = NULL, *p_alias = NULL; - *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_SHARED, shm_fd, offset); + if (test_collapse) { + p = BASE_PMD_ADDR; + if (!is_src) + /* src map + alias + interleaved hpages */ + p += 2 * (bytes + hpage_size); + p_alias = p; + p_alias += bytes; + p_alias += hpage_size; /* Prevent src/dst VMA merge */ + } + + *alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + shm_fd, offset); if (*alloc_area == MAP_FAILED) err("mmap of memfd failed"); + if (test_collapse && *alloc_area != p) + err("mmap of memfd failed at %p", p); - area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_SHARED, shm_fd, offset); + area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED, + shm_fd, offset); if (area_alias == MAP_FAILED) err("mmap of memfd alias failed"); + if (test_collapse && area_alias != p_alias) + err("mmap of anonymous memory failed at %p", p_alias); if (is_src) area_src_alias = area_alias; @@ -337,28 +358,39 @@ static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset) *start = (unsigned long)area_dst_alias + offset; } +static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages) +{ + if (!check_huge_shmem(area_dst_alias, expect_nr_hpages, hpage_size)) + err("Did not find expected %d number of hugepages", + expect_nr_hpages); +} + struct uffd_test_ops { - void (*allocate_area)(void **alloc_area); + void (*allocate_area)(void **alloc_area, bool is_src); void (*release_pages)(char *rel_area); void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset); + void (*check_pmd_mapping)(void *p, int expect_nr_hpages); }; static struct uffd_test_ops anon_uffd_test_ops = { .allocate_area = anon_allocate_area, .release_pages = anon_release_pages, .alias_mapping = noop_alias_mapping, + .check_pmd_mapping = NULL, }; static struct uffd_test_ops shmem_uffd_test_ops = { .allocate_area = shmem_allocate_area, .release_pages = shmem_release_pages, .alias_mapping = shmem_alias_mapping, + .check_pmd_mapping = shmem_check_pmd_mapping, }; static struct uffd_test_ops hugetlb_uffd_test_ops = { .allocate_area = hugetlb_allocate_area, .release_pages = hugetlb_release_pages, .alias_mapping = hugetlb_alias_mapping, + .check_pmd_mapping = NULL, }; static struct uffd_test_ops *uffd_test_ops; @@ -478,6 +510,7 @@ static void uffd_test_ctx_clear(void) munmap_area((void **)&area_src_alias); munmap_area((void **)&area_dst); munmap_area((void **)&area_dst_alias); + munmap_area((void **)&area_remap); } static void uffd_test_ctx_init(uint64_t features) @@ -486,8 +519,8 @@ static void uffd_test_ctx_init(uint64_t features) uffd_test_ctx_clear(); - uffd_test_ops->allocate_area((void **)&area_src); - uffd_test_ops->allocate_area((void **)&area_dst); + uffd_test_ops->allocate_area((void **)&area_src, true); + uffd_test_ops->allocate_area((void **)&area_dst, false); userfaultfd_open(&features); @@ -804,6 +837,7 @@ static void *uffd_poll_thread(void *arg) err("remove failure"); break; case UFFD_EVENT_REMAP: + area_remap = area_dst; /* save for later unmap */ area_dst = (char *)(unsigned long)msg.arg.remap.to; break; } @@ -1256,13 +1290,30 @@ static int userfaultfd_sig_test(void) return userfaults != 0; } -static int userfaultfd_minor_test(void) +void check_memory_contents(char *p) { - struct uffdio_register uffdio_register; - unsigned long p; - pthread_t uffd_mon; + unsigned long i; uint8_t expected_byte; void *expected_page; + + if (posix_memalign(&expected_page, page_size, page_size)) + err("out of memory"); + + for (i = 0; i < nr_pages; ++i) { + expected_byte = ~((uint8_t)(i % ((uint8_t)-1))); + memset(expected_page, expected_byte, page_size); + if (my_bcmp(expected_page, p + (i * page_size), page_size)) + err("unexpected page contents after minor fault"); + } + + free(expected_page); +} + +static int userfaultfd_minor_test(void) +{ + unsigned long p; + struct uffdio_register uffdio_register; + pthread_t uffd_mon; char c; struct uffd_stats stats = { 0 }; @@ -1301,17 +1352,7 @@ static int userfaultfd_minor_test(void) * fault. uffd_poll_thread will resolve the fault by bit-flipping the * page's contents, and then issuing a CONTINUE ioctl. */ - - if (posix_memalign(&expected_page, page_size, page_size)) - err("out of memory"); - - for (p = 0; p < nr_pages; ++p) { - expected_byte = ~((uint8_t)(p % ((uint8_t)-1))); - memset(expected_page, expected_byte, page_size); - if (my_bcmp(expected_page, area_dst_alias + (p * page_size), - page_size)) - err("unexpected page contents after minor fault"); - } + check_memory_contents(area_dst_alias); if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) err("pipe write"); @@ -1320,6 +1361,23 @@ static int userfaultfd_minor_test(void) uffd_stats_report(&stats, 1); + if (test_collapse) { + printf("testing collapse of uffd memory into PMD-mapped THPs:"); + if (madvise(area_dst_alias, nr_pages * page_size, + MADV_COLLAPSE)) + err("madvise(MADV_COLLAPSE)"); + + uffd_test_ops->check_pmd_mapping(area_dst, + nr_pages * page_size / + hpage_size); + /* + * This won't cause uffd-fault - it purely just makes sure there + * was no corruption. + */ + check_memory_contents(area_dst_alias); + printf(" done.\n"); + } + return stats.missing_faults != 0 || stats.minor_faults != nr_pages; } @@ -1656,6 +1714,8 @@ static void parse_test_type_arg(const char *raw_type) test_dev_userfaultfd = true; else if (!strcmp(token, "syscall")) test_dev_userfaultfd = false; + else if (!strcmp(token, "collapse")) + test_collapse = true; else err("unrecognized test mod '%s'", token); } @@ -1663,8 +1723,11 @@ static void parse_test_type_arg(const char *raw_type) if (!test_type) err("failed to parse test type argument: '%s'", raw_type); + if (test_collapse && test_type != TEST_SHMEM) + err("Unsupported test: %s", raw_type); + if (test_type == TEST_HUGETLB) - page_size = default_huge_page_size(); + page_size = hpage_size; else page_size = sysconf(_SC_PAGE_SIZE); @@ -1702,6 +1765,8 @@ static void sigalrm(int sig) int main(int argc, char **argv) { + size_t bytes; + if (argc < 4) usage(); @@ -1709,11 +1774,41 @@ int main(int argc, char **argv) err("failed to arm SIGALRM"); alarm(ALARM_INTERVAL_SECS); + hpage_size = default_huge_page_size(); parse_test_type_arg(argv[1]); + bytes = atol(argv[2]) * 1024 * 1024; + + if (test_collapse && bytes & (hpage_size - 1)) + err("MiB must be multiple of %lu if :collapse mod set", + hpage_size >> 20); nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size / - nr_cpus; + + if (test_collapse) { + /* nr_cpus must divide (bytes / page_size), otherwise, + * area allocations of (nr_pages * paze_size) won't be a + * multiple of hpage_size, even if bytes is a multiple of + * hpage_size. + * + * This means that nr_cpus must divide (N * (2 << (H-P)) + * where: + * bytes = hpage_size * N + * hpage_size = 2 << H + * page_size = 2 << P + * + * And we want to chose nr_cpus to be the largest value + * satisfying this constraint, not larger than the number + * of online CPUs. Unfortunately, prime factorization of + * N and nr_cpus may be arbitrary, so have to search for it. + * Instead, just use the highest power of 2 dividing both + * nr_cpus and (bytes / page_size). + */ + int x = factor_of_2(nr_cpus); + int y = factor_of_2(bytes / page_size); + + nr_cpus = x < y ? x : y; + } + nr_pages_per_cpu = bytes / page_size / nr_cpus; if (!nr_pages_per_cpu) { _err("invalid MiB"); usage(); From 6b91e5dfb3c7ef485587e7ab494dcb47bcdadce3 Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Thu, 22 Sep 2022 19:09:35 +0800 Subject: [PATCH 339/350] mm: remove unused inline functions from include/linux/mm_inline.h Remove the following unused inline functions from mm_inline.h: 1. All uses of add_page_to_lru_list_tail() have been removed since commit 7a3dbfe8a52b ("mm/swap: convert lru_deactivate_file to a folio_batch"), and it can be replaced by lruvec_add_folio_tail(). 2. All uses of __clear_page_lru_flags() have been removed since commit 188e8caee968 ("mm/swap: convert __page_cache_release() to use a folio"), and it can be replaced by __folio_clear_lru_flags(). They are useless, so remove them. Link: https://lkml.kernel.org/r/20220922110935.1495099-1-cuigaosheng1@huawei.com Signed-off-by: Gaosheng Cui Signed-off-by: Andrew Morton --- include/linux/mm_inline.h | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/include/linux/mm_inline.h b/include/linux/mm_inline.h index 4949eda9a9a2..e8ed225d8f7c 100644 --- a/include/linux/mm_inline.h +++ b/include/linux/mm_inline.h @@ -76,11 +76,6 @@ static __always_inline void __folio_clear_lru_flags(struct folio *folio) __folio_clear_unevictable(folio); } -static __always_inline void __clear_page_lru_flags(struct page *page) -{ - __folio_clear_lru_flags(page_folio(page)); -} - /** * folio_lru_list - Which LRU list should a folio be on? * @folio: The folio to test. @@ -348,12 +343,6 @@ void lruvec_add_folio_tail(struct lruvec *lruvec, struct folio *folio) list_add_tail(&folio->lru, &lruvec->lists[lru]); } -static __always_inline void add_page_to_lru_list_tail(struct page *page, - struct lruvec *lruvec) -{ - lruvec_add_folio_tail(lruvec, page_folio(page)); -} - static __always_inline void lruvec_del_folio(struct lruvec *lruvec, struct folio *folio) { From 8346d69d8bcb6c526a0d8bd126241dff41a60723 Mon Sep 17 00:00:00 2001 From: Xin Hao Date: Thu, 22 Sep 2022 10:19:29 +0800 Subject: [PATCH 340/350] mm/hugetlb: add available_huge_pages() func In hugetlb.c there are several places which compare the values of 'h->free_huge_pages' and 'h->resv_huge_pages', it looks a bit messy, so add a new available_huge_pages() function to do these. Link: https://lkml.kernel.org/r/20220922021929.98961-1-xhao@linux.alibaba.com Signed-off-by: Xin Hao Reviewed-by: Mike Kravetz Reviewed-by: Muchun Song Reviewed-by: David Hildenbrand Reviewed-by: Oscar Salvador Signed-off-by: Andrew Morton --- mm/hugetlb.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3c1316ad54b5..8de5a6b5a172 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1191,6 +1191,11 @@ retry_cpuset: return NULL; } +static unsigned long available_huge_pages(struct hstate *h) +{ + return h->free_huge_pages - h->resv_huge_pages; +} + static struct page *dequeue_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, unsigned long address, int avoid_reserve, @@ -1207,12 +1212,11 @@ static struct page *dequeue_huge_page_vma(struct hstate *h, * have no page reserves. This check ensures that reservations are * not "stolen". The child may still get SIGKILLed */ - if (!vma_has_reserves(vma, chg) && - h->free_huge_pages - h->resv_huge_pages == 0) + if (!vma_has_reserves(vma, chg) && !available_huge_pages(h)) goto err; /* If reserves cannot be used, ensure enough pages are in the pool */ - if (avoid_reserve && h->free_huge_pages - h->resv_huge_pages == 0) + if (avoid_reserve && !available_huge_pages(h)) goto err; gfp_mask = htlb_alloc_mask(h); @@ -2124,7 +2128,7 @@ retry: if (!page_count(page)) { struct page *head = compound_head(page); struct hstate *h = page_hstate(head); - if (h->free_huge_pages - h->resv_huge_pages == 0) + if (!available_huge_pages(h)) goto out; /* @@ -2311,7 +2315,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask, gfp_t gfp_mask) { spin_lock_irq(&hugetlb_lock); - if (h->free_huge_pages - h->resv_huge_pages > 0) { + if (available_huge_pages(h)) { struct page *page; page = dequeue_huge_page_nodemask(h, gfp_mask, preferred_nid, nmask); From f7c5b1aab5ef18b0eb4136a33fc2c78b54e3e777 Mon Sep 17 00:00:00 2001 From: Xiu Jianfeng Date: Tue, 20 Sep 2022 09:22:05 +0800 Subject: [PATCH 341/350] mm/secretmem: remove reduntant return value The return value @ret is always 0, so remove it and return 0 directly. Link: https://lkml.kernel.org/r/20220920012205.246217-1-xiujianfeng@huawei.com Signed-off-by: Xiu Jianfeng Signed-off-by: Andrew Morton --- mm/secretmem.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/mm/secretmem.c b/mm/secretmem.c index 6a44efb673b2..04c3ac9448a1 100644 --- a/mm/secretmem.c +++ b/mm/secretmem.c @@ -278,10 +278,8 @@ static struct file_system_type secretmem_fs = { static int __init secretmem_init(void) { - int ret = 0; - if (!secretmem_enable) - return ret; + return 0; secretmem_mnt = kern_mount(&secretmem_fs); if (IS_ERR(secretmem_mnt)) @@ -290,6 +288,6 @@ static int __init secretmem_init(void) /* prevent secretmem mappings from ever getting PROT_EXEC */ secretmem_mnt->mnt_flags |= MNT_NOEXEC; - return ret; + return 0; } fs_initcall(secretmem_init); From c91bdc9358992856721ff77887202a7e80b7ab22 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:01 -0400 Subject: [PATCH 342/350] mm: memcontrol: don't allocate cgroup swap arrays when memcg is disabled Patch series "memcg swap fix & cleanups". This patch (of 4): Since commit 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control"), the cgroup swap arrays are used to track memory ownership at the time of swap readahead and swapoff, even if swap space *accounting* has been turned off by the user via swapaccount=0 (which sets cgroup_memory_noswap). However, the patch was overzealous: by simply dropping the cgroup_memory_noswap conditionals in the swapon, swapoff and uncharge path, it caused the cgroup arrays being allocated even when the memory controller as a whole is disabled. This is a waste of that memory. Restore mem_cgroup_disabled() checks, implied previously by cgroup_memory_noswap, in the swapon, swapoff, and swap_entry_free callbacks. Link: https://lkml.kernel.org/r/20220926135704.400818-1-hannes@cmpxchg.org Link: https://lkml.kernel.org/r/20220926135704.400818-2-hannes@cmpxchg.org Fixes: 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control") Signed-off-by: Johannes Weiner Reported-by: Hugh Dickins Reviewed-by: Shakeel Butt Acked-by: Hugh Dickins Acked-by: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 3 +++ mm/swap_cgroup.c | 6 ++++++ 2 files changed, 9 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6b74bbdc2659..9e3c010ca676 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7459,6 +7459,9 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) struct mem_cgroup *memcg; unsigned short id; + if (mem_cgroup_disabled()) + return; + id = swap_cgroup_record(entry, 0, nr_pages); rcu_read_lock(); memcg = mem_cgroup_from_id(id); diff --git a/mm/swap_cgroup.c b/mm/swap_cgroup.c index 5a9442979a18..db6c4a26cf59 100644 --- a/mm/swap_cgroup.c +++ b/mm/swap_cgroup.c @@ -170,6 +170,9 @@ int swap_cgroup_swapon(int type, unsigned long max_pages) unsigned long length; struct swap_cgroup_ctrl *ctrl; + if (mem_cgroup_disabled()) + return 0; + length = DIV_ROUND_UP(max_pages, SC_PER_PAGE); array = vcalloc(length, sizeof(void *)); @@ -204,6 +207,9 @@ void swap_cgroup_swapoff(int type) unsigned long i, length; struct swap_cgroup_ctrl *ctrl; + if (mem_cgroup_disabled()) + return; + mutex_lock(&swap_cgroup_mutex); ctrl = &swap_cgroup_ctrl[type]; map = ctrl->map; From b25806dcd3d5248833f7d2544ee29a701735159f Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:02 -0400 Subject: [PATCH 343/350] mm: memcontrol: deprecate swapaccounting=0 mode The swapaccounting= commandline option already does very little today. To close a trivial containment failure case, the swap ownership tracking part of the swap controller has recently become mandatory (see commit 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control") for details), which makes up the majority of the work during swapout, swapin, and the swap slot map. The only thing left under this flag is the page_counter operations and the visibility of the swap control files in the first place, which are rather meager savings. There also aren't many scenarios, if any, where controlling the memory of a cgroup while allowing it unlimited access to a global swap space is a workable resource isolation strategy. On the other hand, there have been several bugs and confusion around the many possible swap controller states (cgroup1 vs cgroup2 behavior, memory accounting without swap accounting, memcg runtime disabled). This puts the maintenance overhead of retaining the toggle above its practical benefits. Deprecate it. Link: https://lkml.kernel.org/r/20220926135704.400818-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Suggested-by: Shakeel Butt Reviewed-by: Shakeel Butt Cc: Hugh Dickins Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- .../admin-guide/kernel-parameters.txt | 6 --- mm/memcontrol.c | 50 ++++--------------- 2 files changed, 10 insertions(+), 46 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 3b95f65bafe2..99a13f2be2ef 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6036,12 +6036,6 @@ This parameter controls use of the Protected Execution Facility on pSeries. - swapaccount= [KNL] - Format: [0|1] - Enable accounting of swap in memory resource - controller if no parameter or 1 is given or disable - it if 0 is given (See Documentation/admin-guide/cgroup-v1/memory.rst) - swiotlb= [ARM,IA-64,PPC,MIPS,X86] Format: { [,] | force | noforce } -- Number of I/O TLB slabs diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9e3c010ca676..4be1b48b9659 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -88,22 +88,6 @@ static bool cgroup_memory_nosocket __ro_after_init; /* Kernel memory accounting disabled? */ static bool cgroup_memory_nokmem __ro_after_init; -/* Whether the swap controller is active */ -#ifdef CONFIG_MEMCG_SWAP -static bool cgroup_memory_noswap __initdata; - -static DEFINE_STATIC_KEY_FALSE(memcg_swap_enabled_key); -static inline bool memcg_swap_enabled(void) -{ - return static_branch_likely(&memcg_swap_enabled_key); -} -#else -static inline bool memcg_swap_enabled(void) -{ - return false; -} -#endif - #ifdef CONFIG_CGROUP_WRITEBACK static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); #endif @@ -111,7 +95,7 @@ static DECLARE_WAIT_QUEUE_HEAD(memcg_cgwb_frn_waitq); /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { - return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg_swap_enabled(); + return !cgroup_subsys_on_dfl(memory_cgrp_subsys); } #define THRESHOLDS_EVENTS_TARGET 128 @@ -7379,7 +7363,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (!mem_cgroup_is_root(memcg)) page_counter_uncharge(&memcg->memory, nr_entries); - if (memcg_swap_enabled() && memcg != swap_memcg) { + if (memcg != swap_memcg) { if (!mem_cgroup_is_root(swap_memcg)) page_counter_charge(&swap_memcg->memsw, nr_entries); page_counter_uncharge(&memcg->memsw, nr_entries); @@ -7431,7 +7415,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) memcg = mem_cgroup_id_get_online(memcg); - if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg) && + if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { memcg_memory_event(memcg, MEMCG_SWAP_MAX); memcg_memory_event(memcg, MEMCG_SWAP_FAIL); @@ -7466,7 +7450,7 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) rcu_read_lock(); memcg = mem_cgroup_from_id(id); if (memcg) { - if (memcg_swap_enabled() && !mem_cgroup_is_root(memcg)) { + if (!mem_cgroup_is_root(memcg)) { if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) page_counter_uncharge(&memcg->swap, nr_pages); else @@ -7482,7 +7466,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) { long nr_swap_pages = get_nr_swap_pages(); - if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (mem_cgroup_disabled() || do_memsw_account()) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, @@ -7499,7 +7483,7 @@ bool mem_cgroup_swap_full(struct folio *folio) if (vm_swap_full()) return true; - if (!memcg_swap_enabled() || !cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (do_memsw_account()) return false; memcg = folio_memcg(folio); @@ -7519,10 +7503,9 @@ bool mem_cgroup_swap_full(struct folio *folio) static int __init setup_swap_account(char *s) { - bool res; - - if (!kstrtobool(s, &res)) - cgroup_memory_noswap = !res; + pr_warn_once("The swapaccount= commandline option is deprecated. " + "Please report your usecase to linux-mm@kvack.org if you " + "depend on this functionality.\n"); return 1; } __setup("swapaccount=", setup_swap_account); @@ -7791,24 +7774,11 @@ static struct cftype zswap_files[] = { }; #endif /* CONFIG_MEMCG_KMEM && CONFIG_ZSWAP */ -/* - * If mem_cgroup_swap_init() is implemented as a subsys_initcall() - * instead of a core_initcall(), this could mean cgroup_memory_noswap still - * remains set to false even when memcg is disabled via "cgroup_disable=memory" - * boot parameter. This may result in premature OOPS inside - * mem_cgroup_get_nr_swap_pages() function in corner cases. - */ static int __init mem_cgroup_swap_init(void) { - /* No memory control -> no swap control */ if (mem_cgroup_disabled()) - cgroup_memory_noswap = true; - - if (cgroup_memory_noswap) return 0; - static_branch_enable(&memcg_swap_enabled_key); - WARN_ON(cgroup_add_dfl_cftypes(&memory_cgrp_subsys, swap_files)); WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, memsw_files)); #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_ZSWAP) @@ -7816,6 +7786,6 @@ static int __init mem_cgroup_swap_init(void) #endif return 0; } -core_initcall(mem_cgroup_swap_init); +subsys_initcall(mem_cgroup_swap_init); #endif /* CONFIG_MEMCG_SWAP */ From b94c4e949c36e0e363515822ade0d8305e9a6ef2 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:03 -0400 Subject: [PATCH 344/350] mm: memcontrol: use do_memsw_account() in a few more places It's slightly more descriptive and consistent with other places that distinguish cgroup1's combined memory+swap accounting scheme from cgroup2's dedicated swap accounting. Link: https://lkml.kernel.org/r/20220926135704.400818-4-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Hugh Dickins Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- mm/memcontrol.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4be1b48b9659..76bb0a18a2f3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1667,17 +1667,17 @@ unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { unsigned long max = READ_ONCE(memcg->memory.max); - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) { - if (mem_cgroup_swappiness(memcg)) - max += min(READ_ONCE(memcg->swap.max), - (unsigned long)total_swap_pages); - } else { /* v1 */ + if (do_memsw_account()) { if (mem_cgroup_swappiness(memcg)) { /* Calculate swap excess capacity from memsw limit */ unsigned long swap = READ_ONCE(memcg->memsw.max) - max; max += min(swap, (unsigned long)total_swap_pages); } + } else { + if (mem_cgroup_swappiness(memcg)) + max += min(READ_ONCE(memcg->swap.max), + (unsigned long)total_swap_pages); } return max; } @@ -7334,7 +7334,7 @@ void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry) if (mem_cgroup_disabled()) return; - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (!do_memsw_account()) return; memcg = folio_memcg(folio); @@ -7399,7 +7399,7 @@ int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry) struct mem_cgroup *memcg; unsigned short oldid; - if (!cgroup_subsys_on_dfl(memory_cgrp_subsys)) + if (do_memsw_account()) return 0; memcg = folio_memcg(folio); @@ -7451,10 +7451,10 @@ void __mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) memcg = mem_cgroup_from_id(id); if (memcg) { if (!mem_cgroup_is_root(memcg)) { - if (cgroup_subsys_on_dfl(memory_cgrp_subsys)) - page_counter_uncharge(&memcg->swap, nr_pages); - else + if (do_memsw_account()) page_counter_uncharge(&memcg->memsw, nr_pages); + else + page_counter_uncharge(&memcg->swap, nr_pages); } mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); mem_cgroup_id_put_many(memcg, nr_pages); From e55b9f96860f6c6026cff97966a740576285e07b Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Mon, 26 Sep 2022 09:57:04 -0400 Subject: [PATCH 345/350] mm: memcontrol: drop dead CONFIG_MEMCG_SWAP config symbol Since 2d1c498072de ("mm: memcontrol: make swap tracking an integral part of memory control"), CONFIG_MEMCG_SWAP hasn't been a user-visible config option anymore, it just means CONFIG_MEMCG && CONFIG_SWAP. Update the sites accordingly and drop the symbol. [ While touching the docs, remove two references to CONFIG_MEMCG_KMEM, which hasn't been a user-visible symbol for over half a decade. ] Link: https://lkml.kernel.org/r/20220926135704.400818-5-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Shakeel Butt Cc: Hugh Dickins Cc: Michal Hocko Cc: Roman Gushchin Signed-off-by: Andrew Morton --- Documentation/admin-guide/cgroup-v1/memory.rst | 4 +--- arch/mips/configs/db1xxx_defconfig | 1 - arch/mips/configs/generic_defconfig | 1 - arch/powerpc/configs/powernv_defconfig | 1 - arch/powerpc/configs/pseries_defconfig | 1 - arch/sh/configs/sdk7786_defconfig | 1 - arch/sh/configs/urquell_defconfig | 1 - include/linux/swap.h | 2 +- include/linux/swap_cgroup.h | 4 ++-- init/Kconfig | 5 ----- mm/Makefile | 4 +++- mm/memcontrol.c | 6 +++--- tools/testing/selftests/cgroup/config | 1 - 13 files changed, 10 insertions(+), 22 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v1/memory.rst b/Documentation/admin-guide/cgroup-v1/memory.rst index 2cc502a75ef6..5b86245450bd 100644 --- a/Documentation/admin-guide/cgroup-v1/memory.rst +++ b/Documentation/admin-guide/cgroup-v1/memory.rst @@ -299,7 +299,7 @@ Per-node-per-memcgroup LRU (cgroup's private LRU) is guarded by lruvec->lru_lock; PG_lru bit of page->flags is cleared before isolating a page from its LRU under lruvec->lru_lock. -2.7 Kernel Memory Extension (CONFIG_MEMCG_KMEM) +2.7 Kernel Memory Extension ----------------------------------------------- With the Kernel memory extension, the Memory Controller is able to limit @@ -386,8 +386,6 @@ U != 0, K >= U: a. Enable CONFIG_CGROUPS b. Enable CONFIG_MEMCG -c. Enable CONFIG_MEMCG_SWAP (to use swap extension) -d. Enable CONFIG_MEMCG_KMEM (to use kmem extension) 3.1. Prepare the cgroups (see cgroups.txt, Why are cgroups needed?) ------------------------------------------------------------------- diff --git a/arch/mips/configs/db1xxx_defconfig b/arch/mips/configs/db1xxx_defconfig index b8bd66300996..83cbdecb27e6 100644 --- a/arch/mips/configs/db1xxx_defconfig +++ b/arch/mips/configs/db1xxx_defconfig @@ -9,7 +9,6 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_LOG_BUF_SHIFT=16 CONFIG_CGROUPS=y CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_BLK_CGROUP=y CONFIG_CGROUP_SCHED=y CONFIG_CFS_BANDWIDTH=y diff --git a/arch/mips/configs/generic_defconfig b/arch/mips/configs/generic_defconfig index 714169e411cf..48e4e251779b 100644 --- a/arch/mips/configs/generic_defconfig +++ b/arch/mips/configs/generic_defconfig @@ -3,7 +3,6 @@ CONFIG_NO_HZ_IDLE=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_BLK_CGROUP=y CONFIG_CFS_BANDWIDTH=y CONFIG_RT_GROUP_SCHED=y diff --git a/arch/powerpc/configs/powernv_defconfig b/arch/powerpc/configs/powernv_defconfig index 49f49c263935..4acca5263404 100644 --- a/arch/powerpc/configs/powernv_defconfig +++ b/arch/powerpc/configs/powernv_defconfig @@ -17,7 +17,6 @@ CONFIG_LOG_CPU_MAX_BUF_SHIFT=13 CONFIG_NUMA_BALANCING=y CONFIG_CGROUPS=y CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y diff --git a/arch/powerpc/configs/pseries_defconfig b/arch/powerpc/configs/pseries_defconfig index b571d084c148..fead14ebb1fc 100644 --- a/arch/powerpc/configs/pseries_defconfig +++ b/arch/powerpc/configs/pseries_defconfig @@ -16,7 +16,6 @@ CONFIG_LOG_CPU_MAX_BUF_SHIFT=13 CONFIG_NUMA_BALANCING=y CONFIG_CGROUPS=y CONFIG_MEMCG=y -CONFIG_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_CGROUP_FREEZER=y CONFIG_CPUSETS=y diff --git a/arch/sh/configs/sdk7786_defconfig b/arch/sh/configs/sdk7786_defconfig index a8662b6927ec..97b7356639ed 100644 --- a/arch/sh/configs/sdk7786_defconfig +++ b/arch/sh/configs/sdk7786_defconfig @@ -16,7 +16,6 @@ CONFIG_CPUSETS=y # CONFIG_PROC_PID_CPUSET is not set CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_MEMCG=y -CONFIG_CGROUP_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_RT_GROUP_SCHED=y CONFIG_BLK_CGROUP=y diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig index cb2f56468fe0..be478f3148f2 100644 --- a/arch/sh/configs/urquell_defconfig +++ b/arch/sh/configs/urquell_defconfig @@ -14,7 +14,6 @@ CONFIG_CPUSETS=y # CONFIG_PROC_PID_CPUSET is not set CONFIG_CGROUP_CPUACCT=y CONFIG_CGROUP_MEMCG=y -CONFIG_CGROUP_MEMCG_SWAP=y CONFIG_CGROUP_SCHED=y CONFIG_RT_GROUP_SCHED=y CONFIG_BLK_DEV_INITRD=y diff --git a/include/linux/swap.h b/include/linux/swap.h index fc8d98660326..a18cf4b7c724 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -666,7 +666,7 @@ static inline void folio_throttle_swaprate(struct folio *folio, gfp_t gfp) cgroup_throttle_swaprate(&folio->page, gfp); } -#ifdef CONFIG_MEMCG_SWAP +#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) void mem_cgroup_swapout(struct folio *folio, swp_entry_t entry); int __mem_cgroup_try_charge_swap(struct folio *folio, swp_entry_t entry); static inline int mem_cgroup_try_charge_swap(struct folio *folio, diff --git a/include/linux/swap_cgroup.h b/include/linux/swap_cgroup.h index a12dd1c3966c..ae73a87775b3 100644 --- a/include/linux/swap_cgroup.h +++ b/include/linux/swap_cgroup.h @@ -4,7 +4,7 @@ #include -#ifdef CONFIG_MEMCG_SWAP +#if defined(CONFIG_MEMCG) && defined(CONFIG_SWAP) extern unsigned short swap_cgroup_cmpxchg(swp_entry_t ent, unsigned short old, unsigned short new); @@ -40,6 +40,6 @@ static inline void swap_cgroup_swapoff(int type) return; } -#endif /* CONFIG_MEMCG_SWAP */ +#endif #endif /* __LINUX_SWAP_CGROUP_H */ diff --git a/init/Kconfig b/init/Kconfig index 532362fcfe31..7d86cf6b3012 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -958,11 +958,6 @@ config MEMCG help Provides control over the memory footprint of tasks in a cgroup. -config MEMCG_SWAP - bool - depends on MEMCG && SWAP - default y - config MEMCG_KMEM bool depends on MEMCG && !SLOB diff --git a/mm/Makefile b/mm/Makefile index cc23b0052584..8e105e5b3e29 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -98,7 +98,9 @@ obj-$(CONFIG_DEVICE_MIGRATION) += migrate_device.o obj-$(CONFIG_TRANSPARENT_HUGEPAGE) += huge_memory.o khugepaged.o obj-$(CONFIG_PAGE_COUNTER) += page_counter.o obj-$(CONFIG_MEMCG) += memcontrol.o vmpressure.o -obj-$(CONFIG_MEMCG_SWAP) += swap_cgroup.o +ifdef CONFIG_SWAP +obj-$(CONFIG_MEMCG) += swap_cgroup.o +endif obj-$(CONFIG_CGROUP_HUGETLB) += hugetlb_cgroup.o obj-$(CONFIG_GUP_TEST) += gup_test.o obj-$(CONFIG_MEMORY_FAILURE) += memory-failure.o diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 76bb0a18a2f3..61e05fc281fb 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3423,7 +3423,7 @@ void split_page_memcg(struct page *head, unsigned int nr) css_get_many(&memcg->css, nr - 1); } -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_SWAP /** * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. * @entry: swap entry to be moved @@ -7296,7 +7296,7 @@ static int __init mem_cgroup_init(void) } subsys_initcall(mem_cgroup_init); -#ifdef CONFIG_MEMCG_SWAP +#ifdef CONFIG_SWAP static struct mem_cgroup *mem_cgroup_id_get_online(struct mem_cgroup *memcg) { while (!refcount_inc_not_zero(&memcg->id.ref)) { @@ -7788,4 +7788,4 @@ static int __init mem_cgroup_swap_init(void) } subsys_initcall(mem_cgroup_swap_init); -#endif /* CONFIG_MEMCG_SWAP */ +#endif /* CONFIG_SWAP */ diff --git a/tools/testing/selftests/cgroup/config b/tools/testing/selftests/cgroup/config index 84fe884fad86..97d549ee894f 100644 --- a/tools/testing/selftests/cgroup/config +++ b/tools/testing/selftests/cgroup/config @@ -4,5 +4,4 @@ CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_SCHED=y CONFIG_MEMCG=y CONFIG_MEMCG_KMEM=y -CONFIG_MEMCG_SWAP=y CONFIG_PAGE_COUNTER=y From 14aa8b2d5c2ebead01b542f62d68029023054774 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 28 Sep 2022 13:36:58 -0600 Subject: [PATCH 346/350] mm/mglru: don't sync disk for each aging cycle wakeup_flusher_threads() was added under the assumption that if a system runs out of clean cold pages, it might want to write back dirty pages more aggressively so that they can become clean and be dropped. However, doing so can breach the rate limit a system wants to impose on writeback, resulting in early SSD wearout. Link: https://lkml.kernel.org/r/YzSiWq9UEER5LKup@google.com Fixes: bd74fdaea146 ("mm: multi-gen LRU: support page table walks") Signed-off-by: Yu Zhao Reported-by: Axel Rasmussen Signed-off-by: Andrew Morton --- mm/vmscan.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index c5a4bff11da6..3240d5dd7784 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -4413,8 +4413,6 @@ done: if (wq_has_sleeper(&lruvec->mm_state.wait)) wake_up_all(&lruvec->mm_state.wait); - wakeup_flusher_threads(WB_REASON_VMSCAN); - return true; } From e4fea72b143848d8bbbeae6d39a890212bcf848e Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 28 Sep 2022 12:46:20 -0600 Subject: [PATCH 347/350] mglru: mm/vmscan.c: fix imprecise comments Link: https://lkml.kernel.org/r/YzSWfFI+MOeb1ils@google.com Signed-off-by: Yu Zhao Signed-off-by: Andrew Morton --- mm/vmscan.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 3240d5dd7784..04d8b88e5216 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -5076,7 +5076,7 @@ static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, DEFINE_MAX_SEQ(lruvec); if (!current_is_kswapd()) { - /* age each memcg once to ensure fairness */ + /* age each memcg at most once to ensure fairness */ if (max_seq - seq > 1) return true; @@ -5101,10 +5101,9 @@ static bool should_abort_scan(struct lruvec *lruvec, unsigned long seq, /* * A minimum amount of work was done under global memory pressure. For - * kswapd, it may be overshooting. For direct reclaim, the target isn't - * met, and yet the allocation may still succeed, since kswapd may have - * caught up. In either case, it's better to stop now, and restart if - * necessary. + * kswapd, it may be overshooting. For direct reclaim, the allocation + * may succeed if all suitable zones are somewhat safe. In either case, + * it's better to stop now, and restart later if necessary. */ for (i = 0; i <= sc->reclaim_idx; i++) { unsigned long wmark; From 131a79b474e973f023c5c75e2323a940332103be Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 Oct 2022 18:17:05 -0700 Subject: [PATCH 348/350] hugetlb: fix vma lock handling during split vma and range unmapping Patch series "hugetlb: fixes for new vma lock series". In review of the series "hugetlb: Use new vma lock for huge pmd sharing synchronization", Miaohe Lin pointed out two key issues: 1) There is a race in the routine hugetlb_unmap_file_folio when locks are dropped and reacquired in the correct order [1]. 2) With the switch to using vma lock for fault/truncate synchronization, we need to make sure lock exists for all VM_MAYSHARE vmas, not just vmas capable of pmd sharing. These two issues are addressed here. In addition, having a vma lock present in all VM_MAYSHARE vmas, uncovered some issues around vma splitting. Those are also addressed. [1] https://lore.kernel.org/linux-mm/01f10195-7088-4462-6def-909549c75ef4@huawei.com/ This patch (of 3): The hugetlb vma lock hangs off the vm_private_data field and is specific to the vma. When vm_area_dup() is called as part of vma splitting, the vma lock pointer is copied to the new vma. This will result in issues such as double freeing of the structure. Update the hugetlb open vm_ops to allocate a new vma lock for the new vma. The routine __unmap_hugepage_range_final unconditionally unset VM_MAYSHARE to prevent subsequent pmd sharing. hugetlb_vma_lock_free attempted to anticipate this by checking both VM_MAYSHARE and VM_SHARED. However, if only VM_MAYSHARE was set we would miss the free. With the introduction of the vma lock, a vma can not participate in pmd sharing if vm_private_data is NULL. Instead of clearing VM_MAYSHARE in __unmap_hugepage_range_final, free the vma lock to prevent sharing. Also, update the sharing code to make sure vma lock is indeed a condition for pmd sharing. hugetlb_vma_lock_free can then key off VM_MAYSHARE and not miss any vmas. Link: https://lkml.kernel.org/r/20221005011707.514612-1-mike.kravetz@oracle.com Link: https://lkml.kernel.org/r/20221005011707.514612-2-mike.kravetz@oracle.com Fixes: "hugetlb: add vma based lock for pmd sharing" Signed-off-by: Mike Kravetz Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Miaohe Lin Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- mm/hugetlb.c | 43 +++++++++++++++++++++++++++---------------- mm/memory.c | 4 ---- 2 files changed, 27 insertions(+), 20 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8de5a6b5a172..45e305d182f6 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4612,7 +4612,14 @@ static void hugetlb_vm_op_open(struct vm_area_struct *vma) kref_get(&resv->refs); } - hugetlb_vma_lock_alloc(vma); + /* + * vma_lock structure for sharable mappings is vma specific. + * Clear old pointer (if copied via vm_area_dup) and create new. + */ + if (vma->vm_flags & VM_MAYSHARE) { + vma->vm_private_data = NULL; + hugetlb_vma_lock_alloc(vma); + } } static void hugetlb_vm_op_close(struct vm_area_struct *vma) @@ -5168,19 +5175,23 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, unsigned long end, struct page *ref_page, zap_flags_t zap_flags) { + hugetlb_vma_lock_write(vma); + i_mmap_lock_write(vma->vm_file->f_mapping); + __unmap_hugepage_range(tlb, vma, start, end, ref_page, zap_flags); /* - * Clear this flag so that x86's huge_pmd_share page_table_shareable - * test will fail on a vma being torn down, and not grab a page table - * on its way out. We're lucky that the flag has such an appropriate - * name, and can in fact be safely cleared here. We could clear it - * before the __unmap_hugepage_range above, but all that's necessary - * is to clear it before releasing the i_mmap_rwsem. This works - * because in the context this is called, the VMA is about to be - * destroyed and the i_mmap_rwsem is held. + * Unlock and free the vma lock before releasing i_mmap_rwsem. When + * the vma_lock is freed, this makes the vma ineligible for pmd + * sharing. And, i_mmap_rwsem is required to set up pmd sharing. + * This is important as page tables for this unmapped range will + * be asynchrously deleted. If the page tables are shared, there + * will be issues when accessed by someone else. */ - vma->vm_flags &= ~VM_MAYSHARE; + hugetlb_vma_unlock_write(vma); + hugetlb_vma_lock_free(vma); + + i_mmap_unlock_write(vma->vm_file->f_mapping); } void unmap_hugepage_range(struct vm_area_struct *vma, unsigned long start, @@ -6664,10 +6675,13 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, /* * match the virtual addresses, permission and the alignment of the * page table page. + * + * Also, vma_lock (vm_private_data) is required for sharing. */ if (pmd_index(addr) != pmd_index(saddr) || vm_flags != svm_flags || - !range_in_vma(svma, sbase, s_end)) + !range_in_vma(svma, sbase, s_end) || + !svma->vm_private_data) return 0; return saddr; @@ -6817,12 +6831,9 @@ void hugetlb_vma_lock_release(struct kref *kref) static void hugetlb_vma_lock_free(struct vm_area_struct *vma) { /* - * Only present in sharable vmas. See comment in - * __unmap_hugepage_range_final about how VM_SHARED could - * be set without VM_MAYSHARE. As a result, we need to - * check if either is set in the free path. + * Only present in sharable vmas. */ - if (!vma || !(vma->vm_flags & (VM_MAYSHARE | VM_SHARED))) + if (!vma || !__vma_shareable_flags_pmd(vma)) return; if (vma->vm_private_data) { diff --git a/mm/memory.c b/mm/memory.c index 118e5f023597..df678fa30cdb 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1685,12 +1685,8 @@ static void unmap_single_vma(struct mmu_gather *tlb, if (vma->vm_file) { zap_flags_t zap_flags = details ? details->zap_flags : 0; - hugetlb_vma_lock_write(vma); - i_mmap_lock_write(vma->vm_file->f_mapping); __unmap_hugepage_range_final(tlb, vma, start, end, NULL, zap_flags); - i_mmap_unlock_write(vma->vm_file->f_mapping); - hugetlb_vma_unlock_write(vma); } } else unmap_page_range(tlb, vma, start, end, details); From ecfbd733878da48ed03a5b8a9c301366a03e3cca Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 Oct 2022 18:17:06 -0700 Subject: [PATCH 349/350] hugetlb: take hugetlb vma_lock when clearing vma_lock->vma pointer hugetlb file truncation/hole punch code may need to back out and take locks in order in the routine hugetlb_unmap_file_folio(). This code could race with vma freeing as pointed out in [1] and result in accessing a stale vma pointer. To address this, take the vma_lock when clearing the vma_lock->vma pointer. [1] https://lore.kernel.org/linux-mm/01f10195-7088-4462-6def-909549c75ef4@huawei.com/ [mike.kravetz@oracle.com: address build issues] Link: https://lkml.kernel.org/r/Yz5L1uxQYR1VqFtJ@monkey Link: https://lkml.kernel.org/r/20221005011707.514612-3-mike.kravetz@oracle.com Fixes: "hugetlb: use new vma_lock for pmd sharing synchronization" Signed-off-by: Mike Kravetz Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Miaohe Lin Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- mm/hugetlb.c | 42 ++++++++++++++++++++++++++++++++---------- 1 file changed, 32 insertions(+), 10 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 45e305d182f6..01f3e36caa6c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -93,6 +93,7 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); +static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static inline bool subpool_is_free(struct hugepage_subpool *spool) { @@ -5188,8 +5189,7 @@ void __unmap_hugepage_range_final(struct mmu_gather *tlb, * be asynchrously deleted. If the page tables are shared, there * will be issues when accessed by someone else. */ - hugetlb_vma_unlock_write(vma); - hugetlb_vma_lock_free(vma); + __hugetlb_vma_unlock_write_free(vma); i_mmap_unlock_write(vma->vm_file->f_mapping); } @@ -6828,6 +6828,30 @@ void hugetlb_vma_lock_release(struct kref *kref) kfree(vma_lock); } +void __hugetlb_vma_unlock_write_put(struct hugetlb_vma_lock *vma_lock) +{ + struct vm_area_struct *vma = vma_lock->vma; + + /* + * vma_lock structure may or not be released as a result of put, + * it certainly will no longer be attached to vma so clear pointer. + * Semaphore synchronizes access to vma_lock->vma field. + */ + vma_lock->vma = NULL; + vma->vm_private_data = NULL; + up_write(&vma_lock->rw_sema); + kref_put(&vma_lock->refs, hugetlb_vma_lock_release); +} + +static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) +{ + if (__vma_shareable_flags_pmd(vma)) { + struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; + + __hugetlb_vma_unlock_write_put(vma_lock); + } +} + static void hugetlb_vma_lock_free(struct vm_area_struct *vma) { /* @@ -6839,14 +6863,8 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma) if (vma->vm_private_data) { struct hugetlb_vma_lock *vma_lock = vma->vm_private_data; - /* - * vma_lock structure may or not be released, but it - * certainly will no longer be attached to vma so clear - * pointer. - */ - vma_lock->vma = NULL; - kref_put(&vma_lock->refs, hugetlb_vma_lock_release); - vma->vm_private_data = NULL; + down_write(&vma_lock->rw_sema); + __hugetlb_vma_unlock_write_put(vma_lock); } } @@ -6997,6 +7015,10 @@ void hugetlb_vma_lock_release(struct kref *kref) { } +static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma) +{ +} + static void hugetlb_vma_lock_free(struct vm_area_struct *vma) { } From bbff39cc6cbcb86ccfacb2dcafc79912a9f9df69 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Tue, 4 Oct 2022 18:17:07 -0700 Subject: [PATCH 350/350] hugetlb: allocate vma lock for all sharable vmas The hugetlb vma lock was originally designed to synchronize pmd sharing. As such, it was only necessary to allocate the lock for vmas that were capable of pmd sharing. Later in the development cycle, it was discovered that it could also be used to simplify fault/truncation races as described in [1]. However, a subsequent change to allocate the lock for all vmas that use the page cache was never made. A fault/truncation race could leave pages in a file past i_size until the file is removed. Remove the previous restriction and allocate lock for all VM_MAYSHARE vmas. Warn in the unlikely event of allocation failure. [1] https://lore.kernel.org/lkml/Yxiv0SkMkZ0JWGGp@monkey/#t Link: https://lkml.kernel.org/r/20221005011707.514612-4-mike.kravetz@oracle.com Fixes: "hugetlb: clean up code checking for fault/truncation races" Signed-off-by: Mike Kravetz Cc: Andrea Arcangeli Cc: "Aneesh Kumar K.V" Cc: Axel Rasmussen Cc: David Hildenbrand Cc: Davidlohr Bueso Cc: James Houghton Cc: "Kirill A. Shutemov" Cc: Miaohe Lin Cc: Michal Hocko Cc: Mina Almasry Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pasha Tatashin Cc: Peter Xu Cc: Prakash Sangappa Cc: Sven Schnelle Signed-off-by: Andrew Morton --- mm/hugetlb.c | 50 +++++++++++++++----------------------------------- 1 file changed, 15 insertions(+), 35 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 01f3e36caa6c..0ad53ad98e74 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -6687,10 +6687,11 @@ static unsigned long page_table_shareable(struct vm_area_struct *svma, return saddr; } -static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma, - unsigned long start, unsigned long end, - bool check_vma_lock) +bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) { + unsigned long start = addr & PUD_MASK; + unsigned long end = start + PUD_SIZE; + #ifdef CONFIG_USERFAULTFD if (uffd_disable_huge_pmd_share(vma)) return false; @@ -6700,38 +6701,13 @@ static bool __vma_aligned_range_pmd_shareable(struct vm_area_struct *vma, */ if (!(vma->vm_flags & VM_MAYSHARE)) return false; - if (check_vma_lock && !vma->vm_private_data) + if (!vma->vm_private_data) /* vma lock required for sharing */ return false; if (!range_in_vma(vma, start, end)) return false; return true; } -static bool vma_pmd_shareable(struct vm_area_struct *vma) -{ - unsigned long start = ALIGN(vma->vm_start, PUD_SIZE), - end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); - - if (start >= end) - return false; - - return __vma_aligned_range_pmd_shareable(vma, start, end, false); -} - -static bool vma_addr_pmd_shareable(struct vm_area_struct *vma, - unsigned long addr) -{ - unsigned long start = addr & PUD_MASK; - unsigned long end = start + PUD_SIZE; - - return __vma_aligned_range_pmd_shareable(vma, start, end, true); -} - -bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr) -{ - return vma_addr_pmd_shareable(vma, addr); -} - /* * Determine if start,end range within vma could be mapped by shared pmd. * If yes, adjust start and end to cover range associated with possible @@ -6880,17 +6856,21 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) if (vma->vm_private_data) return; - /* Check size/alignment for pmd sharing possible */ - if (!vma_pmd_shareable(vma)) - return; - vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); - if (!vma_lock) + if (!vma_lock) { /* * If we can not allocate structure, then vma can not - * participate in pmd sharing. + * participate in pmd sharing. This is only a possible + * performance enhancement and memory saving issue. + * However, the lock is also used to synchronize page + * faults with truncation. If the lock is not present, + * unlikely races could leave pages in a file past i_size + * until the file is removed. Warn in the unlikely case of + * allocation failure. */ + pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); return; + } kref_init(&vma_lock->refs); init_rwsem(&vma_lock->rw_sema);