From fb70c4878d6b3001ef40fa39432a38d8cabdcbf7 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Mon, 15 Aug 2022 19:10:17 +0800 Subject: [PATCH 01/25] mm: kill find_min_pfn_with_active_regions() find_min_pfn_with_active_regions() is only called from free_area_init(). Open-code the PHYS_PFN(memblock_start_of_DRAM()) into free_area_init(), and kill find_min_pfn_with_active_regions(). Link: https://lkml.kernel.org/r/20220815111017.39341-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/page_alloc.c | 13 +------------ 2 files changed, 1 insertion(+), 13 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 27839b158ca4..e98ef2cb1176 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2520,7 +2520,6 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); -extern unsigned long find_min_pfn_with_active_regions(void); #ifndef CONFIG_NUMA static inline int early_pfn_to_nid(unsigned long pfn) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index f13d3ead95f2..48c65bf3cb29 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7908,17 +7908,6 @@ unsigned long __init node_map_pfn_alignment(void) return ~accl_mask + 1; } -/** - * find_min_pfn_with_active_regions - Find the minimum PFN registered - * - * Return: the minimum PFN based on information provided via - * memblock_set_node(). - */ -unsigned long __init find_min_pfn_with_active_regions(void) -{ - return PHYS_PFN(memblock_start_of_DRAM()); -} - /* * early_calculate_totalpages() * Sum pages in active regions for movable zone. @@ -8211,7 +8200,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); - start_pfn = find_min_pfn_with_active_regions(); + start_pfn = PHYS_PFN(memblock_start_of_DRAM()); descending = arch_has_descending_max_zone_pfns(); for (i = 0; i < MAX_NR_ZONES; i++) { From b8dd3ee9cacc2398700e5227931988931871e020 Mon Sep 17 00:00:00 2001 From: xupanda Date: Mon, 15 Aug 2022 06:51:04 +0000 Subject: [PATCH 02/25] mm: memcontrol: fix a typo in comment Fix a spelling mistake in comment. Link: https://lkml.kernel.org/r/20220815065102.74347-1-xu.panda@zte.com.cn Reported-by: Zeal Robot Signed-off-by: xupanda Signed-off-by: CGEL ZTE Reviewed-by: Mukesh Ojha Signed-off-by: Andrew Morton --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index b69979c9ced5..4dddd8be320a 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1143,7 +1143,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) } while ((memcg = parent_mem_cgroup(memcg))); /* - * When cgruop1 non-hierarchy mode is used, + * When cgroup1 non-hierarchy mode is used, * parent_mem_cgroup() does not walk all the way up to the * cgroup root (root_mem_cgroup). So we have to handle * dead_memcg from cgroup root separately. From 08262ac50a7e4d70ee92b34746ea54a0ba51739a Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 18 Aug 2022 22:07:41 +0100 Subject: [PATCH 03/25] mm/vmalloc.c: support HIGHMEM pages in vmap_pages_range_noflush() If the pages being mapped are in HIGHMEM, page_address() returns NULL. This probably wasn't noticed before because there aren't currently any architectures with HAVE_ARCH_HUGE_VMALLOC and HIGHMEM, but it's simpler to call page_to_phys() and futureproofs us against such configurations existing. Link: https://lkml.kernel.org/r/Yv6qHc6e+m7TMWhi@casper.infradead.org Fixes: 121e6f3258fe ("mm/vmalloc: hugepage vmalloc mappings") Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Reviewed-by: Christoph Hellwig Cc: Uladzislau Rezki Cc: Nicholas Piggin Signed-off-by: Andrew Morton --- mm/vmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index dd6cdb201195..e68c0081e861 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -590,7 +590,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, int err; err = vmap_range_noflush(addr, addr + (1UL << page_shift), - __pa(page_address(pages[i])), prot, + page_to_phys(pages[i]), prot, page_shift); if (err) return err; From b1d5488a252dc9c0d9574100d0b8d807bf154603 Mon Sep 17 00:00:00 2001 From: Charan Teja Kalla Date: Thu, 18 Aug 2022 19:20:00 +0530 Subject: [PATCH 04/25] mm: fix use-after free of page_ext after race with memory-offline The below is one path where race between page_ext and offline of the respective memory blocks will cause use-after-free on the access of page_ext structure. process1 process2 --------- --------- a)doing /proc/page_owner doing memory offline through offline_pages. b) PageBuddy check is failed thus proceed to get the page_owner information through page_ext access. page_ext = lookup_page_ext(page); migrate_pages(); ................. Since all pages are successfully migrated as part of the offline operation,send MEM_OFFLINE notification where for page_ext it calls: offline_page_ext()--> __free_page_ext()--> free_page_ext()--> vfree(ms->page_ext) mem_section->page_ext = NULL c) Check for the PAGE_EXT flags in the page_ext->flags access results into the use-after-free (leading to the translation faults). As mentioned above, there is really no synchronization between page_ext access and its freeing in the memory_offline. The memory offline steps(roughly) on a memory block is as below: 1) Isolate all the pages 2) while(1) try free the pages to buddy.(->free_list[MIGRATE_ISOLATE]) 3) delete the pages from this buddy list. 4) Then free page_ext.(Note: The struct page is still alive as it is freed only during hot remove of the memory which frees the memmap, which steps the user might not perform). This design leads to the state where struct page is alive but the struct page_ext is freed, where the later is ideally part of the former which just representing the page_flags (check [3] for why this design is chosen). The abovementioned race is just one example __but the problem persists in the other paths too involving page_ext->flags access(eg: page_is_idle())__. Fix all the paths where offline races with page_ext access by maintaining synchronization with rcu lock and is achieved in 3 steps: 1) Invalidate all the page_ext's of the sections of a memory block by storing a flag in the LSB of mem_section->page_ext. 2) Wait until all the existing readers to finish working with the ->page_ext's with synchronize_rcu(). Any parallel process that starts after this call will not get page_ext, through lookup_page_ext(), for the block parallel offline operation is being performed. 3) Now safely free all sections ->page_ext's of the block on which offline operation is being performed. Note: If synchronize_rcu() takes time then optimizations can be done in this path through call_rcu()[2]. Thanks to David Hildenbrand for his views/suggestions on the initial discussion[1] and Pavan kondeti for various inputs on this patch. [1] https://lore.kernel.org/linux-mm/59edde13-4167-8550-86f0-11fc67882107@quicinc.com/ [2] https://lore.kernel.org/all/a26ce299-aed1-b8ad-711e-a49e82bdd180@quicinc.com/T/#u [3] https://lore.kernel.org/all/6fa6b7aa-731e-891c-3efb-a03d6a700efa@redhat.com/ [quic_charante@quicinc.com: rename label `loop' to `ext_put_continue' per David] Link: https://lkml.kernel.org/r/1661496993-11473-1-git-send-email-quic_charante@quicinc.com Link: https://lkml.kernel.org/r/1660830600-9068-1-git-send-email-quic_charante@quicinc.com Signed-off-by: Charan Teja Kalla Suggested-by: David Hildenbrand Suggested-by: Michal Hocko Acked-by: Michal Hocko Acked-by: David Hildenbrand Cc: Fernand Sieber Cc: Minchan Kim Cc: Pasha Tatashin Cc: Pavan Kondeti Cc: SeongJae Park Cc: Shakeel Butt Cc: Vlastimil Babka Cc: William Kucharski Signed-off-by: Andrew Morton --- include/linux/page_ext.h | 17 ++++--- include/linux/page_idle.h | 34 +++++++++---- mm/page_ext.c | 103 +++++++++++++++++++++++++++++++++++--- mm/page_owner.c | 73 +++++++++++++++++++-------- mm/page_table_check.c | 10 ++-- 5 files changed, 192 insertions(+), 45 deletions(-) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index fabb2e1e087f..ed27198cdaf4 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -55,7 +55,8 @@ static inline void page_ext_init(void) } #endif -struct page_ext *lookup_page_ext(const struct page *page); +extern struct page_ext *page_ext_get(struct page *page); +extern void page_ext_put(struct page_ext *page_ext); static inline struct page_ext *page_ext_next(struct page_ext *curr) { @@ -71,11 +72,6 @@ static inline void pgdat_page_ext_init(struct pglist_data *pgdat) { } -static inline struct page_ext *lookup_page_ext(const struct page *page) -{ - return NULL; -} - static inline void page_ext_init(void) { } @@ -87,5 +83,14 @@ static inline void page_ext_init_flatmem_late(void) static inline void page_ext_init_flatmem(void) { } + +static inline struct page_ext *page_ext_get(struct page *page) +{ + return NULL; +} + +static inline void page_ext_put(struct page_ext *page_ext) +{ +} #endif /* CONFIG_PAGE_EXTENSION */ #endif /* __LINUX_PAGE_EXT_H */ diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 4663dfed1293..5cb7bd2078ec 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -13,65 +13,79 @@ * If there is not enough space to store Idle and Young bits in page flags, use * page ext flags instead. */ - static inline bool folio_test_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); + bool page_young; if (unlikely(!page_ext)) return false; - return test_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_young = test_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); + + return page_young; } static inline void folio_set_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); } static inline bool folio_test_clear_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); + bool page_young; if (unlikely(!page_ext)) return false; - return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_young = test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); + + return page_young; } static inline bool folio_test_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); + bool page_idle; if (unlikely(!page_ext)) return false; - return test_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_idle = test_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); + + return page_idle; } static inline void folio_set_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); } static inline void folio_clear_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; clear_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); } #endif /* !CONFIG_64BIT */ diff --git a/mm/page_ext.c b/mm/page_ext.c index e22a928dd66a..b236bdd59fa8 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -9,6 +9,7 @@ #include #include #include +#include /* * struct page extension @@ -59,6 +60,10 @@ * can utilize this callback to initialize the state of it correctly. */ +#ifdef CONFIG_SPARSEMEM +#define PAGE_EXT_INVALID (0x1) +#endif + #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) static bool need_page_idle(void) { @@ -84,6 +89,7 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { unsigned long page_ext_size = sizeof(struct page_ext); static unsigned long total_usage; +static struct page_ext *lookup_page_ext(const struct page *page); static bool __init invoke_need_callbacks(void) { @@ -125,6 +131,48 @@ static inline struct page_ext *get_entry(void *base, unsigned long index) return base + page_ext_size * index; } +/** + * page_ext_get() - Get the extended information for a page. + * @page: The page we're interested in. + * + * Ensures that the page_ext will remain valid until page_ext_put() + * is called. + * + * Return: NULL if no page_ext exists for this page. + * Context: Any context. Caller may not sleep until they have called + * page_ext_put(). + */ +struct page_ext *page_ext_get(struct page *page) +{ + struct page_ext *page_ext; + + rcu_read_lock(); + page_ext = lookup_page_ext(page); + if (!page_ext) { + rcu_read_unlock(); + return NULL; + } + + return page_ext; +} + +/** + * page_ext_put() - Working with page extended information is done. + * @page_ext - Page extended information received from page_ext_get(). + * + * The page extended information of the page may not be valid after this + * function is called. + * + * Return: None. + * Context: Any context with corresponding page_ext_get() is called. + */ +void page_ext_put(struct page_ext *page_ext) +{ + if (unlikely(!page_ext)) + return; + + rcu_read_unlock(); +} #ifndef CONFIG_SPARSEMEM @@ -133,12 +181,13 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) pgdat->node_page_ext = NULL; } -struct page_ext *lookup_page_ext(const struct page *page) +static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; struct page_ext *base; + WARN_ON_ONCE(!rcu_read_lock_held()); base = NODE_DATA(page_to_nid(page))->node_page_ext; /* * The sanity checks the page allocator does upon freeing a @@ -206,20 +255,27 @@ fail: } #else /* CONFIG_SPARSEMEM */ +static bool page_ext_invalid(struct page_ext *page_ext) +{ + return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); +} -struct page_ext *lookup_page_ext(const struct page *page) +static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); + struct page_ext *page_ext = READ_ONCE(section->page_ext); + + WARN_ON_ONCE(!rcu_read_lock_held()); /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ - if (!section->page_ext) + if (page_ext_invalid(page_ext)) return NULL; - return get_entry(section->page_ext, pfn); + return get_entry(page_ext, pfn); } static void *__meminit alloc_page_ext(size_t size, int nid) @@ -298,9 +354,30 @@ static void __free_page_ext(unsigned long pfn) ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; - base = get_entry(ms->page_ext, pfn); + + base = READ_ONCE(ms->page_ext); + /* + * page_ext here can be valid while doing the roll back + * operation in online_page_ext(). + */ + if (page_ext_invalid(base)) + base = (void *)base - PAGE_EXT_INVALID; + WRITE_ONCE(ms->page_ext, NULL); + + base = get_entry(base, pfn); free_page_ext(base); - ms->page_ext = NULL; +} + +static void __invalidate_page_ext(unsigned long pfn) +{ + struct mem_section *ms; + void *val; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_ext) + return; + val = (void *)ms->page_ext + PAGE_EXT_INVALID; + WRITE_ONCE(ms->page_ext, val); } static int __meminit online_page_ext(unsigned long start_pfn, @@ -343,6 +420,20 @@ static int __meminit offline_page_ext(unsigned long start_pfn, start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); + /* + * Freeing of page_ext is done in 3 steps to avoid + * use-after-free of it: + * 1) Traverse all the sections and mark their page_ext + * as invalid. + * 2) Wait for all the existing users of page_ext who + * started before invalidation to finish. + * 3) Free the page_ext. + */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __invalidate_page_ext(pfn); + + synchronize_rcu(); + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); return 0; diff --git a/mm/page_owner.c b/mm/page_owner.c index e4c6f3f1695b..72839a606e22 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -141,7 +141,7 @@ void __reset_page_owner(struct page *page, unsigned short order) struct page_owner *page_owner; u64 free_ts_nsec = local_clock(); - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; @@ -153,6 +153,7 @@ void __reset_page_owner(struct page *page, unsigned short order) page_owner->free_ts_nsec = free_ts_nsec; page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } static inline void __set_page_owner_handle(struct page_ext *page_ext, @@ -183,19 +184,21 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, noinline void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext; depot_stack_handle_t handle; + handle = save_stack(gfp_mask); + + page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; - - handle = save_stack(gfp_mask); __set_page_owner_handle(page_ext, handle, order, gfp_mask); + page_ext_put(page_ext); } void __set_page_owner_migrate_reason(struct page *page, int reason) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) @@ -203,12 +206,13 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_owner = get_page_owner(page_ext); page_owner->last_migrate_reason = reason; + page_ext_put(page_ext); } void __split_page_owner(struct page *page, unsigned int nr) { int i; - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) @@ -219,17 +223,25 @@ void __split_page_owner(struct page *page, unsigned int nr) page_owner->order = 0; page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } void __folio_copy_owner(struct folio *newfolio, struct folio *old) { - struct page_ext *old_ext = lookup_page_ext(&old->page); - struct page_ext *new_ext = lookup_page_ext(&newfolio->page); + struct page_ext *old_ext; + struct page_ext *new_ext; struct page_owner *old_page_owner, *new_page_owner; - if (unlikely(!old_ext || !new_ext)) + old_ext = page_ext_get(&old->page); + if (unlikely(!old_ext)) return; + new_ext = page_ext_get(&newfolio->page); + if (unlikely(!new_ext)) { + page_ext_put(old_ext); + return; + } + old_page_owner = get_page_owner(old_ext); new_page_owner = get_page_owner(new_ext); new_page_owner->order = old_page_owner->order; @@ -254,6 +266,8 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) */ __set_bit(PAGE_EXT_OWNER, &new_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); + page_ext_put(new_ext); + page_ext_put(old_ext); } void pagetypeinfo_showmixedcount_print(struct seq_file *m, @@ -307,12 +321,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, if (PageReserved(page)) continue; - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) - continue; + goto ext_put_continue; page_owner = get_page_owner(page_ext); page_mt = gfp_migratetype(page_owner->gfp_mask); @@ -323,9 +337,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, count[pageblock_mt]++; pfn = block_end_pfn; + page_ext_put(page_ext); break; } pfn += (1UL << page_owner->order) - 1; +ext_put_continue: + page_ext_put(page_ext); } } @@ -435,7 +452,7 @@ err: void __dump_page_owner(const struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get((void *)page); struct page_owner *page_owner; depot_stack_handle_t handle; gfp_t gfp_mask; @@ -452,6 +469,7 @@ void __dump_page_owner(const struct page *page) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); + page_ext_put(page_ext); return; } @@ -482,6 +500,7 @@ void __dump_page_owner(const struct page *page) if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); + page_ext_put(page_ext); } static ssize_t @@ -507,6 +526,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) /* Find an allocated page */ for (; pfn < max_pfn; pfn++) { + /* + * This temporary page_owner is required so + * that we can avoid the context switches while holding + * the rcu lock and copying the page owner information to + * user through copy_to_user() or GFP_KERNEL allocations. + */ + struct page_owner page_owner_tmp; + /* * If the new page is in a new MAX_ORDER_NR_PAGES area, * validate the area as existing, skip it if not @@ -525,7 +552,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) continue; } - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; @@ -534,14 +561,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) * because we don't hold the zone lock. */ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; + goto ext_put_continue; /* * Although we do have the info about past allocation of free * pages, it's not relevant for current memory usage. */ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) - continue; + goto ext_put_continue; page_owner = get_page_owner(page_ext); @@ -550,7 +577,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) * would inflate the stats. */ if (!IS_ALIGNED(pfn, 1 << page_owner->order)) - continue; + goto ext_put_continue; /* * Access to page_ext->handle isn't synchronous so we should @@ -558,13 +585,17 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) */ handle = READ_ONCE(page_owner->handle); if (!handle) - continue; + goto ext_put_continue; /* Record the next PFN to read in the file offset */ *ppos = (pfn - min_low_pfn) + 1; + page_owner_tmp = *page_owner; + page_ext_put(page_ext); return print_page_owner(buf, count, pfn, page, - page_owner, handle); + &page_owner_tmp, handle); +ext_put_continue: + page_ext_put(page_ext); } return 0; @@ -617,18 +648,20 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) if (PageReserved(page)) continue; - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; /* Maybe overlapping zone */ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; + goto ext_put_continue; /* Found early allocated page */ __set_page_owner_handle(page_ext, early_handle, 0, 0); count++; +ext_put_continue: + page_ext_put(page_ext); } cond_resched(); } diff --git a/mm/page_table_check.c b/mm/page_table_check.c index e2062748791a..903db62794d3 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -68,7 +68,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, return; page = pfn_to_page(pfn); - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); anon = PageAnon(page); for (i = 0; i < pgcnt; i++) { @@ -83,6 +83,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, } page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } /* @@ -103,7 +104,7 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, return; page = pfn_to_page(pfn); - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); anon = PageAnon(page); for (i = 0; i < pgcnt; i++) { @@ -118,6 +119,7 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, } page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } /* @@ -126,9 +128,10 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, */ void __page_table_check_zero(struct page *page, unsigned int order) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext; unsigned long i; + page_ext = page_ext_get(page); BUG_ON(!page_ext); for (i = 0; i < (1ul << order); i++) { struct page_table_check *ptc = get_page_table_check(page_ext); @@ -137,6 +140,7 @@ void __page_table_check_zero(struct page *page, unsigned int order) BUG_ON(atomic_read(&ptc->file_map_count)); page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, From f36a5543a74883c21a59b8082b403a13c7654769 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 18 Aug 2022 21:00:11 +0800 Subject: [PATCH 05/25] mm, hwpoison: fix page refcnt leaking in try_memory_failure_hugetlb() Patch series "A few fixup patches for memory-failure", v2. This series contains a few fixup patches to fix incorrect update of page refcnt, fix possible use-after-free issue and so on. More details can be found in the respective changelogs. This patch (of 6): When hwpoison_filter() refuses to hwpoison a hugetlb page, the refcnt of the page would have been incremented if res == 1. Using put_page() to fix the refcnt leaking in this case. Link: https://lkml.kernel.org/r/20220823032346.4260-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220818130016.45313-1-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220818130016.45313-2-linmiaohe@huawei.com Fixes: 405ce051236c ("mm/hwpoison: fix race between hugetlb free/demotion and memory_failure_hugetlb()") Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 0dfed9d7b273..3f98fa2ac6cf 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1860,8 +1860,10 @@ retry: if (hwpoison_filter(p)) { hugetlb_clear_page_hwpoison(head); - res = -EOPNOTSUPP; - goto out; + unlock_page(head); + if (res == 1) + put_page(head); + return -EOPNOTSUPP; } /* From 6bbabd041dfd4823c752940286656d404621bf38 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 18 Aug 2022 21:00:12 +0800 Subject: [PATCH 06/25] mm, hwpoison: fix page refcnt leaking in unpoison_memory() When free_raw_hwp_pages() fails its work, the refcnt of the hugetlb page would have been incremented if ret > 0. Using put_page() to fix refcnt leaking in this case. Link: https://lkml.kernel.org/r/20220818130016.45313-3-linmiaohe@huawei.com Fixes: debb6b9c3fdd ("mm, hwpoison: make unpoison aware of raw error info in hwpoisoned hugepage") Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 1 + 1 file changed, 1 insertion(+) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3f98fa2ac6cf..f391818c9eac 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2378,6 +2378,7 @@ int unpoison_memory(unsigned long pfn) count = free_raw_hwp_pages(page, false); if (count == 0) { ret = -EBUSY; + put_page(page); goto unlock_mutex; } } From 12f1dbcf8f144c0b8dde7a62fea766f88cb79fc8 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 18 Aug 2022 21:00:13 +0800 Subject: [PATCH 07/25] mm, hwpoison: fix extra put_page() in soft_offline_page() When hwpoison_filter() refuses to soft offline a page, the page refcnt incremented previously by MF_COUNT_INCREASED would have been consumed via get_hwpoison_page() if ret <= 0. So the put_ref_page() here will put the extra one. Remove it to fix the issue. Link: https://lkml.kernel.org/r/20220818130016.45313-4-linmiaohe@huawei.com Fixes: 9113eaf331bf ("mm/memory-failure.c: add hwpoison_filter for soft offline") Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index f391818c9eac..799176d3f5f7 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2591,8 +2591,6 @@ retry: if (hwpoison_filter(page)) { if (ret > 0) put_page(page); - else - put_ref_page(ref_page); mutex_unlock(&mf_mutex); return -EOPNOTSUPP; From 54f9555d4031d4aeb10651aa9dcb5335f6a05865 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Tue, 23 Aug 2022 11:23:44 +0800 Subject: [PATCH 08/25] mm, hwpoison: fix possible use-after-free in mf_dax_kill_procs() After kill_procs(), tk will be freed without being removed from the to_kill list. In the next iteration, the freed list entry in the to_kill list will be accessed, thus leading to use-after-free issue. Adding list_del() in kill_procs() to fix the issue. Link: https://lkml.kernel.org/r/20220823032346.4260-5-linmiaohe@huawei.com Fixes: c36e20249571 ("mm: introduce mf_dax_kill_procs() for fsdax case") Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 799176d3f5f7..208a0f85ef54 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -413,7 +413,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, { struct to_kill *tk, *next; - list_for_each_entry_safe (tk, next, to_kill, nd) { + list_for_each_entry_safe(tk, next, to_kill, nd) { if (forcekill) { /* * In case something went wrong with munmapping @@ -437,6 +437,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); } + list_del(&tk->nd); put_task_struct(tk->tsk); kfree(tk); } From 0792a4a6195a6d67a4ead2554da393cbd8dcdf5a Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 18 Aug 2022 21:00:15 +0800 Subject: [PATCH 09/25] mm, hwpoison: kill procs if unmap fails If try_to_unmap() fails, the hwpoisoned page still resides in the address space of some processes. We should kill these processes or the hwpoisoned page might be consumed later. collect_procs() is always called to collect relevant processes now so they can be killed later if unmap fails. [linmiaohe@huawei.com: v2] Link: https://lkml.kernel.org/r/20220823032346.4260-6-linmiaohe@huawei.com Link: https://lkml.kernel.org/r/20220818130016.45313-6-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 208a0f85ef54..3b8e7937ce75 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -1397,7 +1397,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; - int kill = 1, forcekill; + int forcekill; bool mlocked = PageMlocked(hpage); /* @@ -1438,7 +1438,6 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (page_mkclean(hpage)) { SetPageDirty(hpage); } else { - kill = 0; ttu |= TTU_IGNORE_HWPOISON; pr_info("%#lx: corrupted page was clean: dropped without side effects\n", pfn); @@ -1449,12 +1448,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. - * - * Error handling: We ignore errors here because - * there's nothing that can be done. */ - if (kill) - collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); + collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); if (PageHuge(hpage) && !PageAnon(hpage)) { /* @@ -1496,7 +1491,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); + forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) || + !unmap_success; kill_procs(&tokill, forcekill, !unmap_success, pfn, flags); return unmap_success; From e9ff3ba7ff10490a92792faf1d3573a24fc6e5c9 Mon Sep 17 00:00:00 2001 From: Miaohe Lin Date: Thu, 18 Aug 2022 21:00:16 +0800 Subject: [PATCH 10/25] mm, hwpoison: avoid trying to unpoison reserved page For reserved pages, HWPoison flag will be set without increasing the page refcnt. So we shouldn't even try to unpoison these pages and thus decrease the page refcnt unexpectly. Add a PageReserved() check to filter this case out and remove the below unneeded zero page (zero page is reserved) check. Link: https://lkml.kernel.org/r/20220818130016.45313-7-linmiaohe@huawei.com Signed-off-by: Miaohe Lin Acked-by: Naoya Horiguchi Cc: "Aneesh Kumar K.V" Signed-off-by: Andrew Morton --- mm/memory-failure.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3b8e7937ce75..8156ef0983a3 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2351,7 +2351,7 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (PageSlab(page) || PageTable(page)) + if (PageSlab(page) || PageTable(page) || PageReserved(page)) goto unlock_mutex; ret = get_hwpoison_page(p, MF_UNPOISON); @@ -2382,7 +2382,7 @@ int unpoison_memory(unsigned long pfn) freeit = !!TestClearPageHWPoison(p); put_page(page); - if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) { + if (freeit) { put_page(page); ret = 0; } From 33febb519d673b9f574903764f13f47e65328ca5 Mon Sep 17 00:00:00 2001 From: Muchun Song Date: Fri, 19 Aug 2022 11:55:32 +0800 Subject: [PATCH 11/25] mm: hugetlb_vmemmap: simplify reset_struct_pages() We can choose to copy three contiguous tail pages' content to the first three pages instead of copying one by one to simplify the code and reduce code size from 229 bytes to 63 bytes. The BUILD_BUG_ON() aims to avoid out-of-bounds accesses. Link: https://lkml.kernel.org/r/20220819035532.6189-1-songmuchun@bytedance.com Signed-off-by: Muchun Song Reviewed-by: Miaohe Lin Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/hugetlb_vmemmap.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 76b2d03a0d8d..ba2a2596fb4e 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -265,11 +265,10 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, static inline void reset_struct_pages(struct page *start) { - int i; struct page *from = start + NR_RESET_STRUCT_PAGE; - for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) - memcpy(start + i, from, sizeof(*from)); + BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); + memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); } static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, From 7adb45887c8af88985c335b53d253654e9d2dd16 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 19 Aug 2022 11:34:01 +0800 Subject: [PATCH 12/25] mm: memory-failure: kill soft_offline_free_page() Open-code the page_handle_poison() into soft_offline_page() and kill unneeded soft_offline_free_page(). Link: https://lkml.kernel.org/r/20220819033402.156519-1-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Miaohe Lin Acked-by: Naoya Horiguchi Signed-off-by: Andrew Morton --- mm/memory-failure.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 8156ef0983a3..774b5234a715 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2511,16 +2511,6 @@ static int soft_offline_in_use_page(struct page *page) return __soft_offline_page(page); } -static int soft_offline_free_page(struct page *page) -{ - int rc = 0; - - if (!page_handle_poison(page, true, false)) - rc = -EBUSY; - - return rc; -} - static void put_ref_page(struct page *page) { if (page) @@ -2596,7 +2586,7 @@ retry: if (ret > 0) { ret = soft_offline_in_use_page(page); } else if (ret == 0) { - if (soft_offline_free_page(page) && try_again) { + if (!page_handle_poison(page, true, false) && try_again) { try_again = false; flags &= ~MF_COUNT_INCREASED; goto retry; From 48309e1f6f7b15b041b39b7f15e7adc1c7e2de95 Mon Sep 17 00:00:00 2001 From: Kefeng Wang Date: Fri, 19 Aug 2022 11:34:02 +0800 Subject: [PATCH 13/25] mm: memory-failure: kill __soft_offline_page() Squash the __soft_offline_page() into soft_offline_in_use_page() and kill __soft_offline_page(). [wangkefeng.wang@huawei.com: update hpage when try_to_split_thp_page() succeeds] Link: https://lkml.kernel.org/r/20220830104654.28234-1-wangkefeng.wang@huawei.com Link: https://lkml.kernel.org/r/20220819033402.156519-2-wangkefeng.wang@huawei.com Signed-off-by: Kefeng Wang Reviewed-by: Miaohe Lin Acked-by: Naoya Horiguchi Reviewed-by: David Hildenbrand Reviewed-by: Anshuman Khandual Signed-off-by: Andrew Morton --- mm/memory-failure.c | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 774b5234a715..6dd74f535f77 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -2432,11 +2432,11 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) } /* - * __soft_offline_page handles hugetlb-pages and non-hugetlb pages. + * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages. * If the page is a non-dirty unmapped page-cache page, it simply invalidates. * If the page is mapped, it migrates the contents over. */ -static int __soft_offline_page(struct page *page) +static int soft_offline_in_use_page(struct page *page) { long ret = 0; unsigned long pfn = page_to_pfn(page); @@ -2449,6 +2449,14 @@ static int __soft_offline_page(struct page *page) .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; + if (!huge && PageTransHuge(hpage)) { + if (try_to_split_thp_page(page)) { + pr_info("soft offline: %#lx: thp split failed\n", pfn); + return -EBUSY; + } + hpage = page; + } + lock_page(page); if (!PageHuge(page)) wait_on_page_writeback(page); @@ -2498,19 +2506,6 @@ static int __soft_offline_page(struct page *page) return ret; } -static int soft_offline_in_use_page(struct page *page) -{ - struct page *hpage = compound_head(page); - - if (!PageHuge(page) && PageTransHuge(hpage)) - if (try_to_split_thp_page(page) < 0) { - pr_info("soft offline: %#lx: thp split failed\n", - page_to_pfn(page)); - return -EBUSY; - } - return __soft_offline_page(page); -} - static void put_ref_page(struct page *page) { if (page) From c8bb41631bc2ecc6434e93232c031c7a218ebe81 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Thu, 18 Aug 2022 16:27:48 +0800 Subject: [PATCH 14/25] mm: thp: remove redundant pgtable check in set_huge_zero_page() When the pgtable is NULL in the set_huge_zero_page(), we should not increment the count of PTE page table pages by calling mm_inc_nr_ptes(). Otherwise we may receive the following warning when the mm exits: BUG: non-zero pgtables_bytes on freeing mm Now we can't observe the above warning since only do_huge_pmd_anonymous_page() invokes set_huge_zero_page() and the pgtable can not be NULL. Therefore, instead of moving mm_inc_nr_ptes() to the non-NULL branch of pgtable, it is better to remove the redundant pgtable check directly. Link: https://lkml.kernel.org/r/20220818082748.40021-1-zhengqi.arch@bytedance.com Signed-off-by: Qi Zheng Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/huge_memory.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 37105d9aa4d2..286a70c3d5de 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -770,8 +770,7 @@ static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); - if (pgtable) - pgtable_trans_huge_deposit(mm, pmd, pgtable); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); } From 6a3edd29395631a010d6760792824a44986e57fe Mon Sep 17 00:00:00 2001 From: Yin Fengwei Date: Wed, 10 Aug 2022 14:49:07 +0800 Subject: [PATCH 15/25] mm: release private data before split THP If there is private data attached to a THP, the refcount of THP will be increased and will prevent the THP from being split. Attempt to release any private data attached to the THP before attempting the split to increase the chance of splitting successfully. There was a memory failure issue hit during HW error injection testing with 5.18 kernel + xfs as rootfs. The test was killed and a system reboot was required to re-run the test. The issue was tracked down to a THP split failure caused by the memory failure not being handled. The page dump showed: [ 1785.433075] page:0000000025f9530b refcount:18 mapcount:0 mapping:000000008162eea7 index:0xa10 pfn:0x2f0200 [ 1785.443954] head:0000000025f9530b order:4 compound_mapcount:0 compound_pincount:0 [ 1785.452408] memcg:ff4247f2d28e9000 [ 1785.456304] aops:xfs_address_space_operations ino:8555182 dentry name:"baseos-filenames.solvx" [ 1785.466612] flags: 0x1000000000012036(referenced|uptodate|lru|active|private|head|node=0|zone=2) [ 1785.476514] raw: 1000000000012036 ffb9460f8bc07c08 ffb9460f8bc08408 ff4247f22e6299f8 [ 1785.485268] raw: 0000000000000a10 ff4247f194ade900 00000012ffffffff ff4247f2d28e9000 It was like the error was injected to a large folio for xfs with private data attached. With private data released before splitting the THP, the test case could be run successfully many times without rebooting the system. Link: https://lkml.kernel.org/r/20220810064907.582899-1-fengwei.yin@intel.com Co-developed-by: Qiuxu Zhuo Signed-off-by: Qiuxu Zhuo Signed-off-by: Yin Fengwei Suggested-by: Matthew Wilcox Reviewed-by: Yang Shi Reviewed-by: Miaohe Lin Reviewed-by: Aaron Lu Signed-off-by: Andrew Morton --- mm/huge_memory.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 286a70c3d5de..0405d7375bce 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2643,6 +2643,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) mapping = NULL; anon_vma_lock_write(anon_vma); } else { + gfp_t gfp; + mapping = head->mapping; /* Truncated ? */ @@ -2651,8 +2653,16 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } - xas_split_alloc(&xas, head, compound_order(head), - mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); + gfp = current_gfp_context(mapping_gfp_mask(mapping) & + GFP_RECLAIM_MASK); + + if (folio_test_private(folio) && + !filemap_release_folio(folio, gfp)) { + ret = -EBUSY; + goto out; + } + + xas_split_alloc(&xas, head, compound_order(head), gfp); if (xas_error(&xas)) { ret = xas_error(&xas); goto out; From c8b9aff419303e4d4219b5ff64b1c7e062dee48e Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 18 Aug 2022 15:37:43 +0800 Subject: [PATCH 16/25] mm/damon: validate if the pmd entry is present before accessing pmd_huge() is used to validate if the pmd entry is mapped by a huge page, also including the case of non-present (migration or hwpoisoned) pmd entry on arm64 or x86 architectures. This means that pmd_pfn() can not get the correct pfn number for a non-present pmd entry, which will cause damon_get_page() to get an incorrect page struct (also may be NULL by pfn_to_online_page()), making the access statistics incorrect. This means that the DAMON may make incorrect decision according to the incorrect statistics, for example, DAMON may can not reclaim cold page in time due to this cold page was regarded as accessed mistakenly if DAMOS_PAGEOUT operation is specified. Moreover it does not make sense that we still waste time to get the page of the non-present entry. Just treat it as not-accessed and skip it, which maintains consistency with non-present pte level entries. So add pmd entry present validation to fix the above issues. Link: https://lkml.kernel.org/r/58b1d1f5fbda7db49ca886d9ef6783e3dcbbbc98.1660805030.git.baolin.wang@linux.alibaba.com Fixes: 3f49584b262c ("mm/damon: implement primitives for the virtual memory address spaces") Signed-off-by: Baolin Wang Reviewed-by: SeongJae Park Reviewed-by: Muchun Song Cc: Mike Kravetz Cc: Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 3c7b9d6dca95..1d16c6c79638 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -304,6 +304,11 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, if (pmd_huge(*pmd)) { ptl = pmd_lock(walk->mm, pmd); + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return 0; + } + if (pmd_huge(*pmd)) { damon_pmdp_mkold(pmd, walk->mm, addr); spin_unlock(ptl); @@ -431,6 +436,11 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, #ifdef CONFIG_TRANSPARENT_HUGEPAGE if (pmd_huge(*pmd)) { ptl = pmd_lock(walk->mm, pmd); + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return 0; + } + if (!pmd_huge(*pmd)) { spin_unlock(ptl); goto regular_page; From 72c33ef4c02e32e2884e7688ec878a486913fe9c Mon Sep 17 00:00:00 2001 From: Baolin Wang Date: Thu, 18 Aug 2022 15:37:44 +0800 Subject: [PATCH 17/25] mm/damon: replace pmd_huge() with pmd_trans_huge() for THP pmd_huge() is usually used to indicate a pmd level hugetlb. However a pmd mapped huge page can only be THP in damon_mkold_pmd_entry() or damon_young_pmd_entry(), so replace pmd_huge() with pmd_trans_huge() in this case to make the code more readable according to the discussion [1]. [1] https://lore.kernel.org/all/098c1480-416d-bca9-cedb-ca495df69b64@linux.alibaba.com/ Link: https://lkml.kernel.org/r/a9e010ca5d299e18d740c7c52290ecb6a014dde6.1660805030.git.baolin.wang@linux.alibaba.com Signed-off-by: Baolin Wang Reviewed-by: Muchun Song Reviewed-by: SeongJae Park Cc: Mike Kravetz Signed-off-by: Andrew Morton --- mm/damon/vaddr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 1d16c6c79638..cc04d467ba23 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -302,14 +302,14 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, pte_t *pte; spinlock_t *ptl; - if (pmd_huge(*pmd)) { + if (pmd_trans_huge(*pmd)) { ptl = pmd_lock(walk->mm, pmd); if (!pmd_present(*pmd)) { spin_unlock(ptl); return 0; } - if (pmd_huge(*pmd)) { + if (pmd_trans_huge(*pmd)) { damon_pmdp_mkold(pmd, walk->mm, addr); spin_unlock(ptl); return 0; @@ -434,14 +434,14 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, struct damon_young_walk_private *priv = walk->private; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_huge(*pmd)) { + if (pmd_trans_huge(*pmd)) { ptl = pmd_lock(walk->mm, pmd); if (!pmd_present(*pmd)) { spin_unlock(ptl); return 0; } - if (!pmd_huge(*pmd)) { + if (!pmd_trans_huge(*pmd)) { spin_unlock(ptl); goto regular_page; } From 8f0efa81dfbc6abf86bf410549e61a2636753c86 Mon Sep 17 00:00:00 2001 From: Kassey Li Date: Thu, 18 Aug 2022 10:24:25 +0800 Subject: [PATCH 18/25] mm/page_owner.c: add llseek for page_owner It is too slow to dump all the pages, in some usage we just want to dump a given start pfn, for example: a CMA range or a single page. To speed up and save time, this change allows specifying of a start pfn by adding llseek for page_owner. Link: https://lkml.kernel.org/r/20220818022425.31056-1-quic_yingangl@quicinc.com Signed-off-by: Kassey Li Suggested-by: Vlastimil Babka Acked-by: Vlastimil Babka Cc: Joonsoo Kim Cc: Minchan Kim Signed-off-by: Andrew Morton --- Documentation/mm/page_owner.rst | 5 +++++ mm/page_owner.c | 24 +++++++++++++++++++++--- 2 files changed, 26 insertions(+), 3 deletions(-) diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst index f5c954afe97c..f18fd8907049 100644 --- a/Documentation/mm/page_owner.rst +++ b/Documentation/mm/page_owner.rst @@ -94,6 +94,11 @@ Usage Page allocated via order XXX, ... PFN XXX ... // Detailed stack + By default, it will do full pfn dump, to start with a given pfn, + page_owner supports fseek. + + FILE *fp = fopen("/sys/kernel/debug/page_owner", "r"); + fseek(fp, pfn_start, SEEK_SET); The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows in buf, uses regexp to extract the page order value, counts the times diff --git a/mm/page_owner.c b/mm/page_owner.c index 72839a606e22..90023f938c19 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -516,8 +516,10 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) return -EINVAL; page = NULL; - pfn = min_low_pfn + *ppos; - + if (*ppos == 0) + pfn = min_low_pfn; + else + pfn = *ppos; /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) pfn++; @@ -588,7 +590,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) goto ext_put_continue; /* Record the next PFN to read in the file offset */ - *ppos = (pfn - min_low_pfn) + 1; + *ppos = pfn + 1; page_owner_tmp = *page_owner; page_ext_put(page_ext); @@ -601,6 +603,21 @@ ext_put_continue: return 0; } +static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig) +{ + switch (orig) { + case SEEK_SET: + file->f_pos = offset; + break; + case SEEK_CUR: + file->f_pos += offset; + break; + default: + return -EINVAL; + } + return file->f_pos; +} + static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) { unsigned long pfn = zone->zone_start_pfn; @@ -693,6 +710,7 @@ static void init_early_allocated_pages(void) static const struct file_operations proc_page_owner_operations = { .read = read_page_owner, + .llseek = lseek_page_owner, }; static int __init pageowner_init(void) From e09b0b61fbbf28e0f528ca17af5c3e445109f147 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Tue, 16 Aug 2022 11:58:01 -0700 Subject: [PATCH 19/25] mm: memcg: export workingset refault stats for cgroup v1 Workingset refault stats are important and useful metrics to measure how well reclaimer and swapping work and how healthy the services are, but they are just available for cgroup v2. There are still plenty users with cgroup v1, export the stats for cgroup v1. Link: https://lkml.kernel.org/r/20220816185801.651091-1-shy828301@gmail.com Signed-off-by: Yang Shi Acked-by: Shakeel Butt Cc: Johannes Weiner Cc: Michal Hocko Cc: Muchun Song Cc: Roman Gushchin Cc: Yosry Ahmed Signed-off-by: Andrew Morton --- mm/memcontrol.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 4dddd8be320a..403af5f7a2b9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3975,6 +3975,8 @@ static const unsigned int memcg1_stats[] = { NR_FILE_MAPPED, NR_FILE_DIRTY, NR_WRITEBACK, + WORKINGSET_REFAULT_ANON, + WORKINGSET_REFAULT_FILE, MEMCG_SWAP, }; @@ -3988,6 +3990,8 @@ static const char *const memcg1_stat_names[] = { "mapped_file", "dirty", "writeback", + "workingset_refault_anon", + "workingset_refault_file", "swap", }; @@ -4016,7 +4020,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state_local(memcg, memcg1_stats[i]); - seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], + nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -4047,7 +4052,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) continue; nr = memcg_page_state(memcg, memcg1_stats[i]); seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)nr * PAGE_SIZE); + (u64)nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) From e2f8f44b7686bf65747f485ac7c9c43de8b8de09 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Mon, 22 Aug 2022 15:01:32 +0200 Subject: [PATCH 20/25] mm: pagewalk: fix documentation of PTE hole handling Empty PTEs are passed to the pte_entry callback, not to pte_hole. Link: https://lkml.kernel.org/r/3695521.kQq0lBPeGt@devpool047 Signed-off-by: Rolf Eike Beer Signed-off-by: Andrew Morton --- include/linux/pagewalk.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index ac7b38ad5903..f3fafb731ffd 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -15,12 +15,12 @@ struct mm_walk; * this handler is required to be able to handle * pmd_trans_huge() pmds. They may simply choose to * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each non-empty PTE (lowest-level) - * entry + * @pte_entry: if set, called for each PTE (lowest-level) entry, + * including empty ones * @pte_hole: if set, called for each hole at all levels, - * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD - * 4:PTE. Any folded depths (where PTRS_PER_P?D is equal - * to 1) are skipped. + * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. + * Any folded depths (where PTRS_PER_P?D is equal to 1) + * are skipped. * @hugetlb_entry: if set, called for each hugetlb entry * @test_walk: caller specific callback function to determine whether * we walk over the current vma or not. Returning 0 means From 8bd3873d1bff1d7b761949deaccc3b527bffa1e3 Mon Sep 17 00:00:00 2001 From: Rolf Eike Beer Date: Mon, 22 Aug 2022 15:02:36 +0200 Subject: [PATCH 21/25] mm: pagewalk: add api documentation for walk_page_range_novma() Link: https://lkml.kernel.org/r/8991525.CDJkKcVGEf@devpool047 Signed-off-by: Rolf Eike Beer Signed-off-by: Andrew Morton --- mm/pagewalk.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 9b3db11a4d1d..908ec1577f40 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -479,7 +479,15 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, return err; } -/* +/** + * walk_page_range_novma - walk a range of pagetables not backed by a vma + * @mm: mm_struct representing the target process of page table walk + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @ops: operation to call during the walk + * @pgd: pgd to walk if different from mm->pgd + * @private: private data for callbacks' usage + * * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. This is useful for From 32d772708009eb90f8eeed6ec8f76e06f07e41e9 Mon Sep 17 00:00:00 2001 From: Bui Quang Minh Date: Sun, 21 Aug 2022 22:40:55 +0700 Subject: [PATCH 22/25] mm: skip retry when new limit is not below old one in page_counter_set_max In page_counter_set_max, we want to make sure the new limit is not below the concurrently-changing counter value. We read the counter and check that the limit is not below the counter before the swap. After the swap, we read the counter again and retry in case the counter is incremented as this may violate the requirement. Even though the page_counter_try_charge can see the old limit, it is guaranteed that the counter is not above the old limit after the increment. So in case the new limit is not below the old limit, the counter is guaranteed to be not above the new limit too. We can skip the retry in this case to optimize a little bit. Link: https://lkml.kernel.org/r/20220821154055.109635-1-minhquangbui99@gmail.com Signed-off-by: Bui Quang Minh Signed-off-by: Andrew Morton --- mm/page_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_counter.c b/mm/page_counter.c index eb156ff5d603..8a0cc24b60dd 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -193,7 +193,7 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) old = xchg(&counter->max, nr_pages); - if (page_counter_read(counter) <= usage) + if (page_counter_read(counter) <= usage || nr_pages >= old) return 0; counter->max = old; From f6d299ec39d8ebb62fc047022d4904575c99a28c Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 24 Aug 2022 15:09:51 +1000 Subject: [PATCH 23/25] mm/gup.c: don't pass gup_flags to check_and_migrate_movable_pages() gup_flags is passed to check_and_migrate_movable_pages() so that it can call either put_page() or unpin_user_page() to drop the page reference. However check_and_migrate_movable_pages() is only called for FOLL_LONGTERM, which implies FOLL_PIN so there is no need to pass gup_flags. Link: https://lkml.kernel.org/r/d611c65a9008ff55887307df457c6c2220ad6163.1661317396.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: David Hildenbrand Reviewed-by: John Hubbard Cc: Alex Sierra Cc: Dan Williams Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Logan Gunthorpe Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Muchun Song Cc: Ralph Campbell Cc: Shigeru Yoshida Signed-off-by: Andrew Morton --- mm/gup.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index b05810e7e9bb..1800af4806e3 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1933,8 +1933,7 @@ struct page *get_dump_page(unsigned long addr) * migration failure. */ static long check_and_migrate_movable_pages(unsigned long nr_pages, - struct page **pages, - unsigned int gup_flags) + struct page **pages) { unsigned long i; struct folio *prev_folio = NULL; @@ -1967,10 +1966,8 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, * Migration will fail if the page is pinned, so convert * the pin on the source page to a normal reference. */ - if (gup_flags & FOLL_PIN) { - get_page(&folio->page); - unpin_user_page(&folio->page); - } + get_page(&folio->page); + unpin_user_page(&folio->page); if (migrate_device_coherent_page(&folio->page)) { ret = -EBUSY; @@ -2023,10 +2020,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, if (!pages[i]) continue; - if (gup_flags & FOLL_PIN) - unpin_user_page(pages[i]); - else - put_page(pages[i]); + unpin_user_page(pages[i]); } if (!list_empty(&movable_page_list)) { @@ -2049,8 +2043,7 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, } #else static long check_and_migrate_movable_pages(unsigned long nr_pages, - struct page **pages, - unsigned int gup_flags) + struct page **pages) { return 0; } @@ -2073,6 +2066,9 @@ static long __gup_longterm_locked(struct mm_struct *mm, if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, gup_flags); + /* check_and_migrate_movable_pages() assumes pages have been pinned. */ + if (WARN_ON(!(gup_flags & FOLL_PIN))) + return -EINVAL; flags = memalloc_pin_save(); do { nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, @@ -2082,8 +2078,7 @@ static long __gup_longterm_locked(struct mm_struct *mm, rc = nr_pinned_pages; break; } - rc = check_and_migrate_movable_pages(nr_pinned_pages, pages, - gup_flags); + rc = check_and_migrate_movable_pages(nr_pinned_pages, pages); } while (rc == -EAGAIN); memalloc_pin_restore(flags); From 67e139b02d99496c03bf3b51abad685ab3b49138 Mon Sep 17 00:00:00 2001 From: Alistair Popple Date: Wed, 24 Aug 2022 15:09:52 +1000 Subject: [PATCH 24/25] mm/gup.c: refactor check_and_migrate_movable_pages() When pinning pages with FOLL_LONGTERM check_and_migrate_movable_pages() is called to migrate pages out of zones which should not contain any longterm pinned pages. When migration succeeds all pages will have been unpinned so pinning needs to be retried. Migration can also fail, in which case the pages will also have been unpinned but the operation should not be retried. If all pages are in the correct zone nothing will be unpinned and no retry is required. The logic in check_and_migrate_movable_pages() tracks unnecessary state and the return codes for each case are difficult to follow. Refactor the code to clean this up. No behaviour change is intended. [akpm@linux-foundation.org: fix unused var warning] Link: https://lkml.kernel.org/r/19583d1df07fdcb99cfa05c265588a3fa58d1902.1661317396.git-series.apopple@nvidia.com Signed-off-by: Alistair Popple Reviewed-by: John Hubbard Cc: Alex Sierra Cc: Dan Williams Cc: David Hildenbrand Cc: Felix Kuehling Cc: Jason Gunthorpe Cc: Logan Gunthorpe Cc: Matthew Wilcox Cc: Miaohe Lin Cc: Muchun Song Cc: Ralph Campbell Cc: Shigeru Yoshida Signed-off-by: Andrew Morton --- mm/gup.c | 185 ++++++++++++++++++++++++++++++++++--------------------- 1 file changed, 114 insertions(+), 71 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 1800af4806e3..ce8ff9f51e05 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1927,19 +1927,16 @@ struct page *get_dump_page(unsigned long addr) #ifdef CONFIG_MIGRATION /* - * Check whether all pages are pinnable. If some pages are not pinnable migrate - * them and unpin all the pages. Returns -EAGAIN if pages were unpinned or zero - * if all pages are pinnable and in the right zone. Other errors indicate - * migration failure. + * Returns the number of collected pages. Return value is always >= 0. */ -static long check_and_migrate_movable_pages(unsigned long nr_pages, - struct page **pages) +static unsigned long collect_longterm_unpinnable_pages( + struct list_head *movable_page_list, + unsigned long nr_pages, + struct page **pages) { - unsigned long i; + unsigned long i, collected = 0; struct folio *prev_folio = NULL; - LIST_HEAD(movable_page_list); - bool drain_allow = true, coherent_pages = false; - int ret = 0; + bool drain_allow = true; for (i = 0; i < nr_pages; i++) { struct folio *folio = page_folio(pages[i]); @@ -1948,43 +1945,16 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, continue; prev_folio = folio; - /* - * Device coherent pages are managed by a driver and should not - * be pinned indefinitely as it prevents the driver moving the - * page. So when trying to pin with FOLL_LONGTERM instead try - * to migrate the page out of device memory. - */ - if (folio_is_device_coherent(folio)) { - /* - * We always want a new GUP lookup with device coherent - * pages. - */ - pages[i] = 0; - coherent_pages = true; - - /* - * Migration will fail if the page is pinned, so convert - * the pin on the source page to a normal reference. - */ - get_page(&folio->page); - unpin_user_page(&folio->page); - - if (migrate_device_coherent_page(&folio->page)) { - ret = -EBUSY; - break; - } - continue; - } - if (folio_is_longterm_pinnable(folio)) continue; - /* - * Try to move out any movable page before pinning the range. - */ + + collected++; + + if (folio_is_device_coherent(folio)) + continue; + if (folio_test_hugetlb(folio)) { - if (isolate_hugetlb(&folio->page, - &movable_page_list)) - ret = -EBUSY; + isolate_hugetlb(&folio->page, movable_page_list); continue; } @@ -1993,53 +1963,118 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, drain_allow = false; } - if (folio_isolate_lru(folio)) { - ret = -EBUSY; + if (!folio_isolate_lru(folio)) continue; - } - list_add_tail(&folio->lru, &movable_page_list); + + list_add_tail(&folio->lru, movable_page_list); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } - /* - * If list is empty, and no isolation errors, means that all pages are - * in the correct zone. If there were device coherent pages some pages - * have been unpinned. - */ - if (list_empty(&movable_page_list) && !ret && !coherent_pages) - return 0; + return collected; +} + +/* + * Unpins all pages and migrates device coherent pages and movable_page_list. + * Returns -EAGAIN if all pages were successfully migrated or -errno for failure + * (or partial success). + */ +static int migrate_longterm_unpinnable_pages( + struct list_head *movable_page_list, + unsigned long nr_pages, + struct page **pages) +{ + int ret; + unsigned long i; - /* - * Unpin all pages. If device coherent pages were found - * migrate_device_coherent_page() will have dropped the pin and set - * pages[i] == NULL. - */ for (i = 0; i < nr_pages; i++) { - if (!pages[i]) - continue; + struct folio *folio = page_folio(pages[i]); + if (folio_is_device_coherent(folio)) { + /* + * Migration will fail if the page is pinned, so convert + * the pin on the source page to a normal reference. + */ + pages[i] = NULL; + folio_get(folio); + gup_put_folio(folio, 1, FOLL_PIN); + + if (migrate_device_coherent_page(&folio->page)) { + ret = -EBUSY; + goto err; + } + + continue; + } + + /* + * We can't migrate pages with unexpected references, so drop + * the reference obtained by __get_user_pages_locked(). + * Migrating pages have been added to movable_page_list after + * calling folio_isolate_lru() which takes a reference so the + * page won't be freed if it's migrating. + */ unpin_user_page(pages[i]); + pages[i] = NULL; } - if (!list_empty(&movable_page_list)) { + if (!list_empty(movable_page_list)) { struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, }; - ret = migrate_pages(&movable_page_list, alloc_migration_target, - NULL, (unsigned long)&mtc, MIGRATE_SYNC, - MR_LONGTERM_PIN, NULL); - if (ret > 0) /* number of pages not migrated */ + if (migrate_pages(movable_page_list, alloc_migration_target, + NULL, (unsigned long)&mtc, MIGRATE_SYNC, + MR_LONGTERM_PIN, NULL)) { ret = -ENOMEM; + goto err; + } } - if (ret && !list_empty(&movable_page_list)) - putback_movable_pages(&movable_page_list); + putback_movable_pages(movable_page_list); - return ret ? ret : -EAGAIN; + return -EAGAIN; + +err: + for (i = 0; i < nr_pages; i++) + if (pages[i]) + unpin_user_page(pages[i]); + putback_movable_pages(movable_page_list); + + return ret; +} + +/* + * Check whether all pages are *allowed* to be pinned. Rather confusingly, all + * pages in the range are required to be pinned via FOLL_PIN, before calling + * this routine. + * + * If any pages in the range are not allowed to be pinned, then this routine + * will migrate those pages away, unpin all the pages in the range and return + * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then + * call this routine again. + * + * If an error other than -EAGAIN occurs, this indicates a migration failure. + * The caller should give up, and propagate the error back up the call stack. + * + * If everything is OK and all pages in the range are allowed to be pinned, then + * this routine leaves all pages pinned and returns zero for success. + */ +static long check_and_migrate_movable_pages(unsigned long nr_pages, + struct page **pages) +{ + unsigned long collected; + LIST_HEAD(movable_page_list); + + collected = collect_longterm_unpinnable_pages(&movable_page_list, + nr_pages, pages); + if (!collected) + return 0; + + return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages, + pages); } #else static long check_and_migrate_movable_pages(unsigned long nr_pages, @@ -2066,7 +2101,15 @@ static long __gup_longterm_locked(struct mm_struct *mm, if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, gup_flags); - /* check_and_migrate_movable_pages() assumes pages have been pinned. */ + + /* + * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM + * implies FOLL_PIN (although the reverse is not true). Therefore it is + * correct to unconditionally call check_and_migrate_movable_pages() + * which assumes pages have been pinned via FOLL_PIN. + * + * Enforce the above reasoning by asserting that FOLL_PIN is set. + */ if (WARN_ON(!(gup_flags & FOLL_PIN))) return -EINVAL; flags = memalloc_pin_save(); From 6d2453c3dbc5f70eafc1c866289a90a1fc57ce18 Mon Sep 17 00:00:00 2001 From: Sergey Senozhatsky Date: Wed, 24 Aug 2022 12:51:00 +0900 Subject: [PATCH 25/25] drivers/block/zram/zram_drv.c: do not keep dangling zcomp pointer after zram reset We do all reset operations under write lock, so we don't need to save ->disksize and ->comp to stack variables. Another thing is that ->comp is freed during zram reset, but comp pointer is not NULL-ed, so zram keeps the freed pointer value. Link: https://lkml.kernel.org/r/20220824035100.971816-1-senozhatsky@chromium.org Signed-off-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Nitin Gupta Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 226ea76cc819..add43f6c8bc0 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1710,9 +1710,6 @@ out: static void zram_reset_device(struct zram *zram) { - struct zcomp *comp; - u64 disksize; - down_write(&zram->init_lock); zram->limit_pages = 0; @@ -1722,17 +1719,15 @@ static void zram_reset_device(struct zram *zram) return; } - comp = zram->comp; - disksize = zram->disksize; - zram->disksize = 0; - set_capacity_and_notify(zram->disk, 0); part_stat_set_all(zram->disk->part0, 0); /* I/O operation under all of CPU are done so let's free */ - zram_meta_free(zram, disksize); + zram_meta_free(zram, zram->disksize); + zram->disksize = 0; memset(&zram->stats, 0, sizeof(zram->stats)); - zcomp_destroy(comp); + zcomp_destroy(zram->comp); + zram->comp = NULL; reset_bdev(zram); up_write(&zram->init_lock);