diff --git a/Documentation/mm/page_owner.rst b/Documentation/mm/page_owner.rst index f5c954afe97c..f18fd8907049 100644 --- a/Documentation/mm/page_owner.rst +++ b/Documentation/mm/page_owner.rst @@ -94,6 +94,11 @@ Usage Page allocated via order XXX, ... PFN XXX ... // Detailed stack + By default, it will do full pfn dump, to start with a given pfn, + page_owner supports fseek. + + FILE *fp = fopen("/sys/kernel/debug/page_owner", "r"); + fseek(fp, pfn_start, SEEK_SET); The ``page_owner_sort`` tool ignores ``PFN`` rows, puts the remaining rows in buf, uses regexp to extract the page order value, counts the times diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index e551433cd107..04077a2bb8a2 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1710,9 +1710,6 @@ out: static void zram_reset_device(struct zram *zram) { - struct zcomp *comp; - u64 disksize; - down_write(&zram->init_lock); zram->limit_pages = 0; @@ -1722,17 +1719,15 @@ static void zram_reset_device(struct zram *zram) return; } - comp = zram->comp; - disksize = zram->disksize; - zram->disksize = 0; - set_capacity_and_notify(zram->disk, 0); part_stat_set_all(zram->disk->part0, 0); /* I/O operation under all of CPU are done so let's free */ - zram_meta_free(zram, disksize); + zram_meta_free(zram, zram->disksize); + zram->disksize = 0; memset(&zram->stats, 0, sizeof(zram->stats)); - zcomp_destroy(comp); + zcomp_destroy(zram->comp); + zram->comp = NULL; reset_bdev(zram); up_write(&zram->init_lock); diff --git a/include/linux/mm.h b/include/linux/mm.h index 27839b158ca4..e98ef2cb1176 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2520,7 +2520,6 @@ extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); extern void get_pfn_range_for_nid(unsigned int nid, unsigned long *start_pfn, unsigned long *end_pfn); -extern unsigned long find_min_pfn_with_active_regions(void); #ifndef CONFIG_NUMA static inline int early_pfn_to_nid(unsigned long pfn) diff --git a/include/linux/page_ext.h b/include/linux/page_ext.h index fabb2e1e087f..ed27198cdaf4 100644 --- a/include/linux/page_ext.h +++ b/include/linux/page_ext.h @@ -55,7 +55,8 @@ static inline void page_ext_init(void) } #endif -struct page_ext *lookup_page_ext(const struct page *page); +extern struct page_ext *page_ext_get(struct page *page); +extern void page_ext_put(struct page_ext *page_ext); static inline struct page_ext *page_ext_next(struct page_ext *curr) { @@ -71,11 +72,6 @@ static inline void pgdat_page_ext_init(struct pglist_data *pgdat) { } -static inline struct page_ext *lookup_page_ext(const struct page *page) -{ - return NULL; -} - static inline void page_ext_init(void) { } @@ -87,5 +83,14 @@ static inline void page_ext_init_flatmem_late(void) static inline void page_ext_init_flatmem(void) { } + +static inline struct page_ext *page_ext_get(struct page *page) +{ + return NULL; +} + +static inline void page_ext_put(struct page_ext *page_ext) +{ +} #endif /* CONFIG_PAGE_EXTENSION */ #endif /* __LINUX_PAGE_EXT_H */ diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 4663dfed1293..5cb7bd2078ec 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -13,65 +13,79 @@ * If there is not enough space to store Idle and Young bits in page flags, use * page ext flags instead. */ - static inline bool folio_test_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); + bool page_young; if (unlikely(!page_ext)) return false; - return test_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_young = test_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); + + return page_young; } static inline void folio_set_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); } static inline bool folio_test_clear_young(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); + bool page_young; if (unlikely(!page_ext)) return false; - return test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_young = test_and_clear_bit(PAGE_EXT_YOUNG, &page_ext->flags); + page_ext_put(page_ext); + + return page_young; } static inline bool folio_test_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); + bool page_idle; if (unlikely(!page_ext)) return false; - return test_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_idle = test_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); + + return page_idle; } static inline void folio_set_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; set_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); } static inline void folio_clear_idle(struct folio *folio) { - struct page_ext *page_ext = lookup_page_ext(&folio->page); + struct page_ext *page_ext = page_ext_get(&folio->page); if (unlikely(!page_ext)) return; clear_bit(PAGE_EXT_IDLE, &page_ext->flags); + page_ext_put(page_ext); } #endif /* !CONFIG_64BIT */ diff --git a/include/linux/pagewalk.h b/include/linux/pagewalk.h index ac7b38ad5903..f3fafb731ffd 100644 --- a/include/linux/pagewalk.h +++ b/include/linux/pagewalk.h @@ -15,12 +15,12 @@ struct mm_walk; * this handler is required to be able to handle * pmd_trans_huge() pmds. They may simply choose to * split_huge_page() instead of handling it explicitly. - * @pte_entry: if set, called for each non-empty PTE (lowest-level) - * entry + * @pte_entry: if set, called for each PTE (lowest-level) entry, + * including empty ones * @pte_hole: if set, called for each hole at all levels, - * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD - * 4:PTE. Any folded depths (where PTRS_PER_P?D is equal - * to 1) are skipped. + * depth is -1 if not known, 0:PGD, 1:P4D, 2:PUD, 3:PMD. + * Any folded depths (where PTRS_PER_P?D is equal to 1) + * are skipped. * @hugetlb_entry: if set, called for each hugetlb entry * @test_walk: caller specific callback function to determine whether * we walk over the current vma or not. Returning 0 means diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 3c7b9d6dca95..cc04d467ba23 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -302,9 +302,14 @@ static int damon_mkold_pmd_entry(pmd_t *pmd, unsigned long addr, pte_t *pte; spinlock_t *ptl; - if (pmd_huge(*pmd)) { + if (pmd_trans_huge(*pmd)) { ptl = pmd_lock(walk->mm, pmd); - if (pmd_huge(*pmd)) { + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return 0; + } + + if (pmd_trans_huge(*pmd)) { damon_pmdp_mkold(pmd, walk->mm, addr); spin_unlock(ptl); return 0; @@ -429,9 +434,14 @@ static int damon_young_pmd_entry(pmd_t *pmd, unsigned long addr, struct damon_young_walk_private *priv = walk->private; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_huge(*pmd)) { + if (pmd_trans_huge(*pmd)) { ptl = pmd_lock(walk->mm, pmd); - if (!pmd_huge(*pmd)) { + if (!pmd_present(*pmd)) { + spin_unlock(ptl); + return 0; + } + + if (!pmd_trans_huge(*pmd)) { spin_unlock(ptl); goto regular_page; } diff --git a/mm/gup.c b/mm/gup.c index a88c86664026..9e9b17c42f3a 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1927,20 +1927,16 @@ struct page *get_dump_page(unsigned long addr) #ifdef CONFIG_MIGRATION /* - * Check whether all pages are pinnable. If some pages are not pinnable migrate - * them and unpin all the pages. Returns -EAGAIN if pages were unpinned or zero - * if all pages are pinnable and in the right zone. Other errors indicate - * migration failure. + * Returns the number of collected pages. Return value is always >= 0. */ -static long check_and_migrate_movable_pages(unsigned long nr_pages, - struct page **pages, - unsigned int gup_flags) +static unsigned long collect_longterm_unpinnable_pages( + struct list_head *movable_page_list, + unsigned long nr_pages, + struct page **pages) { - unsigned long i; + unsigned long i, collected = 0; struct folio *prev_folio = NULL; - LIST_HEAD(movable_page_list); - bool drain_allow = true, coherent_pages = false; - int ret = 0; + bool drain_allow = true; for (i = 0; i < nr_pages; i++) { struct folio *folio = page_folio(pages[i]); @@ -1949,45 +1945,16 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, continue; prev_folio = folio; - /* - * Device coherent pages are managed by a driver and should not - * be pinned indefinitely as it prevents the driver moving the - * page. So when trying to pin with FOLL_LONGTERM instead try - * to migrate the page out of device memory. - */ - if (folio_is_device_coherent(folio)) { - /* - * We always want a new GUP lookup with device coherent - * pages. - */ - pages[i] = 0; - coherent_pages = true; - - /* - * Migration will fail if the page is pinned, so convert - * the pin on the source page to a normal reference. - */ - if (gup_flags & FOLL_PIN) { - get_page(&folio->page); - unpin_user_page(&folio->page); - } - - if (migrate_device_coherent_page(&folio->page)) { - ret = -EBUSY; - break; - } - continue; - } - if (folio_is_longterm_pinnable(folio)) continue; - /* - * Try to move out any movable page before pinning the range. - */ + + collected++; + + if (folio_is_device_coherent(folio)) + continue; + if (folio_test_hugetlb(folio)) { - if (isolate_hugetlb(&folio->page, - &movable_page_list)) - ret = -EBUSY; + isolate_hugetlb(&folio->page, movable_page_list); continue; } @@ -1996,61 +1963,122 @@ static long check_and_migrate_movable_pages(unsigned long nr_pages, drain_allow = false; } - if (folio_isolate_lru(folio)) { - ret = -EBUSY; + if (!folio_isolate_lru(folio)) continue; - } - list_add_tail(&folio->lru, &movable_page_list); + + list_add_tail(&folio->lru, movable_page_list); node_stat_mod_folio(folio, NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } - /* - * If list is empty, and no isolation errors, means that all pages are - * in the correct zone. If there were device coherent pages some pages - * have been unpinned. - */ - if (list_empty(&movable_page_list) && !ret && !coherent_pages) - return 0; + return collected; +} + +/* + * Unpins all pages and migrates device coherent pages and movable_page_list. + * Returns -EAGAIN if all pages were successfully migrated or -errno for failure + * (or partial success). + */ +static int migrate_longterm_unpinnable_pages( + struct list_head *movable_page_list, + unsigned long nr_pages, + struct page **pages) +{ + int ret; + unsigned long i; - /* - * Unpin all pages. If device coherent pages were found - * migrate_device_coherent_page() will have dropped the pin and set - * pages[i] == NULL. - */ for (i = 0; i < nr_pages; i++) { - if (!pages[i]) - continue; + struct folio *folio = page_folio(pages[i]); - if (gup_flags & FOLL_PIN) - unpin_user_page(pages[i]); - else - put_page(pages[i]); + if (folio_is_device_coherent(folio)) { + /* + * Migration will fail if the page is pinned, so convert + * the pin on the source page to a normal reference. + */ + pages[i] = NULL; + folio_get(folio); + gup_put_folio(folio, 1, FOLL_PIN); + + if (migrate_device_coherent_page(&folio->page)) { + ret = -EBUSY; + goto err; + } + + continue; + } + + /* + * We can't migrate pages with unexpected references, so drop + * the reference obtained by __get_user_pages_locked(). + * Migrating pages have been added to movable_page_list after + * calling folio_isolate_lru() which takes a reference so the + * page won't be freed if it's migrating. + */ + unpin_user_page(pages[i]); + pages[i] = NULL; } - if (!list_empty(&movable_page_list)) { + if (!list_empty(movable_page_list)) { struct migration_target_control mtc = { .nid = NUMA_NO_NODE, .gfp_mask = GFP_USER | __GFP_NOWARN, }; - ret = migrate_pages(&movable_page_list, alloc_migration_target, - NULL, (unsigned long)&mtc, MIGRATE_SYNC, - MR_LONGTERM_PIN, NULL); - if (ret > 0) /* number of pages not migrated */ + if (migrate_pages(movable_page_list, alloc_migration_target, + NULL, (unsigned long)&mtc, MIGRATE_SYNC, + MR_LONGTERM_PIN, NULL)) { ret = -ENOMEM; + goto err; + } } - if (ret && !list_empty(&movable_page_list)) - putback_movable_pages(&movable_page_list); + putback_movable_pages(movable_page_list); - return ret ? ret : -EAGAIN; + return -EAGAIN; + +err: + for (i = 0; i < nr_pages; i++) + if (pages[i]) + unpin_user_page(pages[i]); + putback_movable_pages(movable_page_list); + + return ret; +} + +/* + * Check whether all pages are *allowed* to be pinned. Rather confusingly, all + * pages in the range are required to be pinned via FOLL_PIN, before calling + * this routine. + * + * If any pages in the range are not allowed to be pinned, then this routine + * will migrate those pages away, unpin all the pages in the range and return + * -EAGAIN. The caller should re-pin the entire range with FOLL_PIN and then + * call this routine again. + * + * If an error other than -EAGAIN occurs, this indicates a migration failure. + * The caller should give up, and propagate the error back up the call stack. + * + * If everything is OK and all pages in the range are allowed to be pinned, then + * this routine leaves all pages pinned and returns zero for success. + */ +static long check_and_migrate_movable_pages(unsigned long nr_pages, + struct page **pages) +{ + unsigned long collected; + LIST_HEAD(movable_page_list); + + collected = collect_longterm_unpinnable_pages(&movable_page_list, + nr_pages, pages); + if (!collected) + return 0; + + return migrate_longterm_unpinnable_pages(&movable_page_list, nr_pages, + pages); } #else static long check_and_migrate_movable_pages(unsigned long nr_pages, - struct page **pages, - unsigned int gup_flags) + struct page **pages) { return 0; } @@ -2073,6 +2101,17 @@ static long __gup_longterm_locked(struct mm_struct *mm, if (!(gup_flags & FOLL_LONGTERM)) return __get_user_pages_locked(mm, start, nr_pages, pages, vmas, NULL, gup_flags); + + /* + * If we get to this point then FOLL_LONGTERM is set, and FOLL_LONGTERM + * implies FOLL_PIN (although the reverse is not true). Therefore it is + * correct to unconditionally call check_and_migrate_movable_pages() + * which assumes pages have been pinned via FOLL_PIN. + * + * Enforce the above reasoning by asserting that FOLL_PIN is set. + */ + if (WARN_ON(!(gup_flags & FOLL_PIN))) + return -EINVAL; flags = memalloc_pin_save(); do { nr_pinned_pages = __get_user_pages_locked(mm, start, nr_pages, @@ -2082,8 +2121,7 @@ static long __gup_longterm_locked(struct mm_struct *mm, rc = nr_pinned_pages; break; } - rc = check_and_migrate_movable_pages(nr_pinned_pages, pages, - gup_flags); + rc = check_and_migrate_movable_pages(nr_pinned_pages, pages); } while (rc == -EAGAIN); memalloc_pin_restore(flags); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index b6b9b0b8f847..ce025f95aa8c 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -770,8 +770,7 @@ static void set_huge_zero_page(pgtable_t pgtable, struct mm_struct *mm, return; entry = mk_pmd(zero_page, vma->vm_page_prot); entry = pmd_mkhuge(entry); - if (pgtable) - pgtable_trans_huge_deposit(mm, pmd, pgtable); + pgtable_trans_huge_deposit(mm, pmd, pgtable); set_pmd_at(mm, haddr, pmd, entry); mm_inc_nr_ptes(mm); } @@ -2644,6 +2643,8 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) mapping = NULL; anon_vma_lock_write(anon_vma); } else { + gfp_t gfp; + mapping = head->mapping; /* Truncated ? */ @@ -2652,8 +2653,16 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) goto out; } - xas_split_alloc(&xas, head, compound_order(head), - mapping_gfp_mask(mapping) & GFP_RECLAIM_MASK); + gfp = current_gfp_context(mapping_gfp_mask(mapping) & + GFP_RECLAIM_MASK); + + if (folio_test_private(folio) && + !filemap_release_folio(folio, gfp)) { + ret = -EBUSY; + goto out; + } + + xas_split_alloc(&xas, head, compound_order(head), gfp); if (xas_error(&xas)) { ret = xas_error(&xas); goto out; diff --git a/mm/hugetlb_vmemmap.c b/mm/hugetlb_vmemmap.c index 76b2d03a0d8d..ba2a2596fb4e 100644 --- a/mm/hugetlb_vmemmap.c +++ b/mm/hugetlb_vmemmap.c @@ -265,11 +265,10 @@ static void vmemmap_remap_pte(pte_t *pte, unsigned long addr, static inline void reset_struct_pages(struct page *start) { - int i; struct page *from = start + NR_RESET_STRUCT_PAGE; - for (i = 0; i < NR_RESET_STRUCT_PAGE; i++) - memcpy(start + i, from, sizeof(*from)); + BUILD_BUG_ON(NR_RESET_STRUCT_PAGE * 2 > PAGE_SIZE / sizeof(struct page)); + memcpy(start, from, sizeof(*from) * NR_RESET_STRUCT_PAGE); } static void vmemmap_restore_pte(pte_t *pte, unsigned long addr, diff --git a/mm/memcontrol.c b/mm/memcontrol.c index bac2de4b9c42..29aeaf3d1915 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1136,7 +1136,7 @@ static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) } while ((memcg = parent_mem_cgroup(memcg))); /* - * When cgruop1 non-hierarchy mode is used, + * When cgroup1 non-hierarchy mode is used, * parent_mem_cgroup() does not walk all the way up to the * cgroup root (root_mem_cgroup). So we have to handle * dead_memcg from cgroup root separately. @@ -3969,6 +3969,8 @@ static const unsigned int memcg1_stats[] = { NR_FILE_MAPPED, NR_FILE_DIRTY, NR_WRITEBACK, + WORKINGSET_REFAULT_ANON, + WORKINGSET_REFAULT_FILE, MEMCG_SWAP, }; @@ -3982,6 +3984,8 @@ static const char *const memcg1_stat_names[] = { "mapped_file", "dirty", "writeback", + "workingset_refault_anon", + "workingset_refault_file", "swap", }; @@ -4010,7 +4014,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account()) continue; nr = memcg_page_state_local(memcg, memcg1_stats[i]); - seq_printf(m, "%s %lu\n", memcg1_stat_names[i], nr * PAGE_SIZE); + seq_printf(m, "%s %lu\n", memcg1_stat_names[i], + nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) @@ -4041,7 +4046,7 @@ static int memcg_stat_show(struct seq_file *m, void *v) continue; nr = memcg_page_state(memcg, memcg1_stats[i]); seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i], - (u64)nr * PAGE_SIZE); + (u64)nr * memcg_page_state_unit(memcg1_stats[i])); } for (i = 0; i < ARRAY_SIZE(memcg1_events); i++) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index e6713a979e42..265378237c22 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -413,7 +413,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, { struct to_kill *tk, *next; - list_for_each_entry_safe (tk, next, to_kill, nd) { + list_for_each_entry_safe(tk, next, to_kill, nd) { if (forcekill) { /* * In case something went wrong with munmapping @@ -437,6 +437,7 @@ static void kill_procs(struct list_head *to_kill, int forcekill, bool fail, pr_err("%#lx: Cannot send advisory machine check signal to %s:%d\n", pfn, tk->tsk->comm, tk->tsk->pid); } + list_del(&tk->nd); put_task_struct(tk->tsk); kfree(tk); } @@ -1401,7 +1402,7 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, struct address_space *mapping; LIST_HEAD(tokill); bool unmap_success; - int kill = 1, forcekill; + int forcekill; bool mlocked = PageMlocked(hpage); /* @@ -1442,7 +1443,6 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, if (page_mkclean(hpage)) { SetPageDirty(hpage); } else { - kill = 0; ttu |= TTU_IGNORE_HWPOISON; pr_info("%#lx: corrupted page was clean: dropped without side effects\n", pfn); @@ -1453,12 +1453,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * First collect all the processes that have the page * mapped in dirty form. This has to be done before try_to_unmap, * because ttu takes the rmap data structures down. - * - * Error handling: We ignore errors here because - * there's nothing that can be done. */ - if (kill) - collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); + collect_procs(hpage, &tokill, flags & MF_ACTION_REQUIRED); if (PageHuge(hpage) && !PageAnon(hpage)) { /* @@ -1500,7 +1496,8 @@ static bool hwpoison_user_mappings(struct page *p, unsigned long pfn, * use a more force-full uncatchable kill to prevent * any accesses to the poisoned memory. */ - forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL); + forcekill = PageDirty(hpage) || (flags & MF_MUST_KILL) || + !unmap_success; kill_procs(&tokill, forcekill, !unmap_success, pfn, flags); return unmap_success; @@ -1865,8 +1862,10 @@ retry: if (hwpoison_filter(p)) { hugetlb_clear_page_hwpoison(head); - res = -EOPNOTSUPP; - goto out; + unlock_page(head); + if (res == 1) + put_page(head); + return -EOPNOTSUPP; } /* @@ -2357,7 +2356,7 @@ int unpoison_memory(unsigned long pfn) goto unlock_mutex; } - if (PageSlab(page) || PageTable(page)) + if (PageSlab(page) || PageTable(page) || PageReserved(page)) goto unlock_mutex; ret = get_hwpoison_page(p, MF_UNPOISON); @@ -2381,13 +2380,14 @@ int unpoison_memory(unsigned long pfn) count = free_raw_hwp_pages(page, false); if (count == 0) { ret = -EBUSY; + put_page(page); goto unlock_mutex; } } freeit = !!TestClearPageHWPoison(p); put_page(page); - if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) { + if (freeit) { put_page(page); ret = 0; } @@ -2437,11 +2437,11 @@ static bool isolate_page(struct page *page, struct list_head *pagelist) } /* - * __soft_offline_page handles hugetlb-pages and non-hugetlb pages. + * soft_offline_in_use_page handles hugetlb-pages and non-hugetlb pages. * If the page is a non-dirty unmapped page-cache page, it simply invalidates. * If the page is mapped, it migrates the contents over. */ -static int __soft_offline_page(struct page *page) +static int soft_offline_in_use_page(struct page *page) { long ret = 0; unsigned long pfn = page_to_pfn(page); @@ -2454,6 +2454,14 @@ static int __soft_offline_page(struct page *page) .gfp_mask = GFP_USER | __GFP_MOVABLE | __GFP_RETRY_MAYFAIL, }; + if (!huge && PageTransHuge(hpage)) { + if (try_to_split_thp_page(page)) { + pr_info("soft offline: %#lx: thp split failed\n", pfn); + return -EBUSY; + } + hpage = page; + } + lock_page(page); if (!PageHuge(page)) wait_on_page_writeback(page); @@ -2503,29 +2511,6 @@ static int __soft_offline_page(struct page *page) return ret; } -static int soft_offline_in_use_page(struct page *page) -{ - struct page *hpage = compound_head(page); - - if (!PageHuge(page) && PageTransHuge(hpage)) - if (try_to_split_thp_page(page) < 0) { - pr_info("soft offline: %#lx: thp split failed\n", - page_to_pfn(page)); - return -EBUSY; - } - return __soft_offline_page(page); -} - -static int soft_offline_free_page(struct page *page) -{ - int rc = 0; - - if (!page_handle_poison(page, true, false)) - rc = -EBUSY; - - return rc; -} - static void put_ref_page(struct page *page) { if (page) @@ -2593,8 +2578,6 @@ retry: if (hwpoison_filter(page)) { if (ret > 0) put_page(page); - else - put_ref_page(ref_page); mutex_unlock(&mf_mutex); return -EOPNOTSUPP; @@ -2603,7 +2586,7 @@ retry: if (ret > 0) { ret = soft_offline_in_use_page(page); } else if (ret == 0) { - if (soft_offline_free_page(page) && try_again) { + if (!page_handle_poison(page, true, false) && try_again) { try_again = false; flags &= ~MF_COUNT_INCREASED; goto retry; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 7e16873b4b6f..ba0b438d57ab 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7957,17 +7957,6 @@ unsigned long __init node_map_pfn_alignment(void) return ~accl_mask + 1; } -/** - * find_min_pfn_with_active_regions - Find the minimum PFN registered - * - * Return: the minimum PFN based on information provided via - * memblock_set_node(). - */ -unsigned long __init find_min_pfn_with_active_regions(void) -{ - return PHYS_PFN(memblock_start_of_DRAM()); -} - /* * early_calculate_totalpages() * Sum pages in active regions for movable zone. @@ -8260,7 +8249,7 @@ void __init free_area_init(unsigned long *max_zone_pfn) memset(arch_zone_highest_possible_pfn, 0, sizeof(arch_zone_highest_possible_pfn)); - start_pfn = find_min_pfn_with_active_regions(); + start_pfn = PHYS_PFN(memblock_start_of_DRAM()); descending = arch_has_descending_max_zone_pfns(); for (i = 0; i < MAX_NR_ZONES; i++) { diff --git a/mm/page_counter.c b/mm/page_counter.c index eb156ff5d603..8a0cc24b60dd 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -193,7 +193,7 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) old = xchg(&counter->max, nr_pages); - if (page_counter_read(counter) <= usage) + if (page_counter_read(counter) <= usage || nr_pages >= old) return 0; counter->max = old; diff --git a/mm/page_ext.c b/mm/page_ext.c index e22a928dd66a..b236bdd59fa8 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -9,6 +9,7 @@ #include #include #include +#include /* * struct page extension @@ -59,6 +60,10 @@ * can utilize this callback to initialize the state of it correctly. */ +#ifdef CONFIG_SPARSEMEM +#define PAGE_EXT_INVALID (0x1) +#endif + #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) static bool need_page_idle(void) { @@ -84,6 +89,7 @@ static struct page_ext_operations *page_ext_ops[] __initdata = { unsigned long page_ext_size = sizeof(struct page_ext); static unsigned long total_usage; +static struct page_ext *lookup_page_ext(const struct page *page); static bool __init invoke_need_callbacks(void) { @@ -125,6 +131,48 @@ static inline struct page_ext *get_entry(void *base, unsigned long index) return base + page_ext_size * index; } +/** + * page_ext_get() - Get the extended information for a page. + * @page: The page we're interested in. + * + * Ensures that the page_ext will remain valid until page_ext_put() + * is called. + * + * Return: NULL if no page_ext exists for this page. + * Context: Any context. Caller may not sleep until they have called + * page_ext_put(). + */ +struct page_ext *page_ext_get(struct page *page) +{ + struct page_ext *page_ext; + + rcu_read_lock(); + page_ext = lookup_page_ext(page); + if (!page_ext) { + rcu_read_unlock(); + return NULL; + } + + return page_ext; +} + +/** + * page_ext_put() - Working with page extended information is done. + * @page_ext - Page extended information received from page_ext_get(). + * + * The page extended information of the page may not be valid after this + * function is called. + * + * Return: None. + * Context: Any context with corresponding page_ext_get() is called. + */ +void page_ext_put(struct page_ext *page_ext) +{ + if (unlikely(!page_ext)) + return; + + rcu_read_unlock(); +} #ifndef CONFIG_SPARSEMEM @@ -133,12 +181,13 @@ void __meminit pgdat_page_ext_init(struct pglist_data *pgdat) pgdat->node_page_ext = NULL; } -struct page_ext *lookup_page_ext(const struct page *page) +static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); unsigned long index; struct page_ext *base; + WARN_ON_ONCE(!rcu_read_lock_held()); base = NODE_DATA(page_to_nid(page))->node_page_ext; /* * The sanity checks the page allocator does upon freeing a @@ -206,20 +255,27 @@ fail: } #else /* CONFIG_SPARSEMEM */ +static bool page_ext_invalid(struct page_ext *page_ext) +{ + return !page_ext || (((unsigned long)page_ext & PAGE_EXT_INVALID) == PAGE_EXT_INVALID); +} -struct page_ext *lookup_page_ext(const struct page *page) +static struct page_ext *lookup_page_ext(const struct page *page) { unsigned long pfn = page_to_pfn(page); struct mem_section *section = __pfn_to_section(pfn); + struct page_ext *page_ext = READ_ONCE(section->page_ext); + + WARN_ON_ONCE(!rcu_read_lock_held()); /* * The sanity checks the page allocator does upon freeing a * page can reach here before the page_ext arrays are * allocated when feeding a range of pages to the allocator * for the first time during bootup or memory hotplug. */ - if (!section->page_ext) + if (page_ext_invalid(page_ext)) return NULL; - return get_entry(section->page_ext, pfn); + return get_entry(page_ext, pfn); } static void *__meminit alloc_page_ext(size_t size, int nid) @@ -298,9 +354,30 @@ static void __free_page_ext(unsigned long pfn) ms = __pfn_to_section(pfn); if (!ms || !ms->page_ext) return; - base = get_entry(ms->page_ext, pfn); + + base = READ_ONCE(ms->page_ext); + /* + * page_ext here can be valid while doing the roll back + * operation in online_page_ext(). + */ + if (page_ext_invalid(base)) + base = (void *)base - PAGE_EXT_INVALID; + WRITE_ONCE(ms->page_ext, NULL); + + base = get_entry(base, pfn); free_page_ext(base); - ms->page_ext = NULL; +} + +static void __invalidate_page_ext(unsigned long pfn) +{ + struct mem_section *ms; + void *val; + + ms = __pfn_to_section(pfn); + if (!ms || !ms->page_ext) + return; + val = (void *)ms->page_ext + PAGE_EXT_INVALID; + WRITE_ONCE(ms->page_ext, val); } static int __meminit online_page_ext(unsigned long start_pfn, @@ -343,6 +420,20 @@ static int __meminit offline_page_ext(unsigned long start_pfn, start = SECTION_ALIGN_DOWN(start_pfn); end = SECTION_ALIGN_UP(start_pfn + nr_pages); + /* + * Freeing of page_ext is done in 3 steps to avoid + * use-after-free of it: + * 1) Traverse all the sections and mark their page_ext + * as invalid. + * 2) Wait for all the existing users of page_ext who + * started before invalidation to finish. + * 3) Free the page_ext. + */ + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) + __invalidate_page_ext(pfn); + + synchronize_rcu(); + for (pfn = start; pfn < end; pfn += PAGES_PER_SECTION) __free_page_ext(pfn); return 0; diff --git a/mm/page_owner.c b/mm/page_owner.c index e4c6f3f1695b..90023f938c19 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -141,7 +141,7 @@ void __reset_page_owner(struct page *page, unsigned short order) struct page_owner *page_owner; u64 free_ts_nsec = local_clock(); - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; @@ -153,6 +153,7 @@ void __reset_page_owner(struct page *page, unsigned short order) page_owner->free_ts_nsec = free_ts_nsec; page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } static inline void __set_page_owner_handle(struct page_ext *page_ext, @@ -183,19 +184,21 @@ static inline void __set_page_owner_handle(struct page_ext *page_ext, noinline void __set_page_owner(struct page *page, unsigned short order, gfp_t gfp_mask) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext; depot_stack_handle_t handle; + handle = save_stack(gfp_mask); + + page_ext = page_ext_get(page); if (unlikely(!page_ext)) return; - - handle = save_stack(gfp_mask); __set_page_owner_handle(page_ext, handle, order, gfp_mask); + page_ext_put(page_ext); } void __set_page_owner_migrate_reason(struct page *page, int reason) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) @@ -203,12 +206,13 @@ void __set_page_owner_migrate_reason(struct page *page, int reason) page_owner = get_page_owner(page_ext); page_owner->last_migrate_reason = reason; + page_ext_put(page_ext); } void __split_page_owner(struct page *page, unsigned int nr) { int i; - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get(page); struct page_owner *page_owner; if (unlikely(!page_ext)) @@ -219,17 +223,25 @@ void __split_page_owner(struct page *page, unsigned int nr) page_owner->order = 0; page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } void __folio_copy_owner(struct folio *newfolio, struct folio *old) { - struct page_ext *old_ext = lookup_page_ext(&old->page); - struct page_ext *new_ext = lookup_page_ext(&newfolio->page); + struct page_ext *old_ext; + struct page_ext *new_ext; struct page_owner *old_page_owner, *new_page_owner; - if (unlikely(!old_ext || !new_ext)) + old_ext = page_ext_get(&old->page); + if (unlikely(!old_ext)) return; + new_ext = page_ext_get(&newfolio->page); + if (unlikely(!new_ext)) { + page_ext_put(old_ext); + return; + } + old_page_owner = get_page_owner(old_ext); new_page_owner = get_page_owner(new_ext); new_page_owner->order = old_page_owner->order; @@ -254,6 +266,8 @@ void __folio_copy_owner(struct folio *newfolio, struct folio *old) */ __set_bit(PAGE_EXT_OWNER, &new_ext->flags); __set_bit(PAGE_EXT_OWNER_ALLOCATED, &new_ext->flags); + page_ext_put(new_ext); + page_ext_put(old_ext); } void pagetypeinfo_showmixedcount_print(struct seq_file *m, @@ -307,12 +321,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, if (PageReserved(page)) continue; - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) - continue; + goto ext_put_continue; page_owner = get_page_owner(page_ext); page_mt = gfp_migratetype(page_owner->gfp_mask); @@ -323,9 +337,12 @@ void pagetypeinfo_showmixedcount_print(struct seq_file *m, count[pageblock_mt]++; pfn = block_end_pfn; + page_ext_put(page_ext); break; } pfn += (1UL << page_owner->order) - 1; +ext_put_continue: + page_ext_put(page_ext); } } @@ -435,7 +452,7 @@ err: void __dump_page_owner(const struct page *page) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext = page_ext_get((void *)page); struct page_owner *page_owner; depot_stack_handle_t handle; gfp_t gfp_mask; @@ -452,6 +469,7 @@ void __dump_page_owner(const struct page *page) if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) { pr_alert("page_owner info is not present (never set?)\n"); + page_ext_put(page_ext); return; } @@ -482,6 +500,7 @@ void __dump_page_owner(const struct page *page) if (page_owner->last_migrate_reason != -1) pr_alert("page has been migrated, last migrate reason: %s\n", migrate_reason_names[page_owner->last_migrate_reason]); + page_ext_put(page_ext); } static ssize_t @@ -497,8 +516,10 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) return -EINVAL; page = NULL; - pfn = min_low_pfn + *ppos; - + if (*ppos == 0) + pfn = min_low_pfn; + else + pfn = *ppos; /* Find a valid PFN or the start of a MAX_ORDER_NR_PAGES area */ while (!pfn_valid(pfn) && (pfn & (MAX_ORDER_NR_PAGES - 1)) != 0) pfn++; @@ -507,6 +528,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) /* Find an allocated page */ for (; pfn < max_pfn; pfn++) { + /* + * This temporary page_owner is required so + * that we can avoid the context switches while holding + * the rcu lock and copying the page owner information to + * user through copy_to_user() or GFP_KERNEL allocations. + */ + struct page_owner page_owner_tmp; + /* * If the new page is in a new MAX_ORDER_NR_PAGES area, * validate the area as existing, skip it if not @@ -525,7 +554,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) continue; } - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; @@ -534,14 +563,14 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) * because we don't hold the zone lock. */ if (!test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; + goto ext_put_continue; /* * Although we do have the info about past allocation of free * pages, it's not relevant for current memory usage. */ if (!test_bit(PAGE_EXT_OWNER_ALLOCATED, &page_ext->flags)) - continue; + goto ext_put_continue; page_owner = get_page_owner(page_ext); @@ -550,7 +579,7 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) * would inflate the stats. */ if (!IS_ALIGNED(pfn, 1 << page_owner->order)) - continue; + goto ext_put_continue; /* * Access to page_ext->handle isn't synchronous so we should @@ -558,18 +587,37 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) */ handle = READ_ONCE(page_owner->handle); if (!handle) - continue; + goto ext_put_continue; /* Record the next PFN to read in the file offset */ - *ppos = (pfn - min_low_pfn) + 1; + *ppos = pfn + 1; + page_owner_tmp = *page_owner; + page_ext_put(page_ext); return print_page_owner(buf, count, pfn, page, - page_owner, handle); + &page_owner_tmp, handle); +ext_put_continue: + page_ext_put(page_ext); } return 0; } +static loff_t lseek_page_owner(struct file *file, loff_t offset, int orig) +{ + switch (orig) { + case SEEK_SET: + file->f_pos = offset; + break; + case SEEK_CUR: + file->f_pos += offset; + break; + default: + return -EINVAL; + } + return file->f_pos; +} + static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) { unsigned long pfn = zone->zone_start_pfn; @@ -617,18 +665,20 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) if (PageReserved(page)) continue; - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); if (unlikely(!page_ext)) continue; /* Maybe overlapping zone */ if (test_bit(PAGE_EXT_OWNER, &page_ext->flags)) - continue; + goto ext_put_continue; /* Found early allocated page */ __set_page_owner_handle(page_ext, early_handle, 0, 0); count++; +ext_put_continue: + page_ext_put(page_ext); } cond_resched(); } @@ -660,6 +710,7 @@ static void init_early_allocated_pages(void) static const struct file_operations proc_page_owner_operations = { .read = read_page_owner, + .llseek = lseek_page_owner, }; static int __init pageowner_init(void) diff --git a/mm/page_table_check.c b/mm/page_table_check.c index e2062748791a..903db62794d3 100644 --- a/mm/page_table_check.c +++ b/mm/page_table_check.c @@ -68,7 +68,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, return; page = pfn_to_page(pfn); - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); anon = PageAnon(page); for (i = 0; i < pgcnt; i++) { @@ -83,6 +83,7 @@ static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, } page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } /* @@ -103,7 +104,7 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, return; page = pfn_to_page(pfn); - page_ext = lookup_page_ext(page); + page_ext = page_ext_get(page); anon = PageAnon(page); for (i = 0; i < pgcnt; i++) { @@ -118,6 +119,7 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, } page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } /* @@ -126,9 +128,10 @@ static void page_table_check_set(struct mm_struct *mm, unsigned long addr, */ void __page_table_check_zero(struct page *page, unsigned int order) { - struct page_ext *page_ext = lookup_page_ext(page); + struct page_ext *page_ext; unsigned long i; + page_ext = page_ext_get(page); BUG_ON(!page_ext); for (i = 0; i < (1ul << order); i++) { struct page_table_check *ptc = get_page_table_check(page_ext); @@ -137,6 +140,7 @@ void __page_table_check_zero(struct page *page, unsigned int order) BUG_ON(atomic_read(&ptc->file_map_count)); page_ext = page_ext_next(page_ext); } + page_ext_put(page_ext); } void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, diff --git a/mm/pagewalk.c b/mm/pagewalk.c index fa7a3d21a751..1cb8de5889c4 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -482,7 +482,15 @@ int walk_page_range(struct mm_struct *mm, unsigned long start, return err; } -/* +/** + * walk_page_range_novma - walk a range of pagetables not backed by a vma + * @mm: mm_struct representing the target process of page table walk + * @start: start address of the virtual address range + * @end: end address of the virtual address range + * @ops: operation to call during the walk + * @pgd: pgd to walk if different from mm->pgd + * @private: private data for callbacks' usage + * * Similar to walk_page_range() but can walk any page tables even if they are * not backed by VMAs. Because 'unusual' entries may be walked this function * will also not lock the PTEs for the pte_entry() callback. This is useful for diff --git a/mm/vmalloc.c b/mm/vmalloc.c index dd6cdb201195..e68c0081e861 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -590,7 +590,7 @@ int vmap_pages_range_noflush(unsigned long addr, unsigned long end, int err; err = vmap_range_noflush(addr, addr + (1UL << page_shift), - __pa(page_address(pages[i])), prot, + page_to_phys(pages[i]), prot, page_shift); if (err) return err;