From 3e9d80a891df3b1a5d77db47fa7fdf33ba71e5cb Mon Sep 17 00:00:00 2001 From: "Matthew Wilcox (Oracle)" Date: Fri, 14 Jan 2022 14:05:04 -0800 Subject: [PATCH 1/6] mm,fs: split dump_mapping() out from dump_page() dump_mapping() is a big chunk of dump_page(), and it'd be handy to be able to call it when we don't have a struct page. Split it out and move it to fs/inode.c. Take the opportunity to simplify some of the debug messages a little. Link: https://lkml.kernel.org/r/20211121121056.2870061-1-willy@infradead.org Signed-off-by: Matthew Wilcox (Oracle) Reviewed-by: William Kucharski Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/inode.c | 49 +++++++++++++++++++++++++++++++++++++++++++ include/linux/fs.h | 1 + mm/debug.c | 52 ++-------------------------------------------- 3 files changed, 52 insertions(+), 50 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 6b80a51129d5..980e7b7a5460 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -526,6 +526,55 @@ void __remove_inode_hash(struct inode *inode) } EXPORT_SYMBOL(__remove_inode_hash); +void dump_mapping(const struct address_space *mapping) +{ + struct inode *host; + const struct address_space_operations *a_ops; + struct hlist_node *dentry_first; + struct dentry *dentry_ptr; + struct dentry dentry; + unsigned long ino; + + /* + * If mapping is an invalid pointer, we don't want to crash + * accessing it, so probe everything depending on it carefully. + */ + if (get_kernel_nofault(host, &mapping->host) || + get_kernel_nofault(a_ops, &mapping->a_ops)) { + pr_warn("invalid mapping:%px\n", mapping); + return; + } + + if (!host) { + pr_warn("aops:%ps\n", a_ops); + return; + } + + if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || + get_kernel_nofault(ino, &host->i_ino)) { + pr_warn("aops:%ps invalid inode:%px\n", a_ops, host); + return; + } + + if (!dentry_first) { + pr_warn("aops:%ps ino:%lx\n", a_ops, ino); + return; + } + + dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); + if (get_kernel_nofault(dentry, dentry_ptr)) { + pr_warn("aops:%ps ino:%lx invalid dentry:%px\n", + a_ops, ino, dentry_ptr); + return; + } + + /* + * if dentry is corrupted, the %pd handler may still crash, + * but it's unlikely that we reach here with a corrupt mapping + */ + pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", a_ops, ino, &dentry); +} + void clear_inode(struct inode *inode) { /* diff --git a/include/linux/fs.h b/include/linux/fs.h index bbf812ce89a8..5315fa68f751 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3152,6 +3152,7 @@ extern void unlock_new_inode(struct inode *); extern void discard_new_inode(struct inode *); extern unsigned int get_next_ino(void); extern void evict_inodes(struct super_block *sb); +void dump_mapping(const struct address_space *); /* * Userspace may rely on the the inode number being non-zero. For example, glibc diff --git a/mm/debug.c b/mm/debug.c index a05a39ff8fe4..bc9ac87f0e08 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -112,56 +112,8 @@ static void __dump_page(struct page *page) type = "ksm "; else if (PageAnon(page)) type = "anon "; - else if (mapping) { - struct inode *host; - const struct address_space_operations *a_ops; - struct hlist_node *dentry_first; - struct dentry *dentry_ptr; - struct dentry dentry; - unsigned long ino; - - /* - * mapping can be invalid pointer and we don't want to crash - * accessing it, so probe everything depending on it carefully - */ - if (get_kernel_nofault(host, &mapping->host) || - get_kernel_nofault(a_ops, &mapping->a_ops)) { - pr_warn("failed to read mapping contents, not a valid kernel address?\n"); - goto out_mapping; - } - - if (!host) { - pr_warn("aops:%ps\n", a_ops); - goto out_mapping; - } - - if (get_kernel_nofault(dentry_first, &host->i_dentry.first) || - get_kernel_nofault(ino, &host->i_ino)) { - pr_warn("aops:%ps with invalid host inode %px\n", - a_ops, host); - goto out_mapping; - } - - if (!dentry_first) { - pr_warn("aops:%ps ino:%lx\n", a_ops, ino); - goto out_mapping; - } - - dentry_ptr = container_of(dentry_first, struct dentry, d_u.d_alias); - if (get_kernel_nofault(dentry, dentry_ptr)) { - pr_warn("aops:%ps ino:%lx with invalid dentry %px\n", - a_ops, ino, dentry_ptr); - } else { - /* - * if dentry is corrupted, the %pd handler may still - * crash, but it's unlikely that we reach here with a - * corrupted struct page - */ - pr_warn("aops:%ps ino:%lx dentry name:\"%pd\"\n", - a_ops, ino, &dentry); - } - } -out_mapping: + else if (mapping) + dump_mapping(mapping); BUILD_BUG_ON(ARRAY_SIZE(pageflag_names) != __NR_PAGEFLAGS + 1); pr_warn("%sflags: %pGp%s\n", type, &head->flags, From 236476180c0f5d308fb313d5570d0b067307884c Mon Sep 17 00:00:00 2001 From: Anshuman Khandual Date: Fri, 14 Jan 2022 14:05:07 -0800 Subject: [PATCH 2/6] mm/debug_vm_pgtable: update comments regarding migration swap entries Commit 4dd845b5a3e5 ("mm/swapops: rework swap entry manipulation code") had changed migtation entry related helpers. Just update debug_vm_pgatble() synced documentation to reflect those changes. Link: https://lkml.kernel.org/r/1641880417-24848-1-git-send-email-anshuman.khandual@arm.com Signed-off-by: Anshuman Khandual Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/arch_pgtable_helpers.rst | 14 +++++++------- mm/debug_vm_pgtable.c | 4 ++-- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst index 552567d863b8..b3166c33db39 100644 --- a/Documentation/vm/arch_pgtable_helpers.rst +++ b/Documentation/vm/arch_pgtable_helpers.rst @@ -247,12 +247,12 @@ SWAP Page Table Helpers | __swp_to_pmd_entry | Creates a mapped PMD from a swapped entry (arch) | +---------------------------+--------------------------------------------------+ | is_migration_entry | Tests a migration (read or write) swapped entry | -+---------------------------+--------------------------------------------------+ -| is_write_migration_entry | Tests a write migration swapped entry | -+---------------------------+--------------------------------------------------+ -| make_migration_entry_read | Converts into read migration swapped entry | -+---------------------------+--------------------------------------------------+ -| make_migration_entry | Creates a migration swapped entry (read or write)| -+---------------------------+--------------------------------------------------+ ++-------------------------------+----------------------------------------------+ +| is_writable_migration_entry | Tests a write migration swapped entry | ++-------------------------------+----------------------------------------------+ +| make_readable_migration_entry | Creates a read migration swapped entry | ++-------------------------------+----------------------------------------------+ +| make_writable_migration_entry | Creates a write migration swapped entry | ++-------------------------------+----------------------------------------------+ [1] https://lore.kernel.org/linux-mm/20181017020930.GN30832@redhat.com/ diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 228e3954b90c..2a2b24e87877 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -888,8 +888,8 @@ static void __init swap_migration_tests(struct pgtable_debug_args *args) pr_debug("Validating swap migration\n"); /* - * make_migration_entry() expects given page to be - * locked, otherwise it stumbles upon a BUG_ON(). + * make_[readable|writable]_migration_entry() expects given page to + * be locked, otherwise it stumbles upon a BUG_ON(). */ __SetPageLocked(page); swp = make_writable_migration_entry(page_to_pfn(page)); From 43b93121056c524e2af77d561900ea856d32029c Mon Sep 17 00:00:00 2001 From: chiminghao Date: Fri, 14 Jan 2022 14:05:10 -0800 Subject: [PATCH 3/6] mm/truncate.c: remove unneeded variable Return value directly instead of taking this in another redundant variable. Link: https://lkml.kernel.org/r/20211207083222.401594-1-chi.minghao@zte.com.cn Signed-off-by: chiminghao Reported-by: Zeal Robot Reviewed-by: David Hildenbrand Reviewed-by: Pankaj Gupta Reviewed-by: Muchun Song Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/truncate.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/mm/truncate.c b/mm/truncate.c index cc83a3f7c1ad..41b8249b3b4a 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -205,7 +205,6 @@ static void truncate_cleanup_page(struct page *page) static int invalidate_complete_page(struct address_space *mapping, struct page *page) { - int ret; if (page->mapping != mapping) return 0; @@ -213,9 +212,7 @@ invalidate_complete_page(struct address_space *mapping, struct page *page) if (page_has_private(page) && !try_to_release_page(page, 0)) return 0; - ret = remove_mapping(mapping, page); - - return ret; + return remove_mapping(mapping, page); } int truncate_inode_page(struct address_space *mapping, struct page *page) From 677b2a8c1f25db5b09c1ef5bf72faa39ea81d9cf Mon Sep 17 00:00:00 2001 From: Christophe Leroy Date: Fri, 14 Jan 2022 14:05:13 -0800 Subject: [PATCH 4/6] gup: avoid multiple user access locking/unlocking in fault_in_{read/write}able fault_in_readable() and fault_in_writeable() perform __get_user() and __put_user() in a loop, implying multiple user access locking/unlocking. To avoid that, use user access blocks. Link: https://lkml.kernel.org/r/720dcf79314acca1a78fae56d478cc851952149d.1637084492.git.christophe.leroy@csgroup.eu Signed-off-by: Christophe Leroy Reviewed-by: Andreas Gruenbacher Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 2c51e9748a6a..be2a41feec7d 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1672,21 +1672,22 @@ size_t fault_in_writeable(char __user *uaddr, size_t size) if (unlikely(size == 0)) return 0; + if (!user_write_access_begin(uaddr, size)) + return size; if (!PAGE_ALIGNED(uaddr)) { - if (unlikely(__put_user(0, uaddr) != 0)) - return size; + unsafe_put_user(0, uaddr, out); uaddr = (char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { - if (unlikely(__put_user(0, uaddr) != 0)) - goto out; + unsafe_put_user(0, uaddr, out); uaddr += PAGE_SIZE; } out: + user_write_access_end(); if (size > uaddr - start) return size - (uaddr - start); return 0; @@ -1771,21 +1772,22 @@ size_t fault_in_readable(const char __user *uaddr, size_t size) if (unlikely(size == 0)) return 0; + if (!user_read_access_begin(uaddr, size)) + return size; if (!PAGE_ALIGNED(uaddr)) { - if (unlikely(__get_user(c, uaddr) != 0)) - return size; + unsafe_get_user(c, uaddr, out); uaddr = (const char __user *)PAGE_ALIGN((unsigned long)uaddr); } end = (const char __user *)PAGE_ALIGN((unsigned long)start + size); if (unlikely(end < start)) end = NULL; while (uaddr != end) { - if (unlikely(__get_user(c, uaddr) != 0)) - goto out; + unsafe_get_user(c, uaddr, out); uaddr += PAGE_SIZE; } out: + user_read_access_end(); (void)c; if (size > uaddr - start) return size - (uaddr - start); From 28b0ee3fb35047bd2bac57cc5a051b26bbd9b194 Mon Sep 17 00:00:00 2001 From: Li Xinhai Date: Fri, 14 Jan 2022 14:05:16 -0800 Subject: [PATCH 5/6] mm/gup.c: stricter check on THP migration entry during follow_pmd_mask When BUG_ON check for THP migration entry, the existing code only check thp_migration_supported case, but not for !thp_migration_supported case. If !thp_migration_supported() and !pmd_present(), the original code may dead loop in theory. To make the BUG_ON check consistent, we need catch both cases. Move the BUG_ON check one step earlier, because if the bug happen we should know it instead of depend on FOLL_MIGRATION been used by caller. Because pmdval instead of *pmd is read by the is_pmd_migration_entry() check, the existing code don't help to avoid useless locking within pmd_migration_entry_wait(), so remove that check. Link: https://lkml.kernel.org/r/20211217062559.737063-1-lixinhai.lxh@gmail.com Signed-off-by: Li Xinhai Reviewed-by: "Huang, Ying" Reviewed-by: Miaohe Lin Cc: Zi Yan Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index be2a41feec7d..f0af462ac1e2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -642,12 +642,17 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, } retry: if (!pmd_present(pmdval)) { + /* + * Should never reach here, if thp migration is not supported; + * Otherwise, it must be a thp migration entry. + */ + VM_BUG_ON(!thp_migration_supported() || + !is_pmd_migration_entry(pmdval)); + if (likely(!(flags & FOLL_MIGRATION))) return no_page_table(vma, flags); - VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(pmdval)); - if (is_pmd_migration_entry(pmdval)) - pmd_migration_entry_wait(mm, pmd); + + pmd_migration_entry_wait(mm, pmd); pmdval = READ_ONCE(*pmd); /* * MADV_DONTNEED may convert the pmd to null because From a7605426666196c5a460dd3de6f8dac1d3c21f00 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Fri, 14 Jan 2022 14:05:19 -0800 Subject: [PATCH 6/6] mm: shmem: don't truncate page if memory failure happens The current behavior of memory failure is to truncate the page cache regardless of dirty or clean. If the page is dirty the later access will get the obsolete data from disk without any notification to the users. This may cause silent data loss. It is even worse for shmem since shmem is in-memory filesystem, truncating page cache means discarding data blocks. The later read would return all zero. The right approach is to keep the corrupted page in page cache, any later access would return error for syscalls or SIGBUS for page fault, until the file is truncated, hole punched or removed. The regular storage backed filesystems would be more complicated so this patch is focused on shmem. This also unblock the support for soft offlining shmem THP. [akpm@linux-foundation.org: coding style fixes] [arnd@arndb.de: fix uninitialized variable use in me_pagecache_clean()] Link: https://lkml.kernel.org/r/20211022064748.4173718-1-arnd@kernel.org [Fix invalid pointer dereference in shmem_read_mapping_page_gfp() with a slight different implementation from what Ajay Garg and Muchun Song proposed and reworked the error handling of shmem_write_begin() suggested by Linus] Link: https://lore.kernel.org/linux-mm/20211111084617.6746-1-ajaygargnsit@gmail.com/ Link: https://lkml.kernel.org/r/20211020210755.23964-6-shy828301@gmail.com Link: https://lkml.kernel.org/r/20211116193247.21102-1-shy828301@gmail.com Signed-off-by: Yang Shi Signed-off-by: Arnd Bergmann Cc: Hugh Dickins Cc: Kirill A. Shutemov Cc: Matthew Wilcox Cc: Naoya Horiguchi Cc: Oscar Salvador Cc: Peter Xu Cc: Ajay Garg Cc: Muchun Song Cc: Andy Lavr Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory-failure.c | 14 ++++++++++--- mm/shmem.c | 51 +++++++++++++++++++++++++++++++++++++++------ mm/userfaultfd.c | 5 +++++ 3 files changed, 61 insertions(+), 9 deletions(-) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 3a274468f193..5f8ad5527506 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -58,6 +58,7 @@ #include #include #include +#include #include "internal.h" #include "ras/ras_event.h" @@ -867,6 +868,7 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) { int ret; struct address_space *mapping; + bool extra_pins; delete_from_lru_cache(p); @@ -895,18 +897,24 @@ static int me_pagecache_clean(struct page_state *ps, struct page *p) goto out; } + /* + * The shmem page is kept in page cache instead of truncating + * so is expected to have an extra refcount after error-handling. + */ + extra_pins = shmem_mapping(mapping); + /* * Truncation is a bit tricky. Enable it per file system for now. * * Open: to take i_rwsem or not for this? Right now we don't. */ ret = truncate_error_page(p, page_to_pfn(p), mapping); + if (has_extra_refcount(ps, p, extra_pins)) + ret = MF_FAILED; + out: unlock_page(p); - if (has_extra_refcount(ps, p, false)) - ret = MF_FAILED; - return ret; } diff --git a/mm/shmem.c b/mm/shmem.c index 18f93c2d68f1..fb5152369926 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2457,6 +2457,7 @@ shmem_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct shmem_inode_info *info = SHMEM_I(inode); pgoff_t index = pos >> PAGE_SHIFT; + int ret = 0; /* i_rwsem is held by caller */ if (unlikely(info->seals & (F_SEAL_GROW | @@ -2467,7 +2468,19 @@ shmem_write_begin(struct file *file, struct address_space *mapping, return -EPERM; } - return shmem_getpage(inode, index, pagep, SGP_WRITE); + ret = shmem_getpage(inode, index, pagep, SGP_WRITE); + + if (ret) + return ret; + + if (PageHWPoison(*pagep)) { + unlock_page(*pagep); + put_page(*pagep); + *pagep = NULL; + return -EIO; + } + + return 0; } static int @@ -2554,6 +2567,12 @@ static ssize_t shmem_file_read_iter(struct kiocb *iocb, struct iov_iter *to) if (sgp == SGP_CACHE) set_page_dirty(page); unlock_page(page); + + if (PageHWPoison(page)) { + put_page(page); + error = -EIO; + break; + } } /* @@ -3093,7 +3112,8 @@ static const char *shmem_get_link(struct dentry *dentry, page = find_get_page(inode->i_mapping, 0); if (!page) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { + if (PageHWPoison(page) || + !PageUptodate(page)) { put_page(page); return ERR_PTR(-ECHILD); } @@ -3101,6 +3121,13 @@ static const char *shmem_get_link(struct dentry *dentry, error = shmem_getpage(inode, 0, &page, SGP_READ); if (error) return ERR_PTR(error); + if (!page) + return ERR_PTR(-ECHILD); + if (PageHWPoison(page)) { + unlock_page(page); + put_page(page); + return ERR_PTR(-ECHILD); + } unlock_page(page); } set_delayed_call(done, shmem_put_link, page); @@ -3751,6 +3778,13 @@ static void shmem_destroy_inodecache(void) kmem_cache_destroy(shmem_inode_cachep); } +/* Keep the page in page cache instead of truncating it */ +static int shmem_error_remove_page(struct address_space *mapping, + struct page *page) +{ + return 0; +} + const struct address_space_operations shmem_aops = { .writepage = shmem_writepage, .set_page_dirty = __set_page_dirty_no_writeback, @@ -3761,7 +3795,7 @@ const struct address_space_operations shmem_aops = { #ifdef CONFIG_MIGRATION .migratepage = migrate_page, #endif - .error_remove_page = generic_error_remove_page, + .error_remove_page = shmem_error_remove_page, }; EXPORT_SYMBOL(shmem_aops); @@ -4169,9 +4203,14 @@ struct page *shmem_read_mapping_page_gfp(struct address_space *mapping, error = shmem_getpage_gfp(inode, index, &page, SGP_CACHE, gfp, NULL, NULL, NULL); if (error) - page = ERR_PTR(error); - else - unlock_page(page); + return ERR_PTR(error); + + unlock_page(page); + if (PageHWPoison(page)) { + put_page(page); + return ERR_PTR(-EIO); + } + return page; #else /* diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index ac6f036298cd..0780c2a57ff1 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -232,6 +232,11 @@ static int mcontinue_atomic_pte(struct mm_struct *dst_mm, goto out; } + if (PageHWPoison(page)) { + ret = -EIO; + goto out_release; + } + ret = mfill_atomic_install_pte(dst_mm, dst_pmd, dst_vma, dst_addr, page, false, wp_copy); if (ret)