From 2067231a9e2cbbcae0a4aca6ac36ff2dd6a7b701 Mon Sep 17 00:00:00 2001 From: Sun Ke Date: Fri, 12 Aug 2022 09:14:40 +0800 Subject: [PATCH 01/39] NFS: Fix missing unlock in nfs_unlink() Add the missing unlock before goto. Fixes: 3c59366c207e ("NFS: don't unhash dentry during unlink/rename") Signed-off-by: Sun Ke Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index dbab3caa15ed..1b879584d4fe 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2484,8 +2484,10 @@ int nfs_unlink(struct inode *dir, struct dentry *dentry) */ error = -ETXTBSY; if (WARN_ON(dentry->d_flags & DCACHE_NFSFS_RENAMED) || - WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED)) + WARN_ON(dentry->d_fsdata == NFS_FSDATA_BLOCKED)) { + spin_unlock(&dentry->d_lock); goto out; + } if (dentry->d_fsdata) /* old devname */ kfree(dentry->d_fsdata); From 67f4b5dc49913abcdb5cc736e73674e2f352f81d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 13 Aug 2022 08:22:25 -0400 Subject: [PATCH 02/39] NFS: Fix another fsync() issue after a server reboot Currently, when the writeback code detects a server reboot, it redirties any pages that were not committed to disk, and it sets the flag NFS_CONTEXT_RESEND_WRITES in the nfs_open_context of the file descriptor that dirtied the file. While this allows the file descriptor in question to redrive its own writes, it violates the fsync() requirement that we should be synchronising all writes to disk. While the problem is infrequent, we do see corner cases where an untimely server reboot causes the fsync() call to abandon its attempt to sync data to disk and causing data corruption issues due to missed error conditions or similar. In order to tighted up the client's ability to deal with this situation without introducing livelocks, add a counter that records the number of times pages are redirtied due to a server reboot-like condition, and use that in fsync() to redrive the sync to disk. Fixes: 2197e9b06c22 ("NFS: Fix up fsync() when the server rebooted") Cc: stable@vger.kernel.org Signed-off-by: Trond Myklebust --- fs/nfs/file.c | 15 ++++++--------- fs/nfs/inode.c | 1 + fs/nfs/write.c | 6 ++++-- include/linux/nfs_fs.h | 1 + 4 files changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 54237a231687..749e5487df50 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -221,8 +221,10 @@ nfs_file_fsync_commit(struct file *file, int datasync) int nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - struct nfs_open_context *ctx = nfs_file_open_context(file); struct inode *inode = file_inode(file); + struct nfs_inode *nfsi = NFS_I(inode); + long save_nredirtied = atomic_long_read(&nfsi->redirtied_pages); + long nredirtied; int ret; trace_nfs_fsync_enter(inode); @@ -237,15 +239,10 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = pnfs_sync_inode(inode, !!datasync); if (ret != 0) break; - if (!test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags)) + nredirtied = atomic_long_read(&nfsi->redirtied_pages); + if (nredirtied == save_nredirtied) break; - /* - * If nfs_file_fsync_commit detected a server reboot, then - * resend all dirty pages that might have been covered by - * the NFS_CONTEXT_RESEND_WRITES flag - */ - start = 0; - end = LLONG_MAX; + save_nredirtied = nredirtied; } trace_nfs_fsync_exit(inode, ret); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index b4e46b0ffa2d..bea7c005119c 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -426,6 +426,7 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh) static void nfs_inode_init_regular(struct nfs_inode *nfsi) { atomic_long_set(&nfsi->nrequests, 0); + atomic_long_set(&nfsi->redirtied_pages, 0); INIT_LIST_HEAD(&nfsi->commit_info.list); atomic_long_set(&nfsi->commit_info.ncommit, 0); atomic_set(&nfsi->commit_info.rpcs_out, 0); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 4a3796811b4b..989c734cf91d 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -1420,10 +1420,12 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr, */ static void nfs_redirty_request(struct nfs_page *req) { + struct nfs_inode *nfsi = NFS_I(page_file_mapping(req->wb_page)->host); + /* Bump the transmission count */ req->wb_nio++; nfs_mark_request_dirty(req); - set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags); + atomic_long_inc(&nfsi->redirtied_pages); nfs_end_page_writeback(req); nfs_release_request(req); } @@ -1904,7 +1906,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* We have a mismatch. Write the page again */ dprintk_cont(" mismatch\n"); nfs_mark_request_dirty(req); - set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags); + atomic_long_inc(&NFS_I(data->inode)->redirtied_pages); next: nfs_unlock_and_release_request(req); /* Latency breaker */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index b32ed68e7dc4..f08e581f0161 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -182,6 +182,7 @@ struct nfs_inode { /* Regular file */ struct { atomic_long_t nrequests; + atomic_long_t redirtied_pages; struct nfs_mds_commit_info commit_info; struct mutex commit_mutex; }; From edf79efcc9d0cf38fcc1efe688fd697b9bb0ddc4 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 13 Aug 2022 08:46:35 -0400 Subject: [PATCH 03/39] NFS: Remove a bogus flag setting in pnfs_write_done_resend_to_mds Since pnfs_write_done_resend_to_mds() does not actually call end_page_writeback() on the pages that are being redirected to the metadata server, callers of fsync() do not see the I/O as complete until the writeback to the MDS finishes. We therefore do not need to set NFS_CONTEXT_RESEND_WRITES, since there is nothing to redrive. Signed-off-by: Trond Myklebust --- fs/nfs/pnfs.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 41a9b6b58fb9..2613b7e36eb9 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -2817,7 +2817,6 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *hdr) /* Resend all requests through the MDS */ nfs_pageio_init_write(&pgio, hdr->inode, FLUSH_STABLE, true, hdr->completion_ops); - set_bit(NFS_CONTEXT_RESEND_WRITES, &hdr->args.context->flags); return nfs_pageio_resend(&pgio, hdr); } EXPORT_SYMBOL_GPL(pnfs_write_done_resend_to_mds); From 5f6277a0c15e1ea54b6fd3d78c9fff7bfe42556c Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Sat, 13 Aug 2022 08:51:45 -0400 Subject: [PATCH 04/39] NFS: Cleanup to remove unused flag NFS_CONTEXT_RESEND_WRITES Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f08e581f0161..7931fa472561 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -83,7 +83,6 @@ struct nfs_open_context { fmode_t mode; unsigned long flags; -#define NFS_CONTEXT_RESEND_WRITES (1) #define NFS_CONTEXT_BAD (2) #define NFS_CONTEXT_UNLOCK (3) #define NFS_CONTEXT_FILE_OPEN (4) From d52788b3da2750ebc617891cf260b8f8912f522b Mon Sep 17 00:00:00 2001 From: David Gow Date: Fri, 15 Jul 2022 12:03:54 +0800 Subject: [PATCH 05/39] mmc: sdhci-of-aspeed: test: Fix dependencies when KUNIT=m While the sdhci-of-aspeed KUnit tests do work when builtin, and do work when KUnit itself is being built as a module, the two together break. This is because the KUnit tests (understandably) depend on KUnit, so a built-in test cannot build if KUnit is a module. Fix this by adding a dependency on (MMC_SDHCI_OF_ASPEED=m || KUNIT=y), which only excludes this one problematic configuration. This was reported on a nasty openrisc-randconfig run by the kernel test robot, though for some reason (compiler optimisations removing the test code?) I wasn't able to reproduce it locally on x86: https://lore.kernel.org/linux-mm/202207140122.fzhlf60k-lkp@intel.com/T/ Fixes: 291cd54e5b05 ("mmc: sdhci-of-aspeed: test: Use kunit_test_suite() macro") Reported-by: kernel test robot Signed-off-by: David Gow Acked-by: Andrew Jeffery Acked-by: Ulf Hansson Acked-by: Brendan Higgins Signed-off-by: Shuah Khan --- drivers/mmc/host/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 10c563999d3d..e63608834411 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -171,6 +171,7 @@ config MMC_SDHCI_OF_ASPEED config MMC_SDHCI_OF_ASPEED_TEST bool "Tests for the ASPEED SDHCI driver" if !KUNIT_ALL_TESTS depends on MMC_SDHCI_OF_ASPEED && KUNIT + depends on (MMC_SDHCI_OF_ASPEED=m || KUNIT=y) default KUNIT_ALL_TESTS help Enable KUnit tests for the ASPEED SDHCI driver. Select this From 41a55567b9e31cb852670684404654ec4fd0d8d6 Mon Sep 17 00:00:00 2001 From: David Gow Date: Wed, 13 Jul 2022 08:52:20 +0800 Subject: [PATCH 06/39] module: kunit: Load .kunit_test_suites section when CONFIG_KUNIT=m The new KUnit module handling has KUnit test suites listed in a .kunit_test_suites section of each module. This should be loaded when the module is, but at the moment this only happens if KUnit is built-in. Also load this when KUnit is enabled as a module: it'll not be usable unless KUnit is loaded, but such modules are likely to depend on KUnit anyway, so it's unlikely to ever be loaded needlessly. Fixes: 3d6e44623841 ("kunit: unify module and builtin suite definitions") Signed-off-by: David Gow Reviewed-by: Brendan Higgins Tested-by: Geert Uytterhoeven Signed-off-by: Shuah Khan --- kernel/module/main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/module/main.c b/kernel/module/main.c index 6a477c622544..a4e4d84b6f4e 100644 --- a/kernel/module/main.c +++ b/kernel/module/main.c @@ -2099,7 +2099,7 @@ static int find_module_sections(struct module *mod, struct load_info *info) sizeof(*mod->static_call_sites), &mod->num_static_call_sites); #endif -#ifdef CONFIG_KUNIT +#if IS_ENABLED(CONFIG_KUNIT) mod->kunit_suites = section_objs(info, ".kunit_test_suites", sizeof(*mod->kunit_suites), &mod->num_kunit_suites); From 5f4d1fd5b5d3506759b5d9cf20bb5fb5b8bdcab1 Mon Sep 17 00:00:00 2001 From: Kristen Carlson Accardi Date: Fri, 12 Aug 2022 11:07:13 -0700 Subject: [PATCH 07/39] selftests/sgx: Ignore OpenSSL 3.0 deprecated functions warning OpenSSL 3.0 deprecates some of the functions used in the SGX selftests, causing build errors on new distros. For now ignore the warnings until support for the functions is no longer available and mark FIXME so that it can be clear this should be removed at some point. Signed-off-by: Kristen Carlson Accardi Reviewed-by: Jarkko Sakkinen Signed-off-by: Shuah Khan --- tools/testing/selftests/sgx/sigstruct.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tools/testing/selftests/sgx/sigstruct.c b/tools/testing/selftests/sgx/sigstruct.c index 50c5ab1aa6fa..a07896a46364 100644 --- a/tools/testing/selftests/sgx/sigstruct.c +++ b/tools/testing/selftests/sgx/sigstruct.c @@ -17,6 +17,12 @@ #include "defines.h" #include "main.h" +/* + * FIXME: OpenSSL 3.0 has deprecated some functions. For now just ignore + * the warnings. + */ +#pragma GCC diagnostic ignored "-Wdeprecated-declarations" + struct q1q2_ctx { BN_CTX *bn_ctx; BIGNUM *m; From abfcf55d8b07a990589301bc64d82a5d26680956 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 16 Aug 2022 13:35:13 +0200 Subject: [PATCH 08/39] acl: handle idmapped mounts for idmapped filesystems Ensure that POSIX ACLs checking, getting, and setting works correctly for filesystems mountable with a filesystem idmapping ("fs_idmapping") that want to support idmapped mounts ("mnt_idmapping"). Note that no filesystems mountable with an fs_idmapping do yet support idmapped mounts. This is required infrastructure work to unblock this. As we explained in detail in [1] the fs_idmapping is irrelevant for getxattr() and setxattr() when mapping the ACL_{GROUP,USER} {g,u}ids stored in the uapi struct posix_acl_xattr_entry in posix_acl_fix_xattr_{from,to}_user(). But for acl_permission_check() and posix_acl_{g,s}etxattr_idmapped_mnt() the fs_idmapping matters. acl_permission_check(): During lookup POSIX ACLs are retrieved directly via i_op->get_acl() and are returned via the kernel internal struct posix_acl which contains e_{g,u}id members of type k{g,u}id_t that already take the fs_idmapping into acccount. For example, a POSIX ACL stored with u4 on the backing store is mapped to k10000004 in the fs_idmapping. The mnt_idmapping remaps the POSIX ACL to k20000004. In order to do that the fs_idmapping needs to be taken into account but that doesn't happen yet (Again, this is a counterfactual currently as fuse doesn't support idmapped mounts currently. It's just used as a convenient example.): fs_idmapping: u0:k10000000:r65536 mnt_idmapping: u0:v20000000:r65536 ACL_USER: k10000004 acl_permission_check() -> check_acl() -> get_acl() -> i_op->get_acl() == fuse_get_acl() -> posix_acl_from_xattr(u0:k10000000:r65536 /* fs_idmapping */, ...) { k10000004 = make_kuid(u0:k10000000:r65536 /* fs_idmapping */, u4 /* ACL_USER */); } -> posix_acl_permission() { -1 = make_vfsuid(u0:v20000000:r65536 /* mnt_idmapping */, &init_user_ns, k10000004); vfsuid_eq_kuid(-1, k10000004 /* caller_fsuid */) } In order to correctly map from the fs_idmapping into mnt_idmapping we require the relevant fs_idmaping to be passed: acl_permission_check() -> check_acl() -> get_acl() -> i_op->get_acl() == fuse_get_acl() -> posix_acl_from_xattr(u0:k10000000:r65536 /* fs_idmapping */, ...) { k10000004 = make_kuid(u0:k10000000:r65536 /* fs_idmapping */, u4 /* ACL_USER */); } -> posix_acl_permission() { v20000004 = make_vfsuid(u0:v20000000:r65536 /* mnt_idmapping */, u0:k10000000:r65536 /* fs_idmapping */, k10000004); vfsuid_eq_kuid(v20000004, k10000004 /* caller_fsuid */) } The initial_idmapping is only correct for the current situation because all filesystems that currently support idmapped mounts do not support being mounted with an fs_idmapping. Note that ovl_get_acl() is used to retrieve the POSIX ACLs from the relevant lower layer and the lower layer's mnt_idmapping needs to be taken into account and so does the fs_idmapping. See 0c5fd887d2bb ("acl: move idmapped mount fixup into vfs_{g,s}etxattr()") for more details. For posix_acl_{g,s}etxattr_idmapped_mnt() it is not as obvious why the fs_idmapping matters as it is for acl_permission_check(). Especially because it doesn't matter for posix_acl_fix_xattr_{from,to}_user() (See [1] for more context.). Because posix_acl_{g,s}etxattr_idmapped_mnt() operate on the uapi struct posix_acl_xattr_entry which contains {g,u}id_t values and thus give the impression that the fs_idmapping is irrelevant as at this point appropriate {g,u}id_t values have seemlingly been generated. As we've stated multiple times this assumption is wrong and in fact the uapi struct posix_acl_xattr_entry is taking idmappings into account depending at what place it is operated on. posix_acl_getxattr_idmapped_mnt() When posix_acl_getxattr_idmapped_mnt() is called the values stored in the uapi struct posix_acl_xattr_entry are mapped according to the fs_idmapping. This happened when they were read from the backing store and then translated from struct posix_acl into the uapi struct posix_acl_xattr_entry during posix_acl_to_xattr(). In other words, the fs_idmapping matters as the values stored as {g,u}id_t in the uapi struct posix_acl_xattr_entry have been generated by it. So we need to take the fs_idmapping into account during make_vfsuid() in posix_acl_getxattr_idmapped_mnt(). posix_acl_setxattr_idmapped_mnt() When posix_acl_setxattr_idmapped_mnt() is called the values stored as {g,u}id_t in uapi struct posix_acl_xattr_entry are intended to be the values that ultimately get turned back into a k{g,u}id_t in posix_acl_from_xattr() (which turns the uapi struct posix_acl_xattr_entry into the kernel internal struct posix_acl). In other words, the fs_idmapping matters as the values stored as {g,u}id_t in the uapi struct posix_acl_xattr_entry are intended to be the values that will be undone in the fs_idmapping when writing to the backing store. So we need to take the fs_idmapping into account during from_vfsuid() in posix_acl_setxattr_idmapped_mnt(). Link: https://lore.kernel.org/all/20220801145520.1532837-1-brauner@kernel.org [1] Fixes: 0c5fd887d2bb ("acl: move idmapped mount fixup into vfs_{g,s}etxattr()") Cc: Seth Forshee Signed-off-by: Christian Brauner (Microsoft) Reviewed-by: Seth Forshee Link: https://lore.kernel.org/r/20220816113514.43304-1-brauner@kernel.org --- fs/overlayfs/inode.c | 11 +++++++---- fs/posix_acl.c | 15 +++++++++------ 2 files changed, 16 insertions(+), 10 deletions(-) diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index b45fea69fff3..0fbcb590af84 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -460,9 +460,12 @@ ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) * of the POSIX ACLs retrieved from the lower layer to this function to not * alter the POSIX ACLs for the underlying filesystem. */ -static void ovl_idmap_posix_acl(struct user_namespace *mnt_userns, +static void ovl_idmap_posix_acl(struct inode *realinode, + struct user_namespace *mnt_userns, struct posix_acl *acl) { + struct user_namespace *fs_userns = i_user_ns(realinode); + for (unsigned int i = 0; i < acl->a_count; i++) { vfsuid_t vfsuid; vfsgid_t vfsgid; @@ -470,11 +473,11 @@ static void ovl_idmap_posix_acl(struct user_namespace *mnt_userns, struct posix_acl_entry *e = &acl->a_entries[i]; switch (e->e_tag) { case ACL_USER: - vfsuid = make_vfsuid(mnt_userns, &init_user_ns, e->e_uid); + vfsuid = make_vfsuid(mnt_userns, fs_userns, e->e_uid); e->e_uid = vfsuid_into_kuid(vfsuid); break; case ACL_GROUP: - vfsgid = make_vfsgid(mnt_userns, &init_user_ns, e->e_gid); + vfsgid = make_vfsgid(mnt_userns, fs_userns, e->e_gid); e->e_gid = vfsgid_into_kgid(vfsgid); break; } @@ -536,7 +539,7 @@ struct posix_acl *ovl_get_acl(struct inode *inode, int type, bool rcu) if (!clone) clone = ERR_PTR(-ENOMEM); else - ovl_idmap_posix_acl(mnt_user_ns(realpath.mnt), clone); + ovl_idmap_posix_acl(realinode, mnt_user_ns(realpath.mnt), clone); /* * Since we're not in RCU path walk we always need to release the * original ACLs. diff --git a/fs/posix_acl.c b/fs/posix_acl.c index 1d17d7b13dcd..5af33800743e 100644 --- a/fs/posix_acl.c +++ b/fs/posix_acl.c @@ -361,6 +361,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, const struct posix_acl *acl, int want) { const struct posix_acl_entry *pa, *pe, *mask_obj; + struct user_namespace *fs_userns = i_user_ns(inode); int found = 0; vfsuid_t vfsuid; vfsgid_t vfsgid; @@ -376,7 +377,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, goto check_perm; break; case ACL_USER: - vfsuid = make_vfsuid(mnt_userns, &init_user_ns, + vfsuid = make_vfsuid(mnt_userns, fs_userns, pa->e_uid); if (vfsuid_eq_kuid(vfsuid, current_fsuid())) goto mask; @@ -390,7 +391,7 @@ posix_acl_permission(struct user_namespace *mnt_userns, struct inode *inode, } break; case ACL_GROUP: - vfsgid = make_vfsgid(mnt_userns, &init_user_ns, + vfsgid = make_vfsgid(mnt_userns, fs_userns, pa->e_gid); if (vfsgid_in_group_p(vfsgid)) { found = 1; @@ -736,6 +737,7 @@ void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, { struct posix_acl_xattr_header *header = value; struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + struct user_namespace *fs_userns = i_user_ns(inode); int count; vfsuid_t vfsuid; vfsgid_t vfsgid; @@ -753,13 +755,13 @@ void posix_acl_getxattr_idmapped_mnt(struct user_namespace *mnt_userns, switch (le16_to_cpu(entry->e_tag)) { case ACL_USER: uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsuid = make_vfsuid(mnt_userns, &init_user_ns, uid); + vfsuid = make_vfsuid(mnt_userns, fs_userns, uid); entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, vfsuid_into_kuid(vfsuid))); break; case ACL_GROUP: gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); - vfsgid = make_vfsgid(mnt_userns, &init_user_ns, gid); + vfsgid = make_vfsgid(mnt_userns, fs_userns, gid); entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, vfsgid_into_kgid(vfsgid))); break; @@ -775,6 +777,7 @@ void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, { struct posix_acl_xattr_header *header = value; struct posix_acl_xattr_entry *entry = (void *)(header + 1), *end; + struct user_namespace *fs_userns = i_user_ns(inode); int count; vfsuid_t vfsuid; vfsgid_t vfsgid; @@ -793,13 +796,13 @@ void posix_acl_setxattr_idmapped_mnt(struct user_namespace *mnt_userns, case ACL_USER: uid = make_kuid(&init_user_ns, le32_to_cpu(entry->e_id)); vfsuid = VFSUIDT_INIT(uid); - uid = from_vfsuid(mnt_userns, &init_user_ns, vfsuid); + uid = from_vfsuid(mnt_userns, fs_userns, vfsuid); entry->e_id = cpu_to_le32(from_kuid(&init_user_ns, uid)); break; case ACL_GROUP: gid = make_kgid(&init_user_ns, le32_to_cpu(entry->e_id)); vfsgid = VFSGIDT_INIT(gid); - gid = from_vfsgid(mnt_userns, &init_user_ns, vfsgid); + gid = from_vfsgid(mnt_userns, fs_userns, vfsgid); entry->e_id = cpu_to_le32(from_kgid(&init_user_ns, gid)); break; default: From ddc84c90538e1fdb0721cd41c313944c38449a34 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Tue, 16 Aug 2022 13:35:14 +0200 Subject: [PATCH 09/39] MAINTAINERS: update idmapping tree Since Seth joined as a maintainer in ba40a57ff08b ("Add Seth Forshee as co-maintainer for idmapped mounts") it was best to get a shared git tree instead of using our personal repositories. So we requested and Konstantin suggested and gave us a new "idmapping" repository under the pre-existing but mainly unused vfs namespace. Just makes it easier for Seth to send fixes in case I'm out or someone else ever takes over. Cc: Seth Forshee Signed-off-by: Christian Brauner (Microsoft) Link: https://lore.kernel.org/r/20220816113514.43304-2-brauner@kernel.org --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 8a5012ba6ff9..a558794dddf9 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -9780,7 +9780,7 @@ M: Christian Brauner M: Seth Forshee L: linux-fsdevel@vger.kernel.org S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git +T: git://git.kernel.org/pub/scm/linux/kernel/git/vfs/idmapping.git F: Documentation/filesystems/idmappings.rst F: tools/testing/selftests/mount_setattr/ F: include/linux/mnt_idmapping.h From bf1ac16edf6770a92bc75cf2373f1f9feea398a4 Mon Sep 17 00:00:00 2001 From: Seth Forshee Date: Tue, 16 Aug 2022 11:47:52 -0500 Subject: [PATCH 10/39] fs: require CAP_SYS_ADMIN in target namespace for idmapped mounts Idmapped mounts should not allow a user to map file ownsership into a range of ids which is not under the control of that user. However, we currently don't check whether the mounter is privileged wrt to the target user namespace. Currently no FS_USERNS_MOUNT filesystems support idmapped mounts, thus this is not a problem as only CAP_SYS_ADMIN in init_user_ns is allowed to set up idmapped mounts. But this could change in the future, so add a check to refuse to create idmapped mounts when the mounter does not have CAP_SYS_ADMIN in the target user namespace. Fixes: bd303368b776 ("fs: support mapped mounts of mapped filesystems") Signed-off-by: Seth Forshee Reviewed-by: Christian Brauner (Microsoft) Link: https://lore.kernel.org/r/20220816164752.2595240-1-sforshee@digitalocean.com Signed-off-by: Christian Brauner (Microsoft) --- fs/namespace.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/namespace.c b/fs/namespace.c index 68789f896f08..df137ba19d37 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -4238,6 +4238,13 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, err = -EPERM; goto out_fput; } + + /* We're not controlling the target namespace. */ + if (!ns_capable(mnt_userns, CAP_SYS_ADMIN)) { + err = -EPERM; + goto out_fput; + } + kattr->mnt_userns = get_user_ns(mnt_userns); out_fput: From 932c29a10d5d0bba63b9f505a8ec1e3ce8c02542 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 17 Aug 2022 19:41:27 +0100 Subject: [PATCH 11/39] locks: Fix dropped call to ->fl_release_private() Prior to commit 4149be7bda7e, sys_flock() would allocate the file_lock struct it was going to use to pass parameters, call ->flock() and then call locks_free_lock() to get rid of it - which had the side effect of calling locks_release_private() and thus ->fl_release_private(). With commit 4149be7bda7e, however, this is no longer the case: the struct is now allocated on the stack, and locks_free_lock() is no longer called - and thus any remaining private data doesn't get cleaned up either. This causes afs flock to cause oops. Kasan catches this as a UAF by the list_del_init() in afs_fl_release_private() for the file_lock record produced by afs_fl_copy_lock() as the original record didn't get delisted. It can be reproduced using the generic/504 xfstest. Fix this by reinstating the locks_release_private() call in sys_flock(). I'm not sure if this would affect any other filesystems. If not, then the release could be done in afs_flock() instead. Changes ======= ver #2) - Don't need to call ->fl_release_private() after calling the security hook, only after calling ->flock(). Fixes: 4149be7bda7e ("fs/lock: Don't allocate file_lock in flock_make_lock().") cc: Chuck Lever cc: Jeff Layton cc: Marc Dionne cc: linux-afs@lists.infradead.org cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/166075758809.3532462.13307935588777587536.stgit@warthog.procyon.org.uk/ # v1 Acked-by: Kuniyuki Iwashima Signed-off-by: David Howells Signed-off-by: Jeff Layton --- fs/locks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/locks.c b/fs/locks.c index c266cfdc3291..607f94a0e789 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2129,6 +2129,7 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) else error = locks_lock_file_wait(f.file, &fl); + locks_release_private(&fl); out_putf: fdput(f); From 41191cf6bf565f4139046d7be68ec30c290af92d Mon Sep 17 00:00:00 2001 From: Stefan Roesch Date: Tue, 16 Aug 2022 08:31:58 -0700 Subject: [PATCH 12/39] fs: __file_remove_privs(): restore call to inode_has_no_xattr() This restores the call to inode_has_no_xattr() in the function __file_remove_privs(). In case the dentry_meeds_remove_privs() returned 0, the function inode_has_no_xattr() was not called. Signed-off-by: Stefan Roesch Fixes: faf99b563558 ("fs: add __remove_file_privs() with flags parameter") Reviewed-by: Christian Brauner (Microsoft) Link: https://lore.kernel.org/r/20220816153158.1925040-1-shr@fb.com Signed-off-by: Christian Brauner (Microsoft) --- fs/inode.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/inode.c b/fs/inode.c index 6462276dfdf0..ba1de23c13c1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -2018,23 +2018,25 @@ static int __file_remove_privs(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); - int error; + int error = 0; int kill; if (IS_NOSEC(inode) || !S_ISREG(inode->i_mode)) return 0; kill = dentry_needs_remove_privs(dentry); - if (kill <= 0) + if (kill < 0) return kill; - if (flags & IOCB_NOWAIT) - return -EAGAIN; + if (kill) { + if (flags & IOCB_NOWAIT) + return -EAGAIN; + + error = __remove_privs(file_mnt_user_ns(file), dentry, kill); + } - error = __remove_privs(file_mnt_user_ns(file), dentry, kill); if (!error) inode_has_no_xattr(inode); - return error; } From bdbf0617bbc3641af158d1aeffeebb1505f76263 Mon Sep 17 00:00:00 2001 From: Axel Rasmussen Date: Fri, 19 Aug 2022 12:19:28 -0700 Subject: [PATCH 13/39] selftests/vm: fix inability to build any vm tests When we stopped using KSFT_KHDR_INSTALL, a side effect is we also changed the value of `top_srcdir`. This can be seen by looking at the code removed by commit 49de12ba06ef ("selftests: drop KSFT_KHDR_INSTALL make target"). (Note though that this commit didn't break this, technically the one before it did since that's the one that stopped KSFT_KHDR_INSTALL from being used, even though the code was still there.) Previously lib.mk reconfigured `top_srcdir` when KSFT_KHDR_INSTALL was being used. Now, that's no longer the case. As a result, the path to gup_test.h in vm/Makefile was wrong, and since it's a dependency of all of the vm binaries none of them could be built. Instead, we'd get an "error" like: make[1]: *** No rule to make target '/[...]/tools/testing/selftests/vm/compaction_test', needed by 'all'. Stop. So, modify lib.mk so it once again sets top_srcdir to the root of the kernel tree. Fixes: f2745dc0ba3d ("selftests: stop using KSFT_KHDR_INSTALL") Signed-off-by: Axel Rasmussen Signed-off-by: Shuah Khan --- tools/testing/selftests/lib.mk | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/testing/selftests/lib.mk b/tools/testing/selftests/lib.mk index 947fc72413e9..d44c72b3abe3 100644 --- a/tools/testing/selftests/lib.mk +++ b/tools/testing/selftests/lib.mk @@ -40,6 +40,7 @@ ifeq (0,$(MAKELEVEL)) endif endif selfdir = $(realpath $(dir $(filter %/lib.mk,$(MAKEFILE_LIST)))) +top_srcdir = $(selfdir)/../../.. # The following are built by lib.mk common compile rules. # TEST_CUSTOM_PROGS should be used by tests that require From f16857e62bac60786104c020ad7c86e2163b2c5b Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 19 Aug 2022 09:55:59 +1000 Subject: [PATCH 14/39] NFS: unlink/rmdir shouldn't call d_delete() twice on ENOENT nfs_unlink() calls d_delete() twice if it receives ENOENT from the server - once in nfs_dentry_handle_enoent() from nfs_safe_remove and once in nfs_dentry_remove_handle_error(). nfs_rmddir() also calls it twice - the nfs_dentry_handle_enoent() call is direct and inside a region locked with ->rmdir_sem It is safe to call d_delete() twice if the refcount > 1 as the dentry is simply unhashed. If the refcount is 1, the first call sets d_inode to NULL and the second call crashes. This patch guards the d_delete() call from nfs_dentry_handle_enoent() leaving the one under ->remdir_sem in case that is important. In mainline it would be safe to remove the d_delete() call. However in older kernels to which this might be backported, that would change the behaviour of nfs_unlink(). nfs_unlink() used to unhash the dentry which resulted in nfs_dentry_handle_enoent() not calling d_delete(). So in older kernels we need the d_delete() in nfs_dentry_remove_handle_error() when called from nfs_unlink() but not when called from nfs_rmdir(). To make the code work correctly for old and new kernels, and from both nfs_unlink() and nfs_rmdir(), we protect the d_delete() call with simple_positive(). This ensures it is never called in a circumstance where it could crash. Fixes: 3c59366c207e ("NFS: don't unhash dentry during unlink/rename") Fixes: 9019fb391de0 ("NFS: Label the dentry with a verifier in nfs_rmdir() and nfs_unlink()") Signed-off-by: NeilBrown Tested-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 1b879584d4fe..5d6c2ddc7ea6 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -2382,7 +2382,8 @@ static void nfs_dentry_remove_handle_error(struct inode *dir, { switch (error) { case -ENOENT: - d_delete(dentry); + if (d_really_is_positive(dentry)) + d_delete(dentry); nfs_set_verifier(dentry, nfs_save_change_attribute(dir)); break; case 0: From fcfc8be1e9cf2f12b50dce8b579b3ae54443a014 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Thu, 18 Aug 2022 15:07:05 -0400 Subject: [PATCH 15/39] NFSv4.2 fix problems with __nfs42_ssc_open A destination server while doing a COPY shouldn't accept using the passed in filehandle if its not a regular filehandle. If alloc_file_pseudo() has failed, we need to decrement a reference on the newly created inode, otherwise it leaks. Reported-by: Al Viro Fixes: ec4b092508982 ("NFS: inter ssc open") Signed-off-by: Olga Kornievskaia Signed-off-by: Trond Myklebust --- fs/nfs/nfs4file.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index e88f6b18445e..9eb181287879 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -340,6 +340,11 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, goto out; } + if (!S_ISREG(fattr->mode)) { + res = ERR_PTR(-EBADF); + goto out; + } + res = ERR_PTR(-ENOMEM); len = strlen(SSC_READ_NAME_BODY) + 16; read_name = kzalloc(len, GFP_KERNEL); @@ -357,6 +362,7 @@ static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt, r_ino->i_fop); if (IS_ERR(filep)) { res = ERR_CAST(filep); + iput(r_ino); goto out_free_name; } From ed06fce0b034b2e25bd93430f5c4cbb28036cc1a Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Wed, 3 Aug 2022 14:55:03 -0400 Subject: [PATCH 16/39] SUNRPC: RPC level errors should set task->tk_rpc_status Fix up a case in call_encode() where we're failing to set task->tk_rpc_status when an RPC level error occurred. Fixes: 9c5948c24869 ("SUNRPC: task should be exit if encode return EKEYEXPIRED more times") Signed-off-by: Trond Myklebust --- net/sunrpc/clnt.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index b098e707ad41..7d268a291486 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -1902,7 +1902,7 @@ call_encode(struct rpc_task *task) break; case -EKEYEXPIRED: if (!task->tk_cred_retry) { - rpc_exit(task, task->tk_status); + rpc_call_rpcerror(task, task->tk_status); } else { task->tk_action = call_refresh; task->tk_cred_retry--; From d10a72de54c2d2990721923e025ea505fa4f5b02 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 10 Aug 2022 12:18:08 +0300 Subject: [PATCH 17/39] get_maintainer: add Alan to .get_maintainer.ignore Alan asked to be added to the .get_maintainer.ignore list. Link: https://lkml.kernel.org/r/YvN30KhO9aD5Sza9@kili Signed-off-by: Dan Carpenter Cc: Alan Cox Cc: Arnd Bergmann Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton --- .get_maintainer.ignore | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.get_maintainer.ignore b/.get_maintainer.ignore index a64d21913745..c298bab3d320 100644 --- a/.get_maintainer.ignore +++ b/.get_maintainer.ignore @@ -1,2 +1,4 @@ +Alan Cox +Alan Cox Christoph Hellwig Marc Gonzalez From 37887783b3fef877bf34b8992c9199864da4afcb Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 10 Aug 2022 09:06:09 +0200 Subject: [PATCH 18/39] Revert "zram: remove double compression logic" This reverts commit e7be8d1dd983156b ("zram: remove double compression logic") as it causes zram failures. It does not revert cleanly, PTR_ERR handling was introduced in the meantime. This is handled by appropriate IS_ERR. When under memory pressure, zs_malloc() can fail. Before the above commit, the allocation was retried with direct reclaim enabled (GFP_NOIO). After the commit, it is not -- only __GFP_KSWAPD_RECLAIM is tried. So when the failure occurs under memory pressure, the overlaying filesystem such as ext2 (mounted by ext4 module in this case) can emit failures, making the (file)system unusable: EXT4-fs warning (device zram0): ext4_end_bio:343: I/O error 10 writing to inode 16386 starting block 159744) Buffer I/O error on device zram0, logical block 159744 With direct reclaim, memory is really reclaimed and allocation succeeds, eventually. In the worst case, the oom killer is invoked, which is proper outcome if user sets up zram too large (in comparison to available RAM). This very diff doesn't apply to 5.19 (stable) cleanly (see PTR_ERR note above). Use revert of e7be8d1dd983 directly. Link: https://bugzilla.suse.com/show_bug.cgi?id=1202203 Link: https://lkml.kernel.org/r/20220810070609.14402-1-jslaby@suse.cz Fixes: e7be8d1dd983 ("zram: remove double compression logic") Signed-off-by: Jiri Slaby Reviewed-by: Sergey Senozhatsky Cc: Minchan Kim Cc: Nitin Gupta Cc: Alexey Romanov Cc: Dmitry Rokosov Cc: Lukas Czerner Cc: [5.19] Signed-off-by: Andrew Morton --- drivers/block/zram/zram_drv.c | 42 ++++++++++++++++++++++++++--------- drivers/block/zram/zram_drv.h | 1 + 2 files changed, 33 insertions(+), 10 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 92cb929a45b7..226ea76cc819 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1146,14 +1146,15 @@ static ssize_t bd_stat_show(struct device *dev, static ssize_t debug_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { - int version = 2; + int version = 1; struct zram *zram = dev_to_zram(dev); ssize_t ret; down_read(&zram->init_lock); ret = scnprintf(buf, PAGE_SIZE, - "version: %d\n%8llu\n", + "version: %d\n%8llu %8llu\n", version, + (u64)atomic64_read(&zram->stats.writestall), (u64)atomic64_read(&zram->stats.miss_free)); up_read(&zram->init_lock); @@ -1351,7 +1352,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, { int ret = 0; unsigned long alloced_pages; - unsigned long handle = 0; + unsigned long handle = -ENOMEM; unsigned int comp_len = 0; void *src, *dst, *mem; struct zcomp_strm *zstrm; @@ -1369,6 +1370,7 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, } kunmap_atomic(mem); +compress_again: zstrm = zcomp_stream_get(zram->comp); src = kmap_atomic(page); ret = zcomp_compress(zstrm, src, &comp_len); @@ -1377,20 +1379,39 @@ static int __zram_bvec_write(struct zram *zram, struct bio_vec *bvec, if (unlikely(ret)) { zcomp_stream_put(zram->comp); pr_err("Compression failed! err=%d\n", ret); + zs_free(zram->mem_pool, handle); return ret; } if (comp_len >= huge_class_size) comp_len = PAGE_SIZE; - - handle = zs_malloc(zram->mem_pool, comp_len, - __GFP_KSWAPD_RECLAIM | - __GFP_NOWARN | - __GFP_HIGHMEM | - __GFP_MOVABLE); - + /* + * handle allocation has 2 paths: + * a) fast path is executed with preemption disabled (for + * per-cpu streams) and has __GFP_DIRECT_RECLAIM bit clear, + * since we can't sleep; + * b) slow path enables preemption and attempts to allocate + * the page with __GFP_DIRECT_RECLAIM bit set. we have to + * put per-cpu compression stream and, thus, to re-do + * the compression once handle is allocated. + * + * if we have a 'non-null' handle here then we are coming + * from the slow path and handle has already been allocated. + */ + if (IS_ERR((void *)handle)) + handle = zs_malloc(zram->mem_pool, comp_len, + __GFP_KSWAPD_RECLAIM | + __GFP_NOWARN | + __GFP_HIGHMEM | + __GFP_MOVABLE); if (IS_ERR((void *)handle)) { zcomp_stream_put(zram->comp); + atomic64_inc(&zram->stats.writestall); + handle = zs_malloc(zram->mem_pool, comp_len, + GFP_NOIO | __GFP_HIGHMEM | + __GFP_MOVABLE); + if (!IS_ERR((void *)handle)) + goto compress_again; return PTR_ERR((void *)handle); } @@ -1948,6 +1969,7 @@ static int zram_add(void) if (ZRAM_LOGICAL_BLOCK_SIZE == PAGE_SIZE) blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); ret = device_add_disk(NULL, zram->disk, zram_disk_groups); if (ret) goto out_cleanup_disk; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 158c91e54850..80c3b43b4828 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -81,6 +81,7 @@ struct zram_stats { atomic64_t huge_pages_since; /* no. of huge pages since zram set up */ atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ + atomic64_t writestall; /* no. of write slow paths */ atomic64_t miss_free; /* no. of missed free */ #ifdef CONFIG_ZRAM_WRITEBACK atomic64_t bd_count; /* no. of pages in backing device */ From 5535be3099717646781ce1540cf725965d680e7b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Tue, 9 Aug 2022 22:56:40 +0200 Subject: [PATCH 19/39] mm/gup: fix FOLL_FORCE COW security issue and remove FOLL_COW Ever since the Dirty COW (CVE-2016-5195) security issue happened, we know that FOLL_FORCE can be possibly dangerous, especially if there are races that can be exploited by user space. Right now, it would be sufficient to have some code that sets a PTE of a R/O-mapped shared page dirty, in order for it to erroneously become writable by FOLL_FORCE. The implications of setting a write-protected PTE dirty might not be immediately obvious to everyone. And in fact ever since commit 9ae0f87d009c ("mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte"), we can use UFFDIO_CONTINUE to map a shmem page R/O while marking the pte dirty. This can be used by unprivileged user space to modify tmpfs/shmem file content even if the user does not have write permissions to the file, and to bypass memfd write sealing -- Dirty COW restricted to tmpfs/shmem (CVE-2022-2590). To fix such security issues for good, the insight is that we really only need that fancy retry logic (FOLL_COW) for COW mappings that are not writable (!VM_WRITE). And in a COW mapping, we really only broke COW if we have an exclusive anonymous page mapped. If we have something else mapped, or the mapped anonymous page might be shared (!PageAnonExclusive), we have to trigger a write fault to break COW. If we don't find an exclusive anonymous page when we retry, we have to trigger COW breaking once again because something intervened. Let's move away from this mandatory-retry + dirty handling and rely on our PageAnonExclusive() flag for making a similar decision, to use the same COW logic as in other kernel parts here as well. In case we stumble over a PTE in a COW mapping that does not map an exclusive anonymous page, COW was not properly broken and we have to trigger a fake write-fault to break COW. Just like we do in can_change_pte_writable() added via commit 64fe24a3e05e ("mm/mprotect: try avoiding write faults for exclusive anonymous pages when changing protection") and commit 76aefad628aa ("mm/mprotect: fix soft-dirty check in can_change_pte_writable()"), take care of softdirty and uffd-wp manually. For example, a write() via /proc/self/mem to a uffd-wp-protected range has to fail instead of silently granting write access and bypassing the userspace fault handler. Note that FOLL_FORCE is not only used for debug access, but also triggered by applications without debug intentions, for example, when pinning pages via RDMA. This fixes CVE-2022-2590. Note that only x86_64 and aarch64 are affected, because only those support CONFIG_HAVE_ARCH_USERFAULTFD_MINOR. Fortunately, FOLL_COW is no longer required to handle FOLL_FORCE. So let's just get rid of it. Thanks to Nadav Amit for pointing out that the pte_dirty() check in FOLL_FORCE code is problematic and might be exploitable. Note 1: We don't check for the PTE being dirty because it doesn't matter for making a "was COWed" decision anymore, and whoever modifies the page has to set the page dirty either way. Note 2: Kernels before extended uffd-wp support and before PageAnonExclusive (< 5.19) can simply revert the problematic commit instead and be safe regarding UFFDIO_CONTINUE. A backport to v5.19 requires minor adjustments due to lack of vma_soft_dirty_enabled(). Link: https://lkml.kernel.org/r/20220809205640.70916-1-david@redhat.com Fixes: 9ae0f87d009c ("mm/shmem: unconditionally set pte dirty in mfill_atomic_install_pte") Signed-off-by: David Hildenbrand Cc: Greg Kroah-Hartman Cc: Axel Rasmussen Cc: Nadav Amit Cc: Peter Xu Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Matthew Wilcox Cc: Vlastimil Babka Cc: John Hubbard Cc: Jason Gunthorpe Cc: David Laight Cc: [5.16] Signed-off-by: Andrew Morton --- include/linux/mm.h | 1 - mm/gup.c | 68 +++++++++++++++++++++++++++++++--------------- mm/huge_memory.c | 64 +++++++++++++++++++++++++++++-------------- 3 files changed, 89 insertions(+), 44 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 3bedc449c14d..982f2607180b 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2885,7 +2885,6 @@ struct page *follow_page(struct vm_area_struct *vma, unsigned long address, #define FOLL_MIGRATION 0x400 /* wait for page to replace migration entry */ #define FOLL_TRIED 0x800 /* a retry, previous pass started an IO */ #define FOLL_REMOTE 0x2000 /* we are working on non-current tsk/mm */ -#define FOLL_COW 0x4000 /* internal GUP flag */ #define FOLL_ANON 0x8000 /* don't do file mappings */ #define FOLL_LONGTERM 0x10000 /* mapping lifetime is indefinite: see below */ #define FOLL_SPLIT_PMD 0x20000 /* split huge pmd before returning */ diff --git a/mm/gup.c b/mm/gup.c index 732825157430..5abdaf487460 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -478,14 +478,42 @@ static int follow_pfn_pte(struct vm_area_struct *vma, unsigned long address, return -EEXIST; } -/* - * FOLL_FORCE can write to even unwritable pte's, but only - * after we've gone through a COW cycle and they are dirty. - */ -static inline bool can_follow_write_pte(pte_t pte, unsigned int flags) +/* FOLL_FORCE can write to even unwritable PTEs in COW mappings. */ +static inline bool can_follow_write_pte(pte_t pte, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) { - return pte_write(pte) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pte_dirty(pte)); + /* If the pte is writable, we can write to the page. */ + if (pte_write(pte)) + return true; + + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + return false; + + /* ... and a write-fault isn't required for other reasons. */ + if (vma_soft_dirty_enabled(vma) && !pte_soft_dirty(pte)) + return false; + return !userfaultfd_pte_wp(vma, pte); } static struct page *follow_page_pte(struct vm_area_struct *vma, @@ -528,12 +556,19 @@ retry: } if ((flags & FOLL_NUMA) && pte_protnone(pte)) goto no_page; - if ((flags & FOLL_WRITE) && !can_follow_write_pte(pte, flags)) { - pte_unmap_unlock(ptep, ptl); - return NULL; - } page = vm_normal_page(vma, address, pte); + + /* + * We only care about anon pages in can_follow_write_pte() and don't + * have to worry about pte_devmap() because they are never anon. + */ + if ((flags & FOLL_WRITE) && + !can_follow_write_pte(pte, page, vma, flags)) { + page = NULL; + goto out; + } + if (!page && pte_devmap(pte) && (flags & (FOLL_GET | FOLL_PIN))) { /* * Only return device mapping pages in the FOLL_GET or FOLL_PIN @@ -986,17 +1021,6 @@ static int faultin_page(struct vm_area_struct *vma, return -EBUSY; } - /* - * The VM_FAULT_WRITE bit tells us that do_wp_page has broken COW when - * necessary, even if maybe_mkwrite decided not to set pte_write. We - * can thus safely do subsequent page lookups as if they were reads. - * But only do so when looping for pte_write is futile: in some cases - * userspace may also be wanting to write to the gotten user page, - * which a read fault here might prevent (a readonly page might get - * reCOWed by userspace write). - */ - if ((ret & VM_FAULT_WRITE) && !(vma->vm_flags & VM_WRITE)) - *flags |= FOLL_COW; return 0; } diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8a7c1b344abe..e9414ee57c5b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1040,12 +1040,6 @@ struct page *follow_devmap_pmd(struct vm_area_struct *vma, unsigned long addr, assert_spin_locked(pmd_lockptr(mm, pmd)); - /* - * When we COW a devmap PMD entry, we split it into PTEs, so we should - * not be in this function with `flags & FOLL_COW` set. - */ - WARN_ONCE(flags & FOLL_COW, "mm: In follow_devmap_pmd with FOLL_COW set"); - /* FOLL_GET and FOLL_PIN are mutually exclusive. */ if (WARN_ON_ONCE((flags & (FOLL_PIN | FOLL_GET)) == (FOLL_PIN | FOLL_GET))) @@ -1395,14 +1389,42 @@ fallback: return VM_FAULT_FALLBACK; } -/* - * FOLL_FORCE can write to even unwritable pmd's, but only - * after we've gone through a COW cycle and they are dirty. - */ -static inline bool can_follow_write_pmd(pmd_t pmd, unsigned int flags) +/* FOLL_FORCE can write to even unwritable PMDs in COW mappings. */ +static inline bool can_follow_write_pmd(pmd_t pmd, struct page *page, + struct vm_area_struct *vma, + unsigned int flags) { - return pmd_write(pmd) || - ((flags & FOLL_FORCE) && (flags & FOLL_COW) && pmd_dirty(pmd)); + /* If the pmd is writable, we can write to the page. */ + if (pmd_write(pmd)) + return true; + + /* Maybe FOLL_FORCE is set to override it? */ + if (!(flags & FOLL_FORCE)) + return false; + + /* But FOLL_FORCE has no effect on shared mappings */ + if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED)) + return false; + + /* ... or read-only private ones */ + if (!(vma->vm_flags & VM_MAYWRITE)) + return false; + + /* ... or already writable ones that just need to take a write fault */ + if (vma->vm_flags & VM_WRITE) + return false; + + /* + * See can_change_pte_writable(): we broke COW and could map the page + * writable if we have an exclusive anonymous page ... + */ + if (!page || !PageAnon(page) || !PageAnonExclusive(page)) + return false; + + /* ... and a write-fault isn't required for other reasons. */ + if (vma_soft_dirty_enabled(vma) && !pmd_soft_dirty(pmd)) + return false; + return !userfaultfd_huge_pmd_wp(vma, pmd); } struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, @@ -1411,12 +1433,16 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, unsigned int flags) { struct mm_struct *mm = vma->vm_mm; - struct page *page = NULL; + struct page *page; assert_spin_locked(pmd_lockptr(mm, pmd)); - if (flags & FOLL_WRITE && !can_follow_write_pmd(*pmd, flags)) - goto out; + page = pmd_page(*pmd); + VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + + if ((flags & FOLL_WRITE) && + !can_follow_write_pmd(*pmd, page, vma, flags)) + return NULL; /* Avoid dumping huge zero page */ if ((flags & FOLL_DUMP) && is_huge_zero_pmd(*pmd)) @@ -1424,10 +1450,7 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, /* Full NUMA hinting faults to serialise migration in fault paths */ if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) - goto out; - - page = pmd_page(*pmd); - VM_BUG_ON_PAGE(!PageHead(page) && !is_zone_device_page(page), page); + return NULL; if (!pmd_write(*pmd) && gup_must_unshare(flags, page)) return ERR_PTR(-EMLINK); @@ -1444,7 +1467,6 @@ struct page *follow_trans_huge_pmd(struct vm_area_struct *vma, page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT; VM_BUG_ON_PAGE(!PageCompound(page) && !is_zone_device_page(page), page); -out: return page; } From a8faed3a02eeb75857a3b5d660fa80fe79db77a3 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Sun, 7 Aug 2022 15:09:34 -0700 Subject: [PATCH 20/39] kernel/sys_ni: add compat entry for fadvise64_64 When CONFIG_ADVISE_SYSCALLS is not set/enabled and CONFIG_COMPAT is set/enabled, the riscv compat_syscall_table references 'compat_sys_fadvise64_64', which is not defined: riscv64-linux-ld: arch/riscv/kernel/compat_syscall_table.o:(.rodata+0x6f8): undefined reference to `compat_sys_fadvise64_64' Add 'fadvise64_64' to kernel/sys_ni.c as a conditional COMPAT function so that when CONFIG_ADVISE_SYSCALLS is not set, there is a fallback function available. Link: https://lkml.kernel.org/r/20220807220934.5689-1-rdunlap@infradead.org Fixes: d3ac21cacc24 ("mm: Support compiling out madvise and fadvise") Signed-off-by: Randy Dunlap Suggested-by: Arnd Bergmann Reviewed-by: Arnd Bergmann Cc: Josh Triplett Cc: Paul Walmsley Cc: Palmer Dabbelt Cc: Albert Ou Cc: Signed-off-by: Andrew Morton --- kernel/sys_ni.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index a492f159624f..860b2dcf3ac4 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -277,6 +277,7 @@ COND_SYSCALL(landlock_restrict_self); /* mm/fadvise.c */ COND_SYSCALL(fadvise64_64); +COND_SYSCALL_COMPAT(fadvise64_64); /* mm/, CONFIG_MMU only */ COND_SYSCALL(swapon); From a39c5d3ce03dd890ab6a9be44b21177cec32da55 Mon Sep 17 00:00:00 2001 From: Hao Lee Date: Sun, 7 Aug 2022 15:44:42 +0000 Subject: [PATCH 21/39] mm: add DEVICE_ZONE to FOR_ALL_ZONES FOR_ALL_ZONES should be consistent with enum zone_type. Otherwise, __count_zid_vm_events have the potential to add count to wrong item when zid is ZONE_DEVICE. Link: https://lkml.kernel.org/r/20220807154442.GA18167@haolee.io Signed-off-by: Hao Lee Cc: David Hildenbrand Cc: Johannes Weiner Signed-off-by: Andrew Morton --- include/linux/vm_event_item.h | 15 +++++++++++---- mm/vmstat.c | 9 ++++++++- 2 files changed, 19 insertions(+), 5 deletions(-) diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index 404024486fa5..f3fc36cd2276 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -20,12 +20,19 @@ #define HIGHMEM_ZONE(xx) #endif -#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, HIGHMEM_ZONE(xx) xx##_MOVABLE +#ifdef CONFIG_ZONE_DEVICE +#define DEVICE_ZONE(xx) xx##_DEVICE, +#else +#define DEVICE_ZONE(xx) +#endif + +#define FOR_ALL_ZONES(xx) DMA_ZONE(xx) DMA32_ZONE(xx) xx##_NORMAL, \ + HIGHMEM_ZONE(xx) xx##_MOVABLE, DEVICE_ZONE(xx) enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, - FOR_ALL_ZONES(PGALLOC), - FOR_ALL_ZONES(ALLOCSTALL), - FOR_ALL_ZONES(PGSCAN_SKIP), + FOR_ALL_ZONES(PGALLOC) + FOR_ALL_ZONES(ALLOCSTALL) + FOR_ALL_ZONES(PGSCAN_SKIP) PGFREE, PGACTIVATE, PGDEACTIVATE, PGLAZYFREE, PGFAULT, PGMAJFAULT, PGLAZYFREED, diff --git a/mm/vmstat.c b/mm/vmstat.c index 373d2730fcf2..90af9a8572f5 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1168,8 +1168,15 @@ int fragmentation_index(struct zone *zone, unsigned int order) #define TEXT_FOR_HIGHMEM(xx) #endif +#ifdef CONFIG_ZONE_DEVICE +#define TEXT_FOR_DEVICE(xx) xx "_device", +#else +#define TEXT_FOR_DEVICE(xx) +#endif + #define TEXTS_FOR_ZONES(xx) TEXT_FOR_DMA(xx) TEXT_FOR_DMA32(xx) xx "_normal", \ - TEXT_FOR_HIGHMEM(xx) xx "_movable", + TEXT_FOR_HIGHMEM(xx) xx "_movable", \ + TEXT_FOR_DEVICE(xx) const char * const vmstat_text[] = { /* enum zone_stat_item counters */ From efd4149342db2df41b1bbe68972ead853b30e444 Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Fri, 5 Aug 2022 12:00:03 -0400 Subject: [PATCH 22/39] mm/smaps: don't access young/dirty bit if pte unpresent These bits should only be valid when the ptes are present. Introducing two booleans for it and set it to false when !pte_present() for both pte and pmd accountings. The bug is found during code reading and no real world issue reported, but logically such an error can cause incorrect readings for either smaps or smaps_rollup output on quite a few fields. For example, it could cause over-estimate on values like Shared_Dirty, Private_Dirty, Referenced. Or it could also cause under-estimate on values like LazyFree, Shared_Clean, Private_Clean. Link: https://lkml.kernel.org/r/20220805160003.58929-1-peterx@redhat.com Fixes: b1d4d9e0cbd0 ("proc/smaps: carefully handle migration entries") Fixes: c94b6923fa0a ("/proc/PID/smaps: Add PMD migration entry parsing") Signed-off-by: Peter Xu Reviewed-by: Vlastimil Babka Reviewed-by: David Hildenbrand Reviewed-by: Yang Shi Cc: Konstantin Khlebnikov Cc: Huang Ying Signed-off-by: Andrew Morton --- fs/proc/task_mmu.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a3398d0f1927..4e0023643f8b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -527,10 +527,12 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct vm_area_struct *vma = walk->vma; bool locked = !!(vma->vm_flags & VM_LOCKED); struct page *page = NULL; - bool migration = false; + bool migration = false, young = false, dirty = false; if (pte_present(*pte)) { page = vm_normal_page(vma, addr, *pte); + young = pte_young(*pte); + dirty = pte_dirty(*pte); } else if (is_swap_pte(*pte)) { swp_entry_t swpent = pte_to_swp_entry(*pte); @@ -560,8 +562,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, if (!page) return; - smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte), - locked, migration); + smaps_account(mss, page, false, young, dirty, locked, migration); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE From f369b07c861435bd812a9d14493f71b34132ed6f Mon Sep 17 00:00:00 2001 From: Peter Xu Date: Thu, 11 Aug 2022 16:13:40 -0400 Subject: [PATCH 23/39] mm/uffd: reset write protection when unregister with wp-mode The motivation of this patch comes from a recent report and patchfix from David Hildenbrand on hugetlb shared handling of wr-protected page [1]. With the reproducer provided in commit message of [1], one can leverage the uffd-wp lazy-reset of ptes to trigger a hugetlb issue which can affect not only the attacker process, but also the whole system. The lazy-reset mechanism of uffd-wp was used to make unregister faster, meanwhile it has an assumption that any leftover pgtable entries should only affect the process on its own, so not only the user should be aware of anything it does, but also it should not affect outside of the process. But it seems that this is not true, and it can also be utilized to make some exploit easier. So far there's no clue showing that the lazy-reset is important to any userfaultfd users because normally the unregister will only happen once for a specific range of memory of the lifecycle of the process. Considering all above, what this patch proposes is to do explicit pte resets when unregister an uffd region with wr-protect mode enabled. It should be the same as calling ioctl(UFFDIO_WRITEPROTECT, wp=false) right before ioctl(UFFDIO_UNREGISTER) for the user. So potentially it'll make the unregister slower. From that pov it's a very slight abi change, but hopefully nothing should break with this change either. Regarding to the change itself - core of uffd write [un]protect operation is moved into a separate function (uffd_wp_range()) and it is reused in the unregister code path. Note that the new function will not check for anything, e.g. ranges or memory types, because they should have been checked during the previous UFFDIO_REGISTER or it should have failed already. It also doesn't check mmap_changing because we're with mmap write lock held anyway. I added a Fixes upon introducing of uffd-wp shmem+hugetlbfs because that's the only issue reported so far and that's the commit David's reproducer will start working (v5.19+). But the whole idea actually applies to not only file memories but also anonymous. It's just that we don't need to fix anonymous prior to v5.19- because there's no known way to exploit. IOW, this patch can also fix the issue reported in [1] as the patch 2 does. [1] https://lore.kernel.org/all/20220811103435.188481-3-david@redhat.com/ Link: https://lkml.kernel.org/r/20220811201340.39342-1-peterx@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: Peter Xu Cc: David Hildenbrand Cc: Mike Rapoport Cc: Mike Kravetz Cc: Andrea Arcangeli Cc: Nadav Amit Cc: Axel Rasmussen Cc: Signed-off-by: Andrew Morton --- fs/userfaultfd.c | 4 ++++ include/linux/userfaultfd_k.h | 2 ++ mm/userfaultfd.c | 29 ++++++++++++++++++----------- 3 files changed, 24 insertions(+), 11 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 1c44bf75f916..175de70e3adf 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -1601,6 +1601,10 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, wake_userfault(vma->vm_userfaultfd_ctx.ctx, &range); } + /* Reset ptes for the whole vma range if wr-protected */ + if (userfaultfd_wp(vma)) + uffd_wp_range(mm, vma, start, vma_end - start, false); + new_flags = vma->vm_flags & ~__VM_UFFD_FLAGS; prev = vma_merge(mm, prev, start, vma_end, new_flags, vma->anon_vma, vma->vm_file, vma->vm_pgoff, diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index 732b522bacb7..e1b8a915e9e9 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -73,6 +73,8 @@ extern ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long dst_start, extern int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing); +extern void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *vma, + unsigned long start, unsigned long len, bool enable_wp); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 07d3befc80e4..7327b2573f7c 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -703,14 +703,29 @@ ssize_t mcopy_continue(struct mm_struct *dst_mm, unsigned long start, mmap_changing, 0); } +void uffd_wp_range(struct mm_struct *dst_mm, struct vm_area_struct *dst_vma, + unsigned long start, unsigned long len, bool enable_wp) +{ + struct mmu_gather tlb; + pgprot_t newprot; + + if (enable_wp) + newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); + else + newprot = vm_get_page_prot(dst_vma->vm_flags); + + tlb_gather_mmu(&tlb, dst_mm); + change_protection(&tlb, dst_vma, start, start + len, newprot, + enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); + tlb_finish_mmu(&tlb); +} + int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, unsigned long len, bool enable_wp, atomic_t *mmap_changing) { struct vm_area_struct *dst_vma; unsigned long page_mask; - struct mmu_gather tlb; - pgprot_t newprot; int err; /* @@ -750,15 +765,7 @@ int mwriteprotect_range(struct mm_struct *dst_mm, unsigned long start, goto out_unlock; } - if (enable_wp) - newprot = vm_get_page_prot(dst_vma->vm_flags & ~(VM_WRITE)); - else - newprot = vm_get_page_prot(dst_vma->vm_flags); - - tlb_gather_mmu(&tlb, dst_mm); - change_protection(&tlb, dst_vma, start, start + len, newprot, - enable_wp ? MM_CP_UFFD_WP : MM_CP_UFFD_WP_RESOLVE); - tlb_finish_mmu(&tlb); + uffd_wp_range(dst_mm, dst_vma, start, len, enable_wp); err = 0; out_unlock: From f96f7a40874d7c746680c0b9f57cef2262ae551f Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 11 Aug 2022 12:34:34 +0200 Subject: [PATCH 24/39] mm/hugetlb: fix hugetlb not supporting softdirty tracking Patch series "mm/hugetlb: fix write-fault handling for shared mappings", v2. I observed that hugetlb does not support/expect write-faults in shared mappings that would have to map the R/O-mapped page writable -- and I found two case where we could currently get such faults and would erroneously map an anon page into a shared mapping. Reproducers part of the patches. I propose to backport both fixes to stable trees. The first fix needs a small adjustment. This patch (of 2): Staring at hugetlb_wp(), one might wonder where all the logic for shared mappings is when stumbling over a write-protected page in a shared mapping. In fact, there is none, and so far we thought we could get away with that because e.g., mprotect() should always do the right thing and map all pages directly writable. Looks like we were wrong: -------------------------------------------------------------------------- #include #include #include #include #include #include #include #define HUGETLB_SIZE (2 * 1024 * 1024u) static void clear_softdirty(void) { int fd = open("/proc/self/clear_refs", O_WRONLY); const char *ctrl = "4"; int ret; if (fd < 0) { fprintf(stderr, "open(clear_refs) failed\n"); exit(1); } ret = write(fd, ctrl, strlen(ctrl)); if (ret != strlen(ctrl)) { fprintf(stderr, "write(clear_refs) failed\n"); exit(1); } close(fd); } int main(int argc, char **argv) { char *map; int fd; fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT); if (!fd) { fprintf(stderr, "open() failed\n"); return -errno; } if (ftruncate(fd, HUGETLB_SIZE)) { fprintf(stderr, "ftruncate() failed\n"); return -errno; } map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if (map == MAP_FAILED) { fprintf(stderr, "mmap() failed\n"); return -errno; } *map = 0; if (mprotect(map, HUGETLB_SIZE, PROT_READ)) { fprintf(stderr, "mmprotect() failed\n"); return -errno; } clear_softdirty(); if (mprotect(map, HUGETLB_SIZE, PROT_READ|PROT_WRITE)) { fprintf(stderr, "mmprotect() failed\n"); return -errno; } *map = 0; return 0; } -------------------------------------------------------------------------- Above test fails with SIGBUS when there is only a single free hugetlb page. # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # ./test Bus error (core dumped) And worse, with sufficient free hugetlb pages it will map an anonymous page into a shared mapping, for example, messing up accounting during unmap and breaking MAP_SHARED semantics: # echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # ./test # cat /proc/meminfo | grep HugePages_ HugePages_Total: 2 HugePages_Free: 1 HugePages_Rsvd: 18446744073709551615 HugePages_Surp: 0 Reason in this particular case is that vma_wants_writenotify() will return "true", removing VM_SHARED in vma_set_page_prot() to map pages write-protected. Let's teach vma_wants_writenotify() that hugetlb does not support softdirty tracking. Link: https://lkml.kernel.org/r/20220811103435.188481-1-david@redhat.com Link: https://lkml.kernel.org/r/20220811103435.188481-2-david@redhat.com Fixes: 64e455079e1b ("mm: softdirty: enable write notifications on VMAs after VM_SOFTDIRTY cleared") Signed-off-by: David Hildenbrand Reviewed-by: Mike Kravetz Cc: Peter Feiner Cc: Kirill A. Shutemov Cc: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Jamie Liu Cc: Hugh Dickins Cc: Naoya Horiguchi Cc: Bjorn Helgaas Cc: Muchun Song Cc: Peter Xu Cc: [3.18+] Signed-off-by: Andrew Morton --- mm/mmap.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index c035020d0c89..9d780f415be3 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1646,8 +1646,11 @@ int vma_wants_writenotify(struct vm_area_struct *vma, pgprot_t vm_page_prot) pgprot_val(vm_pgprot_modify(vm_page_prot, vm_flags))) return 0; - /* Do we need to track softdirty? */ - if (vma_soft_dirty_enabled(vma)) + /* + * Do we need to track softdirty? hugetlb does not support softdirty + * tracking yet. + */ + if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) return 1; /* Specialty mapping? */ From 1d8d14641fd94a01b20a4abbf2749fd8eddcf57b Mon Sep 17 00:00:00 2001 From: David Hildenbrand Date: Thu, 11 Aug 2022 12:34:35 +0200 Subject: [PATCH 25/39] mm/hugetlb: support write-faults in shared mappings If we ever get a write-fault on a write-protected page in a shared mapping, we'd be in trouble (again). Instead, we can simply map the page writable. And in fact, there is even a way right now to trigger that code via uffd-wp ever since we stared to support it for shmem in 5.19: -------------------------------------------------------------------------- #include #include #include #include #include #include #include #include #include #include #define HUGETLB_SIZE (2 * 1024 * 1024u) static char *map; int uffd; static int temp_setup_uffd(void) { struct uffdio_api uffdio_api; struct uffdio_register uffdio_register; struct uffdio_writeprotect uffd_writeprotect; struct uffdio_range uffd_range; uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK | UFFD_USER_MODE_ONLY); if (uffd < 0) { fprintf(stderr, "syscall() failed: %d\n", errno); return -errno; } uffdio_api.api = UFFD_API; uffdio_api.features = UFFD_FEATURE_PAGEFAULT_FLAG_WP; if (ioctl(uffd, UFFDIO_API, &uffdio_api) < 0) { fprintf(stderr, "UFFDIO_API failed: %d\n", errno); return -errno; } if (!(uffdio_api.features & UFFD_FEATURE_PAGEFAULT_FLAG_WP)) { fprintf(stderr, "UFFD_FEATURE_WRITEPROTECT missing\n"); return -ENOSYS; } /* Register UFFD-WP */ uffdio_register.range.start = (unsigned long) map; uffdio_register.range.len = HUGETLB_SIZE; uffdio_register.mode = UFFDIO_REGISTER_MODE_WP; if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register) < 0) { fprintf(stderr, "UFFDIO_REGISTER failed: %d\n", errno); return -errno; } /* Writeprotect a single page. */ uffd_writeprotect.range.start = (unsigned long) map; uffd_writeprotect.range.len = HUGETLB_SIZE; uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_WP; if (ioctl(uffd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) { fprintf(stderr, "UFFDIO_WRITEPROTECT failed: %d\n", errno); return -errno; } /* Unregister UFFD-WP without prior writeunprotection. */ uffd_range.start = (unsigned long) map; uffd_range.len = HUGETLB_SIZE; if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_range)) { fprintf(stderr, "UFFDIO_UNREGISTER failed: %d\n", errno); return -errno; } return 0; } int main(int argc, char **argv) { int fd; fd = open("/dev/hugepages/tmp", O_RDWR | O_CREAT); if (!fd) { fprintf(stderr, "open() failed\n"); return -errno; } if (ftruncate(fd, HUGETLB_SIZE)) { fprintf(stderr, "ftruncate() failed\n"); return -errno; } map = mmap(NULL, HUGETLB_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, fd, 0); if (map == MAP_FAILED) { fprintf(stderr, "mmap() failed\n"); return -errno; } *map = 0; if (temp_setup_uffd()) return 1; *map = 0; return 0; } -------------------------------------------------------------------------- Above test fails with SIGBUS when there is only a single free hugetlb page. # echo 1 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # ./test Bus error (core dumped) And worse, with sufficient free hugetlb pages it will map an anonymous page into a shared mapping, for example, messing up accounting during unmap and breaking MAP_SHARED semantics: # echo 2 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages # ./test # cat /proc/meminfo | grep HugePages_ HugePages_Total: 2 HugePages_Free: 1 HugePages_Rsvd: 18446744073709551615 HugePages_Surp: 0 Reason is that uffd-wp doesn't clear the uffd-wp PTE bit when unregistering and consequently keeps the PTE writeprotected. Reason for this is to avoid the additional overhead when unregistering. Note that this is the case also for !hugetlb and that we will end up with writable PTEs that still have the uffd-wp PTE bit set once we return from hugetlb_wp(). I'm not touching the uffd-wp PTE bit for now, because it seems to be a generic thing -- wp_page_reuse() also doesn't clear it. VM_MAYSHARE handling in hugetlb_fault() for FAULT_FLAG_WRITE indicates that MAP_SHARED handling was at least envisioned, but could never have worked as expected. While at it, make sure that we never end up in hugetlb_wp() on write faults without VM_WRITE, because we don't support maybe_mkwrite() semantics as commonly used in the !hugetlb case -- for example, in wp_page_reuse(). Note that there is no need to do any kind of reservation in hugetlb_fault() in this case ... because we already have a hugetlb page mapped R/O that we will simply map writable and we are not dealing with COW/unsharing. Link: https://lkml.kernel.org/r/20220811103435.188481-3-david@redhat.com Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") Signed-off-by: David Hildenbrand Reviewed-by: Mike Kravetz Cc: Bjorn Helgaas Cc: Cyrill Gorcunov Cc: Hugh Dickins Cc: Jamie Liu Cc: Kirill A. Shutemov Cc: Muchun Song Cc: Naoya Horiguchi Cc: Pavel Emelyanov Cc: Peter Feiner Cc: Peter Xu Cc: [5.19] Signed-off-by: Andrew Morton --- mm/hugetlb.c | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0aee2f3ae15c..2480ba627aa5 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -5241,6 +5241,21 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma, VM_BUG_ON(unshare && (flags & FOLL_WRITE)); VM_BUG_ON(!unshare && !(flags & FOLL_WRITE)); + /* + * hugetlb does not support FOLL_FORCE-style write faults that keep the + * PTE mapped R/O such as maybe_mkwrite() would do. + */ + if (WARN_ON_ONCE(!unshare && !(vma->vm_flags & VM_WRITE))) + return VM_FAULT_SIGSEGV; + + /* Let's take out MAP_SHARED mappings first. */ + if (vma->vm_flags & VM_MAYSHARE) { + if (unlikely(unshare)) + return 0; + set_huge_ptep_writable(vma, haddr, ptep); + return 0; + } + pte = huge_ptep_get(ptep); old_page = pte_page(pte); @@ -5781,12 +5796,11 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * If we are going to COW/unshare the mapping later, we examine the * pending reservations for this page now. This will ensure that any * allocations necessary to record that reservation occur outside the - * spinlock. For private mappings, we also lookup the pagecache - * page now as it is used to determine if a reservation has been - * consumed. + * spinlock. Also lookup the pagecache page now as it is used to + * determine if a reservation has been consumed. */ if ((flags & (FAULT_FLAG_WRITE|FAULT_FLAG_UNSHARE)) && - !huge_pte_write(entry)) { + !(vma->vm_flags & VM_MAYSHARE) && !huge_pte_write(entry)) { if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto out_mutex; @@ -5794,9 +5808,7 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, /* Just decrements count, does not deallocate */ vma_end_reservation(h, vma, haddr); - if (!(vma->vm_flags & VM_MAYSHARE)) - pagecache_page = hugetlbfs_pagecache_page(h, - vma, haddr); + pagecache_page = hugetlbfs_pagecache_page(h, vma, haddr); } ptl = huge_pte_lock(h, mm, ptep); From cb241339b9d020c758a6647c69f8e42538c5cf88 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 10 Aug 2022 21:51:09 -0700 Subject: [PATCH 26/39] mm/shmem: fix chattr fsflags support in tmpfs ext[234] have always allowed unimplemented chattr flags to be set, but other filesystems have tended to be stricter. Follow the stricter approach for tmpfs: I don't want to have to explain why csu attributes don't actually work, and we won't need to update the chattr(1) manpage; and it's never wrong to start off strict, relaxing later if persuaded. Allow only a (append only) i (immutable) A (no atime) and d (no dump). Although lsattr showed 'A' inherited, the NOATIME behavior was not being inherited: because nothing sync'ed FS_NOATIME_FL to S_NOATIME. Add shmem_set_inode_flags() to sync the flags, using inode_set_flags() to avoid that instant of lost immutablility during fileattr_set(). But that change switched generic/079 from passing to failing: because FS_IMMUTABLE_FL and FS_APPEND_FL had been unconventionally included in the INHERITED fsflags: remove them and generic/079 is back to passing. Link: https://lkml.kernel.org/r/2961dcb0-ddf3-b9f0-3268-12a4ff996856@google.com Fixes: e408e695f5f1 ("mm/shmem: support FS_IOC_[SG]ETFLAGS in tmpfs") Signed-off-by: Hugh Dickins Cc: "Theodore Ts'o" Cc: Radoslaw Burny Cc: "Darrick J. Wong" Cc: Matthew Wilcox (Oracle) Signed-off-by: Andrew Morton --- include/linux/shmem_fs.h | 13 +++------- mm/shmem.c | 54 +++++++++++++++++++++++----------------- 2 files changed, 35 insertions(+), 32 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 1b6c4013f691..ff0b990de83d 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -29,15 +29,10 @@ struct shmem_inode_info { struct inode vfs_inode; }; -#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE -#define SHMEM_FL_USER_MODIFIABLE FS_FL_USER_MODIFIABLE -#define SHMEM_FL_INHERITED FS_FL_USER_MODIFIABLE - -/* Flags that are appropriate for regular files (all but dir-specific ones). */ -#define SHMEM_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) - -/* Flags that are appropriate for non-directories/regular files. */ -#define SHMEM_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) +#define SHMEM_FL_USER_VISIBLE FS_FL_USER_VISIBLE +#define SHMEM_FL_USER_MODIFIABLE \ + (FS_IMMUTABLE_FL | FS_APPEND_FL | FS_NODUMP_FL | FS_NOATIME_FL) +#define SHMEM_FL_INHERITED (FS_NODUMP_FL | FS_NOATIME_FL) struct shmem_sb_info { unsigned long max_blocks; /* How many blocks are allowed */ diff --git a/mm/shmem.c b/mm/shmem.c index 5783f11351bb..170b4078420f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2281,16 +2281,34 @@ static int shmem_mmap(struct file *file, struct vm_area_struct *vma) return 0; } -/* Mask out flags that are inappropriate for the given type of inode. */ -static unsigned shmem_mask_flags(umode_t mode, __u32 flags) +#ifdef CONFIG_TMPFS_XATTR +static int shmem_initxattrs(struct inode *, const struct xattr *, void *); + +/* + * chattr's fsflags are unrelated to extended attributes, + * but tmpfs has chosen to enable them under the same config option. + */ +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) { - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & SHMEM_REG_FLMASK; - else - return flags & SHMEM_OTHER_FLMASK; + unsigned int i_flags = 0; + + if (fsflags & FS_NOATIME_FL) + i_flags |= S_NOATIME; + if (fsflags & FS_APPEND_FL) + i_flags |= S_APPEND; + if (fsflags & FS_IMMUTABLE_FL) + i_flags |= S_IMMUTABLE; + /* + * But FS_NODUMP_FL does not require any action in i_flags. + */ + inode_set_flags(inode, i_flags, S_NOATIME | S_APPEND | S_IMMUTABLE); } +#else +static void shmem_set_inode_flags(struct inode *inode, unsigned int fsflags) +{ +} +#define shmem_initxattrs NULL +#endif static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, umode_t mode, dev_t dev, unsigned long flags) @@ -2319,7 +2337,8 @@ static struct inode *shmem_get_inode(struct super_block *sb, struct inode *dir, info->i_crtime = inode->i_mtime; info->fsflags = (dir == NULL) ? 0 : SHMEM_I(dir)->fsflags & SHMEM_FL_INHERITED; - info->fsflags = shmem_mask_flags(mode, info->fsflags); + if (info->fsflags) + shmem_set_inode_flags(inode, info->fsflags); INIT_LIST_HEAD(&info->shrinklist); INIT_LIST_HEAD(&info->swaplist); simple_xattrs_init(&info->xattrs); @@ -2468,12 +2487,6 @@ out_unacct_blocks: static const struct inode_operations shmem_symlink_inode_operations; static const struct inode_operations shmem_short_symlink_operations; -#ifdef CONFIG_TMPFS_XATTR -static int shmem_initxattrs(struct inode *, const struct xattr *, void *); -#else -#define shmem_initxattrs NULL -#endif - static int shmem_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, @@ -3179,18 +3192,13 @@ static int shmem_fileattr_set(struct user_namespace *mnt_userns, if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; + if (fa->flags & ~SHMEM_FL_USER_MODIFIABLE) + return -EOPNOTSUPP; info->fsflags = (info->fsflags & ~SHMEM_FL_USER_MODIFIABLE) | (fa->flags & SHMEM_FL_USER_MODIFIABLE); - inode->i_flags &= ~(S_APPEND | S_IMMUTABLE | S_NOATIME); - if (info->fsflags & FS_APPEND_FL) - inode->i_flags |= S_APPEND; - if (info->fsflags & FS_IMMUTABLE_FL) - inode->i_flags |= S_IMMUTABLE; - if (info->fsflags & FS_NOATIME_FL) - inode->i_flags |= S_NOATIME; - + shmem_set_inode_flags(inode, info->fsflags); inode->i_ctime = current_time(inode); return 0; } From 15f242bb65b89d5f1ff990668a586fdf1307b2c8 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 10 Aug 2022 21:55:36 -0700 Subject: [PATCH 27/39] mm/shmem: tmpfs fallocate use file_modified() 5.18 fixed the btrfs and ext4 fallocates to use file_modified(), as xfs was already doing, to drop privileges: and fstests generic/{683,684,688} expect this. There's no need to argue over keep-size allocation (which could just update ctime): fix shmem_fallocate() to behave the same way. Link: https://lkml.kernel.org/r/39c5e62-4896-7795-c0a0-f79c50d4909@google.com Signed-off-by: Hugh Dickins Acked-by: Christian Brauner (Microsoft) Cc: "Darrick J. Wong" Cc: Matthew Wilcox (Oracle) Cc: Radoslaw Burny Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton --- mm/shmem.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 170b4078420f..ce2090744c5e 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2839,12 +2839,13 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) i_size_write(inode, offset + len); - inode->i_ctime = current_time(inode); undone: spin_lock(&inode->i_lock); inode->i_private = NULL; spin_unlock(&inode->i_lock); out: + if (!error) + file_modified(file); inode_unlock(inode); return error; } From 76d36dea02691a8ffa8cd7368eecbf727b8a1c0c Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Wed, 10 Aug 2022 22:06:33 -0700 Subject: [PATCH 28/39] mm/shmem: shmem_replace_page() remember NR_SHMEM Elsewhere, NR_SHMEM is updated at the same time as shmem NR_FILE_PAGES; but shmem_replace_page() was forgetting to do that - so NR_SHMEM stats could grow too big or too small, in those unusual cases when it's used. Link: https://lkml.kernel.org/r/cec7c09d-5874-e160-ada6-6e10ee48784@google.com Signed-off-by: Hugh Dickins Reviewed-by: Matthew Wilcox (Oracle) Cc: "Darrick J. Wong" Cc: Radoslaw Burny Cc: "Theodore Ts'o" Signed-off-by: Andrew Morton --- mm/shmem.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index ce2090744c5e..d075dd2dcc48 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1659,7 +1659,9 @@ static int shmem_replace_page(struct page **pagep, gfp_t gfp, new = page_folio(newpage); mem_cgroup_migrate(old, new); __inc_lruvec_page_state(newpage, NR_FILE_PAGES); + __inc_lruvec_page_state(newpage, NR_SHMEM); __dec_lruvec_page_state(oldpage, NR_FILE_PAGES); + __dec_lruvec_page_state(oldpage, NR_SHMEM); } xa_unlock_irq(&swap_mapping->i_pages); From 9c80e79906b4ca440d09e7f116609262bb747909 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Fri, 12 Aug 2022 19:05:09 -0700 Subject: [PATCH 29/39] kprobes: don't call disarm_kprobe() for disabled kprobes The assumption in __disable_kprobe() is wrong, and it could try to disarm an already disarmed kprobe and fire the WARN_ONCE() below. [0] We can easily reproduce this issue. 1. Write 0 to /sys/kernel/debug/kprobes/enabled. # echo 0 > /sys/kernel/debug/kprobes/enabled 2. Run execsnoop. At this time, one kprobe is disabled. # /usr/share/bcc/tools/execsnoop & [1] 2460 PCOMM PID PPID RET ARGS # cat /sys/kernel/debug/kprobes/list ffffffff91345650 r __x64_sys_execve+0x0 [FTRACE] ffffffff91345650 k __x64_sys_execve+0x0 [DISABLED][FTRACE] 3. Write 1 to /sys/kernel/debug/kprobes/enabled, which changes kprobes_all_disarmed to false but does not arm the disabled kprobe. # echo 1 > /sys/kernel/debug/kprobes/enabled # cat /sys/kernel/debug/kprobes/list ffffffff91345650 r __x64_sys_execve+0x0 [FTRACE] ffffffff91345650 k __x64_sys_execve+0x0 [DISABLED][FTRACE] 4. Kill execsnoop, when __disable_kprobe() calls disarm_kprobe() for the disabled kprobe and hits the WARN_ONCE() in __disarm_kprobe_ftrace(). # fg /usr/share/bcc/tools/execsnoop ^C Actually, WARN_ONCE() is fired twice, and __unregister_kprobe_top() misses some cleanups and leaves the aggregated kprobe in the hash table. Then, __unregister_trace_kprobe() initialises tk->rp.kp.list and creates an infinite loop like this. aggregated kprobe.list -> kprobe.list -. ^ | '.__.' In this situation, these commands fall into the infinite loop and result in RCU stall or soft lockup. cat /sys/kernel/debug/kprobes/list : show_kprobe_addr() enters into the infinite loop with RCU. /usr/share/bcc/tools/execsnoop : warn_kprobe_rereg() holds kprobe_mutex, and __get_valid_kprobe() is stuck in the loop. To avoid the issue, make sure we don't call disarm_kprobe() for disabled kprobes. [0] Failed to disarm kprobe-ftrace at __x64_sys_execve+0x0/0x40 (error -2) WARNING: CPU: 6 PID: 2460 at kernel/kprobes.c:1130 __disarm_kprobe_ftrace.isra.19 (kernel/kprobes.c:1129) Modules linked in: ena CPU: 6 PID: 2460 Comm: execsnoop Not tainted 5.19.0+ #28 Hardware name: Amazon EC2 c5.2xlarge/, BIOS 1.0 10/16/2017 RIP: 0010:__disarm_kprobe_ftrace.isra.19 (kernel/kprobes.c:1129) Code: 24 8b 02 eb c1 80 3d c4 83 f2 01 00 75 d4 48 8b 75 00 89 c2 48 c7 c7 90 fa 0f 92 89 04 24 c6 05 ab 83 01 e8 e4 94 f0 ff <0f> 0b 8b 04 24 eb b1 89 c6 48 c7 c7 60 fa 0f 92 89 04 24 e8 cc 94 RSP: 0018:ffff9e6ec154bd98 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffffff930f7b00 RCX: 0000000000000001 RDX: 0000000080000001 RSI: ffffffff921461c5 RDI: 00000000ffffffff RBP: ffff89c504286da8 R08: 0000000000000000 R09: c0000000fffeffff R10: 0000000000000000 R11: ffff9e6ec154bc28 R12: ffff89c502394e40 R13: ffff89c502394c00 R14: ffff9e6ec154bc00 R15: 0000000000000000 FS: 00007fe800398740(0000) GS:ffff89c812d80000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 000000c00057f010 CR3: 0000000103b54006 CR4: 00000000007706e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: __disable_kprobe (kernel/kprobes.c:1716) disable_kprobe (kernel/kprobes.c:2392) __disable_trace_kprobe (kernel/trace/trace_kprobe.c:340) disable_trace_kprobe (kernel/trace/trace_kprobe.c:429) perf_trace_event_unreg.isra.2 (./include/linux/tracepoint.h:93 kernel/trace/trace_event_perf.c:168) perf_kprobe_destroy (kernel/trace/trace_event_perf.c:295) _free_event (kernel/events/core.c:4971) perf_event_release_kernel (kernel/events/core.c:5176) perf_release (kernel/events/core.c:5186) __fput (fs/file_table.c:321) task_work_run (./include/linux/sched.h:2056 (discriminator 1) kernel/task_work.c:179 (discriminator 1)) exit_to_user_mode_prepare (./include/linux/resume_user_mode.h:49 kernel/entry/common.c:169 kernel/entry/common.c:201) syscall_exit_to_user_mode (./arch/x86/include/asm/jump_label.h:55 ./arch/x86/include/asm/nospec-branch.h:384 ./arch/x86/include/asm/entry-common.h:94 kernel/entry/common.c:133 kernel/entry/common.c:296) do_syscall_64 (arch/x86/entry/common.c:87) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) RIP: 0033:0x7fe7ff210654 Code: 15 79 89 20 00 f7 d8 64 89 02 48 c7 c0 ff ff ff ff eb be 0f 1f 00 8b 05 9a cd 20 00 48 63 ff 85 c0 75 11 b8 03 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 3a f3 c3 48 83 ec 18 48 89 7c 24 08 e8 34 fc RSP: 002b:00007ffdbd1d3538 EFLAGS: 00000246 ORIG_RAX: 0000000000000003 RAX: 0000000000000000 RBX: 0000000000000008 RCX: 00007fe7ff210654 RDX: 0000000000000000 RSI: 0000000000002401 RDI: 0000000000000008 RBP: 0000000000000000 R08: 94ae31d6fda838a4 R0900007fe8001c9d30 R10: 00007ffdbd1d34b0 R11: 0000000000000246 R12: 00007ffdbd1d3600 R13: 0000000000000000 R14: fffffffffffffffc R15: 00007ffdbd1d3560 Link: https://lkml.kernel.org/r/20220813020509.90805-1-kuniyu@amazon.com Fixes: 69d54b916d83 ("kprobes: makes kprobes/enabled works correctly for optimized kprobes.") Signed-off-by: Kuniyuki Iwashima Reported-by: Ayushman Dutta Cc: "Naveen N. Rao" Cc: Anil S Keshavamurthy Cc: "David S. Miller" Cc: Masami Hiramatsu Cc: Wang Nan Cc: Kuniyuki Iwashima Cc: Kuniyuki Iwashima Cc: Ayushman Dutta Cc: Signed-off-by: Andrew Morton --- kernel/kprobes.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 80697e5e03e4..08350e35aba2 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -1707,11 +1707,12 @@ static struct kprobe *__disable_kprobe(struct kprobe *p) /* Try to disarm and disable this/parent probe */ if (p == orig_p || aggr_kprobe_disabled(orig_p)) { /* - * If 'kprobes_all_disarmed' is set, 'orig_p' - * should have already been disarmed, so - * skip unneed disarming process. + * Don't be lazy here. Even if 'kprobes_all_disarmed' + * is false, 'orig_p' might not have been armed yet. + * Note arm_all_kprobes() __tries__ to arm all kprobes + * on the best effort basis. */ - if (!kprobes_all_disarmed) { + if (!kprobes_all_disarmed && !kprobe_disabled(orig_p)) { ret = disarm_kprobe(orig_p, true); if (ret) { p->flags &= ~KPROBE_FLAG_DISABLED; From 7ae1f5508d9a33fd58ed3059bd2d569961e3b8bd Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sat, 20 Aug 2022 17:59:17 +0200 Subject: [PATCH 30/39] parisc: Fix exception handler for fldw and fstw instructions The exception handler is broken for unaligned memory acceses with fldw and fstw instructions, because it trashes or uses randomly some other floating point register than the one specified in the instruction word on loads and stores. The instruction "fldw 0(addr),%fr22L" (and the other fldw/fstw instructions) encode the target register (%fr22) in the rightmost 5 bits of the instruction word. The 7th rightmost bit of the instruction word defines if the left or right half of %fr22 should be used. While processing unaligned address accesses, the FR3() define is used to extract the offset into the local floating-point register set. But the calculation in FR3() was buggy, so that for example instead of %fr22, register %fr12 [((22 * 2) & 0x1f) = 12] was used. This bug has been since forever in the parisc kernel and I wonder why it wasn't detected earlier. Interestingly I noticed this bug just because the libime debian package failed to build on *native* hardware, while it successfully built in qemu. This patch corrects the bitshift and masking calculation in FR3(). Signed-off-by: Helge Deller Cc: --- arch/parisc/kernel/unaligned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/parisc/kernel/unaligned.c b/arch/parisc/kernel/unaligned.c index bac581b5ecfc..e8a4d77cff53 100644 --- a/arch/parisc/kernel/unaligned.c +++ b/arch/parisc/kernel/unaligned.c @@ -93,7 +93,7 @@ #define R1(i) (((i)>>21)&0x1f) #define R2(i) (((i)>>16)&0x1f) #define R3(i) ((i)&0x1f) -#define FR3(i) ((((i)<<1)&0x1f)|(((i)>>6)&1)) +#define FR3(i) ((((i)&0x1f)<<1)|(((i)>>6)&1)) #define IM(i,n) (((i)>>1&((1<<(n-1))-1))|((i)&1?((0-1L)<<(n-1)):0)) #define IM5_2(i) IM((i)>>16,5) #define IM5_3(i) IM((i),5) From 3dcfb729b5f4a0c9b50742865cd5e6c4dbcc80dc Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Fri, 19 Aug 2022 19:30:50 +0200 Subject: [PATCH 31/39] parisc: Make CONFIG_64BIT available for ARCH=parisc64 only With this patch the ARCH= parameter decides if the CONFIG_64BIT option will be set or not. This means, the ARCH= parameter will give: ARCH=parisc -> 32-bit kernel ARCH=parisc64 -> 64-bit kernel This simplifies the usage of the other config options like randconfig, allmodconfig and allyesconfig a lot and produces the output which is expected for parisc64 (64-bit) vs. parisc (32-bit). Suggested-by: Masahiro Yamada Signed-off-by: Helge Deller Tested-by: Randy Dunlap Reviewed-by: Randy Dunlap Cc: # 5.15+ --- arch/parisc/Kconfig | 21 ++++++--------------- 1 file changed, 6 insertions(+), 15 deletions(-) diff --git a/arch/parisc/Kconfig b/arch/parisc/Kconfig index 7f059cd1196a..9aede2447011 100644 --- a/arch/parisc/Kconfig +++ b/arch/parisc/Kconfig @@ -146,10 +146,10 @@ menu "Processor type and features" choice prompt "Processor type" - default PA7000 + default PA7000 if "$(ARCH)" = "parisc" config PA7000 - bool "PA7000/PA7100" + bool "PA7000/PA7100" if "$(ARCH)" = "parisc" help This is the processor type of your CPU. This information is used for optimizing purposes. In order to compile a kernel @@ -160,21 +160,21 @@ config PA7000 which is required on some machines. config PA7100LC - bool "PA7100LC" + bool "PA7100LC" if "$(ARCH)" = "parisc" help Select this option for the PCX-L processor, as used in the 712, 715/64, 715/80, 715/100, 715/100XC, 725/100, 743, 748, D200, D210, D300, D310 and E-class config PA7200 - bool "PA7200" + bool "PA7200" if "$(ARCH)" = "parisc" help Select this option for the PCX-T' processor, as used in the C100, C110, J100, J110, J210XC, D250, D260, D350, D360, K100, K200, K210, K220, K400, K410 and K420 config PA7300LC - bool "PA7300LC" + bool "PA7300LC" if "$(ARCH)" = "parisc" help Select this option for the PCX-L2 processor, as used in the 744, A180, B132L, B160L, B180L, C132L, C160L, C180L, @@ -224,17 +224,8 @@ config MLONGCALLS Enabling this option will probably slow down your kernel. config 64BIT - bool "64-bit kernel" + def_bool "$(ARCH)" = "parisc64" depends on PA8X00 - help - Enable this if you want to support 64bit kernel on PA-RISC platform. - - At the moment, only people willing to use more than 2GB of RAM, - or having a 64bit-only capable PA-RISC machine should say Y here. - - Since there is no 64bit userland on PA-RISC, there is no point to - enable this option otherwise. The 64bit kernel is significantly bigger - and slower than the 32bit one. choice prompt "Kernel page size" From b4b18f47f4f9682fbf5827682645da7c8dde8f80 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 21 Aug 2022 08:12:19 +0200 Subject: [PATCH 32/39] Revert "parisc: Show error if wrong 32/64-bit compiler is being used" This reverts commit b160628e9ebcdc85d0db9d7f423c26b3c7c179d0. There is no need any longer to have this sanity check, because the previous commit ("parisc: Make CONFIG_64BIT available for ARCH=parisc64 only") prevents that CONFIG_64BIT is set if ARCH==parisc. Signed-off-by: Helge Deller --- arch/parisc/include/asm/bitops.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/parisc/include/asm/bitops.h b/arch/parisc/include/asm/bitops.h index 56ffd260c669..0ec9cfc5131f 100644 --- a/arch/parisc/include/asm/bitops.h +++ b/arch/parisc/include/asm/bitops.h @@ -12,14 +12,6 @@ #include #include -/* compiler build environment sanity checks: */ -#if !defined(CONFIG_64BIT) && defined(__LP64__) -#error "Please use 'ARCH=parisc' to build the 32-bit kernel." -#endif -#if defined(CONFIG_64BIT) && !defined(__LP64__) -#error "Please use 'ARCH=parisc64' to build the 64-bit kernel." -#endif - /* See http://marc.theaimsgroup.com/?t=108826637900003 for discussion * on use of volatile and __*_bit() (set/clear/change): * *_bit() want use of volatile. From db4538ad4db280a2114e9d507d47c9f1a9f2cfd8 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Thu, 11 Aug 2022 21:59:53 +0800 Subject: [PATCH 33/39] parisc: ccio-dma: Fix typo in comment The double `was' is duplicated in the comment, remove one. Signed-off-by: Jason Wang Signed-off-by: Helge Deller --- drivers/parisc/ccio-dma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index 9be007c9420f..e863eb648379 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -268,7 +268,7 @@ static int ioc_count; * Each bit can represent a number of pages. * LSbs represent lower addresses (IOVA's). * -* This was was copied from sba_iommu.c. Don't try to unify +* This was copied from sba_iommu.c. Don't try to unify * the two resource managers unless a way to have different * allocation policies is also adjusted. We'd like to avoid * I/O TLB thrashing by having resource allocation policy From 4cb2643667c26fc976a4941b92f3770da9ec06a0 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:00:55 +0200 Subject: [PATCH 34/39] parisc: led: Move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Signed-off-by: Wolfram Sang Signed-off-by: Helge Deller --- drivers/parisc/led.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/parisc/led.c b/drivers/parisc/led.c index 1e4a5663d011..d4be9d2ee74d 100644 --- a/drivers/parisc/led.c +++ b/drivers/parisc/led.c @@ -646,7 +646,7 @@ int lcd_print( const char *str ) cancel_delayed_work_sync(&led_task); /* copy display string to buffer for procfs */ - strlcpy(lcd_text, str, sizeof(lcd_text)); + strscpy(lcd_text, str, sizeof(lcd_text)); /* Set LCD Cursor to 1st character */ gsc_writeb(lcd_info.reset_cmd1, LCD_CMD_REG); From d46c742f827fa2326ab1f4faa1cccadb56912341 Mon Sep 17 00:00:00 2001 From: Li Qiong Date: Fri, 19 Aug 2022 12:15:10 +0800 Subject: [PATCH 35/39] parisc: ccio-dma: Handle kmalloc failure in ccio_init_resources() As the possible failure of the kmalloc(), it should be better to fix this error path, check and return '-ENOMEM' error code. Signed-off-by: Li Qiong Signed-off-by: Helge Deller --- drivers/parisc/ccio-dma.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index e863eb648379..f223afe47d10 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -1380,15 +1380,17 @@ ccio_init_resource(struct resource *res, char *name, void __iomem *ioaddr) } } -static void __init ccio_init_resources(struct ioc *ioc) +static int __init ccio_init_resources(struct ioc *ioc) { struct resource *res = ioc->mmio_region; char *name = kmalloc(14, GFP_KERNEL); - + if (unlikely(!name)) + return -ENOMEM; snprintf(name, 14, "GSC Bus [%d/]", ioc->hw_path); ccio_init_resource(res, name, &ioc->ioc_regs->io_io_low); ccio_init_resource(res + 1, name, &ioc->ioc_regs->io_io_low_hv); + return 0; } static int new_ioc_area(struct resource *res, unsigned long size, @@ -1543,7 +1545,10 @@ static int __init ccio_probe(struct parisc_device *dev) return -ENOMEM; } ccio_ioc_init(ioc); - ccio_init_resources(ioc); + if (ccio_init_resources(ioc)) { + kfree(ioc); + return -ENOMEM; + } hppa_dma_ops = &ccio_ops; hba = kzalloc(sizeof(*hba), GFP_KERNEL); From 591d2108f3abc4db9f9073cae37cf3591fd250d6 Mon Sep 17 00:00:00 2001 From: Helge Deller Date: Sun, 21 Aug 2022 14:49:58 +0200 Subject: [PATCH 36/39] parisc: Add runtime check to prevent PA2.0 kernels on PA1.x machines If a 32-bit kernel was compiled for PA2.0 CPUs, it won't be able to run on machines with PA1.x CPUs. Add a check and bail out early if a PA1.x machine is detected. Signed-off-by: Helge Deller --- arch/parisc/kernel/head.S | 43 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 42 insertions(+), 1 deletion(-) diff --git a/arch/parisc/kernel/head.S b/arch/parisc/kernel/head.S index e0a9e9657622..fd15fd4bbb61 100644 --- a/arch/parisc/kernel/head.S +++ b/arch/parisc/kernel/head.S @@ -22,7 +22,7 @@ #include #include - .level PA_ASM_LEVEL + .level 1.1 __INITDATA ENTRY(boot_args) @@ -70,6 +70,47 @@ $bss_loop: stw,ma %arg2,4(%r1) stw,ma %arg3,4(%r1) +#if !defined(CONFIG_64BIT) && defined(CONFIG_PA20) + /* This 32-bit kernel was compiled for PA2.0 CPUs. Check current CPU + * and halt kernel if we detect a PA1.x CPU. */ + ldi 32,%r10 + mtctl %r10,%cr11 + .level 2.0 + mfctl,w %cr11,%r10 + .level 1.1 + comib,<>,n 0,%r10,$cpu_ok + + load32 PA(msg1),%arg0 + ldi msg1_end-msg1,%arg1 +$iodc_panic: + copy %arg0, %r10 + copy %arg1, %r11 + load32 PA(init_stack),%sp +#define MEM_CONS 0x3A0 + ldw MEM_CONS+32(%r0),%arg0 // HPA + ldi ENTRY_IO_COUT,%arg1 + ldw MEM_CONS+36(%r0),%arg2 // SPA + ldw MEM_CONS+8(%r0),%arg3 // layers + load32 PA(__bss_start),%r1 + stw %r1,-52(%sp) // arg4 + stw %r0,-56(%sp) // arg5 + stw %r10,-60(%sp) // arg6 = ptr to text + stw %r11,-64(%sp) // arg7 = len + stw %r0,-68(%sp) // arg8 + load32 PA(.iodc_panic_ret), %rp + ldw MEM_CONS+40(%r0),%r1 // ENTRY_IODC + bv,n (%r1) +.iodc_panic_ret: + b . /* wait endless with ... */ + or %r10,%r10,%r10 /* qemu idle sleep */ +msg1: .ascii "Can't boot kernel which was built for PA8x00 CPUs on this machine.\r\n" +msg1_end: + +$cpu_ok: +#endif + + .level PA_ASM_LEVEL + /* Initialize startup VM. Just map first 16/32 MB of memory */ load32 PA(swapper_pg_dir),%r4 mtctl %r4,%cr24 /* Initialize kernel root pointer */ From 0c3bc7899e6dfb52df1c46118a5a670ae619645f Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 20 Jul 2022 14:32:52 +0200 Subject: [PATCH 37/39] ntfs: fix acl handling While looking at our current POSIX ACL handling in the context of some overlayfs work I went through a range of other filesystems checking how they handle them currently and encountered ntfs3. The posic_acl_{from,to}_xattr() helpers always need to operate on the filesystem idmapping. Since ntfs3 can only be mounted in the initial user namespace the relevant idmapping is init_user_ns. The posix_acl_{from,to}_xattr() helpers are concerned with translating between the kernel internal struct posix_acl{_entry} and the uapi struct posix_acl_xattr_{header,entry} and the kernel internal data structure is cached filesystem wide. Additional idmappings such as the caller's idmapping or the mount's idmapping are handled higher up in the VFS. Individual filesystems usually do not need to concern themselves with these. The posix_acl_valid() helper is concerned with checking whether the values in the kernel internal struct posix_acl can be represented in the filesystem's idmapping. IOW, if they can be written to disk. So this helper too needs to take the filesystem's idmapping. Fixes: be71b5cba2e6 ("fs/ntfs3: Add attrib operations") Cc: Konstantin Komarov Cc: ntfs3@lists.linux.dev Signed-off-by: Christian Brauner (Microsoft) --- fs/ntfs3/xattr.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/ntfs3/xattr.c b/fs/ntfs3/xattr.c index 5e0e0280e70d..3e9118705174 100644 --- a/fs/ntfs3/xattr.c +++ b/fs/ntfs3/xattr.c @@ -478,8 +478,7 @@ out: } #ifdef CONFIG_NTFS3_FS_POSIX_ACL -static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns, - struct inode *inode, int type, +static struct posix_acl *ntfs_get_acl_ex(struct inode *inode, int type, int locked) { struct ntfs_inode *ni = ntfs_i(inode); @@ -514,7 +513,7 @@ static struct posix_acl *ntfs_get_acl_ex(struct user_namespace *mnt_userns, /* Translate extended attribute to acl. */ if (err >= 0) { - acl = posix_acl_from_xattr(mnt_userns, buf, err); + acl = posix_acl_from_xattr(&init_user_ns, buf, err); } else if (err == -ENODATA) { acl = NULL; } else { @@ -537,8 +536,7 @@ struct posix_acl *ntfs_get_acl(struct inode *inode, int type, bool rcu) if (rcu) return ERR_PTR(-ECHILD); - /* TODO: init_user_ns? */ - return ntfs_get_acl_ex(&init_user_ns, inode, type, 0); + return ntfs_get_acl_ex(inode, type, 0); } static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, @@ -595,7 +593,7 @@ static noinline int ntfs_set_acl_ex(struct user_namespace *mnt_userns, value = kmalloc(size, GFP_NOFS); if (!value) return -ENOMEM; - err = posix_acl_to_xattr(mnt_userns, acl, value, size); + err = posix_acl_to_xattr(&init_user_ns, acl, value, size); if (err < 0) goto out; flags = 0; @@ -641,7 +639,7 @@ static int ntfs_xattr_get_acl(struct user_namespace *mnt_userns, if (!acl) return -ENODATA; - err = posix_acl_to_xattr(mnt_userns, acl, buffer, size); + err = posix_acl_to_xattr(&init_user_ns, acl, buffer, size); posix_acl_release(acl); return err; @@ -665,12 +663,12 @@ static int ntfs_xattr_set_acl(struct user_namespace *mnt_userns, if (!value) { acl = NULL; } else { - acl = posix_acl_from_xattr(mnt_userns, value, size); + acl = posix_acl_from_xattr(&init_user_ns, value, size); if (IS_ERR(acl)) return PTR_ERR(acl); if (acl) { - err = posix_acl_valid(mnt_userns, acl); + err = posix_acl_valid(&init_user_ns, acl); if (err) goto release_and_out; } From cfd2b5c1106fa20254d9f24970232cdf24860005 Mon Sep 17 00:00:00 2001 From: Yang Jihong Date: Mon, 22 Aug 2022 17:25:57 +0800 Subject: [PATCH 38/39] perf tools: Fix compile error for x86 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit a0a12c3ed057 ("asm goto: eradicate CC_HAS_ASM_GOTO") eradicates CC_HAS_ASM_GOTO, and in the process also causes the perf tool on x86 to use asm_volatile_goto when compiling __GEN_RMWcc. However, asm_volatile_goto is not declared in the perf tool headers, which causes a compilation error: In file included from tools/arch/x86/include/asm/atomic.h:7, from tools/include/asm/atomic.h:6, from tools/include/linux/atomic.h:5, from tools/include/linux/refcount.h:41, from tools/lib/perf/include/internal/cpumap.h:5, from tools/perf/util/cpumap.h:7, from tools/perf/util/env.h:7, from tools/perf/util/header.h:12, from pmu-events/pmu-events.c:9: tools/arch/x86/include/asm/atomic.h: In function ‘atomic_dec_and_test’: tools/arch/x86/include/asm/rmwcc.h:7:2: error: implicit declaration of function ‘asm_volatile_goto’ [-Werror=implicit-function-declaration] asm_volatile_goto (fullop "; j" cc " %l[cc_label]" \ ^~~~~~~~~~~~~~~~~ Define asm_volatile_goto in compiler_types.h if not declared, like the main kernel header files do. Fixes: a0a12c3ed057 ("asm goto: eradicate CC_HAS_ASM_GOTO") Signed-off-by: Yang Jihong Tested-by: Arnaldo Carvalho de Melo Tested-by: Ingo Molnar Signed-off-by: Linus Torvalds --- tools/include/linux/compiler_types.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/include/linux/compiler_types.h b/tools/include/linux/compiler_types.h index 24ae3054f304..1bdd834bdd57 100644 --- a/tools/include/linux/compiler_types.h +++ b/tools/include/linux/compiler_types.h @@ -36,4 +36,8 @@ #include #endif +#ifndef asm_volatile_goto +#define asm_volatile_goto(x...) asm goto(x) +#endif + #endif /* __LINUX_COMPILER_TYPES_H */ From ad982c3be4e60c7d39c03f782733503cbd88fd2a Mon Sep 17 00:00:00 2001 From: Gaosheng Cui Date: Mon, 22 Aug 2022 10:29:05 +0800 Subject: [PATCH 39/39] audit: fix potential double free on error path from fsnotify_add_inode_mark Audit_alloc_mark() assign pathname to audit_mark->path, on error path from fsnotify_add_inode_mark(), fsnotify_put_mark will free memory of audit_mark->path, but the caller of audit_alloc_mark will free the pathname again, so there will be double free problem. Fix this by resetting audit_mark->path to NULL pointer on error path from fsnotify_add_inode_mark(). Cc: stable@vger.kernel.org Fixes: 7b1293234084d ("fsnotify: Add group pointer in fsnotify_init_mark()") Signed-off-by: Gaosheng Cui Reviewed-by: Jan Kara Signed-off-by: Paul Moore --- kernel/audit_fsnotify.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 6432a37ac1c9..c565fbf66ac8 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -102,6 +102,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa ret = fsnotify_add_inode_mark(&audit_mark->mark, inode, 0); if (ret < 0) { + audit_mark->path = NULL; fsnotify_put_mark(&audit_mark->mark); audit_mark = ERR_PTR(ret); }