From 7df140e84a75c89962feef659d686303d3ce75e5 Mon Sep 17 00:00:00 2001 From: Christian Marangi Date: Fri, 21 Oct 2022 18:53:04 +0200 Subject: [PATCH 01/14] mtd: rawnand: qcom: handle ret from parse with codeword_fixup With use_codeword_fixup enabled, any return from mtd_device_parse_register gets overwritten. Aside from the clear bug, this is also problematic as a parser can EPROBE_DEFER and because this is not correctly handled, the nand is never rescanned later in the bootup process. An example of this problem is when smem requires additional time to be probed and nandc use qcomsmempart as parser. Parser will return EPROBE_DEFER but in the current code this ret gets overwritten by qcom_nand_host_parse_boot_partitions and qcom_nand_host_init_and_register return 0. Correctly handle the return code from mtd_device_parse_register so that any error from this function is not ignored. Fixes: 862bdedd7f4b ("mtd: nand: raw: qcom_nandc: add support for unprotected spare data pages") Cc: stable@vger.kernel.org # v6.0+ Signed-off-by: Christian Marangi Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20221021165304.19991-1-ansuelsmth@gmail.com --- drivers/mtd/nand/raw/qcom_nandc.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/mtd/nand/raw/qcom_nandc.c b/drivers/mtd/nand/raw/qcom_nandc.c index 8f80019a9f01..198a44794d2d 100644 --- a/drivers/mtd/nand/raw/qcom_nandc.c +++ b/drivers/mtd/nand/raw/qcom_nandc.c @@ -3167,16 +3167,18 @@ static int qcom_nand_host_init_and_register(struct qcom_nand_controller *nandc, ret = mtd_device_parse_register(mtd, probes, NULL, NULL, 0); if (ret) - nand_cleanup(chip); + goto err; if (nandc->props->use_codeword_fixup) { ret = qcom_nand_host_parse_boot_partitions(nandc, host, dn); - if (ret) { - nand_cleanup(chip); - return ret; - } + if (ret) + goto err; } + return 0; + +err: + nand_cleanup(chip); return ret; } From 83f0869e9bf3333d778d62f055b0f8e1de1cc812 Mon Sep 17 00:00:00 2001 From: Adam Borowski Date: Mon, 24 Oct 2022 11:20:26 +0200 Subject: [PATCH 02/14] mtd: rawnand: placate "$VARIABLE is used uninitialized" warnings The compiler is not smart enough to notice that it's impossible for them to be actually used uninitialized. Which exact variables trip here varies depending on random surrounding code; none triggered in 6.1-rc1 but 6.1-rc2 fails on three of these five, despite variables declared in the very same line having identical flow. Signed-off-by: Adam Borowski Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20221024092026.42123-1-kilobyte@angband.pl --- drivers/mtd/nand/raw/nand_base.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mtd/nand/raw/nand_base.c b/drivers/mtd/nand/raw/nand_base.c index 33f2c98a030e..c3cc66039925 100644 --- a/drivers/mtd/nand/raw/nand_base.c +++ b/drivers/mtd/nand/raw/nand_base.c @@ -5834,7 +5834,7 @@ nand_match_ecc_req(struct nand_chip *chip, int req_step = requirements->step_size; int req_strength = requirements->strength; int req_corr, step_size, strength, nsteps, ecc_bytes, ecc_bytes_total; - int best_step, best_strength, best_ecc_bytes; + int best_step = 0, best_strength = 0, best_ecc_bytes = 0; int best_ecc_bytes_total = INT_MAX; int i, j; @@ -5915,7 +5915,7 @@ nand_maximize_ecc(struct nand_chip *chip, int step_size, strength, nsteps, ecc_bytes, corr; int best_corr = 0; int best_step = 0; - int best_strength, best_ecc_bytes; + int best_strength = 0, best_ecc_bytes = 0; int i, j; for (i = 0; i < caps->nstepinfos; i++) { From c717b9b7d6de9e024e47f7cd5bbff49f581d3db9 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 7 Nov 2022 10:15:20 +0100 Subject: [PATCH 03/14] mtd: onenand: omap2: add dependency on GPMC OMAP2 OneNAND driver uses gpmc_omap_onenand_set_timings() provided by OMAP_GPMC driver, so the latter cannot be module if OneNAND driver is built-in: /usr/bin/arm-linux-gnueabi-ld: drivers/mtd/nand/onenand/onenand_omap2.o: in function `omap2_onenand_probe': onenand_omap2.c:(.text+0x520): undefined reference to `gpmc_omap_onenand_set_timings' The OMAP_GPMC is also a runtime dependency. Reported-by: kernel test robot Fixes: 854fd9209b20 ("memory: omap-gpmc: Allow building as a module") Signed-off-by: Krzysztof Kozlowski Reviewed-by: Roger Quadros Signed-off-by: Miquel Raynal Link: https://lore.kernel.org/linux-mtd/20221107091520.127053-1-krzysztof.kozlowski@linaro.org --- drivers/mtd/nand/onenand/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mtd/nand/onenand/Kconfig b/drivers/mtd/nand/onenand/Kconfig index 34d9a7a82ad4..c94bf483541e 100644 --- a/drivers/mtd/nand/onenand/Kconfig +++ b/drivers/mtd/nand/onenand/Kconfig @@ -26,6 +26,7 @@ config MTD_ONENAND_OMAP2 tristate "OneNAND on OMAP2/OMAP3 support" depends on ARCH_OMAP2 || ARCH_OMAP3 || (COMPILE_TEST && ARM) depends on OF || COMPILE_TEST + depends on OMAP_GPMC help Support for a OneNAND flash device connected to an OMAP2/OMAP3 SoC via the GPMC memory controller. From e5126de138caef0eedb3d6431059c0c5581a1a5d Mon Sep 17 00:00:00 2001 From: Yue Hu Date: Fri, 21 Oct 2022 16:53:25 +0800 Subject: [PATCH 04/14] erofs: fix general protection fault when reading fragment As syzbot reported [1], the fragment feature sb flag is not set, so packed_inode != NULL needs to be checked in z_erofs_read_fragment(). [1] https://lore.kernel.org/all/0000000000002e7a8905eb841ddd@google.com/ Reported-by: syzbot+3faecbfd845a895c04cb@syzkaller.appspotmail.com Fixes: b15b2e307c3a ("erofs: support on-disk compressed fragments data") Signed-off-by: Yue Hu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20221021085325.25788-1-zbestahu@gmail.com Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index c7f24fc7efd5..d6caf275be77 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -660,6 +660,9 @@ static int z_erofs_read_fragment(struct inode *inode, erofs_off_t pos, u8 *src, *dst; unsigned int i, cnt; + if (!packed_inode) + return -EFSCORRUPTED; + pos += EROFS_I(inode)->z_fragmentoff; for (i = 0; i < len; i += cnt) { cnt = min_t(unsigned int, len - i, From 75e43355cbe4d5948a79bd592f2ffecb9f75f75d Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Fri, 4 Nov 2022 13:40:27 +0800 Subject: [PATCH 05/14] erofs: put metabuf in error path in fscache mode For tail packing layout, put metabuf when error is encountered. Fixes: 1ae9470c3e14 ("erofs: clean up .read_folio() and .readahead() in fscache mode") Signed-off-by: Jingbo Xu Reviewed-by: Gao Xiang Reviewed-by: Jia Zhu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20221104054028.52208-2-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index fe05bc51f9f2..83559008bfa8 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -287,8 +287,10 @@ static int erofs_fscache_data_read(struct address_space *mapping, return PTR_ERR(src); iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, PAGE_SIZE); - if (copy_to_iter(src + offset, size, &iter) != size) + if (copy_to_iter(src + offset, size, &iter) != size) { + erofs_put_metabuf(&buf); return -EFAULT; + } iov_iter_zero(PAGE_SIZE - size, &iter); erofs_put_metabuf(&buf); return PAGE_SIZE; From e6d9f9ba111b56154f1b1120252aff269cebd49c Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Fri, 4 Nov 2022 13:40:28 +0800 Subject: [PATCH 06/14] erofs: get correct count for unmapped range in fscache mode For unmapped range, the returned map.m_llen is zero, and thus the calculated count is unexpected zero. Prior to the refactoring introduced by commit 1ae9470c3e14 ("erofs: clean up .read_folio() and .readahead() in fscache mode"), only the readahead routine suffers from this. With the refactoring of making .read_folio() and .readahead() calling one common routine, both read_folio and readahead have this issue now. Fix this by calculating count separately in unmapped condition. Fixes: c665b394b9e8 ("erofs: implement fscache-based data readahead") Fixes: 1ae9470c3e14 ("erofs: clean up .read_folio() and .readahead() in fscache mode") Signed-off-by: Jingbo Xu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20221104054028.52208-3-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 83559008bfa8..260fa4737fc0 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -296,15 +296,16 @@ static int erofs_fscache_data_read(struct address_space *mapping, return PAGE_SIZE; } - count = min_t(size_t, map.m_llen - (pos - map.m_la), len); - DBG_BUGON(!count || count % PAGE_SIZE); - if (!(map.m_flags & EROFS_MAP_MAPPED)) { + count = len; iov_iter_xarray(&iter, READ, &mapping->i_pages, pos, count); iov_iter_zero(count, &iter); return count; } + count = min_t(size_t, map.m_llen - (pos - map.m_la), len); + DBG_BUGON(!count || count % PAGE_SIZE); + mdev = (struct erofs_map_dev) { .m_deviceid = map.m_deviceid, .m_pa = map.m_pa, From 39bfcb8138f6dc3375f23b1e62ccfc7c0d83295d Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Fri, 21 Oct 2022 10:31:53 +0800 Subject: [PATCH 07/14] erofs: fix use-after-free of fsid and domain_id string When erofs instance is remounted with fsid or domain_id mount option specified, the original fsid and domain_id string pointer in sbi->opt is directly overridden with the fsid and domain_id string in the new fs_context, without freeing the original fsid and domain_id string. What's worse, when the new fsid and domain_id string is transferred to sbi, they are not reset to NULL in fs_context, and thus they are freed when remount finishes, while sbi is still referring to these strings. Reconfiguration for fsid and domain_id seems unusual. Thus clarify this restriction explicitly and dump a warning when users are attempting to do this. Besides, to fix the use-after-free issue, move fsid and domain_id from erofs_mount_opts to outside. Fixes: c6be2bd0a5dd ("erofs: register fscache volume") Fixes: 8b7adf1dff3d ("erofs: introduce fscache-based domain") Signed-off-by: Jingbo Xu Reviewed-by: Jia Zhu Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20221021023153.1330-1-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 14 +++++++------- fs/erofs/internal.h | 6 ++++-- fs/erofs/super.c | 39 ++++++++++++++++++++++----------------- fs/erofs/sysfs.c | 8 ++++---- 4 files changed, 37 insertions(+), 30 deletions(-) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 260fa4737fc0..6eaf4a4ab95c 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -406,13 +406,13 @@ static void erofs_fscache_domain_put(struct erofs_domain *domain) static int erofs_fscache_register_volume(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); - char *domain_id = sbi->opt.domain_id; + char *domain_id = sbi->domain_id; struct fscache_volume *volume; char *name; int ret = 0; name = kasprintf(GFP_KERNEL, "erofs,%s", - domain_id ? domain_id : sbi->opt.fsid); + domain_id ? domain_id : sbi->fsid); if (!name) return -ENOMEM; @@ -438,7 +438,7 @@ static int erofs_fscache_init_domain(struct super_block *sb) if (!domain) return -ENOMEM; - domain->domain_id = kstrdup(sbi->opt.domain_id, GFP_KERNEL); + domain->domain_id = kstrdup(sbi->domain_id, GFP_KERNEL); if (!domain->domain_id) { kfree(domain); return -ENOMEM; @@ -475,7 +475,7 @@ static int erofs_fscache_register_domain(struct super_block *sb) mutex_lock(&erofs_domain_list_lock); list_for_each_entry(domain, &erofs_domain_list, list) { - if (!strcmp(domain->domain_id, sbi->opt.domain_id)) { + if (!strcmp(domain->domain_id, sbi->domain_id)) { sbi->domain = domain; sbi->volume = domain->volume; refcount_inc(&domain->ref); @@ -612,7 +612,7 @@ struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, char *name, bool need_inode) { - if (EROFS_SB(sb)->opt.domain_id) + if (EROFS_SB(sb)->domain_id) return erofs_domain_register_cookie(sb, name, need_inode); return erofs_fscache_acquire_cookie(sb, name, need_inode); } @@ -644,7 +644,7 @@ int erofs_fscache_register_fs(struct super_block *sb) struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; - if (sbi->opt.domain_id) + if (sbi->domain_id) ret = erofs_fscache_register_domain(sb); else ret = erofs_fscache_register_volume(sb); @@ -652,7 +652,7 @@ int erofs_fscache_register_fs(struct super_block *sb) return ret; /* acquired domain/volume will be relinquished in kill_sb() on error */ - fscache = erofs_fscache_register_cookie(sb, sbi->opt.fsid, true); + fscache = erofs_fscache_register_cookie(sb, sbi->fsid, true); if (IS_ERR(fscache)) return PTR_ERR(fscache); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 1701df48c446..05dc68627722 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -75,8 +75,6 @@ struct erofs_mount_opts { unsigned int max_sync_decompress_pages; #endif unsigned int mount_opt; - char *fsid; - char *domain_id; }; struct erofs_dev_context { @@ -89,6 +87,8 @@ struct erofs_dev_context { struct erofs_fs_context { struct erofs_mount_opts opt; struct erofs_dev_context *devs; + char *fsid; + char *domain_id; }; /* all filesystem-wide lz4 configurations */ @@ -170,6 +170,8 @@ struct erofs_sb_info { struct fscache_volume *volume; struct erofs_fscache *s_fscache; struct erofs_domain *domain; + char *fsid; + char *domain_id; }; #define EROFS_SB(sb) ((struct erofs_sb_info *)(sb)->s_fs_info) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 2cf96ce1c32e..1c7dcca702b3 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -579,9 +579,9 @@ static int erofs_fc_parse_param(struct fs_context *fc, break; case Opt_fsid: #ifdef CONFIG_EROFS_FS_ONDEMAND - kfree(ctx->opt.fsid); - ctx->opt.fsid = kstrdup(param->string, GFP_KERNEL); - if (!ctx->opt.fsid) + kfree(ctx->fsid); + ctx->fsid = kstrdup(param->string, GFP_KERNEL); + if (!ctx->fsid) return -ENOMEM; #else errorfc(fc, "fsid option not supported"); @@ -589,9 +589,9 @@ static int erofs_fc_parse_param(struct fs_context *fc, break; case Opt_domain_id: #ifdef CONFIG_EROFS_FS_ONDEMAND - kfree(ctx->opt.domain_id); - ctx->opt.domain_id = kstrdup(param->string, GFP_KERNEL); - if (!ctx->opt.domain_id) + kfree(ctx->domain_id); + ctx->domain_id = kstrdup(param->string, GFP_KERNEL); + if (!ctx->domain_id) return -ENOMEM; #else errorfc(fc, "domain_id option not supported"); @@ -728,10 +728,12 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_fs_info = sbi; sbi->opt = ctx->opt; - ctx->opt.fsid = NULL; - ctx->opt.domain_id = NULL; sbi->devs = ctx->devs; ctx->devs = NULL; + sbi->fsid = ctx->fsid; + ctx->fsid = NULL; + sbi->domain_id = ctx->domain_id; + ctx->domain_id = NULL; if (erofs_is_fscache_mode(sb)) { sb->s_blocksize = EROFS_BLKSIZ; @@ -820,7 +822,7 @@ static int erofs_fc_get_tree(struct fs_context *fc) { struct erofs_fs_context *ctx = fc->fs_private; - if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->opt.fsid) + if (IS_ENABLED(CONFIG_EROFS_FS_ONDEMAND) && ctx->fsid) return get_tree_nodev(fc, erofs_fc_fill_super); return get_tree_bdev(fc, erofs_fc_fill_super); @@ -834,6 +836,9 @@ static int erofs_fc_reconfigure(struct fs_context *fc) DBG_BUGON(!sb_rdonly(sb)); + if (ctx->fsid || ctx->domain_id) + erofs_info(sb, "ignoring reconfiguration for fsid|domain_id."); + if (test_opt(&ctx->opt, POSIX_ACL)) fc->sb_flags |= SB_POSIXACL; else @@ -873,8 +878,8 @@ static void erofs_fc_free(struct fs_context *fc) struct erofs_fs_context *ctx = fc->fs_private; erofs_free_dev_context(ctx->devs); - kfree(ctx->opt.fsid); - kfree(ctx->opt.domain_id); + kfree(ctx->fsid); + kfree(ctx->domain_id); kfree(ctx); } @@ -944,8 +949,8 @@ static void erofs_kill_sb(struct super_block *sb) erofs_free_dev_context(sbi->devs); fs_put_dax(sbi->dax_dev, NULL); erofs_fscache_unregister_fs(sb); - kfree(sbi->opt.fsid); - kfree(sbi->opt.domain_id); + kfree(sbi->fsid); + kfree(sbi->domain_id); kfree(sbi); sb->s_fs_info = NULL; } @@ -1098,10 +1103,10 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(opt, DAX_NEVER)) seq_puts(seq, ",dax=never"); #ifdef CONFIG_EROFS_FS_ONDEMAND - if (opt->fsid) - seq_printf(seq, ",fsid=%s", opt->fsid); - if (opt->domain_id) - seq_printf(seq, ",domain_id=%s", opt->domain_id); + if (sbi->fsid) + seq_printf(seq, ",fsid=%s", sbi->fsid); + if (sbi->domain_id) + seq_printf(seq, ",domain_id=%s", sbi->domain_id); #endif return 0; } diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 783bb7b21b51..fd476961f742 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -210,14 +210,14 @@ int erofs_register_sysfs(struct super_block *sb) int err; if (erofs_is_fscache_mode(sb)) { - if (sbi->opt.domain_id) { - str = kasprintf(GFP_KERNEL, "%s,%s", sbi->opt.domain_id, - sbi->opt.fsid); + if (sbi->domain_id) { + str = kasprintf(GFP_KERNEL, "%s,%s", sbi->domain_id, + sbi->fsid); if (!str) return -ENOMEM; name = str; } else { - name = sbi->opt.fsid; + name = sbi->fsid; } } else { name = sb->s_id; From 7fdba0011157861892c470995ff586a1871e603f Mon Sep 17 00:00:00 2001 From: Anthony DeRossi Date: Wed, 9 Nov 2022 17:40:25 -0800 Subject: [PATCH 08/14] vfio: Fix container device registration life cycle In vfio_device_open(), vfio_device_container_register() is always called when open_count == 1. On error, vfio_device_container_unregister() is only called when open_count == 1 and close_device is set. This leaks a registration for devices without a close_device implementation. In vfio_device_fops_release(), vfio_device_container_unregister() is called unconditionally. This can cause a device to be unregistered multiple times. Treating container device registration/unregistration uniformly (always when open_count == 1) fixes both issues. Fixes: ce4b4657ff18 ("vfio: Replace the DMA unmapping notifier with a callback") Signed-off-by: Anthony DeRossi Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Link: https://lore.kernel.org/r/20221110014027.28780-2-ajderossi@gmail.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 2d168793d4e1..9a4af880e941 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -801,8 +801,9 @@ static struct file *vfio_device_open(struct vfio_device *device) err_close_device: mutex_lock(&device->dev_set->lock); mutex_lock(&device->group->group_lock); - if (device->open_count == 1 && device->ops->close_device) { - device->ops->close_device(device); + if (device->open_count == 1) { + if (device->ops->close_device) + device->ops->close_device(device); vfio_device_container_unregister(device); } @@ -1017,10 +1018,12 @@ static int vfio_device_fops_release(struct inode *inode, struct file *filep) mutex_lock(&device->dev_set->lock); vfio_assert_device_open(device); mutex_lock(&device->group->group_lock); - if (device->open_count == 1 && device->ops->close_device) - device->ops->close_device(device); + if (device->open_count == 1) { + if (device->ops->close_device) + device->ops->close_device(device); - vfio_device_container_unregister(device); + vfio_device_container_unregister(device); + } mutex_unlock(&device->group->group_lock); device->open_count--; if (device->open_count == 0) From 5cd189e410debedda416fecfc12f4716b5829845 Mon Sep 17 00:00:00 2001 From: Anthony DeRossi Date: Wed, 9 Nov 2022 17:40:26 -0800 Subject: [PATCH 09/14] vfio: Export the device set open count The open count of a device set is the sum of the open counts of all devices in the set. Drivers can use this value to determine whether shared resources are in use without tracking them manually or accessing the private open_count in vfio_device. Signed-off-by: Anthony DeRossi Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Reviewed-by: Yi Liu Link: https://lore.kernel.org/r/20221110014027.28780-3-ajderossi@gmail.com Signed-off-by: Alex Williamson --- drivers/vfio/vfio_main.c | 13 +++++++++++++ include/linux/vfio.h | 1 + 2 files changed, 14 insertions(+) diff --git a/drivers/vfio/vfio_main.c b/drivers/vfio/vfio_main.c index 9a4af880e941..6e8804fe0095 100644 --- a/drivers/vfio/vfio_main.c +++ b/drivers/vfio/vfio_main.c @@ -125,6 +125,19 @@ static void vfio_release_device_set(struct vfio_device *device) xa_unlock(&vfio_device_set_xa); } +unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set) +{ + struct vfio_device *cur; + unsigned int open_count = 0; + + lockdep_assert_held(&dev_set->lock); + + list_for_each_entry(cur, &dev_set->device_list, dev_set_list) + open_count += cur->open_count; + return open_count; +} +EXPORT_SYMBOL_GPL(vfio_device_set_open_count); + /* * Group objects - create, release, get, put, search */ diff --git a/include/linux/vfio.h b/include/linux/vfio.h index e7cebeb875dd..fdd393f70b19 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -189,6 +189,7 @@ int vfio_register_emulated_iommu_dev(struct vfio_device *device); void vfio_unregister_group_dev(struct vfio_device *device); int vfio_assign_device_set(struct vfio_device *device, void *set_id); +unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set); int vfio_mig_get_next_state(struct vfio_device *device, enum vfio_device_mig_state cur_fsm, From e806e223621e4f5105170df69d7311dc3fb4bbb4 Mon Sep 17 00:00:00 2001 From: Anthony DeRossi Date: Wed, 9 Nov 2022 17:40:27 -0800 Subject: [PATCH 10/14] vfio/pci: Check the device set open count on reset vfio_pci_dev_set_needs_reset() inspects the open_count of every device in the set to determine whether a reset is allowed. The current device always has open_count == 1 within vfio_pci_core_disable(), effectively disabling the reset logic. This field is also documented as private in vfio_device, so it should not be used to determine whether other devices in the set are open. Checking for vfio_device_set_open_count() > 1 on the device set fixes both issues. After commit 2cd8b14aaa66 ("vfio/pci: Move to the device set infrastructure"), failure to create a new file for a device would cause the reset to be skipped due to open_count being decremented after calling close_device() in the error path. After commit eadd86f835c6 ("vfio: Remove calls to vfio_group_add_container_user()"), releasing a device would always skip the reset due to an ordering change in vfio_device_fops_release(). Failing to reset the device leaves it in an unknown state, potentially causing errors when it is accessed later or bound to a different driver. This issue was observed with a Radeon RX Vega 56 [1002:687f] (rev c3) assigned to a Windows guest. After shutting down the guest, unbinding the device from vfio-pci, and binding the device to amdgpu: [ 548.007102] [drm:psp_hw_start [amdgpu]] *ERROR* PSP create ring failed! [ 548.027174] [drm:psp_hw_init [amdgpu]] *ERROR* PSP firmware loading failed [ 548.027242] [drm:amdgpu_device_fw_loading [amdgpu]] *ERROR* hw_init of IP block failed -22 [ 548.027306] amdgpu 0000:0a:00.0: amdgpu: amdgpu_device_ip_init failed [ 548.027308] amdgpu 0000:0a:00.0: amdgpu: Fatal error during GPU init Fixes: 2cd8b14aaa66 ("vfio/pci: Move to the device set infrastructure") Fixes: eadd86f835c6 ("vfio: Remove calls to vfio_group_add_container_user()") Signed-off-by: Anthony DeRossi Reviewed-by: Jason Gunthorpe Reviewed-by: Kevin Tian Link: https://lore.kernel.org/r/20221110014027.28780-4-ajderossi@gmail.com Signed-off-by: Alex Williamson --- drivers/vfio/pci/vfio_pci_core.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/vfio/pci/vfio_pci_core.c b/drivers/vfio/pci/vfio_pci_core.c index badc9d828cac..e030c2120183 100644 --- a/drivers/vfio/pci/vfio_pci_core.c +++ b/drivers/vfio/pci/vfio_pci_core.c @@ -2488,12 +2488,12 @@ static bool vfio_pci_dev_set_needs_reset(struct vfio_device_set *dev_set) struct vfio_pci_core_device *cur; bool needs_reset = false; - list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) { - /* No VFIO device in the set can have an open device FD */ - if (cur->vdev.open_count) - return false; + /* No other VFIO device in the set can be open. */ + if (vfio_device_set_open_count(dev_set) > 1) + return false; + + list_for_each_entry(cur, &dev_set->device_list, vdev.dev_set_list) needs_reset |= cur->needs_reset; - } return needs_reset; } From 37020bbb71d911431e16c2c940b97cf86ae4f2f6 Mon Sep 17 00:00:00 2001 From: Jingbo Xu Date: Mon, 14 Nov 2022 20:19:43 +0800 Subject: [PATCH 11/14] erofs: fix missing xas_retry() in fscache mode The xarray iteration only holds the RCU read lock and thus may encounter XA_RETRY_ENTRY if there's process modifying the xarray concurrently. This will cause oops when referring to the invalid entry. Fix this by adding the missing xas_retry(), which will make the iteration wind back to the root node if XA_RETRY_ENTRY is encountered. Fixes: d435d53228dd ("erofs: change to use asynchronous io for fscache readpage/readahead") Suggested-by: David Howells Reviewed-by: Gao Xiang Reviewed-by: Jia Zhu Signed-off-by: Jingbo Xu Link: https://lore.kernel.org/r/20221114121943.29987-1-jefflexu@linux.alibaba.com Signed-off-by: Gao Xiang --- fs/erofs/fscache.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 6eaf4a4ab95c..af5ed6b9c54d 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -75,11 +75,15 @@ static void erofs_fscache_rreq_unlock_folios(struct netfs_io_request *rreq) rcu_read_lock(); xas_for_each(&xas, folio, last_page) { - unsigned int pgpos = - (folio_index(folio) - start_page) * PAGE_SIZE; - unsigned int pgend = pgpos + folio_size(folio); + unsigned int pgpos, pgend; bool pg_failed = false; + if (xas_retry(&xas, folio)) + continue; + + pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; + pgend = pgpos + folio_size(folio); + for (;;) { if (!subreq) { pg_failed = true; From 7e043a80b5dae5c2d2cf84031501de7827fd6c00 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 3 Nov 2022 16:08:14 +0000 Subject: [PATCH 12/14] netfs: Fix missing xas_retry() calls in xarray iteration netfslib has a number of places in which it performs iteration of an xarray whilst being under the RCU read lock. It *should* call xas_retry() as the first thing inside of the loop and do "continue" if it returns true in case the xarray walker passed out a special value indicating that the walk needs to be redone from the root[*]. Fix this by adding the missing retry checks. [*] I wonder if this should be done inside xas_find(), xas_next_node() and suchlike, but I'm told that's not an simple change to effect. This can cause an oops like that below. Note the faulting address - this is an internal value (|0x2) returned from xarray. BUG: kernel NULL pointer dereference, address: 0000000000000402 ... RIP: 0010:netfs_rreq_unlock+0xef/0x380 [netfs] ... Call Trace: netfs_rreq_assess+0xa6/0x240 [netfs] netfs_readpage+0x173/0x3b0 [netfs] ? init_wait_var_entry+0x50/0x50 filemap_read_page+0x33/0xf0 filemap_get_pages+0x2f2/0x3f0 filemap_read+0xaa/0x320 ? do_filp_open+0xb2/0x150 ? rmqueue+0x3be/0xe10 ceph_read_iter+0x1fe/0x680 [ceph] ? new_sync_read+0x115/0x1a0 new_sync_read+0x115/0x1a0 vfs_read+0xf3/0x180 ksys_read+0x5f/0xe0 do_syscall_64+0x38/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xae Changes: ======== ver #2) - Changed an unsigned int to a size_t to reduce the likelihood of an overflow as per Willy's suggestion. - Added an additional patch to fix the maths. Fixes: 3d3c95046742 ("netfs: Provide readahead and readpage netfs helpers") Reported-by: George Law Signed-off-by: David Howells Reviewed-by: Jeff Layton Reviewed-by: Jingbo Xu cc: Matthew Wilcox cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/166749229733.107206.17482609105741691452.stgit@warthog.procyon.org.uk/ # v1 Link: https://lore.kernel.org/r/166757987929.950645.12595273010425381286.stgit@warthog.procyon.org.uk/ # v2 --- fs/netfs/buffered_read.c | 9 +++++++-- fs/netfs/io.c | 3 +++ 2 files changed, 10 insertions(+), 2 deletions(-) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 0ce535852151..baf668fb4315 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -46,10 +46,15 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) rcu_read_lock(); xas_for_each(&xas, folio, last_page) { - unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; - unsigned int pgend = pgpos + folio_size(folio); + unsigned int pgpos, pgend; bool pg_failed = false; + if (xas_retry(&xas, folio)) + continue; + + pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; + pgend = pgpos + folio_size(folio); + for (;;) { if (!subreq) { pg_failed = true; diff --git a/fs/netfs/io.c b/fs/netfs/io.c index 428925899282..e374767d1b68 100644 --- a/fs/netfs/io.c +++ b/fs/netfs/io.c @@ -121,6 +121,9 @@ static void netfs_rreq_unmark_after_write(struct netfs_io_request *rreq, XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) { + if (xas_retry(&xas, folio)) + continue; + /* We might have multiple writes from the same huge * folio, but we mustn't unlock a folio more than once. */ From 5e51c627c5acbcf82bb552e17533a79d2a6a2600 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 4 Nov 2022 15:36:49 +0000 Subject: [PATCH 13/14] netfs: Fix dodgy maths Fix the dodgy maths in netfs_rreq_unlock_folios(). start_page could be inside the folio, in which case the calculation of pgpos will be come up with a negative number (though for the moment rreq->start is rounded down earlier and folios would have to get merged whilst locked) Alter how this works to just frame the tracking in terms of absolute file positions, rather than offsets from the start of the I/O request. This simplifies the maths and makes it easier to follow. Fix the issue by using folio_pos() and folio_size() to calculate the end position of the page. Fixes: 3d3c95046742 ("netfs: Provide readahead and readpage netfs helpers") Reported-by: Matthew Wilcox Signed-off-by: David Howells Reviewed-by: Jeff Layton Reviewed-by: Jingbo Xu cc: linux-cachefs@redhat.com cc: linux-fsdevel@vger.kernel.org Link: https://lore.kernel.org/r/Y2SJw7w1IsIik3nb@casper.infradead.org/ Link: https://lore.kernel.org/r/166757988611.950645.7626959069846893164.stgit@warthog.procyon.org.uk/ # v2 --- fs/netfs/buffered_read.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index baf668fb4315..7679a68e8193 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -17,9 +17,9 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; struct folio *folio; - unsigned int iopos, account = 0; pgoff_t start_page = rreq->start / PAGE_SIZE; pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; + size_t account = 0; bool subreq_failed = false; XA_STATE(xas, &rreq->mapping->i_pages, start_page); @@ -39,23 +39,23 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) */ subreq = list_first_entry(&rreq->subrequests, struct netfs_io_subrequest, rreq_link); - iopos = 0; subreq_failed = (subreq->error < 0); trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); rcu_read_lock(); xas_for_each(&xas, folio, last_page) { - unsigned int pgpos, pgend; + loff_t pg_end; bool pg_failed = false; if (xas_retry(&xas, folio)) continue; - pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; - pgend = pgpos + folio_size(folio); + pg_end = folio_pos(folio) + folio_size(folio) - 1; for (;;) { + loff_t sreq_end; + if (!subreq) { pg_failed = true; break; @@ -63,11 +63,11 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) if (test_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags)) folio_start_fscache(folio); pg_failed |= subreq_failed; - if (pgend < iopos + subreq->len) + sreq_end = subreq->start + subreq->len - 1; + if (pg_end < sreq_end) break; account += subreq->transferred; - iopos += subreq->len; if (!list_is_last(&subreq->rreq_link, &rreq->subrequests)) { subreq = list_next_entry(subreq, rreq_link); subreq_failed = (subreq->error < 0); @@ -75,7 +75,8 @@ void netfs_rreq_unlock_folios(struct netfs_io_request *rreq) subreq = NULL; subreq_failed = false; } - if (pgend == iopos) + + if (pg_end == sreq_end) break; } From 2632daebafd04746b4b96c2f26a6021bc38f6209 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 14 Nov 2022 12:44:01 +0100 Subject: [PATCH 14/14] x86/cpu: Restore AMD's DE_CFG MSR after resume DE_CFG contains the LFENCE serializing bit, restore it on resume too. This is relevant to older families due to the way how they do S3. Unify and correct naming while at it. Fixes: e4d0e84e4907 ("x86/cpu/AMD: Make LFENCE a serializing instruction") Reported-by: Andrew Cooper Reported-by: Pawan Gupta Signed-off-by: Borislav Petkov Cc: Signed-off-by: Linus Torvalds --- arch/x86/include/asm/msr-index.h | 8 +++++--- arch/x86/kernel/cpu/amd.c | 6 ++---- arch/x86/kernel/cpu/hygon.c | 4 ++-- arch/x86/kvm/svm/svm.c | 10 +++++----- arch/x86/kvm/x86.c | 2 +- arch/x86/power/cpu.c | 1 + tools/arch/x86/include/asm/msr-index.h | 8 +++++--- 7 files changed, 21 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index 10ac52705892..4a2af82553e4 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -535,6 +535,11 @@ #define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 + +#define MSR_AMD64_DE_CFG 0xc0011029 +#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 +#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT) + #define MSR_AMD64_BU_CFG2 0xc001102a #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 @@ -640,9 +645,6 @@ #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 #define MSR_FAM10H_NODE_ID 0xc001100c -#define MSR_F10H_DECFG 0xc0011029 -#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 -#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 860b60273df3..c75d75b9f11a 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -770,8 +770,6 @@ static void init_amd_gh(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_TLB_MMATCH); } -#define MSR_AMD64_DE_CFG 0xC0011029 - static void init_amd_ln(struct cpuinfo_x86 *c) { /* @@ -965,8 +963,8 @@ static void init_amd(struct cpuinfo_x86 *c) * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. */ - msr_set_bit(MSR_F10H_DECFG, - MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); /* A serializing LFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); diff --git a/arch/x86/kernel/cpu/hygon.c b/arch/x86/kernel/cpu/hygon.c index 21fd425088fe..c393b8773ace 100644 --- a/arch/x86/kernel/cpu/hygon.c +++ b/arch/x86/kernel/cpu/hygon.c @@ -326,8 +326,8 @@ static void init_hygon(struct cpuinfo_x86 *c) * msr_set_bit() uses the safe accessors, too, even if the MSR * is not present. */ - msr_set_bit(MSR_F10H_DECFG, - MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + msr_set_bit(MSR_AMD64_DE_CFG, + MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT); /* A serializing LFENCE stops RDTSC speculation */ set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 9f88c8e6766e..4b6d2b050e57 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -2709,9 +2709,9 @@ static int svm_get_msr_feature(struct kvm_msr_entry *msr) msr->data = 0; switch (msr->index) { - case MSR_F10H_DECFG: - if (boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) - msr->data |= MSR_F10H_DECFG_LFENCE_SERIALIZE; + case MSR_AMD64_DE_CFG: + if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) + msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; break; case MSR_IA32_PERF_CAPABILITIES: return 0; @@ -2812,7 +2812,7 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) msr_info->data = 0x1E; } break; - case MSR_F10H_DECFG: + case MSR_AMD64_DE_CFG: msr_info->data = svm->msr_decfg; break; default: @@ -3041,7 +3041,7 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) case MSR_VM_IGNNE: vcpu_unimpl(vcpu, "unimplemented wrmsr: 0x%x data 0x%llx\n", ecx, data); break; - case MSR_F10H_DECFG: { + case MSR_AMD64_DE_CFG: { struct kvm_msr_entry msr_entry; msr_entry.index = msr->index; diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ecea83f0da49..490ec23c8450 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1557,7 +1557,7 @@ static const u32 msr_based_features_all[] = { MSR_IA32_VMX_EPT_VPID_CAP, MSR_IA32_VMX_VMFUNC, - MSR_F10H_DECFG, + MSR_AMD64_DE_CFG, MSR_IA32_UCODE_REV, MSR_IA32_ARCH_CAPABILITIES, MSR_IA32_PERF_CAPABILITIES, diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index bb176c72891c..4cd39f304e20 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -519,6 +519,7 @@ static void pm_save_spec_msr(void) MSR_TSX_FORCE_ABORT, MSR_IA32_MCU_OPT_CTRL, MSR_AMD64_LS_CFG, + MSR_AMD64_DE_CFG, }; msr_build_context(spec_msr_id, ARRAY_SIZE(spec_msr_id)); diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h index 10ac52705892..f17ade084720 100644 --- a/tools/arch/x86/include/asm/msr-index.h +++ b/tools/arch/x86/include/asm/msr-index.h @@ -535,6 +535,11 @@ #define MSR_AMD64_CPUID_FN_1 0xc0011004 #define MSR_AMD64_LS_CFG 0xc0011020 #define MSR_AMD64_DC_CFG 0xc0011022 + +#define MSR_AMD64_DE_CFG 0xc0011029 +#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT 1 +#define MSR_AMD64_DE_CFG_LFENCE_SERIALIZE BIT_ULL(MSR_AMD64_DE_CFG_LFENCE_SERIALIZE_BIT) + #define MSR_AMD64_BU_CFG2 0xc001102a #define MSR_AMD64_IBSFETCHCTL 0xc0011030 #define MSR_AMD64_IBSFETCHLINAD 0xc0011031 @@ -640,9 +645,6 @@ #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 #define MSR_FAM10H_NODE_ID 0xc001100c -#define MSR_F10H_DECFG 0xc0011029 -#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 -#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a