From eaa090538e8d21801c6d5f94590c3799e6a528b5 Mon Sep 17 00:00:00 2001 From: Evan Quan Date: Thu, 30 Dec 2021 17:53:54 +0800 Subject: [PATCH 01/13] drm/amd/pm: keep the BACO feature enabled for suspend To pair with the workaround which always reset the ASIC in suspend. Otherwise, the reset which relies on BACO will fail. Fixes: daf8de0874ab5b ("drm/amdgpu: always reset the asic in suspend (v2)") Signed-off-by: Evan Quan Reviewed-by: Alex Deucher Reviewed-by: Guchun Chen Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c index 8a817932acdf..9d7d64fdf410 100644 --- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c @@ -1400,8 +1400,14 @@ static int smu_disable_dpms(struct smu_context *smu) { struct amdgpu_device *adev = smu->adev; int ret = 0; + /* + * TODO: (adev->in_suspend && !adev->in_s0ix) is added to pair + * the workaround which always reset the asic in suspend. + * It's likely that workaround will be dropped in the future. + * Then the change here should be dropped together. + */ bool use_baco = !smu->is_apu && - ((amdgpu_in_reset(adev) && + (((amdgpu_in_reset(adev) || (adev->in_suspend && !adev->in_s0ix)) && (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO)) || ((adev->in_runpm || adev->in_s4) && amdgpu_asic_supports_baco(adev))); From 9a45ac2320d0a6ae01880a30d4b86025fce4061b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 22 Dec 2021 22:41:18 -0500 Subject: [PATCH 02/13] fbdev: fbmem: add a helper to determine if an aperture is used by a fw fb Add a function for drivers to check if the a firmware initialized fb is corresponds to their aperture. This allows drivers to check if the device corresponds to what the firmware set up as the display device. Bug: https://bugzilla.kernel.org/show_bug.cgi?id=215203 Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1840 Signed-off-by: Alex Deucher --- drivers/video/fbdev/core/fbmem.c | 47 ++++++++++++++++++++++++++++++++ include/linux/fb.h | 1 + 2 files changed, 48 insertions(+) diff --git a/drivers/video/fbdev/core/fbmem.c b/drivers/video/fbdev/core/fbmem.c index 826175ad88a2..0fa7ede94fa6 100644 --- a/drivers/video/fbdev/core/fbmem.c +++ b/drivers/video/fbdev/core/fbmem.c @@ -1762,6 +1762,53 @@ int remove_conflicting_framebuffers(struct apertures_struct *a, } EXPORT_SYMBOL(remove_conflicting_framebuffers); +/** + * is_firmware_framebuffer - detect if firmware-configured framebuffer matches + * @a: memory range, users of which are to be checked + * + * This function checks framebuffer devices (initialized by firmware/bootloader) + * which use memory range described by @a. If @a matchesm the function returns + * true, otherwise false. + */ +bool is_firmware_framebuffer(struct apertures_struct *a) +{ + bool do_free = false; + bool found = false; + int i; + + if (!a) { + a = alloc_apertures(1); + if (!a) + return false; + + a->ranges[0].base = 0; + a->ranges[0].size = ~0; + do_free = true; + } + + mutex_lock(®istration_lock); + /* check all firmware fbs and kick off if the base addr overlaps */ + for_each_registered_fb(i) { + struct apertures_struct *gen_aper; + + if (!(registered_fb[i]->flags & FBINFO_MISC_FIRMWARE)) + continue; + + gen_aper = registered_fb[i]->apertures; + if (fb_do_apertures_overlap(gen_aper, a)) { + found = true; + break; + } + } + mutex_unlock(®istration_lock); + + if (do_free) + kfree(a); + + return found; +} +EXPORT_SYMBOL(is_firmware_framebuffer); + /** * remove_conflicting_pci_framebuffers - remove firmware-configured framebuffers for PCI devices * @pdev: PCI device diff --git a/include/linux/fb.h b/include/linux/fb.h index 6f3db99ab990..3da95842b207 100644 --- a/include/linux/fb.h +++ b/include/linux/fb.h @@ -610,6 +610,7 @@ extern int remove_conflicting_pci_framebuffers(struct pci_dev *pdev, const char *name); extern int remove_conflicting_framebuffers(struct apertures_struct *a, const char *name, bool primary); +extern bool is_firmware_framebuffer(struct apertures_struct *a); extern int fb_prepare_logo(struct fb_info *fb_info, int rotate); extern int fb_show_logo(struct fb_info *fb_info, int rotate); extern char* fb_get_buffer_offset(struct fb_info *info, struct fb_pixmap *buf, u32 size); From b95dc06af3e683d6b7ddbbae178b2b2a21ee8b2b Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Wed, 22 Dec 2021 22:57:16 -0500 Subject: [PATCH 03/13] drm/amdgpu: disable runpm if we are the primary adapter If we are the primary adapter (i.e., the one used by the firwmare framebuffer), disable runtime pm. This fixes a regression caused by commit 55285e21f045 which results in the displays waking up shortly after they go to sleep due to the device coming out of runtime suspend and sending a hotplug uevent. v2: squash in reworked fix from Evan Fixes: 55285e21f045 ("fbdev/efifb: Release PCI device's runtime PM ref during FB destroy") Bug: https://bugzilla.kernel.org/show_bug.cgi?id=215203 Bug: https://gitlab.freedesktop.org/drm/amd/-/issues/1840 Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 28 +++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 6 ++++++ 3 files changed, 35 insertions(+) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index b85b67a88a3d..7d67aec6f4a2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -1077,6 +1077,7 @@ struct amdgpu_device { bool runpm; bool in_runpm; bool has_pr3; + bool is_fw_fb; bool pm_sysfs_en; bool ucode_sysfs_en; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c index 86ca80da9eea..99370bdd8c5b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c @@ -39,6 +39,7 @@ #include #include #include +#include #include "amdgpu.h" #include "amdgpu_irq.h" @@ -1890,6 +1891,26 @@ MODULE_DEVICE_TABLE(pci, pciidlist); static const struct drm_driver amdgpu_kms_driver; +static bool amdgpu_is_fw_framebuffer(resource_size_t base, + resource_size_t size) +{ + bool found = false; +#if IS_REACHABLE(CONFIG_FB) + struct apertures_struct *a; + + a = alloc_apertures(1); + if (!a) + return false; + + a->ranges[0].base = base; + a->ranges[0].size = size; + + found = is_firmware_framebuffer(a); + kfree(a); +#endif + return found; +} + static int amdgpu_pci_probe(struct pci_dev *pdev, const struct pci_device_id *ent) { @@ -1898,6 +1919,8 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, unsigned long flags = ent->driver_data; int ret, retry = 0, i; bool supports_atomic = false; + bool is_fw_fb; + resource_size_t base, size; /* skip devices which are owned by radeon */ for (i = 0; i < ARRAY_SIZE(amdgpu_unsupported_pciidlist); i++) { @@ -1966,6 +1989,10 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, } #endif + base = pci_resource_start(pdev, 0); + size = pci_resource_len(pdev, 0); + is_fw_fb = amdgpu_is_fw_framebuffer(base, size); + /* Get rid of things like offb */ ret = drm_aperture_remove_conflicting_pci_framebuffers(pdev, &amdgpu_kms_driver); if (ret) @@ -1978,6 +2005,7 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, adev->dev = &pdev->dev; adev->pdev = pdev; ddev = adev_to_drm(adev); + adev->is_fw_fb = is_fw_fb; if (!supports_atomic) ddev->driver_features &= ~DRIVER_ATOMIC; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c index 651c7abfde03..09ad17944eb2 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c @@ -206,6 +206,12 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, unsigned long flags) adev->runpm = true; break; } + /* XXX: disable runtime pm if we are the primary adapter + * to avoid displays being re-enabled after DPMS. + * This needs to be sorted out and fixed properly. + */ + if (adev->is_fw_fb) + adev->runpm = false; if (adev->runpm) dev_info(adev->dev, "Using BACO for runtime pm\n"); } From 46669e8616c649c71c4cfcd712fd3d107e771380 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Mon, 3 Jan 2022 13:49:36 -0800 Subject: [PATCH 04/13] md/raid1: fix missing bitmap update w/o WriteMostly devices commit [1] causes missing bitmap updates when there isn't any WriteMostly devices. Detailed steps to reproduce by Norbert (which somehow didn't make to lore): # setup md10 (raid1) with two drives (1 GByte sparse files) dd if=/dev/zero of=disk1 bs=1024k seek=1024 count=0 dd if=/dev/zero of=disk2 bs=1024k seek=1024 count=0 losetup /dev/loop11 disk1 losetup /dev/loop12 disk2 mdadm --create /dev/md10 --level=1 --raid-devices=2 /dev/loop11 /dev/loop12 # add bitmap (aka write-intent log) mdadm /dev/md10 --grow --bitmap=internal echo check > /sys/block/md10/md/sync_action root:# cat /sys/block/md10/md/mismatch_cnt 0 root:# # remove member drive disk2 (loop12) mdadm /dev/md10 -f loop12 ; mdadm /dev/md10 -r loop12 # modify degraded md device dd if=/dev/urandom of=/dev/md10 bs=512 count=1 # no blocks recorded as out of sync on the remaining member disk1/loop11 root:# mdadm -X /dev/loop11 | grep Bitmap Bitmap : 16 bits (chunks), 0 dirty (0.0%) root:# # re-add disk2, nothing synced because of empty bitmap mdadm /dev/md10 --re-add /dev/loop12 # check integrity again echo check > /sys/block/md10/md/sync_action # disk1 and disk2 are no longer in sync, reads return differend data root:# cat /sys/block/md10/md/mismatch_cnt 128 root:# # clean up mdadm -S /dev/md10 losetup -d /dev/loop11 losetup -d /dev/loop12 rm disk1 disk2 Fix this by moving the WriteMostly check to the if condition for alloc_behind_master_bio(). [1] commit fd3b6975e9c1 ("md/raid1: only allocate write behind bio for WriteMostly device") Fixes: fd3b6975e9c1 ("md/raid1: only allocate write behind bio for WriteMostly device") Cc: stable@vger.kernel.org # v5.12+ Cc: Guoqing Jiang Cc: Jens Axboe Reported-by: Norbert Warmuth Suggested-by: Linus Torvalds Signed-off-by: Song Liu --- drivers/md/raid1.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 7dc8026cf6ee..85505424f7a4 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1496,12 +1496,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (!r1_bio->bios[i]) continue; - if (first_clone && test_bit(WriteMostly, &rdev->flags)) { + if (first_clone) { /* do behind I/O ? * Not if there are too many, or cannot * allocate memory, or a reader on WriteMostly * is waiting for behind writes to flush */ if (bitmap && + test_bit(WriteMostly, &rdev->flags) && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && !waitqueue_active(&bitmap->behind_wait)) { From c370baa328022cbd46c59c821d1b467a97f047be Mon Sep 17 00:00:00 2001 From: Qiuxu Zhuo Date: Fri, 24 Dec 2021 04:11:26 -0500 Subject: [PATCH 05/13] EDAC/i10nm: Release mdev/mbase when failing to detect HBM On systems without HBM (High Bandwidth Memory) mdev/mbase are not released/unmapped. Add the code to release mdev/mbase when failing to detect HBM. [Tony: re-word commit message] Cc: Fixes: c945088384d0 ("EDAC/i10nm: Add support for high bandwidth memory") Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Qiuxu Zhuo Signed-off-by: Tony Luck Link: https://lore.kernel.org/r/20211224091126.1246-1-qiuxu.zhuo@intel.com --- drivers/edac/i10nm_base.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/edac/i10nm_base.c b/drivers/edac/i10nm_base.c index 83345bfac246..6cf50ee0b77c 100644 --- a/drivers/edac/i10nm_base.c +++ b/drivers/edac/i10nm_base.c @@ -358,6 +358,9 @@ static int i10nm_get_hbm_munits(void) mbase = ioremap(base + off, I10NM_HBM_IMC_MMIO_SIZE); if (!mbase) { + pci_dev_put(d->imc[lmc].mdev); + d->imc[lmc].mdev = NULL; + i10nm_printk(KERN_ERR, "Failed to ioremap for hbm mc 0x%llx\n", base + off); return -ENOMEM; @@ -368,6 +371,12 @@ static int i10nm_get_hbm_munits(void) mcmtr = I10NM_GET_MCMTR(&d->imc[lmc], 0); if (!I10NM_IS_HBM_IMC(mcmtr)) { + iounmap(d->imc[lmc].mbase); + d->imc[lmc].mbase = NULL; + d->imc[lmc].hbm_mc = false; + pci_dev_put(d->imc[lmc].mdev); + d->imc[lmc].mdev = NULL; + i10nm_printk(KERN_ERR, "This isn't an hbm mc!\n"); return -ENODEV; } From 1756d7994ad85c2479af6ae5a9750b92324685af Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Jan 2022 11:02:28 -1000 Subject: [PATCH 06/13] cgroup: Use open-time credentials for process migraton perm checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cgroup process migration permission checks are performed at write time as whether a given operation is allowed or not is dependent on the content of the write - the PID. This currently uses current's credentials which is a potential security weakness as it may allow scenarios where a less privileged process tricks a more privileged one into writing into a fd that it created. This patch makes both cgroup2 and cgroup1 process migration interfaces to use the credentials saved at the time of open (file->f_cred) instead of current's. Reported-by: "Eric W. Biederman" Suggested-by: Linus Torvalds Fixes: 187fe84067bd ("cgroup: require write perm on common ancestor when moving processes on the default hierarchy") Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-v1.c | 7 ++++--- kernel/cgroup/cgroup.c | 9 ++++++++- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 81c9e0685948..0e7369103ba6 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -504,10 +504,11 @@ static ssize_t __cgroup1_procs_write(struct kernfs_open_file *of, goto out_unlock; /* - * Even if we're attaching all tasks in the thread group, we only - * need to check permissions on one of them. + * Even if we're attaching all tasks in the thread group, we only need + * to check permissions on one of them. Check permissions using the + * credentials from file open to protect against inherited fd attacks. */ - cred = current_cred(); + cred = of->file->f_cred; tcred = get_task_cred(task); if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && !uid_eq(cred->euid, tcred->uid) && diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 919194de39c8..2632e46da1d4 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -4892,6 +4892,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, { struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; + const struct cred *saved_cred; ssize_t ret; bool locked; @@ -4909,9 +4910,15 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root); spin_unlock_irq(&css_set_lock); - /* process and thread migrations follow same delegation rule */ + /* + * Process and thread migrations follow same delegation rule. Check + * permissions using the credentials from file open to protect against + * inherited fd attacks. + */ + saved_cred = override_creds(of->file->f_cred); ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, of->file->f_path.dentry->d_sb, threadgroup); + revert_creds(saved_cred); if (ret) goto out_finish; From 0d2b5955b36250a9428c832664f2079cbf723bec Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Jan 2022 11:02:29 -1000 Subject: [PATCH 07/13] cgroup: Allocate cgroup_file_ctx for kernfs_open_file->priv MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit of->priv is currently used by each interface file implementation to store private information. This patch collects the current two private data usages into struct cgroup_file_ctx which is allocated and freed by the common path. This allows generic private data which applies to multiple files, which will be used to in the following patch. Note that cgroup_procs iterator is now embedded as procs.iter in the new cgroup_file_ctx so that it doesn't need to be allocated and freed separately. v2: union dropped from cgroup_file_ctx and the procs iterator is embedded in cgroup_file_ctx as suggested by Linus. v3: Michal pointed out that cgroup1's procs pidlist uses of->priv too. Converted. Didn't change to embedded allocation as cgroup1 pidlists get stored for caching. Signed-off-by: Tejun Heo Cc: Linus Torvalds Reviewed-by: Michal Koutný --- kernel/cgroup/cgroup-internal.h | 17 +++++++++++ kernel/cgroup/cgroup-v1.c | 26 ++++++++-------- kernel/cgroup/cgroup.c | 53 +++++++++++++++++++++------------ 3 files changed, 65 insertions(+), 31 deletions(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index bfbeabc17a9d..cf637bc4ab45 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -65,6 +65,23 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) return container_of(kfc, struct cgroup_fs_context, kfc); } +struct cgroup_pidlist; + +struct cgroup_file_ctx { + struct { + void *trigger; + } psi; + + struct { + bool started; + struct css_task_iter iter; + } procs; + + struct { + struct cgroup_pidlist *pidlist; + } procs1; +}; + /* * A cgroup can be associated with multiple css_sets as different tasks may * belong to different cgroups on different hierarchies. In the other diff --git a/kernel/cgroup/cgroup-v1.c b/kernel/cgroup/cgroup-v1.c index 0e7369103ba6..41e0837a5a0b 100644 --- a/kernel/cgroup/cgroup-v1.c +++ b/kernel/cgroup/cgroup-v1.c @@ -394,6 +394,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) * next pid to display, if any */ struct kernfs_open_file *of = s->private; + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = seq_css(s)->cgroup; struct cgroup_pidlist *l; enum cgroup_filetype type = seq_cft(s)->private; @@ -403,25 +404,24 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) mutex_lock(&cgrp->pidlist_mutex); /* - * !NULL @of->priv indicates that this isn't the first start() - * after open. If the matching pidlist is around, we can use that. - * Look for it. Note that @of->priv can't be used directly. It - * could already have been destroyed. + * !NULL @ctx->procs1.pidlist indicates that this isn't the first + * start() after open. If the matching pidlist is around, we can use + * that. Look for it. Note that @ctx->procs1.pidlist can't be used + * directly. It could already have been destroyed. */ - if (of->priv) - of->priv = cgroup_pidlist_find(cgrp, type); + if (ctx->procs1.pidlist) + ctx->procs1.pidlist = cgroup_pidlist_find(cgrp, type); /* * Either this is the first start() after open or the matching * pidlist has been destroyed inbetween. Create a new one. */ - if (!of->priv) { - ret = pidlist_array_load(cgrp, type, - (struct cgroup_pidlist **)&of->priv); + if (!ctx->procs1.pidlist) { + ret = pidlist_array_load(cgrp, type, &ctx->procs1.pidlist); if (ret) return ERR_PTR(ret); } - l = of->priv; + l = ctx->procs1.pidlist; if (pid) { int end = l->length; @@ -449,7 +449,8 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) static void cgroup_pidlist_stop(struct seq_file *s, void *v) { struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_pidlist *l = ctx->procs1.pidlist; if (l) mod_delayed_work(cgroup_pidlist_destroy_wq, &l->destroy_dwork, @@ -460,7 +461,8 @@ static void cgroup_pidlist_stop(struct seq_file *s, void *v) static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; - struct cgroup_pidlist *l = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct cgroup_pidlist *l = ctx->procs1.pidlist; pid_t *p = v; pid_t *end = l->list + l->length; /* diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 2632e46da1d4..a84631d08d98 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3630,6 +3630,7 @@ static int cgroup_cpu_pressure_show(struct seq_file *seq, void *v) static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, size_t nbytes, enum psi_res res) { + struct cgroup_file_ctx *ctx = of->priv; struct psi_trigger *new; struct cgroup *cgrp; struct psi_group *psi; @@ -3648,7 +3649,7 @@ static ssize_t cgroup_pressure_write(struct kernfs_open_file *of, char *buf, return PTR_ERR(new); } - psi_trigger_replace(&of->priv, new); + psi_trigger_replace(&ctx->psi.trigger, new); cgroup_put(cgrp); @@ -3679,12 +3680,16 @@ static ssize_t cgroup_cpu_pressure_write(struct kernfs_open_file *of, static __poll_t cgroup_pressure_poll(struct kernfs_open_file *of, poll_table *pt) { - return psi_trigger_poll(&of->priv, of->file, pt); + struct cgroup_file_ctx *ctx = of->priv; + + return psi_trigger_poll(&ctx->psi.trigger, of->file, pt); } static void cgroup_pressure_release(struct kernfs_open_file *of) { - psi_trigger_replace(&of->priv, NULL); + struct cgroup_file_ctx *ctx = of->priv; + + psi_trigger_replace(&ctx->psi.trigger, NULL); } bool cgroup_psi_enabled(void) @@ -3811,18 +3816,31 @@ static ssize_t cgroup_kill_write(struct kernfs_open_file *of, char *buf, static int cgroup_file_open(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); + struct cgroup_file_ctx *ctx; + int ret; - if (cft->open) - return cft->open(of); - return 0; + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + of->priv = ctx; + + if (!cft->open) + return 0; + + ret = cft->open(of); + if (ret) + kfree(ctx); + return ret; } static void cgroup_file_release(struct kernfs_open_file *of) { struct cftype *cft = of_cft(of); + struct cgroup_file_ctx *ctx = of->priv; if (cft->release) cft->release(of); + kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, @@ -4751,21 +4769,21 @@ void css_task_iter_end(struct css_task_iter *it) static void cgroup_procs_release(struct kernfs_open_file *of) { - if (of->priv) { - css_task_iter_end(of->priv); - kfree(of->priv); - } + struct cgroup_file_ctx *ctx = of->priv; + + if (ctx->procs.started) + css_task_iter_end(&ctx->procs.iter); } static void *cgroup_procs_next(struct seq_file *s, void *v, loff_t *pos) { struct kernfs_open_file *of = s->private; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; if (pos) (*pos)++; - return css_task_iter_next(it); + return css_task_iter_next(&ctx->procs.iter); } static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, @@ -4773,21 +4791,18 @@ static void *__cgroup_procs_start(struct seq_file *s, loff_t *pos, { struct kernfs_open_file *of = s->private; struct cgroup *cgrp = seq_css(s)->cgroup; - struct css_task_iter *it = of->priv; + struct cgroup_file_ctx *ctx = of->priv; + struct css_task_iter *it = &ctx->procs.iter; /* * When a seq_file is seeked, it's always traversed sequentially * from position 0, so we can simply keep iterating on !0 *pos. */ - if (!it) { + if (!ctx->procs.started) { if (WARN_ON_ONCE((*pos))) return ERR_PTR(-EINVAL); - - it = kzalloc(sizeof(*it), GFP_KERNEL); - if (!it) - return ERR_PTR(-ENOMEM); - of->priv = it; css_task_iter_start(&cgrp->self, iter_flags, it); + ctx->procs.started = true; } else if (!(*pos)) { css_task_iter_end(it); css_task_iter_start(&cgrp->self, iter_flags, it); From e57457641613fef0d147ede8bd6a3047df588b95 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Jan 2022 11:02:29 -1000 Subject: [PATCH 08/13] cgroup: Use open-time cgroup namespace for process migration perm checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit cgroup process migration permission checks are performed at write time as whether a given operation is allowed or not is dependent on the content of the write - the PID. This currently uses current's cgroup namespace which is a potential security weakness as it may allow scenarios where a less privileged process tricks a more privileged one into writing into a fd that it created. This patch makes cgroup remember the cgroup namespace at the time of open and uses it for migration permission checks instad of current's. Note that this only applies to cgroup2 as cgroup1 doesn't have namespace support. This also fixes a use-after-free bug on cgroupns reported in https://lore.kernel.org/r/00000000000048c15c05d0083397@google.com Note that backporting this fix also requires the preceding patch. Reported-by: "Eric W. Biederman" Suggested-by: Linus Torvalds Cc: Michal Koutný Cc: Oleg Nesterov Reviewed-by: Michal Koutný Reported-by: syzbot+50f5cf33a284ce738b62@syzkaller.appspotmail.com Link: https://lore.kernel.org/r/00000000000048c15c05d0083397@google.com Fixes: 5136f6365ce3 ("cgroup: implement "nsdelegate" mount option") Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup-internal.h | 2 ++ kernel/cgroup/cgroup.c | 28 +++++++++++++++++++--------- 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/kernel/cgroup/cgroup-internal.h b/kernel/cgroup/cgroup-internal.h index cf637bc4ab45..6e36e854b512 100644 --- a/kernel/cgroup/cgroup-internal.h +++ b/kernel/cgroup/cgroup-internal.h @@ -68,6 +68,8 @@ static inline struct cgroup_fs_context *cgroup_fc2context(struct fs_context *fc) struct cgroup_pidlist; struct cgroup_file_ctx { + struct cgroup_namespace *ns; + struct { void *trigger; } psi; diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index a84631d08d98..cafb8c114a21 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -3822,14 +3822,19 @@ static int cgroup_file_open(struct kernfs_open_file *of) ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return -ENOMEM; + + ctx->ns = current->nsproxy->cgroup_ns; + get_cgroup_ns(ctx->ns); of->priv = ctx; if (!cft->open) return 0; ret = cft->open(of); - if (ret) + if (ret) { + put_cgroup_ns(ctx->ns); kfree(ctx); + } return ret; } @@ -3840,13 +3845,14 @@ static void cgroup_file_release(struct kernfs_open_file *of) if (cft->release) cft->release(of); + put_cgroup_ns(ctx->ns); kfree(ctx); } static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *cgrp = of->kn->parent->priv; struct cftype *cft = of_cft(of); struct cgroup_subsys_state *css; @@ -3863,7 +3869,7 @@ static ssize_t cgroup_file_write(struct kernfs_open_file *of, char *buf, */ if ((cgrp->root->flags & CGRP_ROOT_NS_DELEGATE) && !(cft->flags & CFTYPE_NS_DELEGATABLE) && - ns != &init_cgroup_ns && ns->root_cset->dfl_cgrp == cgrp) + ctx->ns != &init_cgroup_ns && ctx->ns->root_cset->dfl_cgrp == cgrp) return -EPERM; if (cft->write) @@ -4853,9 +4859,9 @@ static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb) static int cgroup_procs_write_permission(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb) + struct super_block *sb, + struct cgroup_namespace *ns) { - struct cgroup_namespace *ns = current->nsproxy->cgroup_ns; struct cgroup *com_cgrp = src_cgrp; int ret; @@ -4884,11 +4890,12 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp, static int cgroup_attach_permissions(struct cgroup *src_cgrp, struct cgroup *dst_cgrp, - struct super_block *sb, bool threadgroup) + struct super_block *sb, bool threadgroup, + struct cgroup_namespace *ns) { int ret = 0; - ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb); + ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb, ns); if (ret) return ret; @@ -4905,6 +4912,7 @@ static int cgroup_attach_permissions(struct cgroup *src_cgrp, static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, bool threadgroup) { + struct cgroup_file_ctx *ctx = of->priv; struct cgroup *src_cgrp, *dst_cgrp; struct task_struct *task; const struct cred *saved_cred; @@ -4932,7 +4940,8 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf, */ saved_cred = override_creds(of->file->f_cred); ret = cgroup_attach_permissions(src_cgrp, dst_cgrp, - of->file->f_path.dentry->d_sb, threadgroup); + of->file->f_path.dentry->d_sb, + threadgroup, ctx->ns); revert_creds(saved_cred); if (ret) goto out_finish; @@ -6152,7 +6161,8 @@ static int cgroup_css_set_fork(struct kernel_clone_args *kargs) goto err; ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb, - !(kargs->flags & CLONE_THREAD)); + !(kargs->flags & CLONE_THREAD), + current->nsproxy->cgroup_ns); if (ret) goto err; From b09c2baa56347ae65795350dfcc633dedb1c2970 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Jan 2022 11:02:29 -1000 Subject: [PATCH 09/13] selftests: cgroup: Make cg_create() use 0755 for permission instead of 0644 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 0644 is an odd perm to create a cgroup which is a directory. Use the regular 0755 instead. This is necessary for euid switching test case. Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/cgroup_util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/cgroup/cgroup_util.c b/tools/testing/selftests/cgroup/cgroup_util.c index 623cec04ad42..0cf7e90c0052 100644 --- a/tools/testing/selftests/cgroup/cgroup_util.c +++ b/tools/testing/selftests/cgroup/cgroup_util.c @@ -221,7 +221,7 @@ int cg_find_unified_root(char *root, size_t len) int cg_create(const char *cgroup) { - return mkdir(cgroup, 0644); + return mkdir(cgroup, 0755); } int cg_wait_for_proc_count(const char *cgroup, int count) From 613e040e4dc285367bff0f8f75ea59839bc10947 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Jan 2022 11:02:29 -1000 Subject: [PATCH 10/13] selftests: cgroup: Test open-time credential usage for migration checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a task is writing to an fd opened by a different task, the perm check should use the credentials of the latter task. Add a test for it. Tested-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_core.c | 68 ++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 3df648c37876..01b766506973 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -674,6 +674,73 @@ cleanup: return ret; } +/* + * cgroup migration permission check should be performed based on the + * credentials at the time of open instead of write. + */ +static int test_cgcore_lesser_euid_open(const char *root) +{ + const uid_t test_euid = 65534; /* usually nobody, any !root is fine */ + int ret = KSFT_FAIL; + char *cg_test_a = NULL, *cg_test_b = NULL; + char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL; + int cg_test_b_procs_fd = -1; + uid_t saved_uid; + + cg_test_a = cg_name(root, "cg_test_a"); + cg_test_b = cg_name(root, "cg_test_b"); + + if (!cg_test_a || !cg_test_b) + goto cleanup; + + cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs"); + cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs"); + + if (!cg_test_a_procs || !cg_test_b_procs) + goto cleanup; + + if (cg_create(cg_test_a) || cg_create(cg_test_b)) + goto cleanup; + + if (cg_enter_current(cg_test_a)) + goto cleanup; + + if (chown(cg_test_a_procs, test_euid, -1) || + chown(cg_test_b_procs, test_euid, -1)) + goto cleanup; + + saved_uid = geteuid(); + if (seteuid(test_euid)) + goto cleanup; + + cg_test_b_procs_fd = open(cg_test_b_procs, O_RDWR); + + if (seteuid(saved_uid)) + goto cleanup; + + if (cg_test_b_procs_fd < 0) + goto cleanup; + + if (write(cg_test_b_procs_fd, "0", 1) >= 0 || errno != EACCES) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + cg_enter_current(root); + if (cg_test_b_procs_fd >= 0) + close(cg_test_b_procs_fd); + if (cg_test_b) + cg_destroy(cg_test_b); + if (cg_test_a) + cg_destroy(cg_test_a); + free(cg_test_b_procs); + free(cg_test_a_procs); + free(cg_test_b); + free(cg_test_a); + return ret; +} + #define T(x) { x, #x } struct corecg_test { int (*fn)(const char *root); @@ -689,6 +756,7 @@ struct corecg_test { T(test_cgcore_proc_migration), T(test_cgcore_thread_migration), T(test_cgcore_destroy), + T(test_cgcore_lesser_euid_open), }; #undef T From bf35a7879f1dfb0d050fe779168bcf25c7de66f5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 6 Jan 2022 11:02:29 -1000 Subject: [PATCH 11/13] selftests: cgroup: Test open-time cgroup namespace usage for migration checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a task is writing to an fd opened by a different task, the perm check should use the cgroup namespace of the latter task. Add a test for it. Tested-by: Michal Koutný Signed-off-by: Tejun Heo --- tools/testing/selftests/cgroup/test_core.c | 97 ++++++++++++++++++++++ 1 file changed, 97 insertions(+) diff --git a/tools/testing/selftests/cgroup/test_core.c b/tools/testing/selftests/cgroup/test_core.c index 01b766506973..600123503063 100644 --- a/tools/testing/selftests/cgroup/test_core.c +++ b/tools/testing/selftests/cgroup/test_core.c @@ -1,11 +1,14 @@ /* SPDX-License-Identifier: GPL-2.0 */ +#define _GNU_SOURCE #include +#include #include #include #include #include #include +#include #include #include #include @@ -741,6 +744,99 @@ cleanup: return ret; } +struct lesser_ns_open_thread_arg { + const char *path; + int fd; + int err; +}; + +static int lesser_ns_open_thread_fn(void *arg) +{ + struct lesser_ns_open_thread_arg *targ = arg; + + targ->fd = open(targ->path, O_RDWR); + targ->err = errno; + return 0; +} + +/* + * cgroup migration permission check should be performed based on the cgroup + * namespace at the time of open instead of write. + */ +static int test_cgcore_lesser_ns_open(const char *root) +{ + static char stack[65536]; + const uid_t test_euid = 65534; /* usually nobody, any !root is fine */ + int ret = KSFT_FAIL; + char *cg_test_a = NULL, *cg_test_b = NULL; + char *cg_test_a_procs = NULL, *cg_test_b_procs = NULL; + int cg_test_b_procs_fd = -1; + struct lesser_ns_open_thread_arg targ = { .fd = -1 }; + pid_t pid; + int status; + + cg_test_a = cg_name(root, "cg_test_a"); + cg_test_b = cg_name(root, "cg_test_b"); + + if (!cg_test_a || !cg_test_b) + goto cleanup; + + cg_test_a_procs = cg_name(cg_test_a, "cgroup.procs"); + cg_test_b_procs = cg_name(cg_test_b, "cgroup.procs"); + + if (!cg_test_a_procs || !cg_test_b_procs) + goto cleanup; + + if (cg_create(cg_test_a) || cg_create(cg_test_b)) + goto cleanup; + + if (cg_enter_current(cg_test_b)) + goto cleanup; + + if (chown(cg_test_a_procs, test_euid, -1) || + chown(cg_test_b_procs, test_euid, -1)) + goto cleanup; + + targ.path = cg_test_b_procs; + pid = clone(lesser_ns_open_thread_fn, stack + sizeof(stack), + CLONE_NEWCGROUP | CLONE_FILES | CLONE_VM | SIGCHLD, + &targ); + if (pid < 0) + goto cleanup; + + if (waitpid(pid, &status, 0) < 0) + goto cleanup; + + if (!WIFEXITED(status)) + goto cleanup; + + cg_test_b_procs_fd = targ.fd; + if (cg_test_b_procs_fd < 0) + goto cleanup; + + if (cg_enter_current(cg_test_a)) + goto cleanup; + + if ((status = write(cg_test_b_procs_fd, "0", 1)) >= 0 || errno != ENOENT) + goto cleanup; + + ret = KSFT_PASS; + +cleanup: + cg_enter_current(root); + if (cg_test_b_procs_fd >= 0) + close(cg_test_b_procs_fd); + if (cg_test_b) + cg_destroy(cg_test_b); + if (cg_test_a) + cg_destroy(cg_test_a); + free(cg_test_b_procs); + free(cg_test_a_procs); + free(cg_test_b); + free(cg_test_a); + return ret; +} + #define T(x) { x, #x } struct corecg_test { int (*fn)(const char *root); @@ -757,6 +853,7 @@ struct corecg_test { T(test_cgcore_thread_migration), T(test_cgcore_destroy), T(test_cgcore_lesser_euid_open), + T(test_cgcore_lesser_ns_open), }; #undef T From 597cb7968cb6243e915ba9599195656be14773e5 Mon Sep 17 00:00:00 2001 From: Wanpeng Li Date: Tue, 4 Jan 2022 22:41:03 -0800 Subject: [PATCH 12/13] KVM: SEV: Mark nested locking of kvm->lock Both source and dest vms' kvm->locks are held in sev_lock_two_vms. Mark one with a different subtype to avoid false positives from lockdep. Fixes: c9d61dcb0bc26 (KVM: SEV: accept signals in sev_lock_two_vms) Reported-by: Yiru Xu Tested-by: Jinrong Liang Signed-off-by: Wanpeng Li Message-Id: <1641364863-26331-1-git-send-email-wanpengli@tencent.com> Signed-off-by: Paolo Bonzini --- arch/x86/kvm/svm/sev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7656a2c5662a..be2883141220 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -1565,7 +1565,7 @@ static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm) r = -EINTR; if (mutex_lock_killable(&dst_kvm->lock)) goto release_src; - if (mutex_lock_killable(&src_kvm->lock)) + if (mutex_lock_killable_nested(&src_kvm->lock, SINGLE_DEPTH_NESTING)) goto unlock_dst; return 0; From fffb5323780786c81ba005f8b8603d4a558aad28 Mon Sep 17 00:00:00 2001 From: Nikunj A Dadhania Date: Wed, 5 Jan 2022 09:33:37 +0530 Subject: [PATCH 13/13] KVM: x86: Check for rmaps allocation With TDP MMU being the default now, access to mmu_rmaps_stat debugfs file causes following oops: BUG: kernel NULL pointer dereference, address: 0000000000000000 PGD 0 P4D 0 Oops: 0000 [#1] PREEMPT SMP NOPTI CPU: 7 PID: 3185 Comm: cat Not tainted 5.16.0-rc4+ #204 RIP: 0010:pte_list_count+0x6/0x40 Call Trace: ? kvm_mmu_rmaps_stat_show+0x15e/0x320 seq_read_iter+0x126/0x4b0 ? aa_file_perm+0x124/0x490 seq_read+0xf5/0x140 full_proxy_read+0x5c/0x80 vfs_read+0x9f/0x1a0 ksys_read+0x67/0xe0 __x64_sys_read+0x19/0x20 do_syscall_64+0x3b/0xc0 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x7fca6fc13912 Return early when rmaps are not present. Reported-by: Vasant Hegde Tested-by: Vasant Hegde Signed-off-by: Nikunj A Dadhania Reviewed-by: Peter Xu Reviewed-by: Sean Christopherson Message-Id: <20220105040337.4234-1-nikunj@amd.com> Cc: stable@vger.kernel.org Fixes: 3bcd0662d66f ("KVM: X86: Introduce mmu_rmaps_stat per-vm debugfs file") Signed-off-by: Paolo Bonzini --- arch/x86/kvm/debugfs.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 54a83a744538..f33c804a922a 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -95,6 +95,9 @@ static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) unsigned int *log[KVM_NR_PAGE_SIZES], *cur; int i, j, k, l, ret; + if (!kvm_memslots_have_rmaps(kvm)) + return 0; + ret = -ENOMEM; memset(log, 0, sizeof(log)); for (i = 0; i < KVM_NR_PAGE_SIZES; i++) {