From cb354450527a905607a86d44dc42498136fd037b Mon Sep 17 00:00:00 2001 From: Jing Zhang Date: Wed, 4 Dec 2024 12:23:55 -0800 Subject: [PATCH 1/4] KVM: arm64: vgic-its: Add a data length check in vgic_its_save_* commit 7fe28d7e68f92cc3d0668b8f2fbdf5c303ac3022 upstream. In all the vgic_its_save_*() functinos, they do not check whether the data length is 8 bytes before calling vgic_write_guest_lock. This patch adds the check. To prevent the kernel from being blown up when the fault occurs, KVM_BUG_ON() is used. And the other BUG_ON()s are replaced together. Cc: stable@vger.kernel.org Signed-off-by: Kunkun Jiang [Jing: Update with the new entry read/write helpers] Signed-off-by: Jing Zhang Link: https://lore.kernel.org/r/20241107214137.428439-4-jingzhangos@google.com Signed-off-by: Oliver Upton Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kvm/vgic/vgic-its.c | 20 ++++++++------------ arch/arm64/kvm/vgic/vgic.h | 24 ++++++++++++++++++++++++ 2 files changed, 32 insertions(+), 12 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 092327665a6e..84499545bd8d 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -2207,7 +2207,6 @@ static int scan_its_table(struct vgic_its *its, gpa_t base, int size, u32 esz, static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, struct its_ite *ite, gpa_t gpa, int ite_esz) { - struct kvm *kvm = its->dev->kvm; u32 next_offset; u64 val; @@ -2216,7 +2215,8 @@ static int vgic_its_save_ite(struct vgic_its *its, struct its_device *dev, ((u64)ite->irq->intid << KVM_ITS_ITE_PINTID_SHIFT) | ite->collection->collection_id; val = cpu_to_le64(val); - return kvm_write_guest_lock(kvm, gpa, &val, ite_esz); + + return vgic_its_write_entry_lock(its, gpa, val, ite_esz); } /** @@ -2357,7 +2357,6 @@ static int vgic_its_restore_itt(struct vgic_its *its, struct its_device *dev) static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, gpa_t ptr, int dte_esz) { - struct kvm *kvm = its->dev->kvm; u64 val, itt_addr_field; u32 next_offset; @@ -2368,7 +2367,8 @@ static int vgic_its_save_dte(struct vgic_its *its, struct its_device *dev, (itt_addr_field << KVM_ITS_DTE_ITTADDR_SHIFT) | (dev->num_eventid_bits - 1)); val = cpu_to_le64(val); - return kvm_write_guest_lock(kvm, ptr, &val, dte_esz); + + return vgic_its_write_entry_lock(its, ptr, val, dte_esz); } /** @@ -2555,7 +2555,8 @@ static int vgic_its_save_cte(struct vgic_its *its, ((u64)collection->target_addr << KVM_ITS_CTE_RDBASE_SHIFT) | collection->collection_id); val = cpu_to_le64(val); - return kvm_write_guest_lock(its->dev->kvm, gpa, &val, esz); + + return vgic_its_write_entry_lock(its, gpa, val, esz); } /* @@ -2571,8 +2572,7 @@ static int vgic_its_restore_cte(struct vgic_its *its, gpa_t gpa, int esz) u64 val; int ret; - BUG_ON(esz > sizeof(val)); - ret = kvm_read_guest_lock(kvm, gpa, &val, esz); + ret = vgic_its_read_entry_lock(its, gpa, &val, esz); if (ret) return ret; val = le64_to_cpu(val); @@ -2610,7 +2610,6 @@ static int vgic_its_save_collection_table(struct vgic_its *its) u64 baser = its->baser_coll_table; gpa_t gpa = GITS_BASER_ADDR_48_to_52(baser); struct its_collection *collection; - u64 val; size_t max_size, filled = 0; int ret, cte_esz = abi->cte_esz; @@ -2634,10 +2633,7 @@ static int vgic_its_save_collection_table(struct vgic_its *its) * table is not fully filled, add a last dummy element * with valid bit unset */ - val = 0; - BUG_ON(cte_esz > sizeof(val)); - ret = kvm_write_guest_lock(its->dev->kvm, gpa, &val, cte_esz); - return ret; + return vgic_its_write_entry_lock(its, gpa, 0, cte_esz); } /** diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index bc898229167b..056fcee46165 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -6,6 +6,7 @@ #define __KVM_ARM_VGIC_NEW_H__ #include +#include #define PRODUCT_ID_KVM 0x4b /* ASCII code K */ #define IMPLEMENTER_ARM 0x43b @@ -131,6 +132,29 @@ static inline bool vgic_irq_is_multi_sgi(struct vgic_irq *irq) return vgic_irq_get_lr_count(irq) > 1; } +static inline int vgic_its_read_entry_lock(struct vgic_its *its, gpa_t eaddr, + u64 *eval, unsigned long esize) +{ + struct kvm *kvm = its->dev->kvm; + + if (KVM_BUG_ON(esize != sizeof(*eval), kvm)) + return -EINVAL; + + return kvm_read_guest_lock(kvm, eaddr, eval, esize); + +} + +static inline int vgic_its_write_entry_lock(struct vgic_its *its, gpa_t eaddr, + u64 eval, unsigned long esize) +{ + struct kvm *kvm = its->dev->kvm; + + if (KVM_BUG_ON(esize != sizeof(eval), kvm)) + return -EINVAL; + + return kvm_write_guest_lock(kvm, eaddr, &eval, esize); +} + /* * This struct provides an intermediate representation of the fields contained * in the GICH_VMCR and ICH_VMCR registers, such that code exporting the GIC From fe695bc1572c030b17a68d8ec7d3d60a486131fd Mon Sep 17 00:00:00 2001 From: Kunkun Jiang Date: Wed, 4 Dec 2024 12:23:56 -0800 Subject: [PATCH 2/4] KVM: arm64: vgic-its: Clear DTE when MAPD unmaps a device commit e9649129d33dca561305fc590a7c4ba8c3e5675a upstream. vgic_its_save_device_tables will traverse its->device_list to save DTE for each device. vgic_its_restore_device_tables will traverse each entry of device table and check if it is valid. Restore if valid. But when MAPD unmaps a device, it does not invalidate the corresponding DTE. In the scenario of continuous saves and restores, there may be a situation where a device's DTE is not saved but is restored. This is unreasonable and may cause restore to fail. This patch clears the corresponding DTE when MAPD unmaps a device. Cc: stable@vger.kernel.org Fixes: 57a9a117154c ("KVM: arm64: vgic-its: Device table save/restore") Co-developed-by: Shusen Li Signed-off-by: Shusen Li Signed-off-by: Kunkun Jiang [Jing: Update with entry write helper] Signed-off-by: Jing Zhang Link: https://lore.kernel.org/r/20241107214137.428439-5-jingzhangos@google.com Signed-off-by: Oliver Upton Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kvm/vgic/vgic-its.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 84499545bd8d..7a1569b9d10e 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -1211,9 +1211,11 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, bool valid = its_cmd_get_validbit(its_cmd); u8 num_eventid_bits = its_cmd_get_size(its_cmd); gpa_t itt_addr = its_cmd_get_ittaddr(its_cmd); + int dte_esz = vgic_its_get_abi(its)->dte_esz; struct its_device *device; + gpa_t gpa; - if (!vgic_its_check_id(its, its->baser_device_table, device_id, NULL)) + if (!vgic_its_check_id(its, its->baser_device_table, device_id, &gpa)) return E_ITS_MAPD_DEVICE_OOR; if (valid && num_eventid_bits > VITS_TYPER_IDBITS) @@ -1234,7 +1236,7 @@ static int vgic_its_cmd_handle_mapd(struct kvm *kvm, struct vgic_its *its, * is an error, so we are done in any case. */ if (!valid) - return 0; + return vgic_its_write_entry_lock(its, gpa, 0, dte_esz); device = vgic_its_alloc_device(its, device_id, itt_addr, num_eventid_bits); From 21bc72eef08a42e793aee3c5267fd1961a21bc54 Mon Sep 17 00:00:00 2001 From: Kunkun Jiang Date: Wed, 4 Dec 2024 12:23:57 -0800 Subject: [PATCH 3/4] KVM: arm64: vgic-its: Clear ITE when DISCARD frees an ITE commit 7602ffd1d5e8927fadd5187cb4aed2fdc9c47143 upstream. When DISCARD frees an ITE, it does not invalidate the corresponding ITE. In the scenario of continuous saves and restores, there may be a situation where an ITE is not saved but is restored. This is unreasonable and may cause restore to fail. This patch clears the corresponding ITE when DISCARD frees an ITE. Cc: stable@vger.kernel.org Fixes: eff484e0298d ("KVM: arm64: vgic-its: ITT save and restore") Signed-off-by: Kunkun Jiang [Jing: Update with entry write helper] Signed-off-by: Jing Zhang Link: https://lore.kernel.org/r/20241107214137.428439-6-jingzhangos@google.com Signed-off-by: Oliver Upton Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kvm/vgic/vgic-its.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/arm64/kvm/vgic/vgic-its.c b/arch/arm64/kvm/vgic/vgic-its.c index 7a1569b9d10e..5a8a181cf219 100644 --- a/arch/arm64/kvm/vgic/vgic-its.c +++ b/arch/arm64/kvm/vgic/vgic-its.c @@ -855,6 +855,9 @@ static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, ite = find_ite(its, device_id, event_id); if (ite && its_is_collection_mapped(ite->collection)) { + struct its_device *device = find_its_device(its, device_id); + int ite_esz = vgic_its_get_abi(its)->ite_esz; + gpa_t gpa = device->itt_addr + ite->event_id * ite_esz; /* * Though the spec talks about removing the pending state, we * don't bother here since we clear the ITTE anyway and the @@ -863,7 +866,8 @@ static int vgic_its_cmd_handle_discard(struct kvm *kvm, struct vgic_its *its, vgic_its_invalidate_cache(kvm); its_free_ite(kvm, ite); - return 0; + + return vgic_its_write_entry_lock(its, gpa, 0, ite_esz); } return E_ITS_DISCARD_UNMAPPED_INTERRUPT; From 4118bd1834a9cf6b4c328fe25064129ee4d17701 Mon Sep 17 00:00:00 2001 From: Nikolay Kuratov Date: Sun, 8 Dec 2024 11:37:43 +0300 Subject: [PATCH 4/4] KVM: x86/mmu: Ensure that kvm_release_pfn_clean() takes exact pfn from kvm_faultin_pfn() Since 5.16 and prior to 6.13 KVM can't be used with FSDAX guest memory (PMD pages). To reproduce the issue you need to reserve guest memory with `memmap=` cmdline, create and mount FS in DAX mode (tested both XFS and ext4), see doc link below. ndctl command for test: ndctl create-namespace -v -e namespace1.0 --map=dev --mode=fsdax -a 2M Then pass memory object to qemu like: -m 8G -object memory-backend-file,id=ram0,size=8G,\ mem-path=/mnt/pmem/guestmem,share=on,prealloc=on,dump=off,align=2097152 \ -numa node,memdev=ram0,cpus=0-1 QEMU fails to run guest with error: kvm run failed Bad address and there are two warnings in dmesg: WARN_ON_ONCE(!page_count(page)) in kvm_is_zone_device_page() and WARN_ON_ONCE(folio_ref_count(folio) <= 0) in try_grab_folio() (v6.6.63) It looks like in the past assumption was made that pfn won't change from faultin_pfn() to release_pfn_clean(), e.g. see commit 4cd071d13c5c ("KVM: x86/mmu: Move calls to thp_adjust() down a level") But kvm_page_fault structure made pfn part of mutable state, so now release_pfn_clean() can take hugepage-adjusted pfn. And it works for all cases (/dev/shm, hugetlb, devdax) except fsdax. Apparently in fsdax mode faultin-pfn and adjusted-pfn may refer to different folios, so we're getting get_page/put_page imbalance. To solve this preserve faultin pfn in separate local variable and pass it in kvm_release_pfn_clean(). Patch tested for all mentioned guest memory backends with tdp_mmu={0,1}. No bug in upstream as it was solved fundamentally by commit 8dd861cc07e2 ("KVM: x86/mmu: Put refcounted pages instead of blindly releasing pfns") and related patch series. Link: https://nvdimm.docs.kernel.org/2mib_fs_dax.html Fixes: 2f6305dd5676 ("KVM: MMU: change kvm_tdp_mmu_map() arguments to kvm_page_fault") Co-developed-by: Sean Christopherson Signed-off-by: Sean Christopherson Reviewed-by: Sean Christopherson Signed-off-by: Nikolay Kuratov Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/mmu/mmu.c | 5 ++++- arch/x86/kvm/mmu/paging_tmpl.h | 5 ++++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 13134954e24d..d392022dcb89 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -4245,6 +4245,7 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault bool is_tdp_mmu_fault = is_tdp_mmu(vcpu->arch.mmu); unsigned long mmu_seq; + kvm_pfn_t orig_pfn; int r; fault->gfn = fault->addr >> PAGE_SHIFT; @@ -4272,6 +4273,8 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault if (r != RET_PF_CONTINUE) return r; + orig_pfn = fault->pfn; + r = RET_PF_RETRY; if (is_tdp_mmu_fault) @@ -4296,7 +4299,7 @@ out_unlock: read_unlock(&vcpu->kvm->mmu_lock); else write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); + kvm_release_pfn_clean(orig_pfn); return r; } diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index 1f4f5e703f13..685560a45bf6 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -790,6 +790,7 @@ FNAME(is_self_change_mapping)(struct kvm_vcpu *vcpu, static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) { struct guest_walker walker; + kvm_pfn_t orig_pfn; int r; unsigned long mmu_seq; bool is_self_change_mapping; @@ -868,6 +869,8 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault walker.pte_access &= ~ACC_EXEC_MASK; } + orig_pfn = fault->pfn; + r = RET_PF_RETRY; write_lock(&vcpu->kvm->mmu_lock); @@ -881,7 +884,7 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault out_unlock: write_unlock(&vcpu->kvm->mmu_lock); - kvm_release_pfn_clean(fault->pfn); + kvm_release_pfn_clean(orig_pfn); return r; }