diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 44dc817143e0..96fecb072570 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -79,7 +79,6 @@ enum __kvm_host_smccc_func { /* Hypercalls available after pKVM finalisation */ __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp, __KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp, - __KVM_HOST_SMCCC_FUNC___pkvm_host_reclaim_page, __KVM_HOST_SMCCC_FUNC___pkvm_host_map_guest, __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc, __KVM_HOST_SMCCC_FUNC___kvm_vcpu_run, @@ -88,7 +87,9 @@ enum __kvm_host_smccc_func { __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_vmcr_aprs, __KVM_HOST_SMCCC_FUNC___pkvm_init_vm, __KVM_HOST_SMCCC_FUNC___pkvm_init_vcpu, - __KVM_HOST_SMCCC_FUNC___pkvm_teardown_vm, + __KVM_HOST_SMCCC_FUNC___pkvm_start_teardown_vm, + __KVM_HOST_SMCCC_FUNC___pkvm_finalize_teardown_vm, + __KVM_HOST_SMCCC_FUNC___pkvm_reclaim_dying_guest_page, __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_load, __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_put, __KVM_HOST_SMCCC_FUNC___pkvm_vcpu_sync_state, diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 94fafecd0e00..b4050138ada9 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -70,7 +70,7 @@ extern unsigned long hyp_nr_cpus; int __pkvm_prot_finalize(void); int __pkvm_host_share_hyp(u64 pfn); int __pkvm_host_unshare_hyp(u64 pfn); -int __pkvm_host_reclaim_page(u64 pfn); +int __pkvm_host_reclaim_page(struct pkvm_hyp_vm *vm, u64 pfn, u64 ipa); int __pkvm_host_donate_hyp(u64 pfn, u64 nr_pages); int __pkvm_hyp_donate_host(u64 pfn, u64 nr_pages); int __pkvm_host_share_guest(u64 pfn, u64 gfn, struct pkvm_hyp_vcpu *vcpu); @@ -98,13 +98,15 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); int hyp_register_host_perm_fault_handler(int (*cb)(struct kvm_cpu_context *ctxt, u64 esr, u64 addr)); int hyp_pin_shared_mem(void *from, void *to); void hyp_unpin_shared_mem(void *from, void *to); -void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); int host_stage2_get_leaf(phys_addr_t phys, kvm_pte_t *ptep, u32 *level); int refill_memcache(struct kvm_hyp_memcache *mc, unsigned long min_pages, struct kvm_hyp_memcache *host_mc); int module_change_host_page_prot(u64 pfn, enum kvm_pgtable_prot prot); +void destroy_hyp_vm_pgt(struct pkvm_hyp_vm *vm); +void drain_hyp_pool(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc); + void psci_mem_protect_inc(u64 n); void psci_mem_protect_dec(u64 n); diff --git a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h index 644dce2f9256..c373844cea79 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h @@ -64,6 +64,13 @@ struct pkvm_hyp_vm { */ unsigned int nr_vcpus; + /* + * True when the guest is being torn down. When in this state, the + * guest's vCPUs can't be loaded anymore, but its pages can be + * reclaimed by the host. + */ + bool is_dying; + /* Array of the hyp vCPU structures for this VM. */ struct pkvm_hyp_vcpu *vcpus[]; }; @@ -96,7 +103,9 @@ int __pkvm_init_vm(struct kvm *host_kvm, unsigned long vm_hva, unsigned long pgd_hva, unsigned long last_ran_hva); int __pkvm_init_vcpu(pkvm_handle_t handle, struct kvm_vcpu *host_vcpu, unsigned long vcpu_hva); -int __pkvm_teardown_vm(pkvm_handle_t handle); +int __pkvm_start_teardown_vm(pkvm_handle_t handle); +int __pkvm_finalize_teardown_vm(pkvm_handle_t handle); +int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 pfn, u64 ipa); struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, unsigned int vcpu_idx); diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 407b14563d00..8d8f7e255dec 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -1061,11 +1061,13 @@ static void handle___pkvm_host_unshare_hyp(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_host_unshare_hyp(pfn); } -static void handle___pkvm_host_reclaim_page(struct kvm_cpu_context *host_ctxt) +static void handle___pkvm_reclaim_dying_guest_page(struct kvm_cpu_context *host_ctxt) { - DECLARE_REG(u64, pfn, host_ctxt, 1); + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + DECLARE_REG(u64, pfn, host_ctxt, 2); + DECLARE_REG(u64, ipa, host_ctxt, 3); - cpu_reg(host_ctxt, 1) = __pkvm_host_reclaim_page(pfn); + cpu_reg(host_ctxt, 1) = __pkvm_reclaim_dying_guest_page(handle, pfn, ipa); } static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt) @@ -1120,13 +1122,19 @@ static void handle___pkvm_init_vcpu(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = __pkvm_init_vcpu(handle, host_vcpu, vcpu_hva); } -static void handle___pkvm_teardown_vm(struct kvm_cpu_context *host_ctxt) +static void handle___pkvm_start_teardown_vm(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); - cpu_reg(host_ctxt, 1) = __pkvm_teardown_vm(handle); + cpu_reg(host_ctxt, 1) = __pkvm_start_teardown_vm(handle); } +static void handle___pkvm_finalize_teardown_vm(struct kvm_cpu_context *host_ctxt) +{ + DECLARE_REG(pkvm_handle_t, handle, host_ctxt, 1); + + cpu_reg(host_ctxt, 1) = __pkvm_finalize_teardown_vm(handle); +} static void handle___pkvm_iommu_driver_init(struct kvm_cpu_context *host_ctxt) { DECLARE_REG(struct pkvm_iommu_driver*, drv, host_ctxt, 1); @@ -1275,7 +1283,6 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__pkvm_host_share_hyp), HANDLE_FUNC(__pkvm_host_unshare_hyp), - HANDLE_FUNC(__pkvm_host_reclaim_page), HANDLE_FUNC(__pkvm_host_map_guest), HANDLE_FUNC(__kvm_adjust_pc), HANDLE_FUNC(__kvm_vcpu_run), @@ -1284,7 +1291,9 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__vgic_v3_restore_vmcr_aprs), HANDLE_FUNC(__pkvm_init_vm), HANDLE_FUNC(__pkvm_init_vcpu), - HANDLE_FUNC(__pkvm_teardown_vm), + HANDLE_FUNC(__pkvm_start_teardown_vm), + HANDLE_FUNC(__pkvm_finalize_teardown_vm), + HANDLE_FUNC(__pkvm_reclaim_dying_guest_page), HANDLE_FUNC(__pkvm_vcpu_load), HANDLE_FUNC(__pkvm_vcpu_put), HANDLE_FUNC(__pkvm_vcpu_sync_state), diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 58d05791ec25..735bd4c0c718 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -284,61 +284,6 @@ int kvm_guest_prepare_stage2(struct pkvm_hyp_vm *vm, void *pgd) return 0; } -static int reclaim_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, void * const arg) -{ - kvm_pte_t pte = *ptep; - struct hyp_page *page; - - if (!kvm_pte_valid(pte)) - return 0; - - page = hyp_phys_to_page(kvm_pte_to_phys(pte)); - switch (pkvm_getstate(kvm_pgtable_stage2_pte_prot(pte))) { - case PKVM_PAGE_OWNED: - page->flags |= HOST_PAGE_NEED_POISONING; - fallthrough; - case PKVM_PAGE_SHARED_BORROWED: - case PKVM_PAGE_SHARED_OWNED: - page->flags |= HOST_PAGE_PENDING_RECLAIM; - break; - default: - return -EPERM; - } - - return 0; -} - -void reclaim_guest_pages(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) -{ - - struct kvm_pgtable_walker walker = { - .cb = reclaim_walker, - .flags = KVM_PGTABLE_WALK_LEAF - }; - void *addr; - - host_lock_component(); - guest_lock_component(vm); - - /* Reclaim all guest pages and dump all pgtable pages in the hyp_pool */ - BUG_ON(kvm_pgtable_walk(&vm->pgt, 0, BIT(vm->pgt.ia_bits), &walker)); - kvm_pgtable_stage2_destroy(&vm->pgt); - vm->kvm.arch.mmu.pgd_phys = 0ULL; - - guest_unlock_component(vm); - host_unlock_component(); - - /* Drain the hyp_pool into the memcache */ - addr = hyp_alloc_pages(&vm->pool, 0); - while (addr) { - memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page)); - push_hyp_memcache(mc, addr, hyp_virt_to_phys); - WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1)); - addr = hyp_alloc_pages(&vm->pool, 0); - } -} - struct relinquish_data { enum pkvm_page_state expected_state; u64 pa; @@ -2123,40 +2068,69 @@ void hyp_poison_page(phys_addr_t phys) hyp_fixmap_unmap(); } -int __pkvm_host_reclaim_page(u64 pfn) +void destroy_hyp_vm_pgt(struct pkvm_hyp_vm *vm) { - u64 addr = hyp_pfn_to_phys(pfn); - struct hyp_page *page; + guest_lock_component(vm); + kvm_pgtable_stage2_destroy(&vm->pgt); + guest_unlock_component(vm); +} + +void drain_hyp_pool(struct pkvm_hyp_vm *vm, struct kvm_hyp_memcache *mc) +{ + void *addr = hyp_alloc_pages(&vm->pool, 0); + + while (addr) { + memset(hyp_virt_to_page(addr), 0, sizeof(struct hyp_page)); + push_hyp_memcache(mc, addr, hyp_virt_to_phys); + WARN_ON(__pkvm_hyp_donate_host(hyp_virt_to_pfn(addr), 1)); + addr = hyp_alloc_pages(&vm->pool, 0); + } +} + +int __pkvm_host_reclaim_page(struct pkvm_hyp_vm *vm, u64 pfn, u64 ipa) +{ + phys_addr_t phys = hyp_pfn_to_phys(pfn); kvm_pte_t pte; int ret; host_lock_component(); + guest_lock_component(vm); - ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, NULL); + ret = kvm_pgtable_get_leaf(&vm->pgt, ipa, &pte, NULL); if (ret) goto unlock; - if (host_get_page_state(pte, addr) == PKVM_PAGE_OWNED) + if (!kvm_pte_valid(pte)) { + ret = -EINVAL; goto unlock; - - page = hyp_phys_to_page(addr); - if (!(page->flags & HOST_PAGE_PENDING_RECLAIM)) { + } else if (phys != kvm_pte_to_phys(pte)) { ret = -EPERM; goto unlock; } - if (page->flags & HOST_PAGE_NEED_POISONING) { - hyp_poison_page(addr); - page->flags &= ~HOST_PAGE_NEED_POISONING; + /* We could avoid TLB inval, it is done per VMID on the finalize path */ + WARN_ON(kvm_pgtable_stage2_unmap(&vm->pgt, ipa, PAGE_SIZE)); + + switch(guest_get_page_state(pte, ipa)) { + case PKVM_PAGE_OWNED: + WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_NOPAGE)); + hyp_poison_page(phys); psci_mem_protect_dec(1); + break; + case PKVM_PAGE_SHARED_BORROWED: + WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_OWNED)); + break; + case PKVM_PAGE_SHARED_OWNED: + WARN_ON(__host_check_page_state_range(phys, PAGE_SIZE, PKVM_PAGE_SHARED_BORROWED)); + break; + default: + BUG_ON(1); } - ret = host_stage2_set_owner_locked(addr, PAGE_SIZE, PKVM_ID_HOST); - if (ret) - goto unlock; - page->flags &= ~HOST_PAGE_PENDING_RECLAIM; + WARN_ON(host_stage2_set_owner_locked(phys, PAGE_SIZE, PKVM_ID_HOST)); unlock: + guest_unlock_component(vm); host_unlock_component(); return ret; diff --git a/arch/arm64/kvm/hyp/nvhe/pkvm.c b/arch/arm64/kvm/hyp/nvhe/pkvm.c index 12d5728d2b04..f160bc4306a5 100644 --- a/arch/arm64/kvm/hyp/nvhe/pkvm.c +++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c @@ -268,6 +268,27 @@ static struct pkvm_hyp_vm *get_vm_by_handle(pkvm_handle_t handle) return vm_table[idx]; } +int __pkvm_reclaim_dying_guest_page(pkvm_handle_t handle, u64 pfn, u64 ipa) +{ + struct pkvm_hyp_vm *hyp_vm; + int ret = -EINVAL; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm || !hyp_vm->is_dying) + goto unlock; + + ret = __pkvm_host_reclaim_page(hyp_vm, pfn, ipa); + if (ret) + goto unlock; + + drain_hyp_pool(hyp_vm, &hyp_vm->host_kvm->arch.pkvm.teardown_stage2_mc); +unlock: + hyp_spin_unlock(&vm_table_lock); + + return ret; +} + struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, unsigned int vcpu_idx) { @@ -280,7 +301,7 @@ struct pkvm_hyp_vcpu *pkvm_load_hyp_vcpu(pkvm_handle_t handle, hyp_spin_lock(&vm_table_lock); hyp_vm = get_vm_by_handle(handle); - if (!hyp_vm || hyp_vm->nr_vcpus <= vcpu_idx) + if (!hyp_vm || hyp_vm->is_dying || hyp_vm->nr_vcpus <= vcpu_idx) goto unlock; hyp_vcpu = hyp_vm->vcpus[vcpu_idx]; @@ -796,7 +817,33 @@ teardown_donated_memory(struct kvm_hyp_memcache *mc, void *addr, size_t size) unmap_donated_memory_noclear(addr, size); } -int __pkvm_teardown_vm(pkvm_handle_t handle) +int __pkvm_start_teardown_vm(pkvm_handle_t handle) +{ + struct pkvm_hyp_vm *hyp_vm; + int ret = 0; + + hyp_spin_lock(&vm_table_lock); + hyp_vm = get_vm_by_handle(handle); + if (!hyp_vm) { + ret = -ENOENT; + goto unlock; + } else if (WARN_ON(hyp_page_count(hyp_vm))) { + ret = -EBUSY; + goto unlock; + } else if (hyp_vm->is_dying) { + ret = -EINVAL; + goto unlock; + } + + hyp_vm->is_dying = true; + +unlock: + hyp_spin_unlock(&vm_table_lock); + + return ret; +} + +int __pkvm_finalize_teardown_vm(pkvm_handle_t handle) { struct kvm_hyp_memcache *mc, *stage2_mc; size_t vm_size, last_ran_size; @@ -811,9 +858,7 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) if (!hyp_vm) { err = -ENOENT; goto err_unlock; - } - - if (WARN_ON(hyp_page_count(hyp_vm))) { + } else if (!hyp_vm->is_dying) { err = -EBUSY; goto err_unlock; } @@ -828,8 +873,8 @@ int __pkvm_teardown_vm(pkvm_handle_t handle) mc = &host_kvm->arch.pkvm.teardown_mc; stage2_mc = &host_kvm->arch.pkvm.teardown_stage2_mc; - /* Reclaim guest pages (including page-table pages) */ - reclaim_guest_pages(hyp_vm, stage2_mc); + destroy_hyp_vm_pgt(hyp_vm); + drain_hyp_pool(hyp_vm, stage2_mc); unpin_host_vcpus(hyp_vm->vcpus, hyp_vm->nr_vcpus); /* Push the metadata pages to the teardown memcache */ diff --git a/arch/arm64/kvm/pkvm.c b/arch/arm64/kvm/pkvm.c index e1381f564107..4ff98340fcb2 100644 --- a/arch/arm64/kvm/pkvm.c +++ b/arch/arm64/kvm/pkvm.c @@ -315,21 +315,18 @@ void pkvm_destroy_hyp_vm(struct kvm *host_kvm) struct mm_struct *mm = current->mm; struct rb_node *node; - if (host_kvm->arch.pkvm.handle) { - WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm, - host_kvm->arch.pkvm.handle)); - } + if (!host_kvm->arch.pkvm.handle) + goto out_free; - host_kvm->arch.pkvm.handle = 0; - free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc, host_kvm); - free_hyp_stage2_memcache(&host_kvm->arch.pkvm.teardown_stage2_mc, - host_kvm); + WARN_ON(kvm_call_hyp_nvhe(__pkvm_start_teardown_vm, host_kvm->arch.pkvm.handle)); node = rb_first(&host_kvm->arch.pkvm.pinned_pages); while (node) { ppage = rb_entry(node, struct kvm_pinned_page, node); - WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_reclaim_page, - page_to_pfn(ppage->page))); + WARN_ON(kvm_call_hyp_nvhe(__pkvm_reclaim_dying_guest_page, + host_kvm->arch.pkvm.handle, + page_to_pfn(ppage->page), + ppage->ipa)); cond_resched(); account_locked_vm(mm, 1, false); @@ -338,6 +335,14 @@ void pkvm_destroy_hyp_vm(struct kvm *host_kvm) rb_erase(&ppage->node, &host_kvm->arch.pkvm.pinned_pages); kfree(ppage); } + + WARN_ON(kvm_call_hyp_nvhe(__pkvm_finalize_teardown_vm, host_kvm->arch.pkvm.handle)); + +out_free: + host_kvm->arch.pkvm.handle = 0; + free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc, host_kvm); + free_hyp_stage2_memcache(&host_kvm->arch.pkvm.teardown_stage2_mc, + host_kvm); } int pkvm_init_host_vm(struct kvm *host_kvm, unsigned long type)