mirror of
https://github.com/hardkernel/linux.git
synced 2026-06-07 19:30:30 +09:00
ANDROID: BACKPORT: KVM: arm64: Handle guest stage-2 page-tables entirely at EL2
Now that EL2 is able to manage guest stage-2 page-tables, avoid allocating a separate MMU structure in the host and instead introduce a new fault handler which responds to guest stage-2 faults by sharing GUP-pinned pages with the guest via a hypercall. These pages are recovered (and unpinned) on guest teardown via the page reclaim hypercall. Signed-off-by: Will Deacon <will@kernel.org> [willdeacon@: Dropped 'bkt' arg from kvm_for_each_memslot()] Signed-off-by: Will Deacon <willdeacon@google.com> Bug: 233587962 Change-Id: Ibbddc97cee322bf2db258b4f0848733e2efb1126
This commit is contained in:
@@ -65,6 +65,7 @@ enum __kvm_host_smccc_func {
|
||||
__KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp,
|
||||
__KVM_HOST_SMCCC_FUNC___pkvm_host_unshare_hyp,
|
||||
__KVM_HOST_SMCCC_FUNC___pkvm_host_reclaim_page,
|
||||
__KVM_HOST_SMCCC_FUNC___pkvm_host_map_guest,
|
||||
__KVM_HOST_SMCCC_FUNC___kvm_adjust_pc,
|
||||
__KVM_HOST_SMCCC_FUNC___kvm_vcpu_run,
|
||||
__KVM_HOST_SMCCC_FUNC___kvm_flush_vm_context,
|
||||
|
||||
@@ -172,11 +172,17 @@ struct kvm_smccc_features {
|
||||
unsigned long vendor_hyp_bmap;
|
||||
};
|
||||
|
||||
struct kvm_pinned_page {
|
||||
struct list_head link;
|
||||
struct page *page;
|
||||
};
|
||||
|
||||
typedef unsigned int pkvm_handle_t;
|
||||
|
||||
struct kvm_protected_vm {
|
||||
pkvm_handle_t handle;
|
||||
struct kvm_hyp_memcache teardown_mc;
|
||||
struct list_head pinned_pages;
|
||||
};
|
||||
|
||||
struct kvm_arch {
|
||||
|
||||
@@ -377,7 +377,11 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
|
||||
if (vcpu_has_run_once(vcpu) && unlikely(!irqchip_in_kernel(vcpu->kvm)))
|
||||
static_branch_dec(&userspace_irqchip_in_use);
|
||||
|
||||
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
|
||||
if (is_protected_kvm_enabled())
|
||||
free_hyp_memcache(&vcpu->arch.pkvm_memcache);
|
||||
else
|
||||
kvm_mmu_free_memory_cache(&vcpu->arch.mmu_page_cache);
|
||||
|
||||
kvm_timer_vcpu_terminate(vcpu);
|
||||
kvm_pmu_vcpu_destroy(vcpu);
|
||||
|
||||
@@ -399,6 +403,9 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
struct kvm_s2_mmu *mmu;
|
||||
int *last_ran;
|
||||
|
||||
if (is_protected_kvm_enabled())
|
||||
goto nommu;
|
||||
|
||||
mmu = vcpu->arch.hw_mmu;
|
||||
last_ran = this_cpu_ptr(mmu->last_vcpu_ran);
|
||||
|
||||
@@ -416,6 +423,7 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
|
||||
*last_ran = vcpu->vcpu_id;
|
||||
}
|
||||
|
||||
nommu:
|
||||
vcpu->cpu = cpu;
|
||||
|
||||
kvm_vgic_load(vcpu);
|
||||
|
||||
@@ -31,8 +31,6 @@ static void flush_hyp_vcpu(struct pkvm_hyp_vcpu *hyp_vcpu)
|
||||
hyp_vcpu->vcpu.arch.sve_state = kern_hyp_va(host_vcpu->arch.sve_state);
|
||||
hyp_vcpu->vcpu.arch.sve_max_vl = host_vcpu->arch.sve_max_vl;
|
||||
|
||||
hyp_vcpu->vcpu.arch.hw_mmu = host_vcpu->arch.hw_mmu;
|
||||
|
||||
hyp_vcpu->vcpu.arch.hcr_el2 = host_vcpu->arch.hcr_el2;
|
||||
hyp_vcpu->vcpu.arch.mdcr_el2 = host_vcpu->arch.mdcr_el2;
|
||||
hyp_vcpu->vcpu.arch.cptr_el2 = host_vcpu->arch.cptr_el2;
|
||||
@@ -104,6 +102,47 @@ out:
|
||||
cpu_reg(host_ctxt, 1) = ret;
|
||||
}
|
||||
|
||||
static int pkvm_refill_memcache(struct pkvm_hyp_vcpu *hyp_vcpu)
|
||||
{
|
||||
struct pkvm_hyp_vm *hyp_vm = pkvm_hyp_vcpu_to_hyp_vm(hyp_vcpu);
|
||||
u64 nr_pages = VTCR_EL2_LVLS(hyp_vm->kvm.arch.vtcr) - 1;
|
||||
struct kvm_vcpu *host_vcpu = hyp_vcpu->host_vcpu;
|
||||
|
||||
return refill_memcache(&hyp_vcpu->vcpu.arch.pkvm_memcache, nr_pages,
|
||||
&host_vcpu->arch.pkvm_memcache);
|
||||
}
|
||||
|
||||
static void handle___pkvm_host_map_guest(struct kvm_cpu_context *host_ctxt)
|
||||
{
|
||||
DECLARE_REG(u64, pfn, host_ctxt, 1);
|
||||
DECLARE_REG(u64, gfn, host_ctxt, 2);
|
||||
DECLARE_REG(struct kvm_vcpu *, host_vcpu, host_ctxt, 3);
|
||||
struct pkvm_hyp_vcpu *hyp_vcpu;
|
||||
struct kvm *host_kvm;
|
||||
int ret = -EINVAL;
|
||||
|
||||
if (!is_protected_kvm_enabled())
|
||||
goto out;
|
||||
|
||||
host_vcpu = kern_hyp_va(host_vcpu);
|
||||
host_kvm = kern_hyp_va(host_vcpu->kvm);
|
||||
hyp_vcpu = pkvm_load_hyp_vcpu(host_kvm->arch.pkvm.handle,
|
||||
host_vcpu->vcpu_idx);
|
||||
if (!hyp_vcpu)
|
||||
goto out;
|
||||
|
||||
/* Top-up our per-vcpu memcache from the host's */
|
||||
ret = pkvm_refill_memcache(hyp_vcpu);
|
||||
if (ret)
|
||||
goto out_put_vcpu;
|
||||
|
||||
ret = __pkvm_host_share_guest(pfn, gfn, hyp_vcpu);
|
||||
out_put_vcpu:
|
||||
pkvm_put_hyp_vcpu(hyp_vcpu);
|
||||
out:
|
||||
cpu_reg(host_ctxt, 1) = ret;
|
||||
}
|
||||
|
||||
static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt)
|
||||
{
|
||||
DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
|
||||
@@ -319,6 +358,7 @@ static const hcall_t host_hcall[] = {
|
||||
HANDLE_FUNC(__pkvm_host_share_hyp),
|
||||
HANDLE_FUNC(__pkvm_host_unshare_hyp),
|
||||
HANDLE_FUNC(__pkvm_host_reclaim_page),
|
||||
HANDLE_FUNC(__pkvm_host_map_guest),
|
||||
HANDLE_FUNC(__kvm_adjust_pc),
|
||||
HANDLE_FUNC(__kvm_vcpu_run),
|
||||
HANDLE_FUNC(__kvm_flush_vm_context),
|
||||
|
||||
@@ -197,6 +197,22 @@ static void unmap_stage2_range(struct kvm_s2_mmu *mmu, phys_addr_t start, u64 si
|
||||
__unmap_stage2_range(mmu, start, size, true);
|
||||
}
|
||||
|
||||
static void pkvm_stage2_flush(struct kvm *kvm)
|
||||
{
|
||||
struct kvm_pinned_page *ppage;
|
||||
|
||||
/*
|
||||
* Contrary to stage2_apply_range(), we don't need to check
|
||||
* whether the VM is being torn down, as this is always called
|
||||
* from a vcpu thread, and the list is only ever freed on VM
|
||||
* destroy (which only occurs when all vcpu are gone).
|
||||
*/
|
||||
list_for_each_entry(ppage, &kvm->arch.pkvm.pinned_pages, link) {
|
||||
__clean_dcache_guest_page(page_address(ppage->page), PAGE_SIZE);
|
||||
cond_resched_rwlock_write(&kvm->mmu_lock);
|
||||
}
|
||||
}
|
||||
|
||||
static void stage2_flush_memslot(struct kvm *kvm,
|
||||
struct kvm_memory_slot *memslot)
|
||||
{
|
||||
@@ -222,9 +238,13 @@ static void stage2_flush_vm(struct kvm *kvm)
|
||||
idx = srcu_read_lock(&kvm->srcu);
|
||||
write_lock(&kvm->mmu_lock);
|
||||
|
||||
slots = kvm_memslots(kvm);
|
||||
kvm_for_each_memslot(memslot, slots)
|
||||
stage2_flush_memslot(kvm, memslot);
|
||||
if (!is_protected_kvm_enabled()) {
|
||||
slots = kvm_memslots(kvm);
|
||||
kvm_for_each_memslot(memslot, slots)
|
||||
stage2_flush_memslot(kvm, memslot);
|
||||
} else if (!kvm_vm_is_protected(kvm)) {
|
||||
pkvm_stage2_flush(kvm);
|
||||
}
|
||||
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
srcu_read_unlock(&kvm->srcu, idx);
|
||||
@@ -683,6 +703,11 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu, unsigned long t
|
||||
mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1);
|
||||
mmfr1 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR1_EL1);
|
||||
kvm->arch.vtcr = kvm_get_vtcr(mmfr0, mmfr1, phys_shift);
|
||||
INIT_LIST_HEAD(&kvm->arch.pkvm.pinned_pages);
|
||||
mmu->arch = &kvm->arch;
|
||||
|
||||
if (is_protected_kvm_enabled())
|
||||
return 0;
|
||||
|
||||
if (mmu->pgt != NULL) {
|
||||
kvm_err("kvm_arch already initialized?\n");
|
||||
@@ -791,6 +816,9 @@ void kvm_free_stage2_pgd(struct kvm_s2_mmu *mmu)
|
||||
struct kvm *kvm = kvm_s2_mmu_to_kvm(mmu);
|
||||
struct kvm_pgtable *pgt = NULL;
|
||||
|
||||
if (is_protected_kvm_enabled())
|
||||
return;
|
||||
|
||||
write_lock(&kvm->mmu_lock);
|
||||
pgt = mmu->pgt;
|
||||
if (pgt) {
|
||||
@@ -1144,6 +1172,99 @@ static int sanitise_mte_tags(struct kvm *kvm, kvm_pfn_t pfn,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int pkvm_host_map_guest(u64 pfn, u64 gfn, struct kvm_vcpu *vcpu)
|
||||
{
|
||||
int ret = kvm_call_hyp_nvhe(__pkvm_host_map_guest, pfn, gfn, vcpu);
|
||||
|
||||
/*
|
||||
* Getting -EPERM at this point implies that the pfn has already been
|
||||
* mapped. This should only ever happen when two vCPUs faulted on the
|
||||
* same page, and the current one lost the race to do the mapping.
|
||||
*/
|
||||
return (ret == -EPERM) ? -EAGAIN : ret;
|
||||
}
|
||||
|
||||
static int pkvm_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
||||
unsigned long hva)
|
||||
{
|
||||
struct kvm_hyp_memcache *hyp_memcache = &vcpu->arch.pkvm_memcache;
|
||||
struct mm_struct *mm = current->mm;
|
||||
unsigned int flags = FOLL_HWPOISON | FOLL_LONGTERM | FOLL_WRITE;
|
||||
struct kvm_pinned_page *ppage;
|
||||
struct kvm *kvm = vcpu->kvm;
|
||||
struct page *page;
|
||||
u64 pfn;
|
||||
int ret;
|
||||
|
||||
ret = topup_hyp_memcache(hyp_memcache, kvm_mmu_cache_min_pages(kvm));
|
||||
if (ret)
|
||||
return -ENOMEM;
|
||||
|
||||
ppage = kmalloc(sizeof(*ppage), GFP_KERNEL_ACCOUNT);
|
||||
if (!ppage)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = account_locked_vm(mm, 1, true);
|
||||
if (ret)
|
||||
goto free_ppage;
|
||||
|
||||
mmap_read_lock(mm);
|
||||
ret = pin_user_pages(hva, 1, flags, &page, NULL);
|
||||
mmap_read_unlock(mm);
|
||||
|
||||
if (ret == -EHWPOISON) {
|
||||
kvm_send_hwpoison_signal(hva, PAGE_SHIFT);
|
||||
ret = 0;
|
||||
goto dec_account;
|
||||
} else if (ret != 1) {
|
||||
ret = -EFAULT;
|
||||
goto dec_account;
|
||||
} else if (!PageSwapBacked(page)) {
|
||||
/*
|
||||
* We really can't deal with page-cache pages returned by GUP
|
||||
* because (a) we may trigger writeback of a page for which we
|
||||
* no longer have access and (b) page_mkclean() won't find the
|
||||
* stage-2 mapping in the rmap so we can get out-of-whack with
|
||||
* the filesystem when marking the page dirty during unpinning
|
||||
* (see cc5095747edf ("ext4: don't BUG if someone dirty pages
|
||||
* without asking ext4 first")).
|
||||
*
|
||||
* Ideally we'd just restrict ourselves to anonymous pages, but
|
||||
* we also want to allow memfd (i.e. shmem) pages, so check for
|
||||
* pages backed by swap in the knowledge that the GUP pin will
|
||||
* prevent try_to_unmap() from succeeding.
|
||||
*/
|
||||
ret = -EIO;
|
||||
goto dec_account;
|
||||
}
|
||||
|
||||
write_lock(&kvm->mmu_lock);
|
||||
pfn = page_to_pfn(page);
|
||||
ret = pkvm_host_map_guest(pfn, fault_ipa >> PAGE_SHIFT, vcpu);
|
||||
if (ret) {
|
||||
if (ret == -EAGAIN)
|
||||
ret = 0;
|
||||
goto unpin;
|
||||
}
|
||||
|
||||
ppage->page = page;
|
||||
INIT_LIST_HEAD(&ppage->link);
|
||||
list_add(&ppage->link, &kvm->arch.pkvm.pinned_pages);
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
|
||||
return 0;
|
||||
|
||||
unpin:
|
||||
write_unlock(&kvm->mmu_lock);
|
||||
unpin_user_pages(&page, 1);
|
||||
dec_account:
|
||||
account_locked_vm(mm, 1, false);
|
||||
free_ppage:
|
||||
kfree(ppage);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa,
|
||||
struct kvm_memory_slot *memslot, unsigned long hva,
|
||||
unsigned long fault_status)
|
||||
@@ -1427,7 +1548,8 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
|
||||
}
|
||||
|
||||
/* Falls between the IPA range and the PARange? */
|
||||
if (fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
|
||||
if (!is_protected_kvm_enabled() &&
|
||||
fault_ipa >= BIT_ULL(vcpu->arch.hw_mmu->pgt->ia_bits)) {
|
||||
fault_ipa |= kvm_vcpu_get_hfar(vcpu) & GENMASK(11, 0);
|
||||
|
||||
if (is_iabt)
|
||||
@@ -1523,7 +1645,11 @@ int kvm_handle_guest_abort(struct kvm_vcpu *vcpu)
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
|
||||
if (is_protected_kvm_enabled())
|
||||
ret = pkvm_mem_abort(vcpu, fault_ipa, hva);
|
||||
else
|
||||
ret = user_mem_abort(vcpu, fault_ipa, memslot, hva, fault_status);
|
||||
|
||||
if (ret == 0)
|
||||
ret = 1;
|
||||
out:
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
|
||||
#include <linux/kvm_host.h>
|
||||
#include <linux/memblock.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/sort.h>
|
||||
|
||||
@@ -199,6 +200,10 @@ int pkvm_create_hyp_vm(struct kvm *host_kvm)
|
||||
|
||||
void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
|
||||
{
|
||||
struct kvm_pinned_page *ppage, *tmp;
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct list_head *ppages;
|
||||
|
||||
if (host_kvm->arch.pkvm.handle) {
|
||||
WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_vm,
|
||||
host_kvm->arch.pkvm.handle));
|
||||
@@ -206,6 +211,18 @@ void pkvm_destroy_hyp_vm(struct kvm *host_kvm)
|
||||
|
||||
host_kvm->arch.pkvm.handle = 0;
|
||||
free_hyp_memcache(&host_kvm->arch.pkvm.teardown_mc);
|
||||
|
||||
ppages = &host_kvm->arch.pkvm.pinned_pages;
|
||||
list_for_each_entry_safe(ppage, tmp, ppages, link) {
|
||||
WARN_ON(kvm_call_hyp_nvhe(__pkvm_host_reclaim_page,
|
||||
page_to_pfn(ppage->page)));
|
||||
cond_resched();
|
||||
|
||||
account_locked_vm(mm, 1, false);
|
||||
unpin_user_pages_dirty_lock(&ppage->page, 1, true);
|
||||
list_del(&ppage->link);
|
||||
kfree(ppage);
|
||||
}
|
||||
}
|
||||
|
||||
int pkvm_init_host_vm(struct kvm *host_kvm)
|
||||
|
||||
Reference in New Issue
Block a user