From 9e7e5db52c3ae70f419d97428d10608752896af9 Mon Sep 17 00:00:00 2001 From: Sebastian Ene Date: Thu, 8 Dec 2022 11:19:17 +0000 Subject: [PATCH] ANDROID: KVM: arm64: Coalesce host stage2 entries on ownership reclaim This optimization allows us to re-create higher order block mappings in the host stage2 pagetables after we teardown a guest VM. The coalescing code is triggered on host_stage2_set_owner_locked path when we annotate the entries in the host stage2 page-tables with an invalid entry that has the owner set to PKVM_ID_HOST. This can also be triggered from page_relinquish when we do page insertion in the ballooning code. When the host reclaims ownership during guest teardown, the page table walker drops the refcount of the counted entries and clears out unreferenced entries (refcount == 1). Clearing out the entry installs a zero PTE. When the host stage2 receives a data abort because there is no mapping associated, it will try to create the largest possible block mapping from the founded leaf entry. With the current patch, we increase the chances of finding a leaf entry that has level < 3 if the requested region comes from a reclaimed torned down VM memory. This has the advantage of reducing the TLB pressure at host stage2. To be able to do coalescing, we modify the way we do refcounting by not counting the following descriptor types at host stage 2: - non-zero invalid PTEs - any descriptor that has at least one of the reserved-high bits(58-55) toogled - non-default attribute mappings - page table descriptors The algorithm works as presented below: Is refcount(child(pte_table)) == 1 ? Yes -> (because we left only default mappings) Zap the table by setting 0 in the pte_table and put the page that holds the level 3 entries back into the memcache level 2 +---------+ | | | ... | | pte_table---+ level 3 -> we can now re-create a 2Mb mapping | ... | +---> +---------+ | | | | | | | | | | |def entry| +---------+ | | |def entry| | | | ... | +---------+ This (v3) is a re-work of the previous version which fixes some issues on the stage2_unmap path: When we register a pKVM IOMMU we unmap the MMIO region from the host stage2. While we treat most of the MMIO regions as default mappings in the coalescing change, we end up decrementing the page table page refcount for a default mapping which breaks the refcounting. Fix this by adding a check which verifies if we have a default mapping before decrementing the reference. Bug: 222044487 Test: dump the host stage2 pagetables and view the mapping Change-Id: I518fcbd7f022e77965eef54dd59dac07425db3a5 Signed-off-by: Sebastian Ene Signed-off-by: Will Deacon --- arch/arm64/include/asm/kvm_pgtable.h | 14 +++++ arch/arm64/kvm/hyp/nvhe/mem_protect.c | 27 ++++++--- arch/arm64/kvm/hyp/pgtable.c | 83 ++++++++++++++++++++------- 3 files changed, 96 insertions(+), 28 deletions(-) diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index 054612a2a7fc..04f9050c482d 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -210,6 +210,20 @@ enum kvm_pgtable_prot { #define PKVM_HOST_MEM_PROT KVM_PGTABLE_PROT_RWX #define PKVM_HOST_MMIO_PROT KVM_PGTABLE_PROT_RW +#define KVM_HOST_S2_DEFAULT_MASK (KVM_PTE_LEAF_ATTR_HI | \ + KVM_PTE_LEAF_ATTR_LO) + +#define KVM_HOST_S2_DEFAULT_MEM_PTE \ + (PTE_S2_MEMATTR(MT_S2_NORMAL) | \ + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R | \ + KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ + KVM_PTE_LEAF_ATTR_LO_S2_AF | \ + FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, KVM_PTE_LEAF_ATTR_LO_S2_SH_IS)) + +#define KVM_HOST_S2_DEFAULT_MMIO_PTE \ + (KVM_HOST_S2_DEFAULT_MEM_PTE | \ + KVM_PTE_LEAF_ATTR_HI_S2_XN) + #define PAGE_HYP KVM_PGTABLE_PROT_RW #define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) #define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index 735bd4c0c718..74ef0d062719 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -182,7 +182,12 @@ static bool guest_stage2_force_pte_cb(u64 addr, u64 end, static bool guest_stage2_pte_is_counted(kvm_pte_t pte, u32 level) { - return host_stage2_pte_is_counted(pte, level); + /* + * The refcount tracks valid entries as well as invalid entries if they + * encode ownership of a page to another entity than the page-table + * owner, whose id is 0. + */ + return !!pte; } static void *guest_s2_zalloc_pages_exact(size_t size) @@ -617,12 +622,20 @@ static bool host_stage2_force_pte(u64 addr, u64 end, enum kvm_pgtable_prot prot) static bool host_stage2_pte_is_counted(kvm_pte_t pte, u32 level) { - /* - * The refcount tracks valid entries as well as invalid entries if they - * encode ownership of a page to another entity than the page-table - * owner, whose id is 0. - */ - return !!pte; + u64 phys; + + if (!kvm_pte_valid(pte)) + return !!pte; + + if (kvm_pte_table(pte, level)) + return true; + + phys = kvm_pte_to_phys(pte); + if (addr_is_memory(phys)) + return (pte & KVM_HOST_S2_DEFAULT_MASK) != + KVM_HOST_S2_DEFAULT_MEM_PTE; + + return (pte & KVM_HOST_S2_DEFAULT_MASK) != KVM_HOST_S2_DEFAULT_MMIO_PTE; } static int host_stage2_idmap(u64 addr) diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index e48b66b744d5..0add34ea9ff2 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -690,25 +690,26 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, else new = data->annotation; - if (pte_ops->pte_is_counted_cb(old, level)) { - /* - * Skip updating the PTE if we are trying to recreate the exact - * same mapping or only change the access permissions. Instead, - * the vCPU will exit one more time from guest if still needed - * and then go through the path of relaxing permissions. - */ - if (!stage2_pte_needs_update(old, new)) - return -EAGAIN; + /* + * Skip updating the PTE if we are trying to recreate the exact + * same mapping or only change the access permissions. Instead, + * the vCPU will exit one more time from guest if still needed + * and then go through the path of relaxing permissions. + */ + if (!stage2_pte_needs_update(old, new)) + return -EAGAIN; - /* - * If we're only changing software bits, then we don't need to - * do anything else/ - */ - if (!((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) - goto out_set_pte; + if (pte_ops->pte_is_counted_cb(old, level)) + mm_ops->put_page(ptep); - stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); - } + /* + * If we're only changing software bits, then we don't need to + * do anything else. + */ + if (!((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW)) + goto out_set_pte; + + stage2_clear_pte(ptep, data->mmu, addr, level); /* Perform CMOs before installation of the guest stage-2 PTE */ if (mm_ops->dcache_clean_inval_poc && stage2_pte_cacheable(pgt, new)) @@ -717,10 +718,10 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, if (mm_ops->icache_inval_pou && stage2_pte_executable(new)) mm_ops->icache_inval_pou(kvm_pte_follow(new, mm_ops), granule); +out_set_pte: if (pte_ops->pte_is_counted_cb(new, level)) mm_ops->get_page(ptep); -out_set_pte: smp_store_release(ptep, new); if (kvm_phys_is_valid(phys)) data->phys += granule; @@ -785,8 +786,15 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, * a table. Accesses beyond 'end' that fall within the new table * will be mapped lazily. */ - if (pte_ops->pte_is_counted_cb(pte, level)) + if (pte_ops->pte_is_counted_cb(pte, level)) { stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); + } else { + /* + * On non-refcounted PTEs we just clear them out without + * dropping the refcount. + */ + stage2_clear_pte(ptep, data->mmu, addr, level); + } kvm_set_table_pte(ptep, childp, mm_ops); mm_ops->get_page(ptep); @@ -794,6 +802,35 @@ static int stage2_map_walk_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, return 0; } +static void stage2_coalesce_walk_table_post(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + struct stage2_map_data *data) +{ + struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; + kvm_pte_t *childp = kvm_pte_follow(*ptep, mm_ops); + + /* + * Decrement the refcount only on the set ownership path to avoid a + * loop situation when the following happens: + * 1. We take a host stage2 fault and we create a small mapping which + * has default attributes (is not refcounted). + * 2. On the way back we execute the post handler and we zap the + * table that holds our mapping. + */ + if (kvm_phys_is_valid(data->phys) || + !kvm_level_supports_block_mapping(level)) + return; + + /* + * Free a page that is not referenced anymore and drop the reference + * of the page table page. + */ + if (mm_ops->page_count(childp) == 1) { + stage2_put_pte(ptep, data->mmu, addr, level, mm_ops); + mm_ops->put_page(childp); + } +} + static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) @@ -802,8 +839,11 @@ static int stage2_map_walk_table_post(u64 addr, u64 end, u32 level, kvm_pte_t *childp; int ret = 0; - if (!data->anchor) + if (!data->anchor) { + stage2_coalesce_walk_table_post(addr, end, level, ptep, + data); return 0; + } if (data->anchor == ptep) { childp = data->childp; @@ -951,7 +991,8 @@ static int stage2_unmap_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, * block entry and rely on the remaining portions being faulted * back lazily. */ - stage2_put_pte(ptep, mmu, addr, level, mm_ops); + if (pte_ops->pte_is_counted_cb(pte, level)) + stage2_put_pte(ptep, mmu, addr, level, mm_ops); if (need_flush && mm_ops->dcache_clean_inval_poc) mm_ops->dcache_clean_inval_poc(kvm_pte_follow(pte, mm_ops),