ANDROID: arm64: Implement hypervisor workaround for SoCs with DMA beyond the PoC

SoCs featuring peripherals that can issue non-coherent DMA traffic
beyond the point of coherency (PoC) present multiple challenges for the
DMA-API implementation in Linux. Many of these challenges can be
overcome by suitable configuration of the interconnect, however the
presence of a cacheable alias for non-cacheable buffers can still lead
to coherence issues arising when stale clean lines are back-snooped from
the cache hierarchy to satisfy a non-cacheable transaction at the PoC.

Removing all cacheable aliases on a case-by-cases basis is both
error-prone and expensive. Instead, leverage the stage-2 identity
mapping installed by pKVM to enforce consistent cacheability for all
stage-1 aliases.

Bug: 240786634
Change-Id: I78b0aa51fe3e23811bbd25481173086aa957c4bf
Signed-off-by: Will Deacon <willdeacon@google.com>
This commit is contained in:
Will Deacon
2023-03-23 14:08:58 +00:00
parent 148ab83891
commit 18c78ecd82
14 changed files with 395 additions and 19 deletions

View File

@@ -2420,6 +2420,19 @@
for all guests.
Default is 1 (enabled) if in 64-bit or 32-bit PAE mode.
kvm-arm.force_nc
[KVM,ARM,ANDROID_ARM64_WORKAROUND_DMA_BEYOND_POC]
Enable hypercalls to remap host pages as normal
non-cacheable at stage-2 and issue these hypercalls
when installing non-cacheable ptes at stage-1. This
is useful to work around coherency issues on systems
with DMA peripherals integrated beyond the Point of
Coherency (PoC).
This option only applies when booting with
kvm-arm.mode=protected.
kvm-arm.mode=
[KVM,ARM] Select one of KVM/arm64's modes of operation.

View File

@@ -1031,6 +1031,23 @@ config SOCIONEXT_SYNQUACER_PREITS
If unsure, say Y.
config ANDROID_ARM64_WORKAROUND_DMA_BEYOND_POC
bool "Remove cacheable aliases of non-cacheable DMA buffers at stage-2"
default y
depends on KVM
help
Some SoCs integrate non-coherent DMA-capable peripherals beyond
the Point of Coherency (PoC), resulting in loss of coherency
with non-cacheable mappings on the CPU in the presence of a
cacheable alias.
This workaround provides a mechanism (controlled by the kernel
command-line) to remap pages as non-cacheable in pKVM's stage-2
mapping for the host, thereby removing any cacheable aliases
that may be present in the stage-1 mapping.
If unsure, say Y.
endmenu

View File

@@ -0,0 +1,64 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2023 - Google LLC
* Author: Will Deacon <willdeacon@google.com>
*/
#ifndef _ASM_ARM64_ANDROID_ERRATUM_PGTABLE_H
#define _ASM_ARM64_ANDROID_ERRATUM_PGTABLE_H
#ifndef __ASM_PGTABLE_H
#error "Please don't include this header directly."
#endif
#ifdef CONFIG_ANDROID_ARM64_WORKAROUND_DMA_BEYOND_POC
extern void pkvm_host_set_stage2_memattr(phys_addr_t addr, bool force_nc);
extern __init int pkvm_register_early_nc_mappings(void);
DECLARE_STATIC_KEY_FALSE(pkvm_force_nc);
static inline bool prot_needs_stage2_update(pgprot_t prot)
{
pteval_t val = pgprot_val(prot);
if (!static_branch_unlikely(&pkvm_force_nc))
return 0;
return (val & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_NC);
}
static inline void arm64_update_cacheable_aliases(pte_t *ptep, pte_t pte)
{
pte_t old_pte = READ_ONCE(*ptep);
bool force_nc;
if (!static_branch_unlikely(&pkvm_force_nc))
return;
if (pte_valid(old_pte) == pte_valid(pte))
return;
if (!pte_valid(pte)) {
force_nc = false;
pte = old_pte;
} else {
force_nc = true;
}
if ((pte_val(pte) & PTE_ATTRINDX_MASK) == PTE_ATTRINDX(MT_NORMAL_NC))
pkvm_host_set_stage2_memattr(__pte_to_phys(pte), force_nc);
}
#define set_pmd_at(mm, addr, pmdp, pmd) do { \
WARN_ON(prot_needs_stage2_update(__pgprot(pmd_val(pmd)))); \
set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd)); \
} while (0)
#define set_pud_at(mm, addr, pudp, pud) do { \
WARN_ON(prot_needs_stage2_update(__pgprot(pud_val(pud)))); \
set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud)); \
} while (0)
#else
static inline void arm64_update_cacheable_aliases(pte_t *ptep, pte_t pte) { }
static inline bool prot_needs_stage2_update(pgprot_t prot) { return false; }
#endif /* CONFIG_ANDROID_ARM64_WORKAROUND_DMA_BEYOND_POC */
#endif /* _ASM_ARM64_ANDROID_ERRATUM_PGTABLE_H */

View File

@@ -103,6 +103,9 @@ enum __kvm_host_smccc_func {
__KVM_HOST_SMCCC_FUNC___pkvm_rb_swap_reader_page,
__KVM_HOST_SMCCC_FUNC___pkvm_rb_update_footers,
__KVM_HOST_SMCCC_FUNC___pkvm_enable_event,
#ifdef CONFIG_ANDROID_ARM64_WORKAROUND_DMA_BEYOND_POC
__KVM_HOST_SMCCC_FUNC___pkvm_host_set_stage2_memattr,
#endif
/*
* Start of the dynamically registered hypercalls. Start a bit

View File

@@ -312,6 +312,8 @@ static inline void __check_racy_pte_update(struct mm_struct *mm, pte_t *ptep,
__func__, pte_val(old_pte), pte_val(pte));
}
#include <asm/android_erratum_pgtable.h>
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
pte_t *ptep, pte_t pte)
{
@@ -340,6 +342,7 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
__check_racy_pte_update(mm, ptep, pte);
arm64_update_cacheable_aliases(ptep, pte);
set_pte(ptep, pte);
}
@@ -485,8 +488,13 @@ static inline pmd_t pmd_mkdevmap(pmd_t pmd)
#define pud_pfn(pud) ((__pud_to_phys(pud) & PUD_MASK) >> PAGE_SHIFT)
#define pfn_pud(pfn,prot) __pud(__phys_to_pud_val((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))
#ifndef set_pmd_at
#define set_pmd_at(mm, addr, pmdp, pmd) set_pte_at(mm, addr, (pte_t *)pmdp, pmd_pte(pmd))
#endif
#ifndef set_pud_at
#define set_pud_at(mm, addr, pudp, pud) set_pte_at(mm, addr, (pte_t *)pudp, pud_pte(pud))
#endif
#define __p4d_to_phys(p4d) __pte_to_phys(p4d_pte(p4d))
#define __phys_to_p4d_val(phys) __phys_to_pte_val(phys)
@@ -869,6 +877,7 @@ static inline int pmdp_test_and_clear_young(struct vm_area_struct *vma,
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
unsigned long address, pte_t *ptep)
{
arm64_update_cacheable_aliases(ptep, __pte(0));
return __pte(xchg_relaxed(&pte_val(*ptep), 0));
}

View File

@@ -9,6 +9,9 @@
#define arch_vmap_pud_supported arch_vmap_pud_supported
static inline bool arch_vmap_pud_supported(pgprot_t prot)
{
if (prot_needs_stage2_update(prot))
return false;
/*
* Only 4k granule supports level 1 block mappings.
* SW table walks can't handle removal of intermediate entries.
@@ -20,6 +23,9 @@ static inline bool arch_vmap_pud_supported(pgprot_t prot)
#define arch_vmap_pmd_supported arch_vmap_pmd_supported
static inline bool arch_vmap_pmd_supported(pgprot_t prot)
{
if (prot_needs_stage2_update(prot))
return false;
/* See arch_vmap_pud_supported() */
return !IS_ENABLED(CONFIG_PTDUMP_DEBUGFS);
}

View File

@@ -134,6 +134,10 @@ KVM_NVHE_ALIAS(__hyp_event_ids_end);
/* pKVM static key */
KVM_NVHE_ALIAS(kvm_protected_mode_initialized);
#ifdef CONFIG_ANDROID_ARM64_WORKAROUND_DMA_BEYOND_POC
KVM_NVHE_ALIAS(pkvm_force_nc);
#endif
#endif /* CONFIG_KVM */
#endif /* __ARM64_KERNEL_IMAGE_VARS_H */

View File

@@ -25,6 +25,7 @@ kvm-y := $(KVM)/kvm_main.o $(KVM)/coalesced_mmio.o $(KVM)/eventfd.o \
vgic/vgic-its.o vgic/vgic-debug.o
kvm-$(CONFIG_HW_PERF_EVENTS) += pmu-emul.o pmu.o
kvm-$(CONFIG_ANDROID_ARM64_WORKAROUND_DMA_BEYOND_POC) += android_erratum_pgtable.o
kvm-$(CONFIG_TRACING) += hyp_events.o hyp_trace.o

View File

@@ -0,0 +1,108 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2023 - Google LLC
* Author: Will Deacon <willdeacon@google.com>
*/
#include <asm/kvm_host.h>
#include <asm/pgtable.h>
#include <linux/init.h>
#include <linux/jump_label.h>
#include <linux/memblock.h>
DEFINE_STATIC_KEY_FALSE(pkvm_force_nc);
static int __init early_pkvm_force_nc_cfg(char *arg)
{
static_branch_enable(&pkvm_force_nc);
return 0;
}
early_param("kvm-arm.force_nc", early_pkvm_force_nc_cfg);
/*
* Update the stage-2 memory attributes (cacheability) for a page, usually
* in response to mapping or unmapping a normal non-cacheable region at stage-1.
*
* If 'force_nc' is set, the stage-2 entry is immediately made non-cacheable
* (and cleaned+invalidated to the PoC) otherwise the entry is unmapped and the
* cacheability determined based on the stage-1 attribute of the next access
* (with no cache maintenance being performed).
*/
struct pkvm_host_nc_region {
phys_addr_t start;
phys_addr_t end;
};
#define PKVM_HOST_MAX_EARLY_NC_REGIONS 8
static struct pkvm_host_nc_region
pkvm_host_early_nc_regions[PKVM_HOST_MAX_EARLY_NC_REGIONS];
static void pkvm_host_track_early_nc_mapping(phys_addr_t addr)
{
static int idx /*= 0*/;
struct pkvm_host_nc_region *reg = &pkvm_host_early_nc_regions[idx];
if (reg->start == reg->end) {
reg->start = addr;
} else if (reg->end != addr) {
if (WARN_ON(idx == PKVM_HOST_MAX_EARLY_NC_REGIONS - 1))
return;
reg = &pkvm_host_early_nc_regions[++idx];
reg->start = addr;
}
reg->end = addr + PAGE_SIZE;
}
void pkvm_host_set_stage2_memattr(phys_addr_t addr, bool force_nc)
{
int err;
if (kvm_get_mode() != KVM_MODE_PROTECTED)
return;
/*
* Non-memory regions or carveouts marked as "no-map" are handled
* entirely by their corresponding driver, which should avoid the
* creation of a cacheable alias in the first place.
*/
if (!memblock_is_map_memory(addr))
return;
if (!is_pkvm_initialized()) {
if (!WARN_ON_ONCE(!force_nc))
pkvm_host_track_early_nc_mapping(addr);
return;
}
err = kvm_call_hyp_nvhe(__pkvm_host_set_stage2_memattr, addr, force_nc);
WARN_ON(err && err != -EAGAIN);
}
EXPORT_SYMBOL_GPL(pkvm_host_set_stage2_memattr);
int __init pkvm_register_early_nc_mappings(void)
{
int i;
if (!is_pkvm_initialized())
return 0;
for (i = 0; i < PKVM_HOST_MAX_EARLY_NC_REGIONS; ++i) {
struct pkvm_host_nc_region *reg = &pkvm_host_early_nc_regions[i];
if (reg->start == reg->end)
return 0;
while (reg->start != reg->end) {
int err;
err = kvm_call_hyp_nvhe(__pkvm_host_set_stage2_memattr, reg->start, true);
if (err)
return err;
reg->start += PAGE_SIZE;
}
}
return 0;
}

View File

@@ -16,6 +16,7 @@ extern const struct pkvm_module_ops module_ops;
int hyp_create_pcpu_fixmap(void);
void *hyp_fixmap_map(phys_addr_t phys);
void *hyp_fixmap_map_nc(phys_addr_t phys);
void hyp_fixmap_unmap(void);
void hyp_poison_page(phys_addr_t phys);

View File

@@ -1263,6 +1263,17 @@ static void handle___pkvm_enable_event(struct kvm_cpu_context *host_ctxt)
cpu_reg(host_ctxt, 1) = __pkvm_enable_event(id, enable);
}
#ifdef CONFIG_ANDROID_ARM64_WORKAROUND_DMA_BEYOND_POC
extern int __pkvm_host_set_stage2_memattr(phys_addr_t phys, bool force_nc);
static void handle___pkvm_host_set_stage2_memattr(struct kvm_cpu_context *host_ctxt)
{
DECLARE_REG(phys_addr_t, phys, host_ctxt, 1);
DECLARE_REG(bool, force_nc, host_ctxt, 2);
cpu_reg(host_ctxt, 1) = __pkvm_host_set_stage2_memattr(phys, force_nc);
}
#endif
typedef void (*hcall_t)(struct kvm_cpu_context *);
#define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -1315,6 +1326,9 @@ static const hcall_t host_hcall[] = {
HANDLE_FUNC(__pkvm_rb_swap_reader_page),
HANDLE_FUNC(__pkvm_rb_update_footers),
HANDLE_FUNC(__pkvm_enable_event),
#ifdef CONFIG_ANDROID_ARM64_WORKAROUND_DMA_BEYOND_POC
HANDLE_FUNC(__pkvm_host_set_stage2_memattr),
#endif
};
unsigned long pkvm_priv_hcall_limit __ro_after_init = __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize;

View File

@@ -539,23 +539,10 @@ static inline bool range_included(struct kvm_mem_range *child,
return parent->start <= child->start && child->end <= parent->end;
}
static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range)
static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range,
u32 level)
{
struct kvm_mem_range cur;
kvm_pte_t pte;
u32 level;
int ret;
hyp_assert_lock_held(&host_mmu.lock);
ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level);
if (ret)
return ret;
if (kvm_pte_valid(pte))
return -EAGAIN;
if (pte)
return -EPERM;
do {
u64 granule = kvm_granule_size(level);
@@ -641,15 +628,141 @@ static bool host_stage2_pte_is_counted(kvm_pte_t pte, u32 level)
return (pte & KVM_HOST_S2_DEFAULT_MASK) != KVM_HOST_S2_DEFAULT_MMIO_PTE;
}
static int host_stage2_idmap(u64 addr)
#define DEFERRED_MEMATTR_NOTE (1ULL << 24)
#ifdef CONFIG_ANDROID_ARM64_WORKAROUND_DMA_BEYOND_POC
static enum pkvm_page_state host_get_page_state(kvm_pte_t pte, u64 addr);
int __pkvm_host_set_stage2_memattr(phys_addr_t phys, bool force_nc)
{
kvm_pte_t pte;
int ret = 0;
if (!static_branch_unlikely(&pkvm_force_nc))
return -ENOENT;
phys = ALIGN_DOWN(phys, PAGE_SIZE);
hyp_spin_lock(&host_mmu.lock);
ret = kvm_pgtable_get_leaf(&host_mmu.pgt, phys, &pte, NULL);
if (ret)
goto unlock;
if (!addr_is_memory(phys)) {
ret = -EIO;
goto unlock;
}
if (!kvm_pte_valid(pte) && pte) {
switch (pte) {
case DEFERRED_MEMATTR_NOTE:
break;
default:
ret = -EPERM;
}
} else if (host_get_page_state(pte, phys) != PKVM_PAGE_OWNED) {
ret = -EPERM;
}
if (ret)
goto unlock;
if (force_nc) {
ret = host_stage2_idmap_locked(phys, PAGE_SIZE,
PKVM_HOST_MEM_PROT |
KVM_PGTABLE_PROT_NC,
false);
if (ret)
goto unlock;
kvm_flush_dcache_to_poc(hyp_fixmap_map_nc(phys), PAGE_SIZE);
hyp_fixmap_unmap();
} else {
ret = kvm_pgtable_stage2_annotate(&host_mmu.pgt, phys,
PAGE_SIZE, &host_s2_pool,
DEFERRED_MEMATTR_NOTE);
}
unlock:
hyp_spin_unlock(&host_mmu.lock);
return ret;
}
static int handle_memattr_annotation(struct kvm_vcpu_fault_info *fault,
u64 addr, enum kvm_pgtable_prot *prot,
struct kvm_mem_range *range)
{
u64 par, oldpar;
/* If the S1 MMU is disabled, treat the access as cacheable */
if (unlikely(!(read_sysreg(sctlr_el1) & SCTLR_ELx_M)))
return 0;
/* If we took a fault on a PTW, then treat it as cacheable */
if (fault->esr_el2 & ESR_ELx_S1PTW)
return 0;
oldpar = read_sysreg_par();
if (!__kvm_at("s1e1r", fault->far_el2))
par = read_sysreg_par();
else
par = SYS_PAR_EL1_F;
write_sysreg(oldpar, par_el1);
if (unlikely(par & SYS_PAR_EL1_F))
return -EAGAIN;
if ((par >> 56) == MAIR_ATTR_NORMAL_NC) {
range->start = ALIGN_DOWN(addr, PAGE_SIZE);
range->end = range->start + PAGE_SIZE;
*prot |= KVM_PGTABLE_PROT_NC;
}
return 0;
}
#else
static int handle_memattr_annotation(struct kvm_vcpu_fault_info *fault,
u64 addr, enum kvm_pgtable_prot *prot,
struct kvm_mem_range *range)
{
return -EPERM;
}
#endif
static int host_stage2_idmap(struct kvm_vcpu_fault_info *fault, u64 addr)
{
struct kvm_mem_range range;
bool is_memory = !!find_mem_range(addr, &range);
enum kvm_pgtable_prot prot = default_host_prot(is_memory);
kvm_pte_t pte;
u32 level;
int ret;
hyp_assert_lock_held(&host_mmu.lock);
ret = kvm_pgtable_get_leaf(&host_mmu.pgt, addr, &pte, &level);
if (ret)
return ret;
if (kvm_pte_valid(pte))
return -EAGAIN;
if (pte) {
if (!is_memory)
return -EPERM;
switch (pte) {
case DEFERRED_MEMATTR_NOTE:
ret = handle_memattr_annotation(fault, addr, &prot,
&range);
if (ret)
return ret;
break;
default:
return -EPERM;
}
}
/*
* Adjust against IOMMU devices first. host_stage2_adjust_range() should
* be called last for proper alignment.
@@ -661,7 +774,7 @@ static int host_stage2_idmap(u64 addr)
return ret;
}
ret = host_stage2_adjust_range(addr, &range);
ret = host_stage2_adjust_range(addr, &range, level);
if (ret)
return ret;
@@ -750,6 +863,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
esr = read_sysreg_el2(SYS_ESR);
BUG_ON(!__get_fault_info(esr, &fault));
fault.esr_el2 = esr;
addr = (fault.hpfar_el2 & HPFAR_MASK) << 8;
addr |= fault.far_el2 & FAR_MASK;
@@ -763,7 +877,7 @@ void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt)
/* If not handled, attempt to map the page. */
if (ret == -EPERM)
ret = host_stage2_idmap(addr);
ret = host_stage2_idmap(&fault, addr);
host_unlock_component();

View File

@@ -309,12 +309,29 @@ void *hyp_fixmap_map(phys_addr_t phys)
return (void *)slot->addr + offset_in_page(phys);
}
#define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2)
void *hyp_fixmap_map_nc(phys_addr_t phys)
{
struct hyp_fixmap_slot *slot = this_cpu_ptr(&fixmap_slots);
kvm_pte_t pte, *ptep = slot->ptep;
pte = *ptep;
pte &= ~kvm_phys_to_pte(KVM_PHYS_INVALID);
pte |= kvm_phys_to_pte(phys) | KVM_PTE_VALID |
FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX, MT_NORMAL_NC);
WRITE_ONCE(*ptep, pte);
dsb(ishst);
return (void *)slot->addr;
}
static void fixmap_clear_slot(struct hyp_fixmap_slot *slot)
{
kvm_pte_t *ptep = slot->ptep;
u64 addr = slot->addr;
WRITE_ONCE(*ptep, *ptep & ~KVM_PTE_VALID);
/* Zap the memory type too. MT_NORMAL is 0 so the fixmap is cacheable by default */
WRITE_ONCE(*ptep, *ptep & ~(KVM_PTE_VALID | KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX));
/*
* Irritatingly, the architecture requires that we use inner-shareable

View File

@@ -514,6 +514,11 @@ static int __init finalize_pkvm(void)
pkvm_firmware_rmem_clear();
}
#ifdef CONFIG_ANDROID_ARM64_WORKAROUND_DMA_BEYOND_POC
if (!ret)
ret = pkvm_register_early_nc_mappings();
#endif
return ret;
}
device_initcall_sync(finalize_pkvm);