ANDROID: KVM: arm64: Shadow table for KVM EL2 state

Create and populate a shadow table that contains the state hyp need for running protected VMs, i.e., struct kvm and struct kvm_vcpu at EL2. The memory for this is donated by the host and then unmapped from the host at stage 1 and at stage 2 (by hyp). This state is not used yet. Signed-off-by: Fuad Tabba <tabba@google.com> Bug: 209580772 Change-Id: Ie2d948f2a5f22a06d615d909de7a60d46944e6d8 Signed-off-by: Will Deacon <willdeacon@google.com>
2026-06-07 03:15:31 +09:00 · 2021-02-03 17:17:01 +00:00
parent 03c9459126
commit bd5993889e
10 changed files with 582 additions and 3 deletions
--- a/arch/arm64/include/asm/kvm_asm.h
+++ b/arch/arm64/include/asm/kvm_asm.h
@@ -75,6 +75,8 @@ enum __kvm_host_smccc_func {
 	__KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs,
 	__KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs,
 	__KVM_HOST_SMCCC_FUNC___pkvm_vcpu_init_traps,
+	__KVM_HOST_SMCCC_FUNC___pkvm_init_shadow,
+	__KVM_HOST_SMCCC_FUNC___pkvm_teardown_shadow,
 };

 #define DECLARE_KVM_VHE_SYM(sym)	extern char sym[]
--- a/arch/arm64/include/asm/kvm_host.h
+++ b/arch/arm64/include/asm/kvm_host.h
@@ -102,6 +102,11 @@ struct kvm_s2_mmu {
 struct kvm_arch_memory_slot {
 };

+struct kvm_protected_vm {
+	bool enabled;
+	int shadow_handle;
+};
+
 struct kvm_arch {
 	struct kvm_s2_mmu mmu;

@@ -137,6 +142,22 @@ struct kvm_arch {

 	/* Memory Tagging Extension enabled for the guest */
 	bool mte_enabled;
+
+	struct kvm_protected_vm pkvm;
+};
+
+struct kvm_protected_vcpu {
+	/* A unique id to the shadow structs in the hyp shadow area. */
+	int shadow_handle;
+
+	/* A pointer to the host's vcpu. */
+	struct kvm_vcpu *host_vcpu;
+
+	/* A pointer to the shadow vm. */
+	struct kvm_shadow_vm *shadow_vm;
+
+	/* Tracks exit code for the protected guest. */
+	int exit_code;
 };

 struct kvm_vcpu_fault_info {
@@ -390,6 +411,8 @@ struct kvm_vcpu_arch {
 		u64 last_steal;
 		gpa_t base;
 	} steal;
+
+	struct kvm_protected_vcpu pkvm;
 };

 /* Pointer to the vcpu's SVE FFR for sve_{save,load}_state() */
--- a/arch/arm64/include/asm/kvm_pkvm.h
+++ b/arch/arm64/include/asm/kvm_pkvm.h
@@ -11,6 +11,9 @@
 #include <asm/kvm_pgtable.h>
 #include <asm/sysreg.h>

+/* Maximum number of protected VMs that can be created. */
+#define KVM_MAX_PVMS 255
+
 #define HYP_MEMBLOCK_REGIONS 128

 /*
@@ -196,7 +199,6 @@
 	ARM64_FEATURE_MASK(ID_AA64ISAR1_I8MM) \
 	)

-
 extern struct memblock_region kvm_nvhe_sym(hyp_memory)[];
 extern unsigned int kvm_nvhe_sym(hyp_memblock_nr);

@@ -226,6 +228,11 @@ static inline unsigned long hyp_vmemmap_pages(size_t vmemmap_entry_size)
 	return res >> PAGE_SHIFT;
 }

+static inline unsigned long hyp_shadow_table_pages(size_t shadow_entry_size)
+{
+	return PAGE_ALIGN(KVM_MAX_PVMS * shadow_entry_size) >> PAGE_SHIFT;
+}
+
 static inline unsigned long __hyp_pgtable_max_pages(unsigned long nr_pages)
 {
 	unsigned long total = 0, i;
--- a/arch/arm64/kvm/arm.c
+++ b/arch/arm64/kvm/arm.c
@@ -167,6 +167,14 @@ vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf)
 	return VM_FAULT_SIGBUS;
 }

+static void kvm_shadow_destroy(struct kvm *kvm)
+{
+	if (!kvm_vm_is_protected(kvm))
+		return;
+
+	if (kvm->arch.pkvm.shadow_handle)
+		WARN_ON(kvm_call_hyp_nvhe(__pkvm_teardown_shadow, kvm));
+}

 /**
 * kvm_arch_destroy_vm - destroy the VM data structure
@@ -179,6 +187,7 @@ void kvm_arch_destroy_vm(struct kvm *kvm)
 	bitmap_free(kvm->arch.pmu_filter);

 	kvm_vgic_destroy(kvm);
+	kvm_shadow_destroy(kvm);

 	for (i = 0; i < KVM_MAX_VCPUS; ++i) {
 		if (kvm->vcpus[i]) {
--- a/arch/arm64/kvm/hyp/hyp-constants.c
+++ b/arch/arm64/kvm/hyp/hyp-constants.c
@@ -2,9 +2,12 @@

 #include <linux/kbuild.h>
 #include <nvhe/memory.h>
+#include <nvhe/pkvm.h>

 int main(void)
 {
 	DEFINE(STRUCT_HYP_PAGE_SIZE,	sizeof(struct hyp_page));
+	DEFINE(KVM_SHADOW_VM_SIZE,	sizeof(struct kvm_shadow_vm));
+	DEFINE(SHADOW_VCPU_STATE_SIZE,	sizeof(struct shadow_vcpu_state));
 	return 0;
 }
--- a/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
+++ b/arch/arm64/kvm/hyp/include/nvhe/pkvm.h
@@ -9,6 +9,46 @@

 #include <asm/kvm_pkvm.h>

+/*
+ * A container for the vcpu state that hyp needs to maintain for protected VMs.
+ */
+struct shadow_vcpu_state {
+	struct kvm_shadow_vm *vm;
+	struct kvm_vcpu vcpu;
+};
+
+/*
+ * Holds the relevant data for running a protected vm.
+ */
+struct kvm_shadow_vm {
+	/* A unique id to the shadow structs in the hyp shadow area. */
+	int shadow_handle;
+
+	/* A pointer to the s2 mmu for the protected vm.. */
+	struct kvm_s2_mmu *mmu;
+
+	/* Number of vcpus for the vm. */
+	int created_vcpus;
+
+	/* Pointers to the shadow vcpus of the shadow vm. */
+	struct kvm_vcpu *vcpus[KVM_MAX_VCPUS];
+
+	/* The host's kvm structure. */
+	struct kvm *host_kvm;
+
+	/* The total size of the donated shadow area. */
+	size_t shadow_area_size;
+
+	/* Array of the shadow state per vcpu. */
+	struct shadow_vcpu_state shadow_vcpus[0];
+};
+
+extern struct kvm_shadow_vm **shadow_table;
+
+int __pkvm_init_shadow(struct kvm *kvm, void *shadow_va, size_t size);
+int __pkvm_teardown_shadow(struct kvm *kvm);
+struct kvm_vcpu *hyp_get_shadow_vcpu(const struct kvm_vcpu *host_vcpu);
+
 u64 pvm_read_id_reg(const struct kvm_vcpu *vcpu, u32 id);
 bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code);
 bool kvm_handle_pvm_restricted(struct kvm_vcpu *vcpu, u64 *exit_code);
--- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c
+++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c
@@ -15,6 +15,7 @@

 #include <nvhe/mem_protect.h>
 #include <nvhe/mm.h>
+#include <nvhe/pkvm.h>
 #include <nvhe/trap_handler.h>

 DEFINE_PER_CPU(struct kvm_nvhe_init_params, kvm_init_params);
@@ -23,18 +24,54 @@ struct kvm_iommu_ops kvm_iommu_ops;

 void __kvm_hyp_host_forward_smc(struct kvm_cpu_context *host_ctxt);

+static struct kvm_vcpu *get_shadow_vcpu(struct kvm_vcpu *host_vcpu)
+{
+	struct kvm_vcpu *shadow_vcpu;
+
+	host_vcpu = kern_hyp_va(host_vcpu);
+	shadow_vcpu = hyp_get_shadow_vcpu(host_vcpu);
+
+	if (shadow_vcpu) {
+		shadow_vcpu->arch.pkvm.exit_code = 0;
+		return shadow_vcpu;
+	}
+
+	return host_vcpu;
+}
+
+static void put_shadow_vcpu(struct kvm_vcpu *shadow_vcpu, int exit_code)
+{
+	shadow_vcpu->arch.pkvm.exit_code = exit_code;
+}
+
 static void handle___kvm_vcpu_run(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);
+	struct kvm_vcpu *shadow_vcpu;
+	int ret;

-	cpu_reg(host_ctxt, 1) =  __kvm_vcpu_run(kern_hyp_va(vcpu));
+	shadow_vcpu = get_shadow_vcpu(vcpu);
+	ret = __kvm_vcpu_run(shadow_vcpu);
+
+	if (shadow_vcpu != kern_hyp_va(vcpu))
+		put_shadow_vcpu(shadow_vcpu, ret);
+
+	cpu_reg(host_ctxt, 1) =  ret;
 }

 static void handle___kvm_adjust_pc(struct kvm_cpu_context *host_ctxt)
 {
 	DECLARE_REG(struct kvm_vcpu *, vcpu, host_ctxt, 1);

-	__kvm_adjust_pc(kern_hyp_va(vcpu));
+	/*
+	 * This get_shadow_vcpu() shouldn't exist, as we would never
+	 * commit a pending update before returning to userspace, and
+	 * this is an actual attack vector (it leaves EL1 in full
+	 * control of PC).
+	 */
+	vcpu = get_shadow_vcpu(vcpu);
+
+	__kvm_adjust_pc(vcpu);
 }

 static void handle___kvm_flush_vm_context(struct kvm_cpu_context *host_ctxt)
@@ -178,6 +215,23 @@ static void handle___pkvm_vcpu_init_traps(struct kvm_cpu_context *host_ctxt)
 	__pkvm_vcpu_init_traps(kern_hyp_va(vcpu));
 }

+static void handle___pkvm_init_shadow(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1);
+	DECLARE_REG(void *, host_shadow_va, host_ctxt, 2);
+	DECLARE_REG(size_t, shadow_size, host_ctxt, 3);
+
+	cpu_reg(host_ctxt, 1) = __pkvm_init_shadow(host_kvm, host_shadow_va,
+						       shadow_size);
+}
+
+static void handle___pkvm_teardown_shadow(struct kvm_cpu_context *host_ctxt)
+{
+	DECLARE_REG(struct kvm *, host_kvm, host_ctxt, 1);
+
+	cpu_reg(host_ctxt, 1) = __pkvm_teardown_shadow(host_kvm);
+}
+
 typedef void (*hcall_t)(struct kvm_cpu_context *);

 #define HANDLE_FUNC(x)	[__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x
@@ -207,6 +261,8 @@ static const hcall_t host_hcall[] = {
 	HANDLE_FUNC(__vgic_v3_save_aprs),
 	HANDLE_FUNC(__vgic_v3_restore_aprs),
 	HANDLE_FUNC(__pkvm_vcpu_init_traps),
+	HANDLE_FUNC(__pkvm_init_shadow),
+	HANDLE_FUNC(__pkvm_teardown_shadow),
 };

 static void handle_host_hcall(struct kvm_cpu_context *host_ctxt)
--- a/arch/arm64/kvm/hyp/nvhe/pkvm.c
+++ b/arch/arm64/kvm/hyp/nvhe/pkvm.c
@@ -4,8 +4,16 @@
 * Author: Fuad Tabba <tabba@google.com>
 */

+#include <asm/kvm_asm.h>
+#include <asm/kvm_emulate.h>
+#include <asm/kvm_mmu.h>
+#include <asm/memory.h>
+
 #include <linux/kvm_host.h>
 #include <linux/mm.h>
+
+#include <nvhe/mem_protect.h>
+#include <nvhe/mm.h>
 #include <nvhe/pkvm.h>
 #include <nvhe/trap_handler.h>

@@ -191,3 +199,349 @@ void __pkvm_vcpu_init_traps(struct kvm_vcpu *vcpu)
 	pvm_init_traps_aa64mmfr0(vcpu);
 	pvm_init_traps_aa64mmfr1(vcpu);
 }
+
+/*
+ * Start the shadow table handle at the offset defined instead of at 0.
+ * Mainly for sanity checking and debugging.
+ */
+#define HANDLE_OFFSET 0x1000
+
+static int shadow_handle_to_index(int shadow_handle)
+{
+	return shadow_handle - HANDLE_OFFSET;
+}
+
+static int index_to_shadow_handle(int index)
+{
+	return index + HANDLE_OFFSET;
+}
+
+extern unsigned long hyp_nr_cpus;
+
+/*
+ * Spinlock for protecting the shadow table related state.
+ * Protects writes to shadow_table, num_shadow_entries, and next_shadow_alloc,
+ * as well as reads and writes to last_shadow_vcpu_lookup.
+ */
+DEFINE_HYP_SPINLOCK(shadow_lock);
+
+/*
+ * The table of shadow entries for protected VMs in hyp.
+ * Allocated at hyp initialization and setup.
+ */
+struct kvm_shadow_vm **shadow_table;
+
+/* Current number of vms in the shadow table. */
+int num_shadow_entries;
+
+/* The next entry index to try to allocate from. */
+int next_shadow_alloc;
+
+/*
+ * Return the shadow vm corresponding to the handle.
+ */
+static struct kvm_shadow_vm *find_shadow_by_handle(int shadow_handle)
+{
+	int shadow_index = shadow_handle_to_index(shadow_handle);
+
+	if (unlikely(shadow_index < 0 || shadow_index >= KVM_MAX_PVMS))
+		return NULL;
+
+	return shadow_table[shadow_index];
+}
+
+/*
+ * Returns the hyp shadow vcpu for the corresponding host vcpu,
+ * or NULL if it fails.
+ */
+struct kvm_vcpu *hyp_get_shadow_vcpu(const struct kvm_vcpu *vcpu)
+{
+	struct shadow_vcpu_state *shadow_vcpu_state;
+	struct kvm_shadow_vm *vm;
+	int vcpu_idx;
+	int shadow_handle;
+
+	shadow_handle = vcpu->arch.pkvm.shadow_handle;
+	vm = find_shadow_by_handle(shadow_handle);
+	vcpu_idx = vcpu->vcpu_idx;
+
+	if (unlikely(!vm || vcpu_idx < 0 || vcpu_idx >= vm->created_vcpus))
+		return NULL;
+
+	shadow_vcpu_state = &vm->shadow_vcpus[vcpu_idx];
+
+	return &shadow_vcpu_state->vcpu;
+}
+
+/* Copy the supported features for the vcpu from the host. */
+static void copy_features(struct kvm_vcpu *shadow_vcpu, struct kvm_vcpu *host_vcpu)
+{
+	DECLARE_BITMAP(allowed_features, KVM_VCPU_MAX_FEATURES);
+
+	bitmap_zero(allowed_features, KVM_VCPU_MAX_FEATURES);
+
+	/*
+	 * Always allowed:
+	 * - CPU starting in poweroff state
+	 * - PSCI v0.2
+	 */
+	set_bit(KVM_ARM_VCPU_POWER_OFF, allowed_features);
+	set_bit(KVM_ARM_VCPU_PSCI_0_2, allowed_features);
+
+	/*
+	 * Check if remaining features are allowed:
+	 * - Performance Monitoring
+	 * - Scalable Vectors
+	 * - Pointer Authentication
+	 */
+	if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64DFR0_PMUVER), PVM_ID_AA64DFR0_ALLOW))
+	        set_bit(KVM_ARM_VCPU_PMU_V3, allowed_features);
+
+	if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64PFR0_SVE), PVM_ID_AA64PFR0_ALLOW))
+	        set_bit(KVM_ARM_VCPU_SVE, allowed_features);
+
+	if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_API), PVM_ID_AA64ISAR1_ALLOW) &&
+	    FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA), PVM_ID_AA64ISAR1_ALLOW))
+	        set_bit(KVM_ARM_VCPU_PTRAUTH_ADDRESS, allowed_features);
+
+	if (FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI), PVM_ID_AA64ISAR1_ALLOW) &&
+	    FIELD_GET(ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA), PVM_ID_AA64ISAR1_ALLOW))
+	        set_bit(KVM_ARM_VCPU_PTRAUTH_GENERIC, allowed_features);
+
+	bitmap_and(shadow_vcpu->arch.features, host_vcpu->arch.features,
+		allowed_features, KVM_VCPU_MAX_FEATURES);
+}
+
+static void init_shadow_structs(struct kvm *kvm, struct kvm_shadow_vm *vm, int nr_vcpus)
+{
+	int i;
+
+	/* TODO: initialize the protected MMU. For now, use the host's. */
+	vm->mmu = &kvm->arch.mmu;
+	vm->host_kvm = kvm;
+	vm->created_vcpus = 0;
+
+	for (i = 0; i < nr_vcpus; i++) {
+		struct kvm_vcpu *host_vcpu = kern_hyp_va(kvm->vcpus[i]);
+		struct shadow_vcpu_state *shadow_state = &vm->shadow_vcpus[i];
+		struct kvm_vcpu *shadow_vcpu = &shadow_state->vcpu;
+
+		shadow_vcpu->kvm = kvm;
+		shadow_vcpu->vcpu_id = host_vcpu->vcpu_id;
+		shadow_vcpu->vcpu_idx = i;
+
+		vcpu_gp_regs(shadow_vcpu)->pstate = VCPU_RESET_PSTATE_EL1;
+		*vcpu_pc(shadow_vcpu) = *vcpu_pc(host_vcpu);
+		vcpu_set_reg(shadow_vcpu, 0, vcpu_get_reg(host_vcpu, 0));
+
+		kvm_reset_pvm_sys_regs(shadow_vcpu);
+
+		copy_features(shadow_vcpu, host_vcpu);
+
+		vm->vcpus[i] = shadow_vcpu;
+		shadow_state->vm = vm;
+
+		/* TODO - use &vm->arch.mmu when setup properly */
+		shadow_vcpu->arch.hw_mmu = host_vcpu->arch.hw_mmu;
+		shadow_vcpu->arch.pkvm.shadow_handle = vm->shadow_handle;
+		shadow_vcpu->arch.pkvm.host_vcpu = host_vcpu;
+		shadow_vcpu->arch.pkvm.shadow_vm = vm;
+
+		vm->created_vcpus++;
+	}
+}
+
+static bool exists_shadow(struct kvm *host_kvm)
+{
+	int i;
+	int num_checked = 0;
+
+	for (i = 0; i < KVM_MAX_PVMS && num_checked < num_shadow_entries; i++) {
+		if (!shadow_table[i])
+			continue;
+
+		if (unlikely(shadow_table[i]->host_kvm == host_kvm))
+			return true;
+
+		num_checked++;
+	}
+
+	return false;
+}
+
+/*
+ * Allocate a shadow table entry and insert a pointer to the shadow vm.
+ *
+ * Return a unique handle to the protected VM on success,
+ * negative error code on failure.
+ */
+static int __insert_shadow_table(struct kvm *kvm, struct kvm_shadow_vm *vm,
+			         size_t shadow_size)
+{
+	int shadow_handle;
+
+	if (unlikely(num_shadow_entries >= KVM_MAX_PVMS))
+		return -ENOMEM;
+
+	/*
+	 * Initializing protected state might have failed, yet a malicious host
+	 * could trigger this function. Thus, ensure that shadow_table exists.
+	 */
+	if (unlikely(!shadow_table))
+		return -EINVAL;
+
+	/* Check that a shadow hasn't been created before for this host KVM. */
+	if (unlikely(exists_shadow(kvm)))
+		return -EEXIST;
+
+	/* Find the next free entry in the shadow table. */
+	while (shadow_table[next_shadow_alloc])
+		next_shadow_alloc = (next_shadow_alloc + 1) % KVM_MAX_PVMS;
+	shadow_handle = index_to_shadow_handle(next_shadow_alloc);
+
+	vm->shadow_handle = shadow_handle;
+	vm->shadow_area_size = shadow_size;
+
+	shadow_table[next_shadow_alloc] = vm;
+	next_shadow_alloc = (next_shadow_alloc + 1) % KVM_MAX_PVMS;
+	num_shadow_entries++;
+
+	return shadow_handle;
+}
+
+static int insert_shadow_table(struct kvm *kvm, struct kvm_shadow_vm *vm,
+			       size_t shadow_size)
+{
+	int ret;
+
+	hyp_spin_lock(&shadow_lock);
+	ret = __insert_shadow_table(kvm, vm, shadow_size);
+	hyp_spin_unlock(&shadow_lock);
+
+	return ret;
+}
+
+/*
+ * Deallocate and remove the shadow table entry corresponding to the handle.
+ */
+static void __remove_shadow_table(int shadow_handle)
+{
+	shadow_table[shadow_handle_to_index(shadow_handle)] = NULL;
+	num_shadow_entries--;
+}
+
+static void remove_shadow_table(int shadow_handle)
+{
+	hyp_spin_lock(&shadow_lock);
+	__remove_shadow_table(shadow_handle);
+	hyp_spin_unlock(&shadow_lock);
+}
+
+static size_t pkvm_get_shadow_size(int num_vcpus)
+{
+	/* Shadow space for the vm struct and all of its vcpu states. */
+	return sizeof(struct kvm_shadow_vm) +
+	       sizeof(struct shadow_vcpu_state) * num_vcpus;
+}
+
+/*
+ * Check whether the size of the area donated by the host is sufficient for
+ * the shadow structues required for nr_vcpus as well as the shadow vm.
+ */
+static int check_shadow_size(int nr_vcpus, size_t shadow_size)
+{
+	if (nr_vcpus < 1 || nr_vcpus > KVM_MAX_VCPUS)
+		return -EINVAL;
+
+	/*
+	 * Shadow size is rounded up when allocated and donated by the host,
+	 * so it's likely to be larger than the sum of the struct sizes.
+	 */
+	if (shadow_size < pkvm_get_shadow_size(nr_vcpus))
+		return -EINVAL;
+
+	return 0;
+}
+
+/*
+ * Initialize the shadow copy of the protected VM state using the memory
+ * donated by the host.
+ *
+ * Unmaps the donated memory from the host at stage 2.
+ *
+ * Return a unique handle to the protected VM on success,
+ * negative error code on failure.
+ */
+int __pkvm_init_shadow(struct kvm *kvm,
+		       void *shadow_va,
+		       size_t shadow_size)
+{
+	struct kvm_shadow_vm *vm = kern_hyp_va(shadow_va);
+	phys_addr_t shadow_pa = hyp_virt_to_phys(vm);
+	u64 pfn = hyp_phys_to_pfn(shadow_pa);
+	u64 nr_pages = shadow_size >> PAGE_SHIFT;
+	int nr_vcpus = 0;
+	int ret = 0;
+
+	kvm = kern_hyp_va(kvm);
+
+	/* Ensure the host has donated enough memory for the shadow structs. */
+	nr_vcpus = kvm->created_vcpus;
+	ret = check_shadow_size(nr_vcpus, shadow_size);
+	if (ret)
+		goto err;
+
+	ret = __pkvm_host_donate_hyp(pfn, nr_pages);
+	if (ret)
+		goto err;
+
+	/* Ensure we're working with a clean slate. */
+	memset(vm, 0, shadow_size);
+
+	/* Add the entry to the shadow table. */
+	ret = insert_shadow_table(kvm, vm, shadow_size);
+	if (ret < 0)
+		goto err_clear_shadow;
+
+	init_shadow_structs(kvm, vm, nr_vcpus);
+
+	return vm->shadow_handle;
+
+err_clear_shadow:
+	/* Clear the donated shadow memory on failure to avoid data leaks. */
+	memset(vm, 0, shadow_size);
+	WARN_ON(__pkvm_hyp_donate_host(pfn, nr_pages));
+
+err:
+	return ret;
+}
+
+int __pkvm_teardown_shadow(struct kvm *kvm)
+{
+	struct kvm_shadow_vm *vm;
+	size_t shadow_size;
+	int shadow_handle;
+	u64 pfn;
+	u64 nr_pages;
+
+	kvm = kern_hyp_va(kvm);
+
+	shadow_handle = kvm->arch.pkvm.shadow_handle;
+
+	/* Lookup then remove entry from the shadow table. */
+	vm = find_shadow_by_handle(shadow_handle);
+	if (!vm)
+		return -ENOENT;
+
+	shadow_size = vm->shadow_area_size;
+
+	remove_shadow_table(shadow_handle);
+
+	/* Clear the shadow memory since hyp is releasing it back to host. */
+	memset(vm, 0, shadow_size);
+
+	pfn = hyp_phys_to_pfn(__hyp_pa(vm));
+	nr_pages = shadow_size >> PAGE_SHIFT;
+	WARN_ON(__pkvm_hyp_donate_host(pfn, nr_pages));
+	return 0;
+}
--- a/arch/arm64/kvm/hyp/nvhe/setup.c
+++ b/arch/arm64/kvm/hyp/nvhe/setup.c
@@ -40,6 +40,11 @@ static int divide_memory_pool(void *virt, unsigned long size)
 	if (!vmemmap_base)
 		return -ENOMEM;

+	nr_pages = hyp_shadow_table_pages(sizeof(struct kvm_shadow_vm));
+	shadow_table = hyp_early_alloc_contig(nr_pages);
+	if (!shadow_table)
+		return -ENOMEM;
+
 	nr_pages = hyp_s1_pgtable_pages();
 	hyp_pgt_base = hyp_early_alloc_contig(nr_pages);
 	if (!hyp_pgt_base)
--- a/arch/arm64/kvm/pkvm.c
+++ b/arch/arm64/kvm/pkvm.c
@@ -6,6 +6,8 @@

 #include <linux/kvm_host.h>
 #include <linux/memblock.h>
+#include <linux/mm.h>
+#include <linux/mutex.h>
 #include <linux/sort.h>

 #include <asm/kvm_pkvm.h>
@@ -71,6 +73,7 @@ void __init kvm_hyp_reserve(void)

 	hyp_mem_pages += hyp_s1_pgtable_pages();
 	hyp_mem_pages += host_s2_pgtable_pages();
+	hyp_mem_pages += hyp_shadow_table_pages(KVM_SHADOW_VM_SIZE);
 	hyp_mem_pages += hyp_vmemmap_pages(STRUCT_HYP_PAGE_SIZE);

 	/*
@@ -96,3 +99,80 @@ void __init kvm_hyp_reserve(void)
 	kvm_info("Reserved %lld MiB at 0x%llx\n", hyp_mem_size >> 20,
 		 hyp_mem_base);
 }
+
+/*
+ * Updates the state of the host's version of the vcpu state.
+ */
+static void update_vcpu_state(struct kvm_vcpu *vcpu, int shadow_handle)
+{
+	vcpu->arch.pkvm.shadow_handle = shadow_handle;
+}
+
+/*
+ * Allocates and donates memory for EL2 shadow structs.
+ *
+ * Allocates space for the shadow state, which includes the shadow vm as well as
+ * the shadow vcpu states.
+ *
+ * Stores an opaque handler in the kvm struct for future reference.
+ *
+ * Return 0 on success, negative error code on failure.
+ */
+static int create_el2_shadow(struct kvm *kvm)
+{
+	int shadow_handle;
+	void *shadow_addr;
+	size_t shadow_sz;
+	int ret, i;
+
+	if (kvm->arch.pkvm.shadow_handle)
+		return -EEXIST;
+
+	if (kvm->created_vcpus < 1)
+		return -EINVAL;
+
+	/* Allocate memory to donate to hyp for the kvm and vcpu state. */
+	shadow_sz = PAGE_ALIGN(KVM_SHADOW_VM_SIZE +
+			       SHADOW_VCPU_STATE_SIZE * kvm->created_vcpus);
+	shadow_addr = alloc_pages_exact(shadow_sz, GFP_KERNEL_ACCOUNT);
+	if (!shadow_addr)
+		return -ENOMEM;
+
+	/* Donate the shadow memory to hyp and let hyp initialize it. */
+	ret = kvm_call_hyp_nvhe(__pkvm_init_shadow, kvm, shadow_addr, shadow_sz);
+	if (ret < 0)
+		goto err;
+
+	shadow_handle = ret;
+
+	/* Store the shadow handle given by hyp for future call reference. */
+	kvm->arch.pkvm.shadow_handle = shadow_handle;
+
+	/* Adjust host's vcpu state as it doesn't control it anymore. */
+	for (i = 0; i < kvm->created_vcpus; i++)
+		update_vcpu_state(kvm->vcpus[i], shadow_handle);
+
+	return 0;
+
+err:
+	free_pages_exact(shadow_addr, shadow_sz);
+	return ret;
+}
+
+int pkvm_init_el2_context(struct kvm *kvm)
+{
+	int ret = 0;
+
+	mutex_lock(&kvm->lock);
+	ret = create_el2_shadow(kvm);
+	mutex_unlock(&kvm->lock);
+
+	if (ret < 0) {
+		kvm_err("Creating shadow structures for protected VM failed: %d\n",
+			ret);
+		return ret;
+	}
+
+	kvm_pr_unimpl("Stage-2 protection is a work-in-progress: civilization phase III\n");
+	return 0;
+}