Merge 58e1100fdc ("MAINTAINERS: co-maintain random.c") into android-mainline

Steps on the way to 5.16-rc4

Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
Change-Id: I57f8e77758b2ebb23b9ec8fb4e7053287282c1af
This commit is contained in:
Greg Kroah-Hartman
2021-12-03 11:07:08 +01:00
33 changed files with 711 additions and 414 deletions

View File

@@ -1,7 +1,7 @@
.. SPDX-License-Identifier: GPL-2.0
=================================
NETWORK FILESYSTEM HELPER LIBRARY
Network Filesystem Helper Library
=================================
.. Contents:
@@ -37,22 +37,22 @@ into a common call framework.
The following services are provided:
* Handles transparent huge pages (THPs).
* Handle folios that span multiple pages.
* Insulates the netfs from VM interface changes.
* Insulate the netfs from VM interface changes.
* Allows the netfs to arbitrarily split reads up into pieces, even ones that
don't match page sizes or page alignments and that may cross pages.
* Allow the netfs to arbitrarily split reads up into pieces, even ones that
don't match folio sizes or folio alignments and that may cross folios.
* Allows the netfs to expand a readahead request in both directions to meet
its needs.
* Allow the netfs to expand a readahead request in both directions to meet its
needs.
* Allows the netfs to partially fulfil a read, which will then be resubmitted.
* Allow the netfs to partially fulfil a read, which will then be resubmitted.
* Handles local caching, allowing cached data and server-read data to be
* Handle local caching, allowing cached data and server-read data to be
interleaved for a single request.
* Handles clearing of bufferage that aren't on the server.
* Handle clearing of bufferage that aren't on the server.
* Handle retrying of reads that failed, switching reads from the cache to the
server as necessary.
@@ -70,22 +70,22 @@ Read Helper Functions
Three read helpers are provided::
* void netfs_readahead(struct readahead_control *ractl,
const struct netfs_read_request_ops *ops,
void *netfs_priv);``
* int netfs_readpage(struct file *file,
struct page *page,
const struct netfs_read_request_ops *ops,
void *netfs_priv);
* int netfs_write_begin(struct file *file,
struct address_space *mapping,
loff_t pos,
unsigned int len,
unsigned int flags,
struct page **_page,
void **_fsdata,
const struct netfs_read_request_ops *ops,
void *netfs_priv);
void netfs_readahead(struct readahead_control *ractl,
const struct netfs_read_request_ops *ops,
void *netfs_priv);
int netfs_readpage(struct file *file,
struct folio *folio,
const struct netfs_read_request_ops *ops,
void *netfs_priv);
int netfs_write_begin(struct file *file,
struct address_space *mapping,
loff_t pos,
unsigned int len,
unsigned int flags,
struct folio **_folio,
void **_fsdata,
const struct netfs_read_request_ops *ops,
void *netfs_priv);
Each corresponds to a VM operation, with the addition of a couple of parameters
for the use of the read helpers:
@@ -103,8 +103,8 @@ Both of these values will be stored into the read request structure.
For ->readahead() and ->readpage(), the network filesystem should just jump
into the corresponding read helper; whereas for ->write_begin(), it may be a
little more complicated as the network filesystem might want to flush
conflicting writes or track dirty data and needs to put the acquired page if an
error occurs after calling the helper.
conflicting writes or track dirty data and needs to put the acquired folio if
an error occurs after calling the helper.
The helpers manage the read request, calling back into the network filesystem
through the suppplied table of operations. Waits will be performed as
@@ -253,7 +253,7 @@ through which it can issue requests and negotiate::
void (*issue_op)(struct netfs_read_subrequest *subreq);
bool (*is_still_valid)(struct netfs_read_request *rreq);
int (*check_write_begin)(struct file *file, loff_t pos, unsigned len,
struct page *page, void **_fsdata);
struct folio *folio, void **_fsdata);
void (*done)(struct netfs_read_request *rreq);
void (*cleanup)(struct address_space *mapping, void *netfs_priv);
};
@@ -313,13 +313,14 @@ The operations are as follows:
There is no return value; the netfs_subreq_terminated() function should be
called to indicate whether or not the operation succeeded and how much data
it transferred. The filesystem also should not deal with setting pages
it transferred. The filesystem also should not deal with setting folios
uptodate, unlocking them or dropping their refs - the helpers need to deal
with this as they have to coordinate with copying to the local cache.
Note that the helpers have the pages locked, but not pinned. It is possible
to use the ITER_XARRAY iov iterator to refer to the range of the inode that
is being operated upon without the need to allocate large bvec tables.
Note that the helpers have the folios locked, but not pinned. It is
possible to use the ITER_XARRAY iov iterator to refer to the range of the
inode that is being operated upon without the need to allocate large bvec
tables.
* ``is_still_valid()``
@@ -330,15 +331,15 @@ The operations are as follows:
* ``check_write_begin()``
[Optional] This is called from the netfs_write_begin() helper once it has
allocated/grabbed the page to be modified to allow the filesystem to flush
allocated/grabbed the folio to be modified to allow the filesystem to flush
conflicting state before allowing it to be modified.
It should return 0 if everything is now fine, -EAGAIN if the page should be
It should return 0 if everything is now fine, -EAGAIN if the folio should be
regrabbed and any other error code to abort the operation.
* ``done``
[Optional] This is called after the pages in the request have all been
[Optional] This is called after the folios in the request have all been
unlocked (and marked uptodate if applicable).
* ``cleanup``
@@ -390,7 +391,7 @@ The read helpers work by the following general procedure:
* If NETFS_SREQ_CLEAR_TAIL was set, a short read will be cleared to the
end of the slice instead of reissuing.
* Once the data is read, the pages that have been fully read/cleared:
* Once the data is read, the folios that have been fully read/cleared:
* Will be marked uptodate.
@@ -398,11 +399,11 @@ The read helpers work by the following general procedure:
* Unlocked
* Any pages that need writing to the cache will then have DIO writes issued.
* Any folios that need writing to the cache will then have DIO writes issued.
* Synchronous operations will wait for reading to be complete.
* Writes to the cache will proceed asynchronously and the pages will have the
* Writes to the cache will proceed asynchronously and the folios will have the
PG_fscache mark removed when that completes.
* The request structures will be cleaned up when everything has completed.
@@ -452,6 +453,9 @@ operation table looks like the following::
netfs_io_terminated_t term_func,
void *term_func_priv);
int (*prepare_write)(struct netfs_cache_resources *cres,
loff_t *_start, size_t *_len, loff_t i_size);
int (*write)(struct netfs_cache_resources *cres,
loff_t start_pos,
struct iov_iter *iter,
@@ -509,6 +513,14 @@ The methods defined in the table are:
indicating whether the termination is definitely happening in the caller's
context.
* ``prepare_write()``
[Required] Called to adjust a write to the cache and check that there is
sufficient space in the cache. The start and length values indicate the
size of the write that netfslib is proposing, and this can be adjusted by
the cache to respect DIO boundaries. The file size is passed for
information.
* ``write()``
[Required] Called to write to the cache. The start file offset is given
@@ -525,4 +537,9 @@ not the read request structure as they could be used in other situations where
there isn't a read request structure as well, such as writing dirty data to the
cache.
API Function Reference
======================
.. kernel-doc:: include/linux/netfs.h
.. kernel-doc:: fs/netfs/read_helper.c

View File

@@ -15986,6 +15986,7 @@ F: arch/mips/generic/board-ranchu.c
RANDOM NUMBER DRIVER
M: "Theodore Ts'o" <tytso@mit.edu>
M: Jason A. Donenfeld <Jason@zx2c4.com>
S: Maintained
F: drivers/char/random.c

View File

@@ -91,7 +91,7 @@
#define HCR_HOST_VHE_FLAGS (HCR_RW | HCR_TGE | HCR_E2H)
/* TCR_EL2 Registers bits */
#define TCR_EL2_RES1 ((1 << 31) | (1 << 23))
#define TCR_EL2_RES1 ((1U << 31) | (1 << 23))
#define TCR_EL2_TBI (1 << 20)
#define TCR_EL2_PS_SHIFT 16
#define TCR_EL2_PS_MASK (7 << TCR_EL2_PS_SHIFT)
@@ -276,7 +276,7 @@
#define CPTR_EL2_TFP_SHIFT 10
/* Hyp Coprocessor Trap Register */
#define CPTR_EL2_TCPAC (1 << 31)
#define CPTR_EL2_TCPAC (1U << 31)
#define CPTR_EL2_TAM (1 << 30)
#define CPTR_EL2_TTA (1 << 20)
#define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT)

View File

@@ -403,6 +403,8 @@ typedef bool (*exit_handler_fn)(struct kvm_vcpu *, u64 *);
static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu);
static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code);
/*
* Allow the hypervisor to handle the exit with an exit handler if it has one.
*
@@ -429,6 +431,18 @@ static inline bool kvm_hyp_handle_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
*/
static inline bool fixup_guest_exit(struct kvm_vcpu *vcpu, u64 *exit_code)
{
/*
* Save PSTATE early so that we can evaluate the vcpu mode
* early on.
*/
vcpu->arch.ctxt.regs.pstate = read_sysreg_el2(SYS_SPSR);
/*
* Check whether we want to repaint the state one way or
* another.
*/
early_exit_filter(vcpu, exit_code);
if (ARM_EXCEPTION_CODE(*exit_code) != ARM_EXCEPTION_IRQ)
vcpu->arch.fault.esr_el2 = read_sysreg_el2(SYS_ESR);

View File

@@ -70,7 +70,12 @@ static inline void __sysreg_save_el1_state(struct kvm_cpu_context *ctxt)
static inline void __sysreg_save_el2_return_state(struct kvm_cpu_context *ctxt)
{
ctxt->regs.pc = read_sysreg_el2(SYS_ELR);
ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR);
/*
* Guest PSTATE gets saved at guest fixup time in all
* cases. We still need to handle the nVHE host side here.
*/
if (!has_vhe() && ctxt->__hyp_running_vcpu)
ctxt->regs.pstate = read_sysreg_el2(SYS_SPSR);
if (cpus_have_final_cap(ARM64_HAS_RAS_EXTN))
ctxt_sys_reg(ctxt, DISR_EL1) = read_sysreg_s(SYS_VDISR_EL2);

View File

@@ -233,7 +233,7 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
* Returns false if the guest ran in AArch32 when it shouldn't have, and
* thus should exit to the host, or true if a the guest run loop can continue.
*/
static bool handle_aarch32_guest(struct kvm_vcpu *vcpu, u64 *exit_code)
static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
{
struct kvm *kvm = kern_hyp_va(vcpu->kvm);
@@ -248,10 +248,7 @@ static bool handle_aarch32_guest(struct kvm_vcpu *vcpu, u64 *exit_code)
vcpu->arch.target = -1;
*exit_code &= BIT(ARM_EXIT_WITH_SERROR_BIT);
*exit_code |= ARM_EXCEPTION_IL;
return false;
}
return true;
}
/* Switch to the guest for legacy non-VHE systems */
@@ -316,9 +313,6 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu)
/* Jump in the fire! */
exit_code = __guest_enter(vcpu);
if (unlikely(!handle_aarch32_guest(vcpu, &exit_code)))
break;
/* And we're baaack! */
} while (fixup_guest_exit(vcpu, &exit_code));

View File

@@ -112,6 +112,10 @@ static const exit_handler_fn *kvm_get_exit_handler_array(struct kvm_vcpu *vcpu)
return hyp_exit_handlers;
}
static void early_exit_filter(struct kvm_vcpu *vcpu, u64 *exit_code)
{
}
/* Switch to the guest for VHE systems running in EL2 */
static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu)
{

View File

@@ -12,14 +12,12 @@
#include <linux/types.h>
#include <linux/kvm.h>
#include <linux/kvm_types.h>
#include <asm/csr.h>
#include <asm/kvm_vcpu_fp.h>
#include <asm/kvm_vcpu_timer.h>
#ifdef CONFIG_64BIT
#define KVM_MAX_VCPUS (1U << 16)
#else
#define KVM_MAX_VCPUS (1U << 9)
#endif
#define KVM_MAX_VCPUS \
((HGATP_VMID_MASK >> HGATP_VMID_SHIFT) + 1)
#define KVM_HALT_POLL_NS_DEFAULT 500000

View File

@@ -453,6 +453,12 @@ void kvm_arch_flush_shadow_all(struct kvm *kvm)
void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
struct kvm_memory_slot *slot)
{
gpa_t gpa = slot->base_gfn << PAGE_SHIFT;
phys_addr_t size = slot->npages << PAGE_SHIFT;
spin_lock(&kvm->mmu_lock);
stage2_unmap_range(kvm, gpa, size, false);
spin_unlock(&kvm->mmu_lock);
}
void kvm_arch_commit_memory_region(struct kvm *kvm,

View File

@@ -81,7 +81,6 @@ struct kvm_ioapic {
unsigned long irq_states[IOAPIC_NUM_PINS];
struct kvm_io_device dev;
struct kvm *kvm;
void (*ack_notifier)(void *opaque, int irq);
spinlock_t lock;
struct rtc_status rtc_status;
struct delayed_work eoi_inject;

View File

@@ -56,7 +56,6 @@ struct kvm_pic {
struct kvm_io_device dev_master;
struct kvm_io_device dev_slave;
struct kvm_io_device dev_elcr;
void (*ack_notifier)(void *opaque, int irq);
unsigned long irq_states[PIC_NUM_PINS];
};

View File

@@ -707,7 +707,7 @@ static void pv_eoi_clr_pending(struct kvm_vcpu *vcpu)
static int apic_has_interrupt_for_ppr(struct kvm_lapic *apic, u32 ppr)
{
int highest_irr;
if (apic->vcpu->arch.apicv_active)
if (kvm_x86_ops.sync_pir_to_irr)
highest_irr = static_call(kvm_x86_sync_pir_to_irr)(apic->vcpu);
else
highest_irr = apic_find_highest_irr(apic);

View File

@@ -1582,7 +1582,7 @@ bool kvm_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range)
flush = kvm_handle_gfn_range(kvm, range, kvm_unmap_rmapp);
if (is_tdp_mmu_enabled(kvm))
flush |= kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
return flush;
}
@@ -2173,10 +2173,10 @@ static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterato
iterator->shadow_addr = root;
iterator->level = vcpu->arch.mmu->shadow_root_level;
if (iterator->level == PT64_ROOT_4LEVEL &&
if (iterator->level >= PT64_ROOT_4LEVEL &&
vcpu->arch.mmu->root_level < PT64_ROOT_4LEVEL &&
!vcpu->arch.mmu->direct_map)
--iterator->level;
iterator->level = PT32E_ROOT_LEVEL;
if (iterator->level == PT32E_ROOT_LEVEL) {
/*
@@ -4855,7 +4855,7 @@ void kvm_init_shadow_npt_mmu(struct kvm_vcpu *vcpu, unsigned long cr0,
struct kvm_mmu *context = &vcpu->arch.guest_mmu;
struct kvm_mmu_role_regs regs = {
.cr0 = cr0,
.cr4 = cr4,
.cr4 = cr4 & ~X86_CR4_PKE,
.efer = efer,
};
union kvm_mmu_role new_role;
@@ -4919,7 +4919,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
context->direct_map = false;
update_permission_bitmask(context, true);
update_pkru_bitmask(context);
context->pkru_mask = 0;
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
}
@@ -5025,6 +5025,14 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
/*
* Invalidate all MMU roles to force them to reinitialize as CPUID
* information is factored into reserved bit calculations.
*
* Correctly handling multiple vCPU models with respect to paging and
* physical address properties) in a single VM would require tracking
* all relevant CPUID information in kvm_mmu_page_role. That is very
* undesirable as it would increase the memory requirements for
* gfn_track (see struct kvm_mmu_page_role comments). For now that
* problem is swept under the rug; KVM's CPUID API is horrific and
* it's all but impossible to solve it without introducing a new API.
*/
vcpu->arch.root_mmu.mmu_role.ext.valid = 0;
vcpu->arch.guest_mmu.mmu_role.ext.valid = 0;
@@ -5032,24 +5040,10 @@ void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
kvm_mmu_reset_context(vcpu);
/*
* KVM does not correctly handle changing guest CPUID after KVM_RUN, as
* MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
* tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
* faults due to reusing SPs/SPTEs. Alert userspace, but otherwise
* sweep the problem under the rug.
*
* KVM's horrific CPUID ABI makes the problem all but impossible to
* solve, as correctly handling multiple vCPU models (with respect to
* paging and physical address properties) in a single VM would require
* tracking all relevant CPUID information in kvm_mmu_page_role. That
* is very undesirable as it would double the memory requirements for
* gfn_track (see struct kvm_mmu_page_role comments), and in practice
* no sane VMM mucks with the core vCPU model on the fly.
* Changing guest CPUID after KVM_RUN is forbidden, see the comment in
* kvm_arch_vcpu_ioctl().
*/
if (vcpu->arch.last_vmentry_cpu != -1) {
pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} after KVM_RUN may cause guest instability\n");
pr_warn_ratelimited("KVM: KVM_SET_CPUID{,2} will fail after KVM_RUN starting with Linux 5.16\n");
}
KVM_BUG_ON(vcpu->arch.last_vmentry_cpu != -1, vcpu->kvm);
}
void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
@@ -5369,7 +5363,7 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
{
kvm_mmu_invalidate_gva(vcpu, vcpu->arch.mmu, gva, INVALID_PAGE);
kvm_mmu_invalidate_gva(vcpu, vcpu->arch.walk_mmu, gva, INVALID_PAGE);
++vcpu->stat.invlpg;
}
EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
@@ -5854,8 +5848,6 @@ restart:
void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
{
bool flush = false;
if (kvm_memslots_have_rmaps(kvm)) {
write_lock(&kvm->mmu_lock);
/*
@@ -5863,17 +5855,14 @@ void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
* logging at a 4k granularity and never creates collapsible
* 2m SPTEs during dirty logging.
*/
flush = slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true);
if (flush)
if (slot_handle_level_4k(kvm, slot, kvm_mmu_zap_collapsible_spte, true))
kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
write_unlock(&kvm->mmu_lock);
}
if (is_tdp_mmu_enabled(kvm)) {
read_lock(&kvm->mmu_lock);
flush = kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot, flush);
if (flush)
kvm_arch_flush_remote_tlbs_memslot(kvm, slot);
kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
read_unlock(&kvm->mmu_lock);
}
}
@@ -6182,23 +6171,46 @@ void kvm_mmu_module_exit(void)
mmu_audit_disable();
}
/*
* Calculate the effective recovery period, accounting for '0' meaning "let KVM
* select a halving time of 1 hour". Returns true if recovery is enabled.
*/
static bool calc_nx_huge_pages_recovery_period(uint *period)
{
/*
* Use READ_ONCE to get the params, this may be called outside of the
* param setters, e.g. by the kthread to compute its next timeout.
*/
bool enabled = READ_ONCE(nx_huge_pages);
uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
if (!enabled || !ratio)
return false;
*period = READ_ONCE(nx_huge_pages_recovery_period_ms);
if (!*period) {
/* Make sure the period is not less than one second. */
ratio = min(ratio, 3600u);
*period = 60 * 60 * 1000 / ratio;
}
return true;
}
static int set_nx_huge_pages_recovery_param(const char *val, const struct kernel_param *kp)
{
bool was_recovery_enabled, is_recovery_enabled;
uint old_period, new_period;
int err;
was_recovery_enabled = nx_huge_pages_recovery_ratio;
old_period = nx_huge_pages_recovery_period_ms;
was_recovery_enabled = calc_nx_huge_pages_recovery_period(&old_period);
err = param_set_uint(val, kp);
if (err)
return err;
is_recovery_enabled = nx_huge_pages_recovery_ratio;
new_period = nx_huge_pages_recovery_period_ms;
is_recovery_enabled = calc_nx_huge_pages_recovery_period(&new_period);
if (READ_ONCE(nx_huge_pages) && is_recovery_enabled &&
if (is_recovery_enabled &&
(!was_recovery_enabled || old_period > new_period)) {
struct kvm *kvm;
@@ -6262,18 +6274,13 @@ static void kvm_recover_nx_lpages(struct kvm *kvm)
static long get_nx_lpage_recovery_timeout(u64 start_time)
{
uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
uint period = READ_ONCE(nx_huge_pages_recovery_period_ms);
bool enabled;
uint period;
if (!period && ratio) {
/* Make sure the period is not less than one second. */
ratio = min(ratio, 3600u);
period = 60 * 60 * 1000 / ratio;
}
enabled = calc_nx_huge_pages_recovery_period(&period);
return READ_ONCE(nx_huge_pages) && ratio
? start_time + msecs_to_jiffies(period) - get_jiffies_64()
: MAX_SCHEDULE_TIMEOUT;
return enabled ? start_time + msecs_to_jiffies(period) - get_jiffies_64()
: MAX_SCHEDULE_TIMEOUT;
}
static int kvm_nx_lpage_recovery_worker(struct kvm *kvm, uintptr_t data)

View File

@@ -317,9 +317,6 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(pt));
int level = sp->role.level;
gfn_t base_gfn = sp->gfn;
u64 old_child_spte;
u64 *sptep;
gfn_t gfn;
int i;
trace_kvm_mmu_prepare_zap_page(sp);
@@ -327,8 +324,9 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
tdp_mmu_unlink_page(kvm, sp, shared);
for (i = 0; i < PT64_ENT_PER_PAGE; i++) {
sptep = rcu_dereference(pt) + i;
gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
u64 *sptep = rcu_dereference(pt) + i;
gfn_t gfn = base_gfn + i * KVM_PAGES_PER_HPAGE(level);
u64 old_child_spte;
if (shared) {
/*
@@ -374,7 +372,7 @@ static void handle_removed_tdp_mmu_page(struct kvm *kvm, tdp_ptep_t pt,
shared);
}
kvm_flush_remote_tlbs_with_address(kvm, gfn,
kvm_flush_remote_tlbs_with_address(kvm, base_gfn,
KVM_PAGES_PER_HPAGE(level + 1));
call_rcu(&sp->rcu_head, tdp_mmu_free_sp_rcu_callback);
@@ -1033,9 +1031,9 @@ bool kvm_tdp_mmu_unmap_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range,
{
struct kvm_mmu_page *root;
for_each_tdp_mmu_root(kvm, root, range->slot->as_id)
flush |= zap_gfn_range(kvm, root, range->start, range->end,
range->may_block, flush, false);
for_each_tdp_mmu_root_yield_safe(kvm, root, range->slot->as_id, false)
flush = zap_gfn_range(kvm, root, range->start, range->end,
range->may_block, flush, false);
return flush;
}
@@ -1364,10 +1362,9 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
* Clear leaf entries which could be replaced by large mappings, for
* GFNs within the slot.
*/
static bool zap_collapsible_spte_range(struct kvm *kvm,
static void zap_collapsible_spte_range(struct kvm *kvm,
struct kvm_mmu_page *root,
const struct kvm_memory_slot *slot,
bool flush)
const struct kvm_memory_slot *slot)
{
gfn_t start = slot->base_gfn;
gfn_t end = start + slot->npages;
@@ -1378,10 +1375,8 @@ static bool zap_collapsible_spte_range(struct kvm *kvm,
tdp_root_for_each_pte(iter, root, start, end) {
retry:
if (tdp_mmu_iter_cond_resched(kvm, &iter, flush, true)) {
flush = false;
if (tdp_mmu_iter_cond_resched(kvm, &iter, false, true))
continue;
}
if (!is_shadow_present_pte(iter.old_spte) ||
!is_last_spte(iter.old_spte, iter.level))
@@ -1393,6 +1388,7 @@ retry:
pfn, PG_LEVEL_NUM))
continue;
/* Note, a successful atomic zap also does a remote TLB flush. */
if (!tdp_mmu_zap_spte_atomic(kvm, &iter)) {
/*
* The iter must explicitly re-read the SPTE because
@@ -1401,30 +1397,24 @@ retry:
iter.old_spte = READ_ONCE(*rcu_dereference(iter.sptep));
goto retry;
}
flush = true;
}
rcu_read_unlock();
return flush;
}
/*
* Clear non-leaf entries (and free associated page tables) which could
* be replaced by large mappings, for GFNs within the slot.
*/
bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot,
bool flush)
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot)
{
struct kvm_mmu_page *root;
lockdep_assert_held_read(&kvm->mmu_lock);
for_each_tdp_mmu_root_yield_safe(kvm, root, slot->as_id, true)
flush = zap_collapsible_spte_range(kvm, root, slot, flush);
return flush;
zap_collapsible_spte_range(kvm, root, slot);
}
/*

View File

@@ -64,9 +64,8 @@ void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm,
struct kvm_memory_slot *slot,
gfn_t gfn, unsigned long mask,
bool wrprot);
bool kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot,
bool flush);
void kvm_tdp_mmu_zap_collapsible_sptes(struct kvm *kvm,
const struct kvm_memory_slot *slot);
bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm,
struct kvm_memory_slot *slot, gfn_t gfn,

View File

@@ -989,16 +989,18 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu)
static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run)
{
struct vcpu_svm *svm = to_svm(vcpu);
int cpu = get_cpu();
WARN_ON(cpu != vcpu->cpu);
svm->avic_is_running = is_run;
if (!kvm_vcpu_apicv_active(vcpu))
return;
if (is_run)
avic_vcpu_load(vcpu, vcpu->cpu);
else
avic_vcpu_put(vcpu);
if (kvm_vcpu_apicv_active(vcpu)) {
if (is_run)
avic_vcpu_load(vcpu, cpu);
else
avic_vcpu_put(vcpu);
}
put_cpu();
}
void svm_vcpu_blocking(struct kvm_vcpu *vcpu)

View File

@@ -1543,28 +1543,50 @@ static bool is_cmd_allowed_from_mirror(u32 cmd_id)
return false;
}
static int sev_lock_for_migration(struct kvm *kvm)
static int sev_lock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
int r = -EBUSY;
if (dst_kvm == src_kvm)
return -EINVAL;
/*
* Bail if this VM is already involved in a migration to avoid deadlock
* between two VMs trying to migrate to/from each other.
* Bail if these VMs are already involved in a migration to avoid
* deadlock between two VMs trying to migrate to/from each other.
*/
if (atomic_cmpxchg_acquire(&sev->migration_in_progress, 0, 1))
if (atomic_cmpxchg_acquire(&dst_sev->migration_in_progress, 0, 1))
return -EBUSY;
mutex_lock(&kvm->lock);
if (atomic_cmpxchg_acquire(&src_sev->migration_in_progress, 0, 1))
goto release_dst;
r = -EINTR;
if (mutex_lock_killable(&dst_kvm->lock))
goto release_src;
if (mutex_lock_killable(&src_kvm->lock))
goto unlock_dst;
return 0;
unlock_dst:
mutex_unlock(&dst_kvm->lock);
release_src:
atomic_set_release(&src_sev->migration_in_progress, 0);
release_dst:
atomic_set_release(&dst_sev->migration_in_progress, 0);
return r;
}
static void sev_unlock_after_migration(struct kvm *kvm)
static void sev_unlock_two_vms(struct kvm *dst_kvm, struct kvm *src_kvm)
{
struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info;
struct kvm_sev_info *dst_sev = &to_kvm_svm(dst_kvm)->sev_info;
struct kvm_sev_info *src_sev = &to_kvm_svm(src_kvm)->sev_info;
mutex_unlock(&kvm->lock);
atomic_set_release(&sev->migration_in_progress, 0);
mutex_unlock(&dst_kvm->lock);
mutex_unlock(&src_kvm->lock);
atomic_set_release(&dst_sev->migration_in_progress, 0);
atomic_set_release(&src_sev->migration_in_progress, 0);
}
@@ -1607,14 +1629,15 @@ static void sev_migrate_from(struct kvm_sev_info *dst,
dst->asid = src->asid;
dst->handle = src->handle;
dst->pages_locked = src->pages_locked;
dst->enc_context_owner = src->enc_context_owner;
src->asid = 0;
src->active = false;
src->handle = 0;
src->pages_locked = 0;
src->enc_context_owner = NULL;
INIT_LIST_HEAD(&dst->regions_list);
list_replace_init(&src->regions_list, &dst->regions_list);
list_cut_before(&dst->regions_list, &src->regions_list, &src->regions_list);
}
static int sev_es_migrate_from(struct kvm *dst, struct kvm *src)
@@ -1666,15 +1689,6 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
bool charged = false;
int ret;
ret = sev_lock_for_migration(kvm);
if (ret)
return ret;
if (sev_guest(kvm)) {
ret = -EINVAL;
goto out_unlock;
}
source_kvm_file = fget(source_fd);
if (!file_is_kvm(source_kvm_file)) {
ret = -EBADF;
@@ -1682,16 +1696,26 @@ int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd)
}
source_kvm = source_kvm_file->private_data;
ret = sev_lock_for_migration(source_kvm);
ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
goto out_fput;
if (!sev_guest(source_kvm)) {
if (sev_guest(kvm) || !sev_guest(source_kvm)) {
ret = -EINVAL;
goto out_source;
goto out_unlock;
}
src_sev = &to_kvm_svm(source_kvm)->sev_info;
/*
* VMs mirroring src's encryption context rely on it to keep the
* ASID allocated, but below we are clearing src_sev->asid.
*/
if (src_sev->num_mirrored_vms) {
ret = -EBUSY;
goto out_unlock;
}
dst_sev->misc_cg = get_current_misc_cg();
cg_cleanup_sev = dst_sev;
if (dst_sev->misc_cg != src_sev->misc_cg) {
@@ -1728,13 +1752,11 @@ out_dst_cgroup:
sev_misc_cg_uncharge(cg_cleanup_sev);
put_misc_cg(cg_cleanup_sev->misc_cg);
cg_cleanup_sev->misc_cg = NULL;
out_source:
sev_unlock_after_migration(source_kvm);
out_unlock:
sev_unlock_two_vms(kvm, source_kvm);
out_fput:
if (source_kvm_file)
fput(source_kvm_file);
out_unlock:
sev_unlock_after_migration(kvm);
return ret;
}
@@ -1953,76 +1975,60 @@ int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd)
{
struct file *source_kvm_file;
struct kvm *source_kvm;
struct kvm_sev_info source_sev, *mirror_sev;
struct kvm_sev_info *source_sev, *mirror_sev;
int ret;
source_kvm_file = fget(source_fd);
if (!file_is_kvm(source_kvm_file)) {
ret = -EBADF;
goto e_source_put;
goto e_source_fput;
}
source_kvm = source_kvm_file->private_data;
mutex_lock(&source_kvm->lock);
ret = sev_lock_two_vms(kvm, source_kvm);
if (ret)
goto e_source_fput;
if (!sev_guest(source_kvm)) {
/*
* Mirrors of mirrors should work, but let's not get silly. Also
* disallow out-of-band SEV/SEV-ES init if the target is already an
* SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
* created after SEV/SEV-ES initialization, e.g. to init intercepts.
*/
if (sev_guest(kvm) || !sev_guest(source_kvm) ||
is_mirroring_enc_context(source_kvm) || kvm->created_vcpus) {
ret = -EINVAL;
goto e_source_unlock;
goto e_unlock;
}
/* Mirrors of mirrors should work, but let's not get silly */
if (is_mirroring_enc_context(source_kvm) || source_kvm == kvm) {
ret = -EINVAL;
goto e_source_unlock;
}
memcpy(&source_sev, &to_kvm_svm(source_kvm)->sev_info,
sizeof(source_sev));
/*
* The mirror kvm holds an enc_context_owner ref so its asid can't
* disappear until we're done with it
*/
source_sev = &to_kvm_svm(source_kvm)->sev_info;
kvm_get_kvm(source_kvm);
fput(source_kvm_file);
mutex_unlock(&source_kvm->lock);
mutex_lock(&kvm->lock);
/*
* Disallow out-of-band SEV/SEV-ES init if the target is already an
* SEV guest, or if vCPUs have been created. KVM relies on vCPUs being
* created after SEV/SEV-ES initialization, e.g. to init intercepts.
*/
if (sev_guest(kvm) || kvm->created_vcpus) {
ret = -EINVAL;
goto e_mirror_unlock;
}
source_sev->num_mirrored_vms++;
/* Set enc_context_owner and copy its encryption context over */
mirror_sev = &to_kvm_svm(kvm)->sev_info;
mirror_sev->enc_context_owner = source_kvm;
mirror_sev->active = true;
mirror_sev->asid = source_sev.asid;
mirror_sev->fd = source_sev.fd;
mirror_sev->es_active = source_sev.es_active;
mirror_sev->handle = source_sev.handle;
mirror_sev->asid = source_sev->asid;
mirror_sev->fd = source_sev->fd;
mirror_sev->es_active = source_sev->es_active;
mirror_sev->handle = source_sev->handle;
INIT_LIST_HEAD(&mirror_sev->regions_list);
ret = 0;
/*
* Do not copy ap_jump_table. Since the mirror does not share the same
* KVM contexts as the original, and they may have different
* memory-views.
*/
mutex_unlock(&kvm->lock);
return 0;
e_mirror_unlock:
mutex_unlock(&kvm->lock);
kvm_put_kvm(source_kvm);
return ret;
e_source_unlock:
mutex_unlock(&source_kvm->lock);
e_source_put:
e_unlock:
sev_unlock_two_vms(kvm, source_kvm);
e_source_fput:
if (source_kvm_file)
fput(source_kvm_file);
return ret;
@@ -2034,17 +2040,24 @@ void sev_vm_destroy(struct kvm *kvm)
struct list_head *head = &sev->regions_list;
struct list_head *pos, *q;
WARN_ON(sev->num_mirrored_vms);
if (!sev_guest(kvm))
return;
/* If this is a mirror_kvm release the enc_context_owner and skip sev cleanup */
if (is_mirroring_enc_context(kvm)) {
kvm_put_kvm(sev->enc_context_owner);
struct kvm *owner_kvm = sev->enc_context_owner;
struct kvm_sev_info *owner_sev = &to_kvm_svm(owner_kvm)->sev_info;
mutex_lock(&owner_kvm->lock);
if (!WARN_ON(!owner_sev->num_mirrored_vms))
owner_sev->num_mirrored_vms--;
mutex_unlock(&owner_kvm->lock);
kvm_put_kvm(owner_kvm);
return;
}
mutex_lock(&kvm->lock);
/*
* Ensure that all guest tagged cache entries are flushed before
* releasing the pages back to the system for use. CLFLUSH will
@@ -2064,8 +2077,6 @@ void sev_vm_destroy(struct kvm *kvm)
}
}
mutex_unlock(&kvm->lock);
sev_unbind_asid(kvm, sev->handle);
sev_asid_free(sev);
}

View File

@@ -4651,7 +4651,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = {
.load_eoi_exitmap = svm_load_eoi_exitmap,
.hwapic_irr_update = svm_hwapic_irr_update,
.hwapic_isr_update = svm_hwapic_isr_update,
.sync_pir_to_irr = kvm_lapic_find_highest_irr,
.apicv_post_state_restore = avic_post_state_restore,
.set_tss_addr = svm_set_tss_addr,

View File

@@ -79,6 +79,7 @@ struct kvm_sev_info {
struct list_head regions_list; /* List of registered regions */
u64 ap_jump_table; /* SEV-ES AP Jump Table address */
struct kvm *enc_context_owner; /* Owner of copied encryption context */
unsigned long num_mirrored_vms; /* Number of VMs sharing this ASID */
struct misc_cg *misc_cg; /* For misc cgroup accounting */
atomic_t migration_in_progress;
};

View File

@@ -1162,29 +1162,26 @@ static void nested_vmx_transition_tlb_flush(struct kvm_vcpu *vcpu,
WARN_ON(!enable_vpid);
/*
* If VPID is enabled and used by vmc12, but L2 does not have a unique
* TLB tag (ASID), i.e. EPT is disabled and KVM was unable to allocate
* a VPID for L2, flush the current context as the effective ASID is
* common to both L1 and L2.
*
* Defer the flush so that it runs after vmcs02.EPTP has been set by
* KVM_REQ_LOAD_MMU_PGD (if nested EPT is enabled) and to avoid
* redundant flushes further down the nested pipeline.
*
* If a TLB flush isn't required due to any of the above, and vpid12 is
* changing then the new "virtual" VPID (vpid12) will reuse the same
* "real" VPID (vpid02), and so needs to be flushed. There's no direct
* mapping between vpid02 and vpid12, vpid02 is per-vCPU and reused for
* all nested vCPUs. Remember, a flush on VM-Enter does not invalidate
* guest-physical mappings, so there is no need to sync the nEPT MMU.
* VPID is enabled and in use by vmcs12. If vpid12 is changing, then
* emulate a guest TLB flush as KVM does not track vpid12 history nor
* is the VPID incorporated into the MMU context. I.e. KVM must assume
* that the new vpid12 has never been used and thus represents a new
* guest ASID that cannot have entries in the TLB.
*/
if (!nested_has_guest_tlb_tag(vcpu)) {
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
} else if (is_vmenter &&
vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
if (is_vmenter && vmcs12->virtual_processor_id != vmx->nested.last_vpid) {
vmx->nested.last_vpid = vmcs12->virtual_processor_id;
vpid_sync_context(nested_get_vpid02(vcpu));
kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu);
return;
}
/*
* If VPID is enabled, used by vmc12, and vpid12 is not changing but
* does not have a unique TLB tag (ASID), i.e. EPT is disabled and
* KVM was unable to allocate a VPID for L2, flush the current context
* as the effective ASID is common to both L1 and L2.
*/
if (!nested_has_guest_tlb_tag(vcpu))
kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
static bool is_bitwise_subset(u64 superset, u64 subset, u64 mask)
@@ -3344,8 +3341,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu,
};
u32 failed_index;
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
kvm_vcpu_flush_tlb_current(vcpu);
kvm_service_local_tlb_flush_requests(vcpu);
evaluate_pending_interrupts = exec_controls_get(vmx) &
(CPU_BASED_INTR_WINDOW_EXITING | CPU_BASED_NMI_WINDOW_EXITING);
@@ -4502,9 +4498,8 @@ void nested_vmx_vmexit(struct kvm_vcpu *vcpu, u32 vm_exit_reason,
(void)nested_get_evmcs_page(vcpu);
}
/* Service the TLB flush request for L2 before switching to L1. */
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
kvm_vcpu_flush_tlb_current(vcpu);
/* Service pending TLB flush requests for L2 before switching to L1. */
kvm_service_local_tlb_flush_requests(vcpu);
/*
* VCPU_EXREG_PDPTR will be clobbered in arch/x86/kvm/vmx/vmx.h between
@@ -4857,6 +4852,7 @@ static int enter_vmx_operation(struct kvm_vcpu *vcpu)
if (!vmx->nested.cached_vmcs12)
goto out_cached_vmcs12;
vmx->nested.shadow_vmcs12_cache.gpa = INVALID_GPA;
vmx->nested.cached_shadow_vmcs12 = kzalloc(VMCS12_SIZE, GFP_KERNEL_ACCOUNT);
if (!vmx->nested.cached_shadow_vmcs12)
goto out_cached_shadow_vmcs12;
@@ -5289,8 +5285,7 @@ static int handle_vmptrld(struct kvm_vcpu *vcpu)
struct gfn_to_hva_cache *ghc = &vmx->nested.vmcs12_cache;
struct vmcs_hdr hdr;
if (ghc->gpa != vmptr &&
kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, vmptr, VMCS12_SIZE)) {
/*
* Reads from an unbacked page return all 1s,
* which means that the 32 bits located at the

View File

@@ -5,6 +5,7 @@
#include <asm/cpu.h>
#include "lapic.h"
#include "irq.h"
#include "posted_intr.h"
#include "trace.h"
#include "vmx.h"
@@ -77,13 +78,18 @@ after_clear_sn:
pi_set_on(pi_desc);
}
static bool vmx_can_use_vtd_pi(struct kvm *kvm)
{
return irqchip_in_kernel(kvm) && enable_apicv &&
kvm_arch_has_assigned_device(kvm) &&
irq_remapping_cap(IRQ_POSTING_CAP);
}
void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu)
{
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP) ||
!kvm_vcpu_apicv_active(vcpu))
if (!vmx_can_use_vtd_pi(vcpu->kvm))
return;
/* Set SN when the vCPU is preempted */
@@ -141,9 +147,7 @@ int pi_pre_block(struct kvm_vcpu *vcpu)
struct pi_desc old, new;
struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
if (!kvm_arch_has_assigned_device(vcpu->kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP) ||
!kvm_vcpu_apicv_active(vcpu))
if (!vmx_can_use_vtd_pi(vcpu->kvm))
return 0;
WARN_ON(irqs_disabled());
@@ -270,9 +274,7 @@ int pi_update_irte(struct kvm *kvm, unsigned int host_irq, uint32_t guest_irq,
struct vcpu_data vcpu_info;
int idx, ret = 0;
if (!kvm_arch_has_assigned_device(kvm) ||
!irq_remapping_cap(IRQ_POSTING_CAP) ||
!kvm_vcpu_apicv_active(kvm->vcpus[0]))
if (!vmx_can_use_vtd_pi(kvm))
return 0;
idx = srcu_read_lock(&kvm->irq_srcu);

View File

@@ -2918,6 +2918,13 @@ static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
}
}
static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
{
if (is_guest_mode(vcpu))
return nested_get_vpid02(vcpu);
return to_vmx(vcpu)->vpid;
}
static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
{
struct kvm_mmu *mmu = vcpu->arch.mmu;
@@ -2930,31 +2937,29 @@ static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
if (enable_ept)
ept_sync_context(construct_eptp(vcpu, root_hpa,
mmu->shadow_root_level));
else if (!is_guest_mode(vcpu))
vpid_sync_context(to_vmx(vcpu)->vpid);
else
vpid_sync_context(nested_get_vpid02(vcpu));
vpid_sync_context(vmx_get_current_vpid(vcpu));
}
static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
{
/*
* vpid_sync_vcpu_addr() is a nop if vmx->vpid==0, see the comment in
* vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
* vmx_flush_tlb_guest() for an explanation of why this is ok.
*/
vpid_sync_vcpu_addr(to_vmx(vcpu)->vpid, addr);
vpid_sync_vcpu_addr(vmx_get_current_vpid(vcpu), addr);
}
static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
{
/*
* vpid_sync_context() is a nop if vmx->vpid==0, e.g. if enable_vpid==0
* or a vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit
* are required to flush GVA->{G,H}PA mappings from the TLB if vpid is
* vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
* vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
* required to flush GVA->{G,H}PA mappings from the TLB if vpid is
* disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
* i.e. no explicit INVVPID is necessary.
*/
vpid_sync_context(to_vmx(vcpu)->vpid);
vpid_sync_context(vmx_get_current_vpid(vcpu));
}
void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
@@ -6262,9 +6267,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
{
struct vcpu_vmx *vmx = to_vmx(vcpu);
int max_irr;
bool max_irr_updated;
bool got_posted_interrupt;
if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm))
if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
return -EIO;
if (pi_test_on(&vmx->pi_desc)) {
@@ -6274,22 +6279,33 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
* But on x86 this is just a compiler barrier anyway.
*/
smp_mb__after_atomic();
max_irr_updated =
got_posted_interrupt =
kvm_apic_update_irr(vcpu, vmx->pi_desc.pir, &max_irr);
/*
* If we are running L2 and L1 has a new pending interrupt
* which can be injected, this may cause a vmexit or it may
* be injected into L2. Either way, this interrupt will be
* processed via KVM_REQ_EVENT, not RVI, because we do not use
* virtual interrupt delivery to inject L1 interrupts into L2.
*/
if (is_guest_mode(vcpu) && max_irr_updated)
kvm_make_request(KVM_REQ_EVENT, vcpu);
} else {
max_irr = kvm_lapic_find_highest_irr(vcpu);
got_posted_interrupt = false;
}
vmx_hwapic_irr_update(vcpu, max_irr);
/*
* Newly recognized interrupts are injected via either virtual interrupt
* delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
* disabled in two cases:
*
* 1) If L2 is running and the vCPU has a new pending interrupt. If L1
* wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
* VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
* into L2, but KVM doesn't use virtual interrupt delivery to inject
* interrupts into L2, and so KVM_REQ_EVENT is again needed.
*
* 2) If APICv is disabled for this vCPU, assigned devices may still
* attempt to post interrupts. The posted interrupt vector will cause
* a VM-Exit and the subsequent entry will call sync_pir_to_irr.
*/
if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
vmx_set_rvi(max_irr);
else if (got_posted_interrupt)
kvm_make_request(KVM_REQ_EVENT, vcpu);
return max_irr;
}
@@ -7761,10 +7777,10 @@ static __init int hardware_setup(void)
ple_window_shrink = 0;
}
if (!cpu_has_vmx_apicv()) {
if (!cpu_has_vmx_apicv())
enable_apicv = 0;
if (!enable_apicv)
vmx_x86_ops.sync_pir_to_irr = NULL;
}
if (cpu_has_vmx_tsc_scaling()) {
kvm_has_tsc_control = true;

View File

@@ -3258,6 +3258,29 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu)
static_call(kvm_x86_tlb_flush_guest)(vcpu);
}
static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
static_call(kvm_x86_tlb_flush_current)(vcpu);
}
/*
* Service "local" TLB flush requests, which are specific to the current MMU
* context. In addition to the generic event handling in vcpu_enter_guest(),
* TLB flushes that are targeted at an MMU context also need to be serviced
* prior before nested VM-Enter/VM-Exit.
*/
void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu)
{
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
kvm_vcpu_flush_tlb_current(vcpu);
if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
kvm_vcpu_flush_tlb_guest(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests);
static void record_steal_time(struct kvm_vcpu *vcpu)
{
struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache;
@@ -4133,6 +4156,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_SGX_ATTRIBUTE:
#endif
case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM:
case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM:
case KVM_CAP_SREGS2:
case KVM_CAP_EXIT_ON_EMULATION_FAILURE:
case KVM_CAP_VCPU_ATTRIBUTES:
@@ -4448,8 +4472,7 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu,
struct kvm_lapic_state *s)
{
if (vcpu->arch.apicv_active)
static_call(kvm_x86_sync_pir_to_irr)(vcpu);
static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
return kvm_apic_get_state(vcpu, s);
}
@@ -5124,6 +5147,17 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid __user *cpuid_arg = argp;
struct kvm_cpuid cpuid;
/*
* KVM does not correctly handle changing guest CPUID after KVM_RUN, as
* MAXPHYADDR, GBPAGES support, AMD reserved bit behavior, etc.. aren't
* tracked in kvm_mmu_page_role. As a result, KVM may miss guest page
* faults due to reusing SPs/SPTEs. In practice no sane VMM mucks with
* the core vCPU model on the fly, so fail.
*/
r = -EINVAL;
if (vcpu->arch.last_vmentry_cpu != -1)
goto out;
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
@@ -5134,6 +5168,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp,
struct kvm_cpuid2 __user *cpuid_arg = argp;
struct kvm_cpuid2 cpuid;
/*
* KVM_SET_CPUID{,2} after KVM_RUN is forbidded, see the comment in
* KVM_SET_CPUID case above.
*/
r = -EINVAL;
if (vcpu->arch.last_vmentry_cpu != -1)
goto out;
r = -EFAULT;
if (copy_from_user(&cpuid, cpuid_arg, sizeof(cpuid)))
goto out;
@@ -9528,8 +9570,7 @@ static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu)
if (irqchip_split(vcpu->kvm))
kvm_scan_ioapic_routes(vcpu, vcpu->arch.ioapic_handled_vectors);
else {
if (vcpu->arch.apicv_active)
static_call(kvm_x86_sync_pir_to_irr)(vcpu);
static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (ioapic_in_kernel(vcpu->kvm))
kvm_ioapic_scan_entry(vcpu, vcpu->arch.ioapic_handled_vectors);
}
@@ -9648,10 +9689,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/* Flushing all ASIDs flushes the current ASID... */
kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
}
if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu))
kvm_vcpu_flush_tlb_current(vcpu);
if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu))
kvm_vcpu_flush_tlb_guest(vcpu);
kvm_service_local_tlb_flush_requests(vcpu);
if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) {
vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS;
@@ -9802,10 +9840,12 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
/*
* This handles the case where a posted interrupt was
* notified with kvm_vcpu_kick.
* notified with kvm_vcpu_kick. Assigned devices can
* use the POSTED_INTR_VECTOR even if APICv is disabled,
* so do it even if APICv is disabled on this vCPU.
*/
if (kvm_lapic_enabled(vcpu) && vcpu->arch.apicv_active)
static_call(kvm_x86_sync_pir_to_irr)(vcpu);
if (kvm_lapic_enabled(vcpu))
static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (kvm_vcpu_exit_request(vcpu)) {
vcpu->mode = OUTSIDE_GUEST_MODE;
@@ -9849,8 +9889,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu)
if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST))
break;
if (vcpu->arch.apicv_active)
static_call(kvm_x86_sync_pir_to_irr)(vcpu);
if (kvm_lapic_enabled(vcpu))
static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu);
if (unlikely(kvm_vcpu_exit_request(vcpu))) {
exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED;

View File

@@ -103,6 +103,7 @@ static inline unsigned int __shrink_ple_window(unsigned int val,
#define MSR_IA32_CR_PAT_DEFAULT 0x0007040600070406ULL
void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu);
int kvm_check_nested_events(struct kvm_vcpu *vcpu);
static inline void kvm_clear_exception_queue(struct kvm_vcpu *vcpu)
@@ -185,12 +186,6 @@ static inline bool mmu_is_nested(struct kvm_vcpu *vcpu)
return vcpu->arch.walk_mmu == &vcpu->arch.nested_mmu;
}
static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu)
{
++vcpu->stat.tlb_flush;
static_call(kvm_x86_tlb_flush_current)(vcpu);
}
static inline int is_pae(struct kvm_vcpu *vcpu)
{
return kvm_read_cr4_bits(vcpu, X86_CR4_PAE);

View File

@@ -1008,8 +1008,8 @@ out:
}
EXPORT_SYMBOL(netfs_readpage);
/**
* netfs_skip_folio_read - prep a folio for writing without reading first
/*
* Prepare a folio for writing without reading first
* @folio: The folio being prepared
* @pos: starting position for the write
* @len: length of write

View File

@@ -7,6 +7,7 @@
#include <assert.h>
#include <linux/build_bug.h>
#include <linux/compiler.h>
#include <linux/math.h>
#include <endian.h>
#include <byteswap.h>
@@ -14,8 +15,6 @@
#define UINT_MAX (~0U)
#endif
#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
#define PERF_ALIGN(x, a) __PERF_ALIGN_MASK(x, (typeof(x))(a)-1)
#define __PERF_ALIGN_MASK(x, mask) (((x)+(mask))&~(mask))
@@ -52,15 +51,6 @@
_min1 < _min2 ? _min1 : _min2; })
#endif
#ifndef roundup
#define roundup(x, y) ( \
{ \
const typeof(y) __y = y; \
(((x) + (__y - 1)) / __y) * __y; \
} \
)
#endif
#ifndef BUG_ON
#ifdef NDEBUG
#define BUG_ON(cond) do { if (cond) {} } while (0)
@@ -104,16 +94,6 @@ int scnprintf_pad(char * buf, size_t size, const char * fmt, ...);
#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0]) + __must_be_array(arr))
/*
* This looks more complex than it should be. But we need to
* get the type for the ~ right in round_down (it needs to be
* as wide as the result!), and we want to evaluate the macro
* arguments just once each.
*/
#define __round_mask(x, y) ((__typeof__(x))((y)-1))
#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
#define round_down(x, y) ((x) & ~__round_mask(x, y))
#define current_gfp_context(k) 0
#define synchronize_rcu()

View File

@@ -0,0 +1,25 @@
#ifndef _TOOLS_MATH_H
#define _TOOLS_MATH_H
/*
* This looks more complex than it should be. But we need to
* get the type for the ~ right in round_down (it needs to be
* as wide as the result!), and we want to evaluate the macro
* arguments just once each.
*/
#define __round_mask(x, y) ((__typeof__(x))((y)-1))
#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1)
#define round_down(x, y) ((x) & ~__round_mask(x, y))
#define DIV_ROUND_UP(n,d) (((n) + (d) - 1) / (d))
#ifndef roundup
#define roundup(x, y) ( \
{ \
const typeof(y) __y = y; \
(((x) + (__y - 1)) / __y) * __y; \
} \
)
#endif
#endif

View File

@@ -1,5 +1,8 @@
#ifndef _LINUX_LOCKDEP_H
#define _LINUX_LOCKDEP_H
#include <linux/spinlock.h>
struct lock_class_key {
unsigned int a;
};

View File

@@ -12,6 +12,7 @@
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/resource.h>
#include "test_util.h"
@@ -40,10 +41,39 @@ int main(int argc, char *argv[])
{
int kvm_max_vcpu_id = kvm_check_cap(KVM_CAP_MAX_VCPU_ID);
int kvm_max_vcpus = kvm_check_cap(KVM_CAP_MAX_VCPUS);
/*
* Number of file descriptors reqired, KVM_CAP_MAX_VCPUS for vCPU fds +
* an arbitrary number for everything else.
*/
int nr_fds_wanted = kvm_max_vcpus + 100;
struct rlimit rl;
pr_info("KVM_CAP_MAX_VCPU_ID: %d\n", kvm_max_vcpu_id);
pr_info("KVM_CAP_MAX_VCPUS: %d\n", kvm_max_vcpus);
/*
* Check that we're allowed to open nr_fds_wanted file descriptors and
* try raising the limits if needed.
*/
TEST_ASSERT(!getrlimit(RLIMIT_NOFILE, &rl), "getrlimit() failed!");
if (rl.rlim_cur < nr_fds_wanted) {
rl.rlim_cur = nr_fds_wanted;
if (rl.rlim_max < nr_fds_wanted) {
int old_rlim_max = rl.rlim_max;
rl.rlim_max = nr_fds_wanted;
int r = setrlimit(RLIMIT_NOFILE, &rl);
if (r < 0) {
printf("RLIMIT_NOFILE hard limit is too low (%d, wanted %d)\n",
old_rlim_max, nr_fds_wanted);
exit(KSFT_SKIP);
}
} else {
TEST_ASSERT(!setrlimit(RLIMIT_NOFILE, &rl), "setrlimit() failed!");
}
}
/*
* Upstream KVM prior to 4.8 does not support KVM_CAP_MAX_VCPU_ID.
* Userspace is supposed to use KVM_CAP_MAX_VCPUS as the maximum ID

View File

@@ -280,7 +280,7 @@ static struct kvm_vm *pre_init_before_test(enum vm_guest_mode mode, void *arg)
#ifdef __s390x__
alignment = max(0x100000, alignment);
#endif
guest_test_phys_mem = align_down(guest_test_virt_mem, alignment);
guest_test_phys_mem = align_down(guest_test_phys_mem, alignment);
/* Set up the shared data structure test_args */
test_args.vm = vm;

View File

@@ -165,10 +165,10 @@ static void hv_set_cpuid(struct kvm_vm *vm, struct kvm_cpuid2 *cpuid,
vcpu_set_cpuid(vm, VCPU_ID, cpuid);
}
static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
struct kvm_cpuid2 *best)
static void guest_test_msrs_access(void)
{
struct kvm_run *run;
struct kvm_vm *vm;
struct ucall uc;
int stage = 0, r;
struct kvm_cpuid_entry2 feat = {
@@ -180,11 +180,34 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
struct kvm_cpuid_entry2 dbg = {
.function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES
};
struct kvm_enable_cap cap = {0};
run = vcpu_state(vm, VCPU_ID);
struct kvm_cpuid2 *best;
vm_vaddr_t msr_gva;
struct kvm_enable_cap cap = {
.cap = KVM_CAP_HYPERV_ENFORCE_CPUID,
.args = {1}
};
struct msr_data *msr;
while (true) {
vm = vm_create_default(VCPU_ID, 0, guest_msr);
msr_gva = vm_vaddr_alloc_page(vm);
memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize());
msr = addr_gva2hva(vm, msr_gva);
vcpu_args_set(vm, VCPU_ID, 1, msr_gva);
vcpu_enable_cap(vm, VCPU_ID, &cap);
vcpu_set_hv_cpuid(vm, VCPU_ID);
best = kvm_get_supported_hv_cpuid();
vm_init_descriptor_tables(vm);
vcpu_init_descriptor_tables(vm, VCPU_ID);
vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
run = vcpu_state(vm, VCPU_ID);
switch (stage) {
case 0:
/*
@@ -315,6 +338,7 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
* capability enabled and guest visible CPUID bit unset.
*/
cap.cap = KVM_CAP_HYPERV_SYNIC2;
cap.args[0] = 0;
vcpu_enable_cap(vm, VCPU_ID, &cap);
break;
case 22:
@@ -461,9 +485,9 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
switch (get_ucall(vm, VCPU_ID, &uc)) {
case UCALL_SYNC:
TEST_ASSERT(uc.args[1] == stage,
"Unexpected stage: %ld (%d expected)\n",
uc.args[1], stage);
TEST_ASSERT(uc.args[1] == 0,
"Unexpected stage: %ld (0 expected)\n",
uc.args[1]);
break;
case UCALL_ABORT:
TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
@@ -474,13 +498,14 @@ static void guest_test_msrs_access(struct kvm_vm *vm, struct msr_data *msr,
}
stage++;
kvm_vm_free(vm);
}
}
static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall,
void *input, void *output, struct kvm_cpuid2 *best)
static void guest_test_hcalls_access(void)
{
struct kvm_run *run;
struct kvm_vm *vm;
struct ucall uc;
int stage = 0, r;
struct kvm_cpuid_entry2 feat = {
@@ -493,10 +518,38 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall
struct kvm_cpuid_entry2 dbg = {
.function = HYPERV_CPUID_SYNDBG_PLATFORM_CAPABILITIES
};
run = vcpu_state(vm, VCPU_ID);
struct kvm_enable_cap cap = {
.cap = KVM_CAP_HYPERV_ENFORCE_CPUID,
.args = {1}
};
vm_vaddr_t hcall_page, hcall_params;
struct hcall_data *hcall;
struct kvm_cpuid2 *best;
while (true) {
vm = vm_create_default(VCPU_ID, 0, guest_hcall);
vm_init_descriptor_tables(vm);
vcpu_init_descriptor_tables(vm, VCPU_ID);
vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
/* Hypercall input/output */
hcall_page = vm_vaddr_alloc_pages(vm, 2);
hcall = addr_gva2hva(vm, hcall_page);
memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
hcall_params = vm_vaddr_alloc_page(vm);
memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize());
vcpu_args_set(vm, VCPU_ID, 2, addr_gva2gpa(vm, hcall_page), hcall_params);
vcpu_enable_cap(vm, VCPU_ID, &cap);
vcpu_set_hv_cpuid(vm, VCPU_ID);
best = kvm_get_supported_hv_cpuid();
run = vcpu_state(vm, VCPU_ID);
switch (stage) {
case 0:
hcall->control = 0xdeadbeef;
@@ -606,9 +659,9 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall
switch (get_ucall(vm, VCPU_ID, &uc)) {
case UCALL_SYNC:
TEST_ASSERT(uc.args[1] == stage,
"Unexpected stage: %ld (%d expected)\n",
uc.args[1], stage);
TEST_ASSERT(uc.args[1] == 0,
"Unexpected stage: %ld (0 expected)\n",
uc.args[1]);
break;
case UCALL_ABORT:
TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0],
@@ -619,66 +672,15 @@ static void guest_test_hcalls_access(struct kvm_vm *vm, struct hcall_data *hcall
}
stage++;
kvm_vm_free(vm);
}
}
int main(void)
{
struct kvm_cpuid2 *best;
struct kvm_vm *vm;
vm_vaddr_t msr_gva, hcall_page, hcall_params;
struct kvm_enable_cap cap = {
.cap = KVM_CAP_HYPERV_ENFORCE_CPUID,
.args = {1}
};
/* Test MSRs */
vm = vm_create_default(VCPU_ID, 0, guest_msr);
msr_gva = vm_vaddr_alloc_page(vm);
memset(addr_gva2hva(vm, msr_gva), 0x0, getpagesize());
vcpu_args_set(vm, VCPU_ID, 1, msr_gva);
vcpu_enable_cap(vm, VCPU_ID, &cap);
vcpu_set_hv_cpuid(vm, VCPU_ID);
best = kvm_get_supported_hv_cpuid();
vm_init_descriptor_tables(vm);
vcpu_init_descriptor_tables(vm, VCPU_ID);
vm_install_exception_handler(vm, GP_VECTOR, guest_gp_handler);
pr_info("Testing access to Hyper-V specific MSRs\n");
guest_test_msrs_access(vm, addr_gva2hva(vm, msr_gva),
best);
kvm_vm_free(vm);
/* Test hypercalls */
vm = vm_create_default(VCPU_ID, 0, guest_hcall);
vm_init_descriptor_tables(vm);
vcpu_init_descriptor_tables(vm, VCPU_ID);
vm_install_exception_handler(vm, UD_VECTOR, guest_ud_handler);
/* Hypercall input/output */
hcall_page = vm_vaddr_alloc_pages(vm, 2);
memset(addr_gva2hva(vm, hcall_page), 0x0, 2 * getpagesize());
hcall_params = vm_vaddr_alloc_page(vm);
memset(addr_gva2hva(vm, hcall_params), 0x0, getpagesize());
vcpu_args_set(vm, VCPU_ID, 2, addr_gva2gpa(vm, hcall_page), hcall_params);
vcpu_enable_cap(vm, VCPU_ID, &cap);
vcpu_set_hv_cpuid(vm, VCPU_ID);
best = kvm_get_supported_hv_cpuid();
guest_test_msrs_access();
pr_info("Testing access to Hyper-V hypercalls\n");
guest_test_hcalls_access(vm, addr_gva2hva(vm, hcall_params),
addr_gva2hva(vm, hcall_page),
addr_gva2hva(vm, hcall_page) + getpagesize(),
best);
kvm_vm_free(vm);
guest_test_hcalls_access();
}

View File

@@ -54,12 +54,15 @@ static struct kvm_vm *sev_vm_create(bool es)
return vm;
}
static struct kvm_vm *__vm_create(void)
static struct kvm_vm *aux_vm_create(bool with_vcpus)
{
struct kvm_vm *vm;
int i;
vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
if (!with_vcpus)
return vm;
for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
vm_vcpu_add(vm, i);
@@ -89,11 +92,11 @@ static void test_sev_migrate_from(bool es)
{
struct kvm_vm *src_vm;
struct kvm_vm *dst_vms[NR_MIGRATE_TEST_VMS];
int i;
int i, ret;
src_vm = sev_vm_create(es);
for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i)
dst_vms[i] = __vm_create();
dst_vms[i] = aux_vm_create(true);
/* Initial migration from the src to the first dst. */
sev_migrate_from(dst_vms[0]->fd, src_vm->fd);
@@ -102,7 +105,10 @@ static void test_sev_migrate_from(bool es)
sev_migrate_from(dst_vms[i]->fd, dst_vms[i - 1]->fd);
/* Migrate the guest back to the original VM. */
sev_migrate_from(src_vm->fd, dst_vms[NR_MIGRATE_TEST_VMS - 1]->fd);
ret = __sev_migrate_from(src_vm->fd, dst_vms[NR_MIGRATE_TEST_VMS - 1]->fd);
TEST_ASSERT(ret == -1 && errno == EIO,
"VM that was migrated from should be dead. ret %d, errno: %d\n", ret,
errno);
kvm_vm_free(src_vm);
for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i)
@@ -146,6 +152,8 @@ static void test_sev_migrate_locking(void)
for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
pthread_join(pt[i], NULL);
for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i)
kvm_vm_free(input[i].vm);
}
static void test_sev_migrate_parameters(void)
@@ -157,12 +165,11 @@ static void test_sev_migrate_parameters(void)
sev_vm = sev_vm_create(/* es= */ false);
sev_es_vm = sev_vm_create(/* es= */ true);
vm_no_vcpu = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
vm_no_sev = __vm_create();
vm_no_sev = aux_vm_create(true);
sev_es_vm_no_vmsa = vm_create(VM_MODE_DEFAULT, 0, O_RDWR);
sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL);
vm_vcpu_add(sev_es_vm_no_vmsa, 1);
ret = __sev_migrate_from(sev_vm->fd, sev_es_vm->fd);
TEST_ASSERT(
ret == -1 && errno == EINVAL,
@@ -191,13 +198,151 @@ static void test_sev_migrate_parameters(void)
TEST_ASSERT(ret == -1 && errno == EINVAL,
"Migrations require SEV enabled. ret %d, errno: %d\n", ret,
errno);
kvm_vm_free(sev_vm);
kvm_vm_free(sev_es_vm);
kvm_vm_free(sev_es_vm_no_vmsa);
kvm_vm_free(vm_no_vcpu);
kvm_vm_free(vm_no_sev);
}
static int __sev_mirror_create(int dst_fd, int src_fd)
{
struct kvm_enable_cap cap = {
.cap = KVM_CAP_VM_COPY_ENC_CONTEXT_FROM,
.args = { src_fd }
};
return ioctl(dst_fd, KVM_ENABLE_CAP, &cap);
}
static void sev_mirror_create(int dst_fd, int src_fd)
{
int ret;
ret = __sev_mirror_create(dst_fd, src_fd);
TEST_ASSERT(!ret, "Copying context failed, ret: %d, errno: %d\n", ret, errno);
}
static void test_sev_mirror(bool es)
{
struct kvm_vm *src_vm, *dst_vm;
struct kvm_sev_launch_start start = {
.policy = es ? SEV_POLICY_ES : 0
};
int i;
src_vm = sev_vm_create(es);
dst_vm = aux_vm_create(false);
sev_mirror_create(dst_vm->fd, src_vm->fd);
/* Check that we can complete creation of the mirror VM. */
for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i)
vm_vcpu_add(dst_vm, i);
sev_ioctl(dst_vm->fd, KVM_SEV_LAUNCH_START, &start);
if (es)
sev_ioctl(dst_vm->fd, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL);
kvm_vm_free(src_vm);
kvm_vm_free(dst_vm);
}
static void test_sev_mirror_parameters(void)
{
struct kvm_vm *sev_vm, *sev_es_vm, *vm_no_vcpu, *vm_with_vcpu;
int ret;
sev_vm = sev_vm_create(/* es= */ false);
sev_es_vm = sev_vm_create(/* es= */ true);
vm_with_vcpu = aux_vm_create(true);
vm_no_vcpu = aux_vm_create(false);
ret = __sev_mirror_create(sev_vm->fd, sev_vm->fd);
TEST_ASSERT(
ret == -1 && errno == EINVAL,
"Should not be able copy context to self. ret: %d, errno: %d\n",
ret, errno);
ret = __sev_mirror_create(sev_vm->fd, sev_es_vm->fd);
TEST_ASSERT(
ret == -1 && errno == EINVAL,
"Should not be able copy context to SEV enabled VM. ret: %d, errno: %d\n",
ret, errno);
ret = __sev_mirror_create(sev_es_vm->fd, sev_vm->fd);
TEST_ASSERT(
ret == -1 && errno == EINVAL,
"Should not be able copy context to SEV-ES enabled VM. ret: %d, errno: %d\n",
ret, errno);
ret = __sev_mirror_create(vm_no_vcpu->fd, vm_with_vcpu->fd);
TEST_ASSERT(ret == -1 && errno == EINVAL,
"Copy context requires SEV enabled. ret %d, errno: %d\n", ret,
errno);
ret = __sev_mirror_create(vm_with_vcpu->fd, sev_vm->fd);
TEST_ASSERT(
ret == -1 && errno == EINVAL,
"SEV copy context requires no vCPUS on the destination. ret: %d, errno: %d\n",
ret, errno);
kvm_vm_free(sev_vm);
kvm_vm_free(sev_es_vm);
kvm_vm_free(vm_with_vcpu);
kvm_vm_free(vm_no_vcpu);
}
static void test_sev_move_copy(void)
{
struct kvm_vm *dst_vm, *sev_vm, *mirror_vm, *dst_mirror_vm;
int ret;
sev_vm = sev_vm_create(/* es= */ false);
dst_vm = aux_vm_create(true);
mirror_vm = aux_vm_create(false);
dst_mirror_vm = aux_vm_create(false);
sev_mirror_create(mirror_vm->fd, sev_vm->fd);
ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
TEST_ASSERT(ret == -1 && errno == EBUSY,
"Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
errno);
/* The mirror itself can be migrated. */
sev_migrate_from(dst_mirror_vm->fd, mirror_vm->fd);
ret = __sev_migrate_from(dst_vm->fd, sev_vm->fd);
TEST_ASSERT(ret == -1 && errno == EBUSY,
"Cannot migrate VM that has mirrors. ret %d, errno: %d\n", ret,
errno);
/*
* mirror_vm is not a mirror anymore, dst_mirror_vm is. Thus,
* the owner can be copied as soon as dst_mirror_vm is gone.
*/
kvm_vm_free(dst_mirror_vm);
sev_migrate_from(dst_vm->fd, sev_vm->fd);
kvm_vm_free(mirror_vm);
kvm_vm_free(dst_vm);
kvm_vm_free(sev_vm);
}
int main(int argc, char *argv[])
{
test_sev_migrate_from(/* es= */ false);
test_sev_migrate_from(/* es= */ true);
test_sev_migrate_locking();
test_sev_migrate_parameters();
if (kvm_check_cap(KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM)) {
test_sev_migrate_from(/* es= */ false);
test_sev_migrate_from(/* es= */ true);
test_sev_migrate_locking();
test_sev_migrate_parameters();
if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM))
test_sev_move_copy();
}
if (kvm_check_cap(KVM_CAP_VM_COPY_ENC_CONTEXT_FROM)) {
test_sev_mirror(/* es= */ false);
test_sev_mirror(/* es= */ true);
test_sev_mirror_parameters();
}
return 0;
}

View File

@@ -1531,11 +1531,10 @@ static struct kvm_memslots *kvm_dup_memslots(struct kvm_memslots *old,
static int kvm_set_memslot(struct kvm *kvm,
const struct kvm_userspace_memory_region *mem,
struct kvm_memory_slot *old,
struct kvm_memory_slot *new, int as_id,
enum kvm_mr_change change)
{
struct kvm_memory_slot *slot;
struct kvm_memory_slot *slot, old;
struct kvm_memslots *slots;
int r;
@@ -1566,7 +1565,7 @@ static int kvm_set_memslot(struct kvm *kvm,
* Note, the INVALID flag needs to be in the appropriate entry
* in the freshly allocated memslots, not in @old or @new.
*/
slot = id_to_memslot(slots, old->id);
slot = id_to_memslot(slots, new->id);
slot->flags |= KVM_MEMSLOT_INVALID;
/*
@@ -1597,6 +1596,26 @@ static int kvm_set_memslot(struct kvm *kvm,
kvm_copy_memslots(slots, __kvm_memslots(kvm, as_id));
}
/*
* Make a full copy of the old memslot, the pointer will become stale
* when the memslots are re-sorted by update_memslots(), and the old
* memslot needs to be referenced after calling update_memslots(), e.g.
* to free its resources and for arch specific behavior. This needs to
* happen *after* (re)acquiring slots_arch_lock.
*/
slot = id_to_memslot(slots, new->id);
if (slot) {
old = *slot;
} else {
WARN_ON_ONCE(change != KVM_MR_CREATE);
memset(&old, 0, sizeof(old));
old.id = new->id;
old.as_id = as_id;
}
/* Copy the arch-specific data, again after (re)acquiring slots_arch_lock. */
memcpy(&new->arch, &old.arch, sizeof(old.arch));
r = kvm_arch_prepare_memory_region(kvm, new, mem, change);
if (r)
goto out_slots;
@@ -1604,14 +1623,18 @@ static int kvm_set_memslot(struct kvm *kvm,
update_memslots(slots, new, change);
slots = install_new_memslots(kvm, as_id, slots);
kvm_arch_commit_memory_region(kvm, mem, old, new, change);
kvm_arch_commit_memory_region(kvm, mem, &old, new, change);
/* Free the old memslot's metadata. Note, this is the full copy!!! */
if (change == KVM_MR_DELETE)
kvm_free_memslot(kvm, &old);
kvfree(slots);
return 0;
out_slots:
if (change == KVM_MR_DELETE || change == KVM_MR_MOVE) {
slot = id_to_memslot(slots, old->id);
slot = id_to_memslot(slots, new->id);
slot->flags &= ~KVM_MEMSLOT_INVALID;
slots = install_new_memslots(kvm, as_id, slots);
} else {
@@ -1626,7 +1649,6 @@ static int kvm_delete_memslot(struct kvm *kvm,
struct kvm_memory_slot *old, int as_id)
{
struct kvm_memory_slot new;
int r;
if (!old->npages)
return -EINVAL;
@@ -1639,12 +1661,7 @@ static int kvm_delete_memslot(struct kvm *kvm,
*/
new.as_id = as_id;
r = kvm_set_memslot(kvm, mem, old, &new, as_id, KVM_MR_DELETE);
if (r)
return r;
kvm_free_memslot(kvm, old);
return 0;
return kvm_set_memslot(kvm, mem, &new, as_id, KVM_MR_DELETE);
}
/*
@@ -1672,7 +1689,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
id = (u16)mem->slot;
/* General sanity checks */
if (mem->memory_size & (PAGE_SIZE - 1))
if ((mem->memory_size & (PAGE_SIZE - 1)) ||
(mem->memory_size != (unsigned long)mem->memory_size))
return -EINVAL;
if (mem->guest_phys_addr & (PAGE_SIZE - 1))
return -EINVAL;
@@ -1718,7 +1736,6 @@ int __kvm_set_memory_region(struct kvm *kvm,
if (!old.npages) {
change = KVM_MR_CREATE;
new.dirty_bitmap = NULL;
memset(&new.arch, 0, sizeof(new.arch));
} else { /* Modify an existing slot. */
if ((new.userspace_addr != old.userspace_addr) ||
(new.npages != old.npages) ||
@@ -1732,9 +1749,8 @@ int __kvm_set_memory_region(struct kvm *kvm,
else /* Nothing to change. */
return 0;
/* Copy dirty_bitmap and arch from the current memslot. */
/* Copy dirty_bitmap from the current memslot. */
new.dirty_bitmap = old.dirty_bitmap;
memcpy(&new.arch, &old.arch, sizeof(new.arch));
}
if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
@@ -1760,7 +1776,7 @@ int __kvm_set_memory_region(struct kvm *kvm,
bitmap_set(new.dirty_bitmap, 0, new.npages);
}
r = kvm_set_memslot(kvm, mem, &old, &new, as_id, change);
r = kvm_set_memslot(kvm, mem, &new, as_id, change);
if (r)
goto out_bitmap;
@@ -2915,7 +2931,8 @@ int kvm_write_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
int r;
gpa_t gpa = ghc->gpa + offset;
BUG_ON(len + offset > ghc->len);
if (WARN_ON_ONCE(len + offset > ghc->len))
return -EINVAL;
if (slots->generation != ghc->generation) {
if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))
@@ -2952,7 +2969,8 @@ int kvm_read_guest_offset_cached(struct kvm *kvm, struct gfn_to_hva_cache *ghc,
int r;
gpa_t gpa = ghc->gpa + offset;
BUG_ON(len + offset > ghc->len);
if (WARN_ON_ONCE(len + offset > ghc->len))
return -EINVAL;
if (slots->generation != ghc->generation) {
if (__kvm_gfn_to_hva_cache_init(slots, ghc, ghc->gpa, ghc->len))