diff --git a/Makefile b/Makefile index 9521e267c536..a308739a61b4 100644 --- a/Makefile +++ b/Makefile @@ -2,7 +2,7 @@ VERSION = 6 PATCHLEVEL = 0 SUBLEVEL = 0 -EXTRAVERSION = -rc3 +EXTRAVERSION = -rc4 NAME = Hurr durr I'ma ninja sloth # *DOCUMENTATION* diff --git a/arch/powerpc/include/asm/firmware.h b/arch/powerpc/include/asm/firmware.h index 398e0b5e485f..ed6db13a1d7c 100644 --- a/arch/powerpc/include/asm/firmware.h +++ b/arch/powerpc/include/asm/firmware.h @@ -83,6 +83,8 @@ enum { FW_FEATURE_POWERNV_ALWAYS = 0, FW_FEATURE_PS3_POSSIBLE = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1, FW_FEATURE_PS3_ALWAYS = FW_FEATURE_LPAR | FW_FEATURE_PS3_LV1, + FW_FEATURE_NATIVE_POSSIBLE = 0, + FW_FEATURE_NATIVE_ALWAYS = 0, FW_FEATURE_POSSIBLE = #ifdef CONFIG_PPC_PSERIES FW_FEATURE_PSERIES_POSSIBLE | @@ -92,6 +94,9 @@ enum { #endif #ifdef CONFIG_PPC_PS3 FW_FEATURE_PS3_POSSIBLE | +#endif +#ifdef CONFIG_PPC_HASH_MMU_NATIVE + FW_FEATURE_NATIVE_ALWAYS | #endif 0, FW_FEATURE_ALWAYS = @@ -103,6 +108,9 @@ enum { #endif #ifdef CONFIG_PPC_PS3 FW_FEATURE_PS3_ALWAYS & +#endif +#ifdef CONFIG_PPC_HASH_MMU_NATIVE + FW_FEATURE_NATIVE_ALWAYS & #endif FW_FEATURE_POSSIBLE, diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index 26ede09c521d..983551859891 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -113,7 +113,14 @@ static inline void __hard_RI_enable(void) static inline notrace unsigned long irq_soft_mask_return(void) { - return READ_ONCE(local_paca->irq_soft_mask); + unsigned long flags; + + asm volatile( + "lbz %0,%1(13)" + : "=r" (flags) + : "i" (offsetof(struct paca_struct, irq_soft_mask))); + + return flags; } /* @@ -140,24 +147,46 @@ static inline notrace void irq_soft_mask_set(unsigned long mask) if (IS_ENABLED(CONFIG_PPC_IRQ_SOFT_MASK_DEBUG)) WARN_ON(mask && !(mask & IRQS_DISABLED)); - WRITE_ONCE(local_paca->irq_soft_mask, mask); - barrier(); + asm volatile( + "stb %0,%1(13)" + : + : "r" (mask), + "i" (offsetof(struct paca_struct, irq_soft_mask)) + : "memory"); } static inline notrace unsigned long irq_soft_mask_set_return(unsigned long mask) { - unsigned long flags = irq_soft_mask_return(); + unsigned long flags; - irq_soft_mask_set(mask); +#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG + WARN_ON(mask && !(mask & IRQS_DISABLED)); +#endif + + asm volatile( + "lbz %0,%1(13); stb %2,%1(13)" + : "=&r" (flags) + : "i" (offsetof(struct paca_struct, irq_soft_mask)), + "r" (mask) + : "memory"); return flags; } static inline notrace unsigned long irq_soft_mask_or_return(unsigned long mask) { - unsigned long flags = irq_soft_mask_return(); + unsigned long flags, tmp; - irq_soft_mask_set(flags | mask); + asm volatile( + "lbz %0,%2(13); or %1,%0,%3; stb %1,%2(13)" + : "=&r" (flags), "=r" (tmp) + : "i" (offsetof(struct paca_struct, irq_soft_mask)), + "r" (mask) + : "memory"); + +#ifdef CONFIG_PPC_IRQ_SOFT_MASK_DEBUG + WARN_ON((mask | flags) && !((mask | flags) & IRQS_DISABLED)); +#endif return flags; } @@ -282,7 +311,8 @@ static inline bool pmi_irq_pending(void) flags = irq_soft_mask_set_return(IRQS_ALL_DISABLED); \ local_paca->irq_happened |= PACA_IRQ_HARD_DIS; \ if (!arch_irqs_disabled_flags(flags)) { \ - WRITE_ONCE(local_paca->saved_r1, current_stack_pointer);\ + asm volatile("std%X0 %1,%0" : "=m" (local_paca->saved_r1) \ + : "r" (current_stack_pointer)); \ trace_hardirqs_off(); \ } \ } while(0) diff --git a/arch/powerpc/kernel/pci_32.c b/arch/powerpc/kernel/pci_32.c index 433965bf37b4..855b59892c5c 100644 --- a/arch/powerpc/kernel/pci_32.c +++ b/arch/powerpc/kernel/pci_32.c @@ -245,6 +245,15 @@ static int __init pcibios_init(void) printk(KERN_INFO "PCI: Probing PCI hardware\n"); +#ifdef CONFIG_PPC_PCI_BUS_NUM_DOMAIN_DEPENDENT + /* + * Enable PCI domains in /proc when PCI bus numbers are not unique + * across all PCI domains to prevent conflicts. And keep PCI domain 0 + * backward compatible in /proc for video cards. + */ + pci_add_flags(PCI_ENABLE_PROC_DOMAINS | PCI_COMPAT_DOMAIN_0); +#endif + if (pci_has_flag(PCI_REASSIGN_ALL_BUS)) pci_assign_all_buses = 1; diff --git a/arch/powerpc/kernel/rtas_entry.S b/arch/powerpc/kernel/rtas_entry.S index 9a434d42e660..6ce95ddadbcd 100644 --- a/arch/powerpc/kernel/rtas_entry.S +++ b/arch/powerpc/kernel/rtas_entry.S @@ -109,8 +109,12 @@ __enter_rtas: * its critical regions (as specified in PAPR+ section 7.2.1). MSR[S] * is not impacted by RFI_TO_KERNEL (only urfid can unset it). So if * MSR[S] is set, it will remain when entering RTAS. + * If we're in HV mode, RTAS must also run in HV mode, so extract MSR_HV + * from the saved MSR value and insert into the value RTAS will use. */ + extrdi r0, r6, 1, 63 - MSR_HV_LG LOAD_REG_IMMEDIATE(r6, MSR_ME | MSR_RI) + insrdi r6, r0, 1, 63 - MSR_HV_LG li r0,0 mtmsrd r0,1 /* disable RI before using SRR0/1 */ diff --git a/arch/powerpc/kernel/systbl.S b/arch/powerpc/kernel/systbl.S index cb3358886203..6c1db3b6de2d 100644 --- a/arch/powerpc/kernel/systbl.S +++ b/arch/powerpc/kernel/systbl.S @@ -18,6 +18,7 @@ .p2align 3 #define __SYSCALL(nr, entry) .8byte entry #else + .p2align 2 #define __SYSCALL(nr, entry) .long entry #endif diff --git a/arch/powerpc/platforms/pseries/papr_scm.c b/arch/powerpc/platforms/pseries/papr_scm.c index 20f6ed813bff..2f8385523a13 100644 --- a/arch/powerpc/platforms/pseries/papr_scm.c +++ b/arch/powerpc/platforms/pseries/papr_scm.c @@ -124,9 +124,6 @@ struct papr_scm_priv { /* The bits which needs to be overridden */ u64 health_bitmap_inject_mask; - - /* array to have event_code and stat_id mappings */ - u8 *nvdimm_events_map; }; static int papr_scm_pmem_flush(struct nd_region *nd_region, @@ -350,6 +347,25 @@ static ssize_t drc_pmem_query_stats(struct papr_scm_priv *p, #ifdef CONFIG_PERF_EVENTS #define to_nvdimm_pmu(_pmu) container_of(_pmu, struct nvdimm_pmu, pmu) +static const char * const nvdimm_events_map[] = { + [1] = "CtlResCt", + [2] = "CtlResTm", + [3] = "PonSecs ", + [4] = "MemLife ", + [5] = "CritRscU", + [6] = "HostLCnt", + [7] = "HostSCnt", + [8] = "HostSDur", + [9] = "HostLDur", + [10] = "MedRCnt ", + [11] = "MedWCnt ", + [12] = "MedRDur ", + [13] = "MedWDur ", + [14] = "CchRHCnt", + [15] = "CchWHCnt", + [16] = "FastWCnt", +}; + static int papr_scm_pmu_get_value(struct perf_event *event, struct device *dev, u64 *count) { struct papr_scm_perf_stat *stat; @@ -357,11 +373,15 @@ static int papr_scm_pmu_get_value(struct perf_event *event, struct device *dev, struct papr_scm_priv *p = dev_get_drvdata(dev); int rc, size; + /* Invalid eventcode */ + if (event->attr.config == 0 || event->attr.config >= ARRAY_SIZE(nvdimm_events_map)) + return -EINVAL; + /* Allocate request buffer enough to hold single performance stat */ size = sizeof(struct papr_scm_perf_stats) + sizeof(struct papr_scm_perf_stat); - if (!p || !p->nvdimm_events_map) + if (!p) return -EINVAL; stats = kzalloc(size, GFP_KERNEL); @@ -370,7 +390,7 @@ static int papr_scm_pmu_get_value(struct perf_event *event, struct device *dev, stat = &stats->scm_statistic[0]; memcpy(&stat->stat_id, - &p->nvdimm_events_map[event->attr.config * sizeof(stat->stat_id)], + nvdimm_events_map[event->attr.config], sizeof(stat->stat_id)); stat->stat_val = 0; @@ -458,56 +478,6 @@ static void papr_scm_pmu_del(struct perf_event *event, int flags) papr_scm_pmu_read(event); } -static int papr_scm_pmu_check_events(struct papr_scm_priv *p, struct nvdimm_pmu *nd_pmu) -{ - struct papr_scm_perf_stat *stat; - struct papr_scm_perf_stats *stats; - u32 available_events; - int index, rc = 0; - - if (!p->stat_buffer_len) - return -ENOENT; - - available_events = (p->stat_buffer_len - sizeof(struct papr_scm_perf_stats)) - / sizeof(struct papr_scm_perf_stat); - if (available_events == 0) - return -EOPNOTSUPP; - - /* Allocate the buffer for phyp where stats are written */ - stats = kzalloc(p->stat_buffer_len, GFP_KERNEL); - if (!stats) { - rc = -ENOMEM; - return rc; - } - - /* Called to get list of events supported */ - rc = drc_pmem_query_stats(p, stats, 0); - if (rc) - goto out; - - /* - * Allocate memory and populate nvdimm_event_map. - * Allocate an extra element for NULL entry - */ - p->nvdimm_events_map = kcalloc(available_events + 1, - sizeof(stat->stat_id), - GFP_KERNEL); - if (!p->nvdimm_events_map) { - rc = -ENOMEM; - goto out; - } - - /* Copy all stat_ids to event map */ - for (index = 0, stat = stats->scm_statistic; - index < available_events; index++, ++stat) { - memcpy(&p->nvdimm_events_map[index * sizeof(stat->stat_id)], - &stat->stat_id, sizeof(stat->stat_id)); - } -out: - kfree(stats); - return rc; -} - static void papr_scm_pmu_register(struct papr_scm_priv *p) { struct nvdimm_pmu *nd_pmu; @@ -519,9 +489,10 @@ static void papr_scm_pmu_register(struct papr_scm_priv *p) goto pmu_err_print; } - rc = papr_scm_pmu_check_events(p, nd_pmu); - if (rc) + if (!p->stat_buffer_len) { + rc = -ENOENT; goto pmu_check_events_err; + } nd_pmu->pmu.task_ctx_nr = perf_invalid_context; nd_pmu->pmu.name = nvdimm_name(p->nvdimm); @@ -539,7 +510,7 @@ static void papr_scm_pmu_register(struct papr_scm_priv *p) rc = register_nvdimm_pmu(nd_pmu, p->pdev); if (rc) - goto pmu_register_err; + goto pmu_check_events_err; /* * Set archdata.priv value to nvdimm_pmu structure, to handle the @@ -548,8 +519,6 @@ static void papr_scm_pmu_register(struct papr_scm_priv *p) p->pdev->archdata.priv = nd_pmu; return; -pmu_register_err: - kfree(p->nvdimm_events_map); pmu_check_events_err: kfree(nd_pmu); pmu_err_print: @@ -1560,7 +1529,6 @@ static int papr_scm_remove(struct platform_device *pdev) unregister_nvdimm_pmu(pdev->archdata.priv); pdev->archdata.priv = NULL; - kfree(p->nvdimm_events_map); kfree(p->bus_desc.provider_name); kfree(p); diff --git a/arch/riscv/include/asm/kvm_vcpu_sbi.h b/arch/riscv/include/asm/kvm_vcpu_sbi.h index 83d6d4d2b1df..26a446a34057 100644 --- a/arch/riscv/include/asm/kvm_vcpu_sbi.h +++ b/arch/riscv/include/asm/kvm_vcpu_sbi.h @@ -33,4 +33,16 @@ void kvm_riscv_vcpu_sbi_system_reset(struct kvm_vcpu *vcpu, u32 type, u64 flags); const struct kvm_vcpu_sbi_extension *kvm_vcpu_sbi_find_ext(unsigned long extid); +#ifdef CONFIG_RISCV_SBI_V01 +extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01; +#endif +extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_base; +extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_time; +extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_ipi; +extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_rfence; +extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_srst; +extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_hsm; +extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental; +extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor; + #endif /* __RISCV_KVM_VCPU_SBI_H__ */ diff --git a/arch/riscv/kvm/vcpu_sbi.c b/arch/riscv/kvm/vcpu_sbi.c index d45e7da3f0d3..f96991d230bf 100644 --- a/arch/riscv/kvm/vcpu_sbi.c +++ b/arch/riscv/kvm/vcpu_sbi.c @@ -32,23 +32,13 @@ static int kvm_linux_err_map_sbi(int err) }; } -#ifdef CONFIG_RISCV_SBI_V01 -extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01; -#else +#ifndef CONFIG_RISCV_SBI_V01 static const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_v01 = { .extid_start = -1UL, .extid_end = -1UL, .handler = NULL, }; #endif -extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_base; -extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_time; -extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_ipi; -extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_rfence; -extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_srst; -extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_hsm; -extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_experimental; -extern const struct kvm_vcpu_sbi_extension vcpu_sbi_ext_vendor; static const struct kvm_vcpu_sbi_extension *sbi_ext[] = { &vcpu_sbi_ext_v01, diff --git a/arch/riscv/kvm/vcpu_timer.c b/arch/riscv/kvm/vcpu_timer.c index 16f50c46ba39..185f2386a747 100644 --- a/arch/riscv/kvm/vcpu_timer.c +++ b/arch/riscv/kvm/vcpu_timer.c @@ -299,7 +299,6 @@ static void kvm_riscv_vcpu_update_timedelta(struct kvm_vcpu *vcpu) void kvm_riscv_vcpu_timer_restore(struct kvm_vcpu *vcpu) { - struct kvm_vcpu_csr *csr; struct kvm_vcpu_timer *t = &vcpu->arch.timer; kvm_riscv_vcpu_update_timedelta(vcpu); @@ -307,7 +306,6 @@ void kvm_riscv_vcpu_timer_restore(struct kvm_vcpu *vcpu) if (!t->sstc_enabled) return; - csr = &vcpu->arch.guest_csr; #if defined(CONFIG_32BIT) csr_write(CSR_VSTIMECMP, (u32)t->next_cycles); csr_write(CSR_VSTIMECMPH, (u32)(t->next_cycles >> 32)); @@ -324,13 +322,11 @@ void kvm_riscv_vcpu_timer_restore(struct kvm_vcpu *vcpu) void kvm_riscv_vcpu_timer_save(struct kvm_vcpu *vcpu) { - struct kvm_vcpu_csr *csr; struct kvm_vcpu_timer *t = &vcpu->arch.timer; if (!t->sstc_enabled) return; - csr = &vcpu->arch.guest_csr; t = &vcpu->arch.timer; #if defined(CONFIG_32BIT) t->next_cycles = csr_read(CSR_VSTIMECMP); diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index f39092e0ceaa..b1e98a9ed152 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -1038,16 +1038,11 @@ static inline void kvm_arch_vcpu_unblocking(struct kvm_vcpu *vcpu) {} #define __KVM_HAVE_ARCH_VM_FREE void kvm_arch_free_vm(struct kvm *kvm); -#ifdef CONFIG_VFIO_PCI_ZDEV_KVM -int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm); -void kvm_s390_pci_unregister_kvm(struct zpci_dev *zdev); -#else -static inline int kvm_s390_pci_register_kvm(struct zpci_dev *dev, - struct kvm *kvm) -{ - return -EPERM; -} -static inline void kvm_s390_pci_unregister_kvm(struct zpci_dev *dev) {} -#endif +struct zpci_kvm_hook { + int (*kvm_register)(void *opaque, struct kvm *kvm); + void (*kvm_unregister)(void *opaque); +}; + +extern struct zpci_kvm_hook zpci_kvm_hook; #endif diff --git a/arch/s390/kvm/pci.c b/arch/s390/kvm/pci.c index 4946fb7757d6..bb8c335d17b9 100644 --- a/arch/s390/kvm/pci.c +++ b/arch/s390/kvm/pci.c @@ -431,8 +431,9 @@ static void kvm_s390_pci_dev_release(struct zpci_dev *zdev) * available, enable them and let userspace indicate whether or not they will * be used (specify SHM bit to disable). */ -int kvm_s390_pci_register_kvm(struct zpci_dev *zdev, struct kvm *kvm) +static int kvm_s390_pci_register_kvm(void *opaque, struct kvm *kvm) { + struct zpci_dev *zdev = opaque; int rc; if (!zdev) @@ -510,10 +511,10 @@ err: kvm_put_kvm(kvm); return rc; } -EXPORT_SYMBOL_GPL(kvm_s390_pci_register_kvm); -void kvm_s390_pci_unregister_kvm(struct zpci_dev *zdev) +static void kvm_s390_pci_unregister_kvm(void *opaque) { + struct zpci_dev *zdev = opaque; struct kvm *kvm; if (!zdev) @@ -566,7 +567,6 @@ out: kvm_put_kvm(kvm); } -EXPORT_SYMBOL_GPL(kvm_s390_pci_unregister_kvm); void kvm_s390_pci_init_list(struct kvm *kvm) { @@ -678,6 +678,8 @@ int kvm_s390_pci_init(void) spin_lock_init(&aift->gait_lock); mutex_init(&aift->aift_lock); + zpci_kvm_hook.kvm_register = kvm_s390_pci_register_kvm; + zpci_kvm_hook.kvm_unregister = kvm_s390_pci_unregister_kvm; return 0; } @@ -685,6 +687,8 @@ int kvm_s390_pci_init(void) void kvm_s390_pci_exit(void) { mutex_destroy(&aift->aift_lock); + zpci_kvm_hook.kvm_register = NULL; + zpci_kvm_hook.kvm_unregister = NULL; kfree(aift); } diff --git a/arch/s390/pci/Makefile b/arch/s390/pci/Makefile index bf557a1b789c..5ae31ca9dd44 100644 --- a/arch/s390/pci/Makefile +++ b/arch/s390/pci/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_PCI) += pci.o pci_irq.o pci_dma.o pci_clp.o pci_sysfs.o \ pci_event.o pci_debug.o pci_insn.o pci_mmio.o \ - pci_bus.o + pci_bus.o pci_kvm_hook.o obj-$(CONFIG_PCI_IOV) += pci_iov.o diff --git a/arch/s390/pci/pci_kvm_hook.c b/arch/s390/pci/pci_kvm_hook.c new file mode 100644 index 000000000000..ff34baf50a3e --- /dev/null +++ b/arch/s390/pci/pci_kvm_hook.c @@ -0,0 +1,11 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VFIO ZPCI devices support + * + * Copyright (C) IBM Corp. 2022. All rights reserved. + * Author(s): Pierre Morel + */ +#include + +struct zpci_kvm_hook zpci_kvm_hook; +EXPORT_SYMBOL_GPL(zpci_kvm_hook); diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index cb98a05ee743..c601939a74b1 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -4052,8 +4052,9 @@ static struct perf_guest_switch_msr *intel_guest_get_msrs(int *nr, void *data) /* Disable guest PEBS if host PEBS is enabled. */ arr[pebs_enable].guest = 0; } else { - /* Disable guest PEBS for cross-mapped PEBS counters. */ + /* Disable guest PEBS thoroughly for cross-mapped PEBS counters. */ arr[pebs_enable].guest &= ~kvm_pmu->host_cross_mapped_mask; + arr[global_ctrl].guest &= ~kvm_pmu->host_cross_mapped_mask; /* Set hw GLOBAL_CTRL bits for PEBS counter when it runs for guest */ arr[global_ctrl].guest |= arr[pebs_enable].guest; } diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 126fa9aec64c..e418ef3ecfcb 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -5361,19 +5361,6 @@ void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu) __kvm_mmu_free_obsolete_roots(vcpu->kvm, &vcpu->arch.guest_mmu); } -static bool need_remote_flush(u64 old, u64 new) -{ - if (!is_shadow_present_pte(old)) - return false; - if (!is_shadow_present_pte(new)) - return true; - if ((old ^ new) & SPTE_BASE_ADDR_MASK) - return true; - old ^= shadow_nx_mask; - new ^= shadow_nx_mask; - return (old & ~new & SPTE_PERM_MASK) != 0; -} - static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu *vcpu, gpa_t *gpa, int *bytes) { @@ -5519,7 +5506,7 @@ static void kvm_mmu_pte_write(struct kvm_vcpu *vcpu, gpa_t gpa, mmu_page_zap_pte(vcpu->kvm, sp, spte, NULL); if (gentry && sp->role.level != PG_LEVEL_4K) ++vcpu->kvm->stat.mmu_pde_zapped; - if (need_remote_flush(entry, *spte)) + if (is_shadow_present_pte(entry)) flush = true; ++spte; } @@ -6085,47 +6072,18 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, const struct kvm_memory_slot *memslot, int start_level) { - bool flush = false; - if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); - flush = slot_handle_level(kvm, memslot, slot_rmap_write_protect, - start_level, KVM_MAX_HUGEPAGE_LEVEL, - false); + slot_handle_level(kvm, memslot, slot_rmap_write_protect, + start_level, KVM_MAX_HUGEPAGE_LEVEL, false); write_unlock(&kvm->mmu_lock); } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); - flush |= kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); + kvm_tdp_mmu_wrprot_slot(kvm, memslot, start_level); read_unlock(&kvm->mmu_lock); } - - /* - * Flush TLBs if any SPTEs had to be write-protected to ensure that - * guest writes are reflected in the dirty bitmap before the memslot - * update completes, i.e. before enabling dirty logging is visible to - * userspace. - * - * Perform the TLB flush outside the mmu_lock to reduce the amount of - * time the lock is held. However, this does mean that another CPU can - * now grab mmu_lock and encounter a write-protected SPTE while CPUs - * still have a writable mapping for the associated GFN in their TLB. - * - * This is safe but requires KVM to be careful when making decisions - * based on the write-protection status of an SPTE. Specifically, KVM - * also write-protects SPTEs to monitor changes to guest page tables - * during shadow paging, and must guarantee no CPUs can write to those - * page before the lock is dropped. As mentioned in the previous - * paragraph, a write-protected SPTE is no guarantee that CPU cannot - * perform writes. So to determine if a TLB flush is truly required, KVM - * will clear a separate software-only bit (MMU-writable) and skip the - * flush if-and-only-if this bit was already clear. - * - * See is_writable_pte() for more details. - */ - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } static inline bool need_topup(struct kvm_mmu_memory_cache *cache, int min) @@ -6493,32 +6451,30 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, const struct kvm_memory_slot *memslot) { - bool flush = false; - if (kvm_memslots_have_rmaps(kvm)) { write_lock(&kvm->mmu_lock); /* * Clear dirty bits only on 4k SPTEs since the legacy MMU only * support dirty logging at a 4k granularity. */ - flush = slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); + slot_handle_level_4k(kvm, memslot, __rmap_clear_dirty, false); write_unlock(&kvm->mmu_lock); } if (is_tdp_mmu_enabled(kvm)) { read_lock(&kvm->mmu_lock); - flush |= kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); + kvm_tdp_mmu_clear_dirty_slot(kvm, memslot); read_unlock(&kvm->mmu_lock); } /* + * The caller will flush the TLBs after this function returns. + * * It's also safe to flush TLBs out of mmu lock here as currently this * function is only used for dirty logging, in which case flushing TLB * out of mmu lock also guarantees no dirty pages will be lost in * dirty_bitmap. */ - if (flush) - kvm_arch_flush_remote_tlbs_memslot(kvm, memslot); } void kvm_mmu_zap_all(struct kvm *kvm) diff --git a/arch/x86/kvm/mmu/spte.h b/arch/x86/kvm/mmu/spte.h index f3744eea45f5..7670c13ce251 100644 --- a/arch/x86/kvm/mmu/spte.h +++ b/arch/x86/kvm/mmu/spte.h @@ -343,7 +343,7 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, } /* - * An shadow-present leaf SPTE may be non-writable for 3 possible reasons: + * A shadow-present leaf SPTE may be non-writable for 4 possible reasons: * * 1. To intercept writes for dirty logging. KVM write-protects huge pages * so that they can be split be split down into the dirty logging @@ -361,8 +361,13 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, * read-only memslot or guest memory backed by a read-only VMA. Writes to * such pages are disallowed entirely. * - * To keep track of why a given SPTE is write-protected, KVM uses 2 - * software-only bits in the SPTE: + * 4. To emulate the Accessed bit for SPTEs without A/D bits. Note, in this + * case, the SPTE is access-protected, not just write-protected! + * + * For cases #1 and #4, KVM can safely make such SPTEs writable without taking + * mmu_lock as capturing the Accessed/Dirty state doesn't require taking it. + * To differentiate #1 and #4 from #2 and #3, KVM uses two software-only bits + * in the SPTE: * * shadow_mmu_writable_mask, aka MMU-writable - * Cleared on SPTEs that KVM is currently write-protecting for shadow paging @@ -391,7 +396,8 @@ static __always_inline bool is_rsvd_spte(struct rsvd_bits_validate *rsvd_check, * shadow page tables between vCPUs. Write-protecting an SPTE for dirty logging * (which does not clear the MMU-writable bit), does not flush TLBs before * dropping the lock, as it only needs to synchronize guest writes with the - * dirty bitmap. + * dirty bitmap. Similarly, making the SPTE inaccessible (and non-writable) for + * access-tracking via the clear_young() MMU notifier also does not flush TLBs. * * So, there is the problem: clearing the MMU-writable bit can encounter a * write-protected SPTE while CPUs still have writable mappings for that SPTE diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d7f8331d6f7e..c9b49a09e6b5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -843,8 +843,7 @@ static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) return true; - return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, - MSR_IA32_SPEC_CTRL); + return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, msr); } unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 205ebdc2b11b..43a6a7efc6ec 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -1557,12 +1557,32 @@ static const u32 msr_based_features_all[] = { static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all)]; static unsigned int num_msr_based_features; +/* + * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM + * does not yet virtualize. These include: + * 10 - MISC_PACKAGE_CTRLS + * 11 - ENERGY_FILTERING_CTL + * 12 - DOITM + * 18 - FB_CLEAR_CTRL + * 21 - XAPIC_DISABLE_STATUS + * 23 - OVERCLOCKING_STATUS + */ + +#define KVM_SUPPORTED_ARCH_CAP \ + (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \ + ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ + ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ + ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ + ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO) + static u64 kvm_get_arch_capabilities(void) { u64 data = 0; - if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) + if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) { rdmsrl(MSR_IA32_ARCH_CAPABILITIES, data); + data &= KVM_SUPPORTED_ARCH_CAP; + } /* * If nx_huge_pages is enabled, KVM's shadow paging will ensure that @@ -1610,9 +1630,6 @@ static u64 kvm_get_arch_capabilities(void) */ } - /* Guests don't need to know "Fill buffer clear control" exists */ - data &= ~ARCH_CAP_FB_CLEAR_CTRL; - return data; } @@ -10652,7 +10669,8 @@ static inline int vcpu_block(struct kvm_vcpu *vcpu) case KVM_MP_STATE_INIT_RECEIVED: break; default: - return -EINTR; + WARN_ON_ONCE(1); + break; } return 1; } @@ -11093,9 +11111,22 @@ int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, vcpu_load(vcpu); - if (!lapic_in_kernel(vcpu) && - mp_state->mp_state != KVM_MP_STATE_RUNNABLE) + switch (mp_state->mp_state) { + case KVM_MP_STATE_UNINITIALIZED: + case KVM_MP_STATE_HALTED: + case KVM_MP_STATE_AP_RESET_HOLD: + case KVM_MP_STATE_INIT_RECEIVED: + case KVM_MP_STATE_SIPI_RECEIVED: + if (!lapic_in_kernel(vcpu)) + goto out; + break; + + case KVM_MP_STATE_RUNNABLE: + break; + + default: goto out; + } /* * KVM_MP_STATE_INIT_RECEIVED means the processor is in @@ -11563,7 +11594,7 @@ int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, sizeof(u64), GFP_KERNEL_ACCOUNT); if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks) - goto fail_free_pio_data; + goto fail_free_mce_banks; vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; if (!zalloc_cpumask_var(&vcpu->arch.wbinvd_dirty_mask, @@ -11617,7 +11648,6 @@ free_wbinvd_dirty_mask: fail_free_mce_banks: kfree(vcpu->arch.mce_banks); kfree(vcpu->arch.mci_ctl2_banks); -fail_free_pio_data: free_page((unsigned long)vcpu->arch.pio_data); fail_free_lapic: kvm_free_lapic(vcpu); @@ -12473,6 +12503,50 @@ static void kvm_mmu_slot_apply_flags(struct kvm *kvm, } else { kvm_mmu_slot_remove_write_access(kvm, new, PG_LEVEL_4K); } + + /* + * Unconditionally flush the TLBs after enabling dirty logging. + * A flush is almost always going to be necessary (see below), + * and unconditionally flushing allows the helpers to omit + * the subtly complex checks when removing write access. + * + * Do the flush outside of mmu_lock to reduce the amount of + * time mmu_lock is held. Flushing after dropping mmu_lock is + * safe as KVM only needs to guarantee the slot is fully + * write-protected before returning to userspace, i.e. before + * userspace can consume the dirty status. + * + * Flushing outside of mmu_lock requires KVM to be careful when + * making decisions based on writable status of an SPTE, e.g. a + * !writable SPTE doesn't guarantee a CPU can't perform writes. + * + * Specifically, KVM also write-protects guest page tables to + * monitor changes when using shadow paging, and must guarantee + * no CPUs can write to those page before mmu_lock is dropped. + * Because CPUs may have stale TLB entries at this point, a + * !writable SPTE doesn't guarantee CPUs can't perform writes. + * + * KVM also allows making SPTES writable outside of mmu_lock, + * e.g. to allow dirty logging without taking mmu_lock. + * + * To handle these scenarios, KVM uses a separate software-only + * bit (MMU-writable) to track if a SPTE is !writable due to + * a guest page table being write-protected (KVM clears the + * MMU-writable flag when write-protecting for shadow paging). + * + * The use of MMU-writable is also the primary motivation for + * the unconditional flush. Because KVM must guarantee that a + * CPU doesn't contain stale, writable TLB entries for a + * !MMU-writable SPTE, KVM must flush if it encounters any + * MMU-writable SPTE regardless of whether the actual hardware + * writable bit was set. I.e. KVM is almost guaranteed to need + * to flush, while unconditionally flushing allows the "remove + * write access" helpers to ignore MMU-writable entirely. + * + * See is_writable_pte() for more details (the case involving + * access-tracked SPTEs is particularly relevant). + */ + kvm_arch_flush_remote_tlbs_memslot(kvm, new); } } diff --git a/drivers/vfio/pci/vfio_pci_zdev.c b/drivers/vfio/pci/vfio_pci_zdev.c index e163aa9f6144..0cbdcd14f1c8 100644 --- a/drivers/vfio/pci/vfio_pci_zdev.c +++ b/drivers/vfio/pci/vfio_pci_zdev.c @@ -151,7 +151,10 @@ int vfio_pci_zdev_open_device(struct vfio_pci_core_device *vdev) if (!vdev->vdev.kvm) return 0; - return kvm_s390_pci_register_kvm(zdev, vdev->vdev.kvm); + if (zpci_kvm_hook.kvm_register) + return zpci_kvm_hook.kvm_register(zdev, vdev->vdev.kvm); + + return -ENOENT; } void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev) @@ -161,5 +164,6 @@ void vfio_pci_zdev_close_device(struct vfio_pci_core_device *vdev) if (!zdev || !vdev->vdev.kvm) return; - kvm_s390_pci_unregister_kvm(zdev); + if (zpci_kvm_hook.kvm_unregister) + zpci_kvm_hook.kvm_unregister(zdev); } diff --git a/scripts/Makefile.extrawarn b/scripts/Makefile.extrawarn index 0621c39a3955..6ae482158bc4 100644 --- a/scripts/Makefile.extrawarn +++ b/scripts/Makefile.extrawarn @@ -47,7 +47,19 @@ else ifdef CONFIG_CC_IS_CLANG KBUILD_CFLAGS += -Wno-initializer-overrides +# Clang before clang-16 would warn on default argument promotions. +ifeq ($(shell [ $(CONFIG_CLANG_VERSION) -lt 160000 ] && echo y),y) +# Disable -Wformat KBUILD_CFLAGS += -Wno-format +# Then re-enable flags that were part of the -Wformat group that aren't +# problematic. +KBUILD_CFLAGS += -Wformat-extra-args -Wformat-invalid-specifier +KBUILD_CFLAGS += -Wformat-zero-length -Wnonnull +# Requires clang-12+. +ifeq ($(shell [ $(CONFIG_CLANG_VERSION) -ge 120000 ] && echo y),y) +KBUILD_CFLAGS += -Wformat-insufficient-args +endif +endif KBUILD_CFLAGS += -Wno-sign-compare KBUILD_CFLAGS += $(call cc-disable-warning, pointer-to-enum-cast) KBUILD_CFLAGS += -Wno-tautological-constant-out-of-range-compare diff --git a/tools/testing/selftests/kvm/include/x86_64/processor.h b/tools/testing/selftests/kvm/include/x86_64/processor.h index 45edf45821d0..0cbc71b7af50 100644 --- a/tools/testing/selftests/kvm/include/x86_64/processor.h +++ b/tools/testing/selftests/kvm/include/x86_64/processor.h @@ -754,7 +754,7 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector, void (*handler)(struct ex_regs *)); /* If a toddler were to say "abracadabra". */ -#define KVM_EXCEPTION_MAGIC 0xabacadabaull +#define KVM_EXCEPTION_MAGIC 0xabacadabaULL /* * KVM selftest exception fixup uses registers to coordinate with the exception @@ -786,7 +786,7 @@ void vm_install_exception_handler(struct kvm_vm *vm, int vector, "lea 1f(%%rip), %%r10\n\t" \ "lea 2f(%%rip), %%r11\n\t" \ "1: " insn "\n\t" \ - "mov $0, %[vector]\n\t" \ + "movb $0, %[vector]\n\t" \ "jmp 3f\n\t" \ "2:\n\t" \ "mov %%r9b, %[vector]\n\t" \