diff --git a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml index c597ccced623..0c112f3264a9 100644 --- a/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml +++ b/Documentation/devicetree/bindings/remoteproc/qcom,adsp.yaml @@ -28,6 +28,7 @@ properties: - qcom,sc8180x-adsp-pas - qcom,sc8180x-cdsp-pas - qcom,sc8180x-mpss-pas + - qcom,sdm660-adsp-pas - qcom,sdm845-adsp-pas - qcom,sdm845-cdsp-pas - qcom,sdx55-mpss-pas diff --git a/Documentation/devicetree/bindings/watchdog/maxim,max63xx.yaml b/Documentation/devicetree/bindings/watchdog/maxim,max63xx.yaml new file mode 100644 index 000000000000..f2105eedac2c --- /dev/null +++ b/Documentation/devicetree/bindings/watchdog/maxim,max63xx.yaml @@ -0,0 +1,44 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/watchdog/maxim,max63xx.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Maxim 63xx Watchdog Timers + +allOf: + - $ref: "watchdog.yaml#" + +maintainers: + - Marc Zyngier + - Linus Walleij + +properties: + compatible: + oneOf: + - const: maxim,max6369 + - const: maxim,max6370 + - const: maxim,max6371 + - const: maxim,max6372 + - const: maxim,max6373 + - const: maxim,max6374 + + reg: + description: This is a 1-byte memory-mapped address + maxItems: 1 + +required: + - compatible + - reg + +unevaluatedProperties: false + +examples: + - | + wdt: watchdog@50000000 { + compatible = "maxim,max6369"; + reg = <0x50000000 0x1>; + timeout-sec = <10>; + }; + +... diff --git a/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt b/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt index 416d716403f6..a4e31ce96e0e 100644 --- a/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt +++ b/Documentation/devicetree/bindings/watchdog/mtk-wdt.txt @@ -13,6 +13,7 @@ Required properties: "mediatek,mt7622-wdt", "mediatek,mt6589-wdt": for MT7622 "mediatek,mt7623-wdt", "mediatek,mt6589-wdt": for MT7623 "mediatek,mt7629-wdt", "mediatek,mt6589-wdt": for MT7629 + "mediatek,mt7986-wdt", "mediatek,mt6589-wdt": for MT7986 "mediatek,mt8183-wdt": for MT8183 "mediatek,mt8516-wdt", "mediatek,mt6589-wdt": for MT8516 "mediatek,mt8192-wdt": for MT8192 diff --git a/Documentation/networking/nf_conntrack-sysctl.rst b/Documentation/networking/nf_conntrack-sysctl.rst index 34ca762ea56f..311128abb768 100644 --- a/Documentation/networking/nf_conntrack-sysctl.rst +++ b/Documentation/networking/nf_conntrack-sysctl.rst @@ -17,9 +17,8 @@ nf_conntrack_acct - BOOLEAN nf_conntrack_buckets - INTEGER Size of hash table. If not specified as parameter during module loading, the default size is calculated by dividing total memory - by 16384 to determine the number of buckets but the hash table will - never have fewer than 32 and limited to 16384 buckets. For systems - with more than 4GB of memory it will be 65536 buckets. + by 16384 to determine the number of buckets. The hash table will + never have fewer than 1024 and never more than 262144 buckets. This sysctl is only writeable in the initial net namespace. nf_conntrack_checksum - BOOLEAN @@ -100,8 +99,12 @@ nf_conntrack_log_invalid - INTEGER Log invalid packets of a type specified by value. nf_conntrack_max - INTEGER - Size of connection tracking table. Default value is - nf_conntrack_buckets value * 4. + Maximum number of allowed connection tracking entries. This value is set + to nf_conntrack_buckets by default. + Note that connection tracking entries are added to the table twice -- once + for the original direction and once for the reply direction (i.e., with + the reversed address). This means that with default settings a maxed-out + table will have a average hash chain length of 2, not 1. nf_conntrack_tcp_be_liberal - BOOLEAN - 0 - disabled (default) diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index c6212c2d5fe3..a6729c8cf063 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -3357,6 +3357,7 @@ flags which can include the following: - KVM_GUESTDBG_INJECT_DB: inject DB type exception [x86] - KVM_GUESTDBG_INJECT_BP: inject BP type exception [x86] - KVM_GUESTDBG_EXIT_PENDING: trigger an immediate guest exit [s390] + - KVM_GUESTDBG_BLOCKIRQ: avoid injecting interrupts/NMI/SMI [x86] For example KVM_GUESTDBG_USE_SW_BP indicates that software breakpoints are enabled in memory so we need to ensure breakpoint exceptions are @@ -5208,6 +5209,9 @@ by a string of size ``name_size``. #define KVM_STATS_TYPE_CUMULATIVE (0x0 << KVM_STATS_TYPE_SHIFT) #define KVM_STATS_TYPE_INSTANT (0x1 << KVM_STATS_TYPE_SHIFT) #define KVM_STATS_TYPE_PEAK (0x2 << KVM_STATS_TYPE_SHIFT) + #define KVM_STATS_TYPE_LINEAR_HIST (0x3 << KVM_STATS_TYPE_SHIFT) + #define KVM_STATS_TYPE_LOG_HIST (0x4 << KVM_STATS_TYPE_SHIFT) + #define KVM_STATS_TYPE_MAX KVM_STATS_TYPE_LOG_HIST #define KVM_STATS_UNIT_SHIFT 4 #define KVM_STATS_UNIT_MASK (0xF << KVM_STATS_UNIT_SHIFT) @@ -5215,18 +5219,20 @@ by a string of size ``name_size``. #define KVM_STATS_UNIT_BYTES (0x1 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_SECONDS (0x2 << KVM_STATS_UNIT_SHIFT) #define KVM_STATS_UNIT_CYCLES (0x3 << KVM_STATS_UNIT_SHIFT) + #define KVM_STATS_UNIT_MAX KVM_STATS_UNIT_CYCLES #define KVM_STATS_BASE_SHIFT 8 #define KVM_STATS_BASE_MASK (0xF << KVM_STATS_BASE_SHIFT) #define KVM_STATS_BASE_POW10 (0x0 << KVM_STATS_BASE_SHIFT) #define KVM_STATS_BASE_POW2 (0x1 << KVM_STATS_BASE_SHIFT) + #define KVM_STATS_BASE_MAX KVM_STATS_BASE_POW2 struct kvm_stats_desc { __u32 flags; __s16 exponent; __u16 size; __u32 offset; - __u32 unused; + __u32 bucket_size; char name[]; }; @@ -5237,21 +5243,35 @@ The following flags are supported: Bits 0-3 of ``flags`` encode the type: * ``KVM_STATS_TYPE_CUMULATIVE`` - The statistics data is cumulative. The value of data can only be increased. + The statistics reports a cumulative count. The value of data can only be increased. Most of the counters used in KVM are of this type. The corresponding ``size`` field for this type is always 1. All cumulative statistics data are read/write. * ``KVM_STATS_TYPE_INSTANT`` - The statistics data is instantaneous. Its value can be increased or + The statistics reports an instantaneous value. Its value can be increased or decreased. This type is usually used as a measurement of some resources, like the number of dirty pages, the number of large pages, etc. All instant statistics are read only. The corresponding ``size`` field for this type is always 1. * ``KVM_STATS_TYPE_PEAK`` - The statistics data is peak. The value of data can only be increased, and - represents a peak value for a measurement, for example the maximum number + The statistics data reports a peak value, for example the maximum number of items in a hash table bucket, the longest time waited and so on. + The value of data can only be increased. The corresponding ``size`` field for this type is always 1. + * ``KVM_STATS_TYPE_LINEAR_HIST`` + The statistic is reported as a linear histogram. The number of + buckets is specified by the ``size`` field. The size of buckets is specified + by the ``hist_param`` field. The range of the Nth bucket (1 <= N < ``size``) + is [``hist_param``*(N-1), ``hist_param``*N), while the range of the last + bucket is [``hist_param``*(``size``-1), +INF). (+INF means positive infinity + value.) The bucket value indicates how many samples fell in the bucket's range. + * ``KVM_STATS_TYPE_LOG_HIST`` + The statistic is reported as a logarithmic histogram. The number of + buckets is specified by the ``size`` field. The range of the first bucket is + [0, 1), while the range of the last bucket is [pow(2, ``size``-2), +INF). + Otherwise, The Nth bucket (1 < N < ``size``) covers + [pow(2, N-2), pow(2, N-1)). The bucket value indicates how many samples fell + in the bucket's range. Bits 4-7 of ``flags`` encode the unit: @@ -5286,9 +5306,9 @@ unsigned 64bit data. The ``offset`` field is the offset from the start of Data Block to the start of the corresponding statistics data. -The ``unused`` field is reserved for future support for other types of -statistics data, like log/linear histogram. Its value is always 0 for the types -defined above. +The ``bucket_size`` field is used as a parameter for histogram statistics data. +It is only used by linear histogram statistics data, specifying the size of a +bucket. The ``name`` field is the name string of the statistics data. The name string starts at the end of ``struct kvm_stats_desc``. The maximum length including diff --git a/Documentation/virt/kvm/locking.rst b/Documentation/virt/kvm/locking.rst index 88fa495abbac..5d27da356836 100644 --- a/Documentation/virt/kvm/locking.rst +++ b/Documentation/virt/kvm/locking.rst @@ -21,6 +21,12 @@ The acquisition orders for mutexes are as follows: can be taken inside a kvm->srcu read-side critical section, while kvm->slots_lock cannot. +- kvm->mn_active_invalidate_count ensures that pairs of + invalidate_range_start() and invalidate_range_end() callbacks + use the same memslots array. kvm->slots_lock and kvm->slots_arch_lock + are taken on the waiting side in install_new_memslots, so MMU notifiers + must not take either kvm->slots_lock or kvm->slots_arch_lock. + On x86: - vcpu->mutex is taken outside kvm->arch.hyperv.hv_lock diff --git a/MAINTAINERS b/MAINTAINERS index fc7e024d3d5a..fa30f4098d29 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -19774,18 +19774,11 @@ L: kvm@vger.kernel.org L: virtualization@lists.linux-foundation.org L: netdev@vger.kernel.org S: Maintained -F: drivers/net/vsockmon.c F: drivers/vhost/vsock.c F: include/linux/virtio_vsock.h F: include/uapi/linux/virtio_vsock.h -F: include/uapi/linux/vm_sockets_diag.h -F: include/uapi/linux/vsockmon.h -F: net/vmw_vsock/af_vsock_tap.c -F: net/vmw_vsock/diag.c F: net/vmw_vsock/virtio_transport.c F: net/vmw_vsock/virtio_transport_common.c -F: net/vmw_vsock/vsock_loopback.c -F: tools/testing/vsock/ VIRTIO BLOCK AND SCSI DRIVERS M: "Michael S. Tsirkin" @@ -19990,6 +19983,19 @@ F: drivers/staging/vme/ F: drivers/vme/ F: include/linux/vme* +VM SOCKETS (AF_VSOCK) +M: Stefano Garzarella +L: virtualization@lists.linux-foundation.org +L: netdev@vger.kernel.org +S: Maintained +F: drivers/net/vsockmon.c +F: include/net/af_vsock.h +F: include/uapi/linux/vm_sockets.h +F: include/uapi/linux/vm_sockets_diag.h +F: include/uapi/linux/vsockmon.h +F: net/vmw_vsock/ +F: tools/testing/vsock/ + VMWARE BALLOON DRIVER M: Nadav Amit M: "VMware, Inc." diff --git a/arch/arm64/include/asm/cpufeature.h b/arch/arm64/include/asm/cpufeature.h index cdfa2a242e9f..ef6be92b1921 100644 --- a/arch/arm64/include/asm/cpufeature.h +++ b/arch/arm64/include/asm/cpufeature.h @@ -602,14 +602,14 @@ static inline bool id_aa64pfr0_32bit_el1(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL1_SHIFT); - return val == ID_AA64PFR0_EL1_32BIT_64BIT; + return val == ID_AA64PFR0_ELx_32BIT_64BIT; } static inline bool id_aa64pfr0_32bit_el0(u64 pfr0) { u32 val = cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_EL0_SHIFT); - return val == ID_AA64PFR0_EL0_32BIT_64BIT; + return val == ID_AA64PFR0_ELx_32BIT_64BIT; } static inline bool id_aa64pfr0_sve(u64 pfr0) @@ -784,13 +784,13 @@ extern int do_emulate_mrs(struct pt_regs *regs, u32 sys_reg, u32 rt); static inline u32 id_aa64mmfr0_parange_to_phys_shift(int parange) { switch (parange) { - case 0: return 32; - case 1: return 36; - case 2: return 40; - case 3: return 42; - case 4: return 44; - case 5: return 48; - case 6: return 52; + case ID_AA64MMFR0_PARANGE_32: return 32; + case ID_AA64MMFR0_PARANGE_36: return 36; + case ID_AA64MMFR0_PARANGE_40: return 40; + case ID_AA64MMFR0_PARANGE_42: return 42; + case ID_AA64MMFR0_PARANGE_44: return 44; + case ID_AA64MMFR0_PARANGE_48: return 48; + case ID_AA64MMFR0_PARANGE_52: return 52; /* * A future PE could use a value unknown to the kernel. * However, by the "D10.1.4 Principles of the ID scheme diff --git a/arch/arm64/include/asm/kvm_arm.h b/arch/arm64/include/asm/kvm_arm.h index d436831dd706..327120c0089f 100644 --- a/arch/arm64/include/asm/kvm_arm.h +++ b/arch/arm64/include/asm/kvm_arm.h @@ -12,8 +12,13 @@ #include /* Hyp Configuration Register (HCR) bits */ + +#define HCR_TID5 (UL(1) << 58) +#define HCR_DCT (UL(1) << 57) #define HCR_ATA_SHIFT 56 #define HCR_ATA (UL(1) << HCR_ATA_SHIFT) +#define HCR_AMVOFFEN (UL(1) << 51) +#define HCR_FIEN (UL(1) << 47) #define HCR_FWB (UL(1) << 46) #define HCR_API (UL(1) << 41) #define HCR_APK (UL(1) << 40) @@ -32,9 +37,9 @@ #define HCR_TVM (UL(1) << 26) #define HCR_TTLB (UL(1) << 25) #define HCR_TPU (UL(1) << 24) -#define HCR_TPC (UL(1) << 23) +#define HCR_TPC (UL(1) << 23) /* HCR_TPCP if FEAT_DPB */ #define HCR_TSW (UL(1) << 22) -#define HCR_TAC (UL(1) << 21) +#define HCR_TACR (UL(1) << 21) #define HCR_TIDCP (UL(1) << 20) #define HCR_TSC (UL(1) << 19) #define HCR_TID3 (UL(1) << 18) @@ -56,12 +61,13 @@ #define HCR_PTW (UL(1) << 2) #define HCR_SWIO (UL(1) << 1) #define HCR_VM (UL(1) << 0) +#define HCR_RES0 ((UL(1) << 48) | (UL(1) << 39)) /* * The bits we set in HCR: * TLOR: Trap LORegion register accesses * RW: 64bit by default, can be overridden for 32bit VMs - * TAC: Trap ACTLR + * TACR: Trap ACTLR * TSC: Trap SMC * TSW: Trap cache operations by set/way * TWE: Trap WFE @@ -76,7 +82,7 @@ * PTW: Take a stage2 fault if a stage1 walk steps in device memory */ #define HCR_GUEST_FLAGS (HCR_TSC | HCR_TSW | HCR_TWE | HCR_TWI | HCR_VM | \ - HCR_BSU_IS | HCR_FB | HCR_TAC | \ + HCR_BSU_IS | HCR_FB | HCR_TACR | \ HCR_AMO | HCR_SWIO | HCR_TIDCP | HCR_RW | HCR_TLOR | \ HCR_FMO | HCR_IMO | HCR_PTW ) #define HCR_VIRT_EXCP_MASK (HCR_VSE | HCR_VI | HCR_VF) @@ -275,24 +281,40 @@ #define CPTR_EL2_TTA (1 << 20) #define CPTR_EL2_TFP (1 << CPTR_EL2_TFP_SHIFT) #define CPTR_EL2_TZ (1 << 8) -#define CPTR_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 */ -#define CPTR_EL2_DEFAULT CPTR_EL2_RES1 +#define CPTR_NVHE_EL2_RES1 0x000032ff /* known RES1 bits in CPTR_EL2 (nVHE) */ +#define CPTR_EL2_DEFAULT CPTR_NVHE_EL2_RES1 +#define CPTR_NVHE_EL2_RES0 (GENMASK(63, 32) | \ + GENMASK(29, 21) | \ + GENMASK(19, 14) | \ + BIT(11)) /* Hyp Debug Configuration Register bits */ #define MDCR_EL2_E2TB_MASK (UL(0x3)) #define MDCR_EL2_E2TB_SHIFT (UL(24)) -#define MDCR_EL2_TTRF (1 << 19) -#define MDCR_EL2_TPMS (1 << 14) +#define MDCR_EL2_HPMFZS (UL(1) << 36) +#define MDCR_EL2_HPMFZO (UL(1) << 29) +#define MDCR_EL2_MTPME (UL(1) << 28) +#define MDCR_EL2_TDCC (UL(1) << 27) +#define MDCR_EL2_HCCD (UL(1) << 23) +#define MDCR_EL2_TTRF (UL(1) << 19) +#define MDCR_EL2_HPMD (UL(1) << 17) +#define MDCR_EL2_TPMS (UL(1) << 14) #define MDCR_EL2_E2PB_MASK (UL(0x3)) #define MDCR_EL2_E2PB_SHIFT (UL(12)) -#define MDCR_EL2_TDRA (1 << 11) -#define MDCR_EL2_TDOSA (1 << 10) -#define MDCR_EL2_TDA (1 << 9) -#define MDCR_EL2_TDE (1 << 8) -#define MDCR_EL2_HPME (1 << 7) -#define MDCR_EL2_TPM (1 << 6) -#define MDCR_EL2_TPMCR (1 << 5) -#define MDCR_EL2_HPMN_MASK (0x1F) +#define MDCR_EL2_TDRA (UL(1) << 11) +#define MDCR_EL2_TDOSA (UL(1) << 10) +#define MDCR_EL2_TDA (UL(1) << 9) +#define MDCR_EL2_TDE (UL(1) << 8) +#define MDCR_EL2_HPME (UL(1) << 7) +#define MDCR_EL2_TPM (UL(1) << 6) +#define MDCR_EL2_TPMCR (UL(1) << 5) +#define MDCR_EL2_HPMN_MASK (UL(0x1F)) +#define MDCR_EL2_RES0 (GENMASK(63, 37) | \ + GENMASK(35, 30) | \ + GENMASK(25, 24) | \ + GENMASK(22, 20) | \ + BIT(18) | \ + GENMASK(16, 15)) /* For compatibility with fault code shared with 32-bit */ #define FSC_FAULT ESR_ELx_FSC_FAULT diff --git a/arch/arm64/include/asm/kvm_asm.h b/arch/arm64/include/asm/kvm_asm.h index 9f0bf2109be7..e86045ac43ba 100644 --- a/arch/arm64/include/asm/kvm_asm.h +++ b/arch/arm64/include/asm/kvm_asm.h @@ -59,12 +59,11 @@ #define __KVM_HOST_SMCCC_FUNC___vgic_v3_save_aprs 13 #define __KVM_HOST_SMCCC_FUNC___vgic_v3_restore_aprs 14 #define __KVM_HOST_SMCCC_FUNC___pkvm_init 15 -#define __KVM_HOST_SMCCC_FUNC___pkvm_create_mappings 16 +#define __KVM_HOST_SMCCC_FUNC___pkvm_host_share_hyp 16 #define __KVM_HOST_SMCCC_FUNC___pkvm_create_private_mapping 17 #define __KVM_HOST_SMCCC_FUNC___pkvm_cpu_set_vector 18 #define __KVM_HOST_SMCCC_FUNC___pkvm_prot_finalize 19 -#define __KVM_HOST_SMCCC_FUNC___pkvm_mark_hyp 20 -#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 21 +#define __KVM_HOST_SMCCC_FUNC___kvm_adjust_pc 20 #ifndef __ASSEMBLY__ @@ -210,7 +209,7 @@ extern u64 __vgic_v3_read_vmcr(void); extern void __vgic_v3_write_vmcr(u32 vmcr); extern void __vgic_v3_init_lrs(void); -extern u32 __kvm_get_mdcr_el2(void); +extern u64 __kvm_get_mdcr_el2(void); #define __KVM_EXTABLE(from, to) \ " .pushsection __kvm_ex_table, \"a\"\n" \ diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 41911585ae0c..f8be56d5342b 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -66,7 +66,7 @@ DECLARE_STATIC_KEY_FALSE(userspace_irqchip_in_use); extern unsigned int kvm_sve_max_vl; int kvm_arm_init_sve(void); -int __attribute_const__ kvm_target_cpu(void); +u32 __attribute_const__ kvm_target_cpu(void); int kvm_reset_vcpu(struct kvm_vcpu *vcpu); void kvm_arm_vcpu_destroy(struct kvm_vcpu *vcpu); @@ -185,7 +185,6 @@ enum vcpu_sysreg { PMCNTENSET_EL0, /* Count Enable Set Register */ PMINTENSET_EL1, /* Interrupt Enable Set Register */ PMOVSSET_EL0, /* Overflow Flag Status Set Register */ - PMSWINC_EL0, /* Software Increment Register */ PMUSERENR_EL0, /* User Enable Register */ /* Pointer Authentication Registers in a strict increasing order. */ @@ -287,9 +286,13 @@ struct kvm_vcpu_arch { /* Stage 2 paging state used by the hardware on next switch */ struct kvm_s2_mmu *hw_mmu; - /* HYP configuration */ + /* Values of trap registers for the guest. */ u64 hcr_el2; - u32 mdcr_el2; + u64 mdcr_el2; + u64 cptr_el2; + + /* Values of trap registers for the host before guest entry. */ + u64 mdcr_el2_host; /* Exception Information */ struct kvm_vcpu_fault_info fault; @@ -576,6 +579,7 @@ struct kvm_vcpu_stat { u64 wfi_exit_stat; u64 mmio_exit_user; u64 mmio_exit_kernel; + u64 signal_exits; u64 exits; }; @@ -771,6 +775,11 @@ void kvm_arch_free_vm(struct kvm *kvm); int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type); +static inline bool kvm_vm_is_protected(struct kvm *kvm) +{ + return false; +} + int kvm_arm_vcpu_finalize(struct kvm_vcpu *vcpu, int feature); bool kvm_arm_vcpu_is_finalized(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_hyp.h b/arch/arm64/include/asm/kvm_hyp.h index 9d60b3006efc..657d0c94cf82 100644 --- a/arch/arm64/include/asm/kvm_hyp.h +++ b/arch/arm64/include/asm/kvm_hyp.h @@ -95,7 +95,7 @@ void __sve_restore_state(void *sve_pffr, u32 *fpsr); #ifndef __KVM_NVHE_HYPERVISOR__ void activate_traps_vhe_load(struct kvm_vcpu *vcpu); -void deactivate_traps_vhe_put(void); +void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu); #endif u64 __guest_enter(struct kvm_vcpu *vcpu); diff --git a/arch/arm64/include/asm/kvm_mmu.h b/arch/arm64/include/asm/kvm_mmu.h index b52c5c4b9a3d..02d378887743 100644 --- a/arch/arm64/include/asm/kvm_mmu.h +++ b/arch/arm64/include/asm/kvm_mmu.h @@ -252,6 +252,11 @@ static inline int kvm_write_guest_lock(struct kvm *kvm, gpa_t gpa, #define kvm_phys_to_vttbr(addr) phys_to_ttbr(addr) +/* + * When this is (directly or indirectly) used on the TLB invalidation + * path, we rely on a previously issued DSB so that page table updates + * and VMID reads are correctly ordered. + */ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) { struct kvm_vmid *vmid = &mmu->vmid; @@ -259,7 +264,7 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) u64 cnp = system_supports_cnp() ? VTTBR_CNP_BIT : 0; baddr = mmu->pgd_phys; - vmid_field = (u64)vmid->vmid << VTTBR_VMID_SHIFT; + vmid_field = (u64)READ_ONCE(vmid->vmid) << VTTBR_VMID_SHIFT; return kvm_phys_to_vttbr(baddr) | vmid_field | cnp; } @@ -267,9 +272,10 @@ static __always_inline u64 kvm_get_vttbr(struct kvm_s2_mmu *mmu) * Must be called from hyp code running at EL2 with an updated VTTBR * and interrupts disabled. */ -static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long vtcr) +static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, + struct kvm_arch *arch) { - write_sysreg(vtcr, vtcr_el2); + write_sysreg(arch->vtcr, vtcr_el2); write_sysreg(kvm_get_vttbr(mmu), vttbr_el2); /* @@ -280,11 +286,6 @@ static __always_inline void __load_stage2(struct kvm_s2_mmu *mmu, unsigned long asm(ALTERNATIVE("nop", "isb", ARM64_WORKAROUND_SPECULATIVE_AT)); } -static __always_inline void __load_guest_stage2(struct kvm_s2_mmu *mmu) -{ - __load_stage2(mmu, kern_hyp_va(mmu->arch)->vtcr); -} - static inline struct kvm *kvm_s2_mmu_to_kvm(struct kvm_s2_mmu *mmu) { return container_of(mmu->arch, struct kvm, arch); diff --git a/arch/arm64/include/asm/kvm_pgtable.h b/arch/arm64/include/asm/kvm_pgtable.h index f004c0115d89..027783829584 100644 --- a/arch/arm64/include/asm/kvm_pgtable.h +++ b/arch/arm64/include/asm/kvm_pgtable.h @@ -25,6 +25,46 @@ static inline u64 kvm_get_parange(u64 mmfr0) typedef u64 kvm_pte_t; +#define KVM_PTE_VALID BIT(0) + +#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) +#define KVM_PTE_ADDR_51_48 GENMASK(15, 12) + +static inline bool kvm_pte_valid(kvm_pte_t pte) +{ + return pte & KVM_PTE_VALID; +} + +static inline u64 kvm_pte_to_phys(kvm_pte_t pte) +{ + u64 pa = pte & KVM_PTE_ADDR_MASK; + + if (PAGE_SHIFT == 16) + pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; + + return pa; +} + +static inline u64 kvm_granule_shift(u32 level) +{ + /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ + return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); +} + +static inline u64 kvm_granule_size(u32 level) +{ + return BIT(kvm_granule_shift(level)); +} + +static inline bool kvm_level_supports_block_mapping(u32 level) +{ + /* + * Reject invalid block mappings and don't bother with 4TB mappings for + * 52-bit PAs. + */ + return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1)); +} + /** * struct kvm_pgtable_mm_ops - Memory management callbacks. * @zalloc_page: Allocate a single zeroed memory page. @@ -75,6 +115,44 @@ enum kvm_pgtable_stage2_flags { KVM_PGTABLE_S2_IDMAP = BIT(1), }; +/** + * enum kvm_pgtable_prot - Page-table permissions and attributes. + * @KVM_PGTABLE_PROT_X: Execute permission. + * @KVM_PGTABLE_PROT_W: Write permission. + * @KVM_PGTABLE_PROT_R: Read permission. + * @KVM_PGTABLE_PROT_DEVICE: Device attributes. + * @KVM_PGTABLE_PROT_SW0: Software bit 0. + * @KVM_PGTABLE_PROT_SW1: Software bit 1. + * @KVM_PGTABLE_PROT_SW2: Software bit 2. + * @KVM_PGTABLE_PROT_SW3: Software bit 3. + */ +enum kvm_pgtable_prot { + KVM_PGTABLE_PROT_X = BIT(0), + KVM_PGTABLE_PROT_W = BIT(1), + KVM_PGTABLE_PROT_R = BIT(2), + + KVM_PGTABLE_PROT_DEVICE = BIT(3), + + KVM_PGTABLE_PROT_SW0 = BIT(55), + KVM_PGTABLE_PROT_SW1 = BIT(56), + KVM_PGTABLE_PROT_SW2 = BIT(57), + KVM_PGTABLE_PROT_SW3 = BIT(58), +}; + +#define KVM_PGTABLE_PROT_RW (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W) +#define KVM_PGTABLE_PROT_RWX (KVM_PGTABLE_PROT_RW | KVM_PGTABLE_PROT_X) + +#define PKVM_HOST_MEM_PROT KVM_PGTABLE_PROT_RWX +#define PKVM_HOST_MMIO_PROT KVM_PGTABLE_PROT_RW + +#define PAGE_HYP KVM_PGTABLE_PROT_RW +#define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) +#define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) +#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) + +typedef bool (*kvm_pgtable_force_pte_cb_t)(u64 addr, u64 end, + enum kvm_pgtable_prot prot); + /** * struct kvm_pgtable - KVM page-table. * @ia_bits: Maximum input address size, in bits. @@ -82,6 +160,9 @@ enum kvm_pgtable_stage2_flags { * @pgd: Pointer to the first top-level entry of the page-table. * @mm_ops: Memory management callbacks. * @mmu: Stage-2 KVM MMU struct. Unused for stage-1 page-tables. + * @flags: Stage-2 page-table flags. + * @force_pte_cb: Function that returns true if page level mappings must + * be used instead of block mappings. */ struct kvm_pgtable { u32 ia_bits; @@ -92,36 +173,7 @@ struct kvm_pgtable { /* Stage-2 only */ struct kvm_s2_mmu *mmu; enum kvm_pgtable_stage2_flags flags; -}; - -/** - * enum kvm_pgtable_prot - Page-table permissions and attributes. - * @KVM_PGTABLE_PROT_X: Execute permission. - * @KVM_PGTABLE_PROT_W: Write permission. - * @KVM_PGTABLE_PROT_R: Read permission. - * @KVM_PGTABLE_PROT_DEVICE: Device attributes. - */ -enum kvm_pgtable_prot { - KVM_PGTABLE_PROT_X = BIT(0), - KVM_PGTABLE_PROT_W = BIT(1), - KVM_PGTABLE_PROT_R = BIT(2), - - KVM_PGTABLE_PROT_DEVICE = BIT(3), -}; - -#define PAGE_HYP (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W) -#define PAGE_HYP_EXEC (KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_X) -#define PAGE_HYP_RO (KVM_PGTABLE_PROT_R) -#define PAGE_HYP_DEVICE (PAGE_HYP | KVM_PGTABLE_PROT_DEVICE) - -/** - * struct kvm_mem_range - Range of Intermediate Physical Addresses - * @start: Start of the range. - * @end: End of the range. - */ -struct kvm_mem_range { - u64 start; - u64 end; + kvm_pgtable_force_pte_cb_t force_pte_cb; }; /** @@ -216,21 +268,24 @@ int kvm_pgtable_hyp_map(struct kvm_pgtable *pgt, u64 addr, u64 size, u64 phys, u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift); /** - * kvm_pgtable_stage2_init_flags() - Initialise a guest stage-2 page-table. + * __kvm_pgtable_stage2_init() - Initialise a guest stage-2 page-table. * @pgt: Uninitialised page-table structure to initialise. * @arch: Arch-specific KVM structure representing the guest virtual * machine. * @mm_ops: Memory management callbacks. * @flags: Stage-2 configuration flags. + * @force_pte_cb: Function that returns true if page level mappings must + * be used instead of block mappings. * * Return: 0 on success, negative error code on failure. */ -int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch, - struct kvm_pgtable_mm_ops *mm_ops, - enum kvm_pgtable_stage2_flags flags); +int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch, + struct kvm_pgtable_mm_ops *mm_ops, + enum kvm_pgtable_stage2_flags flags, + kvm_pgtable_force_pte_cb_t force_pte_cb); #define kvm_pgtable_stage2_init(pgt, arch, mm_ops) \ - kvm_pgtable_stage2_init_flags(pgt, arch, mm_ops, 0) + __kvm_pgtable_stage2_init(pgt, arch, mm_ops, 0, NULL) /** * kvm_pgtable_stage2_destroy() - Destroy an unused guest stage-2 page-table. @@ -374,7 +429,8 @@ kvm_pte_t kvm_pgtable_stage2_mkold(struct kvm_pgtable *pgt, u64 addr); * If there is a valid, leaf page-table entry used to translate @addr, then * relax the permissions in that entry according to the read, write and * execute permissions specified by @prot. No permissions are removed, and - * TLB invalidation is performed after updating the entry. + * TLB invalidation is performed after updating the entry. Software bits cannot + * be set or cleared using kvm_pgtable_stage2_relax_perms(). * * Return: 0 on success, negative error code on failure. */ @@ -433,22 +489,42 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, struct kvm_pgtable_walker *walker); /** - * kvm_pgtable_stage2_find_range() - Find a range of Intermediate Physical - * Addresses with compatible permission - * attributes. - * @pgt: Page-table structure initialised by kvm_pgtable_stage2_init*(). - * @addr: Address that must be covered by the range. - * @prot: Protection attributes that the range must be compatible with. - * @range: Range structure used to limit the search space at call time and - * that will hold the result. + * kvm_pgtable_get_leaf() - Walk a page-table and retrieve the leaf entry + * with its level. + * @pgt: Page-table structure initialised by kvm_pgtable_*_init() + * or a similar initialiser. + * @addr: Input address for the start of the walk. + * @ptep: Pointer to storage for the retrieved PTE. + * @level: Pointer to storage for the level of the retrieved PTE. * - * The offset of @addr within a page is ignored. An IPA is compatible with @prot - * iff its corresponding stage-2 page-table entry has default ownership and, if - * valid, is mapped with protection attributes identical to @prot. + * The offset of @addr within a page is ignored. + * + * The walker will walk the page-table entries corresponding to the input + * address specified, retrieving the leaf corresponding to this address. + * Invalid entries are treated as leaf entries. * * Return: 0 on success, negative error code on failure. */ -int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr, - enum kvm_pgtable_prot prot, - struct kvm_mem_range *range); +int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, + kvm_pte_t *ptep, u32 *level); + +/** + * kvm_pgtable_stage2_pte_prot() - Retrieve the protection attributes of a + * stage-2 Page-Table Entry. + * @pte: Page-table entry + * + * Return: protection attributes of the page-table entry in the enum + * kvm_pgtable_prot format. + */ +enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte); + +/** + * kvm_pgtable_hyp_pte_prot() - Retrieve the protection attributes of a stage-1 + * Page-Table Entry. + * @pte: Page-table entry + * + * Return: protection attributes of the page-table entry in the enum + * kvm_pgtable_prot format. + */ +enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte); #endif /* __ARM64_KVM_PGTABLE_H__ */ diff --git a/arch/arm64/include/asm/sysreg.h b/arch/arm64/include/asm/sysreg.h index f2e06e7c0a31..b268082d67ed 100644 --- a/arch/arm64/include/asm/sysreg.h +++ b/arch/arm64/include/asm/sysreg.h @@ -784,14 +784,13 @@ #define ID_AA64PFR0_AMU 0x1 #define ID_AA64PFR0_SVE 0x1 #define ID_AA64PFR0_RAS_V1 0x1 +#define ID_AA64PFR0_RAS_V1P1 0x2 #define ID_AA64PFR0_FP_NI 0xf #define ID_AA64PFR0_FP_SUPPORTED 0x0 #define ID_AA64PFR0_ASIMD_NI 0xf #define ID_AA64PFR0_ASIMD_SUPPORTED 0x0 -#define ID_AA64PFR0_EL1_64BIT_ONLY 0x1 -#define ID_AA64PFR0_EL1_32BIT_64BIT 0x2 -#define ID_AA64PFR0_EL0_64BIT_ONLY 0x1 -#define ID_AA64PFR0_EL0_32BIT_64BIT 0x2 +#define ID_AA64PFR0_ELx_64BIT_ONLY 0x1 +#define ID_AA64PFR0_ELx_32BIT_64BIT 0x2 /* id_aa64pfr1 */ #define ID_AA64PFR1_MPAMFRAC_SHIFT 16 @@ -847,6 +846,9 @@ #define ID_AA64MMFR0_ASID_SHIFT 4 #define ID_AA64MMFR0_PARANGE_SHIFT 0 +#define ID_AA64MMFR0_ASID_8 0x0 +#define ID_AA64MMFR0_ASID_16 0x2 + #define ID_AA64MMFR0_TGRAN4_NI 0xf #define ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN 0x0 #define ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX 0x7 @@ -857,9 +859,16 @@ #define ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN 0x1 #define ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX 0xf +#define ID_AA64MMFR0_PARANGE_32 0x0 +#define ID_AA64MMFR0_PARANGE_36 0x1 +#define ID_AA64MMFR0_PARANGE_40 0x2 +#define ID_AA64MMFR0_PARANGE_42 0x3 +#define ID_AA64MMFR0_PARANGE_44 0x4 #define ID_AA64MMFR0_PARANGE_48 0x5 #define ID_AA64MMFR0_PARANGE_52 0x6 +#define ARM64_MIN_PARANGE_BITS 32 + #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_DEFAULT 0x0 #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE 0x1 #define ID_AA64MMFR0_TGRAN_2_SUPPORTED_MIN 0x2 @@ -904,6 +913,7 @@ #define ID_AA64MMFR2_CNP_SHIFT 0 /* id_aa64dfr0 */ +#define ID_AA64DFR0_MTPMU_SHIFT 48 #define ID_AA64DFR0_TRBE_SHIFT 44 #define ID_AA64DFR0_TRACE_FILT_SHIFT 40 #define ID_AA64DFR0_DOUBLELOCK_SHIFT 36 @@ -1034,14 +1044,17 @@ #define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN4_SHIFT #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN4_SUPPORTED_MIN #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN4_SUPPORTED_MAX +#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN4_2_SHIFT #elif defined(CONFIG_ARM64_16K_PAGES) #define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN16_SHIFT #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN16_SUPPORTED_MIN #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN16_SUPPORTED_MAX +#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN16_2_SHIFT #elif defined(CONFIG_ARM64_64K_PAGES) #define ID_AA64MMFR0_TGRAN_SHIFT ID_AA64MMFR0_TGRAN64_SHIFT #define ID_AA64MMFR0_TGRAN_SUPPORTED_MIN ID_AA64MMFR0_TGRAN64_SUPPORTED_MIN #define ID_AA64MMFR0_TGRAN_SUPPORTED_MAX ID_AA64MMFR0_TGRAN64_SUPPORTED_MAX +#define ID_AA64MMFR0_TGRAN_2_SHIFT ID_AA64MMFR0_TGRAN64_2_SHIFT #endif #define MVFR2_FPMISC_SHIFT 4 @@ -1172,6 +1185,11 @@ #define ICH_VTR_A3V_SHIFT 21 #define ICH_VTR_A3V_MASK (1 << ICH_VTR_A3V_SHIFT) +#define ARM64_FEATURE_FIELD_BITS 4 + +/* Create a mask for the feature bits of the specified feature. */ +#define ARM64_FEATURE_MASK(x) (GENMASK_ULL(x##_SHIFT + ARM64_FEATURE_FIELD_BITS - 1, x##_SHIFT)) + #ifdef __ASSEMBLY__ .irp num,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30 diff --git a/arch/arm64/kernel/cpufeature.c b/arch/arm64/kernel/cpufeature.c index b2770d753ba3..f8a3067d10c6 100644 --- a/arch/arm64/kernel/cpufeature.c +++ b/arch/arm64/kernel/cpufeature.c @@ -240,8 +240,8 @@ static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL3_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL2_SHIFT, 4, 0), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_EL1_64BIT_ONLY), - ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_EL0_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL1_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_EL0_SHIFT, 4, ID_AA64PFR0_ELx_64BIT_ONLY), ARM64_FTR_END, }; @@ -1983,7 +1983,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sys_reg = SYS_ID_AA64PFR0_EL1, .sign = FTR_UNSIGNED, .field_pos = ID_AA64PFR0_EL0_SHIFT, - .min_field_value = ID_AA64PFR0_EL0_32BIT_64BIT, + .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT, }, #ifdef CONFIG_KVM { @@ -1994,7 +1994,7 @@ static const struct arm64_cpu_capabilities arm64_features[] = { .sys_reg = SYS_ID_AA64PFR0_EL1, .sign = FTR_UNSIGNED, .field_pos = ID_AA64PFR0_EL1_SHIFT, - .min_field_value = ID_AA64PFR0_EL1_32BIT_64BIT, + .min_field_value = ID_AA64PFR0_ELx_32BIT_64BIT, }, { .desc = "Protected KVM", diff --git a/arch/arm64/kernel/vmlinux.lds.S b/arch/arm64/kernel/vmlinux.lds.S index 709d2c433c5e..f6b1a88245db 100644 --- a/arch/arm64/kernel/vmlinux.lds.S +++ b/arch/arm64/kernel/vmlinux.lds.S @@ -181,6 +181,8 @@ SECTIONS /* everything from this point to __init_begin will be marked RO NX */ RO_DATA(PAGE_SIZE) + HYPERVISOR_DATA_SECTIONS + idmap_pg_dir = .; . += IDMAP_DIR_SIZE; idmap_pg_end = .; @@ -260,8 +262,6 @@ SECTIONS _sdata = .; RW_DATA(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) - HYPERVISOR_DATA_SECTIONS - /* * Data written with the MMU off but read with the MMU on requires * cache lines to be invalidated, discarding up to a Cache Writeback diff --git a/arch/arm64/kvm/Kconfig b/arch/arm64/kvm/Kconfig index a4eba0908bfa..d7eec0b43744 100644 --- a/arch/arm64/kvm/Kconfig +++ b/arch/arm64/kvm/Kconfig @@ -26,6 +26,7 @@ menuconfig KVM select HAVE_KVM_ARCH_TLB_FLUSH_ALL select KVM_MMIO select KVM_GENERIC_DIRTYLOG_READ_PROTECT + select KVM_XFER_TO_GUEST_WORK select SRCU select KVM_VFIO select HAVE_KVM_EVENTFD @@ -46,6 +47,15 @@ if KVM source "virt/kvm/Kconfig" +config NVHE_EL2_DEBUG + bool "Debug mode for non-VHE EL2 object" + help + Say Y here to enable the debug mode for the non-VHE KVM EL2 object. + Failure reports will BUG() in the hypervisor. This is intended for + local EL2 hypervisor development. + + If unsure, say N. + endif # KVM endif # VIRTUALIZATION diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 0ca72f5cda41..fe102cd2e518 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -6,6 +6,7 @@ #include #include +#include #include #include #include @@ -15,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -42,10 +44,6 @@ #include #include -#ifdef REQUIRES_VIRT -__asm__(".arch_extension virt"); -#endif - static enum kvm_mode kvm_mode = KVM_MODE_DEFAULT; DEFINE_STATIC_KEY_FALSE(kvm_protected_mode_initialized); @@ -575,7 +573,7 @@ static void update_vmid(struct kvm_vmid *vmid) kvm_call_hyp(__kvm_flush_vm_context); } - vmid->vmid = kvm_next_vmid; + WRITE_ONCE(vmid->vmid, kvm_next_vmid); kvm_next_vmid++; kvm_next_vmid &= (1 << kvm_get_vmid_bits()) - 1; @@ -718,6 +716,45 @@ static bool vcpu_mode_is_bad_32bit(struct kvm_vcpu *vcpu) static_branch_unlikely(&arm64_mismatched_32bit_el0); } +/** + * kvm_vcpu_exit_request - returns true if the VCPU should *not* enter the guest + * @vcpu: The VCPU pointer + * @ret: Pointer to write optional return code + * + * Returns: true if the VCPU needs to return to a preemptible + interruptible + * and skip guest entry. + * + * This function disambiguates between two different types of exits: exits to a + * preemptible + interruptible kernel context and exits to userspace. For an + * exit to userspace, this function will write the return code to ret and return + * true. For an exit to preemptible + interruptible kernel context (i.e. check + * for pending work and re-enter), return true without writing to ret. + */ +static bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu, int *ret) +{ + struct kvm_run *run = vcpu->run; + + /* + * If we're using a userspace irqchip, then check if we need + * to tell a userspace irqchip about timer or PMU level + * changes and if so, exit to userspace (the actual level + * state gets updated in kvm_timer_update_run and + * kvm_pmu_update_run below). + */ + if (static_branch_unlikely(&userspace_irqchip_in_use)) { + if (kvm_timer_should_notify_user(vcpu) || + kvm_pmu_should_notify_user(vcpu)) { + *ret = -EINTR; + run->exit_reason = KVM_EXIT_INTR; + return true; + } + } + + return kvm_request_pending(vcpu) || + need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) || + xfer_to_guest_mode_work_pending(); +} + /** * kvm_arch_vcpu_ioctl_run - the main VCPU run function to execute guest code * @vcpu: The VCPU pointer @@ -761,7 +798,9 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* * Check conditions before entering the guest */ - cond_resched(); + ret = xfer_to_guest_mode_handle_work(vcpu); + if (!ret) + ret = 1; update_vmid(&vcpu->arch.hw_mmu->vmid); @@ -780,30 +819,6 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_vgic_flush_hwstate(vcpu); - /* - * Exit if we have a signal pending so that we can deliver the - * signal to user space. - */ - if (signal_pending(current)) { - ret = -EINTR; - run->exit_reason = KVM_EXIT_INTR; - } - - /* - * If we're using a userspace irqchip, then check if we need - * to tell a userspace irqchip about timer or PMU level - * changes and if so, exit to userspace (the actual level - * state gets updated in kvm_timer_update_run and - * kvm_pmu_update_run below). - */ - if (static_branch_unlikely(&userspace_irqchip_in_use)) { - if (kvm_timer_should_notify_user(vcpu) || - kvm_pmu_should_notify_user(vcpu)) { - ret = -EINTR; - run->exit_reason = KVM_EXIT_INTR; - } - } - /* * Ensure we set mode to IN_GUEST_MODE after we disable * interrupts and before the final VCPU requests check. @@ -812,8 +827,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) */ smp_store_mb(vcpu->mode, IN_GUEST_MODE); - if (ret <= 0 || need_new_vmid_gen(&vcpu->arch.hw_mmu->vmid) || - kvm_request_pending(vcpu)) { + if (ret <= 0 || kvm_vcpu_exit_request(vcpu, &ret)) { vcpu->mode = OUTSIDE_GUEST_MODE; isb(); /* Ensure work in x_flush_hwstate is committed */ kvm_pmu_sync_hwstate(vcpu); @@ -1039,7 +1053,7 @@ static int kvm_vcpu_set_target(struct kvm_vcpu *vcpu, const struct kvm_vcpu_init *init) { unsigned int i, ret; - int phys_target = kvm_target_cpu(); + u32 phys_target = kvm_target_cpu(); if (init->target != phys_target) return -EINVAL; @@ -1108,6 +1122,7 @@ static int kvm_arch_vcpu_ioctl_vcpu_init(struct kvm_vcpu *vcpu, } vcpu_reset_hcr(vcpu); + vcpu->arch.cptr_el2 = CPTR_EL2_DEFAULT; /* * Handle the "start in power-off" case. @@ -1219,6 +1234,14 @@ long kvm_arch_vcpu_ioctl(struct file *filp, if (copy_from_user(®, argp, sizeof(reg))) break; + /* + * We could owe a reset due to PSCI. Handle the pending reset + * here to ensure userspace register accesses are ordered after + * the reset. + */ + if (kvm_check_request(KVM_REQ_VCPU_RESET, vcpu)) + kvm_reset_vcpu(vcpu); + if (ioctl == KVM_SET_ONE_REG) r = kvm_arm_set_reg(vcpu, ®); else @@ -1700,11 +1723,6 @@ static bool init_psci_relay(void) return true; } -static int init_common_resources(void) -{ - return kvm_set_ipa_limit(); -} - static int init_subsystems(void) { int err = 0; @@ -1958,56 +1976,17 @@ static void _kvm_host_prot_finalize(void *discard) WARN_ON(kvm_call_hyp_nvhe(__pkvm_prot_finalize)); } -static inline int pkvm_mark_hyp(phys_addr_t start, phys_addr_t end) -{ - return kvm_call_hyp_nvhe(__pkvm_mark_hyp, start, end); -} - -#define pkvm_mark_hyp_section(__section) \ - pkvm_mark_hyp(__pa_symbol(__section##_start), \ - __pa_symbol(__section##_end)) - static int finalize_hyp_mode(void) { - int cpu, ret; - if (!is_protected_kvm_enabled()) return 0; - ret = pkvm_mark_hyp_section(__hyp_idmap_text); - if (ret) - return ret; - - ret = pkvm_mark_hyp_section(__hyp_text); - if (ret) - return ret; - - ret = pkvm_mark_hyp_section(__hyp_rodata); - if (ret) - return ret; - - ret = pkvm_mark_hyp_section(__hyp_bss); - if (ret) - return ret; - - ret = pkvm_mark_hyp(hyp_mem_base, hyp_mem_base + hyp_mem_size); - if (ret) - return ret; - - for_each_possible_cpu(cpu) { - phys_addr_t start = virt_to_phys((void *)kvm_arm_hyp_percpu_base[cpu]); - phys_addr_t end = start + (PAGE_SIZE << nvhe_percpu_order()); - - ret = pkvm_mark_hyp(start, end); - if (ret) - return ret; - - start = virt_to_phys((void *)per_cpu(kvm_arm_hyp_stack_page, cpu)); - end = start + PAGE_SIZE; - ret = pkvm_mark_hyp(start, end); - if (ret) - return ret; - } + /* + * Exclude HYP BSS from kmemleak so that it doesn't get peeked + * at, which would end badly once the section is inaccessible. + * None of other sections should ever be introspected. + */ + kmemleak_free_part(__hyp_bss_start, __hyp_bss_end - __hyp_bss_start); /* * Flip the static key upfront as that may no longer be possible @@ -2019,11 +1998,6 @@ static int finalize_hyp_mode(void) return 0; } -static void check_kvm_target_cpu(void *ret) -{ - *(int *)ret = kvm_target_cpu(); -} - struct kvm_vcpu *kvm_mpidr_to_vcpu(struct kvm *kvm, unsigned long mpidr) { struct kvm_vcpu *vcpu; @@ -2083,7 +2057,6 @@ void kvm_arch_irq_bypass_start(struct irq_bypass_consumer *cons) int kvm_arch_init(void *opaque) { int err; - int ret, cpu; bool in_hyp_mode; if (!is_hyp_mode_available()) { @@ -2098,15 +2071,7 @@ int kvm_arch_init(void *opaque) kvm_info("Guests without required CPU erratum workarounds can deadlock system!\n" \ "Only trusted guests should be used on this system.\n"); - for_each_online_cpu(cpu) { - smp_call_function_single(cpu, check_kvm_target_cpu, &ret, 1); - if (ret < 0) { - kvm_err("Error, CPU %d not supported!\n", cpu); - return -ENODEV; - } - } - - err = init_common_resources(); + err = kvm_set_ipa_limit(); if (err) return err; diff --git a/arch/arm64/kvm/debug.c b/arch/arm64/kvm/debug.c index d5e79d7ee6e9..db9361338b2a 100644 --- a/arch/arm64/kvm/debug.c +++ b/arch/arm64/kvm/debug.c @@ -21,7 +21,7 @@ DBG_MDSCR_KDE | \ DBG_MDSCR_MDE) -static DEFINE_PER_CPU(u32, mdcr_el2); +static DEFINE_PER_CPU(u64, mdcr_el2); /** * save/restore_guest_debug_regs diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 1dfb83578277..5ce26bedf23c 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -31,8 +31,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS() }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -50,10 +48,9 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, wfi_exit_stat), STATS_DESC_COUNTER(VCPU, mmio_exit_user), STATS_DESC_COUNTER(VCPU, mmio_exit_kernel), + STATS_DESC_COUNTER(VCPU, signal_exits), STATS_DESC_COUNTER(VCPU, exits) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -842,7 +839,7 @@ int __kvm_arm_vcpu_set_events(struct kvm_vcpu *vcpu, return 0; } -int __attribute_const__ kvm_target_cpu(void) +u32 __attribute_const__ kvm_target_cpu(void) { unsigned long implementor = read_cpuid_implementor(); unsigned long part_number = read_cpuid_part_number(); @@ -874,7 +871,7 @@ int __attribute_const__ kvm_target_cpu(void) int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) { - int target = kvm_target_cpu(); + u32 target = kvm_target_cpu(); if (target < 0) return -ENODEV; diff --git a/arch/arm64/kvm/handle_exit.c b/arch/arm64/kvm/handle_exit.c index 6f48336b1d86..275a27368a04 100644 --- a/arch/arm64/kvm/handle_exit.c +++ b/arch/arm64/kvm/handle_exit.c @@ -113,34 +113,20 @@ static int kvm_handle_wfx(struct kvm_vcpu *vcpu) * guest and host are using the same debug facilities it will be up to * userspace to re-inject the correct exception for guest delivery. * - * @return: 0 (while setting vcpu->run->exit_reason), -1 for error + * @return: 0 (while setting vcpu->run->exit_reason) */ static int kvm_handle_guest_debug(struct kvm_vcpu *vcpu) { struct kvm_run *run = vcpu->run; u32 esr = kvm_vcpu_get_esr(vcpu); - int ret = 0; run->exit_reason = KVM_EXIT_DEBUG; run->debug.arch.hsr = esr; - switch (ESR_ELx_EC(esr)) { - case ESR_ELx_EC_WATCHPT_LOW: + if (ESR_ELx_EC(esr) == ESR_ELx_EC_WATCHPT_LOW) run->debug.arch.far = vcpu->arch.fault.far_el2; - fallthrough; - case ESR_ELx_EC_SOFTSTP_LOW: - case ESR_ELx_EC_BREAKPT_LOW: - case ESR_ELx_EC_BKPT32: - case ESR_ELx_EC_BRK64: - break; - default: - kvm_err("%s: un-handled case esr: %#08x\n", - __func__, (unsigned int) esr); - ret = -1; - break; - } - return ret; + return 0; } static int kvm_handle_unknown_ec(struct kvm_vcpu *vcpu) @@ -292,11 +278,12 @@ void handle_exit_early(struct kvm_vcpu *vcpu, int exception_index) kvm_handle_guest_serror(vcpu, kvm_vcpu_get_esr(vcpu)); } -void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr, +void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, + u64 elr_virt, u64 elr_phys, u64 par, uintptr_t vcpu, u64 far, u64 hpfar) { - u64 elr_in_kimg = __phys_to_kimg(__hyp_pa(elr)); - u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr; + u64 elr_in_kimg = __phys_to_kimg(elr_phys); + u64 hyp_offset = elr_in_kimg - kaslr_offset() - elr_virt; u64 mode = spsr & PSR_MODE_MASK; /* @@ -309,20 +296,24 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr, kvm_err("Invalid host exception to nVHE hyp!\n"); } else if (ESR_ELx_EC(esr) == ESR_ELx_EC_BRK64 && (esr & ESR_ELx_BRK64_ISS_COMMENT_MASK) == BUG_BRK_IMM) { - struct bug_entry *bug = find_bug(elr_in_kimg); const char *file = NULL; unsigned int line = 0; /* All hyp bugs, including warnings, are treated as fatal. */ - if (bug) - bug_get_file_line(bug, &file, &line); + if (!is_protected_kvm_enabled() || + IS_ENABLED(CONFIG_NVHE_EL2_DEBUG)) { + struct bug_entry *bug = find_bug(elr_in_kimg); + + if (bug) + bug_get_file_line(bug, &file, &line); + } if (file) kvm_err("nVHE hyp BUG at: %s:%u!\n", file, line); else - kvm_err("nVHE hyp BUG at: %016llx!\n", elr + hyp_offset); + kvm_err("nVHE hyp BUG at: %016llx!\n", elr_virt + hyp_offset); } else { - kvm_err("nVHE hyp panic at: %016llx!\n", elr + hyp_offset); + kvm_err("nVHE hyp panic at: %016llx!\n", elr_virt + hyp_offset); } /* @@ -334,5 +325,5 @@ void __noreturn __cold nvhe_hyp_panic_handler(u64 esr, u64 spsr, u64 elr, kvm_err("Hyp Offset: 0x%llx\n", hyp_offset); panic("HYP panic:\nPS:%08llx PC:%016llx ESR:%08llx\nFAR:%016llx HPFAR:%016llx PAR:%016llx\nVCPU:%016lx\n", - spsr, elr, esr, far, hpfar, par, vcpu); + spsr, elr_virt, esr, far, hpfar, par, vcpu); } diff --git a/arch/arm64/kvm/hyp/include/hyp/switch.h b/arch/arm64/kvm/hyp/include/hyp/switch.h index e4a2f295a394..a0e78a6027be 100644 --- a/arch/arm64/kvm/hyp/include/hyp/switch.h +++ b/arch/arm64/kvm/hyp/include/hyp/switch.h @@ -92,11 +92,15 @@ static inline void __activate_traps_common(struct kvm_vcpu *vcpu) write_sysreg(0, pmselr_el0); write_sysreg(ARMV8_PMU_USERENR_MASK, pmuserenr_el0); } + + vcpu->arch.mdcr_el2_host = read_sysreg(mdcr_el2); write_sysreg(vcpu->arch.mdcr_el2, mdcr_el2); } -static inline void __deactivate_traps_common(void) +static inline void __deactivate_traps_common(struct kvm_vcpu *vcpu) { + write_sysreg(vcpu->arch.mdcr_el2_host, mdcr_el2); + write_sysreg(0, hstr_el2); if (kvm_arm_support_pmu_v3()) write_sysreg(0, pmuserenr_el0); diff --git a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h index 9c227d87c36d..b58c910babaf 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mem_protect.h @@ -12,6 +12,32 @@ #include #include +/* + * SW bits 0-1 are reserved to track the memory ownership state of each page: + * 00: The page is owned exclusively by the page-table owner. + * 01: The page is owned by the page-table owner, but is shared + * with another entity. + * 10: The page is shared with, but not owned by the page-table owner. + * 11: Reserved for future use (lending). + */ +enum pkvm_page_state { + PKVM_PAGE_OWNED = 0ULL, + PKVM_PAGE_SHARED_OWNED = KVM_PGTABLE_PROT_SW0, + PKVM_PAGE_SHARED_BORROWED = KVM_PGTABLE_PROT_SW1, +}; + +#define PKVM_PAGE_STATE_PROT_MASK (KVM_PGTABLE_PROT_SW0 | KVM_PGTABLE_PROT_SW1) +static inline enum kvm_pgtable_prot pkvm_mkstate(enum kvm_pgtable_prot prot, + enum pkvm_page_state state) +{ + return (prot & ~PKVM_PAGE_STATE_PROT_MASK) | state; +} + +static inline enum pkvm_page_state pkvm_getstate(enum kvm_pgtable_prot prot) +{ + return prot & PKVM_PAGE_STATE_PROT_MASK; +} + struct host_kvm { struct kvm_arch arch; struct kvm_pgtable pgt; @@ -20,16 +46,21 @@ struct host_kvm { }; extern struct host_kvm host_kvm; -int __pkvm_prot_finalize(void); -int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end); +extern const u8 pkvm_hyp_id; +int __pkvm_prot_finalize(void); +int __pkvm_host_share_hyp(u64 pfn); + +bool addr_is_memory(phys_addr_t phys); +int host_stage2_idmap_locked(phys_addr_t addr, u64 size, enum kvm_pgtable_prot prot); +int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id); int kvm_host_prepare_stage2(void *pgt_pool_base); void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt); static __always_inline void __load_host_stage2(void) { if (static_branch_likely(&kvm_protected_mode_initialized)) - __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr); + __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); else write_sysreg(0, vttbr_el2); } diff --git a/arch/arm64/kvm/hyp/include/nvhe/mm.h b/arch/arm64/kvm/hyp/include/nvhe/mm.h index 8ec3a5a7744b..c9a8f535212e 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/mm.h +++ b/arch/arm64/kvm/hyp/include/nvhe/mm.h @@ -23,8 +23,7 @@ int hyp_map_vectors(void); int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back); int pkvm_cpu_set_vector(enum arm64_hyp_spectre_vector slot); int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot); -int __pkvm_create_mappings(unsigned long start, unsigned long size, - unsigned long phys, enum kvm_pgtable_prot prot); +int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot); unsigned long __pkvm_create_private_mapping(phys_addr_t phys, size_t size, enum kvm_pgtable_prot prot); diff --git a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h index 76b537f8d1c6..4652fd04bdbe 100644 --- a/arch/arm64/kvm/hyp/include/nvhe/spinlock.h +++ b/arch/arm64/kvm/hyp/include/nvhe/spinlock.h @@ -15,6 +15,7 @@ #include #include +#include typedef union hyp_spinlock { u32 __val; @@ -89,4 +90,28 @@ static inline void hyp_spin_unlock(hyp_spinlock_t *lock) : "memory"); } +static inline bool hyp_spin_is_locked(hyp_spinlock_t *lock) +{ + hyp_spinlock_t lockval = READ_ONCE(*lock); + + return lockval.owner != lockval.next; +} + +#ifdef CONFIG_NVHE_EL2_DEBUG +static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) +{ + /* + * The __pkvm_init() path accesses protected data-structures without + * holding locks as the other CPUs are guaranteed to not enter EL2 + * concurrently at this point in time. The point by which EL2 is + * initialized on all CPUs is reflected in the pkvm static key, so + * wait until it is set before checking the lock state. + */ + if (static_branch_likely(&kvm_protected_mode_initialized)) + BUG_ON(!hyp_spin_is_locked(lock)); +} +#else +static inline void hyp_assert_lock_held(hyp_spinlock_t *lock) { } +#endif + #endif /* __ARM64_KVM_NVHE_SPINLOCK_H__ */ diff --git a/arch/arm64/kvm/hyp/nvhe/debug-sr.c b/arch/arm64/kvm/hyp/nvhe/debug-sr.c index 7d3f25868cae..df361d839902 100644 --- a/arch/arm64/kvm/hyp/nvhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/nvhe/debug-sr.c @@ -109,7 +109,7 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu) __debug_switch_to_host_common(vcpu); } -u32 __kvm_get_mdcr_el2(void) +u64 __kvm_get_mdcr_el2(void) { return read_sysreg(mdcr_el2); } diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 2b23400e0fb3..4b652ffb591d 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -7,6 +7,7 @@ #include #include +#include #include #include @@ -85,12 +86,24 @@ SYM_FUNC_START(__hyp_do_panic) mov x29, x0 +#ifdef CONFIG_NVHE_EL2_DEBUG + /* Ensure host stage-2 is disabled */ + mrs x0, hcr_el2 + bic x0, x0, #HCR_VM + msr hcr_el2, x0 + isb + tlbi vmalls12e1 + dsb nsh +#endif + /* Load the panic arguments into x0-7 */ mrs x0, esr_el2 - get_vcpu_ptr x4, x5 - mrs x5, far_el2 - mrs x6, hpfar_el2 - mov x7, xzr // Unused argument + mov x4, x3 + mov x3, x2 + hyp_pa x3, x6 + get_vcpu_ptr x5, x6 + mrs x6, far_el2 + mrs x7, hpfar_el2 /* Enter the host, conditionally restoring the host context. */ cbz x29, __host_enter_without_restoring diff --git a/arch/arm64/kvm/hyp/nvhe/hyp-main.c b/arch/arm64/kvm/hyp/nvhe/hyp-main.c index 1632f001f4ed..2da6aa8da868 100644 --- a/arch/arm64/kvm/hyp/nvhe/hyp-main.c +++ b/arch/arm64/kvm/hyp/nvhe/hyp-main.c @@ -140,14 +140,11 @@ static void handle___pkvm_cpu_set_vector(struct kvm_cpu_context *host_ctxt) cpu_reg(host_ctxt, 1) = pkvm_cpu_set_vector(slot); } -static void handle___pkvm_create_mappings(struct kvm_cpu_context *host_ctxt) +static void handle___pkvm_host_share_hyp(struct kvm_cpu_context *host_ctxt) { - DECLARE_REG(unsigned long, start, host_ctxt, 1); - DECLARE_REG(unsigned long, size, host_ctxt, 2); - DECLARE_REG(unsigned long, phys, host_ctxt, 3); - DECLARE_REG(enum kvm_pgtable_prot, prot, host_ctxt, 4); + DECLARE_REG(u64, pfn, host_ctxt, 1); - cpu_reg(host_ctxt, 1) = __pkvm_create_mappings(start, size, phys, prot); + cpu_reg(host_ctxt, 1) = __pkvm_host_share_hyp(pfn); } static void handle___pkvm_create_private_mapping(struct kvm_cpu_context *host_ctxt) @@ -163,14 +160,6 @@ static void handle___pkvm_prot_finalize(struct kvm_cpu_context *host_ctxt) { cpu_reg(host_ctxt, 1) = __pkvm_prot_finalize(); } - -static void handle___pkvm_mark_hyp(struct kvm_cpu_context *host_ctxt) -{ - DECLARE_REG(phys_addr_t, start, host_ctxt, 1); - DECLARE_REG(phys_addr_t, end, host_ctxt, 2); - - cpu_reg(host_ctxt, 1) = __pkvm_mark_hyp(start, end); -} typedef void (*hcall_t)(struct kvm_cpu_context *); #define HANDLE_FUNC(x) [__KVM_HOST_SMCCC_FUNC_##x] = (hcall_t)handle_##x @@ -193,10 +182,9 @@ static const hcall_t host_hcall[] = { HANDLE_FUNC(__vgic_v3_restore_aprs), HANDLE_FUNC(__pkvm_init), HANDLE_FUNC(__pkvm_cpu_set_vector), - HANDLE_FUNC(__pkvm_create_mappings), + HANDLE_FUNC(__pkvm_host_share_hyp), HANDLE_FUNC(__pkvm_create_private_mapping), HANDLE_FUNC(__pkvm_prot_finalize), - HANDLE_FUNC(__pkvm_mark_hyp), }; static void handle_host_hcall(struct kvm_cpu_context *host_ctxt) diff --git a/arch/arm64/kvm/hyp/nvhe/mem_protect.c b/arch/arm64/kvm/hyp/nvhe/mem_protect.c index a6ce991b1467..bacd493a4eac 100644 --- a/arch/arm64/kvm/hyp/nvhe/mem_protect.c +++ b/arch/arm64/kvm/hyp/nvhe/mem_protect.c @@ -31,7 +31,7 @@ static struct hyp_pool host_s2_pool; u64 id_aa64mmfr0_el1_sys_val; u64 id_aa64mmfr1_el1_sys_val; -static const u8 pkvm_hyp_id = 1; +const u8 pkvm_hyp_id = 1; static void *host_s2_zalloc_pages_exact(size_t size) { @@ -89,6 +89,8 @@ static void prepare_host_vtcr(void) id_aa64mmfr1_el1_sys_val, phys_shift); } +static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot); + int kvm_host_prepare_stage2(void *pgt_pool_base) { struct kvm_s2_mmu *mmu = &host_kvm.arch.mmu; @@ -101,16 +103,17 @@ int kvm_host_prepare_stage2(void *pgt_pool_base) if (ret) return ret; - ret = kvm_pgtable_stage2_init_flags(&host_kvm.pgt, &host_kvm.arch, - &host_kvm.mm_ops, KVM_HOST_S2_FLAGS); + ret = __kvm_pgtable_stage2_init(&host_kvm.pgt, &host_kvm.arch, + &host_kvm.mm_ops, KVM_HOST_S2_FLAGS, + host_stage2_force_pte_cb); if (ret) return ret; mmu->pgd_phys = __hyp_pa(host_kvm.pgt.pgd); mmu->arch = &host_kvm.arch; mmu->pgt = &host_kvm.pgt; - mmu->vmid.vmid_gen = 0; - mmu->vmid.vmid = 0; + WRITE_ONCE(mmu->vmid.vmid_gen, 0); + WRITE_ONCE(mmu->vmid.vmid, 0); return 0; } @@ -126,7 +129,7 @@ int __pkvm_prot_finalize(void) kvm_flush_dcache_to_poc(params, sizeof(*params)); write_sysreg(params->hcr_el2, hcr_el2); - __load_stage2(&host_kvm.arch.mmu, host_kvm.arch.vtcr); + __load_stage2(&host_kvm.arch.mmu, &host_kvm.arch); /* * Make sure to have an ISB before the TLB maintenance below but only @@ -159,6 +162,11 @@ static int host_stage2_unmap_dev_all(void) return kvm_pgtable_stage2_unmap(pgt, addr, BIT(pgt->ia_bits) - addr); } +struct kvm_mem_range { + u64 start; + u64 end; +}; + static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) { int cur, left = 0, right = hyp_memblock_nr; @@ -189,16 +197,26 @@ static bool find_mem_range(phys_addr_t addr, struct kvm_mem_range *range) return false; } +bool addr_is_memory(phys_addr_t phys) +{ + struct kvm_mem_range range; + + return find_mem_range(phys, &range); +} + +static bool is_in_mem_range(u64 addr, struct kvm_mem_range *range) +{ + return range->start <= addr && addr < range->end; +} + static bool range_is_memory(u64 start, u64 end) { - struct kvm_mem_range r1, r2; + struct kvm_mem_range r; - if (!find_mem_range(start, &r1) || !find_mem_range(end - 1, &r2)) - return false; - if (r1.start != r2.start) + if (!find_mem_range(start, &r)) return false; - return true; + return is_in_mem_range(end - 1, &r); } static inline int __host_stage2_idmap(u64 start, u64 end, @@ -208,60 +226,208 @@ static inline int __host_stage2_idmap(u64 start, u64 end, prot, &host_s2_pool); } -static int host_stage2_idmap(u64 addr) +/* + * The pool has been provided with enough pages to cover all of memory with + * page granularity, but it is difficult to know how much of the MMIO range + * we will need to cover upfront, so we may need to 'recycle' the pages if we + * run out. + */ +#define host_stage2_try(fn, ...) \ + ({ \ + int __ret; \ + hyp_assert_lock_held(&host_kvm.lock); \ + __ret = fn(__VA_ARGS__); \ + if (__ret == -ENOMEM) { \ + __ret = host_stage2_unmap_dev_all(); \ + if (!__ret) \ + __ret = fn(__VA_ARGS__); \ + } \ + __ret; \ + }) + +static inline bool range_included(struct kvm_mem_range *child, + struct kvm_mem_range *parent) { - enum kvm_pgtable_prot prot = KVM_PGTABLE_PROT_R | KVM_PGTABLE_PROT_W; - struct kvm_mem_range range; - bool is_memory = find_mem_range(addr, &range); + return parent->start <= child->start && child->end <= parent->end; +} + +static int host_stage2_adjust_range(u64 addr, struct kvm_mem_range *range) +{ + struct kvm_mem_range cur; + kvm_pte_t pte; + u32 level; int ret; - if (is_memory) - prot |= KVM_PGTABLE_PROT_X; + hyp_assert_lock_held(&host_kvm.lock); + ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, &level); + if (ret) + return ret; + + if (kvm_pte_valid(pte)) + return -EAGAIN; + + if (pte) + return -EPERM; + + do { + u64 granule = kvm_granule_size(level); + cur.start = ALIGN_DOWN(addr, granule); + cur.end = cur.start + granule; + level++; + } while ((level < KVM_PGTABLE_MAX_LEVELS) && + !(kvm_level_supports_block_mapping(level) && + range_included(&cur, range))); + + *range = cur; + + return 0; +} + +int host_stage2_idmap_locked(phys_addr_t addr, u64 size, + enum kvm_pgtable_prot prot) +{ + hyp_assert_lock_held(&host_kvm.lock); + + return host_stage2_try(__host_stage2_idmap, addr, addr + size, prot); +} + +int host_stage2_set_owner_locked(phys_addr_t addr, u64 size, u8 owner_id) +{ + hyp_assert_lock_held(&host_kvm.lock); + + return host_stage2_try(kvm_pgtable_stage2_set_owner, &host_kvm.pgt, + addr, size, &host_s2_pool, owner_id); +} + +static bool host_stage2_force_pte_cb(u64 addr, u64 end, enum kvm_pgtable_prot prot) +{ + /* + * Block mappings must be used with care in the host stage-2 as a + * kvm_pgtable_stage2_map() operation targeting a page in the range of + * an existing block will delete the block under the assumption that + * mappings in the rest of the block range can always be rebuilt lazily. + * That assumption is correct for the host stage-2 with RWX mappings + * targeting memory or RW mappings targeting MMIO ranges (see + * host_stage2_idmap() below which implements some of the host memory + * abort logic). However, this is not safe for any other mappings where + * the host stage-2 page-table is in fact the only place where this + * state is stored. In all those cases, it is safer to use page-level + * mappings, hence avoiding to lose the state because of side-effects in + * kvm_pgtable_stage2_map(). + */ + if (range_is_memory(addr, end)) + return prot != PKVM_HOST_MEM_PROT; + else + return prot != PKVM_HOST_MMIO_PROT; +} + +static int host_stage2_idmap(u64 addr) +{ + struct kvm_mem_range range; + bool is_memory = find_mem_range(addr, &range); + enum kvm_pgtable_prot prot; + int ret; + + prot = is_memory ? PKVM_HOST_MEM_PROT : PKVM_HOST_MMIO_PROT; hyp_spin_lock(&host_kvm.lock); - ret = kvm_pgtable_stage2_find_range(&host_kvm.pgt, addr, prot, &range); + ret = host_stage2_adjust_range(addr, &range); if (ret) goto unlock; - ret = __host_stage2_idmap(range.start, range.end, prot); - if (ret != -ENOMEM) - goto unlock; - - /* - * The pool has been provided with enough pages to cover all of memory - * with page granularity, but it is difficult to know how much of the - * MMIO range we will need to cover upfront, so we may need to 'recycle' - * the pages if we run out. - */ - ret = host_stage2_unmap_dev_all(); - if (ret) - goto unlock; - - ret = __host_stage2_idmap(range.start, range.end, prot); - + ret = host_stage2_idmap_locked(range.start, range.end - range.start, prot); unlock: hyp_spin_unlock(&host_kvm.lock); return ret; } -int __pkvm_mark_hyp(phys_addr_t start, phys_addr_t end) +static inline bool check_prot(enum kvm_pgtable_prot prot, + enum kvm_pgtable_prot required, + enum kvm_pgtable_prot denied) { + return (prot & (required | denied)) == required; +} + +int __pkvm_host_share_hyp(u64 pfn) +{ + phys_addr_t addr = hyp_pfn_to_phys(pfn); + enum kvm_pgtable_prot prot, cur; + void *virt = __hyp_va(addr); + enum pkvm_page_state state; + kvm_pte_t pte; int ret; - /* - * host_stage2_unmap_dev_all() currently relies on MMIO mappings being - * non-persistent, so don't allow changing page ownership in MMIO range. - */ - if (!range_is_memory(start, end)) + if (!addr_is_memory(addr)) return -EINVAL; hyp_spin_lock(&host_kvm.lock); - ret = kvm_pgtable_stage2_set_owner(&host_kvm.pgt, start, end - start, - &host_s2_pool, pkvm_hyp_id); + hyp_spin_lock(&pkvm_pgd_lock); + + ret = kvm_pgtable_get_leaf(&host_kvm.pgt, addr, &pte, NULL); + if (ret) + goto unlock; + if (!pte) + goto map_shared; + + /* + * Check attributes in the host stage-2 PTE. We need the page to be: + * - mapped RWX as we're sharing memory; + * - not borrowed, as that implies absence of ownership. + * Otherwise, we can't let it got through + */ + cur = kvm_pgtable_stage2_pte_prot(pte); + prot = pkvm_mkstate(0, PKVM_PAGE_SHARED_BORROWED); + if (!check_prot(cur, PKVM_HOST_MEM_PROT, prot)) { + ret = -EPERM; + goto unlock; + } + + state = pkvm_getstate(cur); + if (state == PKVM_PAGE_OWNED) + goto map_shared; + + /* + * Tolerate double-sharing the same page, but this requires + * cross-checking the hypervisor stage-1. + */ + if (state != PKVM_PAGE_SHARED_OWNED) { + ret = -EPERM; + goto unlock; + } + + ret = kvm_pgtable_get_leaf(&pkvm_pgtable, (u64)virt, &pte, NULL); + if (ret) + goto unlock; + + /* + * If the page has been shared with the hypervisor, it must be + * already mapped as SHARED_BORROWED in its stage-1. + */ + cur = kvm_pgtable_hyp_pte_prot(pte); + prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); + if (!check_prot(cur, prot, ~prot)) + ret = -EPERM; + goto unlock; + +map_shared: + /* + * If the page is not yet shared, adjust mappings in both page-tables + * while both locks are held. + */ + prot = pkvm_mkstate(PAGE_HYP, PKVM_PAGE_SHARED_BORROWED); + ret = pkvm_create_mappings_locked(virt, virt + PAGE_SIZE, prot); + BUG_ON(ret); + + prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); + ret = host_stage2_idmap_locked(addr, PAGE_SIZE, prot); + BUG_ON(ret); + +unlock: + hyp_spin_unlock(&pkvm_pgd_lock); hyp_spin_unlock(&host_kvm.lock); - return ret != -EAGAIN ? ret : 0; + return ret; } void handle_host_mem_abort(struct kvm_cpu_context *host_ctxt) diff --git a/arch/arm64/kvm/hyp/nvhe/mm.c b/arch/arm64/kvm/hyp/nvhe/mm.c index a8efdf0f9003..2fabeceb889a 100644 --- a/arch/arm64/kvm/hyp/nvhe/mm.c +++ b/arch/arm64/kvm/hyp/nvhe/mm.c @@ -23,8 +23,8 @@ u64 __io_map_base; struct memblock_region hyp_memory[HYP_MEMBLOCK_REGIONS]; unsigned int hyp_memblock_nr; -int __pkvm_create_mappings(unsigned long start, unsigned long size, - unsigned long phys, enum kvm_pgtable_prot prot) +static int __pkvm_create_mappings(unsigned long start, unsigned long size, + unsigned long phys, enum kvm_pgtable_prot prot) { int err; @@ -67,13 +67,15 @@ out: return addr; } -int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) +int pkvm_create_mappings_locked(void *from, void *to, enum kvm_pgtable_prot prot) { unsigned long start = (unsigned long)from; unsigned long end = (unsigned long)to; unsigned long virt_addr; phys_addr_t phys; + hyp_assert_lock_held(&pkvm_pgd_lock); + start = start & PAGE_MASK; end = PAGE_ALIGN(end); @@ -81,7 +83,8 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) int err; phys = hyp_virt_to_phys((void *)virt_addr); - err = __pkvm_create_mappings(virt_addr, PAGE_SIZE, phys, prot); + err = kvm_pgtable_hyp_map(&pkvm_pgtable, virt_addr, PAGE_SIZE, + phys, prot); if (err) return err; } @@ -89,6 +92,17 @@ int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) return 0; } +int pkvm_create_mappings(void *from, void *to, enum kvm_pgtable_prot prot) +{ + int ret; + + hyp_spin_lock(&pkvm_pgd_lock); + ret = pkvm_create_mappings_locked(from, to, prot); + hyp_spin_unlock(&pkvm_pgd_lock); + + return ret; +} + int hyp_back_vmemmap(phys_addr_t phys, unsigned long size, phys_addr_t back) { unsigned long start, end; diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 0b574d106519..57c27846320f 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -58,6 +58,7 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, { void *start, *end, *virt = hyp_phys_to_virt(phys); unsigned long pgt_size = hyp_s1_pgtable_pages() << PAGE_SHIFT; + enum kvm_pgtable_prot prot; int ret, i; /* Recreate the hyp page-table using the early page allocator */ @@ -83,10 +84,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; - ret = pkvm_create_mappings(__start_rodata, __end_rodata, PAGE_HYP_RO); - if (ret) - return ret; - ret = pkvm_create_mappings(__hyp_rodata_start, __hyp_rodata_end, PAGE_HYP_RO); if (ret) return ret; @@ -95,10 +92,6 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, if (ret) return ret; - ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, PAGE_HYP_RO); - if (ret) - return ret; - ret = pkvm_create_mappings(virt, virt + size, PAGE_HYP); if (ret) return ret; @@ -117,6 +110,24 @@ static int recreate_hyp_mappings(phys_addr_t phys, unsigned long size, return ret; } + /* + * Map the host's .bss and .rodata sections RO in the hypervisor, but + * transfer the ownership from the host to the hypervisor itself to + * make sure it can't be donated or shared with another entity. + * + * The ownership transition requires matching changes in the host + * stage-2. This will be done later (see finalize_host_mappings()) once + * the hyp_vmemmap is addressable. + */ + prot = pkvm_mkstate(PAGE_HYP_RO, PKVM_PAGE_SHARED_OWNED); + ret = pkvm_create_mappings(__start_rodata, __end_rodata, prot); + if (ret) + return ret; + + ret = pkvm_create_mappings(__hyp_bss_end, __bss_stop, prot); + if (ret) + return ret; + return 0; } @@ -148,6 +159,57 @@ static void hpool_put_page(void *addr) hyp_put_page(&hpool, addr); } +static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, + kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, + void * const arg) +{ + enum kvm_pgtable_prot prot; + enum pkvm_page_state state; + kvm_pte_t pte = *ptep; + phys_addr_t phys; + + if (!kvm_pte_valid(pte)) + return 0; + + if (level != (KVM_PGTABLE_MAX_LEVELS - 1)) + return -EINVAL; + + phys = kvm_pte_to_phys(pte); + if (!addr_is_memory(phys)) + return 0; + + /* + * Adjust the host stage-2 mappings to match the ownership attributes + * configured in the hypervisor stage-1. + */ + state = pkvm_getstate(kvm_pgtable_hyp_pte_prot(pte)); + switch (state) { + case PKVM_PAGE_OWNED: + return host_stage2_set_owner_locked(phys, PAGE_SIZE, pkvm_hyp_id); + case PKVM_PAGE_SHARED_OWNED: + prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_BORROWED); + break; + case PKVM_PAGE_SHARED_BORROWED: + prot = pkvm_mkstate(PKVM_HOST_MEM_PROT, PKVM_PAGE_SHARED_OWNED); + break; + default: + return -EINVAL; + } + + return host_stage2_idmap_locked(phys, PAGE_SIZE, prot); +} + +static int finalize_host_mappings(void) +{ + struct kvm_pgtable_walker walker = { + .cb = finalize_host_mappings_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + }; + + return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits), &walker); +} + void __noreturn __pkvm_init_finalise(void) { struct kvm_host_data *host_data = this_cpu_ptr(&kvm_host_data); @@ -167,6 +229,10 @@ void __noreturn __pkvm_init_finalise(void) if (ret) goto out; + ret = finalize_host_mappings(); + if (ret) + goto out; + pkvm_pgtable_mm_ops = (struct kvm_pgtable_mm_ops) { .zalloc_page = hyp_zalloc_hyp_page, .phys_to_virt = hyp_phys_to_virt, diff --git a/arch/arm64/kvm/hyp/nvhe/switch.c b/arch/arm64/kvm/hyp/nvhe/switch.c index f7af9688c1f7..a34b01cc8ab9 100644 --- a/arch/arm64/kvm/hyp/nvhe/switch.c +++ b/arch/arm64/kvm/hyp/nvhe/switch.c @@ -41,7 +41,7 @@ static void __activate_traps(struct kvm_vcpu *vcpu) ___activate_traps(vcpu); __activate_traps_common(vcpu); - val = CPTR_EL2_DEFAULT; + val = vcpu->arch.cptr_el2; val |= CPTR_EL2_TTA | CPTR_EL2_TAM; if (!update_fp_enabled(vcpu)) { val |= CPTR_EL2_TFP | CPTR_EL2_TZ; @@ -69,12 +69,10 @@ static void __activate_traps(struct kvm_vcpu *vcpu) static void __deactivate_traps(struct kvm_vcpu *vcpu) { extern char __kvm_hyp_host_vector[]; - u64 mdcr_el2, cptr; + u64 cptr; ___deactivate_traps(vcpu); - mdcr_el2 = read_sysreg(mdcr_el2); - if (cpus_have_final_cap(ARM64_WORKAROUND_SPECULATIVE_AT)) { u64 val; @@ -92,13 +90,8 @@ static void __deactivate_traps(struct kvm_vcpu *vcpu) isb(); } - __deactivate_traps_common(); + __deactivate_traps_common(vcpu); - mdcr_el2 &= MDCR_EL2_HPMN_MASK; - mdcr_el2 |= MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT; - mdcr_el2 |= MDCR_EL2_E2TB_MASK << MDCR_EL2_E2TB_SHIFT; - - write_sysreg(mdcr_el2, mdcr_el2); write_sysreg(this_cpu_ptr(&kvm_init_params)->hcr_el2, hcr_el2); cptr = CPTR_EL2_DEFAULT; @@ -170,6 +163,7 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) { struct kvm_cpu_context *host_ctxt; struct kvm_cpu_context *guest_ctxt; + struct kvm_s2_mmu *mmu; bool pmu_switch_needed; u64 exit_code; @@ -213,7 +207,8 @@ int __kvm_vcpu_run(struct kvm_vcpu *vcpu) __sysreg32_restore_state(vcpu); __sysreg_restore_state_nvhe(guest_ctxt); - __load_guest_stage2(kern_hyp_va(vcpu->arch.hw_mmu)); + mmu = kern_hyp_va(vcpu->arch.hw_mmu); + __load_stage2(mmu, kern_hyp_va(mmu->arch)); __activate_traps(vcpu); __hyp_vgic_restore_state(vcpu); diff --git a/arch/arm64/kvm/hyp/nvhe/tlb.c b/arch/arm64/kvm/hyp/nvhe/tlb.c index 38ed0f6f2703..d296d617f589 100644 --- a/arch/arm64/kvm/hyp/nvhe/tlb.c +++ b/arch/arm64/kvm/hyp/nvhe/tlb.c @@ -34,12 +34,12 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, } /* - * __load_guest_stage2() includes an ISB only when the AT + * __load_stage2() includes an ISB only when the AT * workaround is applied. Take care of the opposite condition, * ensuring that we always have an ISB, but not two ISBs back * to back. */ - __load_guest_stage2(mmu); + __load_stage2(mmu, kern_hyp_va(mmu->arch)); asm(ALTERNATIVE("isb", "nop", ARM64_WORKAROUND_SPECULATIVE_AT)); } diff --git a/arch/arm64/kvm/hyp/pgtable.c b/arch/arm64/kvm/hyp/pgtable.c index 05321f4165e3..f8ceebe4982e 100644 --- a/arch/arm64/kvm/hyp/pgtable.c +++ b/arch/arm64/kvm/hyp/pgtable.c @@ -11,16 +11,12 @@ #include #include -#define KVM_PTE_VALID BIT(0) #define KVM_PTE_TYPE BIT(1) #define KVM_PTE_TYPE_BLOCK 0 #define KVM_PTE_TYPE_PAGE 1 #define KVM_PTE_TYPE_TABLE 1 -#define KVM_PTE_ADDR_MASK GENMASK(47, PAGE_SHIFT) -#define KVM_PTE_ADDR_51_48 GENMASK(15, 12) - #define KVM_PTE_LEAF_ATTR_LO GENMASK(11, 2) #define KVM_PTE_LEAF_ATTR_LO_S1_ATTRIDX GENMASK(4, 2) @@ -40,6 +36,8 @@ #define KVM_PTE_LEAF_ATTR_HI GENMASK(63, 51) +#define KVM_PTE_LEAF_ATTR_HI_SW GENMASK(58, 55) + #define KVM_PTE_LEAF_ATTR_HI_S1_XN BIT(54) #define KVM_PTE_LEAF_ATTR_HI_S2_XN BIT(54) @@ -48,9 +46,7 @@ KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W | \ KVM_PTE_LEAF_ATTR_HI_S2_XN) -#define KVM_PTE_LEAF_ATTR_S2_IGNORED GENMASK(58, 55) - -#define KVM_INVALID_PTE_OWNER_MASK GENMASK(63, 56) +#define KVM_INVALID_PTE_OWNER_MASK GENMASK(9, 2) #define KVM_MAX_OWNER_ID 1 struct kvm_pgtable_walk_data { @@ -61,17 +57,6 @@ struct kvm_pgtable_walk_data { u64 end; }; -static u64 kvm_granule_shift(u32 level) -{ - /* Assumes KVM_PGTABLE_MAX_LEVELS is 4 */ - return ARM64_HW_PGTABLE_LEVEL_SHIFT(level); -} - -static u64 kvm_granule_size(u32 level) -{ - return BIT(kvm_granule_shift(level)); -} - #define KVM_PHYS_INVALID (-1ULL) static bool kvm_phys_is_valid(u64 phys) @@ -79,15 +64,6 @@ static bool kvm_phys_is_valid(u64 phys) return phys < BIT(id_aa64mmfr0_parange_to_phys_shift(ID_AA64MMFR0_PARANGE_MAX)); } -static bool kvm_level_supports_block_mapping(u32 level) -{ - /* - * Reject invalid block mappings and don't bother with 4TB mappings for - * 52-bit PAs. - */ - return !(level == 0 || (PAGE_SIZE != SZ_4K && level == 1)); -} - static bool kvm_block_mapping_supported(u64 addr, u64 end, u64 phys, u32 level) { u64 granule = kvm_granule_size(level); @@ -135,11 +111,6 @@ static u32 kvm_pgd_pages(u32 ia_bits, u32 start_level) return __kvm_pgd_page_idx(&pgt, -1ULL) + 1; } -static bool kvm_pte_valid(kvm_pte_t pte) -{ - return pte & KVM_PTE_VALID; -} - static bool kvm_pte_table(kvm_pte_t pte, u32 level) { if (level == KVM_PGTABLE_MAX_LEVELS - 1) @@ -151,16 +122,6 @@ static bool kvm_pte_table(kvm_pte_t pte, u32 level) return FIELD_GET(KVM_PTE_TYPE, pte) == KVM_PTE_TYPE_TABLE; } -static u64 kvm_pte_to_phys(kvm_pte_t pte) -{ - u64 pa = pte & KVM_PTE_ADDR_MASK; - - if (PAGE_SHIFT == 16) - pa |= FIELD_GET(KVM_PTE_ADDR_51_48, pte) << 48; - - return pa; -} - static kvm_pte_t kvm_phys_to_pte(u64 pa) { kvm_pte_t pte = pa & KVM_PTE_ADDR_MASK; @@ -326,6 +287,45 @@ int kvm_pgtable_walk(struct kvm_pgtable *pgt, u64 addr, u64 size, return _kvm_pgtable_walk(&walk_data); } +struct leaf_walk_data { + kvm_pte_t pte; + u32 level; +}; + +static int leaf_walker(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, + enum kvm_pgtable_walk_flags flag, void * const arg) +{ + struct leaf_walk_data *data = arg; + + data->pte = *ptep; + data->level = level; + + return 0; +} + +int kvm_pgtable_get_leaf(struct kvm_pgtable *pgt, u64 addr, + kvm_pte_t *ptep, u32 *level) +{ + struct leaf_walk_data data; + struct kvm_pgtable_walker walker = { + .cb = leaf_walker, + .flags = KVM_PGTABLE_WALK_LEAF, + .arg = &data, + }; + int ret; + + ret = kvm_pgtable_walk(pgt, ALIGN_DOWN(addr, PAGE_SIZE), + PAGE_SIZE, &walker); + if (!ret) { + if (ptep) + *ptep = data.pte; + if (level) + *level = data.level; + } + + return ret; +} + struct hyp_map_data { u64 phys; kvm_pte_t attr; @@ -357,11 +357,47 @@ static int hyp_set_prot_attr(enum kvm_pgtable_prot prot, kvm_pte_t *ptep) attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_AP, ap); attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S1_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S1_AF; + attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; return 0; } +enum kvm_pgtable_prot kvm_pgtable_hyp_pte_prot(kvm_pte_t pte) +{ + enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; + u32 ap; + + if (!kvm_pte_valid(pte)) + return prot; + + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S1_XN)) + prot |= KVM_PGTABLE_PROT_X; + + ap = FIELD_GET(KVM_PTE_LEAF_ATTR_LO_S1_AP, pte); + if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RO) + prot |= KVM_PGTABLE_PROT_R; + else if (ap == KVM_PTE_LEAF_ATTR_LO_S1_AP_RW) + prot |= KVM_PGTABLE_PROT_RW; + + return prot; +} + +static bool hyp_pte_needs_update(kvm_pte_t old, kvm_pte_t new) +{ + /* + * Tolerate KVM recreating the exact same mapping, or changing software + * bits if the existing mapping was valid. + */ + if (old == new) + return false; + + if (!kvm_pte_valid(old)) + return true; + + return !WARN_ON((old ^ new) & ~KVM_PTE_LEAF_ATTR_HI_SW); +} + static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct hyp_map_data *data) { @@ -371,9 +407,8 @@ static bool hyp_map_walker_try_leaf(u64 addr, u64 end, u32 level, if (!kvm_block_mapping_supported(addr, end, phys, level)) return false; - /* Tolerate KVM recreating the exact same mapping */ new = kvm_init_valid_leaf_pte(phys, data->attr, level); - if (old != new && !WARN_ON(kvm_pte_valid(old))) + if (hyp_pte_needs_update(old, new)) smp_store_release(ptep, new); data->phys += granule; @@ -438,6 +473,8 @@ int kvm_pgtable_hyp_init(struct kvm_pgtable *pgt, u32 va_bits, pgt->start_level = KVM_PGTABLE_MAX_LEVELS - levels; pgt->mm_ops = mm_ops; pgt->mmu = NULL; + pgt->force_pte_cb = NULL; + return 0; } @@ -475,6 +512,9 @@ struct stage2_map_data { void *memcache; struct kvm_pgtable_mm_ops *mm_ops; + + /* Force mappings to page granularity */ + bool force_pte; }; u64 kvm_get_vtcr(u64 mmfr0, u64 mmfr1, u32 phys_shift) @@ -539,11 +579,29 @@ static int stage2_set_prot_attr(struct kvm_pgtable *pgt, enum kvm_pgtable_prot p attr |= FIELD_PREP(KVM_PTE_LEAF_ATTR_LO_S2_SH, sh); attr |= KVM_PTE_LEAF_ATTR_LO_S2_AF; + attr |= prot & KVM_PTE_LEAF_ATTR_HI_SW; *ptep = attr; return 0; } +enum kvm_pgtable_prot kvm_pgtable_stage2_pte_prot(kvm_pte_t pte) +{ + enum kvm_pgtable_prot prot = pte & KVM_PTE_LEAF_ATTR_HI_SW; + + if (!kvm_pte_valid(pte)) + return prot; + + if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R) + prot |= KVM_PGTABLE_PROT_R; + if (pte & KVM_PTE_LEAF_ATTR_LO_S2_S2AP_W) + prot |= KVM_PGTABLE_PROT_W; + if (!(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN)) + prot |= KVM_PGTABLE_PROT_X; + + return prot; +} + static bool stage2_pte_needs_update(kvm_pte_t old, kvm_pte_t new) { if (!kvm_pte_valid(old) || !kvm_pte_valid(new)) @@ -588,6 +646,15 @@ static bool stage2_pte_executable(kvm_pte_t pte) return !(pte & KVM_PTE_LEAF_ATTR_HI_S2_XN); } +static bool stage2_leaf_mapping_allowed(u64 addr, u64 end, u32 level, + struct stage2_map_data *data) +{ + if (data->force_pte && (level < (KVM_PGTABLE_MAX_LEVELS - 1))) + return false; + + return kvm_block_mapping_supported(addr, end, data->phys, level); +} + static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, kvm_pte_t *ptep, struct stage2_map_data *data) @@ -597,7 +664,7 @@ static int stage2_map_walker_try_leaf(u64 addr, u64 end, u32 level, struct kvm_pgtable *pgt = data->mmu->pgt; struct kvm_pgtable_mm_ops *mm_ops = data->mm_ops; - if (!kvm_block_mapping_supported(addr, end, phys, level)) + if (!stage2_leaf_mapping_allowed(addr, end, level, data)) return -E2BIG; if (kvm_phys_is_valid(phys)) @@ -641,7 +708,7 @@ static int stage2_map_walk_table_pre(u64 addr, u64 end, u32 level, if (data->anchor) return 0; - if (!kvm_block_mapping_supported(addr, end, data->phys, level)) + if (!stage2_leaf_mapping_allowed(addr, end, level, data)) return 0; data->childp = kvm_pte_follow(*ptep, data->mm_ops); @@ -771,6 +838,7 @@ int kvm_pgtable_stage2_map(struct kvm_pgtable *pgt, u64 addr, u64 size, .mmu = pgt->mmu, .memcache = mc, .mm_ops = pgt->mm_ops, + .force_pte = pgt->force_pte_cb && pgt->force_pte_cb(addr, addr + size, prot), }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -802,6 +870,7 @@ int kvm_pgtable_stage2_set_owner(struct kvm_pgtable *pgt, u64 addr, u64 size, .memcache = mc, .mm_ops = pgt->mm_ops, .owner_id = owner_id, + .force_pte = true, }; struct kvm_pgtable_walker walker = { .cb = stage2_map_walker, @@ -995,6 +1064,9 @@ int kvm_pgtable_stage2_relax_perms(struct kvm_pgtable *pgt, u64 addr, u32 level; kvm_pte_t set = 0, clr = 0; + if (prot & KVM_PTE_LEAF_ATTR_HI_SW) + return -EINVAL; + if (prot & KVM_PGTABLE_PROT_R) set |= KVM_PTE_LEAF_ATTR_LO_S2_S2AP_R; @@ -1043,9 +1115,11 @@ int kvm_pgtable_stage2_flush(struct kvm_pgtable *pgt, u64 addr, u64 size) return kvm_pgtable_walk(pgt, addr, size, &walker); } -int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch, - struct kvm_pgtable_mm_ops *mm_ops, - enum kvm_pgtable_stage2_flags flags) + +int __kvm_pgtable_stage2_init(struct kvm_pgtable *pgt, struct kvm_arch *arch, + struct kvm_pgtable_mm_ops *mm_ops, + enum kvm_pgtable_stage2_flags flags, + kvm_pgtable_force_pte_cb_t force_pte_cb) { size_t pgd_sz; u64 vtcr = arch->vtcr; @@ -1063,6 +1137,7 @@ int kvm_pgtable_stage2_init_flags(struct kvm_pgtable *pgt, struct kvm_arch *arch pgt->mm_ops = mm_ops; pgt->mmu = &arch->mmu; pgt->flags = flags; + pgt->force_pte_cb = force_pte_cb; /* Ensure zeroed PGD pages are visible to the hardware walker */ dsb(ishst); @@ -1102,77 +1177,3 @@ void kvm_pgtable_stage2_destroy(struct kvm_pgtable *pgt) pgt->mm_ops->free_pages_exact(pgt->pgd, pgd_sz); pgt->pgd = NULL; } - -#define KVM_PTE_LEAF_S2_COMPAT_MASK (KVM_PTE_LEAF_ATTR_S2_PERMS | \ - KVM_PTE_LEAF_ATTR_LO_S2_MEMATTR | \ - KVM_PTE_LEAF_ATTR_S2_IGNORED) - -static int stage2_check_permission_walker(u64 addr, u64 end, u32 level, - kvm_pte_t *ptep, - enum kvm_pgtable_walk_flags flag, - void * const arg) -{ - kvm_pte_t old_attr, pte = *ptep, *new_attr = arg; - - /* - * Compatible mappings are either invalid and owned by the page-table - * owner (whose id is 0), or valid with matching permission attributes. - */ - if (kvm_pte_valid(pte)) { - old_attr = pte & KVM_PTE_LEAF_S2_COMPAT_MASK; - if (old_attr != *new_attr) - return -EEXIST; - } else if (pte) { - return -EEXIST; - } - - return 0; -} - -int kvm_pgtable_stage2_find_range(struct kvm_pgtable *pgt, u64 addr, - enum kvm_pgtable_prot prot, - struct kvm_mem_range *range) -{ - kvm_pte_t attr; - struct kvm_pgtable_walker check_perm_walker = { - .cb = stage2_check_permission_walker, - .flags = KVM_PGTABLE_WALK_LEAF, - .arg = &attr, - }; - u64 granule, start, end; - u32 level; - int ret; - - ret = stage2_set_prot_attr(pgt, prot, &attr); - if (ret) - return ret; - attr &= KVM_PTE_LEAF_S2_COMPAT_MASK; - - for (level = pgt->start_level; level < KVM_PGTABLE_MAX_LEVELS; level++) { - granule = kvm_granule_size(level); - start = ALIGN_DOWN(addr, granule); - end = start + granule; - - if (!kvm_level_supports_block_mapping(level)) - continue; - - if (start < range->start || range->end < end) - continue; - - /* - * Check the presence of existing mappings with incompatible - * permissions within the current block range, and try one level - * deeper if one is found. - */ - ret = kvm_pgtable_walk(pgt, start, granule, &check_perm_walker); - if (ret != -EEXIST) - break; - } - - if (!ret) { - range->start = start; - range->end = end; - } - - return ret; -} diff --git a/arch/arm64/kvm/hyp/vhe/debug-sr.c b/arch/arm64/kvm/hyp/vhe/debug-sr.c index f1e2e5a00933..289689b2682d 100644 --- a/arch/arm64/kvm/hyp/vhe/debug-sr.c +++ b/arch/arm64/kvm/hyp/vhe/debug-sr.c @@ -20,7 +20,7 @@ void __debug_switch_to_host(struct kvm_vcpu *vcpu) __debug_switch_to_host_common(vcpu); } -u32 __kvm_get_mdcr_el2(void) +u64 __kvm_get_mdcr_el2(void) { return read_sysreg(mdcr_el2); } diff --git a/arch/arm64/kvm/hyp/vhe/switch.c b/arch/arm64/kvm/hyp/vhe/switch.c index b3229924d243..ded2c66675f0 100644 --- a/arch/arm64/kvm/hyp/vhe/switch.c +++ b/arch/arm64/kvm/hyp/vhe/switch.c @@ -91,17 +91,9 @@ void activate_traps_vhe_load(struct kvm_vcpu *vcpu) __activate_traps_common(vcpu); } -void deactivate_traps_vhe_put(void) +void deactivate_traps_vhe_put(struct kvm_vcpu *vcpu) { - u64 mdcr_el2 = read_sysreg(mdcr_el2); - - mdcr_el2 &= MDCR_EL2_HPMN_MASK | - MDCR_EL2_E2PB_MASK << MDCR_EL2_E2PB_SHIFT | - MDCR_EL2_TPMS; - - write_sysreg(mdcr_el2, mdcr_el2); - - __deactivate_traps_common(); + __deactivate_traps_common(vcpu); } /* Switch to the guest for VHE systems running in EL2 */ @@ -124,11 +116,11 @@ static int __kvm_vcpu_run_vhe(struct kvm_vcpu *vcpu) * * We have already configured the guest's stage 1 translation in * kvm_vcpu_load_sysregs_vhe above. We must now call - * __load_guest_stage2 before __activate_traps, because - * __load_guest_stage2 configures stage 2 translation, and + * __load_stage2 before __activate_traps, because + * __load_stage2 configures stage 2 translation, and * __activate_traps clear HCR_EL2.TGE (among other things). */ - __load_guest_stage2(vcpu->arch.hw_mmu); + __load_stage2(vcpu->arch.hw_mmu, vcpu->arch.hw_mmu->arch); __activate_traps(vcpu); __kvm_adjust_pc(vcpu); diff --git a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c index 2a0b8c88d74f..007a12dd4351 100644 --- a/arch/arm64/kvm/hyp/vhe/sysreg-sr.c +++ b/arch/arm64/kvm/hyp/vhe/sysreg-sr.c @@ -101,7 +101,7 @@ void kvm_vcpu_put_sysregs_vhe(struct kvm_vcpu *vcpu) struct kvm_cpu_context *host_ctxt; host_ctxt = &this_cpu_ptr(&kvm_host_data)->host_ctxt; - deactivate_traps_vhe_put(); + deactivate_traps_vhe_put(vcpu); __sysreg_save_el1_state(guest_ctxt); __sysreg_save_user_state(guest_ctxt); diff --git a/arch/arm64/kvm/hyp/vhe/tlb.c b/arch/arm64/kvm/hyp/vhe/tlb.c index 66f17349f0c3..24cef9b87f9e 100644 --- a/arch/arm64/kvm/hyp/vhe/tlb.c +++ b/arch/arm64/kvm/hyp/vhe/tlb.c @@ -50,10 +50,10 @@ static void __tlb_switch_to_guest(struct kvm_s2_mmu *mmu, * * ARM erratum 1165522 requires some special handling (again), * as we need to make sure both stages of translation are in - * place before clearing TGE. __load_guest_stage2() already + * place before clearing TGE. __load_stage2() already * has an ISB in order to deal with this. */ - __load_guest_stage2(mmu); + __load_stage2(mmu, mmu->arch); val = read_sysreg(hcr_el2); val &= ~HCR_TGE; write_sysreg(val, hcr_el2); diff --git a/arch/arm64/kvm/mmu.c b/arch/arm64/kvm/mmu.c index 0625bf2353c2..1a94a7ca48f2 100644 --- a/arch/arm64/kvm/mmu.c +++ b/arch/arm64/kvm/mmu.c @@ -80,6 +80,7 @@ static bool memslot_is_logging(struct kvm_memory_slot *memslot) */ void kvm_flush_remote_tlbs(struct kvm *kvm) { + ++kvm->stat.generic.remote_tlb_flush_requests; kvm_call_hyp(__kvm_tlb_flush_vmid, &kvm->arch.mmu); } @@ -259,10 +260,8 @@ static int __create_hyp_mappings(unsigned long start, unsigned long size, { int err; - if (!kvm_host_owns_hyp_mappings()) { - return kvm_call_hyp_nvhe(__pkvm_create_mappings, - start, size, phys, prot); - } + if (WARN_ON(!kvm_host_owns_hyp_mappings())) + return -EINVAL; mutex_lock(&kvm_hyp_pgd_mutex); err = kvm_pgtable_hyp_map(hyp_pgtable, start, size, phys, prot); @@ -282,6 +281,21 @@ static phys_addr_t kvm_kaddr_to_phys(void *kaddr) } } +static int pkvm_share_hyp(phys_addr_t start, phys_addr_t end) +{ + phys_addr_t addr; + int ret; + + for (addr = ALIGN_DOWN(start, PAGE_SIZE); addr < end; addr += PAGE_SIZE) { + ret = kvm_call_hyp_nvhe(__pkvm_host_share_hyp, + __phys_to_pfn(addr)); + if (ret) + return ret; + } + + return 0; +} + /** * create_hyp_mappings - duplicate a kernel virtual address range in Hyp mode * @from: The virtual kernel start address of the range @@ -302,6 +316,13 @@ int create_hyp_mappings(void *from, void *to, enum kvm_pgtable_prot prot) if (is_kernel_in_hyp_mode()) return 0; + if (!kvm_host_owns_hyp_mappings()) { + if (WARN_ON(prot != PAGE_HYP)) + return -EPERM; + return pkvm_share_hyp(kvm_kaddr_to_phys(from), + kvm_kaddr_to_phys(to)); + } + start = start & PAGE_MASK; end = PAGE_ALIGN(end); @@ -433,6 +454,32 @@ int create_hyp_exec_mappings(phys_addr_t phys_addr, size_t size, return 0; } +static struct kvm_pgtable_mm_ops kvm_user_mm_ops = { + /* We shouldn't need any other callback to walk the PT */ + .phys_to_virt = kvm_host_va, +}; + +static int get_user_mapping_size(struct kvm *kvm, u64 addr) +{ + struct kvm_pgtable pgt = { + .pgd = (kvm_pte_t *)kvm->mm->pgd, + .ia_bits = VA_BITS, + .start_level = (KVM_PGTABLE_MAX_LEVELS - + CONFIG_PGTABLE_LEVELS), + .mm_ops = &kvm_user_mm_ops, + }; + kvm_pte_t pte = 0; /* Keep GCC quiet... */ + u32 level = ~0; + int ret; + + ret = kvm_pgtable_get_leaf(&pgt, addr, &pte, &level); + VM_BUG_ON(ret); + VM_BUG_ON(level >= KVM_PGTABLE_MAX_LEVELS); + VM_BUG_ON(!(pte & PTE_VALID)); + + return BIT(ARM64_HW_PGTABLE_LEVEL_SHIFT(level)); +} + static struct kvm_pgtable_mm_ops kvm_s2_mm_ops = { .zalloc_page = stage2_memcache_zalloc_page, .zalloc_pages_exact = kvm_host_zalloc_pages_exact, @@ -485,7 +532,7 @@ int kvm_init_stage2_mmu(struct kvm *kvm, struct kvm_s2_mmu *mmu) mmu->arch = &kvm->arch; mmu->pgt = pgt; mmu->pgd_phys = __pa(pgt->pgd); - mmu->vmid.vmid_gen = 0; + WRITE_ONCE(mmu->vmid.vmid_gen, 0); return 0; out_destroy_pgtable: @@ -780,7 +827,7 @@ static bool fault_supports_stage2_huge_mapping(struct kvm_memory_slot *memslot, * Returns the size of the mapping. */ static unsigned long -transparent_hugepage_adjust(struct kvm_memory_slot *memslot, +transparent_hugepage_adjust(struct kvm *kvm, struct kvm_memory_slot *memslot, unsigned long hva, kvm_pfn_t *pfnp, phys_addr_t *ipap) { @@ -791,8 +838,8 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot, * sure that the HVA and IPA are sufficiently aligned and that the * block map is contained within the memslot. */ - if (kvm_is_transparent_hugepage(pfn) && - fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE)) { + if (fault_supports_stage2_huge_mapping(memslot, hva, PMD_SIZE) && + get_user_mapping_size(kvm, hva) >= PMD_SIZE) { /* * The address we faulted on is backed by a transparent huge * page. However, because we map the compound huge page and @@ -814,7 +861,7 @@ transparent_hugepage_adjust(struct kvm_memory_slot *memslot, *ipap &= PMD_MASK; kvm_release_pfn_clean(pfn); pfn &= ~(PTRS_PER_PMD - 1); - kvm_get_pfn(pfn); + get_page(pfn_to_page(pfn)); *pfnp = pfn; return PMD_SIZE; @@ -1050,9 +1097,14 @@ static int user_mem_abort(struct kvm_vcpu *vcpu, phys_addr_t fault_ipa, * If we are not forced to use page mapping, check if we are * backed by a THP and thus use block mapping if possible. */ - if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) - vma_pagesize = transparent_hugepage_adjust(memslot, hva, - &pfn, &fault_ipa); + if (vma_pagesize == PAGE_SIZE && !(force_pte || device)) { + if (fault_status == FSC_PERM && fault_granule > PAGE_SIZE) + vma_pagesize = fault_granule; + else + vma_pagesize = transparent_hugepage_adjust(kvm, memslot, + hva, &pfn, + &fault_ipa); + } if (fault_status != FSC_PERM && !device && kvm_has_mte(kvm)) { /* Check the VMM hasn't introduced a new VM_SHARED VMA */ diff --git a/arch/arm64/kvm/perf.c b/arch/arm64/kvm/perf.c index 151c31fb9860..f9bb3b14130e 100644 --- a/arch/arm64/kvm/perf.c +++ b/arch/arm64/kvm/perf.c @@ -50,7 +50,7 @@ static struct perf_guest_info_callbacks kvm_guest_cbs = { int kvm_perf_init(void) { - if (kvm_pmu_probe_pmuver() != 0xf && !is_protected_kvm_enabled()) + if (kvm_pmu_probe_pmuver() != ID_AA64DFR0_PMUVER_IMP_DEF && !is_protected_kvm_enabled()) static_branch_enable(&kvm_arm_pmu_available); return perf_register_guest_info_callbacks(&kvm_guest_cbs); diff --git a/arch/arm64/kvm/pmu-emul.c b/arch/arm64/kvm/pmu-emul.c index f33825c995cb..f5065f23b413 100644 --- a/arch/arm64/kvm/pmu-emul.c +++ b/arch/arm64/kvm/pmu-emul.c @@ -373,7 +373,6 @@ static u64 kvm_pmu_overflow_status(struct kvm_vcpu *vcpu) reg = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); reg &= __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); reg &= __vcpu_sys_reg(vcpu, PMINTENSET_EL1); - reg &= kvm_pmu_valid_counter_mask(vcpu); } return reg; @@ -564,20 +563,21 @@ void kvm_pmu_software_increment(struct kvm_vcpu *vcpu, u64 val) */ void kvm_pmu_handle_pmcr(struct kvm_vcpu *vcpu, u64 val) { - unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); int i; if (val & ARMV8_PMU_PMCR_E) { kvm_pmu_enable_counter_mask(vcpu, - __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask); + __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); } else { - kvm_pmu_disable_counter_mask(vcpu, mask); + kvm_pmu_disable_counter_mask(vcpu, + __vcpu_sys_reg(vcpu, PMCNTENSET_EL0)); } if (val & ARMV8_PMU_PMCR_C) kvm_pmu_set_counter_value(vcpu, ARMV8_PMU_CYCLE_IDX, 0); if (val & ARMV8_PMU_PMCR_P) { + unsigned long mask = kvm_pmu_valid_counter_mask(vcpu); mask &= ~BIT(ARMV8_PMU_CYCLE_IDX); for_each_set_bit(i, &mask, 32) kvm_pmu_set_counter_value(vcpu, i, 0); @@ -745,7 +745,7 @@ int kvm_pmu_probe_pmuver(void) struct perf_event_attr attr = { }; struct perf_event *event; struct arm_pmu *pmu; - int pmuver = 0xf; + int pmuver = ID_AA64DFR0_PMUVER_IMP_DEF; /* * Create a dummy event that only counts user cycles. As we'll never @@ -770,7 +770,7 @@ int kvm_pmu_probe_pmuver(void) if (IS_ERR(event)) { pr_err_once("kvm: pmu event creation failed %ld\n", PTR_ERR(event)); - return 0xf; + return ID_AA64DFR0_PMUVER_IMP_DEF; } if (event->pmu) { @@ -923,7 +923,7 @@ int kvm_arm_pmu_v3_set_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr) if (!vcpu->kvm->arch.pmuver) vcpu->kvm->arch.pmuver = kvm_pmu_probe_pmuver(); - if (vcpu->kvm->arch.pmuver == 0xf) + if (vcpu->kvm->arch.pmuver == ID_AA64DFR0_PMUVER_IMP_DEF) return -ENODEV; switch (attr->attr) { diff --git a/arch/arm64/kvm/psci.c b/arch/arm64/kvm/psci.c index db4056ecccfd..74c47d420253 100644 --- a/arch/arm64/kvm/psci.c +++ b/arch/arm64/kvm/psci.c @@ -59,6 +59,12 @@ static void kvm_psci_vcpu_off(struct kvm_vcpu *vcpu) kvm_vcpu_kick(vcpu); } +static inline bool kvm_psci_valid_affinity(struct kvm_vcpu *vcpu, + unsigned long affinity) +{ + return !(affinity & ~MPIDR_HWID_BITMASK); +} + static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) { struct vcpu_reset_state *reset_state; @@ -66,9 +72,9 @@ static unsigned long kvm_psci_vcpu_on(struct kvm_vcpu *source_vcpu) struct kvm_vcpu *vcpu = NULL; unsigned long cpu_id; - cpu_id = smccc_get_arg1(source_vcpu) & MPIDR_HWID_BITMASK; - if (vcpu_mode_is_32bit(source_vcpu)) - cpu_id &= ~((u32) 0); + cpu_id = smccc_get_arg1(source_vcpu); + if (!kvm_psci_valid_affinity(source_vcpu, cpu_id)) + return PSCI_RET_INVALID_PARAMS; vcpu = kvm_mpidr_to_vcpu(kvm, cpu_id); @@ -126,6 +132,9 @@ static unsigned long kvm_psci_vcpu_affinity_info(struct kvm_vcpu *vcpu) target_affinity = smccc_get_arg1(vcpu); lowest_affinity_level = smccc_get_arg2(vcpu); + if (!kvm_psci_valid_affinity(vcpu, target_affinity)) + return PSCI_RET_INVALID_PARAMS; + /* Determine target affinity mask */ target_affinity_mask = psci_affinity_mask(lowest_affinity_level); if (!target_affinity_mask) diff --git a/arch/arm64/kvm/reset.c b/arch/arm64/kvm/reset.c index cba7872d69a8..5ce36b0a3343 100644 --- a/arch/arm64/kvm/reset.c +++ b/arch/arm64/kvm/reset.c @@ -210,10 +210,16 @@ static bool vcpu_allowed_register_width(struct kvm_vcpu *vcpu) */ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) { + struct vcpu_reset_state reset_state; int ret; bool loaded; u32 pstate; + mutex_lock(&vcpu->kvm->lock); + reset_state = vcpu->arch.reset_state; + WRITE_ONCE(vcpu->arch.reset_state.reset, false); + mutex_unlock(&vcpu->kvm->lock); + /* Reset PMU outside of the non-preemptible section */ kvm_pmu_vcpu_reset(vcpu); @@ -276,8 +282,8 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) * Additional reset state handling that PSCI may have imposed on us. * Must be done after all the sys_reg reset. */ - if (vcpu->arch.reset_state.reset) { - unsigned long target_pc = vcpu->arch.reset_state.pc; + if (reset_state.reset) { + unsigned long target_pc = reset_state.pc; /* Gracefully handle Thumb2 entry point */ if (vcpu_mode_is_32bit(vcpu) && (target_pc & 1)) { @@ -286,13 +292,11 @@ int kvm_reset_vcpu(struct kvm_vcpu *vcpu) } /* Propagate caller endianness */ - if (vcpu->arch.reset_state.be) + if (reset_state.be) kvm_vcpu_set_be(vcpu); *vcpu_pc(vcpu) = target_pc; - vcpu_set_reg(vcpu, 0, vcpu->arch.reset_state.r0); - - vcpu->arch.reset_state.reset = false; + vcpu_set_reg(vcpu, 0, reset_state.r0); } /* Reset timer */ @@ -311,31 +315,26 @@ u32 get_kvm_ipa_limit(void) int kvm_set_ipa_limit(void) { - unsigned int parange, tgran_2; + unsigned int parange; u64 mmfr0; mmfr0 = read_sanitised_ftr_reg(SYS_ID_AA64MMFR0_EL1); parange = cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_PARANGE_SHIFT); + /* + * IPA size beyond 48 bits could not be supported + * on either 4K or 16K page size. Hence let's cap + * it to 48 bits, in case it's reported as larger + * on the system. + */ + if (PAGE_SIZE != SZ_64K) + parange = min(parange, (unsigned int)ID_AA64MMFR0_PARANGE_48); /* * Check with ARMv8.5-GTG that our PAGE_SIZE is supported at * Stage-2. If not, things will stop very quickly. */ - switch (PAGE_SIZE) { - default: - case SZ_4K: - tgran_2 = ID_AA64MMFR0_TGRAN4_2_SHIFT; - break; - case SZ_16K: - tgran_2 = ID_AA64MMFR0_TGRAN16_2_SHIFT; - break; - case SZ_64K: - tgran_2 = ID_AA64MMFR0_TGRAN64_2_SHIFT; - break; - } - - switch (cpuid_feature_extract_unsigned_field(mmfr0, tgran_2)) { + switch (cpuid_feature_extract_unsigned_field(mmfr0, ID_AA64MMFR0_TGRAN_2_SHIFT)) { case ID_AA64MMFR0_TGRAN_2_SUPPORTED_NONE: kvm_err("PAGE_SIZE not supported at Stage-2, giving up\n"); return -EINVAL; @@ -369,7 +368,7 @@ int kvm_arm_setup_stage2(struct kvm *kvm, unsigned long type) phys_shift = KVM_VM_TYPE_ARM_IPA_SIZE(type); if (phys_shift) { if (phys_shift > kvm_ipa_limit || - phys_shift < 32) + phys_shift < ARM64_MIN_PARANGE_BITS) return -EINVAL; } else { phys_shift = KVM_PHYS_SHIFT; diff --git a/arch/arm64/kvm/sys_regs.c b/arch/arm64/kvm/sys_regs.c index f6f126eb6ac1..1d46e185f31e 100644 --- a/arch/arm64/kvm/sys_regs.c +++ b/arch/arm64/kvm/sys_regs.c @@ -44,10 +44,6 @@ * 64bit interface. */ -#define reg_to_encoding(x) \ - sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ - (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2) - static bool read_from_write_only(struct kvm_vcpu *vcpu, struct sys_reg_params *params, const struct sys_reg_desc *r) @@ -318,14 +314,14 @@ static bool trap_dbgauthstatus_el1(struct kvm_vcpu *vcpu, /* * We want to avoid world-switching all the DBG registers all the * time: - * + * * - If we've touched any debug register, it is likely that we're * going to touch more of them. It then makes sense to disable the * traps and start doing the save/restore dance * - If debug is active (DBG_MDSCR_KDE or DBG_MDSCR_MDE set), it is * then mandatory to save/restore the registers, as the guest * depends on them. - * + * * For this, we use a DIRTY bit, indicating the guest has modified the * debug registers, used as follow: * @@ -603,6 +599,41 @@ static unsigned int pmu_visibility(const struct kvm_vcpu *vcpu, return REG_HIDDEN; } +static void reset_pmu_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + u64 n, mask = BIT(ARMV8_PMU_CYCLE_IDX); + + /* No PMU available, any PMU reg may UNDEF... */ + if (!kvm_arm_support_pmu_v3()) + return; + + n = read_sysreg(pmcr_el0) >> ARMV8_PMU_PMCR_N_SHIFT; + n &= ARMV8_PMU_PMCR_N_MASK; + if (n) + mask |= GENMASK(n - 1, 0); + + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= mask; +} + +static void reset_pmevcntr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= GENMASK(31, 0); +} + +static void reset_pmevtyper(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_EVTYPE_MASK; +} + +static void reset_pmselr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) +{ + reset_unknown(vcpu, r); + __vcpu_sys_reg(vcpu, r->reg) &= ARMV8_PMU_COUNTER_MASK; +} + static void reset_pmcr(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) { u64 pmcr, val; @@ -845,7 +876,7 @@ static bool access_pmcnten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, kvm_pmu_disable_counter_mask(vcpu, val); } } else { - p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0) & mask; + p->regval = __vcpu_sys_reg(vcpu, PMCNTENSET_EL0); } return true; @@ -869,7 +900,7 @@ static bool access_pminten(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* accessing PMINTENCLR_EL1 */ __vcpu_sys_reg(vcpu, PMINTENSET_EL1) &= ~val; } else { - p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1) & mask; + p->regval = __vcpu_sys_reg(vcpu, PMINTENSET_EL1); } return true; @@ -891,7 +922,7 @@ static bool access_pmovs(struct kvm_vcpu *vcpu, struct sys_reg_params *p, /* accessing PMOVSCLR_EL0 */ __vcpu_sys_reg(vcpu, PMOVSSET_EL0) &= ~(p->regval & mask); } else { - p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0) & mask; + p->regval = __vcpu_sys_reg(vcpu, PMOVSSET_EL0); } return true; @@ -944,16 +975,18 @@ static bool access_pmuserenr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, trap_wcr, reset_wcr, 0, 0, get_wcr, set_wcr } #define PMU_SYS_REG(r) \ - SYS_DESC(r), .reset = reset_unknown, .visibility = pmu_visibility + SYS_DESC(r), .reset = reset_pmu_reg, .visibility = pmu_visibility /* Macro to expand the PMEVCNTRn_EL0 register */ #define PMU_PMEVCNTR_EL0(n) \ { PMU_SYS_REG(SYS_PMEVCNTRn_EL0(n)), \ + .reset = reset_pmevcntr, \ .access = access_pmu_evcntr, .reg = (PMEVCNTR0_EL0 + n), } /* Macro to expand the PMEVTYPERn_EL0 register */ #define PMU_PMEVTYPER_EL0(n) \ { PMU_SYS_REG(SYS_PMEVTYPERn_EL0(n)), \ + .reset = reset_pmevtyper, \ .access = access_pmu_evtyper, .reg = (PMEVTYPER0_EL0 + n), } static bool undef_access(struct kvm_vcpu *vcpu, struct sys_reg_params *p, @@ -1026,8 +1059,6 @@ static bool access_arch_timer(struct kvm_vcpu *vcpu, return true; } -#define FEATURE(x) (GENMASK_ULL(x##_SHIFT + 3, x##_SHIFT)) - /* Read a sanitised cpufeature ID register by sys_reg_desc */ static u64 read_id_reg(const struct kvm_vcpu *vcpu, struct sys_reg_desc const *r, bool raz) @@ -1038,40 +1069,40 @@ static u64 read_id_reg(const struct kvm_vcpu *vcpu, switch (id) { case SYS_ID_AA64PFR0_EL1: if (!vcpu_has_sve(vcpu)) - val &= ~FEATURE(ID_AA64PFR0_SVE); - val &= ~FEATURE(ID_AA64PFR0_AMU); - val &= ~FEATURE(ID_AA64PFR0_CSV2); - val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); - val &= ~FEATURE(ID_AA64PFR0_CSV3); - val |= FIELD_PREP(FEATURE(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_SVE); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_AMU); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV2), (u64)vcpu->kvm->arch.pfr0_csv2); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR0_CSV3), (u64)vcpu->kvm->arch.pfr0_csv3); break; case SYS_ID_AA64PFR1_EL1: - val &= ~FEATURE(ID_AA64PFR1_MTE); + val &= ~ARM64_FEATURE_MASK(ID_AA64PFR1_MTE); if (kvm_has_mte(vcpu->kvm)) { u64 pfr, mte; pfr = read_sanitised_ftr_reg(SYS_ID_AA64PFR1_EL1); mte = cpuid_feature_extract_unsigned_field(pfr, ID_AA64PFR1_MTE_SHIFT); - val |= FIELD_PREP(FEATURE(ID_AA64PFR1_MTE), mte); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64PFR1_MTE), mte); } break; case SYS_ID_AA64ISAR1_EL1: if (!vcpu_has_ptrauth(vcpu)) - val &= ~(FEATURE(ID_AA64ISAR1_APA) | - FEATURE(ID_AA64ISAR1_API) | - FEATURE(ID_AA64ISAR1_GPA) | - FEATURE(ID_AA64ISAR1_GPI)); + val &= ~(ARM64_FEATURE_MASK(ID_AA64ISAR1_APA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_API) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPA) | + ARM64_FEATURE_MASK(ID_AA64ISAR1_GPI)); break; case SYS_ID_AA64DFR0_EL1: /* Limit debug to ARMv8.0 */ - val &= ~FEATURE(ID_AA64DFR0_DEBUGVER); - val |= FIELD_PREP(FEATURE(ID_AA64DFR0_DEBUGVER), 6); + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER); + val |= FIELD_PREP(ARM64_FEATURE_MASK(ID_AA64DFR0_DEBUGVER), 6); /* Limit guests to PMUv3 for ARMv8.4 */ val = cpuid_feature_cap_perfmon_field(val, ID_AA64DFR0_PMUVER_SHIFT, kvm_vcpu_has_pmu(vcpu) ? ID_AA64DFR0_PMUVER_8_4 : 0); /* Hide SPE from guests */ - val &= ~FEATURE(ID_AA64DFR0_PMSVER); + val &= ~ARM64_FEATURE_MASK(ID_AA64DFR0_PMSVER); break; case SYS_ID_DFR0_EL1: /* Limit guests to PMUv3 for ARMv8.4 */ @@ -1249,6 +1280,20 @@ static int set_raz_id_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, return __set_id_reg(vcpu, rd, uaddr, true); } +static int set_wi_reg(struct kvm_vcpu *vcpu, const struct sys_reg_desc *rd, + const struct kvm_one_reg *reg, void __user *uaddr) +{ + int err; + u64 val; + + /* Perform the access even if we are going to ignore the value */ + err = reg_from_user(&val, uaddr, sys_reg_to_index(rd)); + if (err) + return err; + + return 0; +} + static bool access_ctr(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r) { @@ -1592,16 +1637,21 @@ static const struct sys_reg_desc sys_reg_descs[] = { .access = access_pmcnten, .reg = PMCNTENSET_EL0 }, { PMU_SYS_REG(SYS_PMOVSCLR_EL0), .access = access_pmovs, .reg = PMOVSSET_EL0 }, + /* + * PM_SWINC_EL0 is exposed to userspace as RAZ/WI, as it was + * previously (and pointlessly) advertised in the past... + */ { PMU_SYS_REG(SYS_PMSWINC_EL0), - .access = access_pmswinc, .reg = PMSWINC_EL0 }, + .get_user = get_raz_id_reg, .set_user = set_wi_reg, + .access = access_pmswinc, .reset = NULL }, { PMU_SYS_REG(SYS_PMSELR_EL0), - .access = access_pmselr, .reg = PMSELR_EL0 }, + .access = access_pmselr, .reset = reset_pmselr, .reg = PMSELR_EL0 }, { PMU_SYS_REG(SYS_PMCEID0_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(SYS_PMCEID1_EL0), .access = access_pmceid, .reset = NULL }, { PMU_SYS_REG(SYS_PMCCNTR_EL0), - .access = access_pmu_evcntr, .reg = PMCCNTR_EL0 }, + .access = access_pmu_evcntr, .reset = reset_unknown, .reg = PMCCNTR_EL0 }, { PMU_SYS_REG(SYS_PMXEVTYPER_EL0), .access = access_pmu_evtyper, .reset = NULL }, { PMU_SYS_REG(SYS_PMXEVCNTR_EL0), @@ -2106,23 +2156,6 @@ static int check_sysreg_table(const struct sys_reg_desc *table, unsigned int n, return 0; } -static int match_sys_reg(const void *key, const void *elt) -{ - const unsigned long pval = (unsigned long)key; - const struct sys_reg_desc *r = elt; - - return pval - reg_to_encoding(r); -} - -static const struct sys_reg_desc *find_reg(const struct sys_reg_params *params, - const struct sys_reg_desc table[], - unsigned int num) -{ - unsigned long pval = reg_to_encoding(params); - - return bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); -} - int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu) { kvm_inject_undefined(vcpu); @@ -2365,13 +2398,8 @@ int kvm_handle_sys_reg(struct kvm_vcpu *vcpu) trace_kvm_handle_sys_reg(esr); - params.Op0 = (esr >> 20) & 3; - params.Op1 = (esr >> 14) & 0x7; - params.CRn = (esr >> 10) & 0xf; - params.CRm = (esr >> 1) & 0xf; - params.Op2 = (esr >> 17) & 0x7; + params = esr_sys64_to_params(esr); params.regval = vcpu_get_reg(vcpu, Rt); - params.is_write = !(esr & 1); ret = emulate_sys_reg(vcpu, ¶ms); diff --git a/arch/arm64/kvm/sys_regs.h b/arch/arm64/kvm/sys_regs.h index 9d0621417c2a..cc0cc95a0280 100644 --- a/arch/arm64/kvm/sys_regs.h +++ b/arch/arm64/kvm/sys_regs.h @@ -11,6 +11,12 @@ #ifndef __ARM64_KVM_SYS_REGS_LOCAL_H__ #define __ARM64_KVM_SYS_REGS_LOCAL_H__ +#include + +#define reg_to_encoding(x) \ + sys_reg((u32)(x)->Op0, (u32)(x)->Op1, \ + (u32)(x)->CRn, (u32)(x)->CRm, (u32)(x)->Op2) + struct sys_reg_params { u8 Op0; u8 Op1; @@ -21,6 +27,14 @@ struct sys_reg_params { bool is_write; }; +#define esr_sys64_to_params(esr) \ + ((struct sys_reg_params){ .Op0 = ((esr) >> 20) & 3, \ + .Op1 = ((esr) >> 14) & 0x7, \ + .CRn = ((esr) >> 10) & 0xf, \ + .CRm = ((esr) >> 1) & 0xf, \ + .Op2 = ((esr) >> 17) & 0x7, \ + .is_write = !((esr) & 1) }) + struct sys_reg_desc { /* Sysreg string for debug */ const char *name; @@ -152,6 +166,23 @@ static inline int cmp_sys_reg(const struct sys_reg_desc *i1, return i1->Op2 - i2->Op2; } +static inline int match_sys_reg(const void *key, const void *elt) +{ + const unsigned long pval = (unsigned long)key; + const struct sys_reg_desc *r = elt; + + return pval - reg_to_encoding(r); +} + +static inline const struct sys_reg_desc * +find_reg(const struct sys_reg_params *params, const struct sys_reg_desc table[], + unsigned int num) +{ + unsigned long pval = reg_to_encoding(params); + + return __inline_bsearch((void *)pval, table, num, sizeof(table[0]), match_sys_reg); +} + const struct sys_reg_desc *find_reg_by_id(u64 id, struct sys_reg_params *params, const struct sys_reg_desc table[], diff --git a/arch/arm64/kvm/trace_handle_exit.h b/arch/arm64/kvm/trace_handle_exit.h index 8d78acc4fba7..064a58c19f48 100644 --- a/arch/arm64/kvm/trace_handle_exit.h +++ b/arch/arm64/kvm/trace_handle_exit.h @@ -78,13 +78,17 @@ TRACE_EVENT(kvm_arm_clear_debug, TP_printk("flags: 0x%08x", __entry->guest_debug) ); +/* + * The dreg32 name is a leftover from a distant past. This will really + * output a 64bit value... + */ TRACE_EVENT(kvm_arm_set_dreg32, - TP_PROTO(const char *name, __u32 value), + TP_PROTO(const char *name, __u64 value), TP_ARGS(name, value), TP_STRUCT__entry( __field(const char *, name) - __field(__u32, value) + __field(__u64, value) ), TP_fast_assign( @@ -92,7 +96,7 @@ TRACE_EVENT(kvm_arm_set_dreg32, __entry->value = value; ), - TP_printk("%s: 0x%08x", __entry->name, __entry->value) + TP_printk("%s: 0x%llx", __entry->name, __entry->value) ); TRACE_DEFINE_SIZEOF(__u64); diff --git a/arch/arm64/kvm/vgic/vgic-mmio-v2.c b/arch/arm64/kvm/vgic/vgic-mmio-v2.c index a016f07adc28..5f9014ae595b 100644 --- a/arch/arm64/kvm/vgic/vgic-mmio-v2.c +++ b/arch/arm64/kvm/vgic/vgic-mmio-v2.c @@ -282,7 +282,7 @@ static unsigned long vgic_mmio_read_vcpuif(struct kvm_vcpu *vcpu, case GIC_CPU_PRIMASK: /* * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the - * the PMR field as GICH_VMCR.VMPriMask rather than + * PMR field as GICH_VMCR.VMPriMask rather than * GICC_PMR.Priority, so we expose the upper five bits of * priority mask to userspace using the lower bits in the * unsigned long. @@ -329,7 +329,7 @@ static void vgic_mmio_write_vcpuif(struct kvm_vcpu *vcpu, case GIC_CPU_PRIMASK: /* * Our KVM_DEV_TYPE_ARM_VGIC_V2 device ABI exports the - * the PMR field as GICH_VMCR.VMPriMask rather than + * PMR field as GICH_VMCR.VMPriMask rather than * GICC_PMR.Priority, so we expose the upper five bits of * priority mask to userspace using the lower bits in the * unsigned long. diff --git a/arch/arm64/kvm/vgic/vgic-v2.c b/arch/arm64/kvm/vgic/vgic-v2.c index 2c580204f1dc..95a18cec14a3 100644 --- a/arch/arm64/kvm/vgic/vgic-v2.c +++ b/arch/arm64/kvm/vgic/vgic-v2.c @@ -60,6 +60,7 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) u32 val = cpuif->vgic_lr[lr]; u32 cpuid, intid = val & GICH_LR_VIRTUALID; struct vgic_irq *irq; + bool deactivated; /* Extract the source vCPU id from the LR */ cpuid = val & GICH_LR_PHYSID_CPUID; @@ -75,7 +76,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) raw_spin_lock(&irq->irq_lock); - /* Always preserve the active bit */ + /* Always preserve the active bit, note deactivation */ + deactivated = irq->active && !(val & GICH_LR_ACTIVE_BIT); irq->active = !!(val & GICH_LR_ACTIVE_BIT); if (irq->active && vgic_irq_is_sgi(intid)) @@ -96,36 +98,8 @@ void vgic_v2_fold_lr_state(struct kvm_vcpu *vcpu) if (irq->config == VGIC_CONFIG_LEVEL && !(val & GICH_LR_STATE)) irq->pending_latch = false; - /* - * Level-triggered mapped IRQs are special because we only - * observe rising edges as input to the VGIC. - * - * If the guest never acked the interrupt we have to sample - * the physical line and set the line level, because the - * device state could have changed or we simply need to - * process the still pending interrupt later. - * - * If this causes us to lower the level, we have to also clear - * the physical active state, since we will otherwise never be - * told when the interrupt becomes asserted again. - * - * Another case is when the interrupt requires a helping hand - * on deactivation (no HW deactivation, for example). - */ - if (vgic_irq_is_mapped_level(irq)) { - bool resample = false; - - if (val & GICH_LR_PENDING_BIT) { - irq->line_level = vgic_get_phys_line_level(irq); - resample = !irq->line_level; - } else if (vgic_irq_needs_resampling(irq) && - !(irq->active || irq->pending_latch)) { - resample = true; - } - - if (resample) - vgic_irq_set_phys_active(irq, false); - } + /* Handle resampling for mapped interrupts if required */ + vgic_irq_handle_resampling(irq, deactivated, val & GICH_LR_PENDING_BIT); raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); diff --git a/arch/arm64/kvm/vgic/vgic-v3.c b/arch/arm64/kvm/vgic/vgic-v3.c index 66004f61cd83..21a6207fb2ee 100644 --- a/arch/arm64/kvm/vgic/vgic-v3.c +++ b/arch/arm64/kvm/vgic/vgic-v3.c @@ -46,6 +46,7 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) u32 intid, cpuid; struct vgic_irq *irq; bool is_v2_sgi = false; + bool deactivated; cpuid = val & GICH_LR_PHYSID_CPUID; cpuid >>= GICH_LR_PHYSID_CPUID_SHIFT; @@ -68,7 +69,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) raw_spin_lock(&irq->irq_lock); - /* Always preserve the active bit */ + /* Always preserve the active bit, note deactivation */ + deactivated = irq->active && !(val & ICH_LR_ACTIVE_BIT); irq->active = !!(val & ICH_LR_ACTIVE_BIT); if (irq->active && is_v2_sgi) @@ -89,36 +91,8 @@ void vgic_v3_fold_lr_state(struct kvm_vcpu *vcpu) if (irq->config == VGIC_CONFIG_LEVEL && !(val & ICH_LR_STATE)) irq->pending_latch = false; - /* - * Level-triggered mapped IRQs are special because we only - * observe rising edges as input to the VGIC. - * - * If the guest never acked the interrupt we have to sample - * the physical line and set the line level, because the - * device state could have changed or we simply need to - * process the still pending interrupt later. - * - * If this causes us to lower the level, we have to also clear - * the physical active state, since we will otherwise never be - * told when the interrupt becomes asserted again. - * - * Another case is when the interrupt requires a helping hand - * on deactivation (no HW deactivation, for example). - */ - if (vgic_irq_is_mapped_level(irq)) { - bool resample = false; - - if (val & ICH_LR_PENDING_BIT) { - irq->line_level = vgic_get_phys_line_level(irq); - resample = !irq->line_level; - } else if (vgic_irq_needs_resampling(irq) && - !(irq->active || irq->pending_latch)) { - resample = true; - } - - if (resample) - vgic_irq_set_phys_active(irq, false); - } + /* Handle resampling for mapped interrupts if required */ + vgic_irq_handle_resampling(irq, deactivated, val & ICH_LR_PENDING_BIT); raw_spin_unlock(&irq->irq_lock); vgic_put_irq(vcpu->kvm, irq); diff --git a/arch/arm64/kvm/vgic/vgic.c b/arch/arm64/kvm/vgic/vgic.c index 111bff47e471..5dad4996cfb2 100644 --- a/arch/arm64/kvm/vgic/vgic.c +++ b/arch/arm64/kvm/vgic/vgic.c @@ -106,7 +106,6 @@ struct vgic_irq *vgic_get_irq(struct kvm *kvm, struct kvm_vcpu *vcpu, if (intid >= VGIC_MIN_LPI) return vgic_get_lpi(kvm, intid); - WARN(1, "Looking up struct vgic_irq for reserved INTID"); return NULL; } @@ -1022,3 +1021,41 @@ bool kvm_vgic_map_is_active(struct kvm_vcpu *vcpu, unsigned int vintid) return map_is_active; } + +/* + * Level-triggered mapped IRQs are special because we only observe rising + * edges as input to the VGIC. + * + * If the guest never acked the interrupt we have to sample the physical + * line and set the line level, because the device state could have changed + * or we simply need to process the still pending interrupt later. + * + * We could also have entered the guest with the interrupt active+pending. + * On the next exit, we need to re-evaluate the pending state, as it could + * otherwise result in a spurious interrupt by injecting a now potentially + * stale pending state. + * + * If this causes us to lower the level, we have to also clear the physical + * active state, since we will otherwise never be told when the interrupt + * becomes asserted again. + * + * Another case is when the interrupt requires a helping hand on + * deactivation (no HW deactivation, for example). + */ +void vgic_irq_handle_resampling(struct vgic_irq *irq, + bool lr_deactivated, bool lr_pending) +{ + if (vgic_irq_is_mapped_level(irq)) { + bool resample = false; + + if (unlikely(vgic_irq_needs_resampling(irq))) { + resample = !(irq->active || irq->pending_latch); + } else if (lr_pending || (lr_deactivated && irq->line_level)) { + irq->line_level = vgic_get_phys_line_level(irq); + resample = !irq->line_level; + } + + if (resample) + vgic_irq_set_phys_active(irq, false); + } +} diff --git a/arch/arm64/kvm/vgic/vgic.h b/arch/arm64/kvm/vgic/vgic.h index dc1f3d1657ee..14a9218641f5 100644 --- a/arch/arm64/kvm/vgic/vgic.h +++ b/arch/arm64/kvm/vgic/vgic.h @@ -169,6 +169,8 @@ void vgic_irq_set_phys_active(struct vgic_irq *irq, bool active); bool vgic_queue_irq_unlock(struct kvm *kvm, struct vgic_irq *irq, unsigned long flags); void vgic_kick_vcpus(struct kvm *kvm); +void vgic_irq_handle_resampling(struct vgic_irq *irq, + bool lr_deactivated, bool lr_pending); int vgic_check_ioaddr(struct kvm *kvm, phys_addr_t *ioaddr, phys_addr_t addr, phys_addr_t alignment); diff --git a/arch/mips/kvm/mips.c b/arch/mips/kvm/mips.c index af9dd029a4e1..75c6f264c626 100644 --- a/arch/mips/kvm/mips.c +++ b/arch/mips/kvm/mips.c @@ -41,8 +41,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { KVM_GENERIC_VM_STATS() }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -85,8 +83,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, vz_cpucfg_exits), #endif }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/arch/mips/kvm/vz.c b/arch/mips/kvm/vz.c index 43cad10b877d..4adca5abbc72 100644 --- a/arch/mips/kvm/vz.c +++ b/arch/mips/kvm/vz.c @@ -388,7 +388,6 @@ static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu, u32 compare, u32 cause) { u32 start_count, after_count; - ktime_t freeze_time; unsigned long flags; /* @@ -396,7 +395,7 @@ static void _kvm_vz_restore_htimer(struct kvm_vcpu *vcpu, * this with interrupts disabled to avoid latency. */ local_irq_save(flags); - freeze_time = kvm_mips_freeze_hrtimer(vcpu, &start_count); + kvm_mips_freeze_hrtimer(vcpu, &start_count); write_c0_gtoffset(start_count - read_c0_count()); local_irq_restore(flags); diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index a779f7849cfb..080a7feb7731 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -103,7 +103,6 @@ struct kvm_vcpu_stat { u64 emulated_inst_exits; u64 dec_exits; u64 ext_intr_exits; - u64 halt_wait_ns; u64 halt_successful_wait; u64 dbell_exits; u64 gdbell_exits; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 79833f78d1da..b785f6772391 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -43,8 +43,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_ICOUNTER(VM, num_2M_pages), STATS_DESC_ICOUNTER(VM, num_1G_pages) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -71,7 +69,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, emulated_inst_exits), STATS_DESC_COUNTER(VCPU, dec_exits), STATS_DESC_COUNTER(VCPU, ext_intr_exits), - STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns), STATS_DESC_COUNTER(VCPU, halt_successful_wait), STATS_DESC_COUNTER(VCPU, dbell_exits), STATS_DESC_COUNTER(VCPU, gdbell_exits), @@ -88,8 +85,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, pthru_host), STATS_DESC_COUNTER(VCPU, pthru_bad_aff) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 8da93fdfa59e..6365087f3160 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -346,7 +346,7 @@ static long kvmppc_tce_to_ua(struct kvm *kvm, unsigned long tce, unsigned long gfn = tce >> PAGE_SHIFT; struct kvm_memory_slot *memslot; - memslot = search_memslots(kvm_memslots(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots(kvm), gfn); if (!memslot) return -EINVAL; diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c b/arch/powerpc/kvm/book3s_64_vio_hv.c index 636c6ae0939b..870b7f0c7ea5 100644 --- a/arch/powerpc/kvm/book3s_64_vio_hv.c +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -80,7 +80,7 @@ static long kvmppc_rm_tce_to_ua(struct kvm *kvm, unsigned long gfn = tce >> PAGE_SHIFT; struct kvm_memory_slot *memslot; - memslot = search_memslots(kvm_memslots_raw(kvm), gfn); + memslot = __gfn_to_memslot(kvm_memslots_raw(kvm), gfn); if (!memslot) return -EINVAL; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index bb0dacf7cbec..2acb1c96cfaf 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4202,19 +4202,31 @@ out: /* Attribute wait time */ if (do_sleep) { - vc->runner->stat.halt_wait_ns += + vc->runner->stat.generic.halt_wait_ns += ktime_to_ns(cur) - ktime_to_ns(start_wait); + KVM_STATS_LOG_HIST_UPDATE( + vc->runner->stat.generic.halt_wait_hist, + ktime_to_ns(cur) - ktime_to_ns(start_wait)); /* Attribute failed poll time */ - if (vc->halt_poll_ns) + if (vc->halt_poll_ns) { vc->runner->stat.generic.halt_poll_fail_ns += ktime_to_ns(start_wait) - ktime_to_ns(start_poll); + KVM_STATS_LOG_HIST_UPDATE( + vc->runner->stat.generic.halt_poll_fail_hist, + ktime_to_ns(start_wait) - + ktime_to_ns(start_poll)); + } } else { /* Attribute successful poll time */ - if (vc->halt_poll_ns) + if (vc->halt_poll_ns) { vc->runner->stat.generic.halt_poll_success_ns += ktime_to_ns(cur) - ktime_to_ns(start_poll); + KVM_STATS_LOG_HIST_UPDATE( + vc->runner->stat.generic.halt_poll_success_hist, + ktime_to_ns(cur) - ktime_to_ns(start_poll)); + } } /* Adjust poll time */ diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c index 551b30d84aee..977801c83aff 100644 --- a/arch/powerpc/kvm/booke.c +++ b/arch/powerpc/kvm/booke.c @@ -41,8 +41,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_ICOUNTER(VM, num_2M_pages), STATS_DESC_ICOUNTER(VM, num_1G_pages) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -69,7 +67,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, emulated_inst_exits), STATS_DESC_COUNTER(VCPU, dec_exits), STATS_DESC_COUNTER(VCPU, ext_intr_exits), - STATS_DESC_TIME_NSEC(VCPU, halt_wait_ns), STATS_DESC_COUNTER(VCPU, halt_successful_wait), STATS_DESC_COUNTER(VCPU, dbell_exits), STATS_DESC_COUNTER(VCPU, gdbell_exits), @@ -79,8 +76,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, pthru_host), STATS_DESC_COUNTER(VCPU, pthru_bad_aff) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h index d681ae462350..a604d51acfc8 100644 --- a/arch/s390/include/asm/kvm_host.h +++ b/arch/s390/include/asm/kvm_host.h @@ -244,6 +244,7 @@ struct kvm_s390_sie_block { __u8 fpf; /* 0x0060 */ #define ECB_GS 0x40 #define ECB_TE 0x10 +#define ECB_SPECI 0x08 #define ECB_SRSI 0x04 #define ECB_HOSTPROTINT 0x02 __u8 ecb; /* 0x0061 */ @@ -955,6 +956,7 @@ struct kvm_arch{ atomic64_t cmma_dirty_pages; /* subset of available cpu features enabled by user space */ DECLARE_BITMAP(cpu_feat, KVM_S390_VM_CPU_FEAT_NR_BITS); + /* indexed by vcpu_idx */ DECLARE_BITMAP(idle_mask, KVM_MAX_VCPUS); struct kvm_s390_gisa_interrupt gisa_int; struct kvm_s390_pv pv; diff --git a/arch/s390/kvm/interrupt.c b/arch/s390/kvm/interrupt.c index d548d60caed2..16256e17a544 100644 --- a/arch/s390/kvm/interrupt.c +++ b/arch/s390/kvm/interrupt.c @@ -419,13 +419,13 @@ static unsigned long deliverable_irqs(struct kvm_vcpu *vcpu) static void __set_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_set_cpuflags(vcpu, CPUSTAT_WAIT); - set_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + set_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static void __unset_cpu_idle(struct kvm_vcpu *vcpu) { kvm_s390_clear_cpuflags(vcpu, CPUSTAT_WAIT); - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static void __reset_intercept_indicators(struct kvm_vcpu *vcpu) @@ -3050,18 +3050,18 @@ int kvm_s390_get_irq_state(struct kvm_vcpu *vcpu, __u8 __user *buf, int len) static void __airqs_kick_single_vcpu(struct kvm *kvm, u8 deliverable_mask) { - int vcpu_id, online_vcpus = atomic_read(&kvm->online_vcpus); + int vcpu_idx, online_vcpus = atomic_read(&kvm->online_vcpus); struct kvm_s390_gisa_interrupt *gi = &kvm->arch.gisa_int; struct kvm_vcpu *vcpu; - for_each_set_bit(vcpu_id, kvm->arch.idle_mask, online_vcpus) { - vcpu = kvm_get_vcpu(kvm, vcpu_id); + for_each_set_bit(vcpu_idx, kvm->arch.idle_mask, online_vcpus) { + vcpu = kvm_get_vcpu(kvm, vcpu_idx); if (psw_ioint_disabled(vcpu)) continue; deliverable_mask &= (u8)(vcpu->arch.sie_block->gcr[6] >> 24); if (deliverable_mask) { /* lately kicked but not yet running */ - if (test_and_set_bit(vcpu_id, gi->kicked_mask)) + if (test_and_set_bit(vcpu_idx, gi->kicked_mask)) return; kvm_s390_vcpu_wakeup(vcpu); return; diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c index efda0615741f..752a0ffab9bf 100644 --- a/arch/s390/kvm/kvm-s390.c +++ b/arch/s390/kvm/kvm-s390.c @@ -66,8 +66,6 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, inject_service_signal), STATS_DESC_COUNTER(VM, inject_virtio) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -174,8 +172,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, instruction_diagnose_other), STATS_DESC_COUNTER(VCPU, pfault_sync) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -1953,7 +1949,7 @@ out: static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn) { int start = 0, end = slots->used_slots; - int slot = atomic_read(&slots->lru_slot); + int slot = atomic_read(&slots->last_used_slot); struct kvm_memory_slot *memslots = slots->memslots; if (gfn >= memslots[slot].base_gfn && @@ -1974,7 +1970,7 @@ static int gfn_to_memslot_approx(struct kvm_memslots *slots, gfn_t gfn) if (gfn >= memslots[start].base_gfn && gfn < memslots[start].base_gfn + memslots[start].npages) { - atomic_set(&slots->lru_slot, start); + atomic_set(&slots->last_used_slot, start); } return start; @@ -3224,6 +3220,8 @@ static int kvm_s390_vcpu_setup(struct kvm_vcpu *vcpu) vcpu->arch.sie_block->ecb |= ECB_SRSI; if (test_kvm_facility(vcpu->kvm, 73)) vcpu->arch.sie_block->ecb |= ECB_TE; + if (!kvm_is_ucontrol(vcpu->kvm)) + vcpu->arch.sie_block->ecb |= ECB_SPECI; if (test_kvm_facility(vcpu->kvm, 8) && vcpu->kvm->arch.use_pfmfi) vcpu->arch.sie_block->ecb2 |= ECB2_PFMFI; @@ -4068,7 +4066,7 @@ static int vcpu_pre_run(struct kvm_vcpu *vcpu) kvm_s390_patch_guest_per_regs(vcpu); } - clear_bit(vcpu->vcpu_id, vcpu->kvm->arch.gisa_int.kicked_mask); + clear_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.gisa_int.kicked_mask); vcpu->arch.sie_block->icptcode = 0; cpuflags = atomic_read(&vcpu->arch.sie_block->cpuflags); diff --git a/arch/s390/kvm/kvm-s390.h b/arch/s390/kvm/kvm-s390.h index 9fad25109b0d..ecd741ee3276 100644 --- a/arch/s390/kvm/kvm-s390.h +++ b/arch/s390/kvm/kvm-s390.h @@ -79,7 +79,7 @@ static inline int is_vcpu_stopped(struct kvm_vcpu *vcpu) static inline int is_vcpu_idle(struct kvm_vcpu *vcpu) { - return test_bit(vcpu->vcpu_id, vcpu->kvm->arch.idle_mask); + return test_bit(kvm_vcpu_get_idx(vcpu), vcpu->kvm->arch.idle_mask); } static inline int kvm_is_ucontrol(struct kvm *kvm) diff --git a/arch/s390/kvm/vsie.c b/arch/s390/kvm/vsie.c index 4002a24bc43a..acda4b6fc851 100644 --- a/arch/s390/kvm/vsie.c +++ b/arch/s390/kvm/vsie.c @@ -510,6 +510,8 @@ static int shadow_scb(struct kvm_vcpu *vcpu, struct vsie_page *vsie_page) prefix_unmapped(vsie_page); scb_s->ecb |= ECB_TE; } + /* specification exception interpretation */ + scb_s->ecb |= scb_o->ecb & ECB_SPECI; /* branch prediction */ if (test_kvm_facility(vcpu->kvm, 82)) scb_s->fpf |= scb_o->fpf & FPF_BPBC; diff --git a/arch/x86/include/asm/kvm-x86-ops.h b/arch/x86/include/asm/kvm-x86-ops.h index a12a4987154e..cefe1d81e2e8 100644 --- a/arch/x86/include/asm/kvm-x86-ops.h +++ b/arch/x86/include/asm/kvm-x86-ops.h @@ -72,7 +72,6 @@ KVM_X86_OP(enable_nmi_window) KVM_X86_OP(enable_irq_window) KVM_X86_OP(update_cr8_intercept) KVM_X86_OP(check_apicv_inhibit_reasons) -KVM_X86_OP_NULL(pre_update_apicv_exec_ctrl) KVM_X86_OP(refresh_apicv_exec_ctrl) KVM_X86_OP(hwapic_irr_update) KVM_X86_OP(hwapic_isr_update) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index af6ce8d4c86a..f8f48a7ec577 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -37,9 +37,21 @@ #define __KVM_HAVE_ARCH_VCPU_DEBUGFS -#define KVM_MAX_VCPUS 288 -#define KVM_SOFT_MAX_VCPUS 240 -#define KVM_MAX_VCPU_ID 1023 +#define KVM_MAX_VCPUS 1024 +#define KVM_SOFT_MAX_VCPUS 710 + +/* + * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs + * might be larger than the actual number of VCPUs because the + * APIC ID encodes CPU topology information. + * + * In the worst case, we'll need less than one extra bit for the + * Core ID, and less than one extra bit for the Package (Die) ID, + * so ratio of 4 should be enough. + */ +#define KVM_VCPU_ID_RATIO 4 +#define KVM_MAX_VCPU_ID (KVM_MAX_VCPUS * KVM_VCPU_ID_RATIO) + /* memory slots that are not exposed to userspace */ #define KVM_PRIVATE_MEM_SLOTS 3 @@ -124,13 +136,6 @@ #define KVM_HPAGE_MASK(x) (~(KVM_HPAGE_SIZE(x) - 1)) #define KVM_PAGES_PER_HPAGE(x) (KVM_HPAGE_SIZE(x) / PAGE_SIZE) -static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) -{ - /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */ - return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - - (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); -} - #define KVM_PERMILLE_MMU_PAGES 20 #define KVM_MIN_ALLOC_MMU_PAGES 64UL #define KVM_MMU_HASH_SHIFT 12 @@ -229,7 +234,8 @@ enum x86_intercept_stage; KVM_GUESTDBG_USE_HW_BP | \ KVM_GUESTDBG_USE_SW_BP | \ KVM_GUESTDBG_INJECT_BP | \ - KVM_GUESTDBG_INJECT_DB) + KVM_GUESTDBG_INJECT_DB | \ + KVM_GUESTDBG_BLOCKIRQ) #define PFERR_PRESENT_BIT 0 @@ -447,6 +453,7 @@ struct kvm_mmu { u64 *pae_root; u64 *pml4_root; + u64 *pml5_root; /* * check zero bits on shadow page table entries, these @@ -482,6 +489,7 @@ struct kvm_pmc { * ctrl value for fixed counters. */ u64 current_config; + bool is_paused; }; struct kvm_pmu { @@ -522,7 +530,6 @@ struct kvm_pmu_ops; enum { KVM_DEBUGREG_BP_ENABLED = 1, KVM_DEBUGREG_WONT_EXIT = 2, - KVM_DEBUGREG_RELOAD = 4, }; struct kvm_mtrr_range { @@ -723,7 +730,6 @@ struct kvm_vcpu_arch { u64 reserved_gpa_bits; int maxphyaddr; - int max_tdp_level; /* emulate context */ @@ -988,6 +994,12 @@ struct kvm_hv { /* How many vCPUs have VP index != vCPU index */ atomic_t num_mismatched_vp_indexes; + /* + * How many SynICs use 'AutoEOI' feature + * (protected by arch.apicv_update_lock) + */ + unsigned int synic_auto_eoi_used; + struct hv_partition_assist_pg *hv_pa_pg; struct kvm_hv_syndbg hv_syndbg; }; @@ -1002,9 +1014,8 @@ struct msr_bitmap_range { /* Xen emulation context */ struct kvm_xen { bool long_mode; - bool shinfo_set; u8 upcall_vector; - struct gfn_to_hva_cache shinfo_cache; + gfn_t shinfo_gfn; }; enum kvm_irqchip_mode { @@ -1061,6 +1072,9 @@ struct kvm_arch { struct kvm_apic_map __rcu *apic_map; atomic_t apic_map_dirty; + /* Protects apic_access_memslot_enabled and apicv_inhibit_reasons */ + struct mutex apicv_update_lock; + bool apic_access_memslot_enabled; unsigned long apicv_inhibit_reasons; @@ -1213,9 +1227,17 @@ struct kvm_vm_stat { u64 mmu_recycled; u64 mmu_cache_miss; u64 mmu_unsync; - u64 lpages; + union { + struct { + atomic64_t pages_4k; + atomic64_t pages_2m; + atomic64_t pages_1g; + }; + atomic64_t pages[KVM_NR_PAGE_SIZES]; + }; u64 nx_lpage_splits; u64 max_mmu_page_hash_collisions; + u64 max_mmu_rmap_size; }; struct kvm_vcpu_stat { @@ -1359,7 +1381,6 @@ struct kvm_x86_ops { void (*enable_irq_window)(struct kvm_vcpu *vcpu); void (*update_cr8_intercept)(struct kvm_vcpu *vcpu, int tpr, int irr); bool (*check_apicv_inhibit_reasons)(ulong bit); - void (*pre_update_apicv_exec_ctrl)(struct kvm *kvm, bool activate); void (*refresh_apicv_exec_ctrl)(struct kvm_vcpu *vcpu); void (*hwapic_irr_update)(struct kvm_vcpu *vcpu, int max_irr); void (*hwapic_isr_update)(struct kvm_vcpu *vcpu, int isr); @@ -1543,12 +1564,12 @@ void kvm_mmu_uninit_vm(struct kvm *kvm); void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu); void kvm_mmu_reset_context(struct kvm_vcpu *vcpu); void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot, + const struct kvm_memory_slot *memslot, int start_level); void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, const struct kvm_memory_slot *memslot); void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot); + const struct kvm_memory_slot *memslot); void kvm_mmu_zap_all(struct kvm *kvm); void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen); unsigned long kvm_mmu_calculate_default_mmu_pages(struct kvm *kvm); @@ -1744,6 +1765,9 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu); void kvm_request_apicv_update(struct kvm *kvm, bool activate, unsigned long bit); +void __kvm_request_apicv_update(struct kvm *kvm, bool activate, + unsigned long bit); + int kvm_emulate_hypercall(struct kvm_vcpu *vcpu); int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, @@ -1754,8 +1778,8 @@ void kvm_mmu_invalidate_gva(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid); void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd); -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, - int tdp_huge_page_level); +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + int tdp_max_root_level, int tdp_huge_page_level); static inline u16 kvm_read_ldt(void) { @@ -1779,11 +1803,6 @@ static inline unsigned long read_msr(unsigned long msr) } #endif -static inline u32 get_rdx_init_val(void) -{ - return 0x600; /* P6 family */ -} - static inline void kvm_inject_gp(struct kvm_vcpu *vcpu, u32 error_code) { kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); @@ -1816,31 +1835,6 @@ enum { #define kvm_arch_vcpu_memslots_id(vcpu) ((vcpu)->arch.hflags & HF_SMM_MASK ? 1 : 0) #define kvm_memslots_for_spte_role(kvm, role) __kvm_memslots(kvm, (role).smm) -asmlinkage void kvm_spurious_fault(void); - -/* - * Hardware virtualization extension instructions may fault if a - * reboot turns off virtualization while processes are running. - * Usually after catching the fault we just panic; during reboot - * instead the instruction is ignored. - */ -#define __kvm_handle_fault_on_reboot(insn) \ - "666: \n\t" \ - insn "\n\t" \ - "jmp 668f \n\t" \ - "667: \n\t" \ - "1: \n\t" \ - ".pushsection .discard.instr_begin \n\t" \ - ".long 1b - . \n\t" \ - ".popsection \n\t" \ - "call kvm_spurious_fault \n\t" \ - "1: \n\t" \ - ".pushsection .discard.instr_end \n\t" \ - ".long 1b - . \n\t" \ - ".popsection \n\t" \ - "668: \n\t" \ - _ASM_EXTABLE(666b, 667b) - #define KVM_ARCH_WANT_MMU_NOTIFIER int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v); diff --git a/arch/x86/include/uapi/asm/kvm.h b/arch/x86/include/uapi/asm/kvm.h index a6c327f8ad9e..2ef1f6513c68 100644 --- a/arch/x86/include/uapi/asm/kvm.h +++ b/arch/x86/include/uapi/asm/kvm.h @@ -295,6 +295,7 @@ struct kvm_debug_exit_arch { #define KVM_GUESTDBG_USE_HW_BP 0x00020000 #define KVM_GUESTDBG_INJECT_DB 0x00040000 #define KVM_GUESTDBG_INJECT_BP 0x00080000 +#define KVM_GUESTDBG_BLOCKIRQ 0x00100000 /* for KVM_SET_GUEST_DEBUG */ struct kvm_guest_debug_arch { diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index a26643dc6bd6..b656456c3a94 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -884,10 +884,11 @@ static void kvm_wait(u8 *ptr, u8 val) } else { local_irq_disable(); + /* safe_halt() will enable IRQ */ if (READ_ONCE(*ptr) == val) safe_halt(); - - local_irq_enable(); + else + local_irq_enable(); } } diff --git a/arch/x86/kvm/debugfs.c b/arch/x86/kvm/debugfs.c index 95a98413dc32..54a83a744538 100644 --- a/arch/x86/kvm/debugfs.c +++ b/arch/x86/kvm/debugfs.c @@ -7,6 +7,8 @@ #include #include #include "lapic.h" +#include "mmu.h" +#include "mmu/mmu_internal.h" static int vcpu_get_timer_advance_ns(void *data, u64 *val) { @@ -73,3 +75,112 @@ void kvm_arch_create_vcpu_debugfs(struct kvm_vcpu *vcpu, struct dentry *debugfs_ &vcpu_tsc_scaling_frac_fops); } } + +/* + * This covers statistics <1024 (11=log(1024)+1), which should be enough to + * cover RMAP_RECYCLE_THRESHOLD. + */ +#define RMAP_LOG_SIZE 11 + +static const char *kvm_lpage_str[KVM_NR_PAGE_SIZES] = { "4K", "2M", "1G" }; + +static int kvm_mmu_rmaps_stat_show(struct seq_file *m, void *v) +{ + struct kvm_rmap_head *rmap; + struct kvm *kvm = m->private; + struct kvm_memory_slot *slot; + struct kvm_memslots *slots; + unsigned int lpage_size, index; + /* Still small enough to be on the stack */ + unsigned int *log[KVM_NR_PAGE_SIZES], *cur; + int i, j, k, l, ret; + + ret = -ENOMEM; + memset(log, 0, sizeof(log)); + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { + log[i] = kcalloc(RMAP_LOG_SIZE, sizeof(unsigned int), GFP_KERNEL); + if (!log[i]) + goto out; + } + + mutex_lock(&kvm->slots_lock); + write_lock(&kvm->mmu_lock); + + for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { + slots = __kvm_memslots(kvm, i); + for (j = 0; j < slots->used_slots; j++) { + slot = &slots->memslots[j]; + for (k = 0; k < KVM_NR_PAGE_SIZES; k++) { + rmap = slot->arch.rmap[k]; + lpage_size = kvm_mmu_slot_lpages(slot, k + 1); + cur = log[k]; + for (l = 0; l < lpage_size; l++) { + index = ffs(pte_list_count(&rmap[l])); + if (WARN_ON_ONCE(index >= RMAP_LOG_SIZE)) + index = RMAP_LOG_SIZE - 1; + cur[index]++; + } + } + } + } + + write_unlock(&kvm->mmu_lock); + mutex_unlock(&kvm->slots_lock); + + /* index=0 counts no rmap; index=1 counts 1 rmap */ + seq_printf(m, "Rmap_Count:\t0\t1\t"); + for (i = 2; i < RMAP_LOG_SIZE; i++) { + j = 1 << (i - 1); + k = (1 << i) - 1; + seq_printf(m, "%d-%d\t", j, k); + } + seq_printf(m, "\n"); + + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) { + seq_printf(m, "Level=%s:\t", kvm_lpage_str[i]); + cur = log[i]; + for (j = 0; j < RMAP_LOG_SIZE; j++) + seq_printf(m, "%d\t", cur[j]); + seq_printf(m, "\n"); + } + + ret = 0; +out: + for (i = 0; i < KVM_NR_PAGE_SIZES; i++) + kfree(log[i]); + + return ret; +} + +static int kvm_mmu_rmaps_stat_open(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + + if (!kvm_get_kvm_safe(kvm)) + return -ENOENT; + + return single_open(file, kvm_mmu_rmaps_stat_show, kvm); +} + +static int kvm_mmu_rmaps_stat_release(struct inode *inode, struct file *file) +{ + struct kvm *kvm = inode->i_private; + + kvm_put_kvm(kvm); + + return single_release(inode, file); +} + +static const struct file_operations mmu_rmaps_stat_fops = { + .open = kvm_mmu_rmaps_stat_open, + .read = seq_read, + .llseek = seq_lseek, + .release = kvm_mmu_rmaps_stat_release, +}; + +int kvm_arch_create_vm_debugfs(struct kvm *kvm) +{ + debugfs_create_file("mmu_rmaps_stat", 0644, kvm->debugfs_dentry, kvm, + &mmu_rmaps_stat_fops); + return 0; +} diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 41d2a53c5dea..232a86a6faaf 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -88,6 +88,10 @@ static bool synic_has_vector_auto_eoi(struct kvm_vcpu_hv_synic *synic, static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, int vector) { + struct kvm_vcpu *vcpu = hv_synic_to_vcpu(synic); + struct kvm_hv *hv = to_kvm_hv(vcpu->kvm); + int auto_eoi_old, auto_eoi_new; + if (vector < HV_SYNIC_FIRST_VALID_VECTOR) return; @@ -96,10 +100,30 @@ static void synic_update_vector(struct kvm_vcpu_hv_synic *synic, else __clear_bit(vector, synic->vec_bitmap); + auto_eoi_old = bitmap_weight(synic->auto_eoi_bitmap, 256); + if (synic_has_vector_auto_eoi(synic, vector)) __set_bit(vector, synic->auto_eoi_bitmap); else __clear_bit(vector, synic->auto_eoi_bitmap); + + auto_eoi_new = bitmap_weight(synic->auto_eoi_bitmap, 256); + + if (!!auto_eoi_old == !!auto_eoi_new) + return; + + mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + + if (auto_eoi_new) + hv->synic_auto_eoi_used++; + else + hv->synic_auto_eoi_used--; + + __kvm_request_apicv_update(vcpu->kvm, + !hv->synic_auto_eoi_used, + APICV_INHIBIT_REASON_HYPERV); + + mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); } static int synic_set_sint(struct kvm_vcpu_hv_synic *synic, int sint, @@ -933,12 +957,6 @@ int kvm_hv_activate_synic(struct kvm_vcpu *vcpu, bool dont_zero_synic_pages) synic = to_hv_synic(vcpu); - /* - * Hyper-V SynIC auto EOI SINT's are - * not compatible with APICV, so request - * to deactivate APICV permanently. - */ - kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_HYPERV); synic->active = true; synic->dont_zero_synic_pages = dont_zero_synic_pages; synic->control = HV_SYNIC_CONTROL_ENABLE; @@ -2476,6 +2494,8 @@ int kvm_get_hv_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid2 *cpuid, ent->eax |= HV_X64_ENLIGHTENED_VMCS_RECOMMENDED; if (!cpu_smt_possible()) ent->eax |= HV_X64_NO_NONARCH_CORESHARING; + + ent->eax |= HV_DEPRECATING_AEOI_RECOMMENDED; /* * Default number of spinlock retry attempts, matches * HyperV 2016. diff --git a/arch/x86/kvm/i8254.c b/arch/x86/kvm/i8254.c index a6e218c6140d..5a69cce4d72d 100644 --- a/arch/x86/kvm/i8254.c +++ b/arch/x86/kvm/i8254.c @@ -220,7 +220,8 @@ void __kvm_migrate_pit_timer(struct kvm_vcpu *vcpu) struct kvm_pit *pit = vcpu->kvm->arch.vpit; struct hrtimer *timer; - if (!kvm_vcpu_is_bsp(vcpu) || !pit) + /* Somewhat arbitrarily make vcpu0 the owner of the PIT. */ + if (vcpu->vcpu_id || !pit) return; timer = &pit->pit_state.timer; diff --git a/arch/x86/kvm/ioapic.h b/arch/x86/kvm/ioapic.h index 11e4065e1617..bbd4a5d18b5d 100644 --- a/arch/x86/kvm/ioapic.h +++ b/arch/x86/kvm/ioapic.h @@ -35,11 +35,7 @@ struct kvm_vcpu; #define IOAPIC_INIT 0x5 #define IOAPIC_EXTINT 0x7 -#ifdef CONFIG_X86 #define RTC_GSI 8 -#else -#define RTC_GSI -1U -#endif struct dest_map { /* vcpu bitmap where IRQ has been sent */ diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index ba5a27879f1d..76fb00921203 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -192,6 +192,9 @@ void kvm_recalculate_apic_map(struct kvm *kvm) if (atomic_read_acquire(&kvm->arch.apic_map_dirty) == CLEAN) return; + WARN_ONCE(!irqchip_in_kernel(kvm), + "Dirty APIC map without an in-kernel local APIC"); + mutex_lock(&kvm->arch.apic_map_lock); /* * Read kvm->arch.apic_map_dirty before kvm->arch.apic_map @@ -2265,9 +2268,6 @@ void kvm_lapic_set_base(struct kvm_vcpu *vcpu, u64 value) u64 old_value = vcpu->arch.apic_base; struct kvm_lapic *apic = vcpu->arch.apic; - if (!apic) - value |= MSR_IA32_APICBASE_BSP; - vcpu->arch.apic_base = value; if ((old_value ^ value) & MSR_IA32_APICBASE_ENABLE) @@ -2323,6 +2323,13 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) struct kvm_lapic *apic = vcpu->arch.apic; int i; + if (!init_event) { + vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | + MSR_IA32_APICBASE_ENABLE; + if (kvm_vcpu_is_reset_bsp(vcpu)) + vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; + } + if (!apic) return; @@ -2330,8 +2337,8 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) hrtimer_cancel(&apic->lapic_timer.timer); if (!init_event) { - kvm_lapic_set_base(vcpu, APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE); + apic->base_address = APIC_DEFAULT_PHYS_BASE; + kvm_apic_set_xapic_id(apic, vcpu->vcpu_id); } kvm_apic_set_version(apic->vcpu); @@ -2364,9 +2371,7 @@ void kvm_lapic_reset(struct kvm_vcpu *vcpu, bool init_event) apic->highest_isr_cache = -1; update_divide_count(apic); atomic_set(&apic->lapic_timer.pending, 0); - if (kvm_vcpu_is_bsp(vcpu)) - kvm_lapic_set_base(vcpu, - vcpu->arch.apic_base | MSR_IA32_APICBASE_BSP); + vcpu->arch.pv_eoi.msr_val = 0; apic_update_ppr(apic); if (vcpu->arch.apicv_active) { @@ -2476,11 +2481,6 @@ int kvm_create_lapic(struct kvm_vcpu *vcpu, int timer_advance_ns) lapic_timer_advance_dynamic = false; } - /* - * APIC is created enabled. This will prevent kvm_lapic_set_base from - * thinking that APIC state has changed. - */ - vcpu->arch.apic_base = MSR_IA32_APICBASE_ENABLE; static_branch_inc(&apic_sw_disabled.key); /* sw disabled at reset */ kvm_iodevice_init(&apic->dev, &apic_mmio_ops); diff --git a/arch/x86/kvm/mmu.h b/arch/x86/kvm/mmu.h index 83e6c6965f1e..e9688a9f7b57 100644 --- a/arch/x86/kvm/mmu.h +++ b/arch/x86/kvm/mmu.h @@ -240,4 +240,29 @@ static inline bool kvm_memslots_have_rmaps(struct kvm *kvm) return smp_load_acquire(&kvm->arch.memslots_have_rmaps); } +static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t base_gfn, int level) +{ + /* KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K) must be 0. */ + return (gfn >> KVM_HPAGE_GFN_SHIFT(level)) - + (base_gfn >> KVM_HPAGE_GFN_SHIFT(level)); +} + +static inline unsigned long +__kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, unsigned long npages, + int level) +{ + return gfn_to_index(slot->base_gfn + npages - 1, + slot->base_gfn, level) + 1; +} + +static inline unsigned long +kvm_mmu_slot_lpages(struct kvm_memory_slot *slot, int level) +{ + return __kvm_mmu_slot_lpages(slot, slot->npages, level); +} + +static inline void kvm_update_page_stats(struct kvm *kvm, int level, int count) +{ + atomic64_add(count, &kvm->stat.pages[level - 1]); +} #endif diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 47b765270239..2d7e61122af8 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -97,6 +97,7 @@ module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, 0644); bool tdp_enabled = false; static int max_huge_page_level __read_mostly; +static int tdp_root_level __read_mostly; static int max_tdp_level __read_mostly; enum { @@ -137,12 +138,22 @@ module_param(dbg, bool, 0644); #include -/* make pte_list_desc fit well in cache line */ -#define PTE_LIST_EXT 3 +/* make pte_list_desc fit well in cache lines */ +#define PTE_LIST_EXT 14 +/* + * Slight optimization of cacheline layout, by putting `more' and `spte_count' + * at the start; then accessing it will only use one single cacheline for + * either full (entries==PTE_LIST_EXT) case or entries<=6. + */ struct pte_list_desc { - u64 *sptes[PTE_LIST_EXT]; struct pte_list_desc *more; + /* + * Stores number of entries stored in the pte_list_desc. No need to be + * u64 but just for easier alignment. When PTE_LIST_EXT, means full. + */ + u64 spte_count; + u64 *sptes[PTE_LIST_EXT]; }; struct kvm_shadow_walk_iterator { @@ -193,7 +204,7 @@ struct kvm_mmu_role_regs { * the single source of truth for the MMU's state. */ #define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \ -static inline bool ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ +static inline bool __maybe_unused ____is_##reg##_##name(struct kvm_mmu_role_regs *regs)\ { \ return !!(regs->reg & flag); \ } @@ -215,7 +226,7 @@ BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA); * and the vCPU may be incorrect/irrelevant. */ #define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \ -static inline bool is_##reg##_##name(struct kvm_mmu *mmu) \ +static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \ { \ return !!(mmu->mmu_role. base_or_ext . reg##_##name); \ } @@ -323,12 +334,6 @@ static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte) static gpa_t translate_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u32 access, struct x86_exception *exception) { - /* Check if guest physical address doesn't exceed guest maximum */ - if (kvm_vcpu_is_illegal_gpa(vcpu, gpa)) { - exception->error_code |= PFERR_RSVD_MASK; - return UNMAPPED_GVA; - } - return gpa; } @@ -592,12 +597,13 @@ static bool mmu_spte_update(u64 *sptep, u64 new_spte) * Rules for using mmu_spte_clear_track_bits: * It sets the sptep from present to nonpresent, and track the * state bits, it is used to clear the last level sptep. - * Returns non-zero if the PTE was previously valid. + * Returns the old PTE. */ -static int mmu_spte_clear_track_bits(u64 *sptep) +static int mmu_spte_clear_track_bits(struct kvm *kvm, u64 *sptep) { kvm_pfn_t pfn; u64 old_spte = *sptep; + int level = sptep_to_sp(sptep)->role.level; if (!spte_has_volatile_bits(old_spte)) __update_clear_spte_fast(sptep, 0ull); @@ -605,7 +611,9 @@ static int mmu_spte_clear_track_bits(u64 *sptep) old_spte = __update_clear_spte_slow(sptep, 0ull); if (!is_shadow_present_pte(old_spte)) - return 0; + return old_spte; + + kvm_update_page_stats(kvm, level, -1); pfn = spte_to_pfn(old_spte); @@ -622,7 +630,7 @@ static int mmu_spte_clear_track_bits(u64 *sptep) if (is_dirty_spte(old_spte)) kvm_set_pfn_dirty(pfn); - return 1; + return old_spte; } /* @@ -686,28 +694,36 @@ static bool mmu_spte_age(u64 *sptep) static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu) { - /* - * Prevent page table teardown by making any free-er wait during - * kvm_flush_remote_tlbs() IPI to all active vcpus. - */ - local_irq_disable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_begin(); + } else { + /* + * Prevent page table teardown by making any free-er wait during + * kvm_flush_remote_tlbs() IPI to all active vcpus. + */ + local_irq_disable(); - /* - * Make sure a following spte read is not reordered ahead of the write - * to vcpu->mode. - */ - smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + /* + * Make sure a following spte read is not reordered ahead of the write + * to vcpu->mode. + */ + smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES); + } } static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu) { - /* - * Make sure the write to vcpu->mode is not reordered in front of - * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us - * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. - */ - smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); - local_irq_enable(); + if (is_tdp_mmu(vcpu->arch.mmu)) { + kvm_tdp_mmu_walk_lockless_end(); + } else { + /* + * Make sure the write to vcpu->mode is not reordered in front of + * reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us + * OUTSIDE_GUEST_MODE and proceed to free the shadow page table. + */ + smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE); + local_irq_enable(); + } } static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect) @@ -786,7 +802,7 @@ static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn, return &slot->arch.lpage_info[level - 2][idx]; } -static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, +static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot, gfn_t gfn, int count) { struct kvm_lpage_info *linfo; @@ -799,12 +815,12 @@ static void update_gfn_disallow_lpage_count(struct kvm_memory_slot *slot, } } -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, 1); } -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn) +void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn) { update_gfn_disallow_lpage_count(slot, gfn, -1); } @@ -893,7 +909,7 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, struct kvm_rmap_head *rmap_head) { struct pte_list_desc *desc; - int i, count = 0; + int count = 0; if (!rmap_head->val) { rmap_printk("%p %llx 0->1\n", spte, *spte); @@ -903,24 +919,24 @@ static int pte_list_add(struct kvm_vcpu *vcpu, u64 *spte, desc = mmu_alloc_pte_list_desc(vcpu); desc->sptes[0] = (u64 *)rmap_head->val; desc->sptes[1] = spte; + desc->spte_count = 2; rmap_head->val = (unsigned long)desc | 1; ++count; } else { rmap_printk("%p %llx many->many\n", spte, *spte); desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); - while (desc->sptes[PTE_LIST_EXT-1]) { + while (desc->spte_count == PTE_LIST_EXT) { count += PTE_LIST_EXT; - if (!desc->more) { desc->more = mmu_alloc_pte_list_desc(vcpu); desc = desc->more; + desc->spte_count = 0; break; } desc = desc->more; } - for (i = 0; desc->sptes[i]; ++i) - ++count; - desc->sptes[i] = spte; + count += desc->spte_count; + desc->sptes[desc->spte_count++] = spte; } return count; } @@ -930,13 +946,12 @@ pte_list_desc_remove_entry(struct kvm_rmap_head *rmap_head, struct pte_list_desc *desc, int i, struct pte_list_desc *prev_desc) { - int j; + int j = desc->spte_count - 1; - for (j = PTE_LIST_EXT - 1; !desc->sptes[j] && j > i; --j) - ; desc->sptes[i] = desc->sptes[j]; desc->sptes[j] = NULL; - if (j != 0) + desc->spte_count--; + if (desc->spte_count) return; if (!prev_desc && !desc->more) rmap_head->val = 0; @@ -969,7 +984,7 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); prev_desc = NULL; while (desc) { - for (i = 0; i < PTE_LIST_EXT && desc->sptes[i]; ++i) { + for (i = 0; i < desc->spte_count; ++i) { if (desc->sptes[i] == spte) { pte_list_desc_remove_entry(rmap_head, desc, i, prev_desc); @@ -984,14 +999,63 @@ static void __pte_list_remove(u64 *spte, struct kvm_rmap_head *rmap_head) } } -static void pte_list_remove(struct kvm_rmap_head *rmap_head, u64 *sptep) +static void pte_list_remove(struct kvm *kvm, struct kvm_rmap_head *rmap_head, + u64 *sptep) { - mmu_spte_clear_track_bits(sptep); + mmu_spte_clear_track_bits(kvm, sptep); __pte_list_remove(sptep, rmap_head); } -static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, - struct kvm_memory_slot *slot) +/* Return true if rmap existed, false otherwise */ +static bool pte_list_destroy(struct kvm *kvm, struct kvm_rmap_head *rmap_head) +{ + struct pte_list_desc *desc, *next; + int i; + + if (!rmap_head->val) + return false; + + if (!(rmap_head->val & 1)) { + mmu_spte_clear_track_bits(kvm, (u64 *)rmap_head->val); + goto out; + } + + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + + for (; desc; desc = next) { + for (i = 0; i < desc->spte_count; i++) + mmu_spte_clear_track_bits(kvm, desc->sptes[i]); + next = desc->more; + mmu_free_pte_list_desc(desc); + } +out: + /* rmap_head is meaningless now, remember to reset it */ + rmap_head->val = 0; + return true; +} + +unsigned int pte_list_count(struct kvm_rmap_head *rmap_head) +{ + struct pte_list_desc *desc; + unsigned int count = 0; + + if (!rmap_head->val) + return 0; + else if (!(rmap_head->val & 1)) + return 1; + + desc = (struct pte_list_desc *)(rmap_head->val & ~1ul); + + while (desc) { + count += desc->spte_count; + desc = desc->more; + } + + return count; +} + +static struct kvm_rmap_head *gfn_to_rmap(gfn_t gfn, int level, + const struct kvm_memory_slot *slot) { unsigned long idx; @@ -999,17 +1063,6 @@ static struct kvm_rmap_head *__gfn_to_rmap(gfn_t gfn, int level, return &slot->arch.rmap[level - PG_LEVEL_4K][idx]; } -static struct kvm_rmap_head *gfn_to_rmap(struct kvm *kvm, gfn_t gfn, - struct kvm_mmu_page *sp) -{ - struct kvm_memslots *slots; - struct kvm_memory_slot *slot; - - slots = kvm_memslots_for_spte_role(kvm, sp->role); - slot = __gfn_to_memslot(slots, gfn); - return __gfn_to_rmap(gfn, sp->role.level, slot); -} - static bool rmap_can_add(struct kvm_vcpu *vcpu) { struct kvm_mmu_memory_cache *mc; @@ -1020,24 +1073,39 @@ static bool rmap_can_add(struct kvm_vcpu *vcpu) static int rmap_add(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { + struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); kvm_mmu_page_set_gfn(sp, spte - sp->spt, gfn); - rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); return pte_list_add(vcpu, spte, rmap_head); } + static void rmap_remove(struct kvm *kvm, u64 *spte) { + struct kvm_memslots *slots; + struct kvm_memory_slot *slot; struct kvm_mmu_page *sp; gfn_t gfn; struct kvm_rmap_head *rmap_head; sp = sptep_to_sp(spte); gfn = kvm_mmu_page_get_gfn(sp, spte - sp->spt); - rmap_head = gfn_to_rmap(kvm, gfn, sp); + + /* + * Unlike rmap_add and rmap_recycle, rmap_remove does not run in the + * context of a vCPU so have to determine which memslots to use based + * on context information in sp->role. + */ + slots = kvm_memslots_for_spte_role(kvm, sp->role); + + slot = __gfn_to_memslot(slots, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); + __pte_list_remove(spte, rmap_head); } @@ -1119,7 +1187,9 @@ out: static void drop_spte(struct kvm *kvm, u64 *sptep) { - if (mmu_spte_clear_track_bits(sptep)) + u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep); + + if (is_shadow_present_pte(old_spte)) rmap_remove(kvm, sptep); } @@ -1129,7 +1199,6 @@ static bool __drop_large_spte(struct kvm *kvm, u64 *sptep) if (is_large_pte(*sptep)) { WARN_ON(sptep_to_sp(sptep)->role.level == PG_LEVEL_4K); drop_spte(kvm, sptep); - --kvm->stat.lpages; return true; } @@ -1218,7 +1287,7 @@ static bool spte_wrprot_for_clear_dirty(u64 *sptep) * Returns true iff any D or W bits were cleared. */ static bool __rmap_clear_dirty(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -1256,8 +1325,8 @@ static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm, return; while (mask) { - rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PG_LEVEL_4K, slot); __rmap_write_protect(kvm, rmap_head, false); /* clear the first set bit */ @@ -1289,8 +1358,8 @@ static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm, return; while (mask) { - rmap_head = __gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), - PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(slot->base_gfn + gfn_offset + __ffs(mask), + PG_LEVEL_4K, slot); __rmap_clear_dirty(kvm, rmap_head, slot); /* clear the first set bit */ @@ -1356,7 +1425,7 @@ bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, if (kvm_memslots_have_rmaps(kvm)) { for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) { - rmap_head = __gfn_to_rmap(gfn, i, slot); + rmap_head = gfn_to_rmap(gfn, i, slot); write_protected |= __rmap_write_protect(kvm, rmap_head, true); } } @@ -1377,20 +1446,9 @@ static bool rmap_write_protect(struct kvm_vcpu *vcpu, u64 gfn) } static bool kvm_zap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { - u64 *sptep; - struct rmap_iterator iter; - bool flush = false; - - while ((sptep = rmap_get_first(rmap_head, &iter))) { - rmap_printk("spte %p %llx.\n", sptep, *sptep); - - pte_list_remove(rmap_head, sptep); - flush = true; - } - - return flush; + return pte_list_destroy(kvm, rmap_head); } static bool kvm_unmap_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, @@ -1421,13 +1479,13 @@ restart: need_flush = 1; if (pte_write(pte)) { - pte_list_remove(rmap_head, sptep); + pte_list_remove(kvm, rmap_head, sptep); goto restart; } else { new_spte = kvm_mmu_changed_pte_notifier_make_spte( *sptep, new_pfn); - mmu_spte_clear_track_bits(sptep); + mmu_spte_clear_track_bits(kvm, sptep); mmu_spte_set(sptep, new_spte); } } @@ -1442,7 +1500,7 @@ restart: struct slot_rmap_walk_iterator { /* input fields. */ - struct kvm_memory_slot *slot; + const struct kvm_memory_slot *slot; gfn_t start_gfn; gfn_t end_gfn; int start_level; @@ -1462,14 +1520,13 @@ rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator, int level) { iterator->level = level; iterator->gfn = iterator->start_gfn; - iterator->rmap = __gfn_to_rmap(iterator->gfn, level, iterator->slot); - iterator->end_rmap = __gfn_to_rmap(iterator->end_gfn, level, - iterator->slot); + iterator->rmap = gfn_to_rmap(iterator->gfn, level, iterator->slot); + iterator->end_rmap = gfn_to_rmap(iterator->end_gfn, level, iterator->slot); } static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator, - struct kvm_memory_slot *slot, int start_level, + const struct kvm_memory_slot *slot, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn) { iterator->slot = slot; @@ -1584,12 +1641,13 @@ static bool kvm_test_age_rmapp(struct kvm *kvm, struct kvm_rmap_head *rmap_head, static void rmap_recycle(struct kvm_vcpu *vcpu, u64 *spte, gfn_t gfn) { + struct kvm_memory_slot *slot; struct kvm_rmap_head *rmap_head; struct kvm_mmu_page *sp; sp = sptep_to_sp(spte); - - rmap_head = gfn_to_rmap(vcpu->kvm, gfn, sp); + slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); + rmap_head = gfn_to_rmap(gfn, sp->role.level, slot); kvm_unmap_rmapp(vcpu->kvm, rmap_head, NULL, gfn, sp->role.level, __pte(0)); kvm_flush_remote_tlbs_with_address(vcpu->kvm, sp->gfn, @@ -2232,8 +2290,6 @@ static int mmu_page_zap_pte(struct kvm *kvm, struct kvm_mmu_page *sp, if (is_shadow_present_pte(pte)) { if (is_last_spte(pte, sp->role.level)) { drop_spte(kvm, spte); - if (is_large_pte(pte)) - --kvm->stat.lpages; } else { child = to_shadow_page(pte & PT64_BASE_ADDR_MASK); drop_parent_pte(child, spte); @@ -2716,15 +2772,12 @@ static int mmu_set_spte(struct kvm_vcpu *vcpu, u64 *sptep, pgprintk("%s: setting spte %llx\n", __func__, *sptep); trace_kvm_mmu_set_spte(level, gfn, sptep); - if (!was_rmapped && is_large_pte(*sptep)) - ++vcpu->kvm->stat.lpages; - if (is_shadow_present_pte(*sptep)) { - if (!was_rmapped) { - rmap_count = rmap_add(vcpu, sptep, gfn); - if (rmap_count > RMAP_RECYCLE_THRESHOLD) - rmap_recycle(vcpu, sptep, gfn); - } + if (!was_rmapped) { + kvm_update_page_stats(vcpu->kvm, level, 1); + rmap_count = rmap_add(vcpu, sptep, gfn); + if (rmap_count > RMAP_RECYCLE_THRESHOLD) + rmap_recycle(vcpu, sptep, gfn); } return ret; @@ -2852,6 +2905,7 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, kvm_pfn_t pfn, int max_level) { struct kvm_lpage_info *linfo; + int host_level; max_level = min(max_level, max_huge_page_level); for ( ; max_level > PG_LEVEL_4K; max_level--) { @@ -2863,7 +2917,8 @@ int kvm_mmu_max_mapping_level(struct kvm *kvm, if (max_level == PG_LEVEL_4K) return PG_LEVEL_4K; - return host_pfn_mapping_level(kvm, gfn, pfn, slot); + host_level = host_pfn_mapping_level(kvm, gfn, pfn, slot); + return min(host_level, max_level); } int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, @@ -2887,17 +2942,12 @@ int kvm_mmu_hugepage_adjust(struct kvm_vcpu *vcpu, gfn_t gfn, if (!slot) return PG_LEVEL_4K; - level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); - if (level == PG_LEVEL_4K) - return level; - - *req_level = level = min(level, max_level); - /* * Enforce the iTLB multihit workaround after capturing the requested * level, which will be used to do precise, accurate accounting. */ - if (huge_page_disallowed) + *req_level = level = kvm_mmu_max_mapping_level(vcpu->kvm, slot, gfn, pfn, max_level); + if (level == PG_LEVEL_4K || huge_page_disallowed) return PG_LEVEL_4K; /* @@ -2965,15 +3015,16 @@ static int __direct_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, break; drop_large_spte(vcpu, it.sptep); - if (!is_shadow_present_pte(*it.sptep)) { - sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, - it.level - 1, true, ACC_ALL); + if (is_shadow_present_pte(*it.sptep)) + continue; - link_shadow_page(vcpu, it.sptep, sp); - if (is_tdp && huge_page_disallowed && - req_level >= it.level) - account_huge_nx_page(vcpu->kvm, sp); - } + sp = kvm_mmu_get_page(vcpu, base_gfn, it.addr, + it.level - 1, true, ACC_ALL); + + link_shadow_page(vcpu, it.sptep, sp); + if (is_tdp && huge_page_disallowed && + req_level >= it.level) + account_huge_nx_page(vcpu->kvm, sp); } ret = mmu_set_spte(vcpu, it.sptep, ACC_ALL, @@ -3122,15 +3173,40 @@ static bool is_access_allowed(u32 fault_err_code, u64 spte) } /* - * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between walk_shadow_page_lockless_{begin,end}. + * - The returned sptep must not be used after walk_shadow_page_lockless_end. */ -static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, - u32 error_code) +static u64 *fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, gpa_t gpa, u64 *spte) { struct kvm_shadow_walk_iterator iterator; + u64 old_spte; + u64 *sptep = NULL; + + for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) { + sptep = iterator.sptep; + *spte = old_spte; + + if (!is_shadow_present_pte(old_spte)) + break; + } + + return sptep; +} + +/* + * Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS. + */ +static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code) +{ struct kvm_mmu_page *sp; int ret = RET_PF_INVALID; u64 spte = 0ull; + u64 *sptep = NULL; uint retry_count = 0; if (!page_fault_can_be_fast(error_code)) @@ -3141,14 +3217,15 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, do { u64 new_spte; - for_each_shadow_entry_lockless(vcpu, cr2_or_gpa, iterator, spte) - if (!is_shadow_present_pte(spte)) - break; + if (is_tdp_mmu(vcpu->arch.mmu)) + sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, gpa, &spte); + else + sptep = fast_pf_get_last_sptep(vcpu, gpa, &spte); if (!is_shadow_present_pte(spte)) break; - sp = sptep_to_sp(iterator.sptep); + sp = sptep_to_sp(sptep); if (!is_last_spte(spte, sp->role.level)) break; @@ -3206,8 +3283,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, * since the gfn is not stable for indirect shadow page. See * Documentation/virt/kvm/locking.rst to get more detail. */ - if (fast_pf_fix_direct_spte(vcpu, sp, iterator.sptep, spte, - new_spte)) { + if (fast_pf_fix_direct_spte(vcpu, sp, sptep, spte, new_spte)) { ret = RET_PF_FIXED; break; } @@ -3220,8 +3296,7 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, } while (true); - trace_fast_page_fault(vcpu, cr2_or_gpa, error_code, iterator.sptep, - spte, ret); + trace_fast_page_fault(vcpu, gpa, error_code, sptep, spte, ret); walk_shadow_page_lockless_end(vcpu); return ret; @@ -3455,15 +3530,22 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) * the shadow page table may be a PAE or a long mode page table. */ pm_mask = PT_PRESENT_MASK | shadow_me_mask; - if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) { + if (mmu->shadow_root_level >= PT64_ROOT_4LEVEL) { pm_mask |= PT_ACCESSED_MASK | PT_WRITABLE_MASK | PT_USER_MASK; if (WARN_ON_ONCE(!mmu->pml4_root)) { r = -EIO; goto out_unlock; } - mmu->pml4_root[0] = __pa(mmu->pae_root) | pm_mask; + + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) { + if (WARN_ON_ONCE(!mmu->pml5_root)) { + r = -EIO; + goto out_unlock; + } + mmu->pml5_root[0] = __pa(mmu->pml4_root) | pm_mask; + } } for (i = 0; i < 4; ++i) { @@ -3482,7 +3564,9 @@ static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu) mmu->pae_root[i] = root | pm_mask; } - if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) + if (mmu->shadow_root_level == PT64_ROOT_5LEVEL) + mmu->root_hpa = __pa(mmu->pml5_root); + else if (mmu->shadow_root_level == PT64_ROOT_4LEVEL) mmu->root_hpa = __pa(mmu->pml4_root); else mmu->root_hpa = __pa(mmu->pae_root); @@ -3498,7 +3582,10 @@ out_unlock: static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) { struct kvm_mmu *mmu = vcpu->arch.mmu; - u64 *pml4_root, *pae_root; + bool need_pml5 = mmu->shadow_root_level > PT64_ROOT_4LEVEL; + u64 *pml5_root = NULL; + u64 *pml4_root = NULL; + u64 *pae_root; /* * When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP @@ -3511,20 +3598,21 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) return 0; /* - * This mess only works with 4-level paging and needs to be updated to - * work with 5-level paging. + * NPT, the only paging mode that uses this horror, uses a fixed number + * of levels for the shadow page tables, e.g. all MMUs are 4-level or + * all MMus are 5-level. Thus, this can safely require that pml5_root + * is allocated if the other roots are valid and pml5 is needed, as any + * prior MMU would also have required pml5. */ - if (WARN_ON_ONCE(mmu->shadow_root_level != PT64_ROOT_4LEVEL)) - return -EIO; - - if (mmu->pae_root && mmu->pml4_root) + if (mmu->pae_root && mmu->pml4_root && (!need_pml5 || mmu->pml5_root)) return 0; /* * The special roots should always be allocated in concert. Yell and * bail if KVM ends up in a state where only one of the roots is valid. */ - if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root)) + if (WARN_ON_ONCE(!tdp_enabled || mmu->pae_root || mmu->pml4_root || + (need_pml5 && mmu->pml5_root))) return -EIO; /* @@ -3535,16 +3623,31 @@ static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu) if (!pae_root) return -ENOMEM; +#ifdef CONFIG_X86_64 pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); - if (!pml4_root) { - free_page((unsigned long)pae_root); - return -ENOMEM; + if (!pml4_root) + goto err_pml4; + + if (need_pml5) { + pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT); + if (!pml5_root) + goto err_pml5; } +#endif mmu->pae_root = pae_root; mmu->pml4_root = pml4_root; + mmu->pml5_root = pml5_root; return 0; + +#ifdef CONFIG_X86_64 +err_pml5: + free_page((unsigned long)pml4_root); +err_pml4: + free_page((unsigned long)pae_root); + return -ENOMEM; +#endif } void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu) @@ -3640,6 +3743,8 @@ static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct) /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between walk_shadow_page_lockless_{begin,end}. */ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) { @@ -3647,8 +3752,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level int leaf = -1; u64 spte; - walk_shadow_page_lockless_begin(vcpu); - for (shadow_walk_init(&iterator, vcpu, addr), *root_level = iterator.level; shadow_walk_okay(&iterator); @@ -3662,8 +3765,6 @@ static int get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level break; } - walk_shadow_page_lockless_end(vcpu); - return leaf; } @@ -3675,11 +3776,15 @@ static bool get_mmio_spte(struct kvm_vcpu *vcpu, u64 addr, u64 *sptep) int root, leaf, level; bool reserved = false; + walk_shadow_page_lockless_begin(vcpu); + if (is_tdp_mmu(vcpu->arch.mmu)) leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, &root); else leaf = get_walk(vcpu, addr, sptes, &root); + walk_shadow_page_lockless_end(vcpu); + if (unlikely(leaf < 0)) { *sptep = 0ull; return reserved; @@ -3795,9 +3900,9 @@ static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, kvm_vcpu_gfn_to_hva(vcpu, gfn), &arch); } -static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, +static bool kvm_faultin_pfn(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, gpa_t cr2_or_gpa, kvm_pfn_t *pfn, hva_t *hva, - bool write, bool *writable) + bool write, bool *writable, int *r) { struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn); bool async; @@ -3808,13 +3913,26 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, * be zapped before KVM inserts a new MMIO SPTE for the gfn. */ if (slot && (slot->flags & KVM_MEMSLOT_INVALID)) - return true; + goto out_retry; - /* Don't expose private memslots to L2. */ - if (is_guest_mode(vcpu) && !kvm_is_visible_memslot(slot)) { - *pfn = KVM_PFN_NOSLOT; - *writable = false; - return false; + if (!kvm_is_visible_memslot(slot)) { + /* Don't expose private memslots to L2. */ + if (is_guest_mode(vcpu)) { + *pfn = KVM_PFN_NOSLOT; + *writable = false; + return false; + } + /* + * If the APIC access page exists but is disabled, go directly + * to emulation without caching the MMIO access or creating a + * MMIO SPTE. That way the cache doesn't need to be purged + * when the AVIC is re-enabled. + */ + if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT && + !kvm_apicv_activated(vcpu->kvm)) { + *r = RET_PF_EMULATE; + return true; + } } async = false; @@ -3828,14 +3946,17 @@ static bool try_async_pf(struct kvm_vcpu *vcpu, bool prefault, gfn_t gfn, if (kvm_find_async_pf_gfn(vcpu, gfn)) { trace_kvm_async_pf_doublefault(cr2_or_gpa, gfn); kvm_make_request(KVM_REQ_APF_HALT, vcpu); - return true; + goto out_retry; } else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa, gfn)) - return true; + goto out_retry; } *pfn = __gfn_to_pfn_memslot(slot, gfn, false, NULL, write, writable, hva); - return false; + +out_retry: + *r = RET_PF_RETRY; + return true; } static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, @@ -3854,11 +3975,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, if (page_fault_handle_page_track(vcpu, error_code, gfn)) return RET_PF_EMULATE; - if (!is_tdp_mmu_fault) { - r = fast_page_fault(vcpu, gpa, error_code); - if (r != RET_PF_INVALID) - return r; - } + r = fast_page_fault(vcpu, gpa, error_code); + if (r != RET_PF_INVALID) + return r; r = mmu_topup_memory_caches(vcpu, false); if (r) @@ -3867,9 +3986,9 @@ static int direct_page_fault(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, gfn, gpa, &pfn, &hva, - write, &map_writable)) - return RET_PF_RETRY; + if (kvm_faultin_pfn(vcpu, prefault, gfn, gpa, &pfn, &hva, + write, &map_writable, &r)) + return r; if (handle_abnormal_pfn(vcpu, is_tdp ? 0 : gpa, gfn, pfn, ACC_ALL, &r)) return r; @@ -4588,6 +4707,10 @@ static union kvm_mmu_role kvm_calc_mmu_role_common(struct kvm_vcpu *vcpu, static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu) { + /* tdp_root_level is architecture forced level, use it if nonzero */ + if (tdp_root_level) + return tdp_root_level; + /* Use 5-level TDP if and only if it's useful/necessary. */ if (max_tdp_level == 5 && cpuid_maxphyaddr(vcpu) <= 48) return 4; @@ -5160,7 +5283,7 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code, if (r == RET_PF_INVALID) { r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa, lower_32_bits(error_code), false); - if (WARN_ON_ONCE(r == RET_PF_INVALID)) + if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm)) return -EIO; } @@ -5279,10 +5402,11 @@ void kvm_mmu_invpcid_gva(struct kvm_vcpu *vcpu, gva_t gva, unsigned long pcid) */ } -void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, - int tdp_huge_page_level) +void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level, + int tdp_max_root_level, int tdp_huge_page_level) { tdp_enabled = enable_tdp; + tdp_root_level = tdp_forced_root_level; max_tdp_level = tdp_max_root_level; /* @@ -5302,12 +5426,13 @@ void kvm_configure_mmu(bool enable_tdp, int tdp_max_root_level, EXPORT_SYMBOL_GPL(kvm_configure_mmu); /* The return value indicates if tlb flush on all vcpus is needed. */ -typedef bool (*slot_level_handler) (struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot); +typedef bool (*slot_level_handler) (struct kvm *kvm, + struct kvm_rmap_head *rmap_head, + const struct kvm_memory_slot *slot); /* The caller should hold mmu-lock before calling this function. */ static __always_inline bool -slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_level_range(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, gfn_t start_gfn, gfn_t end_gfn, bool flush_on_yield, bool flush) @@ -5334,7 +5459,7 @@ slot_handle_level_range(struct kvm *kvm, struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_level(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, int start_level, int end_level, bool flush_on_yield) { @@ -5345,7 +5470,7 @@ slot_handle_level(struct kvm *kvm, struct kvm_memory_slot *memslot, } static __always_inline bool -slot_handle_leaf(struct kvm *kvm, struct kvm_memory_slot *memslot, +slot_handle_leaf(struct kvm *kvm, const struct kvm_memory_slot *memslot, slot_level_handler fn, bool flush_on_yield) { return slot_handle_level(kvm, memslot, fn, PG_LEVEL_4K, @@ -5358,6 +5483,7 @@ static void free_mmu_pages(struct kvm_mmu *mmu) set_memory_encrypted((unsigned long)mmu->pae_root, 1); free_page((unsigned long)mmu->pae_root); free_page((unsigned long)mmu->pml4_root); + free_page((unsigned long)mmu->pml5_root); } static int __kvm_mmu_create(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu) @@ -5587,6 +5713,10 @@ void kvm_mmu_uninit_vm(struct kvm *kvm) kvm_mmu_uninit_tdp_mmu(kvm); } +/* + * Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end + * (not including it) + */ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) { struct kvm_memslots *slots; @@ -5594,8 +5724,11 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) int i; bool flush = false; + write_lock(&kvm->mmu_lock); + + kvm_inc_notifier_count(kvm, gfn_start, gfn_end); + if (kvm_memslots_have_rmaps(kvm)) { - write_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(memslot, slots) { @@ -5606,41 +5739,44 @@ void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end) if (start >= end) continue; - flush = slot_handle_level_range(kvm, memslot, + flush = slot_handle_level_range(kvm, + (const struct kvm_memory_slot *) memslot, kvm_zap_rmapp, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL, start, end - 1, true, flush); } } if (flush) - kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); - write_unlock(&kvm->mmu_lock); + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, + gfn_end - gfn_start); } if (is_tdp_mmu_enabled(kvm)) { - flush = false; - - read_lock(&kvm->mmu_lock); for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, gfn_start, - gfn_end, flush, true); + gfn_end, flush); if (flush) kvm_flush_remote_tlbs_with_address(kvm, gfn_start, - gfn_end); - - read_unlock(&kvm->mmu_lock); + gfn_end - gfn_start); } + + if (flush) + kvm_flush_remote_tlbs_with_address(kvm, gfn_start, gfn_end); + + kvm_dec_notifier_count(kvm, gfn_start, gfn_end); + + write_unlock(&kvm->mmu_lock); } static bool slot_rmap_write_protect(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { return __rmap_write_protect(kvm, rmap_head, false); } void kvm_mmu_slot_remove_write_access(struct kvm *kvm, - struct kvm_memory_slot *memslot, + const struct kvm_memory_slot *memslot, int start_level) { bool flush = false; @@ -5676,7 +5812,7 @@ void kvm_mmu_slot_remove_write_access(struct kvm *kvm, static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm, struct kvm_rmap_head *rmap_head, - struct kvm_memory_slot *slot) + const struct kvm_memory_slot *slot) { u64 *sptep; struct rmap_iterator iter; @@ -5699,7 +5835,7 @@ restart: if (sp->role.direct && !kvm_is_reserved_pfn(pfn) && sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, sp->gfn, pfn, PG_LEVEL_NUM)) { - pte_list_remove(rmap_head, sptep); + pte_list_remove(kvm, rmap_head, sptep); if (kvm_available_flush_tlb_with_range()) kvm_flush_remote_tlbs_with_address(kvm, sp->gfn, @@ -5715,10 +5851,8 @@ restart: } void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm, - const struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *slot) { - /* FIXME: const-ify all uses of struct kvm_memory_slot. */ - struct kvm_memory_slot *slot = (struct kvm_memory_slot *)memslot; bool flush = false; if (kvm_memslots_have_rmaps(kvm)) { @@ -5754,7 +5888,7 @@ void kvm_arch_flush_remote_tlbs_memslot(struct kvm *kvm, } void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm, - struct kvm_memory_slot *memslot) + const struct kvm_memory_slot *memslot) { bool flush = false; diff --git a/arch/x86/kvm/mmu/mmu_audit.c b/arch/x86/kvm/mmu/mmu_audit.c index cedc17b2f60e..9e7dcf999f08 100644 --- a/arch/x86/kvm/mmu/mmu_audit.c +++ b/arch/x86/kvm/mmu/mmu_audit.c @@ -147,7 +147,7 @@ static void inspect_spte_has_rmap(struct kvm *kvm, u64 *sptep) return; } - rmap_head = __gfn_to_rmap(gfn, rev_sp->role.level, slot); + rmap_head = gfn_to_rmap(gfn, rev_sp->role.level, slot); if (!rmap_head->val) { if (!__ratelimit(&ratelimit_state)) return; @@ -200,7 +200,7 @@ static void audit_write_protection(struct kvm *kvm, struct kvm_mmu_page *sp) slots = kvm_memslots_for_spte_role(kvm, sp->role); slot = __gfn_to_memslot(slots, sp->gfn); - rmap_head = __gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); + rmap_head = gfn_to_rmap(sp->gfn, PG_LEVEL_4K, slot); for_each_rmap_spte(rmap_head, &iter, sptep) { if (is_writable_pte(*sptep)) diff --git a/arch/x86/kvm/mmu/mmu_internal.h b/arch/x86/kvm/mmu/mmu_internal.h index 35567293c1fd..bf2bdbf333c2 100644 --- a/arch/x86/kvm/mmu/mmu_internal.h +++ b/arch/x86/kvm/mmu/mmu_internal.h @@ -31,13 +31,16 @@ extern bool dbg; #define IS_VALID_PAE_ROOT(x) (!!(x)) struct kvm_mmu_page { + /* + * Note, "link" through "spt" fit in a single 64 byte cache line on + * 64-bit kernels, keep it that way unless there's a reason not to. + */ struct list_head link; struct hlist_node hash_link; - struct list_head lpage_disallowed_link; + bool tdp_mmu_page; bool unsync; u8 mmu_valid_gen; - bool mmio_cached; bool lpage_disallowed; /* Can't be replaced by an equiv large page */ /* @@ -59,6 +62,7 @@ struct kvm_mmu_page { struct kvm_rmap_head parent_ptes; /* rmap pointers to parent sptes */ DECLARE_BITMAP(unsync_child_bitmap, 512); + struct list_head lpage_disallowed_link; #ifdef CONFIG_X86_32 /* * Used out of the mmu-lock to avoid reading spte values while an @@ -71,8 +75,6 @@ struct kvm_mmu_page { atomic_t write_flooding_count; #ifdef CONFIG_X86_64 - bool tdp_mmu_page; - /* Used for freeing the page asynchronously if it is a TDP MMU page. */ struct rcu_head rcu_head; #endif @@ -124,13 +126,14 @@ static inline bool is_nx_huge_page_enabled(void) int mmu_try_to_unsync_pages(struct kvm_vcpu *vcpu, gfn_t gfn, bool can_unsync); -void kvm_mmu_gfn_disallow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); -void kvm_mmu_gfn_allow_lpage(struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); +void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn); bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm, struct kvm_memory_slot *slot, u64 gfn, int min_level); void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, u64 start_gfn, u64 pages); +unsigned int pte_list_count(struct kvm_rmap_head *rmap_head); /* * Return values of handle_mmio_page_fault, mmu.page_fault, and fast_page_fault(). @@ -140,6 +143,9 @@ void kvm_flush_remote_tlbs_with_address(struct kvm *kvm, * RET_PF_INVALID: the spte is invalid, let the real page fault path update it. * RET_PF_FIXED: The faulting entry has been fixed. * RET_PF_SPURIOUS: The faulting entry was already fixed, e.g. by another vCPU. + * + * Any names added to this enum should be exported to userspace for use in + * tracepoints via TRACE_DEFINE_ENUM() in mmutrace.h */ enum { RET_PF_RETRY = 0, diff --git a/arch/x86/kvm/mmu/mmutrace.h b/arch/x86/kvm/mmu/mmutrace.h index efbad33a0645..2924a4081a19 100644 --- a/arch/x86/kvm/mmu/mmutrace.h +++ b/arch/x86/kvm/mmu/mmutrace.h @@ -54,6 +54,12 @@ { PFERR_RSVD_MASK, "RSVD" }, \ { PFERR_FETCH_MASK, "F" } +TRACE_DEFINE_ENUM(RET_PF_RETRY); +TRACE_DEFINE_ENUM(RET_PF_EMULATE); +TRACE_DEFINE_ENUM(RET_PF_INVALID); +TRACE_DEFINE_ENUM(RET_PF_FIXED); +TRACE_DEFINE_ENUM(RET_PF_SPURIOUS); + /* * A pagetable walk has started */ diff --git a/arch/x86/kvm/mmu/page_track.c b/arch/x86/kvm/mmu/page_track.c index 91a9f7e0fd91..269f11f92fd0 100644 --- a/arch/x86/kvm/mmu/page_track.c +++ b/arch/x86/kvm/mmu/page_track.c @@ -16,6 +16,7 @@ #include +#include "mmu.h" #include "mmu_internal.h" void kvm_page_track_free_memslot(struct kvm_memory_slot *slot) diff --git a/arch/x86/kvm/mmu/paging_tmpl.h b/arch/x86/kvm/mmu/paging_tmpl.h index ee044d357b5f..7d03e9b7ccfa 100644 --- a/arch/x86/kvm/mmu/paging_tmpl.h +++ b/arch/x86/kvm/mmu/paging_tmpl.h @@ -881,9 +881,9 @@ static int FNAME(page_fault)(struct kvm_vcpu *vcpu, gpa_t addr, u32 error_code, mmu_seq = vcpu->kvm->mmu_notifier_seq; smp_rmb(); - if (try_async_pf(vcpu, prefault, walker.gfn, addr, &pfn, &hva, - write_fault, &map_writable)) - return RET_PF_RETRY; + if (kvm_faultin_pfn(vcpu, prefault, walker.gfn, addr, &pfn, &hva, + write_fault, &map_writable, &r)) + return r; if (handle_abnormal_pfn(vcpu, addr, walker.gfn, pfn, walker.pte_access, &r)) return r; diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index d80cb122b5f3..64ccfc1fa553 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -10,7 +10,7 @@ #include #include -static bool __read_mostly tdp_mmu_enabled = false; +static bool __read_mostly tdp_mmu_enabled = true; module_param_named(tdp_mmu, tdp_mmu_enabled, bool, 0644); /* Initializes the TDP MMU for the VM, if enabled. */ @@ -255,26 +255,17 @@ static void handle_changed_spte_dirty_log(struct kvm *kvm, int as_id, gfn_t gfn, * * @kvm: kvm instance * @sp: the new page - * @shared: This operation may not be running under the exclusive use of - * the MMU lock and the operation must synchronize with other - * threads that might be adding or removing pages. * @account_nx: This page replaces a NX large page and should be marked for * eventual reclaim. */ static void tdp_mmu_link_page(struct kvm *kvm, struct kvm_mmu_page *sp, - bool shared, bool account_nx) + bool account_nx) { - if (shared) - spin_lock(&kvm->arch.tdp_mmu_pages_lock); - else - lockdep_assert_held_write(&kvm->mmu_lock); - + spin_lock(&kvm->arch.tdp_mmu_pages_lock); list_add(&sp->link, &kvm->arch.tdp_mmu_pages); if (account_nx) account_huge_nx_page(kvm, sp); - - if (shared) - spin_unlock(&kvm->arch.tdp_mmu_pages_lock); + spin_unlock(&kvm->arch.tdp_mmu_pages_lock); } /** @@ -445,13 +436,6 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, trace_kvm_tdp_mmu_spte_changed(as_id, gfn, level, old_spte, new_spte); - if (is_large_pte(old_spte) != is_large_pte(new_spte)) { - if (is_large_pte(old_spte)) - atomic64_sub(1, (atomic64_t*)&kvm->stat.lpages); - else - atomic64_add(1, (atomic64_t*)&kvm->stat.lpages); - } - /* * The only times a SPTE should be changed from a non-present to * non-present state is when an MMIO entry is installed/modified/ @@ -477,6 +461,8 @@ static void __handle_changed_spte(struct kvm *kvm, int as_id, gfn_t gfn, return; } + if (is_leaf != was_leaf) + kvm_update_page_stats(kvm, level, is_leaf ? 1 : -1); if (was_leaf && is_dirty_spte(old_spte) && (!is_present || !is_dirty_spte(new_spte) || pfn_changed)) @@ -526,6 +512,10 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, if (is_removed_spte(iter->old_spte)) return false; + /* + * Note, fast_pf_fix_direct_spte() can also modify TDP MMU SPTEs and + * does not hold the mmu_lock. + */ if (cmpxchg64(rcu_dereference(iter->sptep), iter->old_spte, new_spte) != iter->old_spte) return false; @@ -537,15 +527,40 @@ static inline bool tdp_mmu_set_spte_atomic_no_dirty_log(struct kvm *kvm, return true; } -static inline bool tdp_mmu_set_spte_atomic(struct kvm *kvm, - struct tdp_iter *iter, - u64 new_spte) +/* + * tdp_mmu_map_set_spte_atomic - Set a leaf TDP MMU SPTE atomically to resolve a + * TDP page fault. + * + * @vcpu: The vcpu instance that took the TDP page fault. + * @iter: a tdp_iter instance currently on the SPTE that should be set + * @new_spte: The value the SPTE should be set to + * + * Returns: true if the SPTE was set, false if it was not. If false is returned, + * this function will have no side-effects. + */ +static inline bool tdp_mmu_map_set_spte_atomic(struct kvm_vcpu *vcpu, + struct tdp_iter *iter, + u64 new_spte) { + struct kvm *kvm = vcpu->kvm; + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, new_spte)) return false; - handle_changed_spte_dirty_log(kvm, iter->as_id, iter->gfn, - iter->old_spte, new_spte, iter->level); + /* + * Use kvm_vcpu_gfn_to_memslot() instead of going through + * handle_changed_spte_dirty_log() to leverage vcpu->last_used_slot. + */ + if (is_writable_pte(new_spte)) { + struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, iter->gfn); + + if (slot && kvm_slot_dirty_track_enabled(slot)) { + /* Enforced by kvm_mmu_hugepage_adjust. */ + WARN_ON_ONCE(iter->level > PG_LEVEL_4K); + mark_page_dirty_in_slot(kvm, slot, iter->gfn); + } + } + return true; } @@ -558,7 +573,7 @@ static inline bool tdp_mmu_zap_spte_atomic(struct kvm *kvm, * immediately installing a present entry in its place * before the TLBs are flushed. */ - if (!tdp_mmu_set_spte_atomic(kvm, iter, REMOVED_SPTE)) + if (!tdp_mmu_set_spte_atomic_no_dirty_log(kvm, iter, REMOVED_SPTE)) return false; kvm_flush_remote_tlbs_with_address(kvm, iter->gfn, @@ -789,21 +804,15 @@ retry: * non-root pages mapping GFNs strictly within that range. Returns true if * SPTEs have been cleared and a TLB flush is needed before releasing the * MMU lock. - * - * If shared is true, this thread holds the MMU lock in read mode and must - * account for the possibility that other threads are modifying the paging - * structures concurrently. If shared is false, this thread should hold the - * MMU in write mode. */ bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush, - bool shared) + gfn_t end, bool can_yield, bool flush) { struct kvm_mmu_page *root; - for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, shared) + for_each_tdp_mmu_root_yield_safe(kvm, root, as_id, false) flush = zap_gfn_range(kvm, root, start, end, can_yield, flush, - shared); + false); return flush; } @@ -814,8 +823,7 @@ void kvm_tdp_mmu_zap_all(struct kvm *kvm) int i; for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) - flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, - flush, false); + flush = kvm_tdp_mmu_zap_gfn_range(kvm, i, 0, -1ull, flush); if (flush) kvm_flush_remote_tlbs(kvm); @@ -940,7 +948,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, int write, if (new_spte == iter->old_spte) ret = RET_PF_SPURIOUS; - else if (!tdp_mmu_set_spte_atomic(vcpu->kvm, iter, new_spte)) + else if (!tdp_mmu_map_set_spte_atomic(vcpu, iter, new_spte)) return RET_PF_RETRY; /* @@ -1044,9 +1052,8 @@ int kvm_tdp_mmu_map(struct kvm_vcpu *vcpu, gpa_t gpa, u32 error_code, new_spte = make_nonleaf_spte(child_pt, !shadow_accessed_mask); - if (tdp_mmu_set_spte_atomic(vcpu->kvm, &iter, - new_spte)) { - tdp_mmu_link_page(vcpu->kvm, sp, true, + if (tdp_mmu_set_spte_atomic_no_dirty_log(vcpu->kvm, &iter, new_spte)) { + tdp_mmu_link_page(vcpu->kvm, sp, huge_page_disallowed && req_level >= iter.level); @@ -1255,8 +1262,8 @@ retry: * only affect leaf SPTEs down to min_level. * Returns true if an SPTE has been changed and the TLBs need to be flushed. */ -bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, - int min_level) +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot, int min_level) { struct kvm_mmu_page *root; bool spte_set = false; @@ -1326,7 +1333,8 @@ retry: * each SPTE. Returns true if an SPTE has been changed and the TLBs need to * be flushed. */ -bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, struct kvm_memory_slot *slot) +bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot) { struct kvm_mmu_page *root; bool spte_set = false; @@ -1529,6 +1537,8 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, /* * Return the level of the lowest level SPTE added to sptes. * That SPTE may be non-present. + * + * Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. */ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level) @@ -1540,14 +1550,47 @@ int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, *root_level = vcpu->arch.mmu->shadow_root_level; - rcu_read_lock(); - tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { leaf = iter.level; sptes[leaf] = iter.old_spte; } - rcu_read_unlock(); - return leaf; } + +/* + * Returns the last level spte pointer of the shadow page walk for the given + * gpa, and sets *spte to the spte value. This spte may be non-preset. If no + * walk could be performed, returns NULL and *spte does not contain valid data. + * + * Contract: + * - Must be called between kvm_tdp_mmu_walk_lockless_{begin,end}. + * - The returned sptep must not be used after kvm_tdp_mmu_walk_lockless_end. + * + * WARNING: This function is only intended to be called during fast_page_fault. + */ +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte) +{ + struct tdp_iter iter; + struct kvm_mmu *mmu = vcpu->arch.mmu; + gfn_t gfn = addr >> PAGE_SHIFT; + tdp_ptep_t sptep = NULL; + + tdp_mmu_for_each_pte(iter, mmu, gfn, gfn + 1) { + *spte = iter.old_spte; + sptep = iter.sptep; + } + + /* + * Perform the rcu_dereference to get the raw spte pointer value since + * we are passing it up to fast_page_fault, which is shared with the + * legacy MMU and thus does not retain the TDP MMU-specific __rcu + * annotation. + * + * This is safe since fast_page_fault obeys the contracts of this + * function as well as all TDP MMU contracts around modifying SPTEs + * outside of mmu_lock. + */ + return rcu_dereference(sptep); +} diff --git a/arch/x86/kvm/mmu/tdp_mmu.h b/arch/x86/kvm/mmu/tdp_mmu.h index 1cae4485b3bc..358f447d4012 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.h +++ b/arch/x86/kvm/mmu/tdp_mmu.h @@ -20,14 +20,11 @@ void kvm_tdp_mmu_put_root(struct kvm *kvm, struct kvm_mmu_page *root, bool shared); bool __kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, gfn_t start, - gfn_t end, bool can_yield, bool flush, - bool shared); + gfn_t end, bool can_yield, bool flush); static inline bool kvm_tdp_mmu_zap_gfn_range(struct kvm *kvm, int as_id, - gfn_t start, gfn_t end, bool flush, - bool shared) + gfn_t start, gfn_t end, bool flush) { - return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush, - shared); + return __kvm_tdp_mmu_zap_gfn_range(kvm, as_id, start, end, true, flush); } static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) { @@ -44,7 +41,7 @@ static inline bool kvm_tdp_mmu_zap_sp(struct kvm *kvm, struct kvm_mmu_page *sp) */ lockdep_assert_held_write(&kvm->mmu_lock); return __kvm_tdp_mmu_zap_gfn_range(kvm, kvm_mmu_page_as_id(sp), - sp->gfn, end, false, false, false); + sp->gfn, end, false, false); } void kvm_tdp_mmu_zap_all(struct kvm *kvm); @@ -61,10 +58,10 @@ bool kvm_tdp_mmu_age_gfn_range(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_test_age_gfn(struct kvm *kvm, struct kvm_gfn_range *range); bool kvm_tdp_mmu_set_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range); -bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, struct kvm_memory_slot *slot, - int min_level); +bool kvm_tdp_mmu_wrprot_slot(struct kvm *kvm, + const struct kvm_memory_slot *slot, int min_level); bool kvm_tdp_mmu_clear_dirty_slot(struct kvm *kvm, - struct kvm_memory_slot *slot); + const struct kvm_memory_slot *slot); void kvm_tdp_mmu_clear_dirty_pt_masked(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, unsigned long mask, @@ -77,8 +74,20 @@ bool kvm_tdp_mmu_write_protect_gfn(struct kvm *kvm, struct kvm_memory_slot *slot, gfn_t gfn, int min_level); +static inline void kvm_tdp_mmu_walk_lockless_begin(void) +{ + rcu_read_lock(); +} + +static inline void kvm_tdp_mmu_walk_lockless_end(void) +{ + rcu_read_unlock(); +} + int kvm_tdp_mmu_get_walk(struct kvm_vcpu *vcpu, u64 addr, u64 *sptes, int *root_level); +u64 *kvm_tdp_mmu_fast_pf_get_last_sptep(struct kvm_vcpu *vcpu, u64 addr, + u64 *spte); #ifdef CONFIG_X86_64 bool kvm_mmu_init_tdp_mmu(struct kvm *kvm); diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 827886c12c16..0772bad9165c 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -137,18 +137,20 @@ static void pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, pmc->perf_event = event; pmc_to_pmu(pmc)->event_count++; clear_bit(pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi); + pmc->is_paused = false; } static void pmc_pause_counter(struct kvm_pmc *pmc) { u64 counter = pmc->counter; - if (!pmc->perf_event) + if (!pmc->perf_event || pmc->is_paused) return; /* update counter, reset event value to avoid redundant accumulation */ counter += perf_event_pause(pmc->perf_event, true); pmc->counter = counter & pmc_bitmask(pmc); + pmc->is_paused = true; } static bool pmc_resume_counter(struct kvm_pmc *pmc) @@ -163,6 +165,7 @@ static bool pmc_resume_counter(struct kvm_pmc *pmc) /* reuse perf_event to serve as pmc_reprogram_counter() does*/ perf_event_enable(pmc->perf_event); + pmc->is_paused = false; clear_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->reprogram_pmi); return true; diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 67e753edfa22..0e4f2b1fa9fb 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -55,7 +55,7 @@ static inline u64 pmc_read_counter(struct kvm_pmc *pmc) u64 counter, enabled, running; counter = pmc->counter; - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) counter += perf_event_read_value(pmc->perf_event, &enabled, &running); /* FIXME: Scaling needed? */ diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index a8ad78a2faa1..8052d92069e0 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -197,6 +197,8 @@ void avic_init_vmcb(struct vcpu_svm *svm) vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID_COUNT; + vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; + if (kvm_apicv_activated(svm->vcpu.kvm)) vmcb->control.int_ctl |= AVIC_ENABLE_MASK; else @@ -225,31 +227,26 @@ static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, * field of the VMCB. Therefore, we set up the * APIC_ACCESS_PAGE_PRIVATE_MEMSLOT (4KB) here. */ -static int avic_update_access_page(struct kvm *kvm, bool activate) +static int avic_alloc_access_page(struct kvm *kvm) { void __user *ret; int r = 0; mutex_lock(&kvm->slots_lock); - /* - * During kvm_destroy_vm(), kvm_pit_set_reinject() could trigger - * APICv mode change, which update APIC_ACCESS_PAGE_PRIVATE_MEMSLOT - * memory region. So, we need to ensure that kvm->mm == current->mm. - */ - if ((kvm->arch.apic_access_memslot_enabled == activate) || - (kvm->mm != current->mm)) + + if (kvm->arch.apic_access_memslot_enabled) goto out; ret = __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, APIC_DEFAULT_PHYS_BASE, - activate ? PAGE_SIZE : 0); + PAGE_SIZE); if (IS_ERR(ret)) { r = PTR_ERR(ret); goto out; } - kvm->arch.apic_access_memslot_enabled = activate; + kvm->arch.apic_access_memslot_enabled = true; out: mutex_unlock(&kvm->slots_lock); return r; @@ -270,7 +267,7 @@ static int avic_init_backing_page(struct kvm_vcpu *vcpu) if (kvm_apicv_activated(vcpu->kvm)) { int ret; - ret = avic_update_access_page(vcpu->kvm, true); + ret = avic_alloc_access_page(vcpu->kvm); if (ret) return ret; } @@ -587,17 +584,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu) avic_handle_ldr_update(vcpu); } -void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate) -{ - if (!enable_apicv || !lapic_in_kernel(vcpu)) - return; - - srcu_read_unlock(&vcpu->kvm->srcu, vcpu->srcu_idx); - kvm_request_apicv_update(vcpu->kvm, activate, - APICV_INHIBIT_REASON_IRQWIN); - vcpu->srcu_idx = srcu_read_lock(&vcpu->kvm->srcu); -} - void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu) { return; @@ -667,6 +653,11 @@ void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) } vmcb_mark_dirty(vmcb, VMCB_AVIC); + if (activated) + avic_vcpu_load(vcpu, vcpu->cpu); + else + avic_vcpu_put(vcpu); + svm_set_pi_irte_mode(vcpu, activated); } @@ -918,10 +909,6 @@ bool svm_check_apicv_inhibit_reasons(ulong bit) return supported & BIT(bit); } -void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate) -{ - avic_update_access_page(kvm, activate); -} static inline int avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) @@ -960,9 +947,6 @@ void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) int h_physical_id = kvm_cpu_get_apicid(cpu); struct vcpu_svm *svm = to_svm(vcpu); - if (!kvm_vcpu_apicv_active(vcpu)) - return; - /* * Since the host physical APIC id is 8 bits, * we can support host APIC ID upto 255. @@ -990,9 +974,6 @@ void avic_vcpu_put(struct kvm_vcpu *vcpu) u64 entry; struct vcpu_svm *svm = to_svm(vcpu); - if (!kvm_vcpu_apicv_active(vcpu)) - return; - entry = READ_ONCE(*(svm->avic_physical_id_cache)); if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) avic_update_iommu_vcpu_affinity(vcpu, -1, 0); @@ -1009,6 +990,10 @@ static void avic_set_running(struct kvm_vcpu *vcpu, bool is_run) struct vcpu_svm *svm = to_svm(vcpu); svm->avic_is_running = is_run; + + if (!kvm_vcpu_apicv_active(vcpu)) + return; + if (is_run) avic_vcpu_load(vcpu, vcpu->cpu); else diff --git a/arch/x86/kvm/svm/nested.c b/arch/x86/kvm/svm/nested.c index e5515477c30a..2545d0c61985 100644 --- a/arch/x86/kvm/svm/nested.c +++ b/arch/x86/kvm/svm/nested.c @@ -666,11 +666,6 @@ int nested_svm_vmrun(struct kvm_vcpu *vcpu) goto out; } - - /* Clear internal status */ - kvm_clear_exception_queue(vcpu); - kvm_clear_interrupt_queue(vcpu); - /* * Since vmcb01 is not in use, we can use it to store some of the L1 * state. diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 7fbce342eec4..75e0b21ad07c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -28,8 +28,6 @@ #include "cpuid.h" #include "trace.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) - #ifndef CONFIG_KVM_AMD_SEV /* * When this config is not defined, SEV feature is not supported and APIs in @@ -584,6 +582,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) save->xcr0 = svm->vcpu.arch.xcr0; save->pkru = svm->vcpu.arch.pkru; save->xss = svm->vcpu.arch.ia32_xss; + save->dr6 = svm->vcpu.arch.dr6; /* * SEV-ES will use a VMSA that is pointed to by the VMCB, not diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index 69639f9624f5..05e8d4d27969 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -46,8 +46,6 @@ #include "kvm_onhyperv.h" #include "svm_onhyperv.h" -#define __ex(x) __kvm_handle_fault_on_reboot(x) - MODULE_AUTHOR("Qumranet"); MODULE_LICENSE("GPL"); @@ -261,7 +259,7 @@ u32 svm_msrpm_offset(u32 msr) static int get_max_npt_level(void) { #ifdef CONFIG_X86_64 - return PT64_ROOT_4LEVEL; + return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; #else return PT32E_ROOT_LEVEL; #endif @@ -462,11 +460,6 @@ static int has_svm(void) return 0; } - if (pgtable_l5_enabled()) { - pr_info("KVM doesn't yet support 5-level paging on AMD SVM\n"); - return 0; - } - return 1; } @@ -1015,7 +1008,9 @@ static __init int svm_hardware_setup(void) if (!boot_cpu_has(X86_FEATURE_NPT)) npt_enabled = false; - kvm_configure_mmu(npt_enabled, get_max_npt_level(), PG_LEVEL_1G); + /* Force VM NPT level equal to the host's max NPT level */ + kvm_configure_mmu(npt_enabled, get_max_npt_level(), + get_max_npt_level(), PG_LEVEL_1G); pr_info("kvm: Nested Paging %sabled\n", npt_enabled ? "en" : "dis"); /* Note, SEV setup consumes npt_enabled. */ @@ -1161,8 +1156,6 @@ static void init_vmcb(struct kvm_vcpu *vcpu) struct vmcb_control_area *control = &svm->vmcb->control; struct vmcb_save_area *save = &svm->vmcb->save; - vcpu->arch.hflags = 0; - svm_set_intercept(svm, INTERCEPT_CR0_READ); svm_set_intercept(svm, INTERCEPT_CR3_READ); svm_set_intercept(svm, INTERCEPT_CR4_READ); @@ -1241,29 +1234,14 @@ static void init_vmcb(struct kvm_vcpu *vcpu) SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; save->cs.limit = 0xffff; + save->gdtr.base = 0; save->gdtr.limit = 0xffff; + save->idtr.base = 0; save->idtr.limit = 0xffff; init_sys_seg(&save->ldtr, SEG_TYPE_LDT); init_sys_seg(&save->tr, SEG_TYPE_BUSY_TSS16); - svm_set_cr4(vcpu, 0); - svm_set_efer(vcpu, 0); - save->dr6 = 0xffff0ff0; - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - save->rip = 0x0000fff0; - vcpu->arch.regs[VCPU_REGS_RIP] = save->rip; - - /* - * svm_set_cr0() sets PG and WP and clears NW and CD on save->cr0. - * It also updates the guest-visible cr0 value. - */ - svm_set_cr0(vcpu, X86_CR0_NW | X86_CR0_CD | X86_CR0_ET); - kvm_mmu_reset_context(vcpu); - - save->cr4 = X86_CR4_PAE; - /* rdx = ?? */ - if (npt_enabled) { /* Setup VMCB for Nested Paging */ control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; @@ -1273,14 +1251,12 @@ static void init_vmcb(struct kvm_vcpu *vcpu) svm_clr_intercept(svm, INTERCEPT_CR3_WRITE); save->g_pat = vcpu->arch.pat; save->cr3 = 0; - save->cr4 = 0; } svm->current_vmcb->asid_generation = 0; svm->asid = 0; svm->nested.vmcb12_gpa = INVALID_GPA; svm->nested.last_vmcb12_gpa = INVALID_GPA; - vcpu->arch.hflags = 0; if (!kvm_pause_in_guest(vcpu->kvm)) { control->pause_filter_count = pause_filter_count; @@ -1330,25 +1306,11 @@ static void init_vmcb(struct kvm_vcpu *vcpu) static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_svm *svm = to_svm(vcpu); - u32 dummy; - u32 eax = 1; svm->spec_ctrl = 0; svm->virt_spec_ctrl = 0; - if (!init_event) { - vcpu->arch.apic_base = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - vcpu->arch.apic_base |= MSR_IA32_APICBASE_BSP; - } init_vmcb(vcpu); - - kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, false); - kvm_rdx_write(vcpu, eax); - - if (kvm_vcpu_apicv_active(vcpu) && !init_event) - avic_update_vapic_bar(svm, APIC_DEFAULT_PHYS_BASE); } void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) @@ -1513,12 +1475,15 @@ static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) sd->current_vmcb = svm->vmcb; indirect_branch_prediction_barrier(); } - avic_vcpu_load(vcpu, cpu); + if (kvm_vcpu_apicv_active(vcpu)) + avic_vcpu_load(vcpu, cpu); } static void svm_vcpu_put(struct kvm_vcpu *vcpu) { - avic_vcpu_put(vcpu); + if (kvm_vcpu_apicv_active(vcpu)) + avic_vcpu_put(vcpu); + svm_prepare_host_switch(vcpu); ++vcpu->stat.host_state_reload; @@ -1560,7 +1525,7 @@ static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) load_pdptrs(vcpu, vcpu->arch.walk_mmu, kvm_read_cr3(vcpu)); break; default: - WARN_ON_ONCE(1); + KVM_BUG_ON(1, vcpu->kvm); } } @@ -2078,11 +2043,15 @@ static int shutdown_interception(struct kvm_vcpu *vcpu) return -EINVAL; /* - * VMCB is undefined after a SHUTDOWN intercept - * so reinitialize it. + * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put + * the VMCB in a known good state. Unfortuately, KVM doesn't have + * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking + * userspace. At a platform view, INIT is acceptable behavior as + * there exist bare metal platforms that automatically INIT the CPU + * in response to shutdown. */ clear_page(svm->vmcb); - init_vmcb(vcpu); + kvm_vcpu_reset(vcpu, true); kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; return 0; @@ -2993,10 +2962,6 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) svm->msr_decfg = data; break; } - case MSR_IA32_APICBASE: - if (kvm_vcpu_apicv_active(vcpu)) - avic_update_vapic_bar(to_svm(vcpu), data); - fallthrough; default: return kvm_set_msr_common(vcpu, msr); } @@ -3021,7 +2986,7 @@ static int interrupt_window_interception(struct kvm_vcpu *vcpu) * In this case AVIC was temporarily disabled for * requesting the IRQ window and we have to re-enable it. */ - svm_toggle_avic_for_irq_window(vcpu, true); + kvm_request_apicv_update(vcpu->kvm, true, APICV_INHIBIT_REASON_IRQWIN); ++vcpu->stat.irq_window_exits; return 1; @@ -3269,12 +3234,14 @@ static void dump_vmcb(struct kvm_vcpu *vcpu) "excp_to:", save->last_excp_to); } +static bool svm_check_exit_valid(struct kvm_vcpu *vcpu, u64 exit_code) +{ + return (exit_code < ARRAY_SIZE(svm_exit_handlers) && + svm_exit_handlers[exit_code]); +} + static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) { - if (exit_code < ARRAY_SIZE(svm_exit_handlers) && - svm_exit_handlers[exit_code]) - return 0; - vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n", exit_code); dump_vmcb(vcpu); vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; @@ -3282,14 +3249,13 @@ static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) vcpu->run->internal.ndata = 2; vcpu->run->internal.data[0] = exit_code; vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; - - return -EINVAL; + return 0; } int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) { - if (svm_handle_invalid_exit(vcpu, exit_code)) - return 0; + if (!svm_check_exit_valid(vcpu, exit_code)) + return svm_handle_invalid_exit(vcpu, exit_code); #ifdef CONFIG_RETPOLINE if (exit_code == SVM_EXIT_MSR) @@ -3573,7 +3539,7 @@ static void svm_enable_irq_window(struct kvm_vcpu *vcpu) * via AVIC. In such case, we need to temporarily disable AVIC, * and fallback to injecting IRQ via V_IRQ. */ - svm_toggle_avic_for_irq_window(vcpu, false); + kvm_request_apicv_update(vcpu->kvm, false, APICV_INHIBIT_REASON_IRQWIN); svm_set_vintr(svm); } } @@ -3808,6 +3774,8 @@ static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu) pre_svm_run(vcpu); + WARN_ON_ONCE(kvm_apicv_activated(vcpu->kvm) != kvm_vcpu_apicv_active(vcpu)); + sync_lapic_to_cr8(vcpu); if (unlikely(svm->asid != svm->vmcb->control.asid)) { @@ -4610,7 +4578,6 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .set_virtual_apic_mode = svm_set_virtual_apic_mode, .refresh_apicv_exec_ctrl = svm_refresh_apicv_exec_ctrl, .check_apicv_inhibit_reasons = svm_check_apicv_inhibit_reasons, - .pre_update_apicv_exec_ctrl = svm_pre_update_apicv_exec_ctrl, .load_eoi_exitmap = svm_load_eoi_exitmap, .hwapic_irr_update = svm_hwapic_irr_update, .hwapic_isr_update = svm_hwapic_isr_update, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index bd0fe94c2920..524d943f3efc 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -503,12 +503,6 @@ extern struct kvm_x86_nested_ops svm_nested_ops; #define VMCB_AVIC_APIC_BAR_MASK 0xFFFFFFFFFF000ULL -static inline void avic_update_vapic_bar(struct vcpu_svm *svm, u64 data) -{ - svm->vmcb->control.avic_vapic_bar = data & VMCB_AVIC_APIC_BAR_MASK; - vmcb_mark_dirty(svm->vmcb, VMCB_AVIC); -} - static inline bool avic_vcpu_is_running(struct kvm_vcpu *vcpu) { struct vcpu_svm *svm = to_svm(vcpu); @@ -524,7 +518,6 @@ int avic_ga_log_notifier(u32 ga_tag); void avic_vm_destroy(struct kvm *kvm); int avic_vm_init(struct kvm *kvm); void avic_init_vmcb(struct vcpu_svm *svm); -void svm_toggle_avic_for_irq_window(struct kvm_vcpu *vcpu, bool activate); int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu); int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu); int avic_init_vcpu(struct vcpu_svm *svm); @@ -534,7 +527,6 @@ void avic_post_state_restore(struct kvm_vcpu *vcpu); void svm_set_virtual_apic_mode(struct kvm_vcpu *vcpu); void svm_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu); bool svm_check_apicv_inhibit_reasons(ulong bit); -void svm_pre_update_apicv_exec_ctrl(struct kvm *kvm, bool activate); void svm_load_eoi_exitmap(struct kvm_vcpu *vcpu, u64 *eoi_exit_bitmap); void svm_hwapic_irr_update(struct kvm_vcpu *vcpu, int max_irr); void svm_hwapic_isr_update(struct kvm_vcpu *vcpu, int max_isr); diff --git a/arch/x86/kvm/svm/svm_ops.h b/arch/x86/kvm/svm/svm_ops.h index 8170f2a5a16f..22e2b019de37 100644 --- a/arch/x86/kvm/svm/svm_ops.h +++ b/arch/x86/kvm/svm/svm_ops.h @@ -4,7 +4,7 @@ #include -#include +#include "x86.h" #define svm_asm(insn, clobber...) \ do { \ diff --git a/arch/x86/kvm/vmx/evmcs.c b/arch/x86/kvm/vmx/evmcs.c index 896b2a50b4aa..0dab1b7b529f 100644 --- a/arch/x86/kvm/vmx/evmcs.c +++ b/arch/x86/kvm/vmx/evmcs.c @@ -14,7 +14,6 @@ DEFINE_STATIC_KEY_FALSE(enable_evmcs); #if IS_ENABLED(CONFIG_HYPERV) -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) #define EVMCS1_OFFSET(x) offsetof(struct hv_enlightened_vmcs, x) #define EVMCS1_FIELD(number, name, clean_field)[ROL16(number, 6)] = \ {EVMCS1_OFFSET(name), clean_field} diff --git a/arch/x86/kvm/vmx/evmcs.h b/arch/x86/kvm/vmx/evmcs.h index 2ec9b46f0d0c..152ab0aa82cf 100644 --- a/arch/x86/kvm/vmx/evmcs.h +++ b/arch/x86/kvm/vmx/evmcs.h @@ -73,8 +73,6 @@ struct evmcs_field { extern const struct evmcs_field vmcs_field_to_evmcs_1[]; extern const unsigned int nr_evmcs_1_fields; -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) - static __always_inline int get_evmcs_offset(unsigned long field, u16 *clean_field) { @@ -95,8 +93,6 @@ static __always_inline int get_evmcs_offset(unsigned long field, return evmcs_field->offset; } -#undef ROL16 - static inline void evmcs_write64(unsigned long field, u64 value) { u16 clean_field; diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b3f77d18eb5a..ccb03d69546c 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -2207,7 +2207,8 @@ static void prepare_vmcs02_early_rare(struct vcpu_vmx *vmx, } } -static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) +static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct loaded_vmcs *vmcs01, + struct vmcs12 *vmcs12) { u32 exec_control; u64 guest_efer = nested_vmx_calc_efer(vmx, vmcs12); @@ -2218,23 +2219,22 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) /* * PIN CONTROLS */ - exec_control = vmx_pin_based_exec_ctrl(vmx); + exec_control = __pin_controls_get(vmcs01); exec_control |= (vmcs12->pin_based_vm_exec_control & ~PIN_BASED_VMX_PREEMPTION_TIMER); /* Posted interrupts setting is only taken from vmcs12. */ - if (nested_cpu_has_posted_intr(vmcs12)) { + vmx->nested.pi_pending = false; + if (nested_cpu_has_posted_intr(vmcs12)) vmx->nested.posted_intr_nv = vmcs12->posted_intr_nv; - vmx->nested.pi_pending = false; - } else { + else exec_control &= ~PIN_BASED_POSTED_INTR; - } pin_controls_set(vmx, exec_control); /* * EXEC CONTROLS */ - exec_control = vmx_exec_control(vmx); /* L0's desires */ + exec_control = __exec_controls_get(vmcs01); /* L0's desires */ exec_control &= ~CPU_BASED_INTR_WINDOW_EXITING; exec_control &= ~CPU_BASED_NMI_WINDOW_EXITING; exec_control &= ~CPU_BASED_TPR_SHADOW; @@ -2271,10 +2271,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * SECONDARY EXEC CONTROLS */ if (cpu_has_secondary_exec_ctrls()) { - exec_control = vmx->secondary_exec_control; + exec_control = __secondary_exec_controls_get(vmcs01); /* Take the following fields only from vmcs12 */ exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE | SECONDARY_EXEC_ENABLE_INVPCID | SECONDARY_EXEC_ENABLE_RDTSCP | SECONDARY_EXEC_XSAVES | @@ -2282,7 +2283,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY | SECONDARY_EXEC_APIC_REGISTER_VIRT | SECONDARY_EXEC_ENABLE_VMFUNC | - SECONDARY_EXEC_TSC_SCALING); + SECONDARY_EXEC_TSC_SCALING | + SECONDARY_EXEC_DESC); + if (nested_cpu_has(vmcs12, CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)) exec_control |= vmcs12->secondary_vm_exec_control; @@ -2322,8 +2325,9 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * on the related bits (if supported by the CPU) in the hope that * we can avoid VMWrites during vmx_set_efer(). */ - exec_control = (vmcs12->vm_entry_controls | vmx_vmentry_ctrl()) & - ~VM_ENTRY_IA32E_MODE & ~VM_ENTRY_LOAD_IA32_EFER; + exec_control = __vm_entry_controls_get(vmcs01); + exec_control |= vmcs12->vm_entry_controls; + exec_control &= ~(VM_ENTRY_IA32E_MODE | VM_ENTRY_LOAD_IA32_EFER); if (cpu_has_load_ia32_efer()) { if (guest_efer & EFER_LMA) exec_control |= VM_ENTRY_IA32E_MODE; @@ -2339,9 +2343,11 @@ static void prepare_vmcs02_early(struct vcpu_vmx *vmx, struct vmcs12 *vmcs12) * we should use its exit controls. Note that VM_EXIT_LOAD_IA32_EFER * bits may be modified by vmx_set_efer() in prepare_vmcs02(). */ - exec_control = vmx_vmexit_ctrl(); + exec_control = __vm_exit_controls_get(vmcs01); if (cpu_has_load_ia32_efer() && guest_efer != host_efer) exec_control |= VM_EXIT_LOAD_IA32_EFER; + else + exec_control &= ~VM_EXIT_LOAD_IA32_EFER; vm_exit_controls_set(vmx, exec_control); /* @@ -3384,7 +3390,7 @@ enum nvmx_vmentry_status nested_vmx_enter_non_root_mode(struct kvm_vcpu *vcpu, vmx_switch_vmcs(vcpu, &vmx->nested.vmcs02); - prepare_vmcs02_early(vmx, vmcs12); + prepare_vmcs02_early(vmx, &vmx->vmcs01, vmcs12); if (from_vmentry) { if (unlikely(!nested_get_vmcs12_pages(vcpu))) { @@ -4304,7 +4310,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, seg.l = 1; else seg.db = 1; - vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_CS); seg = (struct kvm_segment) { .base = 0, .limit = 0xFFFFFFFF, @@ -4315,17 +4321,17 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, .g = 1 }; seg.selector = vmcs12->host_ds_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_DS); seg.selector = vmcs12->host_es_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_ES); seg.selector = vmcs12->host_ss_selector; - vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_SS); seg.selector = vmcs12->host_fs_selector; seg.base = vmcs12->host_fs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_FS); seg.selector = vmcs12->host_gs_selector; seg.base = vmcs12->host_gs_base; - vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_GS); seg = (struct kvm_segment) { .base = vmcs12->host_tr_base, .limit = 0x67, @@ -4333,14 +4339,15 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, .type = 11, .present = 1 }; - vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + __vmx_set_segment(vcpu, &seg, VCPU_SREG_TR); + + memset(&seg, 0, sizeof(seg)); + seg.unusable = 1; + __vmx_set_segment(vcpu, &seg, VCPU_SREG_LDTR); kvm_set_dr(vcpu, 7, 0x400); vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); - if (nested_vmx_load_msr(vcpu, vmcs12->vm_exit_msr_load_addr, vmcs12->vm_exit_msr_load_count)) nested_vmx_abort(vcpu, VMX_ABORT_LOAD_HOST_MSR_FAIL); @@ -4419,9 +4426,6 @@ static void nested_vmx_restore_host_state(struct kvm_vcpu *vcpu) kvm_mmu_reset_context(vcpu); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); - /* * This nasty bit of open coding is a compromise between blindly * loading L1's MSRs using the exit load lists (incorrect emulation diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index 9efc1a6b8693..10cc4f65c4ef 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -437,13 +437,13 @@ static int intel_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) !(msr & MSR_PMC_FULL_WIDTH_BIT)) data = (s64)(s32)data; pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) perf_event_period(pmc->perf_event, get_sample_period(pmc, data)); return 0; } else if ((pmc = get_fixed_pmc(pmu, msr))) { pmc->counter += data - pmc_read_counter(pmc); - if (pmc->perf_event) + if (pmc->perf_event && !pmc->is_paused) perf_event_period(pmc->perf_event, get_sample_period(pmc, data)); return 0; diff --git a/arch/x86/kvm/vmx/vmcs.h b/arch/x86/kvm/vmx/vmcs.h index 4b9957e2bf5b..6e5de2e2b0da 100644 --- a/arch/x86/kvm/vmx/vmcs.h +++ b/arch/x86/kvm/vmx/vmcs.h @@ -11,6 +11,8 @@ #include "capabilities.h" +#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) + struct vmcs_hdr { u32 revision_id:31; u32 shadow_vmcs:1; diff --git a/arch/x86/kvm/vmx/vmcs12.c b/arch/x86/kvm/vmx/vmcs12.c index d9f5d7c56ae3..cab6ba7a5005 100644 --- a/arch/x86/kvm/vmx/vmcs12.c +++ b/arch/x86/kvm/vmx/vmcs12.c @@ -2,7 +2,6 @@ #include "vmcs12.h" -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) #define VMCS12_OFFSET(x) offsetof(struct vmcs12, x) #define FIELD(number, name) [ROL16(number, 6)] = VMCS12_OFFSET(name) #define FIELD64(number, name) \ diff --git a/arch/x86/kvm/vmx/vmcs12.h b/arch/x86/kvm/vmx/vmcs12.h index 5e0e1b39f495..2a45f026ee11 100644 --- a/arch/x86/kvm/vmx/vmcs12.h +++ b/arch/x86/kvm/vmx/vmcs12.h @@ -364,8 +364,6 @@ static inline void vmx_check_vmcs12_offsets(void) extern const unsigned short vmcs_field_to_offset_table[]; extern const unsigned int nr_vmcs12_fields; -#define ROL16(val, n) ((u16)(((u16)(val) << (n)) | ((u16)(val) >> (16 - (n))))) - static inline short vmcs_field_to_offset(unsigned long field) { unsigned short offset; @@ -385,8 +383,6 @@ static inline short vmcs_field_to_offset(unsigned long field) return offset; } -#undef ROL16 - static inline u64 vmcs12_read_any(struct vmcs12 *vmcs12, unsigned long field, u16 offset) { diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 927a552393b9..0c2c0d5ae873 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -136,8 +136,7 @@ module_param(allow_smaller_maxphyaddr, bool, S_IRUGO); #define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW | X86_CR0_CD) #define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE #define KVM_VM_CR0_ALWAYS_ON \ - (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | \ - X86_CR0_WP | X86_CR0_PG | X86_CR0_PE) + (KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST | X86_CR0_PG | X86_CR0_PE) #define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE #define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE | X86_CR4_VMXE) @@ -1648,11 +1647,12 @@ static void vmx_setup_uret_msr(struct vcpu_vmx *vmx, unsigned int msr, } /* - * Set up the vmcs to automatically save and restore system - * msrs. Don't touch the 64-bit msrs if the guest is in legacy - * mode, as fiddling with msrs is very expensive. + * Configuring user return MSRs to automatically save, load, and restore MSRs + * that need to be shoved into hardware when running the guest. Note, omitting + * an MSR here does _NOT_ mean it's not emulated, only that it will not be + * loaded into hardware when running the guest. */ -static void setup_msrs(struct vcpu_vmx *vmx) +static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx) { #ifdef CONFIG_X86_64 bool load_syscall_msrs; @@ -1682,9 +1682,6 @@ static void setup_msrs(struct vcpu_vmx *vmx) */ vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM)); - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(&vmx->vcpu); - /* * The set of MSRs to load may have changed, reload MSRs before the * next VM-Enter. @@ -2263,8 +2260,11 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr0 |= vmcs_readl(GUEST_CR0) & guest_owned_bits; break; case VCPU_EXREG_CR3: - if (is_unrestricted_guest(vcpu) || - (enable_ept && is_paging(vcpu))) + /* + * When intercepting CR3 loads, e.g. for shadowing paging, KVM's + * CR3 is loaded into hardware, not the guest's CR3. + */ + if (!(exec_controls_get(to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING)) vcpu->arch.cr3 = vmcs_readl(GUEST_CR3); break; case VCPU_EXREG_CR4: @@ -2274,7 +2274,7 @@ static void vmx_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) vcpu->arch.cr4 |= vmcs_readl(GUEST_CR4) & guest_owned_bits; break; default: - WARN_ON_ONCE(1); + KVM_BUG_ON(1, vcpu->kvm); break; } } @@ -2733,7 +2733,7 @@ static void fix_pmode_seg(struct kvm_vcpu *vcpu, int seg, save->dpl = save->selector & SEGMENT_RPL_MASK; save->s = 1; } - vmx_set_segment(vcpu, save, seg); + __vmx_set_segment(vcpu, save, seg); } static void enter_pmode(struct kvm_vcpu *vcpu) @@ -2754,7 +2754,7 @@ static void enter_pmode(struct kvm_vcpu *vcpu) vmx->rmode.vm86_active = 0; - vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); + __vmx_set_segment(vcpu, &vmx->rmode.segs[VCPU_SREG_TR], VCPU_SREG_TR); flags = vmcs_readl(GUEST_RFLAGS); flags &= RMODE_GUEST_OWNED_EFLAGS_BITS; @@ -2852,8 +2852,6 @@ static void enter_rmode(struct kvm_vcpu *vcpu) fix_rmode_seg(VCPU_SREG_DS, &vmx->rmode.segs[VCPU_SREG_DS]); fix_rmode_seg(VCPU_SREG_GS, &vmx->rmode.segs[VCPU_SREG_GS]); fix_rmode_seg(VCPU_SREG_FS, &vmx->rmode.segs[VCPU_SREG_FS]); - - kvm_mmu_reset_context(vcpu); } int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) @@ -2874,7 +2872,7 @@ int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer) msr->data = efer & ~EFER_LME; } - setup_msrs(vmx); + vmx_setup_uret_msrs(vmx); return 0; } @@ -2997,42 +2995,24 @@ void ept_save_pdptrs(struct kvm_vcpu *vcpu) kvm_register_mark_dirty(vcpu, VCPU_EXREG_PDPTR); } -static void ept_update_paging_mode_cr0(unsigned long *hw_cr0, - unsigned long cr0, - struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - - if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) - vmx_cache_reg(vcpu, VCPU_EXREG_CR3); - if (!(cr0 & X86_CR0_PG)) { - /* From paging/starting to nonpaging */ - exec_controls_setbit(vmx, CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } else if (!is_paging(vcpu)) { - /* From nonpaging to paging */ - exec_controls_clearbit(vmx, CPU_BASED_CR3_LOAD_EXITING | - CPU_BASED_CR3_STORE_EXITING); - vcpu->arch.cr0 = cr0; - vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); - } - - if (!(cr0 & X86_CR0_WP)) - *hw_cr0 &= ~X86_CR0_WP; -} +#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING | \ + CPU_BASED_CR3_STORE_EXITING) void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) { struct vcpu_vmx *vmx = to_vmx(vcpu); - unsigned long hw_cr0; + unsigned long hw_cr0, old_cr0_pg; + u32 tmp; + + old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG); hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF); if (is_unrestricted_guest(vcpu)) hw_cr0 |= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST; else { hw_cr0 |= KVM_VM_CR0_ALWAYS_ON; + if (!enable_ept) + hw_cr0 |= X86_CR0_WP; if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE)) enter_pmode(vcpu); @@ -3041,23 +3021,61 @@ void vmx_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) enter_rmode(vcpu); } -#ifdef CONFIG_X86_64 - if (vcpu->arch.efer & EFER_LME) { - if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) - enter_lmode(vcpu); - if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) - exit_lmode(vcpu); - } -#endif - - if (enable_ept && !is_unrestricted_guest(vcpu)) - ept_update_paging_mode_cr0(&hw_cr0, cr0, vcpu); - vmcs_writel(CR0_READ_SHADOW, cr0); vmcs_writel(GUEST_CR0, hw_cr0); vcpu->arch.cr0 = cr0; kvm_register_mark_available(vcpu, VCPU_EXREG_CR0); +#ifdef CONFIG_X86_64 + if (vcpu->arch.efer & EFER_LME) { + if (!old_cr0_pg && (cr0 & X86_CR0_PG)) + enter_lmode(vcpu); + else if (old_cr0_pg && !(cr0 & X86_CR0_PG)) + exit_lmode(vcpu); + } +#endif + + if (enable_ept && !is_unrestricted_guest(vcpu)) { + /* + * Ensure KVM has an up-to-date snapshot of the guest's CR3. If + * the below code _enables_ CR3 exiting, vmx_cache_reg() will + * (correctly) stop reading vmcs.GUEST_CR3 because it thinks + * KVM's CR3 is installed. + */ + if (!kvm_register_is_available(vcpu, VCPU_EXREG_CR3)) + vmx_cache_reg(vcpu, VCPU_EXREG_CR3); + + /* + * When running with EPT but not unrestricted guest, KVM must + * intercept CR3 accesses when paging is _disabled_. This is + * necessary because restricted guests can't actually run with + * paging disabled, and so KVM stuffs its own CR3 in order to + * run the guest when identity mapped page tables. + * + * Do _NOT_ check the old CR0.PG, e.g. to optimize away the + * update, it may be stale with respect to CR3 interception, + * e.g. after nested VM-Enter. + * + * Lastly, honor L1's desires, i.e. intercept CR3 loads and/or + * stores to forward them to L1, even if KVM does not need to + * intercept them to preserve its identity mapped page tables. + */ + if (!(cr0 & X86_CR0_PG)) { + exec_controls_setbit(vmx, CR3_EXITING_BITS); + } else if (!is_guest_mode(vcpu)) { + exec_controls_clearbit(vmx, CR3_EXITING_BITS); + } else { + tmp = exec_controls_get(vmx); + tmp &= ~CR3_EXITING_BITS; + tmp |= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS; + exec_controls_set(vmx, tmp); + } + + /* Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. */ + if ((old_cr0_pg ^ cr0) & X86_CR0_PG) + vmx_set_cr4(vcpu, kvm_read_cr4(vcpu)); + } + /* depends on vcpu->arch.cr0 to be set to a new value */ vmx->emulation_required = emulation_required(vcpu); } @@ -3271,7 +3289,7 @@ static u32 vmx_segment_access_rights(struct kvm_segment *var) return ar; } -void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) { struct vcpu_vmx *vmx = to_vmx(vcpu); const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg]; @@ -3284,7 +3302,7 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) vmcs_write16(sf->selector, var->selector); else if (var->s) fix_rmode_seg(seg, &vmx->rmode.segs[seg]); - goto out; + return; } vmcs_writel(sf->base, var->base); @@ -3306,9 +3324,13 @@ void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) var->type |= 0x1; /* Accessed */ vmcs_write32(sf->ar_bytes, vmx_segment_access_rights(var)); +} -out: - vmx->emulation_required = emulation_required(vcpu); +static void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg) +{ + __vmx_set_segment(vcpu, var, seg); + + to_vmx(vcpu)->emulation_required = emulation_required(vcpu); } static void vmx_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) @@ -3790,21 +3812,6 @@ void vmx_enable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) vmx_set_msr_bitmap_write(msr_bitmap, msr); } -static u8 vmx_msr_bitmap_mode(struct kvm_vcpu *vcpu) -{ - u8 mode = 0; - - if (cpu_has_secondary_exec_ctrls() && - (secondary_exec_controls_get(to_vmx(vcpu)) & - SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { - mode |= MSR_BITMAP_MODE_X2APIC; - if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) - mode |= MSR_BITMAP_MODE_X2APIC_APICV; - } - - return mode; -} - static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) { unsigned long *msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; @@ -3822,11 +3829,29 @@ static void vmx_reset_x2apic_msrs(struct kvm_vcpu *vcpu, u8 mode) } } -static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) +static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu) { + struct vcpu_vmx *vmx = to_vmx(vcpu); + u8 mode; + if (!cpu_has_vmx_msr_bitmap()) return; + if (cpu_has_secondary_exec_ctrls() && + (secondary_exec_controls_get(vmx) & + SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) { + mode = MSR_BITMAP_MODE_X2APIC; + if (enable_apicv && kvm_vcpu_apicv_active(vcpu)) + mode |= MSR_BITMAP_MODE_X2APIC_APICV; + } else { + mode = 0; + } + + if (mode == vmx->x2apic_msr_bitmap_mode) + return; + + vmx->x2apic_msr_bitmap_mode = mode; + vmx_reset_x2apic_msrs(vcpu, mode); /* @@ -3843,21 +3868,6 @@ static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu, u8 mode) } } -void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu) -{ - struct vcpu_vmx *vmx = to_vmx(vcpu); - u8 mode = vmx_msr_bitmap_mode(vcpu); - u8 changed = mode ^ vmx->msr_bitmap_mode; - - if (!changed) - return; - - if (changed & (MSR_BITMAP_MODE_X2APIC | MSR_BITMAP_MODE_X2APIC_APICV)) - vmx_update_msr_bitmap_x2apic(vcpu, mode); - - vmx->msr_bitmap_mode = mode; -} - void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -3914,7 +3924,6 @@ static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu) } pt_update_intercept_for_msr(vcpu); - vmx_update_msr_bitmap_x2apic(vcpu, vmx_msr_bitmap_mode(vcpu)); } static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu, @@ -4086,7 +4095,7 @@ void set_cr4_guest_host_mask(struct vcpu_vmx *vmx) vmcs_writel(CR4_GUEST_HOST_MASK, ~vcpu->arch.cr4_guest_owned_bits); } -u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) +static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) { u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl; @@ -4102,6 +4111,30 @@ u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx) return pin_based_exec_ctrl; } +static u32 vmx_vmentry_ctrl(void) +{ + u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; + + if (vmx_pt_mode_is_system()) + vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | + VM_ENTRY_LOAD_IA32_RTIT_CTL); + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmentry_ctrl & + ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); +} + +static u32 vmx_vmexit_ctrl(void) +{ + u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; + + if (vmx_pt_mode_is_system()) + vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | + VM_EXIT_CLEAR_IA32_RTIT_CTL); + /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ + return vmexit_ctrl & + ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); +} + static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -4118,11 +4151,10 @@ static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY); } - if (cpu_has_vmx_msr_bitmap()) - vmx_update_msr_bitmap(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu); } -u32 vmx_exec_control(struct vcpu_vmx *vmx) +static u32 vmx_exec_control(struct vcpu_vmx *vmx) { u32 exec_control = vmcs_config.cpu_based_exec_ctrl; @@ -4204,7 +4236,7 @@ vmx_adjust_secondary_exec_control(struct vcpu_vmx *vmx, u32 *exec_control, #define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \ vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true) -static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) +static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx) { struct kvm_vcpu *vcpu = &vmx->vcpu; @@ -4290,7 +4322,7 @@ static void vmx_compute_secondary_exec_control(struct vcpu_vmx *vmx) if (!vcpu->kvm->arch.bus_lock_detection_enabled) exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION; - vmx->secondary_exec_control = exec_control; + return exec_control; } #define VMX_XSS_EXIT_BITMAP 0 @@ -4314,10 +4346,8 @@ static void init_vmcs(struct vcpu_vmx *vmx) exec_controls_set(vmx, vmx_exec_control(vmx)); - if (cpu_has_secondary_exec_ctrls()) { - vmx_compute_secondary_exec_control(vmx); - secondary_exec_controls_set(vmx, vmx->secondary_exec_control); - } + if (cpu_has_secondary_exec_ctrls()) + secondary_exec_controls_set(vmx, vmx_secondary_exec_control(vmx)); if (kvm_vcpu_apicv_active(&vmx->vcpu)) { vmcs_write64(EOI_EXIT_BITMAP0, 0); @@ -4388,32 +4418,35 @@ static void init_vmcs(struct vcpu_vmx *vmx) vmx->pt_desc.guest.output_mask = 0x7F; vmcs_write64(GUEST_IA32_RTIT_CTL, 0); } + + vmcs_write32(GUEST_SYSENTER_CS, 0); + vmcs_writel(GUEST_SYSENTER_ESP, 0); + vmcs_writel(GUEST_SYSENTER_EIP, 0); + vmcs_write64(GUEST_IA32_DEBUGCTL, 0); + + if (cpu_has_vmx_tpr_shadow()) { + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); + if (cpu_need_tpr_shadow(&vmx->vcpu)) + vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, + __pa(vmx->vcpu.arch.apic->regs)); + vmcs_write32(TPR_THRESHOLD, 0); + } + + vmx_setup_uret_msrs(vmx); } static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { struct vcpu_vmx *vmx = to_vmx(vcpu); - struct msr_data apic_base_msr; - u64 cr0; vmx->rmode.vm86_active = 0; vmx->spec_ctrl = 0; vmx->msr_ia32_umwait_control = 0; - vmx->vcpu.arch.regs[VCPU_REGS_RDX] = get_rdx_init_val(); vmx->hv_deadline_tsc = -1; kvm_set_cr8(vcpu, 0); - if (!init_event) { - apic_base_msr.data = APIC_DEFAULT_PHYS_BASE | - MSR_IA32_APICBASE_ENABLE; - if (kvm_vcpu_is_reset_bsp(vcpu)) - apic_base_msr.data |= MSR_IA32_APICBASE_BSP; - apic_base_msr.host_initiated = true; - kvm_set_apic_base(vcpu, &apic_base_msr); - } - vmx_segment_cache_clear(vmx); seg_setup(VCPU_SREG_CS); @@ -4436,16 +4469,6 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vmcs_write32(GUEST_LDTR_LIMIT, 0xffff); vmcs_write32(GUEST_LDTR_AR_BYTES, 0x00082); - if (!init_event) { - vmcs_write32(GUEST_SYSENTER_CS, 0); - vmcs_writel(GUEST_SYSENTER_ESP, 0); - vmcs_writel(GUEST_SYSENTER_EIP, 0); - vmcs_write64(GUEST_IA32_DEBUGCTL, 0); - } - - kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); - kvm_rip_write(vcpu, 0xfff0); - vmcs_writel(GUEST_GDTR_BASE, 0); vmcs_write32(GUEST_GDTR_LIMIT, 0xffff); @@ -4458,31 +4481,11 @@ static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) if (kvm_mpx_supported()) vmcs_write64(GUEST_BNDCFGS, 0); - setup_msrs(vmx); - vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0); /* 22.2.1 */ - if (cpu_has_vmx_tpr_shadow() && !init_event) { - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, 0); - if (cpu_need_tpr_shadow(vcpu)) - vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, - __pa(vcpu->arch.apic->regs)); - vmcs_write32(TPR_THRESHOLD, 0); - } - kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu); - cr0 = X86_CR0_NW | X86_CR0_CD | X86_CR0_ET; - vmx->vcpu.arch.cr0 = cr0; - vmx_set_cr0(vcpu, cr0); /* enter rmode */ - vmx_set_cr4(vcpu, 0); - vmx_set_efer(vcpu, 0); - - vmx_update_exception_bitmap(vcpu); - vpid_sync_context(vmx->vpid); - if (init_event) - vmx_clear_hlt(vcpu); } static void vmx_enable_irq_window(struct kvm_vcpu *vcpu) @@ -4996,6 +4999,7 @@ static int handle_cr(struct kvm_vcpu *vcpu) return kvm_complete_insn_gp(vcpu, err); case 3: WARN_ON_ONCE(enable_unrestricted_guest); + err = kvm_set_cr3(vcpu, val); return kvm_complete_insn_gp(vcpu, err); case 4: @@ -5021,14 +5025,13 @@ static int handle_cr(struct kvm_vcpu *vcpu) } break; case 2: /* clts */ - WARN_ONCE(1, "Guest should always own CR0.TS"); - vmx_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~X86_CR0_TS)); - trace_kvm_cr_write(0, kvm_read_cr0(vcpu)); - return kvm_skip_emulated_instruction(vcpu); + KVM_BUG(1, vcpu->kvm, "Guest always owns CR0.TS"); + return -EIO; case 1: /*mov from cr*/ switch (cr) { case 3: WARN_ON_ONCE(enable_unrestricted_guest); + val = kvm_read_cr3(vcpu); kvm_register_write(vcpu, reg, val); trace_kvm_cr_read(cr, val); @@ -5129,6 +5132,12 @@ static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; exec_controls_setbit(to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING); + + /* + * exc_debug expects dr6 to be cleared after it runs, avoid that it sees + * a stale dr6 from the guest. + */ + set_debugreg(DR6_RESERVED, 6); } static void vmx_set_dr7(struct kvm_vcpu *vcpu, unsigned long val) @@ -5338,7 +5347,9 @@ static int handle_ept_misconfig(struct kvm_vcpu *vcpu) static int handle_nmi_window(struct kvm_vcpu *vcpu) { - WARN_ON_ONCE(!enable_vnmi); + if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm)) + return -EIO; + exec_controls_clearbit(to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING); ++vcpu->stat.nmi_window_exits; kvm_make_request(KVM_REQ_EVENT, vcpu); @@ -5896,7 +5907,8 @@ static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) * below) should never happen as that means we incorrectly allowed a * nested VM-Enter with an invalid vmcs12. */ - WARN_ON_ONCE(vmx->nested.nested_run_pending); + if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm)) + return -EIO; /* If guest state is invalid, start emulating */ if (vmx->emulation_required) @@ -6189,7 +6201,7 @@ void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu) } secondary_exec_controls_set(vmx, sec_exec_control); - vmx_update_msr_bitmap(vcpu); + vmx_update_msr_bitmap_x2apic(vcpu); } static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu) @@ -6274,7 +6286,9 @@ static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu) int max_irr; bool max_irr_updated; - WARN_ON(!vcpu->arch.apicv_active); + if (KVM_BUG_ON(!vcpu->arch.apicv_active, vcpu->kvm)) + return -EIO; + if (pi_test_on(&vmx->pi_desc)) { pi_clear_on(&vmx->pi_desc); /* @@ -6357,7 +6371,7 @@ static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu) unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK; gate_desc *desc = (gate_desc *)host_idt_base + vector; - if (WARN_ONCE(!is_external_intr(intr_info), + if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm, "KVM: unexpected VM-Exit interrupt info: 0x%x", intr_info)) return; @@ -6368,6 +6382,9 @@ static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu) { struct vcpu_vmx *vmx = to_vmx(vcpu); + if (vmx->emulation_required) + return; + if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT) handle_external_interrupt_irqoff(vcpu); else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI) @@ -6639,6 +6656,10 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) vmx->loaded_vmcs->host_state.cr4 = cr4; } + /* When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. */ + if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) + set_debugreg(vcpu->arch.dr6, 6); + /* When single-stepping over STI and MOV SS, we must clear the * corresponding interruptibility bits in the guest state. Otherwise * vmentry fails as it then expects bit 14 (BS) in pending debug @@ -6838,7 +6859,6 @@ static int vmx_create_vcpu(struct kvm_vcpu *vcpu) vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R); vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R); } - vmx->msr_bitmap_mode = 0; vmx->loaded_vmcs = &vmx->vmcs01; cpu = get_cpu(); @@ -6997,7 +7017,7 @@ exit: return (cache << VMX_EPT_MT_EPTE_SHIFT) | ipat; } -static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) +static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl) { /* * These bits in the secondary execution controls field @@ -7011,7 +7031,6 @@ static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx) SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES | SECONDARY_EXEC_DESC; - u32 new_ctl = vmx->secondary_exec_control; u32 cur_ctl = secondary_exec_controls_get(vmx); secondary_exec_controls_set(vmx, (new_ctl & ~mask) | (cur_ctl & mask)); @@ -7154,10 +7173,11 @@ static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) /* xsaves_enabled is recomputed in vmx_compute_secondary_exec_control(). */ vcpu->arch.xsaves_enabled = false; - if (cpu_has_secondary_exec_ctrls()) { - vmx_compute_secondary_exec_control(vmx); - vmcs_set_secondary_exec_control(vmx); - } + vmx_setup_uret_msrs(vmx); + + if (cpu_has_secondary_exec_ctrls()) + vmcs_set_secondary_exec_control(vmx, + vmx_secondary_exec_control(vmx)); if (nested_vmx_allowed(vcpu)) to_vmx(vcpu)->msr_ia32_feature_control_valid_bits |= @@ -7803,7 +7823,8 @@ static __init int hardware_setup(void) ept_lpage_level = PG_LEVEL_2M; else ept_lpage_level = PG_LEVEL_4K; - kvm_configure_mmu(enable_ept, vmx_get_max_tdp_level(), ept_lpage_level); + kvm_configure_mmu(enable_ept, 0, vmx_get_max_tdp_level(), + ept_lpage_level); /* * Only enable PML when hardware supports PML feature, and both EPT diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index 17a1cb4b059d..4858c5fd95f2 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -227,7 +227,7 @@ struct nested_vmx { struct vcpu_vmx { struct kvm_vcpu vcpu; u8 fail; - u8 msr_bitmap_mode; + u8 x2apic_msr_bitmap_mode; /* * If true, host state has been stored in vmx->loaded_vmcs for @@ -263,8 +263,6 @@ struct vcpu_vmx { u64 spec_ctrl; u32 msr_ia32_umwait_control; - u32 secondary_exec_control; - /* * loaded_vmcs points to the VMCS currently used in this vcpu. For a * non-nested (L1) guest, it always points to vmcs01. For a nested @@ -371,12 +369,11 @@ void vmx_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4); void set_cr4_guest_host_mask(struct vcpu_vmx *vmx); void ept_save_pdptrs(struct kvm_vcpu *vcpu); void vmx_get_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); -void vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); +void __vmx_set_segment(struct kvm_vcpu *vcpu, struct kvm_segment *var, int seg); u64 construct_eptp(struct kvm_vcpu *vcpu, hpa_t root_hpa, int root_level); bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu); void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu); -void vmx_update_msr_bitmap(struct kvm_vcpu *vcpu); bool vmx_nmi_blocked(struct kvm_vcpu *vcpu); bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu); bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu); @@ -419,9 +416,13 @@ static inline void lname##_controls_set(struct vcpu_vmx *vmx, u32 val) \ vmx->loaded_vmcs->controls_shadow.lname = val; \ } \ } \ +static inline u32 __##lname##_controls_get(struct loaded_vmcs *vmcs) \ +{ \ + return vmcs->controls_shadow.lname; \ +} \ static inline u32 lname##_controls_get(struct vcpu_vmx *vmx) \ { \ - return vmx->loaded_vmcs->controls_shadow.lname; \ + return __##lname##_controls_get(vmx->loaded_vmcs); \ } \ static inline void lname##_controls_setbit(struct vcpu_vmx *vmx, u32 val) \ { \ @@ -451,31 +452,6 @@ static inline void vmx_register_cache_reset(struct kvm_vcpu *vcpu) vcpu->arch.regs_dirty = 0; } -static inline u32 vmx_vmentry_ctrl(void) -{ - u32 vmentry_ctrl = vmcs_config.vmentry_ctrl; - if (vmx_pt_mode_is_system()) - vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP | - VM_ENTRY_LOAD_IA32_RTIT_CTL); - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmentry_ctrl & - ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_IA32_EFER); -} - -static inline u32 vmx_vmexit_ctrl(void) -{ - u32 vmexit_ctrl = vmcs_config.vmexit_ctrl; - if (vmx_pt_mode_is_system()) - vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP | - VM_EXIT_CLEAR_IA32_RTIT_CTL); - /* Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically */ - return vmexit_ctrl & - ~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL | VM_EXIT_LOAD_IA32_EFER); -} - -u32 vmx_exec_control(struct vcpu_vmx *vmx); -u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx); - static inline struct kvm_vmx *to_kvm_vmx(struct kvm *kvm) { return container_of(kvm, struct kvm_vmx, kvm); diff --git a/arch/x86/kvm/vmx/vmx_ops.h b/arch/x86/kvm/vmx/vmx_ops.h index 164b64f65a8f..9e9ef47e988c 100644 --- a/arch/x86/kvm/vmx/vmx_ops.h +++ b/arch/x86/kvm/vmx/vmx_ops.h @@ -4,13 +4,11 @@ #include -#include #include #include "evmcs.h" #include "vmcs.h" - -#define __ex(x) __kvm_handle_fault_on_reboot(x) +#include "x86.h" asmlinkage void vmread_error(unsigned long field, bool fault); __attribute__((regparm(0))) void vmread_error_trampoline(unsigned long field, diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index e5d5c5ed7dd4..28ef14155726 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -233,12 +233,13 @@ const struct _kvm_stats_desc kvm_vm_stats_desc[] = { STATS_DESC_COUNTER(VM, mmu_recycled), STATS_DESC_COUNTER(VM, mmu_cache_miss), STATS_DESC_ICOUNTER(VM, mmu_unsync), - STATS_DESC_ICOUNTER(VM, lpages), + STATS_DESC_ICOUNTER(VM, pages_4k), + STATS_DESC_ICOUNTER(VM, pages_2m), + STATS_DESC_ICOUNTER(VM, pages_1g), STATS_DESC_ICOUNTER(VM, nx_lpage_splits), + STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size), STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) }; -static_assert(ARRAY_SIZE(kvm_vm_stats_desc) == - sizeof(struct kvm_vm_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vm_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -278,8 +279,6 @@ const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { STATS_DESC_COUNTER(VCPU, directed_yield_successful), STATS_DESC_ICOUNTER(VCPU, guest_mode) }; -static_assert(ARRAY_SIZE(kvm_vcpu_stats_desc) == - sizeof(struct kvm_vcpu_stat) / sizeof(u64)); const struct kvm_stats_header kvm_vcpu_stats_header = { .name_size = KVM_STATS_NAME_SIZE, @@ -485,7 +484,14 @@ int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) } EXPORT_SYMBOL_GPL(kvm_set_apic_base); -asmlinkage __visible noinstr void kvm_spurious_fault(void) +/* + * Handle a fault on a hardware virtualization (VMX or SVM) instruction. + * + * Hardware virtualization extension instructions may fault if a reboot turns + * off virtualization while processes are running. Usually after catching the + * fault we just panic; during reboot instead the instruction is ignored. + */ +noinstr void kvm_spurious_fault(void) { /* Fault while not rebooting. We want the trace. */ BUG_ON(!kvm_rebooting); @@ -1180,7 +1186,6 @@ static void kvm_update_dr0123(struct kvm_vcpu *vcpu) if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { for (i = 0; i < KVM_NR_DB_REGS; i++) vcpu->arch.eff_db[i] = vcpu->arch.db[i]; - vcpu->arch.switch_db_regs |= KVM_DEBUGREG_RELOAD; } } @@ -3316,6 +3321,10 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!msr_info->host_initiated) { s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; adjust_tsc_offset_guest(vcpu, adj); + /* Before back to guest, tsc_timestamp must be adjusted + * as well, otherwise guest's percpu pvclock time could jump. + */ + kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); } vcpu->arch.ia32_tsc_adjust_msr = data; } @@ -4310,12 +4319,6 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) static_call(kvm_x86_vcpu_put)(vcpu); vcpu->arch.last_host_tsc = rdtsc(); - /* - * If userspace has set any breakpoints or watchpoints, dr6 is restored - * on every vmexit, but if not, we might have a stale dr6 from the - * guest. do_debug expects dr6 to be cleared after it runs, do the same. - */ - set_debugreg(0, 6); } static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, @@ -6567,9 +6570,9 @@ static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, * there is no pkey in EPT page table for L1 guest or EPT * shadow page table for L2 guest. */ - if (vcpu_match_mmio_gva(vcpu, gva) - && !permission_fault(vcpu, vcpu->arch.walk_mmu, - vcpu->arch.mmio_access, 0, access)) { + if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) || + !permission_fault(vcpu, vcpu->arch.walk_mmu, + vcpu->arch.mmio_access, 0, access))) { *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | (gva & (PAGE_SIZE - 1)); trace_vcpu_match_mmio(gva, *gpa, write, false); @@ -8578,6 +8581,8 @@ EXPORT_SYMBOL_GPL(kvm_apicv_activated); static void kvm_apicv_init(struct kvm *kvm) { + mutex_init(&kvm->arch.apicv_update_lock); + if (enable_apicv) clear_bit(APICV_INHIBIT_REASON_DISABLE, &kvm->arch.apicv_inhibit_reasons); @@ -8891,6 +8896,10 @@ static int inject_pending_event(struct kvm_vcpu *vcpu, bool *req_immediate_exit) can_inject = false; } + /* Don't inject interrupts if the user asked to avoid doing so */ + if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) + return 0; + /* * Finally, inject interrupt events. If an event cannot be injected * due to architectural conditions (e.g. IF=0) a window-open exit @@ -9236,10 +9245,18 @@ void kvm_make_scan_ioapic_request(struct kvm *kvm) void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) { + bool activate; + if (!lapic_in_kernel(vcpu)) return; - vcpu->arch.apicv_active = kvm_apicv_activated(vcpu->kvm); + mutex_lock(&vcpu->kvm->arch.apicv_update_lock); + + activate = kvm_apicv_activated(vcpu->kvm); + if (vcpu->arch.apicv_active == activate) + goto out; + + vcpu->arch.apicv_active = activate; kvm_apic_update_apicv(vcpu); static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); @@ -9251,54 +9268,45 @@ void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) */ if (!vcpu->arch.apicv_active) kvm_make_request(KVM_REQ_EVENT, vcpu); + +out: + mutex_unlock(&vcpu->kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_vcpu_update_apicv); -/* - * NOTE: Do not hold any lock prior to calling this. - * - * In particular, kvm_request_apicv_update() expects kvm->srcu not to be - * locked, because it calls __x86_set_memory_region() which does - * synchronize_srcu(&kvm->srcu). - */ -void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +void __kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) { - struct kvm_vcpu *except; - unsigned long old, new, expected; + unsigned long old, new; if (!kvm_x86_ops.check_apicv_inhibit_reasons || !static_call(kvm_x86_check_apicv_inhibit_reasons)(bit)) return; - old = READ_ONCE(kvm->arch.apicv_inhibit_reasons); - do { - expected = new = old; - if (activate) - __clear_bit(bit, &new); - else - __set_bit(bit, &new); - if (new == old) - break; - old = cmpxchg(&kvm->arch.apicv_inhibit_reasons, expected, new); - } while (old != expected); + old = new = kvm->arch.apicv_inhibit_reasons; - if (!!old == !!new) - return; + if (activate) + __clear_bit(bit, &new); + else + __set_bit(bit, &new); - trace_kvm_apicv_update_request(activate, bit); - if (kvm_x86_ops.pre_update_apicv_exec_ctrl) - static_call(kvm_x86_pre_update_apicv_exec_ctrl)(kvm, activate); + if (!!old != !!new) { + trace_kvm_apicv_update_request(activate, bit); + kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); + kvm->arch.apicv_inhibit_reasons = new; + if (new) { + unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE); + kvm_zap_gfn_range(kvm, gfn, gfn+1); + } + } else + kvm->arch.apicv_inhibit_reasons = new; +} +EXPORT_SYMBOL_GPL(__kvm_request_apicv_update); - /* - * Sending request to update APICV for all other vcpus, - * while update the calling vcpu immediately instead of - * waiting for another #VMEXIT to handle the request. - */ - except = kvm_get_running_vcpu(); - kvm_make_all_cpus_request_except(kvm, KVM_REQ_APICV_UPDATE, - except); - if (except) - kvm_vcpu_update_apicv(except); +void kvm_request_apicv_update(struct kvm *kvm, bool activate, ulong bit) +{ + mutex_lock(&kvm->arch.apicv_update_lock); + __kvm_request_apicv_update(kvm, activate, bit); + mutex_unlock(&kvm->arch.apicv_update_lock); } EXPORT_SYMBOL_GPL(kvm_request_apicv_update); @@ -9395,6 +9403,10 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_request_pending(vcpu)) { + if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) { + r = -EIO; + goto out; + } if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { r = 0; @@ -9608,8 +9620,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) set_debugreg(vcpu->arch.eff_db[1], 1); set_debugreg(vcpu->arch.eff_db[2], 2); set_debugreg(vcpu->arch.eff_db[3], 3); - set_debugreg(vcpu->arch.dr6, 6); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } else if (unlikely(hw_breakpoint_active())) { set_debugreg(0, 7); } @@ -9639,7 +9649,6 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) static_call(kvm_x86_sync_dirty_debug_regs)(vcpu); kvm_update_dr0123(vcpu); kvm_update_dr7(vcpu); - vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_RELOAD; } /* @@ -9976,7 +9985,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) goto out; } - if (kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) { + if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) || + (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) { r = -EINVAL; goto out; } @@ -10581,9 +10591,6 @@ static void store_regs(struct kvm_vcpu *vcpu) static int sync_regs(struct kvm_vcpu *vcpu) { - if (vcpu->run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS) - return -EINVAL; - if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) { __set_regs(vcpu, &vcpu->run->s.regs.regs); vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; @@ -10799,6 +10806,8 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) { unsigned long old_cr0 = kvm_read_cr0(vcpu); + unsigned long new_cr0; + u32 eax, dummy; kvm_lapic_reset(vcpu, init_event); @@ -10865,10 +10874,41 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) vcpu->arch.regs_avail = ~0; vcpu->arch.regs_dirty = ~0; + /* + * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) + * if no CPUID match is found. Note, it's impossible to get a match at + * RESET since KVM emulates RESET before exposing the vCPU to userspace, + * i.e. it'simpossible for kvm_cpuid() to find a valid entry on RESET. + * But, go through the motions in case that's ever remedied. + */ + eax = 1; + if (!kvm_cpuid(vcpu, &eax, &dummy, &dummy, &dummy, true)) + eax = 0x600; + kvm_rdx_write(vcpu, eax); + vcpu->arch.ia32_xss = 0; static_call(kvm_x86_vcpu_reset)(vcpu, init_event); + kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); + kvm_rip_write(vcpu, 0xfff0); + + /* + * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions + * of Intel's SDM list CD/NW as being set on INIT, but they contradict + * (or qualify) that with a footnote stating that CD/NW are preserved. + */ + new_cr0 = X86_CR0_ET; + if (init_event) + new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD)); + else + new_cr0 |= X86_CR0_NW | X86_CR0_CD; + + static_call(kvm_x86_set_cr0)(vcpu, new_cr0); + static_call(kvm_x86_set_cr4)(vcpu, 0); + static_call(kvm_x86_set_efer)(vcpu, 0); + static_call(kvm_x86_update_exception_bitmap)(vcpu); + /* * Reset the MMU context if paging was enabled prior to INIT (which is * implied if CR0.PG=1 as CR0 will be '0' prior to RESET). Unlike the @@ -10879,7 +10919,20 @@ void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) */ if (old_cr0 & X86_CR0_PG) kvm_mmu_reset_context(vcpu); + + /* + * Intel's SDM states that all TLB entries are flushed on INIT. AMD's + * APM states the TLBs are untouched by INIT, but it also states that + * the TLBs are flushed on "External initialization of the processor." + * Flush the guest TLB regardless of vendor, there is no meaningful + * benefit in relying on the guest to flush the TLB immediately after + * INIT. A spurious TLB flush is benign and likely negligible from a + * performance perspective. + */ + if (init_event) + kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); } +EXPORT_SYMBOL_GPL(kvm_vcpu_reset); void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) { @@ -11123,6 +11176,7 @@ int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) kvm_hv_init_vm(kvm); kvm_page_track_init(kvm); kvm_mmu_init_vm(kvm); + kvm_xen_init_vm(kvm); return static_call(kvm_x86_vm_init)(kvm); } @@ -11312,8 +11366,7 @@ static int memslot_rmap_alloc(struct kvm_memory_slot *slot, for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { int level = i + 1; - int lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; + int lpages = __kvm_mmu_slot_lpages(slot, npages, level); WARN_ON(slot->arch.rmap[i]); @@ -11396,8 +11449,7 @@ static int kvm_alloc_memslot_metadata(struct kvm *kvm, int lpages; int level = i + 1; - lpages = gfn_to_index(slot->base_gfn + npages - 1, - slot->base_gfn, level) + 1; + lpages = __kvm_mmu_slot_lpages(slot, npages, level); linfo = kvcalloc(lpages, sizeof(*linfo), GFP_KERNEL_ACCOUNT); if (!linfo) @@ -11481,7 +11533,7 @@ static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) static void kvm_mmu_slot_apply_flags(struct kvm *kvm, struct kvm_memory_slot *old, - struct kvm_memory_slot *new, + const struct kvm_memory_slot *new, enum kvm_mr_change change) { bool log_dirty_pages = new->flags & KVM_MEM_LOG_DIRTY_PAGES; @@ -11561,10 +11613,7 @@ void kvm_arch_commit_memory_region(struct kvm *kvm, kvm_mmu_change_mmu_pages(kvm, kvm_mmu_calculate_default_mmu_pages(kvm)); - /* - * FIXME: const-ify all uses of struct kvm_memory_slot. - */ - kvm_mmu_slot_apply_flags(kvm, old, (struct kvm_memory_slot *) new, change); + kvm_mmu_slot_apply_flags(kvm, old, new, change); /* Free the arrays associated with the old memslot. */ if (change == KVM_MR_MOVE) diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index 44ae10312740..7d66d63dc55a 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -8,6 +8,8 @@ #include "kvm_cache_regs.h" #include "kvm_emulate.h" +void kvm_spurious_fault(void); + static __always_inline void kvm_guest_enter_irqoff(void) { /* diff --git a/arch/x86/kvm/xen.c b/arch/x86/kvm/xen.c index ae17250e1efe..9ea9c3dabe37 100644 --- a/arch/x86/kvm/xen.c +++ b/arch/x86/kvm/xen.c @@ -25,15 +25,14 @@ static int kvm_xen_shared_info_init(struct kvm *kvm, gfn_t gfn) { gpa_t gpa = gfn_to_gpa(gfn); int wc_ofs, sec_hi_ofs; - int ret; + int ret = 0; int idx = srcu_read_lock(&kvm->srcu); - ret = kvm_gfn_to_hva_cache_init(kvm, &kvm->arch.xen.shinfo_cache, - gpa, PAGE_SIZE); - if (ret) + if (kvm_is_error_hva(gfn_to_hva(kvm, gfn))) { + ret = -EFAULT; goto out; - - kvm->arch.xen.shinfo_set = true; + } + kvm->arch.xen.shinfo_gfn = gfn; /* Paranoia checks on the 32-bit struct layout */ BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); @@ -245,7 +244,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) case KVM_XEN_ATTR_TYPE_SHARED_INFO: if (data->u.shared_info.gfn == GPA_INVALID) { - kvm->arch.xen.shinfo_set = false; + kvm->arch.xen.shinfo_gfn = GPA_INVALID; r = 0; break; } @@ -283,10 +282,7 @@ int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) break; case KVM_XEN_ATTR_TYPE_SHARED_INFO: - if (kvm->arch.xen.shinfo_set) - data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_cache.gpa); - else - data->u.shared_info.gfn = GPA_INVALID; + data->u.shared_info.gfn = gpa_to_gfn(kvm->arch.xen.shinfo_gfn); r = 0; break; @@ -646,6 +642,11 @@ int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) return 0; } +void kvm_xen_init_vm(struct kvm *kvm) +{ + kvm->arch.xen.shinfo_gfn = GPA_INVALID; +} + void kvm_xen_destroy_vm(struct kvm *kvm) { if (kvm->arch.xen_hvm_config.msr) diff --git a/arch/x86/kvm/xen.h b/arch/x86/kvm/xen.h index 463a7844a8ca..cc0cf5f37450 100644 --- a/arch/x86/kvm/xen.h +++ b/arch/x86/kvm/xen.h @@ -21,6 +21,7 @@ int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data); int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data); int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc); +void kvm_xen_init_vm(struct kvm *kvm); void kvm_xen_destroy_vm(struct kvm *kvm); static inline bool kvm_xen_msr_enabled(struct kvm *kvm) @@ -50,6 +51,10 @@ static inline int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) return 1; } +static inline void kvm_xen_init_vm(struct kvm *kvm) +{ +} + static inline void kvm_xen_destroy_vm(struct kvm *kvm) { } diff --git a/drivers/firmware/dmi-id.c b/drivers/firmware/dmi-id.c index 4d5421d14a41..940ddf916202 100644 --- a/drivers/firmware/dmi-id.c +++ b/drivers/firmware/dmi-id.c @@ -73,6 +73,10 @@ static void ascii_filter(char *d, const char *s) static ssize_t get_modalias(char *buffer, size_t buffer_size) { + /* + * Note new fields need to be added at the end to keep compatibility + * with udev's hwdb which does matches on "`cat dmi/id/modalias`*". + */ static const struct mafield { const char *prefix; int field; @@ -85,13 +89,13 @@ static ssize_t get_modalias(char *buffer, size_t buffer_size) { "svn", DMI_SYS_VENDOR }, { "pn", DMI_PRODUCT_NAME }, { "pvr", DMI_PRODUCT_VERSION }, - { "sku", DMI_PRODUCT_SKU }, { "rvn", DMI_BOARD_VENDOR }, { "rn", DMI_BOARD_NAME }, { "rvr", DMI_BOARD_VERSION }, { "cvn", DMI_CHASSIS_VENDOR }, { "ct", DMI_CHASSIS_TYPE }, { "cvr", DMI_CHASSIS_VERSION }, + { "sku", DMI_PRODUCT_SKU }, { NULL, DMI_NONE } }; diff --git a/drivers/net/bonding/bond_3ad.c b/drivers/net/bonding/bond_3ad.c index a4a202b9a0a2..6006c2e8fa2b 100644 --- a/drivers/net/bonding/bond_3ad.c +++ b/drivers/net/bonding/bond_3ad.c @@ -96,7 +96,7 @@ static int ad_marker_send(struct port *port, struct bond_marker *marker); static void ad_mux_machine(struct port *port, bool *update_slave_arr); static void ad_rx_machine(struct lacpdu *lacpdu, struct port *port); static void ad_tx_machine(struct port *port); -static void ad_periodic_machine(struct port *port, struct bond_params bond_params); +static void ad_periodic_machine(struct port *port, struct bond_params *bond_params); static void ad_port_selection_logic(struct port *port, bool *update_slave_arr); static void ad_agg_selection_logic(struct aggregator *aggregator, bool *update_slave_arr); @@ -1298,7 +1298,7 @@ static void ad_tx_machine(struct port *port) * * Turn ntt flag on priodically to perform periodic transmission of lacpdu's. */ -static void ad_periodic_machine(struct port *port, struct bond_params bond_params) +static void ad_periodic_machine(struct port *port, struct bond_params *bond_params) { periodic_states_t last_state; @@ -1308,7 +1308,7 @@ static void ad_periodic_machine(struct port *port, struct bond_params bond_param /* check if port was reinitialized */ if (((port->sm_vars & AD_PORT_BEGIN) || !(port->sm_vars & AD_PORT_LACP_ENABLED) || !port->is_enabled) || (!(port->actor_oper_port_state & LACP_STATE_LACP_ACTIVITY) && !(port->partner_oper.port_state & LACP_STATE_LACP_ACTIVITY)) || - !bond_params.lacp_active) { + !bond_params->lacp_active) { port->sm_periodic_state = AD_NO_PERIODIC; } /* check if state machine should change state */ @@ -2342,7 +2342,7 @@ void bond_3ad_state_machine_handler(struct work_struct *work) } ad_rx_machine(NULL, port); - ad_periodic_machine(port, bond->params); + ad_periodic_machine(port, &bond->params); ad_port_selection_logic(port, &update_slave_arr); ad_mux_machine(port, &update_slave_arr); ad_tx_machine(port); diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index b0966e733926..77dc79a7f574 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -2169,7 +2169,7 @@ int bond_enslave(struct net_device *bond_dev, struct net_device *slave_dev, res = -EOPNOTSUPP; goto err_sysfs_del; } - } else { + } else if (bond->xdp_prog) { struct netdev_bpf xdp = { .command = XDP_SETUP_PROG, .flags = 0, @@ -2910,9 +2910,9 @@ static void bond_arp_send_all(struct bonding *bond, struct slave *slave) * probe to generate any traffic (arp_validate=0) */ if (bond->params.arp_validate) - net_warn_ratelimited("%s: no route to arp_ip_target %pI4 and arp_validate is set\n", - bond->dev->name, - &targets[i]); + pr_warn_once("%s: no route to arp_ip_target %pI4 and arp_validate is set\n", + bond->dev->name, + &targets[i]); bond_arp_send(slave, ARPOP_REQUEST, targets[i], 0, tags); continue; @@ -5224,13 +5224,12 @@ static int bond_xdp_set(struct net_device *dev, struct bpf_prog *prog, bpf_prog_inc(prog); } - if (old_prog) - bpf_prog_put(old_prog); - - if (prog) + if (prog) { static_branch_inc(&bpf_master_redirect_enabled_key); - else + } else if (old_prog) { + bpf_prog_put(old_prog); static_branch_dec(&bpf_master_redirect_enabled_key); + } return 0; diff --git a/drivers/net/can/c_can/c_can_ethtool.c b/drivers/net/can/c_can/c_can_ethtool.c index cd5f07fca2a5..377c7d2e7612 100644 --- a/drivers/net/can/c_can/c_can_ethtool.c +++ b/drivers/net/can/c_can/c_can_ethtool.c @@ -15,10 +15,8 @@ static void c_can_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *info) { struct c_can_priv *priv = netdev_priv(netdev); - struct platform_device *pdev = to_platform_device(priv->device); - strscpy(info->driver, "c_can", sizeof(info->driver)); - strscpy(info->bus_info, pdev->name, sizeof(info->bus_info)); + strscpy(info->bus_info, dev_name(priv->device), sizeof(info->bus_info)); } static void c_can_get_ringparam(struct net_device *netdev, diff --git a/drivers/net/can/rcar/rcar_canfd.c b/drivers/net/can/rcar/rcar_canfd.c index c47988d3674e..ff9d0f5ae0dd 100644 --- a/drivers/net/can/rcar/rcar_canfd.c +++ b/drivers/net/can/rcar/rcar_canfd.c @@ -2017,7 +2017,7 @@ static int __maybe_unused rcar_canfd_resume(struct device *dev) static SIMPLE_DEV_PM_OPS(rcar_canfd_pm_ops, rcar_canfd_suspend, rcar_canfd_resume); -static const struct of_device_id rcar_canfd_of_table[] = { +static const __maybe_unused struct of_device_id rcar_canfd_of_table[] = { { .compatible = "renesas,rcar-gen3-canfd", .data = (void *)RENESAS_RCAR_GEN3 }, { .compatible = "renesas,rzg2l-canfd", .data = (void *)RENESAS_RZG2L }, { } diff --git a/drivers/net/dsa/b53/b53_common.c b/drivers/net/dsa/b53/b53_common.c index bd1417a66cbf..604f54112665 100644 --- a/drivers/net/dsa/b53/b53_common.c +++ b/drivers/net/dsa/b53/b53_common.c @@ -1144,7 +1144,7 @@ static void b53_force_link(struct b53_device *dev, int port, int link) u8 reg, val, off; /* Override the port settings */ - if (port == dev->cpu_port) { + if (port == dev->imp_port) { off = B53_PORT_OVERRIDE_CTRL; val = PORT_OVERRIDE_EN; } else { @@ -1168,7 +1168,7 @@ static void b53_force_port_config(struct b53_device *dev, int port, u8 reg, val, off; /* Override the port settings */ - if (port == dev->cpu_port) { + if (port == dev->imp_port) { off = B53_PORT_OVERRIDE_CTRL; val = PORT_OVERRIDE_EN; } else { @@ -1236,7 +1236,7 @@ static void b53_adjust_link(struct dsa_switch *ds, int port, b53_force_link(dev, port, phydev->link); if (is531x5(dev) && phy_interface_is_rgmii(phydev)) { - if (port == 8) + if (port == dev->imp_port) off = B53_RGMII_CTRL_IMP; else off = B53_RGMII_CTRL_P(port); @@ -2280,6 +2280,7 @@ struct b53_chip_data { const char *dev_name; u16 vlans; u16 enabled_ports; + u8 imp_port; u8 cpu_port; u8 vta_regs[3]; u8 arl_bins; @@ -2304,6 +2305,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1f, .arl_bins = 2, .arl_buckets = 1024, + .imp_port = 5, .cpu_port = B53_CPU_PORT_25, .duplex_reg = B53_DUPLEX_STAT_FE, }, @@ -2314,6 +2316,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1f, .arl_bins = 2, .arl_buckets = 1024, + .imp_port = 5, .cpu_port = B53_CPU_PORT_25, .duplex_reg = B53_DUPLEX_STAT_FE, }, @@ -2324,6 +2327,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1f, .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2337,6 +2341,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1f, .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2350,6 +2355,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1f, .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS_9798, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2363,6 +2369,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x7f, .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS_9798, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2377,6 +2384,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .arl_bins = 4, .arl_buckets = 1024, .vta_regs = B53_VTA_REGS, + .imp_port = 8, .cpu_port = B53_CPU_PORT, .duplex_reg = B53_DUPLEX_STAT_GE, .jumbo_pm_reg = B53_JUMBO_PORT_MASK, @@ -2389,6 +2397,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0xff, .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2402,6 +2411,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1ff, .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2415,6 +2425,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0, /* pdata must provide them */ .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS_63XX, .duplex_reg = B53_DUPLEX_STAT_63XX, @@ -2428,6 +2439,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1f, .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2441,6 +2453,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1bf, .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2454,6 +2467,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1bf, .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2467,6 +2481,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1f, .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2480,6 +2495,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1f, .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT_25, /* TODO: auto detect */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2493,6 +2509,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1ff, .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2506,6 +2523,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x103, .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2520,6 +2538,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1bf, .arl_bins = 4, .arl_buckets = 256, + .imp_port = 8, .cpu_port = 8, /* TODO: ports 4, 5, 8 */ .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2533,6 +2552,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1ff, .arl_bins = 4, .arl_buckets = 1024, + .imp_port = 8, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2546,6 +2566,7 @@ static const struct b53_chip_data b53_switch_chips[] = { .enabled_ports = 0x1ff, .arl_bins = 4, .arl_buckets = 256, + .imp_port = 8, .cpu_port = B53_CPU_PORT, .vta_regs = B53_VTA_REGS, .duplex_reg = B53_DUPLEX_STAT_GE, @@ -2571,6 +2592,7 @@ static int b53_switch_init(struct b53_device *dev) dev->vta_regs[1] = chip->vta_regs[1]; dev->vta_regs[2] = chip->vta_regs[2]; dev->jumbo_pm_reg = chip->jumbo_pm_reg; + dev->imp_port = chip->imp_port; dev->cpu_port = chip->cpu_port; dev->num_vlans = chip->vlans; dev->num_arl_bins = chip->arl_bins; @@ -2612,9 +2634,10 @@ static int b53_switch_init(struct b53_device *dev) dev->cpu_port = 5; } - /* cpu port is always last */ - dev->num_ports = dev->cpu_port + 1; dev->enabled_ports |= BIT(dev->cpu_port); + dev->num_ports = fls(dev->enabled_ports); + + dev->ds->num_ports = min_t(unsigned int, dev->num_ports, DSA_MAX_PORTS); /* Include non standard CPU port built-in PHYs to be probed */ if (is539x(dev) || is531x5(dev)) { @@ -2660,7 +2683,6 @@ struct b53_device *b53_switch_alloc(struct device *base, return NULL; ds->dev = base; - ds->num_ports = DSA_MAX_PORTS; dev = devm_kzalloc(base, sizeof(*dev), GFP_KERNEL); if (!dev) diff --git a/drivers/net/dsa/b53/b53_priv.h b/drivers/net/dsa/b53/b53_priv.h index 9bf8319342b0..5d068acf7cf8 100644 --- a/drivers/net/dsa/b53/b53_priv.h +++ b/drivers/net/dsa/b53/b53_priv.h @@ -123,6 +123,7 @@ struct b53_device { /* used ports mask */ u16 enabled_ports; + unsigned int imp_port; unsigned int cpu_port; /* connect specific data */ diff --git a/drivers/net/dsa/lantiq_gswip.c b/drivers/net/dsa/lantiq_gswip.c index e78026ef6d8c..64d6dfa83122 100644 --- a/drivers/net/dsa/lantiq_gswip.c +++ b/drivers/net/dsa/lantiq_gswip.c @@ -843,7 +843,8 @@ static int gswip_setup(struct dsa_switch *ds) gswip_switch_mask(priv, 0, GSWIP_MAC_CTRL_2_MLEN, GSWIP_MAC_CTRL_2p(cpu_port)); - gswip_switch_w(priv, VLAN_ETH_FRAME_LEN + 8, GSWIP_MAC_FLEN); + gswip_switch_w(priv, VLAN_ETH_FRAME_LEN + 8 + ETH_FCS_LEN, + GSWIP_MAC_FLEN); gswip_switch_mask(priv, 0, GSWIP_BM_QUEUE_GCTRL_GL_MOD, GSWIP_BM_QUEUE_GCTRL); diff --git a/drivers/net/ethernet/3com/3c59x.c b/drivers/net/ethernet/3com/3c59x.c index 17c16333a412..7b0ae9efc004 100644 --- a/drivers/net/ethernet/3com/3c59x.c +++ b/drivers/net/ethernet/3com/3c59x.c @@ -2786,7 +2786,7 @@ static void dump_tx_ring(struct net_device *dev) { if (vortex_debug > 0) { - struct vortex_private *vp = netdev_priv(dev); + struct vortex_private *vp = netdev_priv(dev); void __iomem *ioaddr = vp->ioaddr; if (vp->full_bus_master_tx) { diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index 627f85ee3922..9b86516e59a1 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -305,13 +305,15 @@ static bool bnxt_vf_pciid(enum board_idx idx) writel(DB_CP_FLAGS | RING_CMP(idx), (db)->doorbell) #define BNXT_DB_NQ_P5(db, idx) \ - writeq((db)->db_key64 | DBR_TYPE_NQ | RING_CMP(idx), (db)->doorbell) + bnxt_writeq(bp, (db)->db_key64 | DBR_TYPE_NQ | RING_CMP(idx), \ + (db)->doorbell) #define BNXT_DB_CQ_ARM(db, idx) \ writel(DB_CP_REARM_FLAGS | RING_CMP(idx), (db)->doorbell) #define BNXT_DB_NQ_ARM_P5(db, idx) \ - writeq((db)->db_key64 | DBR_TYPE_NQ_ARM | RING_CMP(idx), (db)->doorbell) + bnxt_writeq(bp, (db)->db_key64 | DBR_TYPE_NQ_ARM | RING_CMP(idx),\ + (db)->doorbell) static void bnxt_db_nq(struct bnxt *bp, struct bnxt_db_info *db, u32 idx) { @@ -332,8 +334,8 @@ static void bnxt_db_nq_arm(struct bnxt *bp, struct bnxt_db_info *db, u32 idx) static void bnxt_db_cq(struct bnxt *bp, struct bnxt_db_info *db, u32 idx) { if (bp->flags & BNXT_FLAG_CHIP_P5) - writeq(db->db_key64 | DBR_TYPE_CQ_ARMALL | RING_CMP(idx), - db->doorbell); + bnxt_writeq(bp, db->db_key64 | DBR_TYPE_CQ_ARMALL | + RING_CMP(idx), db->doorbell); else BNXT_DB_CQ(db, idx); } @@ -2200,25 +2202,34 @@ static int bnxt_async_event_process(struct bnxt *bp, if (!fw_health) goto async_event_process_exit; - fw_health->enabled = EVENT_DATA1_RECOVERY_ENABLED(data1); - fw_health->master = EVENT_DATA1_RECOVERY_MASTER_FUNC(data1); - if (!fw_health->enabled) { + if (!EVENT_DATA1_RECOVERY_ENABLED(data1)) { + fw_health->enabled = false; netif_info(bp, drv, bp->dev, "Error recovery info: error recovery[0]\n"); break; } + fw_health->master = EVENT_DATA1_RECOVERY_MASTER_FUNC(data1); fw_health->tmr_multiplier = DIV_ROUND_UP(fw_health->polling_dsecs * HZ, bp->current_interval * 10); fw_health->tmr_counter = fw_health->tmr_multiplier; - fw_health->last_fw_heartbeat = - bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG); - fw_health->last_fw_reset_cnt = - bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG); + if (!fw_health->enabled) { + fw_health->last_fw_heartbeat = + bnxt_fw_health_readl(bp, BNXT_FW_HEARTBEAT_REG); + fw_health->last_fw_reset_cnt = + bnxt_fw_health_readl(bp, BNXT_FW_RESET_CNT_REG); + } netif_info(bp, drv, bp->dev, "Error recovery info: error recovery[1], master[%d], reset count[%u], health status: 0x%x\n", fw_health->master, fw_health->last_fw_reset_cnt, bnxt_fw_health_readl(bp, BNXT_FW_HEALTH_REG)); + if (!fw_health->enabled) { + /* Make sure tmr_counter is set and visible to + * bnxt_health_check() before setting enabled to true. + */ + smp_wmb(); + fw_health->enabled = true; + } goto async_event_process_exit; } case ASYNC_EVENT_CMPL_EVENT_ID_DEBUG_NOTIFICATION: @@ -2638,8 +2649,8 @@ static void __bnxt_poll_cqs_done(struct bnxt *bp, struct bnxt_napi *bnapi, if (cpr2 && cpr2->had_work_done) { db = &cpr2->cp_db; - writeq(db->db_key64 | dbr_type | - RING_CMP(cpr2->cp_raw_cons), db->doorbell); + bnxt_writeq(bp, db->db_key64 | dbr_type | + RING_CMP(cpr2->cp_raw_cons), db->doorbell); cpr2->had_work_done = 0; } } @@ -4639,6 +4650,13 @@ static int bnxt_hwrm_tunnel_dst_port_free(struct bnxt *bp, u8 tunnel_type) struct hwrm_tunnel_dst_port_free_input *req; int rc; + if (tunnel_type == TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN && + bp->vxlan_fw_dst_port_id == INVALID_HW_RING_ID) + return 0; + if (tunnel_type == TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE && + bp->nge_fw_dst_port_id == INVALID_HW_RING_ID) + return 0; + rc = hwrm_req_init(bp, req, HWRM_TUNNEL_DST_PORT_FREE); if (rc) return rc; @@ -4648,10 +4666,12 @@ static int bnxt_hwrm_tunnel_dst_port_free(struct bnxt *bp, u8 tunnel_type) switch (tunnel_type) { case TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN: req->tunnel_dst_port_id = cpu_to_le16(bp->vxlan_fw_dst_port_id); + bp->vxlan_port = 0; bp->vxlan_fw_dst_port_id = INVALID_HW_RING_ID; break; case TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE: req->tunnel_dst_port_id = cpu_to_le16(bp->nge_fw_dst_port_id); + bp->nge_port = 0; bp->nge_fw_dst_port_id = INVALID_HW_RING_ID; break; default: @@ -4689,10 +4709,12 @@ static int bnxt_hwrm_tunnel_dst_port_alloc(struct bnxt *bp, __be16 port, switch (tunnel_type) { case TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_VXLAN: + bp->vxlan_port = port; bp->vxlan_fw_dst_port_id = le16_to_cpu(resp->tunnel_dst_port_id); break; case TUNNEL_DST_PORT_ALLOC_REQ_TUNNEL_TYPE_GENEVE: + bp->nge_port = port; bp->nge_fw_dst_port_id = le16_to_cpu(resp->tunnel_dst_port_id); break; default: @@ -8221,12 +8243,10 @@ static int bnxt_hwrm_port_qstats_ext(struct bnxt *bp, u8 flags) static void bnxt_hwrm_free_tunnel_ports(struct bnxt *bp) { - if (bp->vxlan_fw_dst_port_id != INVALID_HW_RING_ID) - bnxt_hwrm_tunnel_dst_port_free( - bp, TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN); - if (bp->nge_fw_dst_port_id != INVALID_HW_RING_ID) - bnxt_hwrm_tunnel_dst_port_free( - bp, TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE); + bnxt_hwrm_tunnel_dst_port_free(bp, + TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN); + bnxt_hwrm_tunnel_dst_port_free(bp, + TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE); } static int bnxt_set_tpa(struct bnxt *bp, bool set_tpa) @@ -11247,6 +11267,8 @@ static void bnxt_fw_health_check(struct bnxt *bp) if (!fw_health->enabled || test_bit(BNXT_STATE_IN_FW_RESET, &bp->state)) return; + /* Make sure it is enabled before checking the tmr_counter. */ + smp_rmb(); if (fw_health->tmr_counter) { fw_health->tmr_counter--; return; @@ -12625,13 +12647,10 @@ static int bnxt_udp_tunnel_sync(struct net_device *netdev, unsigned int table) unsigned int cmd; udp_tunnel_nic_get_port(netdev, table, 0, &ti); - if (ti.type == UDP_TUNNEL_TYPE_VXLAN) { - bp->vxlan_port = ti.port; + if (ti.type == UDP_TUNNEL_TYPE_VXLAN) cmd = TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_VXLAN; - } else { - bp->nge_port = ti.port; + else cmd = TUNNEL_DST_PORT_FREE_REQ_TUNNEL_TYPE_GENEVE; - } if (ti.port) return bnxt_hwrm_tunnel_dst_port_alloc(bp, ti.port, cmd); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.h b/drivers/net/ethernet/broadcom/bnxt/bnxt.h index a8212dcdad5f..ec046e7a2484 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.h @@ -28,6 +28,7 @@ #include #include #include +#include #ifdef CONFIG_TEE_BNXT_FW #include #endif @@ -1981,7 +1982,7 @@ struct bnxt { struct mutex sriov_lock; #endif -#ifndef writeq +#if BITS_PER_LONG == 32 /* ensure atomic 64-bit doorbell writes on 32-bit systems. */ spinlock_t db_lock; #endif @@ -2110,24 +2111,36 @@ static inline u32 bnxt_tx_avail(struct bnxt *bp, struct bnxt_tx_ring_info *txr) ((txr->tx_prod - txr->tx_cons) & bp->tx_ring_mask); } -#ifndef writeq -#define writeq(val64, db) \ -do { \ - spin_lock(&bp->db_lock); \ - writel((val64) & 0xffffffff, db); \ - writel((val64) >> 32, (db) + 4); \ - spin_unlock(&bp->db_lock); \ -} while (0) - -#define writeq_relaxed writeq +static inline void bnxt_writeq(struct bnxt *bp, u64 val, + volatile void __iomem *addr) +{ +#if BITS_PER_LONG == 32 + spin_lock(&bp->db_lock); + lo_hi_writeq(val, addr); + spin_unlock(&bp->db_lock); +#else + writeq(val, addr); #endif +} + +static inline void bnxt_writeq_relaxed(struct bnxt *bp, u64 val, + volatile void __iomem *addr) +{ +#if BITS_PER_LONG == 32 + spin_lock(&bp->db_lock); + lo_hi_writeq_relaxed(val, addr); + spin_unlock(&bp->db_lock); +#else + writeq_relaxed(val, addr); +#endif +} /* For TX and RX ring doorbells with no ordering guarantee*/ static inline void bnxt_db_write_relaxed(struct bnxt *bp, struct bnxt_db_info *db, u32 idx) { if (bp->flags & BNXT_FLAG_CHIP_P5) { - writeq_relaxed(db->db_key64 | idx, db->doorbell); + bnxt_writeq_relaxed(bp, db->db_key64 | idx, db->doorbell); } else { u32 db_val = db->db_key32 | idx; @@ -2142,7 +2155,7 @@ static inline void bnxt_db_write(struct bnxt *bp, struct bnxt_db_info *db, u32 idx) { if (bp->flags & BNXT_FLAG_CHIP_P5) { - writeq(db->db_key64 | idx, db->doorbell); + bnxt_writeq(bp, db->db_key64 | idx, db->doorbell); } else { u32 db_val = db->db_key32 | idx; diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c index 1423cc617d93..9576547df4ab 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.c @@ -352,13 +352,16 @@ static void bnxt_copy_from_nvm_data(union devlink_param_value *dst, dst->vu8 = (u8)val32; } -static int bnxt_hwrm_get_nvm_cfg_ver(struct bnxt *bp, - union devlink_param_value *nvm_cfg_ver) +static int bnxt_hwrm_get_nvm_cfg_ver(struct bnxt *bp, u32 *nvm_cfg_ver) { struct hwrm_nvm_get_variable_input *req; + u16 bytes = BNXT_NVM_CFG_VER_BYTES; + u16 bits = BNXT_NVM_CFG_VER_BITS; + union devlink_param_value ver; union bnxt_nvm_data *data; dma_addr_t data_dma_addr; - int rc; + int rc, i = 2; + u16 dim = 1; rc = hwrm_req_init(bp, req, HWRM_NVM_GET_VARIABLE); if (rc) @@ -370,16 +373,34 @@ static int bnxt_hwrm_get_nvm_cfg_ver(struct bnxt *bp, goto exit; } + /* earlier devices present as an array of raw bytes */ + if (!BNXT_CHIP_P5(bp)) { + dim = 0; + i = 0; + bits *= 3; /* array of 3 version components */ + bytes *= 4; /* copy whole word */ + } + hwrm_req_hold(bp, req); req->dest_data_addr = cpu_to_le64(data_dma_addr); - req->data_len = cpu_to_le16(BNXT_NVM_CFG_VER_BITS); + req->data_len = cpu_to_le16(bits); req->option_num = cpu_to_le16(NVM_OFF_NVM_CFG_VER); + req->dimensions = cpu_to_le16(dim); - rc = hwrm_req_send_silent(bp, req); - if (!rc) - bnxt_copy_from_nvm_data(nvm_cfg_ver, data, - BNXT_NVM_CFG_VER_BITS, - BNXT_NVM_CFG_VER_BYTES); + while (i >= 0) { + req->index_0 = cpu_to_le16(i--); + rc = hwrm_req_send_silent(bp, req); + if (rc) + goto exit; + bnxt_copy_from_nvm_data(&ver, data, bits, bytes); + + if (BNXT_CHIP_P5(bp)) { + *nvm_cfg_ver <<= 8; + *nvm_cfg_ver |= ver.vu8; + } else { + *nvm_cfg_ver = ver.vu32; + } + } exit: hwrm_req_drop(bp, req); @@ -416,12 +437,12 @@ static int bnxt_dl_info_get(struct devlink *dl, struct devlink_info_req *req, { struct hwrm_nvm_get_dev_info_output nvm_dev_info; struct bnxt *bp = bnxt_get_bp_from_dl(dl); - union devlink_param_value nvm_cfg_ver; struct hwrm_ver_get_output *ver_resp; char mgmt_ver[FW_VER_STR_LEN]; char roce_ver[FW_VER_STR_LEN]; char ncsi_ver[FW_VER_STR_LEN]; char buf[32]; + u32 ver = 0; int rc; rc = devlink_info_driver_name_put(req, DRV_MODULE_NAME); @@ -456,7 +477,7 @@ static int bnxt_dl_info_get(struct devlink *dl, struct devlink_info_req *req, return rc; ver_resp = &bp->ver_resp; - sprintf(buf, "%X", ver_resp->chip_rev); + sprintf(buf, "%c%d", 'A' + ver_resp->chip_rev, ver_resp->chip_metal); rc = bnxt_dl_info_put(bp, req, BNXT_VERSION_FIXED, DEVLINK_INFO_VERSION_GENERIC_ASIC_REV, buf); if (rc) @@ -475,11 +496,9 @@ static int bnxt_dl_info_get(struct devlink *dl, struct devlink_info_req *req, if (rc) return rc; - if (BNXT_PF(bp) && !bnxt_hwrm_get_nvm_cfg_ver(bp, &nvm_cfg_ver)) { - u32 ver = nvm_cfg_ver.vu32; - - sprintf(buf, "%d.%d.%d", (ver >> 16) & 0xf, (ver >> 8) & 0xf, - ver & 0xf); + if (BNXT_PF(bp) && !bnxt_hwrm_get_nvm_cfg_ver(bp, &ver)) { + sprintf(buf, "%d.%d.%d", (ver >> 16) & 0xff, (ver >> 8) & 0xff, + ver & 0xff); rc = bnxt_dl_info_put(bp, req, BNXT_VERSION_STORED, DEVLINK_INFO_VERSION_GENERIC_FW_PSID, buf); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h index d22cab5d6856..d889f240da2b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_devlink.h @@ -40,8 +40,8 @@ static inline void bnxt_link_bp_to_dl(struct bnxt *bp, struct devlink *dl) #define NVM_OFF_ENABLE_SRIOV 401 #define NVM_OFF_NVM_CFG_VER 602 -#define BNXT_NVM_CFG_VER_BITS 24 -#define BNXT_NVM_CFG_VER_BYTES 4 +#define BNXT_NVM_CFG_VER_BITS 8 +#define BNXT_NVM_CFG_VER_BYTES 1 #define BNXT_MSIX_VEC_MAX 512 #define BNXT_MSIX_VEC_MIN_MAX 128 diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c index acef61abe35d..bb7327b82d0b 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_hwrm.c @@ -145,11 +145,11 @@ void hwrm_req_timeout(struct bnxt *bp, void *req, unsigned int timeout) * @bp: The driver context. * @req: The request for which calls to hwrm_req_dma_slice() will have altered * allocation flags. - * @flags: A bitmask of GFP flags. These flags are passed to - * dma_alloc_coherent() whenever it is used to allocate backing memory - * for slices. Note that calls to hwrm_req_dma_slice() will not always - * result in new allocations, however, memory suballocated from the - * request buffer is already __GFP_ZERO. + * @gfp: A bitmask of GFP flags. These flags are passed to dma_alloc_coherent() + * whenever it is used to allocate backing memory for slices. Note that + * calls to hwrm_req_dma_slice() will not always result in new allocations, + * however, memory suballocated from the request buffer is already + * __GFP_ZERO. * * Sets the GFP allocation flags associated with the request for subsequent * calls to hwrm_req_dma_slice(). This can be useful for specifying __GFP_ZERO @@ -698,8 +698,8 @@ int hwrm_req_send_silent(struct bnxt *bp, void *req) * @bp: The driver context. * @req: The request for which indirect data will be associated. * @size: The size of the allocation. - * @dma: The bus address associated with the allocation. The HWRM API has no - * knowledge about the type of the request and so cannot infer how the + * @dma_handle: The bus address associated with the allocation. The HWRM API has + * no knowledge about the type of the request and so cannot infer how the * caller intends to use the indirect data. Thus, the caller is * responsible for configuring the request object appropriately to * point to the associated indirect memory. Note, DMA handle has the diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c index 73c016166f06..d246eee4b6d5 100644 --- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c @@ -1111,6 +1111,7 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (!adapter->registered_device_map) { pr_err("%s: could not register any net devices\n", pci_name(pdev)); + err = -EINVAL; goto out_release_adapter_res; } diff --git a/drivers/net/ethernet/chelsio/cxgb3/sge.c b/drivers/net/ethernet/chelsio/cxgb3/sge.c index e21a2e691382..c3afec1041f8 100644 --- a/drivers/net/ethernet/chelsio/cxgb3/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb3/sge.c @@ -3301,6 +3301,9 @@ void t3_sge_stop(struct adapter *adap) t3_sge_stop_dma(adap); + /* workqueues aren't initialized otherwise */ + if (!(adap->flags & FULL_INIT_DONE)) + return; for (i = 0; i < SGE_QSETS; ++i) { struct sge_qset *qs = &adap->sge.qs[i]; diff --git a/drivers/net/ethernet/cirrus/Kconfig b/drivers/net/ethernet/cirrus/Kconfig index dac1764ba740..5bdf731d9503 100644 --- a/drivers/net/ethernet/cirrus/Kconfig +++ b/drivers/net/ethernet/cirrus/Kconfig @@ -38,7 +38,7 @@ config CS89x0_ISA config CS89x0_PLATFORM tristate "CS89x0 platform driver support" - depends on ARM || COMPILE_TEST + depends on ARM || (COMPILE_TEST && !PPC) select CS89x0 help Say Y to compile the cs89x0 platform driver. This makes this driver diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c index 474c6d1664e7..ac9b69513332 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c @@ -362,7 +362,7 @@ static void hclge_set_default_capability(struct hclge_dev *hdev) } } -const struct hclge_caps_bit_map hclge_cmd_caps_bit_map0[] = { +static const struct hclge_caps_bit_map hclge_cmd_caps_bit_map0[] = { {HCLGE_CAP_UDP_GSO_B, HNAE3_DEV_SUPPORT_UDP_GSO_B}, {HCLGE_CAP_PTP_B, HNAE3_DEV_SUPPORT_PTP_B}, {HCLGE_CAP_INT_QL_B, HNAE3_DEV_SUPPORT_INT_QL_B}, diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c index 59772b0e9531..f89bfb352adf 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c @@ -342,7 +342,7 @@ static void hclgevf_set_default_capability(struct hclgevf_dev *hdev) set_bit(HNAE3_DEV_SUPPORT_FEC_B, ae_dev->caps); } -const struct hclgevf_caps_bit_map hclgevf_cmd_caps_bit_map0[] = { +static const struct hclgevf_caps_bit_map hclgevf_cmd_caps_bit_map0[] = { {HCLGEVF_CAP_UDP_GSO_B, HNAE3_DEV_SUPPORT_UDP_GSO_B}, {HCLGEVF_CAP_INT_QL_B, HNAE3_DEV_SUPPORT_INT_QL_B}, {HCLGEVF_CAP_TQP_TXRX_INDEP_B, HNAE3_DEV_SUPPORT_TQP_TXRX_INDEP_B}, diff --git a/drivers/net/ethernet/i825xx/sun3_82586.c b/drivers/net/ethernet/i825xx/sun3_82586.c index 893e0ddcb611..0696f723228a 100644 --- a/drivers/net/ethernet/i825xx/sun3_82586.c +++ b/drivers/net/ethernet/i825xx/sun3_82586.c @@ -314,7 +314,7 @@ static int __init sun3_82586_probe(void) err = register_netdev(dev); if (err) goto out2; - return dev; + return 0; out2: release_region(ioaddr, SUN3_82586_TOTAL_SIZE); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c index 7f3d01059e19..34a089b71e55 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/cgx.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/cgx.c @@ -1487,7 +1487,7 @@ static int cgx_lmac_init(struct cgx *cgx) MAX_DMAC_ENTRIES_PER_CGX / cgx->lmac_count; err = rvu_alloc_bitmap(&lmac->mac_to_index_bmap); if (err) - return err; + goto err_name_free; /* Reserve first entry for default MAC address */ set_bit(0, lmac->mac_to_index_bmap.bmap); @@ -1497,7 +1497,7 @@ static int cgx_lmac_init(struct cgx *cgx) spin_lock_init(&lmac->event_cb_lock); err = cgx_configure_interrupt(cgx, lmac, lmac->lmac_id, false); if (err) - goto err_irq; + goto err_bitmap_free; /* Add reference */ cgx->lmac_idmap[lmac->lmac_id] = lmac; @@ -1507,7 +1507,9 @@ static int cgx_lmac_init(struct cgx *cgx) return cgx_lmac_verify_fwi_version(cgx); -err_irq: +err_bitmap_free: + rvu_free_bitmap(&lmac->mac_to_index_bmap); +err_name_free: kfree(lmac->name); err_lmac_free: kfree(lmac); diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c index ce647e037f4d..35836903b7fb 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.c +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.c @@ -92,7 +92,8 @@ static void rvu_setup_hw_capabilities(struct rvu *rvu) */ int rvu_poll_reg(struct rvu *rvu, u64 block, u64 offset, u64 mask, bool zero) { - unsigned long timeout = jiffies + usecs_to_jiffies(10000); + unsigned long timeout = jiffies + usecs_to_jiffies(20000); + bool twice = false; void __iomem *reg; u64 reg_val; @@ -107,6 +108,15 @@ again: usleep_range(1, 5); goto again; } + /* In scenarios where CPU is scheduled out before checking + * 'time_before' (above) and gets scheduled in such that + * jiffies are beyond timeout value, then check again if HW is + * done with the operation in the meantime. + */ + if (!twice) { + twice = true; + goto again; + } return -EBUSY; } @@ -201,6 +211,11 @@ int rvu_alloc_bitmap(struct rsrc_bmap *rsrc) return 0; } +void rvu_free_bitmap(struct rsrc_bmap *rsrc) +{ + kfree(rsrc->bmap); +} + /* Get block LF's HW index from a PF_FUNC's block slot number */ int rvu_get_lf(struct rvu *rvu, struct rvu_block *block, u16 pcifunc, u16 slot) { diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h index d38e5c980c30..1d9411232f1d 100644 --- a/drivers/net/ethernet/marvell/octeontx2/af/rvu.h +++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu.h @@ -638,6 +638,7 @@ static inline bool is_rvu_fwdata_valid(struct rvu *rvu) } int rvu_alloc_bitmap(struct rsrc_bmap *rsrc); +void rvu_free_bitmap(struct rsrc_bmap *rsrc); int rvu_alloc_rsrc(struct rsrc_bmap *rsrc); void rvu_free_rsrc(struct rsrc_bmap *rsrc, int id); bool is_rsrc_free(struct rsrc_bmap *rsrc, int id); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c index 3cc76f14d2fd..95f21dfdba48 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/cn10k.c @@ -27,7 +27,8 @@ int cn10k_lmtst_init(struct otx2_nic *pfvf) { struct lmtst_tbl_setup_req *req; - int qcount, err; + struct otx2_lmt_info *lmt_info; + int err, cpu; if (!test_bit(CN10K_LMTST, &pfvf->hw.cap_flag)) { pfvf->hw_ops = &otx2_hw_ops; @@ -35,15 +36,9 @@ int cn10k_lmtst_init(struct otx2_nic *pfvf) } pfvf->hw_ops = &cn10k_hw_ops; - qcount = pfvf->hw.max_queues; - /* LMTST lines allocation - * qcount = num_online_cpus(); - * NPA = TX + RX + XDP. - * NIX = TX * 32 (For Burst SQE flush). - */ - pfvf->tot_lmt_lines = (qcount * 3) + (qcount * 32); - pfvf->npa_lmt_lines = qcount * 3; - pfvf->nix_lmt_size = LMT_BURST_SIZE * LMT_LINE_SIZE; + /* Total LMTLINES = num_online_cpus() * 32 (For Burst flush).*/ + pfvf->tot_lmt_lines = (num_online_cpus() * LMT_BURST_SIZE); + pfvf->hw.lmt_info = alloc_percpu(struct otx2_lmt_info); mutex_lock(&pfvf->mbox.lock); req = otx2_mbox_alloc_msg_lmtst_tbl_setup(&pfvf->mbox); @@ -66,6 +61,13 @@ int cn10k_lmtst_init(struct otx2_nic *pfvf) err = otx2_sync_mbox_msg(&pfvf->mbox); mutex_unlock(&pfvf->mbox.lock); + for_each_possible_cpu(cpu) { + lmt_info = per_cpu_ptr(pfvf->hw.lmt_info, cpu); + lmt_info->lmt_addr = ((u64)pfvf->hw.lmt_base + + (cpu * LMT_BURST_SIZE * LMT_LINE_SIZE)); + lmt_info->lmt_id = cpu * LMT_BURST_SIZE; + } + return 0; } EXPORT_SYMBOL(cn10k_lmtst_init); @@ -74,13 +76,6 @@ int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura) { struct nix_cn10k_aq_enq_req *aq; struct otx2_nic *pfvf = dev; - struct otx2_snd_queue *sq; - - sq = &pfvf->qset.sq[qidx]; - sq->lmt_addr = (u64 *)((u64)pfvf->hw.nix_lmt_base + - (qidx * pfvf->nix_lmt_size)); - - sq->lmt_id = pfvf->npa_lmt_lines + (qidx * LMT_BURST_SIZE); /* Get memory to put this msg */ aq = otx2_mbox_alloc_msg_nix_cn10k_aq_enq(&pfvf->mbox); @@ -125,8 +120,7 @@ void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) if (otx2_alloc_buffer(pfvf, cq, &bufptr)) { if (num_ptrs--) __cn10k_aura_freeptr(pfvf, cq->cq_idx, ptrs, - num_ptrs, - cq->rbpool->lmt_addr); + num_ptrs); break; } cq->pool_ptrs--; @@ -134,8 +128,7 @@ void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) num_ptrs++; if (num_ptrs == NPA_MAX_BURST || cq->pool_ptrs == 0) { __cn10k_aura_freeptr(pfvf, cq->cq_idx, ptrs, - num_ptrs, - cq->rbpool->lmt_addr); + num_ptrs); num_ptrs = 1; } } @@ -143,20 +136,23 @@ void cn10k_refill_pool_ptrs(void *dev, struct otx2_cq_queue *cq) void cn10k_sqe_flush(void *dev, struct otx2_snd_queue *sq, int size, int qidx) { + struct otx2_lmt_info *lmt_info; + struct otx2_nic *pfvf = dev; u64 val = 0, tar_addr = 0; + lmt_info = per_cpu_ptr(pfvf->hw.lmt_info, smp_processor_id()); /* FIXME: val[0:10] LMT_ID. * [12:15] no of LMTST - 1 in the burst. * [19:63] data size of each LMTST in the burst except first. */ - val = (sq->lmt_id & 0x7FF); + val = (lmt_info->lmt_id & 0x7FF); /* Target address for LMTST flush tells HW how many 128bit * words are present. * tar_addr[6:4] size of first LMTST - 1 in units of 128b. */ tar_addr |= sq->io_addr | (((size / 16) - 1) & 0x7) << 4; dma_wmb(); - memcpy(sq->lmt_addr, sq->sqe_base, size); + memcpy((u64 *)lmt_info->lmt_addr, sq->sqe_base, size); cn10k_lmt_flush(val, tar_addr); sq->head++; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c index ce25c2744435..78df173e6df2 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c @@ -1230,11 +1230,6 @@ static int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id, pool->rbsize = buf_size; - /* Set LMTST addr for NPA batch free */ - if (test_bit(CN10K_LMTST, &pfvf->hw.cap_flag)) - pool->lmt_addr = (__force u64 *)((u64)pfvf->hw.npa_lmt_base + - (pool_id * LMT_LINE_SIZE)); - /* Initialize this pool's context via AF */ aq = otx2_mbox_alloc_msg_npa_aq_enq(&pfvf->mbox); if (!aq) { diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h index 48227cec06ee..a51ecd771d07 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h @@ -53,6 +53,10 @@ enum arua_mapped_qtypes { /* Send skid of 2000 packets required for CQ size of 4K CQEs. */ #define SEND_CQ_SKID 2000 +struct otx2_lmt_info { + u64 lmt_addr; + u16 lmt_id; +}; /* RSS configuration */ struct otx2_rss_ctx { u8 ind_tbl[MAX_RSS_INDIR_TBL_SIZE]; @@ -224,8 +228,7 @@ struct otx2_hw { #define LMT_LINE_SIZE 128 #define LMT_BURST_SIZE 32 /* 32 LMTST lines for burst SQE flush */ u64 *lmt_base; - u64 *npa_lmt_base; - u64 *nix_lmt_base; + struct otx2_lmt_info __percpu *lmt_info; }; enum vfperm { @@ -407,17 +410,18 @@ static inline bool is_96xx_B0(struct pci_dev *pdev) */ #define PCI_REVISION_ID_96XX 0x00 #define PCI_REVISION_ID_95XX 0x10 -#define PCI_REVISION_ID_LOKI 0x20 +#define PCI_REVISION_ID_95XXN 0x20 #define PCI_REVISION_ID_98XX 0x30 #define PCI_REVISION_ID_95XXMM 0x40 +#define PCI_REVISION_ID_95XXO 0xE0 static inline bool is_dev_otx2(struct pci_dev *pdev) { u8 midr = pdev->revision & 0xF0; return (midr == PCI_REVISION_ID_96XX || midr == PCI_REVISION_ID_95XX || - midr == PCI_REVISION_ID_LOKI || midr == PCI_REVISION_ID_98XX || - midr == PCI_REVISION_ID_95XXMM); + midr == PCI_REVISION_ID_95XXN || midr == PCI_REVISION_ID_98XX || + midr == PCI_REVISION_ID_95XXMM || midr == PCI_REVISION_ID_95XXO); } static inline void otx2_setup_dev_hw_settings(struct otx2_nic *pfvf) @@ -562,15 +566,16 @@ static inline u64 otx2_atomic64_add(u64 incr, u64 *ptr) #endif static inline void __cn10k_aura_freeptr(struct otx2_nic *pfvf, u64 aura, - u64 *ptrs, u64 num_ptrs, - u64 *lmt_addr) + u64 *ptrs, u64 num_ptrs) { + struct otx2_lmt_info *lmt_info; u64 size = 0, count_eot = 0; u64 tar_addr, val = 0; + lmt_info = per_cpu_ptr(pfvf->hw.lmt_info, smp_processor_id()); tar_addr = (__force u64)otx2_get_regaddr(pfvf, NPA_LF_AURA_BATCH_FREE0); /* LMTID is same as AURA Id */ - val = (aura & 0x7FF) | BIT_ULL(63); + val = (lmt_info->lmt_id & 0x7FF) | BIT_ULL(63); /* Set if [127:64] of last 128bit word has a valid pointer */ count_eot = (num_ptrs % 2) ? 0ULL : 1ULL; /* Set AURA ID to free pointer */ @@ -586,7 +591,7 @@ static inline void __cn10k_aura_freeptr(struct otx2_nic *pfvf, u64 aura, size++; tar_addr |= ((size - 1) & 0x7) << 4; } - memcpy(lmt_addr, ptrs, sizeof(u64) * num_ptrs); + memcpy((u64 *)lmt_info->lmt_addr, ptrs, sizeof(u64) * num_ptrs); /* Perform LMTST flush */ cn10k_lmt_flush(val, tar_addr); } @@ -594,12 +599,11 @@ static inline void __cn10k_aura_freeptr(struct otx2_nic *pfvf, u64 aura, static inline void cn10k_aura_freeptr(void *dev, int aura, u64 buf) { struct otx2_nic *pfvf = dev; - struct otx2_pool *pool; u64 ptrs[2]; - pool = &pfvf->qset.pool[aura]; ptrs[1] = buf; - __cn10k_aura_freeptr(pfvf, aura, ptrs, 2, pool->lmt_addr); + /* Free only one buffer at time during init and teardown */ + __cn10k_aura_freeptr(pfvf, aura, ptrs, 2); } /* Alloc pointer from pool/aura */ diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c index 799486c72177..dbfa3bc39e34 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_ethtool.c @@ -16,8 +16,8 @@ #include "otx2_common.h" #include "otx2_ptp.h" -#define DRV_NAME "octeontx2-nicpf" -#define DRV_VF_NAME "octeontx2-nicvf" +#define DRV_NAME "rvu-nicpf" +#define DRV_VF_NAME "rvu-nicvf" struct otx2_stat { char name[ETH_GSTRING_LEN]; diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c index 2f2e8a3d7924..53df7fff92c4 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c @@ -1533,14 +1533,6 @@ int otx2_open(struct net_device *netdev) if (!qset->rq) goto err_free_mem; - if (test_bit(CN10K_LMTST, &pf->hw.cap_flag)) { - /* Reserve LMT lines for NPA AURA batch free */ - pf->hw.npa_lmt_base = pf->hw.lmt_base; - /* Reserve LMT lines for NIX TX */ - pf->hw.nix_lmt_base = (u64 *)((u64)pf->hw.npa_lmt_base + - (pf->npa_lmt_lines * LMT_LINE_SIZE)); - } - err = otx2_init_hw_resources(pf); if (err) goto err_free_mem; @@ -2668,6 +2660,8 @@ err_del_mcam_entries: err_ptp_destroy: otx2_ptp_destroy(pf); err_detach_rsrc: + if (pf->hw.lmt_info) + free_percpu(pf->hw.lmt_info); if (test_bit(CN10K_LMTST, &pf->hw.cap_flag)) qmem_free(pf->dev, pf->dync_lmt); otx2_detach_resources(&pf->mbox); @@ -2811,6 +2805,8 @@ static void otx2_remove(struct pci_dev *pdev) otx2_mcam_flow_del(pf); otx2_shutdown_tc(pf); otx2_detach_resources(&pf->mbox); + if (pf->hw.lmt_info) + free_percpu(pf->hw.lmt_info); if (test_bit(CN10K_LMTST, &pf->hw.cap_flag)) qmem_free(pf->dev, pf->dync_lmt); otx2_disable_mbox_intr(pf); diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h index 869de5f59e73..3ff1ad79c001 100644 --- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h +++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h @@ -80,7 +80,6 @@ struct otx2_snd_queue { u16 num_sqbs; u16 sqe_thresh; u8 sqe_per_sqb; - u32 lmt_id; u64 io_addr; u64 *aura_fc_addr; u64 *lmt_addr; @@ -111,7 +110,6 @@ struct otx2_cq_poll { struct otx2_pool { struct qmem *stack; struct qmem *fc_addr; - u64 *lmt_addr; u16 rbsize; }; diff --git a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c index e91b4874a57f..3de1a03839e2 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_ethtool.c @@ -582,7 +582,10 @@ static int ionic_set_ringparam(struct net_device *netdev, qparam.ntxq_descs = ring->tx_pending; qparam.nrxq_descs = ring->rx_pending; + + mutex_lock(&lif->queue_lock); err = ionic_reconfigure_queues(lif, &qparam); + mutex_unlock(&lif->queue_lock); if (err) netdev_info(netdev, "Ring reconfiguration failed, changes canceled: %d\n", err); @@ -679,7 +682,9 @@ static int ionic_set_channels(struct net_device *netdev, return 0; } + mutex_lock(&lif->queue_lock); err = ionic_reconfigure_queues(lif, &qparam); + mutex_unlock(&lif->queue_lock); if (err) netdev_info(netdev, "Queue reconfiguration failed, changes canceled: %d\n", err); diff --git a/drivers/net/ethernet/pensando/ionic/ionic_lif.c b/drivers/net/ethernet/pensando/ionic/ionic_lif.c index 23c9e196a784..381966e8f557 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_lif.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_lif.c @@ -1715,7 +1715,6 @@ static int ionic_set_mac_address(struct net_device *netdev, void *sa) static void ionic_stop_queues_reconfig(struct ionic_lif *lif) { /* Stop and clean the queues before reconfiguration */ - mutex_lock(&lif->queue_lock); netif_device_detach(lif->netdev); ionic_stop_queues(lif); ionic_txrx_deinit(lif); @@ -1734,8 +1733,7 @@ static int ionic_start_queues_reconfig(struct ionic_lif *lif) * DOWN and UP to try to reset and clear the issue. */ err = ionic_txrx_init(lif); - mutex_unlock(&lif->queue_lock); - ionic_link_status_check_request(lif, CAN_SLEEP); + ionic_link_status_check_request(lif, CAN_NOT_SLEEP); netif_device_attach(lif->netdev); return err; @@ -1765,9 +1763,13 @@ static int ionic_change_mtu(struct net_device *netdev, int new_mtu) return 0; } + mutex_lock(&lif->queue_lock); ionic_stop_queues_reconfig(lif); netdev->mtu = new_mtu; - return ionic_start_queues_reconfig(lif); + err = ionic_start_queues_reconfig(lif); + mutex_unlock(&lif->queue_lock); + + return err; } static void ionic_tx_timeout_work(struct work_struct *ws) @@ -1783,8 +1785,10 @@ static void ionic_tx_timeout_work(struct work_struct *ws) if (!netif_running(lif->netdev)) return; + mutex_lock(&lif->queue_lock); ionic_stop_queues_reconfig(lif); ionic_start_queues_reconfig(lif); + mutex_unlock(&lif->queue_lock); } static void ionic_tx_timeout(struct net_device *netdev, unsigned int txqueue) diff --git a/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c b/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c index 7e3a5634c161..25ecfcfa1281 100644 --- a/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c +++ b/drivers/net/ethernet/pensando/ionic/ionic_rx_filter.c @@ -318,7 +318,7 @@ void ionic_rx_filter_sync(struct ionic_lif *lif) if (f->state == IONIC_FILTER_STATE_NEW || f->state == IONIC_FILTER_STATE_OLD) { sync_item = devm_kzalloc(dev, sizeof(*sync_item), - GFP_KERNEL); + GFP_ATOMIC); if (!sync_item) goto loop_out; diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c index 3d61a767a8a3..09f20c794754 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_init.c @@ -437,7 +437,6 @@ int qlcnic_pinit_from_rom(struct qlcnic_adapter *adapter) QLCWR32(adapter, QLCNIC_CRB_PEG_NET_4 + 0x3c, 1); msleep(20); - qlcnic_rom_unlock(adapter); /* big hammer don't reset CAM block on reset */ QLCWR32(adapter, QLCNIC_ROMUSB_GLB_SW_RESET, 0xfeffffff); diff --git a/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c b/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c index 79e50079ed03..f72e13b83869 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-ethtool.c @@ -100,7 +100,7 @@ static void emac_get_strings(struct net_device *netdev, u32 stringset, u8 *data) case ETH_SS_STATS: for (i = 0; i < EMAC_STATS_LEN; i++) { - strlcpy(data, emac_ethtool_stat_strings[i], + strscpy(data, emac_ethtool_stat_strings[i], ETH_GSTRING_LEN); data += ETH_GSTRING_LEN; } diff --git a/drivers/net/ethernet/renesas/sh_eth.c b/drivers/net/ethernet/renesas/sh_eth.c index 6c8ba916d1a6..1374faa229a2 100644 --- a/drivers/net/ethernet/renesas/sh_eth.c +++ b/drivers/net/ethernet/renesas/sh_eth.c @@ -2533,6 +2533,7 @@ static netdev_tx_t sh_eth_start_xmit(struct sk_buff *skb, else txdesc->status |= cpu_to_le32(TD_TACT); + wmb(); /* cur_tx must be incremented after TACT bit was set */ mdp->cur_tx++; if (!(sh_eth_read(ndev, EDTRR) & mdp->cd->edtrr_trns)) diff --git a/drivers/net/ethernet/smsc/smc911x.c b/drivers/net/ethernet/smsc/smc911x.c index 22cdbf12c823..b008b4e8a2a5 100644 --- a/drivers/net/ethernet/smsc/smc911x.c +++ b/drivers/net/ethernet/smsc/smc911x.c @@ -1550,7 +1550,7 @@ static int smc911x_ethtool_getregslen(struct net_device *dev) } static void smc911x_ethtool_getregs(struct net_device *dev, - struct ethtool_regs* regs, void *buf) + struct ethtool_regs *regs, void *buf) { struct smc911x_local *lp = netdev_priv(dev); unsigned long flags; @@ -1600,7 +1600,7 @@ static int smc911x_ethtool_wait_eeprom_ready(struct net_device *dev) } static inline int smc911x_ethtool_write_eeprom_cmd(struct net_device *dev, - int cmd, int addr) + int cmd, int addr) { struct smc911x_local *lp = netdev_priv(dev); int ret; @@ -1614,7 +1614,7 @@ static inline int smc911x_ethtool_write_eeprom_cmd(struct net_device *dev, } static inline int smc911x_ethtool_read_eeprom_byte(struct net_device *dev, - u8 *data) + u8 *data) { struct smc911x_local *lp = netdev_priv(dev); int ret; @@ -1626,7 +1626,7 @@ static inline int smc911x_ethtool_read_eeprom_byte(struct net_device *dev, } static inline int smc911x_ethtool_write_eeprom_byte(struct net_device *dev, - u8 data) + u8 data) { struct smc911x_local *lp = netdev_priv(dev); int ret; @@ -1638,7 +1638,7 @@ static inline int smc911x_ethtool_write_eeprom_byte(struct net_device *dev, } static int smc911x_ethtool_geteeprom(struct net_device *dev, - struct ethtool_eeprom *eeprom, u8 *data) + struct ethtool_eeprom *eeprom, u8 *data) { u8 eebuf[SMC911X_EEPROM_LEN]; int i, ret; @@ -1654,7 +1654,7 @@ static int smc911x_ethtool_geteeprom(struct net_device *dev, } static int smc911x_ethtool_seteeprom(struct net_device *dev, - struct ethtool_eeprom *eeprom, u8 *data) + struct ethtool_eeprom *eeprom, u8 *data) { int i, ret; diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c index 4c9a37dd0d3f..ecf759ee1c9f 100644 --- a/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c +++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-loongson.c @@ -109,8 +109,10 @@ static int loongson_dwmac_probe(struct pci_dev *pdev, const struct pci_device_id plat->bus_id = pci_dev_id(pdev); phy_mode = device_get_phy_mode(&pdev->dev); - if (phy_mode < 0) + if (phy_mode < 0) { dev_err(&pdev->dev, "phy_mode not found\n"); + return phy_mode; + } plat->phy_interface = phy_mode; plat->interface = PHY_INTERFACE_MODE_GMII; diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index ed0cd3920171..ece02b35a6ce 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -5347,7 +5347,7 @@ static int stmmac_napi_poll_rxtx(struct napi_struct *napi, int budget) struct stmmac_channel *ch = container_of(napi, struct stmmac_channel, rxtx_napi); struct stmmac_priv *priv = ch->priv_data; - int rx_done, tx_done; + int rx_done, tx_done, rxtx_done; u32 chan = ch->index; priv->xstats.napi_poll++; @@ -5357,14 +5357,16 @@ static int stmmac_napi_poll_rxtx(struct napi_struct *napi, int budget) rx_done = stmmac_rx_zc(priv, budget, chan); + rxtx_done = max(tx_done, rx_done); + /* If either TX or RX work is not complete, return budget * and keep pooling */ - if (tx_done >= budget || rx_done >= budget) + if (rxtx_done >= budget) return budget; /* all work done, exit the polling mode */ - if (napi_complete_done(napi, rx_done)) { + if (napi_complete_done(napi, rxtx_done)) { unsigned long flags; spin_lock_irqsave(&ch->lock, flags); @@ -5375,7 +5377,7 @@ static int stmmac_napi_poll_rxtx(struct napi_struct *napi, int budget) spin_unlock_irqrestore(&ch->lock, flags); } - return min(rx_done, budget - 1); + return min(rxtx_done, budget - 1); } /** @@ -7121,8 +7123,6 @@ int stmmac_suspend(struct device *dev) if (!ndev || !netif_running(ndev)) return 0; - phylink_mac_change(priv->phylink, false); - mutex_lock(&priv->lock); netif_device_detach(ndev); @@ -7148,14 +7148,6 @@ int stmmac_suspend(struct device *dev) stmmac_pmt(priv, priv->hw, priv->wolopts); priv->irq_wake = 1; } else { - mutex_unlock(&priv->lock); - rtnl_lock(); - if (device_may_wakeup(priv->device)) - phylink_speed_down(priv->phylink, false); - phylink_stop(priv->phylink); - rtnl_unlock(); - mutex_lock(&priv->lock); - stmmac_mac_set(priv, priv->ioaddr, false); pinctrl_pm_select_sleep_state(priv->device); /* Disable clock in case of PWM is off */ @@ -7169,6 +7161,16 @@ int stmmac_suspend(struct device *dev) mutex_unlock(&priv->lock); + rtnl_lock(); + if (device_may_wakeup(priv->device) && priv->plat->pmt) { + phylink_suspend(priv->phylink, true); + } else { + if (device_may_wakeup(priv->device)) + phylink_speed_down(priv->phylink, false); + phylink_suspend(priv->phylink, false); + } + rtnl_unlock(); + if (priv->dma_cap.fpesel) { /* Disable FPE */ stmmac_fpe_configure(priv, priv->ioaddr, @@ -7259,13 +7261,15 @@ int stmmac_resume(struct device *dev) return ret; } - if (!device_may_wakeup(priv->device) || !priv->plat->pmt) { - rtnl_lock(); - phylink_start(priv->phylink); - /* We may have called phylink_speed_down before */ - phylink_speed_up(priv->phylink); - rtnl_unlock(); + rtnl_lock(); + if (device_may_wakeup(priv->device) && priv->plat->pmt) { + phylink_resume(priv->phylink); + } else { + phylink_resume(priv->phylink); + if (device_may_wakeup(priv->device)) + phylink_speed_up(priv->phylink); } + rtnl_unlock(); rtnl_lock(); mutex_lock(&priv->lock); @@ -7286,8 +7290,6 @@ int stmmac_resume(struct device *dev) mutex_unlock(&priv->lock); rtnl_unlock(); - phylink_mac_change(priv->phylink, true); - netif_device_attach(ndev); return 0; diff --git a/drivers/net/ethernet/xscale/ptp_ixp46x.c b/drivers/net/ethernet/xscale/ptp_ixp46x.c index ecece21315c3..39234852e01b 100644 --- a/drivers/net/ethernet/xscale/ptp_ixp46x.c +++ b/drivers/net/ethernet/xscale/ptp_ixp46x.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include "ixp46x_ts.h" diff --git a/drivers/net/phy/phylink.c b/drivers/net/phy/phylink.c index 2cdf9f989dec..a1464b764d4d 100644 --- a/drivers/net/phy/phylink.c +++ b/drivers/net/phy/phylink.c @@ -33,6 +33,7 @@ enum { PHYLINK_DISABLE_STOPPED, PHYLINK_DISABLE_LINK, + PHYLINK_DISABLE_MAC_WOL, }; /** @@ -1282,6 +1283,9 @@ EXPORT_SYMBOL_GPL(phylink_start); * network device driver's &struct net_device_ops ndo_stop() method. The * network device's carrier state should not be changed prior to calling this * function. + * + * This will synchronously bring down the link if the link is not already + * down (in other words, it will trigger a mac_link_down() method call.) */ void phylink_stop(struct phylink *pl) { @@ -1301,6 +1305,84 @@ void phylink_stop(struct phylink *pl) } EXPORT_SYMBOL_GPL(phylink_stop); +/** + * phylink_suspend() - handle a network device suspend event + * @pl: a pointer to a &struct phylink returned from phylink_create() + * @mac_wol: true if the MAC needs to receive packets for Wake-on-Lan + * + * Handle a network device suspend event. There are several cases: + * - If Wake-on-Lan is not active, we can bring down the link between + * the MAC and PHY by calling phylink_stop(). + * - If Wake-on-Lan is active, and being handled only by the PHY, we + * can also bring down the link between the MAC and PHY. + * - If Wake-on-Lan is active, but being handled by the MAC, the MAC + * still needs to receive packets, so we can not bring the link down. + */ +void phylink_suspend(struct phylink *pl, bool mac_wol) +{ + ASSERT_RTNL(); + + if (mac_wol && (!pl->netdev || pl->netdev->wol_enabled)) { + /* Wake-on-Lan enabled, MAC handling */ + mutex_lock(&pl->state_mutex); + + /* Stop the resolver bringing the link up */ + __set_bit(PHYLINK_DISABLE_MAC_WOL, &pl->phylink_disable_state); + + /* Disable the carrier, to prevent transmit timeouts, + * but one would hope all packets have been sent. This + * also means phylink_resolve() will do nothing. + */ + netif_carrier_off(pl->netdev); + + /* We do not call mac_link_down() here as we want the + * link to remain up to receive the WoL packets. + */ + mutex_unlock(&pl->state_mutex); + } else { + phylink_stop(pl); + } +} +EXPORT_SYMBOL_GPL(phylink_suspend); + +/** + * phylink_resume() - handle a network device resume event + * @pl: a pointer to a &struct phylink returned from phylink_create() + * + * Undo the effects of phylink_suspend(), returning the link to an + * operational state. + */ +void phylink_resume(struct phylink *pl) +{ + ASSERT_RTNL(); + + if (test_bit(PHYLINK_DISABLE_MAC_WOL, &pl->phylink_disable_state)) { + /* Wake-on-Lan enabled, MAC handling */ + + /* Call mac_link_down() so we keep the overall state balanced. + * Do this under the state_mutex lock for consistency. This + * will cause a "Link Down" message to be printed during + * resume, which is harmless - the true link state will be + * printed when we run a resolve. + */ + mutex_lock(&pl->state_mutex); + phylink_link_down(pl); + mutex_unlock(&pl->state_mutex); + + /* Re-apply the link parameters so that all the settings get + * restored to the MAC. + */ + phylink_mac_initial_config(pl, true); + + /* Re-enable and re-resolve the link parameters */ + clear_bit(PHYLINK_DISABLE_MAC_WOL, &pl->phylink_disable_state); + phylink_run_resolve(pl); + } else { + phylink_start(pl); + } +} +EXPORT_SYMBOL_GPL(phylink_resume); + /** * phylink_ethtool_get_wol() - get the wake on lan parameters for the PHY * @pl: a pointer to a &struct phylink returned from phylink_create() diff --git a/drivers/net/usb/cdc_mbim.c b/drivers/net/usb/cdc_mbim.c index 4c4ab7b38d78..82bb5ed94c48 100644 --- a/drivers/net/usb/cdc_mbim.c +++ b/drivers/net/usb/cdc_mbim.c @@ -654,6 +654,11 @@ static const struct usb_device_id mbim_devs[] = { .driver_info = (unsigned long)&cdc_mbim_info_avoid_altsetting_toggle, }, + /* Telit LN920 */ + { USB_DEVICE_AND_INTERFACE_INFO(0x1bc7, 0x1061, USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), + .driver_info = (unsigned long)&cdc_mbim_info_avoid_altsetting_toggle, + }, + /* default entry */ { USB_INTERFACE_INFO(USB_CLASS_COMM, USB_CDC_SUBCLASS_MBIM, USB_CDC_PROTO_NONE), .driver_info = (unsigned long)&cdc_mbim_info_zlp, diff --git a/drivers/net/usb/hso.c b/drivers/net/usb/hso.c index 7dc1ef3f93c3..a57251ba5991 100644 --- a/drivers/net/usb/hso.c +++ b/drivers/net/usb/hso.c @@ -2535,13 +2535,17 @@ static struct hso_device *hso_create_net_device(struct usb_interface *interface, if (!hso_net->mux_bulk_tx_buf) goto err_free_tx_urb; - add_net_device(hso_dev); + result = add_net_device(hso_dev); + if (result) { + dev_err(&interface->dev, "Failed to add net device\n"); + goto err_free_tx_buf; + } /* registering our net device */ result = register_netdev(net); if (result) { dev_err(&interface->dev, "Failed to register device\n"); - goto err_free_tx_buf; + goto err_rmv_ndev; } hso_log_port(hso_dev); @@ -2550,8 +2554,9 @@ static struct hso_device *hso_create_net_device(struct usb_interface *interface, return hso_dev; -err_free_tx_buf: +err_rmv_ndev: remove_net_device(hso_dev); +err_free_tx_buf: kfree(hso_net->mux_bulk_tx_buf); err_free_tx_urb: usb_free_urb(hso_net->mux_bulk_tx_urb); diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 6a2e4f884b12..33ada2c59952 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1354,6 +1354,7 @@ static const struct usb_device_id products[] = { {QMI_QUIRK_SET_DTR(0x1bc7, 0x1031, 3)}, /* Telit LE910C1-EUX */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1040, 2)}, /* Telit LE922A */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1050, 2)}, /* Telit FN980 */ + {QMI_QUIRK_SET_DTR(0x1bc7, 0x1060, 2)}, /* Telit LN920 */ {QMI_FIXED_INTF(0x1bc7, 0x1100, 3)}, /* Telit ME910 */ {QMI_FIXED_INTF(0x1bc7, 0x1101, 3)}, /* Telit ME910 dual modem */ {QMI_FIXED_INTF(0x1bc7, 0x1200, 5)}, /* Telit LE920 */ diff --git a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c index 52d1d391f4c6..d8231cc821ae 100644 --- a/drivers/net/wireless/intel/iwlwifi/cfg/22000.c +++ b/drivers/net/wireless/intel/iwlwifi/cfg/22000.c @@ -9,7 +9,7 @@ #include "iwl-prph.h" /* Highest firmware API version supported */ -#define IWL_22000_UCODE_API_MAX 65 +#define IWL_22000_UCODE_API_MAX 66 /* Lowest firmware API version supported */ #define IWL_22000_UCODE_API_MIN 39 diff --git a/drivers/net/wireless/intel/iwlwifi/fw/pnvm.c b/drivers/net/wireless/intel/iwlwifi/fw/pnvm.c index 314ed90c23dd..dde22bdc8703 100644 --- a/drivers/net/wireless/intel/iwlwifi/fw/pnvm.c +++ b/drivers/net/wireless/intel/iwlwifi/fw/pnvm.c @@ -231,6 +231,7 @@ static int iwl_pnvm_get_from_fs(struct iwl_trans *trans, u8 **data, size_t *len) { const struct firmware *pnvm; char pnvm_name[MAX_PNVM_NAME]; + size_t new_len; int ret; iwl_pnvm_get_fs_name(trans, pnvm_name, sizeof(pnvm_name)); @@ -242,11 +243,14 @@ static int iwl_pnvm_get_from_fs(struct iwl_trans *trans, u8 **data, size_t *len) return ret; } + new_len = pnvm->size; *data = kmemdup(pnvm->data, pnvm->size, GFP_KERNEL); + release_firmware(pnvm); + if (!*data) return -ENOMEM; - *len = pnvm->size; + *len = new_len; return 0; } diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 8dc1b8eecb86..61b2797a34a8 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -558,6 +558,7 @@ static const struct iwl_dev_info iwl_dev_info_table[] = { IWL_DEV_INFO(0xA0F0, 0x1652, killer1650i_2ax_cfg_qu_b0_hr_b0, NULL), IWL_DEV_INFO(0xA0F0, 0x2074, iwl_ax201_cfg_qu_hr, NULL), IWL_DEV_INFO(0xA0F0, 0x4070, iwl_ax201_cfg_qu_hr, NULL), + IWL_DEV_INFO(0xA0F0, 0x6074, iwl_ax201_cfg_qu_hr, NULL), IWL_DEV_INFO(0x02F0, 0x0070, iwl_ax201_cfg_quz_hr, NULL), IWL_DEV_INFO(0x02F0, 0x0074, iwl_ax201_cfg_quz_hr, NULL), IWL_DEV_INFO(0x02F0, 0x6074, iwl_ax201_cfg_quz_hr, NULL), diff --git a/drivers/net/wwan/iosm/iosm_ipc_mmio.c b/drivers/net/wwan/iosm/iosm_ipc_mmio.c index 06c94b1720b6..09f94c123531 100644 --- a/drivers/net/wwan/iosm/iosm_ipc_mmio.c +++ b/drivers/net/wwan/iosm/iosm_ipc_mmio.c @@ -69,7 +69,7 @@ void ipc_mmio_update_cp_capability(struct iosm_mmio *ipc_mmio) unsigned int ver; ver = ipc_mmio_get_cp_version(ipc_mmio); - cp_cap = readl(ipc_mmio->base + ipc_mmio->offset.cp_capability); + cp_cap = ioread32(ipc_mmio->base + ipc_mmio->offset.cp_capability); ipc_mmio->has_mux_lite = (ver >= IOSM_CP_VERSION) && !(cp_cap & DL_AGGR) && !(cp_cap & UL_AGGR); @@ -150,8 +150,8 @@ enum ipc_mem_exec_stage ipc_mmio_get_exec_stage(struct iosm_mmio *ipc_mmio) if (!ipc_mmio) return IPC_MEM_EXEC_STAGE_INVALID; - return (enum ipc_mem_exec_stage)readl(ipc_mmio->base + - ipc_mmio->offset.exec_stage); + return (enum ipc_mem_exec_stage)ioread32(ipc_mmio->base + + ipc_mmio->offset.exec_stage); } void ipc_mmio_copy_chip_info(struct iosm_mmio *ipc_mmio, void *dest, @@ -167,8 +167,8 @@ enum ipc_mem_device_ipc_state ipc_mmio_get_ipc_state(struct iosm_mmio *ipc_mmio) if (!ipc_mmio) return IPC_MEM_DEVICE_IPC_INVALID; - return (enum ipc_mem_device_ipc_state) - readl(ipc_mmio->base + ipc_mmio->offset.ipc_status); + return (enum ipc_mem_device_ipc_state)ioread32(ipc_mmio->base + + ipc_mmio->offset.ipc_status); } enum rom_exit_code ipc_mmio_get_rom_exit_code(struct iosm_mmio *ipc_mmio) @@ -176,8 +176,8 @@ enum rom_exit_code ipc_mmio_get_rom_exit_code(struct iosm_mmio *ipc_mmio) if (!ipc_mmio) return IMEM_ROM_EXIT_FAIL; - return (enum rom_exit_code)readl(ipc_mmio->base + - ipc_mmio->offset.rom_exit_code); + return (enum rom_exit_code)ioread32(ipc_mmio->base + + ipc_mmio->offset.rom_exit_code); } void ipc_mmio_config(struct iosm_mmio *ipc_mmio) @@ -188,10 +188,10 @@ void ipc_mmio_config(struct iosm_mmio *ipc_mmio) /* AP memory window (full window is open and active so that modem checks * each AP address) 0 means don't check on modem side. */ - iowrite64_lo_hi(0, ipc_mmio->base + ipc_mmio->offset.ap_win_base); - iowrite64_lo_hi(0, ipc_mmio->base + ipc_mmio->offset.ap_win_end); + iowrite64(0, ipc_mmio->base + ipc_mmio->offset.ap_win_base); + iowrite64(0, ipc_mmio->base + ipc_mmio->offset.ap_win_end); - iowrite64_lo_hi(ipc_mmio->context_info_addr, + iowrite64(ipc_mmio->context_info_addr, ipc_mmio->base + ipc_mmio->offset.context_info); } @@ -201,8 +201,8 @@ void ipc_mmio_set_psi_addr_and_size(struct iosm_mmio *ipc_mmio, dma_addr_t addr, if (!ipc_mmio) return; - iowrite64_lo_hi(addr, ipc_mmio->base + ipc_mmio->offset.psi_address); - writel(size, ipc_mmio->base + ipc_mmio->offset.psi_size); + iowrite64(addr, ipc_mmio->base + ipc_mmio->offset.psi_address); + iowrite32(size, ipc_mmio->base + ipc_mmio->offset.psi_size); } void ipc_mmio_set_contex_info_addr(struct iosm_mmio *ipc_mmio, phys_addr_t addr) @@ -218,6 +218,8 @@ void ipc_mmio_set_contex_info_addr(struct iosm_mmio *ipc_mmio, phys_addr_t addr) int ipc_mmio_get_cp_version(struct iosm_mmio *ipc_mmio) { - return ipc_mmio ? readl(ipc_mmio->base + ipc_mmio->offset.cp_version) : - -EFAULT; + if (ipc_mmio) + return ioread32(ipc_mmio->base + ipc_mmio->offset.cp_version); + + return -EFAULT; } diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.c b/drivers/ntb/hw/amd/ntb_hw_amd.c index 71428d8cbcfc..87847c380051 100644 --- a/drivers/ntb/hw/amd/ntb_hw_amd.c +++ b/drivers/ntb/hw/amd/ntb_hw_amd.c @@ -1176,22 +1176,14 @@ static int amd_ntb_init_pci(struct amd_ntb_dev *ndev, pci_set_master(pdev); - rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); if (rc) { - rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); if (rc) goto err_dma_mask; dev_warn(&pdev->dev, "Cannot DMA highmem\n"); } - rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - if (rc) { - rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); - if (rc) - goto err_dma_mask; - dev_warn(&pdev->dev, "Cannot DMA consistent highmem\n"); - } - ndev->self_mmio = pci_iomap(pdev, 0, 0); if (!ndev->self_mmio) { rc = -EIO; diff --git a/drivers/ntb/hw/idt/ntb_hw_idt.c b/drivers/ntb/hw/idt/ntb_hw_idt.c index e7a4c2aa8baa..733557231ed0 100644 --- a/drivers/ntb/hw/idt/ntb_hw_idt.c +++ b/drivers/ntb/hw/idt/ntb_hw_idt.c @@ -2640,26 +2640,15 @@ static int idt_init_pci(struct idt_ntb_dev *ndev) int ret; /* Initialize the bit mask of PCI/NTB DMA */ - ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); if (ret != 0) { - ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + ret = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); if (ret != 0) { dev_err(&pdev->dev, "Failed to set DMA bit mask\n"); return ret; } dev_warn(&pdev->dev, "Cannot set DMA highmem bit mask\n"); } - ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - if (ret != 0) { - ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); - if (ret != 0) { - dev_err(&pdev->dev, - "Failed to set consistent DMA bit mask\n"); - return ret; - } - dev_warn(&pdev->dev, - "Cannot set consistent DMA highmem bit mask\n"); - } /* * Enable the device advanced error reporting. It's not critical to diff --git a/drivers/ntb/hw/intel/ntb_hw_gen1.c b/drivers/ntb/hw/intel/ntb_hw_gen1.c index 093dd20057b9..e5f14e20a9ff 100644 --- a/drivers/ntb/hw/intel/ntb_hw_gen1.c +++ b/drivers/ntb/hw/intel/ntb_hw_gen1.c @@ -1771,22 +1771,14 @@ static int intel_ntb_init_pci(struct intel_ntb_dev *ndev, struct pci_dev *pdev) pci_set_master(pdev); - rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)); if (rc) { - rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + rc = dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32)); if (rc) goto err_dma_mask; dev_warn(&pdev->dev, "Cannot DMA highmem\n"); } - rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - if (rc) { - rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); - if (rc) - goto err_dma_mask; - dev_warn(&pdev->dev, "Cannot DMA consistent highmem\n"); - } - ndev->self_mmio = pci_iomap(pdev, 0, 0); if (!ndev->self_mmio) { rc = -EIO; diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.h b/drivers/ntb/hw/intel/ntb_hw_intel.h index 05e2335c9596..b233d1c6ba2d 100644 --- a/drivers/ntb/hw/intel/ntb_hw_intel.h +++ b/drivers/ntb/hw/intel/ntb_hw_intel.h @@ -43,9 +43,6 @@ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * * Intel PCIe NTB Linux driver - * - * Contact Information: - * Jon Mason */ #ifndef NTB_HW_INTEL_H diff --git a/drivers/ntb/test/ntb_msi_test.c b/drivers/ntb/test/ntb_msi_test.c index 7095ecd6223a..4e18e08776c9 100644 --- a/drivers/ntb/test/ntb_msi_test.c +++ b/drivers/ntb/test/ntb_msi_test.c @@ -369,8 +369,10 @@ static int ntb_msit_probe(struct ntb_client *client, struct ntb_dev *ntb) if (ret) goto remove_dbgfs; - if (!nm->isr_ctx) + if (!nm->isr_ctx) { + ret = -ENOMEM; goto remove_dbgfs; + } ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO); diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c index 89df1350fefd..65e1e5cf1b29 100644 --- a/drivers/ntb/test/ntb_perf.c +++ b/drivers/ntb/test/ntb_perf.c @@ -598,6 +598,7 @@ static int perf_setup_inbuf(struct perf_peer *peer) return -ENOMEM; } if (!IS_ALIGNED(peer->inbuf_xlat, xlat_align)) { + ret = -EINVAL; dev_err(&perf->ntb->dev, "Unaligned inbuf allocated\n"); goto err_free_inbuf; } diff --git a/drivers/ntb/test/ntb_pingpong.c b/drivers/ntb/test/ntb_pingpong.c index 2164e8492772..8aeca7914050 100644 --- a/drivers/ntb/test/ntb_pingpong.c +++ b/drivers/ntb/test/ntb_pingpong.c @@ -187,7 +187,7 @@ static void pp_ping(struct pp_ctx *pp) static void pp_pong(struct pp_ctx *pp) { - u32 msg_data = -1, spad_data = -1; + u32 msg_data, spad_data; int pidx = 0; /* Read pong data */ diff --git a/drivers/remoteproc/qcom_q6v5_pas.c b/drivers/remoteproc/qcom_q6v5_pas.c index a79bee901e9b..401b1ec90785 100644 --- a/drivers/remoteproc/qcom_q6v5_pas.c +++ b/drivers/remoteproc/qcom_q6v5_pas.c @@ -833,6 +833,7 @@ static const struct of_device_id adsp_of_match[] = { { .compatible = "qcom,sc8180x-adsp-pas", .data = &sm8150_adsp_resource}, { .compatible = "qcom,sc8180x-cdsp-pas", .data = &sm8150_cdsp_resource}, { .compatible = "qcom,sc8180x-mpss-pas", .data = &sc8180x_mpss_resource}, + { .compatible = "qcom,sdm660-adsp-pas", .data = &adsp_resource_init}, { .compatible = "qcom,sdm845-adsp-pas", .data = &adsp_resource_init}, { .compatible = "qcom,sdm845-cdsp-pas", .data = &cdsp_resource_init}, { .compatible = "qcom,sdx55-mpss-pas", .data = &sdx55_mpss_resource}, diff --git a/drivers/remoteproc/qcom_wcnss.c b/drivers/remoteproc/qcom_wcnss.c index f1cbc6b2edbb..ebadc6c08e11 100644 --- a/drivers/remoteproc/qcom_wcnss.c +++ b/drivers/remoteproc/qcom_wcnss.c @@ -142,18 +142,6 @@ static const struct wcnss_data pronto_v2_data = { .num_vregs = 1, }; -void qcom_wcnss_assign_iris(struct qcom_wcnss *wcnss, - struct qcom_iris *iris, - bool use_48mhz_xo) -{ - mutex_lock(&wcnss->iris_lock); - - wcnss->iris = iris; - wcnss->use_48mhz_xo = use_48mhz_xo; - - mutex_unlock(&wcnss->iris_lock); -} - static int wcnss_load(struct rproc *rproc, const struct firmware *fw) { struct qcom_wcnss *wcnss = (struct qcom_wcnss *)rproc->priv; @@ -639,12 +627,20 @@ static int wcnss_probe(struct platform_device *pdev) goto detach_pds; } + wcnss->iris = qcom_iris_probe(&pdev->dev, &wcnss->use_48mhz_xo); + if (IS_ERR(wcnss->iris)) { + ret = PTR_ERR(wcnss->iris); + goto detach_pds; + } + ret = rproc_add(rproc); if (ret) - goto detach_pds; + goto remove_iris; - return of_platform_populate(pdev->dev.of_node, NULL, NULL, &pdev->dev); + return 0; +remove_iris: + qcom_iris_remove(wcnss->iris); detach_pds: wcnss_release_pds(wcnss); free_rproc: @@ -657,7 +653,7 @@ static int wcnss_remove(struct platform_device *pdev) { struct qcom_wcnss *wcnss = platform_get_drvdata(pdev); - of_platform_depopulate(&pdev->dev); + qcom_iris_remove(wcnss->iris); rproc_del(wcnss->rproc); @@ -686,28 +682,7 @@ static struct platform_driver wcnss_driver = { }, }; -static int __init wcnss_init(void) -{ - int ret; - - ret = platform_driver_register(&wcnss_driver); - if (ret) - return ret; - - ret = platform_driver_register(&qcom_iris_driver); - if (ret) - platform_driver_unregister(&wcnss_driver); - - return ret; -} -module_init(wcnss_init); - -static void __exit wcnss_exit(void) -{ - platform_driver_unregister(&qcom_iris_driver); - platform_driver_unregister(&wcnss_driver); -} -module_exit(wcnss_exit); +module_platform_driver(wcnss_driver); MODULE_DESCRIPTION("Qualcomm Peripheral Image Loader for Wireless Subsystem"); MODULE_LICENSE("GPL v2"); diff --git a/drivers/remoteproc/qcom_wcnss.h b/drivers/remoteproc/qcom_wcnss.h index 62c8682d0a92..6d01ee6afa7f 100644 --- a/drivers/remoteproc/qcom_wcnss.h +++ b/drivers/remoteproc/qcom_wcnss.h @@ -17,9 +17,9 @@ struct wcnss_vreg_info { bool super_turbo; }; +struct qcom_iris *qcom_iris_probe(struct device *parent, bool *use_48mhz_xo); +void qcom_iris_remove(struct qcom_iris *iris); int qcom_iris_enable(struct qcom_iris *iris); void qcom_iris_disable(struct qcom_iris *iris); -void qcom_wcnss_assign_iris(struct qcom_wcnss *wcnss, struct qcom_iris *iris, bool use_48mhz_xo); - #endif diff --git a/drivers/remoteproc/qcom_wcnss_iris.c b/drivers/remoteproc/qcom_wcnss_iris.c index 169acd305ae3..09720ddddc85 100644 --- a/drivers/remoteproc/qcom_wcnss_iris.c +++ b/drivers/remoteproc/qcom_wcnss_iris.c @@ -17,7 +17,7 @@ #include "qcom_wcnss.h" struct qcom_iris { - struct device *dev; + struct device dev; struct clk *xo_clk; @@ -75,7 +75,7 @@ int qcom_iris_enable(struct qcom_iris *iris) ret = clk_prepare_enable(iris->xo_clk); if (ret) { - dev_err(iris->dev, "failed to enable xo clk\n"); + dev_err(&iris->dev, "failed to enable xo clk\n"); goto disable_regulators; } @@ -93,43 +93,90 @@ void qcom_iris_disable(struct qcom_iris *iris) regulator_bulk_disable(iris->num_vregs, iris->vregs); } -static int qcom_iris_probe(struct platform_device *pdev) +static const struct of_device_id iris_of_match[] = { + { .compatible = "qcom,wcn3620", .data = &wcn3620_data }, + { .compatible = "qcom,wcn3660", .data = &wcn3660_data }, + { .compatible = "qcom,wcn3660b", .data = &wcn3680_data }, + { .compatible = "qcom,wcn3680", .data = &wcn3680_data }, + {} +}; + +static void qcom_iris_release(struct device *dev) { + struct qcom_iris *iris = container_of(dev, struct qcom_iris, dev); + + of_node_put(iris->dev.of_node); + kfree(iris); +} + +struct qcom_iris *qcom_iris_probe(struct device *parent, bool *use_48mhz_xo) +{ + const struct of_device_id *match; const struct iris_data *data; - struct qcom_wcnss *wcnss; + struct device_node *of_node; struct qcom_iris *iris; int ret; int i; - iris = devm_kzalloc(&pdev->dev, sizeof(struct qcom_iris), GFP_KERNEL); - if (!iris) - return -ENOMEM; + of_node = of_get_child_by_name(parent->of_node, "iris"); + if (!of_node) { + dev_err(parent, "No child node \"iris\" found\n"); + return ERR_PTR(-EINVAL); + } - data = of_device_get_match_data(&pdev->dev); - wcnss = dev_get_drvdata(pdev->dev.parent); + iris = kzalloc(sizeof(*iris), GFP_KERNEL); + if (!iris) { + of_node_put(of_node); + return ERR_PTR(-ENOMEM); + } - iris->xo_clk = devm_clk_get(&pdev->dev, "xo"); + device_initialize(&iris->dev); + iris->dev.parent = parent; + iris->dev.release = qcom_iris_release; + iris->dev.of_node = of_node; + + dev_set_name(&iris->dev, "%s.iris", dev_name(parent)); + + ret = device_add(&iris->dev); + if (ret) { + put_device(&iris->dev); + return ERR_PTR(ret); + } + + match = of_match_device(iris_of_match, &iris->dev); + if (!match) { + dev_err(&iris->dev, "no matching compatible for iris\n"); + ret = -EINVAL; + goto err_device_del; + } + + data = match->data; + + iris->xo_clk = devm_clk_get(&iris->dev, "xo"); if (IS_ERR(iris->xo_clk)) { - if (PTR_ERR(iris->xo_clk) != -EPROBE_DEFER) - dev_err(&pdev->dev, "failed to acquire xo clk\n"); - return PTR_ERR(iris->xo_clk); + ret = PTR_ERR(iris->xo_clk); + if (ret != -EPROBE_DEFER) + dev_err(&iris->dev, "failed to acquire xo clk\n"); + goto err_device_del; } iris->num_vregs = data->num_vregs; - iris->vregs = devm_kcalloc(&pdev->dev, + iris->vregs = devm_kcalloc(&iris->dev, iris->num_vregs, sizeof(struct regulator_bulk_data), GFP_KERNEL); - if (!iris->vregs) - return -ENOMEM; + if (!iris->vregs) { + ret = -ENOMEM; + goto err_device_del; + } for (i = 0; i < iris->num_vregs; i++) iris->vregs[i].supply = data->vregs[i].name; - ret = devm_regulator_bulk_get(&pdev->dev, iris->num_vregs, iris->vregs); + ret = devm_regulator_bulk_get(&iris->dev, iris->num_vregs, iris->vregs); if (ret) { - dev_err(&pdev->dev, "failed to get regulators\n"); - return ret; + dev_err(&iris->dev, "failed to get regulators\n"); + goto err_device_del; } for (i = 0; i < iris->num_vregs; i++) { @@ -143,34 +190,17 @@ static int qcom_iris_probe(struct platform_device *pdev) data->vregs[i].load_uA); } - qcom_wcnss_assign_iris(wcnss, iris, data->use_48mhz_xo); + *use_48mhz_xo = data->use_48mhz_xo; - return 0; + return iris; + +err_device_del: + device_del(&iris->dev); + + return ERR_PTR(ret); } -static int qcom_iris_remove(struct platform_device *pdev) +void qcom_iris_remove(struct qcom_iris *iris) { - struct qcom_wcnss *wcnss = dev_get_drvdata(pdev->dev.parent); - - qcom_wcnss_assign_iris(wcnss, NULL, false); - - return 0; + device_del(&iris->dev); } - -static const struct of_device_id iris_of_match[] = { - { .compatible = "qcom,wcn3620", .data = &wcn3620_data }, - { .compatible = "qcom,wcn3660", .data = &wcn3660_data }, - { .compatible = "qcom,wcn3660b", .data = &wcn3680_data }, - { .compatible = "qcom,wcn3680", .data = &wcn3680_data }, - {} -}; -MODULE_DEVICE_TABLE(of, iris_of_match); - -struct platform_driver qcom_iris_driver = { - .probe = qcom_iris_probe, - .remove = qcom_iris_remove, - .driver = { - .name = "qcom-iris", - .of_match_table = iris_of_match, - }, -}; diff --git a/drivers/remoteproc/remoteproc_core.c b/drivers/remoteproc/remoteproc_core.c index 7de5905d276a..502b6604b757 100644 --- a/drivers/remoteproc/remoteproc_core.c +++ b/drivers/remoteproc/remoteproc_core.c @@ -2750,8 +2750,8 @@ void rproc_report_crash(struct rproc *rproc, enum rproc_crash_type type) dev_err(&rproc->dev, "crash detected in %s: type %s\n", rproc->name, rproc_crash_to_string(type)); - /* create a new task to handle the error */ - schedule_work(&rproc->crash_handler); + /* Have a worker handle the error; ensure system is not suspended */ + queue_work(system_freezable_wq, &rproc->crash_handler); } EXPORT_SYMBOL(rproc_report_crash); diff --git a/drivers/remoteproc/remoteproc_elf_helpers.h b/drivers/remoteproc/remoteproc_elf_helpers.h index 26404e68e17a..e6de53a5000c 100644 --- a/drivers/remoteproc/remoteproc_elf_helpers.h +++ b/drivers/remoteproc/remoteproc_elf_helpers.h @@ -15,7 +15,7 @@ * fw_elf_get_class - Get elf class * @fw: the ELF firmware image * - * Note that we use and elf32_hdr to access the class since the start of the + * Note that we use elf32_hdr to access the class since the start of the * struct is the same for both elf class * * Return: elf class of the firmware diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig index 0bc7046ab942..b81fe4f7d434 100644 --- a/drivers/watchdog/Kconfig +++ b/drivers/watchdog/Kconfig @@ -168,18 +168,6 @@ config SOFT_WATCHDOG_PRETIMEOUT watchdog. Be aware that governors might affect the watchdog because it is purely software, e.g. the panic governor will stall it! -config BD70528_WATCHDOG - tristate "ROHM BD70528 PMIC Watchdog" - depends on MFD_ROHM_BD70528 - select WATCHDOG_CORE - help - Support for the watchdog in the ROHM BD70528 PMIC. Watchdog trigger - cause system reset. - - Say Y here to include support for the ROHM BD70528 watchdog. - Alternatively say M to compile the driver as a module, - which will be called bd70528_wdt. - config BD957XMUF_WATCHDOG tristate "ROHM BD9576MUF and BD9573MUF PMIC Watchdog" depends on MFD_ROHM_BD957XMUF diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile index abaf2ebd814e..1bd2d6f37c53 100644 --- a/drivers/watchdog/Makefile +++ b/drivers/watchdog/Makefile @@ -204,7 +204,6 @@ obj-$(CONFIG_WATCHDOG_SUN4V) += sun4v_wdt.o obj-$(CONFIG_XEN_WDT) += xen_wdt.o # Architecture Independent -obj-$(CONFIG_BD70528_WATCHDOG) += bd70528_wdt.o obj-$(CONFIG_BD957XMUF_WATCHDOG) += bd9576_wdt.o obj-$(CONFIG_DA9052_WATCHDOG) += da9052_wdt.o obj-$(CONFIG_DA9055_WATCHDOG) += da9055_wdt.o diff --git a/drivers/watchdog/bcm2835_wdt.c b/drivers/watchdog/bcm2835_wdt.c index dec6ca019bea..94907176a0e4 100644 --- a/drivers/watchdog/bcm2835_wdt.c +++ b/drivers/watchdog/bcm2835_wdt.c @@ -205,9 +205,13 @@ static int bcm2835_wdt_probe(struct platform_device *pdev) if (err) return err; - if (pm_power_off == NULL) { - pm_power_off = bcm2835_power_off; - bcm2835_power_off_wdt = wdt; + if (of_device_is_system_power_controller(pdev->dev.parent->of_node)) { + if (!pm_power_off) { + pm_power_off = bcm2835_power_off; + bcm2835_power_off_wdt = wdt; + } else { + dev_info(dev, "Poweroff handler already present!\n"); + } } dev_info(dev, "Broadcom BCM2835 watchdog timer"); diff --git a/drivers/watchdog/bd70528_wdt.c b/drivers/watchdog/bd70528_wdt.c deleted file mode 100644 index 0170b37e6674..000000000000 --- a/drivers/watchdog/bd70528_wdt.c +++ /dev/null @@ -1,291 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2018 ROHM Semiconductors -// ROHM BD70528MWV watchdog driver - -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * Max time we can set is 1 hour, 59 minutes and 59 seconds - * and Minimum time is 1 second - */ -#define WDT_MAX_MS ((2 * 60 * 60 - 1) * 1000) -#define WDT_MIN_MS 1000 -#define DEFAULT_TIMEOUT 60 - -#define WD_CTRL_MAGIC1 0x55 -#define WD_CTRL_MAGIC2 0xAA - -struct wdtbd70528 { - struct device *dev; - struct regmap *regmap; - struct rohm_regmap_dev *mfd; - struct watchdog_device wdt; -}; - -/** - * bd70528_wdt_set - arm or disarm watchdog timer - * - * @data: device data for the PMIC instance we want to operate on - * @enable: new state of WDT. zero to disable, non zero to enable - * @old_state: previous state of WDT will be filled here - * - * Arm or disarm WDT on BD70528 PMIC. Expected to be called only by - * BD70528 RTC and BD70528 WDT drivers. The rtc_timer_lock must be taken - * by calling bd70528_wdt_lock before calling bd70528_wdt_set. - */ -int bd70528_wdt_set(struct rohm_regmap_dev *data, int enable, int *old_state) -{ - int ret, i; - unsigned int tmp; - struct bd70528_data *bd70528 = container_of(data, struct bd70528_data, - chip); - u8 wd_ctrl_arr[3] = { WD_CTRL_MAGIC1, WD_CTRL_MAGIC2, 0 }; - u8 *wd_ctrl = &wd_ctrl_arr[2]; - - ret = regmap_read(bd70528->chip.regmap, BD70528_REG_WDT_CTRL, &tmp); - if (ret) - return ret; - - *wd_ctrl = (u8)tmp; - - if (old_state) { - if (*wd_ctrl & BD70528_MASK_WDT_EN) - *old_state |= BD70528_WDT_STATE_BIT; - else - *old_state &= ~BD70528_WDT_STATE_BIT; - if ((!enable) == (!(*old_state & BD70528_WDT_STATE_BIT))) - return 0; - } - - if (enable) { - if (*wd_ctrl & BD70528_MASK_WDT_EN) - return 0; - *wd_ctrl |= BD70528_MASK_WDT_EN; - } else { - if (*wd_ctrl & BD70528_MASK_WDT_EN) - *wd_ctrl &= ~BD70528_MASK_WDT_EN; - else - return 0; - } - - for (i = 0; i < 3; i++) { - ret = regmap_write(bd70528->chip.regmap, BD70528_REG_WDT_CTRL, - wd_ctrl_arr[i]); - if (ret) - return ret; - } - - ret = regmap_read(bd70528->chip.regmap, BD70528_REG_WDT_CTRL, &tmp); - if ((tmp & BD70528_MASK_WDT_EN) != (*wd_ctrl & BD70528_MASK_WDT_EN)) { - dev_err(bd70528->chip.dev, - "Watchdog ctrl mismatch (hw) 0x%x (set) 0x%x\n", - tmp, *wd_ctrl); - ret = -EIO; - } - - return ret; -} -EXPORT_SYMBOL(bd70528_wdt_set); - -/** - * bd70528_wdt_lock - take WDT lock - * - * @data: device data for the PMIC instance we want to operate on - * - * Lock WDT for arming/disarming in order to avoid race condition caused - * by WDT state changes initiated by WDT and RTC drivers. - */ -void bd70528_wdt_lock(struct rohm_regmap_dev *data) -{ - struct bd70528_data *bd70528 = container_of(data, struct bd70528_data, - chip); - - mutex_lock(&bd70528->rtc_timer_lock); -} -EXPORT_SYMBOL(bd70528_wdt_lock); - -/** - * bd70528_wdt_unlock - unlock WDT lock - * - * @data: device data for the PMIC instance we want to operate on - * - * Unlock WDT lock which has previously been taken by call to - * bd70528_wdt_lock. - */ -void bd70528_wdt_unlock(struct rohm_regmap_dev *data) -{ - struct bd70528_data *bd70528 = container_of(data, struct bd70528_data, - chip); - - mutex_unlock(&bd70528->rtc_timer_lock); -} -EXPORT_SYMBOL(bd70528_wdt_unlock); - -static int bd70528_wdt_set_locked(struct wdtbd70528 *w, int enable) -{ - return bd70528_wdt_set(w->mfd, enable, NULL); -} - -static int bd70528_wdt_change(struct wdtbd70528 *w, int enable) -{ - int ret; - - bd70528_wdt_lock(w->mfd); - ret = bd70528_wdt_set_locked(w, enable); - bd70528_wdt_unlock(w->mfd); - - return ret; -} - -static int bd70528_wdt_start(struct watchdog_device *wdt) -{ - struct wdtbd70528 *w = watchdog_get_drvdata(wdt); - - dev_dbg(w->dev, "WDT ping...\n"); - return bd70528_wdt_change(w, 1); -} - -static int bd70528_wdt_stop(struct watchdog_device *wdt) -{ - struct wdtbd70528 *w = watchdog_get_drvdata(wdt); - - dev_dbg(w->dev, "WDT stopping...\n"); - return bd70528_wdt_change(w, 0); -} - -static int bd70528_wdt_set_timeout(struct watchdog_device *wdt, - unsigned int timeout) -{ - unsigned int hours; - unsigned int minutes; - unsigned int seconds; - int ret; - struct wdtbd70528 *w = watchdog_get_drvdata(wdt); - - seconds = timeout; - hours = timeout / (60 * 60); - /* Maximum timeout is 1h 59m 59s => hours is 1 or 0 */ - if (hours) - seconds -= (60 * 60); - minutes = seconds / 60; - seconds = seconds % 60; - - bd70528_wdt_lock(w->mfd); - - ret = bd70528_wdt_set_locked(w, 0); - if (ret) - goto out_unlock; - - ret = regmap_update_bits(w->regmap, BD70528_REG_WDT_HOUR, - BD70528_MASK_WDT_HOUR, hours); - if (ret) { - dev_err(w->dev, "Failed to set WDT hours\n"); - goto out_en_unlock; - } - ret = regmap_update_bits(w->regmap, BD70528_REG_WDT_MINUTE, - BD70528_MASK_WDT_MINUTE, bin2bcd(minutes)); - if (ret) { - dev_err(w->dev, "Failed to set WDT minutes\n"); - goto out_en_unlock; - } - ret = regmap_update_bits(w->regmap, BD70528_REG_WDT_SEC, - BD70528_MASK_WDT_SEC, bin2bcd(seconds)); - if (ret) - dev_err(w->dev, "Failed to set WDT seconds\n"); - else - dev_dbg(w->dev, "WDT tmo set to %u\n", timeout); - -out_en_unlock: - ret = bd70528_wdt_set_locked(w, 1); -out_unlock: - bd70528_wdt_unlock(w->mfd); - - return ret; -} - -static const struct watchdog_info bd70528_wdt_info = { - .identity = "bd70528-wdt", - .options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | WDIOF_MAGICCLOSE, -}; - -static const struct watchdog_ops bd70528_wdt_ops = { - .start = bd70528_wdt_start, - .stop = bd70528_wdt_stop, - .set_timeout = bd70528_wdt_set_timeout, -}; - -static int bd70528_wdt_probe(struct platform_device *pdev) -{ - struct rohm_regmap_dev *bd70528; - struct wdtbd70528 *w; - int ret; - unsigned int reg; - - bd70528 = dev_get_drvdata(pdev->dev.parent); - if (!bd70528) { - dev_err(&pdev->dev, "No MFD driver data\n"); - return -EINVAL; - } - w = devm_kzalloc(&pdev->dev, sizeof(*w), GFP_KERNEL); - if (!w) - return -ENOMEM; - - w->regmap = bd70528->regmap; - w->mfd = bd70528; - w->dev = &pdev->dev; - - w->wdt.info = &bd70528_wdt_info; - w->wdt.ops = &bd70528_wdt_ops; - w->wdt.min_hw_heartbeat_ms = WDT_MIN_MS; - w->wdt.max_hw_heartbeat_ms = WDT_MAX_MS; - w->wdt.parent = pdev->dev.parent; - w->wdt.timeout = DEFAULT_TIMEOUT; - watchdog_set_drvdata(&w->wdt, w); - watchdog_init_timeout(&w->wdt, 0, pdev->dev.parent); - - ret = bd70528_wdt_set_timeout(&w->wdt, w->wdt.timeout); - if (ret) { - dev_err(&pdev->dev, "Failed to set the watchdog timeout\n"); - return ret; - } - - bd70528_wdt_lock(w->mfd); - ret = regmap_read(w->regmap, BD70528_REG_WDT_CTRL, ®); - bd70528_wdt_unlock(w->mfd); - - if (ret) { - dev_err(&pdev->dev, "Failed to get the watchdog state\n"); - return ret; - } - if (reg & BD70528_MASK_WDT_EN) { - dev_dbg(&pdev->dev, "watchdog was running during probe\n"); - set_bit(WDOG_HW_RUNNING, &w->wdt.status); - } - - ret = devm_watchdog_register_device(&pdev->dev, &w->wdt); - if (ret < 0) - dev_err(&pdev->dev, "watchdog registration failed: %d\n", ret); - - return ret; -} - -static struct platform_driver bd70528_wdt = { - .driver = { - .name = "bd70528-wdt" - }, - .probe = bd70528_wdt_probe, -}; - -module_platform_driver(bd70528_wdt); - -MODULE_AUTHOR("Matti Vaittinen "); -MODULE_DESCRIPTION("BD70528 watchdog driver"); -MODULE_LICENSE("GPL"); -MODULE_ALIAS("platform:bd70528-wdt"); diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c index b3f604669e2c..643c6c2d0b72 100644 --- a/drivers/watchdog/iTCO_wdt.c +++ b/drivers/watchdog/iTCO_wdt.c @@ -362,7 +362,7 @@ static int iTCO_wdt_set_timeout(struct watchdog_device *wd_dev, unsigned int t) * Otherwise, the BIOS generally reboots when the SMI triggers. */ if (p->smi_res && - (SMI_EN(p) & (TCO_EN | GBL_SMI_EN)) != (TCO_EN | GBL_SMI_EN)) + (inl(SMI_EN(p)) & (TCO_EN | GBL_SMI_EN)) != (TCO_EN | GBL_SMI_EN)) tmrval /= 2; /* from the specs: */ diff --git a/drivers/watchdog/imx2_wdt.c b/drivers/watchdog/imx2_wdt.c index cc86018c5eb5..51bfb796898b 100644 --- a/drivers/watchdog/imx2_wdt.c +++ b/drivers/watchdog/imx2_wdt.c @@ -317,6 +317,7 @@ static int __init imx2_wdt_probe(struct platform_device *pdev) watchdog_set_nowayout(wdog, nowayout); watchdog_set_restart_priority(wdog, 128); watchdog_init_timeout(wdog, timeout, dev); + watchdog_stop_ping_on_suspend(wdog); if (imx2_wdt_is_running(wdev)) { imx2_wdt_set_timeout(wdog, wdog->timeout); diff --git a/drivers/watchdog/max63xx_wdt.c b/drivers/watchdog/max63xx_wdt.c index 3a899628a834..9e1541cfae0d 100644 --- a/drivers/watchdog/max63xx_wdt.c +++ b/drivers/watchdog/max63xx_wdt.c @@ -26,6 +26,7 @@ #include #include #include +#include #define DEFAULT_HEARTBEAT 60 #define MAX_HEARTBEAT 60 @@ -99,8 +100,8 @@ static const struct max63xx_timeout max6373_table[] = { { }, }; -static struct max63xx_timeout * -max63xx_select_timeout(struct max63xx_timeout *table, int value) +static const struct max63xx_timeout * +max63xx_select_timeout(const struct max63xx_timeout *table, int value) { while (table->twd) { if (value <= table->twd) { @@ -202,14 +203,17 @@ static int max63xx_wdt_probe(struct platform_device *pdev) { struct device *dev = &pdev->dev; struct max63xx_wdt *wdt; - struct max63xx_timeout *table; + const struct max63xx_timeout *table; int err; wdt = devm_kzalloc(dev, sizeof(*wdt), GFP_KERNEL); if (!wdt) return -ENOMEM; - table = (struct max63xx_timeout *)pdev->id_entry->driver_data; + /* Attempt to use fwnode first */ + table = device_get_match_data(dev); + if (!table) + table = (struct max63xx_timeout *)pdev->id_entry->driver_data; if (heartbeat < 1 || heartbeat > MAX_HEARTBEAT) heartbeat = DEFAULT_HEARTBEAT; @@ -255,11 +259,23 @@ static const struct platform_device_id max63xx_id_table[] = { }; MODULE_DEVICE_TABLE(platform, max63xx_id_table); +static const struct of_device_id max63xx_dt_id_table[] = { + { .compatible = "maxim,max6369", .data = max6369_table, }, + { .compatible = "maxim,max6370", .data = max6369_table, }, + { .compatible = "maxim,max6371", .data = max6371_table, }, + { .compatible = "maxim,max6372", .data = max6371_table, }, + { .compatible = "maxim,max6373", .data = max6373_table, }, + { .compatible = "maxim,max6374", .data = max6373_table, }, + { } +}; +MODULE_DEVICE_TABLE(of, max63xx_dt_id_table); + static struct platform_driver max63xx_wdt_driver = { .probe = max63xx_wdt_probe, .id_table = max63xx_id_table, .driver = { .name = "max63xx_wdt", + .of_match_table = max63xx_dt_id_table, }, }; diff --git a/drivers/watchdog/mpc8xxx_wdt.c b/drivers/watchdog/mpc8xxx_wdt.c index 2f7ded32e878..1c569be72ea2 100644 --- a/drivers/watchdog/mpc8xxx_wdt.c +++ b/drivers/watchdog/mpc8xxx_wdt.c @@ -118,7 +118,7 @@ static struct watchdog_info mpc8xxx_wdt_info = { .identity = "MPC8xxx", }; -static struct watchdog_ops mpc8xxx_wdt_ops = { +static const struct watchdog_ops mpc8xxx_wdt_ops = { .owner = THIS_MODULE, .start = mpc8xxx_wdt_start, .ping = mpc8xxx_wdt_ping, diff --git a/drivers/watchdog/mtk_wdt.c b/drivers/watchdog/mtk_wdt.c index 16b6aff324a7..796fbb048cbe 100644 --- a/drivers/watchdog/mtk_wdt.c +++ b/drivers/watchdog/mtk_wdt.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include @@ -82,6 +83,10 @@ static const struct mtk_wdt_data mt8192_data = { .toprgu_sw_rst_num = MT8192_TOPRGU_SW_RST_NUM, }; +static const struct mtk_wdt_data mt8195_data = { + .toprgu_sw_rst_num = MT8195_TOPRGU_SW_RST_NUM, +}; + static int toprgu_reset_update(struct reset_controller_dev *rcdev, unsigned long id, bool assert) { @@ -408,6 +413,7 @@ static const struct of_device_id mtk_wdt_dt_ids[] = { { .compatible = "mediatek,mt6589-wdt" }, { .compatible = "mediatek,mt8183-wdt", .data = &mt8183_data }, { .compatible = "mediatek,mt8192-wdt", .data = &mt8192_data }, + { .compatible = "mediatek,mt8195-wdt", .data = &mt8195_data }, { /* sentinel */ } }; MODULE_DEVICE_TABLE(of, mtk_wdt_dt_ids); diff --git a/drivers/watchdog/sl28cpld_wdt.c b/drivers/watchdog/sl28cpld_wdt.c index 2de93298475f..9ce456f09f73 100644 --- a/drivers/watchdog/sl28cpld_wdt.c +++ b/drivers/watchdog/sl28cpld_wdt.c @@ -108,7 +108,7 @@ static const struct watchdog_info sl28cpld_wdt_info = { .identity = "sl28cpld watchdog", }; -static struct watchdog_ops sl28cpld_wdt_ops = { +static const struct watchdog_ops sl28cpld_wdt_ops = { .owner = THIS_MODULE, .start = sl28cpld_wdt_start, .stop = sl28cpld_wdt_stop, diff --git a/drivers/watchdog/tqmx86_wdt.c b/drivers/watchdog/tqmx86_wdt.c index 72d0b0adde38..83860e94ce9d 100644 --- a/drivers/watchdog/tqmx86_wdt.c +++ b/drivers/watchdog/tqmx86_wdt.c @@ -62,7 +62,7 @@ static const struct watchdog_info tqmx86_wdt_info = { .identity = "TQMx86 Watchdog", }; -static struct watchdog_ops tqmx86_wdt_ops = { +static const struct watchdog_ops tqmx86_wdt_ops = { .owner = THIS_MODULE, .start = tqmx86_wdt_start, .set_timeout = tqmx86_wdt_set_timeout, diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c index 5df0a22e2cb4..3fe8a7edc252 100644 --- a/drivers/watchdog/watchdog_core.c +++ b/drivers/watchdog/watchdog_core.c @@ -34,6 +34,7 @@ #include /* For ida_* macros */ #include /* For IS_ERR macros */ #include /* For of_get_timeout_sec */ +#include #include "watchdog_core.h" /* For watchdog_dev_register/... */ @@ -185,6 +186,33 @@ static int watchdog_restart_notifier(struct notifier_block *nb, return NOTIFY_DONE; } +static int watchdog_pm_notifier(struct notifier_block *nb, unsigned long mode, + void *data) +{ + struct watchdog_device *wdd; + int ret = 0; + + wdd = container_of(nb, struct watchdog_device, pm_nb); + + switch (mode) { + case PM_HIBERNATION_PREPARE: + case PM_RESTORE_PREPARE: + case PM_SUSPEND_PREPARE: + ret = watchdog_dev_suspend(wdd); + break; + case PM_POST_HIBERNATION: + case PM_POST_RESTORE: + case PM_POST_SUSPEND: + ret = watchdog_dev_resume(wdd); + break; + } + + if (ret) + return NOTIFY_BAD; + + return NOTIFY_DONE; +} + /** * watchdog_set_restart_priority - Change priority of restart handler * @wdd: watchdog device @@ -292,6 +320,15 @@ static int __watchdog_register_device(struct watchdog_device *wdd) wdd->id, ret); } + if (test_bit(WDOG_NO_PING_ON_SUSPEND, &wdd->status)) { + wdd->pm_nb.notifier_call = watchdog_pm_notifier; + + ret = register_pm_notifier(&wdd->pm_nb); + if (ret) + pr_warn("watchdog%d: Cannot register pm handler (%d)\n", + wdd->id, ret); + } + return 0; } diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c index 3bab32485273..3a3d8b5c7ad5 100644 --- a/drivers/watchdog/watchdog_dev.c +++ b/drivers/watchdog/watchdog_dev.c @@ -401,7 +401,7 @@ static int watchdog_set_pretimeout(struct watchdog_device *wdd, if (watchdog_pretimeout_invalid(wdd, timeout)) return -EINVAL; - if (wdd->ops->set_pretimeout) + if (wdd->ops->set_pretimeout && (wdd->info->options & WDIOF_PRETIMEOUT)) err = wdd->ops->set_pretimeout(wdd, timeout); else wdd->pretimeout = timeout; @@ -1096,6 +1096,8 @@ static void watchdog_cdev_unregister(struct watchdog_device *wdd) watchdog_stop(wdd); } + watchdog_hrtimer_pretimeout_stop(wdd); + mutex_lock(&wd_data->lock); wd_data->wdd = NULL; wdd->wd_data = NULL; @@ -1103,7 +1105,6 @@ static void watchdog_cdev_unregister(struct watchdog_device *wdd) hrtimer_cancel(&wd_data->timer); kthread_cancel_work_sync(&wd_data->work); - watchdog_hrtimer_pretimeout_stop(wdd); put_device(&wd_data->dev); } @@ -1172,7 +1173,10 @@ int watchdog_set_last_hw_keepalive(struct watchdog_device *wdd, wd_data->last_hw_keepalive = ktime_sub(now, ms_to_ktime(last_ping_ms)); - return __watchdog_ping(wdd); + if (watchdog_hw_running(wdd) && handle_boot_enabled) + return __watchdog_ping(wdd); + + return 0; } EXPORT_SYMBOL_GPL(watchdog_set_last_hw_keepalive); @@ -1227,6 +1231,53 @@ void __exit watchdog_dev_exit(void) kthread_destroy_worker(watchdog_kworker); } +int watchdog_dev_suspend(struct watchdog_device *wdd) +{ + struct watchdog_core_data *wd_data = wdd->wd_data; + int ret = 0; + + if (!wdd->wd_data) + return -ENODEV; + + /* ping for the last time before suspend */ + mutex_lock(&wd_data->lock); + if (watchdog_worker_should_ping(wd_data)) + ret = __watchdog_ping(wd_data->wdd); + mutex_unlock(&wd_data->lock); + + if (ret) + return ret; + + /* + * make sure that watchdog worker will not kick in when the wdog is + * suspended + */ + hrtimer_cancel(&wd_data->timer); + kthread_cancel_work_sync(&wd_data->work); + + return 0; +} + +int watchdog_dev_resume(struct watchdog_device *wdd) +{ + struct watchdog_core_data *wd_data = wdd->wd_data; + int ret = 0; + + if (!wdd->wd_data) + return -ENODEV; + + /* + * __watchdog_ping will also retrigger hrtimer and therefore restore the + * ping worker if needed. + */ + mutex_lock(&wd_data->lock); + if (watchdog_worker_should_ping(wd_data)) + ret = __watchdog_ping(wd_data->wdd); + mutex_unlock(&wd_data->lock); + + return ret; +} + module_param(handle_boot_enabled, bool, 0444); MODULE_PARM_DESC(handle_boot_enabled, "Watchdog core auto-updates boot enabled watchdogs before userspace takes over (default=" diff --git a/include/dt-bindings/reset/mt8195-resets.h b/include/dt-bindings/reset/mt8195-resets.h new file mode 100644 index 000000000000..a26bccc8b957 --- /dev/null +++ b/include/dt-bindings/reset/mt8195-resets.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: (GPL-2.0+ OR BSD-3-Clause)*/ +/* + * Copyright (c) 2021 MediaTek Inc. + * Author: Christine Zhu + */ + +#ifndef _DT_BINDINGS_RESET_CONTROLLER_MT8195 +#define _DT_BINDINGS_RESET_CONTROLLER_MT8195 + +#define MT8195_TOPRGU_CONN_MCU_SW_RST 0 +#define MT8195_TOPRGU_INFRA_GRST_SW_RST 1 +#define MT8195_TOPRGU_APU_SW_RST 2 +#define MT8195_TOPRGU_INFRA_AO_GRST_SW_RST 6 +#define MT8195_TOPRGU_MMSYS_SW_RST 7 +#define MT8195_TOPRGU_MFG_SW_RST 8 +#define MT8195_TOPRGU_VENC_SW_RST 9 +#define MT8195_TOPRGU_VDEC_SW_RST 10 +#define MT8195_TOPRGU_IMG_SW_RST 11 +#define MT8195_TOPRGU_APMIXEDSYS_SW_RST 13 +#define MT8195_TOPRGU_AUDIO_SW_RST 14 +#define MT8195_TOPRGU_CAMSYS_SW_RST 15 +#define MT8195_TOPRGU_EDPTX_SW_RST 16 +#define MT8195_TOPRGU_ADSPSYS_SW_RST 21 +#define MT8195_TOPRGU_DPTX_SW_RST 22 +#define MT8195_TOPRGU_SPMI_MST_SW_RST 23 + +#define MT8195_TOPRGU_SW_RST_NUM 16 + +#endif /* _DT_BINDINGS_RESET_CONTROLLER_MT8195 */ diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h index 136b8d97d8c0..0d7865a0731c 100644 --- a/include/linux/entry-kvm.h +++ b/include/linux/entry-kvm.h @@ -2,7 +2,11 @@ #ifndef __LINUX_ENTRYKVM_H #define __LINUX_ENTRYKVM_H -#include +#include +#include +#include +#include +#include #include /* Transfer to guest mode work */ diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 330345b1be54..928c411bd509 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -299,6 +299,18 @@ static inline void ether_addr_copy(u8 *dst, const u8 *src) #endif } +/** + * eth_hw_addr_set - Assign Ethernet address to a net_device + * @dev: pointer to net_device structure + * @addr: address to assign + * + * Assign given address to the net_device, addr_assign_type is not changed. + */ +static inline void eth_hw_addr_set(struct net_device *dev, const u8 *addr) +{ + ether_addr_copy(dev->dev_addr, addr); +} + /** * eth_hw_addr_inherit - Copy dev_addr from another net_device * @dst: pointer to net_device to copy dev_addr to diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index ae7735b490b4..041ca7f15ea4 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -150,6 +150,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_UNBLOCK 2 #define KVM_REQ_UNHALT 3 +#define KVM_REQ_VM_BUGGED (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQUEST_ARCH_BASE 8 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \ @@ -158,6 +159,15 @@ static inline bool is_error_page(struct page *page) }) #define KVM_ARCH_REQ(nr) KVM_ARCH_REQ_FLAGS(nr, 0) +bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except, + unsigned long *vcpu_bitmap, cpumask_var_t tmp); +bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); +bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, + struct kvm_vcpu *except); +bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req, + unsigned long *vcpu_bitmap); + #define KVM_USERSPACE_IRQ_SOURCE_ID 0 #define KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID 1 @@ -344,6 +354,13 @@ struct kvm_vcpu { struct kvm_vcpu_stat stat; char stats_id[KVM_STATS_NAME_SIZE]; struct kvm_dirty_ring dirty_ring; + + /* + * The index of the most recently used memslot by this vCPU. It's ok + * if this becomes stale due to memslot changes since we always check + * it is a valid slot. + */ + int last_used_slot; }; /* must be called with irqs disabled */ @@ -512,7 +529,7 @@ struct kvm_memslots { u64 generation; /* The mapping table from slot id to the index in memslots[]. */ short id_to_index[KVM_MEM_SLOTS_NUM]; - atomic_t lru_slot; + atomic_t last_used_slot; int used_slots; struct kvm_memory_slot memslots[]; }; @@ -538,6 +555,11 @@ struct kvm { struct kvm_memslots __rcu *memslots[KVM_ADDRESS_SPACE_NUM]; struct kvm_vcpu *vcpus[KVM_MAX_VCPUS]; + /* Used to wait for completion of MMU notifiers. */ + spinlock_t mn_invalidate_lock; + unsigned long mn_active_invalidate_count; + struct rcuwait mn_memslots_update_rcuwait; + /* * created_vcpus is protected by kvm->lock, and is incremented * at the beginning of KVM_CREATE_VCPU. online_vcpus is only @@ -596,6 +618,7 @@ struct kvm { pid_t userspace_pid; unsigned int max_halt_poll_ns; u32 dirty_ring_size; + bool vm_bugged; #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; @@ -629,6 +652,30 @@ struct kvm { #define vcpu_err(vcpu, fmt, ...) \ kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) +static inline void kvm_vm_bugged(struct kvm *kvm) +{ + kvm->vm_bugged = true; + kvm_make_all_cpus_request(kvm, KVM_REQ_VM_BUGGED); +} + +#define KVM_BUG(cond, kvm, fmt...) \ +({ \ + int __ret = (cond); \ + \ + if (WARN_ONCE(__ret && !(kvm)->vm_bugged, fmt)) \ + kvm_vm_bugged(kvm); \ + unlikely(__ret); \ +}) + +#define KVM_BUG_ON(cond, kvm) \ +({ \ + int __ret = (cond); \ + \ + if (WARN_ON_ONCE(__ret && !(kvm)->vm_bugged)) \ + kvm_vm_bugged(kvm); \ + unlikely(__ret); \ +}) + static inline bool kvm_dirty_log_manual_protect_and_init_set(struct kvm *kvm) { return !!(kvm->manual_dirty_log_protect & KVM_DIRTY_LOG_INITIALLY_SET); @@ -720,6 +767,7 @@ int kvm_init(void *opaque, unsigned vcpu_size, unsigned vcpu_align, void kvm_exit(void); void kvm_get_kvm(struct kvm *kvm); +bool kvm_get_kvm_safe(struct kvm *kvm); void kvm_put_kvm(struct kvm *kvm); bool file_is_kvm(struct file *file); void kvm_put_kvm_no_destroy(struct kvm *kvm); @@ -824,7 +872,6 @@ void kvm_release_pfn_clean(kvm_pfn_t pfn); void kvm_release_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_dirty(kvm_pfn_t pfn); void kvm_set_pfn_accessed(kvm_pfn_t pfn); -void kvm_get_pfn(kvm_pfn_t pfn); void kvm_release_pfn(kvm_pfn_t pfn, bool dirty, struct gfn_to_pfn_cache *cache); int kvm_read_guest_page(struct kvm *kvm, gfn_t gfn, void *data, int offset, @@ -943,14 +990,10 @@ void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc); void *kvm_mmu_memory_cache_alloc(struct kvm_mmu_memory_cache *mc); #endif -bool kvm_make_vcpus_request_mask(struct kvm *kvm, unsigned int req, - struct kvm_vcpu *except, - unsigned long *vcpu_bitmap, cpumask_var_t tmp); -bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req); -bool kvm_make_all_cpus_request_except(struct kvm *kvm, unsigned int req, - struct kvm_vcpu *except); -bool kvm_make_cpus_request_mask(struct kvm *kvm, unsigned int req, - unsigned long *vcpu_bitmap); +void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, + unsigned long end); +void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, + unsigned long end); long kvm_arch_dev_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg); @@ -1034,6 +1077,7 @@ bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu); bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu); int kvm_arch_post_init_vm(struct kvm *kvm); void kvm_arch_pre_destroy_vm(struct kvm *kvm); +int kvm_arch_create_vm_debugfs(struct kvm *kvm); #ifndef __KVM_HAVE_ARCH_VM_ALLOC /* @@ -1157,29 +1201,49 @@ void kvm_free_irq_source_id(struct kvm *kvm, int irq_source_id); bool kvm_arch_irqfd_allowed(struct kvm *kvm, struct kvm_irqfd *args); /* - * search_memslots() and __gfn_to_memslot() are here because they are - * used in non-modular code in arch/powerpc/kvm/book3s_hv_rm_mmu.c. - * gfn_to_memslot() itself isn't here as an inline because that would - * bloat other code too much. + * Returns a pointer to the memslot at slot_index if it contains gfn. + * Otherwise returns NULL. + */ +static inline struct kvm_memory_slot * +try_get_memslot(struct kvm_memslots *slots, int slot_index, gfn_t gfn) +{ + struct kvm_memory_slot *slot; + + if (slot_index < 0 || slot_index >= slots->used_slots) + return NULL; + + /* + * slot_index can come from vcpu->last_used_slot which is not kept + * in sync with userspace-controllable memslot deletion. So use nospec + * to prevent the CPU from speculating past the end of memslots[]. + */ + slot_index = array_index_nospec(slot_index, slots->used_slots); + slot = &slots->memslots[slot_index]; + + if (gfn >= slot->base_gfn && gfn < slot->base_gfn + slot->npages) + return slot; + else + return NULL; +} + +/* + * Returns a pointer to the memslot that contains gfn and records the index of + * the slot in index. Otherwise returns NULL. * * IMPORTANT: Slots are sorted from highest GFN to lowest GFN! */ static inline struct kvm_memory_slot * -search_memslots(struct kvm_memslots *slots, gfn_t gfn) +search_memslots(struct kvm_memslots *slots, gfn_t gfn, int *index) { int start = 0, end = slots->used_slots; - int slot = atomic_read(&slots->lru_slot); struct kvm_memory_slot *memslots = slots->memslots; + struct kvm_memory_slot *slot; if (unlikely(!slots->used_slots)) return NULL; - if (gfn >= memslots[slot].base_gfn && - gfn < memslots[slot].base_gfn + memslots[slot].npages) - return &memslots[slot]; - while (start < end) { - slot = start + (end - start) / 2; + int slot = start + (end - start) / 2; if (gfn >= memslots[slot].base_gfn) end = slot; @@ -1187,19 +1251,37 @@ search_memslots(struct kvm_memslots *slots, gfn_t gfn) start = slot + 1; } - if (start < slots->used_slots && gfn >= memslots[start].base_gfn && - gfn < memslots[start].base_gfn + memslots[start].npages) { - atomic_set(&slots->lru_slot, start); - return &memslots[start]; + slot = try_get_memslot(slots, start, gfn); + if (slot) { + *index = start; + return slot; } return NULL; } +/* + * __gfn_to_memslot() and its descendants are here because it is called from + * non-modular code in arch/powerpc/kvm/book3s_64_vio{,_hv}.c. gfn_to_memslot() + * itself isn't here as an inline because that would bloat other code too much. + */ static inline struct kvm_memory_slot * __gfn_to_memslot(struct kvm_memslots *slots, gfn_t gfn) { - return search_memslots(slots, gfn); + struct kvm_memory_slot *slot; + int slot_index = atomic_read(&slots->last_used_slot); + + slot = try_get_memslot(slots, slot_index, gfn); + if (slot) + return slot; + + slot = search_memslots(slots, gfn, &slot_index); + if (slot) { + atomic_set(&slots->last_used_slot, slot_index); + return slot; + } + + return NULL; } static inline unsigned long @@ -1273,56 +1355,66 @@ struct _kvm_stats_desc { char name[KVM_STATS_NAME_SIZE]; }; -#define STATS_DESC_COMMON(type, unit, base, exp) \ +#define STATS_DESC_COMMON(type, unit, base, exp, sz, bsz) \ .flags = type | unit | base | \ BUILD_BUG_ON_ZERO(type & ~KVM_STATS_TYPE_MASK) | \ BUILD_BUG_ON_ZERO(unit & ~KVM_STATS_UNIT_MASK) | \ BUILD_BUG_ON_ZERO(base & ~KVM_STATS_BASE_MASK), \ .exponent = exp, \ - .size = 1 + .size = sz, \ + .bucket_size = bsz -#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp) \ +#define VM_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vm_stat, generic.stat) \ }, \ .name = #stat, \ } -#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp) \ +#define VCPU_GENERIC_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vcpu_stat, generic.stat) \ }, \ .name = #stat, \ } -#define VM_STATS_DESC(stat, type, unit, base, exp) \ +#define VM_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vm_stat, stat) \ }, \ .name = #stat, \ } -#define VCPU_STATS_DESC(stat, type, unit, base, exp) \ +#define VCPU_STATS_DESC(stat, type, unit, base, exp, sz, bsz) \ { \ { \ - STATS_DESC_COMMON(type, unit, base, exp), \ + STATS_DESC_COMMON(type, unit, base, exp, sz, bsz), \ .offset = offsetof(struct kvm_vcpu_stat, stat) \ }, \ .name = #stat, \ } /* SCOPE: VM, VM_GENERIC, VCPU, VCPU_GENERIC */ -#define STATS_DESC(SCOPE, stat, type, unit, base, exp) \ - SCOPE##_STATS_DESC(stat, type, unit, base, exp) +#define STATS_DESC(SCOPE, stat, type, unit, base, exp, sz, bsz) \ + SCOPE##_STATS_DESC(stat, type, unit, base, exp, sz, bsz) #define STATS_DESC_CUMULATIVE(SCOPE, name, unit, base, exponent) \ - STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, unit, base, exponent) + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_CUMULATIVE, \ + unit, base, exponent, 1, 0) #define STATS_DESC_INSTANT(SCOPE, name, unit, base, exponent) \ - STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, unit, base, exponent) + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_INSTANT, \ + unit, base, exponent, 1, 0) #define STATS_DESC_PEAK(SCOPE, name, unit, base, exponent) \ - STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, unit, base, exponent) + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_PEAK, \ + unit, base, exponent, 1, 0) +#define STATS_DESC_LINEAR_HIST(SCOPE, name, unit, base, exponent, sz, bsz) \ + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LINEAR_HIST, \ + unit, base, exponent, sz, bsz) +#define STATS_DESC_LOG_HIST(SCOPE, name, unit, base, exponent, sz) \ + STATS_DESC(SCOPE, name, KVM_STATS_TYPE_LOG_HIST, \ + unit, base, exponent, sz, 0) /* Cumulative counter, read/write */ #define STATS_DESC_COUNTER(SCOPE, name) \ @@ -1341,9 +1433,18 @@ struct _kvm_stats_desc { #define STATS_DESC_TIME_NSEC(SCOPE, name) \ STATS_DESC_CUMULATIVE(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ KVM_STATS_BASE_POW10, -9) +/* Linear histogram for time in nanosecond */ +#define STATS_DESC_LINHIST_TIME_NSEC(SCOPE, name, sz, bsz) \ + STATS_DESC_LINEAR_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ + KVM_STATS_BASE_POW10, -9, sz, bsz) +/* Logarithmic histogram for time in nanosecond */ +#define STATS_DESC_LOGHIST_TIME_NSEC(SCOPE, name, sz) \ + STATS_DESC_LOG_HIST(SCOPE, name, KVM_STATS_UNIT_SECONDS, \ + KVM_STATS_BASE_POW10, -9, sz) #define KVM_GENERIC_VM_STATS() \ - STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush) + STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush), \ + STATS_DESC_COUNTER(VM_GENERIC, remote_tlb_flush_requests) #define KVM_GENERIC_VCPU_STATS() \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_successful_poll), \ @@ -1351,13 +1452,62 @@ struct _kvm_stats_desc { STATS_DESC_COUNTER(VCPU_GENERIC, halt_poll_invalid), \ STATS_DESC_COUNTER(VCPU_GENERIC, halt_wakeup), \ STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_success_ns), \ - STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns) + STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_ns), \ + STATS_DESC_TIME_NSEC(VCPU_GENERIC, halt_wait_ns), \ + STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_success_hist, \ + HALT_POLL_HIST_COUNT), \ + STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_poll_fail_hist, \ + HALT_POLL_HIST_COUNT), \ + STATS_DESC_LOGHIST_TIME_NSEC(VCPU_GENERIC, halt_wait_hist, \ + HALT_POLL_HIST_COUNT) extern struct dentry *kvm_debugfs_dir; + ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, const struct _kvm_stats_desc *desc, void *stats, size_t size_stats, char __user *user_buffer, size_t size, loff_t *offset); + +/** + * kvm_stats_linear_hist_update() - Update bucket value for linear histogram + * statistics data. + * + * @data: start address of the stats data + * @size: the number of bucket of the stats data + * @value: the new value used to update the linear histogram's bucket + * @bucket_size: the size (width) of a bucket + */ +static inline void kvm_stats_linear_hist_update(u64 *data, size_t size, + u64 value, size_t bucket_size) +{ + size_t index = div64_u64(value, bucket_size); + + index = min(index, size - 1); + ++data[index]; +} + +/** + * kvm_stats_log_hist_update() - Update bucket value for logarithmic histogram + * statistics data. + * + * @data: start address of the stats data + * @size: the number of bucket of the stats data + * @value: the new value used to update the logarithmic histogram's bucket + */ +static inline void kvm_stats_log_hist_update(u64 *data, size_t size, u64 value) +{ + size_t index = fls64(value); + + index = min(index, size - 1); + ++data[index]; +} + +#define KVM_STATS_LINEAR_HIST_UPDATE(array, value, bsize) \ + kvm_stats_linear_hist_update(array, ARRAY_SIZE(array), value, bsize) +#define KVM_STATS_LOG_HIST_UPDATE(array, value) \ + kvm_stats_log_hist_update(array, ARRAY_SIZE(array), value) + + extern const struct kvm_stats_header kvm_vm_stats_header; extern const struct _kvm_stats_desc kvm_vm_stats_desc[]; extern const struct kvm_stats_header kvm_vcpu_stats_header; diff --git a/include/linux/kvm_types.h b/include/linux/kvm_types.h index ed6a985c5680..2237abb93ccd 100644 --- a/include/linux/kvm_types.h +++ b/include/linux/kvm_types.h @@ -76,8 +76,11 @@ struct kvm_mmu_memory_cache { }; #endif +#define HALT_POLL_HIST_COUNT 32 + struct kvm_vm_stat_generic { u64 remote_tlb_flush; + u64 remote_tlb_flush_requests; }; struct kvm_vcpu_stat_generic { @@ -87,6 +90,10 @@ struct kvm_vcpu_stat_generic { u64 halt_wakeup; u64 halt_poll_success_ns; u64 halt_poll_fail_ns; + u64 halt_wait_ns; + u64 halt_poll_success_hist[HALT_POLL_HIST_COUNT]; + u64 halt_poll_fail_hist[HALT_POLL_HIST_COUNT]; + u64 halt_wait_hist[HALT_POLL_HIST_COUNT]; }; #define KVM_STATS_NAME_SIZE 48 diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 7c41593c1d6a..d79163208dfd 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -4641,6 +4641,24 @@ void __hw_addr_unsync_dev(struct netdev_hw_addr_list *list, void __hw_addr_init(struct netdev_hw_addr_list *list); /* Functions used for device addresses handling */ +static inline void +__dev_addr_set(struct net_device *dev, const u8 *addr, size_t len) +{ + memcpy(dev->dev_addr, addr, len); +} + +static inline void dev_addr_set(struct net_device *dev, const u8 *addr) +{ + __dev_addr_set(dev, addr, dev->addr_len); +} + +static inline void +dev_addr_mod(struct net_device *dev, unsigned int offset, + const u8 *addr, size_t len) +{ + memcpy(&dev->dev_addr[offset], addr, len); +} + int dev_addr_add(struct net_device *dev, const unsigned char *addr, unsigned char addr_type); int dev_addr_del(struct net_device *dev, const unsigned char *addr, diff --git a/include/linux/netfilter/nf_conntrack_common.h b/include/linux/netfilter/nf_conntrack_common.h index 0c7d8d1e945d..700ea077ce2d 100644 --- a/include/linux/netfilter/nf_conntrack_common.h +++ b/include/linux/netfilter/nf_conntrack_common.h @@ -18,6 +18,7 @@ struct ip_conntrack_stat { unsigned int expect_create; unsigned int expect_delete; unsigned int search_restart; + unsigned int chaintoolong; }; #define NFCT_INFOMASK 7UL diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 5922031ffab6..1ace27c4a8e0 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -632,43 +632,6 @@ static inline int PageTransCompound(struct page *page) return PageCompound(page); } -/* - * PageTransCompoundMap is the same as PageTransCompound, but it also - * guarantees the primary MMU has the entire compound page mapped - * through pmd_trans_huge, which in turn guarantees the secondary MMUs - * can also map the entire compound page. This allows the secondary - * MMUs to call get_user_pages() only once for each compound page and - * to immediately map the entire compound page with a single secondary - * MMU fault. If there will be a pmd split later, the secondary MMUs - * will get an update through the MMU notifier invalidation through - * split_huge_pmd(). - * - * Unlike PageTransCompound, this is safe to be called only while - * split_huge_pmd() cannot run from under us, like if protected by the - * MMU notifier, otherwise it may result in page->_mapcount check false - * positives. - * - * We have to treat page cache THP differently since every subpage of it - * would get _mapcount inc'ed once it is PMD mapped. But, it may be PTE - * mapped in the current process so comparing subpage's _mapcount to - * compound_mapcount to filter out PTE mapped case. - */ -static inline int PageTransCompoundMap(struct page *page) -{ - struct page *head; - - if (!PageTransCompound(page)) - return 0; - - if (PageAnon(page)) - return atomic_read(&page->_mapcount) < 0; - - head = compound_head(page); - /* File THP is PMD mapped and not PTE mapped */ - return atomic_read(&page->_mapcount) == - atomic_read(compound_mapcount_ptr(head)); -} - /* * PageTransTail returns true for both transparent huge pages * and hugetlbfs pages, so it should only be called when it's known diff --git a/include/linux/phylink.h b/include/linux/phylink.h index afb3ded0b691..237291196ce2 100644 --- a/include/linux/phylink.h +++ b/include/linux/phylink.h @@ -451,6 +451,9 @@ void phylink_mac_change(struct phylink *, bool up); void phylink_start(struct phylink *); void phylink_stop(struct phylink *); +void phylink_suspend(struct phylink *pl, bool mac_wol); +void phylink_resume(struct phylink *pl); + void phylink_ethtool_get_wol(struct phylink *, struct ethtool_wolinfo *); int phylink_ethtool_set_wol(struct phylink *, struct ethtool_wolinfo *); diff --git a/include/linux/remoteproc.h b/include/linux/remoteproc.h index a5b37bc10865..83c09ac36b13 100644 --- a/include/linux/remoteproc.h +++ b/include/linux/remoteproc.h @@ -369,9 +369,8 @@ enum rsc_handling_status { * @da_to_va: optional platform hook to perform address translations * @parse_fw: parse firmware to extract information (e.g. resource table) * @handle_rsc: optional platform hook to handle vendor resources. Should return - * RSC_HANDLED if resource was handled, RSC_IGNORED if not handled and a - * negative value on error - * @load_rsc_table: load resource table from firmware image + * RSC_HANDLED if resource was handled, RSC_IGNORED if not handled + * and a negative value on error * @find_loaded_rsc_table: find the loaded resource table from firmware image * @get_loaded_rsc_table: get resource table installed in memory * by external entity diff --git a/include/linux/soc/marvell/octeontx2/asm.h b/include/linux/soc/marvell/octeontx2/asm.h index 28c04d918f0f..fa1d6af0164e 100644 --- a/include/linux/soc/marvell/octeontx2/asm.h +++ b/include/linux/soc/marvell/octeontx2/asm.h @@ -22,12 +22,17 @@ : [rs]"r" (ioaddr)); \ (result); \ }) +/* + * STEORL store to memory with release semantics. + * This will avoid using DMB barrier after each LMTST + * operation. + */ #define cn10k_lmt_flush(val, addr) \ ({ \ __asm__ volatile(".cpu generic+lse\n" \ - "steor %x[rf],[%[rs]]" \ - : [rf]"+r"(val) \ - : [rs]"r"(addr)); \ + "steorl %x[rf],[%[rs]]" \ + : [rf] "+r"(val) \ + : [rs] "r"(addr)); \ }) #else #define otx2_lmt_flush(ioaddr) ({ 0; }) diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h index 9b19e6bb68b5..99660197a36c 100644 --- a/include/linux/watchdog.h +++ b/include/linux/watchdog.h @@ -107,6 +107,7 @@ struct watchdog_device { unsigned int max_hw_heartbeat_ms; struct notifier_block reboot_nb; struct notifier_block restart_nb; + struct notifier_block pm_nb; void *driver_data; struct watchdog_core_data *wd_data; unsigned long status; @@ -116,6 +117,7 @@ struct watchdog_device { #define WDOG_STOP_ON_REBOOT 2 /* Should be stopped on reboot */ #define WDOG_HW_RUNNING 3 /* True if HW watchdog running */ #define WDOG_STOP_ON_UNREGISTER 4 /* Should be stopped on unregister */ +#define WDOG_NO_PING_ON_SUSPEND 5 /* Ping worker should be stopped on suspend */ struct list_head deferred; }; @@ -156,6 +158,12 @@ static inline void watchdog_stop_on_unregister(struct watchdog_device *wdd) set_bit(WDOG_STOP_ON_UNREGISTER, &wdd->status); } +/* Use the following function to stop the wdog ping worker when suspending */ +static inline void watchdog_stop_ping_on_suspend(struct watchdog_device *wdd) +{ + set_bit(WDOG_NO_PING_ON_SUSPEND, &wdd->status); +} + /* Use the following function to check if a timeout value is invalid */ static inline bool watchdog_timeout_invalid(struct watchdog_device *wdd, unsigned int t) { @@ -209,6 +217,8 @@ extern int watchdog_init_timeout(struct watchdog_device *wdd, unsigned int timeout_parm, struct device *dev); extern int watchdog_register_device(struct watchdog_device *); extern void watchdog_unregister_device(struct watchdog_device *); +int watchdog_dev_suspend(struct watchdog_device *wdd); +int watchdog_dev_resume(struct watchdog_device *wdd); int watchdog_set_last_hw_keepalive(struct watchdog_device *, unsigned int); diff --git a/include/net/flow.h b/include/net/flow.h index 6f5e70240071..58beb16a49b8 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -194,7 +194,7 @@ static inline struct flowi *flowi4_to_flowi(struct flowi4 *fl4) static inline struct flowi_common *flowi4_to_flowi_common(struct flowi4 *fl4) { - return &(flowi4_to_flowi(fl4)->u.__fl_common); + return &(fl4->__fl_common); } static inline struct flowi *flowi6_to_flowi(struct flowi6 *fl6) @@ -204,7 +204,7 @@ static inline struct flowi *flowi6_to_flowi(struct flowi6 *fl6) static inline struct flowi_common *flowi6_to_flowi_common(struct flowi6 *fl6) { - return &(flowi6_to_flowi(fl6)->u.__fl_common); + return &(fl6->__fl_common); } static inline struct flowi *flowidn_to_flowi(struct flowidn *fldn) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index d9e4aabcb31a..a067410ebea5 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1965,7 +1965,9 @@ struct kvm_stats_header { #define KVM_STATS_TYPE_CUMULATIVE (0x0 << KVM_STATS_TYPE_SHIFT) #define KVM_STATS_TYPE_INSTANT (0x1 << KVM_STATS_TYPE_SHIFT) #define KVM_STATS_TYPE_PEAK (0x2 << KVM_STATS_TYPE_SHIFT) -#define KVM_STATS_TYPE_MAX KVM_STATS_TYPE_PEAK +#define KVM_STATS_TYPE_LINEAR_HIST (0x3 << KVM_STATS_TYPE_SHIFT) +#define KVM_STATS_TYPE_LOG_HIST (0x4 << KVM_STATS_TYPE_SHIFT) +#define KVM_STATS_TYPE_MAX KVM_STATS_TYPE_LOG_HIST #define KVM_STATS_UNIT_SHIFT 4 #define KVM_STATS_UNIT_MASK (0xF << KVM_STATS_UNIT_SHIFT) @@ -1988,8 +1990,9 @@ struct kvm_stats_header { * @size: The number of data items for this stats. * Every data item is of type __u64. * @offset: The offset of the stats to the start of stat structure in - * struture kvm or kvm_vcpu. - * @unused: Unused field for future usage. Always 0 for now. + * structure kvm or kvm_vcpu. + * @bucket_size: A parameter value used for histogram stats. It is only used + * for linear histogram stats, specifying the size of the bucket; * @name: The name string for the stats. Its size is indicated by the * &kvm_stats_header->name_size. */ @@ -1998,7 +2001,7 @@ struct kvm_stats_desc { __s16 exponent; __u16 size; __u32 offset; - __u32 unused; + __u32 bucket_size; char name[]; }; diff --git a/include/uapi/linux/netfilter/nfnetlink_conntrack.h b/include/uapi/linux/netfilter/nfnetlink_conntrack.h index c6e6d7d7d538..c2ac7269acf7 100644 --- a/include/uapi/linux/netfilter/nfnetlink_conntrack.h +++ b/include/uapi/linux/netfilter/nfnetlink_conntrack.h @@ -258,6 +258,7 @@ enum ctattr_stats_cpu { CTA_STATS_ERROR, CTA_STATS_SEARCH_RESTART, CTA_STATS_CLASH_RESOLVE, + CTA_STATS_CHAIN_TOOLONG, __CTA_STATS_MAX, }; #define CTA_STATS_MAX (__CTA_STATS_MAX - 1) diff --git a/include/uapi/linux/pkt_sched.h b/include/uapi/linux/pkt_sched.h index 79a699f106b1..ec88590b3198 100644 --- a/include/uapi/linux/pkt_sched.h +++ b/include/uapi/linux/pkt_sched.h @@ -827,6 +827,8 @@ struct tc_codel_xstats { /* FQ_CODEL */ +#define FQ_CODEL_QUANTUM_MAX (1 << 20) + enum { TCA_FQ_CODEL_UNSPEC, TCA_FQ_CODEL_TARGET, diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 9231617a16e4..3523c8c7068f 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -4255,7 +4255,7 @@ int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx, bool del = false; brmctx = br_multicast_port_ctx_get_global(pmctx); - spin_lock(&brmctx->br->multicast_lock); + spin_lock_bh(&brmctx->br->multicast_lock); if (pmctx->multicast_router == val) { /* Refresh the temp router port timer */ if (pmctx->multicast_router == MDB_RTR_TYPE_TEMP) { @@ -4305,7 +4305,7 @@ int br_multicast_set_port_router(struct net_bridge_mcast_port *pmctx, } err = 0; unlock: - spin_unlock(&brmctx->br->multicast_lock); + spin_unlock_bh(&brmctx->br->multicast_lock); return err; } diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 9e5a3249373c..a3d74e2704c4 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3602,7 +3602,6 @@ out: static int pktgen_thread_worker(void *arg) { - DEFINE_WAIT(wait); struct pktgen_thread *t = arg; struct pktgen_dev *pkt_dev = NULL; int cpu = t->cpu; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 9ae9f7258826..d4f9a80ec79a 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3886,7 +3886,7 @@ struct sk_buff *skb_segment_list(struct sk_buff *skb, skb_push(nskb, -skb_network_offset(nskb) + offset); skb_release_head_state(nskb); - __copy_skb_header(nskb, skb); + __copy_skb_header(nskb, skb); skb_headers_offset_update(nskb, skb_headroom(nskb) - skb_headroom(skb)); skb_copy_from_linear_data_offset(skb, -tnl_hlen, diff --git a/net/dsa/tag_rtl4_a.c b/net/dsa/tag_rtl4_a.c index 40811bab4d09..f920487ae145 100644 --- a/net/dsa/tag_rtl4_a.c +++ b/net/dsa/tag_rtl4_a.c @@ -54,9 +54,10 @@ static struct sk_buff *rtl4a_tag_xmit(struct sk_buff *skb, p = (__be16 *)tag; *p = htons(RTL4_A_ETHERTYPE); - out = (RTL4_A_PROTOCOL_RTL8366RB << 12) | (2 << 8); - /* The lower bits is the port number */ - out |= (u8)dp->index; + out = (RTL4_A_PROTOCOL_RTL8366RB << RTL4_A_PROTOCOL_SHIFT) | (2 << 8); + /* The lower bits indicate the port number */ + out |= BIT(dp->index); + p = (__be16 *)(tag + 2); *p = htons(out); diff --git a/net/ipv4/cipso_ipv4.c b/net/ipv4/cipso_ipv4.c index 7fbd0b532f52..099259fc826a 100644 --- a/net/ipv4/cipso_ipv4.c +++ b/net/ipv4/cipso_ipv4.c @@ -465,16 +465,14 @@ void cipso_v4_doi_free(struct cipso_v4_doi *doi_def) if (!doi_def) return; - if (doi_def->map.std) { - switch (doi_def->type) { - case CIPSO_V4_MAP_TRANS: - kfree(doi_def->map.std->lvl.cipso); - kfree(doi_def->map.std->lvl.local); - kfree(doi_def->map.std->cat.cipso); - kfree(doi_def->map.std->cat.local); - kfree(doi_def->map.std); - break; - } + switch (doi_def->type) { + case CIPSO_V4_MAP_TRANS: + kfree(doi_def->map.std->lvl.cipso); + kfree(doi_def->map.std->lvl.local); + kfree(doi_def->map.std->cat.cipso); + kfree(doi_def->map.std->cat.local); + kfree(doi_def->map.std); + break; } kfree(doi_def); } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 177d26d8fb9c..0fe6c936dc54 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -473,8 +473,6 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev, static int gre_handle_offloads(struct sk_buff *skb, bool csum) { - if (csum && skb_checksum_start(skb) < skb->data) - return -EINVAL; return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } @@ -632,15 +630,20 @@ static netdev_tx_t ipgre_xmit(struct sk_buff *skb, } if (dev->header_ops) { + const int pull_len = tunnel->hlen + sizeof(struct iphdr); + if (skb_cow_head(skb, 0)) goto free_skb; tnl_params = (const struct iphdr *)skb->data; + if (pull_len > skb_transport_offset(skb)) + goto free_skb; + /* Pull skb since ip_tunnel_xmit() needs skb->data pointing * to gre header. */ - skb_pull(skb, tunnel->hlen + sizeof(struct iphdr)); + skb_pull(skb, pull_len); skb_reset_mac_header(skb); } else { if (skb_cow_head(skb, dev->needed_headroom)) diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 4075230b14c6..75ca4b6e484f 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -2490,6 +2490,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, .fc_gw4 = cfg->gw.ipv4, .fc_gw_family = cfg->gw.ipv4 ? AF_INET : 0, .fc_flags = cfg->nh_flags, + .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, }; @@ -2528,6 +2529,7 @@ static int nh_create_ipv6(struct net *net, struct nexthop *nh, .fc_ifindex = cfg->nh_ifindex, .fc_gateway = cfg->gw.ipv6, .fc_flags = cfg->nh_flags, + .fc_nlinfo = cfg->nlinfo, .fc_encap = cfg->nh_encap, .fc_encap_type = cfg->nh_encap_type, .fc_is_fdb = cfg->nh_fdb, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 9ca2ea4a2028..26ca9dc132ae 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3114,19 +3114,22 @@ static void add_addr(struct inet6_dev *idev, const struct in6_addr *addr, } } -#if IS_ENABLED(CONFIG_IPV6_SIT) -static void sit_add_v4_addrs(struct inet6_dev *idev) +#if IS_ENABLED(CONFIG_IPV6_SIT) || IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) +static void add_v4_addrs(struct inet6_dev *idev) { struct in6_addr addr; struct net_device *dev; struct net *net = dev_net(idev->dev); - int scope, plen; + int scope, plen, offset = 0; u32 pflags = 0; ASSERT_RTNL(); memset(&addr, 0, sizeof(struct in6_addr)); - memcpy(&addr.s6_addr32[3], idev->dev->dev_addr, 4); + /* in case of IP6GRE the dev_addr is an IPv6 and therefore we use only the last 4 bytes */ + if (idev->dev->addr_len == sizeof(struct in6_addr)) + offset = sizeof(struct in6_addr) - 4; + memcpy(&addr.s6_addr32[3], idev->dev->dev_addr + offset, 4); if (idev->dev->flags&IFF_POINTOPOINT) { addr.s6_addr32[0] = htonl(0xfe800000); @@ -3364,8 +3367,6 @@ static void addrconf_dev_config(struct net_device *dev) (dev->type != ARPHRD_IEEE1394) && (dev->type != ARPHRD_TUNNEL6) && (dev->type != ARPHRD_6LOWPAN) && - (dev->type != ARPHRD_IP6GRE) && - (dev->type != ARPHRD_IPGRE) && (dev->type != ARPHRD_TUNNEL) && (dev->type != ARPHRD_NONE) && (dev->type != ARPHRD_RAWIP)) { @@ -3413,14 +3414,14 @@ static void addrconf_sit_config(struct net_device *dev) return; } - sit_add_v4_addrs(idev); + add_v4_addrs(idev); if (dev->flags&IFF_POINTOPOINT) addrconf_add_mroute(dev); } #endif -#if IS_ENABLED(CONFIG_NET_IPGRE) +#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) static void addrconf_gre_config(struct net_device *dev) { struct inet6_dev *idev; @@ -3433,7 +3434,13 @@ static void addrconf_gre_config(struct net_device *dev) return; } - addrconf_addr_gen(idev, true); + if (dev->type == ARPHRD_ETHER) { + addrconf_addr_gen(idev, true); + return; + } + + add_v4_addrs(idev); + if (dev->flags & IFF_POINTOPOINT) addrconf_add_mroute(dev); } @@ -3609,7 +3616,8 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, addrconf_sit_config(dev); break; #endif -#if IS_ENABLED(CONFIG_NET_IPGRE) +#if IS_ENABLED(CONFIG_NET_IPGRE) || IS_ENABLED(CONFIG_IPV6_GRE) + case ARPHRD_IP6GRE: case ARPHRD_IPGRE: addrconf_gre_config(dev); break; diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 7baf41d160f5..3ad201d372d8 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -629,8 +629,6 @@ drop: static int gre_handle_offloads(struct sk_buff *skb, bool csum) { - if (csum && skb_checksum_start(skb) < skb->data) - return -EINVAL; return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE); } diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index cd951faa2fac..bed8155508c8 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -1356,8 +1356,8 @@ static int mld_process_v1(struct inet6_dev *idev, struct mld_msg *mld, return 0; } -static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, - unsigned long *max_delay) +static void mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, + unsigned long *max_delay) { *max_delay = max(msecs_to_jiffies(mldv2_mrc(mld)), 1UL); @@ -1367,7 +1367,7 @@ static int mld_process_v2(struct inet6_dev *idev, struct mld2_query *mld, idev->mc_maxdelay = *max_delay; - return 0; + return; } /* called with rcu_read_lock() */ @@ -1454,9 +1454,7 @@ static void __mld_query_work(struct sk_buff *skb) mlh2 = (struct mld2_query *)skb_transport_header(skb); - err = mld_process_v2(idev, mlh2, &max_delay); - if (err < 0) - goto out; + mld_process_v2(idev, mlh2, &max_delay); if (group_type == IPV6_ADDR_ANY) { /* general query */ if (mlh2->mld2q_nsrcs) diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index 6fd54744cbc3..aa5bb8789ba0 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -99,7 +99,7 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, { __be16 dport, sport; const struct in6_addr *daddr = NULL, *saddr = NULL; - struct ipv6hdr *iph = ipv6_hdr(skb); + struct ipv6hdr *iph = ipv6_hdr(skb), ipv6_var; struct sk_buff *data_skb = NULL; int doff = 0; int thoff = 0, tproto; @@ -129,8 +129,6 @@ struct sock *nf_sk_lookup_slow_v6(struct net *net, const struct sk_buff *skb, thoff + sizeof(*hp); } else if (tproto == IPPROTO_ICMPV6) { - struct ipv6hdr ipv6_var; - if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr, &sport, &dport, &ipv6_var)) return NULL; diff --git a/net/ipv6/seg6_iptunnel.c b/net/ipv6/seg6_iptunnel.c index 1bf5f5ae75ac..3adc5d9211ad 100644 --- a/net/ipv6/seg6_iptunnel.c +++ b/net/ipv6/seg6_iptunnel.c @@ -385,7 +385,7 @@ static int seg6_output_core(struct net *net, struct sock *sk, struct dst_entry *orig_dst = skb_dst(skb); struct dst_entry *dst = NULL; struct seg6_lwt *slwt; - int err = -EINVAL; + int err; err = seg6_do_srh(skb); if (unlikely(err)) diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 1cf5ac09edcb..323d3d2d986f 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -617,7 +617,7 @@ ieee802154_if_add(struct ieee802154_local *local, const char *name, { struct net_device *ndev = NULL; struct ieee802154_sub_if_data *sdata = NULL; - int ret = -ENOMEM; + int ret; ASSERT_RTNL(); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 1e4289c507ff..c4f9a5ce3815 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -644,15 +644,12 @@ void mptcp_pm_nl_addr_send_ack(struct mptcp_sock *msk) subflow = list_first_entry_or_null(&msk->conn_list, typeof(*subflow), node); if (subflow) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow; spin_unlock_bh(&msk->pm.lock); pr_debug("send ack for %s", mptcp_pm_should_add_signal(msk) ? "add_addr" : "rm_addr"); - slow = lock_sock_fast(ssk); - tcp_send_ack(ssk); - unlock_sock_fast(ssk, slow); + mptcp_subflow_send_ack(ssk); spin_lock_bh(&msk->pm.lock); } } @@ -669,7 +666,6 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, struct sock *ssk = mptcp_subflow_tcp_sock(subflow); struct sock *sk = (struct sock *)msk; struct mptcp_addr_info local; - bool slow; local_address((struct sock_common *)ssk, &local); if (!addresses_equal(&local, addr, addr->port)) @@ -682,9 +678,7 @@ int mptcp_pm_nl_mp_prio_send_ack(struct mptcp_sock *msk, spin_unlock_bh(&msk->pm.lock); pr_debug("send ack for mp_prio"); - slow = lock_sock_fast(ssk); - tcp_send_ack(ssk); - unlock_sock_fast(ssk, slow); + mptcp_subflow_send_ack(ssk); spin_lock_bh(&msk->pm.lock); return 0; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index ade648c3512b..2602f1386160 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -440,19 +440,22 @@ static bool tcp_can_send_ack(const struct sock *ssk) (TCPF_SYN_SENT | TCPF_SYN_RECV | TCPF_TIME_WAIT | TCPF_CLOSE | TCPF_LISTEN)); } +void mptcp_subflow_send_ack(struct sock *ssk) +{ + bool slow; + + slow = lock_sock_fast(ssk); + if (tcp_can_send_ack(ssk)) + tcp_send_ack(ssk); + unlock_sock_fast(ssk, slow); +} + static void mptcp_send_ack(struct mptcp_sock *msk) { struct mptcp_subflow_context *subflow; - mptcp_for_each_subflow(msk, subflow) { - struct sock *ssk = mptcp_subflow_tcp_sock(subflow); - bool slow; - - slow = lock_sock_fast(ssk); - if (tcp_can_send_ack(ssk)) - tcp_send_ack(ssk); - unlock_sock_fast(ssk, slow); - } + mptcp_for_each_subflow(msk, subflow) + mptcp_subflow_send_ack(mptcp_subflow_tcp_sock(subflow)); } static void mptcp_subflow_cleanup_rbuf(struct sock *ssk) @@ -1003,6 +1006,13 @@ static void mptcp_wmem_uncharge(struct sock *sk, int size) msk->wmem_reserved += size; } +static void __mptcp_mem_reclaim_partial(struct sock *sk) +{ + lockdep_assert_held_once(&sk->sk_lock.slock); + __mptcp_update_wmem(sk); + sk_mem_reclaim_partial(sk); +} + static void mptcp_mem_reclaim_partial(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -1094,12 +1104,8 @@ static void __mptcp_clean_una(struct sock *sk) msk->recovery = false; out: - if (cleaned) { - if (tcp_under_memory_pressure(sk)) { - __mptcp_update_wmem(sk); - sk_mem_reclaim_partial(sk); - } - } + if (cleaned && tcp_under_memory_pressure(sk)) + __mptcp_mem_reclaim_partial(sk); if (snd_una == READ_ONCE(msk->snd_nxt) && !msk->recovery) { if (mptcp_timer_pending(sk) && !mptcp_data_fin_enabled(msk)) @@ -1179,6 +1185,7 @@ struct mptcp_sendmsg_info { u16 limit; u16 sent; unsigned int flags; + bool data_lock_held; }; static int mptcp_check_allowed_size(struct mptcp_sock *msk, u64 data_seq, @@ -1250,17 +1257,17 @@ static bool __mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, gfp_t gfp) return false; } -static bool mptcp_must_reclaim_memory(struct sock *sk, struct sock *ssk) +static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk, bool data_lock_held) { - return !ssk->sk_tx_skb_cache && - tcp_under_memory_pressure(sk); -} + gfp_t gfp = data_lock_held ? GFP_ATOMIC : sk->sk_allocation; -static bool mptcp_alloc_tx_skb(struct sock *sk, struct sock *ssk) -{ - if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) - mptcp_mem_reclaim_partial(sk); - return __mptcp_alloc_tx_skb(sk, ssk, sk->sk_allocation); + if (unlikely(tcp_under_memory_pressure(sk))) { + if (data_lock_held) + __mptcp_mem_reclaim_partial(sk); + else + mptcp_mem_reclaim_partial(sk); + } + return __mptcp_alloc_tx_skb(sk, ssk, gfp); } /* note: this always recompute the csum on the whole skb, even @@ -1284,7 +1291,7 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, bool zero_window_probe = false; struct mptcp_ext *mpext = NULL; struct sk_buff *skb, *tail; - bool can_collapse = false; + bool must_collapse = false; int size_bias = 0; int avail_size; size_t ret = 0; @@ -1304,16 +1311,24 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, * SSN association set here */ mpext = skb_ext_find(skb, SKB_EXT_MPTCP); - can_collapse = (info->size_goal - skb->len > 0) && - mptcp_skb_can_collapse_to(data_seq, skb, mpext); - if (!can_collapse) { + if (!mptcp_skb_can_collapse_to(data_seq, skb, mpext)) { TCP_SKB_CB(skb)->eor = 1; - } else { + goto alloc_skb; + } + + must_collapse = (info->size_goal - skb->len > 0) && + (skb_shinfo(skb)->nr_frags < sysctl_max_skb_frags); + if (must_collapse) { size_bias = skb->len; avail_size = info->size_goal - skb->len; } } +alloc_skb: + if (!must_collapse && !ssk->sk_tx_skb_cache && + !mptcp_alloc_tx_skb(sk, ssk, info->data_lock_held)) + return 0; + /* Zero window and all data acked? Probe. */ avail_size = mptcp_check_allowed_size(msk, data_seq, avail_size); if (avail_size == 0) { @@ -1343,7 +1358,6 @@ static int mptcp_sendmsg_frag(struct sock *sk, struct sock *ssk, if (skb == tail) { TCP_SKB_CB(tail)->tcp_flags &= ~TCPHDR_PSH; mpext->data_len += ret; - WARN_ON_ONCE(!can_collapse); WARN_ON_ONCE(zero_window_probe); goto out; } @@ -1530,15 +1544,6 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) if (ssk != prev_ssk) lock_sock(ssk); - /* keep it simple and always provide a new skb for the - * subflow, even if we will not use it when collapsing - * on the pending one - */ - if (!mptcp_alloc_tx_skb(sk, ssk)) { - mptcp_push_release(sk, ssk, &info); - goto out; - } - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) { mptcp_push_release(sk, ssk, &info); @@ -1571,7 +1576,9 @@ out: static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) { struct mptcp_sock *msk = mptcp_sk(sk); - struct mptcp_sendmsg_info info; + struct mptcp_sendmsg_info info = { + .data_lock_held = true, + }; struct mptcp_data_frag *dfrag; struct sock *xmit_ssk; int len, copied = 0; @@ -1597,13 +1604,6 @@ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk) goto out; } - if (unlikely(mptcp_must_reclaim_memory(sk, ssk))) { - __mptcp_update_wmem(sk); - sk_mem_reclaim_partial(sk); - } - if (!__mptcp_alloc_tx_skb(sk, ssk, GFP_ATOMIC)) - goto out; - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) goto out; @@ -2409,9 +2409,6 @@ static void __mptcp_retrans(struct sock *sk) info.sent = 0; info.limit = READ_ONCE(msk->csum_enabled) ? dfrag->data_len : dfrag->already_sent; while (info.sent < info.limit) { - if (!mptcp_alloc_tx_skb(sk, ssk)) - break; - ret = mptcp_sendmsg_frag(sk, ssk, dfrag, &info); if (ret <= 0) break; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index d7aba1c4dc48..d3e6fd1615f1 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -34,7 +34,7 @@ #define OPTIONS_MPTCP_MPC (OPTION_MPTCP_MPC_SYN | OPTION_MPTCP_MPC_SYNACK | \ OPTION_MPTCP_MPC_ACK) #define OPTIONS_MPTCP_MPJ (OPTION_MPTCP_MPJ_SYN | OPTION_MPTCP_MPJ_SYNACK | \ - OPTION_MPTCP_MPJ_SYNACK) + OPTION_MPTCP_MPJ_ACK) /* MPTCP option subtypes */ #define MPTCPOPT_MP_CAPABLE 0 @@ -573,6 +573,7 @@ void __init mptcp_subflow_init(void); void mptcp_subflow_shutdown(struct sock *sk, struct sock *ssk, int how); void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); +void mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); diff --git a/net/ncsi/internal.h b/net/ncsi/internal.h index 0b6cfd3b31e0..03757e76bb6b 100644 --- a/net/ncsi/internal.h +++ b/net/ncsi/internal.h @@ -80,6 +80,7 @@ enum { #define NCSI_OEM_MFR_BCM_ID 0x113d #define NCSI_OEM_MFR_INTEL_ID 0x157 /* Intel specific OEM command */ +#define NCSI_OEM_INTEL_CMD_GMA 0x06 /* CMD ID for Get MAC */ #define NCSI_OEM_INTEL_CMD_KEEP_PHY 0x20 /* CMD ID for Keep PHY up */ /* Broadcom specific OEM Command */ #define NCSI_OEM_BCM_CMD_GMA 0x01 /* CMD ID for Get MAC */ @@ -89,6 +90,7 @@ enum { #define NCSI_OEM_MLX_CMD_SMAF 0x01 /* CMD ID for Set MC Affinity */ #define NCSI_OEM_MLX_CMD_SMAF_PARAM 0x07 /* Parameter for SMAF */ /* OEM Command payload lengths*/ +#define NCSI_OEM_INTEL_CMD_GMA_LEN 5 #define NCSI_OEM_INTEL_CMD_KEEP_PHY_LEN 7 #define NCSI_OEM_BCM_CMD_GMA_LEN 12 #define NCSI_OEM_MLX_CMD_GMA_LEN 8 @@ -99,6 +101,7 @@ enum { /* Mac address offset in OEM response */ #define BCM_MAC_ADDR_OFFSET 28 #define MLX_MAC_ADDR_OFFSET 8 +#define INTEL_MAC_ADDR_OFFSET 1 struct ncsi_channel_version { diff --git a/net/ncsi/ncsi-manage.c b/net/ncsi/ncsi-manage.c index 89c7742cd72e..7121ce2a47c0 100644 --- a/net/ncsi/ncsi-manage.c +++ b/net/ncsi/ncsi-manage.c @@ -795,13 +795,36 @@ static int ncsi_oem_smaf_mlx(struct ncsi_cmd_arg *nca) return ret; } +static int ncsi_oem_gma_handler_intel(struct ncsi_cmd_arg *nca) +{ + unsigned char data[NCSI_OEM_INTEL_CMD_GMA_LEN]; + int ret = 0; + + nca->payload = NCSI_OEM_INTEL_CMD_GMA_LEN; + + memset(data, 0, NCSI_OEM_INTEL_CMD_GMA_LEN); + *(unsigned int *)data = ntohl((__force __be32)NCSI_OEM_MFR_INTEL_ID); + data[4] = NCSI_OEM_INTEL_CMD_GMA; + + nca->data = data; + + ret = ncsi_xmit_cmd(nca); + if (ret) + netdev_err(nca->ndp->ndev.dev, + "NCSI: Failed to transmit cmd 0x%x during configure\n", + nca->type); + + return ret; +} + /* OEM Command handlers initialization */ static struct ncsi_oem_gma_handler { unsigned int mfr_id; int (*handler)(struct ncsi_cmd_arg *nca); } ncsi_oem_gma_handlers[] = { { NCSI_OEM_MFR_BCM_ID, ncsi_oem_gma_handler_bcm }, - { NCSI_OEM_MFR_MLX_ID, ncsi_oem_gma_handler_mlx } + { NCSI_OEM_MFR_MLX_ID, ncsi_oem_gma_handler_mlx }, + { NCSI_OEM_MFR_INTEL_ID, ncsi_oem_gma_handler_intel } }; static int ncsi_gma_handler(struct ncsi_cmd_arg *nca, unsigned int mf_id) diff --git a/net/ncsi/ncsi-pkt.h b/net/ncsi/ncsi-pkt.h index 80938b338fee..ba66c7dc3a21 100644 --- a/net/ncsi/ncsi-pkt.h +++ b/net/ncsi/ncsi-pkt.h @@ -178,6 +178,12 @@ struct ncsi_rsp_oem_bcm_pkt { unsigned char data[]; /* Cmd specific Data */ }; +/* Intel Response Data */ +struct ncsi_rsp_oem_intel_pkt { + unsigned char cmd; /* OEM Command ID */ + unsigned char data[]; /* Cmd specific Data */ +}; + /* Get Link Status */ struct ncsi_rsp_gls_pkt { struct ncsi_rsp_pkt_hdr rsp; /* Response header */ diff --git a/net/ncsi/ncsi-rsp.c b/net/ncsi/ncsi-rsp.c index d48374894817..6447a09932f5 100644 --- a/net/ncsi/ncsi-rsp.c +++ b/net/ncsi/ncsi-rsp.c @@ -699,9 +699,51 @@ static int ncsi_rsp_handler_oem_bcm(struct ncsi_request *nr) return 0; } +/* Response handler for Intel command Get Mac Address */ +static int ncsi_rsp_handler_oem_intel_gma(struct ncsi_request *nr) +{ + struct ncsi_dev_priv *ndp = nr->ndp; + struct net_device *ndev = ndp->ndev.dev; + const struct net_device_ops *ops = ndev->netdev_ops; + struct ncsi_rsp_oem_pkt *rsp; + struct sockaddr saddr; + int ret = 0; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + + saddr.sa_family = ndev->type; + ndev->priv_flags |= IFF_LIVE_ADDR_CHANGE; + memcpy(saddr.sa_data, &rsp->data[INTEL_MAC_ADDR_OFFSET], ETH_ALEN); + /* Increase mac address by 1 for BMC's address */ + eth_addr_inc((u8 *)saddr.sa_data); + if (!is_valid_ether_addr((const u8 *)saddr.sa_data)) + return -ENXIO; + + /* Set the flag for GMA command which should only be called once */ + ndp->gma_flag = 1; + + ret = ops->ndo_set_mac_address(ndev, &saddr); + if (ret < 0) + netdev_warn(ndev, + "NCSI: 'Writing mac address to device failed\n"); + + return ret; +} + /* Response handler for Intel card */ static int ncsi_rsp_handler_oem_intel(struct ncsi_request *nr) { + struct ncsi_rsp_oem_intel_pkt *intel; + struct ncsi_rsp_oem_pkt *rsp; + + /* Get the response header */ + rsp = (struct ncsi_rsp_oem_pkt *)skb_network_header(nr->rsp); + intel = (struct ncsi_rsp_oem_intel_pkt *)(rsp->data); + + if (intel->cmd == NCSI_OEM_INTEL_CMD_GMA) + return ncsi_rsp_handler_oem_intel_gma(nr); + return 0; } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index d31dbccbe7bd..94e18fb9690d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -78,6 +77,8 @@ static __read_mostly bool nf_conntrack_locks_all; #define GC_SCAN_INTERVAL (120u * HZ) #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) +#define MAX_CHAINLEN 64u + static struct conntrack_gc_work conntrack_gc_work; void nf_conntrack_lock(spinlock_t *lock) __acquires(lock) @@ -184,25 +185,31 @@ EXPORT_SYMBOL_GPL(nf_conntrack_htable_size); unsigned int nf_conntrack_max __read_mostly; EXPORT_SYMBOL_GPL(nf_conntrack_max); seqcount_spinlock_t nf_conntrack_generation __read_mostly; -static unsigned int nf_conntrack_hash_rnd __read_mostly; +static siphash_key_t nf_conntrack_hash_rnd __read_mostly; static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, const struct net *net) { - unsigned int n; - u32 seed; + struct { + struct nf_conntrack_man src; + union nf_inet_addr dst_addr; + u32 net_mix; + u16 dport; + u16 proto; + } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_conntrack_hash_rnd, sizeof(nf_conntrack_hash_rnd)); - /* The direction must be ignored, so we hash everything up to the - * destination ports (which is a multiple of 4) and treat the last - * three bytes manually. - */ - seed = nf_conntrack_hash_rnd ^ net_hash_mix(net); - n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - return jhash2((u32 *)tuple, n, seed ^ - (((__force __u16)tuple->dst.u.all << 16) | - tuple->dst.protonum)); + memset(&combined, 0, sizeof(combined)); + + /* The direction must be ignored, so handle usable members manually. */ + combined.src = tuple->src; + combined.dst_addr = tuple->dst.u3; + combined.net_mix = net_hash_mix(net); + combined.dport = (__force __u16)tuple->dst.u.all; + combined.proto = tuple->dst.protonum; + + return (u32)siphash(&combined, sizeof(combined), &nf_conntrack_hash_rnd); } static u32 scale_hash(u32 hash) @@ -835,7 +842,9 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; + unsigned int chainlen = 0; unsigned int sequence; + int err = -EEXIST; zone = nf_ct_zone(ct); @@ -849,15 +858,24 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) } while (nf_conntrack_double_lock(net, hash, reply_hash, sequence)); /* See if there's one in the list already, including reverse */ - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + if (chainlen++ > MAX_CHAINLEN) + goto chaintoolong; + } + + chainlen = 0; + + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; + if (chainlen++ > MAX_CHAINLEN) + goto chaintoolong; + } smp_wmb(); /* The caller holds a reference to this object */ @@ -867,11 +885,13 @@ nf_conntrack_hash_check_insert(struct nf_conn *ct) NF_CT_STAT_INC(net, insert); local_bh_enable(); return 0; - +chaintoolong: + NF_CT_STAT_INC(net, chaintoolong); + err = -ENOSPC; out: nf_conntrack_double_unlock(hash, reply_hash); local_bh_enable(); - return -EEXIST; + return err; } EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); @@ -1084,6 +1104,7 @@ int __nf_conntrack_confirm(struct sk_buff *skb) { const struct nf_conntrack_zone *zone; + unsigned int chainlen = 0, sequence; unsigned int hash, reply_hash; struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; @@ -1091,7 +1112,6 @@ __nf_conntrack_confirm(struct sk_buff *skb) struct hlist_nulls_node *n; enum ip_conntrack_info ctinfo; struct net *net; - unsigned int sequence; int ret = NF_DROP; ct = nf_ct_get(skb, &ctinfo); @@ -1151,15 +1171,28 @@ __nf_conntrack_confirm(struct sk_buff *skb) /* See if there's one in the list already, including reverse: NAT could have grabbed it without realizing, since we're not in the hash. If there is, we lost race. */ - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, zone, net)) goto out; + if (chainlen++ > MAX_CHAINLEN) + goto chaintoolong; + } - hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) + chainlen = 0; + hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[reply_hash], hnnode) { if (nf_ct_key_equal(h, &ct->tuplehash[IP_CT_DIR_REPLY].tuple, zone, net)) goto out; + if (chainlen++ > MAX_CHAINLEN) { +chaintoolong: + nf_ct_add_to_dying_list(ct); + NF_CT_STAT_INC(net, chaintoolong); + NF_CT_STAT_INC(net, insert_failed); + ret = NF_DROP; + goto dying; + } + } /* Timer relative to confirmation time, not original setting time, otherwise we'd get timer wrap in @@ -2594,26 +2627,24 @@ int nf_conntrack_init_start(void) spin_lock_init(&nf_conntrack_locks[i]); if (!nf_conntrack_htable_size) { - /* Idea from tcp.c: use 1/16384 of memory. - * On i386: 32MB machine has 512 buckets. - * >= 1GB machines have 16384 buckets. - * >= 4GB machines have 65536 buckets. - */ nf_conntrack_htable_size = (((nr_pages << PAGE_SHIFT) / 16384) / sizeof(struct hlist_head)); - if (nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) - nf_conntrack_htable_size = 65536; + if (BITS_PER_LONG >= 64 && + nr_pages > (4 * (1024 * 1024 * 1024 / PAGE_SIZE))) + nf_conntrack_htable_size = 262144; else if (nr_pages > (1024 * 1024 * 1024 / PAGE_SIZE)) - nf_conntrack_htable_size = 16384; - if (nf_conntrack_htable_size < 32) - nf_conntrack_htable_size = 32; + nf_conntrack_htable_size = 65536; - /* Use a max. factor of four by default to get the same max as - * with the old struct list_heads. When a table size is given - * we use the old value of 8 to avoid reducing the max. - * entries. */ - max_factor = 4; + if (nf_conntrack_htable_size < 1024) + nf_conntrack_htable_size = 1024; + /* Use a max. factor of one by default to keep the average + * hash chain length at 2 entries. Each entry has to be added + * twice (once for original direction, once for reply). + * When a table size is given we use the old value of 8 to + * avoid implicit reduction of the max entries setting. + */ + max_factor = 1; } nf_conntrack_hash = nf_ct_alloc_hashtable(&nf_conntrack_htable_size, 1); diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index 1e851bc2e61a..f562eeef4234 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -41,7 +41,7 @@ EXPORT_SYMBOL_GPL(nf_ct_expect_hash); unsigned int nf_ct_expect_max __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; -static unsigned int nf_ct_expect_hashrnd __read_mostly; +static siphash_key_t nf_ct_expect_hashrnd __read_mostly; /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, @@ -81,15 +81,26 @@ static void nf_ct_expectation_timed_out(struct timer_list *t) static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple) { - unsigned int hash, seed; + struct { + union nf_inet_addr dst_addr; + u32 net_mix; + u16 dport; + u8 l3num; + u8 protonum; + } __aligned(SIPHASH_ALIGNMENT) combined; + u32 hash; get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd)); - seed = nf_ct_expect_hashrnd ^ net_hash_mix(n); + memset(&combined, 0, sizeof(combined)); - hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all), - (((tuple->dst.protonum ^ tuple->src.l3num) << 16) | - (__force __u16)tuple->dst.u.all) ^ seed); + combined.dst_addr = tuple->dst.u3; + combined.net_mix = net_hash_mix(n); + combined.dport = (__force __u16)tuple->dst.u.all; + combined.l3num = tuple->src.l3num; + combined.protonum = tuple->dst.protonum; + + hash = siphash(&combined, sizeof(combined), &nf_ct_expect_hashrnd); return reciprocal_scale(hash, nf_ct_expect_hsize); } diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 5f9fc6b94855..f1e5443fe7c7 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2528,7 +2528,9 @@ ctnetlink_ct_stat_cpu_fill_info(struct sk_buff *skb, u32 portid, u32 seq, nla_put_be32(skb, CTA_STATS_SEARCH_RESTART, htonl(st->search_restart)) || nla_put_be32(skb, CTA_STATS_CLASH_RESOLVE, - htonl(st->clash_resolve))) + htonl(st->clash_resolve)) || + nla_put_be32(skb, CTA_STATS_CHAIN_TOOLONG, + htonl(st->chaintoolong))) goto nla_put_failure; nlmsg_end(skb, nlh); diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 7e0d956da51d..80f675d884b2 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -432,7 +432,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) unsigned int nr_conntracks; if (v == SEQ_START_TOKEN) { - seq_puts(seq, "entries clashres found new invalid ignore delete delete_list insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); + seq_puts(seq, "entries clashres found new invalid ignore delete chainlength insert insert_failed drop early_drop icmp_error expect_new expect_create expect_delete search_restart\n"); return 0; } @@ -447,7 +447,7 @@ static int ct_cpu_seq_show(struct seq_file *seq, void *v) st->invalid, 0, 0, - 0, + st->chaintoolong, st->insert, st->insert_failed, st->drop, diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 7de595ead06a..7008961f5cb0 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include @@ -34,7 +34,7 @@ static unsigned int nat_net_id __read_mostly; static struct hlist_head *nf_nat_bysource __read_mostly; static unsigned int nf_nat_htable_size __read_mostly; -static unsigned int nf_nat_hash_rnd __read_mostly; +static siphash_key_t nf_nat_hash_rnd __read_mostly; struct nf_nat_lookup_hook_priv { struct nf_hook_entries __rcu *entries; @@ -153,12 +153,22 @@ static unsigned int hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple) { unsigned int hash; + struct { + struct nf_conntrack_man src; + u32 net_mix; + u32 protonum; + } __aligned(SIPHASH_ALIGNMENT) combined; get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd)); + memset(&combined, 0, sizeof(combined)); + /* Original src, to ensure we map it consistently if poss. */ - hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32), - tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n)); + combined.src = tuple->src; + combined.net_mix = net_hash_mix(n); + combined.protonum = tuple->dst.protonum; + + hash = siphash(&combined, sizeof(combined), &nf_nat_hash_rnd); return reciprocal_scale(hash, nf_nat_htable_size); } diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index 337e22d8b40b..99b1de14ff7e 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -41,6 +41,7 @@ struct nft_ct_helper_obj { #ifdef CONFIG_NF_CONNTRACK_ZONES static DEFINE_PER_CPU(struct nf_conn *, nft_ct_pcpu_template); static unsigned int nft_ct_pcpu_template_refcnt __read_mostly; +static DEFINE_MUTEX(nft_ct_pcpu_mutex); #endif static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c, @@ -525,8 +526,10 @@ static void __nft_ct_set_destroy(const struct nft_ctx *ctx, struct nft_ct *priv) #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: + mutex_lock(&nft_ct_pcpu_mutex); if (--nft_ct_pcpu_template_refcnt == 0) nft_ct_tmpl_put_pcpu(); + mutex_unlock(&nft_ct_pcpu_mutex); break; #endif default: @@ -564,9 +567,13 @@ static int nft_ct_set_init(const struct nft_ctx *ctx, #endif #ifdef CONFIG_NF_CONNTRACK_ZONES case NFT_CT_ZONE: - if (!nft_ct_tmpl_alloc_pcpu()) + mutex_lock(&nft_ct_pcpu_mutex); + if (!nft_ct_tmpl_alloc_pcpu()) { + mutex_unlock(&nft_ct_pcpu_mutex); return -ENOMEM; + } nft_ct_pcpu_template_refcnt++; + mutex_unlock(&nft_ct_pcpu_mutex); len = sizeof(u16); break; #endif diff --git a/net/qrtr/qrtr.c b/net/qrtr/qrtr.c index 525e3ea063b1..ec2322529727 100644 --- a/net/qrtr/qrtr.c +++ b/net/qrtr/qrtr.c @@ -493,7 +493,7 @@ int qrtr_endpoint_post(struct qrtr_endpoint *ep, const void *data, size_t len) goto err; } - if (!size || size & 3 || len != size + hdrlen) + if (!size || len != ALIGN(size, 4) + hdrlen) goto err; if (cb->dst_port != QRTR_PORT_CTRL && cb->type != QRTR_TYPE_DATA && diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index c4afdd026f51..bb0cd6d3d2c2 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -369,6 +369,7 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, { struct fq_codel_sched_data *q = qdisc_priv(sch); struct nlattr *tb[TCA_FQ_CODEL_MAX + 1]; + u32 quantum = 0; int err; if (!opt) @@ -386,6 +387,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, q->flows_cnt > 65536) return -EINVAL; } + if (tb[TCA_FQ_CODEL_QUANTUM]) { + quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); + if (quantum > FQ_CODEL_QUANTUM_MAX) { + NL_SET_ERR_MSG(extack, "Invalid quantum"); + return -EINVAL; + } + } sch_tree_lock(sch); if (tb[TCA_FQ_CODEL_TARGET]) { @@ -412,8 +420,8 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_FQ_CODEL_ECN]) q->cparams.ecn = !!nla_get_u32(tb[TCA_FQ_CODEL_ECN]); - if (tb[TCA_FQ_CODEL_QUANTUM]) - q->quantum = max(256U, nla_get_u32(tb[TCA_FQ_CODEL_QUANTUM])); + if (quantum) + q->quantum = quantum; if (tb[TCA_FQ_CODEL_DROP_BATCH_SIZE]) q->drop_batch_size = max(1U, nla_get_u32(tb[TCA_FQ_CODEL_DROP_BATCH_SIZE])); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index e3105ba407c7..a0a27d87f631 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1426,7 +1426,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dlen) if (ua) { if (!tipc_uaddr_valid(ua, m->msg_namelen)) return -EINVAL; - atype = ua->addrtype; + atype = ua->addrtype; } /* If socket belongs to a communication group follow other paths */ diff --git a/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c index 370d220288a6..ad3ba81b4048 100644 --- a/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c +++ b/tools/testing/selftests/bpf/prog_tests/xdp_bonding.c @@ -384,8 +384,7 @@ static void test_xdp_bonding_attach(struct skeletons *skeletons) { struct bpf_link *link = NULL; struct bpf_link *link2 = NULL; - int veth, bond; - int err; + int veth, bond, err; if (!ASSERT_OK(system("ip link add veth type veth"), "add veth")) goto out; @@ -399,22 +398,18 @@ static void test_xdp_bonding_attach(struct skeletons *skeletons) if (!ASSERT_GE(bond, 0, "if_nametoindex bond")) goto out; - /* enslaving with a XDP program loaded fails */ + /* enslaving with a XDP program loaded is allowed */ link = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, veth); if (!ASSERT_OK_PTR(link, "attach program to veth")) goto out; err = system("ip link set veth master bond"); - if (!ASSERT_NEQ(err, 0, "attaching slave with xdp program expected to fail")) + if (!ASSERT_OK(err, "set veth master")) goto out; bpf_link__destroy(link); link = NULL; - err = system("ip link set veth master bond"); - if (!ASSERT_OK(err, "set veth master")) - goto out; - /* attaching to slave when master has no program is allowed */ link = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, veth); if (!ASSERT_OK_PTR(link, "attach program to slave when enslaved")) @@ -434,8 +429,26 @@ static void test_xdp_bonding_attach(struct skeletons *skeletons) goto out; /* attaching to slave not allowed when master has program loaded */ - link2 = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, bond); - ASSERT_ERR_PTR(link2, "attach program to slave when master has program"); + link2 = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, veth); + if (!ASSERT_ERR_PTR(link2, "attach program to slave when master has program")) + goto out; + + bpf_link__destroy(link); + link = NULL; + + /* test program unwinding with a non-XDP slave */ + if (!ASSERT_OK(system("ip link add vxlan type vxlan id 1 remote 1.2.3.4 dstport 0 dev lo"), + "add vxlan")) + goto out; + + err = system("ip link set vxlan master bond"); + if (!ASSERT_OK(err, "set vxlan master")) + goto out; + + /* attaching not allowed when one slave does not support XDP */ + link = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, bond); + if (!ASSERT_ERR_PTR(link, "attach program to master when slave does not support XDP")) + goto out; out: bpf_link__destroy(link); @@ -443,6 +456,44 @@ out: system("ip link del veth"); system("ip link del bond"); + system("ip link del vxlan"); +} + +/* Test with nested bonding devices to catch issue with negative jump label count */ +static void test_xdp_bonding_nested(struct skeletons *skeletons) +{ + struct bpf_link *link = NULL; + int bond, err; + + if (!ASSERT_OK(system("ip link add bond type bond"), "add bond")) + goto out; + + bond = if_nametoindex("bond"); + if (!ASSERT_GE(bond, 0, "if_nametoindex bond")) + goto out; + + if (!ASSERT_OK(system("ip link add bond_nest1 type bond"), "add bond_nest1")) + goto out; + + err = system("ip link set bond_nest1 master bond"); + if (!ASSERT_OK(err, "set bond_nest1 master")) + goto out; + + if (!ASSERT_OK(system("ip link add bond_nest2 type bond"), "add bond_nest1")) + goto out; + + err = system("ip link set bond_nest2 master bond_nest1"); + if (!ASSERT_OK(err, "set bond_nest2 master")) + goto out; + + link = bpf_program__attach_xdp(skeletons->xdp_dummy->progs.xdp_dummy_prog, bond); + ASSERT_OK_PTR(link, "attach program to master"); + +out: + bpf_link__destroy(link); + system("ip link del bond"); + system("ip link del bond_nest1"); + system("ip link del bond_nest2"); } static int libbpf_debug_print(enum libbpf_print_level level, @@ -496,6 +547,9 @@ void test_xdp_bonding(void) if (test__start_subtest("xdp_bonding_attach")) test_xdp_bonding_attach(&skeletons); + if (test__start_subtest("xdp_bonding_nested")) + test_xdp_bonding_nested(&skeletons); + for (i = 0; i < ARRAY_SIZE(bond_test_cases); i++) { struct bond_test_case *test_case = &bond_test_cases[i]; diff --git a/tools/testing/selftests/kvm/.gitignore b/tools/testing/selftests/kvm/.gitignore index 0709af0144c8..98053d3afbda 100644 --- a/tools/testing/selftests/kvm/.gitignore +++ b/tools/testing/selftests/kvm/.gitignore @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only /aarch64/debug-exceptions /aarch64/get-reg-list +/aarch64/psci_cpu_on_test /aarch64/vgic_init /s390x/memop /s390x/resets diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index 5832f510a16c..5d05801ab816 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -86,6 +86,7 @@ TEST_GEN_PROGS_x86_64 += kvm_binary_stats_test TEST_GEN_PROGS_aarch64 += aarch64/debug-exceptions TEST_GEN_PROGS_aarch64 += aarch64/get-reg-list +TEST_GEN_PROGS_aarch64 += aarch64/psci_cpu_on_test TEST_GEN_PROGS_aarch64 += aarch64/vgic_init TEST_GEN_PROGS_aarch64 += demand_paging_test TEST_GEN_PROGS_aarch64 += dirty_log_test diff --git a/tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c b/tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c new file mode 100644 index 000000000000..018c269990e1 --- /dev/null +++ b/tools/testing/selftests/kvm/aarch64/psci_cpu_on_test.c @@ -0,0 +1,121 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * psci_cpu_on_test - Test that the observable state of a vCPU targeted by the + * CPU_ON PSCI call matches what the caller requested. + * + * Copyright (c) 2021 Google LLC. + * + * This is a regression test for a race between KVM servicing the PSCI call and + * userspace reading the vCPUs registers. + */ + +#define _GNU_SOURCE + +#include + +#include "kvm_util.h" +#include "processor.h" +#include "test_util.h" + +#define VCPU_ID_SOURCE 0 +#define VCPU_ID_TARGET 1 + +#define CPU_ON_ENTRY_ADDR 0xfeedf00dul +#define CPU_ON_CONTEXT_ID 0xdeadc0deul + +static uint64_t psci_cpu_on(uint64_t target_cpu, uint64_t entry_addr, + uint64_t context_id) +{ + register uint64_t x0 asm("x0") = PSCI_0_2_FN64_CPU_ON; + register uint64_t x1 asm("x1") = target_cpu; + register uint64_t x2 asm("x2") = entry_addr; + register uint64_t x3 asm("x3") = context_id; + + asm("hvc #0" + : "=r"(x0) + : "r"(x0), "r"(x1), "r"(x2), "r"(x3) + : "memory"); + + return x0; +} + +static uint64_t psci_affinity_info(uint64_t target_affinity, + uint64_t lowest_affinity_level) +{ + register uint64_t x0 asm("x0") = PSCI_0_2_FN64_AFFINITY_INFO; + register uint64_t x1 asm("x1") = target_affinity; + register uint64_t x2 asm("x2") = lowest_affinity_level; + + asm("hvc #0" + : "=r"(x0) + : "r"(x0), "r"(x1), "r"(x2) + : "memory"); + + return x0; +} + +static void guest_main(uint64_t target_cpu) +{ + GUEST_ASSERT(!psci_cpu_on(target_cpu, CPU_ON_ENTRY_ADDR, CPU_ON_CONTEXT_ID)); + uint64_t target_state; + + do { + target_state = psci_affinity_info(target_cpu, 0); + + GUEST_ASSERT((target_state == PSCI_0_2_AFFINITY_LEVEL_ON) || + (target_state == PSCI_0_2_AFFINITY_LEVEL_OFF)); + } while (target_state != PSCI_0_2_AFFINITY_LEVEL_ON); + + GUEST_DONE(); +} + +int main(void) +{ + uint64_t target_mpidr, obs_pc, obs_x0; + struct kvm_vcpu_init init; + struct kvm_vm *vm; + struct ucall uc; + + vm = vm_create(VM_MODE_DEFAULT, DEFAULT_GUEST_PHY_PAGES, O_RDWR); + kvm_vm_elf_load(vm, program_invocation_name); + ucall_init(vm, NULL); + + vm_ioctl(vm, KVM_ARM_PREFERRED_TARGET, &init); + init.features[0] |= (1 << KVM_ARM_VCPU_PSCI_0_2); + + aarch64_vcpu_add_default(vm, VCPU_ID_SOURCE, &init, guest_main); + + /* + * make sure the target is already off when executing the test. + */ + init.features[0] |= (1 << KVM_ARM_VCPU_POWER_OFF); + aarch64_vcpu_add_default(vm, VCPU_ID_TARGET, &init, guest_main); + + get_reg(vm, VCPU_ID_TARGET, ARM64_SYS_REG(MPIDR_EL1), &target_mpidr); + vcpu_args_set(vm, VCPU_ID_SOURCE, 1, target_mpidr & MPIDR_HWID_BITMASK); + vcpu_run(vm, VCPU_ID_SOURCE); + + switch (get_ucall(vm, VCPU_ID_SOURCE, &uc)) { + case UCALL_DONE: + break; + case UCALL_ABORT: + TEST_FAIL("%s at %s:%ld", (const char *)uc.args[0], __FILE__, + uc.args[1]); + break; + default: + TEST_FAIL("Unhandled ucall: %lu", uc.cmd); + } + + get_reg(vm, VCPU_ID_TARGET, ARM64_CORE_REG(regs.pc), &obs_pc); + get_reg(vm, VCPU_ID_TARGET, ARM64_CORE_REG(regs.regs[0]), &obs_x0); + + TEST_ASSERT(obs_pc == CPU_ON_ENTRY_ADDR, + "unexpected target cpu pc: %lx (expected: %lx)", + obs_pc, CPU_ON_ENTRY_ADDR); + TEST_ASSERT(obs_x0 == CPU_ON_CONTEXT_ID, + "unexpected target context id: %lx (expected: %lx)", + obs_x0, CPU_ON_CONTEXT_ID); + + kvm_vm_free(vm); + return 0; +} diff --git a/tools/testing/selftests/kvm/access_tracking_perf_test.c b/tools/testing/selftests/kvm/access_tracking_perf_test.c index e2baa187a21e..71e277c7c3f3 100644 --- a/tools/testing/selftests/kvm/access_tracking_perf_test.c +++ b/tools/testing/selftests/kvm/access_tracking_perf_test.c @@ -222,8 +222,6 @@ static void *vcpu_thread_main(void *arg) int vcpu_id = vcpu_args->vcpu_id; int current_iteration = -1; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); - while (spin_wait_for_next_iteration(¤t_iteration)) { switch (READ_ONCE(iteration_work)) { case ITERATION_ACCESS_MEMORY: @@ -333,7 +331,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) pthread_t *vcpu_threads; int vcpus = params->vcpus; - vm = perf_test_create_vm(mode, vcpus, params->vcpu_memory_bytes, + vm = perf_test_create_vm(mode, vcpus, params->vcpu_memory_bytes, 1, params->backing_src); perf_test_setup_vcpus(vm, vcpus, params->vcpu_memory_bytes, diff --git a/tools/testing/selftests/kvm/demand_paging_test.c b/tools/testing/selftests/kvm/demand_paging_test.c index b74704305835..e79c1b64977f 100644 --- a/tools/testing/selftests/kvm/demand_paging_test.c +++ b/tools/testing/selftests/kvm/demand_paging_test.c @@ -52,7 +52,6 @@ static void *vcpu_worker(void *data) struct timespec start; struct timespec ts_diff; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); run = vcpu_state(vm, vcpu_id); clock_gettime(CLOCK_MONOTONIC, &start); @@ -293,7 +292,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) int vcpu_id; int r; - vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, + vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, p->src_type); perf_test_args.wr_fract = 1; diff --git a/tools/testing/selftests/kvm/dirty_log_perf_test.c b/tools/testing/selftests/kvm/dirty_log_perf_test.c index 80cbd3a748c0..3c30d0045d8d 100644 --- a/tools/testing/selftests/kvm/dirty_log_perf_test.c +++ b/tools/testing/selftests/kvm/dirty_log_perf_test.c @@ -44,7 +44,6 @@ static void *vcpu_worker(void *data) struct perf_test_vcpu_args *vcpu_args = (struct perf_test_vcpu_args *)data; int vcpu_id = vcpu_args->vcpu_id; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); run = vcpu_state(vm, vcpu_id); while (!READ_ONCE(host_quit)) { @@ -94,8 +93,59 @@ struct test_params { int wr_fract; bool partition_vcpu_memory_access; enum vm_mem_backing_src_type backing_src; + int slots; }; +static void toggle_dirty_logging(struct kvm_vm *vm, int slots, bool enable) +{ + int i; + + for (i = 0; i < slots; i++) { + int slot = PERF_TEST_MEM_SLOT_INDEX + i; + int flags = enable ? KVM_MEM_LOG_DIRTY_PAGES : 0; + + vm_mem_region_set_flags(vm, slot, flags); + } +} + +static inline void enable_dirty_logging(struct kvm_vm *vm, int slots) +{ + toggle_dirty_logging(vm, slots, true); +} + +static inline void disable_dirty_logging(struct kvm_vm *vm, int slots) +{ + toggle_dirty_logging(vm, slots, false); +} + +static void get_dirty_log(struct kvm_vm *vm, int slots, unsigned long *bitmap, + uint64_t nr_pages) +{ + uint64_t slot_pages = nr_pages / slots; + int i; + + for (i = 0; i < slots; i++) { + int slot = PERF_TEST_MEM_SLOT_INDEX + i; + unsigned long *slot_bitmap = bitmap + i * slot_pages; + + kvm_vm_get_dirty_log(vm, slot, slot_bitmap); + } +} + +static void clear_dirty_log(struct kvm_vm *vm, int slots, unsigned long *bitmap, + uint64_t nr_pages) +{ + uint64_t slot_pages = nr_pages / slots; + int i; + + for (i = 0; i < slots; i++) { + int slot = PERF_TEST_MEM_SLOT_INDEX + i; + unsigned long *slot_bitmap = bitmap + i * slot_pages; + + kvm_vm_clear_dirty_log(vm, slot, slot_bitmap, 0, slot_pages); + } +} + static void run_test(enum vm_guest_mode mode, void *arg) { struct test_params *p = arg; @@ -114,7 +164,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct timespec clear_dirty_log_total = (struct timespec){0}; vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, - p->backing_src); + p->slots, p->backing_src); perf_test_args.wr_fract = p->wr_fract; @@ -163,8 +213,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Enable dirty logging */ clock_gettime(CLOCK_MONOTONIC, &start); - vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, - KVM_MEM_LOG_DIRTY_PAGES); + enable_dirty_logging(vm, p->slots); ts_diff = timespec_elapsed(start); pr_info("Enabling dirty logging time: %ld.%.9lds\n\n", ts_diff.tv_sec, ts_diff.tv_nsec); @@ -190,8 +239,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) iteration, ts_diff.tv_sec, ts_diff.tv_nsec); clock_gettime(CLOCK_MONOTONIC, &start); - kvm_vm_get_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap); - + get_dirty_log(vm, p->slots, bmap, host_num_pages); ts_diff = timespec_elapsed(start); get_dirty_log_total = timespec_add(get_dirty_log_total, ts_diff); @@ -200,9 +248,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) if (dirty_log_manual_caps) { clock_gettime(CLOCK_MONOTONIC, &start); - kvm_vm_clear_dirty_log(vm, PERF_TEST_MEM_SLOT_INDEX, bmap, 0, - host_num_pages); - + clear_dirty_log(vm, p->slots, bmap, host_num_pages); ts_diff = timespec_elapsed(start); clear_dirty_log_total = timespec_add(clear_dirty_log_total, ts_diff); @@ -213,7 +259,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) /* Disable dirty logging */ clock_gettime(CLOCK_MONOTONIC, &start); - vm_mem_region_set_flags(vm, PERF_TEST_MEM_SLOT_INDEX, 0); + disable_dirty_logging(vm, p->slots); ts_diff = timespec_elapsed(start); pr_info("Disabling dirty logging time: %ld.%.9lds\n", ts_diff.tv_sec, ts_diff.tv_nsec); @@ -244,7 +290,8 @@ static void help(char *name) { puts(""); printf("usage: %s [-h] [-i iterations] [-p offset] " - "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]\n", name); + "[-m mode] [-b vcpu bytes] [-v vcpus] [-o] [-s mem type]" + "[-x memslots]\n", name); puts(""); printf(" -i: specify iteration counts (default: %"PRIu64")\n", TEST_HOST_LOOP_N); @@ -263,6 +310,8 @@ static void help(char *name) " them into a separate region of memory for each vCPU.\n"); printf(" -s: specify the type of memory that should be used to\n" " back the guest data region.\n\n"); + printf(" -x: Split the memory region into this number of memslots.\n" + " (default: 1)"); backing_src_help(); puts(""); exit(0); @@ -276,6 +325,7 @@ int main(int argc, char *argv[]) .wr_fract = 1, .partition_vcpu_memory_access = true, .backing_src = VM_MEM_SRC_ANONYMOUS, + .slots = 1, }; int opt; @@ -286,7 +336,7 @@ int main(int argc, char *argv[]) guest_modes_append_default(); - while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:")) != -1) { + while ((opt = getopt(argc, argv, "hi:p:m:b:f:v:os:x:")) != -1) { switch (opt) { case 'i': p.iterations = atoi(optarg); @@ -316,6 +366,9 @@ int main(int argc, char *argv[]) case 's': p.backing_src = parse_backing_src_type(optarg); break; + case 'x': + p.slots = atoi(optarg); + break; case 'h': default: help(argv[0]); diff --git a/tools/testing/selftests/kvm/include/aarch64/processor.h b/tools/testing/selftests/kvm/include/aarch64/processor.h index 27dc5c2e56b9..c0273aefa63d 100644 --- a/tools/testing/selftests/kvm/include/aarch64/processor.h +++ b/tools/testing/selftests/kvm/include/aarch64/processor.h @@ -17,6 +17,7 @@ #define CPACR_EL1 3, 0, 1, 0, 2 #define TCR_EL1 3, 0, 2, 0, 2 #define MAIR_EL1 3, 0, 10, 2, 0 +#define MPIDR_EL1 3, 0, 0, 0, 5 #define TTBR0_EL1 3, 0, 2, 0, 0 #define SCTLR_EL1 3, 0, 1, 0, 0 #define VBAR_EL1 3, 0, 12, 0, 0 @@ -40,6 +41,8 @@ (0xfful << (4 * 8)) | \ (0xbbul << (5 * 8))) +#define MPIDR_HWID_BITMASK (0xff00fffffful) + static inline void get_reg(struct kvm_vm *vm, uint32_t vcpuid, uint64_t id, uint64_t *addr) { struct kvm_one_reg reg; diff --git a/tools/testing/selftests/kvm/include/perf_test_util.h b/tools/testing/selftests/kvm/include/perf_test_util.h index 005f2143adeb..df9f1a3a3ffb 100644 --- a/tools/testing/selftests/kvm/include/perf_test_util.h +++ b/tools/testing/selftests/kvm/include/perf_test_util.h @@ -44,7 +44,7 @@ extern struct perf_test_args perf_test_args; extern uint64_t guest_test_phys_mem; struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, - uint64_t vcpu_memory_bytes, + uint64_t vcpu_memory_bytes, int slots, enum vm_mem_backing_src_type backing_src); void perf_test_destroy_vm(struct kvm_vm *vm); void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, diff --git a/tools/testing/selftests/kvm/kvm_binary_stats_test.c b/tools/testing/selftests/kvm/kvm_binary_stats_test.c index 5906bbc08483..17f65d514915 100644 --- a/tools/testing/selftests/kvm/kvm_binary_stats_test.c +++ b/tools/testing/selftests/kvm/kvm_binary_stats_test.c @@ -109,6 +109,18 @@ static void stats_test(int stats_fd) /* Check size field, which should not be zero */ TEST_ASSERT(pdesc->size, "KVM descriptor(%s) with size of 0", pdesc->name); + /* Check bucket_size field */ + switch (pdesc->flags & KVM_STATS_TYPE_MASK) { + case KVM_STATS_TYPE_LINEAR_HIST: + TEST_ASSERT(pdesc->bucket_size, + "Bucket size of Linear Histogram stats (%s) is zero", + pdesc->name); + break; + default: + TEST_ASSERT(!pdesc->bucket_size, + "Bucket size of stats (%s) is not zero", + pdesc->name); + } size_data += pdesc->size * sizeof(*stats_data); } /* Check overlap */ diff --git a/tools/testing/selftests/kvm/lib/perf_test_util.c b/tools/testing/selftests/kvm/lib/perf_test_util.c index b488f4aefea8..0ef80dbdc116 100644 --- a/tools/testing/selftests/kvm/lib/perf_test_util.c +++ b/tools/testing/selftests/kvm/lib/perf_test_util.c @@ -50,11 +50,12 @@ static void guest_code(uint32_t vcpu_id) } struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, - uint64_t vcpu_memory_bytes, + uint64_t vcpu_memory_bytes, int slots, enum vm_mem_backing_src_type backing_src) { struct kvm_vm *vm; uint64_t guest_num_pages; + int i; pr_info("Testing guest mode: %s\n", vm_guest_mode_string(mode)); @@ -68,6 +69,9 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, "Guest memory size is not host page size aligned."); TEST_ASSERT(vcpu_memory_bytes % perf_test_args.guest_page_size == 0, "Guest memory size is not guest page size aligned."); + TEST_ASSERT(guest_num_pages % slots == 0, + "Guest memory cannot be evenly divided into %d slots.", + slots); vm = vm_create_with_vcpus(mode, vcpus, DEFAULT_GUEST_PHY_PAGES, (vcpus * vcpu_memory_bytes) / perf_test_args.guest_page_size, @@ -95,10 +99,16 @@ struct kvm_vm *perf_test_create_vm(enum vm_guest_mode mode, int vcpus, #endif pr_info("guest physical test memory offset: 0x%lx\n", guest_test_phys_mem); - /* Add an extra memory slot for testing */ - vm_userspace_mem_region_add(vm, backing_src, guest_test_phys_mem, - PERF_TEST_MEM_SLOT_INDEX, - guest_num_pages, 0); + /* Add extra memory slots for testing */ + for (i = 0; i < slots; i++) { + uint64_t region_pages = guest_num_pages / slots; + vm_paddr_t region_start = guest_test_phys_mem + + region_pages * perf_test_args.guest_page_size * i; + + vm_userspace_mem_region_add(vm, backing_src, region_start, + PERF_TEST_MEM_SLOT_INDEX + i, + region_pages, 0); + } /* Do mapping for the demand paging memory slot */ virt_map(vm, guest_test_virt_mem, guest_test_phys_mem, guest_num_pages); @@ -140,6 +150,8 @@ void perf_test_setup_vcpus(struct kvm_vm *vm, int vcpus, vcpu_gpa = guest_test_phys_mem; } + vcpu_args_set(vm, vcpu_id, 1, vcpu_id); + pr_debug("Added VCPU %d with test mem gpa [%lx, %lx)\n", vcpu_id, vcpu_gpa, vcpu_gpa + (vcpu_args->pages * perf_test_args.guest_page_size)); diff --git a/tools/testing/selftests/kvm/memslot_modification_stress_test.c b/tools/testing/selftests/kvm/memslot_modification_stress_test.c index 98351ba0933c..4cfcafea9f5a 100644 --- a/tools/testing/selftests/kvm/memslot_modification_stress_test.c +++ b/tools/testing/selftests/kvm/memslot_modification_stress_test.c @@ -45,7 +45,6 @@ static void *vcpu_worker(void *data) struct kvm_vm *vm = perf_test_args.vm; struct kvm_run *run; - vcpu_args_set(vm, vcpu_id, 1, vcpu_id); run = vcpu_state(vm, vcpu_id); /* Let the guest access its memory until a stop signal is received */ @@ -105,7 +104,7 @@ static void run_test(enum vm_guest_mode mode, void *arg) struct kvm_vm *vm; int vcpu_id; - vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, + vm = perf_test_create_vm(mode, nr_vcpus, guest_percpu_mem_size, 1, VM_MEM_SRC_ANONYMOUS); perf_test_args.wr_fract = 1; diff --git a/tools/testing/selftests/kvm/x86_64/debug_regs.c b/tools/testing/selftests/kvm/x86_64/debug_regs.c index 6097a8283377..5f078db1bcba 100644 --- a/tools/testing/selftests/kvm/x86_64/debug_regs.c +++ b/tools/testing/selftests/kvm/x86_64/debug_regs.c @@ -8,12 +8,15 @@ #include #include "kvm_util.h" #include "processor.h" +#include "apic.h" #define VCPU_ID 0 #define DR6_BD (1 << 13) #define DR7_GD (1 << 13) +#define IRQ_VECTOR 0xAA + /* For testing data access debug BP */ uint32_t guest_value; @@ -21,6 +24,11 @@ extern unsigned char sw_bp, hw_bp, write_data, ss_start, bd_start; static void guest_code(void) { + /* Create a pending interrupt on current vCPU */ + x2apic_enable(); + x2apic_write_reg(APIC_ICR, APIC_DEST_SELF | APIC_INT_ASSERT | + APIC_DM_FIXED | IRQ_VECTOR); + /* * Software BP tests. * @@ -38,12 +46,19 @@ static void guest_code(void) "mov %%rax,%0;\n\t write_data:" : "=m" (guest_value) : : "rax"); - /* Single step test, covers 2 basic instructions and 2 emulated */ + /* + * Single step test, covers 2 basic instructions and 2 emulated + * + * Enable interrupts during the single stepping to see that + * pending interrupt we raised is not handled due to KVM_GUESTDBG_BLOCKIRQ + */ asm volatile("ss_start: " + "sti\n\t" "xor %%eax,%%eax\n\t" "cpuid\n\t" "movl $0x1a0,%%ecx\n\t" "rdmsr\n\t" + "cli\n\t" : : : "eax", "ebx", "ecx", "edx"); /* DR6.BD test */ @@ -72,11 +87,13 @@ int main(void) uint64_t cmd; int i; /* Instruction lengths starting at ss_start */ - int ss_size[4] = { + int ss_size[6] = { + 1, /* sti*/ 2, /* xor */ 2, /* cpuid */ 5, /* mov */ 2, /* rdmsr */ + 1, /* cli */ }; if (!kvm_check_cap(KVM_CAP_SET_GUEST_DEBUG)) { @@ -154,7 +171,8 @@ int main(void) for (i = 0; i < (sizeof(ss_size) / sizeof(ss_size[0])); i++) { target_rip += ss_size[i]; CLEAR_DEBUG(); - debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP; + debug.control = KVM_GUESTDBG_ENABLE | KVM_GUESTDBG_SINGLESTEP | + KVM_GUESTDBG_BLOCKIRQ; debug.arch.debugreg[7] = 0x00000400; APPLY_DEBUG(); vcpu_run(vm, VCPU_ID); diff --git a/tools/testing/selftests/net/Makefile b/tools/testing/selftests/net/Makefile index 378c0aac5a1a..492b273743b4 100644 --- a/tools/testing/selftests/net/Makefile +++ b/tools/testing/selftests/net/Makefile @@ -27,6 +27,7 @@ TEST_PROGS += udpgro_fwd.sh TEST_PROGS += veth.sh TEST_PROGS += ioam6.sh TEST_PROGS += gro.sh +TEST_PROGS += gre_gso.sh TEST_PROGS_EXTENDED := in_netns.sh TEST_GEN_FILES = socket nettest TEST_GEN_FILES += psock_fanout psock_tpacket msg_zerocopy reuseport_addr_any diff --git a/tools/testing/selftests/net/gre_gso.sh b/tools/testing/selftests/net/gre_gso.sh new file mode 100755 index 000000000000..facbb0c80443 --- /dev/null +++ b/tools/testing/selftests/net/gre_gso.sh @@ -0,0 +1,236 @@ +#!/bin/bash +# SPDX-License-Identifier: GPL-2.0 + +# This test is for checking GRE GSO. + +ret=0 +# Kselftest framework requirement - SKIP code is 4. +ksft_skip=4 + +# all tests in this script. Can be overridden with -t option +TESTS="gre_gso" + +VERBOSE=0 +PAUSE_ON_FAIL=no +PAUSE=no +IP="ip -netns ns1" +NS_EXEC="ip netns exec ns1" +TMPFILE=`mktemp` +PID= + +log_test() +{ + local rc=$1 + local expected=$2 + local msg="$3" + + if [ ${rc} -eq ${expected} ]; then + printf " TEST: %-60s [ OK ]\n" "${msg}" + nsuccess=$((nsuccess+1)) + else + ret=1 + nfail=$((nfail+1)) + printf " TEST: %-60s [FAIL]\n" "${msg}" + if [ "${PAUSE_ON_FAIL}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi + fi + + if [ "${PAUSE}" = "yes" ]; then + echo + echo "hit enter to continue, 'q' to quit" + read a + [ "$a" = "q" ] && exit 1 + fi +} + +setup() +{ + set -e + ip netns add ns1 + ip netns set ns1 auto + $IP link set dev lo up + + ip link add veth0 type veth peer name veth1 + ip link set veth0 up + ip link set veth1 netns ns1 + $IP link set veth1 name veth0 + $IP link set veth0 up + + dd if=/dev/urandom of=$TMPFILE bs=1024 count=2048 &>/dev/null + set +e +} + +cleanup() +{ + rm -rf $TMPFILE + [ -n "$PID" ] && kill $PID + ip link del dev gre1 &> /dev/null + ip link del dev veth0 &> /dev/null + ip netns del ns1 +} + +get_linklocal() +{ + local dev=$1 + local ns=$2 + local addr + + [ -n "$ns" ] && ns="-netns $ns" + + addr=$(ip -6 -br $ns addr show dev ${dev} | \ + awk '{ + for (i = 3; i <= NF; ++i) { + if ($i ~ /^fe80/) + print $i + } + }' + ) + addr=${addr/\/*} + + [ -z "$addr" ] && return 1 + + echo $addr + + return 0 +} + +gre_create_tun() +{ + local a1=$1 + local a2=$2 + local mode + + [[ $a1 =~ ^[0-9.]*$ ]] && mode=gre || mode=ip6gre + + ip tunnel add gre1 mode $mode local $a1 remote $a2 dev veth0 + ip link set gre1 up + $IP tunnel add gre1 mode $mode local $a2 remote $a1 dev veth0 + $IP link set gre1 up +} + +gre_gst_test_checks() +{ + local name=$1 + local addr=$2 + + $NS_EXEC nc -kl $port >/dev/null & + PID=$! + while ! $NS_EXEC ss -ltn | grep -q $port; do ((i++)); sleep 0.01; done + + cat $TMPFILE | timeout 1 nc $addr $port + log_test $? 0 "$name - copy file w/ TSO" + + ethtool -K veth0 tso off + + cat $TMPFILE | timeout 1 nc $addr $port + log_test $? 0 "$name - copy file w/ GSO" + + ethtool -K veth0 tso on + + kill $PID + PID= +} + +gre6_gso_test() +{ + local port=7777 + + setup + + a1=$(get_linklocal veth0) + a2=$(get_linklocal veth0 ns1) + + gre_create_tun $a1 $a2 + + ip addr add 172.16.2.1/24 dev gre1 + $IP addr add 172.16.2.2/24 dev gre1 + + ip -6 addr add 2001:db8:1::1/64 dev gre1 nodad + $IP -6 addr add 2001:db8:1::2/64 dev gre1 nodad + + sleep 2 + + gre_gst_test_checks GREv6/v4 172.16.2.2 + gre_gst_test_checks GREv6/v6 2001:db8:1::2 + + cleanup +} + +gre_gso_test() +{ + gre6_gso_test +} + +################################################################################ +# usage + +usage() +{ + cat < Test(s) to run (default: all) + (options: $TESTS) + -p Pause on fail + -P Pause after each test before cleanup + -v verbose mode (show commands and output) +EOF +} + +################################################################################ +# main + +while getopts :t:pPhv o +do + case $o in + t) TESTS=$OPTARG;; + p) PAUSE_ON_FAIL=yes;; + P) PAUSE=yes;; + v) VERBOSE=$(($VERBOSE + 1));; + h) usage; exit 0;; + *) usage; exit 1;; + esac +done + +PEER_CMD="ip netns exec ${PEER_NS}" + +# make sure we don't pause twice +[ "${PAUSE}" = "yes" ] && PAUSE_ON_FAIL=no + +if [ "$(id -u)" -ne 0 ];then + echo "SKIP: Need root privileges" + exit $ksft_skip; +fi + +if [ ! -x "$(command -v ip)" ]; then + echo "SKIP: Could not run test without ip tool" + exit $ksft_skip +fi + +if [ ! -x "$(command -v nc)" ]; then + echo "SKIP: Could not run test without nc tool" + exit $ksft_skip +fi + +# start clean +cleanup &> /dev/null + +for t in $TESTS +do + case $t in + gre_gso) gre_gso_test;; + + help) echo "Test names: $TESTS"; exit 0;; + esac +done + +if [ "$TESTS" != "none" ]; then + printf "\nTests passed: %3d\n" ${nsuccess} + printf "Tests failed: %3d\n" ${nfail} +fi + +exit $ret diff --git a/tools/testing/selftests/net/mptcp/simult_flows.sh b/tools/testing/selftests/net/mptcp/simult_flows.sh index fd63ebfe9a2b..910d8126af8f 100755 --- a/tools/testing/selftests/net/mptcp/simult_flows.sh +++ b/tools/testing/selftests/net/mptcp/simult_flows.sh @@ -22,8 +22,8 @@ usage() { cleanup() { - rm -f "$cin" "$cout" - rm -f "$sin" "$sout" + rm -f "$cout" "$sout" + rm -f "$large" "$small" rm -f "$capout" local netns diff --git a/virt/kvm/binary_stats.c b/virt/kvm/binary_stats.c index e609d428811a..eefca6c69f51 100644 --- a/virt/kvm/binary_stats.c +++ b/virt/kvm/binary_stats.c @@ -136,9 +136,7 @@ ssize_t kvm_stats_read(char *id, const struct kvm_stats_header *header, src = stats + pos - header->data_offset; if (copy_to_user(dest, src, copylen)) return -EFAULT; - remain -= copylen; pos += copylen; - dest += copylen; } *offset = pos; diff --git a/virt/kvm/dirty_ring.c b/virt/kvm/dirty_ring.c index 7aafefc50aa7..88f4683198ea 100644 --- a/virt/kvm/dirty_ring.c +++ b/virt/kvm/dirty_ring.c @@ -91,11 +91,6 @@ static inline void kvm_dirty_gfn_set_dirtied(struct kvm_dirty_gfn *gfn) gfn->flags = KVM_DIRTY_GFN_F_DIRTY; } -static inline bool kvm_dirty_gfn_invalid(struct kvm_dirty_gfn *gfn) -{ - return gfn->flags == 0; -} - static inline bool kvm_dirty_gfn_harvested(struct kvm_dirty_gfn *gfn) { return gfn->flags & KVM_DIRTY_GFN_F_RESET; diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index b50dbe269f4b..439d3b4cd1a9 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -189,16 +189,6 @@ bool kvm_is_reserved_pfn(kvm_pfn_t pfn) return true; } -bool kvm_is_transparent_hugepage(kvm_pfn_t pfn) -{ - struct page *page = pfn_to_page(pfn); - - if (!PageTransCompoundMap(page)) - return false; - - return is_transparent_hugepage(compound_head(page)); -} - /* * Switches to specified vcpu, until a matching vcpu_put() */ @@ -318,6 +308,7 @@ void kvm_flush_remote_tlbs(struct kvm *kvm) */ long dirty_count = smp_load_acquire(&kvm->tlbs_dirty); + ++kvm->stat.generic.remote_tlb_flush_requests; /* * We want to publish modifications to the page tables before reading * mode. Pairs with a memory barrier in arch-specific code. @@ -415,6 +406,7 @@ static void kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id) vcpu->preempted = false; vcpu->ready = false; preempt_notifier_init(&vcpu->preempt_notifier, &kvm_preempt_ops); + vcpu->last_used_slot = 0; } void kvm_vcpu_destroy(struct kvm_vcpu *vcpu) @@ -496,17 +488,6 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, idx = srcu_read_lock(&kvm->srcu); - /* The on_lock() path does not yet support lock elision. */ - if (!IS_KVM_NULL_FN(range->on_lock)) { - locked = true; - KVM_MMU_LOCK(kvm); - - range->on_lock(kvm, range->start, range->end); - - if (IS_KVM_NULL_FN(range->handler)) - goto out_unlock; - } - for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) { slots = __kvm_memslots(kvm, i); kvm_for_each_memslot(slot, slots) { @@ -538,6 +519,10 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, if (!locked) { locked = true; KVM_MMU_LOCK(kvm); + if (!IS_KVM_NULL_FN(range->on_lock)) + range->on_lock(kvm, range->start, range->end); + if (IS_KVM_NULL_FN(range->handler)) + break; } ret |= range->handler(kvm, &gfn_range); } @@ -546,7 +531,6 @@ static __always_inline int __kvm_handle_hva_range(struct kvm *kvm, if (range->flush_on_ret && (ret || kvm->tlbs_dirty)) kvm_flush_remote_tlbs(kvm); -out_unlock: if (locked) KVM_MMU_UNLOCK(kvm); @@ -604,16 +588,20 @@ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn, trace_kvm_set_spte_hva(address); /* - * .change_pte() must be surrounded by .invalidate_range_{start,end}(), - * and so always runs with an elevated notifier count. This obviates - * the need to bump the sequence count. + * .change_pte() must be surrounded by .invalidate_range_{start,end}(). + * If mmu_notifier_count is zero, then no in-progress invalidations, + * including this one, found a relevant memslot at start(); rechecking + * memslots here is unnecessary. Note, a false positive (count elevated + * by a different invalidation) is sub-optimal but functionally ok. */ - WARN_ON_ONCE(!kvm->mmu_notifier_count); + WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count)); + if (!READ_ONCE(kvm->mmu_notifier_count)) + return; kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn); } -static void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, +void kvm_inc_notifier_count(struct kvm *kvm, unsigned long start, unsigned long end) { /* @@ -658,12 +646,24 @@ static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn, trace_kvm_unmap_hva_range(range->start, range->end); + /* + * Prevent memslot modification between range_start() and range_end() + * so that conditionally locking provides the same result in both + * functions. Without that guarantee, the mmu_notifier_count + * adjustments will be imbalanced. + * + * Pairs with the decrement in range_end(). + */ + spin_lock(&kvm->mn_invalidate_lock); + kvm->mn_active_invalidate_count++; + spin_unlock(&kvm->mn_invalidate_lock); + __kvm_handle_hva_range(kvm, &hva_range); return 0; } -static void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, +void kvm_dec_notifier_count(struct kvm *kvm, unsigned long start, unsigned long end) { /* @@ -694,9 +694,22 @@ static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn, .flush_on_ret = false, .may_block = mmu_notifier_range_blockable(range), }; + bool wake; __kvm_handle_hva_range(kvm, &hva_range); + /* Pairs with the increment in range_start(). */ + spin_lock(&kvm->mn_invalidate_lock); + wake = (--kvm->mn_active_invalidate_count == 0); + spin_unlock(&kvm->mn_invalidate_lock); + + /* + * There can only be one waiter, since the wait happens under + * slots_lock. + */ + if (wake) + rcuwait_wake_up(&kvm->mn_memslots_update_rcuwait); + BUG_ON(kvm->mmu_notifier_count < 0); } @@ -897,7 +910,7 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) char dir_name[ITOA_MAX_LEN * 2]; struct kvm_stat_data *stat_data; const struct _kvm_stats_desc *pdesc; - int i; + int i, ret; int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc + kvm_vcpu_stats_header.num_desc; @@ -954,6 +967,13 @@ static int kvm_create_vm_debugfs(struct kvm *kvm, int fd) kvm->debugfs_dentry, stat_data, &stat_fops_per_vm); } + + ret = kvm_arch_create_vm_debugfs(kvm); + if (ret) { + kvm_destroy_vm_debugfs(kvm); + return i; + } + return 0; } @@ -974,6 +994,17 @@ void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm) { } +/* + * Called after per-vm debugfs created. When called kvm->debugfs_dentry should + * be setup already, so we can create arch-specific debugfs entries under it. + * Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so + * a per-arch destroy interface is not needed. + */ +int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm) +{ + return 0; +} + static struct kvm *kvm_create_vm(unsigned long type) { struct kvm *kvm = kvm_arch_alloc_vm(); @@ -991,6 +1022,9 @@ static struct kvm *kvm_create_vm(unsigned long type) mutex_init(&kvm->irq_lock); mutex_init(&kvm->slots_lock); mutex_init(&kvm->slots_arch_lock); + spin_lock_init(&kvm->mn_invalidate_lock); + rcuwait_init(&kvm->mn_memslots_update_rcuwait); + INIT_LIST_HEAD(&kvm->devices); BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX); @@ -1113,6 +1147,16 @@ static void kvm_destroy_vm(struct kvm *kvm) kvm_coalesced_mmio_free(kvm); #if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER) mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm); + /* + * At this point, pending calls to invalidate_range_start() + * have completed but no more MMU notifiers will run, so + * mn_active_invalidate_count may remain unbalanced. + * No threads can be waiting in install_new_memslots as the + * last reference on KVM has been dropped, but freeing + * memslots would deadlock without this manual intervention. + */ + WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait)); + kvm->mn_active_invalidate_count = 0; #else kvm_arch_flush_shadow_all(kvm); #endif @@ -1134,6 +1178,16 @@ void kvm_get_kvm(struct kvm *kvm) } EXPORT_SYMBOL_GPL(kvm_get_kvm); +/* + * Make sure the vm is not during destruction, which is a safe version of + * kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise. + */ +bool kvm_get_kvm_safe(struct kvm *kvm) +{ + return refcount_inc_not_zero(&kvm->users_count); +} +EXPORT_SYMBOL_GPL(kvm_get_kvm_safe); + void kvm_put_kvm(struct kvm *kvm) { if (refcount_dec_and_test(&kvm->users_count)) @@ -1194,8 +1248,8 @@ static inline void kvm_memslot_delete(struct kvm_memslots *slots, slots->used_slots--; - if (atomic_read(&slots->lru_slot) >= slots->used_slots) - atomic_set(&slots->lru_slot, 0); + if (atomic_read(&slots->last_used_slot) >= slots->used_slots) + atomic_set(&slots->last_used_slot, 0); for (i = slots->id_to_index[memslot->id]; i < slots->used_slots; i++) { mslots[i] = mslots[i + 1]; @@ -1364,7 +1418,22 @@ static struct kvm_memslots *install_new_memslots(struct kvm *kvm, WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS); slots->generation = gen | KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS; + /* + * Do not store the new memslots while there are invalidations in + * progress, otherwise the locking in invalidate_range_start and + * invalidate_range_end will be unbalanced. + */ + spin_lock(&kvm->mn_invalidate_lock); + prepare_to_rcuwait(&kvm->mn_memslots_update_rcuwait); + while (kvm->mn_active_invalidate_count) { + set_current_state(TASK_UNINTERRUPTIBLE); + spin_unlock(&kvm->mn_invalidate_lock); + schedule(); + spin_lock(&kvm->mn_invalidate_lock); + } + finish_rcuwait(&kvm->mn_memslots_update_rcuwait); rcu_assign_pointer(kvm->memslots[as_id], slots); + spin_unlock(&kvm->mn_invalidate_lock); /* * Acquired in kvm_set_memslot. Must be released before synchronize @@ -1980,7 +2049,26 @@ EXPORT_SYMBOL_GPL(gfn_to_memslot); struct kvm_memory_slot *kvm_vcpu_gfn_to_memslot(struct kvm_vcpu *vcpu, gfn_t gfn) { - return __gfn_to_memslot(kvm_vcpu_memslots(vcpu), gfn); + struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu); + struct kvm_memory_slot *slot; + int slot_index; + + slot = try_get_memslot(slots, vcpu->last_used_slot, gfn); + if (slot) + return slot; + + /* + * Fall back to searching all memslots. We purposely use + * search_memslots() instead of __gfn_to_memslot() to avoid + * thrashing the VM-wide last_used_index in kvm_memslots. + */ + slot = search_memslots(slots, gfn, &slot_index); + if (slot) { + vcpu->last_used_slot = slot_index; + return slot; + } + + return NULL; } EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_memslot); @@ -2239,7 +2327,7 @@ static int hva_to_pfn_remapped(struct vm_area_struct *vma, * Get a reference here because callers of *hva_to_pfn* and * *gfn_to_pfn* ultimately call kvm_release_pfn_clean on the * returned pfn. This is only needed if the VMA has VM_MIXEDMAP - * set, but the kvm_get_pfn/kvm_release_pfn_clean pair will + * set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will * simply do nothing for reserved pfns. * * Whoever called remap_pfn_range is also going to call e.g. @@ -2636,13 +2724,6 @@ void kvm_set_pfn_accessed(kvm_pfn_t pfn) } EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed); -void kvm_get_pfn(kvm_pfn_t pfn) -{ - if (!kvm_is_reserved_pfn(pfn)) - get_page(pfn_to_page(pfn)); -} -EXPORT_SYMBOL_GPL(kvm_get_pfn); - static int next_segment(unsigned long len, int offset) { if (len > PAGE_SIZE - offset) @@ -3122,13 +3203,23 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) ++vcpu->stat.generic.halt_successful_poll; if (!vcpu_valid_wakeup(vcpu)) ++vcpu->stat.generic.halt_poll_invalid; + + KVM_STATS_LOG_HIST_UPDATE( + vcpu->stat.generic.halt_poll_success_hist, + ktime_to_ns(ktime_get()) - + ktime_to_ns(start)); goto out; } cpu_relax(); poll_end = cur = ktime_get(); } while (kvm_vcpu_can_poll(cur, stop)); + + KVM_STATS_LOG_HIST_UPDATE( + vcpu->stat.generic.halt_poll_fail_hist, + ktime_to_ns(ktime_get()) - ktime_to_ns(start)); } + prepare_to_rcuwait(&vcpu->wait); for (;;) { set_current_state(TASK_INTERRUPTIBLE); @@ -3141,6 +3232,12 @@ void kvm_vcpu_block(struct kvm_vcpu *vcpu) } finish_rcuwait(&vcpu->wait); cur = ktime_get(); + if (waited) { + vcpu->stat.generic.halt_wait_ns += + ktime_to_ns(cur) - ktime_to_ns(poll_end); + KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist, + ktime_to_ns(cur) - ktime_to_ns(poll_end)); + } out: kvm_arch_vcpu_unblocking(vcpu); block_ns = ktime_to_ns(cur) - ktime_to_ns(start); @@ -3612,7 +3709,7 @@ static long kvm_vcpu_ioctl(struct file *filp, struct kvm_fpu *fpu = NULL; struct kvm_sregs *kvm_sregs = NULL; - if (vcpu->kvm->mm != current->mm) + if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged) return -EIO; if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) @@ -3822,7 +3919,7 @@ static long kvm_vcpu_compat_ioctl(struct file *filp, void __user *argp = compat_ptr(arg); int r; - if (vcpu->kvm->mm != current->mm) + if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged) return -EIO; switch (ioctl) { @@ -3888,7 +3985,7 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, { struct kvm_device *dev = filp->private_data; - if (dev->kvm->mm != current->mm) + if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged) return -EIO; switch (ioctl) { @@ -4210,7 +4307,7 @@ static long kvm_vm_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int r; - if (kvm->mm != current->mm) + if (kvm->mm != current->mm || kvm->vm_bugged) return -EIO; switch (ioctl) { case KVM_CREATE_VCPU: @@ -4421,7 +4518,7 @@ static long kvm_vm_compat_ioctl(struct file *filp, struct kvm *kvm = filp->private_data; int r; - if (kvm->mm != current->mm) + if (kvm->mm != current->mm || kvm->vm_bugged) return -EIO; switch (ioctl) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT @@ -4983,12 +5080,12 @@ static int kvm_debugfs_open(struct inode *inode, struct file *file, struct kvm_stat_data *stat_data = (struct kvm_stat_data *) inode->i_private; - /* The debugfs files are a reference to the kvm struct which - * is still valid when kvm_destroy_vm is called. - * To avoid the race between open and the removal of the debugfs - * directory we test against the users count. + /* + * The debugfs files are a reference to the kvm struct which + * is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe + * avoids the race between open and the removal of the debugfs directory. */ - if (!refcount_inc_not_zero(&stat_data->kvm->users_count)) + if (!kvm_get_kvm_safe(stat_data->kvm)) return -ENOENT; if (simple_attr_open(inode, file, get,