diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index f627e705e663..b268e3e18b4a 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -512,3 +512,19 @@ Date: July 2021 Contact: "Daeho Jeong" Description: You can control the multiplier value of bdi device readahead window size between 2 (default) and 256 for POSIX_FADV_SEQUENTIAL advise option. + +What: /sys/fs/f2fs//max_fragment_chunk +Date: August 2021 +Contact: "Daeho Jeong" +Description: With "mode=fragment:block" mount options, we can scatter block allocation. + f2fs will allocate 1.. blocks in a chunk and make a hole + in the length of 1.. by turns. This value can be set + between 1..512 and the default value is 4. + +What: /sys/fs/f2fs//max_fragment_hole +Date: August 2021 +Contact: "Daeho Jeong" +Description: With "mode=fragment:block" mount options, we can scatter block allocation. + f2fs will allocate 1.. blocks in a chunk and make a hole + in the length of 1.. by turns. This value can be set + between 1..512 and the default value is 4. diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst index 6f3c6e91346d..d7b84695f56a 100644 --- a/Documentation/filesystems/f2fs.rst +++ b/Documentation/filesystems/f2fs.rst @@ -197,10 +197,29 @@ fault_type=%d Support configuring fault injection type, should be FAULT_DISCARD 0x000002000 FAULT_WRITE_IO 0x000004000 FAULT_SLAB_ALLOC 0x000008000 + FAULT_DQUOT_INIT 0x000010000 =================== =========== mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. + "fragment:segment" and "fragment:block" are newly added here. + These are developer options for experiments to simulate filesystem + fragmentation/after-GC situation itself. The developers use these + modes to understand filesystem fragmentation/after-GC condition well, + and eventually get some insights to handle them better. + In "fragment:segment", f2fs allocates a new segment in ramdom + position. With this, we can simulate the after-GC condition. + In "fragment:block", we can scatter block allocation with + "max_fragment_chunk" and "max_fragment_hole" sysfs nodes. + We added some randomness to both chunk and hole size to make + it close to realistic IO pattern. So, in this mode, f2fs will allocate + 1.. blocks in a chunk and make a hole in the + length of 1.. by turns. With this, the newly + allocated blocks will be scattered throughout the whole partition. + Note that "fragment:block" implicitly enables "fragment:segment" + option for more randomness. + Please, use these options for your experiments and we strongly + recommend to re-format the filesystem after using these options. io_bits=%u Set the bit size of write IO requests. It should be set with "mode=lfs". usrquota Enable plain user disk quota accounting. diff --git a/Documentation/security/SCTP.rst b/Documentation/security/SCTP.rst index 406cc68b8808..d5fd6ccc3dcb 100644 --- a/Documentation/security/SCTP.rst +++ b/Documentation/security/SCTP.rst @@ -15,7 +15,10 @@ For security module support, three SCTP specific hooks have been implemented:: security_sctp_assoc_request() security_sctp_bind_connect() security_sctp_sk_clone() - security_sctp_assoc_established() + +Also the following security hook has been utilised:: + + security_inet_conn_established() The usage of these hooks are described below with the SELinux implementation described in the `SCTP SELinux Support`_ chapter. @@ -119,12 +122,11 @@ calls **sctp_peeloff**\(3). @newsk - pointer to new sock structure. -security_sctp_assoc_established() +security_inet_conn_established() ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Called when a COOKIE ACK is received, and the peer secid will be -saved into ``@asoc->peer_secid`` for client:: +Called when a COOKIE ACK is received:: - @asoc - pointer to sctp association structure. + @sk - pointer to sock structure. @skb - pointer to skbuff of the COOKIE ACK packet. @@ -132,7 +134,7 @@ Security Hooks used for Association Establishment ------------------------------------------------- The following diagram shows the use of ``security_sctp_bind_connect()``, -``security_sctp_assoc_request()``, ``security_sctp_assoc_established()`` when +``security_sctp_assoc_request()``, ``security_inet_conn_established()`` when establishing an association. :: @@ -170,7 +172,7 @@ establishing an association. <------------------------------------------- COOKIE ACK | | sctp_sf_do_5_1E_ca | - Call security_sctp_assoc_established() | + Call security_inet_conn_established() | to set the peer label. | | | | If SCTP_SOCKET_TCP or peeled off @@ -196,7 +198,7 @@ hooks with the SELinux specifics expanded below:: security_sctp_assoc_request() security_sctp_bind_connect() security_sctp_sk_clone() - security_sctp_assoc_established() + security_inet_conn_established() security_sctp_assoc_request() @@ -269,12 +271,12 @@ sockets sid and peer sid to that contained in the ``@asoc sid`` and @newsk - pointer to new sock structure. -security_sctp_assoc_established() +security_inet_conn_established() ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Called when a COOKIE ACK is received where it sets the connection's peer sid to that in ``@skb``:: - @asoc - pointer to sctp association structure. + @sk - pointer to sock structure. @skb - pointer to skbuff of the COOKIE ACK packet. diff --git a/Documentation/virt/kvm/api.rst b/Documentation/virt/kvm/api.rst index 3b093d6dbe22..aeeb071c7688 100644 --- a/Documentation/virt/kvm/api.rst +++ b/Documentation/virt/kvm/api.rst @@ -6911,6 +6911,20 @@ MAP_SHARED mmap will result in an -EINVAL return. When enabled the VMM may make use of the ``KVM_ARM_MTE_COPY_TAGS`` ioctl to perform a bulk copy of tags to/from the guest. +7.29 KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM +------------------------------------- + +Architectures: x86 SEV enabled +Type: vm +Parameters: args[0] is the fd of the source vm +Returns: 0 on success + +This capability enables userspace to migrate the encryption context from the VM +indicated by the fd to the VM this is called on. + +This is intended to support intra-host migration of VMs between userspace VMMs, +upgrading the VMM process without interrupting the guest. + 8. Other capabilities. ====================== diff --git a/MAINTAINERS b/MAINTAINERS index d2b46613399a..41c469cf5f72 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -4676,11 +4676,10 @@ COCCINELLE/Semantic Patches (SmPL) M: Julia Lawall M: Gilles Muller M: Nicolas Palix -M: Michal Marek -L: cocci@systeme.lip6.fr (moderated for non-subscribers) +L: cocci@inria.fr (moderated for non-subscribers) S: Supported -W: http://coccinelle.lip6.fr/ -T: git git://git.kernel.org/pub/scm/linux/kernel/git/mmarek/kbuild.git misc +W: https://coccinelle.gitlabpages.inria.fr/website/ +T: git git://git.kernel.org/pub/scm/linux/kernel/git/jlawall/linux.git F: Documentation/dev-tools/coccinelle.rst F: scripts/coccicheck F: scripts/coccinelle/ diff --git a/arch/arm64/include/asm/esr.h b/arch/arm64/include/asm/esr.h index a305ce256090..d52a0b269ee8 100644 --- a/arch/arm64/include/asm/esr.h +++ b/arch/arm64/include/asm/esr.h @@ -68,6 +68,7 @@ #define ESR_ELx_EC_MAX (0x3F) #define ESR_ELx_EC_SHIFT (26) +#define ESR_ELx_EC_WIDTH (6) #define ESR_ELx_EC_MASK (UL(0x3F) << ESR_ELx_EC_SHIFT) #define ESR_ELx_EC(esr) (((esr) & ESR_ELx_EC_MASK) >> ESR_ELx_EC_SHIFT) diff --git a/arch/arm64/include/asm/kvm_host.h b/arch/arm64/include/asm/kvm_host.h index 4be8486042a7..2a5f7f38006f 100644 --- a/arch/arm64/include/asm/kvm_host.h +++ b/arch/arm64/include/asm/kvm_host.h @@ -584,7 +584,7 @@ struct kvm_vcpu_stat { u64 exits; }; -int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init); +void kvm_vcpu_preferred_target(struct kvm_vcpu_init *init); unsigned long kvm_arm_num_regs(struct kvm_vcpu *vcpu); int kvm_arm_copy_reg_indices(struct kvm_vcpu *vcpu, u64 __user *indices); int kvm_arm_get_reg(struct kvm_vcpu *vcpu, const struct kvm_one_reg *reg); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index f5490afe1ebf..2f03cbfefe67 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -1389,12 +1389,9 @@ long kvm_arch_vm_ioctl(struct file *filp, return kvm_vm_ioctl_set_device_addr(kvm, &dev_addr); } case KVM_ARM_PREFERRED_TARGET: { - int err; struct kvm_vcpu_init init; - err = kvm_vcpu_preferred_target(&init); - if (err) - return err; + kvm_vcpu_preferred_target(&init); if (copy_to_user(argp, &init, sizeof(init))) return -EFAULT; diff --git a/arch/arm64/kvm/guest.c b/arch/arm64/kvm/guest.c index 5ce26bedf23c..e116c7767730 100644 --- a/arch/arm64/kvm/guest.c +++ b/arch/arm64/kvm/guest.c @@ -869,13 +869,10 @@ u32 __attribute_const__ kvm_target_cpu(void) return KVM_ARM_TARGET_GENERIC_V8; } -int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) +void kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) { u32 target = kvm_target_cpu(); - if (target < 0) - return -ENODEV; - memset(init, 0, sizeof(*init)); /* @@ -885,8 +882,6 @@ int kvm_vcpu_preferred_target(struct kvm_vcpu_init *init) * target type. */ init->target = (__u32)target; - - return 0; } int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) diff --git a/arch/arm64/kvm/hyp/hyp-entry.S b/arch/arm64/kvm/hyp/hyp-entry.S index 9aa9b73475c9..b6b6801d96d5 100644 --- a/arch/arm64/kvm/hyp/hyp-entry.S +++ b/arch/arm64/kvm/hyp/hyp-entry.S @@ -44,7 +44,7 @@ el1_sync: // Guest trapped into EL2 mrs x0, esr_el2 - lsr x0, x0, #ESR_ELx_EC_SHIFT + ubfx x0, x0, #ESR_ELx_EC_SHIFT, #ESR_ELx_EC_WIDTH cmp x0, #ESR_ELx_EC_HVC64 ccmp x0, #ESR_ELx_EC_HVC32, #4, ne b.ne el1_trap diff --git a/arch/arm64/kvm/hyp/nvhe/host.S b/arch/arm64/kvm/hyp/nvhe/host.S index 0c6116d34e18..3d613e721a75 100644 --- a/arch/arm64/kvm/hyp/nvhe/host.S +++ b/arch/arm64/kvm/hyp/nvhe/host.S @@ -141,7 +141,7 @@ SYM_FUNC_END(__host_hvc) .L__vect_start\@: stp x0, x1, [sp, #-16]! mrs x0, esr_el2 - lsr x0, x0, #ESR_ELx_EC_SHIFT + ubfx x0, x0, #ESR_ELx_EC_SHIFT, #ESR_ELx_EC_WIDTH cmp x0, #ESR_ELx_EC_HVC64 b.eq __host_hvc b __host_exit diff --git a/arch/arm64/kvm/hyp/nvhe/setup.c b/arch/arm64/kvm/hyp/nvhe/setup.c index 862c7b514e20..578f71798c2e 100644 --- a/arch/arm64/kvm/hyp/nvhe/setup.c +++ b/arch/arm64/kvm/hyp/nvhe/setup.c @@ -178,7 +178,7 @@ static int finalize_host_mappings_walker(u64 addr, u64 end, u32 level, phys = kvm_pte_to_phys(pte); if (!addr_is_memory(phys)) - return 0; + return -EINVAL; /* * Adjust the host stage-2 mappings to match the ownership attributes @@ -207,8 +207,18 @@ static int finalize_host_mappings(void) .cb = finalize_host_mappings_walker, .flags = KVM_PGTABLE_WALK_LEAF, }; + int i, ret; - return kvm_pgtable_walk(&pkvm_pgtable, 0, BIT(pkvm_pgtable.ia_bits), &walker); + for (i = 0; i < hyp_memblock_nr; i++) { + struct memblock_region *reg = &hyp_memory[i]; + u64 start = (u64)hyp_phys_to_virt(reg->base); + + ret = kvm_pgtable_walk(&pkvm_pgtable, start, reg->size, &walker); + if (ret) + return ret; + } + + return 0; } void __noreturn __pkvm_init_finalise(void) diff --git a/arch/arm64/kvm/hyp/nvhe/sys_regs.c b/arch/arm64/kvm/hyp/nvhe/sys_regs.c index 3787ee6fb1a2..792cf6e6ac92 100644 --- a/arch/arm64/kvm/hyp/nvhe/sys_regs.c +++ b/arch/arm64/kvm/hyp/nvhe/sys_regs.c @@ -474,7 +474,7 @@ bool kvm_handle_pvm_sysreg(struct kvm_vcpu *vcpu, u64 *exit_code) return true; } -/** +/* * Handler for protected VM restricted exceptions. * * Inject an undefined exception into the guest and return true to indicate that diff --git a/arch/mips/Kbuild.platforms b/arch/mips/Kbuild.platforms index 2c57994b5217..30193bcf9caa 100644 --- a/arch/mips/Kbuild.platforms +++ b/arch/mips/Kbuild.platforms @@ -37,4 +37,4 @@ platform-$(CONFIG_MACH_TX49XX) += txx9/ platform-$(CONFIG_MACH_VR41XX) += vr41xx/ # include the platform specific files -include $(patsubst %, $(srctree)/arch/mips/%/Platform, $(platform-y)) +include $(patsubst %/, $(srctree)/arch/mips/%/Platform, $(platform-y)) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 86510741d49d..de60ad190057 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -292,6 +292,8 @@ config BMIPS_GENERIC select USB_OHCI_BIG_ENDIAN_DESC if CPU_BIG_ENDIAN select USB_OHCI_BIG_ENDIAN_MMIO if CPU_BIG_ENDIAN select HARDIRQS_SW_RESEND + select HAVE_PCI + select PCI_DRIVERS_GENERIC help Build a generic DT-based kernel image that boots on select BCM33xx cable modem chips, BCM63xx DSL chips, and BCM7xxx set-top @@ -333,6 +335,9 @@ config BCM63XX select SYS_SUPPORTS_32BIT_KERNEL select SYS_SUPPORTS_BIG_ENDIAN select SYS_HAS_EARLY_PRINTK + select SYS_HAS_CPU_BMIPS32_3300 + select SYS_HAS_CPU_BMIPS4350 + select SYS_HAS_CPU_BMIPS4380 select SWAP_IO_SPACE select GPIOLIB select MIPS_L1_CACHE_SHIFT_4 diff --git a/arch/mips/Makefile b/arch/mips/Makefile index e036fc025ccc..ace7f033de07 100644 --- a/arch/mips/Makefile +++ b/arch/mips/Makefile @@ -253,7 +253,9 @@ endif # # Board-dependent options and extra files # +ifdef need-compiler include $(srctree)/arch/mips/Kbuild.platforms +endif ifdef CONFIG_PHYSICAL_START load-y = $(CONFIG_PHYSICAL_START) diff --git a/arch/mips/boot/compressed/.gitignore b/arch/mips/boot/compressed/.gitignore deleted file mode 100644 index d358395614c9..000000000000 --- a/arch/mips/boot/compressed/.gitignore +++ /dev/null @@ -1,3 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -ashldi3.c -bswapsi.c diff --git a/arch/mips/boot/compressed/Makefile b/arch/mips/boot/compressed/Makefile index 3548b3b45269..2861a05c2e0c 100644 --- a/arch/mips/boot/compressed/Makefile +++ b/arch/mips/boot/compressed/Makefile @@ -50,19 +50,9 @@ vmlinuzobjs-$(CONFIG_MIPS_ALCHEMY) += $(obj)/uart-alchemy.o vmlinuzobjs-$(CONFIG_ATH79) += $(obj)/uart-ath79.o endif -extra-y += uart-ath79.c -$(obj)/uart-ath79.c: $(srctree)/arch/mips/ath79/early_printk.c - $(call cmd,shipped) - vmlinuzobjs-$(CONFIG_KERNEL_XZ) += $(obj)/ashldi3.o -extra-y += ashldi3.c -$(obj)/ashldi3.c: $(obj)/%.c: $(srctree)/lib/%.c FORCE - $(call if_changed,shipped) - -extra-y += bswapsi.c -$(obj)/bswapsi.c: $(obj)/%.c: $(srctree)/arch/mips/lib/%.c FORCE - $(call if_changed,shipped) +vmlinuzobjs-$(CONFIG_KERNEL_ZSTD) += $(obj)/bswapdi.o targets := $(notdir $(vmlinuzobjs-y)) diff --git a/arch/mips/boot/compressed/ashldi3.c b/arch/mips/boot/compressed/ashldi3.c new file mode 100644 index 000000000000..f7bf6a7aae31 --- /dev/null +++ b/arch/mips/boot/compressed/ashldi3.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "../../../../lib/ashldi3.c" diff --git a/arch/mips/boot/compressed/bswapdi.c b/arch/mips/boot/compressed/bswapdi.c new file mode 100644 index 000000000000..acb28aebb025 --- /dev/null +++ b/arch/mips/boot/compressed/bswapdi.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "../../lib/bswapdi.c" diff --git a/arch/mips/boot/compressed/bswapsi.c b/arch/mips/boot/compressed/bswapsi.c new file mode 100644 index 000000000000..fdb9c6476904 --- /dev/null +++ b/arch/mips/boot/compressed/bswapsi.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "../../lib/bswapsi.c" diff --git a/arch/mips/boot/compressed/uart-ath79.c b/arch/mips/boot/compressed/uart-ath79.c new file mode 100644 index 000000000000..d686820921be --- /dev/null +++ b/arch/mips/boot/compressed/uart-ath79.c @@ -0,0 +1,2 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include "../../ath79/early_printk.c" diff --git a/arch/mips/configs/bmips_stb_defconfig b/arch/mips/configs/bmips_stb_defconfig index 625bd2d7e685..5956fb95c19f 100644 --- a/arch/mips/configs/bmips_stb_defconfig +++ b/arch/mips/configs/bmips_stb_defconfig @@ -1,6 +1,7 @@ # CONFIG_LOCALVERSION_AUTO is not set # CONFIG_SWAP is not set CONFIG_NO_HZ=y +CONFIG_HZ=1000 CONFIG_BLK_DEV_INITRD=y CONFIG_EXPERT=y # CONFIG_VM_EVENT_COUNTERS is not set @@ -8,17 +9,34 @@ CONFIG_EXPERT=y CONFIG_BMIPS_GENERIC=y CONFIG_CPU_LITTLE_ENDIAN=y CONFIG_HIGHMEM=y +CONFIG_HIGH_RES_TIMERS=y CONFIG_SMP=y CONFIG_NR_CPUS=4 +CONFIG_CC_STACKPROTECTOR_STRONG=y # CONFIG_SECCOMP is not set CONFIG_MIPS_O32_FP64_SUPPORT=y +# CONFIG_RD_GZIP is not set +# CONFIG_RD_BZIP2 is not set +# CONFIG_RD_LZMA is not set +CONFIG_RD_XZ=y +# CONFIG_RD_LZO is not set +# CONFIG_RD_LZ4 is not set +# CONFIG_IOSCHED_DEADLINE is not set +# CONFIG_IOSCHED_CFQ is not set +CONFIG_PCI=y +CONFIG_PCI_MSI=y +CONFIG_PCIEASPM_POWERSAVE=y +CONFIG_PCIEPORTBUS=y +CONFIG_PCIE_BRCMSTB=y CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ_STAT=y +CONFIG_CPU_FREQ_STAT_DETAILS=y +CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y CONFIG_CPU_FREQ_GOV_POWERSAVE=y CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_CPU_FREQ_GOV_ONDEMAND=y CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y -CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y CONFIG_BMIPS_CPUFREQ=y # CONFIG_BLK_DEV_BSG is not set CONFIG_NET=y @@ -32,32 +50,99 @@ CONFIG_INET=y # CONFIG_INET_DIAG is not set CONFIG_CFG80211=y CONFIG_NL80211_TESTMODE=y +CONFIG_WIRELESS=y CONFIG_MAC80211=y +CONFIG_NL80211=y CONFIG_DEVTMPFS=y CONFIG_DEVTMPFS_MOUNT=y # CONFIG_STANDALONE is not set # CONFIG_PREVENT_FIRMWARE_BUILD is not set +CONFIG_BRCMSTB_GISB_ARB=y +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_IP_MROUTE=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +# CONFIG_INET_XFRM_MODE_TRANSPORT is not set +# CONFIG_INET_XFRM_MODE_TUNNEL is not set +# CONFIG_INET_XFRM_MODE_BEET is not set +# CONFIG_INET_LRO is not set +CONFIG_INET_UDP_DIAG=y +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_BIC=y +# CONFIG_TCP_CONG_WESTWOOD is not set +# CONFIG_TCP_CONG_HTCP is not set +# CONFIG_IPV6 is not set +CONFIG_IP_NF_IPTABLES=y +CONFIG_IP_NF_FILTER=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_XTABLES=y +CONFIG_BRIDGE=y +CONFIG_BRIDGE_NETFILTER=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_NET_DSA=y +CONFIG_NET_SWITCHDEV=y +CONFIG_DMA_CMA=y +CONFIG_CMA_ALIGNMENT=12 +CONFIG_SPI=y +CONFIG_SPI_BRCMSTB=y CONFIG_MTD=y +CONFIG_MTD_CMDLINE_PARTS=y +CONFIG_MTD_BLOCK=y CONFIG_MTD_CFI=y +CONFIG_MTD_JEDECPROBE=y CONFIG_MTD_CFI_INTELEXT=y CONFIG_MTD_CFI_AMDSTD=y -CONFIG_MTD_PHYSMAP=y +CONFIG_MTD_CFI_STAA=y +CONFIG_MTD_ROM=y +CONFIG_MTD_ABSENT=y +CONFIG_MTD_PHYSMAP_OF=y +CONFIG_MTD_M25P80=y +CONFIG_MTD_NAND=y +CONFIG_MTD_NAND_BRCMNAND=y +CONFIG_MTD_SPI_NOR=y +# CONFIG_MTD_SPI_NOR_USE_4K_SECTORS is not set +CONFIG_MTD_UBI=y +CONFIG_MTD_UBI_GLUEBI=y +CONFIG_PROC_DEVICETREE=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_RAM=y +CONFIG_BLK_DEV_RAM_SIZE=8192 # CONFIG_BLK_DEV is not set CONFIG_SCSI=y CONFIG_BLK_DEV_SD=y +CONFIG_CHR_DEV_SG=y +CONFIG_SCSI_MULTI_LUN=y # CONFIG_SCSI_LOWLEVEL is not set CONFIG_NETDEVICES=y +CONFIG_VLAN_8021Q=y +CONFIG_MACVLAN=y CONFIG_BCMGENET=y CONFIG_USB_USBNET=y -# CONFIG_INPUT is not set +CONFIG_INPUT_EVDEV=y +# CONFIG_INPUT_KEYBOARD is not set +# CONFIG_INPUT_MOUSE is not set +CONFIG_INPUT_MISC=y +CONFIG_INPUT_UINPUT=y # CONFIG_SERIO is not set -# CONFIG_VT is not set +CONFIG_VT=y +CONFIG_VT_HW_CONSOLE_BINDING=y +# CONFIG_DEVKMEM is not set CONFIG_SERIAL_8250=y # CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_OF_PLATFORM=y # CONFIG_HW_RANDOM is not set CONFIG_POWER_RESET=y +CONFIG_POWER_RESET_BRCMSTB=y CONFIG_POWER_RESET_SYSCON=y CONFIG_POWER_SUPPLY=y # CONFIG_HWMON is not set @@ -69,22 +154,76 @@ CONFIG_USB_OHCI_HCD=y CONFIG_USB_OHCI_HCD_PLATFORM=y CONFIG_USB_STORAGE=y CONFIG_SOC_BRCMSTB=y +CONFIG_MMC=y +CONFIG_MMC_BLOCK_MINORS=16 +CONFIG_MMC_SDHCI=y +CONFIG_MMC_SDHCI_PLTFM=y CONFIG_EXT4_FS=y CONFIG_EXT4_FS_POSIX_ACL=y CONFIG_EXT4_FS_SECURITY=y # CONFIG_DNOTIFY is not set -CONFIG_FUSE_FS=y -CONFIG_VFAT_FS=y CONFIG_PROC_KCORE=y -CONFIG_TMPFS=y -CONFIG_NFS_FS=y CONFIG_CIFS=y +CONFIG_JBD2_DEBUG=y +CONFIG_FUSE_FS=y +CONFIG_FHANDLE=y +CONFIG_CGROUPS=y +CONFIG_CUSE=y +CONFIG_ISO9660_FS=y +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_TMPFS=y +CONFIG_JFFS2_FS=y +CONFIG_UBIFS_FS=y +CONFIG_SQUASHFS=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_NFS_FS=y +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_ROOT_NFS=y CONFIG_NLS_CODEPAGE_437=y -CONFIG_NLS_ASCII=y CONFIG_NLS_ISO8859_1=y -# CONFIG_CRYPTO_HW is not set CONFIG_PRINTK_TIME=y +CONFIG_DYNAMIC_DEBUG=y +# CONFIG_DEBUG_INFO is not set +# CONFIG_DEBUG_INFO_REDUCED is not set CONFIG_DEBUG_FS=y CONFIG_MAGIC_SYSRQ=y +CONFIG_LOCKUP_DETECTOR=y +CONFIG_DEBUG_USER=y CONFIG_CMDLINE_BOOL=y CONFIG_CMDLINE="earlycon" +# CONFIG_MIPS_CMDLINE_FROM_DTB is not set +CONFIG_MIPS_CMDLINE_DTB_EXTEND=y +# CONFIG_MIPS_CMDLINE_FROM_BOOTLOADER is not set +# CONFIG_CRYPTO_HW is not set +CONFIG_DT_BCM974XX=y +CONFIG_FW_CFE=y +CONFIG_ATA=y +CONFIG_SATA_AHCI_PLATFORM=y +CONFIG_AHCI_BRCMSTB=y +CONFIG_GENERIC_PHY=y +CONFIG_GPIOLIB=y +CONFIG_GPIO_SYSFS=y +CONFIG_PHY_BRCM_USB=y +CONFIG_PHY_BRCM_SATA=y +CONFIG_PM_RUNTIME=y +CONFIG_PM_DEBUG=y +CONFIG_SYSVIPC=y +CONFIG_FUNCTION_GRAPH_TRACER=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_FUNCTION_TRACER=y +CONFIG_FUNCTION_PROFILER=y +CONFIG_IRQSOFF_TRACER=y +CONFIG_SCHED_TRACER=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_FTRACE_SYSCALLS=y +CONFIG_TRACER_SNAPSHOT=y +CONFIG_TRACER_SNAPSHOT_PER_CPU_SWAP=y +CONFIG_STACK_TRACER=y diff --git a/arch/mips/dec/setup.c b/arch/mips/dec/setup.c index eaad0ed4b523..a8a30bb1dee8 100644 --- a/arch/mips/dec/setup.c +++ b/arch/mips/dec/setup.c @@ -117,21 +117,21 @@ static void __init dec_be_init(void) { switch (mips_machtype) { case MACH_DS23100: /* DS2100/DS3100 Pmin/Pmax */ - board_be_handler = dec_kn01_be_handler; + mips_set_be_handler(dec_kn01_be_handler); busirq_handler = dec_kn01_be_interrupt; busirq_flags |= IRQF_SHARED; dec_kn01_be_init(); break; case MACH_DS5000_1XX: /* DS5000/1xx 3min */ case MACH_DS5000_XX: /* DS5000/xx Maxine */ - board_be_handler = dec_kn02xa_be_handler; + mips_set_be_handler(dec_kn02xa_be_handler); busirq_handler = dec_kn02xa_be_interrupt; dec_kn02xa_be_init(); break; case MACH_DS5000_200: /* DS5000/200 3max */ case MACH_DS5000_2X0: /* DS5000/240 3max+ */ case MACH_DS5900: /* DS5900 bigmax */ - board_be_handler = dec_ecc_be_handler; + mips_set_be_handler(dec_ecc_be_handler); busirq_handler = dec_ecc_be_interrupt; dec_ecc_be_init(); break; diff --git a/arch/mips/include/asm/traps.h b/arch/mips/include/asm/traps.h index b710e76c9c65..15cde638b407 100644 --- a/arch/mips/include/asm/traps.h +++ b/arch/mips/include/asm/traps.h @@ -15,7 +15,7 @@ #define MIPS_BE_FATAL 2 /* treat as an unrecoverable error */ extern void (*board_be_init)(void); -extern int (*board_be_handler)(struct pt_regs *regs, int is_fixup); +void mips_set_be_handler(int (*handler)(struct pt_regs *reg, int is_fixup)); extern void (*board_nmi_handler_setup)(void); extern void (*board_ejtag_handler_setup)(void); diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 6f07362de5ce..d26b0fb8ea06 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -103,13 +103,19 @@ extern asmlinkage void handle_reserved(void); extern void tlb_do_page_fault_0(void); void (*board_be_init)(void); -int (*board_be_handler)(struct pt_regs *regs, int is_fixup); +static int (*board_be_handler)(struct pt_regs *regs, int is_fixup); void (*board_nmi_handler_setup)(void); void (*board_ejtag_handler_setup)(void); void (*board_bind_eic_interrupt)(int irq, int regset); void (*board_ebase_setup)(void); void(*board_cache_error_setup)(void); +void mips_set_be_handler(int (*handler)(struct pt_regs *regs, int is_fixup)) +{ + board_be_handler = handler; +} +EXPORT_SYMBOL_GPL(mips_set_be_handler); + static void show_raw_backtrace(unsigned long reg29, const char *loglvl, bool user) { diff --git a/arch/mips/sgi-ip22/ip22-berr.c b/arch/mips/sgi-ip22/ip22-berr.c index dc0110a607a5..afe8a61078e4 100644 --- a/arch/mips/sgi-ip22/ip22-berr.c +++ b/arch/mips/sgi-ip22/ip22-berr.c @@ -112,5 +112,5 @@ static int ip22_be_handler(struct pt_regs *regs, int is_fixup) void __init ip22_be_init(void) { - board_be_handler = ip22_be_handler; + mips_set_be_handler(ip22_be_handler); } diff --git a/arch/mips/sgi-ip22/ip28-berr.c b/arch/mips/sgi-ip22/ip28-berr.c index c61362d9ea95..16ca470deb80 100644 --- a/arch/mips/sgi-ip22/ip28-berr.c +++ b/arch/mips/sgi-ip22/ip28-berr.c @@ -468,7 +468,7 @@ static int ip28_be_handler(struct pt_regs *regs, int is_fixup) void __init ip22_be_init(void) { - board_be_handler = ip28_be_handler; + mips_set_be_handler(ip28_be_handler); } int ip28_show_be_info(struct seq_file *m) diff --git a/arch/mips/sgi-ip27/ip27-berr.c b/arch/mips/sgi-ip27/ip27-berr.c index 5a38ae6bdfa9..923a63a51cda 100644 --- a/arch/mips/sgi-ip27/ip27-berr.c +++ b/arch/mips/sgi-ip27/ip27-berr.c @@ -85,7 +85,7 @@ void __init ip27_be_init(void) int cpu = LOCAL_HUB_L(PI_CPU_NUM); int cpuoff = cpu << 8; - board_be_handler = ip27_be_handler; + mips_set_be_handler(ip27_be_handler); LOCAL_HUB_S(PI_ERR_INT_PEND, cpu ? PI_ERR_CLEAR_ALL_B : PI_ERR_CLEAR_ALL_A); diff --git a/arch/mips/sgi-ip32/ip32-berr.c b/arch/mips/sgi-ip32/ip32-berr.c index c860f95ab7ed..478b63b4c808 100644 --- a/arch/mips/sgi-ip32/ip32-berr.c +++ b/arch/mips/sgi-ip32/ip32-berr.c @@ -34,5 +34,5 @@ static int ip32_be_handler(struct pt_regs *regs, int is_fixup) void __init ip32_be_init(void) { - board_be_handler = ip32_be_handler; + mips_set_be_handler(ip32_be_handler); } diff --git a/arch/mips/sibyte/swarm/setup.c b/arch/mips/sibyte/swarm/setup.c index f07b15dd1c1a..72a31eeeebba 100644 --- a/arch/mips/sibyte/swarm/setup.c +++ b/arch/mips/sibyte/swarm/setup.c @@ -122,7 +122,7 @@ void __init plat_mem_setup(void) #error invalid SiByte board configuration #endif - board_be_handler = swarm_be_handler; + mips_set_be_handler(swarm_be_handler); if (xicor_probe()) swarm_rtc_type = RTC_XICOR; diff --git a/arch/mips/txx9/generic/setup_tx4927.c b/arch/mips/txx9/generic/setup_tx4927.c index 46e9c4101386..63f9725b2eb0 100644 --- a/arch/mips/txx9/generic/setup_tx4927.c +++ b/arch/mips/txx9/generic/setup_tx4927.c @@ -80,7 +80,7 @@ static int tx4927_be_handler(struct pt_regs *regs, int is_fixup) } static void __init tx4927_be_init(void) { - board_be_handler = tx4927_be_handler; + mips_set_be_handler(tx4927_be_handler); } static struct resource tx4927_sdram_resource[4]; diff --git a/arch/mips/txx9/generic/setup_tx4938.c b/arch/mips/txx9/generic/setup_tx4938.c index 17395d5d15ca..ba646548c5f6 100644 --- a/arch/mips/txx9/generic/setup_tx4938.c +++ b/arch/mips/txx9/generic/setup_tx4938.c @@ -82,7 +82,7 @@ static int tx4938_be_handler(struct pt_regs *regs, int is_fixup) } static void __init tx4938_be_init(void) { - board_be_handler = tx4938_be_handler; + mips_set_be_handler(tx4938_be_handler); } static struct resource tx4938_sdram_resource[4]; diff --git a/arch/mips/txx9/generic/setup_tx4939.c b/arch/mips/txx9/generic/setup_tx4939.c index bf8a3cdababf..f5f59b7401a3 100644 --- a/arch/mips/txx9/generic/setup_tx4939.c +++ b/arch/mips/txx9/generic/setup_tx4939.c @@ -86,7 +86,7 @@ static int tx4939_be_handler(struct pt_regs *regs, int is_fixup) } static void __init tx4939_be_init(void) { - board_be_handler = tx4939_be_handler; + mips_set_be_handler(tx4939_be_handler); } static struct resource tx4939_sdram_resource[4]; diff --git a/arch/mips/vdso/Makefile b/arch/mips/vdso/Makefile index 1b2ea34c3d3b..d65f55f67e19 100644 --- a/arch/mips/vdso/Makefile +++ b/arch/mips/vdso/Makefile @@ -57,7 +57,7 @@ endif # VDSO linker flags. ldflags-y := -Bsymbolic --no-undefined -soname=linux-vdso.so.1 \ - $(filter -E%,$(KBUILD_CFLAGS)) -nostdlib -shared \ + $(filter -E%,$(KBUILD_CFLAGS)) -shared \ -G 0 --eh-frame-hdr --hash-style=sysv --build-id=sha1 -T CFLAGS_REMOVE_vdso.o = $(CC_FLAGS_FTRACE) diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index a34c531be4e7..821252b65f89 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -62,6 +62,7 @@ config RISCV select GENERIC_SCHED_CLOCK select GENERIC_SMP_IDLE_THREAD select GENERIC_TIME_VSYSCALL if MMU && 64BIT + select GENERIC_VDSO_TIME_NS if HAVE_GENERIC_VDSO select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL select HAVE_ARCH_JUMP_LABEL_RELATIVE if !XIP_KERNEL diff --git a/arch/riscv/Makefile b/arch/riscv/Makefile index 7f19b784e649..5927c94302b8 100644 --- a/arch/riscv/Makefile +++ b/arch/riscv/Makefile @@ -136,3 +136,13 @@ zinstall: install-image = Image.gz install zinstall: $(CONFIG_SHELL) $(srctree)/$(boot)/install.sh $(KERNELRELEASE) \ $(boot)/$(install-image) System.map "$(INSTALL_PATH)" + +PHONY += rv32_randconfig +rv32_randconfig: + $(Q)$(MAKE) KCONFIG_ALLCONFIG=$(srctree)/arch/riscv/configs/32-bit.config \ + -f $(srctree)/Makefile randconfig + +PHONY += rv64_randconfig +rv64_randconfig: + $(Q)$(MAKE) KCONFIG_ALLCONFIG=$(srctree)/arch/riscv/configs/64-bit.config \ + -f $(srctree)/Makefile randconfig diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts index b254c60589a1..fc1e5869df1b 100644 --- a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts +++ b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts @@ -9,10 +9,8 @@ #define RTCCLK_FREQ 1000000 / { - #address-cells = <2>; - #size-cells = <2>; model = "Microchip PolarFire-SoC Icicle Kit"; - compatible = "microchip,mpfs-icicle-kit"; + compatible = "microchip,mpfs-icicle-kit", "microchip,mpfs"; aliases { ethernet0 = &emac1; @@ -35,9 +33,6 @@ reg = <0x0 0x80000000 0x0 0x40000000>; clocks = <&clkcfg 26>; }; - - soc { - }; }; &serial0 { @@ -56,8 +51,17 @@ status = "okay"; }; -&sdcard { +&mmc { status = "okay"; + + bus-width = <4>; + disable-wp; + cap-sd-highspeed; + card-detect-delay = <200>; + sd-uhs-sdr12; + sd-uhs-sdr25; + sd-uhs-sdr50; + sd-uhs-sdr104; }; &emac0 { diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi index 9d2fbbc1f777..c9f6d205d2ba 100644 --- a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi +++ b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi @@ -6,8 +6,8 @@ / { #address-cells = <2>; #size-cells = <2>; - model = "Microchip MPFS Icicle Kit"; - compatible = "microchip,mpfs-icicle-kit"; + model = "Microchip PolarFire SoC"; + compatible = "microchip,mpfs"; chosen { }; @@ -161,7 +161,7 @@ }; clint@2000000 { - compatible = "sifive,clint0"; + compatible = "sifive,fu540-c000-clint", "sifive,clint0"; reg = <0x0 0x2000000 0x0 0xC000>; interrupts-extended = <&cpu0_intc 3 &cpu0_intc 7 &cpu1_intc 3 &cpu1_intc 7 @@ -172,7 +172,7 @@ plic: interrupt-controller@c000000 { #interrupt-cells = <1>; - compatible = "sifive,plic-1.0.0"; + compatible = "sifive,fu540-c000-plic", "sifive,plic-1.0.0"; reg = <0x0 0xc000000 0x0 0x4000000>; riscv,ndev = <186>; interrupt-controller; @@ -262,39 +262,13 @@ status = "disabled"; }; - emmc: mmc@20008000 { - compatible = "cdns,sd4hc"; + /* Common node entry for emmc/sd */ + mmc: mmc@20008000 { + compatible = "microchip,mpfs-sd4hc", "cdns,sd4hc"; reg = <0x0 0x20008000 0x0 0x1000>; interrupt-parent = <&plic>; interrupts = <88 89>; - pinctrl-names = "default"; clocks = <&clkcfg 6>; - bus-width = <4>; - cap-mmc-highspeed; - mmc-ddr-3_3v; - max-frequency = <200000000>; - non-removable; - no-sd; - no-sdio; - voltage-ranges = <3300 3300>; - status = "disabled"; - }; - - sdcard: sdhc@20008000 { - compatible = "cdns,sd4hc"; - reg = <0x0 0x20008000 0x0 0x1000>; - interrupt-parent = <&plic>; - interrupts = <88>; - pinctrl-names = "default"; - clocks = <&clkcfg 6>; - bus-width = <4>; - disable-wp; - cap-sd-highspeed; - card-detect-delay = <200>; - sd-uhs-sdr12; - sd-uhs-sdr25; - sd-uhs-sdr50; - sd-uhs-sdr104; max-frequency = <200000000>; status = "disabled"; }; diff --git a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi index 7db861053483..0655b5c4201d 100644 --- a/arch/riscv/boot/dts/sifive/fu540-c000.dtsi +++ b/arch/riscv/boot/dts/sifive/fu540-c000.dtsi @@ -141,7 +141,7 @@ ranges; plic0: interrupt-controller@c000000 { #interrupt-cells = <1>; - compatible = "sifive,plic-1.0.0"; + compatible = "sifive,fu540-c000-plic", "sifive,plic-1.0.0"; reg = <0x0 0xc000000 0x0 0x4000000>; riscv,ndev = <53>; interrupt-controller; diff --git a/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts b/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts index 60846e88ae4b..ba304d4c455c 100644 --- a/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts +++ b/arch/riscv/boot/dts/sifive/hifive-unleashed-a00.dts @@ -8,10 +8,9 @@ #define RTCCLK_FREQ 1000000 / { - #address-cells = <2>; - #size-cells = <2>; model = "SiFive HiFive Unleashed A00"; - compatible = "sifive,hifive-unleashed-a00", "sifive,fu540-c000"; + compatible = "sifive,hifive-unleashed-a00", "sifive,fu540-c000", + "sifive,fu540"; chosen { stdout-path = "serial0"; @@ -26,9 +25,6 @@ reg = <0x0 0x80000000 0x2 0x00000000>; }; - soc { - }; - hfclk: hfclk { #clock-cells = <0>; compatible = "fixed-clock"; @@ -63,7 +59,7 @@ &qspi0 { status = "okay"; flash@0 { - compatible = "issi,is25wp256", "jedec,spi-nor"; + compatible = "jedec,spi-nor"; reg = <0>; spi-max-frequency = <50000000>; m25p,fast-read; diff --git a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts index 2e4ea84f27e7..4f66919215f6 100644 --- a/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts +++ b/arch/riscv/boot/dts/sifive/hifive-unmatched-a00.dts @@ -8,8 +8,6 @@ #define RTCCLK_FREQ 1000000 / { - #address-cells = <2>; - #size-cells = <2>; model = "SiFive HiFive Unmatched A00"; compatible = "sifive,hifive-unmatched-a00", "sifive,fu740-c000", "sifive,fu740"; @@ -27,9 +25,6 @@ reg = <0x0 0x80000000 0x4 0x00000000>; }; - soc { - }; - hfclk: hfclk { #clock-cells = <0>; compatible = "fixed-clock"; @@ -211,7 +206,7 @@ &qspi0 { status = "okay"; flash@0 { - compatible = "issi,is25wp256", "jedec,spi-nor"; + compatible = "jedec,spi-nor"; reg = <0>; spi-max-frequency = <50000000>; m25p,fast-read; diff --git a/arch/riscv/configs/32-bit.config b/arch/riscv/configs/32-bit.config new file mode 100644 index 000000000000..43f41323b67e --- /dev/null +++ b/arch/riscv/configs/32-bit.config @@ -0,0 +1,2 @@ +CONFIG_ARCH_RV32I=y +CONFIG_32BIT=y diff --git a/arch/riscv/configs/64-bit.config b/arch/riscv/configs/64-bit.config new file mode 100644 index 000000000000..313edc554d84 --- /dev/null +++ b/arch/riscv/configs/64-bit.config @@ -0,0 +1,2 @@ +CONFIG_ARCH_RV64I=y +CONFIG_64BIT=y diff --git a/arch/riscv/configs/defconfig b/arch/riscv/configs/defconfig index 4ebc80315f01..c252fd5706d2 100644 --- a/arch/riscv/configs/defconfig +++ b/arch/riscv/configs/defconfig @@ -72,9 +72,10 @@ CONFIG_GPIOLIB=y CONFIG_GPIO_SIFIVE=y # CONFIG_PTP_1588_CLOCK is not set CONFIG_POWER_RESET=y -CONFIG_DRM=y -CONFIG_DRM_RADEON=y -CONFIG_DRM_VIRTIO_GPU=y +CONFIG_DRM=m +CONFIG_DRM_RADEON=m +CONFIG_DRM_NOUVEAU=m +CONFIG_DRM_VIRTIO_GPU=m CONFIG_FRAMEBUFFER_CONSOLE=y CONFIG_USB=y CONFIG_USB_XHCI_HCD=y diff --git a/arch/riscv/include/asm/page.h b/arch/riscv/include/asm/page.h index 109c97e991a6..b3e5ff0125fe 100644 --- a/arch/riscv/include/asm/page.h +++ b/arch/riscv/include/asm/page.h @@ -157,6 +157,8 @@ extern phys_addr_t __phys_addr_symbol(unsigned long x); #define page_to_bus(page) (page_to_phys(page)) #define phys_to_page(paddr) (pfn_to_page(phys_to_pfn(paddr))) +#define sym_to_pfn(x) __phys_to_pfn(__pa_symbol(x)) + #ifdef CONFIG_FLATMEM #define pfn_valid(pfn) \ (((pfn) >= ARCH_PFN_OFFSET) && (((pfn) - ARCH_PFN_OFFSET) < max_mapnr)) diff --git a/arch/riscv/include/asm/pgtable.h b/arch/riscv/include/asm/pgtable.h index 39b550310ec6..bf204e7c1f74 100644 --- a/arch/riscv/include/asm/pgtable.h +++ b/arch/riscv/include/asm/pgtable.h @@ -75,7 +75,8 @@ #endif #ifdef CONFIG_XIP_KERNEL -#define XIP_OFFSET SZ_8M +#define XIP_OFFSET SZ_32M +#define XIP_OFFSET_MASK (SZ_32M - 1) #else #define XIP_OFFSET 0 #endif @@ -97,7 +98,8 @@ #ifdef CONFIG_XIP_KERNEL #define XIP_FIXUP(addr) ({ \ uintptr_t __a = (uintptr_t)(addr); \ - (__a >= CONFIG_XIP_PHYS_ADDR && __a < CONFIG_XIP_PHYS_ADDR + SZ_16M) ? \ + (__a >= CONFIG_XIP_PHYS_ADDR && \ + __a < CONFIG_XIP_PHYS_ADDR + XIP_OFFSET * 2) ? \ __a - CONFIG_XIP_PHYS_ADDR + CONFIG_PHYS_RAM_BASE - XIP_OFFSET :\ __a; \ }) diff --git a/arch/riscv/include/asm/vdso.h b/arch/riscv/include/asm/vdso.h index 208e31bc5d1c..bc6f75f3a199 100644 --- a/arch/riscv/include/asm/vdso.h +++ b/arch/riscv/include/asm/vdso.h @@ -8,30 +8,19 @@ #ifndef _ASM_RISCV_VDSO_H #define _ASM_RISCV_VDSO_H - /* * All systems with an MMU have a VDSO, but systems without an MMU don't * support shared libraries and therefor don't have one. */ #ifdef CONFIG_MMU -#include -/* - * All systems with an MMU have a VDSO, but systems without an MMU don't - * support shared libraries and therefor don't have one. - */ -#ifdef CONFIG_MMU - -#define __VVAR_PAGES 1 +#define __VVAR_PAGES 2 #ifndef __ASSEMBLY__ #include #define VDSO_SYMBOL(base, name) \ (void __user *)((unsigned long)(base) + __vdso_##name##_offset) - -#endif /* CONFIG_MMU */ - #endif /* !__ASSEMBLY__ */ #endif /* CONFIG_MMU */ diff --git a/arch/riscv/include/asm/vdso/gettimeofday.h b/arch/riscv/include/asm/vdso/gettimeofday.h index f839f16e0d2a..77d9c2f721c4 100644 --- a/arch/riscv/include/asm/vdso/gettimeofday.h +++ b/arch/riscv/include/asm/vdso/gettimeofday.h @@ -76,6 +76,13 @@ static __always_inline const struct vdso_data *__arch_get_vdso_data(void) return _vdso_data; } +#ifdef CONFIG_TIME_NS +static __always_inline +const struct vdso_data *__arch_get_timens_vdso_data(const struct vdso_data *vd) +{ + return _timens_data; +} +#endif #endif /* !__ASSEMBLY__ */ #endif /* __ASM_VDSO_GETTIMEOFDAY_H */ diff --git a/arch/riscv/kernel/head.S b/arch/riscv/kernel/head.S index 25ec50573957..f52f01ecbeea 100644 --- a/arch/riscv/kernel/head.S +++ b/arch/riscv/kernel/head.S @@ -20,10 +20,20 @@ REG_L t0, _xip_fixup add \reg, \reg, t0 .endm +.macro XIP_FIXUP_FLASH_OFFSET reg + la t1, __data_loc + li t0, XIP_OFFSET_MASK + and t1, t1, t0 + li t1, XIP_OFFSET + sub t0, t0, t1 + sub \reg, \reg, t0 +.endm _xip_fixup: .dword CONFIG_PHYS_RAM_BASE - CONFIG_XIP_PHYS_ADDR - XIP_OFFSET #else .macro XIP_FIXUP_OFFSET reg .endm +.macro XIP_FIXUP_FLASH_OFFSET reg +.endm #endif /* CONFIG_XIP_KERNEL */ __HEAD @@ -267,6 +277,7 @@ pmp_done: la a3, hart_lottery mv a2, a3 XIP_FIXUP_OFFSET a2 + XIP_FIXUP_FLASH_OFFSET a3 lw t1, (a3) amoswap.w t0, t1, (a2) /* first time here if hart_lottery in RAM is not set */ @@ -305,6 +316,7 @@ clear_bss_done: XIP_FIXUP_OFFSET sp #ifdef CONFIG_BUILTIN_DTB la a0, __dtb_start + XIP_FIXUP_OFFSET a0 #else mv a0, s1 #endif /* CONFIG_BUILTIN_DTB */ diff --git a/arch/riscv/kernel/reset.c b/arch/riscv/kernel/reset.c index ee5878d968cc..9c842c41684a 100644 --- a/arch/riscv/kernel/reset.c +++ b/arch/riscv/kernel/reset.c @@ -12,7 +12,7 @@ static void default_power_off(void) wait_for_interrupt(); } -void (*pm_power_off)(void) = default_power_off; +void (*pm_power_off)(void) = NULL; EXPORT_SYMBOL(pm_power_off); void machine_restart(char *cmd) @@ -23,10 +23,16 @@ void machine_restart(char *cmd) void machine_halt(void) { - pm_power_off(); + if (pm_power_off != NULL) + pm_power_off(); + else + default_power_off(); } void machine_power_off(void) { - pm_power_off(); + if (pm_power_off != NULL) + pm_power_off(); + else + default_power_off(); } diff --git a/arch/riscv/kernel/vdso.c b/arch/riscv/kernel/vdso.c index b70956d80408..a9436a65161a 100644 --- a/arch/riscv/kernel/vdso.c +++ b/arch/riscv/kernel/vdso.c @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_GENERIC_TIME_VSYSCALL #include @@ -25,14 +26,12 @@ extern char vdso_start[], vdso_end[]; enum vvar_pages { VVAR_DATA_PAGE_OFFSET, + VVAR_TIMENS_PAGE_OFFSET, VVAR_NR_PAGES, }; #define VVAR_SIZE (VVAR_NR_PAGES << PAGE_SHIFT) -static unsigned int vdso_pages __ro_after_init; -static struct page **vdso_pagelist __ro_after_init; - /* * The vDSO data page. */ @@ -42,83 +41,228 @@ static union { } vdso_data_store __page_aligned_data; struct vdso_data *vdso_data = &vdso_data_store.data; -static int __init vdso_init(void) +struct __vdso_info { + const char *name; + const char *vdso_code_start; + const char *vdso_code_end; + unsigned long vdso_pages; + /* Data Mapping */ + struct vm_special_mapping *dm; + /* Code Mapping */ + struct vm_special_mapping *cm; +}; + +static struct __vdso_info vdso_info __ro_after_init = { + .name = "vdso", + .vdso_code_start = vdso_start, + .vdso_code_end = vdso_end, +}; + +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) { - unsigned int i; - - vdso_pages = (vdso_end - vdso_start) >> PAGE_SHIFT; - vdso_pagelist = - kcalloc(vdso_pages + VVAR_NR_PAGES, sizeof(struct page *), GFP_KERNEL); - if (unlikely(vdso_pagelist == NULL)) { - pr_err("vdso: pagelist allocation failed\n"); - return -ENOMEM; - } - - for (i = 0; i < vdso_pages; i++) { - struct page *pg; - - pg = virt_to_page(vdso_start + (i << PAGE_SHIFT)); - vdso_pagelist[i] = pg; - } - vdso_pagelist[i] = virt_to_page(vdso_data); + current->mm->context.vdso = (void *)new_vma->vm_start; return 0; } + +static int __init __vdso_init(void) +{ + unsigned int i; + struct page **vdso_pagelist; + unsigned long pfn; + + if (memcmp(vdso_info.vdso_code_start, "\177ELF", 4)) { + pr_err("vDSO is not a valid ELF object!\n"); + return -EINVAL; + } + + vdso_info.vdso_pages = ( + vdso_info.vdso_code_end - + vdso_info.vdso_code_start) >> + PAGE_SHIFT; + + vdso_pagelist = kcalloc(vdso_info.vdso_pages, + sizeof(struct page *), + GFP_KERNEL); + if (vdso_pagelist == NULL) + return -ENOMEM; + + /* Grab the vDSO code pages. */ + pfn = sym_to_pfn(vdso_info.vdso_code_start); + + for (i = 0; i < vdso_info.vdso_pages; i++) + vdso_pagelist[i] = pfn_to_page(pfn + i); + + vdso_info.cm->pages = vdso_pagelist; + + return 0; +} + +#ifdef CONFIG_TIME_NS +struct vdso_data *arch_get_vdso_data(void *vvar_page) +{ + return (struct vdso_data *)(vvar_page); +} + +/* + * The vvar mapping contains data for a specific time namespace, so when a task + * changes namespace we must unmap its vvar data for the old namespace. + * Subsequent faults will map in data for the new namespace. + * + * For more details see timens_setup_vdso_data(). + */ +int vdso_join_timens(struct task_struct *task, struct time_namespace *ns) +{ + struct mm_struct *mm = task->mm; + struct vm_area_struct *vma; + + mmap_read_lock(mm); + + for (vma = mm->mmap; vma; vma = vma->vm_next) { + unsigned long size = vma->vm_end - vma->vm_start; + + if (vma_is_special_mapping(vma, vdso_info.dm)) + zap_page_range(vma, vma->vm_start, size); + } + + mmap_read_unlock(mm); + return 0; +} + +static struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + if (likely(vma->vm_mm == current->mm)) + return current->nsproxy->time_ns->vvar_page; + + /* + * VM_PFNMAP | VM_IO protect .fault() handler from being called + * through interfaces like /proc/$pid/mem or + * process_vm_{readv,writev}() as long as there's no .access() + * in special_mapping_vmops. + * For more details check_vma_flags() and __access_remote_vm() + */ + WARN(1, "vvar_page accessed remotely"); + + return NULL; +} +#else +static struct page *find_timens_vvar_page(struct vm_area_struct *vma) +{ + return NULL; +} +#endif + +static vm_fault_t vvar_fault(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct page *timens_page = find_timens_vvar_page(vma); + unsigned long pfn; + + switch (vmf->pgoff) { + case VVAR_DATA_PAGE_OFFSET: + if (timens_page) + pfn = page_to_pfn(timens_page); + else + pfn = sym_to_pfn(vdso_data); + break; +#ifdef CONFIG_TIME_NS + case VVAR_TIMENS_PAGE_OFFSET: + /* + * If a task belongs to a time namespace then a namespace + * specific VVAR is mapped with the VVAR_DATA_PAGE_OFFSET and + * the real VVAR page is mapped with the VVAR_TIMENS_PAGE_OFFSET + * offset. + * See also the comment near timens_setup_vdso_data(). + */ + if (!timens_page) + return VM_FAULT_SIGBUS; + pfn = sym_to_pfn(vdso_data); + break; +#endif /* CONFIG_TIME_NS */ + default: + return VM_FAULT_SIGBUS; + } + + return vmf_insert_pfn(vma, vmf->address, pfn); +} + +enum rv_vdso_map { + RV_VDSO_MAP_VVAR, + RV_VDSO_MAP_VDSO, +}; + +static struct vm_special_mapping rv_vdso_maps[] __ro_after_init = { + [RV_VDSO_MAP_VVAR] = { + .name = "[vvar]", + .fault = vvar_fault, + }, + [RV_VDSO_MAP_VDSO] = { + .name = "[vdso]", + .mremap = vdso_mremap, + }, +}; + +static int __init vdso_init(void) +{ + vdso_info.dm = &rv_vdso_maps[RV_VDSO_MAP_VVAR]; + vdso_info.cm = &rv_vdso_maps[RV_VDSO_MAP_VDSO]; + + return __vdso_init(); +} arch_initcall(vdso_init); -int arch_setup_additional_pages(struct linux_binprm *bprm, - int uses_interp) +static int __setup_additional_pages(struct mm_struct *mm, + struct linux_binprm *bprm, + int uses_interp) { - struct mm_struct *mm = current->mm; - unsigned long vdso_base, vdso_len; - int ret; + unsigned long vdso_base, vdso_text_len, vdso_mapping_len; + void *ret; BUILD_BUG_ON(VVAR_NR_PAGES != __VVAR_PAGES); - vdso_len = (vdso_pages + VVAR_NR_PAGES) << PAGE_SHIFT; + vdso_text_len = vdso_info.vdso_pages << PAGE_SHIFT; + /* Be sure to map the data page */ + vdso_mapping_len = vdso_text_len + VVAR_SIZE; + + vdso_base = get_unmapped_area(NULL, 0, vdso_mapping_len, 0, 0); + if (IS_ERR_VALUE(vdso_base)) { + ret = ERR_PTR(vdso_base); + goto up_fail; + } + + ret = _install_special_mapping(mm, vdso_base, VVAR_SIZE, + (VM_READ | VM_MAYREAD | VM_PFNMAP), vdso_info.dm); + if (IS_ERR(ret)) + goto up_fail; + + vdso_base += VVAR_SIZE; + mm->context.vdso = (void *)vdso_base; + ret = + _install_special_mapping(mm, vdso_base, vdso_text_len, + (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC), + vdso_info.cm); + + if (IS_ERR(ret)) + goto up_fail; + + return 0; + +up_fail: + mm->context.vdso = NULL; + return PTR_ERR(ret); +} + +int arch_setup_additional_pages(struct linux_binprm *bprm, int uses_interp) +{ + struct mm_struct *mm = current->mm; + int ret; if (mmap_write_lock_killable(mm)) return -EINTR; - vdso_base = get_unmapped_area(NULL, 0, vdso_len, 0, 0); - if (IS_ERR_VALUE(vdso_base)) { - ret = vdso_base; - goto end; - } - - mm->context.vdso = NULL; - ret = install_special_mapping(mm, vdso_base, VVAR_SIZE, - (VM_READ | VM_MAYREAD), &vdso_pagelist[vdso_pages]); - if (unlikely(ret)) - goto end; - - ret = - install_special_mapping(mm, vdso_base + VVAR_SIZE, - vdso_pages << PAGE_SHIFT, - (VM_READ | VM_EXEC | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC), - vdso_pagelist); - - if (unlikely(ret)) - goto end; - - /* - * Put vDSO base into mm struct. We need to do this before calling - * install_special_mapping or the perf counter mmap tracking code - * will fail to recognise it as a vDSO (since arch_vma_name fails). - */ - mm->context.vdso = (void *)vdso_base + VVAR_SIZE; - -end: + ret = __setup_additional_pages(mm, bprm, uses_interp); mmap_write_unlock(mm); + return ret; } - -const char *arch_vma_name(struct vm_area_struct *vma) -{ - if (vma->vm_mm && (vma->vm_start == (long)vma->vm_mm->context.vdso)) - return "[vdso]"; - if (vma->vm_mm && (vma->vm_start == - (long)vma->vm_mm->context.vdso - VVAR_SIZE)) - return "[vdso_data]"; - return NULL; -} diff --git a/arch/riscv/kernel/vdso/vdso.lds.S b/arch/riscv/kernel/vdso/vdso.lds.S index e9111f700af0..01d94aae5bf5 100644 --- a/arch/riscv/kernel/vdso/vdso.lds.S +++ b/arch/riscv/kernel/vdso/vdso.lds.S @@ -10,6 +10,9 @@ OUTPUT_ARCH(riscv) SECTIONS { PROVIDE(_vdso_data = . - __VVAR_PAGES * PAGE_SIZE); +#ifdef CONFIG_TIME_NS + PROVIDE(_timens_data = _vdso_data + PAGE_SIZE); +#endif . = SIZEOF_HEADERS; .hash : { *(.hash) } :text diff --git a/arch/riscv/kernel/vmlinux-xip.lds.S b/arch/riscv/kernel/vmlinux-xip.lds.S index 9c9f35091ef0..f5ed08262139 100644 --- a/arch/riscv/kernel/vmlinux-xip.lds.S +++ b/arch/riscv/kernel/vmlinux-xip.lds.S @@ -64,8 +64,11 @@ SECTIONS /* * From this point, stuff is considered writable and will be copied to RAM */ - __data_loc = ALIGN(16); /* location in file */ - . = LOAD_OFFSET + XIP_OFFSET; /* location in memory */ + __data_loc = ALIGN(PAGE_SIZE); /* location in file */ + . = KERNEL_LINK_ADDR + XIP_OFFSET; /* location in memory */ + +#undef LOAD_OFFSET +#define LOAD_OFFSET (KERNEL_LINK_ADDR + XIP_OFFSET - (__data_loc & XIP_OFFSET_MASK)) _sdata = .; /* Start of data section */ _data = .; @@ -96,7 +99,6 @@ SECTIONS KEEP(*(__soc_builtin_dtb_table)) __soc_builtin_dtb_table_end = .; } - PERCPU_SECTION(L1_CACHE_BYTES) . = ALIGN(8); .alternative : { @@ -122,6 +124,8 @@ SECTIONS BSS_SECTION(PAGE_SIZE, PAGE_SIZE, 0) + PERCPU_SECTION(L1_CACHE_BYTES) + .rel.dyn : AT(ADDR(.rel.dyn) - LOAD_OFFSET) { *(.rel.dyn*) } diff --git a/arch/riscv/mm/context.c b/arch/riscv/mm/context.c index ee3459cb6750..ea54cc0c9106 100644 --- a/arch/riscv/mm/context.c +++ b/arch/riscv/mm/context.c @@ -233,8 +233,10 @@ static int __init asids_init(void) local_flush_tlb_all(); /* Pre-compute ASID details */ - num_asids = 1 << asid_bits; - asid_mask = num_asids - 1; + if (asid_bits) { + num_asids = 1 << asid_bits; + asid_mask = num_asids - 1; + } /* * Use ASID allocator only if number of HW ASIDs are @@ -255,7 +257,7 @@ static int __init asids_init(void) pr_info("ASID allocator using %lu bits (%lu entries)\n", asid_bits, num_asids); } else { - pr_info("ASID allocator disabled\n"); + pr_info("ASID allocator disabled (%lu bits)\n", asid_bits); } return 0; diff --git a/arch/riscv/mm/init.c b/arch/riscv/mm/init.c index c0cddf0fc22d..24b2b8044602 100644 --- a/arch/riscv/mm/init.c +++ b/arch/riscv/mm/init.c @@ -41,7 +41,7 @@ phys_addr_t phys_ram_base __ro_after_init; EXPORT_SYMBOL(phys_ram_base); #ifdef CONFIG_XIP_KERNEL -extern char _xiprom[], _exiprom[]; +extern char _xiprom[], _exiprom[], __data_loc; #endif unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)] @@ -454,10 +454,9 @@ static uintptr_t __init best_map_size(phys_addr_t base, phys_addr_t size) /* called from head.S with MMU off */ asmlinkage void __init __copy_data(void) { - void *from = (void *)(&_sdata); - void *end = (void *)(&_end); + void *from = (void *)(&__data_loc); void *to = (void *)CONFIG_PHYS_RAM_BASE; - size_t sz = (size_t)(end - from + 1); + size_t sz = (size_t)((uintptr_t)(&_end) - (uintptr_t)(&_sdata)); memcpy(to, from, sz); } diff --git a/arch/s390/include/asm/pci.h b/arch/s390/include/asm/pci.h index 6b3c366af78e..90824be5ce9a 100644 --- a/arch/s390/include/asm/pci.h +++ b/arch/s390/include/asm/pci.h @@ -210,9 +210,11 @@ int zpci_deconfigure_device(struct zpci_dev *zdev); void zpci_device_reserved(struct zpci_dev *zdev); bool zpci_is_device_configured(struct zpci_dev *zdev); +int zpci_hot_reset_device(struct zpci_dev *zdev); int zpci_register_ioat(struct zpci_dev *, u8, u64, u64, u64); int zpci_unregister_ioat(struct zpci_dev *, u8); void zpci_remove_reserved_devices(void); +void zpci_update_fh(struct zpci_dev *zdev, u32 fh); /* CLP */ int clp_setup_writeback_mio(void); @@ -294,8 +296,10 @@ void zpci_debug_exit(void); void zpci_debug_init_device(struct zpci_dev *, const char *); void zpci_debug_exit_device(struct zpci_dev *); -/* Error reporting */ +/* Error handling */ int zpci_report_error(struct pci_dev *, struct zpci_report_error_header *); +int zpci_clear_error_state(struct zpci_dev *zdev); +int zpci_reset_load_store_blocked(struct zpci_dev *zdev); #ifdef CONFIG_NUMA diff --git a/arch/s390/kernel/perf_cpum_cf.c b/arch/s390/kernel/perf_cpum_cf.c index 6f431fa9e4d7..ee8707abdb6a 100644 --- a/arch/s390/kernel/perf_cpum_cf.c +++ b/arch/s390/kernel/perf_cpum_cf.c @@ -687,8 +687,10 @@ static void cpumf_pmu_stop(struct perf_event *event, int flags) false); if (cfdiag_diffctr(cpuhw, event->hw.config_base)) cfdiag_push_sample(event, cpuhw); - } else + } else if (cpuhw->flags & PMU_F_RESERVED) { + /* Only update when PMU not hotplugged off */ hw_perf_event_update(event); + } hwc->state |= PERF_HES_UPTODATE; } } diff --git a/arch/s390/pci/pci.c b/arch/s390/pci/pci.c index 872d772b73d2..2f9b78fa82a5 100644 --- a/arch/s390/pci/pci.c +++ b/arch/s390/pci/pci.c @@ -481,6 +481,34 @@ static void zpci_free_iomap(struct zpci_dev *zdev, int entry) spin_unlock(&zpci_iomap_lock); } +static void zpci_do_update_iomap_fh(struct zpci_dev *zdev, u32 fh) +{ + int bar, idx; + + spin_lock(&zpci_iomap_lock); + for (bar = 0; bar < PCI_STD_NUM_BARS; bar++) { + if (!zdev->bars[bar].size) + continue; + idx = zdev->bars[bar].map_idx; + if (!zpci_iomap_start[idx].count) + continue; + WRITE_ONCE(zpci_iomap_start[idx].fh, zdev->fh); + } + spin_unlock(&zpci_iomap_lock); +} + +void zpci_update_fh(struct zpci_dev *zdev, u32 fh) +{ + if (!fh || zdev->fh == fh) + return; + + zdev->fh = fh; + if (zpci_use_mio(zdev)) + return; + if (zdev->has_resources && zdev_enabled(zdev)) + zpci_do_update_iomap_fh(zdev, fh); +} + static struct resource *__alloc_res(struct zpci_dev *zdev, unsigned long start, unsigned long size, unsigned long flags) { @@ -668,7 +696,7 @@ int zpci_enable_device(struct zpci_dev *zdev) if (clp_enable_fh(zdev, &fh, ZPCI_NR_DMA_SPACES)) rc = -EIO; else - zdev->fh = fh; + zpci_update_fh(zdev, fh); return rc; } @@ -679,14 +707,14 @@ int zpci_disable_device(struct zpci_dev *zdev) cc = clp_disable_fh(zdev, &fh); if (!cc) { - zdev->fh = fh; + zpci_update_fh(zdev, fh); } else if (cc == CLP_RC_SETPCIFN_ALRDY) { pr_info("Disabling PCI function %08x had no effect as it was already disabled\n", zdev->fid); /* Function is already disabled - update handle */ rc = clp_refresh_fh(zdev->fid, &fh); if (!rc) { - zdev->fh = fh; + zpci_update_fh(zdev, fh); rc = -EINVAL; } } else { @@ -695,6 +723,65 @@ int zpci_disable_device(struct zpci_dev *zdev) return rc; } +/** + * zpci_hot_reset_device - perform a reset of the given zPCI function + * @zdev: the slot which should be reset + * + * Performs a low level reset of the zPCI function. The reset is low level in + * the sense that the zPCI function can be reset without detaching it from the + * common PCI subsystem. The reset may be performed while under control of + * either DMA or IOMMU APIs in which case the existing DMA/IOMMU translation + * table is reinstated at the end of the reset. + * + * After the reset the functions internal state is reset to an initial state + * equivalent to its state during boot when first probing a driver. + * Consequently after reset the PCI function requires re-initialization via the + * common PCI code including re-enabling IRQs via pci_alloc_irq_vectors() + * and enabling the function via e.g.pci_enablde_device_flags().The caller + * must guard against concurrent reset attempts. + * + * In most cases this function should not be called directly but through + * pci_reset_function() or pci_reset_bus() which handle the save/restore and + * locking. + * + * Return: 0 on success and an error value otherwise + */ +int zpci_hot_reset_device(struct zpci_dev *zdev) +{ + int rc; + + zpci_dbg(3, "rst fid:%x, fh:%x\n", zdev->fid, zdev->fh); + if (zdev_enabled(zdev)) { + /* Disables device access, DMAs and IRQs (reset state) */ + rc = zpci_disable_device(zdev); + /* + * Due to a z/VM vs LPAR inconsistency in the error state the + * FH may indicate an enabled device but disable says the + * device is already disabled don't treat it as an error here. + */ + if (rc == -EINVAL) + rc = 0; + if (rc) + return rc; + } + + rc = zpci_enable_device(zdev); + if (rc) + return rc; + + if (zdev->dma_table) + rc = zpci_register_ioat(zdev, 0, zdev->start_dma, zdev->end_dma, + (u64)zdev->dma_table); + else + rc = zpci_dma_init_device(zdev); + if (rc) { + zpci_disable_device(zdev); + return rc; + } + + return 0; +} + /** * zpci_create_device() - Create a new zpci_dev and add it to the zbus * @fid: Function ID of the device to be created @@ -776,7 +863,7 @@ int zpci_scan_configured_device(struct zpci_dev *zdev, u32 fh) { int rc; - zdev->fh = fh; + zpci_update_fh(zdev, fh); /* the PCI function will be scanned once function 0 appears */ if (!zdev->zbus->bus) return 0; @@ -903,6 +990,59 @@ int zpci_report_error(struct pci_dev *pdev, } EXPORT_SYMBOL(zpci_report_error); +/** + * zpci_clear_error_state() - Clears the zPCI error state of the device + * @zdev: The zdev for which the zPCI error state should be reset + * + * Clear the zPCI error state of the device. If clearing the zPCI error state + * fails the device is left in the error state. In this case it may make sense + * to call zpci_io_perm_failure() on the associated pdev if it exists. + * + * Returns: 0 on success, -EIO otherwise + */ +int zpci_clear_error_state(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_RESET_ERROR); + struct zpci_fib fib = {0}; + u8 status; + int cc; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc) { + zpci_dbg(3, "ces fid:%x, cc:%d, status:%x\n", zdev->fid, cc, status); + return -EIO; + } + + return 0; +} + +/** + * zpci_reset_load_store_blocked() - Re-enables L/S from error state + * @zdev: The zdev for which to unblock load/store access + * + * Re-enables load/store access for a PCI function in the error state while + * keeping DMA blocked. In this state drivers can poke MMIO space to determine + * if error recovery is possible while catching any rogue DMA access from the + * device. + * + * Returns: 0 on success, -EIO otherwise + */ +int zpci_reset_load_store_blocked(struct zpci_dev *zdev) +{ + u64 req = ZPCI_CREATE_REQ(zdev->fh, 0, ZPCI_MOD_FC_RESET_BLOCK); + struct zpci_fib fib = {0}; + u8 status; + int cc; + + cc = zpci_mod_fc(req, &fib, &status); + if (cc) { + zpci_dbg(3, "rls fid:%x, cc:%d, status:%x\n", zdev->fid, cc, status); + return -EIO; + } + + return 0; +} + static int zpci_mem_init(void) { BUILD_BUG_ON(!is_power_of_2(__alignof__(struct zpci_fmb)) || diff --git a/arch/s390/pci/pci_event.c b/arch/s390/pci/pci_event.c index 6a5bfa9dc1f2..2e3e5b278925 100644 --- a/arch/s390/pci/pci_event.c +++ b/arch/s390/pci/pci_event.c @@ -47,18 +47,223 @@ struct zpci_ccdf_avail { u16 pec; /* PCI event code */ } __packed; +static inline bool ers_result_indicates_abort(pci_ers_result_t ers_res) +{ + switch (ers_res) { + case PCI_ERS_RESULT_CAN_RECOVER: + case PCI_ERS_RESULT_RECOVERED: + case PCI_ERS_RESULT_NEED_RESET: + return false; + default: + return true; + } +} + +static bool is_passed_through(struct zpci_dev *zdev) +{ + return zdev->s390_domain; +} + +static bool is_driver_supported(struct pci_driver *driver) +{ + if (!driver || !driver->err_handler) + return false; + if (!driver->err_handler->error_detected) + return false; + if (!driver->err_handler->slot_reset) + return false; + if (!driver->err_handler->resume) + return false; + return true; +} + +static pci_ers_result_t zpci_event_notify_error_detected(struct pci_dev *pdev, + struct pci_driver *driver) +{ + pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; + + ers_res = driver->err_handler->error_detected(pdev, pdev->error_state); + if (ers_result_indicates_abort(ers_res)) + pr_info("%s: Automatic recovery failed after initial reporting\n", pci_name(pdev)); + else if (ers_res == PCI_ERS_RESULT_NEED_RESET) + pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev)); + + return ers_res; +} + +static pci_ers_result_t zpci_event_do_error_state_clear(struct pci_dev *pdev, + struct pci_driver *driver) +{ + pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; + struct zpci_dev *zdev = to_zpci(pdev); + int rc; + + pr_info("%s: Unblocking device access for examination\n", pci_name(pdev)); + rc = zpci_reset_load_store_blocked(zdev); + if (rc) { + pr_err("%s: Unblocking device access failed\n", pci_name(pdev)); + /* Let's try a full reset instead */ + return PCI_ERS_RESULT_NEED_RESET; + } + + if (driver->err_handler->mmio_enabled) { + ers_res = driver->err_handler->mmio_enabled(pdev); + if (ers_result_indicates_abort(ers_res)) { + pr_info("%s: Automatic recovery failed after MMIO re-enable\n", + pci_name(pdev)); + return ers_res; + } else if (ers_res == PCI_ERS_RESULT_NEED_RESET) { + pr_debug("%s: Driver needs reset to recover\n", pci_name(pdev)); + return ers_res; + } + } + + pr_debug("%s: Unblocking DMA\n", pci_name(pdev)); + rc = zpci_clear_error_state(zdev); + if (!rc) { + pdev->error_state = pci_channel_io_normal; + } else { + pr_err("%s: Unblocking DMA failed\n", pci_name(pdev)); + /* Let's try a full reset instead */ + return PCI_ERS_RESULT_NEED_RESET; + } + + return ers_res; +} + +static pci_ers_result_t zpci_event_do_reset(struct pci_dev *pdev, + struct pci_driver *driver) +{ + pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; + + pr_info("%s: Initiating reset\n", pci_name(pdev)); + if (zpci_hot_reset_device(to_zpci(pdev))) { + pr_err("%s: The reset request failed\n", pci_name(pdev)); + return ers_res; + } + pdev->error_state = pci_channel_io_normal; + ers_res = driver->err_handler->slot_reset(pdev); + if (ers_result_indicates_abort(ers_res)) { + pr_info("%s: Automatic recovery failed after slot reset\n", pci_name(pdev)); + return ers_res; + } + + return ers_res; +} + +/* zpci_event_attempt_error_recovery - Try to recover the given PCI function + * @pdev: PCI function to recover currently in the error state + * + * We follow the scheme outlined in Documentation/PCI/pci-error-recovery.rst. + * With the simplification that recovery always happens per function + * and the platform determines which functions are affected for + * multi-function devices. + */ +static pci_ers_result_t zpci_event_attempt_error_recovery(struct pci_dev *pdev) +{ + pci_ers_result_t ers_res = PCI_ERS_RESULT_DISCONNECT; + struct pci_driver *driver; + + /* + * Ensure that the PCI function is not removed concurrently, no driver + * is unbound or probed and that userspace can't access its + * configuration space while we perform recovery. + */ + pci_dev_lock(pdev); + if (pdev->error_state == pci_channel_io_perm_failure) { + ers_res = PCI_ERS_RESULT_DISCONNECT; + goto out_unlock; + } + pdev->error_state = pci_channel_io_frozen; + + if (is_passed_through(to_zpci(pdev))) { + pr_info("%s: Cannot be recovered in the host because it is a pass-through device\n", + pci_name(pdev)); + goto out_unlock; + } + + driver = to_pci_driver(pdev->dev.driver); + if (!is_driver_supported(driver)) { + if (!driver) + pr_info("%s: Cannot be recovered because no driver is bound to the device\n", + pci_name(pdev)); + else + pr_info("%s: The %s driver bound to the device does not support error recovery\n", + pci_name(pdev), + driver->name); + goto out_unlock; + } + + ers_res = zpci_event_notify_error_detected(pdev, driver); + if (ers_result_indicates_abort(ers_res)) + goto out_unlock; + + if (ers_res == PCI_ERS_RESULT_CAN_RECOVER) { + ers_res = zpci_event_do_error_state_clear(pdev, driver); + if (ers_result_indicates_abort(ers_res)) + goto out_unlock; + } + + if (ers_res == PCI_ERS_RESULT_NEED_RESET) + ers_res = zpci_event_do_reset(pdev, driver); + + if (ers_res != PCI_ERS_RESULT_RECOVERED) { + pr_err("%s: Automatic recovery failed; operator intervention is required\n", + pci_name(pdev)); + goto out_unlock; + } + + pr_info("%s: The device is ready to resume operations\n", pci_name(pdev)); + if (driver->err_handler->resume) + driver->err_handler->resume(pdev); +out_unlock: + pci_dev_unlock(pdev); + + return ers_res; +} + +/* zpci_event_io_failure - Report PCI channel failure state to driver + * @pdev: PCI function for which to report + * @es: PCI channel failure state to report + */ +static void zpci_event_io_failure(struct pci_dev *pdev, pci_channel_state_t es) +{ + struct pci_driver *driver; + + pci_dev_lock(pdev); + pdev->error_state = es; + /** + * While vfio-pci's error_detected callback notifies user-space QEMU + * reacts to this by freezing the guest. In an s390 environment PCI + * errors are rarely fatal so this is overkill. Instead in the future + * we will inject the error event and let the guest recover the device + * itself. + */ + if (is_passed_through(to_zpci(pdev))) + goto out; + driver = to_pci_driver(pdev->dev.driver); + if (driver && driver->err_handler && driver->err_handler->error_detected) + driver->err_handler->error_detected(pdev, pdev->error_state); +out: + pci_dev_unlock(pdev); +} + static void __zpci_event_error(struct zpci_ccdf_err *ccdf) { struct zpci_dev *zdev = get_zdev_by_fid(ccdf->fid); struct pci_dev *pdev = NULL; + pci_ers_result_t ers_res; zpci_dbg(3, "err fid:%x, fh:%x, pec:%x\n", ccdf->fid, ccdf->fh, ccdf->pec); zpci_err("error CCDF:\n"); zpci_err_hex(ccdf, sizeof(*ccdf)); - if (zdev) - pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn); + if (zdev) { + zpci_update_fh(zdev, ccdf->fh); + if (zdev->zbus->bus) + pdev = pci_get_slot(zdev->zbus->bus, zdev->devfn); + } pr_err("%s: Event 0x%x reports an error for PCI function 0x%x\n", pdev ? pci_name(pdev) : "n/a", ccdf->pec, ccdf->fid); @@ -66,7 +271,20 @@ static void __zpci_event_error(struct zpci_ccdf_err *ccdf) if (!pdev) return; - pdev->error_state = pci_channel_io_perm_failure; + switch (ccdf->pec) { + case 0x003a: /* Service Action or Error Recovery Successful */ + ers_res = zpci_event_attempt_error_recovery(pdev); + if (ers_res != PCI_ERS_RESULT_RECOVERED) + zpci_event_io_failure(pdev, pci_channel_io_perm_failure); + break; + default: + /* + * Mark as frozen not permanently failed because the device + * could be subsequently recovered by the platform. + */ + zpci_event_io_failure(pdev, pci_channel_io_frozen); + break; + } pci_dev_put(pdev); } @@ -78,7 +296,7 @@ void zpci_event_error(void *data) static void zpci_event_hard_deconfigured(struct zpci_dev *zdev, u32 fh) { - zdev->fh = fh; + zpci_update_fh(zdev, fh); /* Give the driver a hint that the function is * already unusable. */ @@ -121,7 +339,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) if (!zdev) zpci_create_device(ccdf->fid, ccdf->fh, ZPCI_FN_STATE_STANDBY); else - zdev->fh = ccdf->fh; + zpci_update_fh(zdev, ccdf->fh); break; case 0x0303: /* Deconfiguration requested */ if (zdev) { @@ -130,7 +348,7 @@ static void __zpci_event_availability(struct zpci_ccdf_avail *ccdf) */ if (zdev->state != ZPCI_FN_STATE_CONFIGURED) break; - zdev->fh = ccdf->fh; + zpci_update_fh(zdev, ccdf->fh); zpci_deconfigure_device(zdev); } break; diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c index 2e43996159f0..28d863aaafea 100644 --- a/arch/s390/pci/pci_insn.c +++ b/arch/s390/pci/pci_insn.c @@ -163,7 +163,7 @@ static inline int zpci_load_fh(u64 *data, const volatile void __iomem *addr, unsigned long len) { struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; - u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); + u64 req = ZPCI_CREATE_REQ(READ_ONCE(entry->fh), entry->bar, len); return __zpci_load(data, req, ZPCI_OFFSET(addr)); } @@ -244,7 +244,7 @@ static inline int zpci_store_fh(const volatile void __iomem *addr, u64 data, unsigned long len) { struct zpci_iomap_entry *entry = &zpci_iomap_start[ZPCI_IDX(addr)]; - u64 req = ZPCI_CREATE_REQ(entry->fh, entry->bar, len); + u64 req = ZPCI_CREATE_REQ(READ_ONCE(entry->fh), entry->bar, len); return __zpci_store(data, req, ZPCI_OFFSET(addr)); } diff --git a/arch/s390/pci/pci_irq.c b/arch/s390/pci/pci_irq.c index 3823e159bf74..954bb7a83124 100644 --- a/arch/s390/pci/pci_irq.c +++ b/arch/s390/pci/pci_irq.c @@ -387,6 +387,15 @@ void arch_teardown_msi_irqs(struct pci_dev *pdev) airq_iv_free(zpci_ibv[0], zdev->msi_first_bit, zdev->msi_nr_irqs); } +void arch_restore_msi_irqs(struct pci_dev *pdev) +{ + struct zpci_dev *zdev = to_zpci(pdev); + + if (!zdev->irqs_registered) + zpci_set_irq(zdev); + default_restore_msi_irqs(pdev); +} + static struct airq_struct zpci_airq = { .handler = zpci_floating_irq_handler, .isc = PCI_ISC, diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 2acf37cc1991..e5d8700319cc 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -38,7 +38,6 @@ #define __KVM_HAVE_ARCH_VCPU_DEBUGFS #define KVM_MAX_VCPUS 1024 -#define KVM_SOFT_MAX_VCPUS 710 /* * In x86, the VCPU ID corresponds to the APIC ID, and APIC IDs @@ -725,6 +724,7 @@ struct kvm_vcpu_arch { int cpuid_nent; struct kvm_cpuid_entry2 *cpuid_entries; + u32 kvm_cpuid_base; u64 reserved_gpa_bits; int maxphyaddr; @@ -748,7 +748,7 @@ struct kvm_vcpu_arch { u8 preempted; u64 msr_val; u64 last_steal; - struct gfn_to_pfn_cache cache; + struct gfn_to_hva_cache cache; } st; u64 l1_tsc_offset; @@ -1034,6 +1034,7 @@ struct kvm_x86_msr_filter { #define APICV_INHIBIT_REASON_IRQWIN 3 #define APICV_INHIBIT_REASON_PIT_REINJ 4 #define APICV_INHIBIT_REASON_X2APIC 5 +#define APICV_INHIBIT_REASON_BLOCKIRQ 6 struct kvm_arch { unsigned long n_used_mmu_pages; @@ -1476,6 +1477,7 @@ struct kvm_x86_ops { int (*mem_enc_reg_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*mem_enc_unreg_region)(struct kvm *kvm, struct kvm_enc_region *argp); int (*vm_copy_enc_context_from)(struct kvm *kvm, unsigned int source_fd); + int (*vm_move_enc_context_from)(struct kvm *kvm, unsigned int source_fd); int (*get_msr_feature)(struct kvm_msr_entry *entry); diff --git a/arch/x86/include/asm/kvm_para.h b/arch/x86/include/asm/kvm_para.h index 69299878b200..56935ebb1dfe 100644 --- a/arch/x86/include/asm/kvm_para.h +++ b/arch/x86/include/asm/kvm_para.h @@ -83,6 +83,18 @@ static inline long kvm_hypercall4(unsigned int nr, unsigned long p1, return ret; } +static inline long kvm_sev_hypercall3(unsigned int nr, unsigned long p1, + unsigned long p2, unsigned long p3) +{ + long ret; + + asm volatile("vmmcall" + : "=a"(ret) + : "a"(nr), "b"(p1), "c"(p2), "d"(p3) + : "memory"); + return ret; +} + #ifdef CONFIG_KVM_GUEST void kvmclock_init(void); void kvmclock_disable(void); diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h index 2d4f5c17d79c..e2c6f433ed10 100644 --- a/arch/x86/include/asm/mem_encrypt.h +++ b/arch/x86/include/asm/mem_encrypt.h @@ -44,6 +44,8 @@ void __init sme_enable(struct boot_params *bp); int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size); int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size); +void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, + bool enc); void __init mem_encrypt_free_decrypted_mem(void); @@ -78,6 +80,8 @@ static inline int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; } static inline int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; } +static inline void __init +early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) {} static inline void mem_encrypt_free_decrypted_mem(void) { } diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index cebec95a7124..21c4a694ca11 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -97,6 +97,12 @@ static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) PVOP_VCALL1(mmu.exit_mmap, mm); } +static inline void notify_page_enc_status_changed(unsigned long pfn, + int npages, bool enc) +{ + PVOP_VCALL3(mmu.notify_page_enc_status_changed, pfn, npages, enc); +} + #ifdef CONFIG_PARAVIRT_XXL static inline void load_sp0(unsigned long sp0) { diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index fc1151e77569..a69012e1903f 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -168,6 +168,7 @@ struct pv_mmu_ops { /* Hook for intercepting the destruction of an mm_struct. */ void (*exit_mmap)(struct mm_struct *mm); + void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc); #ifdef CONFIG_PARAVIRT_XXL struct paravirt_callee_save read_cr2; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 191878a65c61..355d38c0cf60 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -806,11 +806,14 @@ static inline u32 amd_get_nodes_per_socket(void) { return 0; } static inline u32 amd_get_highest_perf(void) { return 0; } #endif +#define for_each_possible_hypervisor_cpuid_base(function) \ + for (function = 0x40000000; function < 0x40010000; function += 0x100) + static inline uint32_t hypervisor_cpuid_base(const char *sig, uint32_t leaves) { uint32_t base, eax, signature[3]; - for (base = 0x40000000; base < 0x40010000; base += 0x100) { + for_each_possible_hypervisor_cpuid_base(base) { cpuid(base, &eax, &signature[0], &signature[1], &signature[2]); if (!memcmp(sig, signature, 12) && diff --git a/arch/x86/include/asm/set_memory.h b/arch/x86/include/asm/set_memory.h index 43fa081a1adb..872617542bbc 100644 --- a/arch/x86/include/asm/set_memory.h +++ b/arch/x86/include/asm/set_memory.h @@ -83,6 +83,7 @@ int set_pages_rw(struct page *page, int numpages); int set_direct_map_invalid_noflush(struct page *page); int set_direct_map_default_noflush(struct page *page); bool kernel_page_present(struct page *page); +void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc); extern int kernel_set_to_readonly; diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h index 5146bbab84d4..6e64b27b2c1e 100644 --- a/arch/x86/include/uapi/asm/kvm_para.h +++ b/arch/x86/include/uapi/asm/kvm_para.h @@ -8,6 +8,7 @@ * should be used to determine that a VM is running under KVM. */ #define KVM_CPUID_SIGNATURE 0x40000000 +#define KVM_SIGNATURE "KVMKVMKVM\0\0\0" /* This CPUID returns two feature bitmaps in eax, edx. Before enabling * a particular paravirtualization, the appropriate feature bit should diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 8863d1941f1b..59abbdad7729 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -41,6 +42,7 @@ #include #include #include +#include DEFINE_STATIC_KEY_FALSE(kvm_async_pf_enabled); @@ -434,6 +436,8 @@ static void kvm_guest_cpu_offline(bool shutdown) kvm_disable_steal_time(); if (kvm_para_has_feature(KVM_FEATURE_PV_EOI)) wrmsrl(MSR_KVM_PV_EOI_EN, 0); + if (kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) + wrmsrl(MSR_KVM_MIGRATION_CONTROL, 0); kvm_pv_disable_apf(); if (!shutdown) apf_task_wake_all(); @@ -548,6 +552,55 @@ static void kvm_send_ipi_mask_allbutself(const struct cpumask *mask, int vector) __send_ipi_mask(local_mask, vector); } +static int __init setup_efi_kvm_sev_migration(void) +{ + efi_char16_t efi_sev_live_migration_enabled[] = L"SevLiveMigrationEnabled"; + efi_guid_t efi_variable_guid = AMD_SEV_MEM_ENCRYPT_GUID; + efi_status_t status; + unsigned long size; + bool enabled; + + if (!cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) || + !kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) + return 0; + + if (!efi_enabled(EFI_BOOT)) + return 0; + + if (!efi_enabled(EFI_RUNTIME_SERVICES)) { + pr_info("%s : EFI runtime services are not enabled\n", __func__); + return 0; + } + + size = sizeof(enabled); + + /* Get variable contents into buffer */ + status = efi.get_variable(efi_sev_live_migration_enabled, + &efi_variable_guid, NULL, &size, &enabled); + + if (status == EFI_NOT_FOUND) { + pr_info("%s : EFI live migration variable not found\n", __func__); + return 0; + } + + if (status != EFI_SUCCESS) { + pr_info("%s : EFI variable retrieval failed\n", __func__); + return 0; + } + + if (enabled == 0) { + pr_info("%s: live migration disabled in EFI\n", __func__); + return 0; + } + + pr_info("%s : live migration enabled in EFI\n", __func__); + wrmsrl(MSR_KVM_MIGRATION_CONTROL, KVM_MIGRATION_READY); + + return 1; +} + +late_initcall(setup_efi_kvm_sev_migration); + /* * Set the IPI entry points */ @@ -756,7 +809,7 @@ static noinline uint32_t __kvm_cpuid_base(void) return 0; /* So we don't blow up on old processors */ if (boot_cpu_has(X86_FEATURE_HYPERVISOR)) - return hypervisor_cpuid_base("KVMKVMKVM\0\0\0", 0); + return hypervisor_cpuid_base(KVM_SIGNATURE, 0); return 0; } @@ -806,8 +859,62 @@ static bool __init kvm_msi_ext_dest_id(void) return kvm_para_has_feature(KVM_FEATURE_MSI_EXT_DEST_ID); } +static void kvm_sev_hc_page_enc_status(unsigned long pfn, int npages, bool enc) +{ + kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, pfn << PAGE_SHIFT, npages, + KVM_MAP_GPA_RANGE_ENC_STAT(enc) | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); +} + static void __init kvm_init_platform(void) { + if (cc_platform_has(CC_ATTR_GUEST_MEM_ENCRYPT) && + kvm_para_has_feature(KVM_FEATURE_MIGRATION_CONTROL)) { + unsigned long nr_pages; + int i; + + pv_ops.mmu.notify_page_enc_status_changed = + kvm_sev_hc_page_enc_status; + + /* + * Reset the host's shared pages list related to kernel + * specific page encryption status settings before we load a + * new kernel by kexec. Reset the page encryption status + * during early boot intead of just before kexec to avoid SMP + * races during kvm_pv_guest_cpu_reboot(). + * NOTE: We cannot reset the complete shared pages list + * here as we need to retain the UEFI/OVMF firmware + * specific settings. + */ + + for (i = 0; i < e820_table->nr_entries; i++) { + struct e820_entry *entry = &e820_table->entries[i]; + + if (entry->type != E820_TYPE_RAM) + continue; + + nr_pages = DIV_ROUND_UP(entry->size, PAGE_SIZE); + + kvm_sev_hypercall3(KVM_HC_MAP_GPA_RANGE, entry->addr, + nr_pages, + KVM_MAP_GPA_RANGE_ENCRYPTED | KVM_MAP_GPA_RANGE_PAGE_SZ_4K); + } + + /* + * Ensure that _bss_decrypted section is marked as decrypted in the + * shared pages list. + */ + nr_pages = DIV_ROUND_UP(__end_bss_decrypted - __start_bss_decrypted, + PAGE_SIZE); + early_set_mem_enc_dec_hypercall((unsigned long)__start_bss_decrypted, + nr_pages, 0); + + /* + * If not booted using EFI, enable Live migration support. + */ + if (!efi_enabled(EFI_BOOT)) + wrmsrl(MSR_KVM_MIGRATION_CONTROL, + KVM_MIGRATION_READY); + } kvmclock_init(); x86_platform.apic_post_init = kvm_apic_init; } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 7157c2df3bc2..7f7636aac620 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -337,6 +337,7 @@ struct paravirt_patch_template pv_ops = { (void (*)(struct mmu_gather *, void *))tlb_remove_page, .mmu.exit_mmap = paravirt_nop, + .mmu.notify_page_enc_status_changed = paravirt_nop, #ifdef CONFIG_PARAVIRT_XXL .mmu.read_cr2 = __PV_IS_CALLEE_SAVE(pv_native_read_cr2), diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index f14f69d7aa3c..cce1c89cb7df 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -106,7 +106,7 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval) */ local_irq_enable(); - BUG_ON(!vm86 || !vm86->user_vm86); + BUG_ON(!vm86); set_flags(regs->pt.flags, VEFLAGS, X86_EFLAGS_VIF | vm86->veflags_mask); user = vm86->user_vm86; diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 2d70edb0f323..e19dabf1848b 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -99,11 +99,45 @@ static int kvm_check_cpuid(struct kvm_cpuid_entry2 *entries, int nent) return 0; } +static void kvm_update_kvm_cpuid_base(struct kvm_vcpu *vcpu) +{ + u32 function; + struct kvm_cpuid_entry2 *entry; + + vcpu->arch.kvm_cpuid_base = 0; + + for_each_possible_hypervisor_cpuid_base(function) { + entry = kvm_find_cpuid_entry(vcpu, function, 0); + + if (entry) { + u32 signature[3]; + + signature[0] = entry->ebx; + signature[1] = entry->ecx; + signature[2] = entry->edx; + + BUILD_BUG_ON(sizeof(signature) > sizeof(KVM_SIGNATURE)); + if (!memcmp(signature, KVM_SIGNATURE, sizeof(signature))) { + vcpu->arch.kvm_cpuid_base = function; + break; + } + } + } +} + +struct kvm_cpuid_entry2 *kvm_find_kvm_cpuid_features(struct kvm_vcpu *vcpu) +{ + u32 base = vcpu->arch.kvm_cpuid_base; + + if (!base) + return NULL; + + return kvm_find_cpuid_entry(vcpu, base | KVM_CPUID_FEATURES, 0); +} + void kvm_update_pv_runtime(struct kvm_vcpu *vcpu) { - struct kvm_cpuid_entry2 *best; - - best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); + struct kvm_cpuid_entry2 *best = kvm_find_kvm_cpuid_features(vcpu); /* * save the feature bitmap to avoid cpuid lookup for every PV @@ -142,7 +176,7 @@ void kvm_update_cpuid_runtime(struct kvm_vcpu *vcpu) cpuid_entry_has(best, X86_FEATURE_XSAVEC))) best->ebx = xstate_required_size(vcpu->arch.xcr0, true); - best = kvm_find_cpuid_entry(vcpu, KVM_CPUID_FEATURES, 0); + best = kvm_find_kvm_cpuid_features(vcpu); if (kvm_hlt_in_guest(vcpu->kvm) && best && (best->eax & (1 << KVM_FEATURE_PV_UNHALT))) best->eax &= ~(1 << KVM_FEATURE_PV_UNHALT); @@ -239,6 +273,26 @@ u64 kvm_vcpu_reserved_gpa_bits_raw(struct kvm_vcpu *vcpu) return rsvd_bits(cpuid_maxphyaddr(vcpu), 63); } +static int kvm_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid_entry2 *e2, + int nent) +{ + int r; + + r = kvm_check_cpuid(e2, nent); + if (r) + return r; + + kvfree(vcpu->arch.cpuid_entries); + vcpu->arch.cpuid_entries = e2; + vcpu->arch.cpuid_nent = nent; + + kvm_update_kvm_cpuid_base(vcpu); + kvm_update_cpuid_runtime(vcpu); + kvm_vcpu_after_set_cpuid(vcpu); + + return 0; +} + /* when an old userspace process fills a new kernel module */ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, struct kvm_cpuid *cpuid, @@ -275,18 +329,9 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, e2[i].padding[2] = 0; } - r = kvm_check_cpuid(e2, cpuid->nent); - if (r) { + r = kvm_set_cpuid(vcpu, e2, cpuid->nent); + if (r) kvfree(e2); - goto out_free_cpuid; - } - - kvfree(vcpu->arch.cpuid_entries); - vcpu->arch.cpuid_entries = e2; - vcpu->arch.cpuid_nent = cpuid->nent; - - kvm_update_cpuid_runtime(vcpu); - kvm_vcpu_after_set_cpuid(vcpu); out_free_cpuid: kvfree(e); @@ -310,20 +355,11 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, return PTR_ERR(e2); } - r = kvm_check_cpuid(e2, cpuid->nent); - if (r) { + r = kvm_set_cpuid(vcpu, e2, cpuid->nent); + if (r) kvfree(e2); - return r; - } - kvfree(vcpu->arch.cpuid_entries); - vcpu->arch.cpuid_entries = e2; - vcpu->arch.cpuid_nent = cpuid->nent; - - kvm_update_cpuid_runtime(vcpu); - kvm_vcpu_after_set_cpuid(vcpu); - - return 0; + return r; } int kvm_vcpu_ioctl_get_cpuid2(struct kvm_vcpu *vcpu, @@ -871,8 +907,7 @@ static inline int __do_cpuid_func(struct kvm_cpuid_array *array, u32 function) } break; case KVM_CPUID_SIGNATURE: { - static const char signature[12] = "KVMKVMKVM\0\0"; - const u32 *sigptr = (const u32 *)signature; + const u32 *sigptr = (const u32 *)KVM_SIGNATURE; entry->eax = KVM_CPUID_FEATURES; entry->ebx = sigptr[0]; entry->ecx = sigptr[1]; diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c index 4f15c0165c05..4a555f32885a 100644 --- a/arch/x86/kvm/hyperv.c +++ b/arch/x86/kvm/hyperv.c @@ -1472,7 +1472,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) if (!(data & HV_X64_MSR_VP_ASSIST_PAGE_ENABLE)) { hv_vcpu->hv_vapic = data; - if (kvm_lapic_enable_pv_eoi(vcpu, 0, 0)) + if (kvm_lapic_set_pv_eoi(vcpu, 0, 0)) return 1; break; } @@ -1490,7 +1490,7 @@ static int kvm_hv_set_msr(struct kvm_vcpu *vcpu, u32 msr, u64 data, bool host) return 1; hv_vcpu->hv_vapic = data; kvm_vcpu_mark_page_dirty(vcpu, gfn); - if (kvm_lapic_enable_pv_eoi(vcpu, + if (kvm_lapic_set_pv_eoi(vcpu, gfn_to_gpa(gfn) | KVM_MSR_ENABLED, sizeof(struct hv_vp_assist_page))) return 1; diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c index d6ac32f3f650..759952dd1222 100644 --- a/arch/x86/kvm/lapic.c +++ b/arch/x86/kvm/lapic.c @@ -2856,25 +2856,30 @@ int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 reg, u64 *data) return 0; } -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) +int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len) { u64 addr = data & ~KVM_MSR_ENABLED; struct gfn_to_hva_cache *ghc = &vcpu->arch.pv_eoi.data; unsigned long new_len; + int ret; if (!IS_ALIGNED(addr, 4)) return 1; + if (data & KVM_MSR_ENABLED) { + if (addr == ghc->gpa && len <= ghc->len) + new_len = ghc->len; + else + new_len = len; + + ret = kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); + if (ret) + return ret; + } + vcpu->arch.pv_eoi.msr_val = data; - if (!pv_eoi_enabled(vcpu)) - return 0; - if (addr == ghc->gpa && len <= ghc->len) - new_len = ghc->len; - else - new_len = len; - - return kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, addr, new_len); + return 0; } int kvm_apic_accept_events(struct kvm_vcpu *vcpu) diff --git a/arch/x86/kvm/lapic.h b/arch/x86/kvm/lapic.h index d7c25d0c1354..2b44e533fc8d 100644 --- a/arch/x86/kvm/lapic.h +++ b/arch/x86/kvm/lapic.h @@ -127,7 +127,7 @@ int kvm_x2apic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); int kvm_hv_vapic_msr_write(struct kvm_vcpu *vcpu, u32 msr, u64 data); int kvm_hv_vapic_msr_read(struct kvm_vcpu *vcpu, u32 msr, u64 *data); -int kvm_lapic_enable_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); +int kvm_lapic_set_pv_eoi(struct kvm_vcpu *vcpu, u64 data, unsigned long len); void kvm_lapic_exit(void); #define VEC_POS(v) ((v) & (32 - 1)) diff --git a/arch/x86/kvm/mmu/mmu.c b/arch/x86/kvm/mmu/mmu.c index 323b5057d08f..33794379949e 100644 --- a/arch/x86/kvm/mmu/mmu.c +++ b/arch/x86/kvm/mmu/mmu.c @@ -3191,17 +3191,17 @@ static int fast_page_fault(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault) new_spte |= PT_WRITABLE_MASK; /* - * Do not fix write-permission on the large spte. Since - * we only dirty the first page into the dirty-bitmap in + * Do not fix write-permission on the large spte when + * dirty logging is enabled. Since we only dirty the + * first page into the dirty-bitmap in * fast_pf_fix_direct_spte(), other pages are missed * if its slot has dirty logging enabled. * * Instead, we let the slow page fault path create a * normal spte to fix the access. - * - * See the comments in kvm_arch_commit_memory_region(). */ - if (sp->role.level > PG_LEVEL_4K) + if (sp->role.level > PG_LEVEL_4K && + kvm_slot_dirty_track_enabled(fault->slot)) break; } diff --git a/arch/x86/kvm/mmu/tdp_mmu.c b/arch/x86/kvm/mmu/tdp_mmu.c index 7c5dd83e52de..a54c3491af42 100644 --- a/arch/x86/kvm/mmu/tdp_mmu.c +++ b/arch/x86/kvm/mmu/tdp_mmu.c @@ -897,7 +897,7 @@ static int tdp_mmu_map_handle_target_level(struct kvm_vcpu *vcpu, struct kvm_page_fault *fault, struct tdp_iter *iter) { - struct kvm_mmu_page *sp = sptep_to_sp(iter->sptep); + struct kvm_mmu_page *sp = sptep_to_sp(rcu_dereference(iter->sptep)); u64 new_spte; int ret = RET_PF_FIXED; bool wrprot = false; diff --git a/arch/x86/kvm/pmu.c b/arch/x86/kvm/pmu.c index 0772bad9165c..09873f6488f7 100644 --- a/arch/x86/kvm/pmu.c +++ b/arch/x86/kvm/pmu.c @@ -319,7 +319,7 @@ void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) } /* check if idx is a valid index to access PMU */ -int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { return kvm_x86_ops.pmu_ops->is_valid_rdpmc_ecx(vcpu, idx); } diff --git a/arch/x86/kvm/pmu.h b/arch/x86/kvm/pmu.h index 0e4f2b1fa9fb..59d6b76203d5 100644 --- a/arch/x86/kvm/pmu.h +++ b/arch/x86/kvm/pmu.h @@ -32,7 +32,7 @@ struct kvm_pmu_ops { struct kvm_pmc *(*rdpmc_ecx_to_pmc)(struct kvm_vcpu *vcpu, unsigned int idx, u64 *mask); struct kvm_pmc *(*msr_idx_to_pmc)(struct kvm_vcpu *vcpu, u32 msr); - int (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); + bool (*is_valid_rdpmc_ecx)(struct kvm_vcpu *vcpu, unsigned int idx); bool (*is_valid_msr)(struct kvm_vcpu *vcpu, u32 msr); int (*get_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int (*set_msr)(struct kvm_vcpu *vcpu, struct msr_data *msr_info); @@ -149,7 +149,7 @@ void reprogram_counter(struct kvm_pmu *pmu, int pmc_idx); void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu); void kvm_pmu_handle_event(struct kvm_vcpu *vcpu); int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned pmc, u64 *data); -int kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); +bool kvm_pmu_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx); bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr); int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info); diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index 8052d92069e0..affc0ea98d30 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -904,7 +904,8 @@ bool svm_check_apicv_inhibit_reasons(ulong bit) BIT(APICV_INHIBIT_REASON_NESTED) | BIT(APICV_INHIBIT_REASON_IRQWIN) | BIT(APICV_INHIBIT_REASON_PIT_REINJ) | - BIT(APICV_INHIBIT_REASON_X2APIC); + BIT(APICV_INHIBIT_REASON_X2APIC) | + BIT(APICV_INHIBIT_REASON_BLOCKIRQ); return supported & BIT(bit); } diff --git a/arch/x86/kvm/svm/pmu.c b/arch/x86/kvm/svm/pmu.c index fdf587f19c5f..871c426ec389 100644 --- a/arch/x86/kvm/svm/pmu.c +++ b/arch/x86/kvm/svm/pmu.c @@ -181,14 +181,13 @@ static struct kvm_pmc *amd_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) return get_gp_pmc_amd(pmu, base + pmc_idx, PMU_TYPE_COUNTER); } -/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +static bool amd_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); idx &= ~(3u << 30); - return (idx >= pmu->nr_arch_gp_counters); + return idx < pmu->nr_arch_gp_counters; } /* idx is the ECX register of RDPMC instruction */ diff --git a/arch/x86/kvm/svm/sev.c b/arch/x86/kvm/svm/sev.c index 1964b9a174be..902c52a8dd0c 100644 --- a/arch/x86/kvm/svm/sev.c +++ b/arch/x86/kvm/svm/sev.c @@ -120,16 +120,26 @@ static bool __sev_recycle_asids(int min_asid, int max_asid) return true; } +static int sev_misc_cg_try_charge(struct kvm_sev_info *sev) +{ + enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; + return misc_cg_try_charge(type, sev->misc_cg, 1); +} + +static void sev_misc_cg_uncharge(struct kvm_sev_info *sev) +{ + enum misc_res_type type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; + misc_cg_uncharge(type, sev->misc_cg, 1); +} + static int sev_asid_new(struct kvm_sev_info *sev) { int asid, min_asid, max_asid, ret; bool retry = true; - enum misc_res_type type; - type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; WARN_ON(sev->misc_cg); sev->misc_cg = get_current_misc_cg(); - ret = misc_cg_try_charge(type, sev->misc_cg, 1); + ret = sev_misc_cg_try_charge(sev); if (ret) { put_misc_cg(sev->misc_cg); sev->misc_cg = NULL; @@ -162,7 +172,7 @@ again: return asid; e_uncharge: - misc_cg_uncharge(type, sev->misc_cg, 1); + sev_misc_cg_uncharge(sev); put_misc_cg(sev->misc_cg); sev->misc_cg = NULL; return ret; @@ -179,7 +189,6 @@ static void sev_asid_free(struct kvm_sev_info *sev) { struct svm_cpu_data *sd; int cpu; - enum misc_res_type type; mutex_lock(&sev_bitmap_lock); @@ -192,8 +201,7 @@ static void sev_asid_free(struct kvm_sev_info *sev) mutex_unlock(&sev_bitmap_lock); - type = sev->es_active ? MISC_CG_RES_SEV_ES : MISC_CG_RES_SEV; - misc_cg_uncharge(type, sev->misc_cg, 1); + sev_misc_cg_uncharge(sev); put_misc_cg(sev->misc_cg); sev->misc_cg = NULL; } @@ -590,7 +598,7 @@ static int sev_es_sync_vmsa(struct vcpu_svm *svm) * traditional VMSA as it has been built so far (in prep * for LAUNCH_UPDATE_VMSA) to be the initial SEV-ES state. */ - memcpy(svm->vmsa, save, sizeof(*save)); + memcpy(svm->sev_es.vmsa, save, sizeof(*save)); return 0; } @@ -612,11 +620,11 @@ static int __sev_launch_update_vmsa(struct kvm *kvm, struct kvm_vcpu *vcpu, * the VMSA memory content (i.e it will write the same memory region * with the guest's key), so invalidate it first. */ - clflush_cache_range(svm->vmsa, PAGE_SIZE); + clflush_cache_range(svm->sev_es.vmsa, PAGE_SIZE); vmsa.reserved = 0; vmsa.handle = to_kvm_svm(kvm)->sev_info.handle; - vmsa.address = __sme_pa(svm->vmsa); + vmsa.address = __sme_pa(svm->sev_es.vmsa); vmsa.len = PAGE_SIZE; ret = sev_issue_cmd(kvm, SEV_CMD_LAUNCH_UPDATE_VMSA, &vmsa, error); if (ret) @@ -1536,6 +1544,201 @@ static bool cmd_allowed_from_miror(u32 cmd_id) return false; } +static int sev_lock_for_migration(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + /* + * Bail if this VM is already involved in a migration to avoid deadlock + * between two VMs trying to migrate to/from each other. + */ + if (atomic_cmpxchg_acquire(&sev->migration_in_progress, 0, 1)) + return -EBUSY; + + mutex_lock(&kvm->lock); + + return 0; +} + +static void sev_unlock_after_migration(struct kvm *kvm) +{ + struct kvm_sev_info *sev = &to_kvm_svm(kvm)->sev_info; + + mutex_unlock(&kvm->lock); + atomic_set_release(&sev->migration_in_progress, 0); +} + + +static int sev_lock_vcpus_for_migration(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int i, j; + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (mutex_lock_killable(&vcpu->mutex)) + goto out_unlock; + } + + return 0; + +out_unlock: + kvm_for_each_vcpu(j, vcpu, kvm) { + if (i == j) + break; + + mutex_unlock(&vcpu->mutex); + } + return -EINTR; +} + +static void sev_unlock_vcpus_for_migration(struct kvm *kvm) +{ + struct kvm_vcpu *vcpu; + int i; + + kvm_for_each_vcpu(i, vcpu, kvm) { + mutex_unlock(&vcpu->mutex); + } +} + +static void sev_migrate_from(struct kvm_sev_info *dst, + struct kvm_sev_info *src) +{ + dst->active = true; + dst->asid = src->asid; + dst->handle = src->handle; + dst->pages_locked = src->pages_locked; + + src->asid = 0; + src->active = false; + src->handle = 0; + src->pages_locked = 0; + + INIT_LIST_HEAD(&dst->regions_list); + list_replace_init(&src->regions_list, &dst->regions_list); +} + +static int sev_es_migrate_from(struct kvm *dst, struct kvm *src) +{ + int i; + struct kvm_vcpu *dst_vcpu, *src_vcpu; + struct vcpu_svm *dst_svm, *src_svm; + + if (atomic_read(&src->online_vcpus) != atomic_read(&dst->online_vcpus)) + return -EINVAL; + + kvm_for_each_vcpu(i, src_vcpu, src) { + if (!src_vcpu->arch.guest_state_protected) + return -EINVAL; + } + + kvm_for_each_vcpu(i, src_vcpu, src) { + src_svm = to_svm(src_vcpu); + dst_vcpu = kvm_get_vcpu(dst, i); + dst_svm = to_svm(dst_vcpu); + + /* + * Transfer VMSA and GHCB state to the destination. Nullify and + * clear source fields as appropriate, the state now belongs to + * the destination. + */ + memcpy(&dst_svm->sev_es, &src_svm->sev_es, sizeof(src_svm->sev_es)); + dst_svm->vmcb->control.ghcb_gpa = src_svm->vmcb->control.ghcb_gpa; + dst_svm->vmcb->control.vmsa_pa = src_svm->vmcb->control.vmsa_pa; + dst_vcpu->arch.guest_state_protected = true; + + memset(&src_svm->sev_es, 0, sizeof(src_svm->sev_es)); + src_svm->vmcb->control.ghcb_gpa = INVALID_PAGE; + src_svm->vmcb->control.vmsa_pa = INVALID_PAGE; + src_vcpu->arch.guest_state_protected = false; + } + to_kvm_svm(src)->sev_info.es_active = false; + to_kvm_svm(dst)->sev_info.es_active = true; + + return 0; +} + +int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd) +{ + struct kvm_sev_info *dst_sev = &to_kvm_svm(kvm)->sev_info; + struct kvm_sev_info *src_sev, *cg_cleanup_sev; + struct file *source_kvm_file; + struct kvm *source_kvm; + bool charged = false; + int ret; + + ret = sev_lock_for_migration(kvm); + if (ret) + return ret; + + if (sev_guest(kvm)) { + ret = -EINVAL; + goto out_unlock; + } + + source_kvm_file = fget(source_fd); + if (!file_is_kvm(source_kvm_file)) { + ret = -EBADF; + goto out_fput; + } + + source_kvm = source_kvm_file->private_data; + ret = sev_lock_for_migration(source_kvm); + if (ret) + goto out_fput; + + if (!sev_guest(source_kvm)) { + ret = -EINVAL; + goto out_source; + } + + src_sev = &to_kvm_svm(source_kvm)->sev_info; + dst_sev->misc_cg = get_current_misc_cg(); + cg_cleanup_sev = dst_sev; + if (dst_sev->misc_cg != src_sev->misc_cg) { + ret = sev_misc_cg_try_charge(dst_sev); + if (ret) + goto out_dst_cgroup; + charged = true; + } + + ret = sev_lock_vcpus_for_migration(kvm); + if (ret) + goto out_dst_cgroup; + ret = sev_lock_vcpus_for_migration(source_kvm); + if (ret) + goto out_dst_vcpu; + + if (sev_es_guest(source_kvm)) { + ret = sev_es_migrate_from(kvm, source_kvm); + if (ret) + goto out_source_vcpu; + } + sev_migrate_from(dst_sev, src_sev); + kvm_vm_dead(source_kvm); + cg_cleanup_sev = src_sev; + ret = 0; + +out_source_vcpu: + sev_unlock_vcpus_for_migration(source_kvm); +out_dst_vcpu: + sev_unlock_vcpus_for_migration(kvm); +out_dst_cgroup: + /* Operates on the source on success, on the destination on failure. */ + if (charged) + sev_misc_cg_uncharge(cg_cleanup_sev); + put_misc_cg(cg_cleanup_sev->misc_cg); + cg_cleanup_sev->misc_cg = NULL; +out_source: + sev_unlock_after_migration(source_kvm); +out_fput: + if (source_kvm_file) + fput(source_kvm_file); +out_unlock: + sev_unlock_after_migration(kvm); + return ret; +} + int svm_mem_enc_op(struct kvm *kvm, void __user *argp) { struct kvm_sev_cmd sev_cmd; @@ -2038,16 +2241,16 @@ void sev_free_vcpu(struct kvm_vcpu *vcpu) svm = to_svm(vcpu); if (vcpu->arch.guest_state_protected) - sev_flush_guest_memory(svm, svm->vmsa, PAGE_SIZE); - __free_page(virt_to_page(svm->vmsa)); + sev_flush_guest_memory(svm, svm->sev_es.vmsa, PAGE_SIZE); + __free_page(virt_to_page(svm->sev_es.vmsa)); - if (svm->ghcb_sa_free) - kfree(svm->ghcb_sa); + if (svm->sev_es.ghcb_sa_free) + kfree(svm->sev_es.ghcb_sa); } static void dump_ghcb(struct vcpu_svm *svm) { - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; unsigned int nbits; /* Re-use the dump_invalid_vmcb module parameter */ @@ -2073,7 +2276,7 @@ static void dump_ghcb(struct vcpu_svm *svm) static void sev_es_sync_to_ghcb(struct vcpu_svm *svm) { struct kvm_vcpu *vcpu = &svm->vcpu; - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; /* * The GHCB protocol so far allows for the following data @@ -2093,7 +2296,7 @@ static void sev_es_sync_from_ghcb(struct vcpu_svm *svm) { struct vmcb_control_area *control = &svm->vmcb->control; struct kvm_vcpu *vcpu = &svm->vcpu; - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; u64 exit_code; /* @@ -2140,7 +2343,7 @@ static int sev_es_validate_vmgexit(struct vcpu_svm *svm) struct ghcb *ghcb; u64 exit_code = 0; - ghcb = svm->ghcb; + ghcb = svm->sev_es.ghcb; /* Only GHCB Usage code 0 is supported */ if (ghcb->ghcb_usage) @@ -2258,33 +2461,34 @@ vmgexit_err: void sev_es_unmap_ghcb(struct vcpu_svm *svm) { - if (!svm->ghcb) + if (!svm->sev_es.ghcb) return; - if (svm->ghcb_sa_free) { + if (svm->sev_es.ghcb_sa_free) { /* * The scratch area lives outside the GHCB, so there is a * buffer that, depending on the operation performed, may * need to be synced, then freed. */ - if (svm->ghcb_sa_sync) { + if (svm->sev_es.ghcb_sa_sync) { kvm_write_guest(svm->vcpu.kvm, - ghcb_get_sw_scratch(svm->ghcb), - svm->ghcb_sa, svm->ghcb_sa_len); - svm->ghcb_sa_sync = false; + ghcb_get_sw_scratch(svm->sev_es.ghcb), + svm->sev_es.ghcb_sa, + svm->sev_es.ghcb_sa_len); + svm->sev_es.ghcb_sa_sync = false; } - kfree(svm->ghcb_sa); - svm->ghcb_sa = NULL; - svm->ghcb_sa_free = false; + kfree(svm->sev_es.ghcb_sa); + svm->sev_es.ghcb_sa = NULL; + svm->sev_es.ghcb_sa_free = false; } - trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->ghcb); + trace_kvm_vmgexit_exit(svm->vcpu.vcpu_id, svm->sev_es.ghcb); sev_es_sync_to_ghcb(svm); - kvm_vcpu_unmap(&svm->vcpu, &svm->ghcb_map, true); - svm->ghcb = NULL; + kvm_vcpu_unmap(&svm->vcpu, &svm->sev_es.ghcb_map, true); + svm->sev_es.ghcb = NULL; } void pre_sev_run(struct vcpu_svm *svm, int cpu) @@ -2314,7 +2518,7 @@ void pre_sev_run(struct vcpu_svm *svm, int cpu) static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) { struct vmcb_control_area *control = &svm->vmcb->control; - struct ghcb *ghcb = svm->ghcb; + struct ghcb *ghcb = svm->sev_es.ghcb; u64 ghcb_scratch_beg, ghcb_scratch_end; u64 scratch_gpa_beg, scratch_gpa_end; void *scratch_va; @@ -2350,7 +2554,7 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) return false; } - scratch_va = (void *)svm->ghcb; + scratch_va = (void *)svm->sev_es.ghcb; scratch_va += (scratch_gpa_beg - control->ghcb_gpa); } else { /* @@ -2380,12 +2584,12 @@ static bool setup_vmgexit_scratch(struct vcpu_svm *svm, bool sync, u64 len) * the vCPU next time (i.e. a read was requested so the data * must be written back to the guest memory). */ - svm->ghcb_sa_sync = sync; - svm->ghcb_sa_free = true; + svm->sev_es.ghcb_sa_sync = sync; + svm->sev_es.ghcb_sa_free = true; } - svm->ghcb_sa = scratch_va; - svm->ghcb_sa_len = len; + svm->sev_es.ghcb_sa = scratch_va; + svm->sev_es.ghcb_sa_len = len; return true; } @@ -2504,15 +2708,15 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) return -EINVAL; } - if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->ghcb_map)) { + if (kvm_vcpu_map(vcpu, ghcb_gpa >> PAGE_SHIFT, &svm->sev_es.ghcb_map)) { /* Unable to map GHCB from guest */ vcpu_unimpl(vcpu, "vmgexit: error mapping GHCB [%#llx] from guest\n", ghcb_gpa); return -EINVAL; } - svm->ghcb = svm->ghcb_map.hva; - ghcb = svm->ghcb_map.hva; + svm->sev_es.ghcb = svm->sev_es.ghcb_map.hva; + ghcb = svm->sev_es.ghcb_map.hva; trace_kvm_vmgexit_enter(vcpu->vcpu_id, ghcb); @@ -2535,7 +2739,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = kvm_sev_es_mmio_read(vcpu, control->exit_info_1, control->exit_info_2, - svm->ghcb_sa); + svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_MMIO_WRITE: if (!setup_vmgexit_scratch(svm, false, control->exit_info_2)) @@ -2544,7 +2748,7 @@ int sev_handle_vmgexit(struct kvm_vcpu *vcpu) ret = kvm_sev_es_mmio_write(vcpu, control->exit_info_1, control->exit_info_2, - svm->ghcb_sa); + svm->sev_es.ghcb_sa); break; case SVM_VMGEXIT_NMI_COMPLETE: ret = svm_invoke_exit_handler(vcpu, SVM_EXIT_IRET); @@ -2604,7 +2808,8 @@ int sev_es_string_io(struct vcpu_svm *svm, int size, unsigned int port, int in) if (!setup_vmgexit_scratch(svm, in, bytes)) return -EINVAL; - return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->ghcb_sa, count, in); + return kvm_sev_es_string_io(&svm->vcpu, size, port, svm->sev_es.ghcb_sa, + count, in); } void sev_es_init_vmcb(struct vcpu_svm *svm) @@ -2619,7 +2824,7 @@ void sev_es_init_vmcb(struct vcpu_svm *svm) * VMCB page. Do not include the encryption mask on the VMSA physical * address since hardware will access it using the guest key. */ - svm->vmcb->control.vmsa_pa = __pa(svm->vmsa); + svm->vmcb->control.vmsa_pa = __pa(svm->sev_es.vmsa); /* Can't intercept CR register access, HV can't modify CR registers */ svm_clr_intercept(svm, INTERCEPT_CR0_READ); @@ -2691,8 +2896,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) struct vcpu_svm *svm = to_svm(vcpu); /* First SIPI: Use the values as initially set by the VMM */ - if (!svm->received_first_sipi) { - svm->received_first_sipi = true; + if (!svm->sev_es.received_first_sipi) { + svm->sev_es.received_first_sipi = true; return; } @@ -2701,8 +2906,8 @@ void sev_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) * the guest will set the CS and RIP. Set SW_EXIT_INFO_2 to a * non-zero value. */ - if (!svm->ghcb) + if (!svm->sev_es.ghcb) return; - ghcb_set_sw_exit_info_2(svm->ghcb, 1); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, 1); } diff --git a/arch/x86/kvm/svm/svm.c b/arch/x86/kvm/svm/svm.c index b36ca4e476c2..5630c241d5f6 100644 --- a/arch/x86/kvm/svm/svm.c +++ b/arch/x86/kvm/svm/svm.c @@ -1452,7 +1452,7 @@ static int svm_create_vcpu(struct kvm_vcpu *vcpu) svm_switch_vmcb(svm, &svm->vmcb01); if (vmsa_page) - svm->vmsa = page_address(vmsa_page); + svm->sev_es.vmsa = page_address(vmsa_page); svm->guest_state_loaded = false; @@ -2835,11 +2835,11 @@ static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) { struct vcpu_svm *svm = to_svm(vcpu); - if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->ghcb)) + if (!err || !sev_es_guest(vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) return kvm_complete_insn_gp(vcpu, err); - ghcb_set_sw_exit_info_1(svm->ghcb, 1); - ghcb_set_sw_exit_info_2(svm->ghcb, + ghcb_set_sw_exit_info_1(svm->sev_es.ghcb, 1); + ghcb_set_sw_exit_info_2(svm->sev_es.ghcb, X86_TRAP_GP | SVM_EVTINJ_TYPE_EXEPT | SVM_EVTINJ_VALID); @@ -3121,11 +3121,6 @@ static int invpcid_interception(struct kvm_vcpu *vcpu) type = svm->vmcb->control.exit_info_2; gva = svm->vmcb->control.exit_info_1; - if (type > 3) { - kvm_inject_gp(vcpu, 0); - return 1; - } - return kvm_handle_invpcid(vcpu, type, gva); } @@ -4701,6 +4696,7 @@ static struct kvm_x86_ops svm_x86_ops __initdata = { .mem_enc_unreg_region = svm_unregister_enc_region, .vm_copy_enc_context_from = svm_vm_copy_asid_from, + .vm_move_enc_context_from = svm_vm_migrate_from, .can_emulate_instruction = svm_can_emulate_instruction, diff --git a/arch/x86/kvm/svm/svm.h b/arch/x86/kvm/svm/svm.h index 5e9510d4574e..437e68504e66 100644 --- a/arch/x86/kvm/svm/svm.h +++ b/arch/x86/kvm/svm/svm.h @@ -80,6 +80,7 @@ struct kvm_sev_info { u64 ap_jump_table; /* SEV-ES AP Jump Table address */ struct kvm *enc_context_owner; /* Owner of copied encryption context */ struct misc_cg *misc_cg; /* For misc cgroup accounting */ + atomic_t migration_in_progress; }; struct kvm_svm { @@ -123,6 +124,20 @@ struct svm_nested_state { bool initialized; }; +struct vcpu_sev_es_state { + /* SEV-ES support */ + struct vmcb_save_area *vmsa; + struct ghcb *ghcb; + struct kvm_host_map ghcb_map; + bool received_first_sipi; + + /* SEV-ES scratch area support */ + void *ghcb_sa; + u32 ghcb_sa_len; + bool ghcb_sa_sync; + bool ghcb_sa_free; +}; + struct vcpu_svm { struct kvm_vcpu vcpu; /* vmcb always points at current_vmcb->ptr, it's purely a shorthand. */ @@ -186,17 +201,7 @@ struct vcpu_svm { DECLARE_BITMAP(write, MAX_DIRECT_ACCESS_MSRS); } shadow_msr_intercept; - /* SEV-ES support */ - struct vmcb_save_area *vmsa; - struct ghcb *ghcb; - struct kvm_host_map ghcb_map; - bool received_first_sipi; - - /* SEV-ES scratch area support */ - void *ghcb_sa; - u32 ghcb_sa_len; - bool ghcb_sa_sync; - bool ghcb_sa_free; + struct vcpu_sev_es_state sev_es; bool guest_state_loaded; }; @@ -558,6 +563,7 @@ int svm_register_enc_region(struct kvm *kvm, int svm_unregister_enc_region(struct kvm *kvm, struct kvm_enc_region *range); int svm_vm_copy_asid_from(struct kvm *kvm, unsigned int source_fd); +int svm_vm_migrate_from(struct kvm *kvm, unsigned int source_fd); void pre_sev_run(struct vcpu_svm *svm, int cpu); void __init sev_set_cpu_caps(void); void __init sev_hardware_setup(void); diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index b4ee5e9f9e20..b213ca966d41 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -525,67 +525,19 @@ static int nested_vmx_check_tpr_shadow_controls(struct kvm_vcpu *vcpu, } /* - * Check if MSR is intercepted for L01 MSR bitmap. + * For x2APIC MSRs, ignore the vmcs01 bitmap. L1 can enable x2APIC without L1 + * itself utilizing x2APIC. All MSRs were previously set to be intercepted, + * only the "disable intercept" case needs to be handled. */ -static bool msr_write_intercepted_l01(struct kvm_vcpu *vcpu, u32 msr) +static void nested_vmx_disable_intercept_for_x2apic_msr(unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_l0, + u32 msr, int type) { - unsigned long *msr_bitmap; - int f = sizeof(unsigned long); + if (type & MSR_TYPE_R && !vmx_test_msr_bitmap_read(msr_bitmap_l1, msr)) + vmx_clear_msr_bitmap_read(msr_bitmap_l0, msr); - if (!cpu_has_vmx_msr_bitmap()) - return true; - - msr_bitmap = to_vmx(vcpu)->vmcs01.msr_bitmap; - - if (msr <= 0x1fff) { - return !!test_bit(msr, msr_bitmap + 0x800 / f); - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - return !!test_bit(msr, msr_bitmap + 0xc00 / f); - } - - return true; -} - -/* - * If a msr is allowed by L0, we should check whether it is allowed by L1. - * The corresponding bit will be cleared unless both of L0 and L1 allow it. - */ -static void nested_vmx_disable_intercept_for_msr(unsigned long *msr_bitmap_l1, - unsigned long *msr_bitmap_nested, - u32 msr, int type) -{ - int f = sizeof(unsigned long); - - /* - * See Intel PRM Vol. 3, 20.6.9 (MSR-Bitmap Address). Early manuals - * have the write-low and read-high bitmap offsets the wrong way round. - * We can control MSRs 0x00000000-0x00001fff and 0xc0000000-0xc0001fff. - */ - if (msr <= 0x1fff) { - if (type & MSR_TYPE_R && - !test_bit(msr, msr_bitmap_l1 + 0x000 / f)) - /* read-low */ - __clear_bit(msr, msr_bitmap_nested + 0x000 / f); - - if (type & MSR_TYPE_W && - !test_bit(msr, msr_bitmap_l1 + 0x800 / f)) - /* write-low */ - __clear_bit(msr, msr_bitmap_nested + 0x800 / f); - - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - if (type & MSR_TYPE_R && - !test_bit(msr, msr_bitmap_l1 + 0x400 / f)) - /* read-high */ - __clear_bit(msr, msr_bitmap_nested + 0x400 / f); - - if (type & MSR_TYPE_W && - !test_bit(msr, msr_bitmap_l1 + 0xc00 / f)) - /* write-high */ - __clear_bit(msr, msr_bitmap_nested + 0xc00 / f); - - } + if (type & MSR_TYPE_W && !vmx_test_msr_bitmap_write(msr_bitmap_l1, msr)) + vmx_clear_msr_bitmap_write(msr_bitmap_l0, msr); } static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) @@ -600,6 +552,34 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) } } +#define BUILD_NVMX_MSR_INTERCEPT_HELPER(rw) \ +static inline \ +void nested_vmx_set_msr_##rw##_intercept(struct vcpu_vmx *vmx, \ + unsigned long *msr_bitmap_l1, \ + unsigned long *msr_bitmap_l0, u32 msr) \ +{ \ + if (vmx_test_msr_bitmap_##rw(vmx->vmcs01.msr_bitmap, msr) || \ + vmx_test_msr_bitmap_##rw(msr_bitmap_l1, msr)) \ + vmx_set_msr_bitmap_##rw(msr_bitmap_l0, msr); \ + else \ + vmx_clear_msr_bitmap_##rw(msr_bitmap_l0, msr); \ +} +BUILD_NVMX_MSR_INTERCEPT_HELPER(read) +BUILD_NVMX_MSR_INTERCEPT_HELPER(write) + +static inline void nested_vmx_set_intercept_for_msr(struct vcpu_vmx *vmx, + unsigned long *msr_bitmap_l1, + unsigned long *msr_bitmap_l0, + u32 msr, int types) +{ + if (types & MSR_TYPE_R) + nested_vmx_set_msr_read_intercept(vmx, msr_bitmap_l1, + msr_bitmap_l0, msr); + if (types & MSR_TYPE_W) + nested_vmx_set_msr_write_intercept(vmx, msr_bitmap_l1, + msr_bitmap_l0, msr); +} + /* * Merge L0's and L1's MSR bitmap, return false to indicate that * we do not use the hardware. @@ -607,10 +587,11 @@ static inline void enable_x2apic_msr_intercepts(unsigned long *msr_bitmap) static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) { + struct vcpu_vmx *vmx = to_vmx(vcpu); int msr; unsigned long *msr_bitmap_l1; - unsigned long *msr_bitmap_l0 = to_vmx(vcpu)->nested.vmcs02.msr_bitmap; - struct kvm_host_map *map = &to_vmx(vcpu)->nested.msr_bitmap_map; + unsigned long *msr_bitmap_l0 = vmx->nested.vmcs02.msr_bitmap; + struct kvm_host_map *map = &vmx->nested.msr_bitmap_map; /* Nothing to do if the MSR bitmap is not in use. */ if (!cpu_has_vmx_msr_bitmap() || @@ -625,7 +606,7 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, /* * To keep the control flow simple, pay eight 8-byte writes (sixteen * 4-byte writes on 32-bit systems) up front to enable intercepts for - * the x2APIC MSR range and selectively disable them below. + * the x2APIC MSR range and selectively toggle those relevant to L2. */ enable_x2apic_msr_intercepts(msr_bitmap_l0); @@ -644,61 +625,44 @@ static inline bool nested_vmx_prepare_msr_bitmap(struct kvm_vcpu *vcpu, } } - nested_vmx_disable_intercept_for_msr( + nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_R | MSR_TYPE_W); if (nested_cpu_has_vid(vmcs12)) { - nested_vmx_disable_intercept_for_msr( + nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_EOI), MSR_TYPE_W); - nested_vmx_disable_intercept_for_msr( + nested_vmx_disable_intercept_for_x2apic_msr( msr_bitmap_l1, msr_bitmap_l0, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W); } } - /* KVM unconditionally exposes the FS/GS base MSRs to L1. */ -#ifdef CONFIG_X86_64 - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, - MSR_FS_BASE, MSR_TYPE_RW); - - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, - MSR_GS_BASE, MSR_TYPE_RW); - - nested_vmx_disable_intercept_for_msr(msr_bitmap_l1, msr_bitmap_l0, - MSR_KERNEL_GS_BASE, MSR_TYPE_RW); -#endif - /* - * Checking the L0->L1 bitmap is trying to verify two things: - * - * 1. L0 gave a permission to L1 to actually passthrough the MSR. This - * ensures that we do not accidentally generate an L02 MSR bitmap - * from the L12 MSR bitmap that is too permissive. - * 2. That L1 or L2s have actually used the MSR. This avoids - * unnecessarily merging of the bitmap if the MSR is unused. This - * works properly because we only update the L01 MSR bitmap lazily. - * So even if L0 should pass L1 these MSRs, the L01 bitmap is only - * updated to reflect this when L1 (or its L2s) actually write to - * the MSR. + * Always check vmcs01's bitmap to honor userspace MSR filters and any + * other runtime changes to vmcs01's bitmap, e.g. dynamic pass-through. */ - if (!msr_write_intercepted_l01(vcpu, MSR_IA32_SPEC_CTRL)) - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_SPEC_CTRL, - MSR_TYPE_R | MSR_TYPE_W); +#ifdef CONFIG_X86_64 + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_FS_BASE, MSR_TYPE_RW); - if (!msr_write_intercepted_l01(vcpu, MSR_IA32_PRED_CMD)) - nested_vmx_disable_intercept_for_msr( - msr_bitmap_l1, msr_bitmap_l0, - MSR_IA32_PRED_CMD, - MSR_TYPE_W); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_GS_BASE, MSR_TYPE_RW); - kvm_vcpu_unmap(vcpu, &to_vmx(vcpu)->nested.msr_bitmap_map, false); + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_KERNEL_GS_BASE, MSR_TYPE_RW); +#endif + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_SPEC_CTRL, MSR_TYPE_RW); + + nested_vmx_set_intercept_for_msr(vmx, msr_bitmap_l1, msr_bitmap_l0, + MSR_IA32_PRED_CMD, MSR_TYPE_W); + + kvm_vcpu_unmap(vcpu, &vmx->nested.msr_bitmap_map, false); return true; } @@ -5379,7 +5343,7 @@ static int handle_invept(struct kvm_vcpu *vcpu) struct { u64 eptp, gpa; } operand; - int i, r; + int i, r, gpr_index; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_EPT) || @@ -5392,7 +5356,8 @@ static int handle_invept(struct kvm_vcpu *vcpu) return 1; vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); types = (vmx->nested.msrs.ept_caps >> VMX_EPT_EXTENT_SHIFT) & 6; @@ -5459,7 +5424,7 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) u64 gla; } operand; u16 vpid02; - int r; + int r, gpr_index; if (!(vmx->nested.msrs.secondary_ctls_high & SECONDARY_EXEC_ENABLE_VPID) || @@ -5472,7 +5437,8 @@ static int handle_invvpid(struct kvm_vcpu *vcpu) return 1; vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); types = (vmx->nested.msrs.vpid_caps & VMX_VPID_EXTENT_SUPPORTED_MASK) >> 8; diff --git a/arch/x86/kvm/vmx/pmu_intel.c b/arch/x86/kvm/vmx/pmu_intel.c index b8e0d21b7c8a..1b7456b2177b 100644 --- a/arch/x86/kvm/vmx/pmu_intel.c +++ b/arch/x86/kvm/vmx/pmu_intel.c @@ -118,16 +118,15 @@ static struct kvm_pmc *intel_pmc_idx_to_pmc(struct kvm_pmu *pmu, int pmc_idx) } } -/* returns 0 if idx's corresponding MSR exists; otherwise returns 1. */ -static int intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) +static bool intel_is_valid_rdpmc_ecx(struct kvm_vcpu *vcpu, unsigned int idx) { struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); bool fixed = idx & (1u << 30); idx &= ~(3u << 30); - return (!fixed && idx >= pmu->nr_arch_gp_counters) || - (fixed && idx >= pmu->nr_arch_fixed_counters); + return fixed ? idx < pmu->nr_arch_fixed_counters + : idx < pmu->nr_arch_gp_counters; } static struct kvm_pmc *intel_rdpmc_ecx_to_pmc(struct kvm_vcpu *vcpu, diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index 76861b66bbcf..ba66c171d951 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -769,24 +769,13 @@ void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu) /* * Check if MSR is intercepted for currently loaded MSR bitmap. */ -static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) +static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr) { - unsigned long *msr_bitmap; - int f = sizeof(unsigned long); - - if (!cpu_has_vmx_msr_bitmap()) + if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS)) return true; - msr_bitmap = to_vmx(vcpu)->loaded_vmcs->msr_bitmap; - - if (msr <= 0x1fff) { - return !!test_bit(msr, msr_bitmap + 0x800 / f); - } else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) { - msr &= 0x1fff; - return !!test_bit(msr, msr_bitmap + 0xc00 / f); - } - - return true; + return vmx_test_msr_bitmap_write(vmx->loaded_vmcs->msr_bitmap, + MSR_IA32_SPEC_CTRL); } static void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx, @@ -3697,46 +3686,6 @@ void free_vpid(int vpid) spin_unlock(&vmx_vpid_lock); } -static void vmx_clear_msr_bitmap_read(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __clear_bit(msr, msr_bitmap + 0x000 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __clear_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); -} - -static void vmx_clear_msr_bitmap_write(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __clear_bit(msr, msr_bitmap + 0x800 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __clear_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); -} - -static void vmx_set_msr_bitmap_read(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __set_bit(msr, msr_bitmap + 0x000 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __set_bit(msr & 0x1fff, msr_bitmap + 0x400 / f); -} - -static void vmx_set_msr_bitmap_write(ulong *msr_bitmap, u32 msr) -{ - int f = sizeof(unsigned long); - - if (msr <= 0x1fff) - __set_bit(msr, msr_bitmap + 0x800 / f); - else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) - __set_bit(msr & 0x1fff, msr_bitmap + 0xc00 / f); -} - void vmx_disable_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, int type) { struct vcpu_vmx *vmx = to_vmx(vcpu); @@ -5494,6 +5443,7 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) u64 pcid; u64 gla; } operand; + int gpr_index; if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { kvm_queue_exception(vcpu, UD_VECTOR); @@ -5501,12 +5451,8 @@ static int handle_invpcid(struct kvm_vcpu *vcpu) } vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO); - type = kvm_register_read(vcpu, (vmx_instruction_info >> 28) & 0xf); - - if (type > 3) { - kvm_inject_gp(vcpu, 0); - return 1; - } + gpr_index = vmx_get_instr_info_reg2(vmx_instruction_info); + type = kvm_register_read(vcpu, gpr_index); /* According to the Intel instruction reference, the memory operand * is read even if it isn't needed (e.g., for type==all) @@ -6749,7 +6695,7 @@ static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu) * If the L02 MSR bitmap does not intercept the MSR, then we need to * save it. */ - if (unlikely(!msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL))) + if (unlikely(!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))) vmx->spec_ctrl = native_read_msr(MSR_IA32_SPEC_CTRL); x86_spec_ctrl_restore_host(vmx->spec_ctrl, 0); @@ -7563,7 +7509,8 @@ static void hardware_unsetup(void) static bool vmx_check_apicv_inhibit_reasons(ulong bit) { ulong supported = BIT(APICV_INHIBIT_REASON_DISABLE) | - BIT(APICV_INHIBIT_REASON_HYPERV); + BIT(APICV_INHIBIT_REASON_HYPERV) | + BIT(APICV_INHIBIT_REASON_BLOCKIRQ); return supported & BIT(bit); } diff --git a/arch/x86/kvm/vmx/vmx.h b/arch/x86/kvm/vmx/vmx.h index e7db42e3b0ce..a4ead6023133 100644 --- a/arch/x86/kvm/vmx/vmx.h +++ b/arch/x86/kvm/vmx/vmx.h @@ -400,6 +400,34 @@ static inline void vmx_set_intercept_for_msr(struct kvm_vcpu *vcpu, u32 msr, void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu); +/* + * Note, early Intel manuals have the write-low and read-high bitmap offsets + * the wrong way round. The bitmaps control MSRs 0x00000000-0x00001fff and + * 0xc0000000-0xc0001fff. The former (low) uses bytes 0-0x3ff for reads and + * 0x800-0xbff for writes. The latter (high) uses 0x400-0x7ff for reads and + * 0xc00-0xfff for writes. MSRs not covered by either of the ranges always + * VM-Exit. + */ +#define __BUILD_VMX_MSR_BITMAP_HELPER(rtype, action, bitop, access, base) \ +static inline rtype vmx_##action##_msr_bitmap_##access(unsigned long *bitmap, \ + u32 msr) \ +{ \ + int f = sizeof(unsigned long); \ + \ + if (msr <= 0x1fff) \ + return bitop##_bit(msr, bitmap + base / f); \ + else if ((msr >= 0xc0000000) && (msr <= 0xc0001fff)) \ + return bitop##_bit(msr & 0x1fff, bitmap + (base + 0x400) / f); \ + return (rtype)true; \ +} +#define BUILD_VMX_MSR_BITMAP_HELPERS(ret_type, action, bitop) \ + __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, read, 0x0) \ + __BUILD_VMX_MSR_BITMAP_HELPER(ret_type, action, bitop, write, 0x800) + +BUILD_VMX_MSR_BITMAP_HELPERS(bool, test, test) +BUILD_VMX_MSR_BITMAP_HELPERS(void, clear, __clear) +BUILD_VMX_MSR_BITMAP_HELPERS(void, set, __set) + static inline u8 vmx_get_rvi(void) { return vmcs_read16(GUEST_INTR_STATUS) & 0xff; @@ -522,4 +550,9 @@ static inline bool vmx_guest_state_valid(struct kvm_vcpu *vcpu) void dump_vmcs(struct kvm_vcpu *vcpu); +static inline int vmx_get_instr_info_reg2(u32 vmx_instr_info) +{ + return (vmx_instr_info >> 28) & 0xf; +} + #endif /* __KVM_X86_VMX_H */ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index c1c4e2b05a63..dc7eb5fddfd3 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -3260,8 +3260,11 @@ static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) static void record_steal_time(struct kvm_vcpu *vcpu) { - struct kvm_host_map map; - struct kvm_steal_time *st; + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; + struct kvm_steal_time __user *st; + struct kvm_memslots *slots; + u64 steal; + u32 version; if (kvm_xen_msr_enabled(vcpu->kvm)) { kvm_xen_runstate_set_running(vcpu); @@ -3271,47 +3274,86 @@ static void record_steal_time(struct kvm_vcpu *vcpu) if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; - /* -EAGAIN is returned in atomic context so we can just return. */ - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, - &map, &vcpu->arch.st.cache, false)) + if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm)) return; - st = map.hva + - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + slots = kvm_memslots(vcpu->kvm); + if (unlikely(slots->generation != ghc->generation || + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) { + gfn_t gfn = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; + + /* We rely on the fact that it fits in a single page. */ + BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); + + if (kvm_gfn_to_hva_cache_init(vcpu->kvm, ghc, gfn, sizeof(*st)) || + kvm_is_error_hva(ghc->hva) || !ghc->memslot) + return; + } + + st = (struct kvm_steal_time __user *)ghc->hva; /* * Doing a TLB flush here, on the guest's behalf, can avoid * expensive IPIs. */ if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { - u8 st_preempted = xchg(&st->preempted, 0); + u8 st_preempted = 0; + int err = -EFAULT; + + if (!user_access_begin(st, sizeof(*st))) + return; + + asm volatile("1: xchgb %0, %2\n" + "xor %1, %1\n" + "2:\n" + _ASM_EXTABLE_UA(1b, 2b) + : "+r" (st_preempted), + "+&r" (err) + : "m" (st->preempted)); + if (err) + goto out; + + user_access_end(); + + vcpu->arch.st.preempted = 0; trace_kvm_pv_tlb_flush(vcpu->vcpu_id, st_preempted & KVM_VCPU_FLUSH_TLB); if (st_preempted & KVM_VCPU_FLUSH_TLB) kvm_vcpu_flush_tlb_guest(vcpu); + + if (!user_access_begin(st, sizeof(*st))) + goto dirty; } else { - st->preempted = 0; + if (!user_access_begin(st, sizeof(*st))) + return; + + unsafe_put_user(0, &st->preempted, out); + vcpu->arch.st.preempted = 0; } - vcpu->arch.st.preempted = 0; + unsafe_get_user(version, &st->version, out); + if (version & 1) + version += 1; /* first time write, random junk */ - if (st->version & 1) - st->version += 1; /* first time write, random junk */ - - st->version += 1; + version += 1; + unsafe_put_user(version, &st->version, out); smp_wmb(); - st->steal += current->sched_info.run_delay - + unsafe_get_user(steal, &st->steal, out); + steal += current->sched_info.run_delay - vcpu->arch.st.last_steal; vcpu->arch.st.last_steal = current->sched_info.run_delay; + unsafe_put_user(steal, &st->steal, out); - smp_wmb(); + version += 1; + unsafe_put_user(version, &st->version, out); - st->version += 1; - - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, false); + out: + user_access_end(); + dirty: + mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) @@ -3517,7 +3559,7 @@ int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) return 1; - if (kvm_lapic_enable_pv_eoi(vcpu, data, sizeof(u8))) + if (kvm_lapic_set_pv_eoi(vcpu, data, sizeof(u8))) return 1; break; @@ -4137,7 +4179,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) r = !static_call(kvm_x86_cpu_has_accelerated_tpr)(); break; case KVM_CAP_NR_VCPUS: - r = KVM_SOFT_MAX_VCPUS; + r = num_online_cpus(); break; case KVM_CAP_MAX_VCPUS: r = KVM_MAX_VCPUS; @@ -4351,8 +4393,10 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) { - struct kvm_host_map map; - struct kvm_steal_time *st; + struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; + struct kvm_steal_time __user *st; + struct kvm_memslots *slots; + static const u8 preempted = KVM_VCPU_PREEMPTED; if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) return; @@ -4360,16 +4404,23 @@ static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) if (vcpu->arch.st.preempted) return; - if (kvm_map_gfn(vcpu, vcpu->arch.st.msr_val >> PAGE_SHIFT, &map, - &vcpu->arch.st.cache, true)) + /* This happens on process exit */ + if (unlikely(current->mm != vcpu->kvm->mm)) return; - st = map.hva + - offset_in_page(vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS); + slots = kvm_memslots(vcpu->kvm); - st->preempted = vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; + if (unlikely(slots->generation != ghc->generation || + kvm_is_error_hva(ghc->hva) || !ghc->memslot)) + return; - kvm_unmap_gfn(vcpu, &map, &vcpu->arch.st.cache, true, true); + st = (struct kvm_steal_time __user *)ghc->hva; + BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted)); + + if (!copy_to_user_nofault(&st->preempted, &preempted, sizeof(preempted))) + vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; + + mark_page_dirty_in_slot(vcpu->kvm, ghc->memslot, gpa_to_gfn(ghc->gpa)); } void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) @@ -5728,6 +5779,12 @@ split_irqchip_unlock: if (kvm_x86_ops.vm_copy_enc_context_from) r = kvm_x86_ops.vm_copy_enc_context_from(kvm, cap->args[0]); return r; + case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: + r = -EINVAL; + if (kvm_x86_ops.vm_move_enc_context_from) + r = kvm_x86_ops.vm_move_enc_context_from( + kvm, cap->args[0]); + return r; case KVM_CAP_EXIT_HYPERCALL: if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { r = -EINVAL; @@ -7328,7 +7385,9 @@ static void emulator_set_smbase(struct x86_emulate_ctxt *ctxt, u64 smbase) static int emulator_check_pmc(struct x86_emulate_ctxt *ctxt, u32 pmc) { - return kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc); + if (kvm_pmu_is_valid_rdpmc_ecx(emul_to_vcpu(ctxt), pmc)) + return 0; + return -EINVAL; } static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, @@ -9552,7 +9611,7 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) } if (kvm_request_pending(vcpu)) { - if (kvm_check_request(KVM_REQ_VM_BUGGED, vcpu)) { + if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { r = -EIO; goto out; } @@ -10564,6 +10623,24 @@ int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, return ret; } +static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm) +{ + bool inhibit = false; + struct kvm_vcpu *vcpu; + int i; + + down_write(&kvm->arch.apicv_update_lock); + + kvm_for_each_vcpu(i, vcpu, kvm) { + if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) { + inhibit = true; + break; + } + } + __kvm_request_apicv_update(kvm, !inhibit, APICV_INHIBIT_REASON_BLOCKIRQ); + up_write(&kvm->arch.apicv_update_lock); +} + int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, struct kvm_guest_debug *dbg) { @@ -10616,6 +10693,8 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, static_call(kvm_x86_update_exception_bitmap)(vcpu); + kvm_arch_vcpu_guestdbg_update_apicv_inhibit(vcpu->kvm); + r = 0; out: @@ -10859,11 +10938,8 @@ void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) { - struct gfn_to_pfn_cache *cache = &vcpu->arch.st.cache; int idx; - kvm_release_pfn(cache->pfn, cache->dirty, cache); - kvmclock_reset(vcpu); static_call(kvm_x86_vcpu_free)(vcpu); @@ -12275,7 +12351,8 @@ int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) return kvm_skip_emulated_instruction(vcpu); default: - BUG(); /* We have already checked above that type <= 3 */ + kvm_inject_gp(vcpu, 0); + return 1; } } EXPORT_SYMBOL_GPL(kvm_handle_invpcid); diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c index 23d54b810f08..35487305d8af 100644 --- a/arch/x86/mm/mem_encrypt.c +++ b/arch/x86/mm/mem_encrypt.c @@ -229,28 +229,75 @@ void __init sev_setup_arch(void) swiotlb_adjust_size(size); } +static unsigned long pg_level_to_pfn(int level, pte_t *kpte, pgprot_t *ret_prot) +{ + unsigned long pfn = 0; + pgprot_t prot; + + switch (level) { + case PG_LEVEL_4K: + pfn = pte_pfn(*kpte); + prot = pte_pgprot(*kpte); + break; + case PG_LEVEL_2M: + pfn = pmd_pfn(*(pmd_t *)kpte); + prot = pmd_pgprot(*(pmd_t *)kpte); + break; + case PG_LEVEL_1G: + pfn = pud_pfn(*(pud_t *)kpte); + prot = pud_pgprot(*(pud_t *)kpte); + break; + default: + WARN_ONCE(1, "Invalid level for kpte\n"); + return 0; + } + + if (ret_prot) + *ret_prot = prot; + + return pfn; +} + +void notify_range_enc_status_changed(unsigned long vaddr, int npages, bool enc) +{ +#ifdef CONFIG_PARAVIRT + unsigned long sz = npages << PAGE_SHIFT; + unsigned long vaddr_end = vaddr + sz; + + while (vaddr < vaddr_end) { + int psize, pmask, level; + unsigned long pfn; + pte_t *kpte; + + kpte = lookup_address(vaddr, &level); + if (!kpte || pte_none(*kpte)) { + WARN_ONCE(1, "kpte lookup for vaddr\n"); + return; + } + + pfn = pg_level_to_pfn(level, kpte, NULL); + if (!pfn) + continue; + + psize = page_level_size(level); + pmask = page_level_mask(level); + + notify_page_enc_status_changed(pfn, psize >> PAGE_SHIFT, enc); + + vaddr = (vaddr & pmask) + psize; + } +#endif +} + static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) { pgprot_t old_prot, new_prot; unsigned long pfn, pa, size; pte_t new_pte; - switch (level) { - case PG_LEVEL_4K: - pfn = pte_pfn(*kpte); - old_prot = pte_pgprot(*kpte); - break; - case PG_LEVEL_2M: - pfn = pmd_pfn(*(pmd_t *)kpte); - old_prot = pmd_pgprot(*(pmd_t *)kpte); - break; - case PG_LEVEL_1G: - pfn = pud_pfn(*(pud_t *)kpte); - old_prot = pud_pgprot(*(pud_t *)kpte); - break; - default: + pfn = pg_level_to_pfn(level, kpte, &old_prot); + if (!pfn) return; - } new_prot = old_prot; if (enc) @@ -286,12 +333,13 @@ static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc) static int __init early_set_memory_enc_dec(unsigned long vaddr, unsigned long size, bool enc) { - unsigned long vaddr_end, vaddr_next; + unsigned long vaddr_end, vaddr_next, start; unsigned long psize, pmask; int split_page_size_mask; int level, ret; pte_t *kpte; + start = vaddr; vaddr_next = vaddr; vaddr_end = vaddr + size; @@ -346,6 +394,7 @@ static int __init early_set_memory_enc_dec(unsigned long vaddr, ret = 0; + notify_range_enc_status_changed(start, PAGE_ALIGN(size) >> PAGE_SHIFT, enc); out: __flush_tlb_all(); return ret; @@ -361,6 +410,11 @@ int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size) return early_set_memory_enc_dec(vaddr, size, true); } +void __init early_set_mem_enc_dec_hypercall(unsigned long vaddr, int npages, bool enc) +{ + notify_range_enc_status_changed(vaddr, npages, enc); +} + /* Override for DMA direct allocation check - ARCH_HAS_FORCE_DMA_UNENCRYPTED */ bool force_dma_unencrypted(struct device *dev) { diff --git a/arch/x86/mm/pat/set_memory.c b/arch/x86/mm/pat/set_memory.c index 934dc5b2df36..b4072115c8ef 100644 --- a/arch/x86/mm/pat/set_memory.c +++ b/arch/x86/mm/pat/set_memory.c @@ -2023,6 +2023,12 @@ static int __set_memory_enc_pgtable(unsigned long addr, int numpages, bool enc) */ cpa_flush(&cpa, 0); + /* + * Notify hypervisor that a given memory range is mapped encrypted + * or decrypted. + */ + notify_range_enc_status_changed(addr, numpages, enc); + return ret; } diff --git a/crypto/algapi.c b/crypto/algapi.c index d379fd91fb7b..a366cb3e8aa1 100644 --- a/crypto/algapi.c +++ b/crypto/algapi.c @@ -284,6 +284,8 @@ static struct crypto_larval *__crypto_register_alg(struct crypto_alg *alg) if (larval) list_add(&larval->alg.cra_list, &crypto_alg_list); + else + alg->cra_flags |= CRYPTO_ALG_TESTED; crypto_stats_init(alg); diff --git a/drivers/bus/brcmstb_gisb.c b/drivers/bus/brcmstb_gisb.c index 4c2f7d61cb9b..183d5cc37d42 100644 --- a/drivers/bus/brcmstb_gisb.c +++ b/drivers/bus/brcmstb_gisb.c @@ -485,7 +485,7 @@ static int __init brcmstb_gisb_arb_probe(struct platform_device *pdev) list_add_tail(&gdev->next, &brcmstb_gisb_arb_device_list); #ifdef CONFIG_MIPS - board_be_handler = brcmstb_bus_error_handler; + mips_set_be_handler(brcmstb_bus_error_handler); #endif if (list_is_singular(&brcmstb_gisb_arb_device_list)) { diff --git a/drivers/pci/controller/Kconfig b/drivers/pci/controller/Kconfig index e917bb3652bb..93b141110537 100644 --- a/drivers/pci/controller/Kconfig +++ b/drivers/pci/controller/Kconfig @@ -270,7 +270,8 @@ config VMD config PCIE_BRCMSTB tristate "Broadcom Brcmstb PCIe host controller" - depends on ARCH_BRCMSTB || ARCH_BCM2835 || ARCH_BCM4908 || COMPILE_TEST + depends on ARCH_BRCMSTB || ARCH_BCM2835 || ARCH_BCM4908 || \ + BMIPS_GENERIC || COMPILE_TEST depends on OF depends on PCI_MSI_IRQ_DOMAIN default ARCH_BRCMSTB diff --git a/drivers/pci/hotplug/s390_pci_hpc.c b/drivers/pci/hotplug/s390_pci_hpc.c index dcefdb42ac46..a89b7de72dcf 100644 --- a/drivers/pci/hotplug/s390_pci_hpc.c +++ b/drivers/pci/hotplug/s390_pci_hpc.c @@ -57,6 +57,29 @@ static int disable_slot(struct hotplug_slot *hotplug_slot) return zpci_deconfigure_device(zdev); } +static int reset_slot(struct hotplug_slot *hotplug_slot, bool probe) +{ + struct zpci_dev *zdev = container_of(hotplug_slot, struct zpci_dev, + hotplug_slot); + + if (zdev->state != ZPCI_FN_STATE_CONFIGURED) + return -EIO; + /* + * We can't take the zdev->lock as reset_slot may be called during + * probing and/or device removal which already happens under the + * zdev->lock. Instead the user should use the higher level + * pci_reset_function() or pci_bus_reset() which hold the PCI device + * lock preventing concurrent removal. If not using these functions + * holding the PCI device lock is required. + */ + + /* As long as the function is configured we can reset */ + if (probe) + return 0; + + return zpci_hot_reset_device(zdev); +} + static int get_power_status(struct hotplug_slot *hotplug_slot, u8 *value) { struct zpci_dev *zdev = container_of(hotplug_slot, struct zpci_dev, @@ -76,6 +99,7 @@ static int get_adapter_status(struct hotplug_slot *hotplug_slot, u8 *value) static const struct hotplug_slot_ops s390_hotplug_slot_ops = { .enable_slot = enable_slot, .disable_slot = disable_slot, + .reset_slot = reset_slot, .get_power_status = get_power_status, .get_adapter_status = get_adapter_status, }; diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c index 1579a3724eb4..3d2fb394986a 100644 --- a/drivers/pci/pci.c +++ b/drivers/pci/pci.c @@ -5106,12 +5106,13 @@ static int pci_reset_bus_function(struct pci_dev *dev, bool probe) return pci_parent_bus_reset(dev, probe); } -static void pci_dev_lock(struct pci_dev *dev) +void pci_dev_lock(struct pci_dev *dev) { pci_cfg_access_lock(dev); /* block PM suspend, driver probe, etc. */ device_lock(&dev->dev); } +EXPORT_SYMBOL_GPL(pci_dev_lock); /* Return 1 on successful lock, 0 on contention */ int pci_dev_trylock(struct pci_dev *dev) diff --git a/drivers/s390/char/tape_std.c b/drivers/s390/char/tape_std.c index 1f5fab617b67..f7e75d9fedf6 100644 --- a/drivers/s390/char/tape_std.c +++ b/drivers/s390/char/tape_std.c @@ -53,7 +53,6 @@ int tape_std_assign(struct tape_device *device) { int rc; - struct timer_list timeout; struct tape_request *request; request = tape_alloc_request(2, 11); @@ -70,7 +69,7 @@ tape_std_assign(struct tape_device *device) * So we set up a timeout for this call. */ timer_setup(&request->timer, tape_std_assign_timeout, 0); - mod_timer(&timeout, jiffies + 2 * HZ); + mod_timer(&request->timer, jiffies + msecs_to_jiffies(2000)); rc = tape_do_io_interruptible(device, request); diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 2bc55ccf3f23..ce9e7517430f 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -437,8 +437,8 @@ static ssize_t dev_busid_show(struct device *dev, struct subchannel *sch = to_subchannel(dev); struct pmcw *pmcw = &sch->schib.pmcw; - if ((pmcw->st == SUBCHANNEL_TYPE_IO || - pmcw->st == SUBCHANNEL_TYPE_MSG) && pmcw->dnv) + if ((pmcw->st == SUBCHANNEL_TYPE_IO && pmcw->dnv) || + (pmcw->st == SUBCHANNEL_TYPE_MSG && pmcw->w)) return sysfs_emit(buf, "0.%x.%04x\n", sch->schid.ssid, pmcw->dev); else diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index adafdf86f42f..fac918ccb305 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -108,7 +108,9 @@ static const struct netfs_read_request_ops v9fs_req_ops = { */ static int v9fs_vfs_readpage(struct file *file, struct page *page) { - return netfs_readpage(file, page, &v9fs_req_ops, NULL); + struct folio *folio = page_folio(page); + + return netfs_readpage(file, folio, &v9fs_req_ops, NULL); } /** @@ -130,13 +132,15 @@ static void v9fs_vfs_readahead(struct readahead_control *ractl) static int v9fs_release_page(struct page *page, gfp_t gfp) { - if (PagePrivate(page)) + struct folio *folio = page_folio(page); + + if (folio_test_private(folio)) return 0; #ifdef CONFIG_9P_FSCACHE - if (PageFsCache(page)) { + if (folio_test_fscache(folio)) { if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) return 0; - wait_on_page_fscache(page); + folio_wait_fscache(folio); } #endif return 1; @@ -152,55 +156,58 @@ static int v9fs_release_page(struct page *page, gfp_t gfp) static void v9fs_invalidate_page(struct page *page, unsigned int offset, unsigned int length) { - wait_on_page_fscache(page); + struct folio *folio = page_folio(page); + + folio_wait_fscache(folio); } -static int v9fs_vfs_writepage_locked(struct page *page) +static int v9fs_vfs_write_folio_locked(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio_inode(folio); struct v9fs_inode *v9inode = V9FS_I(inode); - loff_t start = page_offset(page); - loff_t size = i_size_read(inode); + loff_t start = folio_pos(folio); + loff_t i_size = i_size_read(inode); struct iov_iter from; - int err, len; + size_t len = folio_size(folio); + int err; - if (page->index == size >> PAGE_SHIFT) - len = size & ~PAGE_MASK; - else - len = PAGE_SIZE; + if (start >= i_size) + return 0; /* Simultaneous truncation occurred */ - iov_iter_xarray(&from, WRITE, &page->mapping->i_pages, start, len); + len = min_t(loff_t, i_size - start, len); + + iov_iter_xarray(&from, WRITE, &folio_mapping(folio)->i_pages, start, len); /* We should have writeback_fid always set */ BUG_ON(!v9inode->writeback_fid); - set_page_writeback(page); + folio_start_writeback(folio); p9_client_write(v9inode->writeback_fid, start, &from, &err); - end_page_writeback(page); + folio_end_writeback(folio); return err; } static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) { + struct folio *folio = page_folio(page); int retval; - p9_debug(P9_DEBUG_VFS, "page %p\n", page); + p9_debug(P9_DEBUG_VFS, "folio %p\n", folio); - retval = v9fs_vfs_writepage_locked(page); + retval = v9fs_vfs_write_folio_locked(folio); if (retval < 0) { if (retval == -EAGAIN) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); retval = 0; } else { - SetPageError(page); - mapping_set_error(page->mapping, retval); + mapping_set_error(folio_mapping(folio), retval); } } else retval = 0; - unlock_page(page); + folio_unlock(folio); return retval; } @@ -213,14 +220,15 @@ static int v9fs_vfs_writepage(struct page *page, struct writeback_control *wbc) static int v9fs_launder_page(struct page *page) { + struct folio *folio = page_folio(page); int retval; - if (clear_page_dirty_for_io(page)) { - retval = v9fs_vfs_writepage_locked(page); + if (folio_clear_dirty_for_io(folio)) { + retval = v9fs_vfs_write_folio_locked(folio); if (retval) return retval; } - wait_on_page_fscache(page); + folio_wait_fscache(folio); return 0; } @@ -265,10 +273,10 @@ v9fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) static int v9fs_write_begin(struct file *filp, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int flags, - struct page **pagep, void **fsdata) + struct page **subpagep, void **fsdata) { int retval; - struct page *page; + struct folio *folio; struct v9fs_inode *v9inode = V9FS_I(mapping->host); p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); @@ -279,31 +287,32 @@ static int v9fs_write_begin(struct file *filp, struct address_space *mapping, * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. */ - retval = netfs_write_begin(filp, mapping, pos, len, flags, &page, fsdata, + retval = netfs_write_begin(filp, mapping, pos, len, flags, &folio, fsdata, &v9fs_req_ops, NULL); if (retval < 0) return retval; - *pagep = find_subpage(page, pos / PAGE_SIZE); + *subpagep = &folio->page; return retval; } static int v9fs_write_end(struct file *filp, struct address_space *mapping, loff_t pos, unsigned int len, unsigned int copied, - struct page *page, void *fsdata) + struct page *subpage, void *fsdata) { loff_t last_pos = pos + copied; - struct inode *inode = page->mapping->host; + struct folio *folio = page_folio(subpage); + struct inode *inode = mapping->host; p9_debug(P9_DEBUG_VFS, "filp %p, mapping %p\n", filp, mapping); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { if (unlikely(copied < len)) { copied = 0; goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } /* @@ -314,10 +323,10 @@ static int v9fs_write_end(struct file *filp, struct address_space *mapping, inode_add_bytes(inode, last_pos - inode->i_size); i_size_write(inode, last_pos); } - set_page_dirty(page); + folio_mark_dirty(folio); out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return copied; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 4244d48398ef..612e297f3763 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -528,13 +528,13 @@ static vm_fault_t v9fs_vm_page_mkwrite(struct vm_fault *vmf) { struct v9fs_inode *v9inode; - struct page *page = vmf->page; + struct folio *folio = page_folio(vmf->page); struct file *filp = vmf->vma->vm_file; struct inode *inode = file_inode(filp); - p9_debug(P9_DEBUG_VFS, "page %p fid %lx\n", - page, (unsigned long)filp->private_data); + p9_debug(P9_DEBUG_VFS, "folio %p fid %lx\n", + folio, (unsigned long)filp->private_data); v9inode = V9FS_I(inode); @@ -542,24 +542,24 @@ v9fs_vm_page_mkwrite(struct vm_fault *vmf) * be modified. We then assume the entire page will need writing back. */ #ifdef CONFIG_9P_FSCACHE - if (PageFsCache(page) && - wait_on_page_fscache_killable(page) < 0) - return VM_FAULT_RETRY; + if (folio_test_fscache(folio) && + folio_wait_fscache_killable(folio) < 0) + return VM_FAULT_NOPAGE; #endif /* Update file times before taking page lock */ file_update_time(filp); BUG_ON(!v9inode->writeback_fid); - if (lock_page_killable(page) < 0) + if (folio_lock_killable(folio) < 0) return VM_FAULT_RETRY; - if (page->mapping != inode->i_mapping) + if (folio_mapping(folio) != inode->i_mapping) goto out_unlock; - wait_for_stable_page(page); + folio_wait_stable(folio); return VM_FAULT_LOCKED; out_unlock: - unlock_page(page); + folio_unlock(folio); return VM_FAULT_NOPAGE; } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 4579bbda4634..da9b4f8577a1 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -103,13 +103,13 @@ struct afs_lookup_cookie { }; /* - * Drop the refs that we're holding on the pages we were reading into. We've + * Drop the refs that we're holding on the folios we were reading into. We've * got refs on the first nr_pages pages. */ static void afs_dir_read_cleanup(struct afs_read *req) { struct address_space *mapping = req->vnode->vfs_inode.i_mapping; - struct page *page; + struct folio *folio; pgoff_t last = req->nr_pages - 1; XA_STATE(xas, &mapping->i_pages, 0); @@ -118,65 +118,56 @@ static void afs_dir_read_cleanup(struct afs_read *req) return; rcu_read_lock(); - xas_for_each(&xas, page, last) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, last) { + if (xas_retry(&xas, folio)) continue; - BUG_ON(xa_is_value(page)); - BUG_ON(PageCompound(page)); - ASSERTCMP(page->mapping, ==, mapping); + BUG_ON(xa_is_value(folio)); + ASSERTCMP(folio_file_mapping(folio), ==, mapping); - put_page(page); + folio_put(folio); } rcu_read_unlock(); } /* - * check that a directory page is valid + * check that a directory folio is valid */ -static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, - loff_t i_size) +static bool afs_dir_check_folio(struct afs_vnode *dvnode, struct folio *folio, + loff_t i_size) { - struct afs_xdr_dir_page *dbuf; - loff_t latter, off; - int tmp, qty; + union afs_xdr_dir_block *block; + size_t offset, size; + loff_t pos; - /* Determine how many magic numbers there should be in this page, but + /* Determine how many magic numbers there should be in this folio, but * we must take care because the directory may change size under us. */ - off = page_offset(page); - if (i_size <= off) + pos = folio_pos(folio); + if (i_size <= pos) goto checked; - latter = i_size - off; - if (latter >= PAGE_SIZE) - qty = PAGE_SIZE; - else - qty = latter; - qty /= sizeof(union afs_xdr_dir_block); - - /* check them */ - dbuf = kmap_atomic(page); - for (tmp = 0; tmp < qty; tmp++) { - if (dbuf->blocks[tmp].hdr.magic != AFS_DIR_MAGIC) { - printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n", - __func__, dvnode->vfs_inode.i_ino, tmp, qty, - ntohs(dbuf->blocks[tmp].hdr.magic)); - trace_afs_dir_check_failed(dvnode, off, i_size); - kunmap(page); + size = min_t(loff_t, folio_size(folio), i_size - pos); + for (offset = 0; offset < size; offset += sizeof(*block)) { + block = kmap_local_folio(folio, offset); + if (block->hdr.magic != AFS_DIR_MAGIC) { + printk("kAFS: %s(%lx): [%llx] bad magic %zx/%zx is %04hx\n", + __func__, dvnode->vfs_inode.i_ino, + pos, offset, size, ntohs(block->hdr.magic)); + trace_afs_dir_check_failed(dvnode, pos + offset, i_size); + kunmap_local(block); trace_afs_file_error(dvnode, -EIO, afs_file_error_dir_bad_magic); goto error; } /* Make sure each block is NUL terminated so we can reasonably - * use string functions on it. The filenames in the page + * use string functions on it. The filenames in the folio * *should* be NUL-terminated anyway. */ - ((u8 *)&dbuf->blocks[tmp])[AFS_DIR_BLOCK_SIZE - 1] = 0; + ((u8 *)block)[AFS_DIR_BLOCK_SIZE - 1] = 0; + + kunmap_local(block); } - - kunmap_atomic(dbuf); - checked: afs_stat_v(dvnode, n_read_dir); return true; @@ -190,11 +181,11 @@ error: */ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) { - struct afs_xdr_dir_page *dbuf; + union afs_xdr_dir_block *block; struct address_space *mapping = dvnode->vfs_inode.i_mapping; - struct page *page; - unsigned int i, qty = PAGE_SIZE / sizeof(union afs_xdr_dir_block); + struct folio *folio; pgoff_t last = req->nr_pages - 1; + size_t offset, size; XA_STATE(xas, &mapping->i_pages, 0); @@ -205,30 +196,28 @@ static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) req->pos, req->nr_pages, req->iter->iov_offset, iov_iter_count(req->iter)); - xas_for_each(&xas, page, last) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, last) { + if (xas_retry(&xas, folio)) continue; - BUG_ON(PageCompound(page)); - BUG_ON(page->mapping != mapping); + BUG_ON(folio_file_mapping(folio) != mapping); - dbuf = kmap_atomic(page); - for (i = 0; i < qty; i++) { - union afs_xdr_dir_block *block = &dbuf->blocks[i]; - - pr_warn("[%02lx] %32phN\n", page->index * qty + i, block); + size = min_t(loff_t, folio_size(folio), req->actual_len - folio_pos(folio)); + for (offset = 0; offset < size; offset += sizeof(*block)) { + block = kmap_local_folio(folio, offset); + pr_warn("[%02lx] %32phN\n", folio_index(folio) + offset, block); + kunmap_local(block); } - kunmap_atomic(dbuf); } } /* - * Check all the pages in a directory. All the pages are held pinned. + * Check all the blocks in a directory. All the folios are held pinned. */ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) { struct address_space *mapping = dvnode->vfs_inode.i_mapping; - struct page *page; + struct folio *folio; pgoff_t last = req->nr_pages - 1; int ret = 0; @@ -238,14 +227,13 @@ static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) return 0; rcu_read_lock(); - xas_for_each(&xas, page, last) { - if (xas_retry(&xas, page)) + xas_for_each(&xas, folio, last) { + if (xas_retry(&xas, folio)) continue; - BUG_ON(PageCompound(page)); - BUG_ON(page->mapping != mapping); + BUG_ON(folio_file_mapping(folio) != mapping); - if (!afs_dir_check_page(dvnode, page, req->file_size)) { + if (!afs_dir_check_folio(dvnode, folio, req->actual_len)) { afs_dir_dump(dvnode, req); ret = -EIO; break; @@ -274,15 +262,16 @@ static int afs_dir_open(struct inode *inode, struct file *file) /* * Read the directory into the pagecache in one go, scrubbing the previous - * contents. The list of pages is returned, pinning them so that they don't + * contents. The list of folios is returned, pinning them so that they don't * get reclaimed during the iteration. */ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) __acquires(&dvnode->validate_lock) { + struct address_space *mapping = dvnode->vfs_inode.i_mapping; struct afs_read *req; loff_t i_size; - int nr_pages, i, n; + int nr_pages, i; int ret; _enter(""); @@ -320,43 +309,30 @@ expand: req->iter = &req->def_iter; /* Fill in any gaps that we might find where the memory reclaimer has - * been at work and pin all the pages. If there are any gaps, we will + * been at work and pin all the folios. If there are any gaps, we will * need to reread the entire directory contents. */ i = req->nr_pages; while (i < nr_pages) { - struct page *pages[8], *page; - - n = find_get_pages_contig(dvnode->vfs_inode.i_mapping, i, - min_t(unsigned int, nr_pages - i, - ARRAY_SIZE(pages)), - pages); - _debug("find %u at %u/%u", n, i, nr_pages); - - if (n == 0) { - gfp_t gfp = dvnode->vfs_inode.i_mapping->gfp_mask; + struct folio *folio; + folio = filemap_get_folio(mapping, i); + if (!folio) { if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) afs_stat_v(dvnode, n_inval); ret = -ENOMEM; - page = __page_cache_alloc(gfp); - if (!page) + folio = __filemap_get_folio(mapping, + i, FGP_LOCK | FGP_CREAT, + mapping->gfp_mask); + if (!folio) goto error; - ret = add_to_page_cache_lru(page, - dvnode->vfs_inode.i_mapping, - i, gfp); - if (ret < 0) - goto error; - - attach_page_private(page, (void *)1); - unlock_page(page); - req->nr_pages++; - i++; - } else { - req->nr_pages += n; - i += n; + folio_attach_private(folio, (void *)1); + folio_unlock(folio); } + + req->nr_pages += folio_nr_pages(folio); + i += folio_nr_pages(folio); } /* If we're going to reload, we need to lock all the pages to prevent @@ -424,7 +400,7 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, size_t nlen; int tmp; - _enter("%u,%x,%p,,",(unsigned)ctx->pos,blkoff,block); + _enter("%llx,%x", ctx->pos, blkoff); curr = (ctx->pos - blkoff) / sizeof(union afs_xdr_dirent); @@ -513,12 +489,10 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, struct key *key, afs_dataversion_t *_dir_version) { struct afs_vnode *dvnode = AFS_FS_I(dir); - struct afs_xdr_dir_page *dbuf; union afs_xdr_dir_block *dblock; struct afs_read *req; - struct page *page; - unsigned blkoff, limit; - void __rcu **slot; + struct folio *folio; + unsigned offset, size; int ret; _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos); @@ -540,43 +514,30 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, /* walk through the blocks in sequence */ ret = 0; while (ctx->pos < req->actual_len) { - blkoff = ctx->pos & ~(sizeof(union afs_xdr_dir_block) - 1); - - /* Fetch the appropriate page from the directory and re-add it + /* Fetch the appropriate folio from the directory and re-add it * to the LRU. We have all the pages pinned with an extra ref. */ - rcu_read_lock(); - page = NULL; - slot = radix_tree_lookup_slot(&dvnode->vfs_inode.i_mapping->i_pages, - blkoff / PAGE_SIZE); - if (slot) - page = radix_tree_deref_slot(slot); - rcu_read_unlock(); - if (!page) { + folio = __filemap_get_folio(dir->i_mapping, ctx->pos / PAGE_SIZE, + FGP_ACCESSED, 0); + if (!folio) { ret = afs_bad(dvnode, afs_file_error_dir_missing_page); break; } - mark_page_accessed(page); - limit = blkoff & ~(PAGE_SIZE - 1); + offset = round_down(ctx->pos, sizeof(*dblock)) - folio_file_pos(folio); + size = min_t(loff_t, folio_size(folio), + req->actual_len - folio_file_pos(folio)); - dbuf = kmap(page); - - /* deal with the individual blocks stashed on this page */ do { - dblock = &dbuf->blocks[(blkoff % PAGE_SIZE) / - sizeof(union afs_xdr_dir_block)]; - ret = afs_dir_iterate_block(dvnode, ctx, dblock, blkoff); - if (ret != 1) { - kunmap(page); + dblock = kmap_local_folio(folio, offset); + ret = afs_dir_iterate_block(dvnode, ctx, dblock, + folio_file_pos(folio) + offset); + kunmap_local(dblock); + if (ret != 1) goto out; - } - blkoff += sizeof(union afs_xdr_dir_block); + } while (offset += sizeof(*dblock), offset < size); - } while (ctx->pos < dir->i_size && blkoff < limit); - - kunmap(page); ret = 0; } @@ -2037,42 +1998,42 @@ error: } /* - * Release a directory page and clean up its private state if it's not busy - * - return true if the page can now be released, false if not + * Release a directory folio and clean up its private state if it's not busy + * - return true if the folio can now be released, false if not */ -static int afs_dir_releasepage(struct page *page, gfp_t gfp_flags) +static int afs_dir_releasepage(struct page *subpage, gfp_t gfp_flags) { - struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host); + struct folio *folio = page_folio(subpage); + struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); - _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, page->index); + _enter("{{%llx:%llu}[%lu]}", dvnode->fid.vid, dvnode->fid.vnode, folio_index(folio)); - detach_page_private(page); + folio_detach_private(folio); /* The directory will need reloading. */ if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) afs_stat_v(dvnode, n_relpg); - return 1; + return true; } /* - * invalidate part or all of a page - * - release a page and clean up its private data if offset is 0 (indicating - * the entire page) + * Invalidate part or all of a folio. */ -static void afs_dir_invalidatepage(struct page *page, unsigned int offset, +static void afs_dir_invalidatepage(struct page *subpage, unsigned int offset, unsigned int length) { - struct afs_vnode *dvnode = AFS_FS_I(page->mapping->host); + struct folio *folio = page_folio(subpage); + struct afs_vnode *dvnode = AFS_FS_I(folio_inode(folio)); - _enter("{%lu},%u,%u", page->index, offset, length); + _enter("{%lu},%u,%u", folio_index(folio), offset, length); - BUG_ON(!PageLocked(page)); + BUG_ON(!folio_test_locked(folio)); /* The directory will need reloading. */ if (test_and_clear_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) afs_stat_v(dvnode, n_inval); - /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == thp_size(page)) - detach_page_private(page); + /* we clean up only if the entire folio is being invalidated */ + if (offset == 0 && length == folio_size(folio)) + folio_detach_private(folio); } diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 540b9fc96824..d98e109ecee9 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -104,6 +104,25 @@ static void afs_clear_contig_bits(union afs_xdr_dir_block *block, block->hdr.bitmap[7] &= ~(u8)(mask >> 7 * 8); } +/* + * Get a new directory folio. + */ +static struct folio *afs_dir_get_folio(struct afs_vnode *vnode, pgoff_t index) +{ + struct address_space *mapping = vnode->vfs_inode.i_mapping; + struct folio *folio; + + folio = __filemap_get_folio(mapping, index, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mapping->gfp_mask); + if (!folio) + clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + else if (folio && !folio_test_private(folio)) + folio_attach_private(folio, (void *)1); + + return folio; +} + /* * Scan a directory block looking for a dirent of the right name. */ @@ -188,13 +207,11 @@ void afs_edit_dir_add(struct afs_vnode *vnode, enum afs_edit_dir_reason why) { union afs_xdr_dir_block *meta, *block; - struct afs_xdr_dir_page *meta_page, *dir_page; union afs_xdr_dirent *de; - struct page *page0, *page; + struct folio *folio0, *folio; unsigned int need_slots, nr_blocks, b; pgoff_t index; loff_t i_size; - gfp_t gfp; int slot; _enter(",,{%d,%s},", name->len, name->name); @@ -206,10 +223,8 @@ void afs_edit_dir_add(struct afs_vnode *vnode, return; } - gfp = vnode->vfs_inode.i_mapping->gfp_mask; - page0 = find_or_create_page(vnode->vfs_inode.i_mapping, 0, gfp); - if (!page0) { - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + folio0 = afs_dir_get_folio(vnode, 0); + if (!folio0) { _leave(" [fgp]"); return; } @@ -217,42 +232,35 @@ void afs_edit_dir_add(struct afs_vnode *vnode, /* Work out how many slots we're going to need. */ need_slots = afs_dir_calc_slots(name->len); - meta_page = kmap(page0); - meta = &meta_page->blocks[0]; + meta = kmap_local_folio(folio0, 0); if (i_size == 0) goto new_directory; nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; - /* Find a block that has sufficient slots available. Each VM page + /* Find a block that has sufficient slots available. Each folio * contains two or more directory blocks. */ for (b = 0; b < nr_blocks + 1; b++) { - /* If the directory extended into a new page, then we need to - * tack a new page on the end. + /* If the directory extended into a new folio, then we need to + * tack a new folio on the end. */ index = b / AFS_DIR_BLOCKS_PER_PAGE; - if (index == 0) { - page = page0; - dir_page = meta_page; + if (nr_blocks >= AFS_DIR_MAX_BLOCKS) + goto error; + if (index >= folio_nr_pages(folio0)) { + folio = afs_dir_get_folio(vnode, index); + if (!folio) + goto error; } else { - if (nr_blocks >= AFS_DIR_MAX_BLOCKS) - goto error; - gfp = vnode->vfs_inode.i_mapping->gfp_mask; - page = find_or_create_page(vnode->vfs_inode.i_mapping, - index, gfp); - if (!page) - goto error; - if (!PagePrivate(page)) - attach_page_private(page, (void *)1); - dir_page = kmap(page); + folio = folio0; } + block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_file_pos(folio)); + /* Abandon the edit if we got a callback break. */ if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) goto invalidated; - block = &dir_page->blocks[b % AFS_DIR_BLOCKS_PER_PAGE]; - _debug("block %u: %2u %3u %u", b, (b < AFS_DIR_BLOCKS_WITH_CTR) ? meta->meta.alloc_ctrs[b] : 99, @@ -266,7 +274,7 @@ void afs_edit_dir_add(struct afs_vnode *vnode, afs_set_i_size(vnode, (b + 1) * AFS_DIR_BLOCK_SIZE); } - /* Only lower dir pages have a counter in the header. */ + /* Only lower dir blocks have a counter in the header. */ if (b >= AFS_DIR_BLOCKS_WITH_CTR || meta->meta.alloc_ctrs[b] >= need_slots) { /* We need to try and find one or more consecutive @@ -279,10 +287,10 @@ void afs_edit_dir_add(struct afs_vnode *vnode, } } - if (page != page0) { - unlock_page(page); - kunmap(page); - put_page(page); + kunmap_local(block); + if (folio != folio0) { + folio_unlock(folio); + folio_put(folio); } } @@ -298,8 +306,8 @@ new_directory: i_size = AFS_DIR_BLOCK_SIZE; afs_set_i_size(vnode, i_size); slot = AFS_DIR_RESV_BLOCKS0; - page = page0; - block = meta; + folio = folio0; + block = kmap_local_folio(folio, 0); nr_blocks = 1; b = 0; @@ -318,10 +326,10 @@ found_space: /* Adjust the bitmap. */ afs_set_contig_bits(block, slot, need_slots); - if (page != page0) { - unlock_page(page); - kunmap(page); - put_page(page); + kunmap_local(block); + if (folio != folio0) { + folio_unlock(folio); + folio_put(folio); } /* Adjust the allocation counter. */ @@ -333,18 +341,19 @@ found_space: _debug("Insert %s in %u[%u]", name->name, b, slot); out_unmap: - unlock_page(page0); - kunmap(page0); - put_page(page0); + kunmap_local(meta); + folio_unlock(folio0); + folio_put(folio0); _leave(""); return; invalidated: trace_afs_edit_dir(vnode, why, afs_edit_dir_create_inval, 0, 0, 0, 0, name->name); clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); - if (page != page0) { - kunmap(page); - put_page(page); + kunmap_local(block); + if (folio != folio0) { + folio_unlock(folio); + folio_put(folio); } goto out_unmap; @@ -364,10 +373,9 @@ error: void afs_edit_dir_remove(struct afs_vnode *vnode, struct qstr *name, enum afs_edit_dir_reason why) { - struct afs_xdr_dir_page *meta_page, *dir_page; union afs_xdr_dir_block *meta, *block; union afs_xdr_dirent *de; - struct page *page0, *page; + struct folio *folio0, *folio; unsigned int need_slots, nr_blocks, b; pgoff_t index; loff_t i_size; @@ -384,9 +392,8 @@ void afs_edit_dir_remove(struct afs_vnode *vnode, } nr_blocks = i_size / AFS_DIR_BLOCK_SIZE; - page0 = find_lock_page(vnode->vfs_inode.i_mapping, 0); - if (!page0) { - clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); + folio0 = afs_dir_get_folio(vnode, 0); + if (!folio0) { _leave(" [fgp]"); return; } @@ -394,30 +401,27 @@ void afs_edit_dir_remove(struct afs_vnode *vnode, /* Work out how many slots we're going to discard. */ need_slots = afs_dir_calc_slots(name->len); - meta_page = kmap(page0); - meta = &meta_page->blocks[0]; + meta = kmap_local_folio(folio0, 0); - /* Find a page that has sufficient slots available. Each VM page + /* Find a block that has sufficient slots available. Each folio * contains two or more directory blocks. */ for (b = 0; b < nr_blocks; b++) { index = b / AFS_DIR_BLOCKS_PER_PAGE; - if (index != 0) { - page = find_lock_page(vnode->vfs_inode.i_mapping, index); - if (!page) + if (index >= folio_nr_pages(folio0)) { + folio = afs_dir_get_folio(vnode, index); + if (!folio) goto error; - dir_page = kmap(page); } else { - page = page0; - dir_page = meta_page; + folio = folio0; } + block = kmap_local_folio(folio, b * AFS_DIR_BLOCK_SIZE - folio_file_pos(folio)); + /* Abandon the edit if we got a callback break. */ if (!test_bit(AFS_VNODE_DIR_VALID, &vnode->flags)) goto invalidated; - block = &dir_page->blocks[b % AFS_DIR_BLOCKS_PER_PAGE]; - if (b > AFS_DIR_BLOCKS_WITH_CTR || meta->meta.alloc_ctrs[b] <= AFS_DIR_SLOTS_PER_BLOCK - 1 - need_slots) { slot = afs_dir_scan_block(block, name, b); @@ -425,10 +429,10 @@ void afs_edit_dir_remove(struct afs_vnode *vnode, goto found_dirent; } - if (page != page0) { - unlock_page(page); - kunmap(page); - put_page(page); + kunmap_local(block); + if (folio != folio0) { + folio_unlock(folio); + folio_put(folio); } } @@ -449,10 +453,10 @@ found_dirent: /* Adjust the bitmap. */ afs_clear_contig_bits(block, slot, need_slots); - if (page != page0) { - unlock_page(page); - kunmap(page); - put_page(page); + kunmap_local(block); + if (folio != folio0) { + folio_unlock(folio); + folio_put(folio); } /* Adjust the allocation counter. */ @@ -464,9 +468,9 @@ found_dirent: _debug("Remove %s from %u[%u]", name->name, b, slot); out_unmap: - unlock_page(page0); - kunmap(page0); - put_page(page0); + kunmap_local(meta); + folio_unlock(folio0); + folio_put(folio0); _leave(""); return; @@ -474,10 +478,10 @@ invalidated: trace_afs_edit_dir(vnode, why, afs_edit_dir_delete_inval, 0, 0, 0, 0, name->name); clear_bit(AFS_VNODE_DIR_VALID, &vnode->flags); - if (page != page0) { - unlock_page(page); - kunmap(page); - put_page(page); + kunmap_local(block); + if (folio != folio0) { + folio_unlock(folio); + folio_put(folio); } goto out_unmap; diff --git a/fs/afs/file.c b/fs/afs/file.c index eb11d047c0ae..cb6ad61eec3b 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -324,21 +324,24 @@ static int afs_symlink_readpage(struct file *file, struct page *page) { struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); struct afs_read *fsreq; + struct folio *folio = page_folio(page); int ret; fsreq = afs_alloc_read(GFP_NOFS); if (!fsreq) return -ENOMEM; - fsreq->pos = page->index * PAGE_SIZE; - fsreq->len = PAGE_SIZE; + fsreq->pos = folio_pos(folio); + fsreq->len = folio_size(folio); fsreq->vnode = vnode; fsreq->iter = &fsreq->def_iter; iov_iter_xarray(&fsreq->def_iter, READ, &page->mapping->i_pages, fsreq->pos, fsreq->len); ret = afs_fetch_data(fsreq->vnode, fsreq); - page_endio(page, false, ret); + if (ret == 0) + SetPageUptodate(page); + unlock_page(page); return ret; } @@ -362,7 +365,7 @@ static int afs_begin_cache_operation(struct netfs_read_request *rreq) } static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, - struct page *page, void **_fsdata) + struct folio *folio, void **_fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); @@ -385,7 +388,9 @@ const struct netfs_read_request_ops afs_req_ops = { static int afs_readpage(struct file *file, struct page *page) { - return netfs_readpage(file, page, &afs_req_ops, NULL); + struct folio *folio = page_folio(page); + + return netfs_readpage(file, folio, &afs_req_ops, NULL); } static void afs_readahead(struct readahead_control *ractl) @@ -397,29 +402,29 @@ static void afs_readahead(struct readahead_control *ractl) * Adjust the dirty region of the page on truncation or full invalidation, * getting rid of the markers altogether if the region is entirely invalidated. */ -static void afs_invalidate_dirty(struct page *page, unsigned int offset, +static void afs_invalidate_dirty(struct folio *folio, unsigned int offset, unsigned int length) { - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); unsigned long priv; unsigned int f, t, end = offset + length; - priv = page_private(page); + priv = (unsigned long)folio_get_private(folio); /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == thp_size(page)) + if (offset == 0 && length == folio_size(folio)) goto full_invalidate; /* If the page was dirtied by page_mkwrite(), the PTE stays writable * and we don't get another notification to tell us to expand it * again. */ - if (afs_is_page_dirty_mmapped(priv)) + if (afs_is_folio_dirty_mmapped(priv)) return; /* We may need to shorten the dirty region */ - f = afs_page_dirty_from(page, priv); - t = afs_page_dirty_to(page, priv); + f = afs_folio_dirty_from(folio, priv); + t = afs_folio_dirty_to(folio, priv); if (t <= offset || f >= end) return; /* Doesn't overlap */ @@ -437,17 +442,17 @@ static void afs_invalidate_dirty(struct page *page, unsigned int offset, if (f == t) goto undirty; - priv = afs_page_dirty(page, f, t); - set_page_private(page, priv); - trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page); + priv = afs_folio_dirty(folio, f, t); + folio_change_private(folio, (void *)priv); + trace_afs_folio_dirty(vnode, tracepoint_string("trunc"), folio); return; undirty: - trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page); - clear_page_dirty_for_io(page); + trace_afs_folio_dirty(vnode, tracepoint_string("undirty"), folio); + folio_clear_dirty_for_io(folio); full_invalidate: - trace_afs_page_dirty(vnode, tracepoint_string("inval"), page); - detach_page_private(page); + trace_afs_folio_dirty(vnode, tracepoint_string("inval"), folio); + folio_detach_private(folio); } /* @@ -458,14 +463,16 @@ full_invalidate: static void afs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - _enter("{%lu},%u,%u", page->index, offset, length); + struct folio *folio = page_folio(page); + + _enter("{%lu},%u,%u", folio_index(folio), offset, length); BUG_ON(!PageLocked(page)); if (PagePrivate(page)) - afs_invalidate_dirty(page, offset, length); + afs_invalidate_dirty(folio, offset, length); - wait_on_page_fscache(page); + folio_wait_fscache(folio); _leave(""); } @@ -475,30 +482,31 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, */ static int afs_releasepage(struct page *page, gfp_t gfp_flags) { - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + struct folio *folio = page_folio(page); + struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); _enter("{{%llx:%llu}[%lu],%lx},%x", - vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, + vnode->fid.vid, vnode->fid.vnode, folio_index(folio), folio->flags, gfp_flags); /* deny if page is being written to the cache and the caller hasn't * elected to wait */ #ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page)) { + if (folio_test_fscache(folio)) { if (!(gfp_flags & __GFP_DIRECT_RECLAIM) || !(gfp_flags & __GFP_FS)) return false; - wait_on_page_fscache(page); + folio_wait_fscache(folio); } #endif - if (PagePrivate(page)) { - trace_afs_page_dirty(vnode, tracepoint_string("rel"), page); - detach_page_private(page); + if (folio_test_private(folio)) { + trace_afs_folio_dirty(vnode, tracepoint_string("rel"), folio); + folio_detach_private(folio); } - /* indicate that the page can be released */ + /* Indicate that the folio can be released */ _leave(" = T"); - return 1; + return true; } static void afs_add_open_mmap(struct afs_vnode *vnode) diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 9357c53faa69..aa4c0d6c9780 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -876,59 +876,59 @@ struct afs_vnode_cache_aux { } __packed; /* - * We use page->private to hold the amount of the page that we've written to, + * We use folio->private to hold the amount of the folio that we've written to, * splitting the field into two parts. However, we need to represent a range - * 0...PAGE_SIZE, so we reduce the resolution if the size of the page + * 0...FOLIO_SIZE, so we reduce the resolution if the size of the folio * exceeds what we can encode. */ #ifdef CONFIG_64BIT -#define __AFS_PAGE_PRIV_MASK 0x7fffffffUL -#define __AFS_PAGE_PRIV_SHIFT 32 -#define __AFS_PAGE_PRIV_MMAPPED 0x80000000UL +#define __AFS_FOLIO_PRIV_MASK 0x7fffffffUL +#define __AFS_FOLIO_PRIV_SHIFT 32 +#define __AFS_FOLIO_PRIV_MMAPPED 0x80000000UL #else -#define __AFS_PAGE_PRIV_MASK 0x7fffUL -#define __AFS_PAGE_PRIV_SHIFT 16 -#define __AFS_PAGE_PRIV_MMAPPED 0x8000UL +#define __AFS_FOLIO_PRIV_MASK 0x7fffUL +#define __AFS_FOLIO_PRIV_SHIFT 16 +#define __AFS_FOLIO_PRIV_MMAPPED 0x8000UL #endif -static inline unsigned int afs_page_dirty_resolution(struct page *page) +static inline unsigned int afs_folio_dirty_resolution(struct folio *folio) { - int shift = thp_order(page) + PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1); + int shift = folio_shift(folio) - (__AFS_FOLIO_PRIV_SHIFT - 1); return (shift > 0) ? shift : 0; } -static inline size_t afs_page_dirty_from(struct page *page, unsigned long priv) +static inline size_t afs_folio_dirty_from(struct folio *folio, unsigned long priv) { - unsigned long x = priv & __AFS_PAGE_PRIV_MASK; + unsigned long x = priv & __AFS_FOLIO_PRIV_MASK; /* The lower bound is inclusive */ - return x << afs_page_dirty_resolution(page); + return x << afs_folio_dirty_resolution(folio); } -static inline size_t afs_page_dirty_to(struct page *page, unsigned long priv) +static inline size_t afs_folio_dirty_to(struct folio *folio, unsigned long priv) { - unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK; + unsigned long x = (priv >> __AFS_FOLIO_PRIV_SHIFT) & __AFS_FOLIO_PRIV_MASK; /* The upper bound is immediately beyond the region */ - return (x + 1) << afs_page_dirty_resolution(page); + return (x + 1) << afs_folio_dirty_resolution(folio); } -static inline unsigned long afs_page_dirty(struct page *page, size_t from, size_t to) +static inline unsigned long afs_folio_dirty(struct folio *folio, size_t from, size_t to) { - unsigned int res = afs_page_dirty_resolution(page); + unsigned int res = afs_folio_dirty_resolution(folio); from >>= res; to = (to - 1) >> res; - return (to << __AFS_PAGE_PRIV_SHIFT) | from; + return (to << __AFS_FOLIO_PRIV_SHIFT) | from; } -static inline unsigned long afs_page_dirty_mmapped(unsigned long priv) +static inline unsigned long afs_folio_dirty_mmapped(unsigned long priv) { - return priv | __AFS_PAGE_PRIV_MMAPPED; + return priv | __AFS_FOLIO_PRIV_MMAPPED; } -static inline bool afs_is_page_dirty_mmapped(unsigned long priv) +static inline bool afs_is_folio_dirty_mmapped(unsigned long priv) { - return priv & __AFS_PAGE_PRIV_MMAPPED; + return priv & __AFS_FOLIO_PRIV_MMAPPED; } #include diff --git a/fs/afs/write.c b/fs/afs/write.c index 8b1d9c2f6bec..ca4909baf5e6 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -32,7 +32,7 @@ int afs_write_begin(struct file *file, struct address_space *mapping, struct page **_page, void **fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - struct page *page; + struct folio *folio; unsigned long priv; unsigned f, from; unsigned t, to; @@ -46,12 +46,12 @@ int afs_write_begin(struct file *file, struct address_space *mapping, * file. We need to do this before we get a lock on the page in case * there's more than one writer competing for the same cache block. */ - ret = netfs_write_begin(file, mapping, pos, len, flags, &page, fsdata, + ret = netfs_write_begin(file, mapping, pos, len, flags, &folio, fsdata, &afs_req_ops, NULL); if (ret < 0) return ret; - index = page->index; + index = folio_index(folio); from = pos - index * PAGE_SIZE; to = from + len; @@ -59,14 +59,14 @@ try_again: /* See if this page is already partially written in a way that we can * merge the new write with. */ - if (PagePrivate(page)) { - priv = page_private(page); - f = afs_page_dirty_from(page, priv); - t = afs_page_dirty_to(page, priv); + if (folio_test_private(folio)) { + priv = (unsigned long)folio_get_private(folio); + f = afs_folio_dirty_from(folio, priv); + t = afs_folio_dirty_to(folio, priv); ASSERTCMP(f, <=, t); - if (PageWriteback(page)) { - trace_afs_page_dirty(vnode, tracepoint_string("alrdy"), page); + if (folio_test_writeback(folio)) { + trace_afs_folio_dirty(vnode, tracepoint_string("alrdy"), folio); goto flush_conflicting_write; } /* If the file is being filled locally, allow inter-write @@ -78,7 +78,7 @@ try_again: goto flush_conflicting_write; } - *_page = page; + *_page = &folio->page; _leave(" = 0"); return 0; @@ -87,17 +87,17 @@ try_again: */ flush_conflicting_write: _debug("flush conflict"); - ret = write_one_page(page); + ret = folio_write_one(folio); if (ret < 0) goto error; - ret = lock_page_killable(page); + ret = folio_lock_killable(folio); if (ret < 0) goto error; goto try_again; error: - put_page(page); + folio_put(folio); _leave(" = %d", ret); return ret; } @@ -107,24 +107,25 @@ error: */ int afs_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct page *subpage, void *fsdata) { + struct folio *folio = page_folio(subpage); struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); unsigned long priv; - unsigned int f, from = pos & (thp_size(page) - 1); + unsigned int f, from = offset_in_folio(folio, pos); unsigned int t, to = from + copied; loff_t i_size, maybe_i_size; _enter("{%llx:%llu},{%lx}", - vnode->fid.vid, vnode->fid.vnode, page->index); + vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { if (copied < len) { copied = 0; goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } if (copied == 0) @@ -141,29 +142,29 @@ int afs_write_end(struct file *file, struct address_space *mapping, write_sequnlock(&vnode->cb_lock); } - if (PagePrivate(page)) { - priv = page_private(page); - f = afs_page_dirty_from(page, priv); - t = afs_page_dirty_to(page, priv); + if (folio_test_private(folio)) { + priv = (unsigned long)folio_get_private(folio); + f = afs_folio_dirty_from(folio, priv); + t = afs_folio_dirty_to(folio, priv); if (from < f) f = from; if (to > t) t = to; - priv = afs_page_dirty(page, f, t); - set_page_private(page, priv); - trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), page); + priv = afs_folio_dirty(folio, f, t); + folio_change_private(folio, (void *)priv); + trace_afs_folio_dirty(vnode, tracepoint_string("dirty+"), folio); } else { - priv = afs_page_dirty(page, from, to); - attach_page_private(page, (void *)priv); - trace_afs_page_dirty(vnode, tracepoint_string("dirty"), page); + priv = afs_folio_dirty(folio, from, to); + folio_attach_private(folio, (void *)priv); + trace_afs_folio_dirty(vnode, tracepoint_string("dirty"), folio); } - if (set_page_dirty(page)) - _debug("dirtied %lx", page->index); + if (folio_mark_dirty(folio)) + _debug("dirtied %lx", folio_index(folio)); out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); return copied; } @@ -174,40 +175,32 @@ static void afs_kill_pages(struct address_space *mapping, loff_t start, loff_t len) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct pagevec pv; - unsigned int loop, psize; + struct folio *folio; + pgoff_t index = start / PAGE_SIZE; + pgoff_t last = (start + len - 1) / PAGE_SIZE, next; _enter("{%llx:%llu},%llx @%llx", vnode->fid.vid, vnode->fid.vnode, len, start); - pagevec_init(&pv); - do { - _debug("kill %llx @%llx", len, start); + _debug("kill %lx (to %lx)", index, last); - pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE, - PAGEVEC_SIZE, pv.pages); - if (pv.nr == 0) - break; - - for (loop = 0; loop < pv.nr; loop++) { - struct page *page = pv.pages[loop]; - - if (page->index * PAGE_SIZE >= start + len) - break; - - psize = thp_size(page); - start += psize; - len -= psize; - ClearPageUptodate(page); - end_page_writeback(page); - lock_page(page); - generic_error_remove_page(mapping, page); - unlock_page(page); + folio = filemap_get_folio(mapping, index); + if (!folio) { + next = index + 1; + continue; } - __pagevec_release(&pv); - } while (len > 0); + next = folio_next_index(folio); + + folio_clear_uptodate(folio); + folio_end_writeback(folio); + folio_lock(folio); + generic_error_remove_page(mapping, &folio->page); + folio_unlock(folio); + folio_put(folio); + + } while (index = next, index <= last); _leave(""); } @@ -220,37 +213,27 @@ static void afs_redirty_pages(struct writeback_control *wbc, loff_t start, loff_t len) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct pagevec pv; - unsigned int loop, psize; + struct folio *folio; + pgoff_t index = start / PAGE_SIZE; + pgoff_t last = (start + len - 1) / PAGE_SIZE, next; _enter("{%llx:%llu},%llx @%llx", vnode->fid.vid, vnode->fid.vnode, len, start); - pagevec_init(&pv); - do { _debug("redirty %llx @%llx", len, start); - pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE, - PAGEVEC_SIZE, pv.pages); - if (pv.nr == 0) - break; - - for (loop = 0; loop < pv.nr; loop++) { - struct page *page = pv.pages[loop]; - - if (page->index * PAGE_SIZE >= start + len) - break; - - psize = thp_size(page); - start += psize; - len -= psize; - redirty_page_for_writepage(wbc, page); - end_page_writeback(page); + folio = filemap_get_folio(mapping, index); + if (!folio) { + next = index + 1; + continue; } - __pagevec_release(&pv); - } while (len > 0); + next = index + folio_nr_pages(folio); + folio_redirty_for_writepage(wbc, folio); + folio_end_writeback(folio); + folio_put(folio); + } while (index = next, index <= last); _leave(""); } @@ -261,7 +244,7 @@ static void afs_redirty_pages(struct writeback_control *wbc, static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len) { struct address_space *mapping = vnode->vfs_inode.i_mapping; - struct page *page; + struct folio *folio; pgoff_t end; XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); @@ -272,15 +255,16 @@ static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsign rcu_read_lock(); end = (start + len - 1) / PAGE_SIZE; - xas_for_each(&xas, page, end) { - if (!PageWriteback(page)) { - kdebug("bad %x @%llx page %lx %lx", len, start, page->index, end); - ASSERT(PageWriteback(page)); + xas_for_each(&xas, folio, end) { + if (!folio_test_writeback(folio)) { + kdebug("bad %x @%llx page %lx %lx", + len, start, folio_index(folio), end); + ASSERT(folio_test_writeback(folio)); } - trace_afs_page_dirty(vnode, tracepoint_string("clear"), page); - detach_page_private(page); - page_endio(page, true, 0); + trace_afs_folio_dirty(vnode, tracepoint_string("clear"), folio); + folio_detach_private(folio); + folio_end_writeback(folio); } rcu_read_unlock(); @@ -437,7 +421,7 @@ static void afs_extend_writeback(struct address_space *mapping, unsigned int *_len) { struct pagevec pvec; - struct page *page; + struct folio *folio; unsigned long priv; unsigned int psize, filler = 0; unsigned int f, t; @@ -456,43 +440,43 @@ static void afs_extend_writeback(struct address_space *mapping, */ rcu_read_lock(); - xas_for_each(&xas, page, ULONG_MAX) { + xas_for_each(&xas, folio, ULONG_MAX) { stop = true; - if (xas_retry(&xas, page)) + if (xas_retry(&xas, folio)) continue; - if (xa_is_value(page)) + if (xa_is_value(folio)) break; - if (page->index != index) + if (folio_index(folio) != index) break; - if (!page_cache_get_speculative(page)) { + if (!folio_try_get_rcu(folio)) { xas_reset(&xas); continue; } /* Has the page moved or been split? */ - if (unlikely(page != xas_reload(&xas))) { - put_page(page); + if (unlikely(folio != xas_reload(&xas))) { + folio_put(folio); break; } - if (!trylock_page(page)) { - put_page(page); + if (!folio_trylock(folio)) { + folio_put(folio); break; } - if (!PageDirty(page) || PageWriteback(page)) { - unlock_page(page); - put_page(page); + if (!folio_test_dirty(folio) || folio_test_writeback(folio)) { + folio_unlock(folio); + folio_put(folio); break; } - psize = thp_size(page); - priv = page_private(page); - f = afs_page_dirty_from(page, priv); - t = afs_page_dirty_to(page, priv); + psize = folio_size(folio); + priv = (unsigned long)folio_get_private(folio); + f = afs_folio_dirty_from(folio, priv); + t = afs_folio_dirty_to(folio, priv); if (f != 0 && !new_content) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); break; } @@ -503,8 +487,8 @@ static void afs_extend_writeback(struct address_space *mapping, else if (t == psize || new_content) stop = false; - index += thp_nr_pages(page); - if (!pagevec_add(&pvec, page)) + index += folio_nr_pages(folio); + if (!pagevec_add(&pvec, &folio->page)) break; if (stop) break; @@ -521,16 +505,16 @@ static void afs_extend_writeback(struct address_space *mapping, break; for (i = 0; i < pagevec_count(&pvec); i++) { - page = pvec.pages[i]; - trace_afs_page_dirty(vnode, tracepoint_string("store+"), page); + folio = page_folio(pvec.pages[i]); + trace_afs_folio_dirty(vnode, tracepoint_string("store+"), folio); - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) BUG(); - if (test_set_page_writeback(page)) + if (folio_start_writeback(folio)) BUG(); - *_count -= thp_nr_pages(page); - unlock_page(page); + *_count -= folio_nr_pages(folio); + folio_unlock(folio); } pagevec_release(&pvec); @@ -544,10 +528,10 @@ static void afs_extend_writeback(struct address_space *mapping, * Synchronously write back the locked page and any subsequent non-locked dirty * pages. */ -static ssize_t afs_write_back_from_locked_page(struct address_space *mapping, - struct writeback_control *wbc, - struct page *page, - loff_t start, loff_t end) +static ssize_t afs_write_back_from_locked_folio(struct address_space *mapping, + struct writeback_control *wbc, + struct folio *folio, + loff_t start, loff_t end) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct iov_iter iter; @@ -558,22 +542,22 @@ static ssize_t afs_write_back_from_locked_page(struct address_space *mapping, long count = wbc->nr_to_write; int ret; - _enter(",%lx,%llx-%llx", page->index, start, end); + _enter(",%lx,%llx-%llx", folio_index(folio), start, end); - if (test_set_page_writeback(page)) + if (folio_start_writeback(folio)) BUG(); - count -= thp_nr_pages(page); + count -= folio_nr_pages(folio); /* Find all consecutive lockable dirty pages that have contiguous * written regions, stopping when we find a page that is not * immediately lockable, is not dirty or is missing, or we reach the * end of the range. */ - priv = page_private(page); - offset = afs_page_dirty_from(page, priv); - to = afs_page_dirty_to(page, priv); - trace_afs_page_dirty(vnode, tracepoint_string("store"), page); + priv = (unsigned long)folio_get_private(folio); + offset = afs_folio_dirty_from(folio, priv); + to = afs_folio_dirty_to(folio, priv); + trace_afs_folio_dirty(vnode, tracepoint_string("store"), folio); len = to - offset; start += offset; @@ -586,7 +570,7 @@ static ssize_t afs_write_back_from_locked_page(struct address_space *mapping, max_len = min_t(unsigned long long, max_len, i_size - start); if (len < max_len && - (to == thp_size(page) || new_content)) + (to == folio_size(folio) || new_content)) afs_extend_writeback(mapping, vnode, &count, start, max_len, new_content, &len); len = min_t(loff_t, len, max_len); @@ -596,7 +580,7 @@ static ssize_t afs_write_back_from_locked_page(struct address_space *mapping, * set; the first page is still locked at this point, but all the rest * have been unlocked. */ - unlock_page(page); + folio_unlock(folio); if (start < i_size) { _debug("write back %x @%llx [%llx]", len, start, i_size); @@ -657,16 +641,17 @@ static ssize_t afs_write_back_from_locked_page(struct address_space *mapping, * write a page back to the server * - the caller locked the page for us */ -int afs_writepage(struct page *page, struct writeback_control *wbc) +int afs_writepage(struct page *subpage, struct writeback_control *wbc) { + struct folio *folio = page_folio(subpage); ssize_t ret; loff_t start; - _enter("{%lx},", page->index); + _enter("{%lx},", folio_index(folio)); - start = page->index * PAGE_SIZE; - ret = afs_write_back_from_locked_page(page->mapping, wbc, page, - start, LLONG_MAX - start); + start = folio_index(folio) * PAGE_SIZE; + ret = afs_write_back_from_locked_folio(folio_mapping(folio), wbc, + folio, start, LLONG_MAX - start); if (ret < 0) { _leave(" = %zd", ret); return ret; @@ -683,7 +668,8 @@ static int afs_writepages_region(struct address_space *mapping, struct writeback_control *wbc, loff_t start, loff_t end, loff_t *_next) { - struct page *page; + struct folio *folio; + struct page *head_page; ssize_t ret; int n; @@ -693,13 +679,14 @@ static int afs_writepages_region(struct address_space *mapping, pgoff_t index = start / PAGE_SIZE; n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE, - PAGECACHE_TAG_DIRTY, 1, &page); + PAGECACHE_TAG_DIRTY, 1, &head_page); if (!n) break; - start = (loff_t)page->index * PAGE_SIZE; /* May regress with THPs */ + folio = page_folio(head_page); + start = folio_pos(folio); /* May regress with THPs */ - _debug("wback %lx", page->index); + _debug("wback %lx", folio_index(folio)); /* At this point we hold neither the i_pages lock nor the * page lock: the page may be truncated or invalidated @@ -707,37 +694,38 @@ static int afs_writepages_region(struct address_space *mapping, * back from swapper_space to tmpfs file mapping */ if (wbc->sync_mode != WB_SYNC_NONE) { - ret = lock_page_killable(page); + ret = folio_lock_killable(folio); if (ret < 0) { - put_page(page); + folio_put(folio); return ret; } } else { - if (!trylock_page(page)) { - put_page(page); + if (!folio_trylock(folio)) { + folio_put(folio); return 0; } } - if (page->mapping != mapping || !PageDirty(page)) { - start += thp_size(page); - unlock_page(page); - put_page(page); + if (folio_mapping(folio) != mapping || + !folio_test_dirty(folio)) { + start += folio_size(folio); + folio_unlock(folio); + folio_put(folio); continue; } - if (PageWriteback(page)) { - unlock_page(page); + if (folio_test_writeback(folio)) { + folio_unlock(folio); if (wbc->sync_mode != WB_SYNC_NONE) - wait_on_page_writeback(page); - put_page(page); + folio_wait_writeback(folio); + folio_put(folio); continue; } - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) BUG(); - ret = afs_write_back_from_locked_page(mapping, wbc, page, start, end); - put_page(page); + ret = afs_write_back_from_locked_folio(mapping, wbc, folio, start, end); + folio_put(folio); if (ret < 0) { _leave(" = %zd", ret); return ret; @@ -862,7 +850,6 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) { struct folio *folio = page_folio(vmf->page); - struct page *page = &folio->page; struct file *file = vmf->vma->vm_file; struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); @@ -870,7 +857,7 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) unsigned long priv; vm_fault_t ret = VM_FAULT_RETRY; - _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); + _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, folio_index(folio)); afs_validate(vnode, af->key); @@ -880,18 +867,18 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) * be modified. We then assume the entire page will need writing back. */ #ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page) && - wait_on_page_fscache_killable(page) < 0) + if (folio_test_fscache(folio) && + folio_wait_fscache_killable(folio) < 0) goto out; #endif if (folio_wait_writeback_killable(folio)) goto out; - if (lock_page_killable(page) < 0) + if (folio_lock_killable(folio) < 0) goto out; - /* We mustn't change page->private until writeback is complete as that + /* We mustn't change folio->private until writeback is complete as that * details the portion of the page we need to write back and we might * need to redirty the page if there's a problem. */ @@ -900,14 +887,14 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) goto out; } - priv = afs_page_dirty(page, 0, thp_size(page)); - priv = afs_page_dirty_mmapped(priv); - if (PagePrivate(page)) { - set_page_private(page, priv); - trace_afs_page_dirty(vnode, tracepoint_string("mkwrite+"), page); + priv = afs_folio_dirty(folio, 0, folio_size(folio)); + priv = afs_folio_dirty_mmapped(priv); + if (folio_test_private(folio)) { + folio_change_private(folio, (void *)priv); + trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite+"), folio); } else { - attach_page_private(page, (void *)priv); - trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), page); + folio_attach_private(folio, (void *)priv); + trace_afs_folio_dirty(vnode, tracepoint_string("mkwrite"), folio); } file_update_time(file); @@ -948,38 +935,38 @@ void afs_prune_wb_keys(struct afs_vnode *vnode) /* * Clean up a page during invalidation. */ -int afs_launder_page(struct page *page) +int afs_launder_page(struct page *subpage) { - struct address_space *mapping = page->mapping; - struct afs_vnode *vnode = AFS_FS_I(mapping->host); + struct folio *folio = page_folio(subpage); + struct afs_vnode *vnode = AFS_FS_I(folio_inode(folio)); struct iov_iter iter; struct bio_vec bv[1]; unsigned long priv; unsigned int f, t; int ret = 0; - _enter("{%lx}", page->index); + _enter("{%lx}", folio_index(folio)); - priv = page_private(page); - if (clear_page_dirty_for_io(page)) { + priv = (unsigned long)folio_get_private(folio); + if (folio_clear_dirty_for_io(folio)) { f = 0; - t = thp_size(page); - if (PagePrivate(page)) { - f = afs_page_dirty_from(page, priv); - t = afs_page_dirty_to(page, priv); + t = folio_size(folio); + if (folio_test_private(folio)) { + f = afs_folio_dirty_from(folio, priv); + t = afs_folio_dirty_to(folio, priv); } - bv[0].bv_page = page; + bv[0].bv_page = &folio->page; bv[0].bv_offset = f; bv[0].bv_len = t - f; iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len); - trace_afs_page_dirty(vnode, tracepoint_string("launder"), page); - ret = afs_store_data(vnode, &iter, page_offset(page) + f, true); + trace_afs_folio_dirty(vnode, tracepoint_string("launder"), folio); + ret = afs_store_data(vnode, &iter, folio_pos(folio) + f, true); } - trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page); - detach_page_private(page); - wait_on_page_fscache(page); + trace_afs_folio_dirty(vnode, tracepoint_string("laundered"), folio); + folio_detach_private(folio); + folio_wait_fscache(folio); return ret; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 99b80b5c7a93..e53c8541f5b2 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -63,7 +63,7 @@ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, - struct page *page, void **_fsdata); + struct folio *folio, void **_fsdata); static inline struct ceph_snap_context *page_snap_context(struct page *page) { @@ -317,13 +317,14 @@ static const struct netfs_read_request_ops ceph_netfs_read_ops = { }; /* read a single page, without unlocking it. */ -static int ceph_readpage(struct file *file, struct page *page) +static int ceph_readpage(struct file *file, struct page *subpage) { + struct folio *folio = page_folio(subpage); struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_vino vino = ceph_vino(inode); - u64 off = page_offset(page); - u64 len = thp_size(page); + size_t len = folio_size(folio); + u64 off = folio_file_pos(folio); if (ci->i_inline_version != CEPH_INLINE_NONE) { /* @@ -331,19 +332,19 @@ static int ceph_readpage(struct file *file, struct page *page) * into page cache while getting Fcr caps. */ if (off == 0) { - unlock_page(page); + folio_unlock(folio); return -EINVAL; } - zero_user_segment(page, 0, thp_size(page)); - SetPageUptodate(page); - unlock_page(page); + zero_user_segment(&folio->page, 0, folio_size(folio)); + folio_mark_uptodate(folio); + folio_unlock(folio); return 0; } - dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n", - vino.ino, vino.snap, file, off, len, page, page->index); + dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n", + vino.ino, vino.snap, file, off, len, folio, folio_index(folio)); - return netfs_readpage(file, page, &ceph_netfs_read_ops, NULL); + return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL); } static void ceph_readahead(struct readahead_control *ractl) @@ -724,7 +725,7 @@ static int ceph_writepages_start(struct address_space *mapping, wbc->sync_mode == WB_SYNC_NONE ? "NONE" : (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); - if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { + if (ceph_inode_is_shutdown(inode)) { if (ci->i_wrbuffer_ref > 0) { pr_warn_ratelimited( "writepage_start %p %lld forced umount\n", @@ -1145,12 +1146,12 @@ static struct ceph_snap_context * ceph_find_incompatible(struct page *page) { struct inode *inode = page->mapping->host; - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); - if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { - dout(" page %p forced umount\n", page); - return ERR_PTR(-EIO); + if (ceph_inode_is_shutdown(inode)) { + dout(" page %p %llx:%llx is shutdown\n", page, + ceph_vinop(inode)); + return ERR_PTR(-ESTALE); } for (;;) { @@ -1187,18 +1188,18 @@ ceph_find_incompatible(struct page *page) } static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, - struct page *page, void **_fsdata) + struct folio *folio, void **_fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; - snapc = ceph_find_incompatible(page); + snapc = ceph_find_incompatible(folio_page(folio, 0)); if (snapc) { int r; - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (IS_ERR(snapc)) return PTR_ERR(snapc); @@ -1216,12 +1217,12 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned * clean, or already dirty within the same snap context. */ static int ceph_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, unsigned flags, + loff_t pos, unsigned len, unsigned aop_flags, struct page **pagep, void **fsdata) { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct page *page = NULL; + struct folio *folio = NULL; pgoff_t index = pos >> PAGE_SHIFT; int r; @@ -1230,39 +1231,43 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, * for inline_version sent to the MDS. */ if (ci->i_inline_version != CEPH_INLINE_NONE) { - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) + unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; + if (aop_flags & AOP_FLAG_NOFS) + fgp_flags |= FGP_NOFS; + folio = __filemap_get_folio(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); + if (!folio) return -ENOMEM; /* * The inline_version on a new inode is set to 1. If that's the - * case, then the page is brand new and isn't yet Uptodate. + * case, then the folio is brand new and isn't yet Uptodate. */ r = 0; if (index == 0 && ci->i_inline_version != 1) { - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n", ci->i_inline_version); r = -EINVAL; } goto out; } - zero_user_segment(page, 0, thp_size(page)); - SetPageUptodate(page); + zero_user_segment(&folio->page, 0, folio_size(folio)); + folio_mark_uptodate(folio); goto out; } - r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &page, NULL, + r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL, &ceph_netfs_read_ops, NULL); out: if (r == 0) - wait_on_page_fscache(page); + folio_wait_fscache(folio); if (r < 0) { - if (page) - put_page(page); + if (folio) + folio_put(folio); } else { - WARN_ON_ONCE(!PageLocked(page)); - *pagep = page; + WARN_ON_ONCE(!folio_test_locked(folio)); + *pagep = &folio->page; } return r; } @@ -1273,32 +1278,33 @@ out: */ static int ceph_write_end(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned copied, - struct page *page, void *fsdata) + struct page *subpage, void *fsdata) { + struct folio *folio = page_folio(subpage); struct inode *inode = file_inode(file); bool check_cap = false; - dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, - inode, page, (int)pos, (int)copied, (int)len); + dout("write_end file %p inode %p folio %p %d~%d (%d)\n", file, + inode, folio, (int)pos, (int)copied, (int)len); - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { /* just return that nothing was copied on a short copy */ if (copied < len) { copied = 0; goto out; } - SetPageUptodate(page); + folio_mark_uptodate(folio); } /* did file size increase? */ if (pos+copied > i_size_read(inode)) check_cap = ceph_inode_set_size(inode, pos+copied); - set_page_dirty(page); + folio_mark_dirty(folio); out: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (check_cap) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); @@ -1306,17 +1312,6 @@ out: return copied; } -/* - * we set .direct_IO to indicate direct io is supported, but since we - * intercept O_DIRECT reads and writes early, this function should - * never get called. - */ -static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter) -{ - WARN_ON(1); - return -EINVAL; -} - const struct address_space_operations ceph_aops = { .readpage = ceph_readpage, .readahead = ceph_readahead, @@ -1327,7 +1322,7 @@ const struct address_space_operations ceph_aops = { .set_page_dirty = ceph_set_page_dirty, .invalidatepage = ceph_invalidatepage, .releasepage = ceph_releasepage, - .direct_IO = ceph_direct_io, + .direct_IO = noop_direct_IO, }; static void ceph_block_sigs(sigset_t *oldset) @@ -1356,6 +1351,9 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) sigset_t oldset; vm_fault_t ret = VM_FAULT_SIGBUS; + if (ceph_inode_is_shutdown(inode)) + return ret; + ceph_block_sigs(&oldset); dout("filemap_fault %p %llx.%llx %llu trying to get caps\n", @@ -1447,6 +1445,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) sigset_t oldset; vm_fault_t ret = VM_FAULT_SIGBUS; + if (ceph_inode_is_shutdown(inode)) + return ret; + prealloc_cf = ceph_alloc_cap_flush(); if (!prealloc_cf) return VM_FAULT_OOM; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 9cfadbb86568..457afda5498a 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -12,12 +12,6 @@ #include "super.h" #include "cache.h" -struct ceph_aux_inode { - u64 version; - u64 mtime_sec; - u64 mtime_nsec; -}; - struct fscache_netfs ceph_cache_netfs = { .name = "ceph", .version = 0, @@ -109,20 +103,14 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( void *cookie_netfs_data, const void *data, uint16_t dlen, loff_t object_size) { - struct ceph_aux_inode aux; struct ceph_inode_info* ci = cookie_netfs_data; struct inode* inode = &ci->vfs_inode; - if (dlen != sizeof(aux) || + if (dlen != sizeof(ci->i_version) || i_size_read(inode) != object_size) return FSCACHE_CHECKAUX_OBSOLETE; - memset(&aux, 0, sizeof(aux)); - aux.version = ci->i_version; - aux.mtime_sec = inode->i_mtime.tv_sec; - aux.mtime_nsec = inode->i_mtime.tv_nsec; - - if (memcmp(data, &aux, sizeof(aux)) != 0) + if (*(u64 *)data != ci->i_version) return FSCACHE_CHECKAUX_OBSOLETE; dout("ceph inode 0x%p cached okay\n", ci); @@ -139,7 +127,6 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_aux_inode aux; /* No caching for filesystem */ if (!fsc->fscache) @@ -151,14 +138,10 @@ void ceph_fscache_register_inode_cookie(struct inode *inode) inode_lock_nested(inode, I_MUTEX_CHILD); if (!ci->fscache) { - memset(&aux, 0, sizeof(aux)); - aux.version = ci->i_version; - aux.mtime_sec = inode->i_mtime.tv_sec; - aux.mtime_nsec = inode->i_mtime.tv_nsec; ci->fscache = fscache_acquire_cookie(fsc->fscache, &ceph_fscache_inode_object_def, &ci->i_vino, sizeof(ci->i_vino), - &aux, sizeof(aux), + &ci->i_version, sizeof(ci->i_version), ci, i_size_read(inode), false); } inode_unlock(inode); diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 8f537f1d9d1d..b9460b6fb76f 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1188,11 +1188,11 @@ void ceph_remove_cap(struct ceph_cap *cap, bool queue_release) lockdep_assert_held(&ci->i_ceph_lock); - fsc = ceph_sb_to_client(ci->vfs_inode.i_sb); + fsc = ceph_inode_to_client(&ci->vfs_inode); WARN_ON_ONCE(ci->i_auth_cap == cap && !list_empty(&ci->i_dirty_item) && !fsc->blocklisted && - READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN); + !ceph_inode_is_shutdown(&ci->vfs_inode)); __ceph_remove_cap(cap, queue_release); } @@ -1968,8 +1968,8 @@ retry: } } - dout("check_caps %p file_want %s used %s dirty %s flushing %s" - " issued %s revoking %s retain %s %s%s\n", inode, + dout("check_caps %llx.%llx file_want %s used %s dirty %s flushing %s" + " issued %s revoking %s retain %s %s%s\n", ceph_vinop(inode), ceph_cap_string(file_wanted), ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps), ceph_cap_string(ci->i_flushing_caps), @@ -1990,7 +1990,8 @@ retry: (revoking & (CEPH_CAP_FILE_CACHE| CEPH_CAP_FILE_LAZYIO)) && /* or revoking cache */ !tried_invalidate) { - dout("check_caps trying to invalidate on %p\n", inode); + dout("check_caps trying to invalidate on %llx.%llx\n", + ceph_vinop(inode)); if (try_nonblocking_invalidate(inode) < 0) { dout("check_caps queuing invalidate\n"); queue_invalidate = true; @@ -2629,9 +2630,9 @@ void ceph_take_cap_refs(struct ceph_inode_info *ci, int got, * * Returns 0 if caps were not able to be acquired (yet), 1 if succeed, * or a negative error code. There are 3 speical error codes: - * -EAGAIN: need to sleep but non-blocking is specified - * -EFBIG: ask caller to call check_max_size() and try again. - * -ESTALE: ask caller to call ceph_renew_caps() and try again. + * -EAGAIN: need to sleep but non-blocking is specified + * -EFBIG: ask caller to call check_max_size() and try again. + * -EUCLEAN: ask caller to call ceph_renew_caps() and try again. */ enum { /* first 8 bits are reserved for CEPH_FILE_MODE_FOO */ @@ -2679,7 +2680,7 @@ again: dout("get_cap_refs %p endoff %llu > maxsize %llu\n", inode, endoff, ci->i_max_size); if (endoff > ci->i_requested_max_size) - ret = ci->i_auth_cap ? -EFBIG : -ESTALE; + ret = ci->i_auth_cap ? -EFBIG : -EUCLEAN; goto out_unlock; } /* @@ -2749,9 +2750,9 @@ again: goto out_unlock; } - if (READ_ONCE(mdsc->fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { - dout("get_cap_refs %p forced umount\n", inode); - ret = -EIO; + if (ceph_inode_is_shutdown(inode)) { + dout("get_cap_refs %p inode is shutdown\n", inode); + ret = -ESTALE; goto out_unlock; } mds_wanted = __ceph_caps_mds_wanted(ci, false); @@ -2759,7 +2760,7 @@ again: dout("get_cap_refs %p need %s > mds_wanted %s\n", inode, ceph_cap_string(need), ceph_cap_string(mds_wanted)); - ret = -ESTALE; + ret = -EUCLEAN; goto out_unlock; } @@ -2843,7 +2844,7 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, ret = try_get_cap_refs(inode, need, want, 0, flags, got); /* three special error codes */ - if (ret == -EAGAIN || ret == -EFBIG || ret == -ESTALE) + if (ret == -EAGAIN || ret == -EFBIG || ret == -EUCLEAN) ret = 0; return ret; } @@ -2926,7 +2927,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got } if (ret < 0) { - if (ret == -EFBIG || ret == -ESTALE) { + if (ret == -EFBIG || ret == -EUCLEAN) { int ret2 = ceph_wait_on_async_create(inode); if (ret2 < 0) return ret2; @@ -2935,7 +2936,7 @@ int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got check_max_size(inode, endoff); continue; } - if (ret == -ESTALE) { + if (ret == -EUCLEAN) { /* session was killed, try renew caps */ ret = ceph_renew_caps(inode, flags); if (ret == 0) @@ -4315,7 +4316,7 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s) i_dirty_item); inode = &ci->vfs_inode; ihold(inode); - dout("flush_dirty_caps %p\n", inode); + dout("flush_dirty_caps %llx.%llx\n", ceph_vinop(inode)); spin_unlock(&mdsc->cap_dirty_lock); ceph_check_caps(ci, CHECK_CAPS_FLUSH, NULL); iput(inode); @@ -4560,3 +4561,119 @@ int ceph_encode_dentry_release(void **p, struct dentry *dentry, spin_unlock(&dentry->d_lock); return ret; } + +static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_cap_snap *capsnap; + int capsnap_release = 0; + + lockdep_assert_held(&ci->i_ceph_lock); + + dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode); + + while (!list_empty(&ci->i_cap_snaps)) { + capsnap = list_first_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, ci_item); + __ceph_remove_capsnap(inode, capsnap, NULL, NULL); + ceph_put_snap_context(capsnap->context); + ceph_put_cap_snap(capsnap); + capsnap_release++; + } + wake_up_all(&ci->i_cap_wq); + wake_up_all(&mdsc->cap_flushing_wq); + return capsnap_release; +} + +int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate) +{ + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_inode_info *ci = ceph_inode(inode); + bool is_auth; + bool dirty_dropped = false; + int iputs = 0; + + lockdep_assert_held(&ci->i_ceph_lock); + + dout("removing cap %p, ci is %p, inode is %p\n", + cap, ci, &ci->vfs_inode); + + is_auth = (cap == ci->i_auth_cap); + __ceph_remove_cap(cap, false); + if (is_auth) { + struct ceph_cap_flush *cf; + + if (ceph_inode_is_shutdown(inode)) { + if (inode->i_data.nrpages > 0) + *invalidate = true; + if (ci->i_wrbuffer_ref > 0) + mapping_set_error(&inode->i_data, -EIO); + } + + spin_lock(&mdsc->cap_dirty_lock); + + /* trash all of the cap flushes for this inode */ + while (!list_empty(&ci->i_cap_flush_list)) { + cf = list_first_entry(&ci->i_cap_flush_list, + struct ceph_cap_flush, i_list); + list_del_init(&cf->g_list); + list_del_init(&cf->i_list); + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); + } + + if (!list_empty(&ci->i_dirty_item)) { + pr_warn_ratelimited( + " dropping dirty %s state for %p %lld\n", + ceph_cap_string(ci->i_dirty_caps), + inode, ceph_ino(inode)); + ci->i_dirty_caps = 0; + list_del_init(&ci->i_dirty_item); + dirty_dropped = true; + } + if (!list_empty(&ci->i_flushing_item)) { + pr_warn_ratelimited( + " dropping dirty+flushing %s state for %p %lld\n", + ceph_cap_string(ci->i_flushing_caps), + inode, ceph_ino(inode)); + ci->i_flushing_caps = 0; + list_del_init(&ci->i_flushing_item); + mdsc->num_cap_flushing--; + dirty_dropped = true; + } + spin_unlock(&mdsc->cap_dirty_lock); + + if (dirty_dropped) { + mapping_set_error(inode->i_mapping, -EIO); + + if (ci->i_wrbuffer_ref_head == 0 && + ci->i_wr_ref == 0 && + ci->i_dirty_caps == 0 && + ci->i_flushing_caps == 0) { + ceph_put_snap_context(ci->i_head_snapc); + ci->i_head_snapc = NULL; + } + } + + if (atomic_read(&ci->i_filelock_ref) > 0) { + /* make further file lock syscall return -EIO */ + ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; + pr_warn_ratelimited(" dropping file locks for %p %lld\n", + inode, ceph_ino(inode)); + } + + if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { + cf = ci->i_prealloc_cap_flush; + ci->i_prealloc_cap_flush = NULL; + if (!cf->is_capsnap) + ceph_free_cap_flush(cf); + } + + if (!list_empty(&ci->i_cap_snaps)) + iputs = remove_capsnaps(mdsc, inode); + } + if (dirty_dropped) + ++iputs; + return iputs; +} diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 38b78b45811f..3cf7c9c1085b 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -146,82 +146,93 @@ static int mdsc_show(struct seq_file *s, void *p) name, total, avg, _min, max, sum); \ } -static int metric_show(struct seq_file *s, void *p) +static int metrics_file_show(struct seq_file *s, void *p) { struct ceph_fs_client *fsc = s->private; - struct ceph_mds_client *mdsc = fsc->mdsc; - struct ceph_client_metric *m = &mdsc->metric; - int nr_caps = 0; - s64 total, sum, avg, min, max, sq; - u64 sum_sz, avg_sz, min_sz, max_sz; + struct ceph_client_metric *m = &fsc->mdsc->metric; - sum = percpu_counter_sum(&m->total_inodes); seq_printf(s, "item total\n"); seq_printf(s, "------------------------------------------\n"); - seq_printf(s, "%-35s%lld / %lld\n", "opened files / total inodes", - atomic64_read(&m->opened_files), sum); - seq_printf(s, "%-35s%lld / %lld\n", "pinned i_caps / total inodes", - atomic64_read(&m->total_caps), sum); - seq_printf(s, "%-35s%lld / %lld\n", "opened inodes / total inodes", - percpu_counter_sum(&m->opened_inodes), sum); + seq_printf(s, "%-35s%lld\n", "total inodes", + percpu_counter_sum(&m->total_inodes)); + seq_printf(s, "%-35s%lld\n", "opened files", + atomic64_read(&m->opened_files)); + seq_printf(s, "%-35s%lld\n", "pinned i_caps", + atomic64_read(&m->total_caps)); + seq_printf(s, "%-35s%lld\n", "opened inodes", + percpu_counter_sum(&m->opened_inodes)); + return 0; +} + +static const char * const metric_str[] = { + "read", + "write", + "metadata", + "copyfrom" +}; +static int metrics_latency_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_client_metric *cm = &fsc->mdsc->metric; + struct ceph_metric *m; + s64 total, sum, avg, min, max, sq; + int i; - seq_printf(s, "\n"); seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); seq_printf(s, "-----------------------------------------------------------------------------------\n"); - spin_lock(&m->read_metric_lock); - total = m->total_reads; - sum = m->read_latency_sum; - avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; - min = m->read_latency_min; - max = m->read_latency_max; - sq = m->read_latency_sq_sum; - spin_unlock(&m->read_metric_lock); - CEPH_LAT_METRIC_SHOW("read", total, avg, min, max, sq); + for (i = 0; i < METRIC_MAX; i++) { + m = &cm->metric[i]; + spin_lock(&m->lock); + total = m->total; + sum = m->latency_sum; + avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + min = m->latency_min; + max = m->latency_max; + sq = m->latency_sq_sum; + spin_unlock(&m->lock); + CEPH_LAT_METRIC_SHOW(metric_str[i], total, avg, min, max, sq); + } - spin_lock(&m->write_metric_lock); - total = m->total_writes; - sum = m->write_latency_sum; - avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; - min = m->write_latency_min; - max = m->write_latency_max; - sq = m->write_latency_sq_sum; - spin_unlock(&m->write_metric_lock); - CEPH_LAT_METRIC_SHOW("write", total, avg, min, max, sq); + return 0; +} - spin_lock(&m->metadata_metric_lock); - total = m->total_metadatas; - sum = m->metadata_latency_sum; - avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; - min = m->metadata_latency_min; - max = m->metadata_latency_max; - sq = m->metadata_latency_sq_sum; - spin_unlock(&m->metadata_metric_lock); - CEPH_LAT_METRIC_SHOW("metadata", total, avg, min, max, sq); +static int metrics_size_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_client_metric *cm = &fsc->mdsc->metric; + struct ceph_metric *m; + s64 total; + u64 sum, avg, min, max; + int i; - seq_printf(s, "\n"); seq_printf(s, "item total avg_sz(bytes) min_sz(bytes) max_sz(bytes) total_sz(bytes)\n"); seq_printf(s, "----------------------------------------------------------------------------------------\n"); - spin_lock(&m->read_metric_lock); - total = m->total_reads; - sum_sz = m->read_size_sum; - avg_sz = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum_sz, total) : 0; - min_sz = m->read_size_min; - max_sz = m->read_size_max; - spin_unlock(&m->read_metric_lock); - CEPH_SZ_METRIC_SHOW("read", total, avg_sz, min_sz, max_sz, sum_sz); + for (i = 0; i < METRIC_MAX; i++) { + /* skip 'metadata' as it doesn't use the size metric */ + if (i == METRIC_METADATA) + continue; + m = &cm->metric[i]; + spin_lock(&m->lock); + total = m->total; + sum = m->size_sum; + avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; + min = m->size_min; + max = m->size_max; + spin_unlock(&m->lock); + CEPH_SZ_METRIC_SHOW(metric_str[i], total, avg, min, max, sum); + } - spin_lock(&m->write_metric_lock); - total = m->total_writes; - sum_sz = m->write_size_sum; - avg_sz = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum_sz, total) : 0; - min_sz = m->write_size_min; - max_sz = m->write_size_max; - spin_unlock(&m->write_metric_lock); - CEPH_SZ_METRIC_SHOW("write", total, avg_sz, min_sz, max_sz, sum_sz); + return 0; +} + +static int metrics_caps_show(struct seq_file *s, void *p) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_client_metric *m = &fsc->mdsc->metric; + int nr_caps = 0; - seq_printf(s, "\n"); seq_printf(s, "item total miss hit\n"); seq_printf(s, "-------------------------------------------------\n"); @@ -350,8 +361,11 @@ DEFINE_SHOW_ATTRIBUTE(mdsmap); DEFINE_SHOW_ATTRIBUTE(mdsc); DEFINE_SHOW_ATTRIBUTE(caps); DEFINE_SHOW_ATTRIBUTE(mds_sessions); -DEFINE_SHOW_ATTRIBUTE(metric); DEFINE_SHOW_ATTRIBUTE(status); +DEFINE_SHOW_ATTRIBUTE(metrics_file); +DEFINE_SHOW_ATTRIBUTE(metrics_latency); +DEFINE_SHOW_ATTRIBUTE(metrics_size); +DEFINE_SHOW_ATTRIBUTE(metrics_caps); /* @@ -385,8 +399,9 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) debugfs_remove(fsc->debugfs_mdsmap); debugfs_remove(fsc->debugfs_mds_sessions); debugfs_remove(fsc->debugfs_caps); - debugfs_remove(fsc->debugfs_metric); + debugfs_remove(fsc->debugfs_status); debugfs_remove(fsc->debugfs_mdsc); + debugfs_remove_recursive(fsc->debugfs_metrics_dir); } void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) @@ -426,12 +441,6 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) fsc, &mdsc_fops); - fsc->debugfs_metric = debugfs_create_file("metrics", - 0400, - fsc->client->debugfs_dir, - fsc, - &metric_fops); - fsc->debugfs_caps = debugfs_create_file("caps", 0400, fsc->client->debugfs_dir, @@ -443,6 +452,18 @@ void ceph_fs_debugfs_init(struct ceph_fs_client *fsc) fsc->client->debugfs_dir, fsc, &status_fops); + + fsc->debugfs_metrics_dir = debugfs_create_dir("metrics", + fsc->client->debugfs_dir); + + debugfs_create_file("file", 0400, fsc->debugfs_metrics_dir, fsc, + &metrics_file_fops); + debugfs_create_file("latency", 0400, fsc->debugfs_metrics_dir, fsc, + &metrics_latency_fops); + debugfs_create_file("size", 0400, fsc->debugfs_metrics_dir, fsc, + &metrics_size_fops); + debugfs_create_file("caps", 0400, fsc->debugfs_metrics_dir, fsc, + &metrics_caps_fops); } diff --git a/fs/ceph/export.c b/fs/ceph/export.c index 1d65934c1262..e0fa66ac8b9f 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -157,6 +157,11 @@ static struct inode *__lookup_inode(struct super_block *sb, u64 ino) ceph_mdsc_put_request(req); if (!inode) return err < 0 ? ERR_PTR(err) : ERR_PTR(-ESTALE); + } else { + if (ceph_inode_is_shutdown(inode)) { + iput(inode); + return ERR_PTR(-ESTALE); + } } return inode; } @@ -223,8 +228,13 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, return ERR_PTR(-ESTALE); inode = ceph_find_inode(sb, vino); - if (inode) + if (inode) { + if (ceph_inode_is_shutdown(inode)) { + iput(inode); + return ERR_PTR(-ESTALE); + } return d_obtain_alias(inode); + } req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, USE_ANY_MDS); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index b129ea551378..02a0a0fd9ccd 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -525,6 +525,7 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, if (result) { struct dentry *dentry = req->r_dentry; + struct inode *inode = d_inode(dentry); int pathlen = 0; u64 base = 0; char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen, @@ -534,7 +535,8 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, if (!d_unhashed(dentry)) d_drop(dentry); - /* FIXME: start returning I/O errors on all accesses? */ + ceph_inode_shutdown(inode); + pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n", base, IS_ERR(path) ? "<>" : path, result); ceph_mdsc_free_path(path, pathlen); @@ -556,7 +558,7 @@ static void ceph_async_create_cb(struct ceph_mds_client *mdsc, } ceph_kick_flushing_inode_caps(req->r_session, ci); spin_unlock(&ci->i_ceph_lock); - } else { + } else if (!result) { pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__, req->r_deleg_ino); } @@ -845,6 +847,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, ssize_t ret; u64 off = iocb->ki_pos; u64 len = iov_iter_count(to); + u64 i_size; dout("sync_read on file %p %llu~%u %s\n", file, off, (unsigned)len, (file->f_flags & O_DIRECT) ? "O_DIRECT" : ""); @@ -868,7 +871,6 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, struct page **pages; int num_pages; size_t page_off; - u64 i_size; bool more; int idx; size_t left; @@ -951,11 +953,14 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, } if (off > iocb->ki_pos) { - if (ret >= 0 && - iov_iter_count(to) > 0 && off >= i_size_read(inode)) + if (off >= i_size) { *retry_op = CHECK_EOF; - ret = off - iocb->ki_pos; - iocb->ki_pos = off; + ret = i_size - iocb->ki_pos; + iocb->ki_pos = i_size; + } else { + ret = off - iocb->ki_pos; + iocb->ki_pos = off; + } } dout("sync_read result %zd retry_op %d\n", ret, *retry_op); @@ -1526,6 +1531,9 @@ again: dout("aio_read %p %llx.%llx %llu~%u trying to get caps on %p\n", inode, ceph_vinop(inode), iocb->ki_pos, (unsigned)len, inode); + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + if (direct_lock) ceph_start_io_direct(inode); else @@ -1678,6 +1686,9 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) loff_t pos; loff_t limit = max(i_size_read(inode), fsc->max_file_size); + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; @@ -2200,6 +2211,54 @@ static int is_file_size_ok(struct inode *src_inode, struct inode *dst_inode, return 0; } +static struct ceph_osd_request * +ceph_alloc_copyfrom_request(struct ceph_osd_client *osdc, + u64 src_snapid, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + struct ceph_object_id *dst_oid, + struct ceph_object_locator *dst_oloc, + u32 truncate_seq, u64 truncate_size) +{ + struct ceph_osd_request *req; + int ret; + u32 src_fadvise_flags = + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_NOCACHE; + u32 dst_fadvise_flags = + CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | + CEPH_OSD_OP_FLAG_FADVISE_DONTNEED; + + req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); + if (!req) + return ERR_PTR(-ENOMEM); + + req->r_flags = CEPH_OSD_FLAG_WRITE; + + ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc); + ceph_oid_copy(&req->r_t.base_oid, dst_oid); + + ret = osd_req_op_copy_from_init(req, src_snapid, 0, + src_oid, src_oloc, + src_fadvise_flags, + dst_fadvise_flags, + truncate_seq, + truncate_size, + CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); + if (ret) + goto out; + + ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); + if (ret) + goto out; + + return req; + +out: + ceph_osdc_put_request(req); + return ERR_PTR(ret); +} + static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off, struct ceph_inode_info *dst_ci, u64 *dst_off, struct ceph_fs_client *fsc, @@ -2207,6 +2266,8 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off { struct ceph_object_locator src_oloc, dst_oloc; struct ceph_object_id src_oid, dst_oid; + struct ceph_osd_client *osdc; + struct ceph_osd_request *req; size_t bytes = 0; u64 src_objnum, src_objoff, dst_objnum, dst_objoff; u32 src_objlen, dst_objlen; @@ -2217,6 +2278,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off src_oloc.pool_ns = ceph_try_get_string(src_ci->i_layout.pool_ns); dst_oloc.pool = dst_ci->i_layout.pool_id; dst_oloc.pool_ns = ceph_try_get_string(dst_ci->i_layout.pool_ns); + osdc = &fsc->client->osdc; while (len >= object_size) { ceph_calc_file_object_mapping(&src_ci->i_layout, *src_off, @@ -2232,17 +2294,22 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off ceph_oid_printf(&dst_oid, "%llx.%08llx", dst_ci->i_vino.ino, dst_objnum); /* Do an object remote copy */ - ret = ceph_osdc_copy_from(&fsc->client->osdc, - src_ci->i_vino.snap, 0, - &src_oid, &src_oloc, - CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | - CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, - &dst_oid, &dst_oloc, - CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | - CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, - dst_ci->i_truncate_seq, - dst_ci->i_truncate_size, - CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ); + req = ceph_alloc_copyfrom_request(osdc, src_ci->i_vino.snap, + &src_oid, &src_oloc, + &dst_oid, &dst_oloc, + dst_ci->i_truncate_seq, + dst_ci->i_truncate_size); + if (IS_ERR(req)) + ret = PTR_ERR(req); + else { + ceph_osdc_start_request(osdc, req, false); + ret = ceph_osdc_wait_request(osdc, req); + ceph_update_copyfrom_metrics(&fsc->mdsc->metric, + req->r_start_latency, + req->r_end_latency, + object_size, ret); + ceph_osdc_put_request(req); + } if (ret) { if (ret == -EOPNOTSUPP) { fsc->have_copy_from2 = false; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 1c7574105478..e3322fcb2e8d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1841,15 +1841,14 @@ void ceph_queue_inode_work(struct inode *inode, int work_bit) static void ceph_do_invalidate_pages(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); u32 orig_gen; int check = 0; mutex_lock(&ci->i_truncate_mutex); - if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { - pr_warn_ratelimited("invalidate_pages %p %lld forced umount\n", - inode, ceph_ino(inode)); + if (ceph_inode_is_shutdown(inode)) { + pr_warn_ratelimited("%s: inode %llx.%llx is shut down\n", + __func__, ceph_vinop(inode)); mapping_set_error(inode->i_mapping, -EIO); truncate_pagecache(inode, 0); mutex_unlock(&ci->i_truncate_mutex); @@ -1871,7 +1870,8 @@ static void ceph_do_invalidate_pages(struct inode *inode) ceph_fscache_invalidate(inode); if (invalidate_inode_pages2(inode->i_mapping) < 0) { - pr_err("invalidate_pages %p fails\n", inode); + pr_err("invalidate_inode_pages2 %llx.%llx failed\n", + ceph_vinop(inode)); } spin_lock(&ci->i_ceph_lock); @@ -2103,12 +2103,14 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) loff_t isize = i_size_read(inode); dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size); - if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > isize) { - i_size_write(inode, attr->ia_size); - inode->i_blocks = calc_inode_blocks(attr->ia_size); - ci->i_reported_size = attr->ia_size; - dirtied |= CEPH_CAP_FILE_EXCL; - ia_valid |= ATTR_MTIME; + if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size >= isize) { + if (attr->ia_size > isize) { + i_size_write(inode, attr->ia_size); + inode->i_blocks = calc_inode_blocks(attr->ia_size); + ci->i_reported_size = attr->ia_size; + dirtied |= CEPH_CAP_FILE_EXCL; + ia_valid |= ATTR_MTIME; + } } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || attr->ia_size != isize) { req->r_args.setattr.size = cpu_to_le64(attr->ia_size); @@ -2217,6 +2219,9 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, if (ceph_snap(inode) != CEPH_NOSNAP) return -EROFS; + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + err = setattr_prepare(&init_user_ns, dentry, attr); if (err != 0) return err; @@ -2347,6 +2352,9 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, u32 valid_mask = STATX_BASIC_STATS; int err = 0; + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + /* Skip the getattr altogether if we're asked not to sync */ if (!(flags & AT_STATX_DONT_SYNC)) { err = ceph_do_getattr(inode, @@ -2394,3 +2402,27 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, stat->result_mask = request_mask & valid_mask; return err; } + +void ceph_inode_shutdown(struct inode *inode) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + struct rb_node *p; + int iputs = 0; + bool invalidate = false; + + spin_lock(&ci->i_ceph_lock); + ci->i_ceph_flags |= CEPH_I_SHUTDOWN; + p = rb_first(&ci->i_caps); + while (p) { + struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node); + + p = rb_next(p); + iputs += ceph_purge_inode_cap(inode, cap, &invalidate); + } + spin_unlock(&ci->i_ceph_lock); + + if (invalidate) + ceph_queue_invalidate(inode); + while (iputs--) + iput(inode); +} diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index d8c31069fbf2..d1f154aec249 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -241,6 +241,9 @@ int ceph_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + dout("ceph_lock, fl_owner: %p\n", fl->fl_owner); /* set wait bit as appropriate, then make command as Ceph expects it*/ @@ -303,6 +306,9 @@ int ceph_flock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_FLOCK)) return -ENOLCK; + if (ceph_inode_is_shutdown(inode)) + return -ESTALE; + dout("ceph_flock, fl_file: %p\n", fl->fl_file); spin_lock(&ci->i_ceph_lock); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d64413adc0fd..250aad330a10 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -1590,129 +1590,23 @@ out: return ret; } -static int remove_capsnaps(struct ceph_mds_client *mdsc, struct inode *inode) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_cap_snap *capsnap; - int capsnap_release = 0; - - lockdep_assert_held(&ci->i_ceph_lock); - - dout("removing capsnaps, ci is %p, inode is %p\n", ci, inode); - - while (!list_empty(&ci->i_cap_snaps)) { - capsnap = list_first_entry(&ci->i_cap_snaps, - struct ceph_cap_snap, ci_item); - __ceph_remove_capsnap(inode, capsnap, NULL, NULL); - ceph_put_snap_context(capsnap->context); - ceph_put_cap_snap(capsnap); - capsnap_release++; - } - wake_up_all(&ci->i_cap_wq); - wake_up_all(&mdsc->cap_flushing_wq); - return capsnap_release; -} - static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, void *arg) { - struct ceph_fs_client *fsc = (struct ceph_fs_client *)arg; - struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_inode_info *ci = ceph_inode(inode); - LIST_HEAD(to_remove); - bool dirty_dropped = false; bool invalidate = false; - int capsnap_release = 0; + int iputs; dout("removing cap %p, ci is %p, inode is %p\n", cap, ci, &ci->vfs_inode); spin_lock(&ci->i_ceph_lock); - __ceph_remove_cap(cap, false); - if (!ci->i_auth_cap) { - struct ceph_cap_flush *cf; - - if (READ_ONCE(fsc->mount_state) >= CEPH_MOUNT_SHUTDOWN) { - if (inode->i_data.nrpages > 0) - invalidate = true; - if (ci->i_wrbuffer_ref > 0) - mapping_set_error(&inode->i_data, -EIO); - } - - while (!list_empty(&ci->i_cap_flush_list)) { - cf = list_first_entry(&ci->i_cap_flush_list, - struct ceph_cap_flush, i_list); - list_move(&cf->i_list, &to_remove); - } - - spin_lock(&mdsc->cap_dirty_lock); - - list_for_each_entry(cf, &to_remove, i_list) - list_del_init(&cf->g_list); - - if (!list_empty(&ci->i_dirty_item)) { - pr_warn_ratelimited( - " dropping dirty %s state for %p %lld\n", - ceph_cap_string(ci->i_dirty_caps), - inode, ceph_ino(inode)); - ci->i_dirty_caps = 0; - list_del_init(&ci->i_dirty_item); - dirty_dropped = true; - } - if (!list_empty(&ci->i_flushing_item)) { - pr_warn_ratelimited( - " dropping dirty+flushing %s state for %p %lld\n", - ceph_cap_string(ci->i_flushing_caps), - inode, ceph_ino(inode)); - ci->i_flushing_caps = 0; - list_del_init(&ci->i_flushing_item); - mdsc->num_cap_flushing--; - dirty_dropped = true; - } - spin_unlock(&mdsc->cap_dirty_lock); - - if (dirty_dropped) { - mapping_set_error(inode->i_mapping, -EIO); - - if (ci->i_wrbuffer_ref_head == 0 && - ci->i_wr_ref == 0 && - ci->i_dirty_caps == 0 && - ci->i_flushing_caps == 0) { - ceph_put_snap_context(ci->i_head_snapc); - ci->i_head_snapc = NULL; - } - } - - if (atomic_read(&ci->i_filelock_ref) > 0) { - /* make further file lock syscall return -EIO */ - ci->i_ceph_flags |= CEPH_I_ERROR_FILELOCK; - pr_warn_ratelimited(" dropping file locks for %p %lld\n", - inode, ceph_ino(inode)); - } - - if (!ci->i_dirty_caps && ci->i_prealloc_cap_flush) { - list_add(&ci->i_prealloc_cap_flush->i_list, &to_remove); - ci->i_prealloc_cap_flush = NULL; - } - - if (!list_empty(&ci->i_cap_snaps)) - capsnap_release = remove_capsnaps(mdsc, inode); - } + iputs = ceph_purge_inode_cap(inode, cap, &invalidate); spin_unlock(&ci->i_ceph_lock); - while (!list_empty(&to_remove)) { - struct ceph_cap_flush *cf; - cf = list_first_entry(&to_remove, - struct ceph_cap_flush, i_list); - list_del_init(&cf->i_list); - if (!cf->is_capsnap) - ceph_free_cap_flush(cf); - } wake_up_all(&ci->i_cap_wq); if (invalidate) ceph_queue_invalidate(inode); - if (dirty_dropped) - iput(inode); - while (capsnap_release--) + while (iputs--) iput(inode); return 0; } @@ -3467,9 +3361,14 @@ static void handle_session(struct ceph_mds_session *session, if (msg_version >= 3) { u32 len; - /* version >= 2, metadata */ - if (__decode_session_metadata(&p, end, &blocklisted) < 0) + /* version >= 2 and < 5, decode metadata, skip otherwise + * as it's handled via flags. + */ + if (msg_version >= 5) + ceph_decode_skip_map(&p, end, string, string, bad); + else if (__decode_session_metadata(&p, end, &blocklisted) < 0) goto bad; + /* version >= 3, feature bits */ ceph_decode_32_safe(&p, end, len, bad); if (len) { @@ -3478,6 +3377,18 @@ static void handle_session(struct ceph_mds_session *session, } } + if (msg_version >= 5) { + u32 flags; + /* version >= 4, struct_v, struct_cv, len, metric_spec */ + ceph_decode_skip_n(&p, end, 2 + sizeof(u32) * 2, bad); + /* version >= 5, flags */ + ceph_decode_32_safe(&p, end, flags, bad); + if (flags & CEPH_SESSION_BLOCKLISTED) { + pr_warn("mds%d session blocklisted\n", session->s_mds); + blocklisted = true; + } + } + mutex_lock(&mdsc->mutex); if (op == CEPH_SESSION_CLOSE) { ceph_get_mds_session(session); @@ -5072,7 +4983,8 @@ void ceph_mdsc_handle_fsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) return; bad: - pr_err("error decoding fsmap\n"); + pr_err("error decoding fsmap %d. Shutting down mount.\n", err); + ceph_umount_begin(mdsc->fsc->sb); err_out: mutex_lock(&mdsc->mutex); mdsc->mdsmap_err = err; @@ -5139,7 +5051,8 @@ void ceph_mdsc_handle_mdsmap(struct ceph_mds_client *mdsc, struct ceph_msg *msg) bad_unlock: mutex_unlock(&mdsc->mutex); bad: - pr_err("error decoding mdsmap %d\n", err); + pr_err("error decoding mdsmap %d. Shutting down mount.\n", err); + ceph_umount_begin(mdsc->fsc->sb); return; } diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 61d67cbcb367..30387733765d 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -263,10 +263,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end, bool msgr2) goto nomem; for (j = 0; j < num_export_targets; j++) { target = ceph_decode_32(&pexport_targets); - if (target >= m->possible_max_rank) { - err = -EIO; - goto corrupt; - } info->export_targets[j] = target; } } else { diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 04d5df29bbbf..c57699d8408d 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -62,7 +62,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, read->header.ver = 1; read->header.compat = 1; read->header.data_len = cpu_to_le32(sizeof(*read) - header_len); - sum = m->read_latency_sum; + sum = m->metric[METRIC_READ].latency_sum; jiffies_to_timespec64(sum, &ts); read->sec = cpu_to_le32(ts.tv_sec); read->nsec = cpu_to_le32(ts.tv_nsec); @@ -74,7 +74,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, write->header.ver = 1; write->header.compat = 1; write->header.data_len = cpu_to_le32(sizeof(*write) - header_len); - sum = m->write_latency_sum; + sum = m->metric[METRIC_WRITE].latency_sum; jiffies_to_timespec64(sum, &ts); write->sec = cpu_to_le32(ts.tv_sec); write->nsec = cpu_to_le32(ts.tv_nsec); @@ -86,7 +86,7 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, meta->header.ver = 1; meta->header.compat = 1; meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len); - sum = m->metadata_latency_sum; + sum = m->metric[METRIC_METADATA].latency_sum; jiffies_to_timespec64(sum, &ts); meta->sec = cpu_to_le32(ts.tv_sec); meta->nsec = cpu_to_le32(ts.tv_nsec); @@ -141,8 +141,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, rsize->header.ver = 1; rsize->header.compat = 1; rsize->header.data_len = cpu_to_le32(sizeof(*rsize) - header_len); - rsize->total_ops = cpu_to_le64(m->total_reads); - rsize->total_size = cpu_to_le64(m->read_size_sum); + rsize->total_ops = cpu_to_le64(m->metric[METRIC_READ].total); + rsize->total_size = cpu_to_le64(m->metric[METRIC_READ].size_sum); items++; /* encode the write io size metric */ @@ -151,8 +151,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, wsize->header.ver = 1; wsize->header.compat = 1; wsize->header.data_len = cpu_to_le32(sizeof(*wsize) - header_len); - wsize->total_ops = cpu_to_le64(m->total_writes); - wsize->total_size = cpu_to_le64(m->write_size_sum); + wsize->total_ops = cpu_to_le64(m->metric[METRIC_WRITE].total); + wsize->total_size = cpu_to_le64(m->metric[METRIC_WRITE].size_sum); items++; put_unaligned_le32(items, &head->num); @@ -220,7 +220,8 @@ static void metric_delayed_work(struct work_struct *work) int ceph_metric_init(struct ceph_client_metric *m) { - int ret; + struct ceph_metric *metric; + int ret, i; if (!m) return -EINVAL; @@ -243,32 +244,18 @@ int ceph_metric_init(struct ceph_client_metric *m) if (ret) goto err_i_caps_mis; - spin_lock_init(&m->read_metric_lock); - m->read_latency_sq_sum = 0; - m->read_latency_min = KTIME_MAX; - m->read_latency_max = 0; - m->total_reads = 0; - m->read_latency_sum = 0; - m->read_size_min = U64_MAX; - m->read_size_max = 0; - m->read_size_sum = 0; - - spin_lock_init(&m->write_metric_lock); - m->write_latency_sq_sum = 0; - m->write_latency_min = KTIME_MAX; - m->write_latency_max = 0; - m->total_writes = 0; - m->write_latency_sum = 0; - m->write_size_min = U64_MAX; - m->write_size_max = 0; - m->write_size_sum = 0; - - spin_lock_init(&m->metadata_metric_lock); - m->metadata_latency_sq_sum = 0; - m->metadata_latency_min = KTIME_MAX; - m->metadata_latency_max = 0; - m->total_metadatas = 0; - m->metadata_latency_sum = 0; + for (i = 0; i < METRIC_MAX; i++) { + metric = &m->metric[i]; + spin_lock_init(&metric->lock); + metric->size_sum = 0; + metric->size_min = U64_MAX; + metric->size_max = 0; + metric->total = 0; + metric->latency_sum = 0; + metric->latency_sq_sum = 0; + metric->latency_min = KTIME_MAX; + metric->latency_max = 0; + } atomic64_set(&m->opened_files, 0); ret = percpu_counter_init(&m->opened_inodes, 0, GFP_KERNEL); @@ -338,9 +325,9 @@ static inline void __update_stdev(ktime_t total, ktime_t lsum, *sq_sump += sq; } -void ceph_update_read_metrics(struct ceph_client_metric *m, - ktime_t r_start, ktime_t r_end, - unsigned int size, int rc) +void ceph_update_metrics(struct ceph_metric *m, + ktime_t r_start, ktime_t r_end, + unsigned int size, int rc) { ktime_t lat = ktime_sub(r_end, r_start); ktime_t total; @@ -348,63 +335,12 @@ void ceph_update_read_metrics(struct ceph_client_metric *m, if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT)) return; - spin_lock(&m->read_metric_lock); - total = ++m->total_reads; - m->read_size_sum += size; - m->read_latency_sum += lat; - METRIC_UPDATE_MIN_MAX(m->read_size_min, - m->read_size_max, - size); - METRIC_UPDATE_MIN_MAX(m->read_latency_min, - m->read_latency_max, - lat); - __update_stdev(total, m->read_latency_sum, - &m->read_latency_sq_sum, lat); - spin_unlock(&m->read_metric_lock); -} - -void ceph_update_write_metrics(struct ceph_client_metric *m, - ktime_t r_start, ktime_t r_end, - unsigned int size, int rc) -{ - ktime_t lat = ktime_sub(r_end, r_start); - ktime_t total; - - if (unlikely(rc && rc != -ETIMEDOUT)) - return; - - spin_lock(&m->write_metric_lock); - total = ++m->total_writes; - m->write_size_sum += size; - m->write_latency_sum += lat; - METRIC_UPDATE_MIN_MAX(m->write_size_min, - m->write_size_max, - size); - METRIC_UPDATE_MIN_MAX(m->write_latency_min, - m->write_latency_max, - lat); - __update_stdev(total, m->write_latency_sum, - &m->write_latency_sq_sum, lat); - spin_unlock(&m->write_metric_lock); -} - -void ceph_update_metadata_metrics(struct ceph_client_metric *m, - ktime_t r_start, ktime_t r_end, - int rc) -{ - ktime_t lat = ktime_sub(r_end, r_start); - ktime_t total; - - if (unlikely(rc && rc != -ENOENT)) - return; - - spin_lock(&m->metadata_metric_lock); - total = ++m->total_metadatas; - m->metadata_latency_sum += lat; - METRIC_UPDATE_MIN_MAX(m->metadata_latency_min, - m->metadata_latency_max, - lat); - __update_stdev(total, m->metadata_latency_sum, - &m->metadata_latency_sq_sum, lat); - spin_unlock(&m->metadata_metric_lock); + spin_lock(&m->lock); + total = ++m->total; + m->size_sum += size; + METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size); + m->latency_sum += lat; + METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat); + __update_stdev(total, m->latency_sum, &m->latency_sq_sum, lat); + spin_unlock(&m->lock); } diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index 0133955a3c6a..bb45608181e7 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -125,6 +125,26 @@ struct ceph_metric_head { __le32 num; /* the number of metrics that will be sent */ } __packed; +enum metric_type { + METRIC_READ, + METRIC_WRITE, + METRIC_METADATA, + METRIC_COPYFROM, + METRIC_MAX +}; + +struct ceph_metric { + spinlock_t lock; + u64 total; + u64 size_sum; + u64 size_min; + u64 size_max; + ktime_t latency_sum; + ktime_t latency_sq_sum; + ktime_t latency_min; + ktime_t latency_max; +}; + /* This is the global metrics */ struct ceph_client_metric { atomic64_t total_dentries; @@ -135,32 +155,7 @@ struct ceph_client_metric { struct percpu_counter i_caps_hit; struct percpu_counter i_caps_mis; - spinlock_t read_metric_lock; - u64 total_reads; - u64 read_size_sum; - u64 read_size_min; - u64 read_size_max; - ktime_t read_latency_sum; - ktime_t read_latency_sq_sum; - ktime_t read_latency_min; - ktime_t read_latency_max; - - spinlock_t write_metric_lock; - u64 total_writes; - u64 write_size_sum; - u64 write_size_min; - u64 write_size_max; - ktime_t write_latency_sum; - ktime_t write_latency_sq_sum; - ktime_t write_latency_min; - ktime_t write_latency_max; - - spinlock_t metadata_metric_lock; - u64 total_metadatas; - ktime_t metadata_latency_sum; - ktime_t metadata_latency_sq_sum; - ktime_t metadata_latency_min; - ktime_t metadata_latency_max; + struct ceph_metric metric[METRIC_MAX]; /* The total number of directories and files that are opened */ atomic64_t opened_files; @@ -195,13 +190,36 @@ static inline void ceph_update_cap_mis(struct ceph_client_metric *m) percpu_counter_inc(&m->i_caps_mis); } -extern void ceph_update_read_metrics(struct ceph_client_metric *m, - ktime_t r_start, ktime_t r_end, - unsigned int size, int rc); -extern void ceph_update_write_metrics(struct ceph_client_metric *m, - ktime_t r_start, ktime_t r_end, - unsigned int size, int rc); -extern void ceph_update_metadata_metrics(struct ceph_client_metric *m, - ktime_t r_start, ktime_t r_end, - int rc); +extern void ceph_update_metrics(struct ceph_metric *m, + ktime_t r_start, ktime_t r_end, + unsigned int size, int rc); + +static inline void ceph_update_read_metrics(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + unsigned int size, int rc) +{ + ceph_update_metrics(&m->metric[METRIC_READ], + r_start, r_end, size, rc); +} +static inline void ceph_update_write_metrics(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + unsigned int size, int rc) +{ + ceph_update_metrics(&m->metric[METRIC_WRITE], + r_start, r_end, size, rc); +} +static inline void ceph_update_metadata_metrics(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + int rc) +{ + ceph_update_metrics(&m->metric[METRIC_METADATA], + r_start, r_end, 0, rc); +} +static inline void ceph_update_copyfrom_metrics(struct ceph_client_metric *m, + ktime_t r_start, ktime_t r_end, + unsigned int size, int rc) +{ + ceph_update_metrics(&m->metric[METRIC_COPYFROM], + r_start, r_end, size, rc); +} #endif /* _FS_CEPH_MDS_METRIC_H */ diff --git a/fs/ceph/super.c b/fs/ceph/super.c index fd8742bae847..bab61232dc5a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -52,8 +52,7 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) struct ceph_fs_client *fsc = ceph_inode_to_client(d_inode(dentry)); struct ceph_mon_client *monc = &fsc->client->monc; struct ceph_statfs st; - u64 fsid; - int err; + int i, err; u64 data_pool; if (fsc->mdsc->mdsmap->m_num_data_pg_pools == 1) { @@ -99,12 +98,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_namelen = NAME_MAX; /* Must convert the fsid, for consistent values across arches */ + buf->f_fsid.val[0] = 0; mutex_lock(&monc->mutex); - fsid = le64_to_cpu(*(__le64 *)(&monc->monmap->fsid)) ^ - le64_to_cpu(*((__le64 *)&monc->monmap->fsid + 1)); + for (i = 0 ; i < sizeof(monc->monmap->fsid) / sizeof(__le32) ; ++i) + buf->f_fsid.val[0] ^= le32_to_cpu(((__le32 *)&monc->monmap->fsid)[i]); mutex_unlock(&monc->mutex); - buf->f_fsid = u64_to_fsid(fsid); + /* fold the fs_cluster_id into the upper bits */ + buf->f_fsid.val[1] = monc->fs_cluster_id; return 0; } @@ -577,8 +578,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) if (fsopt->flags & CEPH_MOUNT_OPT_CLEANRECOVER) seq_show_option(m, "recover_session", "clean"); - if (fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS) - seq_puts(m, ",nowsync"); + if (!(fsopt->flags & CEPH_MOUNT_OPT_ASYNC_DIROPS)) + seq_puts(m, ",wsync"); if (fsopt->wsize != CEPH_MAX_WRITE_SIZE) seq_printf(m, ",wsize=%u", fsopt->wsize); @@ -842,7 +843,7 @@ static void __ceph_umount_begin(struct ceph_fs_client *fsc) * ceph_umount_begin - initiate forced umount. Tear down the * mount, skipping steps that may hang while waiting for server(s). */ -static void ceph_umount_begin(struct super_block *sb) +void ceph_umount_begin(struct super_block *sb) { struct ceph_fs_client *fsc = ceph_sb_to_client(sb); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 14f951cd5b61..ac331aa07cfa 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -48,7 +48,8 @@ #define CEPH_MOUNT_OPT_DEFAULT \ (CEPH_MOUNT_OPT_DCACHE | \ - CEPH_MOUNT_OPT_NOCOPYFROM) + CEPH_MOUNT_OPT_NOCOPYFROM | \ + CEPH_MOUNT_OPT_ASYNC_DIROPS) #define ceph_set_mount_opt(fsc, opt) \ (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt @@ -128,9 +129,9 @@ struct ceph_fs_client { struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; struct dentry *debugfs_mdsc, *debugfs_mdsmap; - struct dentry *debugfs_metric; struct dentry *debugfs_status; struct dentry *debugfs_mds_sessions; + struct dentry *debugfs_metrics_dir; #endif #ifdef CONFIG_CEPH_FSCACHE @@ -580,6 +581,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */ #define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */ #define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT) +#define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */ /* * Masks of ceph inode work. @@ -939,6 +941,7 @@ extern void ceph_put_snapid_map(struct ceph_mds_client* mdsc, struct ceph_snapid_map *sm); extern void ceph_trim_snapid_map(struct ceph_mds_client *mdsc); extern void ceph_cleanup_snapid_map(struct ceph_mds_client *mdsc); +void ceph_umount_begin(struct super_block *sb); /* @@ -1027,6 +1030,16 @@ extern int ceph_setattr(struct user_namespace *mnt_userns, extern int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); +void ceph_inode_shutdown(struct inode *inode); + +static inline bool ceph_inode_is_shutdown(struct inode *inode) +{ + unsigned long flags = READ_ONCE(ceph_inode(inode)->i_ceph_flags); + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + int state = READ_ONCE(fsc->mount_state); + + return (flags & CEPH_I_SHUTDOWN) || state >= CEPH_MOUNT_SHUTDOWN; +} /* xattr.c */ int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int); @@ -1198,6 +1211,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); extern int ceph_uninline_data(struct file *filp, struct page *locked_page); extern int ceph_pool_perm_check(struct inode *inode, int need); extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc); +int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate); /* file.c */ extern const struct file_operations ceph_file_fops; diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index bcb1b91b234f..9a249bfc2770 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -96,16 +96,9 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) DBG_BUGON(1); } -/* - * a compressed_pages[] placeholder in order to avoid - * being filled with file pages for in-place decompression. - */ -#define PAGE_UNALLOCATED ((void *)0x5F0E4B1D) - /* how to allocate cached pages for a pcluster */ enum z_erofs_cache_alloctype { DONTALLOC, /* don't allocate any cached pages */ - DELAYEDALLOC, /* delayed allocation (at the time of submitting io) */ /* * try to use cached I/O if page allocation succeeds or fallback * to in-place I/O instead to avoid any direct reclaim. @@ -267,10 +260,6 @@ static void preload_compressed_pages(struct z_erofs_collector *clt, /* I/O is needed, no possible to decompress directly */ standalone = false; switch (type) { - case DELAYEDALLOC: - t = tagptr_init(compressed_page_t, - PAGE_UNALLOCATED); - break; case TRYALLOC: newpage = erofs_allocpage(pagepool, gfp); if (!newpage) @@ -371,8 +360,8 @@ static bool z_erofs_try_inplace_io(struct z_erofs_collector *clt, /* callers must be with collection lock held */ static int z_erofs_attach_page(struct z_erofs_collector *clt, - struct page *page, - enum z_erofs_page_type type) + struct page *page, enum z_erofs_page_type type, + bool pvec_safereuse) { int ret; @@ -382,9 +371,9 @@ static int z_erofs_attach_page(struct z_erofs_collector *clt, z_erofs_try_inplace_io(clt, page)) return 0; - ret = z_erofs_pagevec_enqueue(&clt->vector, page, type); + ret = z_erofs_pagevec_enqueue(&clt->vector, page, type, + pvec_safereuse); clt->cl->vcnt += (unsigned int)ret; - return ret ? 0 : -EAGAIN; } @@ -727,7 +716,8 @@ hitted: tight &= (clt->mode >= COLLECT_PRIMARY_FOLLOWED); retry: - err = z_erofs_attach_page(clt, page, page_type); + err = z_erofs_attach_page(clt, page, page_type, + clt->mode >= COLLECT_PRIMARY_FOLLOWED); /* should allocate an additional short-lived page for pagevec */ if (err == -EAGAIN) { struct page *const newpage = @@ -735,7 +725,7 @@ retry: set_page_private(newpage, Z_EROFS_SHORTLIVED_PAGE); err = z_erofs_attach_page(clt, newpage, - Z_EROFS_PAGE_TYPE_EXCLUSIVE); + Z_EROFS_PAGE_TYPE_EXCLUSIVE, true); if (!err) goto retry; } @@ -1089,15 +1079,6 @@ repeat: if (!page) goto out_allocpage; - /* - * the cached page has not been allocated and - * an placeholder is out there, prepare it now. - */ - if (page == PAGE_UNALLOCATED) { - tocache = true; - goto out_allocpage; - } - /* process the target tagged pointer */ t = tagptr_init(compressed_page_t, page); justfound = tagptr_unfold_tags(t); diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h index 879df5362777..4a69515dea75 100644 --- a/fs/erofs/zdata.h +++ b/fs/erofs/zdata.h @@ -179,4 +179,3 @@ static inline void z_erofs_onlinepage_endio(struct page *page) #define Z_EROFS_VMAP_GLOBAL_PAGES 2048 #endif - diff --git a/fs/erofs/zpvec.h b/fs/erofs/zpvec.h index dfd7fe0503bb..b05464f4a808 100644 --- a/fs/erofs/zpvec.h +++ b/fs/erofs/zpvec.h @@ -106,11 +106,18 @@ static inline void z_erofs_pagevec_ctor_init(struct z_erofs_pagevec_ctor *ctor, static inline bool z_erofs_pagevec_enqueue(struct z_erofs_pagevec_ctor *ctor, struct page *page, - enum z_erofs_page_type type) + enum z_erofs_page_type type, + bool pvec_safereuse) { - if (!ctor->next && type) - if (ctor->index + 1 == ctor->nr) + if (!ctor->next) { + /* some pages cannot be reused as pvec safely without I/O */ + if (type == Z_EROFS_PAGE_TYPE_EXCLUSIVE && !pvec_safereuse) + type = Z_EROFS_VLE_PAGE_TYPE_TAIL_SHARED; + + if (type != Z_EROFS_PAGE_TYPE_EXCLUSIVE && + ctor->index + 1 == ctor->nr) return false; + } if (ctor->index >= ctor->nr) z_erofs_pagevec_ctor_pagedown(ctor, false); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 83e9bc0f91ff..f1693d45bb78 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -653,7 +653,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) return PTR_ERR(inode); } - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) { iput(inode); goto err_out; @@ -705,9 +705,6 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= SB_ACTIVE; - /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. @@ -1162,7 +1159,8 @@ static bool __need_flush_quota(struct f2fs_sb_info *sbi) if (!is_journalled_quota(sbi)) return false; - down_write(&sbi->quota_sem); + if (!down_write_trylock(&sbi->quota_sem)) + return true; if (is_sbi_flag_set(sbi, SBI_QUOTA_SKIP_FLUSH)) { ret = false; } else if (is_sbi_flag_set(sbi, SBI_QUOTA_NEED_REPAIR)) { diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 20a083dc9042..a0d5cfab75e4 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -882,6 +882,25 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) return is_page_in_cluster(cc, index); } +bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec, + int index, int nr_pages) +{ + unsigned long pgidx; + int i; + + if (nr_pages - index < cc->cluster_size) + return false; + + pgidx = pvec->pages[index]->index; + + for (i = 1; i < cc->cluster_size; i++) { + if (pvec->pages[index + i]->index != pgidx + i) + return false; + } + + return true; +} + static bool cluster_has_invalid_data(struct compress_ctx *cc) { loff_t i_size = i_size_read(cc->inode); @@ -1531,6 +1550,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc, if (cluster_may_compress(cc)) { err = f2fs_compress_pages(cc); if (err == -EAGAIN) { + add_compr_block_stat(cc->inode, cc->cluster_size); goto write; } else if (err) { f2fs_put_rpages_wbc(cc, wbc, true, 1); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 005d6144e5f3..3bd0cc2c1d2e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1470,10 +1470,15 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, struct extent_info ei = {0, }; block_t blkaddr; unsigned int start_pgofs; + int bidx = 0; if (!maxblocks) return 0; + map->m_bdev = inode->i_sb->s_bdev; + map->m_multidev_dio = + f2fs_allow_multi_device_dio(F2FS_I_SB(inode), flag); + map->m_len = 0; map->m_flags = 0; @@ -1496,6 +1501,21 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (flag == F2FS_GET_BLOCK_DIO) f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + + if (map->m_multidev_dio) { + block_t blk_addr = map->m_pblk; + + bidx = f2fs_target_device_index(sbi, map->m_pblk); + + map->m_bdev = FDEV(bidx).bdev; + map->m_pblk -= FDEV(bidx).start_blk; + map->m_len = min(map->m_len, + FDEV(bidx).end_blk + 1 - map->m_pblk); + + if (map->m_may_create) + f2fs_update_device_state(sbi, inode->i_ino, + blk_addr, map->m_len); + } goto out; } @@ -1614,6 +1634,9 @@ next_block: if (flag == F2FS_GET_BLOCK_PRE_AIO) goto skip; + if (map->m_multidev_dio) + bidx = f2fs_target_device_index(sbi, blkaddr); + if (map->m_len == 0) { /* preallocated unwritten block should be mapped for fiemap. */ if (blkaddr == NEW_ADDR) @@ -1622,10 +1645,15 @@ next_block: map->m_pblk = blkaddr; map->m_len = 1; + + if (map->m_multidev_dio) + map->m_bdev = FDEV(bidx).bdev; } else if ((map->m_pblk != NEW_ADDR && blkaddr == (map->m_pblk + ofs)) || (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || flag == F2FS_GET_BLOCK_PRE_DIO) { + if (map->m_multidev_dio && map->m_bdev != FDEV(bidx).bdev) + goto sync_out; ofs++; map->m_len++; } else { @@ -1678,10 +1706,32 @@ skip: sync_out: - /* for hardware encryption, but to avoid potential issue in future */ - if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) + if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED) { + /* + * for hardware encryption, but to avoid potential issue + * in future + */ f2fs_wait_on_block_writeback_range(inode, map->m_pblk, map->m_len); + invalidate_mapping_pages(META_MAPPING(sbi), + map->m_pblk, map->m_pblk); + + if (map->m_multidev_dio) { + block_t blk_addr = map->m_pblk; + + bidx = f2fs_target_device_index(sbi, map->m_pblk); + + map->m_bdev = FDEV(bidx).bdev; + map->m_pblk -= FDEV(bidx).start_blk; + + if (map->m_may_create) + f2fs_update_device_state(sbi, inode->i_ino, + blk_addr, map->m_len); + + f2fs_bug_on(sbi, blk_addr + map->m_len > + FDEV(bidx).end_blk + 1); + } + } if (flag == F2FS_GET_BLOCK_PRECACHE) { if (map->m_flags & F2FS_MAP_MAPPED) { @@ -1701,7 +1751,7 @@ unlock_out: f2fs_balance_fs(sbi, dn.node_changed); } out: - trace_f2fs_map_blocks(inode, map, err); + trace_f2fs_map_blocks(inode, map, create, flag, err); return err; } @@ -1760,6 +1810,9 @@ static int __get_data_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; bh->b_size = blks_to_bytes(inode, map.m_len); + + if (map.m_multidev_dio) + bh->b_bdev = map.m_bdev; } return err; } @@ -2994,6 +3047,10 @@ readd: need_readd = false; #ifdef CONFIG_F2FS_FS_COMPRESSION if (f2fs_compressed_file(inode)) { + void *fsdata = NULL; + struct page *pagep; + int ret2; + ret = f2fs_init_compress_ctx(&cc); if (ret) { done = 1; @@ -3012,27 +3069,23 @@ readd: if (unlikely(f2fs_cp_error(sbi))) goto lock_page; - if (f2fs_cluster_is_empty(&cc)) { - void *fsdata = NULL; - struct page *pagep; - int ret2; + if (!f2fs_cluster_is_empty(&cc)) + goto lock_page; - ret2 = f2fs_prepare_compress_overwrite( + ret2 = f2fs_prepare_compress_overwrite( inode, &pagep, page->index, &fsdata); - if (ret2 < 0) { - ret = ret2; - done = 1; - break; - } else if (ret2 && - !f2fs_compress_write_end(inode, - fsdata, page->index, - 1)) { - retry = 1; - break; - } - } else { - goto lock_page; + if (ret2 < 0) { + ret = ret2; + done = 1; + break; + } else if (ret2 && + (!f2fs_compress_write_end(inode, + fsdata, page->index, 1) || + !f2fs_all_cluster_page_loaded(&cc, + &pvec, i, nr_pages))) { + retry = 1; + break; } } #endif diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2308aaedeb10..64bcb81ec1c6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -55,6 +55,7 @@ enum { FAULT_DISCARD, FAULT_WRITE_IO, FAULT_SLAB_ALLOC, + FAULT_DQUOT_INIT, FAULT_MAX, }; @@ -561,6 +562,9 @@ enum { #define MAX_DIR_RA_PAGES 4 /* maximum ra pages of dir */ +/* dirty segments threshold for triggering CP */ +#define DEFAULT_DIRTY_THRESHOLD 4 + /* for in-memory extent cache entry */ #define F2FS_MIN_EXTENT_LEN 64 /* minimum extent length */ @@ -617,6 +621,7 @@ struct extent_tree { F2FS_MAP_UNWRITTEN) struct f2fs_map_blocks { + struct block_device *m_bdev; /* for multi-device dio */ block_t m_pblk; block_t m_lblk; unsigned int m_len; @@ -625,6 +630,7 @@ struct f2fs_map_blocks { pgoff_t *m_next_extent; /* point to next possible extent */ int m_seg_type; bool m_may_create; /* indicate it is from write path */ + bool m_multidev_dio; /* indicate it allows multi-device dio */ }; /* for flag in get_data_block */ @@ -1284,8 +1290,10 @@ enum { }; enum { - FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ - FS_MODE_LFS, /* use lfs allocation only */ + FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */ + FS_MODE_LFS, /* use lfs allocation only */ + FS_MODE_FRAGMENT_SEG, /* segment fragmentation mode */ + FS_MODE_FRAGMENT_BLK, /* block fragmentation mode */ }; enum { @@ -1728,12 +1736,15 @@ struct f2fs_sb_info { /* For shrinker support */ struct list_head s_list; + struct mutex umount_mutex; + unsigned int shrinker_run_no; + + /* For multi devices */ int s_ndevs; /* number of devices */ struct f2fs_dev_info *devs; /* for device list */ unsigned int dirty_device; /* for checkpoint data flush */ spinlock_t dev_lock; /* protect dirty_device */ - struct mutex umount_mutex; - unsigned int shrinker_run_no; + bool aligned_blksize; /* all devices has the same logical blksize */ /* For write statistics */ u64 sectors_written_start; @@ -1756,6 +1767,9 @@ struct f2fs_sb_info { unsigned long seq_file_ra_mul; /* multiplier for ra_pages of seq. files in fadvise */ + int max_fragment_chunk; /* max chunk size for block fragmentation mode */ + int max_fragment_hole; /* max hole size for block fragmentation mode */ + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ @@ -3363,6 +3377,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) */ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); +int f2fs_dquot_initialize(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); int f2fs_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); @@ -3492,6 +3507,8 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, struct f2fs_io_info *fio); +void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, + block_t blkaddr, unsigned int blkcnt); void f2fs_wait_on_page_writeback(struct page *page, enum page_type type, bool ordered, bool locked); void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); @@ -3516,6 +3533,16 @@ unsigned int f2fs_usable_segs_in_sec(struct f2fs_sb_info *sbi, unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned int segno); +#define DEF_FRAGMENT_SIZE 4 +#define MIN_FRAGMENT_SIZE 1 +#define MAX_FRAGMENT_SIZE 512 + +static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) +{ + return F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG || + F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK; +} + /* * checkpoint.c */ @@ -4027,6 +4054,8 @@ void f2fs_end_read_compressed_page(struct page *page, bool failed, block_t blkaddr); bool f2fs_cluster_is_empty(struct compress_ctx *cc); bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index); +bool f2fs_all_cluster_page_loaded(struct compress_ctx *cc, struct pagevec *pvec, + int index, int nr_pages); bool f2fs_sanity_check_cluster(struct dnode_of_data *dn); void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page); int f2fs_write_multi_pages(struct compress_ctx *cc, @@ -4152,8 +4181,7 @@ static inline bool f2fs_disable_compressed_file(struct inode *inode) if (!f2fs_compressed_file(inode)) return true; - if (S_ISREG(inode->i_mode) && - (get_dirty_pages(inode) || atomic_read(&fi->i_compr_blocks))) + if (S_ISREG(inode->i_mode) && F2FS_HAS_BLOCKS(inode)) return false; fi->i_flags &= ~F2FS_COMPR_FL; @@ -4302,6 +4330,16 @@ static inline int block_unaligned_IO(struct inode *inode, return align & blocksize_mask; } +static inline bool f2fs_allow_multi_device_dio(struct f2fs_sb_info *sbi, + int flag) +{ + if (!f2fs_is_multi_device(sbi)) + return false; + if (flag != F2FS_GET_BLOCK_DIO) + return false; + return sbi->aligned_blksize; +} + static inline bool f2fs_force_buffered_io(struct inode *inode, struct kiocb *iocb, struct iov_iter *iter) { @@ -4314,7 +4352,9 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, return true; if (f2fs_compressed_file(inode)) return true; - if (f2fs_is_multi_device(sbi)) + + /* disallow direct IO if any of devices has unaligned blksize */ + if (f2fs_is_multi_device(sbi) && !sbi->aligned_blksize) return true; /* * for blkzoned device, fallback direct IO to buffered IO, so diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index eb971e1e7227..92ec2699bc85 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -786,7 +786,7 @@ int f2fs_truncate(struct inode *inode) return -EIO; } - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; @@ -916,7 +916,7 @@ int f2fs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return err; if (is_quota_modification(inode, attr)) { - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; } @@ -3020,7 +3020,7 @@ static int f2fs_ioc_setproject(struct inode *inode, __u32 projid) } f2fs_put_page(ipage, 1); - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 77391e3b7d68..a946ce0ead34 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -257,7 +258,9 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->max_search = sbi->max_victim_search; /* let's select beginning hot/small space first in no_heap mode*/ - if (test_opt(sbi, NOHEAP) && + if (f2fs_need_rand_seg(sbi)) + p->offset = prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); + else if (test_opt(sbi, NOHEAP) && (type == CURSEG_HOT_DATA || IS_NODESEG(type))) p->offset = 0; else diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 2311c76ffc37..928fea695373 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -210,7 +210,7 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) return 0; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 9141147b5bb0..0f8b2df3e1e0 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -527,7 +527,7 @@ make_now: inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - inode_nohighmem(inode); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); } else if (S_ISLNK(inode->i_mode)) { if (file_is_encrypt(inode)) inode->i_op = &f2fs_encrypted_symlink_inode_operations; @@ -754,7 +754,7 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) { err = 0; set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 9c528e583c9d..a728a0af9ce0 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -74,7 +74,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (err) goto fail_drop; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) goto fail_drop; @@ -345,7 +345,7 @@ static int f2fs_create(struct user_namespace *mnt_userns, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -404,7 +404,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, F2FS_I(old_dentry->d_inode)->i_projid))) return -EXDEV; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -460,7 +460,7 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -598,10 +598,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) goto fail; } - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) goto fail; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) goto fail; @@ -675,7 +675,7 @@ static int f2fs_symlink(struct user_namespace *mnt_userns, struct inode *dir, if (err) return err; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -746,7 +746,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, if (unlikely(f2fs_cp_error(sbi))) return -EIO; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -757,7 +757,7 @@ static int f2fs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; inode->i_mapping->a_ops = &f2fs_dblock_aops; - inode_nohighmem(inode); + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); @@ -803,7 +803,7 @@ static int f2fs_mknod(struct user_namespace *mnt_userns, struct inode *dir, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -841,7 +841,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; - err = dquot_initialize(dir); + err = f2fs_dquot_initialize(dir); if (err) return err; @@ -965,16 +965,16 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, return err; } - err = dquot_initialize(old_dir); + err = f2fs_dquot_initialize(old_dir); if (err) goto out; - err = dquot_initialize(new_dir); + err = f2fs_dquot_initialize(new_dir); if (err) goto out; if (new_inode) { - err = dquot_initialize(new_inode); + err = f2fs_dquot_initialize(new_inode); if (err) goto out; } @@ -1138,11 +1138,11 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, F2FS_I(new_dentry->d_inode)->i_projid))) return -EXDEV; - err = dquot_initialize(old_dir); + err = f2fs_dquot_initialize(old_dir); if (err) goto out; - err = dquot_initialize(new_dir); + err = f2fs_dquot_initialize(new_dir); if (err) goto out; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e863136081b4..556fcd8457f3 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1443,6 +1443,7 @@ page_hit: nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), next_blkaddr_of_node(page)); + set_sbi_flag(sbi, SBI_NEED_FSCK); err = -EINVAL; out_err: ClearPageUptodate(page); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index ff14a6e5ac1c..18b98cf0465b 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -138,11 +138,6 @@ static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) return NM_I(sbi)->nat_cnt[TOTAL_NAT] >= DEF_NAT_CACHE_THRESHOLD; } -static inline bool excess_dirty_nodes(struct f2fs_sb_info *sbi) -{ - return get_pages(sbi, F2FS_DIRTY_NODES) >= sbi->blocks_per_seg * 8; -} - enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 04655511d7f5..6a1b4668d933 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -81,7 +81,7 @@ static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, if (IS_ERR(inode)) return ERR_CAST(inode); - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) goto err_out; @@ -203,7 +203,7 @@ retry: goto out_put; } - err = dquot_initialize(einode); + err = f2fs_dquot_initialize(einode); if (err) { iput(einode); goto out_put; @@ -508,7 +508,7 @@ got_it: if (IS_ERR(inode)) return PTR_ERR(inode); - ret = dquot_initialize(inode); + ret = f2fs_dquot_initialize(inode); if (ret) { iput(inode); return ret; @@ -787,8 +787,6 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sbi->sb->s_flags |= SB_ACTIVE; /* Turn on quotas so that they are updated correctly */ quota_enabled = f2fs_enable_quota_files(sbi, s_flags & SB_RDONLY); #endif @@ -816,10 +814,8 @@ int f2fs_recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &tmp_inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); - else { - /* restore s_flags to let iput() trash data */ - sbi->sb->s_flags = s_flags; - } + else + f2fs_bug_on(sbi, sbi->sb->s_flags & SB_ACTIVE); skip: fix_curseg_write_pointer = !check_only || list_empty(&inode_list); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a135d2247415..df9ed75f0b7a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -529,6 +530,25 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) } } +static inline bool excess_dirty_threshold(struct f2fs_sb_info *sbi) +{ + int factor = rwsem_is_locked(&sbi->cp_rwsem) ? 3 : 2; + unsigned int dents = get_pages(sbi, F2FS_DIRTY_DENTS); + unsigned int qdata = get_pages(sbi, F2FS_DIRTY_QDATA); + unsigned int nodes = get_pages(sbi, F2FS_DIRTY_NODES); + unsigned int meta = get_pages(sbi, F2FS_DIRTY_META); + unsigned int imeta = get_pages(sbi, F2FS_DIRTY_IMETA); + unsigned int threshold = sbi->blocks_per_seg * factor * + DEFAULT_DIRTY_THRESHOLD; + unsigned int global_threshold = threshold * 3 / 2; + + if (dents >= threshold || qdata >= threshold || + nodes >= threshold || meta >= threshold || + imeta >= threshold) + return true; + return dents + qdata + nodes + meta + imeta > global_threshold; +} + void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) { if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) @@ -547,8 +567,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) else f2fs_build_free_nids(sbi, false, false); - if (excess_dirty_nats(sbi) || excess_dirty_nodes(sbi) || - excess_prefree_segs(sbi)) + if (excess_dirty_nats(sbi) || excess_dirty_threshold(sbi) || + excess_prefree_segs(sbi) || !f2fs_space_for_roll_forward(sbi)) goto do_sync; /* there is background inflight IO or foreground operation recently */ @@ -561,7 +581,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg) goto do_sync; /* checkpoint is the only way to shrink partial cached entries */ - if (f2fs_available_free_memory(sbi, NAT_ENTRIES) || + if (f2fs_available_free_memory(sbi, NAT_ENTRIES) && f2fs_available_free_memory(sbi, INO_ENTRIES)) return; @@ -2630,6 +2650,8 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) unsigned short seg_type = curseg->seg_type; sanity_check_seg_type(sbi, seg_type); + if (f2fs_need_rand_seg(sbi)) + return prandom_u32() % (MAIN_SECS(sbi) * sbi->segs_per_sec); /* if segs_per_sec is large than 1, we need to keep original policy. */ if (__is_large_section(sbi)) @@ -2681,6 +2703,9 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->next_segno = segno; reset_curseg(sbi, type, 1); curseg->alloc_type = LFS; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + curseg->fragment_remained_chunk = + prandom_u32() % sbi->max_fragment_chunk + 1; } static int __next_free_blkoff(struct f2fs_sb_info *sbi, @@ -2707,12 +2732,22 @@ static int __next_free_blkoff(struct f2fs_sb_info *sbi, static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, struct curseg_info *seg) { - if (seg->alloc_type == SSR) + if (seg->alloc_type == SSR) { seg->next_blkoff = __next_free_blkoff(sbi, seg->segno, seg->next_blkoff + 1); - else + } else { seg->next_blkoff++; + if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) { + /* To allocate block chunks in different sizes, use random number */ + if (--seg->fragment_remained_chunk <= 0) { + seg->fragment_remained_chunk = + prandom_u32() % sbi->max_fragment_chunk + 1; + seg->next_blkoff += + prandom_u32() % sbi->max_fragment_hole + 1; + } + } + } } bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) @@ -3485,24 +3520,30 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, up_read(&SM_I(sbi)->curseg_lock); } -static void update_device_state(struct f2fs_io_info *fio) +void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, + block_t blkaddr, unsigned int blkcnt) { - struct f2fs_sb_info *sbi = fio->sbi; - unsigned int devidx; - if (!f2fs_is_multi_device(sbi)) return; - devidx = f2fs_target_device_index(sbi, fio->new_blkaddr); + while (1) { + unsigned int devidx = f2fs_target_device_index(sbi, blkaddr); + unsigned int blks = FDEV(devidx).end_blk - blkaddr + 1; - /* update device state for fsync */ - f2fs_set_dirty_device(sbi, fio->ino, devidx, FLUSH_INO); + /* update device state for fsync */ + f2fs_set_dirty_device(sbi, ino, devidx, FLUSH_INO); - /* update device state for checkpoint */ - if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { - spin_lock(&sbi->dev_lock); - f2fs_set_bit(devidx, (char *)&sbi->dirty_device); - spin_unlock(&sbi->dev_lock); + /* update device state for checkpoint */ + if (!f2fs_test_bit(devidx, (char *)&sbi->dirty_device)) { + spin_lock(&sbi->dev_lock); + f2fs_set_bit(devidx, (char *)&sbi->dirty_device); + spin_unlock(&sbi->dev_lock); + } + + if (blkcnt <= blks) + break; + blkcnt -= blks; + blkaddr += blks; } } @@ -3529,7 +3570,7 @@ reallocate: goto reallocate; } - update_device_state(fio); + f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); if (keep_order) up_read(&fio->sbi->io_order_lock); @@ -3611,6 +3652,9 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) goto drop_bio; } + invalidate_mapping_pages(META_MAPPING(sbi), + fio->new_blkaddr, fio->new_blkaddr); + stat_inc_inplace_blocks(fio->sbi); if (fio->bio && !(SM_I(sbi)->ipu_policy & (1 << F2FS_IPU_NOCACHE))) @@ -3618,7 +3662,8 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) else err = f2fs_submit_page_bio(fio); if (!err) { - update_device_state(fio); + f2fs_update_device_state(fio->sbi, fio->ino, + fio->new_blkaddr, 1); f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 89fff258727d..46fde9f3f28e 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -314,6 +314,7 @@ struct curseg_info { unsigned short next_blkoff; /* next block offset to write */ unsigned int zone; /* current zone number */ unsigned int next_segno; /* preallocated segment */ + int fragment_remained_chunk; /* remained block size in a chunk for block fragmentation mode */ bool inited; /* indicate inmem log is inited */ }; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index cf049a042482..7960ce066c1b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -58,6 +58,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_DISCARD] = "discard error", [FAULT_WRITE_IO] = "write IO error", [FAULT_SLAB_ALLOC] = "slab alloc", + [FAULT_DQUOT_INIT] = "dquot initialize", }; void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned int rate, @@ -817,6 +818,10 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE; } else if (!strcmp(name, "lfs")) { F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS; + } else if (!strcmp(name, "fragment:segment")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_SEG; + } else if (!strcmp(name, "fragment:block")) { + F2FS_OPTION(sbi).fs_mode = FS_MODE_FRAGMENT_BLK; } else { kfree(name); return -EINVAL; @@ -1292,7 +1297,7 @@ default_check: /* Not pass down write hints if the number of active logs is lesser * than NR_CURSEG_PERSIST_TYPE. */ - if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_TYPE) + if (F2FS_OPTION(sbi).active_logs != NR_CURSEG_PERSIST_TYPE) F2FS_OPTION(sbi).whint_mode = WHINT_MODE_OFF; if (f2fs_sb_has_readonly(sbi) && !f2fs_readonly(sbi->sb)) { @@ -1896,6 +1901,10 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, "adaptive"); else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS) seq_puts(seq, "lfs"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_SEG) + seq_puts(seq, "fragment:segment"); + else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) + seq_puts(seq, "fragment:block"); seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); if (test_opt(sbi, RESERVE_ROOT)) seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", @@ -2491,6 +2500,16 @@ retry: return len - towrite; } +int f2fs_dquot_initialize(struct inode *inode) +{ + if (time_to_inject(F2FS_I_SB(inode), FAULT_DQUOT_INIT)) { + f2fs_show_injection_info(F2FS_I_SB(inode), FAULT_DQUOT_INIT); + return -ESRCH; + } + + return dquot_initialize(inode); +} + static struct dquot **f2fs_get_dquots(struct inode *inode) { return F2FS_I(inode)->i_dquot; @@ -2875,6 +2894,11 @@ static const struct quotactl_ops f2fs_quotactl_ops = { .get_nextdqblk = dquot_get_next_dqblk, }; #else +int f2fs_dquot_initialize(struct inode *inode) +{ + return 0; +} + int f2fs_quota_sync(struct super_block *sb, int type) { return 0; @@ -3486,7 +3510,7 @@ skip_cross: NR_CURSEG_PERSIST_TYPE + nat_bits_blocks >= blocks_per_seg)) { f2fs_warn(sbi, "Insane cp_payload: %u, nat_bits_blocks: %u)", cp_payload, nat_bits_blocks); - return -EFSCORRUPTED; + return 1; } if (unlikely(f2fs_cp_error(sbi))) { @@ -3522,6 +3546,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; sbi->migration_granularity = sbi->segs_per_sec; sbi->seq_file_ra_mul = MIN_RA_MUL; + sbi->max_fragment_chunk = DEF_FRAGMENT_SIZE; + sbi->max_fragment_hole = DEF_FRAGMENT_SIZE; sbi->dir_level = DEF_DIR_LEVEL; sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; @@ -3746,6 +3772,7 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); unsigned int max_devices = MAX_DEVICES; + unsigned int logical_blksize; int i; /* Initialize single device information */ @@ -3766,6 +3793,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) if (!sbi->devs) return -ENOMEM; + logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev); + sbi->aligned_blksize = true; + for (i = 0; i < max_devices; i++) { if (i > 0 && !RDEV(i).path[0]) @@ -3802,6 +3832,9 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) /* to release errored devices */ sbi->s_ndevs = i + 1; + if (logical_blksize != bdev_logical_block_size(FDEV(i).bdev)) + sbi->aligned_blksize = false; + #ifdef CONFIG_BLK_DEV_ZONED if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && !f2fs_sb_has_blkzoned(sbi)) { @@ -4351,6 +4384,8 @@ free_node_inode: free_stats: f2fs_destroy_stats(sbi); free_nm: + /* stop discard thread before destroying node manager */ + f2fs_stop_discard_thread(sbi); f2fs_destroy_node_manager(sbi); free_sm: f2fs_destroy_segment_manager(sbi); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index a32fe31c33b8..7d289249cd7e 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -196,7 +196,7 @@ static ssize_t encoding_show(struct f2fs_attr *a, struct super_block *sb = sbi->sb; if (f2fs_sb_has_casefold(sbi)) - return snprintf(buf, PAGE_SIZE, "%s (%d.%d.%d)\n", + return sysfs_emit(buf, "%s (%d.%d.%d)\n", sb->s_encoding->charset, (sb->s_encoding->version >> 16) & 0xff, (sb->s_encoding->version >> 8) & 0xff, @@ -245,7 +245,7 @@ static ssize_t avg_vblocks_show(struct f2fs_attr *a, static ssize_t main_blkaddr_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { - return snprintf(buf, PAGE_SIZE, "%llu\n", + return sysfs_emit(buf, "%llu\n", (unsigned long long)MAIN_BLKADDR(sbi)); } @@ -551,6 +551,22 @@ out: return count; } + if (!strcmp(a->attr.name, "max_fragment_chunk")) { + if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) + sbi->max_fragment_chunk = t; + else + return -EINVAL; + return count; + } + + if (!strcmp(a->attr.name, "max_fragment_hole")) { + if (t >= MIN_FRAGMENT_SIZE && t <= MAX_FRAGMENT_SIZE) + sbi->max_fragment_hole = t; + else + return -EINVAL; + return count; + } + *ui = (unsigned int)t; return count; @@ -781,6 +797,8 @@ F2FS_RW_ATTR(ATGC_INFO, atgc_management, atgc_age_threshold, age_threshold); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, seq_file_ra_mul, seq_file_ra_mul); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_segment_mode, gc_segment_mode); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, gc_reclaimed_segments, gc_reclaimed_segs); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_chunk, max_fragment_chunk); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_fragment_hole, max_fragment_hole); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -859,6 +877,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(seq_file_ra_mul), ATTR_LIST(gc_segment_mode), ATTR_LIST(gc_reclaimed_segments), + ATTR_LIST(max_fragment_chunk), + ATTR_LIST(max_fragment_hole), NULL, }; ATTRIBUTE_GROUPS(f2fs); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index 03549b5ba204..fe5acdccaae1 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -136,7 +136,7 @@ static int f2fs_begin_enable_verity(struct file *filp) * here and not rely on ->open() doing it. This must be done before * evicting the inline data. */ - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1d2d29dcd41c..e348f33bcb2b 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -773,7 +773,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (!f2fs_is_checkpoint_ready(sbi)) return -ENOSPC; - err = dquot_initialize(inode); + err = f2fs_dquot_initialize(inode); if (err) return err; diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c index 994ec22d4040..9320a42dfaf9 100644 --- a/fs/netfs/read_helper.c +++ b/fs/netfs/read_helper.c @@ -230,7 +230,7 @@ static void netfs_rreq_completed(struct netfs_read_request *rreq, bool was_async /* * Deal with the completion of writing the data to the cache. We have to clear - * the PG_fscache bits on the pages involved and release the caller's ref. + * the PG_fscache bits on the folios involved and release the caller's ref. * * May be called in softirq mode and we inherit a ref from the caller. */ @@ -238,7 +238,7 @@ static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq, bool was_async) { struct netfs_read_subrequest *subreq; - struct page *page; + struct folio *folio; pgoff_t unlocked = 0; bool have_unlocked = false; @@ -247,14 +247,14 @@ static void netfs_rreq_unmark_after_write(struct netfs_read_request *rreq, list_for_each_entry(subreq, &rreq->subrequests, rreq_link) { XA_STATE(xas, &rreq->mapping->i_pages, subreq->start / PAGE_SIZE); - xas_for_each(&xas, page, (subreq->start + subreq->len - 1) / PAGE_SIZE) { + xas_for_each(&xas, folio, (subreq->start + subreq->len - 1) / PAGE_SIZE) { /* We might have multiple writes from the same huge - * page, but we mustn't unlock a page more than once. + * folio, but we mustn't unlock a folio more than once. */ - if (have_unlocked && page->index <= unlocked) + if (have_unlocked && folio_index(folio) <= unlocked) continue; - unlocked = page->index; - end_page_fscache(page); + unlocked = folio_index(folio); + folio_end_fscache(folio); have_unlocked = true; } } @@ -367,18 +367,17 @@ static void netfs_rreq_write_to_cache(struct netfs_read_request *rreq, } /* - * Unlock the pages in a read operation. We need to set PG_fscache on any - * pages we're going to write back before we unlock them. + * Unlock the folios in a read operation. We need to set PG_fscache on any + * folios we're going to write back before we unlock them. */ static void netfs_rreq_unlock(struct netfs_read_request *rreq) { struct netfs_read_subrequest *subreq; - struct page *page; + struct folio *folio; unsigned int iopos, account = 0; pgoff_t start_page = rreq->start / PAGE_SIZE; pgoff_t last_page = ((rreq->start + rreq->len) / PAGE_SIZE) - 1; bool subreq_failed = false; - int i; XA_STATE(xas, &rreq->mapping->i_pages, start_page); @@ -403,9 +402,9 @@ static void netfs_rreq_unlock(struct netfs_read_request *rreq) trace_netfs_rreq(rreq, netfs_rreq_trace_unlock); rcu_read_lock(); - xas_for_each(&xas, page, last_page) { - unsigned int pgpos = (page->index - start_page) * PAGE_SIZE; - unsigned int pgend = pgpos + thp_size(page); + xas_for_each(&xas, folio, last_page) { + unsigned int pgpos = (folio_index(folio) - start_page) * PAGE_SIZE; + unsigned int pgend = pgpos + folio_size(folio); bool pg_failed = false; for (;;) { @@ -414,7 +413,7 @@ static void netfs_rreq_unlock(struct netfs_read_request *rreq) break; } if (test_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags)) - set_page_fscache(page); + folio_start_fscache(folio); pg_failed |= subreq_failed; if (pgend < iopos + subreq->len) break; @@ -433,17 +432,16 @@ static void netfs_rreq_unlock(struct netfs_read_request *rreq) } if (!pg_failed) { - for (i = 0; i < thp_nr_pages(page); i++) - flush_dcache_page(page); - SetPageUptodate(page); + flush_dcache_folio(folio); + folio_mark_uptodate(folio); } - if (!test_bit(NETFS_RREQ_DONT_UNLOCK_PAGES, &rreq->flags)) { - if (page->index == rreq->no_unlock_page && - test_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags)) + if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { + if (folio_index(folio) == rreq->no_unlock_folio && + test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) _debug("no unlock"); else - unlock_page(page); + folio_unlock(folio); } } rcu_read_unlock(); @@ -876,7 +874,6 @@ void netfs_readahead(struct readahead_control *ractl, void *netfs_priv) { struct netfs_read_request *rreq; - struct page *page; unsigned int debug_index = 0; int ret; @@ -911,11 +908,11 @@ void netfs_readahead(struct readahead_control *ractl, } while (rreq->submitted < rreq->len); - /* Drop the refs on the pages here rather than in the cache or + /* Drop the refs on the folios here rather than in the cache or * filesystem. The locks will be dropped in netfs_rreq_unlock(). */ - while ((page = readahead_page(ractl))) - put_page(page); + while (readahead_folio(ractl)) + ; /* If we decrement nr_rd_ops to 0, the ref belongs to us. */ if (atomic_dec_and_test(&rreq->nr_rd_ops)) @@ -935,7 +932,7 @@ EXPORT_SYMBOL(netfs_readahead); /** * netfs_readpage - Helper to manage a readpage request * @file: The file to read from - * @page: The page to read + * @folio: The folio to read * @ops: The network filesystem's operations for the helper to use * @netfs_priv: Private netfs data to be retained in the request * @@ -950,7 +947,7 @@ EXPORT_SYMBOL(netfs_readahead); * This is usable whether or not caching is enabled. */ int netfs_readpage(struct file *file, - struct page *page, + struct folio *folio, const struct netfs_read_request_ops *ops, void *netfs_priv) { @@ -958,23 +955,23 @@ int netfs_readpage(struct file *file, unsigned int debug_index = 0; int ret; - _enter("%lx", page_index(page)); + _enter("%lx", folio_index(folio)); rreq = netfs_alloc_read_request(ops, netfs_priv, file); if (!rreq) { if (netfs_priv) - ops->cleanup(netfs_priv, page_file_mapping(page)); - unlock_page(page); + ops->cleanup(netfs_priv, folio_file_mapping(folio)); + folio_unlock(folio); return -ENOMEM; } - rreq->mapping = page_file_mapping(page); - rreq->start = page_file_offset(page); - rreq->len = thp_size(page); + rreq->mapping = folio_file_mapping(folio); + rreq->start = folio_file_pos(folio); + rreq->len = folio_size(folio); if (ops->begin_cache_operation) { ret = ops->begin_cache_operation(rreq); if (ret == -ENOMEM || ret == -EINTR || ret == -ERESTARTSYS) { - unlock_page(page); + folio_unlock(folio); goto out; } } @@ -1012,40 +1009,40 @@ out: EXPORT_SYMBOL(netfs_readpage); /** - * netfs_skip_page_read - prep a page for writing without reading first - * @page: page being prepared + * netfs_skip_folio_read - prep a folio for writing without reading first + * @folio: The folio being prepared * @pos: starting position for the write * @len: length of write * * In some cases, write_begin doesn't need to read at all: - * - full page write - * - write that lies in a page that is completely beyond EOF - * - write that covers the the page from start to EOF or beyond it + * - full folio write + * - write that lies in a folio that is completely beyond EOF + * - write that covers the folio from start to EOF or beyond it * * If any of these criteria are met, then zero out the unwritten parts - * of the page and return true. Otherwise, return false. + * of the folio and return true. Otherwise, return false. */ -static bool netfs_skip_page_read(struct page *page, loff_t pos, size_t len) +static bool netfs_skip_folio_read(struct folio *folio, loff_t pos, size_t len) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio_inode(folio); loff_t i_size = i_size_read(inode); - size_t offset = offset_in_thp(page, pos); + size_t offset = offset_in_folio(folio, pos); - /* Full page write */ - if (offset == 0 && len >= thp_size(page)) + /* Full folio write */ + if (offset == 0 && len >= folio_size(folio)) return true; - /* pos beyond last page in the file */ + /* pos beyond last folio in the file */ if (pos - offset >= i_size) goto zero_out; - /* Write that covers from the start of the page to EOF or beyond */ + /* Write that covers from the start of the folio to EOF or beyond */ if (offset == 0 && (pos + len) >= i_size) goto zero_out; return false; zero_out: - zero_user_segments(page, 0, offset, offset + len, thp_size(page)); + zero_user_segments(&folio->page, 0, offset, offset + len, folio_size(folio)); return true; } @@ -1054,9 +1051,9 @@ zero_out: * @file: The file to read from * @mapping: The mapping to read from * @pos: File position at which the write will begin - * @len: The length of the write (may extend beyond the end of the page chosen) - * @flags: AOP_* flags - * @_page: Where to put the resultant page + * @len: The length of the write (may extend beyond the end of the folio chosen) + * @aop_flags: AOP_* flags + * @_folio: Where to put the resultant folio * @_fsdata: Place for the netfs to store a cookie * @ops: The network filesystem's operations for the helper to use * @netfs_priv: Private netfs data to be retained in the request @@ -1072,37 +1069,41 @@ zero_out: * issue_op, is mandatory. * * The check_write_begin() operation can be provided to check for and flush - * conflicting writes once the page is grabbed and locked. It is passed a + * conflicting writes once the folio is grabbed and locked. It is passed a * pointer to the fsdata cookie that gets returned to the VM to be passed to * write_end. It is permitted to sleep. It should return 0 if the request - * should go ahead; unlock the page and return -EAGAIN to cause the page to be - * regot; or return an error. + * should go ahead; unlock the folio and return -EAGAIN to cause the folio to + * be regot; or return an error. * * This is usable whether or not caching is enabled. */ int netfs_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned int len, unsigned int flags, - struct page **_page, void **_fsdata, + loff_t pos, unsigned int len, unsigned int aop_flags, + struct folio **_folio, void **_fsdata, const struct netfs_read_request_ops *ops, void *netfs_priv) { struct netfs_read_request *rreq; - struct page *page, *xpage; + struct folio *folio; struct inode *inode = file_inode(file); - unsigned int debug_index = 0; + unsigned int debug_index = 0, fgp_flags; pgoff_t index = pos >> PAGE_SHIFT; int ret; DEFINE_READAHEAD(ractl, file, NULL, mapping, index); retry: - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) + fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE; + if (aop_flags & AOP_FLAG_NOFS) + fgp_flags |= FGP_NOFS; + folio = __filemap_get_folio(mapping, index, fgp_flags, + mapping_gfp_mask(mapping)); + if (!folio) return -ENOMEM; if (ops->check_write_begin) { /* Allow the netfs (eg. ceph) to flush conflicts. */ - ret = ops->check_write_begin(file, pos, len, page, _fsdata); + ret = ops->check_write_begin(file, pos, len, folio, _fsdata); if (ret < 0) { trace_netfs_failure(NULL, NULL, ret, netfs_fail_check_write_begin); if (ret == -EAGAIN) @@ -1111,28 +1112,28 @@ retry: } } - if (PageUptodate(page)) - goto have_page; + if (folio_test_uptodate(folio)) + goto have_folio; /* If the page is beyond the EOF, we want to clear it - unless it's * within the cache granule containing the EOF, in which case we need * to preload the granule. */ if (!ops->is_cache_enabled(inode) && - netfs_skip_page_read(page, pos, len)) { + netfs_skip_folio_read(folio, pos, len)) { netfs_stat(&netfs_n_rh_write_zskip); - goto have_page_no_wait; + goto have_folio_no_wait; } ret = -ENOMEM; rreq = netfs_alloc_read_request(ops, netfs_priv, file); if (!rreq) goto error; - rreq->mapping = page->mapping; - rreq->start = page_offset(page); - rreq->len = thp_size(page); - rreq->no_unlock_page = page->index; - __set_bit(NETFS_RREQ_NO_UNLOCK_PAGE, &rreq->flags); + rreq->mapping = folio_file_mapping(folio); + rreq->start = folio_file_pos(folio); + rreq->len = folio_size(folio); + rreq->no_unlock_folio = folio_index(folio); + __set_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags); netfs_priv = NULL; if (ops->begin_cache_operation) { @@ -1147,14 +1148,14 @@ retry: /* Expand the request to meet caching requirements and download * preferences. */ - ractl._nr_pages = thp_nr_pages(page); + ractl._nr_pages = folio_nr_pages(folio); netfs_rreq_expand(rreq, &ractl); netfs_get_read_request(rreq); - /* We hold the page locks, so we can drop the references */ - while ((xpage = readahead_page(&ractl))) - if (xpage != page) - put_page(xpage); + /* We hold the folio locks, so we can drop the references */ + folio_get(folio); + while (readahead_folio(&ractl)) + ; atomic_set(&rreq->nr_rd_ops, 1); do { @@ -1184,22 +1185,22 @@ retry: if (ret < 0) goto error; -have_page: - ret = wait_on_page_fscache_killable(page); +have_folio: + ret = folio_wait_fscache_killable(folio); if (ret < 0) goto error; -have_page_no_wait: +have_folio_no_wait: if (netfs_priv) ops->cleanup(netfs_priv, mapping); - *_page = page; + *_folio = folio; _leave(" = 0"); return 0; error_put: netfs_put_read_request(rreq, false); error: - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); if (netfs_priv) ops->cleanup(netfs_priv, mapping); _leave(" = %d", ret); diff --git a/include/linux/ceph/ceph_fs.h b/include/linux/ceph/ceph_fs.h index bc2699feddbe..7ad6c3d0db7d 100644 --- a/include/linux/ceph/ceph_fs.h +++ b/include/linux/ceph/ceph_fs.h @@ -302,6 +302,8 @@ enum { CEPH_SESSION_REQUEST_FLUSH_MDLOG, }; +#define CEPH_SESSION_BLOCKLISTED (1 << 0) /* session blocklisted */ + extern const char *ceph_session_op_name(int op); struct ceph_mds_session_head { diff --git a/include/linux/ceph/osd_client.h b/include/linux/ceph/osd_client.h index 83fa08a06507..3431011f364d 100644 --- a/include/linux/ceph/osd_client.h +++ b/include/linux/ceph/osd_client.h @@ -475,6 +475,14 @@ extern void osd_req_op_alloc_hint_init(struct ceph_osd_request *osd_req, u64 expected_object_size, u64 expected_write_size, u32 flags); +extern int osd_req_op_copy_from_init(struct ceph_osd_request *req, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + u32 dst_fadvise_flags, + u32 truncate_seq, u64 truncate_size, + u8 copy_from_flags); extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, struct ceph_snap_context *snapc, @@ -515,17 +523,6 @@ int ceph_osdc_call(struct ceph_osd_client *osdc, struct page *req_page, size_t req_len, struct page **resp_pages, size_t *resp_len); -int ceph_osdc_copy_from(struct ceph_osd_client *osdc, - u64 src_snapid, u64 src_version, - struct ceph_object_id *src_oid, - struct ceph_object_locator *src_oloc, - u32 src_fadvise_flags, - struct ceph_object_id *dst_oid, - struct ceph_object_locator *dst_oloc, - u32 dst_fadvise_flags, - u32 truncate_seq, u64 truncate_size, - u8 copy_from_flags); - /* watch/notify */ struct ceph_osd_linger_request * ceph_osdc_watch(struct ceph_osd_client *osdc, diff --git a/include/linux/efi.h b/include/linux/efi.h index 6b5d36babfcc..dbd39b20e034 100644 --- a/include/linux/efi.h +++ b/include/linux/efi.h @@ -362,6 +362,7 @@ void efi_native_runtime_setup(void); /* OEM GUIDs */ #define DELLEMC_EFI_RCI2_TABLE_GUID EFI_GUID(0x2d9f28a2, 0xa886, 0x456a, 0x97, 0xa8, 0xf1, 0x1e, 0xf2, 0x4f, 0xf4, 0x55) +#define AMD_SEV_MEM_ENCRYPT_GUID EFI_GUID(0x0cf29b71, 0x9e51, 0x433a, 0xa3, 0xb7, 0x81, 0xf3, 0xab, 0x16, 0xb8, 0x75) typedef struct { efi_guid_t guid; diff --git a/include/linux/kvm_host.h b/include/linux/kvm_host.h index 60a35d9fe259..9e0667e3723e 100644 --- a/include/linux/kvm_host.h +++ b/include/linux/kvm_host.h @@ -150,7 +150,7 @@ static inline bool is_error_page(struct page *page) #define KVM_REQ_MMU_RELOAD (1 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQ_UNBLOCK 2 #define KVM_REQ_UNHALT 3 -#define KVM_REQ_VM_BUGGED (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) +#define KVM_REQ_VM_DEAD (4 | KVM_REQUEST_WAIT | KVM_REQUEST_NO_WAKEUP) #define KVM_REQUEST_ARCH_BASE 8 #define KVM_ARCH_REQ_FLAGS(nr, flags) ({ \ @@ -617,6 +617,7 @@ struct kvm { unsigned int max_halt_poll_ns; u32 dirty_ring_size; bool vm_bugged; + bool vm_dead; #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER struct notifier_block pm_notifier; @@ -650,12 +651,19 @@ struct kvm { #define vcpu_err(vcpu, fmt, ...) \ kvm_err("vcpu%i " fmt, (vcpu)->vcpu_id, ## __VA_ARGS__) +static inline void kvm_vm_dead(struct kvm *kvm) +{ + kvm->vm_dead = true; + kvm_make_all_cpus_request(kvm, KVM_REQ_VM_DEAD); +} + static inline void kvm_vm_bugged(struct kvm *kvm) { kvm->vm_bugged = true; - kvm_make_all_cpus_request(kvm, KVM_REQ_VM_BUGGED); + kvm_vm_dead(kvm); } + #define KVM_BUG(cond, kvm, fmt...) \ ({ \ int __ret = (cond); \ diff --git a/include/linux/lsm_hook_defs.h b/include/linux/lsm_hook_defs.h index 442a611fa0fb..df8de62f4710 100644 --- a/include/linux/lsm_hook_defs.h +++ b/include/linux/lsm_hook_defs.h @@ -335,8 +335,6 @@ LSM_HOOK(int, 0, sctp_bind_connect, struct sock *sk, int optname, struct sockaddr *address, int addrlen) LSM_HOOK(void, LSM_RET_VOID, sctp_sk_clone, struct sctp_association *asoc, struct sock *sk, struct sock *newsk) -LSM_HOOK(void, LSM_RET_VOID, sctp_assoc_established, struct sctp_association *asoc, - struct sk_buff *skb) #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND diff --git a/include/linux/lsm_hooks.h b/include/linux/lsm_hooks.h index d6823214d5c1..d45b6f6e27fd 100644 --- a/include/linux/lsm_hooks.h +++ b/include/linux/lsm_hooks.h @@ -1050,11 +1050,6 @@ * @asoc pointer to current sctp association structure. * @sk pointer to current sock structure. * @newsk pointer to new sock structure. - * @sctp_assoc_established: - * Passes the @asoc and @chunk->skb of the association COOKIE_ACK packet - * to the security module. - * @asoc pointer to sctp association structure. - * @skb pointer to skbuff of association packet. * * Security hooks for Infiniband * diff --git a/include/linux/netfs.h b/include/linux/netfs.h index 12c4177f7703..ca0683b9e3d1 100644 --- a/include/linux/netfs.h +++ b/include/linux/netfs.h @@ -166,13 +166,13 @@ struct netfs_read_request { short error; /* 0 or error that occurred */ loff_t i_size; /* Size of the file */ loff_t start; /* Start position */ - pgoff_t no_unlock_page; /* Don't unlock this page after read */ + pgoff_t no_unlock_folio; /* Don't unlock this folio after read */ refcount_t usage; unsigned long flags; #define NETFS_RREQ_INCOMPLETE_IO 0 /* Some ioreqs terminated short or with error */ #define NETFS_RREQ_WRITE_TO_CACHE 1 /* Need to write to the cache */ -#define NETFS_RREQ_NO_UNLOCK_PAGE 2 /* Don't unlock no_unlock_page on completion */ -#define NETFS_RREQ_DONT_UNLOCK_PAGES 3 /* Don't unlock the pages on completion */ +#define NETFS_RREQ_NO_UNLOCK_FOLIO 2 /* Don't unlock no_unlock_folio on completion */ +#define NETFS_RREQ_DONT_UNLOCK_FOLIOS 3 /* Don't unlock the folios on completion */ #define NETFS_RREQ_FAILED 4 /* The request failed */ #define NETFS_RREQ_IN_PROGRESS 5 /* Unlocked when the request completes */ const struct netfs_read_request_ops *netfs_ops; @@ -190,7 +190,7 @@ struct netfs_read_request_ops { void (*issue_op)(struct netfs_read_subrequest *subreq); bool (*is_still_valid)(struct netfs_read_request *rreq); int (*check_write_begin)(struct file *file, loff_t pos, unsigned len, - struct page *page, void **_fsdata); + struct folio *folio, void **_fsdata); void (*done)(struct netfs_read_request *rreq); void (*cleanup)(struct address_space *mapping, void *netfs_priv); }; @@ -240,11 +240,11 @@ extern void netfs_readahead(struct readahead_control *, const struct netfs_read_request_ops *, void *); extern int netfs_readpage(struct file *, - struct page *, + struct folio *, const struct netfs_read_request_ops *, void *); extern int netfs_write_begin(struct file *, struct address_space *, - loff_t, unsigned int, unsigned int, struct page **, + loff_t, unsigned int, unsigned int, struct folio **, void **, const struct netfs_read_request_ops *, void *); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 18da82ac820e..4a6c91b18326 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -253,6 +253,20 @@ static inline struct address_space *page_mapping_file(struct page *page) return folio_mapping(folio); } +/** + * folio_inode - Get the host inode for this folio. + * @folio: The folio. + * + * For folios which are in the page cache, return the inode that this folio + * belongs to. + * + * Do not call this for folios which aren't in the page cache. + */ +static inline struct inode *folio_inode(struct folio *folio) +{ + return folio->mapping->host; +} + static inline bool page_cache_add_speculative(struct page *page, int count) { VM_BUG_ON_PAGE(PageTail(page), page); @@ -279,6 +293,25 @@ static inline void folio_attach_private(struct folio *folio, void *data) folio_set_private(folio); } +/** + * folio_change_private - Change private data on a folio. + * @folio: Folio to change the data on. + * @data: Data to set on the folio. + * + * Change the private data attached to a folio and return the old + * data. The page must previously have had data attached and the data + * must be detached before the folio will be freed. + * + * Return: Data that was previously attached to the folio. + */ +static inline void *folio_change_private(struct folio *folio, void *data) +{ + void *old = folio_get_private(folio); + + folio->private = data; + return old; +} + /** * folio_detach_private - Detach private data from a folio. * @folio: Folio to detach data from. diff --git a/include/linux/pci.h b/include/linux/pci.h index 138d764c1c35..0b9d58098c8b 100644 --- a/include/linux/pci.h +++ b/include/linux/pci.h @@ -1666,6 +1666,7 @@ void pci_cfg_access_lock(struct pci_dev *dev); bool pci_cfg_access_trylock(struct pci_dev *dev); void pci_cfg_access_unlock(struct pci_dev *dev); +void pci_dev_lock(struct pci_dev *dev); int pci_dev_trylock(struct pci_dev *dev); void pci_dev_unlock(struct pci_dev *dev); diff --git a/include/linux/security.h b/include/linux/security.h index 06eac4e61a13..bbf44a466832 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -1430,8 +1430,6 @@ int security_sctp_bind_connect(struct sock *sk, int optname, struct sockaddr *address, int addrlen); void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, struct sock *newsk); -void security_sctp_assoc_established(struct sctp_association *asoc, - struct sk_buff *skb); #else /* CONFIG_SECURITY_NETWORK */ static inline int security_unix_stream_connect(struct sock *sock, @@ -1651,11 +1649,6 @@ static inline void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *newsk) { } - -static inline void security_sctp_assoc_established(struct sctp_association *asoc, - struct sk_buff *skb) -{ -} #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h index bca73e8c8cde..499f5fabd20f 100644 --- a/include/trace/events/afs.h +++ b/include/trace/events/afs.h @@ -1016,31 +1016,32 @@ TRACE_EVENT(afs_dir_check_failed, __entry->vnode, __entry->off, __entry->i_size) ); -TRACE_EVENT(afs_page_dirty, - TP_PROTO(struct afs_vnode *vnode, const char *where, struct page *page), +TRACE_EVENT(afs_folio_dirty, + TP_PROTO(struct afs_vnode *vnode, const char *where, struct folio *folio), - TP_ARGS(vnode, where, page), + TP_ARGS(vnode, where, folio), TP_STRUCT__entry( __field(struct afs_vnode *, vnode ) __field(const char *, where ) - __field(pgoff_t, page ) + __field(pgoff_t, index ) __field(unsigned long, from ) __field(unsigned long, to ) ), TP_fast_assign( + unsigned long priv = (unsigned long)folio_get_private(folio); __entry->vnode = vnode; __entry->where = where; - __entry->page = page->index; - __entry->from = afs_page_dirty_from(page, page->private); - __entry->to = afs_page_dirty_to(page, page->private); - __entry->to |= (afs_is_page_dirty_mmapped(page->private) ? - (1UL << (BITS_PER_LONG - 1)) : 0); + __entry->index = folio_index(folio); + __entry->from = afs_folio_dirty_from(folio, priv); + __entry->to = afs_folio_dirty_to(folio, priv); + __entry->to |= (afs_is_folio_dirty_mmapped(priv) ? + (1UL << (BITS_PER_LONG - 1)) : 0); ), TP_printk("vn=%p %lx %s %lx-%lx%s", - __entry->vnode, __entry->page, __entry->where, + __entry->vnode, __entry->index, __entry->where, __entry->from, __entry->to & ~(1UL << (BITS_PER_LONG - 1)), __entry->to & (1UL << (BITS_PER_LONG - 1)) ? " M" : "") diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 4e881d91c874..f8cb916f3595 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -570,9 +570,10 @@ TRACE_EVENT(f2fs_file_write_iter, ); TRACE_EVENT(f2fs_map_blocks, - TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, int ret), + TP_PROTO(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag, int ret), - TP_ARGS(inode, map, ret), + TP_ARGS(inode, map, create, flag, ret), TP_STRUCT__entry( __field(dev_t, dev) @@ -583,11 +584,14 @@ TRACE_EVENT(f2fs_map_blocks, __field(unsigned int, m_flags) __field(int, m_seg_type) __field(bool, m_may_create) + __field(bool, m_multidev_dio) + __field(int, create) + __field(int, flag) __field(int, ret) ), TP_fast_assign( - __entry->dev = inode->i_sb->s_dev; + __entry->dev = map->m_bdev->bd_dev; __entry->ino = inode->i_ino; __entry->m_lblk = map->m_lblk; __entry->m_pblk = map->m_pblk; @@ -595,12 +599,16 @@ TRACE_EVENT(f2fs_map_blocks, __entry->m_flags = map->m_flags; __entry->m_seg_type = map->m_seg_type; __entry->m_may_create = map->m_may_create; + __entry->m_multidev_dio = map->m_multidev_dio; + __entry->create = create; + __entry->flag = flag; __entry->ret = ret; ), TP_printk("dev = (%d,%d), ino = %lu, file offset = %llu, " - "start blkaddr = 0x%llx, len = 0x%llx, flags = %u," - "seg_type = %d, may_create = %d, err = %d", + "start blkaddr = 0x%llx, len = 0x%llx, flags = %u, " + "seg_type = %d, may_create = %d, multidevice = %d, " + "create = %d, flag = %d, err = %d", show_dev_ino(__entry), (unsigned long long)__entry->m_lblk, (unsigned long long)__entry->m_pblk, @@ -608,6 +616,9 @@ TRACE_EVENT(f2fs_map_blocks, __entry->m_flags, __entry->m_seg_type, __entry->m_may_create, + __entry->m_multidev_dio, + __entry->create, + __entry->flag, __entry->ret) ); @@ -807,20 +818,20 @@ TRACE_EVENT(f2fs_lookup_start, TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(const char *, name) + __string(name, dentry->d_name.name) __field(unsigned int, flags) ), TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; - __entry->name = dentry->d_name.name; + __assign_str(name, dentry->d_name.name); __entry->flags = flags; ), TP_printk("dev = (%d,%d), pino = %lu, name:%s, flags:%u", show_dev_ino(__entry), - __entry->name, + __get_str(name), __entry->flags) ); @@ -834,7 +845,7 @@ TRACE_EVENT(f2fs_lookup_end, TP_STRUCT__entry( __field(dev_t, dev) __field(ino_t, ino) - __field(const char *, name) + __string(name, dentry->d_name.name) __field(nid_t, cino) __field(int, err) ), @@ -842,14 +853,14 @@ TRACE_EVENT(f2fs_lookup_end, TP_fast_assign( __entry->dev = dir->i_sb->s_dev; __entry->ino = dir->i_ino; - __entry->name = dentry->d_name.name; + __assign_str(name, dentry->d_name.name); __entry->cino = ino; __entry->err = err; ), TP_printk("dev = (%d,%d), pino = %lu, name:%s, ino:%u, err:%d", show_dev_ino(__entry), - __entry->name, + __get_str(name), __entry->cino, __entry->err) ); diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 78f0719cc2a3..1daa45268de2 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -1130,6 +1130,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_BINARY_STATS_FD 203 #define KVM_CAP_EXIT_ON_EMULATION_FAILURE 204 #define KVM_CAP_ARM_MTE 205 +#define KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM 206 #ifdef KVM_CAP_IRQ_ROUTING diff --git a/kernel/trace/trace_events_hist.c b/kernel/trace/trace_events_hist.c index 0abc9a413b4d..8a10046c775f 100644 --- a/kernel/trace/trace_events_hist.c +++ b/kernel/trace/trace_events_hist.c @@ -1953,9 +1953,10 @@ static struct hist_field *create_hist_field(struct hist_trigger_data *hist_data, if (!hist_field->type) goto free; - if (field->filter_type == FILTER_STATIC_STRING) + if (field->filter_type == FILTER_STATIC_STRING) { hist_field->fn = hist_field_string; - else if (field->filter_type == FILTER_DYN_STRING) + hist_field->size = field->size; + } else if (field->filter_type == FILTER_DYN_STRING) hist_field->fn = hist_field_dynstring; else hist_field->fn = hist_field_pstring; @@ -2580,7 +2581,8 @@ static struct hist_field *parse_expr(struct hist_trigger_data *hist_data, operand1_str = str; str = sep+1; - if (!operand1_str || !str) + /* Binary operator requires both operands */ + if (*operand1_str == '\0' || *str == '\0') goto free; operand_flags = 0; @@ -3025,7 +3027,7 @@ static inline void __update_field_vars(struct tracing_map_elt *elt, char *str = elt_data->field_var_str[j++]; char *val_str = (char *)(uintptr_t)var_val; - strscpy(str, val_str, STR_VAR_LEN_MAX); + strscpy(str, val_str, val->size); var_val = (u64)(uintptr_t)str; } tracing_map_set_var(elt, var_idx, var_val); @@ -4920,7 +4922,7 @@ static void hist_trigger_elt_update(struct hist_trigger_data *hist_data, str = elt_data->field_var_str[idx]; val_str = (char *)(uintptr_t)hist_val; - strscpy(str, val_str, STR_VAR_LEN_MAX); + strscpy(str, val_str, hist_field->size); hist_val = (u64)(uintptr_t)str; } diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c index 3e4a1651e329..7520d43aed55 100644 --- a/kernel/trace/trace_osnoise.c +++ b/kernel/trace/trace_osnoise.c @@ -55,7 +55,8 @@ struct osnoise_instance { struct list_head list; struct trace_array *tr; }; -struct list_head osnoise_instances; + +static struct list_head osnoise_instances; static bool osnoise_has_registered_instances(void) { diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 2d498bb62248..a613f8ef6a02 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2967,7 +2967,7 @@ EXPORT_SYMBOL_GPL(folio_wait_writeback_killable); */ void folio_wait_stable(struct folio *folio) { - if (folio->mapping->host->i_sb->s_iflags & SB_I_STABLE_WRITES) + if (folio_inode(folio)->i_sb->s_iflags & SB_I_STABLE_WRITES) folio_wait_writeback(folio); } EXPORT_SYMBOL_GPL(folio_wait_stable); diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 013cbdb6cfe2..6a6898ee4049 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -1153,12 +1153,11 @@ static int build_initial_monmap(struct ceph_mon_client *monc) int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) { - int err = 0; + int err; dout("init\n"); memset(monc, 0, sizeof(*monc)); monc->client = cl; - monc->monmap = NULL; mutex_init(&monc->mutex); err = build_initial_monmap(monc); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index ff8624a7c964..1c5815530e0d 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -5310,14 +5310,14 @@ void ceph_osdc_stop(struct ceph_osd_client *osdc) ceph_msgpool_destroy(&osdc->msgpool_op_reply); } -static int osd_req_op_copy_from_init(struct ceph_osd_request *req, - u64 src_snapid, u64 src_version, - struct ceph_object_id *src_oid, - struct ceph_object_locator *src_oloc, - u32 src_fadvise_flags, - u32 dst_fadvise_flags, - u32 truncate_seq, u64 truncate_size, - u8 copy_from_flags) +int osd_req_op_copy_from_init(struct ceph_osd_request *req, + u64 src_snapid, u64 src_version, + struct ceph_object_id *src_oid, + struct ceph_object_locator *src_oloc, + u32 src_fadvise_flags, + u32 dst_fadvise_flags, + u32 truncate_seq, u64 truncate_size, + u8 copy_from_flags) { struct ceph_osd_req_op *op; struct page **pages; @@ -5346,49 +5346,7 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, op->indata_len, 0, false, true); return 0; } - -int ceph_osdc_copy_from(struct ceph_osd_client *osdc, - u64 src_snapid, u64 src_version, - struct ceph_object_id *src_oid, - struct ceph_object_locator *src_oloc, - u32 src_fadvise_flags, - struct ceph_object_id *dst_oid, - struct ceph_object_locator *dst_oloc, - u32 dst_fadvise_flags, - u32 truncate_seq, u64 truncate_size, - u8 copy_from_flags) -{ - struct ceph_osd_request *req; - int ret; - - req = ceph_osdc_alloc_request(osdc, NULL, 1, false, GFP_KERNEL); - if (!req) - return -ENOMEM; - - req->r_flags = CEPH_OSD_FLAG_WRITE; - - ceph_oloc_copy(&req->r_t.base_oloc, dst_oloc); - ceph_oid_copy(&req->r_t.base_oid, dst_oid); - - ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid, - src_oloc, src_fadvise_flags, - dst_fadvise_flags, truncate_seq, - truncate_size, copy_from_flags); - if (ret) - goto out; - - ret = ceph_osdc_alloc_messages(req, GFP_KERNEL); - if (ret) - goto out; - - ceph_osdc_start_request(osdc, req, false); - ret = ceph_osdc_wait_request(osdc, req); - -out: - ceph_osdc_put_request(req); - return ret; -} -EXPORT_SYMBOL(ceph_osdc_copy_from); +EXPORT_SYMBOL(osd_req_op_copy_from_init); int __init ceph_osdc_setup(void) { diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 39ba82ee87ce..354c1c4de19b 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -946,7 +946,7 @@ enum sctp_disposition sctp_sf_do_5_1E_ca(struct net *net, sctp_add_cmd_sf(commands, SCTP_CMD_INIT_COUNTER_RESET, SCTP_NULL()); /* Set peer label for connection. */ - security_sctp_assoc_established((struct sctp_association *)asoc, chunk->skb); + security_inet_conn_established(ep->base.sk, chunk->skb); /* RFC 2960 5.1 Normal Establishment of an Association * diff --git a/scripts/coccinelle/misc/do_div.cocci b/scripts/coccinelle/misc/do_div.cocci new file mode 100644 index 000000000000..79db083c5208 --- /dev/null +++ b/scripts/coccinelle/misc/do_div.cocci @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0-only +/// do_div() does a 64-by-32 division. +/// When the divisor is long, unsigned long, u64, or s64, +/// do_div() truncates it to 32 bits, this means it can test +/// non-zero and be truncated to 0 for division on 64bit platforms. +/// +//# This makes an effort to find those inappropriate do_div() calls. +// +// Confidence: Moderate +// Copyright: (C) 2020 Wen Yang, Alibaba. +// Comments: +// Options: --no-includes --include-headers + +virtual context +virtual org +virtual report + +@initialize:python@ +@@ + +def get_digit_type_and_value(str): + is_digit = False + value = 0 + + try: + if (str.isdigit()): + is_digit = True + value = int(str, 0) + elif (str.upper().endswith('ULL')): + is_digit = True + value = int(str[:-3], 0) + elif (str.upper().endswith('LL')): + is_digit = True + value = int(str[:-2], 0) + elif (str.upper().endswith('UL')): + is_digit = True + value = int(str[:-2], 0) + elif (str.upper().endswith('L')): + is_digit = True + value = int(str[:-1], 0) + elif (str.upper().endswith('U')): + is_digit = True + value = int(str[:-1], 0) + except Exception as e: + print('Error:',e) + is_digit = False + value = 0 + finally: + return is_digit, value + +def filter_out_safe_constants(str): + is_digit, value = get_digit_type_and_value(str) + if (is_digit): + if (value >= 0x100000000): + return True + else: + return False + else: + return True + +def construct_warnings(suggested_fun): + msg="WARNING: do_div() does a 64-by-32 division, please consider using %s instead." + return msg % suggested_fun + +@depends on context@ +expression f; +long l: script:python() { filter_out_safe_constants(l) }; +unsigned long ul : script:python() { filter_out_safe_constants(ul) }; +u64 ul64 : script:python() { filter_out_safe_constants(ul64) }; +s64 sl64 : script:python() { filter_out_safe_constants(sl64) }; + +@@ +( +* do_div(f, l); +| +* do_div(f, ul); +| +* do_div(f, ul64); +| +* do_div(f, sl64); +) + +@r depends on (org || report)@ +expression f; +position p; +long l: script:python() { filter_out_safe_constants(l) }; +unsigned long ul : script:python() { filter_out_safe_constants(ul) }; +u64 ul64 : script:python() { filter_out_safe_constants(ul64) }; +s64 sl64 : script:python() { filter_out_safe_constants(sl64) }; +@@ +( +do_div@p(f, l); +| +do_div@p(f, ul); +| +do_div@p(f, ul64); +| +do_div@p(f, sl64); +) + +@script:python depends on org@ +p << r.p; +ul << r.ul; +@@ + +coccilib.org.print_todo(p[0], construct_warnings("div64_ul")) + +@script:python depends on org@ +p << r.p; +l << r.l; +@@ + +coccilib.org.print_todo(p[0], construct_warnings("div64_long")) + +@script:python depends on org@ +p << r.p; +ul64 << r.ul64; +@@ + +coccilib.org.print_todo(p[0], construct_warnings("div64_u64")) + +@script:python depends on org@ +p << r.p; +sl64 << r.sl64; +@@ + +coccilib.org.print_todo(p[0], construct_warnings("div64_s64")) + +@script:python depends on report@ +p << r.p; +ul << r.ul; +@@ + +coccilib.report.print_report(p[0], construct_warnings("div64_ul")) + +@script:python depends on report@ +p << r.p; +l << r.l; +@@ + +coccilib.report.print_report(p[0], construct_warnings("div64_long")) + +@script:python depends on report@ +p << r.p; +sl64 << r.sl64; +@@ + +coccilib.report.print_report(p[0], construct_warnings("div64_s64")) + +@script:python depends on report@ +p << r.p; +ul64 << r.ul64; +@@ + +coccilib.report.print_report(p[0], construct_warnings("div64_u64")) diff --git a/scripts/remove-stale-files b/scripts/remove-stale-files index c3eb81c3f7de..0114c41e6938 100755 --- a/scripts/remove-stale-files +++ b/scripts/remove-stale-files @@ -28,4 +28,9 @@ if [ -n "${building_out_of_srctree}" ]; then do rm -f arch/arm/boot/compressed/${f} done + + for f in uart-ath79.c ashldi3.c bswapdi.c bswapsi.c + do + rm -f arch/mips/boot/compressed/${f} + done fi diff --git a/security/security.c b/security/security.c index 779a9edea0a0..c88167a414b4 100644 --- a/security/security.c +++ b/security/security.c @@ -2388,13 +2388,6 @@ void security_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk, } EXPORT_SYMBOL(security_sctp_sk_clone); -void security_sctp_assoc_established(struct sctp_association *asoc, - struct sk_buff *skb) -{ - call_void_hook(sctp_assoc_established, asoc, skb); -} -EXPORT_SYMBOL(security_sctp_assoc_established); - #endif /* CONFIG_SECURITY_NETWORK */ #ifdef CONFIG_SECURITY_INFINIBAND diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 5e5215fe2e83..62d30c0a30c2 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -5502,8 +5502,7 @@ static void selinux_sctp_sk_clone(struct sctp_association *asoc, struct sock *sk if (!selinux_policycap_extsockclass()) return selinux_sk_clone_security(sk, newsk); - if (asoc->secid != SECSID_WILD) - newsksec->sid = asoc->secid; + newsksec->sid = asoc->secid; newsksec->peer_sid = asoc->peer_secid; newsksec->sclass = sksec->sclass; selinux_netlbl_sctp_sk_clone(sk, newsk); @@ -5559,16 +5558,6 @@ static void selinux_inet_conn_established(struct sock *sk, struct sk_buff *skb) selinux_skb_peerlbl_sid(skb, family, &sksec->peer_sid); } -static void selinux_sctp_assoc_established(struct sctp_association *asoc, - struct sk_buff *skb) -{ - struct sk_security_struct *sksec = asoc->base.sk->sk_security; - - selinux_inet_conn_established(asoc->base.sk, skb); - asoc->peer_secid = sksec->peer_sid; - asoc->secid = SECSID_WILD; -} - static int selinux_secmark_relabel_packet(u32 sid) { const struct task_security_struct *__tsec; @@ -7239,7 +7228,6 @@ static struct security_hook_list selinux_hooks[] __lsm_ro_after_init = { LSM_HOOK_INIT(sctp_assoc_request, selinux_sctp_assoc_request), LSM_HOOK_INIT(sctp_sk_clone, selinux_sctp_sk_clone), LSM_HOOK_INIT(sctp_bind_connect, selinux_sctp_bind_connect), - LSM_HOOK_INIT(sctp_assoc_established, selinux_sctp_assoc_established), LSM_HOOK_INIT(inet_conn_request, selinux_inet_conn_request), LSM_HOOK_INIT(inet_csk_clone, selinux_inet_csk_clone), LSM_HOOK_INIT(inet_conn_established, selinux_inet_conn_established), diff --git a/tools/testing/selftests/kvm/Makefile b/tools/testing/selftests/kvm/Makefile index c23e89dea0b6..c4e34717826a 100644 --- a/tools/testing/selftests/kvm/Makefile +++ b/tools/testing/selftests/kvm/Makefile @@ -73,7 +73,8 @@ TEST_GEN_PROGS_x86_64 += x86_64/tsc_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/vmx_pmu_msrs_test TEST_GEN_PROGS_x86_64 += x86_64/xen_shinfo_test TEST_GEN_PROGS_x86_64 += x86_64/xen_vmcall_test -TEST_GEN_PROGS_x86_64 += access_tracking_perf_test +TEST_GEN_PROGS_x86_64 += x86_64/vmx_pi_mmio_test +TEST_GEN_PROGS_x86_64 += x86_64/sev_migrate_tests TEST_GEN_PROGS_x86_64 += demand_paging_test TEST_GEN_PROGS_x86_64 += dirty_log_test TEST_GEN_PROGS_x86_64 += dirty_log_perf_test diff --git a/tools/testing/selftests/kvm/include/kvm_util.h b/tools/testing/selftests/kvm/include/kvm_util.h index f6b3794f306b..6a1a37f30494 100644 --- a/tools/testing/selftests/kvm/include/kvm_util.h +++ b/tools/testing/selftests/kvm/include/kvm_util.h @@ -82,6 +82,7 @@ struct vm_guest_mode_params { }; extern const struct vm_guest_mode_params vm_guest_mode_params[]; +int open_path_or_exit(const char *path, int flags); int open_kvm_dev_path_or_exit(void); int kvm_check_cap(long cap); int vm_enable_cap(struct kvm_vm *vm, struct kvm_enable_cap *cap); diff --git a/tools/testing/selftests/kvm/include/x86_64/svm_util.h b/tools/testing/selftests/kvm/include/x86_64/svm_util.h index b7531c83b8ae..587fbe408b99 100644 --- a/tools/testing/selftests/kvm/include/x86_64/svm_util.h +++ b/tools/testing/selftests/kvm/include/x86_64/svm_util.h @@ -46,4 +46,6 @@ static inline bool cpu_has_svm(void) return ecx & CPUID_SVM; } +int open_sev_dev_path_or_exit(void); + #endif /* SELFTEST_KVM_SVM_UTILS_H */ diff --git a/tools/testing/selftests/kvm/lib/kvm_util.c b/tools/testing/selftests/kvm/lib/kvm_util.c index 041004c0fda7..14bb4d5b6bb7 100644 --- a/tools/testing/selftests/kvm/lib/kvm_util.c +++ b/tools/testing/selftests/kvm/lib/kvm_util.c @@ -31,6 +31,19 @@ static void *align(void *x, size_t size) return (void *) (((size_t) x + mask) & ~mask); } +int open_path_or_exit(const char *path, int flags) +{ + int fd; + + fd = open(path, flags); + if (fd < 0) { + print_skip("%s not available (errno: %d)", path, errno); + exit(KSFT_SKIP); + } + + return fd; +} + /* * Open KVM_DEV_PATH if available, otherwise exit the entire program. * @@ -42,16 +55,7 @@ static void *align(void *x, size_t size) */ static int _open_kvm_dev_path_or_exit(int flags) { - int fd; - - fd = open(KVM_DEV_PATH, flags); - if (fd < 0) { - print_skip("%s not available, is KVM loaded? (errno: %d)", - KVM_DEV_PATH, errno); - exit(KSFT_SKIP); - } - - return fd; + return open_path_or_exit(KVM_DEV_PATH, flags); } int open_kvm_dev_path_or_exit(void) diff --git a/tools/testing/selftests/kvm/lib/x86_64/svm.c b/tools/testing/selftests/kvm/lib/x86_64/svm.c index 161eba7cd128..0ebc03ce079c 100644 --- a/tools/testing/selftests/kvm/lib/x86_64/svm.c +++ b/tools/testing/selftests/kvm/lib/x86_64/svm.c @@ -13,6 +13,8 @@ #include "processor.h" #include "svm_util.h" +#define SEV_DEV_PATH "/dev/sev" + struct gpr64_regs guest_regs; u64 rflags; @@ -172,3 +174,14 @@ void nested_svm_check_supported(void) exit(KSFT_SKIP); } } + +/* + * Open SEV_DEV_PATH if available, otherwise exit the entire program. + * + * Return: + * The opened file descriptor of /dev/sev. + */ +int open_sev_dev_path_or_exit(void) +{ + return open_path_or_exit(SEV_DEV_PATH, 0); +} diff --git a/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c new file mode 100644 index 000000000000..5ba325cd64bf --- /dev/null +++ b/tools/testing/selftests/kvm/x86_64/sev_migrate_tests.c @@ -0,0 +1,203 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include +#include +#include +#include +#include +#include +#include + +#include "test_util.h" +#include "kvm_util.h" +#include "processor.h" +#include "svm_util.h" +#include "kselftest.h" +#include "../lib/kvm_util_internal.h" + +#define SEV_POLICY_ES 0b100 + +#define NR_MIGRATE_TEST_VCPUS 4 +#define NR_MIGRATE_TEST_VMS 3 +#define NR_LOCK_TESTING_THREADS 3 +#define NR_LOCK_TESTING_ITERATIONS 10000 + +static void sev_ioctl(int vm_fd, int cmd_id, void *data) +{ + struct kvm_sev_cmd cmd = { + .id = cmd_id, + .data = (uint64_t)data, + .sev_fd = open_sev_dev_path_or_exit(), + }; + int ret; + + ret = ioctl(vm_fd, KVM_MEMORY_ENCRYPT_OP, &cmd); + TEST_ASSERT((ret == 0 || cmd.error == SEV_RET_SUCCESS), + "%d failed: return code: %d, errno: %d, fw error: %d", + cmd_id, ret, errno, cmd.error); +} + +static struct kvm_vm *sev_vm_create(bool es) +{ + struct kvm_vm *vm; + struct kvm_sev_launch_start start = { 0 }; + int i; + + vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + sev_ioctl(vm->fd, es ? KVM_SEV_ES_INIT : KVM_SEV_INIT, NULL); + for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) + vm_vcpu_add(vm, i); + if (es) + start.policy |= SEV_POLICY_ES; + sev_ioctl(vm->fd, KVM_SEV_LAUNCH_START, &start); + if (es) + sev_ioctl(vm->fd, KVM_SEV_LAUNCH_UPDATE_VMSA, NULL); + return vm; +} + +static struct kvm_vm *__vm_create(void) +{ + struct kvm_vm *vm; + int i; + + vm = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + for (i = 0; i < NR_MIGRATE_TEST_VCPUS; ++i) + vm_vcpu_add(vm, i); + + return vm; +} + +static int __sev_migrate_from(int dst_fd, int src_fd) +{ + struct kvm_enable_cap cap = { + .cap = KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM, + .args = { src_fd } + }; + + return ioctl(dst_fd, KVM_ENABLE_CAP, &cap); +} + + +static void sev_migrate_from(int dst_fd, int src_fd) +{ + int ret; + + ret = __sev_migrate_from(dst_fd, src_fd); + TEST_ASSERT(!ret, "Migration failed, ret: %d, errno: %d\n", ret, errno); +} + +static void test_sev_migrate_from(bool es) +{ + struct kvm_vm *src_vm; + struct kvm_vm *dst_vms[NR_MIGRATE_TEST_VMS]; + int i; + + src_vm = sev_vm_create(es); + for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i) + dst_vms[i] = __vm_create(); + + /* Initial migration from the src to the first dst. */ + sev_migrate_from(dst_vms[0]->fd, src_vm->fd); + + for (i = 1; i < NR_MIGRATE_TEST_VMS; i++) + sev_migrate_from(dst_vms[i]->fd, dst_vms[i - 1]->fd); + + /* Migrate the guest back to the original VM. */ + sev_migrate_from(src_vm->fd, dst_vms[NR_MIGRATE_TEST_VMS - 1]->fd); + + kvm_vm_free(src_vm); + for (i = 0; i < NR_MIGRATE_TEST_VMS; ++i) + kvm_vm_free(dst_vms[i]); +} + +struct locking_thread_input { + struct kvm_vm *vm; + int source_fds[NR_LOCK_TESTING_THREADS]; +}; + +static void *locking_test_thread(void *arg) +{ + int i, j; + struct locking_thread_input *input = (struct locking_thread_input *)arg; + + for (i = 0; i < NR_LOCK_TESTING_ITERATIONS; ++i) { + j = i % NR_LOCK_TESTING_THREADS; + __sev_migrate_from(input->vm->fd, input->source_fds[j]); + } + + return NULL; +} + +static void test_sev_migrate_locking(void) +{ + struct locking_thread_input input[NR_LOCK_TESTING_THREADS]; + pthread_t pt[NR_LOCK_TESTING_THREADS]; + int i; + + for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i) { + input[i].vm = sev_vm_create(/* es= */ false); + input[0].source_fds[i] = input[i].vm->fd; + } + for (i = 1; i < NR_LOCK_TESTING_THREADS; ++i) + memcpy(input[i].source_fds, input[0].source_fds, + sizeof(input[i].source_fds)); + + for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i) + pthread_create(&pt[i], NULL, locking_test_thread, &input[i]); + + for (i = 0; i < NR_LOCK_TESTING_THREADS; ++i) + pthread_join(pt[i], NULL); +} + +static void test_sev_migrate_parameters(void) +{ + struct kvm_vm *sev_vm, *sev_es_vm, *vm_no_vcpu, *vm_no_sev, + *sev_es_vm_no_vmsa; + int ret; + + sev_vm = sev_vm_create(/* es= */ false); + sev_es_vm = sev_vm_create(/* es= */ true); + vm_no_vcpu = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + vm_no_sev = __vm_create(); + sev_es_vm_no_vmsa = vm_create(VM_MODE_DEFAULT, 0, O_RDWR); + sev_ioctl(sev_es_vm_no_vmsa->fd, KVM_SEV_ES_INIT, NULL); + vm_vcpu_add(sev_es_vm_no_vmsa, 1); + + + ret = __sev_migrate_from(sev_vm->fd, sev_es_vm->fd); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "Should not be able migrate to SEV enabled VM. ret: %d, errno: %d\n", + ret, errno); + + ret = __sev_migrate_from(sev_es_vm->fd, sev_vm->fd); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "Should not be able migrate to SEV-ES enabled VM. ret: %d, errno: %d\n", + ret, errno); + + ret = __sev_migrate_from(vm_no_vcpu->fd, sev_es_vm->fd); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "SEV-ES migrations require same number of vCPUS. ret: %d, errno: %d\n", + ret, errno); + + ret = __sev_migrate_from(vm_no_vcpu->fd, sev_es_vm_no_vmsa->fd); + TEST_ASSERT( + ret == -1 && errno == EINVAL, + "SEV-ES migrations require UPDATE_VMSA. ret %d, errno: %d\n", + ret, errno); + + ret = __sev_migrate_from(vm_no_vcpu->fd, vm_no_sev->fd); + TEST_ASSERT(ret == -1 && errno == EINVAL, + "Migrations require SEV enabled. ret %d, errno: %d\n", ret, + errno); +} + +int main(int argc, char *argv[]) +{ + test_sev_migrate_from(/* es= */ false); + test_sev_migrate_from(/* es= */ true); + test_sev_migrate_locking(); + test_sev_migrate_parameters(); + return 0; +} diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index 3f6d450355f0..d31724500501 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -3747,7 +3747,7 @@ static long kvm_vcpu_ioctl(struct file *filp, struct kvm_fpu *fpu = NULL; struct kvm_sregs *kvm_sregs = NULL; - if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged) + if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead) return -EIO; if (unlikely(_IOC_TYPE(ioctl) != KVMIO)) @@ -3957,7 +3957,7 @@ static long kvm_vcpu_compat_ioctl(struct file *filp, void __user *argp = compat_ptr(arg); int r; - if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_bugged) + if (vcpu->kvm->mm != current->mm || vcpu->kvm->vm_dead) return -EIO; switch (ioctl) { @@ -4023,7 +4023,7 @@ static long kvm_device_ioctl(struct file *filp, unsigned int ioctl, { struct kvm_device *dev = filp->private_data; - if (dev->kvm->mm != current->mm || dev->kvm->vm_bugged) + if (dev->kvm->mm != current->mm || dev->kvm->vm_dead) return -EIO; switch (ioctl) { @@ -4345,7 +4345,7 @@ static long kvm_vm_ioctl(struct file *filp, void __user *argp = (void __user *)arg; int r; - if (kvm->mm != current->mm || kvm->vm_bugged) + if (kvm->mm != current->mm || kvm->vm_dead) return -EIO; switch (ioctl) { case KVM_CREATE_VCPU: @@ -4556,7 +4556,7 @@ static long kvm_vm_compat_ioctl(struct file *filp, struct kvm *kvm = filp->private_data; int r; - if (kvm->mm != current->mm || kvm->vm_bugged) + if (kvm->mm != current->mm || kvm->vm_dead) return -EIO; switch (ioctl) { #ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT