diff --git a/Documentation/ABI/testing/ima_policy b/Documentation/ABI/testing/ima_policy index bc8e1cbe5e61..070779e8d836 100644 --- a/Documentation/ABI/testing/ima_policy +++ b/Documentation/ABI/testing/ima_policy @@ -29,7 +29,7 @@ Description: option: [[appraise_type=]] [template=] [permit_directio] [appraise_flag=] [keyrings=] base: - func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK]MODULE_CHECK] + func:= [BPRM_CHECK][MMAP_CHECK][CREDS_CHECK][FILE_CHECK][MODULE_CHECK] [FIRMWARE_CHECK] [KEXEC_KERNEL_CHECK] [KEXEC_INITRAMFS_CHECK] [KEXEC_CMDLINE] [KEY_CHECK] [CRITICAL_DATA] diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 9acc57f68f31..64c62b979f2f 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2098,7 +2098,7 @@ If the program returns 0, the attempt fails with -EPERM, otherwise it succeeds. An example of BPF_CGROUP_DEVICE program may be found in the kernel -source tree in the tools/testing/selftests/bpf/dev_cgroup.c file. +source tree in the tools/testing/selftests/bpf/progs/dev_cgroup.c file. RDMA diff --git a/Documentation/admin-guide/cifs/authors.rst b/Documentation/admin-guide/cifs/authors.rst index b02d6dd6c070..5c1d2f0fa7d1 100644 --- a/Documentation/admin-guide/cifs/authors.rst +++ b/Documentation/admin-guide/cifs/authors.rst @@ -5,10 +5,10 @@ Authors Original Author --------------- -Steve French (sfrench@samba.org) +Steve French (smfrench@gmail.com, sfrench@samba.org) The author wishes to express his appreciation and thanks to: -Andrew Tridgell (Samba team) for his early suggestions about smb/cifs VFS +Andrew Tridgell (Samba team) for his early suggestions about SMB/CIFS VFS improvements. Thanks to IBM for allowing me time and test resources to pursue this project, to Jim McDonough from IBM (and the Samba Team) for his help, to the IBM Linux JFS team for explaining many esoteric Linux filesystem features. @@ -51,7 +51,7 @@ Patch Contributors - Ronnie Sahlberg (for SMB3 xattr work, bug fixes, and lots of great work on compounding) - Shirish Pargaonkar (for many ACL patches over the years) - Sachin Prabhu (many bug fixes, including for reconnect, copy offload and security) -- Paulo Alcantara +- Paulo Alcantara (for some excellent work in DFS, and in booting from SMB3) - Long Li (some great work on RDMA, SMB Direct) diff --git a/Documentation/admin-guide/cifs/changes.rst b/Documentation/admin-guide/cifs/changes.rst index 71f2ecb62299..3147bbae9c43 100644 --- a/Documentation/admin-guide/cifs/changes.rst +++ b/Documentation/admin-guide/cifs/changes.rst @@ -3,6 +3,7 @@ Changes ======= See https://wiki.samba.org/index.php/LinuxCIFSKernel for summary -information (that may be easier to read than parsing the output of -"git log fs/cifs") about fixes/improvements to CIFS/SMB2/SMB3 support (changes +information about fixes/improvements to CIFS/SMB2/SMB3 support (changes to cifs.ko module) by kernel version (and cifs internal module version). +This may be easier to read than parsing the output of "git log fs/cifs" +by release. diff --git a/Documentation/admin-guide/cifs/introduction.rst b/Documentation/admin-guide/cifs/introduction.rst index cc2851d93d17..53ea62906aa5 100644 --- a/Documentation/admin-guide/cifs/introduction.rst +++ b/Documentation/admin-guide/cifs/introduction.rst @@ -7,19 +7,19 @@ Introduction protocol which was the successor to the Server Message Block (SMB) protocol, the native file sharing mechanism for most early PC operating systems. New and improved versions of CIFS are now - called SMB2 and SMB3. Use of SMB3 (and later, including SMB3.1.1) - is strongly preferred over using older dialects like CIFS due to - security reasons. All modern dialects, including the most recent, - SMB3.1.1 are supported by the CIFS VFS module. The SMB3 protocol - is implemented and supported by all major file servers - such as all modern versions of Windows (including Windows 2016 - Server), as well as by Samba (which provides excellent - CIFS/SMB2/SMB3 server support and tools for Linux and many other - operating systems). Apple systems also support SMB3 well, as - do most Network Attached Storage vendors, so this network - filesystem client can mount to a wide variety of systems. - It also supports mounting to the cloud (for example - Microsoft Azure), including the necessary security features. + called SMB2 and SMB3. Use of SMB3 (and later, including SMB3.1.1 + the most current dialect) is strongly preferred over using older + dialects like CIFS due to security reasons. All modern dialects, + including the most recent, SMB3.1.1, are supported by the CIFS VFS + module. The SMB3 protocol is implemented and supported by all major + file servers such as Windows (including Windows 2019 Server), as + well as by Samba (which provides excellent CIFS/SMB2/SMB3 server + support and tools for Linux and many other operating systems). + Apple systems also support SMB3 well, as do most Network Attached + Storage vendors, so this network filesystem client can mount to a + wide variety of systems. It also supports mounting to the cloud + (for example Microsoft Azure), including the necessary security + features. The intent of this module is to provide the most advanced network file system function for SMB3 compliant servers, including advanced @@ -27,8 +27,8 @@ Introduction POSIX compliance, secure per-user session establishment, encryption, high performance safe distributed caching (leases/oplocks), optional packet signing, large files, Unicode support and other internationalization - improvements. Since both Samba server and this filesystem client support - the CIFS Unix extensions (and in the future SMB3 POSIX extensions), + improvements. Since both Samba server and this filesystem client support the + CIFS Unix extensions, and the Linux client also suppors SMB3 POSIX extensions, the combination can provide a reasonable alternative to other network and cluster file systems for fileserving in some Linux to Linux environments, not just in Linux to Windows (or Linux to Mac) environments. diff --git a/Documentation/admin-guide/cifs/todo.rst b/Documentation/admin-guide/cifs/todo.rst index 25f11576e7b9..2646ed2e2d3e 100644 --- a/Documentation/admin-guide/cifs/todo.rst +++ b/Documentation/admin-guide/cifs/todo.rst @@ -13,24 +13,26 @@ is a partial list of the known problems and missing features: a) SMB3 (and SMB3.1.1) missing optional features: - - multichannel (started), integration with RDMA - - directory leases (improved metadata caching), started (root dir only) + - multichannel (partially integrated), integration of multichannel with RDMA + - directory leases (improved metadata caching). Currently only implemented for root dir - T10 copy offload ie "ODX" (copy chunk, and "Duplicate Extents" ioctl currently the only two server side copy mechanisms supported) b) improved sparse file support (fiemap and SEEK_HOLE are implemented - but additional features would be supportable by the protocol). + but additional features would be supportable by the protocol such + as FALLOC_FL_COLLAPSE_RANGE and FALLOC_FL_INSERT_RANGE) c) Directory entry caching relies on a 1 second timer, rather than using Directory Leases, currently only the root file handle is cached longer + by leveraging Directory Leases -d) quota support (needs minor kernel change since quota calls - to make it to network filesystems or deviceless filesystems) +d) quota support (needs minor kernel change since quota calls otherwise + won't make it to network filesystems or deviceless filesystems). e) Additional use cases can be optimized to use "compounding" (e.g. open/query/close and open/setinfo/close) to reduce the number of roundtrips to the server and improve performance. Various cases - (stat, statfs, create, unlink, mkdir) already have been improved by + (stat, statfs, create, unlink, mkdir, xattrs) already have been improved by using compounding but more can be done. In addition we could significantly reduce redundant opens by using deferred close (with handle caching leases) and better using reference counters on file @@ -60,7 +62,9 @@ k) Add tools to take advantage of more smb3 specific ioctls and features metadata attributes easier from tools (e.g. extending what was done in smb-info tool). -l) encrypted file support +l) encrypted file support (currently the attribute showing the file is + encrypted on the server is reported, but changing the attribute is not + supported). m) improved stats gathering tools (perhaps integration with nfsometer?) to extend and make easier to use what is currently in /proc/fs/cifs/Stats @@ -69,14 +73,13 @@ n) Add support for claims based ACLs ("DAC") o) mount helper GUI (to simplify the various configuration options on mount) -p) Add support for witness protocol (perhaps ioctl to cifs.ko from user space - tool listening on witness protocol RPC) to allow for notification of share - move, server failover, and server adapter changes. And also improve other - failover scenarios, e.g. when client knows multiple DFS entries point to - different servers, and the server we are connected to has gone down. +p) Expand support for witness protocol to allow for notification of share + move, and server network adapter changes. Currently only notifications by + the witness protocol for server move is supported by the Linux client. q) Allow mount.cifs to be more verbose in reporting errors with dialect - or unsupported feature errors. + or unsupported feature errors. This would now be easier due to the + implementation of the new mount API. r) updating cifs documentation, and user guide. @@ -87,11 +90,10 @@ t) split cifs and smb3 support into separate modules so legacy (and less secure) CIFS dialect can be disabled in environments that don't need it and simplify the code. -v) POSIX Extensions for SMB3.1.1 (started, create and mkdir support added - so far). +v) Additional testing of POSIX Extensions for SMB3.1.1 w) Add support for additional strong encryption types, and additional spnego - authentication mechanisms (see MS-SMB2) + authentication mechanisms (see MS-SMB2). GCM-256 is now partially implemented. x) Finish support for SMB3.1.1 compression diff --git a/Documentation/admin-guide/cifs/usage.rst b/Documentation/admin-guide/cifs/usage.rst index b6d9f02bc12b..13783dc68ab7 100644 --- a/Documentation/admin-guide/cifs/usage.rst +++ b/Documentation/admin-guide/cifs/usage.rst @@ -83,7 +83,7 @@ and encrypted shares and stronger signing and authentication algorithms. There are additional mount options that may be helpful for SMB3 to get improved POSIX behavior (NB: can use vers=3.0 to force only SMB3, never 2.1): - ``mfsymlinks`` and ``cifsacl`` and ``idsfromsid`` + ``mfsymlinks`` and either ``cifsacl`` or ``modefromsid`` (usually with ``idsfromsid``) Allowing User Mounts ==================== diff --git a/Documentation/admin-guide/xfs.rst b/Documentation/admin-guide/xfs.rst index 6178153d3320..5422407a96d7 100644 --- a/Documentation/admin-guide/xfs.rst +++ b/Documentation/admin-guide/xfs.rst @@ -284,6 +284,9 @@ The following sysctls are available for the XFS filesystem: removes unused preallocation from clean inodes and releases the unused space back to the free pool. + fs.xfs.speculative_cow_prealloc_lifetime + This is an alias for speculative_prealloc_lifetime. + fs.xfs.error_level (Min: 0 Default: 3 Max: 11) A volume knob for error reporting when internal errors occur. This will generate detailed messages & backtraces for filesystem @@ -356,12 +359,13 @@ The following sysctls are available for the XFS filesystem: Deprecated Sysctls ================== -=========================== ================ - Name Removal Schedule -=========================== ================ -fs.xfs.irix_sgid_inherit September 2025 -fs.xfs.irix_symlink_mode September 2025 -=========================== ================ +=========================================== ================ + Name Removal Schedule +=========================================== ================ +fs.xfs.irix_sgid_inherit September 2025 +fs.xfs.irix_symlink_mode September 2025 +fs.xfs.speculative_cow_prealloc_lifetime September 2025 +=========================================== ================ Removed Sysctls diff --git a/Documentation/conf.py b/Documentation/conf.py index 5bd45d5fb0a0..fd65168c10f8 100644 --- a/Documentation/conf.py +++ b/Documentation/conf.py @@ -49,8 +49,7 @@ extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', if major >= 3: sys.stderr.write('''WARNING: The kernel documentation build process support for Sphinx v3.0 and above is brand new. Be prepared for - possible issues in the generated output. - ''') + possible issues in the generated output.\n''') if (major > 3) or (minor > 0 or patch >= 2): # Sphinx c function parser is more pedantic with regards to type # checking. Due to that, having macros at c:function cause problems. diff --git a/Documentation/features/core/cBPF-JIT/arch-support.txt b/Documentation/features/core/cBPF-JIT/arch-support.txt index 399935616813..e59b5215402d 100644 --- a/Documentation/features/core/cBPF-JIT/arch-support.txt +++ b/Documentation/features/core/cBPF-JIT/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | TODO | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/core/eBPF-JIT/arch-support.txt b/Documentation/features/core/eBPF-JIT/arch-support.txt index 79409bfe0263..dcbd8679f514 100644 --- a/Documentation/features/core/eBPF-JIT/arch-support.txt +++ b/Documentation/features/core/eBPF-JIT/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/core/generic-idle-thread/arch-support.txt b/Documentation/features/core/generic-idle-thread/arch-support.txt index 9ea60e416efd..4efcba7b5239 100644 --- a/Documentation/features/core/generic-idle-thread/arch-support.txt +++ b/Documentation/features/core/generic-idle-thread/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | ok | diff --git a/Documentation/features/core/jump-labels/arch-support.txt b/Documentation/features/core/jump-labels/arch-support.txt index 894d9693b380..0c801d1bd2da 100644 --- a/Documentation/features/core/jump-labels/arch-support.txt +++ b/Documentation/features/core/jump-labels/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/core/tracehook/arch-support.txt b/Documentation/features/core/tracehook/arch-support.txt index cd3510e2eedb..af34308fce7f 100644 --- a/Documentation/features/core/tracehook/arch-support.txt +++ b/Documentation/features/core/tracehook/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | c6x: | ok | | csky: | ok | | h8300: | TODO | | hexagon: | ok | diff --git a/Documentation/features/debug/KASAN/arch-support.txt b/Documentation/features/debug/KASAN/arch-support.txt index c3fe9b266e7b..be66d15e0406 100644 --- a/Documentation/features/debug/KASAN/arch-support.txt +++ b/Documentation/features/debug/KASAN/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | ok | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt index 1c49723e7534..7aff505af706 100644 --- a/Documentation/features/debug/debug-vm-pgtable/arch-support.txt +++ b/Documentation/features/debug/debug-vm-pgtable/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | TODO | | arm64: | ok | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/debug/gcov-profile-all/arch-support.txt b/Documentation/features/debug/gcov-profile-all/arch-support.txt index 7563a494ddb8..b39c1a5de3f3 100644 --- a/Documentation/features/debug/gcov-profile-all/arch-support.txt +++ b/Documentation/features/debug/gcov-profile-all/arch-support.txt @@ -10,14 +10,13 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | TODO | | ia64: | TODO | | m68k: | TODO | | microblaze: | ok | - | mips: | TODO | + | mips: | ok | | nds32: | TODO | | nios2: | TODO | | openrisc: | TODO | diff --git a/Documentation/features/debug/kcov/arch-support.txt b/Documentation/features/debug/kcov/arch-support.txt index ab0ee1c933c2..7e44013cc320 100644 --- a/Documentation/features/debug/kcov/arch-support.txt +++ b/Documentation/features/debug/kcov/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/debug/kgdb/arch-support.txt b/Documentation/features/debug/kgdb/arch-support.txt index bc45bac20442..2cb0576f9180 100644 --- a/Documentation/features/debug/kgdb/arch-support.txt +++ b/Documentation/features/debug/kgdb/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | TODO | | h8300: | ok | | hexagon: | ok | diff --git a/Documentation/features/debug/kmemleak/arch-support.txt b/Documentation/features/debug/kmemleak/arch-support.txt index 2db76807ec6f..e9ac415f8aec 100644 --- a/Documentation/features/debug/kmemleak/arch-support.txt +++ b/Documentation/features/debug/kmemleak/arch-support.txt @@ -10,8 +10,7 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | c6x: | TODO | - | csky: | TODO | + | csky: | ok | | h8300: | TODO | | hexagon: | TODO | | ia64: | TODO | diff --git a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt index 6225cfe0c5bf..96156e8802a7 100644 --- a/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt +++ b/Documentation/features/debug/kprobes-on-ftrace/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | TODO | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | TODO | @@ -23,7 +22,7 @@ | openrisc: | TODO | | parisc: | ok | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | ok | | sh: | TODO | | sparc: | TODO | diff --git a/Documentation/features/debug/kprobes/arch-support.txt b/Documentation/features/debug/kprobes/arch-support.txt index 371f0ac488f5..ee95ed61909a 100644 --- a/Documentation/features/debug/kprobes/arch-support.txt +++ b/Documentation/features/debug/kprobes/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | TODO | @@ -23,7 +22,7 @@ | openrisc: | TODO | | parisc: | ok | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | ok | | sh: | ok | | sparc: | ok | diff --git a/Documentation/features/debug/kretprobes/arch-support.txt b/Documentation/features/debug/kretprobes/arch-support.txt index 38e95251deed..612cb97d47b8 100644 --- a/Documentation/features/debug/kretprobes/arch-support.txt +++ b/Documentation/features/debug/kretprobes/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | TODO | @@ -23,7 +22,7 @@ | openrisc: | TODO | | parisc: | ok | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | ok | | sh: | ok | | sparc: | ok | diff --git a/Documentation/features/debug/optprobes/arch-support.txt b/Documentation/features/debug/optprobes/arch-support.txt index 7f4a20e6a12b..d6ff141a6122 100644 --- a/Documentation/features/debug/optprobes/arch-support.txt +++ b/Documentation/features/debug/optprobes/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | TODO | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/debug/stackprotector/arch-support.txt b/Documentation/features/debug/stackprotector/arch-support.txt index 3329559c8207..ad4de22a71ab 100644 --- a/Documentation/features/debug/stackprotector/arch-support.txt +++ b/Documentation/features/debug/stackprotector/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/debug/uprobes/arch-support.txt b/Documentation/features/debug/uprobes/arch-support.txt index 43cac6ee0c68..8bd5548a4485 100644 --- a/Documentation/features/debug/uprobes/arch-support.txt +++ b/Documentation/features/debug/uprobes/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | TODO | @@ -23,7 +22,7 @@ | openrisc: | TODO | | parisc: | TODO | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | ok | | sh: | TODO | | sparc: | ok | diff --git a/Documentation/features/debug/user-ret-profiler/arch-support.txt b/Documentation/features/debug/user-ret-profiler/arch-support.txt index d636ed0e679f..2a3fe812a5fa 100644 --- a/Documentation/features/debug/user-ret-profiler/arch-support.txt +++ b/Documentation/features/debug/user-ret-profiler/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | TODO | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/io/dma-contiguous/arch-support.txt b/Documentation/features/io/dma-contiguous/arch-support.txt index dfc93d074e3d..bece89586efa 100644 --- a/Documentation/features/io/dma-contiguous/arch-support.txt +++ b/Documentation/features/io/dma-contiguous/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/locking/cmpxchg-local/arch-support.txt b/Documentation/features/locking/cmpxchg-local/arch-support.txt index 1815c7fed06d..52bdda004f5c 100644 --- a/Documentation/features/locking/cmpxchg-local/arch-support.txt +++ b/Documentation/features/locking/cmpxchg-local/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | ok | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/locking/lockdep/arch-support.txt b/Documentation/features/locking/lockdep/arch-support.txt index 940b0bd02957..a8cd163c8b7e 100644 --- a/Documentation/features/locking/lockdep/arch-support.txt +++ b/Documentation/features/locking/lockdep/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | ok | diff --git a/Documentation/features/locking/queued-rwlocks/arch-support.txt b/Documentation/features/locking/queued-rwlocks/arch-support.txt index 4dd5e554873f..8c85949752b3 100644 --- a/Documentation/features/locking/queued-rwlocks/arch-support.txt +++ b/Documentation/features/locking/queued-rwlocks/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/locking/queued-spinlocks/arch-support.txt b/Documentation/features/locking/queued-spinlocks/arch-support.txt index b16d4f71e5ce..5f4e1b3841af 100644 --- a/Documentation/features/locking/queued-spinlocks/arch-support.txt +++ b/Documentation/features/locking/queued-spinlocks/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | ok | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/perf/kprobes-event/arch-support.txt b/Documentation/features/perf/kprobes-event/arch-support.txt index 04c17c2106a4..78f3fe080f0e 100644 --- a/Documentation/features/perf/kprobes-event/arch-support.txt +++ b/Documentation/features/perf/kprobes-event/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | ok | @@ -23,7 +22,7 @@ | openrisc: | TODO | | parisc: | ok | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | ok | | sh: | ok | | sparc: | ok | diff --git a/Documentation/features/perf/perf-regs/arch-support.txt b/Documentation/features/perf/perf-regs/arch-support.txt index e7450fbb8253..5bf3b1854a1f 100644 --- a/Documentation/features/perf/perf-regs/arch-support.txt +++ b/Documentation/features/perf/perf-regs/arch-support.txt @@ -10,14 +10,13 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | TODO | | ia64: | TODO | | m68k: | TODO | | microblaze: | TODO | - | mips: | TODO | + | mips: | ok | | nds32: | TODO | | nios2: | TODO | | openrisc: | TODO | diff --git a/Documentation/features/perf/perf-stackdump/arch-support.txt b/Documentation/features/perf/perf-stackdump/arch-support.txt index 98e79d128d9b..d88659bb4fc1 100644 --- a/Documentation/features/perf/perf-stackdump/arch-support.txt +++ b/Documentation/features/perf/perf-stackdump/arch-support.txt @@ -10,14 +10,13 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | TODO | | ia64: | TODO | | m68k: | TODO | | microblaze: | TODO | - | mips: | TODO | + | mips: | ok | | nds32: | TODO | | nios2: | TODO | | openrisc: | TODO | diff --git a/Documentation/features/sched/membarrier-sync-core/arch-support.txt b/Documentation/features/sched/membarrier-sync-core/arch-support.txt index 47e6903f47a5..883d33b265d6 100644 --- a/Documentation/features/sched/membarrier-sync-core/arch-support.txt +++ b/Documentation/features/sched/membarrier-sync-core/arch-support.txt @@ -33,7 +33,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/sched/numa-balancing/arch-support.txt b/Documentation/features/sched/numa-balancing/arch-support.txt index 964457ad26c1..9affb7c2c500 100644 --- a/Documentation/features/sched/numa-balancing/arch-support.txt +++ b/Documentation/features/sched/numa-balancing/arch-support.txt @@ -10,7 +10,6 @@ | arc: | .. | | arm: | .. | | arm64: | ok | - | c6x: | .. | | csky: | .. | | h8300: | .. | | hexagon: | .. | @@ -23,7 +22,7 @@ | openrisc: | .. | | parisc: | .. | | powerpc: | ok | - | riscv: | TODO | + | riscv: | ok | | s390: | ok | | sh: | .. | | sparc: | TODO | diff --git a/Documentation/features/seccomp/seccomp-filter/arch-support.txt b/Documentation/features/seccomp/seccomp-filter/arch-support.txt index eb3d74092c61..26eec58ab819 100644 --- a/Documentation/features/seccomp/seccomp-filter/arch-support.txt +++ b/Documentation/features/seccomp/seccomp-filter/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/time/arch-tick-broadcast/arch-support.txt b/Documentation/features/time/arch-tick-broadcast/arch-support.txt index 4d11cbb3c09b..8639fe8315f5 100644 --- a/Documentation/features/time/arch-tick-broadcast/arch-support.txt +++ b/Documentation/features/time/arch-tick-broadcast/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/time/clockevents/arch-support.txt b/Documentation/features/time/clockevents/arch-support.txt index 6863a3fbddad..9a81cb03b1fd 100644 --- a/Documentation/features/time/clockevents/arch-support.txt +++ b/Documentation/features/time/clockevents/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | TODO | | arm64: | ok | - | c6x: | ok | | csky: | ok | | h8300: | ok | | hexagon: | ok | diff --git a/Documentation/features/time/context-tracking/arch-support.txt b/Documentation/features/time/context-tracking/arch-support.txt index 52aea275aab7..4ed116c2ec39 100644 --- a/Documentation/features/time/context-tracking/arch-support.txt +++ b/Documentation/features/time/context-tracking/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/time/irq-time-acct/arch-support.txt b/Documentation/features/time/irq-time-acct/arch-support.txt index 6fc03deb1c38..bc30c15557c7 100644 --- a/Documentation/features/time/irq-time-acct/arch-support.txt +++ b/Documentation/features/time/irq-time-acct/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/time/virt-cpuacct/arch-support.txt b/Documentation/features/time/virt-cpuacct/arch-support.txt index e51f3af38e31..050de43bbbb9 100644 --- a/Documentation/features/time/virt-cpuacct/arch-support.txt +++ b/Documentation/features/time/virt-cpuacct/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | ok | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/vm/ELF-ASLR/arch-support.txt b/Documentation/features/vm/ELF-ASLR/arch-support.txt index eccda0732474..99cb6d7f5005 100644 --- a/Documentation/features/vm/ELF-ASLR/arch-support.txt +++ b/Documentation/features/vm/ELF-ASLR/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/vm/PG_uncached/arch-support.txt b/Documentation/features/vm/PG_uncached/arch-support.txt index c74e3f8040e1..6cde38458596 100644 --- a/Documentation/features/vm/PG_uncached/arch-support.txt +++ b/Documentation/features/vm/PG_uncached/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | TODO | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/vm/THP/arch-support.txt b/Documentation/features/vm/THP/arch-support.txt index 1c0b95f2b40d..e8238cb2a4da 100644 --- a/Documentation/features/vm/THP/arch-support.txt +++ b/Documentation/features/vm/THP/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | c6x: | .. | | csky: | .. | | h8300: | .. | | hexagon: | .. | diff --git a/Documentation/features/vm/TLB/arch-support.txt b/Documentation/features/vm/TLB/arch-support.txt index 30f75a79ce01..48a5ca548399 100644 --- a/Documentation/features/vm/TLB/arch-support.txt +++ b/Documentation/features/vm/TLB/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | TODO | - | c6x: | .. | | csky: | TODO | | h8300: | .. | | hexagon: | TODO | diff --git a/Documentation/features/vm/huge-vmap/arch-support.txt b/Documentation/features/vm/huge-vmap/arch-support.txt index c5ff3a427722..439fd9069b8b 100644 --- a/Documentation/features/vm/huge-vmap/arch-support.txt +++ b/Documentation/features/vm/huge-vmap/arch-support.txt @@ -10,7 +10,6 @@ | arc: | TODO | | arm: | TODO | | arm64: | ok | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/vm/ioremap_prot/arch-support.txt b/Documentation/features/vm/ioremap_prot/arch-support.txt index b5fb37c28cc6..9a0c8783b84d 100644 --- a/Documentation/features/vm/ioremap_prot/arch-support.txt +++ b/Documentation/features/vm/ioremap_prot/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | TODO | | arm64: | TODO | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt index 13d0e1e17001..40b969f3a6bb 100644 --- a/Documentation/features/vm/pte_special/arch-support.txt +++ b/Documentation/features/vm/pte_special/arch-support.txt @@ -10,7 +10,6 @@ | arc: | ok | | arm: | ok | | arm64: | ok | - | c6x: | TODO | | csky: | TODO | | h8300: | TODO | | hexagon: | TODO | diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 633610253f45..0302035781be 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -883,3 +883,10 @@ For bvec based itererators bio_iov_iter_get_pages() now doesn't copy bvecs but uses the one provided. Anyone issuing kiocb-I/O should ensure that the bvec and page references stay until I/O has completed, i.e. until ->ki_complete() has been called or returned with non -EIOCBQUEUED code. + +--- + +**mandatory** + +mnt_want_write_file() can now only be paired with mnt_drop_write_file(), +whereas previously it could be paired with mnt_drop_write() as well. diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 1bfea7de8055..544c6382a691 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -697,6 +697,10 @@ files are there, and which are missing. number of processes currently runnable (running or on ready queue); total number of processes in system; last pid created. + All fields are separated by one space except "number of + processes currently runnable" and "total number of processes + in system", which are separated by a slash ('/'). Example: + 0.61 0.61 0.55 3/828 22084 locks Kernel locks meminfo Memory info misc Miscellaneous diff --git a/Documentation/powerpc/syscall64-abi.rst b/Documentation/powerpc/syscall64-abi.rst index cf9b2857c72a..dabee3729e5a 100644 --- a/Documentation/powerpc/syscall64-abi.rst +++ b/Documentation/powerpc/syscall64-abi.rst @@ -46,25 +46,38 @@ stack frame LR and CR save fields are not used. Register preservation rules --------------------------- -Register preservation rules match the ELF ABI calling sequence with the -following differences: +Register preservation rules match the ELF ABI calling sequence with some +differences. -+------------------------------------------------------------------------+ -| For the sc instruction, differences with the ELF ABI | -+--------------+--------------+------------------------------------------+ -| r0 | Volatile | (System call number.) | -| rr3 | Volatile | (Parameter 1, and return value.) | -| rr4-r8 | Volatile | (Parameters 2-6.) | -| rcr0 | Volatile | (cr0.SO is the return error condition.) | -| rcr1, cr5-7 | Nonvolatile | | -| rlr | Nonvolatile | | -+--------------+--------------+------------------------------------------+ -| For the scv 0 instruction, differences with the ELF ABI | -+--------------+--------------+------------------------------------------+ -| r0 | Volatile | (System call number.) | -| r3 | Volatile | (Parameter 1, and return value.) | -| r4-r8 | Volatile | (Parameters 2-6.) | -+--------------+--------------+------------------------------------------+ +For the sc instruction, the differences from the ELF ABI are as follows: + ++--------------+--------------------+-----------------------------------------+ +| Register | Preservation Rules | Purpose | ++==============+====================+=========================================+ +| r0 | Volatile | (System call number.) | ++--------------+--------------------+-----------------------------------------+ +| r3 | Volatile | (Parameter 1, and return value.) | ++--------------+--------------------+-----------------------------------------+ +| r4-r8 | Volatile | (Parameters 2-6.) | ++--------------+--------------------+-----------------------------------------+ +| cr0 | Volatile | (cr0.SO is the return error condition.) | ++--------------+--------------------+-----------------------------------------+ +| cr1, cr5-7 | Nonvolatile | | ++--------------+--------------------+-----------------------------------------+ +| lr | Nonvolatile | | ++--------------+--------------------+-----------------------------------------+ + +For the scv 0 instruction, the differences from the ELF ABI are as follows: + ++--------------+--------------------+-----------------------------------------+ +| Register | Preservation Rules | Purpose | ++==============+====================+=========================================+ +| r0 | Volatile | (System call number.) | ++--------------+--------------------+-----------------------------------------+ +| r3 | Volatile | (Parameter 1, and return value.) | ++--------------+--------------------+-----------------------------------------+ +| r4-r8 | Volatile | (Parameters 2-6.) | ++--------------+--------------------+-----------------------------------------+ All floating point and vector data registers as well as control and status registers are nonvolatile. diff --git a/Documentation/process/4.Coding.rst b/Documentation/process/4.Coding.rst index 0825dc496f22..1f0d81f44e14 100644 --- a/Documentation/process/4.Coding.rst +++ b/Documentation/process/4.Coding.rst @@ -242,7 +242,7 @@ and try to avoid "fixes" which make the warning go away without addressing its cause. Note that not all compiler warnings are enabled by default. Build the -kernel with "make EXTRA_CFLAGS=-W" to get the full set. +kernel with "make KCFLAGS=-W" to get the full set. The kernel provides several configuration options which turn on debugging features; most of these are found in the "kernel hacking" submenu. Several diff --git a/Documentation/process/submit-checklist.rst b/Documentation/process/submit-checklist.rst index f709beaf02c9..b1bc2d37bd0a 100644 --- a/Documentation/process/submit-checklist.rst +++ b/Documentation/process/submit-checklist.rst @@ -96,7 +96,7 @@ and elsewhere regarding submitting Linux kernel patches. injection might be appropriate. 20) Newly-added code has been compiled with ``gcc -W`` (use - ``make EXTRA_CFLAGS=-W``). This will generate lots of noise, but is good + ``make KCFLAGS=-W``). This will generate lots of noise, but is good for finding bugs like "warning: comparison between signed and unsigned". 21) Tested after it has been merged into the -mm patchset to make sure diff --git a/Documentation/translations/it_IT/process/4.Coding.rst b/Documentation/translations/it_IT/process/4.Coding.rst index a5e36aa60448..8012fe9497ae 100644 --- a/Documentation/translations/it_IT/process/4.Coding.rst +++ b/Documentation/translations/it_IT/process/4.Coding.rst @@ -256,7 +256,7 @@ e cercate di evitare le "riparazioni" che fan sparire l'avvertimento senza però averne trovato la causa. Tenete a mente che non tutti gli avvertimenti sono disabilitati di default. -Costruite il kernel con "make EXTRA_CFLAGS=-W" per ottenerli tutti. +Costruite il kernel con "make KCFLAGS=-W" per ottenerli tutti. Il kernel fornisce differenti opzioni che abilitano funzionalità di debugging; molti di queste sono trovano all'interno del sotto menu "kernel hacking". diff --git a/Documentation/translations/it_IT/process/submit-checklist.rst b/Documentation/translations/it_IT/process/submit-checklist.rst index 3e575502690f..614fc17d9086 100644 --- a/Documentation/translations/it_IT/process/submit-checklist.rst +++ b/Documentation/translations/it_IT/process/submit-checklist.rst @@ -104,7 +104,7 @@ sottomissione delle patch, in particolare l'iniezione di fallimenti specifici per il sottosistema. 22) Il nuovo codice è stato compilato con ``gcc -W`` (usate - ``make EXTRA_CFLAGS=-W``). Questo genererà molti avvisi, ma è ottimo + ``make KCFLAGS=-W``). Questo genererà molti avvisi, ma è ottimo per scovare bachi come "warning: comparison between signed and unsigned". 23) La patch è stata verificata dopo essere stata inclusa nella serie di patch diff --git a/Documentation/translations/zh_CN/process/4.Coding.rst b/Documentation/translations/zh_CN/process/4.Coding.rst index 959a06ba025c..66cd8ee07606 100644 --- a/Documentation/translations/zh_CN/process/4.Coding.rst +++ b/Documentation/translations/zh_CN/process/4.Coding.rst @@ -165,7 +165,7 @@ Linus对这个问题给出了最佳答案: 通常,这些警告都指向真正的问题。提交以供审阅的代码通常不会产生任何编译器警告。 在消除警告时,注意了解真正的原因,并尽量避免“修复”,使警告消失而不解决其原因。 -请注意,并非所有编译器警告都默认启用。使用“make EXTRA_CFLAGS=-W”构建内核以 +请注意,并非所有编译器警告都默认启用。使用“make KCFLAGS=-W”构建内核以 获得完整集合。 内核提供了几个配置选项,可以打开调试功能;大多数配置选项位于“kernel hacking” diff --git a/arch/alpha/kernel/process.c b/arch/alpha/kernel/process.c index 6c71554206cc..5112ab996394 100644 --- a/arch/alpha/kernel/process.c +++ b/arch/alpha/kernel/process.c @@ -249,7 +249,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, childti->pcb.ksp = (unsigned long) childstack; childti->pcb.flags = 1; /* set FEN, clear everything else */ - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { /* kernel thread */ memset(childstack, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs)); diff --git a/arch/arc/kernel/process.c b/arch/arc/kernel/process.c index 37f724ad5e39..d838d0d57696 100644 --- a/arch/arc/kernel/process.c +++ b/arch/arc/kernel/process.c @@ -191,7 +191,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, childksp[0] = 0; /* fp */ childksp[1] = (unsigned long)ret_from_fork; /* blink */ - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { memset(c_regs, 0, sizeof(struct pt_regs)); c_callee->r13 = kthread_arg; diff --git a/arch/arm/kernel/process.c b/arch/arm/kernel/process.c index ee3aee69e444..5199a2bb4111 100644 --- a/arch/arm/kernel/process.c +++ b/arch/arm/kernel/process.c @@ -243,7 +243,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, thread->cpu_domain = get_domain(); #endif - if (likely(!(p->flags & PF_KTHREAD))) { + if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))) { *childregs = *current_pt_regs(); childregs->ARM_r0 = 0; if (stack_start) diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c index 9a24b09478d4..b12b0cc72a69 100644 --- a/arch/arm64/kernel/process.c +++ b/arch/arm64/kernel/process.c @@ -396,7 +396,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, ptrauth_thread_init_kernel(p); - if (likely(!(p->flags & PF_KTHREAD))) { + if (likely(!(p->flags & (PF_KTHREAD | PF_IO_WORKER)))) { *childregs = *current_pt_regs(); childregs->regs[0] = 0; diff --git a/arch/csky/kernel/process.c b/arch/csky/kernel/process.c index 69af6bc87e64..3d0ca22cd0e2 100644 --- a/arch/csky/kernel/process.c +++ b/arch/csky/kernel/process.c @@ -49,7 +49,7 @@ int copy_thread(unsigned long clone_flags, /* setup thread.sp for switch_to !!! */ p->thread.sp = (unsigned long)childstack; - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { memset(childregs, 0, sizeof(struct pt_regs)); childstack->r15 = (unsigned long) ret_from_kernel_thread; childstack->r10 = kthread_arg; diff --git a/arch/h8300/kernel/process.c b/arch/h8300/kernel/process.c index bc1364db58fe..46b1342ce515 100644 --- a/arch/h8300/kernel/process.c +++ b/arch/h8300/kernel/process.c @@ -112,7 +112,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, childregs = (struct pt_regs *) (THREAD_SIZE + task_stack_page(p)) - 1; - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { memset(childregs, 0, sizeof(struct pt_regs)); childregs->retpc = (unsigned long) ret_from_kernel_thread; childregs->er4 = topstk; /* arg */ diff --git a/arch/hexagon/kernel/process.c b/arch/hexagon/kernel/process.c index 6a980cba7b29..c61165c99ae0 100644 --- a/arch/hexagon/kernel/process.c +++ b/arch/hexagon/kernel/process.c @@ -73,7 +73,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, sizeof(*ss)); ss->lr = (unsigned long)ret_from_fork; p->thread.switch_sp = ss; - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { memset(childregs, 0, sizeof(struct pt_regs)); /* r24 <- fn, r25 <- arg */ ss->r24 = usp; diff --git a/arch/ia64/kernel/process.c b/arch/ia64/kernel/process.c index 4ebbfa076a26..7e1a1525e202 100644 --- a/arch/ia64/kernel/process.c +++ b/arch/ia64/kernel/process.c @@ -338,7 +338,7 @@ copy_thread(unsigned long clone_flags, unsigned long user_stack_base, ia64_drop_fpu(p); /* don't pick up stale state from a CPU's fph */ - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { if (unlikely(!user_stack_base)) { /* fork_idle() called us */ return 0; diff --git a/arch/m68k/kernel/process.c b/arch/m68k/kernel/process.c index 08359a6e058f..da83cc83e791 100644 --- a/arch/m68k/kernel/process.c +++ b/arch/m68k/kernel/process.c @@ -157,7 +157,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, */ p->thread.fs = get_fs().seg; - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { /* kernel thread */ memset(frame, 0, sizeof(struct fork_frame)); frame->regs.sr = PS_S; diff --git a/arch/microblaze/kernel/process.c b/arch/microblaze/kernel/process.c index 657c2beb665e..62aa237180b6 100644 --- a/arch/microblaze/kernel/process.c +++ b/arch/microblaze/kernel/process.c @@ -59,7 +59,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, struct pt_regs *childregs = task_pt_regs(p); struct thread_info *ti = task_thread_info(p); - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { /* if we're creating a new kernel thread then just zeroing all * the registers. That's OK for a brand new thread.*/ memset(childregs, 0, sizeof(struct pt_regs)); diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index af4c862ec5ff..7efa0d1a4c2b 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -120,7 +120,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, /* Put the stack after the struct pt_regs. */ childksp = (unsigned long) childregs; p->thread.cp0_status = (read_c0_status() & ~(ST0_CU2|ST0_CU1)) | ST0_KERNEL_CUMASK; - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { /* kernel thread */ unsigned long status = p->thread.cp0_status; memset(childregs, 0, sizeof(struct pt_regs)); diff --git a/arch/nds32/kernel/process.c b/arch/nds32/kernel/process.c index e01ad5d17224..c1327e552ec6 100644 --- a/arch/nds32/kernel/process.c +++ b/arch/nds32/kernel/process.c @@ -156,7 +156,7 @@ int copy_thread(unsigned long clone_flags, unsigned long stack_start, memset(&p->thread.cpu_context, 0, sizeof(struct cpu_context)); - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { memset(childregs, 0, sizeof(struct pt_regs)); /* kernel thread fn */ p->thread.cpu_context.r6 = stack_start; diff --git a/arch/nios2/kernel/process.c b/arch/nios2/kernel/process.c index 50b4eb19a6cc..c5f916ca6845 100644 --- a/arch/nios2/kernel/process.c +++ b/arch/nios2/kernel/process.c @@ -109,7 +109,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, struct switch_stack *childstack = ((struct switch_stack *)childregs) - 1; - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { memset(childstack, 0, sizeof(struct switch_stack) + sizeof(struct pt_regs)); diff --git a/arch/openrisc/Kbuild b/arch/openrisc/Kbuild new file mode 100644 index 000000000000..4234b4c03e72 --- /dev/null +++ b/arch/openrisc/Kbuild @@ -0,0 +1,3 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-y += lib/ kernel/ mm/ +obj-y += boot/dts/ diff --git a/arch/openrisc/Makefile b/arch/openrisc/Makefile index bf10141c7426..410e7abfac69 100644 --- a/arch/openrisc/Makefile +++ b/arch/openrisc/Makefile @@ -24,6 +24,10 @@ LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name) KBUILD_CFLAGS += -pipe -ffixed-r10 -D__linux__ +all: vmlinux.bin + +boot := arch/$(ARCH)/boot + ifeq ($(CONFIG_OPENRISC_HAVE_INST_MUL),y) KBUILD_CFLAGS += $(call cc-option,-mhard-mul) else @@ -38,14 +42,13 @@ endif head-y := arch/openrisc/kernel/head.o -core-y += arch/openrisc/lib/ \ - arch/openrisc/kernel/ \ - arch/openrisc/mm/ +core-y += arch/openrisc/ libs-y += $(LIBGCC) -ifneq '$(CONFIG_OPENRISC_BUILTIN_DTB)' '""' -BUILTIN_DTB := y -else -BUILTIN_DTB := n -endif -core-$(BUILTIN_DTB) += arch/openrisc/boot/dts/ +PHONY += vmlinux.bin + +vmlinux.bin: vmlinux + $(Q)$(MAKE) $(build)=$(boot) $(boot)/$@ + +archclean: + $(Q)$(MAKE) $(clean)=$(boot) diff --git a/arch/openrisc/boot/.gitignore b/arch/openrisc/boot/.gitignore new file mode 100644 index 000000000000..007d6fea3145 --- /dev/null +++ b/arch/openrisc/boot/.gitignore @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +vmlinux.bin diff --git a/arch/openrisc/boot/Makefile b/arch/openrisc/boot/Makefile new file mode 100644 index 000000000000..5b28538f4dd1 --- /dev/null +++ b/arch/openrisc/boot/Makefile @@ -0,0 +1,10 @@ +# SPDX-License-Identifier: GPL-2.0 +# +# Makefile for bootable kernel images +# + +targets += vmlinux.bin + +OBJCOPYFLAGS_vmlinux.bin := -O binary +$(obj)/vmlinux.bin: vmlinux FORCE + $(call if_changed,objcopy) diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c index 3c98728cce24..eb62429681fc 100644 --- a/arch/openrisc/kernel/process.c +++ b/arch/openrisc/kernel/process.c @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -49,10 +50,16 @@ */ struct thread_info *current_thread_info_set[NR_CPUS] = { &init_thread_info, }; -void machine_restart(void) +void machine_restart(char *cmd) { - printk(KERN_INFO "*** MACHINE RESTART ***\n"); - __asm__("l.nop 1"); + do_kernel_restart(cmd); + + /* Give a grace period for failure to restart of 1s */ + mdelay(1000); + + /* Whoops - the platform was unable to reboot. Tell the user! */ + pr_emerg("Reboot failed -- System halted\n"); + while (1); } /* @@ -167,7 +174,7 @@ copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, sp -= sizeof(struct pt_regs); kregs = (struct pt_regs *)sp; - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { memset(kregs, 0, sizeof(struct pt_regs)); kregs->gpr[20] = usp; /* fn, kernel thread */ kregs->gpr[22] = arg; diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c index 29c82ef2e207..48e1092a64de 100644 --- a/arch/openrisc/kernel/smp.c +++ b/arch/openrisc/kernel/smp.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -60,22 +61,32 @@ void __init smp_prepare_boot_cpu(void) void __init smp_init_cpus(void) { - int i; + struct device_node *cpu; + u32 cpu_id; - for (i = 0; i < NR_CPUS; i++) - set_cpu_possible(i, true); + for_each_of_cpu_node(cpu) { + if (of_property_read_u32(cpu, "reg", &cpu_id)) { + pr_warn("%s missing reg property", cpu->full_name); + continue; + } + + if (cpu_id < NR_CPUS) + set_cpu_possible(cpu_id, true); + } } void __init smp_prepare_cpus(unsigned int max_cpus) { - int i; + unsigned int cpu; /* * Initialise the present map, which describes the set of CPUs * actually populated at the present time. */ - for (i = 0; i < max_cpus; i++) - set_cpu_present(i, true); + for_each_possible_cpu(cpu) { + if (cpu < max_cpus) + set_cpu_present(cpu, true); + } } void __init smp_cpus_done(unsigned int max_cpus) diff --git a/arch/parisc/kernel/process.c b/arch/parisc/kernel/process.c index fda1c1a6a444..b144fbe29bc1 100644 --- a/arch/parisc/kernel/process.c +++ b/arch/parisc/kernel/process.c @@ -200,7 +200,7 @@ copy_thread(unsigned long clone_flags, unsigned long usp, extern void * const ret_from_kernel_thread; extern void * const child_return; - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { /* kernel thread */ memset(cregs, 0, sizeof(struct pt_regs)); if (!usp) /* idle thread */ diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index 924d023dad0a..3231c2df9e26 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -1670,7 +1670,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, /* Copy registers */ sp -= sizeof(struct pt_regs); childregs = (struct pt_regs *) sp; - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { /* kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->gpr[1] = sp + sizeof(struct pt_regs); diff --git a/arch/riscv/kernel/process.c b/arch/riscv/kernel/process.c index 19f4688f2f36..6f728e731bed 100644 --- a/arch/riscv/kernel/process.c +++ b/arch/riscv/kernel/process.c @@ -124,7 +124,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, struct pt_regs *childregs = task_pt_regs(p); /* p->thread holds context to be restored by __switch_to() */ - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { /* Kernel thread */ memset(childregs, 0, sizeof(struct pt_regs)); childregs->gp = gp_in_global; diff --git a/arch/s390/include/asm/irq_work.h b/arch/s390/include/asm/irq_work.h new file mode 100644 index 000000000000..603783766d0a --- /dev/null +++ b/arch/s390/include/asm/irq_work.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_S390_IRQ_WORK_H +#define _ASM_S390_IRQ_WORK_H + +static inline bool arch_irq_work_has_interrupt(void) +{ + return true; +} + +void arch_irq_work_raise(void); + +#endif /* _ASM_S390_IRQ_WORK_H */ diff --git a/arch/s390/include/asm/pgalloc.h b/arch/s390/include/asm/pgalloc.h index d1297d6bbdcf..6b187cd72251 100644 --- a/arch/s390/include/asm/pgalloc.h +++ b/arch/s390/include/asm/pgalloc.h @@ -135,7 +135,7 @@ static inline void pmd_populate(struct mm_struct *mm, #define pmd_populate_kernel(mm, pmd, pte) pmd_populate(mm, pmd, pte) #define pmd_pgtable(pmd) \ - (pgtable_t)(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE) + ((pgtable_t)__va(pmd_val(pmd) & -sizeof(pte_t)*PTRS_PER_PTE)) /* * page table entry allocation/free routines. diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 794746a32806..29c7ecd5ad1d 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1219,8 +1219,8 @@ static inline pte_t mk_pte(struct page *page, pgprot_t pgprot) #define pud_index(address) (((address) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) #define pmd_index(address) (((address) >> PMD_SHIFT) & (PTRS_PER_PMD-1)) -#define p4d_deref(pud) (p4d_val(pud) & _REGION_ENTRY_ORIGIN) -#define pgd_deref(pgd) (pgd_val(pgd) & _REGION_ENTRY_ORIGIN) +#define p4d_deref(pud) ((unsigned long)__va(p4d_val(pud) & _REGION_ENTRY_ORIGIN)) +#define pgd_deref(pgd) ((unsigned long)__va(pgd_val(pgd) & _REGION_ENTRY_ORIGIN)) static inline unsigned long pmd_deref(pmd_t pmd) { @@ -1229,12 +1229,12 @@ static inline unsigned long pmd_deref(pmd_t pmd) origin_mask = _SEGMENT_ENTRY_ORIGIN; if (pmd_large(pmd)) origin_mask = _SEGMENT_ENTRY_ORIGIN_LARGE; - return pmd_val(pmd) & origin_mask; + return (unsigned long)__va(pmd_val(pmd) & origin_mask); } static inline unsigned long pmd_pfn(pmd_t pmd) { - return pmd_deref(pmd) >> PAGE_SHIFT; + return __pa(pmd_deref(pmd)) >> PAGE_SHIFT; } static inline unsigned long pud_deref(pud_t pud) @@ -1244,12 +1244,12 @@ static inline unsigned long pud_deref(pud_t pud) origin_mask = _REGION_ENTRY_ORIGIN; if (pud_large(pud)) origin_mask = _REGION3_ENTRY_ORIGIN_LARGE; - return pud_val(pud) & origin_mask; + return (unsigned long)__va(pud_val(pud) & origin_mask); } static inline unsigned long pud_pfn(pud_t pud) { - return pud_deref(pud) >> PAGE_SHIFT; + return __pa(pud_deref(pud)) >> PAGE_SHIFT; } /* @@ -1329,7 +1329,7 @@ static inline bool gup_fast_permitted(unsigned long start, unsigned long end) } #define gup_fast_permitted gup_fast_permitted -#define pfn_pte(pfn,pgprot) mk_pte_phys(__pa((pfn) << PAGE_SHIFT),(pgprot)) +#define pfn_pte(pfn, pgprot) mk_pte_phys(((pfn) << PAGE_SHIFT), (pgprot)) #define pte_pfn(x) (pte_val(x) >> PAGE_SHIFT) #define pte_page(x) pfn_to_page(pte_pfn(x)) @@ -1636,7 +1636,7 @@ static inline pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, } #define pmdp_collapse_flush pmdp_collapse_flush -#define pfn_pmd(pfn, pgprot) mk_pmd_phys(__pa((pfn) << PAGE_SHIFT), (pgprot)) +#define pfn_pmd(pfn, pgprot) mk_pmd_phys(((pfn) << PAGE_SHIFT), (pgprot)) #define mk_pmd(page, pgprot) pfn_pmd(page_to_pfn(page), (pgprot)) static inline int pmd_trans_huge(pmd_t pmd) diff --git a/arch/s390/include/uapi/asm/perf_cpum_cf_diag.h b/arch/s390/include/uapi/asm/perf_cpum_cf_diag.h new file mode 100644 index 000000000000..3d8284b95f87 --- /dev/null +++ b/arch/s390/include/uapi/asm/perf_cpum_cf_diag.h @@ -0,0 +1,51 @@ +/* SPDX-License-Identifier: GPL-2.0 WITH Linux-syscall-note */ +/* + * Copyright IBM Corp. 2021 + * Interface implementation for communication with the CPU Measurement + * counter facility device driver. + * + * Author(s): Thomas Richter + * + * Define for ioctl() commands to communicate with the CPU Measurement + * counter facility device driver. + */ + +#ifndef _PERF_CPUM_CF_DIAG_H +#define _PERF_CPUM_CF_DIAG_H + +#include +#include + +#define S390_HWCTR_DEVICE "hwctr" +#define S390_HWCTR_START_VERSION 1 + +struct s390_ctrset_start { /* Set CPUs to operate on */ + __u64 version; /* Version of interface */ + __u64 data_bytes; /* # of bytes required */ + __u64 cpumask_len; /* Length of CPU mask in bytes */ + __u64 *cpumask; /* Pointer to CPU mask */ + __u64 counter_sets; /* Bit mask of counter sets to get */ +}; + +struct s390_ctrset_setdata { /* Counter set data */ + __u32 set; /* Counter set number */ + __u32 no_cnts; /* # of counters stored in cv[] */ + __u64 cv[0]; /* Counter values (variable length) */ +}; + +struct s390_ctrset_cpudata { /* Counter set data per CPU */ + __u32 cpu_nr; /* CPU number */ + __u32 no_sets; /* # of counters sets in data[] */ + struct s390_ctrset_setdata data[0]; +}; + +struct s390_ctrset_read { /* Structure to get all ctr sets */ + __u64 no_cpus; /* Total # of CPUs data taken from */ + struct s390_ctrset_cpudata data[0]; +}; + +#define S390_HWCTR_MAGIC 'C' /* Random magic # for ioctls */ +#define S390_HWCTR_START _IOWR(S390_HWCTR_MAGIC, 1, struct s390_ctrset_start) +#define S390_HWCTR_STOP _IO(S390_HWCTR_MAGIC, 2) +#define S390_HWCTR_READ _IOWR(S390_HWCTR_MAGIC, 3, struct s390_ctrset_read) +#endif diff --git a/arch/s390/kernel/perf_cpum_cf_diag.c b/arch/s390/kernel/perf_cpum_cf_diag.c index b5c86fb70d63..db4877bbb9aa 100644 --- a/arch/s390/kernel/perf_cpum_cf_diag.c +++ b/arch/s390/kernel/perf_cpum_cf_diag.c @@ -2,7 +2,7 @@ /* * Performance event support for s390x - CPU-measurement Counter Sets * - * Copyright IBM Corp. 2019 + * Copyright IBM Corp. 2019, 2021 * Author(s): Hendrik Brueckner * Thomas Richer */ @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include @@ -24,15 +26,20 @@ #include #include -#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ +#include +#define CF_DIAG_CTRSET_DEF 0xfeef /* Counter set header mark */ +#define CF_DIAG_MIN_INTERVAL 60 /* Minimum counter set read */ + /* interval in seconds */ +static unsigned long cf_diag_interval = CF_DIAG_MIN_INTERVAL; static unsigned int cf_diag_cpu_speed; static debug_info_t *cf_diag_dbg; -struct cf_diag_csd { /* Counter set data per CPU */ +struct cf_diag_csd { /* Counter set data per CPU */ size_t used; /* Bytes used in data/start */ unsigned char start[PAGE_SIZE]; /* Counter set at event start */ unsigned char data[PAGE_SIZE]; /* Counter set at event delete */ + unsigned int sets; /* # Counter set saved in data */ }; static DEFINE_PER_CPU(struct cf_diag_csd, cf_diag_csd); @@ -178,18 +185,35 @@ static void cf_diag_disable(struct pmu *pmu) /* Number of perf events counting hardware events */ static atomic_t cf_diag_events = ATOMIC_INIT(0); +/* Used to avoid races in calling reserve/release_cpumf_hardware */ +static DEFINE_MUTEX(cf_diag_reserve_mutex); /* Release the PMU if event is the last perf event */ static void cf_diag_perf_event_destroy(struct perf_event *event) { debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d cf_diag_events %d\n", - __func__, event, event->cpu, + __func__, event, smp_processor_id(), atomic_read(&cf_diag_events)); if (atomic_dec_return(&cf_diag_events) == 0) __kernel_cpumcf_end(); } +static int get_authctrsets(void) +{ + struct cpu_cf_events *cpuhw; + unsigned long auth = 0; + enum cpumf_ctr_set i; + + cpuhw = &get_cpu_var(cpu_cf_events); + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i]) + auth |= cpumf_ctr_ctl[i]; + } + put_cpu_var(cpu_cf_events); + return auth; +} + /* Setup the event. Test for authorized counter sets and only include counter * sets which are authorized at the time of the setup. Including unauthorized * counter sets result in specification exception (and panic). @@ -197,15 +221,12 @@ static void cf_diag_perf_event_destroy(struct perf_event *event) static int __hw_perf_event_init(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; - struct cpu_cf_events *cpuhw; - enum cpumf_ctr_set i; int err = 0; debug_sprintf_event(cf_diag_dbg, 5, "%s event %p cpu %d\n", __func__, event, event->cpu); event->hw.config = attr->config; - event->hw.config_base = 0; /* Add all authorized counter sets to config_base. The * the hardware init function is either called per-cpu or just once @@ -215,11 +236,7 @@ static int __hw_perf_event_init(struct perf_event *event) * Checking the authorization on any CPU is fine as the hardware * applies the same authorization settings to all CPUs. */ - cpuhw = &get_cpu_var(cpu_cf_events); - for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) - if (cpuhw->info.auth_ctl & cpumf_ctr_ctl[i]) - event->hw.config_base |= cpumf_ctr_ctl[i]; - put_cpu_var(cpu_cf_events); + event->hw.config_base = get_authctrsets(); /* No authorized counter sets, nothing to count/sample */ if (!event->hw.config_base) { @@ -237,6 +254,25 @@ out: return err; } +/* Return 0 if the CPU-measurement counter facility is currently free + * and an error otherwise. + */ +static int cf_diag_perf_event_inuse(void) +{ + int err = 0; + + if (!atomic_inc_not_zero(&cf_diag_events)) { + mutex_lock(&cf_diag_reserve_mutex); + if (atomic_read(&cf_diag_events) == 0 && + __kernel_cpumcf_begin()) + err = -EBUSY; + else + err = atomic_inc_return(&cf_diag_events); + mutex_unlock(&cf_diag_reserve_mutex); + } + return err; +} + static int cf_diag_event_init(struct perf_event *event) { struct perf_event_attr *attr = &event->attr; @@ -264,13 +300,9 @@ static int cf_diag_event_init(struct perf_event *event) } /* Initialize for using the CPU-measurement counter facility */ - if (atomic_inc_return(&cf_diag_events) == 1) { - if (__kernel_cpumcf_begin()) { - atomic_dec(&cf_diag_events); - err = -EBUSY; - goto out; - } - } + err = cf_diag_perf_event_inuse(); + if (err < 0) + goto out; event->destroy = cf_diag_perf_event_destroy; err = __hw_perf_event_init(event); @@ -599,6 +631,8 @@ static void cf_diag_del(struct perf_event *event, int flags) cpuhw->flags &= ~PMU_F_IN_USE; } +/* Default counter set events and format attribute groups */ + CPUMF_EVENT_ATTR(CF_DIAG, CF_DIAG, PERF_EVENT_CPUM_CF_DIAG); static struct attribute *cf_diag_events_attr[] = { @@ -663,6 +697,452 @@ static void cf_diag_get_cpu_speed(void) } } +/* Code to create device and file I/O operations */ +static atomic_t ctrset_opencnt = ATOMIC_INIT(0); /* Excl. access */ + +static int cf_diag_open(struct inode *inode, struct file *file) +{ + int err = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (atomic_xchg(&ctrset_opencnt, 1)) + return -EBUSY; + + /* Avoid concurrent access with perf_event_open() system call */ + mutex_lock(&cf_diag_reserve_mutex); + if (atomic_read(&cf_diag_events) || __kernel_cpumcf_begin()) + err = -EBUSY; + mutex_unlock(&cf_diag_reserve_mutex); + if (err) { + atomic_set(&ctrset_opencnt, 0); + return err; + } + file->private_data = NULL; + debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__); + /* nonseekable_open() never fails */ + return nonseekable_open(inode, file); +} + +/* Variables for ioctl() interface support */ +static DEFINE_MUTEX(cf_diag_ctrset_mutex); +static struct cf_diag_ctrset { + unsigned long ctrset; /* Bit mask of counter set to read */ + cpumask_t mask; /* CPU mask to read from */ + time64_t lastread; /* Epoch counter set last read */ +} cf_diag_ctrset; + +static void cf_diag_ctrset_clear(void) +{ + cpumask_clear(&cf_diag_ctrset.mask); + cf_diag_ctrset.ctrset = 0; +} + +static void cf_diag_release_cpu(void *p) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + + debug_sprintf_event(cf_diag_dbg, 3, "%s cpu %d\n", __func__, + smp_processor_id()); + lcctl(0); /* Reset counter sets */ + cpuhw->state = 0; /* Save state in CPU hardware state */ +} + +/* Release function is also called when application gets terminated without + * doing a proper ioctl(..., S390_HWCTR_STOP, ...) command. + * Since only one application is allowed to open the device, simple stop all + * CPU counter sets. + */ +static int cf_diag_release(struct inode *inode, struct file *file) +{ + on_each_cpu(cf_diag_release_cpu, NULL, 1); + cf_diag_ctrset_clear(); + atomic_set(&ctrset_opencnt, 0); + __kernel_cpumcf_end(); + debug_sprintf_event(cf_diag_dbg, 2, "%s\n", __func__); + return 0; +} + +struct cf_diag_call_on_cpu_parm { /* Parm struct for smp_call_on_cpu */ + unsigned int sets; /* Counter set bit mask */ + atomic_t cpus_ack; /* # CPUs successfully executed func */ +}; + +static int cf_diag_all_copy(unsigned long arg, cpumask_t *mask) +{ + struct s390_ctrset_read __user *ctrset_read; + unsigned int cpu, cpus, rc; + void __user *uptr; + + ctrset_read = (struct s390_ctrset_read __user *)arg; + uptr = ctrset_read->data; + for_each_cpu(cpu, mask) { + struct cf_diag_csd *csd = per_cpu_ptr(&cf_diag_csd, cpu); + struct s390_ctrset_cpudata __user *ctrset_cpudata; + + ctrset_cpudata = uptr; + debug_sprintf_event(cf_diag_dbg, 5, "%s cpu %d used %zd\n", + __func__, cpu, csd->used); + rc = put_user(cpu, &ctrset_cpudata->cpu_nr); + rc |= put_user(csd->sets, &ctrset_cpudata->no_sets); + rc |= copy_to_user(ctrset_cpudata->data, csd->data, csd->used); + if (rc) + return -EFAULT; + uptr += sizeof(struct s390_ctrset_cpudata) + csd->used; + cond_resched(); + } + cpus = cpumask_weight(mask); + if (put_user(cpus, &ctrset_read->no_cpus)) + return -EFAULT; + debug_sprintf_event(cf_diag_dbg, 5, "%s copied %ld\n", + __func__, uptr - (void __user *)ctrset_read->data); + return 0; +} + +static size_t cf_diag_cpuset_read(struct s390_ctrset_setdata *p, int ctrset, + int ctrset_size, size_t room) +{ + size_t need = 0; + int rc = -1; + + need = sizeof(*p) + sizeof(u64) * ctrset_size; + debug_sprintf_event(cf_diag_dbg, 5, + "%s room %zd need %zd set %#x set_size %d\n", + __func__, room, need, ctrset, ctrset_size); + if (need <= room) { + p->set = cpumf_ctr_ctl[ctrset]; + p->no_cnts = ctrset_size; + rc = ctr_stcctm(ctrset, ctrset_size, (u64 *)p->cv); + if (rc == 3) /* Nothing stored */ + need = 0; + } + debug_sprintf_event(cf_diag_dbg, 5, "%s need %zd rc %d\n", __func__, + need, rc); + return need; +} + +/* Read all counter sets. Since the perf_event_open() system call with + * event cpum_cf_diag/.../ is blocked when this interface is active, reuse + * the perf_event_open() data buffer to store the counter sets. + */ +static void cf_diag_cpu_read(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cf_diag_csd *csd = this_cpu_ptr(&cf_diag_csd); + struct cf_diag_call_on_cpu_parm *p = parm; + int set, set_size; + size_t space; + + debug_sprintf_event(cf_diag_dbg, 5, + "%s new %#x flags %#x state %#llx\n", + __func__, p->sets, cpuhw->flags, + cpuhw->state); + /* No data saved yet */ + csd->used = 0; + csd->sets = 0; + memset(csd->data, 0, sizeof(csd->data)); + + /* Scan the counter sets */ + for (set = CPUMF_CTR_SET_BASIC; set < CPUMF_CTR_SET_MAX; ++set) { + struct s390_ctrset_setdata *sp = (void *)csd->data + csd->used; + + if (!(p->sets & cpumf_ctr_ctl[set])) + continue; /* Counter set not in list */ + set_size = cf_diag_ctrset_size(set, &cpuhw->info); + space = sizeof(csd->data) - csd->used; + space = cf_diag_cpuset_read(sp, set, set_size, space); + if (space) { + csd->used += space; + csd->sets += 1; + } + debug_sprintf_event(cf_diag_dbg, 5, "%s sp %px space %zd\n", + __func__, sp, space); + } + debug_sprintf_event(cf_diag_dbg, 5, "%s sets %d used %zd\n", __func__, + csd->sets, csd->used); +} + +static int cf_diag_all_read(unsigned long arg) +{ + struct cf_diag_call_on_cpu_parm p; + cpumask_var_t mask; + time64_t now; + int rc = 0; + + debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__); + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + now = ktime_get_seconds(); + if (cf_diag_ctrset.lastread + cf_diag_interval > now) { + debug_sprintf_event(cf_diag_dbg, 5, "%s now %lld " + " lastread %lld\n", __func__, now, + cf_diag_ctrset.lastread); + rc = -EAGAIN; + goto out; + } else { + cf_diag_ctrset.lastread = now; + } + p.sets = cf_diag_ctrset.ctrset; + cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask); + on_each_cpu_mask(mask, cf_diag_cpu_read, &p, 1); + rc = cf_diag_all_copy(arg, mask); +out: + free_cpumask_var(mask); + debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d\n", __func__, rc); + return rc; +} + +/* Stop all counter sets via ioctl interface */ +static void cf_diag_ioctl_off(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cf_diag_call_on_cpu_parm *p = parm; + int rc; + + debug_sprintf_event(cf_diag_dbg, 5, + "%s new %#x flags %#x state %#llx\n", + __func__, p->sets, cpuhw->flags, + cpuhw->state); + + ctr_set_multiple_disable(&cpuhw->state, p->sets); + ctr_set_multiple_stop(&cpuhw->state, p->sets); + rc = lcctl(cpuhw->state); /* Stop counter sets */ + if (!cpuhw->state) + cpuhw->flags &= ~PMU_F_IN_USE; + debug_sprintf_event(cf_diag_dbg, 5, + "%s rc %d flags %#x state %#llx\n", __func__, + rc, cpuhw->flags, cpuhw->state); +} + +/* Start counter sets on particular CPU */ +static void cf_diag_ioctl_on(void *parm) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + struct cf_diag_call_on_cpu_parm *p = parm; + int rc; + + debug_sprintf_event(cf_diag_dbg, 5, + "%s new %#x flags %#x state %#llx\n", + __func__, p->sets, cpuhw->flags, + cpuhw->state); + + if (!(cpuhw->flags & PMU_F_IN_USE)) + cpuhw->state = 0; + cpuhw->flags |= PMU_F_IN_USE; + rc = lcctl(cpuhw->state); /* Reset unused counter sets */ + ctr_set_multiple_enable(&cpuhw->state, p->sets); + ctr_set_multiple_start(&cpuhw->state, p->sets); + rc |= lcctl(cpuhw->state); /* Start counter sets */ + if (!rc) + atomic_inc(&p->cpus_ack); + debug_sprintf_event(cf_diag_dbg, 5, "%s rc %d state %#llx\n", + __func__, rc, cpuhw->state); +} + +static int cf_diag_all_stop(void) +{ + struct cf_diag_call_on_cpu_parm p = { + .sets = cf_diag_ctrset.ctrset, + }; + cpumask_var_t mask; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask); + on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1); + free_cpumask_var(mask); + return 0; +} + +static int cf_diag_all_start(void) +{ + struct cf_diag_call_on_cpu_parm p = { + .sets = cf_diag_ctrset.ctrset, + .cpus_ack = ATOMIC_INIT(0), + }; + cpumask_var_t mask; + int rc = 0; + + if (!alloc_cpumask_var(&mask, GFP_KERNEL)) + return -ENOMEM; + cpumask_and(mask, &cf_diag_ctrset.mask, cpu_online_mask); + on_each_cpu_mask(mask, cf_diag_ioctl_on, &p, 1); + if (atomic_read(&p.cpus_ack) != cpumask_weight(mask)) { + on_each_cpu_mask(mask, cf_diag_ioctl_off, &p, 1); + rc = -EIO; + } + free_cpumask_var(mask); + return rc; +} + +/* Return the maximum required space for all possible CPUs in case one + * CPU will be onlined during the START, READ, STOP cycles. + * To find out the size of the counter sets, any one CPU will do. They + * all have the same counter sets. + */ +static size_t cf_diag_needspace(unsigned int sets) +{ + struct cpu_cf_events *cpuhw = this_cpu_ptr(&cpu_cf_events); + size_t bytes = 0; + int i; + + for (i = CPUMF_CTR_SET_BASIC; i < CPUMF_CTR_SET_MAX; ++i) { + if (!(sets & cpumf_ctr_ctl[i])) + continue; + bytes += cf_diag_ctrset_size(i, &cpuhw->info) * sizeof(u64) + + sizeof(((struct s390_ctrset_setdata *)0)->set) + + sizeof(((struct s390_ctrset_setdata *)0)->no_cnts); + } + bytes = sizeof(((struct s390_ctrset_read *)0)->no_cpus) + nr_cpu_ids * + (bytes + sizeof(((struct s390_ctrset_cpudata *)0)->cpu_nr) + + sizeof(((struct s390_ctrset_cpudata *)0)->no_sets)); + debug_sprintf_event(cf_diag_dbg, 5, "%s bytes %ld\n", __func__, + bytes); + return bytes; +} + +static long cf_diag_ioctl_read(unsigned long arg) +{ + struct s390_ctrset_read read; + int ret = 0; + + debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__); + if (copy_from_user(&read, (char __user *)arg, sizeof(read))) + return -EFAULT; + ret = cf_diag_all_read(arg); + debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret); + return ret; +} + +static long cf_diag_ioctl_stop(void) +{ + int ret; + + debug_sprintf_event(cf_diag_dbg, 5, "%s\n", __func__); + ret = cf_diag_all_stop(); + cf_diag_ctrset_clear(); + debug_sprintf_event(cf_diag_dbg, 5, "%s ret %d\n", __func__, ret); + return ret; +} + +static long cf_diag_ioctl_start(unsigned long arg) +{ + struct s390_ctrset_start __user *ustart; + struct s390_ctrset_start start; + void __user *umask; + unsigned int len; + int ret = 0; + size_t need; + + if (cf_diag_ctrset.ctrset) + return -EBUSY; + ustart = (struct s390_ctrset_start __user *)arg; + if (copy_from_user(&start, ustart, sizeof(start))) + return -EFAULT; + if (start.version != S390_HWCTR_START_VERSION) + return -EINVAL; + if (start.counter_sets & ~(cpumf_ctr_ctl[CPUMF_CTR_SET_BASIC] | + cpumf_ctr_ctl[CPUMF_CTR_SET_USER] | + cpumf_ctr_ctl[CPUMF_CTR_SET_CRYPTO] | + cpumf_ctr_ctl[CPUMF_CTR_SET_EXT] | + cpumf_ctr_ctl[CPUMF_CTR_SET_MT_DIAG])) + return -EINVAL; /* Invalid counter set */ + if (!start.counter_sets) + return -EINVAL; /* No counter set at all? */ + cpumask_clear(&cf_diag_ctrset.mask); + len = min_t(u64, start.cpumask_len, cpumask_size()); + umask = (void __user *)start.cpumask; + if (copy_from_user(&cf_diag_ctrset.mask, umask, len)) + return -EFAULT; + if (cpumask_empty(&cf_diag_ctrset.mask)) + return -EINVAL; + need = cf_diag_needspace(start.counter_sets); + if (put_user(need, &ustart->data_bytes)) + ret = -EFAULT; + if (ret) + goto out; + cf_diag_ctrset.ctrset = start.counter_sets; + ret = cf_diag_all_start(); +out: + if (ret) + cf_diag_ctrset_clear(); + debug_sprintf_event(cf_diag_dbg, 2, "%s sets %#lx need %ld ret %d\n", + __func__, cf_diag_ctrset.ctrset, need, ret); + return ret; +} + +static long cf_diag_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + int ret; + + debug_sprintf_event(cf_diag_dbg, 2, "%s cmd %#x arg %lx\n", __func__, + cmd, arg); + get_online_cpus(); + mutex_lock(&cf_diag_ctrset_mutex); + switch (cmd) { + case S390_HWCTR_START: + ret = cf_diag_ioctl_start(arg); + break; + case S390_HWCTR_STOP: + ret = cf_diag_ioctl_stop(); + break; + case S390_HWCTR_READ: + ret = cf_diag_ioctl_read(arg); + break; + default: + ret = -ENOTTY; + break; + } + mutex_unlock(&cf_diag_ctrset_mutex); + put_online_cpus(); + debug_sprintf_event(cf_diag_dbg, 2, "%s ret %d\n", __func__, ret); + return ret; +} + +static const struct file_operations cf_diag_fops = { + .owner = THIS_MODULE, + .open = cf_diag_open, + .release = cf_diag_release, + .unlocked_ioctl = cf_diag_ioctl, + .compat_ioctl = cf_diag_ioctl, + .llseek = no_llseek +}; + +static struct miscdevice cf_diag_dev = { + .name = S390_HWCTR_DEVICE, + .minor = MISC_DYNAMIC_MINOR, + .fops = &cf_diag_fops, +}; + +static int cf_diag_online_cpu(unsigned int cpu) +{ + struct cf_diag_call_on_cpu_parm p; + + mutex_lock(&cf_diag_ctrset_mutex); + if (!cf_diag_ctrset.ctrset) + goto out; + p.sets = cf_diag_ctrset.ctrset; + cf_diag_ioctl_on(&p); +out: + mutex_unlock(&cf_diag_ctrset_mutex); + return 0; +} + +static int cf_diag_offline_cpu(unsigned int cpu) +{ + struct cf_diag_call_on_cpu_parm p; + + mutex_lock(&cf_diag_ctrset_mutex); + if (!cf_diag_ctrset.ctrset) + goto out; + p.sets = cf_diag_ctrset.ctrset; + cf_diag_ioctl_off(&p); +out: + mutex_unlock(&cf_diag_ctrset_mutex); + return 0; +} + /* Initialize the counter set PMU to generate complete counter set data as * event raw data. This relies on the CPU Measurement Counter Facility device * already being loaded and initialized. @@ -685,21 +1165,43 @@ static int __init cf_diag_init(void) return -ENOMEM; } + rc = misc_register(&cf_diag_dev); + if (rc) { + pr_err("Registration of /dev/" S390_HWCTR_DEVICE + "failed rc=%d\n", rc); + goto out; + } + /* Setup s390dbf facility */ cf_diag_dbg = debug_register(KMSG_COMPONENT, 2, 1, 128); if (!cf_diag_dbg) { pr_err("Registration of s390dbf(cpum_cf_diag) failed\n"); - return -ENOMEM; + rc = -ENOMEM; + goto out_dbf; } debug_register_view(cf_diag_dbg, &debug_sprintf_view); rc = perf_pmu_register(&cf_diag, "cpum_cf_diag", -1); if (rc) { - debug_unregister_view(cf_diag_dbg, &debug_sprintf_view); - debug_unregister(cf_diag_dbg); pr_err("Registration of PMU(cpum_cf_diag) failed with rc=%i\n", rc); + goto out_perf; } + rc = cpuhp_setup_state_nocalls(CPUHP_AP_PERF_S390_CFD_ONLINE, + "perf/s390/cfd:online", + cf_diag_online_cpu, cf_diag_offline_cpu); + if (!rc) + goto out; + + pr_err("Registration of CPUHP_AP_PERF_S390_CFD_ONLINE failed rc=%i\n", + rc); + perf_pmu_unregister(&cf_diag); +out_perf: + debug_unregister_view(cf_diag_dbg, &debug_sprintf_view); + debug_unregister(cf_diag_dbg); +out_dbf: + misc_deregister(&cf_diag_dev); +out: return rc; } -arch_initcall(cf_diag_init); +device_initcall(cf_diag_init); diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c index 367bd000f6d1..e20bed1ed34a 100644 --- a/arch/s390/kernel/process.c +++ b/arch/s390/kernel/process.c @@ -130,7 +130,7 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp, frame->sf.gprs[9] = (unsigned long)frame; /* Store access registers to kernel stack of new process. */ - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { /* kernel thread */ memset(&frame->childregs, 0, sizeof(struct pt_regs)); frame->childregs.psw.mask = PSW_KERNEL_BITS | PSW_MASK_DAT | diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c index e299892440b6..58c8afa3da65 100644 --- a/arch/s390/kernel/smp.c +++ b/arch/s390/kernel/smp.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -62,6 +63,7 @@ enum { ec_call_function_single, ec_stop_cpu, ec_mcck_pending, + ec_irq_work, }; enum { @@ -434,10 +436,12 @@ void notrace smp_yield_cpu(int cpu) */ void notrace smp_emergency_stop(void) { - cpumask_t cpumask; + static arch_spinlock_t lock = __ARCH_SPIN_LOCK_UNLOCKED; + static cpumask_t cpumask; u64 end; int cpu; + arch_spin_lock(&lock); cpumask_copy(&cpumask, cpu_online_mask); cpumask_clear_cpu(smp_processor_id(), &cpumask); @@ -458,6 +462,7 @@ void notrace smp_emergency_stop(void) break; cpu_relax(); } + arch_spin_unlock(&lock); } NOKPROBE_SYMBOL(smp_emergency_stop); @@ -505,6 +510,8 @@ static void smp_handle_ext_call(void) generic_smp_call_function_single_interrupt(); if (test_bit(ec_mcck_pending, &bits)) __s390_handle_mcck(); + if (test_bit(ec_irq_work, &bits)) + irq_work_run(); } static void do_ext_call_interrupt(struct ext_code ext_code, @@ -537,6 +544,13 @@ void smp_send_reschedule(int cpu) pcpu_ec_call(pcpu_devices + cpu, ec_schedule); } +#ifdef CONFIG_IRQ_WORK +void arch_irq_work_raise(void) +{ + pcpu_ec_call(pcpu_devices + smp_processor_id(), ec_irq_work); +} +#endif + /* * parameter area for the set/clear control bit callbacks */ @@ -775,11 +789,13 @@ static int smp_add_core(struct sclp_core_entry *core, cpumask_t *avail, static int __smp_rescan_cpus(struct sclp_core_info *info, bool early) { struct sclp_core_entry *core; - cpumask_t avail; + static cpumask_t avail; bool configured; u16 core_id; int nr, i; + get_online_cpus(); + mutex_lock(&smp_cpu_state_mutex); nr = 0; cpumask_xor(&avail, cpu_possible_mask, cpu_present_mask); /* @@ -800,6 +816,8 @@ static int __smp_rescan_cpus(struct sclp_core_info *info, bool early) configured = i < info->configured; nr += smp_add_core(&info->core[i], &avail, configured, early); } + mutex_unlock(&smp_cpu_state_mutex); + put_online_cpus(); return nr; } @@ -847,9 +865,7 @@ void __init smp_detect_cpus(void) pr_info("%d configured CPUs, %d standby CPUs\n", c_cpus, s_cpus); /* Add CPUs present at boot */ - get_online_cpus(); __smp_rescan_cpus(info, true); - put_online_cpus(); memblock_free_early((unsigned long)info, sizeof(*info)); } @@ -1178,11 +1194,7 @@ int __ref smp_rescan_cpus(void) if (!info) return -ENOMEM; smp_get_core_info(info, 0); - get_online_cpus(); - mutex_lock(&smp_cpu_state_mutex); nr = __smp_rescan_cpus(info, false); - mutex_unlock(&smp_cpu_state_mutex); - put_online_cpus(); kfree(info); if (nr) topology_schedule_update(); diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c index ca47141a5be9..e7ce447651b9 100644 --- a/arch/s390/kernel/topology.c +++ b/arch/s390/kernel/topology.c @@ -62,16 +62,16 @@ static struct mask_info drawer_info; struct cpu_topology_s390 cpu_topology[NR_CPUS]; EXPORT_SYMBOL_GPL(cpu_topology); -static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) +static void cpu_group_map(cpumask_t *dst, struct mask_info *info, unsigned int cpu) { - cpumask_t mask; + static cpumask_t mask; cpumask_copy(&mask, cpumask_of(cpu)); switch (topology_mode) { case TOPOLOGY_MODE_HW: while (info) { if (cpumask_test_cpu(cpu, &info->mask)) { - mask = info->mask; + cpumask_copy(&mask, &info->mask); break; } info = info->next; @@ -89,23 +89,24 @@ static cpumask_t cpu_group_map(struct mask_info *info, unsigned int cpu) break; } cpumask_and(&mask, &mask, cpu_online_mask); - return mask; + cpumask_copy(dst, &mask); } -static cpumask_t cpu_thread_map(unsigned int cpu) +static void cpu_thread_map(cpumask_t *dst, unsigned int cpu) { - cpumask_t mask; + static cpumask_t mask; int i; cpumask_copy(&mask, cpumask_of(cpu)); if (topology_mode != TOPOLOGY_MODE_HW) - return mask; + goto out; cpu -= cpu % (smp_cpu_mtid + 1); for (i = 0; i <= smp_cpu_mtid; i++) if (cpu_present(cpu + i)) cpumask_set_cpu(cpu + i, &mask); cpumask_and(&mask, &mask, cpu_online_mask); - return mask; +out: + cpumask_copy(dst, &mask); } #define TOPOLOGY_CORE_BITS 64 @@ -250,10 +251,10 @@ void update_cpu_masks(void) for_each_possible_cpu(cpu) { topo = &cpu_topology[cpu]; - topo->thread_mask = cpu_thread_map(cpu); - topo->core_mask = cpu_group_map(&socket_info, cpu); - topo->book_mask = cpu_group_map(&book_info, cpu); - topo->drawer_mask = cpu_group_map(&drawer_info, cpu); + cpu_thread_map(&topo->thread_mask, cpu); + cpu_group_map(&topo->core_mask, &socket_info, cpu); + cpu_group_map(&topo->book_mask, &book_info, cpu); + cpu_group_map(&topo->drawer_mask, &drawer_info, cpu); topo->booted_cores = 0; if (topology_mode != TOPOLOGY_MODE_HW) { id = topology_mode == TOPOLOGY_MODE_PACKAGE ? 0 : cpu; diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 4e87c819ddea..781965f7210e 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -58,7 +58,7 @@ unsigned long *crst_table_alloc(struct mm_struct *mm) if (!page) return NULL; arch_set_page_dat(page, 2); - return (unsigned long *) page_to_phys(page); + return (unsigned long *) page_to_virt(page); } void crst_table_free(struct mm_struct *mm, unsigned long *table) @@ -161,7 +161,7 @@ struct page *page_table_alloc_pgste(struct mm_struct *mm) page = alloc_page(GFP_KERNEL); if (page) { - table = (u64 *)page_to_phys(page); + table = (u64 *)page_to_virt(page); memset64(table, _PAGE_INVALID, PTRS_PER_PTE); memset64(table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } @@ -194,7 +194,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) mask = atomic_read(&page->_refcount) >> 24; mask = (mask | (mask >> 4)) & 3; if (mask != 3) { - table = (unsigned long *) page_to_phys(page); + table = (unsigned long *) page_to_virt(page); bit = mask & 1; /* =1 -> second 2K */ if (bit) table += PTRS_PER_PTE; @@ -217,7 +217,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) } arch_set_page_dat(page, 0); /* Initialize page table */ - table = (unsigned long *) page_to_phys(page); + table = (unsigned long *) page_to_virt(page); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ atomic_xor_bits(&page->_refcount, 3 << 24); @@ -239,10 +239,10 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) struct page *page; unsigned int bit, mask; - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + page = virt_to_page(table); if (!mm_alloc_pgste(mm)) { /* Free 2K page table fragment of a 4K page */ - bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); + bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24)); mask >>= 24; @@ -269,14 +269,14 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, unsigned int bit, mask; mm = tlb->mm; - page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + page = virt_to_page(table); if (mm_alloc_pgste(mm)) { gmap_unlink(mm, table, vmaddr); - table = (unsigned long *) (__pa(table) | 3); + table = (unsigned long *) ((unsigned long)table | 3); tlb_remove_table(tlb, table); return; } - bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); + bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; @@ -285,7 +285,7 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, else list_del(&page->lru); spin_unlock_bh(&mm->context.lock); - table = (unsigned long *) (__pa(table) | (1U << bit)); + table = (unsigned long *) ((unsigned long) table | (1U << bit)); tlb_remove_table(tlb, table); } @@ -293,7 +293,7 @@ void __tlb_remove_table(void *_table) { unsigned int mask = (unsigned long) _table & 3; void *table = (void *)((unsigned long) _table ^ mask); - struct page *page = pfn_to_page(__pa(table) >> PAGE_SHIFT); + struct page *page = virt_to_page(table); switch (mask) { case 0: /* pmd, pud, or p4d */ diff --git a/arch/s390/mm/vmem.c b/arch/s390/mm/vmem.c index 82dbf9450105..96897fab89dc 100644 --- a/arch/s390/mm/vmem.c +++ b/arch/s390/mm/vmem.c @@ -27,14 +27,14 @@ static void __ref *vmem_alloc_pages(unsigned int order) if (slab_is_available()) return (void *)__get_free_pages(GFP_KERNEL, order); - return (void *) memblock_phys_alloc(size, size); + return memblock_alloc(size, size); } static void vmem_free_pages(unsigned long addr, int order) { /* We don't expect boot memory to be removed ever. */ if (!slab_is_available() || - WARN_ON_ONCE(PageReserved(phys_to_page(addr)))) + WARN_ON_ONCE(PageReserved(virt_to_page(addr)))) return; free_pages(addr, order); } @@ -57,7 +57,7 @@ pte_t __ref *vmem_pte_alloc(void) if (slab_is_available()) pte = (pte_t *) page_table_alloc(&init_mm); else - pte = (pte_t *) memblock_phys_alloc(size, size); + pte = (pte_t *) memblock_alloc(size, size); if (!pte) return NULL; memset64((u64 *)pte, _PAGE_INVALID, PTRS_PER_PTE); @@ -85,7 +85,7 @@ static void vmemmap_flush_unused_sub_pmd(void) { if (!unused_sub_pmd_start) return; - memset(__va(unused_sub_pmd_start), PAGE_UNUSED, + memset((void *)unused_sub_pmd_start, PAGE_UNUSED, ALIGN(unused_sub_pmd_start, PMD_SIZE) - unused_sub_pmd_start); unused_sub_pmd_start = 0; } @@ -98,7 +98,7 @@ static void vmemmap_mark_sub_pmd_used(unsigned long start, unsigned long end) * getting removed (just in case the memmap never gets initialized, * e.g., because the memory block never gets onlined). */ - memset(__va(start), 0, sizeof(struct page)); + memset((void *)start, 0, sizeof(struct page)); } static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) @@ -119,7 +119,7 @@ static void vmemmap_use_sub_pmd(unsigned long start, unsigned long end) static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) { - void *page = __va(ALIGN_DOWN(start, PMD_SIZE)); + unsigned long page = ALIGN_DOWN(start, PMD_SIZE); vmemmap_flush_unused_sub_pmd(); @@ -128,7 +128,7 @@ static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) /* Mark the unused parts of the new memmap page PAGE_UNUSED. */ if (!IS_ALIGNED(start, PMD_SIZE)) - memset(page, PAGE_UNUSED, start - __pa(page)); + memset((void *)page, PAGE_UNUSED, start - page); /* * We want to avoid memset(PAGE_UNUSED) when populating the vmemmap of * consecutive sections. Remember for the last added PMD the last @@ -141,11 +141,11 @@ static void vmemmap_use_new_sub_pmd(unsigned long start, unsigned long end) /* Returns true if the PMD is completely unused and can be freed. */ static bool vmemmap_unuse_sub_pmd(unsigned long start, unsigned long end) { - void *page = __va(ALIGN_DOWN(start, PMD_SIZE)); + unsigned long page = ALIGN_DOWN(start, PMD_SIZE); vmemmap_flush_unused_sub_pmd(); - memset(__va(start), PAGE_UNUSED, end - start); - return !memchr_inv(page, PAGE_UNUSED, PMD_SIZE); + memset((void *)start, PAGE_UNUSED, end - start); + return !memchr_inv((void *)page, PAGE_UNUSED, PMD_SIZE); } /* __ref: we'll only call vmemmap_alloc_block() via vmemmap_populate() */ @@ -166,7 +166,7 @@ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, if (pte_none(*pte)) continue; if (!direct) - vmem_free_pages(pfn_to_phys(pte_pfn(*pte)), 0); + vmem_free_pages((unsigned long) pfn_to_virt(pte_pfn(*pte)), 0); pte_clear(&init_mm, addr, pte); } else if (pte_none(*pte)) { if (!direct) { @@ -176,7 +176,7 @@ static int __ref modify_pte_table(pmd_t *pmd, unsigned long addr, goto out; pte_val(*pte) = __pa(new_page) | prot; } else { - pte_val(*pte) = addr | prot; + pte_val(*pte) = __pa(addr) | prot; } } else { continue; @@ -201,7 +201,7 @@ static void try_free_pte_table(pmd_t *pmd, unsigned long start) if (!pte_none(*pte)) return; } - vmem_pte_free(__va(pmd_deref(*pmd))); + vmem_pte_free((unsigned long *) pmd_deref(*pmd)); pmd_clear(pmd); } @@ -242,7 +242,7 @@ static int __ref modify_pmd_table(pud_t *pud, unsigned long addr, IS_ALIGNED(next, PMD_SIZE) && MACHINE_HAS_EDAT1 && addr && direct && !debug_pagealloc_enabled()) { - pmd_val(*pmd) = addr | prot; + pmd_val(*pmd) = __pa(addr) | prot; pages++; continue; } else if (!direct && MACHINE_HAS_EDAT1) { @@ -338,7 +338,7 @@ static int modify_pud_table(p4d_t *p4d, unsigned long addr, unsigned long end, IS_ALIGNED(next, PUD_SIZE) && MACHINE_HAS_EDAT2 && addr && direct && !debug_pagealloc_enabled()) { - pud_val(*pud) = addr | prot; + pud_val(*pud) = __pa(addr) | prot; pages++; continue; } diff --git a/arch/s390/tools/opcodes.txt b/arch/s390/tools/opcodes.txt index 46d8ed96cf06..0e207c46e8da 100644 --- a/arch/s390/tools/opcodes.txt +++ b/arch/s390/tools/opcodes.txt @@ -597,7 +597,7 @@ b9b3 cu42 RRE_RR b9bd trtre RRF_U0RR b9be srstu RRE_RR b9bf trte RRF_U0RR -b9c0 selhhhr RRF_RURR +b9c0 selfhr RRF_RURR b9c8 ahhhr RRF_R0RR2 b9c9 shhhr RRF_R0RR2 b9ca alhhhr RRF_R0RR2 diff --git a/arch/sh/boards/mach-landisk/gio.c b/arch/sh/boards/mach-landisk/gio.c index 1c0da99dfc60..ff2200fec29a 100644 --- a/arch/sh/boards/mach-landisk/gio.c +++ b/arch/sh/boards/mach-landisk/gio.c @@ -27,11 +27,10 @@ static int openCnt; static int gio_open(struct inode *inode, struct file *filp) { - int minor; + int minor = iminor(inode); int ret = -ENOENT; preempt_disable(); - minor = MINOR(inode->i_rdev); if (minor < DEVCOUNT) { if (openCnt > 0) { ret = -EALREADY; @@ -46,9 +45,8 @@ static int gio_open(struct inode *inode, struct file *filp) static int gio_close(struct inode *inode, struct file *filp) { - int minor; + int minor = iminor(inode); - minor = MINOR(inode->i_rdev); if (minor < DEVCOUNT) { openCnt--; } diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 80a5d1c66a51..1aa508eb0823 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -114,7 +114,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp, unsigned long arg, childregs = task_pt_regs(p); p->thread.sp = (unsigned long) childregs; - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { memset(childregs, 0, sizeof(struct pt_regs)); p->thread.pc = (unsigned long) ret_from_kernel_thread; childregs->regs[4] = arg; diff --git a/arch/sparc/kernel/process_32.c b/arch/sparc/kernel/process_32.c index 7649b14d69f8..b91e88058e0c 100644 --- a/arch/sparc/kernel/process_32.c +++ b/arch/sparc/kernel/process_32.c @@ -309,7 +309,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, ti->ksp = (unsigned long) new_stack; p->thread.kregs = childregs; - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { extern int nwindows; unsigned long psr; memset(new_stack, 0, STACKFRAME_SZ + TRACEREG_SZ); diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c index 6f8c7822fc06..7afd0a859a78 100644 --- a/arch/sparc/kernel/process_64.c +++ b/arch/sparc/kernel/process_64.c @@ -597,7 +597,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, sizeof(struct sparc_stackf)); t->fpsaved[0] = 0; - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { memset(child_trap_frame, 0, child_stack_sz); __thread_flag_byte_ptr(t)[TI_FLAG_BYTE_CWP] = (current_pt_regs()->tstate + 1) & TSTATE_CWP; diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c index 81d508daf67c..c5011064b5dd 100644 --- a/arch/um/kernel/process.c +++ b/arch/um/kernel/process.c @@ -157,7 +157,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct * p, unsigned long tls) { void (*handler)(void); - int kthread = current->flags & PF_KTHREAD; + int kthread = current->flags & (PF_KTHREAD | PF_IO_WORKER); int ret = 0; p->thread = (struct thread_struct) INIT_THREAD; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 145a7ac0c19a..9c214d7085a4 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -161,7 +161,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, unsigned long arg, #endif /* Kernel thread ? */ - if (unlikely(p->flags & PF_KTHREAD)) { + if (unlikely(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { memset(childregs, 0, sizeof(struct pt_regs)); kthread_frame_init(frame, sp, arg); return 0; diff --git a/arch/xtensa/kernel/process.c b/arch/xtensa/kernel/process.c index 397a7de56377..9534ef515d74 100644 --- a/arch/xtensa/kernel/process.c +++ b/arch/xtensa/kernel/process.c @@ -217,7 +217,7 @@ int copy_thread(unsigned long clone_flags, unsigned long usp_thread_fn, p->thread.sp = (unsigned long)childregs; - if (!(p->flags & PF_KTHREAD)) { + if (!(p->flags & (PF_KTHREAD | PF_IO_WORKER))) { struct pt_regs *regs = current_pt_regs(); unsigned long usp = usp_thread_fn ? usp_thread_fn : regs->areg[1]; diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 81d20ec93375..a370cde3ddd4 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -663,7 +663,7 @@ static inline int is_loop_device(struct file *file) { struct inode *i = file->f_mapping->host; - return i && S_ISBLK(i->i_mode) && MAJOR(i->i_rdev) == LOOP_MAJOR; + return i && S_ISBLK(i->i_mode) && imajor(i) == LOOP_MAJOR; } static int loop_validate_file(struct file *file, struct block_device *bdev) diff --git a/drivers/dax/super.c b/drivers/dax/super.c index cadbd0a1a1ef..5fa6ae9dbc8b 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -480,7 +480,7 @@ static void dax_free_inode(struct inode *inode) kfree(dax_dev->host); dax_dev->host = NULL; if (inode->i_rdev) - ida_simple_remove(&dax_minor_ida, MINOR(inode->i_rdev)); + ida_simple_remove(&dax_minor_ida, iminor(inode)); kmem_cache_free(dax_cache, dax_dev); } diff --git a/drivers/gpu/drm/tilcdc/Makefile b/drivers/gpu/drm/tilcdc/Makefile index 662bf3a348c9..f5190477de72 100644 --- a/drivers/gpu/drm/tilcdc/Makefile +++ b/drivers/gpu/drm/tilcdc/Makefile @@ -1,5 +1,5 @@ # SPDX-License-Identifier: GPL-2.0 -ifeq (, $(findstring -W,$(EXTRA_CFLAGS))) +ifeq (, $(findstring -W,$(KCFLAGS))) ccflags-y += -Werror endif diff --git a/drivers/i2c/busses/i2c-brcmstb.c b/drivers/i2c/busses/i2c-brcmstb.c index d4e0a0f6732a..ba766d24219e 100644 --- a/drivers/i2c/busses/i2c-brcmstb.c +++ b/drivers/i2c/busses/i2c-brcmstb.c @@ -316,7 +316,7 @@ static int brcmstb_send_i2c_cmd(struct brcmstb_i2c_dev *dev, goto cmd_out; } - if ((CMD_RD || CMD_WR) && + if ((cmd == CMD_RD || cmd == CMD_WR) && bsc_readl(dev, iic_enable) & BSC_IIC_EN_NOACK_MASK) { rc = -EREMOTEIO; dev_dbg(dev->device, "controller received NOACK intr for %s\n", diff --git a/drivers/i2c/busses/i2c-designware-core.h b/drivers/i2c/busses/i2c-designware-core.h index 85307cfa7109..5392b82f68a4 100644 --- a/drivers/i2c/busses/i2c-designware-core.h +++ b/drivers/i2c/busses/i2c-designware-core.h @@ -38,6 +38,8 @@ #define DW_IC_CON_TX_EMPTY_CTRL BIT(8) #define DW_IC_CON_RX_FIFO_FULL_HLD_CTRL BIT(9) +#define DW_IC_DATA_CMD_DAT GENMASK(7, 0) + /* * Registers offset */ diff --git a/drivers/i2c/busses/i2c-designware-master.c b/drivers/i2c/busses/i2c-designware-master.c index d6425ad6e6a3..dd27b9dbe931 100644 --- a/drivers/i2c/busses/i2c-designware-master.c +++ b/drivers/i2c/busses/i2c-designware-master.c @@ -432,7 +432,7 @@ i2c_dw_read(struct dw_i2c_dev *dev) regmap_read(dev->map, DW_IC_DATA_CMD, &tmp); /* Ensure length byte is a valid value */ if (flags & I2C_M_RECV_LEN && - tmp <= I2C_SMBUS_BLOCK_MAX && tmp > 0) { + (tmp & DW_IC_DATA_CMD_DAT) <= I2C_SMBUS_BLOCK_MAX && tmp > 0) { len = i2c_dw_recv_len(dev, tmp); } *buf++ = tmp; diff --git a/drivers/i2c/busses/i2c-exynos5.c b/drivers/i2c/busses/i2c-exynos5.c index 20a9881a0d6c..5ac30d95650c 100644 --- a/drivers/i2c/busses/i2c-exynos5.c +++ b/drivers/i2c/busses/i2c-exynos5.c @@ -606,6 +606,7 @@ static void exynos5_i2c_message_start(struct exynos5_i2c *i2c, int stop) u32 i2c_ctl; u32 int_en = 0; u32 i2c_auto_conf = 0; + u32 i2c_addr = 0; u32 fifo_ctl; unsigned long flags; unsigned short trig_lvl; @@ -640,7 +641,12 @@ static void exynos5_i2c_message_start(struct exynos5_i2c *i2c, int stop) int_en |= HSI2C_INT_TX_ALMOSTEMPTY_EN; } - writel(HSI2C_SLV_ADDR_MAS(i2c->msg->addr), i2c->regs + HSI2C_ADDR); + i2c_addr = HSI2C_SLV_ADDR_MAS(i2c->msg->addr); + + if (i2c->op_clock >= I2C_MAX_FAST_MODE_PLUS_FREQ) + i2c_addr |= HSI2C_MASTER_ID(MASTER_ID(i2c->adap.nr)); + + writel(i2c_addr, i2c->regs + HSI2C_ADDR); writel(fifo_ctl, i2c->regs + HSI2C_FIFO_CTL); writel(i2c_ctl, i2c->regs + HSI2C_CTL); diff --git a/drivers/i2c/busses/i2c-qcom-geni.c b/drivers/i2c/busses/i2c-qcom-geni.c index c3f584795911..214b4c913a13 100644 --- a/drivers/i2c/busses/i2c-qcom-geni.c +++ b/drivers/i2c/busses/i2c-qcom-geni.c @@ -375,32 +375,6 @@ static void geni_i2c_tx_msg_cleanup(struct geni_i2c_dev *gi2c, } } -static void geni_i2c_stop_xfer(struct geni_i2c_dev *gi2c) -{ - int ret; - u32 geni_status; - struct i2c_msg *cur; - - /* Resume device, as runtime suspend can happen anytime during transfer */ - ret = pm_runtime_get_sync(gi2c->se.dev); - if (ret < 0) { - dev_err(gi2c->se.dev, "Failed to resume device: %d\n", ret); - return; - } - - geni_status = readl_relaxed(gi2c->se.base + SE_GENI_STATUS); - if (geni_status & M_GENI_CMD_ACTIVE) { - cur = gi2c->cur; - geni_i2c_abort_xfer(gi2c); - if (cur->flags & I2C_M_RD) - geni_i2c_rx_msg_cleanup(gi2c, cur); - else - geni_i2c_tx_msg_cleanup(gi2c, cur); - } - - pm_runtime_put_sync_suspend(gi2c->se.dev); -} - static int geni_i2c_rx_one_msg(struct geni_i2c_dev *gi2c, struct i2c_msg *msg, u32 m_param) { @@ -676,13 +650,6 @@ static int geni_i2c_remove(struct platform_device *pdev) return 0; } -static void geni_i2c_shutdown(struct platform_device *pdev) -{ - struct geni_i2c_dev *gi2c = platform_get_drvdata(pdev); - - geni_i2c_stop_xfer(gi2c); -} - static int __maybe_unused geni_i2c_runtime_suspend(struct device *dev) { int ret; @@ -747,7 +714,6 @@ MODULE_DEVICE_TABLE(of, geni_i2c_dt_match); static struct platform_driver geni_i2c_driver = { .probe = geni_i2c_probe, .remove = geni_i2c_remove, - .shutdown = geni_i2c_shutdown, .driver = { .name = "geni_i2c", .pm = &geni_i2c_pm_ops, diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 646823ddd317..2d73407ee52e 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -20,7 +20,6 @@ #include #include #include -#include #include #include #include @@ -4582,12 +4581,8 @@ int sdhci_setup_host(struct sdhci_host *host) mmc->max_segs = SDHCI_MAX_SEGS; } else if (host->flags & SDHCI_USE_SDMA) { mmc->max_segs = 1; - if (swiotlb_max_segment()) { - unsigned int max_req_size = (1 << IO_TLB_SHIFT) * - IO_TLB_SEGSIZE; - mmc->max_req_size = min(mmc->max_req_size, - max_req_size); - } + mmc->max_req_size = min_t(size_t, mmc->max_req_size, + dma_max_mapping_size(mmc_dev(mmc))); } else { /* PIO */ mmc->max_segs = SDHCI_MAX_SEGS; } diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 7b6632c00ffd..38b0d694dfc9 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2632,6 +2632,7 @@ static void nvme_reset_work(struct work_struct *work) * Don't limit the IOMMU merged segment size. */ dma_set_max_seg_size(dev->dev, 0xffffffff); + dma_set_min_align_mask(dev->dev, NVME_CTRL_PAGE_SIZE - 1); mutex_unlock(&dev->shutdown_lock); diff --git a/drivers/rtc/rtc-m41t80.c b/drivers/rtc/rtc-m41t80.c index cce5b5284658..89128fc29ccc 100644 --- a/drivers/rtc/rtc-m41t80.c +++ b/drivers/rtc/rtc-m41t80.c @@ -785,7 +785,7 @@ static long wdt_unlocked_ioctl(struct file *file, unsigned int cmd, */ static int wdt_open(struct inode *inode, struct file *file) { - if (MINOR(inode->i_rdev) == WATCHDOG_MINOR) { + if (iminor(inode) == WATCHDOG_MINOR) { mutex_lock(&m41t80_rtc_mutex); if (test_and_set_bit(0, &wdt_is_open)) { mutex_unlock(&m41t80_rtc_mutex); @@ -809,7 +809,7 @@ static int wdt_open(struct inode *inode, struct file *file) */ static int wdt_release(struct inode *inode, struct file *file) { - if (MINOR(inode->i_rdev) == WATCHDOG_MINOR) + if (iminor(inode) == WATCHDOG_MINOR) clear_bit(0, &wdt_is_open); return 0; } diff --git a/drivers/s390/char/vmur.c b/drivers/s390/char/vmur.c index 1bbf27b98cf6..68f49e2e964c 100644 --- a/drivers/s390/char/vmur.c +++ b/drivers/s390/char/vmur.c @@ -681,7 +681,7 @@ static int ur_open(struct inode *inode, struct file *file) * We treat the minor number as the devno of the ur device * to find in the driver tree. */ - devno = MINOR(file_inode(file)->i_rdev); + devno = iminor(file_inode(file)); urd = urdev_get_from_devno(devno); if (!urd) { diff --git a/drivers/s390/virtio/virtio_ccw.c b/drivers/s390/virtio/virtio_ccw.c index 5730572b52cd..54e686dca6de 100644 --- a/drivers/s390/virtio/virtio_ccw.c +++ b/drivers/s390/virtio/virtio_ccw.c @@ -117,7 +117,7 @@ struct virtio_rev_info { }; /* the highest virtio-ccw revision we support */ -#define VIRTIO_CCW_REV_MAX 1 +#define VIRTIO_CCW_REV_MAX 2 struct virtio_ccw_vq_info { struct virtqueue *vq; @@ -952,7 +952,7 @@ static u8 virtio_ccw_get_status(struct virtio_device *vdev) u8 old_status = vcdev->dma_area->status; struct ccw1 *ccw; - if (vcdev->revision < 1) + if (vcdev->revision < 2) return vcdev->dma_area->status; ccw = ccw_device_dma_zalloc(vcdev->cdev, sizeof(*ccw)); diff --git a/drivers/soc/litex/Kconfig b/drivers/soc/litex/Kconfig index 7a7c38282e11..e7011d665b15 100644 --- a/drivers/soc/litex/Kconfig +++ b/drivers/soc/litex/Kconfig @@ -12,9 +12,21 @@ config LITEX_SOC_CONTROLLER select LITEX help This option enables the SoC Controller Driver which verifies - LiteX CSR access and provides common litex_get_reg/litex_set_reg + LiteX CSR access and provides common litex_[read|write]* accessors. All drivers that use functions from litex.h must depend on LITEX. +config LITEX_SUBREG_SIZE + int "Size of a LiteX CSR subregister, in bytes" + depends on LITEX + range 1 4 + default 4 + help + LiteX MMIO registers (referred to as Configuration and Status + registers, or CSRs) are spread across adjacent 8- or 32-bit + subregisters, located at 32-bit aligned MMIO addresses. Use + this to select the appropriate size (1 or 4 bytes) matching + your particular LiteX build. + endmenu diff --git a/drivers/soc/litex/litex_soc_ctrl.c b/drivers/soc/litex/litex_soc_ctrl.c index 9b0766384570..6268bfa7f0d6 100644 --- a/drivers/soc/litex/litex_soc_ctrl.c +++ b/drivers/soc/litex/litex_soc_ctrl.c @@ -15,79 +15,11 @@ #include #include #include +#include -/* - * LiteX SoC Generator, depending on the configuration, can split a single - * logical CSR (Control&Status Register) into a series of consecutive physical - * registers. - * - * For example, in the configuration with 8-bit CSR Bus, 32-bit aligned (the - * default one for 32-bit CPUs) a 32-bit logical CSR will be generated as four - * 32-bit physical registers, each one containing one byte of meaningful data. - * - * For details see: https://github.com/enjoy-digital/litex/wiki/CSR-Bus - * - * The purpose of `litex_set_reg`/`litex_get_reg` is to implement the logic - * of writing to/reading from the LiteX CSR in a single place that can be - * then reused by all LiteX drivers. - */ - -/** - * litex_set_reg() - Writes the value to the LiteX CSR (Control&Status Register) - * @reg: Address of the CSR - * @reg_size: The width of the CSR expressed in the number of bytes - * @val: Value to be written to the CSR - * - * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned), - * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers, - * each one containing one byte of meaningful data. - * - * This function splits a single possibly multi-byte write into a series of - * single-byte writes with a proper offset. - */ -void litex_set_reg(void __iomem *reg, unsigned long reg_size, - unsigned long val) -{ - unsigned long shifted_data, shift, i; - - for (i = 0; i < reg_size; ++i) { - shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT); - shifted_data = val >> shift; - - WRITE_LITEX_SUBREGISTER(shifted_data, reg, i); - } -} -EXPORT_SYMBOL_GPL(litex_set_reg); - -/** - * litex_get_reg() - Reads the value of the LiteX CSR (Control&Status Register) - * @reg: Address of the CSR - * @reg_size: The width of the CSR expressed in the number of bytes - * - * Return: Value read from the CSR - * - * In the currently supported LiteX configuration (8-bit CSR Bus, 32-bit aligned), - * a 32-bit LiteX CSR is generated as 4 consecutive 32-bit physical registers, - * each one containing one byte of meaningful data. - * - * This function generates a series of single-byte reads with a proper offset - * and joins their results into a single multi-byte value. - */ -unsigned long litex_get_reg(void __iomem *reg, unsigned long reg_size) -{ - unsigned long shifted_data, shift, i; - unsigned long result = 0; - - for (i = 0; i < reg_size; ++i) { - shifted_data = READ_LITEX_SUBREGISTER(reg, i); - - shift = ((reg_size - i - 1) * LITEX_SUBREG_SIZE_BIT); - result |= (shifted_data << shift); - } - - return result; -} -EXPORT_SYMBOL_GPL(litex_get_reg); +/* reset register located at the base address */ +#define RESET_REG_OFF 0x00 +#define RESET_REG_VALUE 0x00000001 #define SCRATCH_REG_OFF 0x04 #define SCRATCH_REG_VALUE 0x12345678 @@ -131,15 +63,27 @@ static int litex_check_csr_access(void __iomem *reg_addr) /* restore original value of the SCRATCH register */ litex_write32(reg_addr + SCRATCH_REG_OFF, SCRATCH_REG_VALUE); - pr_info("LiteX SoC Controller driver initialized"); + pr_info("LiteX SoC Controller driver initialized: subreg:%d, align:%d", + LITEX_SUBREG_SIZE, LITEX_SUBREG_ALIGN); return 0; } struct litex_soc_ctrl_device { void __iomem *base; + struct notifier_block reset_nb; }; +static int litex_reset_handler(struct notifier_block *this, unsigned long mode, + void *cmd) +{ + struct litex_soc_ctrl_device *soc_ctrl_dev = + container_of(this, struct litex_soc_ctrl_device, reset_nb); + + litex_write32(soc_ctrl_dev->base + RESET_REG_OFF, RESET_REG_VALUE); + return NOTIFY_DONE; +} + #ifdef CONFIG_OF static const struct of_device_id litex_soc_ctrl_of_match[] = { {.compatible = "litex,soc-controller"}, @@ -151,6 +95,7 @@ MODULE_DEVICE_TABLE(of, litex_soc_ctrl_of_match); static int litex_soc_ctrl_probe(struct platform_device *pdev) { struct litex_soc_ctrl_device *soc_ctrl_dev; + int error; soc_ctrl_dev = devm_kzalloc(&pdev->dev, sizeof(*soc_ctrl_dev), GFP_KERNEL); if (!soc_ctrl_dev) @@ -160,7 +105,29 @@ static int litex_soc_ctrl_probe(struct platform_device *pdev) if (IS_ERR(soc_ctrl_dev->base)) return PTR_ERR(soc_ctrl_dev->base); - return litex_check_csr_access(soc_ctrl_dev->base); + error = litex_check_csr_access(soc_ctrl_dev->base); + if (error) + return error; + + platform_set_drvdata(pdev, soc_ctrl_dev); + + soc_ctrl_dev->reset_nb.notifier_call = litex_reset_handler; + soc_ctrl_dev->reset_nb.priority = 128; + error = register_restart_handler(&soc_ctrl_dev->reset_nb); + if (error) { + dev_warn(&pdev->dev, "cannot register restart handler: %d\n", + error); + } + + return 0; +} + +static int litex_soc_ctrl_remove(struct platform_device *pdev) +{ + struct litex_soc_ctrl_device *soc_ctrl_dev = platform_get_drvdata(pdev); + + unregister_restart_handler(&soc_ctrl_dev->reset_nb); + return 0; } static struct platform_driver litex_soc_ctrl_driver = { @@ -169,6 +136,7 @@ static struct platform_driver litex_soc_ctrl_driver = { .of_match_table = of_match_ptr(litex_soc_ctrl_of_match) }, .probe = litex_soc_ctrl_probe, + .remove = litex_soc_ctrl_remove, }; module_platform_driver(litex_soc_ctrl_driver); diff --git a/drivers/staging/vme/devices/vme_user.c b/drivers/staging/vme/devices/vme_user.c index 35d7260e2271..e3fa38bd7f12 100644 --- a/drivers/staging/vme/devices/vme_user.c +++ b/drivers/staging/vme/devices/vme_user.c @@ -175,7 +175,7 @@ static ssize_t buffer_from_user(unsigned int minor, const char __user *buf, static ssize_t vme_user_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { - unsigned int minor = MINOR(file_inode(file)->i_rdev); + unsigned int minor = iminor(file_inode(file)); ssize_t retval; size_t image_size; @@ -218,7 +218,7 @@ static ssize_t vme_user_read(struct file *file, char __user *buf, size_t count, static ssize_t vme_user_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { - unsigned int minor = MINOR(file_inode(file)->i_rdev); + unsigned int minor = iminor(file_inode(file)); ssize_t retval; size_t image_size; @@ -260,7 +260,7 @@ static ssize_t vme_user_write(struct file *file, const char __user *buf, static loff_t vme_user_llseek(struct file *file, loff_t off, int whence) { - unsigned int minor = MINOR(file_inode(file)->i_rdev); + unsigned int minor = iminor(file_inode(file)); size_t image_size; loff_t res; @@ -294,7 +294,7 @@ static int vme_user_ioctl(struct inode *inode, struct file *file, struct vme_slave slave; struct vme_irq_id irq_req; unsigned long copied; - unsigned int minor = MINOR(inode->i_rdev); + unsigned int minor = iminor(inode); int retval; dma_addr_t pci_addr; void __user *argp = (void __user *)arg; @@ -412,7 +412,7 @@ vme_user_unlocked_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int ret; struct inode *inode = file_inode(file); - unsigned int minor = MINOR(inode->i_rdev); + unsigned int minor = iminor(inode); mutex_lock(&image[minor].mutex); ret = vme_user_ioctl(inode, file, cmd, arg); @@ -481,7 +481,7 @@ static int vme_user_master_mmap(unsigned int minor, struct vm_area_struct *vma) static int vme_user_mmap(struct file *file, struct vm_area_struct *vma) { - unsigned int minor = MINOR(file_inode(file)->i_rdev); + unsigned int minor = iminor(file_inode(file)); if (type[minor] == MASTER_MINOR) return vme_user_master_mmap(minor, vma); diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 648eb4c4cf7f..8d97f0b45e9c 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -1139,9 +1139,6 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, struct super_block *sb, unsigned int flags) { umode_t mode; - char ext[32]; - char tag_name[14]; - unsigned int i_nlink; struct v9fs_session_info *v9ses = sb->s_fs_info; struct v9fs_inode *v9inode = V9FS_I(inode); @@ -1159,18 +1156,18 @@ v9fs_stat2inode(struct p9_wstat *stat, struct inode *inode, inode->i_gid = stat->n_gid; } if ((S_ISREG(inode->i_mode)) || (S_ISDIR(inode->i_mode))) { - if (v9fs_proto_dotu(v9ses) && (stat->extension[0] != '\0')) { + if (v9fs_proto_dotu(v9ses)) { + unsigned int i_nlink; /* - * Hadlink support got added later to - * to the .u extension. So there can be - * server out there that doesn't support - * this even with .u extension. So check - * for non NULL stat->extension + * Hadlink support got added later to the .u extension. + * So there can be a server out there that doesn't + * support this even with .u extension. That would + * just leave us with stat->extension being an empty + * string, though. */ - strlcpy(ext, stat->extension, sizeof(ext)); /* HARDLINKCOUNT %u */ - sscanf(ext, "%13s %u", tag_name, &i_nlink); - if (!strncmp(tag_name, "HARDLINKCOUNT", 13)) + if (sscanf(stat->extension, + " HARDLINKCOUNT %u", &i_nlink) == 1) set_nlink(inode, i_nlink); } } diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index aa697ccfa9dc..3aedc484e440 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -133,11 +133,12 @@ cifs_dump_channel(struct seq_file *m, int i, struct cifs_chan *chan) { struct TCP_Server_Info *server = chan->server; - seq_printf(m, "\t\tChannel %d Number of credits: %d Dialect 0x%x " - "TCP status: %d Instance: %d Local Users To Server: %d " - "SecMode: 0x%x Req On Wire: %d In Send: %d " - "In MaxReq Wait: %d\n", - i+1, + seq_printf(m, "\n\n\t\tChannel: %d ConnectionId: 0x%llx" + "\n\t\tNumber of credits: %d Dialect 0x%x" + "\n\t\tTCP status: %d Instance: %d" + "\n\t\tLocal Users To Server: %d SecMode: 0x%x Req On Wire: %d" + "\n\t\tIn Send: %d In MaxReq Wait: %d", + i+1, server->conn_id, server->credits, server->dialect, server->tcpStatus, @@ -227,7 +228,7 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) struct TCP_Server_Info *server; struct cifs_ses *ses; struct cifs_tcon *tcon; - int i, j; + int c, i, j; seq_puts(m, "Display Internal CIFS Data Structures for Debugging\n" @@ -275,14 +276,25 @@ static int cifs_debug_data_proc_show(struct seq_file *m, void *v) seq_putc(m, '\n'); seq_printf(m, "CIFSMaxBufSize: %d\n", CIFSMaxBufSize); seq_printf(m, "Active VFS Requests: %d\n", GlobalTotalActiveXid); - seq_printf(m, "Servers:"); - i = 0; + seq_printf(m, "\nServers: "); + + c = 0; spin_lock(&cifs_tcp_ses_lock); list_for_each(tmp1, &cifs_tcp_ses_list) { server = list_entry(tmp1, struct TCP_Server_Info, tcp_ses_list); + /* channel info will be printed as a part of sessions below */ + if (server->is_channel) + continue; + + c++; + seq_printf(m, "\n%d) ConnectionId: 0x%llx ", + c, server->conn_id); + + if (server->hostname) + seq_printf(m, "Hostname: %s ", server->hostname); #ifdef CONFIG_CIFS_SMB_DIRECT if (!server->rdma) goto skip_rdma; @@ -362,46 +374,48 @@ skip_rdma: if (server->posix_ext_supported) seq_printf(m, " posix"); - i++; + if (server->rdma) + seq_printf(m, "\nRDMA "); + seq_printf(m, "\nTCP status: %d Instance: %d" + "\nLocal Users To Server: %d SecMode: 0x%x Req On Wire: %d", + server->tcpStatus, + server->reconnect_instance, + server->srv_count, + server->sec_mode, in_flight(server)); + + seq_printf(m, "\nIn Send: %d In MaxReq Wait: %d", + atomic_read(&server->in_send), + atomic_read(&server->num_waiters)); + + seq_printf(m, "\n\n\tSessions: "); + i = 0; list_for_each(tmp2, &server->smb_ses_list) { ses = list_entry(tmp2, struct cifs_ses, smb_ses_list); + i++; if ((ses->serverDomain == NULL) || (ses->serverOS == NULL) || (ses->serverNOS == NULL)) { - seq_printf(m, "\n%d) Name: %s Uses: %d Capability: 0x%x\tSession Status: %d ", - i, ses->serverName, ses->ses_count, + seq_printf(m, "\n\t%d) Address: %s Uses: %d Capability: 0x%x\tSession Status: %d ", + i, ses->ip_addr, ses->ses_count, ses->capabilities, ses->status); if (ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) - seq_printf(m, "Guest\t"); + seq_printf(m, "Guest "); else if (ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) - seq_printf(m, "Anonymous\t"); + seq_printf(m, "Anonymous "); } else { seq_printf(m, - "\n%d) Name: %s Domain: %s Uses: %d OS:" - " %s\n\tNOS: %s\tCapability: 0x%x\n\tSMB" - " session status: %d ", - i, ses->serverName, ses->serverDomain, + "\n\t%d) Name: %s Domain: %s Uses: %d OS: %s " + "\n\tNOS: %s\tCapability: 0x%x" + "\n\tSMB session status: %d ", + i, ses->ip_addr, ses->serverDomain, ses->ses_count, ses->serverOS, ses->serverNOS, ses->capabilities, ses->status); } - seq_printf(m,"Security type: %s\n", + seq_printf(m, "\n\tSecurity type: %s ", get_security_type_str(server->ops->select_sectype(server, ses->sectype))); - if (server->rdma) - seq_printf(m, "RDMA\n\t"); - seq_printf(m, "TCP status: %d Instance: %d\n\tLocal Users To " - "Server: %d SecMode: 0x%x Req On Wire: %d", - server->tcpStatus, - server->reconnect_instance, - server->srv_count, - server->sec_mode, in_flight(server)); - - seq_printf(m, " In Send: %d In MaxReq Wait: %d", - atomic_read(&server->in_send), - atomic_read(&server->num_waiters)); - /* dump session id helpful for use with network trace */ seq_printf(m, " SessionId: 0x%llx", ses->Suid); if (ses->session_flags & SMB2_SESSION_FLAG_ENCRYPT_DATA) @@ -414,13 +428,13 @@ skip_rdma: from_kuid(&init_user_ns, ses->cred_uid)); if (ses->chan_count > 1) { - seq_printf(m, "\n\n\tExtra Channels: %zu\n", + seq_printf(m, "\n\n\tExtra Channels: %zu ", ses->chan_count-1); for (j = 1; j < ses->chan_count; j++) cifs_dump_channel(m, j, &ses->chans[j]); } - seq_puts(m, "\n\n\tShares:"); + seq_puts(m, "\n\n\tShares: "); j = 0; seq_printf(m, "\n\t%d) IPC: ", j); @@ -437,38 +451,43 @@ skip_rdma: cifs_debug_tcon(m, tcon); } - seq_puts(m, "\n\tMIDs:\n"); - - spin_lock(&GlobalMid_Lock); - list_for_each(tmp3, &server->pending_mid_q) { - mid_entry = list_entry(tmp3, struct mid_q_entry, - qhead); - seq_printf(m, "\tState: %d com: %d pid:" - " %d cbdata: %p mid %llu\n", - mid_entry->mid_state, - le16_to_cpu(mid_entry->command), - mid_entry->pid, - mid_entry->callback_data, - mid_entry->mid); - } - spin_unlock(&GlobalMid_Lock); - spin_lock(&ses->iface_lock); if (ses->iface_count) - seq_printf(m, "\n\tServer interfaces: %zu\n", + seq_printf(m, "\n\n\tServer interfaces: %zu", ses->iface_count); for (j = 0; j < ses->iface_count; j++) { struct cifs_server_iface *iface; iface = &ses->iface_list[j]; - seq_printf(m, "\t%d)", j); + seq_printf(m, "\n\t%d)", j+1); cifs_dump_iface(m, iface); if (is_ses_using_iface(ses, iface)) seq_puts(m, "\t\t[CONNECTED]\n"); } spin_unlock(&ses->iface_lock); } + if (i == 0) + seq_printf(m, "\n\t\t[NONE]"); + + seq_puts(m, "\n\n\tMIDs: "); + spin_lock(&GlobalMid_Lock); + list_for_each(tmp3, &server->pending_mid_q) { + mid_entry = list_entry(tmp3, struct mid_q_entry, + qhead); + seq_printf(m, "\n\tState: %d com: %d pid:" + " %d cbdata: %p mid %llu\n", + mid_entry->mid_state, + le16_to_cpu(mid_entry->command), + mid_entry->pid, + mid_entry->callback_data, + mid_entry->mid); + } + spin_unlock(&GlobalMid_Lock); + seq_printf(m, "\n--\n"); } + if (c == 0) + seq_printf(m, "\n\t[NONE]"); + spin_unlock(&cifs_tcp_ses_lock); seq_putc(m, '\n'); diff --git a/fs/cifs/cifs_swn.c b/fs/cifs/cifs_swn.c index d35f599aa00e..f2d730fffccb 100644 --- a/fs/cifs/cifs_swn.c +++ b/fs/cifs/cifs_swn.c @@ -272,7 +272,7 @@ static struct cifs_swn_reg *cifs_find_swn_reg(struct cifs_tcon *tcon) if (IS_ERR(share_name)) { int ret; - ret = PTR_ERR(net_name); + ret = PTR_ERR(share_name); cifs_dbg(VFS, "%s: failed to extract share name from target '%s': %d\n", __func__, tcon->treeName, ret); kfree(net_name); diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 562913e2b3f2..9d29eb9660c2 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -267,10 +267,11 @@ is_well_known_sid(const struct cifs_sid *psid, uint32_t *puid, bool is_group) return true; /* well known sid found, uid returned */ } -static void +static __u16 cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) { int i; + __u16 size = 1 + 1 + 6; dst->revision = src->revision; dst->num_subauth = min_t(u8, src->num_subauth, SID_MAX_SUB_AUTHORITIES); @@ -278,6 +279,9 @@ cifs_copy_sid(struct cifs_sid *dst, const struct cifs_sid *src) dst->authority[i] = src->authority[i]; for (i = 0; i < dst->num_subauth; ++i) dst->sub_auth[i] = src->sub_auth[i]; + size += (dst->num_subauth * 4); + + return size; } static int @@ -521,8 +525,11 @@ exit_cifs_idmap(void) } /* copy ntsd, owner sid, and group sid from a security descriptor to another */ -static void copy_sec_desc(const struct cifs_ntsd *pntsd, - struct cifs_ntsd *pnntsd, __u32 sidsoffset) +static __u32 copy_sec_desc(const struct cifs_ntsd *pntsd, + struct cifs_ntsd *pnntsd, + __u32 sidsoffset, + struct cifs_sid *pownersid, + struct cifs_sid *pgrpsid) { struct cifs_sid *owner_sid_ptr, *group_sid_ptr; struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; @@ -536,19 +543,25 @@ static void copy_sec_desc(const struct cifs_ntsd *pntsd, pnntsd->gsidoffset = cpu_to_le32(sidsoffset + sizeof(struct cifs_sid)); /* copy owner sid */ - owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + if (pownersid) + owner_sid_ptr = pownersid; + else + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->osidoffset)); nowner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset); cifs_copy_sid(nowner_sid_ptr, owner_sid_ptr); /* copy group sid */ - group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + if (pgrpsid) + group_sid_ptr = pgrpsid; + else + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + le32_to_cpu(pntsd->gsidoffset)); ngroup_sid_ptr = (struct cifs_sid *)((char *)pnntsd + sidsoffset + sizeof(struct cifs_sid)); cifs_copy_sid(ngroup_sid_ptr, group_sid_ptr); - return; + return sidsoffset + (2 * sizeof(struct cifs_sid)); } @@ -663,6 +676,25 @@ static void mode_to_access_flags(umode_t mode, umode_t bits_to_use, return; } +static __u16 cifs_copy_ace(struct cifs_ace *dst, struct cifs_ace *src, struct cifs_sid *psid) +{ + __u16 size = 1 + 1 + 2 + 4; + + dst->type = src->type; + dst->flags = src->flags; + dst->access_req = src->access_req; + + /* Check if there's a replacement sid specified */ + if (psid) + size += cifs_copy_sid(&dst->sid, psid); + else + size += cifs_copy_sid(&dst->sid, &src->sid); + + dst->size = cpu_to_le16(size); + + return size; +} + static __u16 fill_ace_for_sid(struct cifs_ace *pntace, const struct cifs_sid *psid, __u64 nmode, umode_t bits, __u8 access_type, @@ -907,29 +939,30 @@ unsigned int setup_special_user_owner_ACE(struct cifs_ace *pntace) return ace_size; } -static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid, - struct cifs_sid *pgrpsid, __u64 *pnmode, bool modefromsid) +static void populate_new_aces(char *nacl_base, + struct cifs_sid *pownersid, + struct cifs_sid *pgrpsid, + __u64 *pnmode, u32 *pnum_aces, u16 *pnsize, + bool modefromsid) { - u16 size = 0; - u32 num_aces = 0; - struct cifs_acl *pnndacl; __u64 nmode; + u32 num_aces = 0; + u16 nsize = 0; __u64 user_mode; __u64 group_mode; __u64 other_mode; __u64 deny_user_mode = 0; __u64 deny_group_mode = 0; bool sticky_set = false; - - pnndacl = (struct cifs_acl *)((char *)pndacl + sizeof(struct cifs_acl)); + struct cifs_ace *pnntace = NULL; nmode = *pnmode; + num_aces = *pnum_aces; + nsize = *pnsize; if (modefromsid) { - struct cifs_ace *pntace = - (struct cifs_ace *)((char *)pnndacl + size); - - size += setup_special_mode_ACE(pntace, nmode); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += setup_special_mode_ACE(pnntace, nmode); num_aces++; goto set_size; } @@ -966,40 +999,170 @@ static int set_chmod_dacl(struct cifs_acl *pndacl, struct cifs_sid *pownersid, sticky_set = true; if (deny_user_mode) { - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - pownersid, deny_user_mode, 0700, ACCESS_DENIED, false); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pownersid, deny_user_mode, + 0700, ACCESS_DENIED, false); num_aces++; } + /* Group DENY ACE does not conflict with owner ALLOW ACE. Keep in preferred order*/ if (deny_group_mode && !(deny_group_mode & (user_mode >> 3))) { - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - pgrpsid, deny_group_mode, 0070, ACCESS_DENIED, false); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pgrpsid, deny_group_mode, + 0070, ACCESS_DENIED, false); num_aces++; } - size += fill_ace_for_sid((struct cifs_ace *) ((char *)pnndacl + size), - pownersid, user_mode, 0700, ACCESS_ALLOWED, true); + + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pownersid, user_mode, + 0700, ACCESS_ALLOWED, true); num_aces++; + /* Group DENY ACE conflicts with owner ALLOW ACE. So keep it after. */ if (deny_group_mode && (deny_group_mode & (user_mode >> 3))) { - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - pgrpsid, deny_group_mode, 0070, ACCESS_DENIED, false); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pgrpsid, deny_group_mode, + 0070, ACCESS_DENIED, false); num_aces++; } - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - pgrpsid, group_mode, 0070, ACCESS_ALLOWED, !sticky_set); + + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, pgrpsid, group_mode, + 0070, ACCESS_ALLOWED, !sticky_set); num_aces++; - size += fill_ace_for_sid((struct cifs_ace *)((char *)pnndacl + size), - &sid_everyone, other_mode, 0007, ACCESS_ALLOWED, !sticky_set); + + pnntace = (struct cifs_ace *) (nacl_base + nsize); + nsize += fill_ace_for_sid(pnntace, &sid_everyone, other_mode, + 0007, ACCESS_ALLOWED, !sticky_set); num_aces++; set_size: + *pnum_aces = num_aces; + *pnsize = nsize; +} + +static __u16 replace_sids_and_copy_aces(struct cifs_acl *pdacl, struct cifs_acl *pndacl, + struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, + struct cifs_sid *pnownersid, struct cifs_sid *pngrpsid) +{ + int i; + u16 size = 0; + struct cifs_ace *pntace = NULL; + char *acl_base = NULL; + u32 src_num_aces = 0; + u16 nsize = 0; + struct cifs_ace *pnntace = NULL; + char *nacl_base = NULL; + u16 ace_size = 0; + + acl_base = (char *)pdacl; + size = sizeof(struct cifs_acl); + src_num_aces = le32_to_cpu(pdacl->num_aces); + + nacl_base = (char *)pndacl; + nsize = sizeof(struct cifs_acl); + + /* Go through all the ACEs */ + for (i = 0; i < src_num_aces; ++i) { + pntace = (struct cifs_ace *) (acl_base + size); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + + if (pnownersid && compare_sids(&pntace->sid, pownersid) == 0) + ace_size = cifs_copy_ace(pnntace, pntace, pnownersid); + else if (pngrpsid && compare_sids(&pntace->sid, pgrpsid) == 0) + ace_size = cifs_copy_ace(pnntace, pntace, pngrpsid); + else + ace_size = cifs_copy_ace(pnntace, pntace, NULL); + + size += le16_to_cpu(pntace->size); + nsize += ace_size; + } + + return nsize; +} + +static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, + struct cifs_sid *pownersid, struct cifs_sid *pgrpsid, + __u64 *pnmode, bool mode_from_sid) +{ + int i; + u16 size = 0; + struct cifs_ace *pntace = NULL; + char *acl_base = NULL; + u32 src_num_aces = 0; + u16 nsize = 0; + struct cifs_ace *pnntace = NULL; + char *nacl_base = NULL; + u32 num_aces = 0; + __u64 nmode; + bool new_aces_set = false; + + /* Assuming that pndacl and pnmode are never NULL */ + nmode = *pnmode; + nacl_base = (char *)pndacl; + nsize = sizeof(struct cifs_acl); + + /* If pdacl is NULL, we don't have a src. Simply populate new ACL. */ + if (!pdacl) { + populate_new_aces(nacl_base, + pownersid, pgrpsid, + pnmode, &num_aces, &nsize, + mode_from_sid); + goto finalize_dacl; + } + + acl_base = (char *)pdacl; + size = sizeof(struct cifs_acl); + src_num_aces = le32_to_cpu(pdacl->num_aces); + + /* Retain old ACEs which we can retain */ + for (i = 0; i < src_num_aces; ++i) { + pntace = (struct cifs_ace *) (acl_base + size); + pnntace = (struct cifs_ace *) (nacl_base + nsize); + + if (!new_aces_set && (pntace->flags & INHERITED_ACE)) { + /* Place the new ACEs in between existing explicit and inherited */ + populate_new_aces(nacl_base, + pownersid, pgrpsid, + pnmode, &num_aces, &nsize, + mode_from_sid); + + new_aces_set = true; + } + + /* If it's any one of the ACE we're replacing, skip! */ + if ((compare_sids(&pntace->sid, &sid_unix_NFS_mode) == 0) || + (compare_sids(&pntace->sid, pownersid) == 0) || + (compare_sids(&pntace->sid, pgrpsid) == 0) || + (compare_sids(&pntace->sid, &sid_everyone) == 0) || + (compare_sids(&pntace->sid, &sid_authusers) == 0)) { + goto next_ace; + } + + nsize += cifs_copy_ace(pnntace, pntace, NULL); + num_aces++; + +next_ace: + size += le16_to_cpu(pntace->size); + } + + /* If inherited ACEs are not present, place the new ones at the tail */ + if (!new_aces_set) { + populate_new_aces(nacl_base, + pownersid, pgrpsid, + pnmode, &num_aces, &nsize, + mode_from_sid); + + new_aces_set = true; + } + +finalize_dacl: pndacl->num_aces = cpu_to_le32(num_aces); - pndacl->size = cpu_to_le16(size + sizeof(struct cifs_acl)); + pndacl->size = cpu_to_le16(nsize); return 0; } - static int parse_sid(struct cifs_sid *psid, char *end_of_acl) { /* BB need to add parm so we can store the SID BB */ @@ -1094,7 +1257,7 @@ static int parse_sec_desc(struct cifs_sb_info *cifs_sb, /* Convert permission bits from mode to equivalent CIFS ACL */ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, - __u32 secdesclen, __u64 *pnmode, kuid_t uid, kgid_t gid, + __u32 secdesclen, __u32 *pnsecdesclen, __u64 *pnmode, kuid_t uid, kgid_t gid, bool mode_from_sid, bool id_from_sid, int *aclflag) { int rc = 0; @@ -1102,39 +1265,59 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, __u32 ndacloffset; __u32 sidsoffset; struct cifs_sid *owner_sid_ptr, *group_sid_ptr; - struct cifs_sid *nowner_sid_ptr, *ngroup_sid_ptr; + struct cifs_sid *nowner_sid_ptr = NULL, *ngroup_sid_ptr = NULL; struct cifs_acl *dacl_ptr = NULL; /* no need for SACL ptr */ struct cifs_acl *ndacl_ptr = NULL; /* no need for SACL ptr */ + char *end_of_acl = ((char *)pntsd) + secdesclen; + u16 size = 0; + + dacloffset = le32_to_cpu(pntsd->dacloffset); + if (dacloffset) { + dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + if (end_of_acl < (char *)dacl_ptr + le16_to_cpu(dacl_ptr->size)) { + cifs_dbg(VFS, "Server returned illegal ACL size\n"); + return -EINVAL; + } + } + + owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->osidoffset)); + group_sid_ptr = (struct cifs_sid *)((char *)pntsd + + le32_to_cpu(pntsd->gsidoffset)); if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ - owner_sid_ptr = (struct cifs_sid *)((char *)pntsd + - le32_to_cpu(pntsd->osidoffset)); - group_sid_ptr = (struct cifs_sid *)((char *)pntsd + - le32_to_cpu(pntsd->gsidoffset)); - dacloffset = le32_to_cpu(pntsd->dacloffset); - dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); ndacloffset = sizeof(struct cifs_ntsd); ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); - ndacl_ptr->revision = dacl_ptr->revision; - ndacl_ptr->size = 0; - ndacl_ptr->num_aces = 0; + ndacl_ptr->revision = + dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION); - rc = set_chmod_dacl(ndacl_ptr, owner_sid_ptr, group_sid_ptr, + ndacl_ptr->size = cpu_to_le16(0); + ndacl_ptr->num_aces = cpu_to_le32(0); + + rc = set_chmod_dacl(dacl_ptr, ndacl_ptr, owner_sid_ptr, group_sid_ptr, pnmode, mode_from_sid); + sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); - /* copy sec desc control portion & owner and group sids */ - copy_sec_desc(pntsd, pnntsd, sidsoffset); - *aclflag = CIFS_ACL_DACL; + /* copy the non-dacl portion of secdesc */ + *pnsecdesclen = copy_sec_desc(pntsd, pnntsd, sidsoffset, + NULL, NULL); + + *aclflag |= CIFS_ACL_DACL; } else { - memcpy(pnntsd, pntsd, secdesclen); + ndacloffset = sizeof(struct cifs_ntsd); + ndacl_ptr = (struct cifs_acl *)((char *)pnntsd + ndacloffset); + ndacl_ptr->revision = + dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION); + ndacl_ptr->num_aces = dacl_ptr->num_aces; + if (uid_valid(uid)) { /* chown */ uid_t id; - owner_sid_ptr = (struct cifs_sid *)((char *)pnntsd + - le32_to_cpu(pnntsd->osidoffset)); nowner_sid_ptr = kmalloc(sizeof(struct cifs_sid), GFP_KERNEL); - if (!nowner_sid_ptr) - return -ENOMEM; + if (!nowner_sid_ptr) { + rc = -ENOMEM; + goto chown_chgrp_exit; + } id = from_kuid(&init_user_ns, uid); if (id_from_sid) { struct owner_sid *osid = (struct owner_sid *)nowner_sid_ptr; @@ -1145,27 +1328,25 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, osid->SubAuthorities[0] = cpu_to_le32(88); osid->SubAuthorities[1] = cpu_to_le32(1); osid->SubAuthorities[2] = cpu_to_le32(id); + } else { /* lookup sid with upcall */ rc = id_to_sid(id, SIDOWNER, nowner_sid_ptr); if (rc) { cifs_dbg(FYI, "%s: Mapping error %d for owner id %d\n", __func__, rc, id); - kfree(nowner_sid_ptr); - return rc; + goto chown_chgrp_exit; } } - cifs_copy_sid(owner_sid_ptr, nowner_sid_ptr); - kfree(nowner_sid_ptr); - *aclflag = CIFS_ACL_OWNER; + *aclflag |= CIFS_ACL_OWNER; } if (gid_valid(gid)) { /* chgrp */ gid_t id; - group_sid_ptr = (struct cifs_sid *)((char *)pnntsd + - le32_to_cpu(pnntsd->gsidoffset)); ngroup_sid_ptr = kmalloc(sizeof(struct cifs_sid), GFP_KERNEL); - if (!ngroup_sid_ptr) - return -ENOMEM; + if (!ngroup_sid_ptr) { + rc = -ENOMEM; + goto chown_chgrp_exit; + } id = from_kgid(&init_user_ns, gid); if (id_from_sid) { struct owner_sid *gsid = (struct owner_sid *)ngroup_sid_ptr; @@ -1176,19 +1357,35 @@ static int build_sec_desc(struct cifs_ntsd *pntsd, struct cifs_ntsd *pnntsd, gsid->SubAuthorities[0] = cpu_to_le32(88); gsid->SubAuthorities[1] = cpu_to_le32(2); gsid->SubAuthorities[2] = cpu_to_le32(id); + } else { /* lookup sid with upcall */ rc = id_to_sid(id, SIDGROUP, ngroup_sid_ptr); if (rc) { cifs_dbg(FYI, "%s: Mapping error %d for group id %d\n", __func__, rc, id); - kfree(ngroup_sid_ptr); - return rc; + goto chown_chgrp_exit; } } - cifs_copy_sid(group_sid_ptr, ngroup_sid_ptr); - kfree(ngroup_sid_ptr); - *aclflag = CIFS_ACL_GROUP; + *aclflag |= CIFS_ACL_GROUP; } + + if (dacloffset) { + /* Replace ACEs for old owner with new one */ + size = replace_sids_and_copy_aces(dacl_ptr, ndacl_ptr, + owner_sid_ptr, group_sid_ptr, + nowner_sid_ptr, ngroup_sid_ptr); + ndacl_ptr->size = cpu_to_le16(size); + } + + sidsoffset = ndacloffset + le16_to_cpu(ndacl_ptr->size); + /* copy the non-dacl portion of secdesc */ + *pnsecdesclen = copy_sec_desc(pntsd, pnntsd, sidsoffset, + nowner_sid_ptr, ngroup_sid_ptr); + +chown_chgrp_exit: + /* errors could jump here. So make sure we return soon after this */ + kfree(nowner_sid_ptr); + kfree(ngroup_sid_ptr); } return rc; @@ -1384,6 +1581,9 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, int rc = 0; int aclflag = CIFS_ACL_DACL; /* default flag to set */ __u32 secdesclen = 0; + __u32 nsecdesclen = 0; + __u32 dacloffset = 0; + struct cifs_acl *dacl_ptr = NULL; struct cifs_ntsd *pntsd = NULL; /* acl obtained from server */ struct cifs_ntsd *pnntsd = NULL; /* modified acl to be sent to server */ struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -1414,20 +1614,6 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, return rc; } - /* - * Add three ACEs for owner, group, everyone getting rid of other ACEs - * as chmod disables ACEs and set the security descriptor. Allocate - * memory for the smb header, set security descriptor request security - * descriptor parameters, and secuirty descriptor itself - */ - secdesclen = max_t(u32, secdesclen, DEFAULT_SEC_DESC_LEN); - pnntsd = kmalloc(secdesclen, GFP_KERNEL); - if (!pnntsd) { - kfree(pntsd); - cifs_put_tlink(tlink); - return -ENOMEM; - } - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MODE_FROM_SID) mode_from_sid = true; else @@ -1438,7 +1624,42 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, else id_from_sid = false; - rc = build_sec_desc(pntsd, pnntsd, secdesclen, pnmode, uid, gid, + /* Potentially, five new ACEs can be added to the ACL for U,G,O mapping */ + nsecdesclen = secdesclen; + if (pnmode && *pnmode != NO_CHANGE_64) { /* chmod */ + if (mode_from_sid) + nsecdesclen += sizeof(struct cifs_ace); + else /* cifsacl */ + nsecdesclen += 5 * sizeof(struct cifs_ace); + } else { /* chown */ + /* When ownership changes, changes new owner sid length could be different */ + nsecdesclen = sizeof(struct cifs_ntsd) + (sizeof(struct cifs_sid) * 2); + dacloffset = le32_to_cpu(pntsd->dacloffset); + if (dacloffset) { + dacl_ptr = (struct cifs_acl *)((char *)pntsd + dacloffset); + if (mode_from_sid) + nsecdesclen += + le32_to_cpu(dacl_ptr->num_aces) * sizeof(struct cifs_ace); + else /* cifsacl */ + nsecdesclen += le16_to_cpu(dacl_ptr->size); + } + } + + /* + * Add three ACEs for owner, group, everyone getting rid of other ACEs + * as chmod disables ACEs and set the security descriptor. Allocate + * memory for the smb header, set security descriptor request security + * descriptor parameters, and secuirty descriptor itself + */ + nsecdesclen = max_t(u32, nsecdesclen, DEFAULT_SEC_DESC_LEN); + pnntsd = kmalloc(nsecdesclen, GFP_KERNEL); + if (!pnntsd) { + kfree(pntsd); + cifs_put_tlink(tlink); + return -ENOMEM; + } + + rc = build_sec_desc(pntsd, pnntsd, secdesclen, &nsecdesclen, pnmode, uid, gid, mode_from_sid, id_from_sid, &aclflag); cifs_dbg(NOISY, "build_sec_desc rc: %d\n", rc); @@ -1448,7 +1669,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, if (!rc) { /* Set the security descriptor */ - rc = ops->set_acl(pnntsd, secdesclen, inode, path, aclflag); + rc = ops->set_acl(pnntsd, nsecdesclen, inode, path, aclflag); cifs_dbg(NOISY, "set_cifs_acl rc: %d\n", rc); } cifs_put_tlink(tlink); diff --git a/fs/cifs/cifsacl.h b/fs/cifs/cifsacl.h index ff7fd0862e28..d9e704979d99 100644 --- a/fs/cifs/cifsacl.h +++ b/fs/cifs/cifsacl.h @@ -31,8 +31,8 @@ #define EXEC_BIT 0x1 #define ACL_OWNER_MASK 0700 -#define ACL_GROUP_MASK 0770 -#define ACL_EVERYONE_MASK 0777 +#define ACL_GROUP_MASK 0070 +#define ACL_EVERYONE_MASK 0007 #define UBITSHIFT 6 #define GBITSHIFT 3 diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 51d53e4bdf6b..b8f1ff9a83f3 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -568,15 +568,15 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, return rc; } } else { - /* We use ses->serverName if no domain name available */ - len = strlen(ses->serverName); + /* We use ses->ip_addr if no domain name available */ + len = strlen(ses->ip_addr); server = kmalloc(2 + (len * 2), GFP_KERNEL); if (server == NULL) { rc = -ENOMEM; return rc; } - len = cifs_strtoUTF16((__le16 *)server, ses->serverName, len, + len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, nls_cp); rc = crypto_shash_update(&ses->server->secmech.sdeschmacmd5->shash, diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 38534e066cc7..d43e935d2df4 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -638,8 +638,18 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",snapshot=%llu", tcon->snapshot_time); if (tcon->handle_timeout) seq_printf(s, ",handletimeout=%u", tcon->handle_timeout); - /* convert actimeo and display it in seconds */ - seq_printf(s, ",actimeo=%lu", cifs_sb->ctx->actimeo / HZ); + + /* + * Display file and directory attribute timeout in seconds. + * If file and directory attribute timeout the same then actimeo + * was likely specified on mount + */ + if (cifs_sb->ctx->acdirmax == cifs_sb->ctx->acregmax) + seq_printf(s, ",actimeo=%lu", cifs_sb->ctx->acregmax / HZ); + else { + seq_printf(s, ",acdirmax=%lu", cifs_sb->ctx->acdirmax / HZ); + seq_printf(s, ",acregmax=%lu", cifs_sb->ctx->acregmax / HZ); + } if (tcon->ses->chan_max > 1) seq_printf(s, ",multichannel,max_channels=%zu", @@ -1526,6 +1536,7 @@ init_cifs(void) */ atomic_set(&sesInfoAllocCount, 0); atomic_set(&tconInfoAllocCount, 0); + atomic_set(&tcpSesNextId, 0); atomic_set(&tcpSesAllocCount, 0); atomic_set(&tcpSesReconnectCount, 0); atomic_set(&tconInfoReconnectCount, 0); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 71e9c6abb2a6..0d7ef150dbb2 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -165,5 +165,5 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.30" +#define CIFS_VERSION "2.31" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 50fcb65920e8..3de3c5908a72 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -504,6 +505,8 @@ struct smb_version_operations { loff_t (*llseek)(struct file *, struct cifs_tcon *, loff_t, int); /* Check for STATUS_IO_TIMEOUT */ bool (*is_status_io_timeout)(char *buf); + /* Check for STATUS_NETWORK_NAME_DELETED */ + void (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); }; struct smb_version_values { @@ -577,6 +580,7 @@ inc_rfc1001_len(void *buf, int count) struct TCP_Server_Info { struct list_head tcp_ses_list; struct list_head smb_ses_list; + __u64 conn_id; /* connection identifier (useful for debugging) */ int srv_count; /* reference counter */ /* 15 character server name + 0x20 16th byte indicating type = srv */ char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; @@ -901,7 +905,7 @@ struct cifs_ses { kuid_t linux_uid; /* overriding owner of files on the mount */ kuid_t cred_uid; /* owner of credentials */ unsigned int capabilities; - char serverName[SERVER_NAME_LEN_WITH_NULL]; + char ip_addr[INET6_ADDRSTRLEN + 1]; /* Max ipv6 (or v4) addr string len */ char *user_name; /* must not be null except during init of sess and after mount option parsing we fill it */ char *domainName; @@ -1704,7 +1708,9 @@ static inline bool is_retryable_error(int error) #define CIFS_ECHO_OP 0x080 /* echo request */ #define CIFS_OBREAK_OP 0x0100 /* oplock break request */ #define CIFS_NEG_OP 0x0200 /* negotiate request */ -#define CIFS_OP_MASK 0x0380 /* mask request type */ +/* Lower bitmask values are reserved by others below. */ +#define CIFS_SESS_OP 0x2000 /* session setup request */ +#define CIFS_OP_MASK 0x2380 /* mask request type */ #define CIFS_HAS_CREDITS 0x0400 /* already has credits */ #define CIFS_TRANSFORM_REQ 0x0800 /* transform request before sending */ @@ -1844,6 +1850,7 @@ GLOBAL_EXTERN spinlock_t GlobalMid_Lock; /* protects above & list operations */ */ GLOBAL_EXTERN atomic_t sesInfoAllocCount; GLOBAL_EXTERN atomic_t tconInfoAllocCount; +GLOBAL_EXTERN atomic_t tcpSesNextId; GLOBAL_EXTERN atomic_t tcpSesAllocCount; GLOBAL_EXTERN atomic_t tcpSesReconnectCount; GLOBAL_EXTERN atomic_t tconInfoReconnectCount; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 32f7a013402e..75ce6f742b8d 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -232,6 +232,8 @@ extern unsigned int setup_special_user_owner_ACE(struct cifs_ace *pace); extern void dequeue_mid(struct mid_q_entry *mid, bool malformed); extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, unsigned int to_read); +extern ssize_t cifs_discard_from_socket(struct TCP_Server_Info *server, + size_t to_read); extern int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int page_offset, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 0496934feecb..c279527aae92 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1451,9 +1451,9 @@ cifs_discard_remaining_data(struct TCP_Server_Info *server) while (remaining > 0) { int length; - length = cifs_read_from_socket(server, server->bigbuf, - min_t(unsigned int, remaining, - CIFSMaxBufSize + MAX_HEADER_SIZE(server))); + length = cifs_discard_from_socket(server, + min_t(size_t, remaining, + CIFSMaxBufSize + MAX_HEADER_SIZE(server))); if (length < 0) return length; server->total_read += length; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 4bb9decbbf27..112692300fb6 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -242,7 +242,7 @@ cifs_reconnect(struct TCP_Server_Info *server) server->max_read = 0; cifs_dbg(FYI, "Mark tcp session as need reconnect\n"); - trace_smb3_reconnect(server->CurrentMid, server->hostname); + trace_smb3_reconnect(server->CurrentMid, server->conn_id, server->hostname); /* before reconnecting the tcp session, mark the smb session (uid) and the tid bad so they are not used until reconnected */ @@ -564,6 +564,23 @@ cifs_read_from_socket(struct TCP_Server_Info *server, char *buf, return cifs_readv_from_socket(server, &smb_msg); } +ssize_t +cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read) +{ + struct msghdr smb_msg; + + /* + * iov_iter_discard already sets smb_msg.type and count and iov_offset + * and cifs_readv_from_socket sets msg_control and msg_controllen + * so little to initialize in struct msghdr + */ + smb_msg.msg_name = NULL; + smb_msg.msg_namelen = 0; + iov_iter_discard(&smb_msg.msg_iter, READ, to_read); + + return cifs_readv_from_socket(server, &smb_msg); +} + int cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page, unsigned int page_offset, unsigned int to_read) @@ -846,7 +863,7 @@ static void smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) { struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buffer; - int scredits = server->credits; + int scredits, in_flight; /* * SMB1 does not use credits. @@ -857,12 +874,14 @@ smb2_add_credits_from_hdr(char *buffer, struct TCP_Server_Info *server) if (shdr->CreditRequest) { spin_lock(&server->req_lock); server->credits += le16_to_cpu(shdr->CreditRequest); + scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); wake_up(&server->request_q); trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, - le16_to_cpu(shdr->CreditRequest)); + server->conn_id, server->hostname, scredits, + le16_to_cpu(shdr->CreditRequest), in_flight); cifs_server_dbg(FYI, "%s: added %u credits total=%d\n", __func__, le16_to_cpu(shdr->CreditRequest), scredits); @@ -993,6 +1012,10 @@ next_pdu: if (mids[i] != NULL) { mids[i]->resp_buf_size = server->pdu_size; + if (bufs[i] && server->ops->is_network_name_deleted) + server->ops->is_network_name_deleted(bufs[i], + server); + if (!mids[i]->multiRsp || mids[i]->multiEnd) mids[i]->callback(mids[i]); @@ -1317,6 +1340,7 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx) goto out_err_crypto_release; } + tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId); tcp_ses->noblockcnt = ctx->rootfs; tcp_ses->noblocksnd = ctx->noblocksnd || ctx->rootfs; tcp_ses->noautotune = ctx->noautotune; @@ -1838,9 +1862,9 @@ cifs_get_smb_ses(struct TCP_Server_Info *server, struct smb3_fs_context *ctx) /* new SMB session uses our server ref */ ses->server = server; if (server->dstaddr.ss_family == AF_INET6) - sprintf(ses->serverName, "%pI6", &addr6->sin6_addr); + sprintf(ses->ip_addr, "%pI6", &addr6->sin6_addr); else - sprintf(ses->serverName, "%pI4", &addr->sin_addr); + sprintf(ses->ip_addr, "%pI4", &addr->sin_addr); if (ctx->username) { ses->user_name = kstrdup(ctx->username, GFP_KERNEL); @@ -2269,7 +2293,9 @@ compare_mount_options(struct super_block *sb, struct cifs_mnt_data *mnt_data) if (strcmp(old->local_nls->charset, new->local_nls->charset)) return 0; - if (old->ctx->actimeo != new->ctx->actimeo) + if (old->ctx->acregmax != new->ctx->acregmax) + return 0; + if (old->ctx->acdirmax != new->ctx->acdirmax) return 0; return 1; @@ -2911,7 +2937,7 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, #ifdef CONFIG_CIFS_DFS_UPCALL /* * cifs_build_path_to_root returns full path to root when we do not have an - * exiting connection (tcon) + * existing connection (tcon) */ static char * build_unc_path_to_root(const struct smb3_fs_context *ctx, @@ -3038,96 +3064,91 @@ static int update_vol_info(const struct dfs_cache_tgt_iterator *tgt_it, return 0; } -static int setup_dfs_tgt_conn(const char *path, const char *full_path, - const struct dfs_cache_tgt_iterator *tgt_it, - struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx, - unsigned int *xid, struct TCP_Server_Info **server, - struct cifs_ses **ses, struct cifs_tcon **tcon) -{ - int rc; - struct dfs_info3_param ref = {0}; - char *mdata = NULL; - struct smb3_fs_context fake_ctx = {NULL}; - char *fake_devname = NULL; - - cifs_dbg(FYI, "%s: dfs path: %s\n", __func__, path); - - rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref); - if (rc) - return rc; - - mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options, - full_path + 1, &ref, - &fake_devname); - free_dfs_info_param(&ref); - - if (IS_ERR(mdata)) { - rc = PTR_ERR(mdata); - mdata = NULL; - } else - rc = cifs_setup_volume_info(&fake_ctx, mdata, fake_devname); - - kfree(mdata); - kfree(fake_devname); - - if (!rc) { - /* - * We use a 'fake_ctx' here because we need pass it down to the - * mount_{get,put} functions to test connection against new DFS - * targets. - */ - mount_put_conns(cifs_sb, *xid, *server, *ses, *tcon); - rc = mount_get_conns(&fake_ctx, cifs_sb, xid, server, ses, - tcon); - if (!rc || (*server && *ses)) { - /* - * We were able to connect to new target server. - * Update current context with new target server. - */ - rc = update_vol_info(tgt_it, &fake_ctx, ctx); - } - } - smb3_cleanup_fs_context_contents(&fake_ctx); - return rc; -} - static int do_dfs_failover(const char *path, const char *full_path, struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx, struct cifs_ses *root_ses, unsigned int *xid, struct TCP_Server_Info **server, struct cifs_ses **ses, struct cifs_tcon **tcon) { int rc; - struct dfs_cache_tgt_list tgt_list; + struct dfs_cache_tgt_list tgt_list = {0}; struct dfs_cache_tgt_iterator *tgt_it = NULL; + struct smb3_fs_context tmp_ctx = {NULL}; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) return -EOPNOTSUPP; + cifs_dbg(FYI, "%s: path=%s full_path=%s\n", __func__, path, full_path); + rc = dfs_cache_noreq_find(path, NULL, &tgt_list); if (rc) return rc; + /* + * We use a 'tmp_ctx' here because we need pass it down to the mount_{get,put} functions to + * test connection against new DFS targets. + */ + rc = smb3_fs_context_dup(&tmp_ctx, ctx); + if (rc) + goto out; for (;;) { + struct dfs_info3_param ref = {0}; + char *fake_devname = NULL, *mdata = NULL; + /* Get next DFS target server - if any */ rc = get_next_dfs_tgt(path, &tgt_list, &tgt_it); if (rc) break; - /* Connect to next DFS target */ - rc = setup_dfs_tgt_conn(path, full_path, tgt_it, cifs_sb, ctx, xid, server, ses, - tcon); - if (!rc || (*server && *ses)) + + rc = dfs_cache_get_tgt_referral(path, tgt_it, &ref); + if (rc) break; + + cifs_dbg(FYI, "%s: old ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC, + tmp_ctx.prepath); + + mdata = cifs_compose_mount_options(cifs_sb->ctx->mount_options, full_path + 1, &ref, + &fake_devname); + free_dfs_info_param(&ref); + + if (IS_ERR(mdata)) { + rc = PTR_ERR(mdata); + mdata = NULL; + } else + rc = cifs_setup_volume_info(&tmp_ctx, mdata, fake_devname); + + kfree(mdata); + kfree(fake_devname); + + if (rc) + break; + + cifs_dbg(FYI, "%s: new ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC, + tmp_ctx.prepath); + + mount_put_conns(cifs_sb, *xid, *server, *ses, *tcon); + rc = mount_get_conns(&tmp_ctx, cifs_sb, xid, server, ses, tcon); + if (!rc || (*server && *ses)) { + /* + * We were able to connect to new target server. Update current context with + * new target server. + */ + rc = update_vol_info(tgt_it, &tmp_ctx, ctx); + break; + } } if (!rc) { + cifs_dbg(FYI, "%s: final ctx: UNC=%s prepath=%s\n", __func__, tmp_ctx.UNC, + tmp_ctx.prepath); /* - * Update DFS target hint in DFS referral cache with the target - * server we successfully reconnected to. + * Update DFS target hint in DFS referral cache with the target server we + * successfully reconnected to. */ - rc = dfs_cache_update_tgthint(*xid, root_ses ? root_ses : *ses, - cifs_sb->local_nls, - cifs_remap(cifs_sb), path, - tgt_it); + rc = dfs_cache_update_tgthint(*xid, root_ses ? root_ses : *ses, cifs_sb->local_nls, + cifs_remap(cifs_sb), path, tgt_it); } + +out: + smb3_cleanup_fs_context_contents(&tmp_ctx); dfs_cache_free_tgts(&tgt_list); return rc; } @@ -3285,77 +3306,77 @@ static void put_root_ses(struct cifs_ses *ses) cifs_put_smb_ses(ses); } -/* Check if a path component is remote and then update @dfs_path accordingly */ -static int check_dfs_prepath(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx, - const unsigned int xid, struct TCP_Server_Info *server, - struct cifs_tcon *tcon, char **dfs_path) +/* Set up next dfs prefix path in @dfs_path */ +static int next_dfs_prepath(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx, + const unsigned int xid, struct TCP_Server_Info *server, + struct cifs_tcon *tcon, char **dfs_path) { - char *path, *s; - char sep = CIFS_DIR_SEP(cifs_sb), tmp; - char *npath; - int rc = 0; - int added_treename = tcon->Flags & SMB_SHARE_IS_IN_DFS; - int skip = added_treename; + char *path, *npath; + int added_treename = is_tcon_dfs(tcon); + int rc; path = cifs_build_path_to_root(ctx, cifs_sb, tcon, added_treename); if (!path) return -ENOMEM; - /* - * Walk through the path components in @path and check if they're accessible. In case any of - * the components is -EREMOTE, then update @dfs_path with the next DFS referral request path - * (NOT including the remaining components). - */ - s = path; - do { - /* skip separators */ - while (*s && *s == sep) - s++; - if (!*s) - break; - /* next separator */ - while (*s && *s != sep) - s++; - /* - * if the treename is added, we then have to skip the first - * part within the separators - */ - if (skip) { - skip = 0; - continue; + rc = is_path_remote(cifs_sb, ctx, xid, server, tcon); + if (rc == -EREMOTE) { + struct smb3_fs_context v = {NULL}; + /* if @path contains a tree name, skip it in the prefix path */ + if (added_treename) { + rc = smb3_parse_devname(path, &v); + if (rc) + goto out; + npath = build_unc_path_to_root(&v, cifs_sb, true); + smb3_cleanup_fs_context_contents(&v); + } else { + v.UNC = ctx->UNC; + v.prepath = path + 1; + npath = build_unc_path_to_root(&v, cifs_sb, true); } - tmp = *s; - *s = 0; - rc = server->ops->is_path_accessible(xid, tcon, cifs_sb, path); - if (rc && rc == -EREMOTE) { - struct smb3_fs_context v = {NULL}; - /* if @path contains a tree name, skip it in the prefix path */ - if (added_treename) { - rc = smb3_parse_devname(path, &v); - if (rc) - break; - rc = -EREMOTE; - npath = build_unc_path_to_root(&v, cifs_sb, true); - smb3_cleanup_fs_context_contents(&v); - } else { - v.UNC = ctx->UNC; - v.prepath = path + 1; - npath = build_unc_path_to_root(&v, cifs_sb, true); - } - if (IS_ERR(npath)) { - rc = PTR_ERR(npath); - break; - } - kfree(*dfs_path); - *dfs_path = npath; - } - *s = tmp; - } while (rc == 0); + if (IS_ERR(npath)) { + rc = PTR_ERR(npath); + goto out; + } + + kfree(*dfs_path); + *dfs_path = npath; + rc = -EREMOTE; + } + +out: kfree(path); return rc; } +/* Check if resolved targets can handle any DFS referrals */ +static int is_referral_server(const char *ref_path, struct cifs_tcon *tcon, bool *ref_server) +{ + int rc; + struct dfs_info3_param ref = {0}; + + if (is_tcon_dfs(tcon)) { + *ref_server = true; + } else { + cifs_dbg(FYI, "%s: ref_path=%s\n", __func__, ref_path); + + rc = dfs_cache_noreq_find(ref_path, &ref, NULL); + if (rc) { + cifs_dbg(VFS, "%s: dfs_cache_noreq_find: failed (rc=%d)\n", __func__, rc); + return rc; + } + cifs_dbg(FYI, "%s: ref.flags=0x%x\n", __func__, ref.flags); + /* + * Check if all targets are capable of handling DFS referrals as per + * MS-DFSC 2.2.4 RESP_GET_DFS_REFERRAL. + */ + *ref_server = !!(ref.flags & DFSREF_REFERRAL_SERVER); + free_dfs_info_param(&ref); + } + return 0; +} + int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) { int rc = 0; @@ -3367,18 +3388,19 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) char *ref_path = NULL, *full_path = NULL; char *oldmnt = NULL; char *mntdata = NULL; + bool ref_server = false; rc = mount_get_conns(ctx, cifs_sb, &xid, &server, &ses, &tcon); /* - * Unconditionally try to get an DFS referral (even cached) to determine whether it is an - * DFS mount. + * If called with 'nodfs' mount option, then skip DFS resolving. Otherwise unconditionally + * try to get an DFS referral (even cached) to determine whether it is an DFS mount. * * Skip prefix path to provide support for DFS referrals from w2k8 servers which don't seem * to respond with PATH_NOT_COVERED to requests that include the prefix. */ - if (dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), ctx->UNC + 1, NULL, + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS) || + dfs_cache_find(xid, ses, cifs_sb->local_nls, cifs_remap(cifs_sb), ctx->UNC + 1, NULL, NULL)) { - /* No DFS referral was returned. Looks like a regular share. */ if (rc) goto error; /* Check if it is fully accessible and then mount it */ @@ -3432,13 +3454,18 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) break; if (!tcon) continue; + /* Make sure that requests go through new root servers */ - if (is_tcon_dfs(tcon)) { + rc = is_referral_server(ref_path + 1, tcon, &ref_server); + if (rc) + break; + if (ref_server) { put_root_ses(root_ses); set_root_ses(cifs_sb, ses, &root_ses); } - /* Check for remaining path components and then continue chasing them (-EREMOTE) */ - rc = check_dfs_prepath(cifs_sb, ctx, xid, server, tcon, &ref_path); + + /* Get next dfs path and then continue chasing them if -EREMOTE */ + rc = next_dfs_prepath(cifs_sb, ctx, xid, server, tcon, &ref_path); /* Prevent recursion on broken link referrals */ if (rc == -EREMOTE && ++count > MAX_NESTED_LINKS) rc = -ELOOP; diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 4950ab0486ae..098b4bc8da59 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -37,11 +37,12 @@ struct cache_dfs_tgt { struct cache_entry { struct hlist_node hlist; const char *path; - int ttl; - int srvtype; - int flags; + int hdr_flags; /* RESP_GET_DFS_REFERRAL.ReferralHeaderFlags */ + int ttl; /* DFS_REREFERRAL_V3.TimeToLive */ + int srvtype; /* DFS_REREFERRAL_V3.ServerType */ + int ref_flags; /* DFS_REREFERRAL_V3.ReferralEntryFlags */ struct timespec64 etime; - int path_consumed; + int path_consumed; /* RESP_GET_DFS_REFERRAL.PathConsumed */ int numtgts; struct list_head tlist; struct cache_dfs_tgt *tgthint; @@ -166,14 +167,11 @@ static int dfscache_proc_show(struct seq_file *m, void *v) continue; seq_printf(m, - "cache entry: path=%s,type=%s,ttl=%d,etime=%ld," - "interlink=%s,path_consumed=%d,expired=%s\n", - ce->path, - ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", - ce->ttl, ce->etime.tv_nsec, - IS_INTERLINK_SET(ce->flags) ? "yes" : "no", - ce->path_consumed, - cache_entry_expired(ce) ? "yes" : "no"); + "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n", + ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", + ce->ttl, ce->etime.tv_nsec, ce->ref_flags, ce->hdr_flags, + IS_INTERLINK_SET(ce->hdr_flags) ? "yes" : "no", + ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); list_for_each_entry(t, &ce->tlist, list) { seq_printf(m, " %s%s\n", @@ -236,11 +234,12 @@ static inline void dump_tgts(const struct cache_entry *ce) static inline void dump_ce(const struct cache_entry *ce) { - cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,interlink=%s,path_consumed=%d,expired=%s\n", + cifs_dbg(FYI, "cache entry: path=%s,type=%s,ttl=%d,etime=%ld,hdr_flags=0x%x,ref_flags=0x%x,interlink=%s,path_consumed=%d,expired=%s\n", ce->path, ce->srvtype == DFS_TYPE_ROOT ? "root" : "link", ce->ttl, ce->etime.tv_nsec, - IS_INTERLINK_SET(ce->flags) ? "yes" : "no", + ce->hdr_flags, ce->ref_flags, + IS_INTERLINK_SET(ce->hdr_flags) ? "yes" : "no", ce->path_consumed, cache_entry_expired(ce) ? "yes" : "no"); dump_tgts(ce); @@ -381,7 +380,8 @@ static int copy_ref_data(const struct dfs_info3_param *refs, int numrefs, ce->ttl = refs[0].ttl; ce->etime = get_expire_time(ce->ttl); ce->srvtype = refs[0].server_type; - ce->flags = refs[0].ref_flag; + ce->hdr_flags = refs[0].flags; + ce->ref_flags = refs[0].ref_flag; ce->path_consumed = refs[0].path_consumed; for (i = 0; i < numrefs; i++) { @@ -799,7 +799,8 @@ static int setup_referral(const char *path, struct cache_entry *ce, ref->path_consumed = ce->path_consumed; ref->ttl = ce->ttl; ref->server_type = ce->srvtype; - ref->ref_flag = ce->flags; + ref->ref_flag = ce->ref_flags; + ref->flags = ce->hdr_flags; return 0; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 6d001905c8e5..26de4329d161 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -580,7 +580,7 @@ int cifs_open(struct inode *inode, struct file *file) } else if ((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { if (tcon->ses->serverNOS) cifs_dbg(VFS, "server %s of type %s returned unexpected error on SMB posix open, disabling posix open support. Check if server update available.\n", - tcon->ses->serverName, + tcon->ses->ip_addr, tcon->ses->serverNOS); tcon->broken_posix_open = true; } else if ((rc != -EIO) && (rc != -EREMOTE) && diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 12a5da0230b5..892f51a21278 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -140,6 +140,8 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("rsize", Opt_rsize), fsparam_u32("wsize", Opt_wsize), fsparam_u32("actimeo", Opt_actimeo), + fsparam_u32("acdirmax", Opt_acdirmax), + fsparam_u32("acregmax", Opt_acregmax), fsparam_u32("echo_interval", Opt_echo_interval), fsparam_u32("max_credits", Opt_max_credits), fsparam_u32("handletimeout", Opt_handletimeout), @@ -397,7 +399,7 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) ctx->vals = &smb3any_values; break; case Smb_default: - ctx->ops = &smb30_operations; /* currently identical with 3.0 */ + ctx->ops = &smb30_operations; ctx->vals = &smbdefault_values; break; default: @@ -542,20 +544,37 @@ static int smb3_fs_context_parse_monolithic(struct fs_context *fc, /* BB Need to add support for sep= here TBD */ while ((key = strsep(&options, ",")) != NULL) { - if (*key) { - size_t v_len = 0; - char *value = strchr(key, '='); + size_t len; + char *value; - if (value) { - if (value == key) - continue; - *value++ = 0; - v_len = strlen(value); - } - ret = vfs_parse_fs_string(fc, key, value, v_len); - if (ret < 0) - break; + if (*key == 0) + break; + + /* Check if following character is the deliminator If yes, + * we have encountered a double deliminator reset the NULL + * character to the deliminator + */ + while (options && options[0] == ',') { + len = strlen(key); + strcpy(key + len, options); + options = strchr(options, ','); + if (options) + *options++ = 0; } + + + len = 0; + value = strchr(key, '='); + if (value) { + if (value == key) + continue; + *value++ = 0; + len = strlen(value); + } + + ret = vfs_parse_fs_string(fc, key, value, len); + if (ret < 0) + break; } return ret; @@ -929,13 +948,32 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->wsize = result.uint_32; ctx->got_wsize = true; break; - case Opt_actimeo: - ctx->actimeo = HZ * result.uint_32; - if (ctx->actimeo > CIFS_MAX_ACTIMEO) { - cifs_dbg(VFS, "attribute cache timeout too large\n"); + case Opt_acregmax: + ctx->acregmax = HZ * result.uint_32; + if (ctx->acregmax > CIFS_MAX_ACTIMEO) { + cifs_dbg(VFS, "acregmax too large\n"); goto cifs_parse_mount_err; } break; + case Opt_acdirmax: + ctx->acdirmax = HZ * result.uint_32; + if (ctx->acdirmax > CIFS_MAX_ACTIMEO) { + cifs_dbg(VFS, "acdirmax too large\n"); + goto cifs_parse_mount_err; + } + break; + case Opt_actimeo: + if (HZ * result.uint_32 > CIFS_MAX_ACTIMEO) { + cifs_dbg(VFS, "timeout too large\n"); + goto cifs_parse_mount_err; + } + if ((ctx->acdirmax != CIFS_DEF_ACTIMEO) || + (ctx->acregmax != CIFS_DEF_ACTIMEO)) { + cifs_dbg(VFS, "actimeo ignored since acregmax or acdirmax specified\n"); + break; + } + ctx->acdirmax = ctx->acregmax = HZ * result.uint_32; + break; case Opt_echo_interval: ctx->echo_interval = result.uint_32; break; @@ -1361,7 +1399,8 @@ int smb3_init_fs_context(struct fs_context *fc) /* default is to use strict cifs caching semantics */ ctx->strict_io = true; - ctx->actimeo = CIFS_DEF_ACTIMEO; + ctx->acregmax = CIFS_DEF_ACTIMEO; + ctx->acdirmax = CIFS_DEF_ACTIMEO; /* Most clients set timeout to 0, allows server to use its default */ ctx->handle_timeout = 0; /* See MS-SMB2 spec section 2.2.14.2.12 */ diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index 1c44a460e2c0..87dd1f7168f2 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -118,6 +118,8 @@ enum cifs_param { Opt_rsize, Opt_wsize, Opt_actimeo, + Opt_acdirmax, + Opt_acregmax, Opt_echo_interval, Opt_max_credits, Opt_snapshot, @@ -232,7 +234,9 @@ struct smb3_fs_context { unsigned int wsize; unsigned int min_offload; bool sockopt_tcp_nodelay:1; - unsigned long actimeo; /* attribute cache timeout (jiffies) */ + /* attribute cache timemout for files and directories in jiffies */ + unsigned long acregmax; + unsigned long acdirmax; struct smb_version_operations *ops; struct smb_version_values *vals; char *prepath; diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 3e9c7bb23f26..7c61bc9573c0 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -2199,12 +2199,23 @@ cifs_inode_needs_reval(struct inode *inode) if (!lookupCacheEnabled) return true; - if (!cifs_sb->ctx->actimeo) - return true; - - if (!time_in_range(jiffies, cifs_i->time, - cifs_i->time + cifs_sb->ctx->actimeo)) - return true; + /* + * depending on inode type, check if attribute caching disabled for + * files or directories + */ + if (S_ISDIR(inode->i_mode)) { + if (!cifs_sb->ctx->acdirmax) + return true; + if (!time_in_range(jiffies, cifs_i->time, + cifs_i->time + cifs_sb->ctx->acdirmax)) + return true; + } else { /* file */ + if (!cifs_sb->ctx->acregmax) + return true; + if (!time_in_range(jiffies, cifs_i->time, + cifs_i->time + cifs_sb->ctx->acregmax)) + return true; + } /* hardlinked files w/ noserverino get "special" treatment */ if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 213465718fa8..183a3a868d7b 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -218,7 +218,7 @@ cifs_ses_add_channel(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, /* UNC and paths */ /* XXX: Use ses->server->hostname? */ - sprintf(unc, unc_fmt, ses->serverName); + sprintf(unc, unc_fmt, ses->ip_addr); ctx.UNC = unc; ctx.prepath = ""; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f19274857292..f5087295424c 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -63,17 +63,19 @@ smb2_add_credits(struct TCP_Server_Info *server, const struct cifs_credits *credits, const int optype) { int *val, rc = -1; + int scredits, in_flight; unsigned int add = credits->value; unsigned int instance = credits->instance; bool reconnect_detected = false; + bool reconnect_with_invalid_credits = false; spin_lock(&server->req_lock); val = server->ops->get_credits_field(server, optype); /* eg found case where write overlapping reconnect messed up credits */ if (((optype & CIFS_OP_MASK) == CIFS_NEG_OP) && (*val != 0)) - trace_smb3_reconnect_with_invalid_credits(server->CurrentMid, - server->hostname, *val, add); + reconnect_with_invalid_credits = true; + if ((instance == 0) || (instance == server->reconnect_instance)) *val += add; else @@ -84,7 +86,9 @@ smb2_add_credits(struct TCP_Server_Info *server, pr_warn_once("server overflowed SMB3 credits\n"); } server->in_flight--; - if (server->in_flight == 0 && (optype & CIFS_OP_MASK) != CIFS_NEG_OP) + if (server->in_flight == 0 && + ((optype & CIFS_OP_MASK) != CIFS_NEG_OP) && + ((optype & CIFS_OP_MASK) != CIFS_SESS_OP)) rc = change_conf(server); /* * Sometimes server returns 0 credits on oplock break ack - we need to @@ -97,14 +101,26 @@ smb2_add_credits(struct TCP_Server_Info *server, server->oplock_credits++; } } + scredits = *val; + in_flight = server->in_flight; spin_unlock(&server->req_lock); wake_up(&server->request_q); if (reconnect_detected) { + trace_smb3_reconnect_detected(server->CurrentMid, + server->conn_id, server->hostname, scredits, add, in_flight); + cifs_dbg(FYI, "trying to put %d credits from the old server instance %d\n", add, instance); } + if (reconnect_with_invalid_credits) { + trace_smb3_reconnect_with_invalid_credits(server->CurrentMid, + server->conn_id, server->hostname, scredits, add, in_flight); + cifs_dbg(FYI, "Negotiate operation when server credits is non-zero. Optype: %d, server credits: %d, credits added: %d\n", + optype, scredits, add); + } + if (server->tcpStatus == CifsNeedReconnect || server->tcpStatus == CifsExiting) return; @@ -123,23 +139,30 @@ smb2_add_credits(struct TCP_Server_Info *server, cifs_dbg(FYI, "disabling oplocks\n"); break; default: - trace_smb3_add_credits(server->CurrentMid, - server->hostname, rc, add); - cifs_dbg(FYI, "%s: added %u credits total=%d\n", __func__, add, rc); + /* change_conf rebalanced credits for different types */ + break; } + + trace_smb3_add_credits(server->CurrentMid, + server->conn_id, server->hostname, scredits, add, in_flight); + cifs_dbg(FYI, "%s: added %u credits total=%d\n", __func__, add, scredits); } static void smb2_set_credits(struct TCP_Server_Info *server, const int val) { + int scredits, in_flight; + spin_lock(&server->req_lock); server->credits = val; if (val == 1) server->reconnect_instance++; + scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); trace_smb3_set_credits(server->CurrentMid, - server->hostname, val, val); + server->conn_id, server->hostname, scredits, val, in_flight); cifs_dbg(FYI, "%s: set %u credits\n", __func__, val); /* don't log while holding the lock */ @@ -171,7 +194,7 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, unsigned int *num, struct cifs_credits *credits) { int rc = 0; - unsigned int scredits; + unsigned int scredits, in_flight; spin_lock(&server->req_lock); while (1) { @@ -208,17 +231,18 @@ smb2_wait_mtu_credits(struct TCP_Server_Info *server, unsigned int size, DIV_ROUND_UP(*num, SMB2_MAX_BUFFER_SIZE); credits->instance = server->reconnect_instance; server->credits -= credits->value; - scredits = server->credits; server->in_flight++; if (server->in_flight > server->max_in_flight) server->max_in_flight = server->in_flight; break; } } + scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, -(credits->value)); + server->conn_id, server->hostname, scredits, -(credits->value), in_flight); cifs_dbg(FYI, "%s: removed %u credits total=%d\n", __func__, credits->value, scredits); @@ -231,14 +255,14 @@ smb2_adjust_credits(struct TCP_Server_Info *server, const unsigned int payload_size) { int new_val = DIV_ROUND_UP(payload_size, SMB2_MAX_BUFFER_SIZE); - int scredits; + int scredits, in_flight; if (!credits->value || credits->value == new_val) return 0; if (credits->value < new_val) { trace_smb3_too_many_credits(server->CurrentMid, - server->hostname, 0, credits->value - new_val); + server->conn_id, server->hostname, 0, credits->value - new_val, 0); cifs_server_dbg(VFS, "request has less credits (%d) than required (%d)", credits->value, new_val); @@ -248,9 +272,13 @@ smb2_adjust_credits(struct TCP_Server_Info *server, spin_lock(&server->req_lock); if (server->reconnect_instance != credits->instance) { + scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); + trace_smb3_reconnect_detected(server->CurrentMid, - server->hostname, 0, 0); + server->conn_id, server->hostname, scredits, + credits->value - new_val, in_flight); cifs_server_dbg(VFS, "trying to return %d credits to old session\n", credits->value - new_val); return -EAGAIN; @@ -258,15 +286,18 @@ smb2_adjust_credits(struct TCP_Server_Info *server, server->credits += credits->value - new_val; scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); wake_up(&server->request_q); - credits->value = new_val; trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, credits->value - new_val); + server->conn_id, server->hostname, scredits, + credits->value - new_val, in_flight); cifs_dbg(FYI, "%s: adjust added %u credits total=%d\n", __func__, credits->value - new_val, scredits); + credits->value = new_val; + return 0; } @@ -2369,7 +2400,7 @@ static bool smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) { struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; - int scredits; + int scredits, in_flight; if (shdr->Status != STATUS_PENDING) return false; @@ -2378,11 +2409,13 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server) spin_lock(&server->req_lock); server->credits += le16_to_cpu(shdr->CreditRequest); scredits = server->credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); wake_up(&server->request_q); trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, le16_to_cpu(shdr->CreditRequest)); + server->conn_id, server->hostname, scredits, + le16_to_cpu(shdr->CreditRequest), in_flight); cifs_dbg(FYI, "%s: status pending add %u credits total=%d\n", __func__, le16_to_cpu(shdr->CreditRequest), scredits); } @@ -2418,6 +2451,34 @@ smb2_is_status_io_timeout(char *buf) return false; } +static void +smb2_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) +{ + struct smb2_sync_hdr *shdr = (struct smb2_sync_hdr *)buf; + struct list_head *tmp, *tmp1; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + if (shdr->Status != STATUS_NETWORK_NAME_DELETED) + return; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &server->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + list_for_each(tmp1, &ses->tcon_list) { + tcon = list_entry(tmp1, struct cifs_tcon, tcon_list); + if (tcon->tid == shdr->TreeId) { + tcon->need_reconnect = true; + spin_unlock(&cifs_tcp_ses_lock); + pr_warn_once("Server share %s deleted.\n", + tcon->treeName); + return; + } + } + } + spin_unlock(&cifs_tcp_ses_lock); +} + static int smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, struct cifsInodeInfo *cinode) @@ -4605,6 +4666,10 @@ static void smb2_decrypt_offload(struct work_struct *work) #ifdef CONFIG_CIFS_STATS2 mid->when_received = jiffies; #endif + if (dw->server->ops->is_network_name_deleted) + dw->server->ops->is_network_name_deleted(dw->buf, + dw->server); + mid->callback(mid); } else { spin_lock(&GlobalMid_Lock); @@ -4723,6 +4788,12 @@ non_offloaded_decrypt: rc = handle_read_data(server, *mid, buf, server->vals->read_rsp_size, pages, npages, len, false); + if (rc >= 0) { + if (server->ops->is_network_name_deleted) { + server->ops->is_network_name_deleted(buf, + server); + } + } } free_pages: @@ -5072,6 +5143,7 @@ struct smb_version_operations smb20_operations = { .fiemap = smb3_fiemap, .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, + .is_network_name_deleted = smb2_is_network_name_deleted, }; struct smb_version_operations smb21_operations = { @@ -5173,6 +5245,7 @@ struct smb_version_operations smb21_operations = { .fiemap = smb3_fiemap, .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, + .is_network_name_deleted = smb2_is_network_name_deleted, }; struct smb_version_operations smb30_operations = { @@ -5286,6 +5359,7 @@ struct smb_version_operations smb30_operations = { .fiemap = smb3_fiemap, .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, + .is_network_name_deleted = smb2_is_network_name_deleted, }; struct smb_version_operations smb311_operations = { @@ -5399,6 +5473,7 @@ struct smb_version_operations smb311_operations = { .fiemap = smb3_fiemap, .llseek = smb3_llseek, .is_status_io_timeout = smb2_is_status_io_timeout, + .is_network_name_deleted = smb2_is_network_name_deleted, }; struct smb_version_values smb20_values = { diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 794fc3b68b4f..4bbb6126b14d 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -814,8 +814,9 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) SMB3ANY_VERSION_STRING) == 0) { req->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); req->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); - req->DialectCount = cpu_to_le16(2); - total_len += 4; + req->Dialects[2] = cpu_to_le16(SMB311_PROT_ID); + req->DialectCount = cpu_to_le16(3); + total_len += 6; } else if (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { req->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); @@ -848,6 +849,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) memcpy(req->ClientGUID, server->client_guid, SMB2_CLIENT_GUID_SIZE); if ((server->vals->protocol_id == SMB311_PROT_ID) || + (strcmp(server->vals->version_string, + SMB3ANY_VERSION_STRING) == 0) || (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0)) assemble_neg_contexts(req, server, &total_len); @@ -883,6 +886,10 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) cifs_server_dbg(VFS, "SMB2.1 dialect returned but not requested\n"); return -EIO; + } else if (rsp->DialectRevision == cpu_to_le16(SMB311_PROT_ID)) { + /* ops set to 3.0 by default for default so update */ + server->ops = &smb311_operations; + server->vals = &smb311_values; } } else if (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { @@ -1042,10 +1049,11 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) SMB3ANY_VERSION_STRING) == 0) { pneg_inbuf->Dialects[0] = cpu_to_le16(SMB30_PROT_ID); pneg_inbuf->Dialects[1] = cpu_to_le16(SMB302_PROT_ID); - pneg_inbuf->DialectCount = cpu_to_le16(2); - /* structure is big enough for 3 dialects, sending only 2 */ + pneg_inbuf->Dialects[2] = cpu_to_le16(SMB311_PROT_ID); + pneg_inbuf->DialectCount = cpu_to_le16(3); + /* SMB 2.1 not included so subtract one dialect from len */ inbuflen = sizeof(*pneg_inbuf) - - (2 * sizeof(pneg_inbuf->Dialects[0])); + (sizeof(pneg_inbuf->Dialects[0])); } else if (strcmp(server->vals->version_string, SMBDEFAULT_VERSION_STRING) == 0) { pneg_inbuf->Dialects[0] = cpu_to_le16(SMB21_PROT_ID); @@ -1053,7 +1061,7 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) pneg_inbuf->Dialects[2] = cpu_to_le16(SMB302_PROT_ID); pneg_inbuf->Dialects[3] = cpu_to_le16(SMB311_PROT_ID); pneg_inbuf->DialectCount = cpu_to_le16(4); - /* structure is big enough for 3 dialects */ + /* structure is big enough for 4 dialects */ inbuflen = sizeof(*pneg_inbuf); } else { /* otherwise specific dialect was requested */ @@ -1253,7 +1261,7 @@ SMB2_sess_sendreceive(struct SMB2_sess_data *sess_data) cifs_ses_server(sess_data->ses), &rqst, &sess_data->buf0_type, - CIFS_LOG_ERROR | CIFS_NEG_OP, &rsp_iov); + CIFS_LOG_ERROR | CIFS_SESS_OP, &rsp_iov); cifs_small_buf_release(sess_data->iov[0].iov_base); memcpy(&sess_data->iov[0], &rsp_iov, sizeof(struct kvec)); diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index c3d1a584f251..d6df908dccad 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -851,17 +851,21 @@ DEFINE_SMB3_LEASE_ERR_EVENT(lease_err); DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_PROTO(__u64 currmid, + __u64 conn_id, char *hostname), - TP_ARGS(currmid, hostname), + TP_ARGS(currmid, conn_id, hostname), TP_STRUCT__entry( __field(__u64, currmid) + __field(__u64, conn_id) __field(char *, hostname) ), TP_fast_assign( __entry->currmid = currmid; + __entry->conn_id = conn_id; __entry->hostname = hostname; ), - TP_printk("server=%s current_mid=0x%llx", + TP_printk("conn_id=0x%llx server=%s current_mid=%llu", + __entry->conn_id, __entry->hostname, __entry->currmid) ) @@ -869,44 +873,56 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class, #define DEFINE_SMB3_RECONNECT_EVENT(name) \ DEFINE_EVENT(smb3_reconnect_class, smb3_##name, \ TP_PROTO(__u64 currmid, \ - char *hostname), \ - TP_ARGS(currmid, hostname)) + __u64 conn_id, \ + char *hostname), \ + TP_ARGS(currmid, conn_id, hostname)) DEFINE_SMB3_RECONNECT_EVENT(reconnect); DEFINE_SMB3_RECONNECT_EVENT(partial_send_reconnect); DECLARE_EVENT_CLASS(smb3_credit_class, TP_PROTO(__u64 currmid, + __u64 conn_id, char *hostname, int credits, - int credits_to_add), - TP_ARGS(currmid, hostname, credits, credits_to_add), + int credits_to_add, + int in_flight), + TP_ARGS(currmid, conn_id, hostname, credits, credits_to_add, in_flight), TP_STRUCT__entry( __field(__u64, currmid) + __field(__u64, conn_id) __field(char *, hostname) __field(int, credits) __field(int, credits_to_add) + __field(int, in_flight) ), TP_fast_assign( __entry->currmid = currmid; + __entry->conn_id = conn_id; __entry->hostname = hostname; __entry->credits = credits; __entry->credits_to_add = credits_to_add; + __entry->in_flight = in_flight; ), - TP_printk("server=%s current_mid=0x%llx credits=%d credits_to_add=%d", + TP_printk("conn_id=0x%llx server=%s current_mid=%llu " + "credits=%d credit_change=%d in_flight=%d", + __entry->conn_id, __entry->hostname, __entry->currmid, __entry->credits, - __entry->credits_to_add) + __entry->credits_to_add, + __entry->in_flight) ) #define DEFINE_SMB3_CREDIT_EVENT(name) \ DEFINE_EVENT(smb3_credit_class, smb3_##name, \ TP_PROTO(__u64 currmid, \ + __u64 conn_id, \ char *hostname, \ int credits, \ - int credits_to_add), \ - TP_ARGS(currmid, hostname, credits, credits_to_add)) + int credits_to_add, \ + int in_flight), \ + TP_ARGS(currmid, conn_id, hostname, credits, credits_to_add, in_flight)) DEFINE_SMB3_CREDIT_EVENT(reconnect_with_invalid_credits); DEFINE_SMB3_CREDIT_EVENT(reconnect_detected); diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 4a2b836eb017..e90a1d1380b0 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -445,7 +445,7 @@ unmask: */ server->tcpStatus = CifsNeedReconnect; trace_smb3_partial_send_reconnect(server->CurrentMid, - server->hostname); + server->conn_id, server->hostname); } smbd_done: if (rc < 0 && rc != -EINTR) @@ -527,7 +527,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, int *credits; int optype; long int t; - int scredits = server->credits; + int scredits, in_flight; if (timeout < 0) t = MAX_JIFFY_OFFSET; @@ -551,23 +551,39 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, server->max_in_flight = server->in_flight; *credits -= 1; *instance = server->reconnect_instance; + scredits = *credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); + + trace_smb3_add_credits(server->CurrentMid, + server->conn_id, server->hostname, scredits, -1, in_flight); + cifs_dbg(FYI, "%s: remove %u credits total=%d\n", + __func__, 1, scredits); + return 0; } while (1) { if (*credits < num_credits) { + scredits = *credits; spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); rc = wait_event_killable_timeout(server->request_q, has_credits(server, credits, num_credits), t); cifs_num_waiters_dec(server); if (!rc) { + spin_lock(&server->req_lock); + scredits = *credits; + in_flight = server->in_flight; + spin_unlock(&server->req_lock); + trace_smb3_credit_timeout(server->CurrentMid, - server->hostname, num_credits, 0); + server->conn_id, server->hostname, scredits, + num_credits, in_flight); cifs_server_dbg(VFS, "wait timed out after %d ms\n", - timeout); - return -ENOTSUPP; + timeout); + return -EBUSY; } if (rc == -ERESTARTSYS) return -ERESTARTSYS; @@ -595,6 +611,7 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, server->in_flight > 2 * MAX_COMPOUND && *credits <= MAX_COMPOUND) { spin_unlock(&server->req_lock); + cifs_num_waiters_inc(server); rc = wait_event_killable_timeout( server->request_q, @@ -603,13 +620,18 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, t); cifs_num_waiters_dec(server); if (!rc) { + spin_lock(&server->req_lock); + scredits = *credits; + in_flight = server->in_flight; + spin_unlock(&server->req_lock); + trace_smb3_credit_timeout( - server->CurrentMid, - server->hostname, num_credits, - 0); + server->CurrentMid, + server->conn_id, server->hostname, + scredits, num_credits, in_flight); cifs_server_dbg(VFS, "wait timed out after %d ms\n", - timeout); - return -ENOTSUPP; + timeout); + return -EBUSY; } if (rc == -ERESTARTSYS) return -ERESTARTSYS; @@ -625,16 +647,18 @@ wait_for_free_credits(struct TCP_Server_Info *server, const int num_credits, /* update # of requests on the wire to server */ if ((flags & CIFS_TIMEOUT_MASK) != CIFS_BLOCKING_OP) { *credits -= num_credits; - scredits = *credits; server->in_flight += num_credits; if (server->in_flight > server->max_in_flight) server->max_in_flight = server->in_flight; *instance = server->reconnect_instance; } + scredits = *credits; + in_flight = server->in_flight; spin_unlock(&server->req_lock); trace_smb3_add_credits(server->CurrentMid, - server->hostname, scredits, -(num_credits)); + server->conn_id, server->hostname, scredits, + -(num_credits), in_flight); cifs_dbg(FYI, "%s: remove %u credits total=%d\n", __func__, num_credits, scredits); break; @@ -656,13 +680,13 @@ wait_for_compound_request(struct TCP_Server_Info *server, int num, const int flags, unsigned int *instance) { int *credits; - int scredits, sin_flight; + int scredits, in_flight; credits = server->ops->get_credits_field(server, flags & CIFS_OP_MASK); spin_lock(&server->req_lock); scredits = *credits; - sin_flight = server->in_flight; + in_flight = server->in_flight; if (*credits < num) { /* @@ -684,10 +708,11 @@ wait_for_compound_request(struct TCP_Server_Info *server, int num, if (server->in_flight == 0) { spin_unlock(&server->req_lock); trace_smb3_insufficient_credits(server->CurrentMid, - server->hostname, scredits, sin_flight); + server->conn_id, server->hostname, scredits, + num, in_flight); cifs_dbg(FYI, "%s: %d requests in flight, needed %d total=%d\n", - __func__, sin_flight, num, scredits); - return -ENOTSUPP; + __func__, in_flight, num, scredits); + return -EDEADLK; } } spin_unlock(&server->req_lock); @@ -1171,7 +1196,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, /* * Compounding is never used during session establish. */ - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) smb311_update_preauth_hash(ses, rqst[0].rq_iov, rqst[0].rq_nvec); @@ -1236,7 +1261,7 @@ compound_send_recv(const unsigned int xid, struct cifs_ses *ses, /* * Compounding is never used during session establish. */ - if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP)) { + if ((ses->status == CifsNew) || (optype & CIFS_NEG_OP) || (optype & CIFS_SESS_OP)) { struct kvec iov = { .iov_base = resp_iov[0].iov_base, .iov_len = resp_iov[0].iov_len diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 68376fa03880..c9775d5c6594 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -491,8 +491,8 @@ static void init_dinode(struct gfs2_inode *dip, struct gfs2_inode *ip, di = (struct gfs2_dinode *)dibh->b_data; gfs2_dinode_out(ip, di); - di->di_major = cpu_to_be32(MAJOR(ip->i_inode.i_rdev)); - di->di_minor = cpu_to_be32(MINOR(ip->i_inode.i_rdev)); + di->di_major = cpu_to_be32(imajor(&ip->i_inode)); + di->di_minor = cpu_to_be32(iminor(&ip->i_inode)); di->__pad1 = 0; di->__pad2 = 0; di->__pad3 = 0; diff --git a/fs/inode.c b/fs/inode.c index 6dba963d3f6d..a047ab306f9a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -142,6 +142,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) atomic_set(&inode->i_count, 1); inode->i_op = &empty_iops; inode->i_fop = &no_open_fops; + inode->i_ino = 0; inode->__i_nlink = 1; inode->i_opflags = 0; if (sb->s_xattr) diff --git a/fs/io-wq.c b/fs/io-wq.c index c36bbcd823ce..44e20248805a 100644 --- a/fs/io-wq.c +++ b/fs/io-wq.c @@ -13,13 +13,9 @@ #include #include #include -#include #include -#include -#include -#include -#include #include +#include #include "../kernel/sched/sched.h" #include "io-wq.h" @@ -36,7 +32,6 @@ enum { enum { IO_WQ_BIT_EXIT = 0, /* wq exiting */ - IO_WQ_BIT_ERROR = 1, /* error on setup */ }; enum { @@ -57,14 +52,12 @@ struct io_worker { struct io_wq_work *cur_work; spinlock_t lock; - struct rcu_head rcu; - struct mm_struct *mm; -#ifdef CONFIG_BLK_CGROUP - struct cgroup_subsys_state *blkcg_css; -#endif const struct cred *cur_creds; const struct cred *saved_creds; - struct nsproxy *restore_nsproxy; + + struct completion ref_done; + + struct rcu_head rcu; }; #if BITS_PER_LONG == 64 @@ -93,7 +86,6 @@ struct io_wqe { struct { raw_spinlock_t lock; struct io_wq_work_list work_list; - unsigned long hash_map; unsigned flags; } ____cacheline_aligned_in_smp; @@ -103,6 +95,8 @@ struct io_wqe { struct hlist_nulls_head free_list; struct list_head all_list; + struct wait_queue_entry wait; + struct io_wq *wq; struct io_wq_work *hash_tail[IO_WQ_NR_HASH_BUCKETS]; }; @@ -119,12 +113,15 @@ struct io_wq { struct task_struct *manager; struct user_struct *user; + + struct io_wq_hash *hash; + refcount_t refs; struct completion done; struct hlist_node cpuhp_node; - refcount_t use_refs; + pid_t task_pid; }; static enum cpuhp_state io_wq_online; @@ -137,62 +134,7 @@ static bool io_worker_get(struct io_worker *worker) static void io_worker_release(struct io_worker *worker) { if (refcount_dec_and_test(&worker->ref)) - wake_up_process(worker->task); -} - -/* - * Note: drops the wqe->lock if returning true! The caller must re-acquire - * the lock in that case. Some callers need to restart handling if this - * happens, so we can't just re-acquire the lock on behalf of the caller. - */ -static bool __io_worker_unuse(struct io_wqe *wqe, struct io_worker *worker) -{ - bool dropped_lock = false; - - if (worker->saved_creds) { - revert_creds(worker->saved_creds); - worker->cur_creds = worker->saved_creds = NULL; - } - - if (current->files) { - __acquire(&wqe->lock); - raw_spin_unlock_irq(&wqe->lock); - dropped_lock = true; - - task_lock(current); - current->files = NULL; - current->nsproxy = worker->restore_nsproxy; - task_unlock(current); - } - - if (current->fs) - current->fs = NULL; - - /* - * If we have an active mm, we need to drop the wq lock before unusing - * it. If we do, return true and let the caller retry the idle loop. - */ - if (worker->mm) { - if (!dropped_lock) { - __acquire(&wqe->lock); - raw_spin_unlock_irq(&wqe->lock); - dropped_lock = true; - } - __set_current_state(TASK_RUNNING); - kthread_unuse_mm(worker->mm); - mmput(worker->mm); - worker->mm = NULL; - } - -#ifdef CONFIG_BLK_CGROUP - if (worker->blkcg_css) { - kthread_associate_blkcg(NULL); - worker->blkcg_css = NULL; - } -#endif - if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - return dropped_lock; + complete(&worker->ref_done); } static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, @@ -204,9 +146,10 @@ static inline struct io_wqe_acct *io_work_get_acct(struct io_wqe *wqe, return &wqe->acct[IO_WQ_ACCT_BOUND]; } -static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe, - struct io_worker *worker) +static inline struct io_wqe_acct *io_wqe_get_acct(struct io_worker *worker) { + struct io_wqe *wqe = worker->wqe; + if (worker->flags & IO_WORKER_F_BOUND) return &wqe->acct[IO_WQ_ACCT_BOUND]; @@ -216,39 +159,36 @@ static inline struct io_wqe_acct *io_wqe_get_acct(struct io_wqe *wqe, static void io_worker_exit(struct io_worker *worker) { struct io_wqe *wqe = worker->wqe; - struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + struct io_wqe_acct *acct = io_wqe_get_acct(worker); + unsigned flags; - /* - * If we're not at zero, someone else is holding a brief reference - * to the worker. Wait for that to go away. - */ - set_current_state(TASK_INTERRUPTIBLE); - if (!refcount_dec_and_test(&worker->ref)) - schedule(); - __set_current_state(TASK_RUNNING); + if (refcount_dec_and_test(&worker->ref)) + complete(&worker->ref_done); + wait_for_completion(&worker->ref_done); preempt_disable(); current->flags &= ~PF_IO_WORKER; - if (worker->flags & IO_WORKER_F_RUNNING) + flags = worker->flags; + worker->flags = 0; + if (flags & IO_WORKER_F_RUNNING) atomic_dec(&acct->nr_running); - if (!(worker->flags & IO_WORKER_F_BOUND)) - atomic_dec(&wqe->wq->user->processes); worker->flags = 0; preempt_enable(); - raw_spin_lock_irq(&wqe->lock); - hlist_nulls_del_rcu(&worker->nulls_node); - list_del_rcu(&worker->all_list); - if (__io_worker_unuse(wqe, worker)) { - __release(&wqe->lock); - raw_spin_lock_irq(&wqe->lock); + if (worker->saved_creds) { + revert_creds(worker->saved_creds); + worker->cur_creds = worker->saved_creds = NULL; } + + raw_spin_lock_irq(&wqe->lock); + if (flags & IO_WORKER_F_FREE) + hlist_nulls_del_rcu(&worker->nulls_node); + list_del_rcu(&worker->all_list); acct->nr_workers--; raw_spin_unlock_irq(&wqe->lock); kfree_rcu(worker, rcu); - if (refcount_dec_and_test(&wqe->wq->refs)) - complete(&wqe->wq->done); + io_wq_put(wqe->wq); } static inline bool io_wqe_run_queue(struct io_wqe *wqe) @@ -306,33 +246,27 @@ static void io_wqe_wake_worker(struct io_wqe *wqe, struct io_wqe_acct *acct) wake_up_process(wqe->wq->manager); } -static void io_wqe_inc_running(struct io_wqe *wqe, struct io_worker *worker) +static void io_wqe_inc_running(struct io_worker *worker) { - struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + struct io_wqe_acct *acct = io_wqe_get_acct(worker); atomic_inc(&acct->nr_running); } -static void io_wqe_dec_running(struct io_wqe *wqe, struct io_worker *worker) +static void io_wqe_dec_running(struct io_worker *worker) __must_hold(wqe->lock) { - struct io_wqe_acct *acct = io_wqe_get_acct(wqe, worker); + struct io_wqe_acct *acct = io_wqe_get_acct(worker); + struct io_wqe *wqe = worker->wqe; if (atomic_dec_and_test(&acct->nr_running) && io_wqe_run_queue(wqe)) io_wqe_wake_worker(wqe, acct); } -static void io_worker_start(struct io_wqe *wqe, struct io_worker *worker) +static void io_worker_start(struct io_worker *worker) { - allow_kernel_signal(SIGINT); - - current->flags |= PF_IO_WORKER; - current->fs = NULL; - current->files = NULL; - worker->flags |= (IO_WORKER_F_UP | IO_WORKER_F_RUNNING); - worker->restore_nsproxy = current->nsproxy; - io_wqe_inc_running(wqe, worker); + io_wqe_inc_running(worker); } /* @@ -357,19 +291,17 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, worker_bound = (worker->flags & IO_WORKER_F_BOUND) != 0; work_bound = (work->flags & IO_WQ_WORK_UNBOUND) == 0; if (worker_bound != work_bound) { - io_wqe_dec_running(wqe, worker); + io_wqe_dec_running(worker); if (work_bound) { worker->flags |= IO_WORKER_F_BOUND; wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers--; wqe->acct[IO_WQ_ACCT_BOUND].nr_workers++; - atomic_dec(&wqe->wq->user->processes); } else { worker->flags &= ~IO_WORKER_F_BOUND; wqe->acct[IO_WQ_ACCT_UNBOUND].nr_workers++; wqe->acct[IO_WQ_ACCT_BOUND].nr_workers--; - atomic_inc(&wqe->wq->user->processes); } - io_wqe_inc_running(wqe, worker); + io_wqe_inc_running(worker); } } @@ -380,15 +312,17 @@ static void __io_worker_busy(struct io_wqe *wqe, struct io_worker *worker, * retry the loop in that case (we changed task state), we don't regrab * the lock if we return success. */ -static bool __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) +static void __io_worker_idle(struct io_wqe *wqe, struct io_worker *worker) __must_hold(wqe->lock) { if (!(worker->flags & IO_WORKER_F_FREE)) { worker->flags |= IO_WORKER_F_FREE; hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); } - - return __io_worker_unuse(wqe, worker); + if (worker->saved_creds) { + revert_creds(worker->saved_creds); + worker->cur_creds = worker->saved_creds = NULL; + } } static inline unsigned int io_get_work_hash(struct io_wq_work *work) @@ -396,14 +330,31 @@ static inline unsigned int io_get_work_hash(struct io_wq_work *work) return work->flags >> IO_WQ_HASH_SHIFT; } +static void io_wait_on_hash(struct io_wqe *wqe, unsigned int hash) +{ + struct io_wq *wq = wqe->wq; + + spin_lock(&wq->hash->wait.lock); + if (list_empty(&wqe->wait.entry)) { + __add_wait_queue(&wq->hash->wait, &wqe->wait); + if (!test_bit(hash, &wq->hash->map)) { + __set_current_state(TASK_RUNNING); + list_del_init(&wqe->wait.entry); + } + } + spin_unlock(&wq->hash->wait.lock); +} + static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) __must_hold(wqe->lock) { struct io_wq_work_node *node, *prev; struct io_wq_work *work, *tail; - unsigned int hash; + unsigned int stall_hash = -1U; wq_list_for_each(node, prev, &wqe->work_list) { + unsigned int hash; + work = container_of(node, struct io_wq_work, list); /* not hashed, can run anytime */ @@ -412,111 +363,60 @@ static struct io_wq_work *io_get_next_work(struct io_wqe *wqe) return work; } - /* hashed, can run if not already running */ hash = io_get_work_hash(work); - if (!(wqe->hash_map & BIT(hash))) { - wqe->hash_map |= BIT(hash); - /* all items with this hash lie in [work, tail] */ - tail = wqe->hash_tail[hash]; + /* all items with this hash lie in [work, tail] */ + tail = wqe->hash_tail[hash]; + + /* hashed, can run if not already running */ + if (!test_and_set_bit(hash, &wqe->wq->hash->map)) { wqe->hash_tail[hash] = NULL; wq_list_cut(&wqe->work_list, &tail->list, prev); return work; } + if (stall_hash == -1U) + stall_hash = hash; + /* fast forward to a next hash, for-each will fix up @prev */ + node = &tail->list; + } + + if (stall_hash != -1U) { + raw_spin_unlock(&wqe->lock); + io_wait_on_hash(wqe, stall_hash); + raw_spin_lock(&wqe->lock); } return NULL; } -static void io_wq_switch_mm(struct io_worker *worker, struct io_wq_work *work) +static void io_flush_signals(void) { - if (worker->mm) { - kthread_unuse_mm(worker->mm); - mmput(worker->mm); - worker->mm = NULL; + if (unlikely(test_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL))) { + if (current->task_works) + task_work_run(); + clear_tsk_thread_flag(current, TIF_NOTIFY_SIGNAL); } - - if (mmget_not_zero(work->identity->mm)) { - kthread_use_mm(work->identity->mm); - worker->mm = work->identity->mm; - return; - } - - /* failed grabbing mm, ensure work gets cancelled */ - work->flags |= IO_WQ_WORK_CANCEL; -} - -static inline void io_wq_switch_blkcg(struct io_worker *worker, - struct io_wq_work *work) -{ -#ifdef CONFIG_BLK_CGROUP - if (!(work->flags & IO_WQ_WORK_BLKCG)) - return; - if (work->identity->blkcg_css != worker->blkcg_css) { - kthread_associate_blkcg(work->identity->blkcg_css); - worker->blkcg_css = work->identity->blkcg_css; - } -#endif } static void io_wq_switch_creds(struct io_worker *worker, struct io_wq_work *work) { - const struct cred *old_creds = override_creds(work->identity->creds); + const struct cred *old_creds = override_creds(work->creds); - worker->cur_creds = work->identity->creds; + worker->cur_creds = work->creds; if (worker->saved_creds) put_cred(old_creds); /* creds set by previous switch */ else worker->saved_creds = old_creds; } -static void io_impersonate_work(struct io_worker *worker, - struct io_wq_work *work) -{ - if ((work->flags & IO_WQ_WORK_FILES) && - current->files != work->identity->files) { - task_lock(current); - current->files = work->identity->files; - current->nsproxy = work->identity->nsproxy; - task_unlock(current); - if (!work->identity->files) { - /* failed grabbing files, ensure work gets cancelled */ - work->flags |= IO_WQ_WORK_CANCEL; - } - } - if ((work->flags & IO_WQ_WORK_FS) && current->fs != work->identity->fs) - current->fs = work->identity->fs; - if ((work->flags & IO_WQ_WORK_MM) && work->identity->mm != worker->mm) - io_wq_switch_mm(worker, work); - if ((work->flags & IO_WQ_WORK_CREDS) && - worker->cur_creds != work->identity->creds) - io_wq_switch_creds(worker, work); - if (work->flags & IO_WQ_WORK_FSIZE) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = work->identity->fsize; - else if (current->signal->rlim[RLIMIT_FSIZE].rlim_cur != RLIM_INFINITY) - current->signal->rlim[RLIMIT_FSIZE].rlim_cur = RLIM_INFINITY; - io_wq_switch_blkcg(worker, work); -#ifdef CONFIG_AUDIT - current->loginuid = work->identity->loginuid; - current->sessionid = work->identity->sessionid; -#endif -} - static void io_assign_current_work(struct io_worker *worker, struct io_wq_work *work) { if (work) { - /* flush pending signals before assigning new work */ - if (signal_pending(current)) - flush_signals(current); + io_flush_signals(); cond_resched(); } -#ifdef CONFIG_AUDIT - current->loginuid = KUIDT_INIT(AUDIT_UID_UNSET); - current->sessionid = AUDIT_SID_UNSET; -#endif - spin_lock_irq(&worker->lock); worker->cur_work = work; spin_unlock_irq(&worker->lock); @@ -550,6 +450,7 @@ get_next: if (!work) break; io_assign_current_work(worker, work); + __set_current_state(TASK_RUNNING); /* handle a whole dependent link */ do { @@ -557,7 +458,8 @@ get_next: unsigned int hash = io_get_work_hash(work); next_hashed = wq_next_work(work); - io_impersonate_work(worker, work); + if (work->creds && worker->cur_creds != work->creds) + io_wq_switch_creds(worker, work); wq->do_work(work); io_assign_current_work(worker, NULL); @@ -572,8 +474,10 @@ get_next: io_wqe_enqueue(wqe, linked); if (hash != -1U && !next_hashed) { + clear_bit(hash, &wq->hash->map); + if (wq_has_sleeper(&wq->hash->wait)) + wake_up(&wq->hash->wait); raw_spin_lock_irq(&wqe->lock); - wqe->hash_map &= ~BIT_ULL(hash); wqe->flags &= ~IO_WQE_FLAG_STALLED; /* skip unnecessary unlock-lock wqe->lock */ if (!work) @@ -592,27 +496,23 @@ static int io_wqe_worker(void *data) struct io_wqe *wqe = worker->wqe; struct io_wq *wq = wqe->wq; - io_worker_start(wqe, worker); + io_worker_start(worker); while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)) { set_current_state(TASK_INTERRUPTIBLE); loop: raw_spin_lock_irq(&wqe->lock); if (io_wqe_run_queue(wqe)) { - __set_current_state(TASK_RUNNING); io_worker_handle_work(worker); goto loop; } - /* drops the lock on success, retry */ - if (__io_worker_idle(wqe, worker)) { - __release(&wqe->lock); - goto loop; - } + __io_worker_idle(wqe, worker); raw_spin_unlock_irq(&wqe->lock); - if (signal_pending(current)) - flush_signals(current); + io_flush_signals(); if (schedule_timeout(WORKER_IDLE_TIMEOUT)) continue; + if (fatal_signal_pending(current)) + break; /* timed out, exit unless we're the fixed worker */ if (test_bit(IO_WQ_BIT_EXIT, &wq->state) || !(worker->flags & IO_WORKER_F_FIXED)) @@ -636,15 +536,16 @@ loop: */ void io_wq_worker_running(struct task_struct *tsk) { - struct io_worker *worker = kthread_data(tsk); - struct io_wqe *wqe = worker->wqe; + struct io_worker *worker = tsk->pf_io_worker; + if (!worker) + return; if (!(worker->flags & IO_WORKER_F_UP)) return; if (worker->flags & IO_WORKER_F_RUNNING) return; worker->flags |= IO_WORKER_F_RUNNING; - io_wqe_inc_running(wqe, worker); + io_wqe_inc_running(worker); } /* @@ -654,9 +555,10 @@ void io_wq_worker_running(struct task_struct *tsk) */ void io_wq_worker_sleeping(struct task_struct *tsk) { - struct io_worker *worker = kthread_data(tsk); - struct io_wqe *wqe = worker->wqe; + struct io_worker *worker = tsk->pf_io_worker; + if (!worker) + return; if (!(worker->flags & IO_WORKER_F_UP)) return; if (!(worker->flags & IO_WORKER_F_RUNNING)) @@ -664,32 +566,27 @@ void io_wq_worker_sleeping(struct task_struct *tsk) worker->flags &= ~IO_WORKER_F_RUNNING; - raw_spin_lock_irq(&wqe->lock); - io_wqe_dec_running(wqe, worker); - raw_spin_unlock_irq(&wqe->lock); + raw_spin_lock_irq(&worker->wqe->lock); + io_wqe_dec_running(worker); + raw_spin_unlock_irq(&worker->wqe->lock); } -static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +static int task_thread(void *data, int index) { + struct io_worker *worker = data; + struct io_wqe *wqe = worker->wqe; struct io_wqe_acct *acct = &wqe->acct[index]; - struct io_worker *worker; + struct io_wq *wq = wqe->wq; + char buf[TASK_COMM_LEN]; - worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); - if (!worker) - return false; + sprintf(buf, "iou-wrk-%d", wq->task_pid); + set_task_comm(current, buf); - refcount_set(&worker->ref, 1); - worker->nulls_node.pprev = NULL; - worker->wqe = wqe; - spin_lock_init(&worker->lock); + current->pf_io_worker = worker; + worker->task = current; - worker->task = kthread_create_on_node(io_wqe_worker, worker, wqe->node, - "io_wqe_worker-%d/%d", index, wqe->node); - if (IS_ERR(worker->task)) { - kfree(worker); - return false; - } - kthread_bind_mask(worker->task, cpumask_of_node(wqe->node)); + set_cpus_allowed_ptr(current, cpumask_of_node(wqe->node)); + current->flags |= PF_NO_SETAFFINITY; raw_spin_lock_irq(&wqe->lock); hlist_nulls_add_head_rcu(&worker->nulls_node, &wqe->free_list); @@ -702,11 +599,63 @@ static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) acct->nr_workers++; raw_spin_unlock_irq(&wqe->lock); - if (index == IO_WQ_ACCT_UNBOUND) - atomic_inc(&wq->user->processes); + io_wqe_worker(data); + do_exit(0); +} + +static int task_thread_bound(void *data) +{ + return task_thread(data, IO_WQ_ACCT_BOUND); +} + +static int task_thread_unbound(void *data) +{ + return task_thread(data, IO_WQ_ACCT_UNBOUND); +} + +pid_t io_wq_fork_thread(int (*fn)(void *), void *arg) +{ + unsigned long flags = CLONE_FS|CLONE_FILES|CLONE_SIGHAND|CLONE_THREAD| + CLONE_IO|SIGCHLD; + struct kernel_clone_args args = { + .flags = ((lower_32_bits(flags) | CLONE_VM | + CLONE_UNTRACED) & ~CSIGNAL), + .exit_signal = (lower_32_bits(flags) & CSIGNAL), + .stack = (unsigned long)fn, + .stack_size = (unsigned long)arg, + }; + + return kernel_clone(&args); +} + +static bool create_io_worker(struct io_wq *wq, struct io_wqe *wqe, int index) +{ + struct io_worker *worker; + pid_t pid; + + __set_current_state(TASK_RUNNING); + + worker = kzalloc_node(sizeof(*worker), GFP_KERNEL, wqe->node); + if (!worker) + return false; + + refcount_set(&worker->ref, 1); + worker->nulls_node.pprev = NULL; + worker->wqe = wqe; + spin_lock_init(&worker->lock); + init_completion(&worker->ref_done); refcount_inc(&wq->refs); - wake_up_process(worker->task); + + if (index == IO_WQ_ACCT_BOUND) + pid = io_wq_fork_thread(task_thread_bound, worker); + else + pid = io_wq_fork_thread(task_thread_unbound, worker); + if (pid < 0) { + io_wq_put(wq); + kfree(worker); + return false; + } return true; } @@ -752,93 +701,57 @@ static bool io_wq_worker_wake(struct io_worker *worker, void *data) return false; } +static void io_wq_check_workers(struct io_wq *wq) +{ + int node; + + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; + bool fork_worker[2] = { false, false }; + + if (!node_online(node)) + continue; + + raw_spin_lock_irq(&wqe->lock); + if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) + fork_worker[IO_WQ_ACCT_BOUND] = true; + if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) + fork_worker[IO_WQ_ACCT_UNBOUND] = true; + raw_spin_unlock_irq(&wqe->lock); + if (fork_worker[IO_WQ_ACCT_BOUND]) + create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); + if (fork_worker[IO_WQ_ACCT_UNBOUND]) + create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND); + } +} + /* * Manager thread. Tasked with creating new workers, if we need them. */ static int io_wq_manager(void *data) { struct io_wq *wq = data; - int node; + char buf[TASK_COMM_LEN]; - /* create fixed workers */ - refcount_set(&wq->refs, 1); - for_each_node(node) { - if (!node_online(node)) - continue; - if (create_io_worker(wq, wq->wqes[node], IO_WQ_ACCT_BOUND)) - continue; - set_bit(IO_WQ_BIT_ERROR, &wq->state); - set_bit(IO_WQ_BIT_EXIT, &wq->state); - goto out; - } + sprintf(buf, "iou-mgr-%d", wq->task_pid); + set_task_comm(current, buf); + current->flags |= PF_IO_WORKER; + wq->manager = current; complete(&wq->done); - while (!kthread_should_stop()) { - if (current->task_works) - task_work_run(); - - for_each_node(node) { - struct io_wqe *wqe = wq->wqes[node]; - bool fork_worker[2] = { false, false }; - - if (!node_online(node)) - continue; - - raw_spin_lock_irq(&wqe->lock); - if (io_wqe_need_worker(wqe, IO_WQ_ACCT_BOUND)) - fork_worker[IO_WQ_ACCT_BOUND] = true; - if (io_wqe_need_worker(wqe, IO_WQ_ACCT_UNBOUND)) - fork_worker[IO_WQ_ACCT_UNBOUND] = true; - raw_spin_unlock_irq(&wqe->lock); - if (fork_worker[IO_WQ_ACCT_BOUND]) - create_io_worker(wq, wqe, IO_WQ_ACCT_BOUND); - if (fork_worker[IO_WQ_ACCT_UNBOUND]) - create_io_worker(wq, wqe, IO_WQ_ACCT_UNBOUND); - } + do { set_current_state(TASK_INTERRUPTIBLE); + io_wq_check_workers(wq); schedule_timeout(HZ); - } + if (fatal_signal_pending(current)) + set_bit(IO_WQ_BIT_EXIT, &wq->state); + } while (!test_bit(IO_WQ_BIT_EXIT, &wq->state)); - if (current->task_works) - task_work_run(); - -out: - if (refcount_dec_and_test(&wq->refs)) { - complete(&wq->done); - return 0; - } - /* if ERROR is set and we get here, we have workers to wake */ - if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { - rcu_read_lock(); - for_each_node(node) - io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); - rcu_read_unlock(); - } - return 0; -} - -static bool io_wq_can_queue(struct io_wqe *wqe, struct io_wqe_acct *acct, - struct io_wq_work *work) -{ - bool free_worker; - - if (!(work->flags & IO_WQ_WORK_UNBOUND)) - return true; - if (atomic_read(&acct->nr_running)) - return true; - - rcu_read_lock(); - free_worker = !hlist_nulls_empty(&wqe->free_list); - rcu_read_unlock(); - if (free_worker) - return true; - - if (atomic_read(&wqe->wq->user->processes) >= acct->max_workers && - !(capable(CAP_SYS_RESOURCE) || capable(CAP_SYS_ADMIN))) - return false; - - return true; + io_wq_check_workers(wq); + wq->manager = NULL; + io_wq_put(wq); + do_exit(0); } static void io_run_cancel(struct io_wq_work *work, struct io_wqe *wqe) @@ -872,20 +785,37 @@ append: wq_list_add_after(&work->list, &tail->list, &wqe->work_list); } +static int io_wq_fork_manager(struct io_wq *wq) +{ + int ret; + + if (wq->manager) + return 0; + + clear_bit(IO_WQ_BIT_EXIT, &wq->state); + refcount_inc(&wq->refs); + current->flags |= PF_IO_WORKER; + ret = io_wq_fork_thread(io_wq_manager, wq); + current->flags &= ~PF_IO_WORKER; + if (ret >= 0) { + wait_for_completion(&wq->done); + return 0; + } + + io_wq_put(wq); + return ret; +} + static void io_wqe_enqueue(struct io_wqe *wqe, struct io_wq_work *work) { struct io_wqe_acct *acct = io_work_get_acct(wqe, work); int work_flags; unsigned long flags; - /* - * Do early check to see if we need a new unbound worker, and if we do, - * if we're allowed to do so. This isn't 100% accurate as there's a - * gap between this check and incrementing the value, but that's OK. - * It's close enough to not be an issue, fork() has the same delay. - */ - if (unlikely(!io_wq_can_queue(wqe, acct, work))) { - io_run_cancel(work, wqe); + /* Can only happen if manager creation fails after exec */ + if (unlikely(io_wq_fork_manager(wqe->wq))) { + work->flags |= IO_WQ_WORK_CANCEL; + wqe->wq->do_work(work); return; } @@ -939,7 +869,7 @@ static bool io_wq_worker_cancel(struct io_worker *worker, void *data) spin_lock_irqsave(&worker->lock, flags); if (worker->cur_work && match->fn(worker->cur_work, match->data)) { - send_sig(SIGINT, worker->task, 1); + set_notify_signal(worker->task); match->nr_running++; } spin_unlock_irqrestore(&worker->lock, flags); @@ -1043,6 +973,24 @@ enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, return IO_WQ_CANCEL_NOTFOUND; } +static int io_wqe_hash_wake(struct wait_queue_entry *wait, unsigned mode, + int sync, void *key) +{ + struct io_wqe *wqe = container_of(wait, struct io_wqe, wait); + int ret; + + list_del_init(&wait->entry); + + rcu_read_lock(); + ret = io_wqe_activate_free_worker(wqe); + rcu_read_unlock(); + + if (!ret) + wake_up_process(wqe->wq->manager); + + return 1; +} + struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) { int ret = -ENOMEM, node; @@ -1063,12 +1011,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) if (ret) goto err_wqes; + refcount_inc(&data->hash->refs); + wq->hash = data->hash; wq->free_work = data->free_work; wq->do_work = data->do_work; - /* caller must already hold a reference to this */ - wq->user = data->user; - ret = -ENOMEM; for_each_node(node) { struct io_wqe *wqe; @@ -1083,11 +1030,11 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) wqe->node = alloc_node; wqe->acct[IO_WQ_ACCT_BOUND].max_workers = bounded; atomic_set(&wqe->acct[IO_WQ_ACCT_BOUND].nr_running, 0); - if (wq->user) { - wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = + wqe->acct[IO_WQ_ACCT_UNBOUND].max_workers = task_rlimit(current, RLIMIT_NPROC); - } atomic_set(&wqe->acct[IO_WQ_ACCT_UNBOUND].nr_running, 0); + wqe->wait.func = io_wqe_hash_wake; + INIT_LIST_HEAD(&wqe->wait.entry); wqe->wq = wq; raw_spin_lock_init(&wqe->lock); INIT_WQ_LIST(&wqe->work_list); @@ -1095,23 +1042,16 @@ struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data) INIT_LIST_HEAD(&wqe->all_list); } + wq->task_pid = current->pid; init_completion(&wq->done); + refcount_set(&wq->refs, 1); - wq->manager = kthread_create(io_wq_manager, wq, "io_wq_manager"); - if (!IS_ERR(wq->manager)) { - wake_up_process(wq->manager); - wait_for_completion(&wq->done); - if (test_bit(IO_WQ_BIT_ERROR, &wq->state)) { - ret = -ENOMEM; - goto err; - } - refcount_set(&wq->use_refs, 1); - reinit_completion(&wq->done); + ret = io_wq_fork_manager(wq); + if (!ret) return wq; - } - ret = PTR_ERR(wq->manager); - complete(&wq->done); + io_wq_put(wq); + io_wq_put_hash(data->hash); err: cpuhp_state_remove_instance_nocalls(io_wq_online, &wq->cpuhp_node); for_each_node(node) @@ -1123,15 +1063,7 @@ err_wq: return ERR_PTR(ret); } -bool io_wq_get(struct io_wq *wq, struct io_wq_data *data) -{ - if (data->free_work != wq->free_work || data->do_work != wq->do_work) - return false; - - return refcount_inc_not_zero(&wq->use_refs); -} - -static void __io_wq_destroy(struct io_wq *wq) +static void io_wq_destroy(struct io_wq *wq) { int node; @@ -1139,30 +1071,31 @@ static void __io_wq_destroy(struct io_wq *wq) set_bit(IO_WQ_BIT_EXIT, &wq->state); if (wq->manager) - kthread_stop(wq->manager); + wake_up_process(wq->manager); rcu_read_lock(); for_each_node(node) io_wq_for_each_worker(wq->wqes[node], io_wq_worker_wake, NULL); rcu_read_unlock(); - wait_for_completion(&wq->done); + spin_lock_irq(&wq->hash->wait.lock); + for_each_node(node) { + struct io_wqe *wqe = wq->wqes[node]; - for_each_node(node) - kfree(wq->wqes[node]); + list_del_init(&wqe->wait.entry); + kfree(wqe); + } + spin_unlock_irq(&wq->hash->wait.lock); + io_wq_put_hash(wq->hash); kfree(wq->wqes); kfree(wq); + } -void io_wq_destroy(struct io_wq *wq) +void io_wq_put(struct io_wq *wq) { - if (refcount_dec_and_test(&wq->use_refs)) - __io_wq_destroy(wq); -} - -struct task_struct *io_wq_get_task(struct io_wq *wq) -{ - return wq->manager; + if (refcount_dec_and_test(&wq->refs)) + io_wq_destroy(wq); } static bool io_wq_worker_affinity(struct io_worker *worker, void *data) diff --git a/fs/io-wq.h b/fs/io-wq.h index 096f1021018e..b6ca12b60c35 100644 --- a/fs/io-wq.h +++ b/fs/io-wq.h @@ -1,6 +1,7 @@ #ifndef INTERNAL_IO_WQ_H #define INTERNAL_IO_WQ_H +#include #include struct io_wq; @@ -11,13 +12,6 @@ enum { IO_WQ_WORK_UNBOUND = 4, IO_WQ_WORK_CONCURRENT = 16, - IO_WQ_WORK_FILES = 32, - IO_WQ_WORK_FS = 64, - IO_WQ_WORK_MM = 128, - IO_WQ_WORK_CREDS = 256, - IO_WQ_WORK_BLKCG = 512, - IO_WQ_WORK_FSIZE = 1024, - IO_WQ_HASH_SHIFT = 24, /* upper 8 bits are used for hash key */ }; @@ -85,7 +79,7 @@ static inline void wq_list_del(struct io_wq_work_list *list, struct io_wq_work { struct io_wq_work_node list; - struct io_identity *identity; + const struct cred *creds; unsigned flags; }; @@ -100,20 +94,32 @@ static inline struct io_wq_work *wq_next_work(struct io_wq_work *work) typedef struct io_wq_work *(free_work_fn)(struct io_wq_work *); typedef void (io_wq_work_fn)(struct io_wq_work *); -struct io_wq_data { - struct user_struct *user; +struct io_wq_hash { + refcount_t refs; + unsigned long map; + struct wait_queue_head wait; +}; +static inline void io_wq_put_hash(struct io_wq_hash *hash) +{ + if (refcount_dec_and_test(&hash->refs)) + kfree(hash); +} + +struct io_wq_data { + struct io_wq_hash *hash; io_wq_work_fn *do_work; free_work_fn *free_work; }; struct io_wq *io_wq_create(unsigned bounded, struct io_wq_data *data); -bool io_wq_get(struct io_wq *wq, struct io_wq_data *data); -void io_wq_destroy(struct io_wq *wq); +void io_wq_put(struct io_wq *wq); void io_wq_enqueue(struct io_wq *wq, struct io_wq_work *work); void io_wq_hash_work(struct io_wq_work *work, void *val); +pid_t io_wq_fork_thread(int (*fn)(void *), void *arg); + static inline bool io_wq_is_hashed(struct io_wq_work *work) { return work->flags & IO_WQ_WORK_HASHED; @@ -124,8 +130,6 @@ typedef bool (work_cancel_fn)(struct io_wq_work *, void *); enum io_wq_cancel io_wq_cancel_cb(struct io_wq *wq, work_cancel_fn *cancel, void *data, bool cancel_all); -struct task_struct *io_wq_get_task(struct io_wq *wq); - #if defined(CONFIG_IO_WQ) extern void io_wq_worker_sleeping(struct task_struct *); extern void io_wq_worker_running(struct task_struct *); @@ -140,6 +144,7 @@ static inline void io_wq_worker_running(struct task_struct *tsk) static inline bool io_wq_current_is_worker(void) { - return in_task() && (current->flags & PF_IO_WORKER); + return in_task() && (current->flags & PF_IO_WORKER) && + current->pf_io_worker; } #endif diff --git a/fs/io_uring.c b/fs/io_uring.c index 14ce789927e4..4a088581b0f2 100644 --- a/fs/io_uring.c +++ b/fs/io_uring.c @@ -57,7 +57,6 @@ #include #include #include -#include #include #include #include @@ -104,6 +103,10 @@ #define IORING_MAX_RESTRICTIONS (IORING_RESTRICTION_LAST + \ IORING_REGISTER_LAST + IORING_OP_LAST) +#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ + IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ + IOSQE_BUFFER_SELECT) + struct io_uring { u32 head ____cacheline_aligned_in_smp; u32 tail ____cacheline_aligned_in_smp; @@ -232,6 +235,7 @@ struct fixed_rsrc_data { struct fixed_rsrc_ref_node *node; struct percpu_ref refs; struct completion done; + bool quiesce; }; struct io_buffer { @@ -249,6 +253,11 @@ struct io_restriction { bool registered; }; +enum { + IO_SQ_THREAD_SHOULD_STOP = 0, + IO_SQ_THREAD_SHOULD_PARK, +}; + struct io_sq_data { refcount_t refs; struct mutex lock; @@ -262,6 +271,13 @@ struct io_sq_data { struct wait_queue_head wait; unsigned sq_thread_idle; + int sq_cpu; + pid_t task_pid; + + unsigned long state; + struct completion startup; + struct completion completion; + struct completion exited; }; #define IO_IOPOLL_BATCH 8 @@ -279,8 +295,14 @@ struct io_comp_state { struct list_head locked_free_list; }; +struct io_submit_link { + struct io_kiocb *head; + struct io_kiocb *last; +}; + struct io_submit_state { struct blk_plug plug; + struct io_submit_link link; /* * io_kiocb alloc cache @@ -312,12 +334,12 @@ struct io_ring_ctx { struct { unsigned int flags; unsigned int compat: 1; - unsigned int limit_mem: 1; unsigned int cq_overflow_flushed: 1; unsigned int drain_next: 1; unsigned int eventfd_async: 1; unsigned int restricted: 1; unsigned int sqo_dead: 1; + unsigned int sqo_exec: 1; /* * Ring buffer of indices into array of io_uring_sqe, which is @@ -339,6 +361,9 @@ struct io_ring_ctx { unsigned cached_cq_overflow; unsigned long sq_check_overflow; + /* hashed buffered write serialization */ + struct io_wq_hash *hash_map; + struct list_head defer_list; struct list_head timeout_list; struct list_head cq_overflow_list; @@ -355,22 +380,14 @@ struct io_ring_ctx { struct io_rings *rings; - /* IO offload */ - struct io_wq *io_wq; - /* - * For SQPOLL usage - we hold a reference to the parent task, so we - * have access to the ->files + * For SQPOLL usage */ struct task_struct *sqo_task; /* Only used for accounting purposes */ struct mm_struct *mm_account; -#ifdef CONFIG_BLK_CGROUP - struct cgroup_subsys_state *sqo_blkcg_css; -#endif - struct io_sq_data *sq_data; /* if using sq thread polling */ struct wait_queue_head sqo_sq_wait; @@ -390,13 +407,6 @@ struct io_ring_ctx { struct user_struct *user; - const struct cred *creds; - -#ifdef CONFIG_AUDIT - kuid_t loginuid; - unsigned int sessionid; -#endif - struct completion ref_comp; struct completion sq_thread_comp; @@ -445,6 +455,11 @@ struct io_ring_ctx { struct io_restriction restrictions; + /* exit task_work */ + struct callback_head *exit_task_work; + + struct wait_queue_head hash_wait; + /* Keep this last, we don't need it for the fast path */ struct work_struct exit_work; }; @@ -827,7 +842,6 @@ struct io_op_def { unsigned plug : 1; /* size of async data needed, if any */ unsigned short async_size; - unsigned work_flags; }; static const struct io_op_def io_op_defs[] = { @@ -840,7 +854,6 @@ static const struct io_op_def io_op_defs[] = { .needs_async_data = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_WRITEV] = { .needs_file = 1, @@ -850,12 +863,9 @@ static const struct io_op_def io_op_defs[] = { .needs_async_data = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FSIZE, }, [IORING_OP_FSYNC] = { .needs_file = 1, - .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_READ_FIXED] = { .needs_file = 1, @@ -863,7 +873,6 @@ static const struct io_op_def io_op_defs[] = { .pollin = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, }, [IORING_OP_WRITE_FIXED] = { .needs_file = 1, @@ -872,8 +881,6 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE | - IO_WQ_WORK_MM, }, [IORING_OP_POLL_ADD] = { .needs_file = 1, @@ -882,7 +889,6 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_POLL_REMOVE] = {}, [IORING_OP_SYNC_FILE_RANGE] = { .needs_file = 1, - .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_SENDMSG] = { .needs_file = 1, @@ -890,8 +896,6 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_msghdr), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, }, [IORING_OP_RECVMSG] = { .needs_file = 1, @@ -900,29 +904,23 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_msghdr), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS, }, [IORING_OP_TIMEOUT] = { .needs_async_data = 1, .async_size = sizeof(struct io_timeout_data), - .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_TIMEOUT_REMOVE] = { /* used by timeout updates' prep() */ - .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_ACCEPT] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES, }, [IORING_OP_ASYNC_CANCEL] = {}, [IORING_OP_LINK_TIMEOUT] = { .needs_async_data = 1, .async_size = sizeof(struct io_timeout_data), - .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_CONNECT] = { .needs_file = 1, @@ -930,26 +928,14 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .needs_async_data = 1, .async_size = sizeof(struct io_async_connect), - .work_flags = IO_WQ_WORK_MM, }, [IORING_OP_FALLOCATE] = { .needs_file = 1, - .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE, - }, - [IORING_OP_OPENAT] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FS | IO_WQ_WORK_MM, - }, - [IORING_OP_CLOSE] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_BLKCG, - }, - [IORING_OP_FILES_UPDATE] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM, - }, - [IORING_OP_STATX] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_MM | - IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, }, + [IORING_OP_OPENAT] = {}, + [IORING_OP_CLOSE] = {}, + [IORING_OP_FILES_UPDATE] = {}, + [IORING_OP_STATX] = {}, [IORING_OP_READ] = { .needs_file = 1, .unbound_nonreg_file = 1, @@ -957,7 +943,6 @@ static const struct io_op_def io_op_defs[] = { .buffer_select = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_WRITE] = { .needs_file = 1, @@ -965,42 +950,31 @@ static const struct io_op_def io_op_defs[] = { .pollout = 1, .plug = 1, .async_size = sizeof(struct io_async_rw), - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | - IO_WQ_WORK_FSIZE, }, [IORING_OP_FADVISE] = { .needs_file = 1, - .work_flags = IO_WQ_WORK_BLKCG, - }, - [IORING_OP_MADVISE] = { - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, + [IORING_OP_MADVISE] = {}, [IORING_OP_SEND] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollout = 1, - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_RECV] = { .needs_file = 1, .unbound_nonreg_file = 1, .pollin = 1, .buffer_select = 1, - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG, }, [IORING_OP_OPENAT2] = { - .work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS | - IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM, }, [IORING_OP_EPOLL_CTL] = { .unbound_nonreg_file = 1, - .work_flags = IO_WQ_WORK_FILES, }, [IORING_OP_SPLICE] = { .needs_file = 1, .hash_reg_file = 1, .unbound_nonreg_file = 1, - .work_flags = IO_WQ_WORK_BLKCG, }, [IORING_OP_PROVIDE_BUFFERS] = {}, [IORING_OP_REMOVE_BUFFERS] = {}, @@ -1012,24 +986,18 @@ static const struct io_op_def io_op_defs[] = { [IORING_OP_SHUTDOWN] = { .needs_file = 1, }, - [IORING_OP_RENAMEAT] = { - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | - IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, - }, - [IORING_OP_UNLINKAT] = { - .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FILES | - IO_WQ_WORK_FS | IO_WQ_WORK_BLKCG, - }, + [IORING_OP_RENAMEAT] = {}, + [IORING_OP_UNLINKAT] = {}, }; static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct task_struct *task, struct files_struct *files); +static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx); static void destroy_fixed_rsrc_ref_node(struct fixed_rsrc_ref_node *ref_node); static struct fixed_rsrc_ref_node *alloc_fixed_rsrc_ref_node( struct io_ring_ctx *ctx); -static void init_fixed_file_ref_node(struct io_ring_ctx *ctx, - struct fixed_rsrc_ref_node *ref_node); +static void io_ring_file_put(struct io_ring_ctx *ctx, struct io_rsrc_put *prsrc); static bool io_rw_reissue(struct io_kiocb *req); static void io_cqring_fill_event(struct io_kiocb *req, long res); @@ -1116,161 +1084,18 @@ static bool io_match_task(struct io_kiocb *head, continue; if (req->file && req->file->f_op == &io_uring_fops) return true; - if ((req->work.flags & IO_WQ_WORK_FILES) && - req->work.identity->files == files) + if (req->task->files == files) return true; } return false; } -static void io_sq_thread_drop_mm_files(void) -{ - struct files_struct *files = current->files; - struct mm_struct *mm = current->mm; - - if (mm) { - kthread_unuse_mm(mm); - mmput(mm); - current->mm = NULL; - } - if (files) { - struct nsproxy *nsproxy = current->nsproxy; - - task_lock(current); - current->files = NULL; - current->nsproxy = NULL; - task_unlock(current); - put_files_struct(files); - put_nsproxy(nsproxy); - } -} - -static int __io_sq_thread_acquire_files(struct io_ring_ctx *ctx) -{ - if (!current->files) { - struct files_struct *files; - struct nsproxy *nsproxy; - - task_lock(ctx->sqo_task); - files = ctx->sqo_task->files; - if (!files) { - task_unlock(ctx->sqo_task); - return -EOWNERDEAD; - } - atomic_inc(&files->count); - get_nsproxy(ctx->sqo_task->nsproxy); - nsproxy = ctx->sqo_task->nsproxy; - task_unlock(ctx->sqo_task); - - task_lock(current); - current->files = files; - current->nsproxy = nsproxy; - task_unlock(current); - } - return 0; -} - -static int __io_sq_thread_acquire_mm(struct io_ring_ctx *ctx) -{ - struct mm_struct *mm; - - if (current->mm) - return 0; - - task_lock(ctx->sqo_task); - mm = ctx->sqo_task->mm; - if (unlikely(!mm || !mmget_not_zero(mm))) - mm = NULL; - task_unlock(ctx->sqo_task); - - if (mm) { - kthread_use_mm(mm); - return 0; - } - - return -EFAULT; -} - -static int __io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - int ret; - - if (def->work_flags & IO_WQ_WORK_MM) { - ret = __io_sq_thread_acquire_mm(ctx); - if (unlikely(ret)) - return ret; - } - - if (def->needs_file || (def->work_flags & IO_WQ_WORK_FILES)) { - ret = __io_sq_thread_acquire_files(ctx); - if (unlikely(ret)) - return ret; - } - - return 0; -} - -static inline int io_sq_thread_acquire_mm_files(struct io_ring_ctx *ctx, - struct io_kiocb *req) -{ - if (!(ctx->flags & IORING_SETUP_SQPOLL)) - return 0; - return __io_sq_thread_acquire_mm_files(ctx, req); -} - -static void io_sq_thread_associate_blkcg(struct io_ring_ctx *ctx, - struct cgroup_subsys_state **cur_css) - -{ -#ifdef CONFIG_BLK_CGROUP - /* puts the old one when swapping */ - if (*cur_css != ctx->sqo_blkcg_css) { - kthread_associate_blkcg(ctx->sqo_blkcg_css); - *cur_css = ctx->sqo_blkcg_css; - } -#endif -} - -static void io_sq_thread_unassociate_blkcg(void) -{ -#ifdef CONFIG_BLK_CGROUP - kthread_associate_blkcg(NULL); -#endif -} - static inline void req_set_fail_links(struct io_kiocb *req) { if ((req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) == REQ_F_LINK) req->flags |= REQ_F_FAIL_LINK; } -/* - * None of these are dereferenced, they are simply used to check if any of - * them have changed. If we're under current and check they are still the - * same, we're fine to grab references to them for actual out-of-line use. - */ -static void io_init_identity(struct io_identity *id) -{ - id->files = current->files; - id->mm = current->mm; -#ifdef CONFIG_BLK_CGROUP - rcu_read_lock(); - id->blkcg_css = blkcg_css(); - rcu_read_unlock(); -#endif - id->creds = current_cred(); - id->nsproxy = current->nsproxy; - id->fs = current->fs; - id->fsize = rlimit(RLIMIT_FSIZE); -#ifdef CONFIG_AUDIT - id->loginuid = current->loginuid; - id->sessionid = current->sessionid; -#endif - refcount_set(&id->count, 1); -} - static inline void __io_req_init_async(struct io_kiocb *req) { memset(&req->work, 0, sizeof(req->work)); @@ -1283,17 +1108,10 @@ static inline void __io_req_init_async(struct io_kiocb *req) */ static inline void io_req_init_async(struct io_kiocb *req) { - struct io_uring_task *tctx = current->io_uring; - if (req->flags & REQ_F_WORK_INITIALIZED) return; __io_req_init_async(req); - - /* Grab a ref if this isn't our static identity */ - req->work.identity = tctx->identity; - if (tctx->identity != &tctx->__identity) - refcount_inc(&req->work.identity->count); } static void io_ring_ctx_ref_free(struct percpu_ref *ref) @@ -1378,40 +1196,14 @@ static bool req_need_defer(struct io_kiocb *req, u32 seq) return false; } -static void io_put_identity(struct io_uring_task *tctx, struct io_kiocb *req) -{ - if (req->work.identity == &tctx->__identity) - return; - if (refcount_dec_and_test(&req->work.identity->count)) - kfree(req->work.identity); -} - static void io_req_clean_work(struct io_kiocb *req) { if (!(req->flags & REQ_F_WORK_INITIALIZED)) return; - if (req->work.flags & IO_WQ_WORK_MM) - mmdrop(req->work.identity->mm); -#ifdef CONFIG_BLK_CGROUP - if (req->work.flags & IO_WQ_WORK_BLKCG) - css_put(req->work.identity->blkcg_css); -#endif - if (req->work.flags & IO_WQ_WORK_CREDS) - put_cred(req->work.identity->creds); - if (req->work.flags & IO_WQ_WORK_FS) { - struct fs_struct *fs = req->work.identity->fs; - - spin_lock(&req->work.identity->fs->lock); - if (--fs->users) - fs = NULL; - spin_unlock(&req->work.identity->fs->lock); - if (fs) - free_fs_struct(fs); - } - if (req->work.flags & IO_WQ_WORK_FILES) { - put_files_struct(req->work.identity->files); - put_nsproxy(req->work.identity->nsproxy); + if (req->work.creds) { + put_cred(req->work.creds); + req->work.creds = NULL; } if (req->flags & REQ_F_INFLIGHT) { struct io_ring_ctx *ctx = req->ctx; @@ -1427,54 +1219,6 @@ static void io_req_clean_work(struct io_kiocb *req) } req->flags &= ~REQ_F_WORK_INITIALIZED; - req->work.flags &= ~(IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG | IO_WQ_WORK_FS | - IO_WQ_WORK_CREDS | IO_WQ_WORK_FILES); - io_put_identity(req->task->io_uring, req); -} - -/* - * Create a private copy of io_identity, since some fields don't match - * the current context. - */ -static bool io_identity_cow(struct io_kiocb *req) -{ - struct io_uring_task *tctx = current->io_uring; - const struct cred *creds = NULL; - struct io_identity *id; - - if (req->work.flags & IO_WQ_WORK_CREDS) - creds = req->work.identity->creds; - - id = kmemdup(req->work.identity, sizeof(*id), GFP_KERNEL); - if (unlikely(!id)) { - req->work.flags |= IO_WQ_WORK_CANCEL; - return false; - } - - /* - * We can safely just re-init the creds we copied Either the field - * matches the current one, or we haven't grabbed it yet. The only - * exception is ->creds, through registered personalities, so handle - * that one separately. - */ - io_init_identity(id); - if (creds) - id->creds = creds; - - /* add one for this request */ - refcount_inc(&id->count); - - /* drop tctx and req identity references, if needed */ - if (tctx->identity != &tctx->__identity && - refcount_dec_and_test(&tctx->identity->count)) - kfree(tctx->identity); - if (req->work.identity != &tctx->__identity && - refcount_dec_and_test(&req->work.identity->count)) - kfree(req->work.identity); - - req->work.identity = id; - tctx->identity = id; - return true; } static void io_req_track_inflight(struct io_kiocb *req) @@ -1491,79 +1235,6 @@ static void io_req_track_inflight(struct io_kiocb *req) } } -static bool io_grab_identity(struct io_kiocb *req) -{ - const struct io_op_def *def = &io_op_defs[req->opcode]; - struct io_identity *id = req->work.identity; - - if (def->work_flags & IO_WQ_WORK_FSIZE) { - if (id->fsize != rlimit(RLIMIT_FSIZE)) - return false; - req->work.flags |= IO_WQ_WORK_FSIZE; - } -#ifdef CONFIG_BLK_CGROUP - if (!(req->work.flags & IO_WQ_WORK_BLKCG) && - (def->work_flags & IO_WQ_WORK_BLKCG)) { - rcu_read_lock(); - if (id->blkcg_css != blkcg_css()) { - rcu_read_unlock(); - return false; - } - /* - * This should be rare, either the cgroup is dying or the task - * is moving cgroups. Just punt to root for the handful of ios. - */ - if (css_tryget_online(id->blkcg_css)) - req->work.flags |= IO_WQ_WORK_BLKCG; - rcu_read_unlock(); - } -#endif - if (!(req->work.flags & IO_WQ_WORK_CREDS)) { - if (id->creds != current_cred()) - return false; - get_cred(id->creds); - req->work.flags |= IO_WQ_WORK_CREDS; - } -#ifdef CONFIG_AUDIT - if (!uid_eq(current->loginuid, id->loginuid) || - current->sessionid != id->sessionid) - return false; -#endif - if (!(req->work.flags & IO_WQ_WORK_FS) && - (def->work_flags & IO_WQ_WORK_FS)) { - if (current->fs != id->fs) - return false; - spin_lock(&id->fs->lock); - if (!id->fs->in_exec) { - id->fs->users++; - req->work.flags |= IO_WQ_WORK_FS; - } else { - req->work.flags |= IO_WQ_WORK_CANCEL; - } - spin_unlock(¤t->fs->lock); - } - if (!(req->work.flags & IO_WQ_WORK_FILES) && - (def->work_flags & IO_WQ_WORK_FILES) && - !(req->flags & REQ_F_NO_FILE_TABLE)) { - if (id->files != current->files || - id->nsproxy != current->nsproxy) - return false; - atomic_inc(&id->files->count); - get_nsproxy(id->nsproxy); - req->work.flags |= IO_WQ_WORK_FILES; - io_req_track_inflight(req); - } - if (!(req->work.flags & IO_WQ_WORK_MM) && - (def->work_flags & IO_WQ_WORK_MM)) { - if (id->mm != current->mm) - return false; - mmgrab(id->mm); - req->work.flags |= IO_WQ_WORK_MM; - } - - return true; -} - static void io_prep_async_work(struct io_kiocb *req) { const struct io_op_def *def = &io_op_defs[req->opcode]; @@ -1581,17 +1252,8 @@ static void io_prep_async_work(struct io_kiocb *req) if (def->unbound_nonreg_file) req->work.flags |= IO_WQ_WORK_UNBOUND; } - - /* if we fail grabbing identity, we must COW, regrab, and retry */ - if (io_grab_identity(req)) - return; - - if (!io_identity_cow(req)) - return; - - /* can't fail at this point */ - if (!io_grab_identity(req)) - WARN_ON(1); + if (!req->work.creds) + req->work.creds = get_current_cred(); } static void io_prep_async_link(struct io_kiocb *req) @@ -1606,10 +1268,14 @@ static struct io_kiocb *__io_queue_async_work(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_kiocb *link = io_prep_linked_timeout(req); + struct io_uring_task *tctx = req->task->io_uring; + + BUG_ON(!tctx); + BUG_ON(!tctx->io_wq); trace_io_uring_queue_async_work(ctx, io_wq_is_hashed(&req->work), req, &req->work, req->flags); - io_wq_enqueue(ctx->io_wq, &req->work); + io_wq_enqueue(tctx->io_wq, &req->work); return link; } @@ -2303,11 +1969,14 @@ static int io_req_task_work_add(struct io_kiocb *req) static void io_req_task_work_add_fallback(struct io_kiocb *req, task_work_func_t cb) { - struct task_struct *tsk = io_wq_get_task(req->ctx->io_wq); + struct io_ring_ctx *ctx = req->ctx; + struct callback_head *head; init_task_work(&req->task_work, cb); - task_work_add(tsk, &req->task_work, TWA_NONE); - wake_up_process(tsk); + do { + head = READ_ONCE(ctx->exit_task_work); + req->task_work.next = head; + } while (cmpxchg(&ctx->exit_task_work, head, &req->task_work) != head); } static void __io_req_task_cancel(struct io_kiocb *req, int error) @@ -2329,7 +1998,9 @@ static void io_req_task_cancel(struct callback_head *cb) struct io_kiocb *req = container_of(cb, struct io_kiocb, task_work); struct io_ring_ctx *ctx = req->ctx; - __io_req_task_cancel(req, -ECANCELED); + mutex_lock(&ctx->uring_lock); + __io_req_task_cancel(req, req->result); + mutex_unlock(&ctx->uring_lock); percpu_ref_put(&ctx->refs); } @@ -2339,15 +2010,11 @@ static void __io_req_task_submit(struct io_kiocb *req) /* ctx stays valid until unlock, even if we drop all ours ctx->refs */ mutex_lock(&ctx->uring_lock); - if (!ctx->sqo_dead && !(current->flags & PF_EXITING) && - !io_sq_thread_acquire_mm_files(ctx, req)) + if (!ctx->sqo_dead && !(current->flags & PF_EXITING) && !current->in_execve) __io_queue_sqe(req); else __io_req_task_cancel(req, -EFAULT); mutex_unlock(&ctx->uring_lock); - - if (ctx->flags & IORING_SETUP_SQPOLL) - io_sq_thread_drop_mm_files(); } static void io_req_task_submit(struct callback_head *cb) @@ -2364,11 +2031,22 @@ static void io_req_task_queue(struct io_kiocb *req) req->task_work.func = io_req_task_submit; ret = io_req_task_work_add(req); if (unlikely(ret)) { + req->result = -ECANCELED; percpu_ref_get(&req->ctx->refs); io_req_task_work_add_fallback(req, io_req_task_cancel); } } +static void io_req_task_queue_fail(struct io_kiocb *req, int ret) +{ + percpu_ref_get(&req->ctx->refs); + req->result = ret; + req->task_work.func = io_req_task_cancel; + + if (unlikely(io_req_task_work_add(req))) + io_req_task_work_add_fallback(req, io_req_task_cancel); +} + static inline void io_queue_next(struct io_kiocb *req) { struct io_kiocb *nxt = io_req_find_next(req); @@ -2800,18 +2478,22 @@ static bool io_rw_reissue(struct io_kiocb *req) { #ifdef CONFIG_BLOCK umode_t mode = file_inode(req->file)->i_mode; - int ret; if (!S_ISBLK(mode) && !S_ISREG(mode)) return false; if ((req->flags & REQ_F_NOWAIT) || io_wq_current_is_worker()) return false; + /* + * If ref is dying, we might be running poll reap from the exit work. + * Don't attempt to reissue from that path, just let it fail with + * -EAGAIN. + */ + if (percpu_ref_is_dying(&req->ctx->refs)) + return false; lockdep_assert_held(&req->ctx->uring_lock); - ret = io_sq_thread_acquire_mm_files(req->ctx, req); - - if (!ret && io_resubmit_prep(req)) { + if (io_resubmit_prep(req)) { refcount_inc(&req->refs); io_queue_async_work(req); return true; @@ -3467,19 +3149,9 @@ static inline int io_rw_prep_async(struct io_kiocb *req, int rw) static int io_read_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - ssize_t ret; - - ret = io_prep_rw(req, sqe); - if (ret) - return ret; - if (unlikely(!(req->file->f_mode & FMODE_READ))) return -EBADF; - - /* either don't need iovec imported or already have it */ - if (!req->async_data) - return 0; - return io_rw_prep_async(req, READ); + return io_prep_rw(req, sqe); } /* @@ -3607,10 +3279,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) ret = io_iter_do_read(req, iter); if (ret == -EIOCBQUEUED) { - /* it's faster to check here then delegate to kfree */ - if (iovec) - kfree(iovec); - return 0; + goto out_free; } else if (ret == -EAGAIN) { /* IOPOLL retry should happen for io-wq threads */ if (!force_nonblock && !(req->ctx->flags & IORING_SETUP_IOPOLL)) @@ -3631,6 +3300,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) if (ret2) return ret2; + iovec = NULL; rw = req->async_data; /* now use our persistent iterator, if we aren't already */ iter = &rw->iter; @@ -3657,24 +3327,18 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) } while (ret > 0 && ret < io_size); done: kiocb_done(kiocb, ret, issue_flags); +out_free: + /* it's faster to check here then delegate to kfree */ + if (iovec) + kfree(iovec); return 0; } static int io_write_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - ssize_t ret; - - ret = io_prep_rw(req, sqe); - if (ret) - return ret; - if (unlikely(!(req->file->f_mode & FMODE_WRITE))) return -EBADF; - - /* either don't need iovec imported or already have it */ - if (!req->async_data) - return 0; - return io_rw_prep_async(req, WRITE); + return io_prep_rw(req, sqe); } static int io_write(struct io_kiocb *req, unsigned int issue_flags) @@ -4011,7 +3675,7 @@ static int io_nop(struct io_kiocb *req, unsigned int issue_flags) return 0; } -static int io_prep_fsync(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_fsync_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; @@ -4598,13 +4262,10 @@ err: return 0; } -static int io_prep_sfr(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_sfr_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_ring_ctx *ctx = req->ctx; - if (!req->file) - return -EBADF; - if (unlikely(ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; if (unlikely(sqe->addr || sqe->ioprio || sqe->buf_index)) @@ -4664,11 +4325,21 @@ static int io_sendmsg_copy_hdr(struct io_kiocb *req, req->sr_msg.msg_flags, &iomsg->free_iov); } +static int io_sendmsg_prep_async(struct io_kiocb *req) +{ + int ret; + + if (!io_op_defs[req->opcode].needs_async_data) + return 0; + ret = io_sendmsg_copy_hdr(req, req->async_data); + if (!ret) + req->flags |= REQ_F_NEED_CLEANUP; + return ret; +} + static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { - struct io_async_msghdr *async_msg = req->async_data; struct io_sr_msg *sr = &req->sr_msg; - int ret; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4681,13 +4352,7 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - - if (!async_msg || !io_op_defs[req->opcode].needs_async_data) - return 0; - ret = io_sendmsg_copy_hdr(req, async_msg); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; + return 0; } static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) @@ -4881,13 +4546,22 @@ static inline unsigned int io_put_recv_kbuf(struct io_kiocb *req) return io_put_kbuf(req, req->sr_msg.kbuf); } -static int io_recvmsg_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_recvmsg_prep_async(struct io_kiocb *req) { - struct io_async_msghdr *async_msg = req->async_data; - struct io_sr_msg *sr = &req->sr_msg; int ret; + if (!io_op_defs[req->opcode].needs_async_data) + return 0; + ret = io_recvmsg_copy_hdr(req, req->async_data); + if (!ret) + req->flags |= REQ_F_NEED_CLEANUP; + return ret; +} + +static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) +{ + struct io_sr_msg *sr = &req->sr_msg; + if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -4900,13 +4574,7 @@ static int io_recvmsg_prep(struct io_kiocb *req, if (req->ctx->compat) sr->msg_flags |= MSG_CMSG_COMPAT; #endif - - if (!async_msg || !io_op_defs[req->opcode].needs_async_data) - return 0; - ret = io_recvmsg_copy_hdr(req, async_msg); - if (!ret) - req->flags |= REQ_F_NEED_CLEANUP; - return ret; + return 0; } static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) @@ -5059,10 +4727,17 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) return 0; } +static int io_connect_prep_async(struct io_kiocb *req) +{ + struct io_async_connect *io = req->async_data; + struct io_connect *conn = &req->connect; + + return move_addr_to_kernel(conn->addr, conn->addr_len, &io->address); +} + static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) { struct io_connect *conn = &req->connect; - struct io_async_connect *io = req->async_data; if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL)) return -EINVAL; @@ -5071,12 +4746,7 @@ static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) conn->addr = u64_to_user_ptr(READ_ONCE(sqe->addr)); conn->addr_len = READ_ONCE(sqe->addr2); - - if (!io) - return 0; - - return move_addr_to_kernel(conn->addr, conn->addr_len, - &io->address); + return 0; } static int io_connect(struct io_kiocb *req, unsigned int issue_flags) @@ -5121,56 +4791,32 @@ out: return 0; } #else /* !CONFIG_NET */ -static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - return -EOPNOTSUPP; +#define IO_NETOP_FN(op) \ +static int io_##op(struct io_kiocb *req, unsigned int issue_flags) \ +{ \ + return -EOPNOTSUPP; \ } -static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; +#define IO_NETOP_PREP(op) \ +IO_NETOP_FN(op) \ +static int io_##op##_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) \ +{ \ + return -EOPNOTSUPP; \ +} \ + +#define IO_NETOP_PREP_ASYNC(op) \ +IO_NETOP_PREP(op) \ +static int io_##op##_prep_async(struct io_kiocb *req) \ +{ \ + return -EOPNOTSUPP; \ } -static int io_send(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} - -static int io_recvmsg_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - return -EOPNOTSUPP; -} - -static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} - -static int io_recv(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} - -static int io_accept_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - return -EOPNOTSUPP; -} - -static int io_accept(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} - -static int io_connect_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) -{ - return -EOPNOTSUPP; -} - -static int io_connect(struct io_kiocb *req, unsigned int issue_flags) -{ - return -EOPNOTSUPP; -} +IO_NETOP_PREP_ASYNC(sendmsg); +IO_NETOP_PREP_ASYNC(recvmsg); +IO_NETOP_PREP_ASYNC(connect); +IO_NETOP_PREP(accept); +IO_NETOP_FN(send); +IO_NETOP_FN(recv); #endif /* CONFIG_NET */ struct io_poll_table { @@ -5952,12 +5598,15 @@ static bool io_cancel_cb(struct io_wq_work *work, void *data) return req->user_data == (unsigned long) data; } -static int io_async_cancel_one(struct io_ring_ctx *ctx, void *sqe_addr) +static int io_async_cancel_one(struct io_uring_task *tctx, void *sqe_addr) { enum io_wq_cancel cancel_ret; int ret = 0; - cancel_ret = io_wq_cancel_cb(ctx->io_wq, io_cancel_cb, sqe_addr, false); + if (!tctx->io_wq) + return -ENOENT; + + cancel_ret = io_wq_cancel_cb(tctx->io_wq, io_cancel_cb, sqe_addr, false); switch (cancel_ret) { case IO_WQ_CANCEL_OK: ret = 0; @@ -5980,7 +5629,8 @@ static void io_async_find_and_cancel(struct io_ring_ctx *ctx, unsigned long flags; int ret; - ret = io_async_cancel_one(ctx, (void *) (unsigned long) sqe_addr); + ret = io_async_cancel_one(req->task->io_uring, + (void *) (unsigned long) sqe_addr); if (ret != -ENOENT) { spin_lock_irqsave(&ctx->completion_lock, flags); goto done; @@ -6084,9 +5734,9 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) case IORING_OP_POLL_REMOVE: return io_poll_remove_prep(req, sqe); case IORING_OP_FSYNC: - return io_prep_fsync(req, sqe); + return io_fsync_prep(req, sqe); case IORING_OP_SYNC_FILE_RANGE: - return io_prep_sfr(req, sqe); + return io_sfr_prep(req, sqe); case IORING_OP_SENDMSG: case IORING_OP_SEND: return io_sendmsg_prep(req, sqe); @@ -6144,14 +5794,39 @@ static int io_req_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) return-EINVAL; } -static int io_req_defer_prep(struct io_kiocb *req, - const struct io_uring_sqe *sqe) +static int io_req_prep_async(struct io_kiocb *req) { - if (!sqe) + switch (req->opcode) { + case IORING_OP_READV: + case IORING_OP_READ_FIXED: + case IORING_OP_READ: + return io_rw_prep_async(req, READ); + case IORING_OP_WRITEV: + case IORING_OP_WRITE_FIXED: + case IORING_OP_WRITE: + return io_rw_prep_async(req, WRITE); + case IORING_OP_SENDMSG: + case IORING_OP_SEND: + return io_sendmsg_prep_async(req); + case IORING_OP_RECVMSG: + case IORING_OP_RECV: + return io_recvmsg_prep_async(req); + case IORING_OP_CONNECT: + return io_connect_prep_async(req); + } + return 0; +} + +static int io_req_defer_prep(struct io_kiocb *req) +{ + if (!io_op_defs[req->opcode].needs_async_data) return 0; - if (io_alloc_async_data(req)) + /* some opcodes init it during the inital prep */ + if (req->async_data) + return 0; + if (__io_alloc_async_data(req)) return -EAGAIN; - return io_req_prep(req, sqe); + return io_req_prep_async(req); } static u32 io_get_sequence(struct io_kiocb *req) @@ -6167,7 +5842,7 @@ static u32 io_get_sequence(struct io_kiocb *req) return total_submitted - nr_reqs; } -static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static int io_req_defer(struct io_kiocb *req) { struct io_ring_ctx *ctx = req->ctx; struct io_defer_entry *de; @@ -6184,11 +5859,9 @@ static int io_req_defer(struct io_kiocb *req, const struct io_uring_sqe *sqe) if (!req_need_defer(req, seq) && list_empty_careful(&ctx->defer_list)) return 0; - if (!req->async_data) { - ret = io_req_defer_prep(req, sqe); - if (ret) - return ret; - } + ret = io_req_defer_prep(req); + if (ret) + return ret; io_prep_async_link(req); de = kmalloc(sizeof(*de), GFP_KERNEL); if (!de) @@ -6427,29 +6100,11 @@ static void io_wq_submit_work(struct io_wq_work *work) } while (1); } + /* avoid locking problems by failing it from a clean context */ if (ret) { - struct io_ring_ctx *lock_ctx = NULL; - - if (req->ctx->flags & IORING_SETUP_IOPOLL) - lock_ctx = req->ctx; - - /* - * io_iopoll_complete() does not hold completion_lock to - * complete polled io, so here for polled io, we can not call - * io_req_complete() directly, otherwise there maybe concurrent - * access to cqring, defer_list, etc, which is not safe. Given - * that io_iopoll_complete() is always called under uring_lock, - * so here for polled io, we also get uring_lock to complete - * it. - */ - if (lock_ctx) - mutex_lock(&lock_ctx->uring_lock); - - req_set_fail_links(req); - io_req_complete(req, ret); - - if (lock_ctx) - mutex_unlock(&lock_ctx->uring_lock); + /* io-wq is going to take one down */ + refcount_inc(&req->refs); + io_req_task_queue_fail(req, ret); } } @@ -6564,10 +6219,9 @@ static void __io_queue_sqe(struct io_kiocb *req) const struct cred *old_creds = NULL; int ret; - if ((req->flags & REQ_F_WORK_INITIALIZED) && - (req->work.flags & IO_WQ_WORK_CREDS) && - req->work.identity->creds != current_cred()) - old_creds = override_creds(req->work.identity->creds); + if ((req->flags & REQ_F_WORK_INITIALIZED) && req->work.creds && + req->work.creds != current_cred()) + old_creds = override_creds(req->work.creds); ret = io_issue_sqe(req, IO_URING_F_NONBLOCK|IO_URING_F_COMPLETE_DEFER); @@ -6607,11 +6261,11 @@ static void __io_queue_sqe(struct io_kiocb *req) io_queue_linked_timeout(linked_timeout); } -static void io_queue_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe) +static void io_queue_sqe(struct io_kiocb *req) { int ret; - ret = io_req_defer(req, sqe); + ret = io_req_defer(req); if (ret) { if (ret != -EIOCBQUEUED) { fail_req: @@ -6620,42 +6274,139 @@ fail_req: io_req_complete(req, ret); } } else if (req->flags & REQ_F_FORCE_ASYNC) { - if (!req->async_data) { - ret = io_req_defer_prep(req, sqe); - if (unlikely(ret)) - goto fail_req; - } + ret = io_req_defer_prep(req); + if (unlikely(ret)) + goto fail_req; io_queue_async_work(req); } else { - if (sqe) { - ret = io_req_prep(req, sqe); - if (unlikely(ret)) - goto fail_req; - } __io_queue_sqe(req); } } -static inline void io_queue_link_head(struct io_kiocb *req) +/* + * Check SQE restrictions (opcode and flags). + * + * Returns 'true' if SQE is allowed, 'false' otherwise. + */ +static inline bool io_check_restriction(struct io_ring_ctx *ctx, + struct io_kiocb *req, + unsigned int sqe_flags) { - if (unlikely(req->flags & REQ_F_FAIL_LINK)) { - io_put_req(req); - io_req_complete(req, -ECANCELED); - } else - io_queue_sqe(req, NULL); + if (!ctx->restricted) + return true; + + if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) + return false; + + if ((sqe_flags & ctx->restrictions.sqe_flags_required) != + ctx->restrictions.sqe_flags_required) + return false; + + if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | + ctx->restrictions.sqe_flags_required)) + return false; + + return true; } -struct io_submit_link { - struct io_kiocb *head; - struct io_kiocb *last; -}; - -static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, - struct io_submit_link *link) +static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe) { - struct io_ring_ctx *ctx = req->ctx; + struct io_submit_state *state; + unsigned int sqe_flags; + int id, ret = 0; + + req->opcode = READ_ONCE(sqe->opcode); + /* same numerical values with corresponding REQ_F_*, safe to copy */ + req->flags = sqe_flags = READ_ONCE(sqe->flags); + req->user_data = READ_ONCE(sqe->user_data); + req->async_data = NULL; + req->file = NULL; + req->ctx = ctx; + req->link = NULL; + req->fixed_rsrc_refs = NULL; + /* one is dropped after submission, the other at completion */ + refcount_set(&req->refs, 2); + req->task = current; + req->result = 0; + + /* enforce forwards compatibility on users */ + if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) { + req->flags = 0; + return -EINVAL; + } + + if (unlikely(req->opcode >= IORING_OP_LAST)) + return -EINVAL; + + if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) + return -EACCES; + + if ((sqe_flags & IOSQE_BUFFER_SELECT) && + !io_op_defs[req->opcode].buffer_select) + return -EOPNOTSUPP; + + id = READ_ONCE(sqe->personality); + if (id) { + __io_req_init_async(req); + req->work.creds = idr_find(&ctx->personality_idr, id); + if (unlikely(!req->work.creds)) + return -EINVAL; + get_cred(req->work.creds); + } + + state = &ctx->submit_state; + + /* + * Plug now if we have more than 1 IO left after this, and the target + * is potentially a read/write to block based storage. + */ + if (!state->plug_started && state->ios_left > 1 && + io_op_defs[req->opcode].plug) { + blk_start_plug(&state->plug); + state->plug_started = true; + } + + if (io_op_defs[req->opcode].needs_file) { + bool fixed = req->flags & REQ_F_FIXED_FILE; + + req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); + if (unlikely(!req->file)) + ret = -EBADF; + } + + state->ios_left--; + return ret; +} + +static int io_submit_sqe(struct io_ring_ctx *ctx, struct io_kiocb *req, + const struct io_uring_sqe *sqe) +{ + struct io_submit_link *link = &ctx->submit_state.link; int ret; + ret = io_init_req(ctx, req, sqe); + if (unlikely(ret)) { +fail_req: + io_put_req(req); + io_req_complete(req, ret); + if (link->head) { + /* fail even hard links since we don't submit */ + link->head->flags |= REQ_F_FAIL_LINK; + io_put_req(link->head); + io_req_complete(link->head, -ECANCELED); + link->head = NULL; + } + return ret; + } + ret = io_req_prep(req, sqe); + if (unlikely(ret)) + goto fail_req; + + /* don't need @sqe from now on */ + trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, + true, ctx->flags & IORING_SETUP_SQPOLL); + /* * If we already have a head request, queue this one for async * submittal once the head completes. If we don't have a head but @@ -6677,19 +6428,16 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, head->flags |= REQ_F_IO_DRAIN; ctx->drain_next = 1; } - ret = io_req_defer_prep(req, sqe); - if (unlikely(ret)) { - /* fail even hard links since we don't submit */ - head->flags |= REQ_F_FAIL_LINK; - return ret; - } + ret = io_req_defer_prep(req); + if (unlikely(ret)) + goto fail_req; trace_io_uring_link(ctx, req, head); link->last->link = req; link->last = req; /* last request of a link, enqueue the link */ if (!(req->flags & (REQ_F_LINK | REQ_F_HARDLINK))) { - io_queue_link_head(head); + io_queue_sqe(head); link->head = NULL; } } else { @@ -6698,13 +6446,10 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, ctx->drain_next = 0; } if (req->flags & (REQ_F_LINK | REQ_F_HARDLINK)) { - ret = io_req_defer_prep(req, sqe); - if (unlikely(ret)) - req->flags |= REQ_F_FAIL_LINK; link->head = req; link->last = req; } else { - io_queue_sqe(req, sqe); + io_queue_sqe(req); } } @@ -6717,6 +6462,8 @@ static int io_submit_sqe(struct io_kiocb *req, const struct io_uring_sqe *sqe, static void io_submit_state_end(struct io_submit_state *state, struct io_ring_ctx *ctx) { + if (state->link.head) + io_queue_sqe(state->link.head); if (state->comp.nr) io_submit_flush_completions(&state->comp, ctx); if (state->plug_started) @@ -6732,6 +6479,8 @@ static void io_submit_state_start(struct io_submit_state *state, { state->plug_started = false; state->ios_left = max_ios; + /* set only head, no need to init link_last in advance */ + state->link.head = NULL; } static void io_commit_sqring(struct io_ring_ctx *ctx) @@ -6777,117 +6526,9 @@ static const struct io_uring_sqe *io_get_sqe(struct io_ring_ctx *ctx) return NULL; } -/* - * Check SQE restrictions (opcode and flags). - * - * Returns 'true' if SQE is allowed, 'false' otherwise. - */ -static inline bool io_check_restriction(struct io_ring_ctx *ctx, - struct io_kiocb *req, - unsigned int sqe_flags) -{ - if (!ctx->restricted) - return true; - - if (!test_bit(req->opcode, ctx->restrictions.sqe_op)) - return false; - - if ((sqe_flags & ctx->restrictions.sqe_flags_required) != - ctx->restrictions.sqe_flags_required) - return false; - - if (sqe_flags & ~(ctx->restrictions.sqe_flags_allowed | - ctx->restrictions.sqe_flags_required)) - return false; - - return true; -} - -#define SQE_VALID_FLAGS (IOSQE_FIXED_FILE|IOSQE_IO_DRAIN|IOSQE_IO_LINK| \ - IOSQE_IO_HARDLINK | IOSQE_ASYNC | \ - IOSQE_BUFFER_SELECT) - -static int io_init_req(struct io_ring_ctx *ctx, struct io_kiocb *req, - const struct io_uring_sqe *sqe) -{ - struct io_submit_state *state; - unsigned int sqe_flags; - int id, ret = 0; - - req->opcode = READ_ONCE(sqe->opcode); - /* same numerical values with corresponding REQ_F_*, safe to copy */ - req->flags = sqe_flags = READ_ONCE(sqe->flags); - req->user_data = READ_ONCE(sqe->user_data); - req->async_data = NULL; - req->file = NULL; - req->ctx = ctx; - req->link = NULL; - req->fixed_rsrc_refs = NULL; - /* one is dropped after submission, the other at completion */ - refcount_set(&req->refs, 2); - req->task = current; - req->result = 0; - - /* enforce forwards compatibility on users */ - if (unlikely(sqe_flags & ~SQE_VALID_FLAGS)) - return -EINVAL; - - if (unlikely(req->opcode >= IORING_OP_LAST)) - return -EINVAL; - - if (unlikely(io_sq_thread_acquire_mm_files(ctx, req))) - return -EFAULT; - - if (unlikely(!io_check_restriction(ctx, req, sqe_flags))) - return -EACCES; - - if ((sqe_flags & IOSQE_BUFFER_SELECT) && - !io_op_defs[req->opcode].buffer_select) - return -EOPNOTSUPP; - - id = READ_ONCE(sqe->personality); - if (id) { - struct io_identity *iod; - - iod = idr_find(&ctx->personality_idr, id); - if (unlikely(!iod)) - return -EINVAL; - refcount_inc(&iod->count); - - __io_req_init_async(req); - get_cred(iod->creds); - req->work.identity = iod; - req->work.flags |= IO_WQ_WORK_CREDS; - } - - state = &ctx->submit_state; - - /* - * Plug now if we have more than 1 IO left after this, and the target - * is potentially a read/write to block based storage. - */ - if (!state->plug_started && state->ios_left > 1 && - io_op_defs[req->opcode].plug) { - blk_start_plug(&state->plug); - state->plug_started = true; - } - - if (io_op_defs[req->opcode].needs_file) { - bool fixed = req->flags & REQ_F_FIXED_FILE; - - req->file = io_file_get(state, req, READ_ONCE(sqe->fd), fixed); - if (unlikely(!req->file)) - ret = -EBADF; - } - - state->ios_left--; - return ret; -} - static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) { - struct io_submit_link link; - int i, submitted = 0; + int submitted = 0; /* if we have a backlog and couldn't flush it all, return BUSY */ if (test_bit(0, &ctx->sq_check_overflow)) { @@ -6903,14 +6544,11 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) percpu_counter_add(¤t->io_uring->inflight, nr); refcount_add(nr, ¤t->usage); - io_submit_state_start(&ctx->submit_state, nr); - link.head = NULL; - for (i = 0; i < nr; i++) { + while (submitted < nr) { const struct io_uring_sqe *sqe; struct io_kiocb *req; - int err; req = io_alloc_req(ctx); if (unlikely(!req)) { @@ -6925,20 +6563,8 @@ static int io_submit_sqes(struct io_ring_ctx *ctx, unsigned int nr) } /* will complete beyond this point, count as submitted */ submitted++; - - err = io_init_req(ctx, req, sqe); - if (unlikely(err)) { -fail_req: - io_put_req(req); - io_req_complete(req, err); + if (io_submit_sqe(ctx, req, sqe)) break; - } - - trace_io_uring_submit_sqe(ctx, req->opcode, req->user_data, - true, ctx->flags & IORING_SETUP_SQPOLL); - err = io_submit_sqe(req, sqe, &link); - if (err) - goto fail_req; } if (unlikely(submitted != nr)) { @@ -6950,10 +6576,8 @@ fail_req: percpu_counter_sub(&tctx->inflight, unused); put_task_struct_many(current, unused); } - if (link.head) - io_queue_link_head(link.head); - io_submit_state_end(&ctx->submit_state, ctx); + io_submit_state_end(&ctx->submit_state, ctx); /* Commit SQ ring head once we've consumed and submitted all SQEs */ io_commit_sqring(ctx); @@ -7030,71 +6654,97 @@ static void io_sqd_init_new(struct io_sq_data *sqd) io_sqd_update_thread_idle(sqd); } +static bool io_sq_thread_should_stop(struct io_sq_data *sqd) +{ + return test_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); +} + +static bool io_sq_thread_should_park(struct io_sq_data *sqd) +{ + return test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); +} + +static void io_sq_thread_parkme(struct io_sq_data *sqd) +{ + for (;;) { + /* + * TASK_PARKED is a special state; we must serialize against + * possible pending wakeups to avoid store-store collisions on + * task->state. + * + * Such a collision might possibly result in the task state + * changin from TASK_PARKED and us failing the + * wait_task_inactive() in kthread_park(). + */ + set_special_state(TASK_PARKED); + if (!test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)) + break; + + /* + * Thread is going to call schedule(), do not preempt it, + * or the caller of kthread_park() may spend more time in + * wait_task_inactive(). + */ + preempt_disable(); + complete(&sqd->completion); + schedule_preempt_disabled(); + preempt_enable(); + } + __set_current_state(TASK_RUNNING); +} + static int io_sq_thread(void *data) { - struct cgroup_subsys_state *cur_css = NULL; - struct files_struct *old_files = current->files; - struct nsproxy *old_nsproxy = current->nsproxy; - const struct cred *old_cred = NULL; struct io_sq_data *sqd = data; struct io_ring_ctx *ctx; unsigned long timeout = 0; + char buf[TASK_COMM_LEN]; DEFINE_WAIT(wait); - task_lock(current); - current->files = NULL; - current->nsproxy = NULL; - task_unlock(current); + sprintf(buf, "iou-sqp-%d", sqd->task_pid); + set_task_comm(current, buf); + sqd->thread = current; + current->pf_io_worker = NULL; - while (!kthread_should_stop()) { + if (sqd->sq_cpu != -1) + set_cpus_allowed_ptr(current, cpumask_of(sqd->sq_cpu)); + else + set_cpus_allowed_ptr(current, cpu_online_mask); + current->flags |= PF_NO_SETAFFINITY; + + complete(&sqd->completion); + + wait_for_completion(&sqd->startup); + + while (!io_sq_thread_should_stop(sqd)) { int ret; bool cap_entries, sqt_spin, needs_sched; /* * Any changes to the sqd lists are synchronized through the - * kthread parking. This synchronizes the thread vs users, + * thread parking. This synchronizes the thread vs users, * the users are synchronized on the sqd->ctx_lock. */ - if (kthread_should_park()) { - kthread_parkme(); - /* - * When sq thread is unparked, in case the previous park operation - * comes from io_put_sq_data(), which means that sq thread is going - * to be stopped, so here needs to have a check. - */ - if (kthread_should_stop()) - break; + if (io_sq_thread_should_park(sqd)) { + io_sq_thread_parkme(sqd); + continue; } - if (unlikely(!list_empty(&sqd->ctx_new_list))) { io_sqd_init_new(sqd); timeout = jiffies + sqd->sq_thread_idle; } - + if (fatal_signal_pending(current)) + break; sqt_spin = false; cap_entries = !list_is_singular(&sqd->ctx_list); list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { - if (current->cred != ctx->creds) { - if (old_cred) - revert_creds(old_cred); - old_cred = override_creds(ctx->creds); - } - io_sq_thread_associate_blkcg(ctx, &cur_css); -#ifdef CONFIG_AUDIT - current->loginuid = ctx->loginuid; - current->sessionid = ctx->sessionid; -#endif - ret = __io_sq_thread(ctx, cap_entries); if (!sqt_spin && (ret > 0 || !list_empty(&ctx->iopoll_list))) sqt_spin = true; - - io_sq_thread_drop_mm_files(); } if (sqt_spin || !time_after(jiffies, timeout)) { io_run_task_work(); - io_sq_thread_drop_mm_files(); cond_resched(); if (sqt_spin) timeout = jiffies + sqd->sq_thread_idle; @@ -7115,7 +6765,7 @@ static int io_sq_thread(void *data) } } - if (needs_sched && !kthread_should_park()) { + if (needs_sched && !io_sq_thread_should_park(sqd)) { list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) io_ring_set_wakeup_flag(ctx); @@ -7128,22 +6778,25 @@ static int io_sq_thread(void *data) timeout = jiffies + sqd->sq_thread_idle; } + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) + io_uring_cancel_sqpoll(ctx); + io_run_task_work(); - io_sq_thread_drop_mm_files(); - if (cur_css) - io_sq_thread_unassociate_blkcg(); - if (old_cred) - revert_creds(old_cred); + /* + * Clear thread under lock so that concurrent parks work correctly + */ + complete_all(&sqd->completion); + mutex_lock(&sqd->lock); + sqd->thread = NULL; + list_for_each_entry(ctx, &sqd->ctx_list, sqd_list) { + ctx->sqo_exec = 1; + io_ring_set_wakeup_flag(ctx); + } + mutex_unlock(&sqd->lock); - task_lock(current); - current->files = old_files; - current->nsproxy = old_nsproxy; - task_unlock(current); - - kthread_parkme(); - - return 0; + complete(&sqd->exited); + do_exit(0); } struct io_wait_queue { @@ -7328,38 +6981,59 @@ static void io_sqe_rsrc_set_node(struct io_ring_ctx *ctx, percpu_ref_get(&rsrc_data->refs); } -static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data, - struct io_ring_ctx *ctx, - struct fixed_rsrc_ref_node *backup_node) +static void io_sqe_rsrc_kill_node(struct io_ring_ctx *ctx, struct fixed_rsrc_data *data) { - struct fixed_rsrc_ref_node *ref_node; - int ret; + struct fixed_rsrc_ref_node *ref_node = NULL; io_rsrc_ref_lock(ctx); ref_node = data->node; + data->node = NULL; io_rsrc_ref_unlock(ctx); if (ref_node) percpu_ref_kill(&ref_node->refs); +} - percpu_ref_kill(&data->refs); +static int io_rsrc_ref_quiesce(struct fixed_rsrc_data *data, + struct io_ring_ctx *ctx, + void (*rsrc_put)(struct io_ring_ctx *ctx, + struct io_rsrc_put *prsrc)) +{ + struct fixed_rsrc_ref_node *backup_node; + int ret; - /* wait for all refs nodes to complete */ - flush_delayed_work(&ctx->rsrc_put_work); + if (data->quiesce) + return -ENXIO; + + data->quiesce = true; do { + ret = -ENOMEM; + backup_node = alloc_fixed_rsrc_ref_node(ctx); + if (!backup_node) + break; + backup_node->rsrc_data = data; + backup_node->rsrc_put = rsrc_put; + + io_sqe_rsrc_kill_node(ctx, data); + percpu_ref_kill(&data->refs); + flush_delayed_work(&ctx->rsrc_put_work); + ret = wait_for_completion_interruptible(&data->done); if (!ret) break; - ret = io_run_task_work_sig(); - if (ret < 0) { - percpu_ref_resurrect(&data->refs); - reinit_completion(&data->done); - io_sqe_rsrc_set_node(ctx, data, backup_node); - return ret; - } - } while (1); - destroy_fixed_rsrc_ref_node(backup_node); - return 0; + percpu_ref_resurrect(&data->refs); + io_sqe_rsrc_set_node(ctx, data, backup_node); + backup_node = NULL; + reinit_completion(&data->done); + mutex_unlock(&ctx->uring_lock); + ret = io_run_task_work_sig(); + mutex_lock(&ctx->uring_lock); + } while (ret >= 0); + data->quiesce = false; + + if (backup_node) + destroy_fixed_rsrc_ref_node(backup_node); + return ret; } static struct fixed_rsrc_data *alloc_fixed_rsrc_data(struct io_ring_ctx *ctx) @@ -7390,18 +7064,17 @@ static void free_fixed_rsrc_data(struct fixed_rsrc_data *data) static int io_sqe_files_unregister(struct io_ring_ctx *ctx) { struct fixed_rsrc_data *data = ctx->file_data; - struct fixed_rsrc_ref_node *backup_node; unsigned nr_tables, i; int ret; - if (!data) + /* + * percpu_ref_is_dying() is to stop parallel files unregister + * Since we possibly drop uring lock later in this function to + * run task work. + */ + if (!data || percpu_ref_is_dying(&data->refs)) return -ENXIO; - backup_node = alloc_fixed_rsrc_ref_node(ctx); - if (!backup_node) - return -ENOMEM; - init_fixed_file_ref_node(ctx, backup_node); - - ret = io_rsrc_ref_quiesce(data, ctx, backup_node); + ret = io_rsrc_ref_quiesce(data, ctx, io_ring_file_put); if (ret) return ret; @@ -7415,20 +7088,74 @@ static int io_sqe_files_unregister(struct io_ring_ctx *ctx) return 0; } +static void io_sq_thread_unpark(struct io_sq_data *sqd) + __releases(&sqd->lock) +{ + if (!sqd->thread) + return; + if (sqd->thread == current) + return; + clear_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + wake_up_state(sqd->thread, TASK_PARKED); + mutex_unlock(&sqd->lock); +} + +static bool io_sq_thread_park(struct io_sq_data *sqd) + __acquires(&sqd->lock) +{ + if (sqd->thread == current) + return true; + mutex_lock(&sqd->lock); + if (!sqd->thread) { + mutex_unlock(&sqd->lock); + return false; + } + set_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state); + wake_up_process(sqd->thread); + wait_for_completion(&sqd->completion); + return true; +} + +static void io_sq_thread_stop(struct io_sq_data *sqd) +{ + if (!sqd->thread) + return; + + set_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + WARN_ON_ONCE(test_bit(IO_SQ_THREAD_SHOULD_PARK, &sqd->state)); + wake_up_process(sqd->thread); + wait_for_completion(&sqd->exited); +} + static void io_put_sq_data(struct io_sq_data *sqd) { if (refcount_dec_and_test(&sqd->refs)) { - /* - * The park is a bit of a work-around, without it we get - * warning spews on shutdown with SQPOLL set and affinity - * set to a single CPU. - */ + io_sq_thread_stop(sqd); + kfree(sqd); + } +} + +static void io_sq_thread_finish(struct io_ring_ctx *ctx) +{ + struct io_sq_data *sqd = ctx->sq_data; + + if (sqd) { + complete(&sqd->startup); if (sqd->thread) { - kthread_park(sqd->thread); - kthread_stop(sqd->thread); + wait_for_completion(&ctx->sq_thread_comp); + io_sq_thread_park(sqd); } - kfree(sqd); + mutex_lock(&sqd->ctx_lock); + list_del(&ctx->sqd_list); + io_sqd_update_thread_idle(sqd); + mutex_unlock(&sqd->ctx_lock); + + if (sqd->thread) + io_sq_thread_unpark(sqd); + + io_put_sq_data(sqd); + ctx->sq_data = NULL; } } @@ -7475,68 +7202,12 @@ static struct io_sq_data *io_get_sq_data(struct io_uring_params *p) mutex_init(&sqd->ctx_lock); mutex_init(&sqd->lock); init_waitqueue_head(&sqd->wait); + init_completion(&sqd->startup); + init_completion(&sqd->completion); + init_completion(&sqd->exited); return sqd; } -static void io_sq_thread_unpark(struct io_sq_data *sqd) - __releases(&sqd->lock) -{ - if (!sqd->thread) - return; - kthread_unpark(sqd->thread); - mutex_unlock(&sqd->lock); -} - -static void io_sq_thread_park(struct io_sq_data *sqd) - __acquires(&sqd->lock) -{ - if (!sqd->thread) - return; - mutex_lock(&sqd->lock); - kthread_park(sqd->thread); -} - -static void io_sq_thread_stop(struct io_ring_ctx *ctx) -{ - struct io_sq_data *sqd = ctx->sq_data; - - if (sqd) { - if (sqd->thread) { - /* - * We may arrive here from the error branch in - * io_sq_offload_create() where the kthread is created - * without being waked up, thus wake it up now to make - * sure the wait will complete. - */ - wake_up_process(sqd->thread); - wait_for_completion(&ctx->sq_thread_comp); - - io_sq_thread_park(sqd); - } - - mutex_lock(&sqd->ctx_lock); - list_del(&ctx->sqd_list); - io_sqd_update_thread_idle(sqd); - mutex_unlock(&sqd->ctx_lock); - - if (sqd->thread) - io_sq_thread_unpark(sqd); - - io_put_sq_data(sqd); - ctx->sq_data = NULL; - } -} - -static void io_finish_async(struct io_ring_ctx *ctx) -{ - io_sq_thread_stop(ctx); - - if (ctx->io_wq) { - io_wq_destroy(ctx->io_wq); - ctx->io_wq = NULL; - } -} - #if defined(CONFIG_UNIX) /* * Ensure the UNIX gc is aware of our file set, so we are certain that @@ -7563,7 +7234,7 @@ static int __io_sqe_files_scm(struct io_ring_ctx *ctx, int nr, int offset) skb->sk = sk; nr_files = 0; - fpl->user = get_uid(ctx->user); + fpl->user = get_uid(current_user()); for (i = 0; i < nr; i++) { struct file *file = io_file_from_index(ctx, i + offset); @@ -8095,54 +7766,34 @@ static struct io_wq_work *io_free_work(struct io_wq_work *work) return req ? &req->work : NULL; } -static int io_init_wq_offload(struct io_ring_ctx *ctx, - struct io_uring_params *p) +static struct io_wq *io_init_wq_offload(struct io_ring_ctx *ctx) { + struct io_wq_hash *hash; struct io_wq_data data; - struct fd f; - struct io_ring_ctx *ctx_attach; unsigned int concurrency; - int ret = 0; - data.user = ctx->user; + hash = ctx->hash_map; + if (!hash) { + hash = kzalloc(sizeof(*hash), GFP_KERNEL); + if (!hash) + return ERR_PTR(-ENOMEM); + refcount_set(&hash->refs, 1); + init_waitqueue_head(&hash->wait); + ctx->hash_map = hash; + } + + data.hash = hash; data.free_work = io_free_work; data.do_work = io_wq_submit_work; - if (!(p->flags & IORING_SETUP_ATTACH_WQ)) { - /* Do QD, or 4 * CPUS, whatever is smallest */ - concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); + /* Do QD, or 4 * CPUS, whatever is smallest */ + concurrency = min(ctx->sq_entries, 4 * num_online_cpus()); - ctx->io_wq = io_wq_create(concurrency, &data); - if (IS_ERR(ctx->io_wq)) { - ret = PTR_ERR(ctx->io_wq); - ctx->io_wq = NULL; - } - return ret; - } - - f = fdget(p->wq_fd); - if (!f.file) - return -EBADF; - - if (f.file->f_op != &io_uring_fops) { - ret = -EINVAL; - goto out_fput; - } - - ctx_attach = f.file->private_data; - /* @io_wq is protected by holding the fd */ - if (!io_wq_get(ctx_attach->io_wq, &data)) { - ret = -EINVAL; - goto out_fput; - } - - ctx->io_wq = ctx_attach->io_wq; -out_fput: - fdput(f); - return ret; + return io_wq_create(concurrency, &data); } -static int io_uring_alloc_task_context(struct task_struct *task) +static int io_uring_alloc_task_context(struct task_struct *task, + struct io_ring_ctx *ctx) { struct io_uring_task *tctx; int ret; @@ -8157,13 +7808,19 @@ static int io_uring_alloc_task_context(struct task_struct *task) return ret; } + tctx->io_wq = io_init_wq_offload(ctx); + if (IS_ERR(tctx->io_wq)) { + ret = PTR_ERR(tctx->io_wq); + percpu_counter_destroy(&tctx->inflight); + kfree(tctx); + return ret; + } + xa_init(&tctx->xa); init_waitqueue_head(&tctx->wait); tctx->last = NULL; atomic_set(&tctx->in_idle, 0); tctx->sqpoll = false; - io_init_identity(&tctx->__identity); - tctx->identity = &tctx->__identity; task->io_uring = tctx; spin_lock_init(&tctx->task_lock); INIT_WQ_LIST(&tctx->task_list); @@ -8177,19 +7834,49 @@ void __io_uring_free(struct task_struct *tsk) struct io_uring_task *tctx = tsk->io_uring; WARN_ON_ONCE(!xa_empty(&tctx->xa)); - WARN_ON_ONCE(refcount_read(&tctx->identity->count) != 1); - if (tctx->identity != &tctx->__identity) - kfree(tctx->identity); percpu_counter_destroy(&tctx->inflight); kfree(tctx); tsk->io_uring = NULL; } +static int io_sq_thread_fork(struct io_sq_data *sqd, struct io_ring_ctx *ctx) +{ + int ret; + + clear_bit(IO_SQ_THREAD_SHOULD_STOP, &sqd->state); + reinit_completion(&sqd->completion); + ctx->sqo_dead = ctx->sqo_exec = 0; + sqd->task_pid = current->pid; + current->flags |= PF_IO_WORKER; + ret = io_wq_fork_thread(io_sq_thread, sqd); + current->flags &= ~PF_IO_WORKER; + if (ret < 0) { + sqd->thread = NULL; + return ret; + } + wait_for_completion(&sqd->completion); + return io_uring_alloc_task_context(sqd->thread, ctx); +} + static int io_sq_offload_create(struct io_ring_ctx *ctx, struct io_uring_params *p) { int ret; + /* Retain compatibility with failing for an invalid attach attempt */ + if ((ctx->flags & (IORING_SETUP_ATTACH_WQ | IORING_SETUP_SQPOLL)) == + IORING_SETUP_ATTACH_WQ) { + struct fd f; + + f = fdget(p->wq_fd); + if (!f.file) + return -ENXIO; + if (f.file->f_op != &io_uring_fops) { + fdput(f); + return -EINVAL; + } + fdput(f); + } if (ctx->flags & IORING_SETUP_SQPOLL) { struct io_sq_data *sqd; @@ -8215,7 +7902,7 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, ctx->sq_thread_idle = HZ; if (sqd->thread) - goto done; + return 0; if (p->flags & IORING_SETUP_SQ_AFF) { int cpu = p->sq_thread_cpu; @@ -8226,18 +7913,21 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, if (!cpu_online(cpu)) goto err; - sqd->thread = kthread_create_on_cpu(io_sq_thread, sqd, - cpu, "io_uring-sq"); + sqd->sq_cpu = cpu; } else { - sqd->thread = kthread_create(io_sq_thread, sqd, - "io_uring-sq"); + sqd->sq_cpu = -1; } - if (IS_ERR(sqd->thread)) { - ret = PTR_ERR(sqd->thread); + + sqd->task_pid = current->pid; + current->flags |= PF_IO_WORKER; + ret = io_wq_fork_thread(io_sq_thread, sqd); + current->flags &= ~PF_IO_WORKER; + if (ret < 0) { sqd->thread = NULL; goto err; } - ret = io_uring_alloc_task_context(sqd->thread); + wait_for_completion(&sqd->completion); + ret = io_uring_alloc_task_context(sqd->thread, ctx); if (ret) goto err; } else if (p->flags & IORING_SETUP_SQ_AFF) { @@ -8246,14 +7936,9 @@ static int io_sq_offload_create(struct io_ring_ctx *ctx, goto err; } -done: - ret = io_init_wq_offload(ctx, p); - if (ret) - goto err; - return 0; err: - io_finish_async(ctx); + io_sq_thread_finish(ctx); return ret; } @@ -8261,8 +7946,8 @@ static void io_sq_offload_start(struct io_ring_ctx *ctx) { struct io_sq_data *sqd = ctx->sq_data; - if ((ctx->flags & IORING_SETUP_SQPOLL) && sqd->thread) - wake_up_process(sqd->thread); + if (ctx->flags & IORING_SETUP_SQPOLL) + complete(&sqd->startup); } static inline void __io_unaccount_mem(struct user_struct *user, @@ -8292,7 +7977,7 @@ static inline int __io_account_mem(struct user_struct *user, static void io_unaccount_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { - if (ctx->limit_mem) + if (ctx->user) __io_unaccount_mem(ctx->user, nr_pages); if (ctx->mm_account) @@ -8303,7 +7988,7 @@ static int io_account_mem(struct io_ring_ctx *ctx, unsigned long nr_pages) { int ret; - if (ctx->limit_mem) { + if (ctx->user) { ret = __io_account_mem(ctx->user, nr_pages); if (ret) return ret; @@ -8702,19 +8387,23 @@ static void io_req_cache_free(struct list_head *list, struct task_struct *tsk) static void io_req_caches_free(struct io_ring_ctx *ctx, struct task_struct *tsk) { struct io_submit_state *submit_state = &ctx->submit_state; + struct io_comp_state *cs = &ctx->submit_state.comp; mutex_lock(&ctx->uring_lock); - if (submit_state->free_reqs) + if (submit_state->free_reqs) { kmem_cache_free_bulk(req_cachep, submit_state->free_reqs, submit_state->reqs); - - io_req_cache_free(&submit_state->comp.free_list, NULL); + submit_state->free_reqs = 0; + } spin_lock_irq(&ctx->completion_lock); - io_req_cache_free(&submit_state->comp.locked_free_list, NULL); + list_splice_init(&cs->locked_free_list, &cs->free_list); + cs->locked_free_nr = 0; spin_unlock_irq(&ctx->completion_lock); + io_req_cache_free(&cs->free_list, NULL); + mutex_unlock(&ctx->uring_lock); } @@ -8728,22 +8417,17 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) mutex_lock(&ctx->uring_lock); mutex_unlock(&ctx->uring_lock); - io_finish_async(ctx); + io_sq_thread_finish(ctx); io_sqe_buffers_unregister(ctx); - if (ctx->sqo_task) { - put_task_struct(ctx->sqo_task); - ctx->sqo_task = NULL; + if (ctx->mm_account) { mmdrop(ctx->mm_account); ctx->mm_account = NULL; } -#ifdef CONFIG_BLK_CGROUP - if (ctx->sqo_blkcg_css) - css_put(ctx->sqo_blkcg_css); -#endif - + mutex_lock(&ctx->uring_lock); io_sqe_files_unregister(ctx); + mutex_unlock(&ctx->uring_lock); io_eventfd_unregister(ctx); io_destroy_buffers(ctx); idr_destroy(&ctx->personality_idr); @@ -8760,8 +8444,9 @@ static void io_ring_ctx_free(struct io_ring_ctx *ctx) percpu_ref_exit(&ctx->refs); free_uid(ctx->user); - put_cred(ctx->creds); io_req_caches_free(ctx, NULL); + if (ctx->hash_map) + io_wq_put_hash(ctx->hash_map); kfree(ctx->cancel_hash); kfree(ctx); } @@ -8808,13 +8493,11 @@ static int io_uring_fasync(int fd, struct file *file, int on) static int io_unregister_personality(struct io_ring_ctx *ctx, unsigned id) { - struct io_identity *iod; + const struct cred *creds; - iod = idr_remove(&ctx->personality_idr, id); - if (iod) { - put_cred(iod->creds); - if (refcount_dec_and_test(&iod->count)) - kfree(iod); + creds = idr_remove(&ctx->personality_idr, id); + if (creds) { + put_cred(creds); return 0; } @@ -8829,6 +8512,28 @@ static int io_remove_personalities(int id, void *p, void *data) return 0; } +static void io_run_ctx_fallback(struct io_ring_ctx *ctx) +{ + struct callback_head *work, *head, *next; + + do { + do { + head = NULL; + work = READ_ONCE(ctx->exit_task_work); + } while (cmpxchg(&ctx->exit_task_work, work, head) != work); + + if (!work) + break; + + do { + next = work->next; + work->func(work); + work = next; + cond_resched(); + } while (work); + } while (1); +} + static void io_ring_exit_work(struct work_struct *work) { struct io_ring_ctx *ctx = container_of(work, struct io_ring_ctx, @@ -8842,17 +8547,11 @@ static void io_ring_exit_work(struct work_struct *work) */ do { io_uring_try_cancel_requests(ctx, NULL, NULL); + io_run_ctx_fallback(ctx); } while (!wait_for_completion_timeout(&ctx->ref_comp, HZ/20)); io_ring_ctx_free(ctx); } -static bool io_cancel_ctx_cb(struct io_wq_work *work, void *data) -{ - struct io_kiocb *req = container_of(work, struct io_kiocb, work); - - return req->ctx == data; -} - static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) { mutex_lock(&ctx->uring_lock); @@ -8871,9 +8570,6 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) io_kill_timeouts(ctx, NULL, NULL); io_poll_remove_all(ctx, NULL, NULL); - if (ctx->io_wq) - io_wq_cancel_cb(ctx->io_wq, io_cancel_ctx_cb, ctx, true); - /* if we failed setting up the ctx, we might not have any rings */ io_iopoll_try_reap_events(ctx); @@ -8952,13 +8648,14 @@ static void io_uring_try_cancel_requests(struct io_ring_ctx *ctx, struct files_struct *files) { struct io_task_cancel cancel = { .task = task, .files = files, }; + struct io_uring_task *tctx = current->io_uring; while (1) { enum io_wq_cancel cret; bool ret = false; - if (ctx->io_wq) { - cret = io_wq_cancel_cb(ctx->io_wq, io_cancel_task_cb, + if (tctx && tctx->io_wq) { + cret = io_wq_cancel_cb(tctx->io_wq, io_cancel_task_cb, &cancel, true); ret |= (cret != IO_WQ_CANCEL_NOTFOUND); } @@ -9041,12 +8738,15 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, struct files_struct *files) { struct task_struct *task = current; + bool did_park = false; if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { io_disable_sqo_submit(ctx); - task = ctx->sq_data->thread; - atomic_inc(&task->io_uring->in_idle); - io_sq_thread_park(ctx->sq_data); + did_park = io_sq_thread_park(ctx->sq_data); + if (did_park) { + task = ctx->sq_data->thread; + atomic_inc(&task->io_uring->in_idle); + } } io_cancel_defer_files(ctx, task, files); @@ -9055,7 +8755,7 @@ static void io_uring_cancel_task_requests(struct io_ring_ctx *ctx, if (!files) io_uring_try_cancel_requests(ctx, task, NULL); - if ((ctx->flags & IORING_SETUP_SQPOLL) && ctx->sq_data) { + if (did_park) { atomic_dec(&task->io_uring->in_idle); io_sq_thread_unpark(ctx->sq_data); } @@ -9070,7 +8770,7 @@ static int io_uring_add_task_file(struct io_ring_ctx *ctx, struct file *file) int ret; if (unlikely(!tctx)) { - ret = io_uring_alloc_task_context(current); + ret = io_uring_alloc_task_context(current, ctx); if (unlikely(ret)) return ret; tctx = current->io_uring; @@ -9140,8 +8840,13 @@ void __io_uring_files_cancel(struct files_struct *files) io_uring_cancel_task_requests(file->private_data, files); atomic_dec(&tctx->in_idle); - if (files) + if (files) { io_uring_remove_task_files(tctx); + if (tctx->io_wq) { + io_wq_put(tctx->io_wq); + tctx->io_wq = NULL; + } + } } static s64 tctx_inflight(struct io_uring_task *tctx) @@ -9151,14 +8856,17 @@ static s64 tctx_inflight(struct io_uring_task *tctx) static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) { + struct io_sq_data *sqd = ctx->sq_data; struct io_uring_task *tctx; s64 inflight; DEFINE_WAIT(wait); - if (!ctx->sq_data) + if (!sqd) + return; + io_disable_sqo_submit(ctx); + if (!io_sq_thread_park(sqd)) return; tctx = ctx->sq_data->thread->io_uring; - io_disable_sqo_submit(ctx); atomic_inc(&tctx->in_idle); do { @@ -9179,6 +8887,7 @@ static void io_uring_cancel_sqpoll(struct io_ring_ctx *ctx) finish_wait(&tctx->wait, &wait); } while (1); atomic_dec(&tctx->in_idle); + io_sq_thread_unpark(sqd); } /* @@ -9232,11 +8941,17 @@ static int io_uring_flush(struct file *file, void *data) struct io_uring_task *tctx = current->io_uring; struct io_ring_ctx *ctx = file->private_data; + /* Ignore helper thread files exit */ + if (current->flags & PF_IO_WORKER) + return 0; + if (fatal_signal_pending(current) || (current->flags & PF_EXITING)) { io_uring_cancel_task_requests(ctx, NULL); io_req_caches_free(ctx, current); } + io_run_ctx_fallback(ctx); + if (!tctx) return 0; @@ -9435,6 +9150,12 @@ SYSCALL_DEFINE6(io_uring_enter, unsigned int, fd, u32, to_submit, if (ctx->flags & IORING_SETUP_SQPOLL) { io_cqring_overflow_flush(ctx, false, NULL, NULL); + if (unlikely(ctx->sqo_exec)) { + ret = io_sq_thread_fork(ctx->sq_data, ctx); + if (ret) + goto out; + ctx->sqo_exec = 0; + } ret = -EOWNERDEAD; if (unlikely(ctx->sqo_dead)) goto out; @@ -9491,8 +9212,7 @@ out_fput: #ifdef CONFIG_PROC_FS static int io_uring_show_cred(int id, void *p, void *data) { - struct io_identity *iod = p; - const struct cred *cred = iod->creds; + const struct cred *cred = p; struct seq_file *m = data; struct user_namespace *uns = seq_user_ns(m); struct group_info *gi; @@ -9537,8 +9257,11 @@ static void __io_uring_show_fdinfo(struct io_ring_ctx *ctx, struct seq_file *m) */ has_lock = mutex_trylock(&ctx->uring_lock); - if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) + if (has_lock && (ctx->flags & IORING_SETUP_SQPOLL)) { sq = ctx->sq_data; + if (!sq->thread) + sq = NULL; + } seq_printf(m, "SqThread:\t%d\n", sq ? task_pid_nr(sq->thread) : -1); seq_printf(m, "SqThreadCpu:\t%d\n", sq ? task_cpu(sq->thread) : -1); @@ -9698,7 +9421,6 @@ static struct file *io_uring_get_file(struct io_ring_ctx *ctx) static int io_uring_create(unsigned entries, struct io_uring_params *p, struct io_uring_params __user *params) { - struct user_struct *user = NULL; struct io_ring_ctx *ctx; struct file *file; int ret; @@ -9740,22 +9462,13 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, p->cq_entries = 2 * p->sq_entries; } - user = get_uid(current_user()); - ctx = io_ring_ctx_alloc(p); - if (!ctx) { - free_uid(user); + if (!ctx) return -ENOMEM; - } ctx->compat = in_compat_syscall(); - ctx->limit_mem = !capable(CAP_IPC_LOCK); - ctx->user = user; - ctx->creds = get_current_cred(); -#ifdef CONFIG_AUDIT - ctx->loginuid = current->loginuid; - ctx->sessionid = current->sessionid; -#endif - ctx->sqo_task = get_task_struct(current); + if (!capable(CAP_IPC_LOCK)) + ctx->user = get_uid(current_user()); + ctx->sqo_task = current; /* * This is just grabbed for accounting purposes. When a process exits, @@ -9766,24 +9479,6 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, mmgrab(current->mm); ctx->mm_account = current->mm; -#ifdef CONFIG_BLK_CGROUP - /* - * The sq thread will belong to the original cgroup it was inited in. - * If the cgroup goes offline (e.g. disabling the io controller), then - * issued bios will be associated with the closest cgroup later in the - * block layer. - */ - rcu_read_lock(); - ctx->sqo_blkcg_css = blkcg_css(); - ret = css_tryget_online(ctx->sqo_blkcg_css); - rcu_read_unlock(); - if (!ret) { - /* don't init against a dying cgroup, have the user try again */ - ctx->sqo_blkcg_css = NULL; - ret = -ENODEV; - goto err; - } -#endif ret = io_allocate_scq_urings(ctx, p); if (ret) goto err; @@ -9817,7 +9512,7 @@ static int io_uring_create(unsigned entries, struct io_uring_params *p, IORING_FEAT_SUBMIT_STABLE | IORING_FEAT_RW_CUR_POS | IORING_FEAT_CUR_PERSONALITY | IORING_FEAT_FAST_POLL | IORING_FEAT_POLL_32BITS | IORING_FEAT_SQPOLL_NONFIXED | - IORING_FEAT_EXT_ARG; + IORING_FEAT_EXT_ARG | IORING_FEAT_NATIVE_WORKERS; if (copy_to_user(params, p, sizeof(*p))) { ret = -EFAULT; @@ -9923,21 +9618,15 @@ out: static int io_register_personality(struct io_ring_ctx *ctx) { - struct io_identity *id; + const struct cred *creds; int ret; - id = kmalloc(sizeof(*id), GFP_KERNEL); - if (unlikely(!id)) - return -ENOMEM; + creds = get_current_cred(); - io_init_identity(id); - id->creds = get_current_cred(); - - ret = idr_alloc_cyclic(&ctx->personality_idr, id, 1, USHRT_MAX, GFP_KERNEL); - if (ret < 0) { - put_cred(id->creds); - kfree(id); - } + ret = idr_alloc_cyclic(&ctx->personality_idr, (void *) creds, 1, + USHRT_MAX, GFP_KERNEL); + if (ret < 0) + put_cred(creds); return ret; } @@ -10196,6 +9885,8 @@ SYSCALL_DEFINE4(io_uring_register, unsigned int, fd, unsigned int, opcode, ctx = f.file->private_data; + io_run_task_work(); + mutex_lock(&ctx->uring_lock); ret = __io_uring_register(ctx, opcode, arg, nr_args); mutex_unlock(&ctx->uring_lock); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 0d9d1a6a947e..7ffcd7ef33d4 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1458,13 +1458,6 @@ iomap_do_writepage(struct page *page, struct writeback_control *wbc, void *data) PF_MEMALLOC)) goto redirty; - /* - * Given that we do not allow direct reclaim to call us, we should - * never be called in a recursive filesystem reclaim context. - */ - if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS)) - goto redirty; - /* * Is this page beyond the end of the file? * diff --git a/fs/jfs/super.c b/fs/jfs/super.c index b2dc4d1f9dcc..1f0ffabbde56 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -551,7 +551,6 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) ret = -ENOMEM; goto out_unload; } - inode->i_ino = 0; inode->i_size = i_size_read(sb->s_bdev->bd_inode); inode->i_mapping->a_ops = &jfs_metapage_aops; inode_fake_hash(inode); diff --git a/fs/namespace.c b/fs/namespace.c index 062fe984a697..56bb5a5fdc0d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -380,51 +380,37 @@ int mnt_want_write(struct vfsmount *m) } EXPORT_SYMBOL_GPL(mnt_want_write); -/** - * mnt_clone_write - get write access to a mount - * @mnt: the mount on which to take a write - * - * This is effectively like mnt_want_write, except - * it must only be used to take an extra write reference - * on a mountpoint that we already know has a write reference - * on it. This allows some optimisation. - * - * After finished, mnt_drop_write must be called as usual to - * drop the reference. - */ -int mnt_clone_write(struct vfsmount *mnt) -{ - /* superblock may be r/o */ - if (__mnt_is_readonly(mnt)) - return -EROFS; - preempt_disable(); - mnt_inc_writers(real_mount(mnt)); - preempt_enable(); - return 0; -} -EXPORT_SYMBOL_GPL(mnt_clone_write); - /** * __mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like __mnt_want_write, but it takes a file and can - * do some optimisations if the file is open for write already + * This is like __mnt_want_write, but if the file is already open for writing it + * skips incrementing mnt_writers (since the open file already has a reference) + * and instead only does the check for emergency r/o remounts. This must be + * paired with __mnt_drop_write_file. */ int __mnt_want_write_file(struct file *file) { - if (!(file->f_mode & FMODE_WRITER)) - return __mnt_want_write(file->f_path.mnt); - else - return mnt_clone_write(file->f_path.mnt); + if (file->f_mode & FMODE_WRITER) { + /* + * Superblock may have become readonly while there are still + * writable fd's, e.g. due to a fs error with errors=remount-ro + */ + if (__mnt_is_readonly(file->f_path.mnt)) + return -EROFS; + return 0; + } + return __mnt_want_write(file->f_path.mnt); } /** * mnt_want_write_file - get write access to a file's mount * @file: the file who's mount on which to take a write * - * This is like mnt_want_write, but it takes a file and can - * do some optimisations if the file is open for write already + * This is like mnt_want_write, but if the file is already open for writing it + * skips incrementing mnt_writers (since the open file already has a reference) + * and instead only does the freeze protection and the check for emergency r/o + * remounts. This must be paired with mnt_drop_write_file. */ int mnt_want_write_file(struct file *file) { @@ -470,7 +456,8 @@ EXPORT_SYMBOL_GPL(mnt_drop_write); void __mnt_drop_write_file(struct file *file) { - __mnt_drop_write(file->f_path.mnt); + if (!(file->f_mode & FMODE_WRITER)) + __mnt_drop_write(file->f_path.mnt); } void mnt_drop_write_file(struct file *file) diff --git a/fs/proc/self.c b/fs/proc/self.c index a4012154e109..72cd69bcaf4a 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -16,13 +16,6 @@ static const char *proc_self_get_link(struct dentry *dentry, pid_t tgid = task_tgid_nr_ns(current, ns); char *name; - /* - * Not currently supported. Once we can inherit all of struct pid, - * we can allow this. - */ - if (current->flags & PF_IO_WORKER) - return ERR_PTR(-EOPNOTSUPP); - if (!tgid) return ERR_PTR(-ENOENT); /* max length of unsigned int in decimal + NULL term */ diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index d56681d86d28..a553273fbd41 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -17,13 +17,6 @@ static const char *proc_thread_self_get_link(struct dentry *dentry, pid_t pid = task_pid_nr_ns(current, ns); char *name; - /* - * Not currently supported. Once we can inherit all of struct pid, - * we can allow this. - */ - if (current->flags & PF_IO_WORKER) - return ERR_PTR(-EOPNOTSUPP); - if (!pid) return ERR_PTR(-ENOENT); name = kmalloc(10 + 6 + 10 + 1, dentry ? GFP_KERNEL : GFP_ATOMIC); diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c index b56ff451adce..5b6fcb9b44e2 100644 --- a/fs/xfs/libxfs/xfs_btree.c +++ b/fs/xfs/libxfs/xfs_btree.c @@ -2805,7 +2805,7 @@ xfs_btree_split_worker( struct xfs_btree_split_args *args = container_of(work, struct xfs_btree_split_args, work); unsigned long pflags; - unsigned long new_pflags = PF_MEMALLOC_NOFS; + unsigned long new_pflags = 0; /* * we are in a transaction context here, but may also be doing work @@ -2817,12 +2817,20 @@ xfs_btree_split_worker( new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD; current_set_flags_nested(&pflags, new_pflags); + xfs_trans_set_context(args->cur->bc_tp); args->result = __xfs_btree_split(args->cur, args->level, args->ptrp, args->key, args->curp, args->stat); + + xfs_trans_clear_context(args->cur->bc_tp); + current_restore_flags_nested(&pflags, new_pflags); + + /* + * Do not access args after complete() has run here. We don't own args + * and the owner may run and free args before we return here. + */ complete(args->done); - current_restore_flags_nested(&pflags, new_pflags); } /* diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 4304c6416fbb..b4186d666157 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -62,7 +62,7 @@ xfs_setfilesize_trans_alloc( * We hand off the transaction to the completion thread now, so * clear the flag here. */ - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); + xfs_trans_clear_context(tp); return 0; } @@ -125,7 +125,7 @@ xfs_setfilesize_ioend( * thus we need to mark ourselves as being in a transaction manually. * Similarly for freeze protection. */ - current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); + xfs_trans_set_context(tp); __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); /* we abort the update if there was an IO error */ @@ -568,6 +568,12 @@ xfs_vm_writepage( { struct xfs_writepage_ctx wpc = { }; + if (WARN_ON_ONCE(current->journal_info)) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops); } @@ -578,6 +584,13 @@ xfs_vm_writepages( { struct xfs_writepage_ctx wpc = { }; + /* + * Writing back data in a transaction context can result in recursive + * transactions. This is bad, so issue a warning and get out of here. + */ + if (WARN_ON_ONCE(current->journal_info)) + return 0; + xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); } diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index 3991e59cfd18..ef17c1f6db32 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -344,7 +344,6 @@ xfs_extent_busy_trim( ASSERT(*len > 0); spin_lock(&args->pag->pagb_lock); -restart: fbno = *bno; flen = *len; rbp = args->pag->pagb_tree.rb_node; @@ -363,19 +362,6 @@ restart: continue; } - /* - * If this is a metadata allocation, try to reuse the busy - * extent instead of trimming the allocation. - */ - if (!(args->datatype & XFS_ALLOC_USERDATA) && - !(busyp->flags & XFS_EXTENT_BUSY_DISCARDED)) { - if (!xfs_extent_busy_update_extent(args->mp, args->pag, - busyp, fbno, flen, - false)) - goto restart; - continue; - } - if (bbno <= fbno) { /* start overlap */ diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c index 145e06c47744..546a6cd96729 100644 --- a/fs/xfs/xfs_sysctl.c +++ b/fs/xfs/xfs_sysctl.c @@ -51,7 +51,7 @@ xfs_panic_mask_proc_handler( #endif /* CONFIG_PROC_FS */ STATIC int -xfs_deprecate_irix_sgid_inherit_proc_handler( +xfs_deprecated_dointvec_minmax( struct ctl_table *ctl, int write, void *buffer, @@ -59,24 +59,8 @@ xfs_deprecate_irix_sgid_inherit_proc_handler( loff_t *ppos) { if (write) { - printk_once(KERN_WARNING - "XFS: " "%s sysctl option is deprecated.\n", - ctl->procname); - } - return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); -} - -STATIC int -xfs_deprecate_irix_symlink_mode_proc_handler( - struct ctl_table *ctl, - int write, - void *buffer, - size_t *lenp, - loff_t *ppos) -{ - if (write) { - printk_once(KERN_WARNING - "XFS: " "%s sysctl option is deprecated.\n", + printk_ratelimited(KERN_WARNING + "XFS: %s sysctl option is deprecated.\n", ctl->procname); } return proc_dointvec_minmax(ctl, write, buffer, lenp, ppos); @@ -88,7 +72,7 @@ static struct ctl_table xfs_table[] = { .data = &xfs_params.sgid_inherit.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = xfs_deprecate_irix_sgid_inherit_proc_handler, + .proc_handler = xfs_deprecated_dointvec_minmax, .extra1 = &xfs_params.sgid_inherit.min, .extra2 = &xfs_params.sgid_inherit.max }, @@ -97,7 +81,7 @@ static struct ctl_table xfs_table[] = { .data = &xfs_params.symlink_mode.val, .maxlen = sizeof(int), .mode = 0644, - .proc_handler = xfs_deprecate_irix_symlink_mode_proc_handler, + .proc_handler = xfs_deprecated_dointvec_minmax, .extra1 = &xfs_params.symlink_mode.min, .extra2 = &xfs_params.symlink_mode.max }, @@ -201,6 +185,15 @@ static struct ctl_table xfs_table[] = { .extra1 = &xfs_params.blockgc_timer.min, .extra2 = &xfs_params.blockgc_timer.max, }, + { + .procname = "speculative_cow_prealloc_lifetime", + .data = &xfs_params.blockgc_timer.val, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = xfs_deprecated_dointvec_minmax, + .extra1 = &xfs_params.blockgc_timer.min, + .extra2 = &xfs_params.blockgc_timer.max, + }, /* please keep this the last entry */ #ifdef CONFIG_PROC_FS { diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 44f72c09c203..b22a09e9daee 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -72,6 +72,7 @@ xfs_trans_free( xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false); trace_xfs_trans_free(tp, _RET_IP_); + xfs_trans_clear_context(tp); if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT)) sb_end_intwrite(tp->t_mountp->m_super); xfs_trans_free_dqinfo(tp); @@ -123,7 +124,8 @@ xfs_trans_dup( ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used; tp->t_rtx_res = tp->t_rtx_res_used; - ntp->t_pflags = tp->t_pflags; + + xfs_trans_switch_context(tp, ntp); /* move deferred ops over to the new tp */ xfs_defer_move(ntp, tp); @@ -157,9 +159,6 @@ xfs_trans_reserve( int error = 0; bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0; - /* Mark this thread as being in a transaction */ - current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); - /* * Attempt to reserve the needed disk blocks by decrementing * the number needed from the number available. This will @@ -167,10 +166,8 @@ xfs_trans_reserve( */ if (blocks > 0) { error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd); - if (error != 0) { - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); + if (error != 0) return -ENOSPC; - } tp->t_blk_res += blocks; } @@ -244,9 +241,6 @@ undo_blocks: xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd); tp->t_blk_res = 0; } - - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); - return error; } @@ -260,6 +254,7 @@ xfs_trans_alloc( struct xfs_trans **tpp) { struct xfs_trans *tp; + bool want_retry = true; int error; /* @@ -267,9 +262,11 @@ xfs_trans_alloc( * GFP_NOFS allocation context so that we avoid lockdep false positives * by doing GFP_KERNEL allocations inside sb_start_intwrite(). */ +retry: tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL); if (!(flags & XFS_TRANS_NO_WRITECOUNT)) sb_start_intwrite(mp->m_super); + xfs_trans_set_context(tp); /* * Zero-reservation ("empty") transactions can't modify anything, so @@ -289,7 +286,9 @@ xfs_trans_alloc( tp->t_firstblock = NULLFSBLOCK; error = xfs_trans_reserve(tp, resp, blocks, rtextents); - if (error == -ENOSPC) { + if (error == -ENOSPC && want_retry) { + xfs_trans_cancel(tp); + /* * We weren't able to reserve enough space for the transaction. * Flush the other speculative space allocations to free space. @@ -297,8 +296,11 @@ xfs_trans_alloc( * other locks. */ error = xfs_blockgc_free_space(mp, NULL); - if (!error) - error = xfs_trans_reserve(tp, resp, blocks, rtextents); + if (error) + return error; + + want_retry = false; + goto retry; } if (error) { xfs_trans_cancel(tp); @@ -893,7 +895,6 @@ __xfs_trans_commit( xfs_log_commit_cil(mp, tp, &commit_lsn, regrant); - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free(tp); /* @@ -925,7 +926,6 @@ out_unreserve: xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket); tp->t_ticket = NULL; } - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); xfs_trans_free_items(tp, !!error); xfs_trans_free(tp); @@ -985,9 +985,6 @@ xfs_trans_cancel( tp->t_ticket = NULL; } - /* mark this thread as no longer being in a transaction */ - current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS); - xfs_trans_free_items(tp, dirty); xfs_trans_free(tp); } diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index 8b03fbfe9a1b..9dd745cf77c9 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -281,4 +281,34 @@ int xfs_trans_alloc_ichange(struct xfs_inode *ip, struct xfs_dquot *udqp, struct xfs_dquot *gdqp, struct xfs_dquot *pdqp, bool force, struct xfs_trans **tpp); +static inline void +xfs_trans_set_context( + struct xfs_trans *tp) +{ + ASSERT(current->journal_info == NULL); + tp->t_pflags = memalloc_nofs_save(); + current->journal_info = tp; +} + +static inline void +xfs_trans_clear_context( + struct xfs_trans *tp) +{ + if (current->journal_info == tp) { + memalloc_nofs_restore(tp->t_pflags); + current->journal_info = NULL; + } +} + +static inline void +xfs_trans_switch_context( + struct xfs_trans *old_tp, + struct xfs_trans *new_tp) +{ + ASSERT(current->journal_info == old_tp); + new_tp->t_pflags = old_tp->t_pflags; + old_tp->t_pflags = 0; + current->journal_info = new_tp; +} + #endif /* __XFS_TRANS_H__ */ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index ee09a39627d6..9129ba231423 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -168,6 +168,7 @@ enum cpuhp_state { CPUHP_AP_PERF_X86_CQM_ONLINE, CPUHP_AP_PERF_X86_CSTATE_ONLINE, CPUHP_AP_PERF_S390_CF_ONLINE, + CPUHP_AP_PERF_S390_CFD_ONLINE, CPUHP_AP_PERF_S390_SF_ONLINE, CPUHP_AP_PERF_ARM_CCI_ONLINE, CPUHP_AP_PERF_ARM_CCN_ONLINE, diff --git a/include/linux/device.h b/include/linux/device.h index 7619a84f8ce4..ba660731bd25 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -291,6 +291,7 @@ struct device_dma_parameters { * sg limitations. */ unsigned int max_segment_size; + unsigned int min_align_mask; unsigned long segment_boundary_mask; }; diff --git a/include/linux/dma-mapping.h b/include/linux/dma-mapping.h index fbfa3f5abd94..2a984cb4d1e0 100644 --- a/include/linux/dma-mapping.h +++ b/include/linux/dma-mapping.h @@ -509,6 +509,22 @@ static inline int dma_set_seg_boundary(struct device *dev, unsigned long mask) return -EIO; } +static inline unsigned int dma_get_min_align_mask(struct device *dev) +{ + if (dev->dma_parms) + return dev->dma_parms->min_align_mask; + return 0; +} + +static inline int dma_set_min_align_mask(struct device *dev, + unsigned int min_align_mask) +{ + if (WARN_ON_ONCE(!dev->dma_parms)) + return -EIO; + dev->dma_parms->min_align_mask = min_align_mask; + return 0; +} + static inline int dma_get_cache_alignment(void) { #ifdef ARCH_DMA_MINALIGN diff --git a/include/linux/io_uring.h b/include/linux/io_uring.h index 2eb6d19de336..51ede771cd99 100644 --- a/include/linux/io_uring.h +++ b/include/linux/io_uring.h @@ -5,23 +5,6 @@ #include #include -struct io_identity { - struct files_struct *files; - struct mm_struct *mm; -#ifdef CONFIG_BLK_CGROUP - struct cgroup_subsys_state *blkcg_css; -#endif - const struct cred *creds; - struct nsproxy *nsproxy; - struct fs_struct *fs; - unsigned long fsize; -#ifdef CONFIG_AUDIT - kuid_t loginuid; - unsigned int sessionid; -#endif - refcount_t count; -}; - struct io_wq_work_node { struct io_wq_work_node *next; }; @@ -36,9 +19,8 @@ struct io_uring_task { struct xarray xa; struct wait_queue_head wait; struct file *last; + void *io_wq; struct percpu_counter inflight; - struct io_identity __identity; - struct io_identity *identity; atomic_t in_idle; bool sqpoll; @@ -61,7 +43,7 @@ static inline void io_uring_task_cancel(void) } static inline void io_uring_files_cancel(struct files_struct *files) { - if (current->io_uring && !xa_empty(¤t->io_uring->xa)) + if (current->io_uring) __io_uring_files_cancel(files); } static inline void io_uring_free(struct task_struct *tsk) diff --git a/include/linux/litex.h b/include/linux/litex.h index 40f5be503593..5ea9ccf5cce4 100644 --- a/include/linux/litex.h +++ b/include/linux/litex.h @@ -3,9 +3,6 @@ * Common LiteX header providing * helper functions for accessing CSRs. * - * Implementation of the functions is provided by - * the LiteX SoC Controller driver. - * * Copyright (C) 2019-2020 Antmicro */ @@ -13,90 +10,147 @@ #define _LINUX_LITEX_H #include -#include -#include -/* - * The parameters below are true for LiteX SoCs configured for 8-bit CSR Bus, - * 32-bit aligned. - * - * Supporting other configurations will require extending the logic in this - * header and in the LiteX SoC controller driver. - */ -#define LITEX_REG_SIZE 0x4 -#define LITEX_SUBREG_SIZE 0x1 +/* LiteX SoCs support 8- or 32-bit CSR Bus data width (i.e., subreg. size) */ +#if defined(CONFIG_LITEX_SUBREG_SIZE) && \ + (CONFIG_LITEX_SUBREG_SIZE == 1 || CONFIG_LITEX_SUBREG_SIZE == 4) +#define LITEX_SUBREG_SIZE CONFIG_LITEX_SUBREG_SIZE +#else +#error LiteX subregister size (LITEX_SUBREG_SIZE) must be 4 or 1! +#endif #define LITEX_SUBREG_SIZE_BIT (LITEX_SUBREG_SIZE * 8) -#define WRITE_LITEX_SUBREGISTER(val, base_offset, subreg_id) \ - writel((u32 __force)cpu_to_le32(val), base_offset + (LITEX_REG_SIZE * subreg_id)) +/* LiteX subregisters of any width are always aligned on a 4-byte boundary */ +#define LITEX_SUBREG_ALIGN 0x4 -#define READ_LITEX_SUBREGISTER(base_offset, subreg_id) \ - le32_to_cpu((__le32 __force)readl(base_offset + (LITEX_REG_SIZE * subreg_id))) +static inline void _write_litex_subregister(u32 val, void __iomem *addr) +{ + writel((u32 __force)cpu_to_le32(val), addr); +} -void litex_set_reg(void __iomem *reg, unsigned long reg_sz, unsigned long val); +static inline u32 _read_litex_subregister(void __iomem *addr) +{ + return le32_to_cpu((__le32 __force)readl(addr)); +} -unsigned long litex_get_reg(void __iomem *reg, unsigned long reg_sz); +/* + * LiteX SoC Generator, depending on the configuration, can split a single + * logical CSR (Control&Status Register) into a series of consecutive physical + * registers. + * + * For example, in the configuration with 8-bit CSR Bus, a 32-bit aligned, + * 32-bit wide logical CSR will be laid out as four 32-bit physical + * subregisters, each one containing one byte of meaningful data. + * + * For details see: https://github.com/enjoy-digital/litex/wiki/CSR-Bus + */ + +/* number of LiteX subregisters needed to store a register of given reg_size */ +#define _litex_num_subregs(reg_size) \ + (((reg_size) - 1) / LITEX_SUBREG_SIZE + 1) + +/* + * since the number of 4-byte aligned subregisters required to store a single + * LiteX CSR (MMIO) register varies with LITEX_SUBREG_SIZE, the offset of the + * next adjacent LiteX CSR register w.r.t. the offset of the current one also + * depends on how many subregisters the latter is spread across + */ +#define _next_reg_off(off, size) \ + ((off) + _litex_num_subregs(size) * LITEX_SUBREG_ALIGN) + +/* + * The purpose of `_litex_[set|get]_reg()` is to implement the logic of + * writing to/reading from the LiteX CSR in a single place that can be then + * reused by all LiteX drivers via the `litex_[write|read][8|16|32|64]()` + * accessors for the appropriate data width. + * NOTE: direct use of `_litex_[set|get]_reg()` by LiteX drivers is strongly + * discouraged, as they perform no error checking on the requested data width! + */ + +/** + * _litex_set_reg() - Writes a value to the LiteX CSR (Control&Status Register) + * @reg: Address of the CSR + * @reg_size: The width of the CSR expressed in the number of bytes + * @val: Value to be written to the CSR + * + * This function splits a single (possibly multi-byte) LiteX CSR write into + * a series of subregister writes with a proper offset. + * NOTE: caller is responsible for ensuring (0 < reg_size <= sizeof(u64)). + */ +static inline void _litex_set_reg(void __iomem *reg, size_t reg_size, u64 val) +{ + u8 shift = _litex_num_subregs(reg_size) * LITEX_SUBREG_SIZE_BIT; + + while (shift > 0) { + shift -= LITEX_SUBREG_SIZE_BIT; + _write_litex_subregister(val >> shift, reg); + reg += LITEX_SUBREG_ALIGN; + } +} + +/** + * _litex_get_reg() - Reads a value of the LiteX CSR (Control&Status Register) + * @reg: Address of the CSR + * @reg_size: The width of the CSR expressed in the number of bytes + * + * Return: Value read from the CSR + * + * This function generates a series of subregister reads with a proper offset + * and joins their results into a single (possibly multi-byte) LiteX CSR value. + * NOTE: caller is responsible for ensuring (0 < reg_size <= sizeof(u64)). + */ +static inline u64 _litex_get_reg(void __iomem *reg, size_t reg_size) +{ + u64 r; + u8 i; + + r = _read_litex_subregister(reg); + for (i = 1; i < _litex_num_subregs(reg_size); i++) { + r <<= LITEX_SUBREG_SIZE_BIT; + reg += LITEX_SUBREG_ALIGN; + r |= _read_litex_subregister(reg); + } + return r; +} static inline void litex_write8(void __iomem *reg, u8 val) { - WRITE_LITEX_SUBREGISTER(val, reg, 0); + _litex_set_reg(reg, sizeof(u8), val); } static inline void litex_write16(void __iomem *reg, u16 val) { - WRITE_LITEX_SUBREGISTER(val >> 8, reg, 0); - WRITE_LITEX_SUBREGISTER(val, reg, 1); + _litex_set_reg(reg, sizeof(u16), val); } static inline void litex_write32(void __iomem *reg, u32 val) { - WRITE_LITEX_SUBREGISTER(val >> 24, reg, 0); - WRITE_LITEX_SUBREGISTER(val >> 16, reg, 1); - WRITE_LITEX_SUBREGISTER(val >> 8, reg, 2); - WRITE_LITEX_SUBREGISTER(val, reg, 3); + _litex_set_reg(reg, sizeof(u32), val); } static inline void litex_write64(void __iomem *reg, u64 val) { - WRITE_LITEX_SUBREGISTER(val >> 56, reg, 0); - WRITE_LITEX_SUBREGISTER(val >> 48, reg, 1); - WRITE_LITEX_SUBREGISTER(val >> 40, reg, 2); - WRITE_LITEX_SUBREGISTER(val >> 32, reg, 3); - WRITE_LITEX_SUBREGISTER(val >> 24, reg, 4); - WRITE_LITEX_SUBREGISTER(val >> 16, reg, 5); - WRITE_LITEX_SUBREGISTER(val >> 8, reg, 6); - WRITE_LITEX_SUBREGISTER(val, reg, 7); + _litex_set_reg(reg, sizeof(u64), val); } static inline u8 litex_read8(void __iomem *reg) { - return READ_LITEX_SUBREGISTER(reg, 0); + return _litex_get_reg(reg, sizeof(u8)); } static inline u16 litex_read16(void __iomem *reg) { - return (READ_LITEX_SUBREGISTER(reg, 0) << 8) - | (READ_LITEX_SUBREGISTER(reg, 1)); + return _litex_get_reg(reg, sizeof(u16)); } static inline u32 litex_read32(void __iomem *reg) { - return (READ_LITEX_SUBREGISTER(reg, 0) << 24) - | (READ_LITEX_SUBREGISTER(reg, 1) << 16) - | (READ_LITEX_SUBREGISTER(reg, 2) << 8) - | (READ_LITEX_SUBREGISTER(reg, 3)); + return _litex_get_reg(reg, sizeof(u32)); } static inline u64 litex_read64(void __iomem *reg) { - return ((u64)READ_LITEX_SUBREGISTER(reg, 0) << 56) - | ((u64)READ_LITEX_SUBREGISTER(reg, 1) << 48) - | ((u64)READ_LITEX_SUBREGISTER(reg, 2) << 40) - | ((u64)READ_LITEX_SUBREGISTER(reg, 3) << 32) - | ((u64)READ_LITEX_SUBREGISTER(reg, 4) << 24) - | ((u64)READ_LITEX_SUBREGISTER(reg, 5) << 16) - | ((u64)READ_LITEX_SUBREGISTER(reg, 6) << 8) - | ((u64)READ_LITEX_SUBREGISTER(reg, 7)); + return _litex_get_reg(reg, sizeof(u64)); } #endif /* _LINUX_LITEX_H */ diff --git a/include/linux/mount.h b/include/linux/mount.h index 161f4419db6c..5d92a7e1a742 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -86,7 +86,6 @@ struct path; extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write_file(struct file *file); -extern int mnt_clone_write(struct vfsmount *mnt); extern void mnt_drop_write(struct vfsmount *mnt); extern void mnt_drop_write_file(struct file *file); extern void mntput(struct vfsmount *mnt); diff --git a/include/linux/net.h b/include/linux/net.h index 9e2324efc26a..ba736b457a06 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -42,8 +42,6 @@ struct net; #define SOCK_PASSCRED 3 #define SOCK_PASSSEC 4 -#define PROTO_CMSG_DATA_ONLY 0x0001 - #ifndef ARCH_HAS_SOCKET_TYPES /** * enum sock_type - Socket types @@ -138,7 +136,6 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, struct proto_ops { int family; - unsigned int flags; struct module *owner; int (*release) (struct socket *sock); int (*bind) (struct socket *sock, diff --git a/include/linux/sched.h b/include/linux/sched.h index 6d2ed84d326c..79fc40a66828 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -896,6 +896,9 @@ struct task_struct { /* CLONE_CHILD_CLEARTID: */ int __user *clear_child_tid; + /* PF_IO_WORKER */ + void *pf_io_worker; + u64 utime; u64 stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index d9c9fc9ca5d2..5857a937c637 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -29,6 +29,7 @@ enum swiotlb_force { * controllable. */ #define IO_TLB_SHIFT 11 +#define IO_TLB_SIZE (1 << IO_TLB_SHIFT) /* default to 64MB */ #define IO_TLB_DEFAULT_SIZE (64UL<<20) diff --git a/include/uapi/linux/io_uring.h b/include/uapi/linux/io_uring.h index ac4e1738a9af..2514eb6b1cf2 100644 --- a/include/uapi/linux/io_uring.h +++ b/include/uapi/linux/io_uring.h @@ -262,6 +262,7 @@ struct io_uring_params { #define IORING_FEAT_POLL_32BITS (1U << 6) #define IORING_FEAT_SQPOLL_NONFIXED (1U << 7) #define IORING_FEAT_EXT_ARG (1U << 8) +#define IORING_FEAT_NATIVE_WORKERS (1U << 9) /* * io_uring_register(2) opcodes and arguments diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 5b3f01da172b..60739d5e3373 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -84,7 +84,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa dentry = kern_path_locked(pathname, &path); if (IS_ERR(dentry)) - return (void *)dentry; /* returning an error */ + return ERR_CAST(dentry); /* returning an error */ inode = path.dentry->d_inode; inode_unlock(inode); diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c index 7c42df6e6100..c10e855a03bc 100644 --- a/kernel/dma/swiotlb.c +++ b/kernel/dma/swiotlb.c @@ -50,9 +50,6 @@ #define CREATE_TRACE_POINTS #include -#define OFFSET(val,align) ((unsigned long) \ - ( (val) & ( (align) - 1))) - #define SLABS_PER_PAGE (1 << (PAGE_SHIFT - IO_TLB_SHIFT)) /* @@ -102,6 +99,11 @@ static unsigned int max_segment; #define INVALID_PHYS_ADDR (~(phys_addr_t)0) static phys_addr_t *io_tlb_orig_addr; +/* + * The mapped buffer's size should be validated during a sync operation. + */ +static size_t *io_tlb_orig_size; + /* * Protect the above data structures in the map and unmap calls */ @@ -171,7 +173,7 @@ void __init swiotlb_adjust_size(unsigned long new_size) * adjust/expand SWIOTLB size for their use. */ if (!io_tlb_nslabs) { - size = ALIGN(new_size, 1 << IO_TLB_SHIFT); + size = ALIGN(new_size, IO_TLB_SIZE); io_tlb_nslabs = size >> IO_TLB_SHIFT; io_tlb_nslabs = ALIGN(io_tlb_nslabs, IO_TLB_SEGSIZE); @@ -192,6 +194,16 @@ void swiotlb_print_info(void) bytes >> 20); } +static inline unsigned long io_tlb_offset(unsigned long val) +{ + return val & (IO_TLB_SEGSIZE - 1); +} + +static inline unsigned long nr_slots(u64 val) +{ + return DIV_ROUND_UP(val, IO_TLB_SIZE); +} + /* * Early SWIOTLB allocation may be too early to allow an architecture to * perform the desired operations. This function allows the architecture to @@ -240,9 +252,16 @@ int __init swiotlb_init_with_tbl(char *tlb, unsigned long nslabs, int verbose) panic("%s: Failed to allocate %zu bytes align=0x%lx\n", __func__, alloc_size, PAGE_SIZE); + alloc_size = PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t)); + io_tlb_orig_size = memblock_alloc(alloc_size, PAGE_SIZE); + if (!io_tlb_orig_size) + panic("%s: Failed to allocate %zu bytes align=0x%lx\n", + __func__, alloc_size, PAGE_SIZE); + for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + io_tlb_orig_size[i] = 0; } io_tlb_index = 0; no_iotlb_memory = false; @@ -363,7 +382,7 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) * between io_tlb_start and io_tlb_end. */ io_tlb_list = (unsigned int *)__get_free_pages(GFP_KERNEL, - get_order(io_tlb_nslabs * sizeof(int))); + get_order(io_tlb_nslabs * sizeof(int))); if (!io_tlb_list) goto cleanup3; @@ -374,9 +393,18 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) if (!io_tlb_orig_addr) goto cleanup4; + io_tlb_orig_size = (size_t *) + __get_free_pages(GFP_KERNEL, + get_order(io_tlb_nslabs * + sizeof(size_t))); + if (!io_tlb_orig_size) + goto cleanup5; + + for (i = 0; i < io_tlb_nslabs; i++) { - io_tlb_list[i] = IO_TLB_SEGSIZE - OFFSET(i, IO_TLB_SEGSIZE); + io_tlb_list[i] = IO_TLB_SEGSIZE - io_tlb_offset(i); io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + io_tlb_orig_size[i] = 0; } io_tlb_index = 0; no_iotlb_memory = false; @@ -389,6 +417,10 @@ swiotlb_late_init_with_tbl(char *tlb, unsigned long nslabs) return 0; +cleanup5: + free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * + sizeof(phys_addr_t))); + cleanup4: free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * sizeof(int))); @@ -404,6 +436,8 @@ void __init swiotlb_exit(void) return; if (late_alloc) { + free_pages((unsigned long)io_tlb_orig_size, + get_order(io_tlb_nslabs * sizeof(size_t))); free_pages((unsigned long)io_tlb_orig_addr, get_order(io_tlb_nslabs * sizeof(phys_addr_t))); free_pages((unsigned long)io_tlb_list, get_order(io_tlb_nslabs * @@ -413,6 +447,8 @@ void __init swiotlb_exit(void) } else { memblock_free_late(__pa(io_tlb_orig_addr), PAGE_ALIGN(io_tlb_nslabs * sizeof(phys_addr_t))); + memblock_free_late(__pa(io_tlb_orig_size), + PAGE_ALIGN(io_tlb_nslabs * sizeof(size_t))); memblock_free_late(__pa(io_tlb_list), PAGE_ALIGN(io_tlb_nslabs * sizeof(int))); memblock_free_late(io_tlb_start, @@ -461,19 +497,119 @@ static void swiotlb_bounce(phys_addr_t orig_addr, phys_addr_t tlb_addr, } } -phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, +#define slot_addr(start, idx) ((start) + ((idx) << IO_TLB_SHIFT)) + +/* + * Return the offset into a iotlb slot required to keep the device happy. + */ +static unsigned int swiotlb_align_offset(struct device *dev, u64 addr) +{ + return addr & dma_get_min_align_mask(dev) & (IO_TLB_SIZE - 1); +} + +/* + * Carefully handle integer overflow which can occur when boundary_mask == ~0UL. + */ +static inline unsigned long get_max_slots(unsigned long boundary_mask) +{ + if (boundary_mask == ~0UL) + return 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); + return nr_slots(boundary_mask + 1); +} + +static unsigned int wrap_index(unsigned int index) +{ + if (index >= io_tlb_nslabs) + return 0; + return index; +} + +/* + * Find a suitable number of IO TLB entries size that will fit this request and + * allocate a buffer from that IO TLB pool. + */ +static int find_slots(struct device *dev, phys_addr_t orig_addr, + size_t alloc_size) +{ + unsigned long boundary_mask = dma_get_seg_boundary(dev); + dma_addr_t tbl_dma_addr = + phys_to_dma_unencrypted(dev, io_tlb_start) & boundary_mask; + unsigned long max_slots = get_max_slots(boundary_mask); + unsigned int iotlb_align_mask = + dma_get_min_align_mask(dev) & ~(IO_TLB_SIZE - 1); + unsigned int nslots = nr_slots(alloc_size), stride; + unsigned int index, wrap, count = 0, i; + unsigned long flags; + + BUG_ON(!nslots); + + /* + * For mappings with an alignment requirement don't bother looping to + * unaligned slots once we found an aligned one. For allocations of + * PAGE_SIZE or larger only look for page aligned allocations. + */ + stride = (iotlb_align_mask >> IO_TLB_SHIFT) + 1; + if (alloc_size >= PAGE_SIZE) + stride = max(stride, stride << (PAGE_SHIFT - IO_TLB_SHIFT)); + + spin_lock_irqsave(&io_tlb_lock, flags); + if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) + goto not_found; + + index = wrap = wrap_index(ALIGN(io_tlb_index, stride)); + do { + if ((slot_addr(tbl_dma_addr, index) & iotlb_align_mask) != + (orig_addr & iotlb_align_mask)) { + index = wrap_index(index + 1); + continue; + } + + /* + * If we find a slot that indicates we have 'nslots' number of + * contiguous buffers, we allocate the buffers from that slot + * and mark the entries as '0' indicating unavailable. + */ + if (!iommu_is_span_boundary(index, nslots, + nr_slots(tbl_dma_addr), + max_slots)) { + if (io_tlb_list[index] >= nslots) + goto found; + } + index = wrap_index(index + stride); + } while (index != wrap); + +not_found: + spin_unlock_irqrestore(&io_tlb_lock, flags); + return -1; + +found: + for (i = index; i < index + nslots; i++) + io_tlb_list[i] = 0; + for (i = index - 1; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && + io_tlb_list[i]; i--) + io_tlb_list[i] = ++count; + + /* + * Update the indices to avoid searching in the next round. + */ + if (index + nslots < io_tlb_nslabs) + io_tlb_index = index + nslots; + else + io_tlb_index = 0; + io_tlb_used += nslots; + + spin_unlock_irqrestore(&io_tlb_lock, flags); + return index; +} + +phys_addr_t swiotlb_tbl_map_single(struct device *dev, phys_addr_t orig_addr, size_t mapping_size, size_t alloc_size, enum dma_data_direction dir, unsigned long attrs) { - dma_addr_t tbl_dma_addr = phys_to_dma_unencrypted(hwdev, io_tlb_start); - unsigned long flags; + unsigned int offset = swiotlb_align_offset(dev, orig_addr); + unsigned int index, i; phys_addr_t tlb_addr; - unsigned int nslots, stride, index, wrap; - int i; - unsigned long mask; - unsigned long offset_slots; - unsigned long max_slots; - unsigned long tmp_io_tlb_used; if (no_iotlb_memory) panic("Can not allocate SWIOTLB buffer earlier and can't now provide you with the DMA bounce buffer"); @@ -482,114 +618,47 @@ phys_addr_t swiotlb_tbl_map_single(struct device *hwdev, phys_addr_t orig_addr, pr_warn_once("Memory encryption is active and system is using DMA bounce buffers\n"); if (mapping_size > alloc_size) { - dev_warn_once(hwdev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", + dev_warn_once(dev, "Invalid sizes (mapping: %zd bytes, alloc: %zd bytes)", mapping_size, alloc_size); return (phys_addr_t)DMA_MAPPING_ERROR; } - mask = dma_get_seg_boundary(hwdev); - - tbl_dma_addr &= mask; - - offset_slots = ALIGN(tbl_dma_addr, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - - /* - * Carefully handle integer overflow which can occur when mask == ~0UL. - */ - max_slots = mask + 1 - ? ALIGN(mask + 1, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT - : 1UL << (BITS_PER_LONG - IO_TLB_SHIFT); - - /* - * For mappings greater than or equal to a page, we limit the stride - * (and hence alignment) to a page size. - */ - nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - if (alloc_size >= PAGE_SIZE) - stride = (1 << (PAGE_SHIFT - IO_TLB_SHIFT)); - else - stride = 1; - - BUG_ON(!nslots); - - /* - * Find suitable number of IO TLB entries size that will fit this - * request and allocate a buffer from that IO TLB pool. - */ - spin_lock_irqsave(&io_tlb_lock, flags); - - if (unlikely(nslots > io_tlb_nslabs - io_tlb_used)) - goto not_found; - - index = ALIGN(io_tlb_index, stride); - if (index >= io_tlb_nslabs) - index = 0; - wrap = index; - - do { - while (iommu_is_span_boundary(index, nslots, offset_slots, - max_slots)) { - index += stride; - if (index >= io_tlb_nslabs) - index = 0; - if (index == wrap) - goto not_found; - } - - /* - * If we find a slot that indicates we have 'nslots' number of - * contiguous buffers, we allocate the buffers from that slot - * and mark the entries as '0' indicating unavailable. - */ - if (io_tlb_list[index] >= nslots) { - int count = 0; - - for (i = index; i < (int) (index + nslots); i++) - io_tlb_list[i] = 0; - for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE - 1) && io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; - tlb_addr = io_tlb_start + (index << IO_TLB_SHIFT); - - /* - * Update the indices to avoid searching in the next - * round. - */ - io_tlb_index = ((index + nslots) < io_tlb_nslabs - ? (index + nslots) : 0); - - goto found; - } - index += stride; - if (index >= io_tlb_nslabs) - index = 0; - } while (index != wrap); - -not_found: - tmp_io_tlb_used = io_tlb_used; - - spin_unlock_irqrestore(&io_tlb_lock, flags); - if (!(attrs & DMA_ATTR_NO_WARN) && printk_ratelimit()) - dev_warn(hwdev, "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", - alloc_size, io_tlb_nslabs, tmp_io_tlb_used); - return (phys_addr_t)DMA_MAPPING_ERROR; -found: - io_tlb_used += nslots; - spin_unlock_irqrestore(&io_tlb_lock, flags); + index = find_slots(dev, orig_addr, alloc_size + offset); + if (index == -1) { + if (!(attrs & DMA_ATTR_NO_WARN)) + dev_warn_ratelimited(dev, + "swiotlb buffer is full (sz: %zd bytes), total %lu (slots), used %lu (slots)\n", + alloc_size, io_tlb_nslabs, io_tlb_used); + return (phys_addr_t)DMA_MAPPING_ERROR; + } /* * Save away the mapping from the original address to the DMA address. * This is needed when we sync the memory. Then we sync the buffer if * needed. */ - for (i = 0; i < nslots; i++) - io_tlb_orig_addr[index+i] = orig_addr + (i << IO_TLB_SHIFT); + for (i = 0; i < nr_slots(alloc_size + offset); i++) { + io_tlb_orig_addr[index + i] = slot_addr(orig_addr, i); + io_tlb_orig_size[index+i] = alloc_size - (i << IO_TLB_SHIFT); + } + tlb_addr = slot_addr(io_tlb_start, index) + offset; if (!(attrs & DMA_ATTR_SKIP_CPU_SYNC) && (dir == DMA_TO_DEVICE || dir == DMA_BIDIRECTIONAL)) swiotlb_bounce(orig_addr, tlb_addr, mapping_size, DMA_TO_DEVICE); - return tlb_addr; } +static void validate_sync_size_and_truncate(struct device *hwdev, size_t orig_size, size_t *size) +{ + if (*size > orig_size) { + /* Warn and truncate mapping_size */ + dev_WARN_ONCE(hwdev, 1, + "Attempt for buffer overflow. Original size: %zu. Mapping size: %zu.\n", + orig_size, *size); + *size = orig_size; + } +} + /* * tlb_addr is the physical address of the bounce buffer to unmap. */ @@ -598,10 +667,13 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, enum dma_data_direction dir, unsigned long attrs) { unsigned long flags; - int i, count, nslots = ALIGN(alloc_size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; - int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + unsigned int offset = swiotlb_align_offset(hwdev, tlb_addr); + int i, count, nslots = nr_slots(alloc_size + offset); + int index = (tlb_addr - offset - io_tlb_start) >> IO_TLB_SHIFT; phys_addr_t orig_addr = io_tlb_orig_addr[index]; + validate_sync_size_and_truncate(hwdev, io_tlb_orig_size[index], &mapping_size); + /* * First, sync the memory before unmapping the entry */ @@ -617,26 +689,30 @@ void swiotlb_tbl_unmap_single(struct device *hwdev, phys_addr_t tlb_addr, * with slots below and above the pool being returned. */ spin_lock_irqsave(&io_tlb_lock, flags); - { - count = ((index + nslots) < ALIGN(index + 1, IO_TLB_SEGSIZE) ? - io_tlb_list[index + nslots] : 0); - /* - * Step 1: return the slots to the free list, merging the - * slots with superceeding slots - */ - for (i = index + nslots - 1; i >= index; i--) { - io_tlb_list[i] = ++count; - io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; - } - /* - * Step 2: merge the returned slots with the preceding slots, - * if available (non zero) - */ - for (i = index - 1; (OFFSET(i, IO_TLB_SEGSIZE) != IO_TLB_SEGSIZE -1) && io_tlb_list[i]; i--) - io_tlb_list[i] = ++count; + if (index + nslots < ALIGN(index + 1, IO_TLB_SEGSIZE)) + count = io_tlb_list[index + nslots]; + else + count = 0; - io_tlb_used -= nslots; + /* + * Step 1: return the slots to the free list, merging the slots with + * superceeding slots + */ + for (i = index + nslots - 1; i >= index; i--) { + io_tlb_list[i] = ++count; + io_tlb_orig_addr[i] = INVALID_PHYS_ADDR; + io_tlb_orig_size[i] = 0; } + + /* + * Step 2: merge the returned slots with the preceding slots, if + * available (non zero) + */ + for (i = index - 1; + io_tlb_offset(i) != IO_TLB_SEGSIZE - 1 && io_tlb_list[i]; + i--) + io_tlb_list[i] = ++count; + io_tlb_used -= nslots; spin_unlock_irqrestore(&io_tlb_lock, flags); } @@ -645,11 +721,13 @@ void swiotlb_tbl_sync_single(struct device *hwdev, phys_addr_t tlb_addr, enum dma_sync_target target) { int index = (tlb_addr - io_tlb_start) >> IO_TLB_SHIFT; + size_t orig_size = io_tlb_orig_size[index]; phys_addr_t orig_addr = io_tlb_orig_addr[index]; if (orig_addr == INVALID_PHYS_ADDR) return; - orig_addr += (unsigned long)tlb_addr & ((1 << IO_TLB_SHIFT) - 1); + + validate_sync_size_and_truncate(hwdev, orig_size, &size); switch (target) { case SYNC_FOR_CPU: @@ -707,7 +785,7 @@ dma_addr_t swiotlb_map(struct device *dev, phys_addr_t paddr, size_t size, size_t swiotlb_max_mapping_size(struct device *dev) { - return ((size_t)1 << IO_TLB_SHIFT) * IO_TLB_SEGSIZE; + return ((size_t)IO_TLB_SIZE) * IO_TLB_SEGSIZE; } bool is_swiotlb_active(void) diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 61db50f7ca86..821cf1723814 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -375,7 +375,7 @@ static int ptrace_attach(struct task_struct *task, long request, audit_ptrace(task); retval = -EPERM; - if (unlikely(task->flags & PF_KTHREAD)) + if (unlikely(task->flags & (PF_KTHREAD | PF_IO_WORKER))) goto out; if (same_thread_group(task, current)) goto out; diff --git a/kernel/signal.c b/kernel/signal.c index 2d24e3769301..f4bfd3decb75 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -91,7 +91,7 @@ static bool sig_task_ignored(struct task_struct *t, int sig, bool force) return true; /* Only allow kernel generated signals to this kthread */ - if (unlikely((t->flags & PF_KTHREAD) && + if (unlikely((t->flags & (PF_KTHREAD | PF_IO_WORKER)) && (handler == SIG_KTHREAD_KERNEL) && !force)) return true; @@ -1096,7 +1096,7 @@ static int __send_signal(int sig, struct kernel_siginfo *info, struct task_struc /* * Skip useless siginfo allocation for SIGKILL and kernel threads. */ - if ((sig == SIGKILL) || (t->flags & PF_KTHREAD)) + if ((sig == SIGKILL) || (t->flags & (PF_KTHREAD | PF_IO_WORKER))) goto out_set; /* diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index a02ce89b56b5..1355e6c0d567 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -1021,7 +1021,6 @@ static int inet_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon const struct proto_ops inet_stream_ops = { .family = PF_INET, - .flags = PROTO_CMSG_DATA_ONLY, .owner = THIS_MODULE, .release = inet_release, .bind = inet_bind, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 1fb75f01756c..802f5111805a 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -665,7 +665,6 @@ int inet6_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, const struct proto_ops inet6_stream_ops = { .family = PF_INET6, - .flags = PROTO_CMSG_DATA_ONLY, .owner = THIS_MODULE, .release = inet6_release, .bind = inet6_bind, diff --git a/net/socket.c b/net/socket.c index 23c7842389de..84a8049c2b09 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2413,10 +2413,6 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg, long __sys_sendmsg_sock(struct socket *sock, struct msghdr *msg, unsigned int flags) { - /* disallow ancillary data requests from this path */ - if (msg->msg_control || msg->msg_controllen) - return -EINVAL; - return ____sys_sendmsg(sock, msg, flags, NULL, 0); } @@ -2625,12 +2621,6 @@ long __sys_recvmsg_sock(struct socket *sock, struct msghdr *msg, struct user_msghdr __user *umsg, struct sockaddr __user *uaddr, unsigned int flags) { - if (msg->msg_control || msg->msg_controllen) { - /* disallow ancillary data reqs unless cmsg is plain data */ - if (!(sock->ops->flags & PROTO_CMSG_DATA_ONLY)) - return -EINVAL; - } - return ____sys_recvmsg(sock, msg, umsg, uaddr, flags, 0); } diff --git a/scripts/kernel-doc b/scripts/kernel-doc index e046e16e4411..8b5bc7bf4bb8 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -1553,7 +1553,7 @@ sub create_parameterlist($$$$) { } elsif ($arg =~ m/\(.+\)\s*\(/) { # pointer-to-function $arg =~ tr/#/,/; - $arg =~ m/[^\(]+\(\*?\s*([\w\.]*)\s*\)/; + $arg =~ m/[^\(]+\(\*?\s*([\w\[\]\.]*)\s*\)/; $param = $1; $type = $arg; $type =~ s/([^\(]+\(\*?)\s*$param/$1/;