diff --git a/Documentation/admin-guide/cgroup-v1/hugetlb.rst b/Documentation/admin-guide/cgroup-v1/hugetlb.rst index 338f2c7d7a1c..0fa724d82abb 100644 --- a/Documentation/admin-guide/cgroup-v1/hugetlb.rst +++ b/Documentation/admin-guide/cgroup-v1/hugetlb.rst @@ -29,12 +29,14 @@ Brief summary of control files:: hugetlb..max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded hugetlb..usage_in_bytes # show current usage for "hugepagesize" hugetlb hugetlb..failcnt # show the number of allocation failure due to HugeTLB usage limit + hugetlb..numa_stat # show the numa information of the hugetlb memory charged to this cgroup For a system supporting three hugepage sizes (64k, 32M and 1G), the control files include:: hugetlb.1GB.limit_in_bytes hugetlb.1GB.max_usage_in_bytes + hugetlb.1GB.numa_stat hugetlb.1GB.usage_in_bytes hugetlb.1GB.failcnt hugetlb.1GB.rsvd.limit_in_bytes @@ -43,6 +45,7 @@ files include:: hugetlb.1GB.rsvd.failcnt hugetlb.64KB.limit_in_bytes hugetlb.64KB.max_usage_in_bytes + hugetlb.64KB.numa_stat hugetlb.64KB.usage_in_bytes hugetlb.64KB.failcnt hugetlb.64KB.rsvd.limit_in_bytes @@ -51,6 +54,7 @@ files include:: hugetlb.64KB.rsvd.failcnt hugetlb.32MB.limit_in_bytes hugetlb.32MB.max_usage_in_bytes + hugetlb.32MB.numa_stat hugetlb.32MB.usage_in_bytes hugetlb.32MB.failcnt hugetlb.32MB.rsvd.limit_in_bytes diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 4f400b03dddf..5aa368d165da 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -2266,6 +2266,11 @@ HugeTLB Interface Files are local to the cgroup i.e. not hierarchical. The file modified event generated on this file reflects only the local events. + hugetlb..numa_stat + Similar to memory.numa_stat, it shows the numa information of the + hugetlb pages of in this cgroup. Only active in + use hugetlb pages are included. The per-node values are in bytes. + Misc ---- diff --git a/Documentation/admin-guide/mm/damon/reclaim.rst b/Documentation/admin-guide/mm/damon/reclaim.rst index fb9def3a7355..0af51a9705b1 100644 --- a/Documentation/admin-guide/mm/damon/reclaim.rst +++ b/Documentation/admin-guide/mm/damon/reclaim.rst @@ -208,6 +208,31 @@ PID of the DAMON thread. If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. Else, -1. +nr_reclaim_tried_regions +------------------------ + +Number of memory regions that tried to be reclaimed by DAMON_RECLAIM. + +bytes_reclaim_tried_regions +--------------------------- + +Total bytes of memory regions that tried to be reclaimed by DAMON_RECLAIM. + +nr_reclaimed_regions +-------------------- + +Number of memory regions that successfully be reclaimed by DAMON_RECLAIM. + +bytes_reclaimed_regions +----------------------- + +Total bytes of memory regions that successfully be reclaimed by DAMON_RECLAIM. + +nr_quota_exceeds +---------------- + +Number of times that the time/space quota limits have exceeded. + Example ======= diff --git a/Documentation/admin-guide/mm/damon/usage.rst b/Documentation/admin-guide/mm/damon/usage.rst index ed96bbf0daff..59b84904a854 100644 --- a/Documentation/admin-guide/mm/damon/usage.rst +++ b/Documentation/admin-guide/mm/damon/usage.rst @@ -7,37 +7,40 @@ Detailed Usages DAMON provides below three interfaces for different users. - *DAMON user space tool.* - This is for privileged people such as system administrators who want a - just-working human-friendly interface. Using this, users can use the DAMON’s - major features in a human-friendly way. It may not be highly tuned for - special cases, though. It supports both virtual and physical address spaces - monitoring. + `This `_ is for privileged people such as + system administrators who want a just-working human-friendly interface. + Using this, users can use the DAMON’s major features in a human-friendly way. + It may not be highly tuned for special cases, though. It supports both + virtual and physical address spaces monitoring. For more detail, please + refer to its `usage document + `_. - *debugfs interface.* - This is for privileged user space programmers who want more optimized use of - DAMON. Using this, users can use DAMON’s major features by reading - from and writing to special debugfs files. Therefore, you can write and use - your personalized DAMON debugfs wrapper programs that reads/writes the - debugfs files instead of you. The DAMON user space tool is also a reference - implementation of such programs. It supports both virtual and physical - address spaces monitoring. + :ref:`This ` is for privileged user space programmers who + want more optimized use of DAMON. Using this, users can use DAMON’s major + features by reading from and writing to special debugfs files. Therefore, + you can write and use your personalized DAMON debugfs wrapper programs that + reads/writes the debugfs files instead of you. The `DAMON user space tool + `_ is one example of such programs. It + supports both virtual and physical address spaces monitoring. Note that this + interface provides only simple :ref:`statistics ` for the + monitoring results. For detailed monitoring results, DAMON provides a + :ref:`tracepoint `. - *Kernel Space Programming Interface.* - This is for kernel space programmers. Using this, users can utilize every - feature of DAMON most flexibly and efficiently by writing kernel space - DAMON application programs for you. You can even extend DAMON for various - address spaces. + :doc:`This ` is for kernel space programmers. Using this, + users can utilize every feature of DAMON most flexibly and efficiently by + writing kernel space DAMON application programs for you. You can even extend + DAMON for various address spaces. For detail, please refer to the interface + :doc:`document `. -Nevertheless, you could write your own user space tool using the debugfs -interface. A reference implementation is available at -https://github.com/awslabs/damo. If you are a kernel programmer, you could -refer to :doc:`/vm/damon/api` for the kernel space programming interface. For -the reason, this document describes only the debugfs interface + +.. _debugfs_interface: debugfs Interface ================= -DAMON exports five files, ``attrs``, ``target_ids``, ``init_regions``, -``schemes`` and ``monitor_on`` under its debugfs directory, -``/damon/``. +DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``, +``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and +``rm_contexts`` under its debugfs directory, ``/damon/``. Attributes @@ -131,24 +134,38 @@ Schemes For usual DAMON-based data access aware memory management optimizations, users would simply want the system to apply a memory management action to a memory -region of a specific size having a specific access frequency for a specific -time. DAMON receives such formalized operation schemes from the user and -applies those to the target processes. It also counts the total number and -size of regions that each scheme is applied. This statistics can be used for -online analysis or tuning of the schemes. +region of a specific access pattern. DAMON receives such formalized operation +schemes from the user and applies those to the target processes. Users can get and set the schemes by reading from and writing to ``schemes`` debugfs file. Reading the file also shows the statistics of each scheme. To -the file, each of the schemes should be represented in each line in below form: +the file, each of the schemes should be represented in each line in below +form:: - min-size max-size min-acc max-acc min-age max-age action + -Note that the ranges are closed interval. Bytes for the size of regions -(``min-size`` and ``max-size``), number of monitored accesses per aggregate -interval for access frequency (``min-acc`` and ``max-acc``), number of -aggregate intervals for the age of regions (``min-age`` and ``max-age``), and a -predefined integer for memory management actions should be used. The supported -numbers and their meanings are as below. +You can disable schemes by simply writing an empty string to the file. + +Target Access Pattern +~~~~~~~~~~~~~~~~~~~~~ + +The ```` is constructed with three ranges in below +form:: + + min-size max-size min-acc max-acc min-age max-age + +Specifically, bytes for the size of regions (``min-size`` and ``max-size``), +number of monitored accesses per aggregate interval for access frequency +(``min-acc`` and ``max-acc``), number of aggregate intervals for the age of +regions (``min-age`` and ``max-age``) are specified. Note that the ranges are +closed interval. + +Action +~~~~~~ + +The ```` is a predefined integer for memory management actions, which +DAMON will apply to the regions having the target access pattern. The +supported numbers and their meanings are as below. - 0: Call ``madvise()`` for the region with ``MADV_WILLNEED`` - 1: Call ``madvise()`` for the region with ``MADV_COLD`` @@ -157,20 +174,82 @@ numbers and their meanings are as below. - 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE`` - 5: Do nothing but count the statistics -You can disable schemes by simply writing an empty string to the file. For -example, below commands applies a scheme saying "If a memory region of size in -[4KiB, 8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate -interval in [10, 20], page out the region", check the entered scheme again, and -finally remove the scheme. :: +Quota +~~~~~ + +Optimal ``target access pattern`` for each ``action`` is workload dependent, so +not easy to find. Worse yet, setting a scheme of some action too aggressive +can cause severe overhead. To avoid such overhead, users can limit time and +size quota for the scheme via the ```` in below form:: + + + +This makes DAMON to try to use only up to ```` milliseconds for applying +the action to memory regions of the ``target access pattern`` within the +```` milliseconds, and to apply the action to only up to +```` bytes of memory regions within the ````. Setting both +```` and ```` zero disables the quota limits. + +When the quota limit is expected to be exceeded, DAMON prioritizes found memory +regions of the ``target access pattern`` based on their size, access frequency, +and age. For personalized prioritization, users can set the weights for the +three properties in ```` in below form:: + + + +Watermarks +~~~~~~~~~~ + +Some schemes would need to run based on current value of the system's specific +metrics like free memory ratio. For such cases, users can specify watermarks +for the condition.:: + + + +```` is a predefined integer for the metric to be checked. The +supported numbers and their meanings are as below. + + - 0: Ignore the watermarks + - 1: System's free memory rate (per thousand) + +The value of the metric is checked every ```` microseconds. + +If the value is higher than ```` or lower than ````, the +scheme is deactivated. If the value is lower than ````, the scheme +is activated. + +.. _damos_stats: + +Statistics +~~~~~~~~~~ + +It also counts the total number and bytes of regions that each scheme is tried +to be applied, the two numbers for the regions that each scheme is successfully +applied, and the total number of the quota limit exceeds. This statistics can +be used for online analysis or tuning of the schemes. + +The statistics can be shown by reading the ``schemes`` file. Reading the file +will show each scheme you entered in each line, and the five numbers for the +statistics will be added at the end of each line. + +Example +~~~~~~~ + +Below commands applies a scheme saying "If a memory region of size in [4KiB, +8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate +interval in [10, 20], page out the region. For the paging out, use only up to +10ms per second, and also don't page out more than 1GiB per second. Under the +limitation, page out memory regions having longer age first. Also, check the +free memory rate of the system every 5 seconds, start the monitoring and paging +out when the free memory rate becomes lower than 50%, but stop it if the free +memory rate becomes larger than 60%, or lower than 30%".:: # cd /damon - # echo "4096 8192 0 5 10 20 2" > schemes - # cat schemes - 4096 8192 0 5 10 20 2 0 0 - # echo > schemes - -The last two integers in the 4th line of above example is the total number and -the total size of the regions that the scheme is applied. + # scheme="4096 8192 0 5 10 20 2" # target access pattern and action + # scheme+=" 10 $((1024*1024*1024)) 1000" # quotas + # scheme+=" 0 0 100" # prioritization weights + # scheme+=" 1 5000000 600 500 300" # watermarks + # echo "$scheme" > schemes Turning On/Off @@ -195,6 +274,54 @@ the monitoring is turned on. If you write to the files while DAMON is running, an error code such as ``-EBUSY`` will be returned. +Monitoring Thread PID +--------------------- + +DAMON does requested monitoring with a kernel thread called ``kdamond``. You +can get the pid of the thread by reading the ``kdamond_pid`` file. When the +monitoring is turned off, reading the file returns ``none``. :: + + # cd /damon + # cat monitor_on + off + # cat kdamond_pid + none + # echo on > monitor_on + # cat kdamond_pid + 18594 + + +Using Multiple Monitoring Threads +--------------------------------- + +One ``kdamond`` thread is created for each monitoring context. You can create +and remove monitoring contexts for multiple ``kdamond`` required use case using +the ``mk_contexts`` and ``rm_contexts`` files. + +Writing the name of the new context to the ``mk_contexts`` file creates a +directory of the name on the DAMON debugfs directory. The directory will have +DAMON debugfs files for the context. :: + + # cd /damon + # ls foo + # ls: cannot access 'foo': No such file or directory + # echo foo > mk_contexts + # ls foo + # attrs init_regions kdamond_pid schemes target_ids + +If the context is not needed anymore, you can remove it and the corresponding +directory by putting the name of the context to the ``rm_contexts`` file. :: + + # echo foo > rm_contexts + # ls foo + # ls: cannot access 'foo': No such file or directory + +Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the +root directory only. + + +.. _tracepoint: + Tracepoint for Monitoring Results ================================= diff --git a/Documentation/admin-guide/mm/numa_memory_policy.rst b/Documentation/admin-guide/mm/numa_memory_policy.rst index 64fd0ba0d057..5a6afecbb0d0 100644 --- a/Documentation/admin-guide/mm/numa_memory_policy.rst +++ b/Documentation/admin-guide/mm/numa_memory_policy.rst @@ -408,7 +408,7 @@ follows: Memory Policy APIs ================== -Linux supports 3 system calls for controlling memory policy. These APIS +Linux supports 4 system calls for controlling memory policy. These APIS always affect only the calling task, the calling task's address space, or some shared object mapped into the calling task's address space. @@ -460,6 +460,20 @@ requested via the 'flags' argument. See the mbind(2) man page for more details. +Set home node for a Range of Task's Address Spacec:: + + long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, + unsigned long home_node, + unsigned long flags); + +sys_set_mempolicy_home_node set the home node for a VMA policy present in the +task's address range. The system call updates the home node only for the existing +mempolicy range. Other address ranges are ignored. A home node is the NUMA node +closest to which page allocation will come from. Specifying the home node override +the default allocation policy to allocate memory close to the local node for an +executing CPU. + + Memory Policy Command Line Interface ==================================== diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst index 5e795202111f..f4804ce37c58 100644 --- a/Documentation/admin-guide/sysctl/vm.rst +++ b/Documentation/admin-guide/sysctl/vm.rst @@ -948,7 +948,7 @@ how much memory needs to be free before kswapd goes back to sleep. The unit is in fractions of 10,000. The default value of 10 means the distances between watermarks are 0.1% of the available memory in the -node/system. The maximum value is 1000, or 10% of memory. +node/system. The maximum value is 3000, or 30% of memory. A high rate of threads entering direct reclaim (allocstall) or kswapd going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate diff --git a/Documentation/vm/arch_pgtable_helpers.rst b/Documentation/vm/arch_pgtable_helpers.rst index b3166c33db39..f8b225fc9190 100644 --- a/Documentation/vm/arch_pgtable_helpers.rst +++ b/Documentation/vm/arch_pgtable_helpers.rst @@ -66,9 +66,11 @@ PTE Page Table Helpers +---------------------------+--------------------------------------------------+ | pte_mknotpresent | Invalidates a mapped PTE | +---------------------------+--------------------------------------------------+ -| ptep_get_and_clear | Clears a PTE | +| ptep_clear | Clears a PTE | +---------------------------+--------------------------------------------------+ -| ptep_get_and_clear_full | Clears a PTE | +| ptep_get_and_clear | Clears and returns PTE | ++---------------------------+--------------------------------------------------+ +| ptep_get_and_clear_full | Clears and returns PTE (batched PTE unmap) | +---------------------------+--------------------------------------------------+ | ptep_test_and_clear_young | Clears young from a PTE | +---------------------------+--------------------------------------------------+ diff --git a/Documentation/vm/index.rst b/Documentation/vm/index.rst index 6f5ffef4b716..932440805453 100644 --- a/Documentation/vm/index.rst +++ b/Documentation/vm/index.rst @@ -31,10 +31,12 @@ algorithms. If you are looking for advice on simply allocating memory, see the page_migration page_frags page_owner + page_table_check remap_file_pages slub split_page_table_lock transhuge unevictable-lru + vmalloced-kernel-stacks z3fold zsmalloc diff --git a/Documentation/vm/page_migration.rst b/Documentation/vm/page_migration.rst index 08810f549f70..8c5cb8147e55 100644 --- a/Documentation/vm/page_migration.rst +++ b/Documentation/vm/page_migration.rst @@ -263,15 +263,15 @@ Monitoring Migration The following events (counters) can be used to monitor page migration. 1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a - page was migrated. If the page was a non-THP page, then this counter is - increased by one. If the page was a THP, then this counter is increased by - the number of THP subpages. For example, migration of a single 2MB THP that - has 4KB-size base pages (subpages) will cause this counter to increase by - 512. + page was migrated. If the page was a non-THP and non-hugetlb page, then + this counter is increased by one. If the page was a THP or hugetlb, then + this counter is increased by the number of THP or hugetlb subpages. + For example, migration of a single 2MB THP that has 4KB-size base pages + (subpages) will cause this counter to increase by 512. 2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages, - if it was a THP. + if it was a THP or hugetlb. 3. THP_MIGRATION_SUCCESS: A THP was migrated without being split. diff --git a/Documentation/vm/page_table_check.rst b/Documentation/vm/page_table_check.rst new file mode 100644 index 000000000000..81f521ff7ea7 --- /dev/null +++ b/Documentation/vm/page_table_check.rst @@ -0,0 +1,56 @@ +.. SPDX-License-Identifier: GPL-2.0 + +.. _page_table_check: + +================ +Page Table Check +================ + +Introduction +============ + +Page table check allows to hardern the kernel by ensuring that some types of +the memory corruptions are prevented. + +Page table check performs extra verifications at the time when new pages become +accessible from the userspace by getting their page table entries (PTEs PMDs +etc.) added into the table. + +In case of detected corruption, the kernel is crashed. There is a small +performance and memory overhead associated with the page table check. Therefore, +it is disabled by default, but can be optionally enabled on systems where the +extra hardening outweighs the performance costs. Also, because page table check +is synchronous, it can help with debugging double map memory corruption issues, +by crashing kernel at the time wrong mapping occurs instead of later which is +often the case with memory corruptions bugs. + +Double mapping detection logic +============================== + ++-------------------+-------------------+-------------------+------------------+ +| Current Mapping | New mapping | Permissions | Rule | ++===================+===================+===================+==================+ +| Anonymous | Anonymous | Read | Allow | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Anonymous | Read / Write | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Anonymous | Named | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Anonymous | Any | Prohibit | ++-------------------+-------------------+-------------------+------------------+ +| Named | Named | Any | Allow | ++-------------------+-------------------+-------------------+------------------+ + +Enabling Page Table Check +========================= + +Build kernel with: + +- PAGE_TABLE_CHECK=y + Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK + is available. + +- Boot with 'page_table_check=on' kernel parameter. + +Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page +table support without extra kernel parameter. diff --git a/Documentation/vm/vmalloced-kernel-stacks.rst b/Documentation/vm/vmalloced-kernel-stacks.rst new file mode 100644 index 000000000000..fc8c67833af6 --- /dev/null +++ b/Documentation/vm/vmalloced-kernel-stacks.rst @@ -0,0 +1,153 @@ +.. SPDX-License-Identifier: GPL-2.0 + +===================================== +Virtually Mapped Kernel Stack Support +===================================== + +:Author: Shuah Khan + +.. contents:: :local: + +Overview +-------- + +This is a compilation of information from the code and original patch +series that introduced the `Virtually Mapped Kernel Stacks feature +` + +Introduction +------------ + +Kernel stack overflows are often hard to debug and make the kernel +susceptible to exploits. Problems could show up at a later time making +it difficult to isolate and root-cause. + +Virtually-mapped kernel stacks with guard pages causes kernel stack +overflows to be caught immediately rather than causing difficult to +diagnose corruptions. + +HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable +support for virtually mapped stacks with guard pages. This feature +causes reliable faults when the stack overflows. The usability of +the stack trace after overflow and response to the overflow itself +is architecture dependent. + +.. note:: + As of this writing, arm64, powerpc, riscv, s390, um, and x86 have + support for VMAP_STACK. + +HAVE_ARCH_VMAP_STACK +-------------------- + +Architectures that can support Virtually Mapped Kernel Stacks should +enable this bool configuration option. The requirements are: + +- vmalloc space must be large enough to hold many kernel stacks. This + may rule out many 32-bit architectures. +- Stacks in vmalloc space need to work reliably. For example, if + vmap page tables are created on demand, either this mechanism + needs to work while the stack points to a virtual address with + unpopulated page tables or arch code (switch_to() and switch_mm(), + most likely) needs to ensure that the stack's page table entries + are populated before running on a possibly unpopulated stack. +- If the stack overflows into a guard page, something reasonable + should happen. The definition of "reasonable" is flexible, but + instantly rebooting without logging anything would be unfriendly. + +VMAP_STACK +---------- + +VMAP_STACK bool configuration option when enabled allocates virtually +mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK. + +- Enable this if you want the use virtually-mapped kernel stacks + with guard pages. This causes kernel stack overflows to be caught + immediately rather than causing difficult-to-diagnose corruption. + +.. note:: + + Using this feature with KASAN requires architecture support + for backing virtual mappings with real shadow memory, and + KASAN_VMALLOC must be enabled. + +.. note:: + + VMAP_STACK is enabled, it is not possible to run DMA on stack + allocated data. + +Kernel configuration options and dependencies keep changing. Refer to +the latest code base: + +`Kconfig ` + +Allocation +----------- + +When a new kernel thread is created, thread stack is allocated from +virtually contiguous memory pages from the page level allocator. These +pages are mapped into contiguous kernel virtual space with PAGE_KERNEL +protections. + +alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack +with PAGE_KERNEL protections. + +- Allocated stacks are cached and later reused by new threads, so memcg + accounting is performed manually on assigning/releasing stacks to tasks. + Hence, __vmalloc_node_range is called without __GFP_ACCOUNT. +- vm_struct is cached to be able to find when thread free is initiated + in interrupt context. free_thread_stack() can be called in interrupt + context. +- On arm64, all VMAP's stacks need to have the same alignment to ensure + that VMAP'd stack overflow detection works correctly. Arch specific + vmap stack allocator takes care of this detail. +- This does not address interrupt stacks - according to the original patch + +Thread stack allocation is initiated from clone(), fork(), vfork(), +kernel_thread() via kernel_clone(). Leaving a few hints for searching +the code base to understand when and how thread stack is allocated. + +Bulk of the code is in: +`kernel/fork.c `. + +stack_vm_area pointer in task_struct keeps track of the virtually allocated +stack and a non-null stack_vm_area pointer serves as a indication that the +virtually mapped kernel stacks are enabled. + +:: + + struct vm_struct *stack_vm_area; + +Stack overflow handling +----------------------- + +Leading and trailing guard pages help detect stack overflows. When stack +overflows into the guard pages, handlers have to be careful not overflow +the stack again. When handlers are called, it is likely that very little +stack space is left. + +On x86, this is done by handling the page fault indicating the kernel +stack overflow on the double-fault stack. + +Testing VMAP allocation with guard pages +---------------------------------------- + +How do we ensure that VMAP_STACK is actually allocating with a leading +and trailing guard page? The following lkdtm tests can help detect any +regressions. + +:: + + void lkdtm_STACK_GUARD_PAGE_LEADING() + void lkdtm_STACK_GUARD_PAGE_TRAILING() + +Conclusions +----------- + +- A percpu cache of vmalloced stacks appears to be a bit faster than a + high-order stack allocation, at least when the cache hits. +- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and + simply embed the thread_info (containing only flags) and 'int cpu' into + task_struct. +- The thread stack can be free'ed as soon as the task is dead (without + waiting for RCU) and then, if vmapped stacks are in use, cache the + entire stack for reuse on the same cpu. diff --git a/MAINTAINERS b/MAINTAINERS index 7a5c5ba97f80..5556b170a61d 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -14548,6 +14548,15 @@ F: include/net/page_pool.h F: include/trace/events/page_pool.h F: net/core/page_pool.c +PAGE TABLE CHECK +M: Pasha Tatashin +M: Andrew Morton +L: linux-mm@kvack.org +S: Maintained +F: Documentation/vm/page_table_check.rst +F: include/linux/page_table_check.h +F: mm/page_table_check.c + PANASONIC LAPTOP ACPI EXTRAS DRIVER M: Kenneth Chan L: platform-driver-x86@vger.kernel.org diff --git a/arch/Kconfig b/arch/Kconfig index 847fde3d22cd..5a1692392a4d 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -1297,6 +1297,9 @@ config HAVE_ARCH_PFN_VALID config ARCH_SUPPORTS_DEBUG_PAGEALLOC bool +config ARCH_SUPPORTS_PAGE_TABLE_CHECK + bool + config ARCH_SPLIT_ARG64 bool help diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index ca5a32228cd6..3515bc4f16a4 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -489,3 +489,4 @@ # 557 reserved for memfd_secret 558 common process_mrelease sys_process_mrelease 559 common futex_waitv sys_futex_waitv +560 common set_mempolicy_home_node sys_ni_syscall diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 543100151f2b..ac964612d8b0 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -463,3 +463,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 6bdb5f5db438..4e65da3445c7 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 450 +#define __NR_compat_syscalls 451 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 41ea1195e44b..604a2053d006 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -905,6 +905,8 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self) __SYSCALL(__NR_process_mrelease, sys_process_mrelease) #define __NR_futex_waitv 449 __SYSCALL(__NR_futex_waitv, sys_futex_waitv) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 707ae121f6d3..78b1d03e86e1 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -370,3 +370,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 45bc32a41b90..b1f3940bc298 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -449,3 +449,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index 2204bde3ce4a..820145e47350 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -455,3 +455,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index 72d02d363f36..253ff994ed2e 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -388,3 +388,4 @@ # 447 reserved for memfd_secret 448 n32 process_mrelease sys_process_mrelease 449 n32 futex_waitv sys_futex_waitv +450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index e2c481fcede6..3f1886ad9d80 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -364,3 +364,4 @@ # 447 reserved for memfd_secret 448 n64 process_mrelease sys_process_mrelease 449 n64 futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index 3714c97b2643..8f243e35a7b2 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -437,3 +437,4 @@ # 447 reserved for memfd_secret 448 o32 process_mrelease sys_process_mrelease 449 o32 futex_waitv sys_futex_waitv +450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index 358c00000755..68b46fe2f17c 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -447,3 +447,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 15109af9d075..2600b4237292 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -529,3 +529,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index ed9c5c2eafad..799147658dee 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -452,3 +452,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index d9539d28bdaa..2de85c977f54 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -452,3 +452,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 46adabcb1720..4398cc6fb68d 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -495,3 +495,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 976dd6b532bf..407533c835fe 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -104,6 +104,7 @@ config X86 select ARCH_SUPPORTS_ACPI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_DEBUG_PAGEALLOC + select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64 select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096 select ARCH_SUPPORTS_LTO_CLANG diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 7e25543693de..320480a8db4f 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -454,3 +454,4 @@ 447 i386 memfd_secret sys_memfd_secret 448 i386 process_mrelease sys_process_mrelease 449 i386 futex_waitv sys_futex_waitv +450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index fe8f8dd157b4..c84d12608cd2 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -371,6 +371,7 @@ 447 common memfd_secret sys_memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node # # Due to a historical design error, certain syscalls are numbered differently diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index dea9fe8a56cc..8a9432fb3802 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -27,6 +27,7 @@ #include #include #include +#include extern pgd_t early_top_pgt[PTRS_PER_PGD]; bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd); @@ -1007,18 +1008,21 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp) static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { + page_table_check_pte_set(mm, addr, ptep, pte); set_pte(ptep, pte); } static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, pmd_t pmd) { + page_table_check_pmd_set(mm, addr, pmdp, pmd); set_pmd(pmdp, pmd); } static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, pud_t *pudp, pud_t pud) { + page_table_check_pud_set(mm, addr, pudp, pud); native_set_pud(pudp, pud); } @@ -1049,6 +1053,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); + page_table_check_pte_clear(mm, addr, pte); return pte; } @@ -1064,12 +1069,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm, * care about updates and native needs no locking */ pte = native_local_ptep_get_and_clear(ptep); + page_table_check_pte_clear(mm, addr, pte); } else { pte = ptep_get_and_clear(mm, addr, ptep); } return pte; } +#define __HAVE_ARCH_PTEP_CLEAR +static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK)) + ptep_get_and_clear(mm, addr, ptep); + else + pte_clear(mm, addr, ptep); +} + #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long addr, pte_t *ptep) @@ -1110,14 +1126,22 @@ static inline int pmd_write(pmd_t pmd) static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp) { - return native_pmdp_get_and_clear(pmdp); + pmd_t pmd = native_pmdp_get_and_clear(pmdp); + + page_table_check_pmd_clear(mm, addr, pmd); + + return pmd; } #define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pud_t *pudp) { - return native_pudp_get_and_clear(pudp); + pud_t pud = native_pudp_get_and_clear(pudp); + + page_table_check_pud_clear(mm, addr, pud); + + return pud; } #define __HAVE_ARCH_PMDP_SET_WRPROTECT @@ -1138,6 +1162,7 @@ static inline int pud_write(pud_t pud) static inline pmd_t pmdp_establish(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp, pmd_t pmd) { + page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd); if (IS_ENABLED(CONFIG_SMP)) { return xchg(pmdp, pmd); } else { diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 3e3e1a506bed..52c94ab5c205 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -420,3 +420,4 @@ # 447 reserved for memfd_secret 448 common process_mrelease sys_process_mrelease 449 common futex_waitv sys_futex_waitv +450 common set_mempolicy_home_node sys_set_mempolicy_home_node diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index f6da5293b913..cb253d80d72b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -1903,14 +1903,7 @@ static struct attribute *zram_disk_attrs[] = { NULL, }; -static const struct attribute_group zram_disk_attr_group = { - .attrs = zram_disk_attrs, -}; - -static const struct attribute_group *zram_disk_attr_groups[] = { - &zram_disk_attr_group, - NULL, -}; +ATTRIBUTE_GROUPS(zram_disk); /* * Allocate and initialize new zram device. the function returns @@ -1983,7 +1976,7 @@ static int zram_add(void) blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX); blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue); - ret = device_add_disk(NULL, zram->disk, zram_disk_attr_groups); + ret = device_add_disk(NULL, zram->disk, zram_disk_groups); if (ret) goto out_cleanup_disk; diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 1077ce7e189f..74c91da585d7 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -27,8 +27,8 @@ #include #include #include -#include #include +#include #include "ext4_jbd2.h" #include "ext4_extents.h" #include "xattr.h" @@ -4404,8 +4404,7 @@ retry: err = ext4_es_remove_extent(inode, last_block, EXT_MAX_BLOCKS - last_block); if (err == -ENOMEM) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_ATOMIC); goto retry; } if (err) @@ -4413,8 +4412,7 @@ retry: retry_remove_space: err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1); if (err == -ENOMEM) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_ATOMIC); goto retry_remove_space; } return err; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index ec2b21ff19d8..fb073da56b99 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -7,7 +7,7 @@ #include #include #include -#include +#include #include "ext4_jbd2.h" #include "ext4.h" @@ -1943,8 +1943,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline) retry: err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); if (err == -ENOMEM) { - cond_resched(); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(GFP_ATOMIC); goto retry; } if (err) diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 9cb261714991..1d370364230e 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include "ext4_jbd2.h" #include "xattr.h" @@ -523,12 +523,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io, ret = PTR_ERR(bounce_page); if (ret == -ENOMEM && (io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) { - gfp_flags = GFP_NOFS; + gfp_t new_gfp_flags = GFP_NOFS; if (io->io_bio) ext4_io_submit(io); else - gfp_flags |= __GFP_NOFAIL; - congestion_wait(BLK_RW_ASYNC, HZ/50); + new_gfp_flags |= __GFP_NOFAIL; + memalloc_retry_wait(gfp_flags); + gfp_flags = new_gfp_flags; goto retry_encrypt; } diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 3bd0cc2c1d2e..9b02ad3303de 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -8,9 +8,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -2547,7 +2547,7 @@ retry_encrypt: /* flush pending IOs and wait for a while in the ENOMEM case */ if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { f2fs_flush_merged_writes(fio->sbi); - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); gfp_flags |= __GFP_NOFAIL; goto retry_encrypt; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index a946ce0ead34..374bbb5294d9 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -7,7 +7,6 @@ */ #include #include -#include #include #include #include @@ -15,6 +14,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -1375,8 +1375,7 @@ retry: if (err) { clear_page_private_gcing(page); if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry; } if (is_dirty) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 0f8b2df3e1e0..4c11254a07d4 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -8,8 +8,8 @@ #include #include #include -#include #include +#include #include "f2fs.h" #include "node.h" @@ -562,7 +562,7 @@ retry: inode = f2fs_iget(sb, ino); if (IS_ERR(inode)) { if (PTR_ERR(inode) == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry; } } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 556fcd8457f3..219506ca9a97 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -8,7 +8,7 @@ #include #include #include -#include +#include #include #include #include @@ -2750,7 +2750,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) retry: ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); if (!ipage) { - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry; } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 6a1b4668d933..d1664a0567ef 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -8,6 +8,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" #include "segment.h" @@ -587,7 +588,7 @@ retry_dn: err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry_dn; } goto out; @@ -670,8 +671,7 @@ retry_prev: err = check_index_in_prev_nodes(sbi, dest, &dn); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto retry_prev; } goto err; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index df9ed75f0b7a..40fdb4a8daeb 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -9,6 +9,7 @@ #include #include #include +#include #include #include #include @@ -245,9 +246,7 @@ retry: LOOKUP_NODE); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); - cond_resched(); + memalloc_retry_wait(GFP_NOFS); goto retry; } err = -EAGAIN; @@ -424,9 +423,7 @@ retry: err = f2fs_do_write_data_page(&fio); if (err) { if (err == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); - cond_resched(); + memalloc_retry_wait(GFP_NOFS); goto retry; } unlock_page(page); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 040b6d02e1d8..3bace24f8800 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -8,9 +8,9 @@ #include #include #include +#include #include #include -#include #include #include #include @@ -2415,8 +2415,7 @@ repeat: page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); if (IS_ERR(page)) { if (PTR_ERR(page) == -ENOMEM) { - congestion_wait(BLK_RW_ASYNC, - DEFAULT_IO_TIMEOUT); + memalloc_retry_wait(GFP_NOFS); goto repeat; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 49d2e686be74..a7c6c7498be0 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -409,10 +409,11 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end) struct vm_area_struct *vma; /* - * end == 0 indicates that the entire range after - * start should be unmapped. + * end == 0 indicates that the entire range after start should be + * unmapped. Note, end is exclusive, whereas the interval tree takes + * an inclusive "last". */ - vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) { + vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) { unsigned long v_offset; unsigned long v_end; diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 6f49bf39183c..c557a030acfe 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -4,7 +4,6 @@ * All Rights Reserved. */ #include "xfs.h" -#include #include "xfs_message.h" #include "xfs_trace.h" @@ -26,6 +25,6 @@ kmem_alloc(size_t size, xfs_km_flags_t flags) "%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)", current->comm, current->pid, (unsigned int)size, __func__, lflags); - congestion_wait(BLK_RW_ASYNC, HZ/50); + memalloc_retry_wait(lflags); } while (1); } diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index bbb0fbd34e64..b45e0d50a405 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -394,7 +394,7 @@ xfs_buf_alloc_pages( } XFS_STATS_INC(bp->b_mount, xb_page_retries); - congestion_wait(BLK_RW_ASYNC, HZ / 50); + memalloc_retry_wait(gfp_mask); } return 0; } diff --git a/include/linux/ceph/libceph.h b/include/linux/ceph/libceph.h index 409d8c29bc4f..309acbcb5a8a 100644 --- a/include/linux/ceph/libceph.h +++ b/include/linux/ceph/libceph.h @@ -295,7 +295,6 @@ extern bool libceph_compatible(void *data); extern const char *ceph_msg_type_name(int type); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); -extern void *ceph_kvmalloc(size_t size, gfp_t flags); struct fs_parameter; struct fc_log; diff --git a/include/linux/damon.h b/include/linux/damon.h index b4d4be3cc987..5e1e3a128b77 100644 --- a/include/linux/damon.h +++ b/include/linux/damon.h @@ -11,12 +11,19 @@ #include #include #include +#include /* Minimal region size. Every damon_region is aligned by this. */ #define DAMON_MIN_REGION PAGE_SIZE /* Max priority score for DAMON-based operation schemes */ #define DAMOS_MAX_SCORE (99) +/* Get a random number in [l, r) */ +static inline unsigned long damon_rand(unsigned long l, unsigned long r) +{ + return l + prandom_u32_max(r - l); +} + /** * struct damon_addr_range - Represents an address region of [@start, @end). * @start: Start address of the region (inclusive). @@ -185,6 +192,22 @@ struct damos_watermarks { bool activated; }; +/** + * struct damos_stat - Statistics on a given scheme. + * @nr_tried: Total number of regions that the scheme is tried to be applied. + * @sz_tried: Total size of regions that the scheme is tried to be applied. + * @nr_applied: Total number of regions that the scheme is applied. + * @sz_applied: Total size of regions that the scheme is applied. + * @qt_exceeds: Total number of times the quota of the scheme has exceeded. + */ +struct damos_stat { + unsigned long nr_tried; + unsigned long sz_tried; + unsigned long nr_applied; + unsigned long sz_applied; + unsigned long qt_exceeds; +}; + /** * struct damos - Represents a Data Access Monitoring-based Operation Scheme. * @min_sz_region: Minimum size of target regions. @@ -196,8 +219,7 @@ struct damos_watermarks { * @action: &damo_action to be applied to the target regions. * @quota: Control the aggressiveness of this scheme. * @wmarks: Watermarks for automated (in)activation of this scheme. - * @stat_count: Total number of regions that this scheme is applied. - * @stat_sz: Total size of regions that this scheme is applied. + * @stat: Statistics of this scheme. * @list: List head for siblings. * * For each aggregation interval, DAMON finds regions which fit in the @@ -228,8 +250,7 @@ struct damos { enum damos_action action; struct damos_quota quota; struct damos_watermarks wmarks; - unsigned long stat_count; - unsigned long stat_sz; + struct damos_stat stat; struct list_head list; }; @@ -274,7 +295,8 @@ struct damon_ctx; * as an integer in [0, &DAMOS_MAX_SCORE]. * @apply_scheme is called from @kdamond when a region for user provided * DAMON-based operation scheme is found. It should apply the scheme's action - * to the region. This is not used for &DAMON_ARBITRARY_TARGET case. + * to the region and return bytes of the region that the action is successfully + * applied. * @target_valid should check whether the target is still valid for the * monitoring. * @cleanup is called from @kdamond just before its termination. @@ -288,8 +310,9 @@ struct damon_primitive { int (*get_scheme_score)(struct damon_ctx *context, struct damon_target *t, struct damon_region *r, struct damos *scheme); - int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); + unsigned long (*apply_scheme)(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme); bool (*target_valid)(void *target); void (*cleanup)(struct damon_ctx *context); }; @@ -392,14 +415,20 @@ struct damon_ctx { struct list_head schemes; }; -#define damon_next_region(r) \ - (container_of(r->list.next, struct damon_region, list)) +static inline struct damon_region *damon_next_region(struct damon_region *r) +{ + return container_of(r->list.next, struct damon_region, list); +} -#define damon_prev_region(r) \ - (container_of(r->list.prev, struct damon_region, list)) +static inline struct damon_region *damon_prev_region(struct damon_region *r) +{ + return container_of(r->list.prev, struct damon_region, list); +} -#define damon_last_region(t) \ - (list_last_entry(&t->regions_list, struct damon_region, list)) +static inline struct damon_region *damon_last_region(struct damon_target *t) +{ + return list_last_entry(&t->regions_list, struct damon_region, list); +} #define damon_for_each_region(r, t) \ list_for_each_entry(r, &t->regions_list, list) @@ -422,9 +451,18 @@ struct damon_ctx { #ifdef CONFIG_DAMON struct damon_region *damon_new_region(unsigned long start, unsigned long end); -inline void damon_insert_region(struct damon_region *r, + +/* + * Add a region between two other regions + */ +static inline void damon_insert_region(struct damon_region *r, struct damon_region *prev, struct damon_region *next, - struct damon_target *t); + struct damon_target *t) +{ + __list_add(&r->list, &prev->list, &next->list); + t->nr_regions++; +} + void damon_add_region(struct damon_region *r, struct damon_target *t); void damon_destroy_region(struct damon_region *r, struct damon_target *t); @@ -461,34 +499,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs); #endif /* CONFIG_DAMON */ #ifdef CONFIG_DAMON_VADDR - -/* Monitoring primitives for virtual memory address spaces */ -void damon_va_init(struct damon_ctx *ctx); -void damon_va_update(struct damon_ctx *ctx); -void damon_va_prepare_access_checks(struct damon_ctx *ctx); -unsigned int damon_va_check_accesses(struct damon_ctx *ctx); bool damon_va_target_valid(void *t); -void damon_va_cleanup(struct damon_ctx *ctx); -int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); -int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); void damon_va_set_primitives(struct damon_ctx *ctx); - #endif /* CONFIG_DAMON_VADDR */ #ifdef CONFIG_DAMON_PADDR - -/* Monitoring primitives for the physical memory address space */ -void damon_pa_prepare_access_checks(struct damon_ctx *ctx); -unsigned int damon_pa_check_accesses(struct damon_ctx *ctx); bool damon_pa_target_valid(void *t); -int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); -int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme); void damon_pa_set_primitives(struct damon_ctx *ctx); - #endif /* CONFIG_DAMON_PADDR */ #endif /* _DAMON_H */ diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 9baa0116b73d..eafb5b7485d5 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -302,7 +302,9 @@ struct vm_area_struct; * lowest zone as a type of emergency reserve. * * %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit - * address. + * address. Note that kmalloc(..., GFP_DMA32) does not return DMA32 memory + * because the DMA32 kmalloc cache array is not implemented. + * (Reason: there is no such user in kernel). * * %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace, * do not need to be directly accessible by the kernel but that cannot @@ -589,9 +591,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order); struct folio *folio_alloc(gfp_t gfp, unsigned order); extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, struct vm_area_struct *vma, unsigned long addr, - int node, bool hugepage); + bool hugepage); #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ - alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true) + alloc_pages_vma(gfp_mask, order, vma, addr, true) #else static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order) { @@ -601,14 +603,14 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order) { return __folio_alloc_node(gfp, order, numa_node_id()); } -#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\ +#define alloc_pages_vma(gfp_mask, order, vma, addr, false)\ alloc_pages(gfp_mask, order) #define alloc_hugepage_vma(gfp_mask, vma, addr, order) \ alloc_pages(gfp_mask, order) #endif #define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0) #define alloc_page_vma(gfp_mask, vma, addr) \ - alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false) + alloc_pages_vma(gfp_mask, 0, vma, addr, false) extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); extern unsigned long get_zeroed_page(gfp_t gfp_mask); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 00351ccb49a3..d1897a69c540 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -622,8 +622,8 @@ struct hstate { #endif #ifdef CONFIG_CGROUP_HUGETLB /* cgroup control files */ - struct cftype cgroup_files_dfl[7]; - struct cftype cgroup_files_legacy[9]; + struct cftype cgroup_files_dfl[8]; + struct cftype cgroup_files_legacy[10]; #endif char name[HSTATE_NAME_LEN]; }; diff --git a/include/linux/hugetlb_cgroup.h b/include/linux/hugetlb_cgroup.h index ba025ae27882..379344828e78 100644 --- a/include/linux/hugetlb_cgroup.h +++ b/include/linux/hugetlb_cgroup.h @@ -36,6 +36,11 @@ enum hugetlb_memory_event { HUGETLB_NR_MEMORY_EVENTS, }; +struct hugetlb_cgroup_per_node { + /* hugetlb usage in pages over all hstates. */ + unsigned long usage[HUGE_MAX_HSTATE]; +}; + struct hugetlb_cgroup { struct cgroup_subsys_state css; @@ -57,6 +62,8 @@ struct hugetlb_cgroup { /* Handle for "hugetlb.events.local" */ struct cgroup_file events_local_file[HUGE_MAX_HSTATE]; + + struct hugetlb_cgroup_per_node *nodeinfo[]; }; static inline struct hugetlb_cgroup * diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 3c7595e81150..668389b4b53d 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -46,6 +46,7 @@ struct mempolicy { unsigned short mode; /* See MPOL_* above */ unsigned short flags; /* See set_mempolicy() MPOL_F_* above */ nodemask_t nodes; /* interleave/bind/perfer */ + int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */ union { nodemask_t cpuset_mems_allowed; /* relative to these nodes */ diff --git a/include/linux/mm.h b/include/linux/mm.h index 34a92d9e8059..aa47705191bc 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -820,19 +820,15 @@ static inline int page_mapcount(struct page *page) #ifdef CONFIG_TRANSPARENT_HUGEPAGE int total_mapcount(struct page *page); -int page_trans_huge_mapcount(struct page *page, int *total_mapcount); +int page_trans_huge_mapcount(struct page *page); #else static inline int total_mapcount(struct page *page) { return page_mapcount(page); } -static inline int page_trans_huge_mapcount(struct page *page, - int *total_mapcount) +static inline int page_trans_huge_mapcount(struct page *page) { - int mapcount = page_mapcount(page); - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; + return page_mapcount(page); } #endif @@ -3112,7 +3108,6 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *, #endif void drop_slab(void); -void drop_slab_node(int nid); #ifndef CONFIG_MMU #define randomize_va_space 0 @@ -3165,6 +3160,7 @@ enum mf_flags { MF_ACTION_REQUIRED = 1 << 1, MF_MUST_KILL = 1 << 2, MF_SOFT_OFFLINE = 1 << 3, + MF_UNPOISON = 1 << 4, }; extern int memory_failure(unsigned long pfn, int flags); extern void memory_failure_queue(unsigned long pfn, int flags); @@ -3205,7 +3201,6 @@ enum mf_action_page_type { MF_MSG_KERNEL_HIGH_ORDER, MF_MSG_SLAB, MF_MSG_DIFFERENT_COMPOUND, - MF_MSG_POISONED_HUGE, MF_MSG_HUGE, MF_MSG_FREE_HUGE, MF_MSG_NON_PMD_HUGE, @@ -3220,7 +3215,6 @@ enum mf_action_page_type { MF_MSG_CLEAN_LRU, MF_MSG_TRUNCATED_LRU, MF_MSG_BUDDY, - MF_MSG_BUDDY_2ND, MF_MSG_DAX, MF_MSG_UNSPLIT_THP, MF_MSG_UNKNOWN, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 07bfd6f99ca0..3764c1b51b02 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -647,7 +647,7 @@ struct mm_struct { atomic_t tlb_flush_pending; #ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH /* See flush_tlb_batched_pending() */ - bool tlb_flush_batched; + atomic_t tlb_flush_batched; #endif struct uprobes_state uprobes_state; #ifdef CONFIG_PREEMPT_RT diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 0098d6f62d72..7533138a5ee2 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1045,6 +1045,15 @@ static inline int is_highmem_idx(enum zone_type idx) #endif } +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void); +#else +static inline bool has_managed_dma(void) +{ + return false; +} +#endif + /** * is_highmem - helper function to quickly check if a struct zone is a * highmem zone or not. This is an attempt to keep references diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index b3d353d537e2..129421002443 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -380,7 +380,7 @@ static __always_inline int TestClearPage##uname(struct page *page) \ TESTCLEARFLAG(uname, lname, policy) #define TESTPAGEFLAG_FALSE(uname, lname) \ -static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \ +static inline bool folio_test_##lname(const struct folio *folio) { return false; } \ static inline int Page##uname(const struct page *page) { return 0; } #define SETPAGEFLAG_NOOP(uname, lname) \ @@ -519,7 +519,11 @@ PAGEFLAG_FALSE(Uncached, uncached) PAGEFLAG(HWPoison, hwpoison, PF_ANY) TESTSCFLAG(HWPoison, hwpoison, PF_ANY) #define __PG_HWPOISON (1UL << PG_hwpoison) +#define MAGIC_HWPOISON 0x48575053U /* HWPS */ +extern void SetPageHWPoisonTakenOff(struct page *page); +extern void ClearPageHWPoisonTakenOff(struct page *page); extern bool take_page_off_buddy(struct page *page); +extern bool put_page_back_buddy(struct page *page); #else PAGEFLAG_FALSE(HWPoison, hwpoison) #define __PG_HWPOISON 0 diff --git a/include/linux/page_idle.h b/include/linux/page_idle.h index 83abf95e9fa7..4663dfed1293 100644 --- a/include/linux/page_idle.h +++ b/include/linux/page_idle.h @@ -13,7 +13,6 @@ * If there is not enough space to store Idle and Young bits in page flags, use * page ext flags instead. */ -extern struct page_ext_operations page_idle_ops; static inline bool folio_test_young(struct folio *folio) { diff --git a/include/linux/page_table_check.h b/include/linux/page_table_check.h new file mode 100644 index 000000000000..38cace1da7b6 --- /dev/null +++ b/include/linux/page_table_check.h @@ -0,0 +1,147 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +/* + * Copyright (c) 2021, Google LLC. + * Pasha Tatashin + */ +#ifndef __LINUX_PAGE_TABLE_CHECK_H +#define __LINUX_PAGE_TABLE_CHECK_H + +#ifdef CONFIG_PAGE_TABLE_CHECK +#include + +extern struct static_key_true page_table_check_disabled; +extern struct page_ext_operations page_table_check_ops; + +void __page_table_check_zero(struct page *page, unsigned int order); +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte); +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd); +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud); +void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte); +void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd); +void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud); + +static inline void page_table_check_alloc(struct page *page, unsigned int order) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_zero(page, order); +} + +static inline void page_table_check_free(struct page *page, unsigned int order) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_zero(page, order); +} + +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pte_clear(mm, addr, pte); +} + +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pmd_clear(mm, addr, pmd); +} + +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pud_clear(mm, addr, pud); +} + +static inline void page_table_check_pte_set(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pte_t pte) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pte_set(mm, addr, ptep, pte); +} + +static inline void page_table_check_pmd_set(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + pmd_t pmd) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pmd_set(mm, addr, pmdp, pmd); +} + +static inline void page_table_check_pud_set(struct mm_struct *mm, + unsigned long addr, pud_t *pudp, + pud_t pud) +{ + if (static_branch_likely(&page_table_check_disabled)) + return; + + __page_table_check_pud_set(mm, addr, pudp, pud); +} + +#else + +static inline void page_table_check_alloc(struct page *page, unsigned int order) +{ +} + +static inline void page_table_check_free(struct page *page, unsigned int order) +{ +} + +static inline void page_table_check_pte_clear(struct mm_struct *mm, + unsigned long addr, pte_t pte) +{ +} + +static inline void page_table_check_pmd_clear(struct mm_struct *mm, + unsigned long addr, pmd_t pmd) +{ +} + +static inline void page_table_check_pud_clear(struct mm_struct *mm, + unsigned long addr, pud_t pud) +{ +} + +static inline void page_table_check_pte_set(struct mm_struct *mm, + unsigned long addr, pte_t *ptep, + pte_t pte) +{ +} + +static inline void page_table_check_pmd_set(struct mm_struct *mm, + unsigned long addr, pmd_t *pmdp, + pmd_t pmd) +{ +} + +static inline void page_table_check_pud_set(struct mm_struct *mm, + unsigned long addr, pud_t *pudp, + pud_t pud) +{ +} + +#endif /* CONFIG_PAGE_TABLE_CHECK */ +#endif /* __LINUX_PAGE_TABLE_CHECK_H */ diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h index e24d2c992b11..bc8713a76e03 100644 --- a/include/linux/pgtable.h +++ b/include/linux/pgtable.h @@ -258,6 +258,14 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma, #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #endif +#ifndef __HAVE_ARCH_PTEP_CLEAR +static inline void ptep_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) +{ + pte_clear(mm, addr, ptep); +} +#endif + #ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index aca874d33fe6..aa5f09ca5bcf 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -214,6 +214,32 @@ static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif +/* Any memory-allocation retry loop should use + * memalloc_retry_wait(), and pass the flags for the most + * constrained allocation attempt that might have failed. + * This provides useful documentation of where loops are, + * and a central place to fine tune the waiting as the MM + * implementation changes. + */ +static inline void memalloc_retry_wait(gfp_t gfp_flags) +{ + /* We use io_schedule_timeout because waiting for memory + * typically included waiting for dirty pages to be + * written out, which requires IO. + */ + __set_current_state(TASK_UNINTERRUPTIBLE); + gfp_flags = current_gfp_context(gfp_flags); + if (gfpflags_allow_blocking(gfp_flags) && + !(gfp_flags & __GFP_NORETRY)) + /* Probably waited already, no need for much more */ + io_schedule_timeout(1); + else + /* Probably didn't wait, and has now released a lock, + * so now is a good time to wait + */ + io_schedule_timeout(HZ/50); +} + /** * might_alloc - Mark possible allocation sites * @gfp_mask: gfp_t flags that would be used to allocate diff --git a/include/linux/swap.h b/include/linux/swap.h index d1ea44b31f19..1d38d9475c4d 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -514,7 +514,7 @@ extern int __swp_swapcount(swp_entry_t entry); extern int swp_swapcount(swp_entry_t entry); extern struct swap_info_struct *page_swap_info(struct page *); extern struct swap_info_struct *swp_swap_info(swp_entry_t entry); -extern bool reuse_swap_page(struct page *, int *); +extern bool reuse_swap_page(struct page *); extern int try_to_free_swap(struct page *); struct backing_dev_info; extern int init_swap_address_space(unsigned int type, unsigned long nr_pages); @@ -680,8 +680,8 @@ static inline int swp_swapcount(swp_entry_t entry) return 0; } -#define reuse_swap_page(page, total_map_swapcount) \ - (page_trans_huge_mapcount(page, total_map_swapcount) == 1) +#define reuse_swap_page(page) \ + (page_trans_huge_mapcount(page) == 1) static inline int try_to_free_swap(struct page *page) { diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 528a478dbda8..819c0cb00b6d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1057,6 +1057,9 @@ asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type ru const void __user *rule_attr, __u32 flags); asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags); asmlinkage long sys_memfd_secret(unsigned int flags); +asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len, + unsigned long home_node, + unsigned long flags); /* * Architecture-specific system calls diff --git a/include/linux/vm_event_item.h b/include/linux/vm_event_item.h index a185cc75ff52..7b2363388bfa 100644 --- a/include/linux/vm_event_item.h +++ b/include/linux/vm_event_item.h @@ -98,6 +98,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT, THP_SPLIT_PAGE_FAILED, THP_DEFERRED_SPLIT_PAGE, THP_SPLIT_PMD, + THP_SCAN_EXCEED_NONE_PTE, + THP_SCAN_EXCEED_SWAP_PTE, + THP_SCAN_EXCEED_SHARED_PTE, #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD THP_SPLIT_PUD, #endif diff --git a/include/ras/ras_event.h b/include/ras/ras_event.h index 0bdbc0d17d2f..d0337a41141c 100644 --- a/include/ras/ras_event.h +++ b/include/ras/ras_event.h @@ -358,7 +358,6 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \ EM ( MF_MSG_SLAB, "kernel slab page" ) \ EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \ - EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" ) \ EM ( MF_MSG_HUGE, "huge page" ) \ EM ( MF_MSG_FREE_HUGE, "free huge page" ) \ EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" ) \ @@ -373,7 +372,6 @@ TRACE_EVENT(aer_event, EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \ EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \ EM ( MF_MSG_BUDDY, "free buddy page" ) \ - EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" ) \ EM ( MF_MSG_DAX, "dax page" ) \ EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \ EMe ( MF_MSG_UNKNOWN, "unknown page" ) diff --git a/include/trace/events/compaction.h b/include/trace/events/compaction.h index 54e5bf081171..7d48e7079e48 100644 --- a/include/trace/events/compaction.h +++ b/include/trace/events/compaction.h @@ -68,10 +68,9 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages, TRACE_EVENT(mm_compaction_migratepages, TP_PROTO(unsigned long nr_all, - int migrate_rc, - struct list_head *migratepages), + unsigned int nr_succeeded), - TP_ARGS(nr_all, migrate_rc, migratepages), + TP_ARGS(nr_all, nr_succeeded), TP_STRUCT__entry( __field(unsigned long, nr_migrated) @@ -79,23 +78,8 @@ TRACE_EVENT(mm_compaction_migratepages, ), TP_fast_assign( - unsigned long nr_failed = 0; - struct list_head *page_lru; - - /* - * migrate_pages() returns either a non-negative number - * with the number of pages that failed migration, or an - * error code, in which case we need to count the remaining - * pages manually - */ - if (migrate_rc >= 0) - nr_failed = migrate_rc; - else - list_for_each(page_lru, migratepages) - nr_failed++; - - __entry->nr_migrated = nr_all - nr_failed; - __entry->nr_failed = nr_failed; + __entry->nr_migrated = nr_succeeded; + __entry->nr_failed = nr_all - nr_succeeded; ), TP_printk("nr_migrated=%lu nr_failed=%lu", diff --git a/include/trace/events/damon.h b/include/trace/events/damon.h index 2f422f4f1fb9..c79f1d4c39af 100644 --- a/include/trace/events/damon.h +++ b/include/trace/events/damon.h @@ -11,10 +11,10 @@ TRACE_EVENT(damon_aggregated, - TP_PROTO(struct damon_target *t, struct damon_region *r, - unsigned int nr_regions), + TP_PROTO(struct damon_target *t, unsigned int target_id, + struct damon_region *r, unsigned int nr_regions), - TP_ARGS(t, r, nr_regions), + TP_ARGS(t, target_id, r, nr_regions), TP_STRUCT__entry( __field(unsigned long, target_id) @@ -22,19 +22,22 @@ TRACE_EVENT(damon_aggregated, __field(unsigned long, start) __field(unsigned long, end) __field(unsigned int, nr_accesses) + __field(unsigned int, age) ), TP_fast_assign( - __entry->target_id = t->id; + __entry->target_id = target_id; __entry->nr_regions = nr_regions; __entry->start = r->ar.start; __entry->end = r->ar.end; __entry->nr_accesses = r->nr_accesses; + __entry->age = r->age; ), - TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u", + TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u %u", __entry->target_id, __entry->nr_regions, - __entry->start, __entry->end, __entry->nr_accesses) + __entry->start, __entry->end, + __entry->nr_accesses, __entry->age) ); #endif /* _TRACE_DAMON_H */ diff --git a/include/trace/events/thp.h b/include/trace/events/thp.h index d7fbbe551841..ca3f2767828a 100644 --- a/include/trace/events/thp.h +++ b/include/trace/events/thp.h @@ -8,24 +8,6 @@ #include #include -TRACE_EVENT(hugepage_invalidate, - - TP_PROTO(unsigned long addr, unsigned long pte), - TP_ARGS(addr, pte), - TP_STRUCT__entry( - __field(unsigned long, addr) - __field(unsigned long, pte) - ), - - TP_fast_assign( - __entry->addr = addr; - __entry->pte = pte; - ), - - TP_printk("hugepage invalidate at addr 0x%lx and pte = 0x%lx", - __entry->addr, __entry->pte) -); - TRACE_EVENT(hugepage_set_pmd, TP_PROTO(unsigned long addr, unsigned long pmd), @@ -65,23 +47,6 @@ TRACE_EVENT(hugepage_update, TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set) ); -TRACE_EVENT(hugepage_splitting, - - TP_PROTO(unsigned long addr, unsigned long pte), - TP_ARGS(addr, pte), - TP_STRUCT__entry( - __field(unsigned long, addr) - __field(unsigned long, pte) - ), - - TP_fast_assign( - __entry->addr = addr; - __entry->pte = pte; - ), - - TP_printk("hugepage splitting at addr 0x%lx and pte = 0x%lx", - __entry->addr, __entry->pte) -); #endif /* _TRACE_THP_H */ diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index 4557a8b6086f..1c48b0ae3ba3 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -883,8 +883,11 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease) #define __NR_futex_waitv 449 __SYSCALL(__NR_futex_waitv, sys_futex_waitv) +#define __NR_set_mempolicy_home_node 450 +__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node) + #undef __NR_syscalls -#define __NR_syscalls 450 +#define __NR_syscalls 451 /* * 32 bit systems traditionally used different diff --git a/kernel/dma/pool.c b/kernel/dma/pool.c index 5f84e6cdb78e..4d40dcce7604 100644 --- a/kernel/dma/pool.c +++ b/kernel/dma/pool.c @@ -203,7 +203,7 @@ static int __init dma_atomic_pool_init(void) GFP_KERNEL); if (!atomic_pool_kernel) ret = -ENOMEM; - if (IS_ENABLED(CONFIG_ZONE_DMA)) { + if (has_managed_dma()) { atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size, GFP_KERNEL | GFP_DMA); if (!atomic_pool_dma) @@ -226,7 +226,7 @@ static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp) if (prev == NULL) { if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32)) return atomic_pool_dma32; - if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA)) + if (atomic_pool_dma && (gfp & GFP_DMA)) return atomic_pool_dma; return atomic_pool_kernel; } diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index d1944258cfc0..a492f159624f 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -297,6 +297,7 @@ COND_SYSCALL(get_mempolicy); COND_SYSCALL(set_mempolicy); COND_SYSCALL(migrate_pages); COND_SYSCALL(move_pages); +COND_SYSCALL(set_mempolicy_home_node); COND_SYSCALL(perf_event_open); COND_SYSCALL(accept4); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index d7ed1dffa426..ef77be575d87 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -123,6 +123,7 @@ static unsigned long long_max = LONG_MAX; static int one_hundred = 100; static int two_hundred = 200; static int one_thousand = 1000; +static int three_thousand = 3000; #ifdef CONFIG_PRINTK static int ten_thousand = 10000; #endif @@ -2960,7 +2961,7 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = watermark_scale_factor_sysctl_handler, .extra1 = SYSCTL_ONE, - .extra2 = &one_thousand, + .extra2 = &three_thousand, }, { .procname = "percpu_pagelist_high_fraction", diff --git a/lib/test_hmm.c b/lib/test_hmm.c index e2ce8f9b7605..767538089a62 100644 --- a/lib/test_hmm.c +++ b/lib/test_hmm.c @@ -1086,9 +1086,33 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp, return 0; } +static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma) +{ + unsigned long addr; + + for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) { + struct page *page; + int ret; + + page = alloc_page(GFP_KERNEL | __GFP_ZERO); + if (!page) + return -ENOMEM; + + ret = vm_insert_page(vma, addr, page); + if (ret) { + __free_page(page); + return ret; + } + put_page(page); + } + + return 0; +} + static const struct file_operations dmirror_fops = { .open = dmirror_fops_open, .release = dmirror_fops_release, + .mmap = dmirror_fops_mmap, .unlocked_ioctl = dmirror_fops_unlocked_ioctl, .llseek = default_llseek, .owner = THIS_MODULE, diff --git a/mm/Kconfig.debug b/mm/Kconfig.debug index 1e73717802f8..5bd5bb097252 100644 --- a/mm/Kconfig.debug +++ b/mm/Kconfig.debug @@ -62,6 +62,30 @@ config PAGE_OWNER If unsure, say N. +config PAGE_TABLE_CHECK + bool "Check for invalid mappings in user page tables" + depends on ARCH_SUPPORTS_PAGE_TABLE_CHECK + select PAGE_EXTENSION + help + Check that anonymous page is not being mapped twice with read write + permissions. Check that anonymous and file pages are not being + erroneously shared. Since the checking is performed at the time + entries are added and removed to user page tables, leaking, corruption + and double mapping problems are detected synchronously. + + If unsure say "n". + +config PAGE_TABLE_CHECK_ENFORCED + bool "Enforce the page table checking by default" + depends on PAGE_TABLE_CHECK + help + Always enable page table checking. By default the page table checking + is disabled, and can be optionally enabled via page_table_check=on + kernel parameter. This config enforces that page table check is always + enabled. + + If unsure say "n". + config PAGE_POISONING bool "Poison pages after freeing" help diff --git a/mm/Makefile b/mm/Makefile index 7919cd7f13f2..588d3113f3b0 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -114,6 +114,7 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o obj-$(CONFIG_CMA) += cma.o obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o +obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o obj-$(CONFIG_SECRETMEM) += secretmem.o obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o diff --git a/mm/compaction.c b/mm/compaction.c index 6e446094ce90..b4e94cda3019 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -2280,6 +2280,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) unsigned long last_migrated_pfn; const bool sync = cc->mode != MIGRATE_ASYNC; bool update_cached; + unsigned int nr_succeeded = 0; /* * These counters track activities during zone compaction. Initialize @@ -2398,10 +2399,10 @@ compact_zone(struct compact_control *cc, struct capture_control *capc) err = migrate_pages(&cc->migratepages, compaction_alloc, compaction_free, (unsigned long)cc, cc->mode, - MR_COMPACTION, NULL); + MR_COMPACTION, &nr_succeeded); - trace_mm_compaction_migratepages(cc->nr_migratepages, err, - &cc->migratepages); + trace_mm_compaction_migratepages(cc->nr_migratepages, + nr_succeeded); /* All pages were either migrated or will be released */ cc->nr_migratepages = 0; diff --git a/mm/damon/core.c b/mm/damon/core.c index e92497895202..1dd153c31c9e 100644 --- a/mm/damon/core.c +++ b/mm/damon/core.c @@ -11,7 +11,6 @@ #include #include #include -#include #include #include @@ -23,9 +22,6 @@ #define DAMON_MIN_REGION 1 #endif -/* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) - static DEFINE_MUTEX(damon_lock); static int nr_running_ctxs; @@ -53,17 +49,6 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end) return region; } -/* - * Add a region between two other regions - */ -inline void damon_insert_region(struct damon_region *r, - struct damon_region *prev, struct damon_region *next, - struct damon_target *t) -{ - __list_add(&r->list, &prev->list, &next->list); - t->nr_regions++; -} - void damon_add_region(struct damon_region *r, struct damon_target *t) { list_add_tail(&r->list, &t->regions_list); @@ -106,8 +91,7 @@ struct damos *damon_new_scheme( scheme->min_age_region = min_age_region; scheme->max_age_region = max_age_region; scheme->action = action; - scheme->stat_count = 0; - scheme->stat_sz = 0; + scheme->stat = (struct damos_stat){}; INIT_LIST_HEAD(&scheme->list); scheme->quota.ms = quota->ms; @@ -530,15 +514,17 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx) static void kdamond_reset_aggregated(struct damon_ctx *c) { struct damon_target *t; + unsigned int ti = 0; /* target's index */ damon_for_each_target(t, c) { struct damon_region *r; damon_for_each_region(r, t) { - trace_damon_aggregated(t, r, damon_nr_regions(t)); + trace_damon_aggregated(t, ti, r, damon_nr_regions(t)); r->last_nr_accesses = r->nr_accesses; r->nr_accesses = 0; } + ti++; } } @@ -578,6 +564,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, struct damos_quota *quota = &s->quota; unsigned long sz = r->ar.end - r->ar.start; struct timespec64 begin, end; + unsigned long sz_applied = 0; if (!s->wmarks.activated) continue; @@ -631,7 +618,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c, damon_split_region_at(c, t, r, sz); } ktime_get_coarse_ts64(&begin); - c->primitive.apply_scheme(c, t, r, s); + sz_applied = c->primitive.apply_scheme(c, t, r, s); ktime_get_coarse_ts64(&end); quota->total_charged_ns += timespec64_to_ns(&end) - timespec64_to_ns(&begin); @@ -645,8 +632,11 @@ static void damon_do_apply_schemes(struct damon_ctx *c, r->age = 0; update_stat: - s->stat_count++; - s->stat_sz += sz; + s->stat.nr_tried++; + s->stat.sz_tried += sz; + if (sz_applied) + s->stat.nr_applied++; + s->stat.sz_applied += sz_applied; } } @@ -694,6 +684,8 @@ static void kdamond_apply_schemes(struct damon_ctx *c) if (time_after_eq(jiffies, quota->charged_from + msecs_to_jiffies( quota->reset_interval))) { + if (quota->esz && quota->charged_sz >= quota->esz) + s->stat.qt_exceeds++; quota->total_charged_sz += quota->charged_sz; quota->charged_from = jiffies; quota->charged_sz = 0; @@ -733,7 +725,10 @@ static void kdamond_apply_schemes(struct damon_ctx *c) } } -#define sz_damon_region(r) (r->ar.end - r->ar.start) +static inline unsigned long sz_damon_region(struct damon_region *r) +{ + return r->ar.end - r->ar.start; +} /* * Merge two adjacent regions into one region @@ -750,8 +745,6 @@ static void damon_merge_two_regions(struct damon_target *t, damon_destroy_region(r, t); } -#define diff_of(a, b) (a > b ? a - b : b - a) - /* * Merge adjacent regions having similar access frequencies * @@ -765,13 +758,13 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres, struct damon_region *r, *prev = NULL, *next; damon_for_each_region_safe(r, next, t) { - if (diff_of(r->nr_accesses, r->last_nr_accesses) > thres) + if (abs(r->nr_accesses - r->last_nr_accesses) > thres) r->age = 0; else r->age++; if (prev && prev->ar.end == r->ar.start && - diff_of(prev->nr_accesses, r->nr_accesses) <= thres && + abs(prev->nr_accesses - r->nr_accesses) <= thres && sz_damon_region(prev) + sz_damon_region(r) <= sz_limit) damon_merge_two_regions(t, prev, r); else diff --git a/mm/damon/dbgfs.c b/mm/damon/dbgfs.c index ad65436756af..5b899601e56c 100644 --- a/mm/damon/dbgfs.c +++ b/mm/damon/dbgfs.c @@ -105,7 +105,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) damon_for_each_scheme(s, c) { rc = scnprintf(&buf[written], len - written, - "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu\n", + "%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n", s->min_sz_region, s->max_sz_region, s->min_nr_accesses, s->max_nr_accesses, s->min_age_region, s->max_age_region, @@ -117,7 +117,9 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len) s->quota.weight_age, s->wmarks.metric, s->wmarks.interval, s->wmarks.high, s->wmarks.mid, s->wmarks.low, - s->stat_count, s->stat_sz); + s->stat.nr_tried, s->stat.sz_tried, + s->stat.nr_applied, s->stat.sz_applied, + s->stat.qt_exceeds); if (!rc) return -ENOMEM; @@ -213,6 +215,13 @@ static struct damos **str_to_schemes(const char *str, ssize_t len, if (!damos_action_valid(action)) goto fail; + if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age) + goto fail; + + if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low || + wmarks.mid < wmarks.low) + goto fail; + pos += parsed; scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a, min_age, max_age, action, "a, &wmarks); @@ -355,7 +364,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file, struct damon_ctx *ctx = file->private_data; struct damon_target *t, *next_t; bool id_is_pid = true; - char *kbuf, *nrs; + char *kbuf; unsigned long *targets; ssize_t nr_targets; ssize_t ret; @@ -365,14 +374,13 @@ static ssize_t dbgfs_target_ids_write(struct file *file, if (IS_ERR(kbuf)) return PTR_ERR(kbuf); - nrs = kbuf; if (!strncmp(kbuf, "paddr\n", count)) { id_is_pid = false; /* target id is meaningless here, but we set it just for fun */ scnprintf(kbuf, count, "42 "); } - targets = str_to_target_ids(nrs, count, &nr_targets); + targets = str_to_target_ids(kbuf, count, &nr_targets); if (!targets) { ret = -ENOMEM; goto out; diff --git a/mm/damon/paddr.c b/mm/damon/paddr.c index a496d6f203d6..5e8244f65a1a 100644 --- a/mm/damon/paddr.c +++ b/mm/damon/paddr.c @@ -73,7 +73,7 @@ static void __damon_pa_prepare_access_check(struct damon_ctx *ctx, damon_pa_mkold(r->sampling_addr); } -void damon_pa_prepare_access_checks(struct damon_ctx *ctx) +static void damon_pa_prepare_access_checks(struct damon_ctx *ctx) { struct damon_target *t; struct damon_region *r; @@ -192,7 +192,7 @@ static void __damon_pa_check_access(struct damon_ctx *ctx, last_addr = r->sampling_addr; } -unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) +static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx) { struct damon_target *t; struct damon_region *r; @@ -213,14 +213,15 @@ bool damon_pa_target_valid(void *t) return true; } -int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { - unsigned long addr; + unsigned long addr, applied; LIST_HEAD(page_list); if (scheme->action != DAMOS_PAGEOUT) - return -EINVAL; + return 0; for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) { struct page *page = damon_get_page(PHYS_PFN(addr)); @@ -241,13 +242,14 @@ int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, put_page(page); } } - reclaim_pages(&page_list); + applied = reclaim_pages(&page_list); cond_resched(); - return 0; + return applied * PAGE_SIZE; } -int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static int damon_pa_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { switch (scheme->action) { case DAMOS_PAGEOUT: diff --git a/mm/damon/prmtv-common.h b/mm/damon/prmtv-common.h index 61f27037603e..e790cb5f8fe0 100644 --- a/mm/damon/prmtv-common.h +++ b/mm/damon/prmtv-common.h @@ -6,10 +6,6 @@ */ #include -#include - -/* Get a random number in [l, r) */ -#define damon_rand(l, r) (l + prandom_u32_max(r - l)) struct page *damon_get_page(unsigned long pfn); diff --git a/mm/damon/reclaim.c b/mm/damon/reclaim.c index dc1485044eaf..bc476cef688e 100644 --- a/mm/damon/reclaim.c +++ b/mm/damon/reclaim.c @@ -185,6 +185,36 @@ module_param(monitor_region_end, ulong, 0600); static int kdamond_pid __read_mostly = -1; module_param(kdamond_pid, int, 0400); +/* + * Number of memory regions that tried to be reclaimed. + */ +static unsigned long nr_reclaim_tried_regions __read_mostly; +module_param(nr_reclaim_tried_regions, ulong, 0400); + +/* + * Total bytes of memory regions that tried to be reclaimed. + */ +static unsigned long bytes_reclaim_tried_regions __read_mostly; +module_param(bytes_reclaim_tried_regions, ulong, 0400); + +/* + * Number of memory regions that successfully be reclaimed. + */ +static unsigned long nr_reclaimed_regions __read_mostly; +module_param(nr_reclaimed_regions, ulong, 0400); + +/* + * Total bytes of memory regions that successfully be reclaimed. + */ +static unsigned long bytes_reclaimed_regions __read_mostly; +module_param(bytes_reclaimed_regions, ulong, 0400); + +/* + * Number of times that the time/space quota limits have exceeded + */ +static unsigned long nr_quota_exceeds __read_mostly; +module_param(nr_quota_exceeds, ulong, 0400); + static struct damon_ctx *ctx; static struct damon_target *target; @@ -333,6 +363,21 @@ static void damon_reclaim_timer_fn(struct work_struct *work) } static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn); +static int damon_reclaim_after_aggregation(struct damon_ctx *c) +{ + struct damos *s; + + /* update the stats parameter */ + damon_for_each_scheme(s, c) { + nr_reclaim_tried_regions = s->stat.nr_tried; + bytes_reclaim_tried_regions = s->stat.sz_tried; + nr_reclaimed_regions = s->stat.nr_applied; + bytes_reclaimed_regions = s->stat.sz_applied; + nr_quota_exceeds = s->stat.qt_exceeds; + } + return 0; +} + static int __init damon_reclaim_init(void) { ctx = damon_new_ctx(); @@ -340,6 +385,7 @@ static int __init damon_reclaim_init(void) return -ENOMEM; damon_pa_set_primitives(ctx); + ctx->callback.after_aggregation = damon_reclaim_after_aggregation; /* 4242 means nothing but fun */ target = damon_new_target(4242); diff --git a/mm/damon/vaddr.c b/mm/damon/vaddr.c index 20a9a9d69eb1..89b6468da2b9 100644 --- a/mm/damon/vaddr.c +++ b/mm/damon/vaddr.c @@ -26,8 +26,10 @@ * 't->id' should be the pointer to the relevant 'struct pid' having reference * count. Caller must put the returned task, unless it is NULL. */ -#define damon_get_task_struct(t) \ - (get_pid_task((struct pid *)t->id, PIDTYPE_PID)) +static inline struct task_struct *damon_get_task_struct(struct damon_target *t) +{ + return get_pid_task((struct pid *)t->id, PIDTYPE_PID); +} /* * Get the mm_struct of the given target @@ -98,16 +100,6 @@ static unsigned long sz_range(struct damon_addr_range *r) return r->end - r->start; } -static void swap_ranges(struct damon_addr_range *r1, - struct damon_addr_range *r2) -{ - struct damon_addr_range tmp; - - tmp = *r1; - *r1 = *r2; - *r2 = tmp; -} - /* * Find three regions separated by two biggest unmapped regions * @@ -146,9 +138,9 @@ static int __damon_va_three_regions(struct vm_area_struct *vma, gap.start = last_vma->vm_end; gap.end = vma->vm_start; if (sz_range(&gap) > sz_range(&second_gap)) { - swap_ranges(&gap, &second_gap); + swap(gap, second_gap); if (sz_range(&second_gap) > sz_range(&first_gap)) - swap_ranges(&second_gap, &first_gap); + swap(second_gap, first_gap); } next: last_vma = vma; @@ -159,7 +151,7 @@ next: /* Sort the two biggest gaps by address */ if (first_gap.start > second_gap.start) - swap_ranges(&first_gap, &second_gap); + swap(first_gap, second_gap); /* Store the result */ regions[0].start = ALIGN(start, DAMON_MIN_REGION); @@ -240,13 +232,19 @@ static int damon_va_three_regions(struct damon_target *t, static void __damon_va_init_regions(struct damon_ctx *ctx, struct damon_target *t) { + struct damon_target *ti; struct damon_region *r; struct damon_addr_range regions[3]; unsigned long sz = 0, nr_pieces; - int i; + int i, tidx = 0; if (damon_va_three_regions(t, regions)) { - pr_err("Failed to get three regions of target %lu\n", t->id); + damon_for_each_target(ti, ctx) { + if (ti == t) + break; + tidx++; + } + pr_debug("Failed to get three regions of %dth target\n", tidx); return; } @@ -272,7 +270,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx, } /* Initialize '->regions_list' of every target (task) */ -void damon_va_init(struct damon_ctx *ctx) +static void damon_va_init(struct damon_ctx *ctx) { struct damon_target *t; @@ -292,7 +290,8 @@ void damon_va_init(struct damon_ctx *ctx) * * Returns true if it is. */ -static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re) +static bool damon_intersect(struct damon_region *r, + struct damon_addr_range *re) { return !(r->ar.end <= re->start || re->end <= r->ar.start); } @@ -356,7 +355,7 @@ static void damon_va_apply_three_regions(struct damon_target *t, /* * Update regions for current memory mappings */ -void damon_va_update(struct damon_ctx *ctx) +static void damon_va_update(struct damon_ctx *ctx) { struct damon_addr_range three_regions[3]; struct damon_target *t; @@ -395,8 +394,65 @@ out: return 0; } +#ifdef CONFIG_HUGETLB_PAGE +static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm, + struct vm_area_struct *vma, unsigned long addr) +{ + bool referenced = false; + pte_t entry = huge_ptep_get(pte); + struct page *page = pte_page(entry); + + if (!page) + return; + + get_page(page); + + if (pte_young(entry)) { + referenced = true; + entry = pte_mkold(entry); + huge_ptep_set_access_flags(vma, addr, pte, entry, + vma->vm_flags & VM_WRITE); + } + +#ifdef CONFIG_MMU_NOTIFIER + if (mmu_notifier_clear_young(mm, addr, + addr + huge_page_size(hstate_vma(vma)))) + referenced = true; +#endif /* CONFIG_MMU_NOTIFIER */ + + if (referenced) + set_page_young(page); + + set_page_idle(page); + put_page(page); +} + +static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct hstate *h = hstate_vma(walk->vma); + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_mkold_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + static const struct mm_walk_ops damon_mkold_ops = { .pmd_entry = damon_mkold_pmd_entry, + .hugetlb_entry = damon_mkold_hugetlb_entry, }; static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) @@ -410,7 +466,7 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr) * Functions for the access checking of the regions */ -static void damon_va_prepare_access_check(struct damon_ctx *ctx, +static void __damon_va_prepare_access_check(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) { r->sampling_addr = damon_rand(r->ar.start, r->ar.end); @@ -418,7 +474,7 @@ static void damon_va_prepare_access_check(struct damon_ctx *ctx, damon_va_mkold(mm, r->sampling_addr); } -void damon_va_prepare_access_checks(struct damon_ctx *ctx) +static void damon_va_prepare_access_checks(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; @@ -429,7 +485,7 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) - damon_va_prepare_access_check(ctx, mm, r); + __damon_va_prepare_access_check(ctx, mm, r); mmput(mm); } } @@ -491,8 +547,47 @@ out: return 0; } +#ifdef CONFIG_HUGETLB_PAGE +static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask, + unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct damon_young_walk_private *priv = walk->private; + struct hstate *h = hstate_vma(walk->vma); + struct page *page; + spinlock_t *ptl; + pte_t entry; + + ptl = huge_pte_lock(h, walk->mm, pte); + entry = huge_ptep_get(pte); + if (!pte_present(entry)) + goto out; + + page = pte_page(entry); + if (!page) + goto out; + + get_page(page); + + if (pte_young(entry) || !page_is_idle(page) || + mmu_notifier_test_young(walk->mm, addr)) { + *priv->page_sz = huge_page_size(h); + priv->young = true; + } + + put_page(page); + +out: + spin_unlock(ptl); + return 0; +} +#else +#define damon_young_hugetlb_entry NULL +#endif /* CONFIG_HUGETLB_PAGE */ + static const struct mm_walk_ops damon_young_ops = { .pmd_entry = damon_young_pmd_entry, + .hugetlb_entry = damon_young_hugetlb_entry, }; static bool damon_va_young(struct mm_struct *mm, unsigned long addr, @@ -515,7 +610,7 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr, * mm 'mm_struct' for the given virtual address space * r the region to be checked */ -static void damon_va_check_access(struct damon_ctx *ctx, +static void __damon_va_check_access(struct damon_ctx *ctx, struct mm_struct *mm, struct damon_region *r) { static struct mm_struct *last_mm; @@ -539,7 +634,7 @@ static void damon_va_check_access(struct damon_ctx *ctx, last_addr = r->sampling_addr; } -unsigned int damon_va_check_accesses(struct damon_ctx *ctx) +static unsigned int damon_va_check_accesses(struct damon_ctx *ctx) { struct damon_target *t; struct mm_struct *mm; @@ -551,7 +646,7 @@ unsigned int damon_va_check_accesses(struct damon_ctx *ctx) if (!mm) continue; damon_for_each_region(r, t) { - damon_va_check_access(ctx, mm, r); + __damon_va_check_access(ctx, mm, r); max_nr_accesses = max(r->nr_accesses, max_nr_accesses); } mmput(mm); @@ -579,32 +674,34 @@ bool damon_va_target_valid(void *target) } #ifndef CONFIG_ADVISE_SYSCALLS -static int damos_madvise(struct damon_target *target, struct damon_region *r, - int behavior) +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) { - return -EINVAL; + return 0; } #else -static int damos_madvise(struct damon_target *target, struct damon_region *r, - int behavior) +static unsigned long damos_madvise(struct damon_target *target, + struct damon_region *r, int behavior) { struct mm_struct *mm; - int ret = -ENOMEM; + unsigned long start = PAGE_ALIGN(r->ar.start); + unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start); + unsigned long applied; mm = damon_get_mm(target); if (!mm) - goto out; + return 0; - ret = do_madvise(mm, PAGE_ALIGN(r->ar.start), - PAGE_ALIGN(r->ar.end - r->ar.start), behavior); + applied = do_madvise(mm, start, len, behavior) ? 0 : len; mmput(mm); -out: - return ret; + + return applied; } #endif /* CONFIG_ADVISE_SYSCALLS */ -int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { int madv_action; @@ -627,14 +724,15 @@ int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t, case DAMOS_STAT: return 0; default: - return -EINVAL; + return 0; } return damos_madvise(t, r, madv_action); } -int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t, - struct damon_region *r, struct damos *scheme) +static int damon_va_scheme_score(struct damon_ctx *context, + struct damon_target *t, struct damon_region *r, + struct damos *scheme) { switch (scheme->action) { diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 2a2b24e87877..a7ac97c76762 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -652,7 +652,7 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args) set_pte_at(args->mm, args->vaddr, args->ptep, pte); flush_dcache_page(page); barrier(); - pte_clear(args->mm, args->vaddr, args->ptep); + ptep_clear(args->mm, args->vaddr, args->ptep); pte = ptep_get(args->ptep); WARN_ON(!pte_none(pte)); } diff --git a/mm/dmapool.c b/mm/dmapool.c index 64b537b3ccb0..a7eb5d0eb2da 100644 --- a/mm/dmapool.c +++ b/mm/dmapool.c @@ -152,7 +152,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev, else if ((boundary < size) || (boundary & (boundary - 1))) return NULL; - retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev)); + retval = kmalloc(sizeof(*retval), GFP_KERNEL); if (!retval) return retval; diff --git a/mm/hmm.c b/mm/hmm.c index 842e26599238..bd56641c79d4 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -300,7 +300,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr, * Since each architecture defines a struct page for the zero page, just * fall through and treat it like a normal page. */ - if (pte_special(pte) && !pte_devmap(pte) && + if (!vm_normal_page(walk->vma, addr, pte) && + !pte_devmap(pte) && !is_zero_pfn(pte_pfn(pte))) { if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) { pte_unmap(ptep); @@ -518,7 +519,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end, struct hmm_range *range = hmm_vma_walk->range; struct vm_area_struct *vma = walk->vma; - if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) && + if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) && vma->vm_flags & VM_READ) return 0; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index f58524394dc1..406a3c28c026 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1322,7 +1322,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf) * We can only reuse the page if nobody else maps the huge page or it's * part. */ - if (reuse_swap_page(page, NULL)) { + if (reuse_swap_page(page)) { pmd_t entry; entry = pmd_mkyoung(orig_pmd); entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma); @@ -2542,38 +2542,28 @@ int total_mapcount(struct page *page) * need full accuracy to avoid breaking page pinning, because * page_trans_huge_mapcount() is slower than page_mapcount(). */ -int page_trans_huge_mapcount(struct page *page, int *total_mapcount) +int page_trans_huge_mapcount(struct page *page) { - int i, ret, _total_mapcount, mapcount; + int i, ret; /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); - if (likely(!PageTransCompound(page))) { - mapcount = atomic_read(&page->_mapcount) + 1; - if (total_mapcount) - *total_mapcount = mapcount; - return mapcount; - } + if (likely(!PageTransCompound(page))) + return atomic_read(&page->_mapcount) + 1; page = compound_head(page); - _total_mapcount = ret = 0; + ret = 0; for (i = 0; i < thp_nr_pages(page); i++) { - mapcount = atomic_read(&page[i]._mapcount) + 1; + int mapcount = atomic_read(&page[i]._mapcount) + 1; ret = max(ret, mapcount); - _total_mapcount += mapcount; } - if (PageDoubleMap(page)) { + + if (PageDoubleMap(page)) ret -= 1; - _total_mapcount -= thp_nr_pages(page); - } - mapcount = compound_mapcount(page); - ret += mapcount; - _total_mapcount += mapcount; - if (total_mapcount) - *total_mapcount = _total_mapcount; - return ret; + + return ret + compound_mapcount(page); } /* Racy check whether the huge page can be split */ diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a1baa198519a..61895cc01d09 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -4684,8 +4684,8 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr struct page *new_page) { __SetPageUptodate(new_page); - set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); hugepage_add_new_anon_rmap(new_page, vma, addr); + set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1)); hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm); ClearHPageRestoreReserve(new_page); SetHPageMigratable(new_page); @@ -5259,10 +5259,10 @@ retry_avoidcopy: /* Break COW */ huge_ptep_clear_flush(vma, haddr, ptep); mmu_notifier_invalidate_range(mm, range.start, range.end); - set_huge_pte_at(mm, haddr, ptep, - make_huge_pte(vma, new_page, 1)); page_remove_rmap(old_page, true); hugepage_add_new_anon_rmap(new_page, vma, haddr); + set_huge_pte_at(mm, haddr, ptep, + make_huge_pte(vma, new_page, 1)); SetHPageMigratable(new_page); /* Make the old page be freed below */ new_page = old_page; diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index 79d93534ef1e..f9942841df18 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -123,29 +123,58 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, } } +static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup) +{ + int node; + + for_each_node(node) + kfree(h_cgroup->nodeinfo[node]); + kfree(h_cgroup); +} + static struct cgroup_subsys_state * hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) { struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css); struct hugetlb_cgroup *h_cgroup; + int node; + + h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids), + GFP_KERNEL); - h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL); if (!h_cgroup) return ERR_PTR(-ENOMEM); if (!parent_h_cgroup) root_h_cgroup = h_cgroup; + /* + * TODO: this routine can waste much memory for nodes which will + * never be onlined. It's better to use memory hotplug callback + * function. + */ + for_each_node(node) { + /* Set node_to_alloc to -1 for offline nodes. */ + int node_to_alloc = + node_state(node, N_NORMAL_MEMORY) ? node : -1; + h_cgroup->nodeinfo[node] = + kzalloc_node(sizeof(struct hugetlb_cgroup_per_node), + GFP_KERNEL, node_to_alloc); + if (!h_cgroup->nodeinfo[node]) + goto fail_alloc_nodeinfo; + } + hugetlb_cgroup_init(h_cgroup, parent_h_cgroup); return &h_cgroup->css; + +fail_alloc_nodeinfo: + hugetlb_cgroup_free(h_cgroup); + return ERR_PTR(-ENOMEM); } static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css) { - struct hugetlb_cgroup *h_cgroup; - - h_cgroup = hugetlb_cgroup_from_css(css); - kfree(h_cgroup); + hugetlb_cgroup_free(hugetlb_cgroup_from_css(css)); } /* @@ -289,7 +318,17 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, return; __set_hugetlb_cgroup(page, h_cg, rsvd); - return; + if (!rsvd) { + unsigned long usage = + h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + /* + * This write is not atomic due to fetching usage and writing + * to it, but that's fine because we call this with + * hugetlb_lock held anyway. + */ + WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + usage + nr_pages); + } } void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages, @@ -328,8 +367,17 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, if (rsvd) css_put(&h_cg->css); - - return; + else { + unsigned long usage = + h_cg->nodeinfo[page_to_nid(page)]->usage[idx]; + /* + * This write is not atomic due to fetching usage and writing + * to it, but that's fine because we call this with + * hugetlb_lock held anyway. + */ + WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx], + usage - nr_pages); + } } void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages, @@ -418,6 +466,59 @@ enum { RES_RSVD_FAILCNT, }; +static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy) +{ + int nid; + struct cftype *cft = seq_cft(seq); + int idx = MEMFILE_IDX(cft->private); + bool legacy = MEMFILE_ATTR(cft->private); + struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq)); + struct cgroup_subsys_state *css; + unsigned long usage; + + if (legacy) { + /* Add up usage across all nodes for the non-hierarchical total. */ + usage = 0; + for_each_node_state(nid, N_MEMORY) + usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]); + seq_printf(seq, "total=%lu", usage * PAGE_SIZE); + + /* Simply print the per-node usage for the non-hierarchical total. */ + for_each_node_state(nid, N_MEMORY) + seq_printf(seq, " N%d=%lu", nid, + READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) * + PAGE_SIZE); + seq_putc(seq, '\n'); + } + + /* + * The hierarchical total is pretty much the value recorded by the + * counter, so use that. + */ + seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "", + page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE); + + /* + * For each node, transverse the css tree to obtain the hierarchical + * node usage. + */ + for_each_node_state(nid, N_MEMORY) { + usage = 0; + rcu_read_lock(); + css_for_each_descendant_pre(css, &h_cg->css) { + usage += READ_ONCE(hugetlb_cgroup_from_css(css) + ->nodeinfo[nid] + ->usage[idx]); + } + rcu_read_unlock(); + seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE); + } + + seq_putc(seq, '\n'); + + return 0; +} + static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, struct cftype *cft) { @@ -668,8 +769,14 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx) events_local_file[idx]); cft->flags = CFTYPE_NOT_ON_ROOT; - /* NULL terminate the last cft */ + /* Add the numa stat file */ cft = &h->cgroup_files_dfl[6]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); + cft->seq_show = hugetlb_cgroup_read_numa_stat; + cft->flags = CFTYPE_NOT_ON_ROOT; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_dfl[7]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys, @@ -739,8 +846,14 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx) cft->write = hugetlb_cgroup_reset; cft->read_u64 = hugetlb_cgroup_read_u64; - /* NULL terminate the last cft */ + /* Add the numa stat file */ cft = &h->cgroup_files_legacy[8]; + snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf); + cft->private = MEMFILE_PRIVATE(idx, 1); + cft->seq_show = hugetlb_cgroup_read_numa_stat; + + /* NULL terminate the last cft */ + cft = &h->cgroup_files_legacy[9]; memset(cft, 0, sizeof(*cft)); WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys, diff --git a/mm/internal.h b/mm/internal.h index 7fb09515679e..d80300392a19 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -23,7 +23,7 @@ struct folio_batch; #define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\ __GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\ __GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\ - __GFP_ATOMIC) + __GFP_ATOMIC|__GFP_NOLOCKDEP) /* The GFP flags allowed during early boot */ #define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS)) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 2e1911cc3466..35f14d0a00a6 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -618,6 +618,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, continue; } else { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out; } } @@ -636,6 +637,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (page_mapcount(page) > 1 && ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out; } @@ -681,7 +683,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, goto out; } if (!pte_write(pteval) && PageSwapCache(page) && - !reuse_swap_page(page, NULL)) { + !reuse_swap_page(page)) { /* * Page is in the swap cache and cannot be re-used. * It cannot be collapsed into a THP. @@ -756,11 +758,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * ptl mostly unnecessary. */ spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); + ptep_clear(vma->vm_mm, address, _pte); spin_unlock(ptl); } } else { @@ -774,11 +772,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page, * inside page_remove_rmap(). */ spin_lock(ptl); - /* - * paravirt calls inside pte_clear here are - * superfluous. - */ - pte_clear(vma->vm_mm, address, _pte); + ptep_clear(vma->vm_mm, address, _pte); page_remove_rmap(src_page, false); spin_unlock(ptl); free_page_and_swap_cache(src_page); @@ -1261,6 +1255,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, continue; } else { result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); goto out_unmap; } } @@ -1270,6 +1265,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, continue; } else { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); goto out_unmap; } } @@ -1298,6 +1294,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, if (page_mapcount(page) > 1 && ++shared > khugepaged_max_ptes_shared) { result = SCAN_EXCEED_SHARED_PTE; + count_vm_event(THP_SCAN_EXCEED_SHARED_PTE); goto out_unmap; } @@ -1306,7 +1303,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm, /* * Record which node the original page is from and save this * information to khugepaged_node_load[]. - * Khupaged will allocate hugepage from the node has the max + * Khugepaged will allocate hugepage from the node has the max * hit record. */ node = page_to_nid(page); @@ -2014,6 +2011,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, if (xa_is_value(page)) { if (++swap > khugepaged_max_ptes_swap) { result = SCAN_EXCEED_SWAP_PTE; + count_vm_event(THP_SCAN_EXCEED_SWAP_PTE); break; } continue; @@ -2064,6 +2062,7 @@ static void khugepaged_scan_file(struct mm_struct *mm, if (result == SCAN_SUCCEED) { if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) { result = SCAN_EXCEED_NONE_PTE; + count_vm_event(THP_SCAN_EXCEED_NONE_PTE); } else { node = khugepaged_find_target_node(); collapse_file(mm, file, start, hpage, node); diff --git a/mm/ksm.c b/mm/ksm.c index f34476ac0a41..c20bd4d9a0d9 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -2576,8 +2576,8 @@ struct page *ksm_might_need_to_copy(struct page *page, return page; /* no need to copy it */ } else if (!anon_vma) { return page; /* no need to copy it */ - } else if (anon_vma->root == vma->anon_vma->root && - page->index == linear_page_index(vma, address)) { + } else if (page->index == linear_page_index(vma, address) && + anon_vma->root == vma->anon_vma->root) { return page; /* still no need to copy it */ } if (!PageUptodate(page)) diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 5190a8ee553c..14ae5c18e776 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -723,7 +723,6 @@ static const char * const action_page_types[] = { [MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page", [MF_MSG_SLAB] = "kernel slab page", [MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking", - [MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned", [MF_MSG_HUGE] = "huge page", [MF_MSG_FREE_HUGE] = "free huge page", [MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page", @@ -738,7 +737,6 @@ static const char * const action_page_types[] = { [MF_MSG_CLEAN_LRU] = "clean LRU page", [MF_MSG_TRUNCATED_LRU] = "already truncated LRU page", [MF_MSG_BUDDY] = "free buddy page", - [MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)", [MF_MSG_DAX] = "dax page", [MF_MSG_UNSPLIT_THP] = "unsplit thp", [MF_MSG_UNKNOWN] = "unknown page", @@ -1162,6 +1160,22 @@ static int page_action(struct page_state *ps, struct page *p, return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } +static inline bool PageHWPoisonTakenOff(struct page *page) +{ + return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON; +} + +void SetPageHWPoisonTakenOff(struct page *page) +{ + set_page_private(page, MAGIC_HWPOISON); +} + +void ClearPageHWPoisonTakenOff(struct page *page) +{ + if (PageHWPoison(page)) + set_page_private(page, 0); +} + /* * Return true if a page type of a given page is supported by hwpoison * mechanism (while handling could fail), otherwise false. This function @@ -1264,6 +1278,27 @@ out: return ret; } +static int __get_unpoison_page(struct page *page) +{ + struct page *head = compound_head(page); + int ret = 0; + bool hugetlb = false; + + ret = get_hwpoison_huge_page(head, &hugetlb); + if (hugetlb) + return ret; + + /* + * PageHWPoisonTakenOff pages are not only marked as PG_hwpoison, + * but also isolated from buddy freelist, so need to identify the + * state and have to cancel both operations to unpoison. + */ + if (PageHWPoisonTakenOff(page)) + return -EHWPOISON; + + return get_page_unless_zero(page) ? 1 : 0; +} + /** * get_hwpoison_page() - Get refcount for memory error handling * @p: Raw error page (hit by memory error) @@ -1271,7 +1306,7 @@ out: * * get_hwpoison_page() takes a page refcount of an error page to handle memory * error on it, after checking that the error page is in a well-defined state - * (defined as a page-type we can successfully handle the memor error on it, + * (defined as a page-type we can successfully handle the memory error on it, * such as LRU page and hugetlb page). * * Memory error handling could be triggered at any time on any type of page, @@ -1280,18 +1315,26 @@ out: * extra care for the error page's state (as done in __get_hwpoison_page()), * and has some retry logic in get_any_page(). * + * When called from unpoison_memory(), the caller should already ensure that + * the given page has PG_hwpoison. So it's never reused for other page + * allocations, and __get_unpoison_page() never races with them. + * * Return: 0 on failure, * 1 on success for in-use pages in a well-defined state, * -EIO for pages on which we can not handle memory errors, * -EBUSY when get_hwpoison_page() has raced with page lifecycle - * operations like allocation and free. + * operations like allocation and free, + * -EHWPOISON when the page is hwpoisoned and taken off from buddy. */ static int get_hwpoison_page(struct page *p, unsigned long flags) { int ret; zone_pcp_disable(page_zone(p)); - ret = get_any_page(p, flags); + if (flags & MF_UNPOISON) + ret = __get_unpoison_page(p); + else + ret = get_any_page(p, flags); zone_pcp_enable(page_zone(p)); return ret; @@ -1502,14 +1545,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) lock_page(head); page_flags = head->flags; - if (!PageHWPoison(head)) { - pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_dec(); - unlock_page(head); - put_page(head); - return 0; - } - /* * TODO: hwpoison for pud-sized hugetlb doesn't work right now, so * simply disable it. In order to make it work properly, we need @@ -1623,6 +1658,8 @@ out: return rc; } +static DEFINE_MUTEX(mf_mutex); + /** * memory_failure - Handle memory failure of a page. * @pfn: Page Number of the corrupted page @@ -1649,7 +1686,6 @@ int memory_failure(unsigned long pfn, int flags) int res = 0; unsigned long page_flags; bool retry = true; - static DEFINE_MUTEX(mf_mutex); if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -1790,16 +1826,6 @@ try_again: */ page_flags = p->flags; - /* - * unpoison always clear PG_hwpoison inside page lock - */ - if (!PageHWPoison(p)) { - pr_err("Memory failure: %#lx: just unpoisoned\n", pfn); - num_poisoned_pages_dec(); - unlock_page(p); - put_page(p); - goto unlock_mutex; - } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); @@ -1963,6 +1989,28 @@ core_initcall(memory_failure_init); pr_info(fmt, pfn); \ }) +static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p) +{ + if (TestClearPageHWPoison(p)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), rs); + num_poisoned_pages_dec(); + return 1; + } + return 0; +} + +static inline int unpoison_taken_off_page(struct ratelimit_state *rs, + struct page *p) +{ + if (put_page_back_buddy(p)) { + unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", + page_to_pfn(p), rs); + return 0; + } + return -EBUSY; +} + /** * unpoison_memory - Unpoison a previously poisoned page * @pfn: Page number of the to be unpoisoned page @@ -1979,8 +2027,7 @@ int unpoison_memory(unsigned long pfn) { struct page *page; struct page *p; - int freeit = 0; - unsigned long flags = 0; + int ret = -EBUSY; static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); @@ -1990,69 +2037,60 @@ int unpoison_memory(unsigned long pfn) p = pfn_to_page(pfn); page = compound_head(p); + mutex_lock(&mf_mutex); + if (!PageHWPoison(p)) { unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_count(page) > 1) { unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_mapped(page)) { unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } if (page_mapping(page)) { unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n", pfn, &unpoison_rs); - return 0; + goto unlock_mutex; } - /* - * unpoison_memory() can encounter thp only when the thp is being - * worked by memory_failure() and the page lock is not held yet. - * In such case, we yield to memory_failure() and make unpoison fail. - */ - if (!PageHuge(page) && PageTransHuge(page)) { - unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n", - pfn, &unpoison_rs); - return 0; - } + if (PageSlab(page) || PageTable(page)) + goto unlock_mutex; - if (!get_hwpoison_page(p, flags)) { - if (TestClearPageHWPoison(p)) - num_poisoned_pages_dec(); - unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n", - pfn, &unpoison_rs); - return 0; - } + ret = get_hwpoison_page(p, MF_UNPOISON); + if (!ret) { + if (clear_page_hwpoison(&unpoison_rs, page)) + ret = 0; + else + ret = -EBUSY; + } else if (ret < 0) { + if (ret == -EHWPOISON) { + ret = unpoison_taken_off_page(&unpoison_rs, p); + } else + unpoison_pr_info("Unpoison: failed to grab page %#lx\n", + pfn, &unpoison_rs); + } else { + int freeit = clear_page_hwpoison(&unpoison_rs, p); - lock_page(page); - /* - * This test is racy because PG_hwpoison is set outside of page lock. - * That's acceptable because that won't trigger kernel panic. Instead, - * the PG_hwpoison page will be caught and isolated on the entrance to - * the free buddy page pool. - */ - if (TestClearPageHWPoison(page)) { - unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n", - pfn, &unpoison_rs); - num_poisoned_pages_dec(); - freeit = 1; - } - unlock_page(page); - - put_page(page); - if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) put_page(page); + if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) { + put_page(page); + ret = 0; + } + } - return 0; +unlock_mutex: + mutex_unlock(&mf_mutex); + return ret; } EXPORT_SYMBOL(unpoison_memory); @@ -2233,9 +2271,12 @@ int soft_offline_page(unsigned long pfn, int flags) return -EIO; } + mutex_lock(&mf_mutex); + if (PageHWPoison(page)) { pr_info("%s: %#lx page already poisoned\n", __func__, pfn); put_ref_page(ref_page); + mutex_unlock(&mf_mutex); return 0; } @@ -2254,5 +2295,7 @@ retry: } } + mutex_unlock(&mf_mutex); + return ret; } diff --git a/mm/memory.c b/mm/memory.c index a95b66c7fc11..3f1fd445dad8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -722,8 +722,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, else if (is_writable_device_exclusive_entry(entry)) pte = maybe_mkwrite(pte_mkdirty(pte), vma); - set_pte_at(vma->vm_mm, address, ptep, pte); - /* * No need to take a page reference as one was already * created when the swap entry was made. @@ -737,6 +735,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma, */ WARN_ON_ONCE(!PageAnon(page)); + set_pte_at(vma->vm_mm, address, ptep, pte); + if (vma->vm_flags & VM_LOCKED) mlock_vma_page(page); @@ -3652,7 +3652,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES); dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS); pte = mk_pte(page, vma->vm_page_prot); - if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) { + if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) { pte = maybe_mkwrite(pte_mkdirty(pte), vma); vmf->flags &= ~FAULT_FLAG_WRITE; ret |= VM_FAULT_WRITE; @@ -3665,8 +3665,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) pte = pte_mkuffd_wp(pte); pte = pte_wrprotect(pte); } - set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); - arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); vmf->orig_pte = pte; /* ksm created a completely new copy */ @@ -3677,6 +3675,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf) do_page_add_anon_rmap(page, vma, vmf->address, exclusive); } + set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte); + arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte); + swap_free(entry); if (mem_cgroup_swap_full(page) || (vma->vm_flags & VM_LOCKED) || PageMlocked(page)) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 679f47b3a079..028e8dd82b44 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -134,6 +134,8 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES]; * @node: Node id to start the search * * Lookup the next closest node by distance if @nid is not online. + * + * Return: this @node if it is online, otherwise the closest node by distance */ int numa_map_to_online_node(int node) { @@ -296,6 +298,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags, atomic_set(&policy->refcnt, 1); policy->mode = mode; policy->flags = flags; + policy->home_node = NUMA_NO_NODE; return policy; } @@ -1478,6 +1481,77 @@ static long kernel_mbind(unsigned long start, unsigned long len, return do_mbind(start, len, lmode, mode_flags, &nodes, flags); } +SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len, + unsigned long, home_node, unsigned long, flags) +{ + struct mm_struct *mm = current->mm; + struct vm_area_struct *vma; + struct mempolicy *new; + unsigned long vmstart; + unsigned long vmend; + unsigned long end; + int err = -ENOENT; + + start = untagged_addr(start); + if (start & ~PAGE_MASK) + return -EINVAL; + /* + * flags is used for future extension if any. + */ + if (flags != 0) + return -EINVAL; + + /* + * Check home_node is online to avoid accessing uninitialized + * NODE_DATA. + */ + if (home_node >= MAX_NUMNODES || !node_online(home_node)) + return -EINVAL; + + len = (len + PAGE_SIZE - 1) & PAGE_MASK; + end = start + len; + + if (end < start) + return -EINVAL; + if (end == start) + return 0; + mmap_write_lock(mm); + vma = find_vma(mm, start); + for (; vma && vma->vm_start < end; vma = vma->vm_next) { + + vmstart = max(start, vma->vm_start); + vmend = min(end, vma->vm_end); + new = mpol_dup(vma_policy(vma)); + if (IS_ERR(new)) { + err = PTR_ERR(new); + break; + } + /* + * Only update home node if there is an existing vma policy + */ + if (!new) + continue; + + /* + * If any vma in the range got policy other than MPOL_BIND + * or MPOL_PREFERRED_MANY we return error. We don't reset + * the home node for vmas we already updated before. + */ + if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) { + err = -EOPNOTSUPP; + break; + } + + new->home_node = home_node; + err = mbind_range(mm, vmstart, vmend, new); + mpol_put(new); + if (err) + break; + } + mmap_write_unlock(mm); + return err; +} + SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len, unsigned long, mode, const unsigned long __user *, nmask, unsigned long, maxnode, unsigned int, flags) @@ -1802,6 +1876,11 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd) WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE)); } + if ((policy->mode == MPOL_BIND || + policy->mode == MPOL_PREFERRED_MANY) && + policy->home_node != NUMA_NO_NODE) + return policy->home_node; + return nd; } @@ -2062,7 +2141,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL); page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes); if (!page) - page = __alloc_pages(gfp, order, numa_node_id(), NULL); + page = __alloc_pages(gfp, order, nid, NULL); return page; } @@ -2073,7 +2152,6 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * @order: Order of the GFP allocation. * @vma: Pointer to VMA or NULL if not available. * @addr: Virtual address of the allocation. Must be inside @vma. - * @node: Which node to prefer for allocation (modulo policy). * @hugepage: For hugepages try only the preferred node if possible. * * Allocate a page for a specific address in @vma, using the appropriate @@ -2084,9 +2162,10 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order, * Return: The page on success or NULL if allocation fails. */ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, - unsigned long addr, int node, bool hugepage) + unsigned long addr, bool hugepage) { struct mempolicy *pol; + int node = numa_node_id(); struct page *page; int preferred_nid; nodemask_t *nmask; @@ -2103,6 +2182,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma, } if (pol->mode == MPOL_PREFERRED_MANY) { + node = policy_node(gfp, pol, node); page = alloc_pages_preferred_many(gfp, order, node, pol); mpol_cond_put(pol); goto out; @@ -2186,7 +2266,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order) page = alloc_page_interleave(gfp, order, interleave_nodes(pol)); else if (pol->mode == MPOL_PREFERRED_MANY) page = alloc_pages_preferred_many(gfp, order, - numa_node_id(), pol); + policy_node(gfp, pol, numa_node_id()), pol); else page = __alloc_pages(gfp, order, policy_node(gfp, pol, numa_node_id()), @@ -2342,6 +2422,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b) return false; if (a->flags != b->flags) return false; + if (a->home_node != b->home_node) + return false; if (mpol_store_user_nodemask(a)) if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask)) return false; @@ -2885,7 +2967,7 @@ static const char * const policy_modes[] = * Format of input: * [=][:] * - * On success, returns 0, else 1 + * Return: %0 on success, else %1 */ int mpol_parse_str(char *str, struct mempolicy **mpol) { diff --git a/mm/migrate.c b/mm/migrate.c index 7079e6b7dbe7..18ce840914f0 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -50,6 +50,7 @@ #include #include #include +#include #include @@ -236,20 +237,19 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma, pte = pte_mkhuge(pte); pte = arch_make_huge_pte(pte, shift, vma->vm_flags); - set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); if (PageAnon(new)) hugepage_add_anon_rmap(new, vma, pvmw.address); else page_dup_rmap(new, true); + set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } else #endif { - set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); - if (PageAnon(new)) page_add_anon_rmap(new, vma, pvmw.address, false); else page_add_file_rmap(new, false); + set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte); } if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new)) mlock_vma_page(new); @@ -1084,80 +1084,6 @@ out: return rc; } - -/* - * node_demotion[] example: - * - * Consider a system with two sockets. Each socket has - * three classes of memory attached: fast, medium and slow. - * Each memory class is placed in its own NUMA node. The - * CPUs are placed in the node with the "fast" memory. The - * 6 NUMA nodes (0-5) might be split among the sockets like - * this: - * - * Socket A: 0, 1, 2 - * Socket B: 3, 4, 5 - * - * When Node 0 fills up, its memory should be migrated to - * Node 1. When Node 1 fills up, it should be migrated to - * Node 2. The migration path start on the nodes with the - * processors (since allocations default to this node) and - * fast memory, progress through medium and end with the - * slow memory: - * - * 0 -> 1 -> 2 -> stop - * 3 -> 4 -> 5 -> stop - * - * This is represented in the node_demotion[] like this: - * - * { 1, // Node 0 migrates to 1 - * 2, // Node 1 migrates to 2 - * -1, // Node 2 does not migrate - * 4, // Node 3 migrates to 4 - * 5, // Node 4 migrates to 5 - * -1} // Node 5 does not migrate - */ - -/* - * Writes to this array occur without locking. Cycles are - * not allowed: Node X demotes to Y which demotes to X... - * - * If multiple reads are performed, a single rcu_read_lock() - * must be held over all reads to ensure that no cycles are - * observed. - */ -static int node_demotion[MAX_NUMNODES] __read_mostly = - {[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE}; - -/** - * next_demotion_node() - Get the next node in the demotion path - * @node: The starting node to lookup the next node - * - * Return: node id for next memory node in the demotion path hierarchy - * from @node; NUMA_NO_NODE if @node is terminal. This does not keep - * @node online or guarantee that it *continues* to be the next demotion - * target. - */ -int next_demotion_node(int node) -{ - int target; - - /* - * node_demotion[] is updated without excluding this - * function from running. RCU doesn't provide any - * compiler barriers, so the READ_ONCE() is required - * to avoid compiler reordering or read merging. - * - * Make sure to use RCU over entire code blocks if - * node_demotion[] reads need to be consistent. - */ - rcu_read_lock(); - target = READ_ONCE(node_demotion[node]); - rcu_read_unlock(); - - return target; -} - /* * Obtain the lock on page, remove all ptes and migrate the page * to the newly allocated page in newpage. @@ -1413,7 +1339,7 @@ static inline int try_split_thp(struct page *page, struct page **page2, * @mode: The migration mode that specifies the constraints for * page migration, if any. * @reason: The reason for page migration. - * @ret_succeeded: Set to the number of pages migrated successfully if + * @ret_succeeded: Set to the number of normal pages migrated successfully if * the caller passes a non-NULL pointer. * * The function returns after 10 attempts or if no pages are movable any more @@ -1421,7 +1347,9 @@ static inline int try_split_thp(struct page *page, struct page **page2, * It is caller's responsibility to call putback_movable_pages() to return pages * to the LRU or free list only if ret != 0. * - * Returns the number of pages that were not migrated, or an error code. + * Returns the number of {normal page, THP, hugetlb} that were not migrated, or + * an error code. The number of THP splits will be considered as the number of + * non-migrated THP, no matter how many subpages of the THP are migrated successfully. */ int migrate_pages(struct list_head *from, new_page_t get_new_page, free_page_t put_new_page, unsigned long private, @@ -1430,6 +1358,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int retry = 1; int thp_retry = 1; int nr_failed = 0; + int nr_failed_pages = 0; int nr_succeeded = 0; int nr_thp_succeeded = 0; int nr_thp_failed = 0; @@ -1441,13 +1370,16 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page, int swapwrite = current->flags & PF_SWAPWRITE; int rc, nr_subpages; LIST_HEAD(ret_pages); + LIST_HEAD(thp_split_pages); bool nosplit = (reason == MR_NUMA_MISPLACED); + bool no_subpage_counting = false; trace_mm_migrate_pages_start(mode, reason); if (!swapwrite) current->flags |= PF_SWAPWRITE; +thp_subpage_migration: for (pass = 0; pass < 10 && (retry || thp_retry); pass++) { retry = 0; thp_retry = 0; @@ -1460,7 +1392,7 @@ retry: * during migration. */ is_thp = PageTransHuge(page) && !PageHuge(page); - nr_subpages = thp_nr_pages(page); + nr_subpages = compound_nr(page); cond_resched(); if (PageHuge(page)) @@ -1496,18 +1428,20 @@ retry: case -ENOSYS: /* THP migration is unsupported */ if (is_thp) { - if (!try_split_thp(page, &page2, from)) { + nr_thp_failed++; + if (!try_split_thp(page, &page2, &thp_split_pages)) { nr_thp_split++; goto retry; } - nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; break; } /* Hugetlb migration is unsupported */ - nr_failed++; + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages += nr_subpages; break; case -ENOMEM: /* @@ -1516,16 +1450,19 @@ retry: * THP NUMA faulting doesn't split THP to retry. */ if (is_thp && !nosplit) { - if (!try_split_thp(page, &page2, from)) { + nr_thp_failed++; + if (!try_split_thp(page, &page2, &thp_split_pages)) { nr_thp_split++; goto retry; } - nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; goto out; } - nr_failed++; + + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages += nr_subpages; goto out; case -EAGAIN: if (is_thp) { @@ -1535,12 +1472,11 @@ retry: retry++; break; case MIGRATEPAGE_SUCCESS: + nr_succeeded += nr_subpages; if (is_thp) { nr_thp_succeeded++; - nr_succeeded += nr_subpages; break; } - nr_succeeded++; break; default: /* @@ -1551,17 +1487,37 @@ retry: */ if (is_thp) { nr_thp_failed++; - nr_failed += nr_subpages; + nr_failed_pages += nr_subpages; break; } - nr_failed++; + + if (!no_subpage_counting) + nr_failed++; + nr_failed_pages += nr_subpages; break; } } } - nr_failed += retry + thp_retry; + nr_failed += retry; nr_thp_failed += thp_retry; - rc = nr_failed; + /* + * Try to migrate subpages of fail-to-migrate THPs, no nr_failed + * counting in this round, since all subpages of a THP is counted + * as 1 failure in the first round. + */ + if (!list_empty(&thp_split_pages)) { + /* + * Move non-migrated pages (after 10 retries) to ret_pages + * to avoid migrating them again. + */ + list_splice_init(from, &ret_pages); + list_splice_init(&thp_split_pages, from); + no_subpage_counting = true; + retry = 1; + goto thp_subpage_migration; + } + + rc = nr_failed + nr_thp_failed; out: /* * Put the permanent failure page back to migration list, they @@ -1570,11 +1526,11 @@ out: list_splice(&ret_pages, from); count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded); - count_vm_events(PGMIGRATE_FAIL, nr_failed); + count_vm_events(PGMIGRATE_FAIL, nr_failed_pages); count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded); count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed); count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split); - trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded, + trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded, nr_thp_failed, nr_thp_split, mode, reason); if (!swapwrite) @@ -2516,8 +2472,7 @@ static bool migrate_vma_check_page(struct page *page) static void migrate_vma_unmap(struct migrate_vma *migrate) { const unsigned long npages = migrate->npages; - const unsigned long start = migrate->start; - unsigned long addr, i, restore = 0; + unsigned long i, restore = 0; bool allow_drain = true; lru_add_drain(); @@ -2563,7 +2518,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate) } } - for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) { + for (i = 0; i < npages && restore; i++) { struct page *page = migrate_pfn_to_page(migrate->src[i]); if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE)) @@ -2961,14 +2916,152 @@ void migrate_vma_finalize(struct migrate_vma *migrate) EXPORT_SYMBOL(migrate_vma_finalize); #endif /* CONFIG_DEVICE_PRIVATE */ +/* + * node_demotion[] example: + * + * Consider a system with two sockets. Each socket has + * three classes of memory attached: fast, medium and slow. + * Each memory class is placed in its own NUMA node. The + * CPUs are placed in the node with the "fast" memory. The + * 6 NUMA nodes (0-5) might be split among the sockets like + * this: + * + * Socket A: 0, 1, 2 + * Socket B: 3, 4, 5 + * + * When Node 0 fills up, its memory should be migrated to + * Node 1. When Node 1 fills up, it should be migrated to + * Node 2. The migration path start on the nodes with the + * processors (since allocations default to this node) and + * fast memory, progress through medium and end with the + * slow memory: + * + * 0 -> 1 -> 2 -> stop + * 3 -> 4 -> 5 -> stop + * + * This is represented in the node_demotion[] like this: + * + * { nr=1, nodes[0]=1 }, // Node 0 migrates to 1 + * { nr=1, nodes[0]=2 }, // Node 1 migrates to 2 + * { nr=0, nodes[0]=-1 }, // Node 2 does not migrate + * { nr=1, nodes[0]=4 }, // Node 3 migrates to 4 + * { nr=1, nodes[0]=5 }, // Node 4 migrates to 5 + * { nr=0, nodes[0]=-1 }, // Node 5 does not migrate + * + * Moreover some systems may have multiple slow memory nodes. + * Suppose a system has one socket with 3 memory nodes, node 0 + * is fast memory type, and node 1/2 both are slow memory + * type, and the distance between fast memory node and slow + * memory node is same. So the migration path should be: + * + * 0 -> 1/2 -> stop + * + * This is represented in the node_demotion[] like this: + * { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2 + * { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate + * { nr=0, nodes[0]=-1, }, // Node 2 does not migrate + */ + +/* + * Writes to this array occur without locking. Cycles are + * not allowed: Node X demotes to Y which demotes to X... + * + * If multiple reads are performed, a single rcu_read_lock() + * must be held over all reads to ensure that no cycles are + * observed. + */ +#define DEFAULT_DEMOTION_TARGET_NODES 15 + +#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES +#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1) +#else +#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES +#endif + +struct demotion_nodes { + unsigned short nr; + short nodes[DEMOTION_TARGET_NODES]; +}; + +static struct demotion_nodes *node_demotion __read_mostly; + +/** + * next_demotion_node() - Get the next node in the demotion path + * @node: The starting node to lookup the next node + * + * Return: node id for next memory node in the demotion path hierarchy + * from @node; NUMA_NO_NODE if @node is terminal. This does not keep + * @node online or guarantee that it *continues* to be the next demotion + * target. + */ +int next_demotion_node(int node) +{ + struct demotion_nodes *nd; + unsigned short target_nr, index; + int target; + + if (!node_demotion) + return NUMA_NO_NODE; + + nd = &node_demotion[node]; + + /* + * node_demotion[] is updated without excluding this + * function from running. RCU doesn't provide any + * compiler barriers, so the READ_ONCE() is required + * to avoid compiler reordering or read merging. + * + * Make sure to use RCU over entire code blocks if + * node_demotion[] reads need to be consistent. + */ + rcu_read_lock(); + target_nr = READ_ONCE(nd->nr); + + switch (target_nr) { + case 0: + target = NUMA_NO_NODE; + goto out; + case 1: + index = 0; + break; + default: + /* + * If there are multiple target nodes, just select one + * target node randomly. + * + * In addition, we can also use round-robin to select + * target node, but we should introduce another variable + * for node_demotion[] to record last selected target node, + * that may cause cache ping-pong due to the changing of + * last target node. Or introducing per-cpu data to avoid + * caching issue, which seems more complicated. So selecting + * target node randomly seems better until now. + */ + index = get_random_int() % target_nr; + break; + } + + target = READ_ONCE(nd->nodes[index]); + +out: + rcu_read_unlock(); + return target; +} + #if defined(CONFIG_HOTPLUG_CPU) /* Disable reclaim-based migration. */ static void __disable_all_migrate_targets(void) { - int node; + int node, i; - for_each_online_node(node) - node_demotion[node] = NUMA_NO_NODE; + if (!node_demotion) + return; + + for_each_online_node(node) { + node_demotion[node].nr = 0; + for (i = 0; i < DEMOTION_TARGET_NODES; i++) + node_demotion[node].nodes[i] = NUMA_NO_NODE; + } } static void disable_all_migrate_targets(void) @@ -2995,26 +3088,40 @@ static void disable_all_migrate_targets(void) * Failing here is OK. It might just indicate * being at the end of a chain. */ -static int establish_migrate_target(int node, nodemask_t *used) +static int establish_migrate_target(int node, nodemask_t *used, + int best_distance) { - int migration_target; + int migration_target, index, val; + struct demotion_nodes *nd; - /* - * Can not set a migration target on a - * node with it already set. - * - * No need for READ_ONCE() here since this - * in the write path for node_demotion[]. - * This should be the only thread writing. - */ - if (node_demotion[node] != NUMA_NO_NODE) + if (!node_demotion) return NUMA_NO_NODE; + nd = &node_demotion[node]; + migration_target = find_next_best_node(node, used); if (migration_target == NUMA_NO_NODE) return NUMA_NO_NODE; - node_demotion[node] = migration_target; + /* + * If the node has been set a migration target node before, + * which means it's the best distance between them. Still + * check if this node can be demoted to other target nodes + * if they have a same best distance. + */ + if (best_distance != -1) { + val = node_distance(node, migration_target); + if (val > best_distance) + return NUMA_NO_NODE; + } + + index = nd->nr; + if (WARN_ONCE(index >= DEMOTION_TARGET_NODES, + "Exceeds maximum demotion target nodes\n")) + return NUMA_NO_NODE; + + nd->nodes[index] = migration_target; + nd->nr++; return migration_target; } @@ -3030,7 +3137,9 @@ static int establish_migrate_target(int node, nodemask_t *used) * * The difference here is that cycles must be avoided. If * node0 migrates to node1, then neither node1, nor anything - * node1 migrates to can migrate to node0. + * node1 migrates to can migrate to node0. Also one node can + * be migrated to multiple nodes if the target nodes all have + * a same best-distance against the source node. * * This function can run simultaneously with readers of * node_demotion[]. However, it can not run simultaneously @@ -3042,7 +3151,7 @@ static void __set_migration_target_nodes(void) nodemask_t next_pass = NODE_MASK_NONE; nodemask_t this_pass = NODE_MASK_NONE; nodemask_t used_targets = NODE_MASK_NONE; - int node; + int node, best_distance; /* * Avoid any oddities like cycles that could occur @@ -3071,18 +3180,33 @@ again: * multiple source nodes to share a destination. */ nodes_or(used_targets, used_targets, this_pass); - for_each_node_mask(node, this_pass) { - int target_node = establish_migrate_target(node, &used_targets); - if (target_node == NUMA_NO_NODE) - continue; + for_each_node_mask(node, this_pass) { + best_distance = -1; /* - * Visit targets from this pass in the next pass. - * Eventually, every node will have been part of - * a pass, and will become set in 'used_targets'. + * Try to set up the migration path for the node, and the target + * migration nodes can be multiple, so doing a loop to find all + * the target nodes if they all have a best node distance. */ - node_set(target_node, next_pass); + do { + int target_node = + establish_migrate_target(node, &used_targets, + best_distance); + + if (target_node == NUMA_NO_NODE) + break; + + if (best_distance == -1) + best_distance = node_distance(node, target_node); + + /* + * Visit targets from this pass in the next pass. + * Eventually, every node will have been part of + * a pass, and will become set in 'used_targets'. + */ + node_set(target_node, next_pass); + } while (1); } /* * 'next_pass' contains nodes which became migration @@ -3183,6 +3307,11 @@ static int __init migrate_on_reclaim_init(void) { int ret; + node_demotion = kmalloc_array(nr_node_ids, + sizeof(struct demotion_nodes), + GFP_KERNEL); + WARN_ON(!node_demotion); + ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline", NULL, migration_offline_cpu); /* diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 3390316c8a32..3934ff500878 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -1058,7 +1058,7 @@ bool out_of_memory(struct oom_control *oc) if (!is_memcg_oom(oc)) { blocking_notifier_call_chain(&oom_notify_list, 0, &freed); - if (freed > 0) + if (freed > 0 && !is_sysrq_oom(oc)) /* Got some memory back in the last second. */ return true; } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d59023a676ed..d4205e5e41d1 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -63,6 +64,7 @@ #include #include #include +#include #include #include #include @@ -1307,6 +1309,7 @@ static __always_inline bool free_pages_prepare(struct page *page, if (memcg_kmem_enabled() && PageMemcgKmem(page)) __memcg_kmem_uncharge_page(page, order); reset_page_owner(page, order); + page_table_check_free(page, order); return false; } @@ -1346,6 +1349,7 @@ static __always_inline bool free_pages_prepare(struct page *page, page_cpupid_reset_last(page); page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP; reset_page_owner(page, order); + page_table_check_free(page, order); if (!PageHighMem(page)) { debug_check_no_locks_freed(page_address(page), @@ -2420,6 +2424,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order, } set_page_owner(page, order, gfp_flags); + page_table_check_alloc(page, order); } static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags, @@ -4214,7 +4219,9 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...) va_list args; static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1); - if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs)) + if ((gfp_mask & __GFP_NOWARN) || + !__ratelimit(&nopage_rs) || + ((gfp_mask & __GFP_DMA) && !has_managed_dma())) return; va_start(args, fmt); @@ -8224,7 +8231,7 @@ void __init mem_init_print_info(void) */ #define adj_init_size(start, end, size, pos, adj) \ do { \ - if (start <= pos && pos < end && size > adj) \ + if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \ size -= adj; \ } while (0) @@ -9268,8 +9275,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, * for allocation requests which can not be fulfilled with the buddy allocator. * * The allocated memory is always aligned to a page boundary. If nr_pages is a - * power of two then the alignment is guaranteed to be to the given nr_pages - * (e.g. 1GB request would be aligned to 1GB). + * power of two, then allocated range is also guaranteed to be aligned to same + * nr_pages (e.g. 1GB request would be aligned to 1GB). * * Allocated pages can be freed with free_contig_range() or by manually calling * __free_page() on each allocated page. @@ -9502,6 +9509,7 @@ bool take_page_off_buddy(struct page *page) del_page_from_free_list(page_head, zone, page_order); break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); + SetPageHWPoisonTakenOff(page); if (!is_migrate_isolate(migratetype)) __mod_zone_freepage_state(zone, -1, migratetype); ret = true; @@ -9513,4 +9521,44 @@ bool take_page_off_buddy(struct page *page) spin_unlock_irqrestore(&zone->lock, flags); return ret; } + +/* + * Cancel takeoff done by take_page_off_buddy(). + */ +bool put_page_back_buddy(struct page *page) +{ + struct zone *zone = page_zone(page); + unsigned long pfn = page_to_pfn(page); + unsigned long flags; + int migratetype = get_pfnblock_migratetype(page, pfn); + bool ret = false; + + spin_lock_irqsave(&zone->lock, flags); + if (put_page_testzero(page)) { + ClearPageHWPoisonTakenOff(page); + __free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE); + if (TestClearPageHWPoison(page)) { + num_poisoned_pages_dec(); + ret = true; + } + } + spin_unlock_irqrestore(&zone->lock, flags); + + return ret; +} #endif + +#ifdef CONFIG_ZONE_DMA +bool has_managed_dma(void) +{ + struct pglist_data *pgdat; + + for_each_online_pgdat(pgdat) { + struct zone *zone = &pgdat->node_zones[ZONE_DMA]; + + if (managed_zone(zone)) + return true; + } + return false; +} +#endif /* CONFIG_ZONE_DMA */ diff --git a/mm/page_ext.c b/mm/page_ext.c index 6242afb24d84..2e66d934d63f 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -8,6 +8,7 @@ #include #include #include +#include /* * struct page extension @@ -63,18 +64,21 @@ static bool need_page_idle(void) { return true; } -struct page_ext_operations page_idle_ops = { +static struct page_ext_operations page_idle_ops __initdata = { .need = need_page_idle, }; #endif -static struct page_ext_operations *page_ext_ops[] = { +static struct page_ext_operations *page_ext_ops[] __initdata = { #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif #if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT) &page_idle_ops, #endif +#ifdef CONFIG_PAGE_TABLE_CHECK + &page_table_check_ops, +#endif }; unsigned long page_ext_size = sizeof(struct page_ext); diff --git a/mm/page_isolation.c b/mm/page_isolation.c index f67c4c70f17f..6a0ddda6b3c5 100644 --- a/mm/page_isolation.c +++ b/mm/page_isolation.c @@ -115,7 +115,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype) * onlining - just onlined memory won't immediately be considered for * allocation. */ - if (!isolated_page) { + if (!isolated_page && PageBuddy(page)) { nr_pages = move_freepages_block(zone, page, migratetype, NULL); __mod_zone_freepage_state(zone, nr_pages, migratetype); } diff --git a/mm/page_owner.c b/mm/page_owner.c index 4f924957ce7a..5eea061bb1e5 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -46,7 +46,7 @@ static int __init early_page_owner_param(char *buf) } early_param("page_owner", early_page_owner_param); -static bool need_page_owner(void) +static __init bool need_page_owner(void) { return page_owner_enabled; } @@ -75,7 +75,7 @@ static noinline void register_early_stack(void) early_handle = create_dummy_stack(); } -static void init_page_owner(void) +static __init void init_page_owner(void) { if (!page_owner_enabled) return; diff --git a/mm/page_table_check.c b/mm/page_table_check.c new file mode 100644 index 000000000000..7504e7caa2a1 --- /dev/null +++ b/mm/page_table_check.c @@ -0,0 +1,270 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Copyright (c) 2021, Google LLC. + * Pasha Tatashin + */ +#include +#include + +#undef pr_fmt +#define pr_fmt(fmt) "page_table_check: " fmt + +struct page_table_check { + atomic_t anon_map_count; + atomic_t file_map_count; +}; + +static bool __page_table_check_enabled __initdata = + IS_ENABLED(CONFIG_PAGE_TABLE_CHECK_ENFORCED); + +DEFINE_STATIC_KEY_TRUE(page_table_check_disabled); +EXPORT_SYMBOL(page_table_check_disabled); + +static int __init early_page_table_check_param(char *buf) +{ + if (!buf) + return -EINVAL; + + if (strcmp(buf, "on") == 0) + __page_table_check_enabled = true; + else if (strcmp(buf, "off") == 0) + __page_table_check_enabled = false; + + return 0; +} + +early_param("page_table_check", early_page_table_check_param); + +static bool __init need_page_table_check(void) +{ + return __page_table_check_enabled; +} + +static void __init init_page_table_check(void) +{ + if (!__page_table_check_enabled) + return; + static_branch_disable(&page_table_check_disabled); +} + +struct page_ext_operations page_table_check_ops = { + .size = sizeof(struct page_table_check), + .need = need_page_table_check, + .init = init_page_table_check, +}; + +static struct page_table_check *get_page_table_check(struct page_ext *page_ext) +{ + BUG_ON(!page_ext); + return (void *)(page_ext) + page_table_check_ops.offset; +} + +static inline bool pte_user_accessible_page(pte_t pte) +{ + return (pte_val(pte) & _PAGE_PRESENT) && (pte_val(pte) & _PAGE_USER); +} + +static inline bool pmd_user_accessible_page(pmd_t pmd) +{ + return pmd_leaf(pmd) && (pmd_val(pmd) & _PAGE_PRESENT) && + (pmd_val(pmd) & _PAGE_USER); +} + +static inline bool pud_user_accessible_page(pud_t pud) +{ + return pud_leaf(pud) && (pud_val(pud) & _PAGE_PRESENT) && + (pud_val(pud) & _PAGE_USER); +} + +/* + * An enty is removed from the page table, decrement the counters for that page + * verify that it is of correct type and counters do not become negative. + */ +static void page_table_check_clear(struct mm_struct *mm, unsigned long addr, + unsigned long pfn, unsigned long pgcnt) +{ + struct page_ext *page_ext; + struct page *page; + bool anon; + int i; + + if (!pfn_valid(pfn)) + return; + + page = pfn_to_page(pfn); + page_ext = lookup_page_ext(page); + anon = PageAnon(page); + + for (i = 0; i < pgcnt; i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + if (anon) { + BUG_ON(atomic_read(&ptc->file_map_count)); + BUG_ON(atomic_dec_return(&ptc->anon_map_count) < 0); + } else { + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_dec_return(&ptc->file_map_count) < 0); + } + page_ext = page_ext_next(page_ext); + } +} + +/* + * A new enty is added to the page table, increment the counters for that page + * verify that it is of correct type and is not being mapped with a different + * type to a different process. + */ +static void page_table_check_set(struct mm_struct *mm, unsigned long addr, + unsigned long pfn, unsigned long pgcnt, + bool rw) +{ + struct page_ext *page_ext; + struct page *page; + bool anon; + int i; + + if (!pfn_valid(pfn)) + return; + + page = pfn_to_page(pfn); + page_ext = lookup_page_ext(page); + anon = PageAnon(page); + + for (i = 0; i < pgcnt; i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + if (anon) { + BUG_ON(atomic_read(&ptc->file_map_count)); + BUG_ON(atomic_inc_return(&ptc->anon_map_count) > 1 && rw); + } else { + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_inc_return(&ptc->file_map_count) < 0); + } + page_ext = page_ext_next(page_ext); + } +} + +/* + * page is on free list, or is being allocated, verify that counters are zeroes + * crash if they are not. + */ +void __page_table_check_zero(struct page *page, unsigned int order) +{ + struct page_ext *page_ext = lookup_page_ext(page); + int i; + + BUG_ON(!page_ext); + for (i = 0; i < (1 << order); i++) { + struct page_table_check *ptc = get_page_table_check(page_ext); + + BUG_ON(atomic_read(&ptc->anon_map_count)); + BUG_ON(atomic_read(&ptc->file_map_count)); + page_ext = page_ext_next(page_ext); + } +} + +void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t pte) +{ + if (&init_mm == mm) + return; + + if (pte_user_accessible_page(pte)) { + page_table_check_clear(mm, addr, pte_pfn(pte), + PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pte_clear); + +void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr, + pmd_t pmd) +{ + if (&init_mm == mm) + return; + + if (pmd_user_accessible_page(pmd)) { + page_table_check_clear(mm, addr, pmd_pfn(pmd), + PMD_PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pmd_clear); + +void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr, + pud_t pud) +{ + if (&init_mm == mm) + return; + + if (pud_user_accessible_page(pud)) { + page_table_check_clear(mm, addr, pud_pfn(pud), + PUD_PAGE_SIZE >> PAGE_SHIFT); + } +} +EXPORT_SYMBOL(__page_table_check_pud_clear); + +void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr, + pte_t *ptep, pte_t pte) +{ + pte_t old_pte; + + if (&init_mm == mm) + return; + + old_pte = *ptep; + if (pte_user_accessible_page(old_pte)) { + page_table_check_clear(mm, addr, pte_pfn(old_pte), + PAGE_SIZE >> PAGE_SHIFT); + } + + if (pte_user_accessible_page(pte)) { + page_table_check_set(mm, addr, pte_pfn(pte), + PAGE_SIZE >> PAGE_SHIFT, + pte_write(pte)); + } +} +EXPORT_SYMBOL(__page_table_check_pte_set); + +void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) +{ + pmd_t old_pmd; + + if (&init_mm == mm) + return; + + old_pmd = *pmdp; + if (pmd_user_accessible_page(old_pmd)) { + page_table_check_clear(mm, addr, pmd_pfn(old_pmd), + PMD_PAGE_SIZE >> PAGE_SHIFT); + } + + if (pmd_user_accessible_page(pmd)) { + page_table_check_set(mm, addr, pmd_pfn(pmd), + PMD_PAGE_SIZE >> PAGE_SHIFT, + pmd_write(pmd)); + } +} +EXPORT_SYMBOL(__page_table_check_pmd_set); + +void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) +{ + pud_t old_pud; + + if (&init_mm == mm) + return; + + old_pud = *pudp; + if (pud_user_accessible_page(old_pud)) { + page_table_check_clear(mm, addr, pud_pfn(old_pud), + PUD_PAGE_SIZE >> PAGE_SHIFT); + } + + if (pud_user_accessible_page(pud)) { + page_table_check_set(mm, addr, pud_pfn(pud), + PUD_PAGE_SIZE >> PAGE_SHIFT, + pud_write(pud)); + } +} +EXPORT_SYMBOL(__page_table_check_pud_set); diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h index 639662c20c82..411d1593ef23 100644 --- a/mm/percpu-internal.h +++ b/mm/percpu-internal.h @@ -113,6 +113,24 @@ static inline int pcpu_chunk_map_bits(struct pcpu_chunk *chunk) return pcpu_nr_pages_to_map_bits(chunk->nr_pages); } +#ifdef CONFIG_MEMCG_KMEM +/** + * pcpu_obj_full_size - helper to calculate size of each accounted object + * @size: size of area to allocate in bytes + * + * For each accounted object there is an extra space which is used to store + * obj_cgroup membership. Charge it too. + */ +static inline size_t pcpu_obj_full_size(size_t size) +{ + size_t extra_size; + + extra_size = size / PCPU_MIN_ALLOC_SIZE * sizeof(struct obj_cgroup *); + + return size * num_possible_cpus() + extra_size; +} +#endif /* CONFIG_MEMCG_KMEM */ + #ifdef CONFIG_PERCPU_STATS #include diff --git a/mm/percpu.c b/mm/percpu.c index 537f2c0e1761..526c400ea78e 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -1635,7 +1635,7 @@ static bool pcpu_memcg_pre_alloc_hook(size_t size, gfp_t gfp, if (!objcg) return true; - if (obj_cgroup_charge(objcg, gfp, size * num_possible_cpus())) { + if (obj_cgroup_charge(objcg, gfp, pcpu_obj_full_size(size))) { obj_cgroup_put(objcg); return false; } @@ -1656,10 +1656,10 @@ static void pcpu_memcg_post_alloc_hook(struct obj_cgroup *objcg, rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, - size * num_possible_cpus()); + pcpu_obj_full_size(size)); rcu_read_unlock(); } else { - obj_cgroup_uncharge(objcg, size * num_possible_cpus()); + obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); obj_cgroup_put(objcg); } } @@ -1676,11 +1676,11 @@ static void pcpu_memcg_free_hook(struct pcpu_chunk *chunk, int off, size_t size) return; chunk->obj_cgroups[off >> PCPU_MIN_ALLOC_SHIFT] = NULL; - obj_cgroup_uncharge(objcg, size * num_possible_cpus()); + obj_cgroup_uncharge(objcg, pcpu_obj_full_size(size)); rcu_read_lock(); mod_memcg_state(obj_cgroup_memcg(objcg), MEMCG_PERCPU_B, - -(size * num_possible_cpus())); + -pcpu_obj_full_size(size)); rcu_read_unlock(); obj_cgroup_put(objcg); diff --git a/mm/rmap.c b/mm/rmap.c index 163ac4e6bcee..6a1e8c7f6213 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -621,9 +621,20 @@ void try_to_unmap_flush_dirty(void) try_to_unmap_flush(); } +/* + * Bits 0-14 of mm->tlb_flush_batched record pending generations. + * Bits 16-30 of mm->tlb_flush_batched bit record flushed generations. + */ +#define TLB_FLUSH_BATCH_FLUSHED_SHIFT 16 +#define TLB_FLUSH_BATCH_PENDING_MASK \ + ((1 << (TLB_FLUSH_BATCH_FLUSHED_SHIFT - 1)) - 1) +#define TLB_FLUSH_BATCH_PENDING_LARGE \ + (TLB_FLUSH_BATCH_PENDING_MASK / 2) + static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) { struct tlbflush_unmap_batch *tlb_ubc = ¤t->tlb_ubc; + int batch, nbatch; arch_tlbbatch_add_mm(&tlb_ubc->arch, mm); tlb_ubc->flush_required = true; @@ -633,7 +644,22 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable) * before the PTE is cleared. */ barrier(); - mm->tlb_flush_batched = true; + batch = atomic_read(&mm->tlb_flush_batched); +retry: + if ((batch & TLB_FLUSH_BATCH_PENDING_MASK) > TLB_FLUSH_BATCH_PENDING_LARGE) { + /* + * Prevent `pending' from catching up with `flushed' because of + * overflow. Reset `pending' and `flushed' to be 1 and 0 if + * `pending' becomes large. + */ + nbatch = atomic_cmpxchg(&mm->tlb_flush_batched, batch, 1); + if (nbatch != batch) { + batch = nbatch; + goto retry; + } + } else { + atomic_inc(&mm->tlb_flush_batched); + } /* * If the PTE was dirty then it's best to assume it's writable. The @@ -680,15 +706,18 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) */ void flush_tlb_batched_pending(struct mm_struct *mm) { - if (data_race(mm->tlb_flush_batched)) { - flush_tlb_mm(mm); + int batch = atomic_read(&mm->tlb_flush_batched); + int pending = batch & TLB_FLUSH_BATCH_PENDING_MASK; + int flushed = batch >> TLB_FLUSH_BATCH_FLUSHED_SHIFT; + if (pending != flushed) { + flush_tlb_mm(mm); /* - * Do not allow the compiler to re-order the clearing of - * tlb_flush_batched before the tlb is flushed. + * If the new TLB flushing is pending during flushing, leave + * mm->tlb_flush_batched as is, to avoid losing flushing. */ - barrier(); - mm->tlb_flush_batched = false; + atomic_cmpxchg(&mm->tlb_flush_batched, batch, + pending | (pending << TLB_FLUSH_BATCH_FLUSHED_SHIFT)); } } #else diff --git a/mm/shmem.c b/mm/shmem.c index cb6f49379299..66909efd0a1b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1546,8 +1546,7 @@ static struct page *shmem_alloc_hugepage(gfp_t gfp, return NULL; shmem_pseudo_vma_init(&pvma, info, hindex); - page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, numa_node_id(), - true); + page = alloc_pages_vma(gfp, HPAGE_PMD_ORDER, &pvma, 0, true); shmem_pseudo_vma_destroy(&pvma); if (page) prep_transhuge_page(page); diff --git a/mm/slab_common.c b/mm/slab_common.c index 40f2d1b01c88..23f2ab0713b7 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -819,7 +819,7 @@ void __init setup_kmalloc_cache_index_table(void) if (KMALLOC_MIN_SIZE >= 64) { /* - * The 96 byte size cache is not used if the alignment + * The 96 byte sized cache is not used if the alignment * is 64 byte. */ for (i = 64 + 8; i <= 96; i += 8) diff --git a/mm/swap.c b/mm/swap.c index 74f6b311d7ee..bcf3ac288b56 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -882,7 +882,7 @@ void lru_cache_disable(void) * all online CPUs so any calls of lru_cache_disabled wrapped by * local_lock or preemption disabled would be ordered by that. * The atomic operation doesn't need to have stronger ordering - * requirements because that is enforeced by the scheduling + * requirements because that is enforced by the scheduling * guarantees. */ __lru_add_drain_all(true); diff --git a/mm/swapfile.c b/mm/swapfile.c index e59e08ef46e1..caa9f81a0d15 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1601,31 +1601,30 @@ static bool page_swapped(struct page *page) return false; } -static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, +static int page_trans_huge_map_swapcount(struct page *page, int *total_swapcount) { - int i, map_swapcount, _total_mapcount, _total_swapcount; + int i, map_swapcount, _total_swapcount; unsigned long offset = 0; struct swap_info_struct *si; struct swap_cluster_info *ci = NULL; unsigned char *map = NULL; - int mapcount, swapcount = 0; + int swapcount = 0; /* hugetlbfs shouldn't call it */ VM_BUG_ON_PAGE(PageHuge(page), page); if (!IS_ENABLED(CONFIG_THP_SWAP) || likely(!PageTransCompound(page))) { - mapcount = page_trans_huge_mapcount(page, total_mapcount); if (PageSwapCache(page)) swapcount = page_swapcount(page); if (total_swapcount) *total_swapcount = swapcount; - return mapcount + swapcount; + return swapcount + page_trans_huge_mapcount(page); } page = compound_head(page); - _total_mapcount = _total_swapcount = map_swapcount = 0; + _total_swapcount = map_swapcount = 0; if (PageSwapCache(page)) { swp_entry_t entry; @@ -1639,8 +1638,7 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, if (map) ci = lock_cluster(si, offset); for (i = 0; i < HPAGE_PMD_NR; i++) { - mapcount = atomic_read(&page[i]._mapcount) + 1; - _total_mapcount += mapcount; + int mapcount = atomic_read(&page[i]._mapcount) + 1; if (map) { swapcount = swap_count(map[offset + i]); _total_swapcount += swapcount; @@ -1648,19 +1646,14 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, map_swapcount = max(map_swapcount, mapcount + swapcount); } unlock_cluster(ci); - if (PageDoubleMap(page)) { + + if (PageDoubleMap(page)) map_swapcount -= 1; - _total_mapcount -= HPAGE_PMD_NR; - } - mapcount = compound_mapcount(page); - map_swapcount += mapcount; - _total_mapcount += mapcount; - if (total_mapcount) - *total_mapcount = _total_mapcount; + if (total_swapcount) *total_swapcount = _total_swapcount; - return map_swapcount; + return map_swapcount + compound_mapcount(page); } /* @@ -1668,22 +1661,15 @@ static int page_trans_huge_map_swapcount(struct page *page, int *total_mapcount, * to it. And as a side-effect, free up its swap: because the old content * on disk will never be read, and seeking back there to write new content * later would only waste time away from clustering. - * - * NOTE: total_map_swapcount should not be relied upon by the caller if - * reuse_swap_page() returns false, but it may be always overwritten - * (see the other implementation for CONFIG_SWAP=n). */ -bool reuse_swap_page(struct page *page, int *total_map_swapcount) +bool reuse_swap_page(struct page *page) { - int count, total_mapcount, total_swapcount; + int count, total_swapcount; VM_BUG_ON_PAGE(!PageLocked(page), page); if (unlikely(PageKsm(page))) return false; - count = page_trans_huge_map_swapcount(page, &total_mapcount, - &total_swapcount); - if (total_map_swapcount) - *total_map_swapcount = total_mapcount + total_swapcount; + count = page_trans_huge_map_swapcount(page, &total_swapcount); if (count == 1 && PageSwapCache(page) && (likely(!PageTransCompound(page)) || /* The remaining swap count will be freed soon */ @@ -1917,14 +1903,14 @@ static int unuse_pte(struct vm_area_struct *vma, pmd_t *pmd, dec_mm_counter(vma->vm_mm, MM_SWAPENTS); inc_mm_counter(vma->vm_mm, MM_ANONPAGES); get_page(page); - set_pte_at(vma->vm_mm, addr, pte, - pte_mkold(mk_pte(page, vma->vm_page_prot))); if (page == swapcache) { page_add_anon_rmap(page, vma, addr, false); } else { /* ksm created a completely new copy */ page_add_new_anon_rmap(page, vma, addr, false); lru_cache_add_inactive_or_unevictable(page, vma); } + set_pte_at(vma->vm_mm, addr, pte, + pte_mkold(mk_pte(page, vma->vm_page_prot))); swap_free(entry); out: pte_unmap_unlock(pte, ptl); diff --git a/mm/util.c b/mm/util.c index 4eb081e02344..74c154caedfa 100644 --- a/mm/util.c +++ b/mm/util.c @@ -553,13 +553,10 @@ EXPORT_SYMBOL(vm_mmap); * Uses kmalloc to get the memory but if the allocation fails then falls back * to the vmalloc allocator. Use kvfree for freeing the memory. * - * Reclaim modifiers - __GFP_NORETRY and __GFP_NOFAIL are not supported. + * GFP_NOWAIT and GFP_ATOMIC are not supported, neither is the __GFP_NORETRY modifier. * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * - * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not - * fall back to vmalloc. - * * Return: pointer to the allocated memory of %NULL in case of failure */ void *kvmalloc_node(size_t size, gfp_t flags, int node) @@ -567,13 +564,6 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) gfp_t kmalloc_flags = flags; void *ret; - /* - * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) - * so the given set of flags has to be compatible. - */ - if ((flags & GFP_KERNEL) != GFP_KERNEL) - return kmalloc_node(size, flags, node); - /* * We want to attempt a large physically contiguous block first because * it is less likely to fragment multiple larger blocks and therefore @@ -586,6 +576,9 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) if (!(kmalloc_flags & __GFP_RETRY_MAYFAIL)) kmalloc_flags |= __GFP_NORETRY; + + /* nofail semantic is implemented by the vmalloc fallback */ + kmalloc_flags &= ~__GFP_NOFAIL; } ret = kmalloc_node(size, kmalloc_flags, node); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 80c6de4c425f..4165304d3547 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -2846,6 +2847,8 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * more permissive. */ if (!order) { + gfp_t bulk_gfp = gfp & ~__GFP_NOFAIL; + while (nr_allocated < nr_pages) { unsigned int nr, nr_pages_request; @@ -2863,12 +2866,12 @@ vm_area_alloc_pages(gfp_t gfp, int nid, * but mempolcy want to alloc memory by interleaving. */ if (IS_ENABLED(CONFIG_NUMA) && nid == NUMA_NO_NODE) - nr = alloc_pages_bulk_array_mempolicy(gfp, + nr = alloc_pages_bulk_array_mempolicy(bulk_gfp, nr_pages_request, pages + nr_allocated); else - nr = alloc_pages_bulk_array_node(gfp, nid, + nr = alloc_pages_bulk_array_node(bulk_gfp, nid, nr_pages_request, pages + nr_allocated); @@ -2923,11 +2926,14 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, { const gfp_t nested_gfp = (gfp_mask & GFP_RECLAIM_MASK) | __GFP_ZERO; const gfp_t orig_gfp_mask = gfp_mask; + bool nofail = gfp_mask & __GFP_NOFAIL; unsigned long addr = (unsigned long)area->addr; unsigned long size = get_vm_area_size(area); unsigned long array_size; unsigned int nr_small_pages = size >> PAGE_SHIFT; unsigned int page_order; + unsigned int flags; + int ret; array_size = (unsigned long)nr_small_pages * sizeof(struct page *); gfp_mask |= __GFP_NOWARN; @@ -2976,8 +2982,28 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask, goto fail; } - if (vmap_pages_range(addr, addr + size, prot, area->pages, - page_shift) < 0) { + /* + * page tables allocations ignore external gfp mask, enforce it + * by the scope API + */ + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + flags = memalloc_nofs_save(); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + flags = memalloc_noio_save(); + + do { + ret = vmap_pages_range(addr, addr + size, prot, area->pages, + page_shift); + if (nofail && (ret < 0)) + schedule_timeout_uninterruptible(1); + } while (nofail && (ret < 0)); + + if ((gfp_mask & (__GFP_FS | __GFP_IO)) == __GFP_IO) + memalloc_nofs_restore(flags); + else if ((gfp_mask & (__GFP_FS | __GFP_IO)) == 0) + memalloc_noio_restore(flags); + + if (ret < 0) { warn_alloc(orig_gfp_mask, NULL, "vmalloc error: size %lu, failed to map pages", area->nr_pages * PAGE_SIZE); @@ -3005,12 +3031,14 @@ fail: * * Allocate enough pages to cover @size from the page level * allocator with @gfp_mask flags. Please note that the full set of gfp - * flags are not supported. GFP_KERNEL would be a preferred allocation mode - * but GFP_NOFS and GFP_NOIO are supported as well. Zone modifiers are not - * supported. From the reclaim modifiers__GFP_DIRECT_RECLAIM is required (aka - * GFP_NOWAIT is not supported) and only __GFP_NOFAIL is supported (aka - * __GFP_NORETRY and __GFP_RETRY_MAYFAIL are not supported). - * __GFP_NOWARN can be used to suppress error messages about failures. + * flags are not supported. GFP_KERNEL, GFP_NOFS and GFP_NOIO are all + * supported. + * Zone modifiers are not supported. From the reclaim modifiers + * __GFP_DIRECT_RECLAIM is required (aka GFP_NOWAIT is not supported) + * and only __GFP_NOFAIL is supported (i.e. __GFP_NORETRY and + * __GFP_RETRY_MAYFAIL are not supported). + * + * __GFP_NOWARN can be used to suppress failures messages. * * Map them into contiguous kernel virtual space, using a pagetable * protection of @prot. @@ -3065,9 +3093,14 @@ again: VM_UNINITIALIZED | vm_flags, start, end, node, gfp_mask, caller); if (!area) { + bool nofail = gfp_mask & __GFP_NOFAIL; warn_alloc(gfp_mask, NULL, - "vmalloc error: size %lu, vm_struct allocation failed", - real_size); + "vmalloc error: size %lu, vm_struct allocation failed%s", + real_size, (nofail) ? ". Retrying." : ""); + if (nofail) { + schedule_timeout_uninterruptible(1); + goto again; + } goto fail; } diff --git a/mm/vmscan.c b/mm/vmscan.c index ad2327076c0f..a8a1b99f9822 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -954,7 +954,7 @@ out: return freed; } -void drop_slab_node(int nid) +static void drop_slab_node(int nid) { unsigned long freed; int shift = 0; diff --git a/mm/vmstat.c b/mm/vmstat.c index 952897342ab6..c44c2cba2825 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1351,6 +1351,9 @@ const char * const vmstat_text[] = { "thp_split_page_failed", "thp_deferred_split_page", "thp_split_pmd", + "thp_scan_exceed_none_pte", + "thp_scan_exceed_swap_pte", + "thp_scan_exceed_share_pte", #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD "thp_split_pud", #endif diff --git a/mm/zpool.c b/mm/zpool.c index 6d9ed48141e5..68facc193496 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -24,16 +24,11 @@ struct zpool { const struct zpool_ops *ops; bool evictable; bool can_sleep_mapped; - - struct list_head list; }; static LIST_HEAD(drivers_head); static DEFINE_SPINLOCK(drivers_lock); -static LIST_HEAD(pools_head); -static DEFINE_SPINLOCK(pools_lock); - /** * zpool_register_driver() - register a zpool implementation. * @driver: driver to register @@ -195,10 +190,6 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, pr_debug("created pool type %s\n", type); - spin_lock(&pools_lock); - list_add(&zpool->list, &pools_head); - spin_unlock(&pools_lock); - return zpool; } @@ -217,9 +208,6 @@ void zpool_destroy_pool(struct zpool *zpool) { pr_debug("destroying pool type %s\n", zpool->driver->type); - spin_lock(&pools_lock); - list_del(&zpool->list); - spin_unlock(&pools_lock); zpool->driver->destroy(zpool->pool); zpool_put_driver(zpool->driver); kfree(zpool); diff --git a/net/ceph/buffer.c b/net/ceph/buffer.c index 5622763ad402..7e51f128045d 100644 --- a/net/ceph/buffer.c +++ b/net/ceph/buffer.c @@ -7,7 +7,7 @@ #include #include -#include /* for ceph_kvmalloc */ +#include /* for kvmalloc */ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) { @@ -17,7 +17,7 @@ struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp) if (!b) return NULL; - b->vec.iov_base = ceph_kvmalloc(len, gfp); + b->vec.iov_base = kvmalloc(len, gfp); if (!b->vec.iov_base) { kfree(b); return NULL; diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 97d6ea763e32..9441b4a4912b 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -190,33 +190,6 @@ int ceph_compare_options(struct ceph_options *new_opt, } EXPORT_SYMBOL(ceph_compare_options); -/* - * kvmalloc() doesn't fall back to the vmalloc allocator unless flags are - * compatible with (a superset of) GFP_KERNEL. This is because while the - * actual pages are allocated with the specified flags, the page table pages - * are always allocated with GFP_KERNEL. - * - * ceph_kvmalloc() may be called with GFP_KERNEL, GFP_NOFS or GFP_NOIO. - */ -void *ceph_kvmalloc(size_t size, gfp_t flags) -{ - void *p; - - if ((flags & (__GFP_IO | __GFP_FS)) == (__GFP_IO | __GFP_FS)) { - p = kvmalloc(size, flags); - } else if ((flags & (__GFP_IO | __GFP_FS)) == __GFP_IO) { - unsigned int nofs_flag = memalloc_nofs_save(); - p = kvmalloc(size, GFP_KERNEL); - memalloc_nofs_restore(nofs_flag); - } else { - unsigned int noio_flag = memalloc_noio_save(); - p = kvmalloc(size, GFP_KERNEL); - memalloc_noio_restore(noio_flag); - } - - return p; -} - static int parse_fsid(const char *str, struct ceph_fsid *fsid) { int i = 0; diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c index 92d89b331645..051d22c0e4ad 100644 --- a/net/ceph/crypto.c +++ b/net/ceph/crypto.c @@ -147,7 +147,7 @@ void ceph_crypto_key_destroy(struct ceph_crypto_key *key) static const u8 *aes_iv = (u8 *)CEPH_AES_IV; /* - * Should be used for buffers allocated with ceph_kvmalloc(). + * Should be used for buffers allocated with kvmalloc(). * Currently these are encrypt out-buffer (ceph_buffer) and decrypt * in-buffer (msg front). * diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 57d043b382ed..7b891be799d2 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1920,7 +1920,7 @@ struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items, /* front */ if (front_len) { - m->front.iov_base = ceph_kvmalloc(front_len, flags); + m->front.iov_base = kvmalloc(front_len, flags); if (m->front.iov_base == NULL) { dout("ceph_msg_new can't allocate %d bytes\n", front_len); diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index cc40ce4e02fb..c4099b641b38 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -308,7 +308,7 @@ static void *alloc_conn_buf(struct ceph_connection *con, int len) if (WARN_ON(con->v2.conn_buf_cnt >= ARRAY_SIZE(con->v2.conn_bufs))) return NULL; - buf = ceph_kvmalloc(len, GFP_NOIO); + buf = kvmalloc(len, GFP_NOIO); if (!buf) return NULL; diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c index 75b738083523..2823bb3cff55 100644 --- a/net/ceph/osdmap.c +++ b/net/ceph/osdmap.c @@ -980,7 +980,7 @@ static struct crush_work *alloc_workspace(const struct crush_map *c) work_size = crush_work_size(c, CEPH_PG_MAX_SIZE); dout("%s work_size %zu bytes\n", __func__, work_size); - work = ceph_kvmalloc(work_size, GFP_NOIO); + work = kvmalloc(work_size, GFP_NOIO); if (!work) return NULL; @@ -1190,9 +1190,9 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) if (max == map->max_osd) return 0; - state = ceph_kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); - weight = ceph_kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); - addr = ceph_kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); + state = kvmalloc(array_size(max, sizeof(*state)), GFP_NOFS); + weight = kvmalloc(array_size(max, sizeof(*weight)), GFP_NOFS); + addr = kvmalloc(array_size(max, sizeof(*addr)), GFP_NOFS); if (!state || !weight || !addr) { kvfree(state); kvfree(weight); @@ -1222,7 +1222,7 @@ static int osdmap_set_max_osd(struct ceph_osdmap *map, u32 max) if (map->osd_primary_affinity) { u32 *affinity; - affinity = ceph_kvmalloc(array_size(max, sizeof(*affinity)), + affinity = kvmalloc(array_size(max, sizeof(*affinity)), GFP_NOFS); if (!affinity) return -ENOMEM; @@ -1503,7 +1503,7 @@ static int set_primary_affinity(struct ceph_osdmap *map, int osd, u32 aff) if (!map->osd_primary_affinity) { int i; - map->osd_primary_affinity = ceph_kvmalloc( + map->osd_primary_affinity = kvmalloc( array_size(map->max_osd, sizeof(*map->osd_primary_affinity)), GFP_NOFS); if (!map->osd_primary_affinity) diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 1e99ba1b9d72..9cb18b822ab2 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -6,6 +6,7 @@ */ #include +#include #include #include #include @@ -688,7 +689,7 @@ static int svc_alloc_arg(struct svc_rqst *rqstp) return -EINTR; } trace_svc_alloc_arg_err(pages); - schedule_timeout(msecs_to_jiffies(500)); + memalloc_retry_wait(GFP_KERNEL); } rqstp->rq_page_end = &rqstp->rq_pages[pages]; rqstp->rq_pages[pages] = NULL; /* this might be seen in nfsd_splice_actor() */ diff --git a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh index fe8fcfb334e0..a5cb4b09a46c 100644 --- a/tools/testing/selftests/vm/charge_reserved_hugetlb.sh +++ b/tools/testing/selftests/vm/charge_reserved_hugetlb.sh @@ -24,19 +24,23 @@ if [[ "$1" == "-cgroup-v2" ]]; then reservation_usage_file=rsvd.current fi -cgroup_path=/dev/cgroup/memory -if [[ ! -e $cgroup_path ]]; then - mkdir -p $cgroup_path - if [[ $cgroup2 ]]; then +if [[ $cgroup2 ]]; then + cgroup_path=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') + if [[ -z "$cgroup_path" ]]; then + cgroup_path=/dev/cgroup/memory mount -t cgroup2 none $cgroup_path - else + do_umount=1 + fi + echo "+hugetlb" >$cgroup_path/cgroup.subtree_control +else + cgroup_path=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') + if [[ -z "$cgroup_path" ]]; then + cgroup_path=/dev/cgroup/memory mount -t cgroup memory,hugetlb $cgroup_path + do_umount=1 fi fi - -if [[ $cgroup2 ]]; then - echo "+hugetlb" >/dev/cgroup/memory/cgroup.subtree_control -fi +export cgroup_path function cleanup() { if [[ $cgroup2 ]]; then @@ -108,7 +112,7 @@ function setup_cgroup() { function wait_for_hugetlb_memory_to_get_depleted() { local cgroup="$1" - local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" # Wait for hugetlbfs memory to get depleted. while [ $(cat $path) != 0 ]; do echo Waiting for hugetlb memory to get depleted. @@ -121,7 +125,7 @@ function wait_for_hugetlb_memory_to_get_reserved() { local cgroup="$1" local size="$2" - local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$reservation_usage_file" # Wait for hugetlbfs memory to get written. while [ $(cat $path) != $size ]; do echo Waiting for hugetlb memory reservation to reach size $size. @@ -134,7 +138,7 @@ function wait_for_hugetlb_memory_to_get_written() { local cgroup="$1" local size="$2" - local path="/dev/cgroup/memory/$cgroup/hugetlb.${MB}MB.$fault_usage_file" + local path="$cgroup_path/$cgroup/hugetlb.${MB}MB.$fault_usage_file" # Wait for hugetlbfs memory to get written. while [ $(cat $path) != $size ]; do echo Waiting for hugetlb memory to reach size $size. @@ -574,5 +578,7 @@ for populate in "" "-o"; do done # populate done # method -umount $cgroup_path -rmdir $cgroup_path +if [[ $do_umount ]]; then + umount $cgroup_path + rmdir $cgroup_path +fi diff --git a/tools/testing/selftests/vm/hmm-tests.c b/tools/testing/selftests/vm/hmm-tests.c index 864f126ffd78..203323967b50 100644 --- a/tools/testing/selftests/vm/hmm-tests.c +++ b/tools/testing/selftests/vm/hmm-tests.c @@ -1248,6 +1248,48 @@ TEST_F(hmm, anon_teardown) } } +/* + * Test memory snapshot without faulting in pages accessed by the device. + */ +TEST_F(hmm, mixedmap) +{ + struct hmm_buffer *buffer; + unsigned long npages; + unsigned long size; + unsigned char *m; + int ret; + + npages = 1; + size = npages << self->page_shift; + + buffer = malloc(sizeof(*buffer)); + ASSERT_NE(buffer, NULL); + + buffer->fd = -1; + buffer->size = size; + buffer->mirror = malloc(npages); + ASSERT_NE(buffer->mirror, NULL); + + + /* Reserve a range of addresses. */ + buffer->ptr = mmap(NULL, size, + PROT_READ | PROT_WRITE, + MAP_PRIVATE, + self->fd, 0); + ASSERT_NE(buffer->ptr, MAP_FAILED); + + /* Simulate a device snapshotting CPU pagetables. */ + ret = hmm_dmirror_cmd(self->fd, HMM_DMIRROR_SNAPSHOT, buffer, npages); + ASSERT_EQ(ret, 0); + ASSERT_EQ(buffer->cpages, npages); + + /* Check what the device saw. */ + m = buffer->mirror; + ASSERT_EQ(m[0], HMM_DMIRROR_PROT_READ); + + hmm_buffer_free(buffer); +} + /* * Test memory snapshot without faulting in pages accessed by the device. */ diff --git a/tools/testing/selftests/vm/hugepage-mremap.c b/tools/testing/selftests/vm/hugepage-mremap.c index 257df94697a5..2a7c33631a29 100644 --- a/tools/testing/selftests/vm/hugepage-mremap.c +++ b/tools/testing/selftests/vm/hugepage-mremap.c @@ -4,7 +4,11 @@ * * Example of remapping huge page memory in a user application using the * mremap system call. Code assumes a hugetlbfs filesystem is mounted - * at './huge'. The code will use 10MB worth of huge pages. + * at './huge'. The amount of memory used by this test is decided by a command + * line argument in MBs. If missing, the default amount is 10MB. + * + * To make sure the test triggers pmd sharing and goes through the 'unshare' + * path in the mremap code use 1GB (1024) or more. */ #define _GNU_SOURCE @@ -18,8 +22,10 @@ #include #include -#define LENGTH (1UL * 1024 * 1024 * 1024) +#define DEFAULT_LENGTH_MB 10UL +#define MB_TO_BYTES(x) (x * 1024 * 1024) +#define FILE_NAME "huge/hugepagefile" #define PROTECTION (PROT_READ | PROT_WRITE | PROT_EXEC) #define FLAGS (MAP_SHARED | MAP_ANONYMOUS) @@ -28,20 +34,20 @@ static void check_bytes(char *addr) printf("First hex is %x\n", *((unsigned int *)addr)); } -static void write_bytes(char *addr) +static void write_bytes(char *addr, size_t len) { unsigned long i; - for (i = 0; i < LENGTH; i++) + for (i = 0; i < len; i++) *(addr + i) = (char)i; } -static int read_bytes(char *addr) +static int read_bytes(char *addr, size_t len) { unsigned long i; check_bytes(addr); - for (i = 0; i < LENGTH; i++) + for (i = 0; i < len; i++) if (*(addr + i) != (char)i) { printf("Mismatch at %lu\n", i); return 1; @@ -99,11 +105,19 @@ static void register_region_with_uffd(char *addr, size_t len) } } -int main(void) +int main(int argc, char *argv[]) { + /* Read memory length as the first arg if valid, otherwise fallback to + * the default length. Any additional args are ignored. + */ + size_t length = argc > 1 ? (size_t)atoi(argv[1]) : 0UL; + + length = length > 0 ? length : DEFAULT_LENGTH_MB; + length = MB_TO_BYTES(length); + int ret = 0; - int fd = open("/huge/test", O_CREAT | O_RDWR, 0755); + int fd = open(FILE_NAME, O_CREAT | O_RDWR, 0755); if (fd < 0) { perror("Open failed"); @@ -112,7 +126,7 @@ int main(void) /* mmap to a PUD aligned address to hopefully trigger pmd sharing. */ unsigned long suggested_addr = 0x7eaa40000000; - void *haddr = mmap((void *)suggested_addr, LENGTH, PROTECTION, + void *haddr = mmap((void *)suggested_addr, length, PROTECTION, MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); printf("Map haddr: Returned address is %p\n", haddr); if (haddr == MAP_FAILED) { @@ -122,7 +136,7 @@ int main(void) /* mmap again to a dummy address to hopefully trigger pmd sharing. */ suggested_addr = 0x7daa40000000; - void *daddr = mmap((void *)suggested_addr, LENGTH, PROTECTION, + void *daddr = mmap((void *)suggested_addr, length, PROTECTION, MAP_HUGETLB | MAP_SHARED | MAP_POPULATE, fd, 0); printf("Map daddr: Returned address is %p\n", daddr); if (daddr == MAP_FAILED) { @@ -132,16 +146,16 @@ int main(void) suggested_addr = 0x7faa40000000; void *vaddr = - mmap((void *)suggested_addr, LENGTH, PROTECTION, FLAGS, -1, 0); + mmap((void *)suggested_addr, length, PROTECTION, FLAGS, -1, 0); printf("Map vaddr: Returned address is %p\n", vaddr); if (vaddr == MAP_FAILED) { perror("mmap2"); exit(1); } - register_region_with_uffd(haddr, LENGTH); + register_region_with_uffd(haddr, length); - void *addr = mremap(haddr, LENGTH, LENGTH, + void *addr = mremap(haddr, length, length, MREMAP_MAYMOVE | MREMAP_FIXED, vaddr); if (addr == MAP_FAILED) { perror("mremap"); @@ -150,10 +164,10 @@ int main(void) printf("Mremap: Returned address is %p\n", addr); check_bytes(addr); - write_bytes(addr); - ret = read_bytes(addr); + write_bytes(addr, length); + ret = read_bytes(addr, length); - munmap(addr, LENGTH); + munmap(addr, length); return ret; } diff --git a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh index 4a9a3afe9fd4..bf2d2a684edf 100644 --- a/tools/testing/selftests/vm/hugetlb_reparenting_test.sh +++ b/tools/testing/selftests/vm/hugetlb_reparenting_test.sh @@ -18,19 +18,24 @@ if [[ "$1" == "-cgroup-v2" ]]; then usage_file=current fi -CGROUP_ROOT='/dev/cgroup/memory' -MNT='/mnt/huge/' -if [[ ! -e $CGROUP_ROOT ]]; then - mkdir -p $CGROUP_ROOT - if [[ $cgroup2 ]]; then +if [[ $cgroup2 ]]; then + CGROUP_ROOT=$(mount -t cgroup2 | head -1 | awk -e '{print $3}') + if [[ -z "$CGROUP_ROOT" ]]; then + CGROUP_ROOT=/dev/cgroup/memory mount -t cgroup2 none $CGROUP_ROOT - sleep 1 - echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control - else + do_umount=1 + fi + echo "+hugetlb +memory" >$CGROUP_ROOT/cgroup.subtree_control +else + CGROUP_ROOT=$(mount -t cgroup | grep ",hugetlb" | awk -e '{print $3}') + if [[ -z "$CGROUP_ROOT" ]]; then + CGROUP_ROOT=/dev/cgroup/memory mount -t cgroup memory,hugetlb $CGROUP_ROOT + do_umount=1 fi fi +MNT='/mnt/huge/' function get_machine_hugepage_size() { hpz=$(grep -i hugepagesize /proc/meminfo) diff --git a/tools/testing/selftests/vm/run_vmtests.sh b/tools/testing/selftests/vm/run_vmtests.sh index a24d30af3094..75d401741394 100755 --- a/tools/testing/selftests/vm/run_vmtests.sh +++ b/tools/testing/selftests/vm/run_vmtests.sh @@ -111,7 +111,7 @@ fi echo "-----------------------" echo "running hugepage-mremap" echo "-----------------------" -./hugepage-mremap +./hugepage-mremap 256 if [ $? -ne 0 ]; then echo "[FAIL]" exitcode=1 diff --git a/tools/testing/selftests/vm/userfaultfd.c b/tools/testing/selftests/vm/userfaultfd.c index 990f7aecf4a3..d3fd24f9fae8 100644 --- a/tools/testing/selftests/vm/userfaultfd.c +++ b/tools/testing/selftests/vm/userfaultfd.c @@ -87,7 +87,7 @@ static bool test_uffdio_minor = false; static bool map_shared; static int shm_fd; -static int huge_fd = -1; /* only used for hugetlb_shared test */ +static int huge_fd; static char *huge_fd_off0; static unsigned long long *count_verify; static int uffd = -1; @@ -223,9 +223,6 @@ static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset) static void hugetlb_release_pages(char *rel_area) { - if (huge_fd == -1) - return; - if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, rel_area == huge_fd_off0 ? 0 : nr_pages * page_size, nr_pages * page_size)) @@ -238,17 +235,17 @@ static void hugetlb_allocate_area(void **alloc_area) char **alloc_area_alias; *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - map_shared ? MAP_SHARED : - MAP_PRIVATE | MAP_HUGETLB | + (map_shared ? MAP_SHARED : MAP_PRIVATE) | + MAP_HUGETLB | (*alloc_area == area_src ? 0 : MAP_NORESERVE), - huge_fd, - *alloc_area == area_src ? 0 : nr_pages * page_size); + huge_fd, *alloc_area == area_src ? 0 : + nr_pages * page_size); if (*alloc_area == MAP_FAILED) err("mmap of hugetlbfs file failed"); if (map_shared) { area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, - MAP_SHARED, + MAP_SHARED | MAP_HUGETLB, huge_fd, *alloc_area == area_src ? 0 : nr_pages * page_size); if (area_alias == MAP_FAILED) @@ -648,7 +645,7 @@ static int uffd_read_msg(int ufd, struct uffd_msg *msg) if (ret != sizeof(*msg)) { if (ret < 0) { - if (errno == EAGAIN) + if (errno == EAGAIN || errno == EINTR) return 1; err("blocking read error"); } else { @@ -724,8 +721,11 @@ static void *uffd_poll_thread(void *arg) for (;;) { ret = poll(pollfd, 2, -1); - if (ret <= 0) + if (ret <= 0) { + if (errno == EINTR || errno == EAGAIN) + continue; err("poll error: %d", ret); + } if (pollfd[1].revents & POLLIN) { if (read(pollfd[1].fd, &tmp_chr, 1) != 1) err("read pipefd error"); diff --git a/tools/testing/selftests/vm/write_hugetlb_memory.sh b/tools/testing/selftests/vm/write_hugetlb_memory.sh index d3d0d108924d..70a02301f4c2 100644 --- a/tools/testing/selftests/vm/write_hugetlb_memory.sh +++ b/tools/testing/selftests/vm/write_hugetlb_memory.sh @@ -14,7 +14,7 @@ want_sleep=$8 reserve=$9 echo "Putting task in cgroup '$cgroup'" -echo $$ > /dev/cgroup/memory/"$cgroup"/cgroup.procs +echo $$ > ${cgroup_path:-/dev/cgroup/memory}/"$cgroup"/cgroup.procs echo "Method is $method"