mirror of
https://github.com/hardkernel/linux.git
synced 2026-06-06 19:08:57 +09:00
Merge f56caedaf9 ("Merge branch 'akpm' (patches from Andrew)") into android-mainline
Steps on the way to 5.17-rc1 Signed-off-by: Greg Kroah-Hartman <gregkh@google.com> Change-Id: I1c65cc3cfdec641b4810661364847b46c1d3ebfb
This commit is contained in:
@@ -29,12 +29,14 @@ Brief summary of control files::
|
||||
hugetlb.<hugepagesize>.max_usage_in_bytes # show max "hugepagesize" hugetlb usage recorded
|
||||
hugetlb.<hugepagesize>.usage_in_bytes # show current usage for "hugepagesize" hugetlb
|
||||
hugetlb.<hugepagesize>.failcnt # show the number of allocation failure due to HugeTLB usage limit
|
||||
hugetlb.<hugepagesize>.numa_stat # show the numa information of the hugetlb memory charged to this cgroup
|
||||
|
||||
For a system supporting three hugepage sizes (64k, 32M and 1G), the control
|
||||
files include::
|
||||
|
||||
hugetlb.1GB.limit_in_bytes
|
||||
hugetlb.1GB.max_usage_in_bytes
|
||||
hugetlb.1GB.numa_stat
|
||||
hugetlb.1GB.usage_in_bytes
|
||||
hugetlb.1GB.failcnt
|
||||
hugetlb.1GB.rsvd.limit_in_bytes
|
||||
@@ -43,6 +45,7 @@ files include::
|
||||
hugetlb.1GB.rsvd.failcnt
|
||||
hugetlb.64KB.limit_in_bytes
|
||||
hugetlb.64KB.max_usage_in_bytes
|
||||
hugetlb.64KB.numa_stat
|
||||
hugetlb.64KB.usage_in_bytes
|
||||
hugetlb.64KB.failcnt
|
||||
hugetlb.64KB.rsvd.limit_in_bytes
|
||||
@@ -51,6 +54,7 @@ files include::
|
||||
hugetlb.64KB.rsvd.failcnt
|
||||
hugetlb.32MB.limit_in_bytes
|
||||
hugetlb.32MB.max_usage_in_bytes
|
||||
hugetlb.32MB.numa_stat
|
||||
hugetlb.32MB.usage_in_bytes
|
||||
hugetlb.32MB.failcnt
|
||||
hugetlb.32MB.rsvd.limit_in_bytes
|
||||
|
||||
@@ -2266,6 +2266,11 @@ HugeTLB Interface Files
|
||||
are local to the cgroup i.e. not hierarchical. The file modified event
|
||||
generated on this file reflects only the local events.
|
||||
|
||||
hugetlb.<hugepagesize>.numa_stat
|
||||
Similar to memory.numa_stat, it shows the numa information of the
|
||||
hugetlb pages of <hugepagesize> in this cgroup. Only active in
|
||||
use hugetlb pages are included. The per-node values are in bytes.
|
||||
|
||||
Misc
|
||||
----
|
||||
|
||||
|
||||
@@ -208,6 +208,31 @@ PID of the DAMON thread.
|
||||
If DAMON_RECLAIM is enabled, this becomes the PID of the worker thread. Else,
|
||||
-1.
|
||||
|
||||
nr_reclaim_tried_regions
|
||||
------------------------
|
||||
|
||||
Number of memory regions that tried to be reclaimed by DAMON_RECLAIM.
|
||||
|
||||
bytes_reclaim_tried_regions
|
||||
---------------------------
|
||||
|
||||
Total bytes of memory regions that tried to be reclaimed by DAMON_RECLAIM.
|
||||
|
||||
nr_reclaimed_regions
|
||||
--------------------
|
||||
|
||||
Number of memory regions that successfully be reclaimed by DAMON_RECLAIM.
|
||||
|
||||
bytes_reclaimed_regions
|
||||
-----------------------
|
||||
|
||||
Total bytes of memory regions that successfully be reclaimed by DAMON_RECLAIM.
|
||||
|
||||
nr_quota_exceeds
|
||||
----------------
|
||||
|
||||
Number of times that the time/space quota limits have exceeded.
|
||||
|
||||
Example
|
||||
=======
|
||||
|
||||
|
||||
@@ -7,37 +7,40 @@ Detailed Usages
|
||||
DAMON provides below three interfaces for different users.
|
||||
|
||||
- *DAMON user space tool.*
|
||||
This is for privileged people such as system administrators who want a
|
||||
just-working human-friendly interface. Using this, users can use the DAMON’s
|
||||
major features in a human-friendly way. It may not be highly tuned for
|
||||
special cases, though. It supports both virtual and physical address spaces
|
||||
monitoring.
|
||||
`This <https://github.com/awslabs/damo>`_ is for privileged people such as
|
||||
system administrators who want a just-working human-friendly interface.
|
||||
Using this, users can use the DAMON’s major features in a human-friendly way.
|
||||
It may not be highly tuned for special cases, though. It supports both
|
||||
virtual and physical address spaces monitoring. For more detail, please
|
||||
refer to its `usage document
|
||||
<https://github.com/awslabs/damo/blob/next/USAGE.md>`_.
|
||||
- *debugfs interface.*
|
||||
This is for privileged user space programmers who want more optimized use of
|
||||
DAMON. Using this, users can use DAMON’s major features by reading
|
||||
from and writing to special debugfs files. Therefore, you can write and use
|
||||
your personalized DAMON debugfs wrapper programs that reads/writes the
|
||||
debugfs files instead of you. The DAMON user space tool is also a reference
|
||||
implementation of such programs. It supports both virtual and physical
|
||||
address spaces monitoring.
|
||||
:ref:`This <debugfs_interface>` is for privileged user space programmers who
|
||||
want more optimized use of DAMON. Using this, users can use DAMON’s major
|
||||
features by reading from and writing to special debugfs files. Therefore,
|
||||
you can write and use your personalized DAMON debugfs wrapper programs that
|
||||
reads/writes the debugfs files instead of you. The `DAMON user space tool
|
||||
<https://github.com/awslabs/damo>`_ is one example of such programs. It
|
||||
supports both virtual and physical address spaces monitoring. Note that this
|
||||
interface provides only simple :ref:`statistics <damos_stats>` for the
|
||||
monitoring results. For detailed monitoring results, DAMON provides a
|
||||
:ref:`tracepoint <tracepoint>`.
|
||||
- *Kernel Space Programming Interface.*
|
||||
This is for kernel space programmers. Using this, users can utilize every
|
||||
feature of DAMON most flexibly and efficiently by writing kernel space
|
||||
DAMON application programs for you. You can even extend DAMON for various
|
||||
address spaces.
|
||||
:doc:`This </vm/damon/api>` is for kernel space programmers. Using this,
|
||||
users can utilize every feature of DAMON most flexibly and efficiently by
|
||||
writing kernel space DAMON application programs for you. You can even extend
|
||||
DAMON for various address spaces. For detail, please refer to the interface
|
||||
:doc:`document </vm/damon/api>`.
|
||||
|
||||
Nevertheless, you could write your own user space tool using the debugfs
|
||||
interface. A reference implementation is available at
|
||||
https://github.com/awslabs/damo. If you are a kernel programmer, you could
|
||||
refer to :doc:`/vm/damon/api` for the kernel space programming interface. For
|
||||
the reason, this document describes only the debugfs interface
|
||||
|
||||
.. _debugfs_interface:
|
||||
|
||||
debugfs Interface
|
||||
=================
|
||||
|
||||
DAMON exports five files, ``attrs``, ``target_ids``, ``init_regions``,
|
||||
``schemes`` and ``monitor_on`` under its debugfs directory,
|
||||
``<debugfs>/damon/``.
|
||||
DAMON exports eight files, ``attrs``, ``target_ids``, ``init_regions``,
|
||||
``schemes``, ``monitor_on``, ``kdamond_pid``, ``mk_contexts`` and
|
||||
``rm_contexts`` under its debugfs directory, ``<debugfs>/damon/``.
|
||||
|
||||
|
||||
Attributes
|
||||
@@ -131,24 +134,38 @@ Schemes
|
||||
|
||||
For usual DAMON-based data access aware memory management optimizations, users
|
||||
would simply want the system to apply a memory management action to a memory
|
||||
region of a specific size having a specific access frequency for a specific
|
||||
time. DAMON receives such formalized operation schemes from the user and
|
||||
applies those to the target processes. It also counts the total number and
|
||||
size of regions that each scheme is applied. This statistics can be used for
|
||||
online analysis or tuning of the schemes.
|
||||
region of a specific access pattern. DAMON receives such formalized operation
|
||||
schemes from the user and applies those to the target processes.
|
||||
|
||||
Users can get and set the schemes by reading from and writing to ``schemes``
|
||||
debugfs file. Reading the file also shows the statistics of each scheme. To
|
||||
the file, each of the schemes should be represented in each line in below form:
|
||||
the file, each of the schemes should be represented in each line in below
|
||||
form::
|
||||
|
||||
min-size max-size min-acc max-acc min-age max-age action
|
||||
<target access pattern> <action> <quota> <watermarks>
|
||||
|
||||
Note that the ranges are closed interval. Bytes for the size of regions
|
||||
(``min-size`` and ``max-size``), number of monitored accesses per aggregate
|
||||
interval for access frequency (``min-acc`` and ``max-acc``), number of
|
||||
aggregate intervals for the age of regions (``min-age`` and ``max-age``), and a
|
||||
predefined integer for memory management actions should be used. The supported
|
||||
numbers and their meanings are as below.
|
||||
You can disable schemes by simply writing an empty string to the file.
|
||||
|
||||
Target Access Pattern
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The ``<target access pattern>`` is constructed with three ranges in below
|
||||
form::
|
||||
|
||||
min-size max-size min-acc max-acc min-age max-age
|
||||
|
||||
Specifically, bytes for the size of regions (``min-size`` and ``max-size``),
|
||||
number of monitored accesses per aggregate interval for access frequency
|
||||
(``min-acc`` and ``max-acc``), number of aggregate intervals for the age of
|
||||
regions (``min-age`` and ``max-age``) are specified. Note that the ranges are
|
||||
closed interval.
|
||||
|
||||
Action
|
||||
~~~~~~
|
||||
|
||||
The ``<action>`` is a predefined integer for memory management actions, which
|
||||
DAMON will apply to the regions having the target access pattern. The
|
||||
supported numbers and their meanings are as below.
|
||||
|
||||
- 0: Call ``madvise()`` for the region with ``MADV_WILLNEED``
|
||||
- 1: Call ``madvise()`` for the region with ``MADV_COLD``
|
||||
@@ -157,20 +174,82 @@ numbers and their meanings are as below.
|
||||
- 4: Call ``madvise()`` for the region with ``MADV_NOHUGEPAGE``
|
||||
- 5: Do nothing but count the statistics
|
||||
|
||||
You can disable schemes by simply writing an empty string to the file. For
|
||||
example, below commands applies a scheme saying "If a memory region of size in
|
||||
[4KiB, 8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
|
||||
interval in [10, 20], page out the region", check the entered scheme again, and
|
||||
finally remove the scheme. ::
|
||||
Quota
|
||||
~~~~~
|
||||
|
||||
Optimal ``target access pattern`` for each ``action`` is workload dependent, so
|
||||
not easy to find. Worse yet, setting a scheme of some action too aggressive
|
||||
can cause severe overhead. To avoid such overhead, users can limit time and
|
||||
size quota for the scheme via the ``<quota>`` in below form::
|
||||
|
||||
<ms> <sz> <reset interval> <priority weights>
|
||||
|
||||
This makes DAMON to try to use only up to ``<ms>`` milliseconds for applying
|
||||
the action to memory regions of the ``target access pattern`` within the
|
||||
``<reset interval>`` milliseconds, and to apply the action to only up to
|
||||
``<sz>`` bytes of memory regions within the ``<reset interval>``. Setting both
|
||||
``<ms>`` and ``<sz>`` zero disables the quota limits.
|
||||
|
||||
When the quota limit is expected to be exceeded, DAMON prioritizes found memory
|
||||
regions of the ``target access pattern`` based on their size, access frequency,
|
||||
and age. For personalized prioritization, users can set the weights for the
|
||||
three properties in ``<priority weights>`` in below form::
|
||||
|
||||
<size weight> <access frequency weight> <age weight>
|
||||
|
||||
Watermarks
|
||||
~~~~~~~~~~
|
||||
|
||||
Some schemes would need to run based on current value of the system's specific
|
||||
metrics like free memory ratio. For such cases, users can specify watermarks
|
||||
for the condition.::
|
||||
|
||||
<metric> <check interval> <high mark> <middle mark> <low mark>
|
||||
|
||||
``<metric>`` is a predefined integer for the metric to be checked. The
|
||||
supported numbers and their meanings are as below.
|
||||
|
||||
- 0: Ignore the watermarks
|
||||
- 1: System's free memory rate (per thousand)
|
||||
|
||||
The value of the metric is checked every ``<check interval>`` microseconds.
|
||||
|
||||
If the value is higher than ``<high mark>`` or lower than ``<low mark>``, the
|
||||
scheme is deactivated. If the value is lower than ``<mid mark>``, the scheme
|
||||
is activated.
|
||||
|
||||
.. _damos_stats:
|
||||
|
||||
Statistics
|
||||
~~~~~~~~~~
|
||||
|
||||
It also counts the total number and bytes of regions that each scheme is tried
|
||||
to be applied, the two numbers for the regions that each scheme is successfully
|
||||
applied, and the total number of the quota limit exceeds. This statistics can
|
||||
be used for online analysis or tuning of the schemes.
|
||||
|
||||
The statistics can be shown by reading the ``schemes`` file. Reading the file
|
||||
will show each scheme you entered in each line, and the five numbers for the
|
||||
statistics will be added at the end of each line.
|
||||
|
||||
Example
|
||||
~~~~~~~
|
||||
|
||||
Below commands applies a scheme saying "If a memory region of size in [4KiB,
|
||||
8KiB] is showing accesses per aggregate interval in [0, 5] for aggregate
|
||||
interval in [10, 20], page out the region. For the paging out, use only up to
|
||||
10ms per second, and also don't page out more than 1GiB per second. Under the
|
||||
limitation, page out memory regions having longer age first. Also, check the
|
||||
free memory rate of the system every 5 seconds, start the monitoring and paging
|
||||
out when the free memory rate becomes lower than 50%, but stop it if the free
|
||||
memory rate becomes larger than 60%, or lower than 30%".::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# echo "4096 8192 0 5 10 20 2" > schemes
|
||||
# cat schemes
|
||||
4096 8192 0 5 10 20 2 0 0
|
||||
# echo > schemes
|
||||
|
||||
The last two integers in the 4th line of above example is the total number and
|
||||
the total size of the regions that the scheme is applied.
|
||||
# scheme="4096 8192 0 5 10 20 2" # target access pattern and action
|
||||
# scheme+=" 10 $((1024*1024*1024)) 1000" # quotas
|
||||
# scheme+=" 0 0 100" # prioritization weights
|
||||
# scheme+=" 1 5000000 600 500 300" # watermarks
|
||||
# echo "$scheme" > schemes
|
||||
|
||||
|
||||
Turning On/Off
|
||||
@@ -195,6 +274,54 @@ the monitoring is turned on. If you write to the files while DAMON is running,
|
||||
an error code such as ``-EBUSY`` will be returned.
|
||||
|
||||
|
||||
Monitoring Thread PID
|
||||
---------------------
|
||||
|
||||
DAMON does requested monitoring with a kernel thread called ``kdamond``. You
|
||||
can get the pid of the thread by reading the ``kdamond_pid`` file. When the
|
||||
monitoring is turned off, reading the file returns ``none``. ::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# cat monitor_on
|
||||
off
|
||||
# cat kdamond_pid
|
||||
none
|
||||
# echo on > monitor_on
|
||||
# cat kdamond_pid
|
||||
18594
|
||||
|
||||
|
||||
Using Multiple Monitoring Threads
|
||||
---------------------------------
|
||||
|
||||
One ``kdamond`` thread is created for each monitoring context. You can create
|
||||
and remove monitoring contexts for multiple ``kdamond`` required use case using
|
||||
the ``mk_contexts`` and ``rm_contexts`` files.
|
||||
|
||||
Writing the name of the new context to the ``mk_contexts`` file creates a
|
||||
directory of the name on the DAMON debugfs directory. The directory will have
|
||||
DAMON debugfs files for the context. ::
|
||||
|
||||
# cd <debugfs>/damon
|
||||
# ls foo
|
||||
# ls: cannot access 'foo': No such file or directory
|
||||
# echo foo > mk_contexts
|
||||
# ls foo
|
||||
# attrs init_regions kdamond_pid schemes target_ids
|
||||
|
||||
If the context is not needed anymore, you can remove it and the corresponding
|
||||
directory by putting the name of the context to the ``rm_contexts`` file. ::
|
||||
|
||||
# echo foo > rm_contexts
|
||||
# ls foo
|
||||
# ls: cannot access 'foo': No such file or directory
|
||||
|
||||
Note that ``mk_contexts``, ``rm_contexts``, and ``monitor_on`` files are in the
|
||||
root directory only.
|
||||
|
||||
|
||||
.. _tracepoint:
|
||||
|
||||
Tracepoint for Monitoring Results
|
||||
=================================
|
||||
|
||||
|
||||
@@ -408,7 +408,7 @@ follows:
|
||||
Memory Policy APIs
|
||||
==================
|
||||
|
||||
Linux supports 3 system calls for controlling memory policy. These APIS
|
||||
Linux supports 4 system calls for controlling memory policy. These APIS
|
||||
always affect only the calling task, the calling task's address space, or
|
||||
some shared object mapped into the calling task's address space.
|
||||
|
||||
@@ -460,6 +460,20 @@ requested via the 'flags' argument.
|
||||
|
||||
See the mbind(2) man page for more details.
|
||||
|
||||
Set home node for a Range of Task's Address Spacec::
|
||||
|
||||
long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
|
||||
unsigned long home_node,
|
||||
unsigned long flags);
|
||||
|
||||
sys_set_mempolicy_home_node set the home node for a VMA policy present in the
|
||||
task's address range. The system call updates the home node only for the existing
|
||||
mempolicy range. Other address ranges are ignored. A home node is the NUMA node
|
||||
closest to which page allocation will come from. Specifying the home node override
|
||||
the default allocation policy to allocate memory close to the local node for an
|
||||
executing CPU.
|
||||
|
||||
|
||||
Memory Policy Command Line Interface
|
||||
====================================
|
||||
|
||||
|
||||
@@ -948,7 +948,7 @@ how much memory needs to be free before kswapd goes back to sleep.
|
||||
|
||||
The unit is in fractions of 10,000. The default value of 10 means the
|
||||
distances between watermarks are 0.1% of the available memory in the
|
||||
node/system. The maximum value is 1000, or 10% of memory.
|
||||
node/system. The maximum value is 3000, or 30% of memory.
|
||||
|
||||
A high rate of threads entering direct reclaim (allocstall) or kswapd
|
||||
going to sleep prematurely (kswapd_low_wmark_hit_quickly) can indicate
|
||||
|
||||
@@ -66,9 +66,11 @@ PTE Page Table Helpers
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| pte_mknotpresent | Invalidates a mapped PTE |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| ptep_get_and_clear | Clears a PTE |
|
||||
| ptep_clear | Clears a PTE |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| ptep_get_and_clear_full | Clears a PTE |
|
||||
| ptep_get_and_clear | Clears and returns PTE |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| ptep_get_and_clear_full | Clears and returns PTE (batched PTE unmap) |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
| ptep_test_and_clear_young | Clears young from a PTE |
|
||||
+---------------------------+--------------------------------------------------+
|
||||
|
||||
@@ -31,10 +31,12 @@ algorithms. If you are looking for advice on simply allocating memory, see the
|
||||
page_migration
|
||||
page_frags
|
||||
page_owner
|
||||
page_table_check
|
||||
remap_file_pages
|
||||
slub
|
||||
split_page_table_lock
|
||||
transhuge
|
||||
unevictable-lru
|
||||
vmalloced-kernel-stacks
|
||||
z3fold
|
||||
zsmalloc
|
||||
|
||||
@@ -263,15 +263,15 @@ Monitoring Migration
|
||||
The following events (counters) can be used to monitor page migration.
|
||||
|
||||
1. PGMIGRATE_SUCCESS: Normal page migration success. Each count means that a
|
||||
page was migrated. If the page was a non-THP page, then this counter is
|
||||
increased by one. If the page was a THP, then this counter is increased by
|
||||
the number of THP subpages. For example, migration of a single 2MB THP that
|
||||
has 4KB-size base pages (subpages) will cause this counter to increase by
|
||||
512.
|
||||
page was migrated. If the page was a non-THP and non-hugetlb page, then
|
||||
this counter is increased by one. If the page was a THP or hugetlb, then
|
||||
this counter is increased by the number of THP or hugetlb subpages.
|
||||
For example, migration of a single 2MB THP that has 4KB-size base pages
|
||||
(subpages) will cause this counter to increase by 512.
|
||||
|
||||
2. PGMIGRATE_FAIL: Normal page migration failure. Same counting rules as for
|
||||
PGMIGRATE_SUCCESS, above: this will be increased by the number of subpages,
|
||||
if it was a THP.
|
||||
if it was a THP or hugetlb.
|
||||
|
||||
3. THP_MIGRATION_SUCCESS: A THP was migrated without being split.
|
||||
|
||||
|
||||
56
Documentation/vm/page_table_check.rst
Normal file
56
Documentation/vm/page_table_check.rst
Normal file
@@ -0,0 +1,56 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
.. _page_table_check:
|
||||
|
||||
================
|
||||
Page Table Check
|
||||
================
|
||||
|
||||
Introduction
|
||||
============
|
||||
|
||||
Page table check allows to hardern the kernel by ensuring that some types of
|
||||
the memory corruptions are prevented.
|
||||
|
||||
Page table check performs extra verifications at the time when new pages become
|
||||
accessible from the userspace by getting their page table entries (PTEs PMDs
|
||||
etc.) added into the table.
|
||||
|
||||
In case of detected corruption, the kernel is crashed. There is a small
|
||||
performance and memory overhead associated with the page table check. Therefore,
|
||||
it is disabled by default, but can be optionally enabled on systems where the
|
||||
extra hardening outweighs the performance costs. Also, because page table check
|
||||
is synchronous, it can help with debugging double map memory corruption issues,
|
||||
by crashing kernel at the time wrong mapping occurs instead of later which is
|
||||
often the case with memory corruptions bugs.
|
||||
|
||||
Double mapping detection logic
|
||||
==============================
|
||||
|
||||
+-------------------+-------------------+-------------------+------------------+
|
||||
| Current Mapping | New mapping | Permissions | Rule |
|
||||
+===================+===================+===================+==================+
|
||||
| Anonymous | Anonymous | Read | Allow |
|
||||
+-------------------+-------------------+-------------------+------------------+
|
||||
| Anonymous | Anonymous | Read / Write | Prohibit |
|
||||
+-------------------+-------------------+-------------------+------------------+
|
||||
| Anonymous | Named | Any | Prohibit |
|
||||
+-------------------+-------------------+-------------------+------------------+
|
||||
| Named | Anonymous | Any | Prohibit |
|
||||
+-------------------+-------------------+-------------------+------------------+
|
||||
| Named | Named | Any | Allow |
|
||||
+-------------------+-------------------+-------------------+------------------+
|
||||
|
||||
Enabling Page Table Check
|
||||
=========================
|
||||
|
||||
Build kernel with:
|
||||
|
||||
- PAGE_TABLE_CHECK=y
|
||||
Note, it can only be enabled on platforms where ARCH_SUPPORTS_PAGE_TABLE_CHECK
|
||||
is available.
|
||||
|
||||
- Boot with 'page_table_check=on' kernel parameter.
|
||||
|
||||
Optionally, build kernel with PAGE_TABLE_CHECK_ENFORCED in order to have page
|
||||
table support without extra kernel parameter.
|
||||
153
Documentation/vm/vmalloced-kernel-stacks.rst
Normal file
153
Documentation/vm/vmalloced-kernel-stacks.rst
Normal file
@@ -0,0 +1,153 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
=====================================
|
||||
Virtually Mapped Kernel Stack Support
|
||||
=====================================
|
||||
|
||||
:Author: Shuah Khan <skhan@linuxfoundation.org>
|
||||
|
||||
.. contents:: :local:
|
||||
|
||||
Overview
|
||||
--------
|
||||
|
||||
This is a compilation of information from the code and original patch
|
||||
series that introduced the `Virtually Mapped Kernel Stacks feature
|
||||
<https://lwn.net/Articles/694348/>`
|
||||
|
||||
Introduction
|
||||
------------
|
||||
|
||||
Kernel stack overflows are often hard to debug and make the kernel
|
||||
susceptible to exploits. Problems could show up at a later time making
|
||||
it difficult to isolate and root-cause.
|
||||
|
||||
Virtually-mapped kernel stacks with guard pages causes kernel stack
|
||||
overflows to be caught immediately rather than causing difficult to
|
||||
diagnose corruptions.
|
||||
|
||||
HAVE_ARCH_VMAP_STACK and VMAP_STACK configuration options enable
|
||||
support for virtually mapped stacks with guard pages. This feature
|
||||
causes reliable faults when the stack overflows. The usability of
|
||||
the stack trace after overflow and response to the overflow itself
|
||||
is architecture dependent.
|
||||
|
||||
.. note::
|
||||
As of this writing, arm64, powerpc, riscv, s390, um, and x86 have
|
||||
support for VMAP_STACK.
|
||||
|
||||
HAVE_ARCH_VMAP_STACK
|
||||
--------------------
|
||||
|
||||
Architectures that can support Virtually Mapped Kernel Stacks should
|
||||
enable this bool configuration option. The requirements are:
|
||||
|
||||
- vmalloc space must be large enough to hold many kernel stacks. This
|
||||
may rule out many 32-bit architectures.
|
||||
- Stacks in vmalloc space need to work reliably. For example, if
|
||||
vmap page tables are created on demand, either this mechanism
|
||||
needs to work while the stack points to a virtual address with
|
||||
unpopulated page tables or arch code (switch_to() and switch_mm(),
|
||||
most likely) needs to ensure that the stack's page table entries
|
||||
are populated before running on a possibly unpopulated stack.
|
||||
- If the stack overflows into a guard page, something reasonable
|
||||
should happen. The definition of "reasonable" is flexible, but
|
||||
instantly rebooting without logging anything would be unfriendly.
|
||||
|
||||
VMAP_STACK
|
||||
----------
|
||||
|
||||
VMAP_STACK bool configuration option when enabled allocates virtually
|
||||
mapped task stacks. This option depends on HAVE_ARCH_VMAP_STACK.
|
||||
|
||||
- Enable this if you want the use virtually-mapped kernel stacks
|
||||
with guard pages. This causes kernel stack overflows to be caught
|
||||
immediately rather than causing difficult-to-diagnose corruption.
|
||||
|
||||
.. note::
|
||||
|
||||
Using this feature with KASAN requires architecture support
|
||||
for backing virtual mappings with real shadow memory, and
|
||||
KASAN_VMALLOC must be enabled.
|
||||
|
||||
.. note::
|
||||
|
||||
VMAP_STACK is enabled, it is not possible to run DMA on stack
|
||||
allocated data.
|
||||
|
||||
Kernel configuration options and dependencies keep changing. Refer to
|
||||
the latest code base:
|
||||
|
||||
`Kconfig <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/arch/Kconfig>`
|
||||
|
||||
Allocation
|
||||
-----------
|
||||
|
||||
When a new kernel thread is created, thread stack is allocated from
|
||||
virtually contiguous memory pages from the page level allocator. These
|
||||
pages are mapped into contiguous kernel virtual space with PAGE_KERNEL
|
||||
protections.
|
||||
|
||||
alloc_thread_stack_node() calls __vmalloc_node_range() to allocate stack
|
||||
with PAGE_KERNEL protections.
|
||||
|
||||
- Allocated stacks are cached and later reused by new threads, so memcg
|
||||
accounting is performed manually on assigning/releasing stacks to tasks.
|
||||
Hence, __vmalloc_node_range is called without __GFP_ACCOUNT.
|
||||
- vm_struct is cached to be able to find when thread free is initiated
|
||||
in interrupt context. free_thread_stack() can be called in interrupt
|
||||
context.
|
||||
- On arm64, all VMAP's stacks need to have the same alignment to ensure
|
||||
that VMAP'd stack overflow detection works correctly. Arch specific
|
||||
vmap stack allocator takes care of this detail.
|
||||
- This does not address interrupt stacks - according to the original patch
|
||||
|
||||
Thread stack allocation is initiated from clone(), fork(), vfork(),
|
||||
kernel_thread() via kernel_clone(). Leaving a few hints for searching
|
||||
the code base to understand when and how thread stack is allocated.
|
||||
|
||||
Bulk of the code is in:
|
||||
`kernel/fork.c <https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/kernel/fork.c>`.
|
||||
|
||||
stack_vm_area pointer in task_struct keeps track of the virtually allocated
|
||||
stack and a non-null stack_vm_area pointer serves as a indication that the
|
||||
virtually mapped kernel stacks are enabled.
|
||||
|
||||
::
|
||||
|
||||
struct vm_struct *stack_vm_area;
|
||||
|
||||
Stack overflow handling
|
||||
-----------------------
|
||||
|
||||
Leading and trailing guard pages help detect stack overflows. When stack
|
||||
overflows into the guard pages, handlers have to be careful not overflow
|
||||
the stack again. When handlers are called, it is likely that very little
|
||||
stack space is left.
|
||||
|
||||
On x86, this is done by handling the page fault indicating the kernel
|
||||
stack overflow on the double-fault stack.
|
||||
|
||||
Testing VMAP allocation with guard pages
|
||||
----------------------------------------
|
||||
|
||||
How do we ensure that VMAP_STACK is actually allocating with a leading
|
||||
and trailing guard page? The following lkdtm tests can help detect any
|
||||
regressions.
|
||||
|
||||
::
|
||||
|
||||
void lkdtm_STACK_GUARD_PAGE_LEADING()
|
||||
void lkdtm_STACK_GUARD_PAGE_TRAILING()
|
||||
|
||||
Conclusions
|
||||
-----------
|
||||
|
||||
- A percpu cache of vmalloced stacks appears to be a bit faster than a
|
||||
high-order stack allocation, at least when the cache hits.
|
||||
- THREAD_INFO_IN_TASK gets rid of arch-specific thread_info entirely and
|
||||
simply embed the thread_info (containing only flags) and 'int cpu' into
|
||||
task_struct.
|
||||
- The thread stack can be free'ed as soon as the task is dead (without
|
||||
waiting for RCU) and then, if vmapped stacks are in use, cache the
|
||||
entire stack for reuse on the same cpu.
|
||||
@@ -14548,6 +14548,15 @@ F: include/net/page_pool.h
|
||||
F: include/trace/events/page_pool.h
|
||||
F: net/core/page_pool.c
|
||||
|
||||
PAGE TABLE CHECK
|
||||
M: Pasha Tatashin <pasha.tatashin@soleen.com>
|
||||
M: Andrew Morton <akpm@linux-foundation.org>
|
||||
L: linux-mm@kvack.org
|
||||
S: Maintained
|
||||
F: Documentation/vm/page_table_check.rst
|
||||
F: include/linux/page_table_check.h
|
||||
F: mm/page_table_check.c
|
||||
|
||||
PANASONIC LAPTOP ACPI EXTRAS DRIVER
|
||||
M: Kenneth Chan <kenneth.t.chan@gmail.com>
|
||||
L: platform-driver-x86@vger.kernel.org
|
||||
|
||||
@@ -1297,6 +1297,9 @@ config HAVE_ARCH_PFN_VALID
|
||||
config ARCH_SUPPORTS_DEBUG_PAGEALLOC
|
||||
bool
|
||||
|
||||
config ARCH_SUPPORTS_PAGE_TABLE_CHECK
|
||||
bool
|
||||
|
||||
config ARCH_SPLIT_ARG64
|
||||
bool
|
||||
help
|
||||
|
||||
@@ -489,3 +489,4 @@
|
||||
# 557 reserved for memfd_secret
|
||||
558 common process_mrelease sys_process_mrelease
|
||||
559 common futex_waitv sys_futex_waitv
|
||||
560 common set_mempolicy_home_node sys_ni_syscall
|
||||
|
||||
@@ -463,3 +463,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
@@ -38,7 +38,7 @@
|
||||
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
|
||||
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
|
||||
|
||||
#define __NR_compat_syscalls 450
|
||||
#define __NR_compat_syscalls 451
|
||||
#endif
|
||||
|
||||
#define __ARCH_WANT_SYS_CLONE
|
||||
|
||||
@@ -905,6 +905,8 @@ __SYSCALL(__NR_landlock_restrict_self, sys_landlock_restrict_self)
|
||||
__SYSCALL(__NR_process_mrelease, sys_process_mrelease)
|
||||
#define __NR_futex_waitv 449
|
||||
__SYSCALL(__NR_futex_waitv, sys_futex_waitv)
|
||||
#define __NR_set_mempolicy_home_node 450
|
||||
__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
|
||||
|
||||
/*
|
||||
* Please add new compat syscalls above this comment and update
|
||||
|
||||
@@ -370,3 +370,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
@@ -449,3 +449,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
@@ -455,3 +455,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
@@ -388,3 +388,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 n32 process_mrelease sys_process_mrelease
|
||||
449 n32 futex_waitv sys_futex_waitv
|
||||
450 n32 set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
@@ -364,3 +364,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 n64 process_mrelease sys_process_mrelease
|
||||
449 n64 futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
@@ -437,3 +437,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 o32 process_mrelease sys_process_mrelease
|
||||
449 o32 futex_waitv sys_futex_waitv
|
||||
450 o32 set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
@@ -447,3 +447,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
@@ -529,3 +529,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 nospu set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
@@ -452,3 +452,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
@@ -452,3 +452,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
@@ -495,3 +495,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
@@ -104,6 +104,7 @@ config X86
|
||||
select ARCH_SUPPORTS_ACPI
|
||||
select ARCH_SUPPORTS_ATOMIC_RMW
|
||||
select ARCH_SUPPORTS_DEBUG_PAGEALLOC
|
||||
select ARCH_SUPPORTS_PAGE_TABLE_CHECK if X86_64
|
||||
select ARCH_SUPPORTS_NUMA_BALANCING if X86_64
|
||||
select ARCH_SUPPORTS_KMAP_LOCAL_FORCE_MAP if NR_CPUS <= 4096
|
||||
select ARCH_SUPPORTS_LTO_CLANG
|
||||
|
||||
@@ -454,3 +454,4 @@
|
||||
447 i386 memfd_secret sys_memfd_secret
|
||||
448 i386 process_mrelease sys_process_mrelease
|
||||
449 i386 futex_waitv sys_futex_waitv
|
||||
450 i386 set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
@@ -371,6 +371,7 @@
|
||||
447 common memfd_secret sys_memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
#
|
||||
# Due to a historical design error, certain syscalls are numbered differently
|
||||
|
||||
@@ -27,6 +27,7 @@
|
||||
#include <asm/pkru.h>
|
||||
#include <asm/fpu/api.h>
|
||||
#include <asm-generic/pgtable_uffd.h>
|
||||
#include <linux/page_table_check.h>
|
||||
|
||||
extern pgd_t early_top_pgt[PTRS_PER_PGD];
|
||||
bool __init __early_make_pgtable(unsigned long address, pmdval_t pmd);
|
||||
@@ -1007,18 +1008,21 @@ static inline pud_t native_local_pudp_get_and_clear(pud_t *pudp)
|
||||
static inline void set_pte_at(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, pte_t pte)
|
||||
{
|
||||
page_table_check_pte_set(mm, addr, ptep, pte);
|
||||
set_pte(ptep, pte);
|
||||
}
|
||||
|
||||
static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr,
|
||||
pmd_t *pmdp, pmd_t pmd)
|
||||
{
|
||||
page_table_check_pmd_set(mm, addr, pmdp, pmd);
|
||||
set_pmd(pmdp, pmd);
|
||||
}
|
||||
|
||||
static inline void set_pud_at(struct mm_struct *mm, unsigned long addr,
|
||||
pud_t *pudp, pud_t pud)
|
||||
{
|
||||
page_table_check_pud_set(mm, addr, pudp, pud);
|
||||
native_set_pud(pudp, pud);
|
||||
}
|
||||
|
||||
@@ -1049,6 +1053,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep)
|
||||
{
|
||||
pte_t pte = native_ptep_get_and_clear(ptep);
|
||||
page_table_check_pte_clear(mm, addr, pte);
|
||||
return pte;
|
||||
}
|
||||
|
||||
@@ -1064,12 +1069,23 @@ static inline pte_t ptep_get_and_clear_full(struct mm_struct *mm,
|
||||
* care about updates and native needs no locking
|
||||
*/
|
||||
pte = native_local_ptep_get_and_clear(ptep);
|
||||
page_table_check_pte_clear(mm, addr, pte);
|
||||
} else {
|
||||
pte = ptep_get_and_clear(mm, addr, ptep);
|
||||
}
|
||||
return pte;
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PTEP_CLEAR
|
||||
static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep)
|
||||
{
|
||||
if (IS_ENABLED(CONFIG_PAGE_TABLE_CHECK))
|
||||
ptep_get_and_clear(mm, addr, ptep);
|
||||
else
|
||||
pte_clear(mm, addr, ptep);
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PTEP_SET_WRPROTECT
|
||||
static inline void ptep_set_wrprotect(struct mm_struct *mm,
|
||||
unsigned long addr, pte_t *ptep)
|
||||
@@ -1110,14 +1126,22 @@ static inline int pmd_write(pmd_t pmd)
|
||||
static inline pmd_t pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pmd_t *pmdp)
|
||||
{
|
||||
return native_pmdp_get_and_clear(pmdp);
|
||||
pmd_t pmd = native_pmdp_get_and_clear(pmdp);
|
||||
|
||||
page_table_check_pmd_clear(mm, addr, pmd);
|
||||
|
||||
return pmd;
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PUDP_HUGE_GET_AND_CLEAR
|
||||
static inline pud_t pudp_huge_get_and_clear(struct mm_struct *mm,
|
||||
unsigned long addr, pud_t *pudp)
|
||||
{
|
||||
return native_pudp_get_and_clear(pudp);
|
||||
pud_t pud = native_pudp_get_and_clear(pudp);
|
||||
|
||||
page_table_check_pud_clear(mm, addr, pud);
|
||||
|
||||
return pud;
|
||||
}
|
||||
|
||||
#define __HAVE_ARCH_PMDP_SET_WRPROTECT
|
||||
@@ -1138,6 +1162,7 @@ static inline int pud_write(pud_t pud)
|
||||
static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
|
||||
unsigned long address, pmd_t *pmdp, pmd_t pmd)
|
||||
{
|
||||
page_table_check_pmd_set(vma->vm_mm, address, pmdp, pmd);
|
||||
if (IS_ENABLED(CONFIG_SMP)) {
|
||||
return xchg(pmdp, pmd);
|
||||
} else {
|
||||
|
||||
@@ -420,3 +420,4 @@
|
||||
# 447 reserved for memfd_secret
|
||||
448 common process_mrelease sys_process_mrelease
|
||||
449 common futex_waitv sys_futex_waitv
|
||||
450 common set_mempolicy_home_node sys_set_mempolicy_home_node
|
||||
|
||||
@@ -1903,14 +1903,7 @@ static struct attribute *zram_disk_attrs[] = {
|
||||
NULL,
|
||||
};
|
||||
|
||||
static const struct attribute_group zram_disk_attr_group = {
|
||||
.attrs = zram_disk_attrs,
|
||||
};
|
||||
|
||||
static const struct attribute_group *zram_disk_attr_groups[] = {
|
||||
&zram_disk_attr_group,
|
||||
NULL,
|
||||
};
|
||||
ATTRIBUTE_GROUPS(zram_disk);
|
||||
|
||||
/*
|
||||
* Allocate and initialize new zram device. the function returns
|
||||
@@ -1983,7 +1976,7 @@ static int zram_add(void)
|
||||
blk_queue_max_write_zeroes_sectors(zram->disk->queue, UINT_MAX);
|
||||
|
||||
blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, zram->disk->queue);
|
||||
ret = device_add_disk(NULL, zram->disk, zram_disk_attr_groups);
|
||||
ret = device_add_disk(NULL, zram->disk, zram_disk_groups);
|
||||
if (ret)
|
||||
goto out_cleanup_disk;
|
||||
|
||||
|
||||
@@ -27,8 +27,8 @@
|
||||
#include <linux/slab.h>
|
||||
#include <linux/uaccess.h>
|
||||
#include <linux/fiemap.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/iomap.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include "ext4_jbd2.h"
|
||||
#include "ext4_extents.h"
|
||||
#include "xattr.h"
|
||||
@@ -4404,8 +4404,7 @@ retry:
|
||||
err = ext4_es_remove_extent(inode, last_block,
|
||||
EXT_MAX_BLOCKS - last_block);
|
||||
if (err == -ENOMEM) {
|
||||
cond_resched();
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
||||
memalloc_retry_wait(GFP_ATOMIC);
|
||||
goto retry;
|
||||
}
|
||||
if (err)
|
||||
@@ -4413,8 +4412,7 @@ retry:
|
||||
retry_remove_space:
|
||||
err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
|
||||
if (err == -ENOMEM) {
|
||||
cond_resched();
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
||||
memalloc_retry_wait(GFP_ATOMIC);
|
||||
goto retry_remove_space;
|
||||
}
|
||||
return err;
|
||||
|
||||
@@ -7,7 +7,7 @@
|
||||
#include <linux/iomap.h>
|
||||
#include <linux/fiemap.h>
|
||||
#include <linux/iversion.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
#include "ext4_jbd2.h"
|
||||
#include "ext4.h"
|
||||
@@ -1943,8 +1943,7 @@ int ext4_inline_data_truncate(struct inode *inode, int *has_inline)
|
||||
retry:
|
||||
err = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS);
|
||||
if (err == -ENOMEM) {
|
||||
cond_resched();
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
||||
memalloc_retry_wait(GFP_ATOMIC);
|
||||
goto retry;
|
||||
}
|
||||
if (err)
|
||||
|
||||
@@ -24,7 +24,7 @@
|
||||
#include <linux/kernel.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
#include "ext4_jbd2.h"
|
||||
#include "xattr.h"
|
||||
@@ -523,12 +523,13 @@ int ext4_bio_write_page(struct ext4_io_submit *io,
|
||||
ret = PTR_ERR(bounce_page);
|
||||
if (ret == -ENOMEM &&
|
||||
(io->io_bio || wbc->sync_mode == WB_SYNC_ALL)) {
|
||||
gfp_flags = GFP_NOFS;
|
||||
gfp_t new_gfp_flags = GFP_NOFS;
|
||||
if (io->io_bio)
|
||||
ext4_io_submit(io);
|
||||
else
|
||||
gfp_flags |= __GFP_NOFAIL;
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
||||
new_gfp_flags |= __GFP_NOFAIL;
|
||||
memalloc_retry_wait(gfp_flags);
|
||||
gfp_flags = new_gfp_flags;
|
||||
goto retry_encrypt;
|
||||
}
|
||||
|
||||
|
||||
@@ -8,9 +8,9 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/buffer_head.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/mpage.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/bio.h>
|
||||
@@ -2547,7 +2547,7 @@ retry_encrypt:
|
||||
/* flush pending IOs and wait for a while in the ENOMEM case */
|
||||
if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
|
||||
f2fs_flush_merged_writes(fio->sbi);
|
||||
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
gfp_flags |= __GFP_NOFAIL;
|
||||
goto retry_encrypt;
|
||||
}
|
||||
|
||||
@@ -7,7 +7,6 @@
|
||||
*/
|
||||
#include <linux/fs.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/kthread.h>
|
||||
@@ -15,6 +14,7 @@
|
||||
#include <linux/freezer.h>
|
||||
#include <linux/sched/signal.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
#include "f2fs.h"
|
||||
#include "node.h"
|
||||
@@ -1375,8 +1375,7 @@ retry:
|
||||
if (err) {
|
||||
clear_page_private_gcing(page);
|
||||
if (err == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC,
|
||||
DEFAULT_IO_TIMEOUT);
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto retry;
|
||||
}
|
||||
if (is_dirty)
|
||||
|
||||
@@ -8,8 +8,8 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/buffer_head.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/writeback.h>
|
||||
#include <linux/sched/mm.h>
|
||||
|
||||
#include "f2fs.h"
|
||||
#include "node.h"
|
||||
@@ -562,7 +562,7 @@ retry:
|
||||
inode = f2fs_iget(sb, ino);
|
||||
if (IS_ERR(inode)) {
|
||||
if (PTR_ERR(inode) == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto retry;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -8,7 +8,7 @@
|
||||
#include <linux/fs.h>
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/mpage.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/pagevec.h>
|
||||
#include <linux/swap.h>
|
||||
@@ -2750,7 +2750,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
|
||||
retry:
|
||||
ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
|
||||
if (!ipage) {
|
||||
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <asm/unaligned.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include "f2fs.h"
|
||||
#include "node.h"
|
||||
#include "segment.h"
|
||||
@@ -587,7 +588,7 @@ retry_dn:
|
||||
err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE);
|
||||
if (err) {
|
||||
if (err == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto retry_dn;
|
||||
}
|
||||
goto out;
|
||||
@@ -670,8 +671,7 @@ retry_prev:
|
||||
err = check_index_in_prev_nodes(sbi, dest, &dn);
|
||||
if (err) {
|
||||
if (err == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC,
|
||||
DEFAULT_IO_TIMEOUT);
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto retry_prev;
|
||||
}
|
||||
goto err;
|
||||
|
||||
@@ -9,6 +9,7 @@
|
||||
#include <linux/f2fs_fs.h>
|
||||
#include <linux/bio.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/prefetch.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/swap.h>
|
||||
@@ -245,9 +246,7 @@ retry:
|
||||
LOOKUP_NODE);
|
||||
if (err) {
|
||||
if (err == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC,
|
||||
DEFAULT_IO_TIMEOUT);
|
||||
cond_resched();
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto retry;
|
||||
}
|
||||
err = -EAGAIN;
|
||||
@@ -424,9 +423,7 @@ retry:
|
||||
err = f2fs_do_write_data_page(&fio);
|
||||
if (err) {
|
||||
if (err == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC,
|
||||
DEFAULT_IO_TIMEOUT);
|
||||
cond_resched();
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto retry;
|
||||
}
|
||||
unlock_page(page);
|
||||
|
||||
@@ -8,9 +8,9 @@
|
||||
#include <linux/module.h>
|
||||
#include <linux/init.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/statfs.h>
|
||||
#include <linux/buffer_head.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/parser.h>
|
||||
#include <linux/mount.h>
|
||||
@@ -2415,8 +2415,7 @@ repeat:
|
||||
page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS);
|
||||
if (IS_ERR(page)) {
|
||||
if (PTR_ERR(page) == -ENOMEM) {
|
||||
congestion_wait(BLK_RW_ASYNC,
|
||||
DEFAULT_IO_TIMEOUT);
|
||||
memalloc_retry_wait(GFP_NOFS);
|
||||
goto repeat;
|
||||
}
|
||||
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
|
||||
|
||||
@@ -409,10 +409,11 @@ hugetlb_vmdelete_list(struct rb_root_cached *root, pgoff_t start, pgoff_t end)
|
||||
struct vm_area_struct *vma;
|
||||
|
||||
/*
|
||||
* end == 0 indicates that the entire range after
|
||||
* start should be unmapped.
|
||||
* end == 0 indicates that the entire range after start should be
|
||||
* unmapped. Note, end is exclusive, whereas the interval tree takes
|
||||
* an inclusive "last".
|
||||
*/
|
||||
vma_interval_tree_foreach(vma, root, start, end ? end : ULONG_MAX) {
|
||||
vma_interval_tree_foreach(vma, root, start, end ? end - 1 : ULONG_MAX) {
|
||||
unsigned long v_offset;
|
||||
unsigned long v_end;
|
||||
|
||||
|
||||
@@ -4,7 +4,6 @@
|
||||
* All Rights Reserved.
|
||||
*/
|
||||
#include "xfs.h"
|
||||
#include <linux/backing-dev.h>
|
||||
#include "xfs_message.h"
|
||||
#include "xfs_trace.h"
|
||||
|
||||
@@ -26,6 +25,6 @@ kmem_alloc(size_t size, xfs_km_flags_t flags)
|
||||
"%s(%u) possible memory allocation deadlock size %u in %s (mode:0x%x)",
|
||||
current->comm, current->pid,
|
||||
(unsigned int)size, __func__, lflags);
|
||||
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
||||
memalloc_retry_wait(lflags);
|
||||
} while (1);
|
||||
}
|
||||
|
||||
@@ -394,7 +394,7 @@ xfs_buf_alloc_pages(
|
||||
}
|
||||
|
||||
XFS_STATS_INC(bp->b_mount, xb_page_retries);
|
||||
congestion_wait(BLK_RW_ASYNC, HZ / 50);
|
||||
memalloc_retry_wait(gfp_mask);
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -295,7 +295,6 @@ extern bool libceph_compatible(void *data);
|
||||
|
||||
extern const char *ceph_msg_type_name(int type);
|
||||
extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
|
||||
extern void *ceph_kvmalloc(size_t size, gfp_t flags);
|
||||
|
||||
struct fs_parameter;
|
||||
struct fc_log;
|
||||
|
||||
@@ -11,12 +11,19 @@
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/time64.h>
|
||||
#include <linux/types.h>
|
||||
#include <linux/random.h>
|
||||
|
||||
/* Minimal region size. Every damon_region is aligned by this. */
|
||||
#define DAMON_MIN_REGION PAGE_SIZE
|
||||
/* Max priority score for DAMON-based operation schemes */
|
||||
#define DAMOS_MAX_SCORE (99)
|
||||
|
||||
/* Get a random number in [l, r) */
|
||||
static inline unsigned long damon_rand(unsigned long l, unsigned long r)
|
||||
{
|
||||
return l + prandom_u32_max(r - l);
|
||||
}
|
||||
|
||||
/**
|
||||
* struct damon_addr_range - Represents an address region of [@start, @end).
|
||||
* @start: Start address of the region (inclusive).
|
||||
@@ -185,6 +192,22 @@ struct damos_watermarks {
|
||||
bool activated;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct damos_stat - Statistics on a given scheme.
|
||||
* @nr_tried: Total number of regions that the scheme is tried to be applied.
|
||||
* @sz_tried: Total size of regions that the scheme is tried to be applied.
|
||||
* @nr_applied: Total number of regions that the scheme is applied.
|
||||
* @sz_applied: Total size of regions that the scheme is applied.
|
||||
* @qt_exceeds: Total number of times the quota of the scheme has exceeded.
|
||||
*/
|
||||
struct damos_stat {
|
||||
unsigned long nr_tried;
|
||||
unsigned long sz_tried;
|
||||
unsigned long nr_applied;
|
||||
unsigned long sz_applied;
|
||||
unsigned long qt_exceeds;
|
||||
};
|
||||
|
||||
/**
|
||||
* struct damos - Represents a Data Access Monitoring-based Operation Scheme.
|
||||
* @min_sz_region: Minimum size of target regions.
|
||||
@@ -196,8 +219,7 @@ struct damos_watermarks {
|
||||
* @action: &damo_action to be applied to the target regions.
|
||||
* @quota: Control the aggressiveness of this scheme.
|
||||
* @wmarks: Watermarks for automated (in)activation of this scheme.
|
||||
* @stat_count: Total number of regions that this scheme is applied.
|
||||
* @stat_sz: Total size of regions that this scheme is applied.
|
||||
* @stat: Statistics of this scheme.
|
||||
* @list: List head for siblings.
|
||||
*
|
||||
* For each aggregation interval, DAMON finds regions which fit in the
|
||||
@@ -228,8 +250,7 @@ struct damos {
|
||||
enum damos_action action;
|
||||
struct damos_quota quota;
|
||||
struct damos_watermarks wmarks;
|
||||
unsigned long stat_count;
|
||||
unsigned long stat_sz;
|
||||
struct damos_stat stat;
|
||||
struct list_head list;
|
||||
};
|
||||
|
||||
@@ -274,7 +295,8 @@ struct damon_ctx;
|
||||
* as an integer in [0, &DAMOS_MAX_SCORE].
|
||||
* @apply_scheme is called from @kdamond when a region for user provided
|
||||
* DAMON-based operation scheme is found. It should apply the scheme's action
|
||||
* to the region. This is not used for &DAMON_ARBITRARY_TARGET case.
|
||||
* to the region and return bytes of the region that the action is successfully
|
||||
* applied.
|
||||
* @target_valid should check whether the target is still valid for the
|
||||
* monitoring.
|
||||
* @cleanup is called from @kdamond just before its termination.
|
||||
@@ -288,8 +310,9 @@ struct damon_primitive {
|
||||
int (*get_scheme_score)(struct damon_ctx *context,
|
||||
struct damon_target *t, struct damon_region *r,
|
||||
struct damos *scheme);
|
||||
int (*apply_scheme)(struct damon_ctx *context, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *scheme);
|
||||
unsigned long (*apply_scheme)(struct damon_ctx *context,
|
||||
struct damon_target *t, struct damon_region *r,
|
||||
struct damos *scheme);
|
||||
bool (*target_valid)(void *target);
|
||||
void (*cleanup)(struct damon_ctx *context);
|
||||
};
|
||||
@@ -392,14 +415,20 @@ struct damon_ctx {
|
||||
struct list_head schemes;
|
||||
};
|
||||
|
||||
#define damon_next_region(r) \
|
||||
(container_of(r->list.next, struct damon_region, list))
|
||||
static inline struct damon_region *damon_next_region(struct damon_region *r)
|
||||
{
|
||||
return container_of(r->list.next, struct damon_region, list);
|
||||
}
|
||||
|
||||
#define damon_prev_region(r) \
|
||||
(container_of(r->list.prev, struct damon_region, list))
|
||||
static inline struct damon_region *damon_prev_region(struct damon_region *r)
|
||||
{
|
||||
return container_of(r->list.prev, struct damon_region, list);
|
||||
}
|
||||
|
||||
#define damon_last_region(t) \
|
||||
(list_last_entry(&t->regions_list, struct damon_region, list))
|
||||
static inline struct damon_region *damon_last_region(struct damon_target *t)
|
||||
{
|
||||
return list_last_entry(&t->regions_list, struct damon_region, list);
|
||||
}
|
||||
|
||||
#define damon_for_each_region(r, t) \
|
||||
list_for_each_entry(r, &t->regions_list, list)
|
||||
@@ -422,9 +451,18 @@ struct damon_ctx {
|
||||
#ifdef CONFIG_DAMON
|
||||
|
||||
struct damon_region *damon_new_region(unsigned long start, unsigned long end);
|
||||
inline void damon_insert_region(struct damon_region *r,
|
||||
|
||||
/*
|
||||
* Add a region between two other regions
|
||||
*/
|
||||
static inline void damon_insert_region(struct damon_region *r,
|
||||
struct damon_region *prev, struct damon_region *next,
|
||||
struct damon_target *t);
|
||||
struct damon_target *t)
|
||||
{
|
||||
__list_add(&r->list, &prev->list, &next->list);
|
||||
t->nr_regions++;
|
||||
}
|
||||
|
||||
void damon_add_region(struct damon_region *r, struct damon_target *t);
|
||||
void damon_destroy_region(struct damon_region *r, struct damon_target *t);
|
||||
|
||||
@@ -461,34 +499,13 @@ int damon_stop(struct damon_ctx **ctxs, int nr_ctxs);
|
||||
#endif /* CONFIG_DAMON */
|
||||
|
||||
#ifdef CONFIG_DAMON_VADDR
|
||||
|
||||
/* Monitoring primitives for virtual memory address spaces */
|
||||
void damon_va_init(struct damon_ctx *ctx);
|
||||
void damon_va_update(struct damon_ctx *ctx);
|
||||
void damon_va_prepare_access_checks(struct damon_ctx *ctx);
|
||||
unsigned int damon_va_check_accesses(struct damon_ctx *ctx);
|
||||
bool damon_va_target_valid(void *t);
|
||||
void damon_va_cleanup(struct damon_ctx *ctx);
|
||||
int damon_va_apply_scheme(struct damon_ctx *context, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *scheme);
|
||||
int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *scheme);
|
||||
void damon_va_set_primitives(struct damon_ctx *ctx);
|
||||
|
||||
#endif /* CONFIG_DAMON_VADDR */
|
||||
|
||||
#ifdef CONFIG_DAMON_PADDR
|
||||
|
||||
/* Monitoring primitives for the physical memory address space */
|
||||
void damon_pa_prepare_access_checks(struct damon_ctx *ctx);
|
||||
unsigned int damon_pa_check_accesses(struct damon_ctx *ctx);
|
||||
bool damon_pa_target_valid(void *t);
|
||||
int damon_pa_apply_scheme(struct damon_ctx *context, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *scheme);
|
||||
int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *scheme);
|
||||
void damon_pa_set_primitives(struct damon_ctx *ctx);
|
||||
|
||||
#endif /* CONFIG_DAMON_PADDR */
|
||||
|
||||
#endif /* _DAMON_H */
|
||||
|
||||
@@ -302,7 +302,9 @@ struct vm_area_struct;
|
||||
* lowest zone as a type of emergency reserve.
|
||||
*
|
||||
* %GFP_DMA32 is similar to %GFP_DMA except that the caller requires a 32-bit
|
||||
* address.
|
||||
* address. Note that kmalloc(..., GFP_DMA32) does not return DMA32 memory
|
||||
* because the DMA32 kmalloc cache array is not implemented.
|
||||
* (Reason: there is no such user in kernel).
|
||||
*
|
||||
* %GFP_HIGHUSER is for userspace allocations that may be mapped to userspace,
|
||||
* do not need to be directly accessible by the kernel but that cannot
|
||||
@@ -589,9 +591,9 @@ struct page *alloc_pages(gfp_t gfp, unsigned int order);
|
||||
struct folio *folio_alloc(gfp_t gfp, unsigned order);
|
||||
extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order,
|
||||
struct vm_area_struct *vma, unsigned long addr,
|
||||
int node, bool hugepage);
|
||||
bool hugepage);
|
||||
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
|
||||
alloc_pages_vma(gfp_mask, order, vma, addr, numa_node_id(), true)
|
||||
alloc_pages_vma(gfp_mask, order, vma, addr, true)
|
||||
#else
|
||||
static inline struct page *alloc_pages(gfp_t gfp_mask, unsigned int order)
|
||||
{
|
||||
@@ -601,14 +603,14 @@ static inline struct folio *folio_alloc(gfp_t gfp, unsigned int order)
|
||||
{
|
||||
return __folio_alloc_node(gfp, order, numa_node_id());
|
||||
}
|
||||
#define alloc_pages_vma(gfp_mask, order, vma, addr, node, false)\
|
||||
#define alloc_pages_vma(gfp_mask, order, vma, addr, false)\
|
||||
alloc_pages(gfp_mask, order)
|
||||
#define alloc_hugepage_vma(gfp_mask, vma, addr, order) \
|
||||
alloc_pages(gfp_mask, order)
|
||||
#endif
|
||||
#define alloc_page(gfp_mask) alloc_pages(gfp_mask, 0)
|
||||
#define alloc_page_vma(gfp_mask, vma, addr) \
|
||||
alloc_pages_vma(gfp_mask, 0, vma, addr, numa_node_id(), false)
|
||||
alloc_pages_vma(gfp_mask, 0, vma, addr, false)
|
||||
|
||||
extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order);
|
||||
extern unsigned long get_zeroed_page(gfp_t gfp_mask);
|
||||
|
||||
@@ -622,8 +622,8 @@ struct hstate {
|
||||
#endif
|
||||
#ifdef CONFIG_CGROUP_HUGETLB
|
||||
/* cgroup control files */
|
||||
struct cftype cgroup_files_dfl[7];
|
||||
struct cftype cgroup_files_legacy[9];
|
||||
struct cftype cgroup_files_dfl[8];
|
||||
struct cftype cgroup_files_legacy[10];
|
||||
#endif
|
||||
char name[HSTATE_NAME_LEN];
|
||||
};
|
||||
|
||||
@@ -36,6 +36,11 @@ enum hugetlb_memory_event {
|
||||
HUGETLB_NR_MEMORY_EVENTS,
|
||||
};
|
||||
|
||||
struct hugetlb_cgroup_per_node {
|
||||
/* hugetlb usage in pages over all hstates. */
|
||||
unsigned long usage[HUGE_MAX_HSTATE];
|
||||
};
|
||||
|
||||
struct hugetlb_cgroup {
|
||||
struct cgroup_subsys_state css;
|
||||
|
||||
@@ -57,6 +62,8 @@ struct hugetlb_cgroup {
|
||||
|
||||
/* Handle for "hugetlb.events.local" */
|
||||
struct cgroup_file events_local_file[HUGE_MAX_HSTATE];
|
||||
|
||||
struct hugetlb_cgroup_per_node *nodeinfo[];
|
||||
};
|
||||
|
||||
static inline struct hugetlb_cgroup *
|
||||
|
||||
@@ -46,6 +46,7 @@ struct mempolicy {
|
||||
unsigned short mode; /* See MPOL_* above */
|
||||
unsigned short flags; /* See set_mempolicy() MPOL_F_* above */
|
||||
nodemask_t nodes; /* interleave/bind/perfer */
|
||||
int home_node; /* Home node to use for MPOL_BIND and MPOL_PREFERRED_MANY */
|
||||
|
||||
union {
|
||||
nodemask_t cpuset_mems_allowed; /* relative to these nodes */
|
||||
|
||||
@@ -820,19 +820,15 @@ static inline int page_mapcount(struct page *page)
|
||||
|
||||
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
|
||||
int total_mapcount(struct page *page);
|
||||
int page_trans_huge_mapcount(struct page *page, int *total_mapcount);
|
||||
int page_trans_huge_mapcount(struct page *page);
|
||||
#else
|
||||
static inline int total_mapcount(struct page *page)
|
||||
{
|
||||
return page_mapcount(page);
|
||||
}
|
||||
static inline int page_trans_huge_mapcount(struct page *page,
|
||||
int *total_mapcount)
|
||||
static inline int page_trans_huge_mapcount(struct page *page)
|
||||
{
|
||||
int mapcount = page_mapcount(page);
|
||||
if (total_mapcount)
|
||||
*total_mapcount = mapcount;
|
||||
return mapcount;
|
||||
return page_mapcount(page);
|
||||
}
|
||||
#endif
|
||||
|
||||
@@ -3112,7 +3108,6 @@ int drop_caches_sysctl_handler(struct ctl_table *, int, void *, size_t *,
|
||||
#endif
|
||||
|
||||
void drop_slab(void);
|
||||
void drop_slab_node(int nid);
|
||||
|
||||
#ifndef CONFIG_MMU
|
||||
#define randomize_va_space 0
|
||||
@@ -3165,6 +3160,7 @@ enum mf_flags {
|
||||
MF_ACTION_REQUIRED = 1 << 1,
|
||||
MF_MUST_KILL = 1 << 2,
|
||||
MF_SOFT_OFFLINE = 1 << 3,
|
||||
MF_UNPOISON = 1 << 4,
|
||||
};
|
||||
extern int memory_failure(unsigned long pfn, int flags);
|
||||
extern void memory_failure_queue(unsigned long pfn, int flags);
|
||||
@@ -3205,7 +3201,6 @@ enum mf_action_page_type {
|
||||
MF_MSG_KERNEL_HIGH_ORDER,
|
||||
MF_MSG_SLAB,
|
||||
MF_MSG_DIFFERENT_COMPOUND,
|
||||
MF_MSG_POISONED_HUGE,
|
||||
MF_MSG_HUGE,
|
||||
MF_MSG_FREE_HUGE,
|
||||
MF_MSG_NON_PMD_HUGE,
|
||||
@@ -3220,7 +3215,6 @@ enum mf_action_page_type {
|
||||
MF_MSG_CLEAN_LRU,
|
||||
MF_MSG_TRUNCATED_LRU,
|
||||
MF_MSG_BUDDY,
|
||||
MF_MSG_BUDDY_2ND,
|
||||
MF_MSG_DAX,
|
||||
MF_MSG_UNSPLIT_THP,
|
||||
MF_MSG_UNKNOWN,
|
||||
|
||||
@@ -647,7 +647,7 @@ struct mm_struct {
|
||||
atomic_t tlb_flush_pending;
|
||||
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
||||
/* See flush_tlb_batched_pending() */
|
||||
bool tlb_flush_batched;
|
||||
atomic_t tlb_flush_batched;
|
||||
#endif
|
||||
struct uprobes_state uprobes_state;
|
||||
#ifdef CONFIG_PREEMPT_RT
|
||||
|
||||
@@ -1045,6 +1045,15 @@ static inline int is_highmem_idx(enum zone_type idx)
|
||||
#endif
|
||||
}
|
||||
|
||||
#ifdef CONFIG_ZONE_DMA
|
||||
bool has_managed_dma(void);
|
||||
#else
|
||||
static inline bool has_managed_dma(void)
|
||||
{
|
||||
return false;
|
||||
}
|
||||
#endif
|
||||
|
||||
/**
|
||||
* is_highmem - helper function to quickly check if a struct zone is a
|
||||
* highmem zone or not. This is an attempt to keep references
|
||||
|
||||
@@ -380,7 +380,7 @@ static __always_inline int TestClearPage##uname(struct page *page) \
|
||||
TESTCLEARFLAG(uname, lname, policy)
|
||||
|
||||
#define TESTPAGEFLAG_FALSE(uname, lname) \
|
||||
static inline bool folio_test_##lname(const struct folio *folio) { return 0; } \
|
||||
static inline bool folio_test_##lname(const struct folio *folio) { return false; } \
|
||||
static inline int Page##uname(const struct page *page) { return 0; }
|
||||
|
||||
#define SETPAGEFLAG_NOOP(uname, lname) \
|
||||
@@ -519,7 +519,11 @@ PAGEFLAG_FALSE(Uncached, uncached)
|
||||
PAGEFLAG(HWPoison, hwpoison, PF_ANY)
|
||||
TESTSCFLAG(HWPoison, hwpoison, PF_ANY)
|
||||
#define __PG_HWPOISON (1UL << PG_hwpoison)
|
||||
#define MAGIC_HWPOISON 0x48575053U /* HWPS */
|
||||
extern void SetPageHWPoisonTakenOff(struct page *page);
|
||||
extern void ClearPageHWPoisonTakenOff(struct page *page);
|
||||
extern bool take_page_off_buddy(struct page *page);
|
||||
extern bool put_page_back_buddy(struct page *page);
|
||||
#else
|
||||
PAGEFLAG_FALSE(HWPoison, hwpoison)
|
||||
#define __PG_HWPOISON 0
|
||||
|
||||
@@ -13,7 +13,6 @@
|
||||
* If there is not enough space to store Idle and Young bits in page flags, use
|
||||
* page ext flags instead.
|
||||
*/
|
||||
extern struct page_ext_operations page_idle_ops;
|
||||
|
||||
static inline bool folio_test_young(struct folio *folio)
|
||||
{
|
||||
|
||||
147
include/linux/page_table_check.h
Normal file
147
include/linux/page_table_check.h
Normal file
@@ -0,0 +1,147 @@
|
||||
/* SPDX-License-Identifier: GPL-2.0 */
|
||||
|
||||
/*
|
||||
* Copyright (c) 2021, Google LLC.
|
||||
* Pasha Tatashin <pasha.tatashin@soleen.com>
|
||||
*/
|
||||
#ifndef __LINUX_PAGE_TABLE_CHECK_H
|
||||
#define __LINUX_PAGE_TABLE_CHECK_H
|
||||
|
||||
#ifdef CONFIG_PAGE_TABLE_CHECK
|
||||
#include <linux/jump_label.h>
|
||||
|
||||
extern struct static_key_true page_table_check_disabled;
|
||||
extern struct page_ext_operations page_table_check_ops;
|
||||
|
||||
void __page_table_check_zero(struct page *page, unsigned int order);
|
||||
void __page_table_check_pte_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t pte);
|
||||
void __page_table_check_pmd_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pmd_t pmd);
|
||||
void __page_table_check_pud_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pud_t pud);
|
||||
void __page_table_check_pte_set(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep, pte_t pte);
|
||||
void __page_table_check_pmd_set(struct mm_struct *mm, unsigned long addr,
|
||||
pmd_t *pmdp, pmd_t pmd);
|
||||
void __page_table_check_pud_set(struct mm_struct *mm, unsigned long addr,
|
||||
pud_t *pudp, pud_t pud);
|
||||
|
||||
static inline void page_table_check_alloc(struct page *page, unsigned int order)
|
||||
{
|
||||
if (static_branch_likely(&page_table_check_disabled))
|
||||
return;
|
||||
|
||||
__page_table_check_zero(page, order);
|
||||
}
|
||||
|
||||
static inline void page_table_check_free(struct page *page, unsigned int order)
|
||||
{
|
||||
if (static_branch_likely(&page_table_check_disabled))
|
||||
return;
|
||||
|
||||
__page_table_check_zero(page, order);
|
||||
}
|
||||
|
||||
static inline void page_table_check_pte_clear(struct mm_struct *mm,
|
||||
unsigned long addr, pte_t pte)
|
||||
{
|
||||
if (static_branch_likely(&page_table_check_disabled))
|
||||
return;
|
||||
|
||||
__page_table_check_pte_clear(mm, addr, pte);
|
||||
}
|
||||
|
||||
static inline void page_table_check_pmd_clear(struct mm_struct *mm,
|
||||
unsigned long addr, pmd_t pmd)
|
||||
{
|
||||
if (static_branch_likely(&page_table_check_disabled))
|
||||
return;
|
||||
|
||||
__page_table_check_pmd_clear(mm, addr, pmd);
|
||||
}
|
||||
|
||||
static inline void page_table_check_pud_clear(struct mm_struct *mm,
|
||||
unsigned long addr, pud_t pud)
|
||||
{
|
||||
if (static_branch_likely(&page_table_check_disabled))
|
||||
return;
|
||||
|
||||
__page_table_check_pud_clear(mm, addr, pud);
|
||||
}
|
||||
|
||||
static inline void page_table_check_pte_set(struct mm_struct *mm,
|
||||
unsigned long addr, pte_t *ptep,
|
||||
pte_t pte)
|
||||
{
|
||||
if (static_branch_likely(&page_table_check_disabled))
|
||||
return;
|
||||
|
||||
__page_table_check_pte_set(mm, addr, ptep, pte);
|
||||
}
|
||||
|
||||
static inline void page_table_check_pmd_set(struct mm_struct *mm,
|
||||
unsigned long addr, pmd_t *pmdp,
|
||||
pmd_t pmd)
|
||||
{
|
||||
if (static_branch_likely(&page_table_check_disabled))
|
||||
return;
|
||||
|
||||
__page_table_check_pmd_set(mm, addr, pmdp, pmd);
|
||||
}
|
||||
|
||||
static inline void page_table_check_pud_set(struct mm_struct *mm,
|
||||
unsigned long addr, pud_t *pudp,
|
||||
pud_t pud)
|
||||
{
|
||||
if (static_branch_likely(&page_table_check_disabled))
|
||||
return;
|
||||
|
||||
__page_table_check_pud_set(mm, addr, pudp, pud);
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
static inline void page_table_check_alloc(struct page *page, unsigned int order)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void page_table_check_free(struct page *page, unsigned int order)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void page_table_check_pte_clear(struct mm_struct *mm,
|
||||
unsigned long addr, pte_t pte)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void page_table_check_pmd_clear(struct mm_struct *mm,
|
||||
unsigned long addr, pmd_t pmd)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void page_table_check_pud_clear(struct mm_struct *mm,
|
||||
unsigned long addr, pud_t pud)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void page_table_check_pte_set(struct mm_struct *mm,
|
||||
unsigned long addr, pte_t *ptep,
|
||||
pte_t pte)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void page_table_check_pmd_set(struct mm_struct *mm,
|
||||
unsigned long addr, pmd_t *pmdp,
|
||||
pmd_t pmd)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void page_table_check_pud_set(struct mm_struct *mm,
|
||||
unsigned long addr, pud_t *pudp,
|
||||
pud_t pud)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* CONFIG_PAGE_TABLE_CHECK */
|
||||
#endif /* __LINUX_PAGE_TABLE_CHECK_H */
|
||||
@@ -258,6 +258,14 @@ static inline int pmdp_clear_flush_young(struct vm_area_struct *vma,
|
||||
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */
|
||||
#endif
|
||||
|
||||
#ifndef __HAVE_ARCH_PTEP_CLEAR
|
||||
static inline void ptep_clear(struct mm_struct *mm, unsigned long addr,
|
||||
pte_t *ptep)
|
||||
{
|
||||
pte_clear(mm, addr, ptep);
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifndef __HAVE_ARCH_PTEP_GET_AND_CLEAR
|
||||
static inline pte_t ptep_get_and_clear(struct mm_struct *mm,
|
||||
unsigned long address,
|
||||
|
||||
@@ -214,6 +214,32 @@ static inline void fs_reclaim_acquire(gfp_t gfp_mask) { }
|
||||
static inline void fs_reclaim_release(gfp_t gfp_mask) { }
|
||||
#endif
|
||||
|
||||
/* Any memory-allocation retry loop should use
|
||||
* memalloc_retry_wait(), and pass the flags for the most
|
||||
* constrained allocation attempt that might have failed.
|
||||
* This provides useful documentation of where loops are,
|
||||
* and a central place to fine tune the waiting as the MM
|
||||
* implementation changes.
|
||||
*/
|
||||
static inline void memalloc_retry_wait(gfp_t gfp_flags)
|
||||
{
|
||||
/* We use io_schedule_timeout because waiting for memory
|
||||
* typically included waiting for dirty pages to be
|
||||
* written out, which requires IO.
|
||||
*/
|
||||
__set_current_state(TASK_UNINTERRUPTIBLE);
|
||||
gfp_flags = current_gfp_context(gfp_flags);
|
||||
if (gfpflags_allow_blocking(gfp_flags) &&
|
||||
!(gfp_flags & __GFP_NORETRY))
|
||||
/* Probably waited already, no need for much more */
|
||||
io_schedule_timeout(1);
|
||||
else
|
||||
/* Probably didn't wait, and has now released a lock,
|
||||
* so now is a good time to wait
|
||||
*/
|
||||
io_schedule_timeout(HZ/50);
|
||||
}
|
||||
|
||||
/**
|
||||
* might_alloc - Mark possible allocation sites
|
||||
* @gfp_mask: gfp_t flags that would be used to allocate
|
||||
|
||||
@@ -514,7 +514,7 @@ extern int __swp_swapcount(swp_entry_t entry);
|
||||
extern int swp_swapcount(swp_entry_t entry);
|
||||
extern struct swap_info_struct *page_swap_info(struct page *);
|
||||
extern struct swap_info_struct *swp_swap_info(swp_entry_t entry);
|
||||
extern bool reuse_swap_page(struct page *, int *);
|
||||
extern bool reuse_swap_page(struct page *);
|
||||
extern int try_to_free_swap(struct page *);
|
||||
struct backing_dev_info;
|
||||
extern int init_swap_address_space(unsigned int type, unsigned long nr_pages);
|
||||
@@ -680,8 +680,8 @@ static inline int swp_swapcount(swp_entry_t entry)
|
||||
return 0;
|
||||
}
|
||||
|
||||
#define reuse_swap_page(page, total_map_swapcount) \
|
||||
(page_trans_huge_mapcount(page, total_map_swapcount) == 1)
|
||||
#define reuse_swap_page(page) \
|
||||
(page_trans_huge_mapcount(page) == 1)
|
||||
|
||||
static inline int try_to_free_swap(struct page *page)
|
||||
{
|
||||
|
||||
@@ -1057,6 +1057,9 @@ asmlinkage long sys_landlock_add_rule(int ruleset_fd, enum landlock_rule_type ru
|
||||
const void __user *rule_attr, __u32 flags);
|
||||
asmlinkage long sys_landlock_restrict_self(int ruleset_fd, __u32 flags);
|
||||
asmlinkage long sys_memfd_secret(unsigned int flags);
|
||||
asmlinkage long sys_set_mempolicy_home_node(unsigned long start, unsigned long len,
|
||||
unsigned long home_node,
|
||||
unsigned long flags);
|
||||
|
||||
/*
|
||||
* Architecture-specific system calls
|
||||
|
||||
@@ -98,6 +98,9 @@ enum vm_event_item { PGPGIN, PGPGOUT, PSWPIN, PSWPOUT,
|
||||
THP_SPLIT_PAGE_FAILED,
|
||||
THP_DEFERRED_SPLIT_PAGE,
|
||||
THP_SPLIT_PMD,
|
||||
THP_SCAN_EXCEED_NONE_PTE,
|
||||
THP_SCAN_EXCEED_SWAP_PTE,
|
||||
THP_SCAN_EXCEED_SHARED_PTE,
|
||||
#ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD
|
||||
THP_SPLIT_PUD,
|
||||
#endif
|
||||
|
||||
@@ -358,7 +358,6 @@ TRACE_EVENT(aer_event,
|
||||
EM ( MF_MSG_KERNEL_HIGH_ORDER, "high-order kernel page" ) \
|
||||
EM ( MF_MSG_SLAB, "kernel slab page" ) \
|
||||
EM ( MF_MSG_DIFFERENT_COMPOUND, "different compound page after locking" ) \
|
||||
EM ( MF_MSG_POISONED_HUGE, "huge page already hardware poisoned" ) \
|
||||
EM ( MF_MSG_HUGE, "huge page" ) \
|
||||
EM ( MF_MSG_FREE_HUGE, "free huge page" ) \
|
||||
EM ( MF_MSG_NON_PMD_HUGE, "non-pmd-sized huge page" ) \
|
||||
@@ -373,7 +372,6 @@ TRACE_EVENT(aer_event,
|
||||
EM ( MF_MSG_CLEAN_LRU, "clean LRU page" ) \
|
||||
EM ( MF_MSG_TRUNCATED_LRU, "already truncated LRU page" ) \
|
||||
EM ( MF_MSG_BUDDY, "free buddy page" ) \
|
||||
EM ( MF_MSG_BUDDY_2ND, "free buddy page (2nd try)" ) \
|
||||
EM ( MF_MSG_DAX, "dax page" ) \
|
||||
EM ( MF_MSG_UNSPLIT_THP, "unsplit thp" ) \
|
||||
EMe ( MF_MSG_UNKNOWN, "unknown page" )
|
||||
|
||||
@@ -68,10 +68,9 @@ DEFINE_EVENT(mm_compaction_isolate_template, mm_compaction_isolate_freepages,
|
||||
TRACE_EVENT(mm_compaction_migratepages,
|
||||
|
||||
TP_PROTO(unsigned long nr_all,
|
||||
int migrate_rc,
|
||||
struct list_head *migratepages),
|
||||
unsigned int nr_succeeded),
|
||||
|
||||
TP_ARGS(nr_all, migrate_rc, migratepages),
|
||||
TP_ARGS(nr_all, nr_succeeded),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(unsigned long, nr_migrated)
|
||||
@@ -79,23 +78,8 @@ TRACE_EVENT(mm_compaction_migratepages,
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
unsigned long nr_failed = 0;
|
||||
struct list_head *page_lru;
|
||||
|
||||
/*
|
||||
* migrate_pages() returns either a non-negative number
|
||||
* with the number of pages that failed migration, or an
|
||||
* error code, in which case we need to count the remaining
|
||||
* pages manually
|
||||
*/
|
||||
if (migrate_rc >= 0)
|
||||
nr_failed = migrate_rc;
|
||||
else
|
||||
list_for_each(page_lru, migratepages)
|
||||
nr_failed++;
|
||||
|
||||
__entry->nr_migrated = nr_all - nr_failed;
|
||||
__entry->nr_failed = nr_failed;
|
||||
__entry->nr_migrated = nr_succeeded;
|
||||
__entry->nr_failed = nr_all - nr_succeeded;
|
||||
),
|
||||
|
||||
TP_printk("nr_migrated=%lu nr_failed=%lu",
|
||||
|
||||
@@ -11,10 +11,10 @@
|
||||
|
||||
TRACE_EVENT(damon_aggregated,
|
||||
|
||||
TP_PROTO(struct damon_target *t, struct damon_region *r,
|
||||
unsigned int nr_regions),
|
||||
TP_PROTO(struct damon_target *t, unsigned int target_id,
|
||||
struct damon_region *r, unsigned int nr_regions),
|
||||
|
||||
TP_ARGS(t, r, nr_regions),
|
||||
TP_ARGS(t, target_id, r, nr_regions),
|
||||
|
||||
TP_STRUCT__entry(
|
||||
__field(unsigned long, target_id)
|
||||
@@ -22,19 +22,22 @@ TRACE_EVENT(damon_aggregated,
|
||||
__field(unsigned long, start)
|
||||
__field(unsigned long, end)
|
||||
__field(unsigned int, nr_accesses)
|
||||
__field(unsigned int, age)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->target_id = t->id;
|
||||
__entry->target_id = target_id;
|
||||
__entry->nr_regions = nr_regions;
|
||||
__entry->start = r->ar.start;
|
||||
__entry->end = r->ar.end;
|
||||
__entry->nr_accesses = r->nr_accesses;
|
||||
__entry->age = r->age;
|
||||
),
|
||||
|
||||
TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u",
|
||||
TP_printk("target_id=%lu nr_regions=%u %lu-%lu: %u %u",
|
||||
__entry->target_id, __entry->nr_regions,
|
||||
__entry->start, __entry->end, __entry->nr_accesses)
|
||||
__entry->start, __entry->end,
|
||||
__entry->nr_accesses, __entry->age)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_DAMON_H */
|
||||
|
||||
@@ -8,24 +8,6 @@
|
||||
#include <linux/types.h>
|
||||
#include <linux/tracepoint.h>
|
||||
|
||||
TRACE_EVENT(hugepage_invalidate,
|
||||
|
||||
TP_PROTO(unsigned long addr, unsigned long pte),
|
||||
TP_ARGS(addr, pte),
|
||||
TP_STRUCT__entry(
|
||||
__field(unsigned long, addr)
|
||||
__field(unsigned long, pte)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->addr = addr;
|
||||
__entry->pte = pte;
|
||||
),
|
||||
|
||||
TP_printk("hugepage invalidate at addr 0x%lx and pte = 0x%lx",
|
||||
__entry->addr, __entry->pte)
|
||||
);
|
||||
|
||||
TRACE_EVENT(hugepage_set_pmd,
|
||||
|
||||
TP_PROTO(unsigned long addr, unsigned long pmd),
|
||||
@@ -65,23 +47,6 @@ TRACE_EVENT(hugepage_update,
|
||||
|
||||
TP_printk("hugepage update at addr 0x%lx and pte = 0x%lx clr = 0x%lx, set = 0x%lx", __entry->addr, __entry->pte, __entry->clr, __entry->set)
|
||||
);
|
||||
TRACE_EVENT(hugepage_splitting,
|
||||
|
||||
TP_PROTO(unsigned long addr, unsigned long pte),
|
||||
TP_ARGS(addr, pte),
|
||||
TP_STRUCT__entry(
|
||||
__field(unsigned long, addr)
|
||||
__field(unsigned long, pte)
|
||||
),
|
||||
|
||||
TP_fast_assign(
|
||||
__entry->addr = addr;
|
||||
__entry->pte = pte;
|
||||
),
|
||||
|
||||
TP_printk("hugepage splitting at addr 0x%lx and pte = 0x%lx",
|
||||
__entry->addr, __entry->pte)
|
||||
);
|
||||
|
||||
#endif /* _TRACE_THP_H */
|
||||
|
||||
|
||||
@@ -883,8 +883,11 @@ __SYSCALL(__NR_process_mrelease, sys_process_mrelease)
|
||||
#define __NR_futex_waitv 449
|
||||
__SYSCALL(__NR_futex_waitv, sys_futex_waitv)
|
||||
|
||||
#define __NR_set_mempolicy_home_node 450
|
||||
__SYSCALL(__NR_set_mempolicy_home_node, sys_set_mempolicy_home_node)
|
||||
|
||||
#undef __NR_syscalls
|
||||
#define __NR_syscalls 450
|
||||
#define __NR_syscalls 451
|
||||
|
||||
/*
|
||||
* 32 bit systems traditionally used different
|
||||
|
||||
@@ -203,7 +203,7 @@ static int __init dma_atomic_pool_init(void)
|
||||
GFP_KERNEL);
|
||||
if (!atomic_pool_kernel)
|
||||
ret = -ENOMEM;
|
||||
if (IS_ENABLED(CONFIG_ZONE_DMA)) {
|
||||
if (has_managed_dma()) {
|
||||
atomic_pool_dma = __dma_atomic_pool_init(atomic_pool_size,
|
||||
GFP_KERNEL | GFP_DMA);
|
||||
if (!atomic_pool_dma)
|
||||
@@ -226,7 +226,7 @@ static inline struct gen_pool *dma_guess_pool(struct gen_pool *prev, gfp_t gfp)
|
||||
if (prev == NULL) {
|
||||
if (IS_ENABLED(CONFIG_ZONE_DMA32) && (gfp & GFP_DMA32))
|
||||
return atomic_pool_dma32;
|
||||
if (IS_ENABLED(CONFIG_ZONE_DMA) && (gfp & GFP_DMA))
|
||||
if (atomic_pool_dma && (gfp & GFP_DMA))
|
||||
return atomic_pool_dma;
|
||||
return atomic_pool_kernel;
|
||||
}
|
||||
|
||||
@@ -297,6 +297,7 @@ COND_SYSCALL(get_mempolicy);
|
||||
COND_SYSCALL(set_mempolicy);
|
||||
COND_SYSCALL(migrate_pages);
|
||||
COND_SYSCALL(move_pages);
|
||||
COND_SYSCALL(set_mempolicy_home_node);
|
||||
|
||||
COND_SYSCALL(perf_event_open);
|
||||
COND_SYSCALL(accept4);
|
||||
|
||||
@@ -123,6 +123,7 @@ static unsigned long long_max = LONG_MAX;
|
||||
static int one_hundred = 100;
|
||||
static int two_hundred = 200;
|
||||
static int one_thousand = 1000;
|
||||
static int three_thousand = 3000;
|
||||
#ifdef CONFIG_PRINTK
|
||||
static int ten_thousand = 10000;
|
||||
#endif
|
||||
@@ -2960,7 +2961,7 @@ static struct ctl_table vm_table[] = {
|
||||
.mode = 0644,
|
||||
.proc_handler = watermark_scale_factor_sysctl_handler,
|
||||
.extra1 = SYSCTL_ONE,
|
||||
.extra2 = &one_thousand,
|
||||
.extra2 = &three_thousand,
|
||||
},
|
||||
{
|
||||
.procname = "percpu_pagelist_high_fraction",
|
||||
|
||||
@@ -1086,9 +1086,33 @@ static long dmirror_fops_unlocked_ioctl(struct file *filp,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dmirror_fops_mmap(struct file *file, struct vm_area_struct *vma)
|
||||
{
|
||||
unsigned long addr;
|
||||
|
||||
for (addr = vma->vm_start; addr < vma->vm_end; addr += PAGE_SIZE) {
|
||||
struct page *page;
|
||||
int ret;
|
||||
|
||||
page = alloc_page(GFP_KERNEL | __GFP_ZERO);
|
||||
if (!page)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = vm_insert_page(vma, addr, page);
|
||||
if (ret) {
|
||||
__free_page(page);
|
||||
return ret;
|
||||
}
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static const struct file_operations dmirror_fops = {
|
||||
.open = dmirror_fops_open,
|
||||
.release = dmirror_fops_release,
|
||||
.mmap = dmirror_fops_mmap,
|
||||
.unlocked_ioctl = dmirror_fops_unlocked_ioctl,
|
||||
.llseek = default_llseek,
|
||||
.owner = THIS_MODULE,
|
||||
|
||||
@@ -62,6 +62,30 @@ config PAGE_OWNER
|
||||
|
||||
If unsure, say N.
|
||||
|
||||
config PAGE_TABLE_CHECK
|
||||
bool "Check for invalid mappings in user page tables"
|
||||
depends on ARCH_SUPPORTS_PAGE_TABLE_CHECK
|
||||
select PAGE_EXTENSION
|
||||
help
|
||||
Check that anonymous page is not being mapped twice with read write
|
||||
permissions. Check that anonymous and file pages are not being
|
||||
erroneously shared. Since the checking is performed at the time
|
||||
entries are added and removed to user page tables, leaking, corruption
|
||||
and double mapping problems are detected synchronously.
|
||||
|
||||
If unsure say "n".
|
||||
|
||||
config PAGE_TABLE_CHECK_ENFORCED
|
||||
bool "Enforce the page table checking by default"
|
||||
depends on PAGE_TABLE_CHECK
|
||||
help
|
||||
Always enable page table checking. By default the page table checking
|
||||
is disabled, and can be optionally enabled via page_table_check=on
|
||||
kernel parameter. This config enforces that page table check is always
|
||||
enabled.
|
||||
|
||||
If unsure say "n".
|
||||
|
||||
config PAGE_POISONING
|
||||
bool "Poison pages after freeing"
|
||||
help
|
||||
|
||||
@@ -114,6 +114,7 @@ obj-$(CONFIG_GENERIC_EARLY_IOREMAP) += early_ioremap.o
|
||||
obj-$(CONFIG_CMA) += cma.o
|
||||
obj-$(CONFIG_MEMORY_BALLOON) += balloon_compaction.o
|
||||
obj-$(CONFIG_PAGE_EXTENSION) += page_ext.o
|
||||
obj-$(CONFIG_PAGE_TABLE_CHECK) += page_table_check.o
|
||||
obj-$(CONFIG_CMA_DEBUGFS) += cma_debug.o
|
||||
obj-$(CONFIG_SECRETMEM) += secretmem.o
|
||||
obj-$(CONFIG_CMA_SYSFS) += cma_sysfs.o
|
||||
|
||||
@@ -2280,6 +2280,7 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
|
||||
unsigned long last_migrated_pfn;
|
||||
const bool sync = cc->mode != MIGRATE_ASYNC;
|
||||
bool update_cached;
|
||||
unsigned int nr_succeeded = 0;
|
||||
|
||||
/*
|
||||
* These counters track activities during zone compaction. Initialize
|
||||
@@ -2398,10 +2399,10 @@ compact_zone(struct compact_control *cc, struct capture_control *capc)
|
||||
|
||||
err = migrate_pages(&cc->migratepages, compaction_alloc,
|
||||
compaction_free, (unsigned long)cc, cc->mode,
|
||||
MR_COMPACTION, NULL);
|
||||
MR_COMPACTION, &nr_succeeded);
|
||||
|
||||
trace_mm_compaction_migratepages(cc->nr_migratepages, err,
|
||||
&cc->migratepages);
|
||||
trace_mm_compaction_migratepages(cc->nr_migratepages,
|
||||
nr_succeeded);
|
||||
|
||||
/* All pages were either migrated or will be released */
|
||||
cc->nr_migratepages = 0;
|
||||
|
||||
@@ -11,7 +11,6 @@
|
||||
#include <linux/delay.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/mm.h>
|
||||
#include <linux/random.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/string.h>
|
||||
|
||||
@@ -23,9 +22,6 @@
|
||||
#define DAMON_MIN_REGION 1
|
||||
#endif
|
||||
|
||||
/* Get a random number in [l, r) */
|
||||
#define damon_rand(l, r) (l + prandom_u32_max(r - l))
|
||||
|
||||
static DEFINE_MUTEX(damon_lock);
|
||||
static int nr_running_ctxs;
|
||||
|
||||
@@ -53,17 +49,6 @@ struct damon_region *damon_new_region(unsigned long start, unsigned long end)
|
||||
return region;
|
||||
}
|
||||
|
||||
/*
|
||||
* Add a region between two other regions
|
||||
*/
|
||||
inline void damon_insert_region(struct damon_region *r,
|
||||
struct damon_region *prev, struct damon_region *next,
|
||||
struct damon_target *t)
|
||||
{
|
||||
__list_add(&r->list, &prev->list, &next->list);
|
||||
t->nr_regions++;
|
||||
}
|
||||
|
||||
void damon_add_region(struct damon_region *r, struct damon_target *t)
|
||||
{
|
||||
list_add_tail(&r->list, &t->regions_list);
|
||||
@@ -106,8 +91,7 @@ struct damos *damon_new_scheme(
|
||||
scheme->min_age_region = min_age_region;
|
||||
scheme->max_age_region = max_age_region;
|
||||
scheme->action = action;
|
||||
scheme->stat_count = 0;
|
||||
scheme->stat_sz = 0;
|
||||
scheme->stat = (struct damos_stat){};
|
||||
INIT_LIST_HEAD(&scheme->list);
|
||||
|
||||
scheme->quota.ms = quota->ms;
|
||||
@@ -530,15 +514,17 @@ static bool kdamond_aggregate_interval_passed(struct damon_ctx *ctx)
|
||||
static void kdamond_reset_aggregated(struct damon_ctx *c)
|
||||
{
|
||||
struct damon_target *t;
|
||||
unsigned int ti = 0; /* target's index */
|
||||
|
||||
damon_for_each_target(t, c) {
|
||||
struct damon_region *r;
|
||||
|
||||
damon_for_each_region(r, t) {
|
||||
trace_damon_aggregated(t, r, damon_nr_regions(t));
|
||||
trace_damon_aggregated(t, ti, r, damon_nr_regions(t));
|
||||
r->last_nr_accesses = r->nr_accesses;
|
||||
r->nr_accesses = 0;
|
||||
}
|
||||
ti++;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -578,6 +564,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
|
||||
struct damos_quota *quota = &s->quota;
|
||||
unsigned long sz = r->ar.end - r->ar.start;
|
||||
struct timespec64 begin, end;
|
||||
unsigned long sz_applied = 0;
|
||||
|
||||
if (!s->wmarks.activated)
|
||||
continue;
|
||||
@@ -631,7 +618,7 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
|
||||
damon_split_region_at(c, t, r, sz);
|
||||
}
|
||||
ktime_get_coarse_ts64(&begin);
|
||||
c->primitive.apply_scheme(c, t, r, s);
|
||||
sz_applied = c->primitive.apply_scheme(c, t, r, s);
|
||||
ktime_get_coarse_ts64(&end);
|
||||
quota->total_charged_ns += timespec64_to_ns(&end) -
|
||||
timespec64_to_ns(&begin);
|
||||
@@ -645,8 +632,11 @@ static void damon_do_apply_schemes(struct damon_ctx *c,
|
||||
r->age = 0;
|
||||
|
||||
update_stat:
|
||||
s->stat_count++;
|
||||
s->stat_sz += sz;
|
||||
s->stat.nr_tried++;
|
||||
s->stat.sz_tried += sz;
|
||||
if (sz_applied)
|
||||
s->stat.nr_applied++;
|
||||
s->stat.sz_applied += sz_applied;
|
||||
}
|
||||
}
|
||||
|
||||
@@ -694,6 +684,8 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
|
||||
if (time_after_eq(jiffies, quota->charged_from +
|
||||
msecs_to_jiffies(
|
||||
quota->reset_interval))) {
|
||||
if (quota->esz && quota->charged_sz >= quota->esz)
|
||||
s->stat.qt_exceeds++;
|
||||
quota->total_charged_sz += quota->charged_sz;
|
||||
quota->charged_from = jiffies;
|
||||
quota->charged_sz = 0;
|
||||
@@ -733,7 +725,10 @@ static void kdamond_apply_schemes(struct damon_ctx *c)
|
||||
}
|
||||
}
|
||||
|
||||
#define sz_damon_region(r) (r->ar.end - r->ar.start)
|
||||
static inline unsigned long sz_damon_region(struct damon_region *r)
|
||||
{
|
||||
return r->ar.end - r->ar.start;
|
||||
}
|
||||
|
||||
/*
|
||||
* Merge two adjacent regions into one region
|
||||
@@ -750,8 +745,6 @@ static void damon_merge_two_regions(struct damon_target *t,
|
||||
damon_destroy_region(r, t);
|
||||
}
|
||||
|
||||
#define diff_of(a, b) (a > b ? a - b : b - a)
|
||||
|
||||
/*
|
||||
* Merge adjacent regions having similar access frequencies
|
||||
*
|
||||
@@ -765,13 +758,13 @@ static void damon_merge_regions_of(struct damon_target *t, unsigned int thres,
|
||||
struct damon_region *r, *prev = NULL, *next;
|
||||
|
||||
damon_for_each_region_safe(r, next, t) {
|
||||
if (diff_of(r->nr_accesses, r->last_nr_accesses) > thres)
|
||||
if (abs(r->nr_accesses - r->last_nr_accesses) > thres)
|
||||
r->age = 0;
|
||||
else
|
||||
r->age++;
|
||||
|
||||
if (prev && prev->ar.end == r->ar.start &&
|
||||
diff_of(prev->nr_accesses, r->nr_accesses) <= thres &&
|
||||
abs(prev->nr_accesses - r->nr_accesses) <= thres &&
|
||||
sz_damon_region(prev) + sz_damon_region(r) <= sz_limit)
|
||||
damon_merge_two_regions(t, prev, r);
|
||||
else
|
||||
|
||||
@@ -105,7 +105,7 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
|
||||
|
||||
damon_for_each_scheme(s, c) {
|
||||
rc = scnprintf(&buf[written], len - written,
|
||||
"%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu\n",
|
||||
"%lu %lu %u %u %u %u %d %lu %lu %lu %u %u %u %d %lu %lu %lu %lu %lu %lu %lu %lu %lu\n",
|
||||
s->min_sz_region, s->max_sz_region,
|
||||
s->min_nr_accesses, s->max_nr_accesses,
|
||||
s->min_age_region, s->max_age_region,
|
||||
@@ -117,7 +117,9 @@ static ssize_t sprint_schemes(struct damon_ctx *c, char *buf, ssize_t len)
|
||||
s->quota.weight_age,
|
||||
s->wmarks.metric, s->wmarks.interval,
|
||||
s->wmarks.high, s->wmarks.mid, s->wmarks.low,
|
||||
s->stat_count, s->stat_sz);
|
||||
s->stat.nr_tried, s->stat.sz_tried,
|
||||
s->stat.nr_applied, s->stat.sz_applied,
|
||||
s->stat.qt_exceeds);
|
||||
if (!rc)
|
||||
return -ENOMEM;
|
||||
|
||||
@@ -213,6 +215,13 @@ static struct damos **str_to_schemes(const char *str, ssize_t len,
|
||||
if (!damos_action_valid(action))
|
||||
goto fail;
|
||||
|
||||
if (min_sz > max_sz || min_nr_a > max_nr_a || min_age > max_age)
|
||||
goto fail;
|
||||
|
||||
if (wmarks.high < wmarks.mid || wmarks.high < wmarks.low ||
|
||||
wmarks.mid < wmarks.low)
|
||||
goto fail;
|
||||
|
||||
pos += parsed;
|
||||
scheme = damon_new_scheme(min_sz, max_sz, min_nr_a, max_nr_a,
|
||||
min_age, max_age, action, "a, &wmarks);
|
||||
@@ -355,7 +364,7 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
|
||||
struct damon_ctx *ctx = file->private_data;
|
||||
struct damon_target *t, *next_t;
|
||||
bool id_is_pid = true;
|
||||
char *kbuf, *nrs;
|
||||
char *kbuf;
|
||||
unsigned long *targets;
|
||||
ssize_t nr_targets;
|
||||
ssize_t ret;
|
||||
@@ -365,14 +374,13 @@ static ssize_t dbgfs_target_ids_write(struct file *file,
|
||||
if (IS_ERR(kbuf))
|
||||
return PTR_ERR(kbuf);
|
||||
|
||||
nrs = kbuf;
|
||||
if (!strncmp(kbuf, "paddr\n", count)) {
|
||||
id_is_pid = false;
|
||||
/* target id is meaningless here, but we set it just for fun */
|
||||
scnprintf(kbuf, count, "42 ");
|
||||
}
|
||||
|
||||
targets = str_to_target_ids(nrs, count, &nr_targets);
|
||||
targets = str_to_target_ids(kbuf, count, &nr_targets);
|
||||
if (!targets) {
|
||||
ret = -ENOMEM;
|
||||
goto out;
|
||||
|
||||
@@ -73,7 +73,7 @@ static void __damon_pa_prepare_access_check(struct damon_ctx *ctx,
|
||||
damon_pa_mkold(r->sampling_addr);
|
||||
}
|
||||
|
||||
void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
|
||||
static void damon_pa_prepare_access_checks(struct damon_ctx *ctx)
|
||||
{
|
||||
struct damon_target *t;
|
||||
struct damon_region *r;
|
||||
@@ -192,7 +192,7 @@ static void __damon_pa_check_access(struct damon_ctx *ctx,
|
||||
last_addr = r->sampling_addr;
|
||||
}
|
||||
|
||||
unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
|
||||
static unsigned int damon_pa_check_accesses(struct damon_ctx *ctx)
|
||||
{
|
||||
struct damon_target *t;
|
||||
struct damon_region *r;
|
||||
@@ -213,14 +213,15 @@ bool damon_pa_target_valid(void *t)
|
||||
return true;
|
||||
}
|
||||
|
||||
int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *scheme)
|
||||
static unsigned long damon_pa_apply_scheme(struct damon_ctx *ctx,
|
||||
struct damon_target *t, struct damon_region *r,
|
||||
struct damos *scheme)
|
||||
{
|
||||
unsigned long addr;
|
||||
unsigned long addr, applied;
|
||||
LIST_HEAD(page_list);
|
||||
|
||||
if (scheme->action != DAMOS_PAGEOUT)
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
|
||||
for (addr = r->ar.start; addr < r->ar.end; addr += PAGE_SIZE) {
|
||||
struct page *page = damon_get_page(PHYS_PFN(addr));
|
||||
@@ -241,13 +242,14 @@ int damon_pa_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
|
||||
put_page(page);
|
||||
}
|
||||
}
|
||||
reclaim_pages(&page_list);
|
||||
applied = reclaim_pages(&page_list);
|
||||
cond_resched();
|
||||
return 0;
|
||||
return applied * PAGE_SIZE;
|
||||
}
|
||||
|
||||
int damon_pa_scheme_score(struct damon_ctx *context, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *scheme)
|
||||
static int damon_pa_scheme_score(struct damon_ctx *context,
|
||||
struct damon_target *t, struct damon_region *r,
|
||||
struct damos *scheme)
|
||||
{
|
||||
switch (scheme->action) {
|
||||
case DAMOS_PAGEOUT:
|
||||
|
||||
@@ -6,10 +6,6 @@
|
||||
*/
|
||||
|
||||
#include <linux/damon.h>
|
||||
#include <linux/random.h>
|
||||
|
||||
/* Get a random number in [l, r) */
|
||||
#define damon_rand(l, r) (l + prandom_u32_max(r - l))
|
||||
|
||||
struct page *damon_get_page(unsigned long pfn);
|
||||
|
||||
|
||||
@@ -185,6 +185,36 @@ module_param(monitor_region_end, ulong, 0600);
|
||||
static int kdamond_pid __read_mostly = -1;
|
||||
module_param(kdamond_pid, int, 0400);
|
||||
|
||||
/*
|
||||
* Number of memory regions that tried to be reclaimed.
|
||||
*/
|
||||
static unsigned long nr_reclaim_tried_regions __read_mostly;
|
||||
module_param(nr_reclaim_tried_regions, ulong, 0400);
|
||||
|
||||
/*
|
||||
* Total bytes of memory regions that tried to be reclaimed.
|
||||
*/
|
||||
static unsigned long bytes_reclaim_tried_regions __read_mostly;
|
||||
module_param(bytes_reclaim_tried_regions, ulong, 0400);
|
||||
|
||||
/*
|
||||
* Number of memory regions that successfully be reclaimed.
|
||||
*/
|
||||
static unsigned long nr_reclaimed_regions __read_mostly;
|
||||
module_param(nr_reclaimed_regions, ulong, 0400);
|
||||
|
||||
/*
|
||||
* Total bytes of memory regions that successfully be reclaimed.
|
||||
*/
|
||||
static unsigned long bytes_reclaimed_regions __read_mostly;
|
||||
module_param(bytes_reclaimed_regions, ulong, 0400);
|
||||
|
||||
/*
|
||||
* Number of times that the time/space quota limits have exceeded
|
||||
*/
|
||||
static unsigned long nr_quota_exceeds __read_mostly;
|
||||
module_param(nr_quota_exceeds, ulong, 0400);
|
||||
|
||||
static struct damon_ctx *ctx;
|
||||
static struct damon_target *target;
|
||||
|
||||
@@ -333,6 +363,21 @@ static void damon_reclaim_timer_fn(struct work_struct *work)
|
||||
}
|
||||
static DECLARE_DELAYED_WORK(damon_reclaim_timer, damon_reclaim_timer_fn);
|
||||
|
||||
static int damon_reclaim_after_aggregation(struct damon_ctx *c)
|
||||
{
|
||||
struct damos *s;
|
||||
|
||||
/* update the stats parameter */
|
||||
damon_for_each_scheme(s, c) {
|
||||
nr_reclaim_tried_regions = s->stat.nr_tried;
|
||||
bytes_reclaim_tried_regions = s->stat.sz_tried;
|
||||
nr_reclaimed_regions = s->stat.nr_applied;
|
||||
bytes_reclaimed_regions = s->stat.sz_applied;
|
||||
nr_quota_exceeds = s->stat.qt_exceeds;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int __init damon_reclaim_init(void)
|
||||
{
|
||||
ctx = damon_new_ctx();
|
||||
@@ -340,6 +385,7 @@ static int __init damon_reclaim_init(void)
|
||||
return -ENOMEM;
|
||||
|
||||
damon_pa_set_primitives(ctx);
|
||||
ctx->callback.after_aggregation = damon_reclaim_after_aggregation;
|
||||
|
||||
/* 4242 means nothing but fun */
|
||||
target = damon_new_target(4242);
|
||||
|
||||
182
mm/damon/vaddr.c
182
mm/damon/vaddr.c
@@ -26,8 +26,10 @@
|
||||
* 't->id' should be the pointer to the relevant 'struct pid' having reference
|
||||
* count. Caller must put the returned task, unless it is NULL.
|
||||
*/
|
||||
#define damon_get_task_struct(t) \
|
||||
(get_pid_task((struct pid *)t->id, PIDTYPE_PID))
|
||||
static inline struct task_struct *damon_get_task_struct(struct damon_target *t)
|
||||
{
|
||||
return get_pid_task((struct pid *)t->id, PIDTYPE_PID);
|
||||
}
|
||||
|
||||
/*
|
||||
* Get the mm_struct of the given target
|
||||
@@ -98,16 +100,6 @@ static unsigned long sz_range(struct damon_addr_range *r)
|
||||
return r->end - r->start;
|
||||
}
|
||||
|
||||
static void swap_ranges(struct damon_addr_range *r1,
|
||||
struct damon_addr_range *r2)
|
||||
{
|
||||
struct damon_addr_range tmp;
|
||||
|
||||
tmp = *r1;
|
||||
*r1 = *r2;
|
||||
*r2 = tmp;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find three regions separated by two biggest unmapped regions
|
||||
*
|
||||
@@ -146,9 +138,9 @@ static int __damon_va_three_regions(struct vm_area_struct *vma,
|
||||
gap.start = last_vma->vm_end;
|
||||
gap.end = vma->vm_start;
|
||||
if (sz_range(&gap) > sz_range(&second_gap)) {
|
||||
swap_ranges(&gap, &second_gap);
|
||||
swap(gap, second_gap);
|
||||
if (sz_range(&second_gap) > sz_range(&first_gap))
|
||||
swap_ranges(&second_gap, &first_gap);
|
||||
swap(second_gap, first_gap);
|
||||
}
|
||||
next:
|
||||
last_vma = vma;
|
||||
@@ -159,7 +151,7 @@ next:
|
||||
|
||||
/* Sort the two biggest gaps by address */
|
||||
if (first_gap.start > second_gap.start)
|
||||
swap_ranges(&first_gap, &second_gap);
|
||||
swap(first_gap, second_gap);
|
||||
|
||||
/* Store the result */
|
||||
regions[0].start = ALIGN(start, DAMON_MIN_REGION);
|
||||
@@ -240,13 +232,19 @@ static int damon_va_three_regions(struct damon_target *t,
|
||||
static void __damon_va_init_regions(struct damon_ctx *ctx,
|
||||
struct damon_target *t)
|
||||
{
|
||||
struct damon_target *ti;
|
||||
struct damon_region *r;
|
||||
struct damon_addr_range regions[3];
|
||||
unsigned long sz = 0, nr_pieces;
|
||||
int i;
|
||||
int i, tidx = 0;
|
||||
|
||||
if (damon_va_three_regions(t, regions)) {
|
||||
pr_err("Failed to get three regions of target %lu\n", t->id);
|
||||
damon_for_each_target(ti, ctx) {
|
||||
if (ti == t)
|
||||
break;
|
||||
tidx++;
|
||||
}
|
||||
pr_debug("Failed to get three regions of %dth target\n", tidx);
|
||||
return;
|
||||
}
|
||||
|
||||
@@ -272,7 +270,7 @@ static void __damon_va_init_regions(struct damon_ctx *ctx,
|
||||
}
|
||||
|
||||
/* Initialize '->regions_list' of every target (task) */
|
||||
void damon_va_init(struct damon_ctx *ctx)
|
||||
static void damon_va_init(struct damon_ctx *ctx)
|
||||
{
|
||||
struct damon_target *t;
|
||||
|
||||
@@ -292,7 +290,8 @@ void damon_va_init(struct damon_ctx *ctx)
|
||||
*
|
||||
* Returns true if it is.
|
||||
*/
|
||||
static bool damon_intersect(struct damon_region *r, struct damon_addr_range *re)
|
||||
static bool damon_intersect(struct damon_region *r,
|
||||
struct damon_addr_range *re)
|
||||
{
|
||||
return !(r->ar.end <= re->start || re->end <= r->ar.start);
|
||||
}
|
||||
@@ -356,7 +355,7 @@ static void damon_va_apply_three_regions(struct damon_target *t,
|
||||
/*
|
||||
* Update regions for current memory mappings
|
||||
*/
|
||||
void damon_va_update(struct damon_ctx *ctx)
|
||||
static void damon_va_update(struct damon_ctx *ctx)
|
||||
{
|
||||
struct damon_addr_range three_regions[3];
|
||||
struct damon_target *t;
|
||||
@@ -395,8 +394,65 @@ out:
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
static void damon_hugetlb_mkold(pte_t *pte, struct mm_struct *mm,
|
||||
struct vm_area_struct *vma, unsigned long addr)
|
||||
{
|
||||
bool referenced = false;
|
||||
pte_t entry = huge_ptep_get(pte);
|
||||
struct page *page = pte_page(entry);
|
||||
|
||||
if (!page)
|
||||
return;
|
||||
|
||||
get_page(page);
|
||||
|
||||
if (pte_young(entry)) {
|
||||
referenced = true;
|
||||
entry = pte_mkold(entry);
|
||||
huge_ptep_set_access_flags(vma, addr, pte, entry,
|
||||
vma->vm_flags & VM_WRITE);
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MMU_NOTIFIER
|
||||
if (mmu_notifier_clear_young(mm, addr,
|
||||
addr + huge_page_size(hstate_vma(vma))))
|
||||
referenced = true;
|
||||
#endif /* CONFIG_MMU_NOTIFIER */
|
||||
|
||||
if (referenced)
|
||||
set_page_young(page);
|
||||
|
||||
set_page_idle(page);
|
||||
put_page(page);
|
||||
}
|
||||
|
||||
static int damon_mkold_hugetlb_entry(pte_t *pte, unsigned long hmask,
|
||||
unsigned long addr, unsigned long end,
|
||||
struct mm_walk *walk)
|
||||
{
|
||||
struct hstate *h = hstate_vma(walk->vma);
|
||||
spinlock_t *ptl;
|
||||
pte_t entry;
|
||||
|
||||
ptl = huge_pte_lock(h, walk->mm, pte);
|
||||
entry = huge_ptep_get(pte);
|
||||
if (!pte_present(entry))
|
||||
goto out;
|
||||
|
||||
damon_hugetlb_mkold(pte, walk->mm, walk->vma, addr);
|
||||
|
||||
out:
|
||||
spin_unlock(ptl);
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
#define damon_mkold_hugetlb_entry NULL
|
||||
#endif /* CONFIG_HUGETLB_PAGE */
|
||||
|
||||
static const struct mm_walk_ops damon_mkold_ops = {
|
||||
.pmd_entry = damon_mkold_pmd_entry,
|
||||
.hugetlb_entry = damon_mkold_hugetlb_entry,
|
||||
};
|
||||
|
||||
static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
|
||||
@@ -410,7 +466,7 @@ static void damon_va_mkold(struct mm_struct *mm, unsigned long addr)
|
||||
* Functions for the access checking of the regions
|
||||
*/
|
||||
|
||||
static void damon_va_prepare_access_check(struct damon_ctx *ctx,
|
||||
static void __damon_va_prepare_access_check(struct damon_ctx *ctx,
|
||||
struct mm_struct *mm, struct damon_region *r)
|
||||
{
|
||||
r->sampling_addr = damon_rand(r->ar.start, r->ar.end);
|
||||
@@ -418,7 +474,7 @@ static void damon_va_prepare_access_check(struct damon_ctx *ctx,
|
||||
damon_va_mkold(mm, r->sampling_addr);
|
||||
}
|
||||
|
||||
void damon_va_prepare_access_checks(struct damon_ctx *ctx)
|
||||
static void damon_va_prepare_access_checks(struct damon_ctx *ctx)
|
||||
{
|
||||
struct damon_target *t;
|
||||
struct mm_struct *mm;
|
||||
@@ -429,7 +485,7 @@ void damon_va_prepare_access_checks(struct damon_ctx *ctx)
|
||||
if (!mm)
|
||||
continue;
|
||||
damon_for_each_region(r, t)
|
||||
damon_va_prepare_access_check(ctx, mm, r);
|
||||
__damon_va_prepare_access_check(ctx, mm, r);
|
||||
mmput(mm);
|
||||
}
|
||||
}
|
||||
@@ -491,8 +547,47 @@ out:
|
||||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
static int damon_young_hugetlb_entry(pte_t *pte, unsigned long hmask,
|
||||
unsigned long addr, unsigned long end,
|
||||
struct mm_walk *walk)
|
||||
{
|
||||
struct damon_young_walk_private *priv = walk->private;
|
||||
struct hstate *h = hstate_vma(walk->vma);
|
||||
struct page *page;
|
||||
spinlock_t *ptl;
|
||||
pte_t entry;
|
||||
|
||||
ptl = huge_pte_lock(h, walk->mm, pte);
|
||||
entry = huge_ptep_get(pte);
|
||||
if (!pte_present(entry))
|
||||
goto out;
|
||||
|
||||
page = pte_page(entry);
|
||||
if (!page)
|
||||
goto out;
|
||||
|
||||
get_page(page);
|
||||
|
||||
if (pte_young(entry) || !page_is_idle(page) ||
|
||||
mmu_notifier_test_young(walk->mm, addr)) {
|
||||
*priv->page_sz = huge_page_size(h);
|
||||
priv->young = true;
|
||||
}
|
||||
|
||||
put_page(page);
|
||||
|
||||
out:
|
||||
spin_unlock(ptl);
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
#define damon_young_hugetlb_entry NULL
|
||||
#endif /* CONFIG_HUGETLB_PAGE */
|
||||
|
||||
static const struct mm_walk_ops damon_young_ops = {
|
||||
.pmd_entry = damon_young_pmd_entry,
|
||||
.hugetlb_entry = damon_young_hugetlb_entry,
|
||||
};
|
||||
|
||||
static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
|
||||
@@ -515,7 +610,7 @@ static bool damon_va_young(struct mm_struct *mm, unsigned long addr,
|
||||
* mm 'mm_struct' for the given virtual address space
|
||||
* r the region to be checked
|
||||
*/
|
||||
static void damon_va_check_access(struct damon_ctx *ctx,
|
||||
static void __damon_va_check_access(struct damon_ctx *ctx,
|
||||
struct mm_struct *mm, struct damon_region *r)
|
||||
{
|
||||
static struct mm_struct *last_mm;
|
||||
@@ -539,7 +634,7 @@ static void damon_va_check_access(struct damon_ctx *ctx,
|
||||
last_addr = r->sampling_addr;
|
||||
}
|
||||
|
||||
unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
|
||||
static unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
|
||||
{
|
||||
struct damon_target *t;
|
||||
struct mm_struct *mm;
|
||||
@@ -551,7 +646,7 @@ unsigned int damon_va_check_accesses(struct damon_ctx *ctx)
|
||||
if (!mm)
|
||||
continue;
|
||||
damon_for_each_region(r, t) {
|
||||
damon_va_check_access(ctx, mm, r);
|
||||
__damon_va_check_access(ctx, mm, r);
|
||||
max_nr_accesses = max(r->nr_accesses, max_nr_accesses);
|
||||
}
|
||||
mmput(mm);
|
||||
@@ -579,32 +674,34 @@ bool damon_va_target_valid(void *target)
|
||||
}
|
||||
|
||||
#ifndef CONFIG_ADVISE_SYSCALLS
|
||||
static int damos_madvise(struct damon_target *target, struct damon_region *r,
|
||||
int behavior)
|
||||
static unsigned long damos_madvise(struct damon_target *target,
|
||||
struct damon_region *r, int behavior)
|
||||
{
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
#else
|
||||
static int damos_madvise(struct damon_target *target, struct damon_region *r,
|
||||
int behavior)
|
||||
static unsigned long damos_madvise(struct damon_target *target,
|
||||
struct damon_region *r, int behavior)
|
||||
{
|
||||
struct mm_struct *mm;
|
||||
int ret = -ENOMEM;
|
||||
unsigned long start = PAGE_ALIGN(r->ar.start);
|
||||
unsigned long len = PAGE_ALIGN(r->ar.end - r->ar.start);
|
||||
unsigned long applied;
|
||||
|
||||
mm = damon_get_mm(target);
|
||||
if (!mm)
|
||||
goto out;
|
||||
return 0;
|
||||
|
||||
ret = do_madvise(mm, PAGE_ALIGN(r->ar.start),
|
||||
PAGE_ALIGN(r->ar.end - r->ar.start), behavior);
|
||||
applied = do_madvise(mm, start, len, behavior) ? 0 : len;
|
||||
mmput(mm);
|
||||
out:
|
||||
return ret;
|
||||
|
||||
return applied;
|
||||
}
|
||||
#endif /* CONFIG_ADVISE_SYSCALLS */
|
||||
|
||||
int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *scheme)
|
||||
static unsigned long damon_va_apply_scheme(struct damon_ctx *ctx,
|
||||
struct damon_target *t, struct damon_region *r,
|
||||
struct damos *scheme)
|
||||
{
|
||||
int madv_action;
|
||||
|
||||
@@ -627,14 +724,15 @@ int damon_va_apply_scheme(struct damon_ctx *ctx, struct damon_target *t,
|
||||
case DAMOS_STAT:
|
||||
return 0;
|
||||
default:
|
||||
return -EINVAL;
|
||||
return 0;
|
||||
}
|
||||
|
||||
return damos_madvise(t, r, madv_action);
|
||||
}
|
||||
|
||||
int damon_va_scheme_score(struct damon_ctx *context, struct damon_target *t,
|
||||
struct damon_region *r, struct damos *scheme)
|
||||
static int damon_va_scheme_score(struct damon_ctx *context,
|
||||
struct damon_target *t, struct damon_region *r,
|
||||
struct damos *scheme)
|
||||
{
|
||||
|
||||
switch (scheme->action) {
|
||||
|
||||
@@ -652,7 +652,7 @@ static void __init pte_clear_tests(struct pgtable_debug_args *args)
|
||||
set_pte_at(args->mm, args->vaddr, args->ptep, pte);
|
||||
flush_dcache_page(page);
|
||||
barrier();
|
||||
pte_clear(args->mm, args->vaddr, args->ptep);
|
||||
ptep_clear(args->mm, args->vaddr, args->ptep);
|
||||
pte = ptep_get(args->ptep);
|
||||
WARN_ON(!pte_none(pte));
|
||||
}
|
||||
|
||||
@@ -152,7 +152,7 @@ struct dma_pool *dma_pool_create(const char *name, struct device *dev,
|
||||
else if ((boundary < size) || (boundary & (boundary - 1)))
|
||||
return NULL;
|
||||
|
||||
retval = kmalloc_node(sizeof(*retval), GFP_KERNEL, dev_to_node(dev));
|
||||
retval = kmalloc(sizeof(*retval), GFP_KERNEL);
|
||||
if (!retval)
|
||||
return retval;
|
||||
|
||||
|
||||
5
mm/hmm.c
5
mm/hmm.c
@@ -300,7 +300,8 @@ static int hmm_vma_handle_pte(struct mm_walk *walk, unsigned long addr,
|
||||
* Since each architecture defines a struct page for the zero page, just
|
||||
* fall through and treat it like a normal page.
|
||||
*/
|
||||
if (pte_special(pte) && !pte_devmap(pte) &&
|
||||
if (!vm_normal_page(walk->vma, addr, pte) &&
|
||||
!pte_devmap(pte) &&
|
||||
!is_zero_pfn(pte_pfn(pte))) {
|
||||
if (hmm_pte_need_fault(hmm_vma_walk, pfn_req_flags, 0)) {
|
||||
pte_unmap(ptep);
|
||||
@@ -518,7 +519,7 @@ static int hmm_vma_walk_test(unsigned long start, unsigned long end,
|
||||
struct hmm_range *range = hmm_vma_walk->range;
|
||||
struct vm_area_struct *vma = walk->vma;
|
||||
|
||||
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP | VM_MIXEDMAP)) &&
|
||||
if (!(vma->vm_flags & (VM_IO | VM_PFNMAP)) &&
|
||||
vma->vm_flags & VM_READ)
|
||||
return 0;
|
||||
|
||||
|
||||
@@ -1322,7 +1322,7 @@ vm_fault_t do_huge_pmd_wp_page(struct vm_fault *vmf)
|
||||
* We can only reuse the page if nobody else maps the huge page or it's
|
||||
* part.
|
||||
*/
|
||||
if (reuse_swap_page(page, NULL)) {
|
||||
if (reuse_swap_page(page)) {
|
||||
pmd_t entry;
|
||||
entry = pmd_mkyoung(orig_pmd);
|
||||
entry = maybe_pmd_mkwrite(pmd_mkdirty(entry), vma);
|
||||
@@ -2542,38 +2542,28 @@ int total_mapcount(struct page *page)
|
||||
* need full accuracy to avoid breaking page pinning, because
|
||||
* page_trans_huge_mapcount() is slower than page_mapcount().
|
||||
*/
|
||||
int page_trans_huge_mapcount(struct page *page, int *total_mapcount)
|
||||
int page_trans_huge_mapcount(struct page *page)
|
||||
{
|
||||
int i, ret, _total_mapcount, mapcount;
|
||||
int i, ret;
|
||||
|
||||
/* hugetlbfs shouldn't call it */
|
||||
VM_BUG_ON_PAGE(PageHuge(page), page);
|
||||
|
||||
if (likely(!PageTransCompound(page))) {
|
||||
mapcount = atomic_read(&page->_mapcount) + 1;
|
||||
if (total_mapcount)
|
||||
*total_mapcount = mapcount;
|
||||
return mapcount;
|
||||
}
|
||||
if (likely(!PageTransCompound(page)))
|
||||
return atomic_read(&page->_mapcount) + 1;
|
||||
|
||||
page = compound_head(page);
|
||||
|
||||
_total_mapcount = ret = 0;
|
||||
ret = 0;
|
||||
for (i = 0; i < thp_nr_pages(page); i++) {
|
||||
mapcount = atomic_read(&page[i]._mapcount) + 1;
|
||||
int mapcount = atomic_read(&page[i]._mapcount) + 1;
|
||||
ret = max(ret, mapcount);
|
||||
_total_mapcount += mapcount;
|
||||
}
|
||||
if (PageDoubleMap(page)) {
|
||||
|
||||
if (PageDoubleMap(page))
|
||||
ret -= 1;
|
||||
_total_mapcount -= thp_nr_pages(page);
|
||||
}
|
||||
mapcount = compound_mapcount(page);
|
||||
ret += mapcount;
|
||||
_total_mapcount += mapcount;
|
||||
if (total_mapcount)
|
||||
*total_mapcount = _total_mapcount;
|
||||
return ret;
|
||||
|
||||
return ret + compound_mapcount(page);
|
||||
}
|
||||
|
||||
/* Racy check whether the huge page can be split */
|
||||
|
||||
@@ -4684,8 +4684,8 @@ hugetlb_install_page(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr
|
||||
struct page *new_page)
|
||||
{
|
||||
__SetPageUptodate(new_page);
|
||||
set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
|
||||
hugepage_add_new_anon_rmap(new_page, vma, addr);
|
||||
set_huge_pte_at(vma->vm_mm, addr, ptep, make_huge_pte(vma, new_page, 1));
|
||||
hugetlb_count_add(pages_per_huge_page(hstate_vma(vma)), vma->vm_mm);
|
||||
ClearHPageRestoreReserve(new_page);
|
||||
SetHPageMigratable(new_page);
|
||||
@@ -5259,10 +5259,10 @@ retry_avoidcopy:
|
||||
/* Break COW */
|
||||
huge_ptep_clear_flush(vma, haddr, ptep);
|
||||
mmu_notifier_invalidate_range(mm, range.start, range.end);
|
||||
set_huge_pte_at(mm, haddr, ptep,
|
||||
make_huge_pte(vma, new_page, 1));
|
||||
page_remove_rmap(old_page, true);
|
||||
hugepage_add_new_anon_rmap(new_page, vma, haddr);
|
||||
set_huge_pte_at(mm, haddr, ptep,
|
||||
make_huge_pte(vma, new_page, 1));
|
||||
SetHPageMigratable(new_page);
|
||||
/* Make the old page be freed below */
|
||||
new_page = old_page;
|
||||
|
||||
@@ -123,29 +123,58 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup,
|
||||
}
|
||||
}
|
||||
|
||||
static void hugetlb_cgroup_free(struct hugetlb_cgroup *h_cgroup)
|
||||
{
|
||||
int node;
|
||||
|
||||
for_each_node(node)
|
||||
kfree(h_cgroup->nodeinfo[node]);
|
||||
kfree(h_cgroup);
|
||||
}
|
||||
|
||||
static struct cgroup_subsys_state *
|
||||
hugetlb_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
|
||||
{
|
||||
struct hugetlb_cgroup *parent_h_cgroup = hugetlb_cgroup_from_css(parent_css);
|
||||
struct hugetlb_cgroup *h_cgroup;
|
||||
int node;
|
||||
|
||||
h_cgroup = kzalloc(struct_size(h_cgroup, nodeinfo, nr_node_ids),
|
||||
GFP_KERNEL);
|
||||
|
||||
h_cgroup = kzalloc(sizeof(*h_cgroup), GFP_KERNEL);
|
||||
if (!h_cgroup)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
if (!parent_h_cgroup)
|
||||
root_h_cgroup = h_cgroup;
|
||||
|
||||
/*
|
||||
* TODO: this routine can waste much memory for nodes which will
|
||||
* never be onlined. It's better to use memory hotplug callback
|
||||
* function.
|
||||
*/
|
||||
for_each_node(node) {
|
||||
/* Set node_to_alloc to -1 for offline nodes. */
|
||||
int node_to_alloc =
|
||||
node_state(node, N_NORMAL_MEMORY) ? node : -1;
|
||||
h_cgroup->nodeinfo[node] =
|
||||
kzalloc_node(sizeof(struct hugetlb_cgroup_per_node),
|
||||
GFP_KERNEL, node_to_alloc);
|
||||
if (!h_cgroup->nodeinfo[node])
|
||||
goto fail_alloc_nodeinfo;
|
||||
}
|
||||
|
||||
hugetlb_cgroup_init(h_cgroup, parent_h_cgroup);
|
||||
return &h_cgroup->css;
|
||||
|
||||
fail_alloc_nodeinfo:
|
||||
hugetlb_cgroup_free(h_cgroup);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
static void hugetlb_cgroup_css_free(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct hugetlb_cgroup *h_cgroup;
|
||||
|
||||
h_cgroup = hugetlb_cgroup_from_css(css);
|
||||
kfree(h_cgroup);
|
||||
hugetlb_cgroup_free(hugetlb_cgroup_from_css(css));
|
||||
}
|
||||
|
||||
/*
|
||||
@@ -289,7 +318,17 @@ static void __hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
|
||||
return;
|
||||
|
||||
__set_hugetlb_cgroup(page, h_cg, rsvd);
|
||||
return;
|
||||
if (!rsvd) {
|
||||
unsigned long usage =
|
||||
h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
|
||||
/*
|
||||
* This write is not atomic due to fetching usage and writing
|
||||
* to it, but that's fine because we call this with
|
||||
* hugetlb_lock held anyway.
|
||||
*/
|
||||
WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
|
||||
usage + nr_pages);
|
||||
}
|
||||
}
|
||||
|
||||
void hugetlb_cgroup_commit_charge(int idx, unsigned long nr_pages,
|
||||
@@ -328,8 +367,17 @@ static void __hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
|
||||
|
||||
if (rsvd)
|
||||
css_put(&h_cg->css);
|
||||
|
||||
return;
|
||||
else {
|
||||
unsigned long usage =
|
||||
h_cg->nodeinfo[page_to_nid(page)]->usage[idx];
|
||||
/*
|
||||
* This write is not atomic due to fetching usage and writing
|
||||
* to it, but that's fine because we call this with
|
||||
* hugetlb_lock held anyway.
|
||||
*/
|
||||
WRITE_ONCE(h_cg->nodeinfo[page_to_nid(page)]->usage[idx],
|
||||
usage - nr_pages);
|
||||
}
|
||||
}
|
||||
|
||||
void hugetlb_cgroup_uncharge_page(int idx, unsigned long nr_pages,
|
||||
@@ -418,6 +466,59 @@ enum {
|
||||
RES_RSVD_FAILCNT,
|
||||
};
|
||||
|
||||
static int hugetlb_cgroup_read_numa_stat(struct seq_file *seq, void *dummy)
|
||||
{
|
||||
int nid;
|
||||
struct cftype *cft = seq_cft(seq);
|
||||
int idx = MEMFILE_IDX(cft->private);
|
||||
bool legacy = MEMFILE_ATTR(cft->private);
|
||||
struct hugetlb_cgroup *h_cg = hugetlb_cgroup_from_css(seq_css(seq));
|
||||
struct cgroup_subsys_state *css;
|
||||
unsigned long usage;
|
||||
|
||||
if (legacy) {
|
||||
/* Add up usage across all nodes for the non-hierarchical total. */
|
||||
usage = 0;
|
||||
for_each_node_state(nid, N_MEMORY)
|
||||
usage += READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]);
|
||||
seq_printf(seq, "total=%lu", usage * PAGE_SIZE);
|
||||
|
||||
/* Simply print the per-node usage for the non-hierarchical total. */
|
||||
for_each_node_state(nid, N_MEMORY)
|
||||
seq_printf(seq, " N%d=%lu", nid,
|
||||
READ_ONCE(h_cg->nodeinfo[nid]->usage[idx]) *
|
||||
PAGE_SIZE);
|
||||
seq_putc(seq, '\n');
|
||||
}
|
||||
|
||||
/*
|
||||
* The hierarchical total is pretty much the value recorded by the
|
||||
* counter, so use that.
|
||||
*/
|
||||
seq_printf(seq, "%stotal=%lu", legacy ? "hierarchical_" : "",
|
||||
page_counter_read(&h_cg->hugepage[idx]) * PAGE_SIZE);
|
||||
|
||||
/*
|
||||
* For each node, transverse the css tree to obtain the hierarchical
|
||||
* node usage.
|
||||
*/
|
||||
for_each_node_state(nid, N_MEMORY) {
|
||||
usage = 0;
|
||||
rcu_read_lock();
|
||||
css_for_each_descendant_pre(css, &h_cg->css) {
|
||||
usage += READ_ONCE(hugetlb_cgroup_from_css(css)
|
||||
->nodeinfo[nid]
|
||||
->usage[idx]);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
seq_printf(seq, " N%d=%lu", nid, usage * PAGE_SIZE);
|
||||
}
|
||||
|
||||
seq_putc(seq, '\n');
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css,
|
||||
struct cftype *cft)
|
||||
{
|
||||
@@ -668,8 +769,14 @@ static void __init __hugetlb_cgroup_file_dfl_init(int idx)
|
||||
events_local_file[idx]);
|
||||
cft->flags = CFTYPE_NOT_ON_ROOT;
|
||||
|
||||
/* NULL terminate the last cft */
|
||||
/* Add the numa stat file */
|
||||
cft = &h->cgroup_files_dfl[6];
|
||||
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
|
||||
cft->seq_show = hugetlb_cgroup_read_numa_stat;
|
||||
cft->flags = CFTYPE_NOT_ON_ROOT;
|
||||
|
||||
/* NULL terminate the last cft */
|
||||
cft = &h->cgroup_files_dfl[7];
|
||||
memset(cft, 0, sizeof(*cft));
|
||||
|
||||
WARN_ON(cgroup_add_dfl_cftypes(&hugetlb_cgrp_subsys,
|
||||
@@ -739,8 +846,14 @@ static void __init __hugetlb_cgroup_file_legacy_init(int idx)
|
||||
cft->write = hugetlb_cgroup_reset;
|
||||
cft->read_u64 = hugetlb_cgroup_read_u64;
|
||||
|
||||
/* NULL terminate the last cft */
|
||||
/* Add the numa stat file */
|
||||
cft = &h->cgroup_files_legacy[8];
|
||||
snprintf(cft->name, MAX_CFTYPE_NAME, "%s.numa_stat", buf);
|
||||
cft->private = MEMFILE_PRIVATE(idx, 1);
|
||||
cft->seq_show = hugetlb_cgroup_read_numa_stat;
|
||||
|
||||
/* NULL terminate the last cft */
|
||||
cft = &h->cgroup_files_legacy[9];
|
||||
memset(cft, 0, sizeof(*cft));
|
||||
|
||||
WARN_ON(cgroup_add_legacy_cftypes(&hugetlb_cgrp_subsys,
|
||||
|
||||
@@ -23,7 +23,7 @@ struct folio_batch;
|
||||
#define GFP_RECLAIM_MASK (__GFP_RECLAIM|__GFP_HIGH|__GFP_IO|__GFP_FS|\
|
||||
__GFP_NOWARN|__GFP_RETRY_MAYFAIL|__GFP_NOFAIL|\
|
||||
__GFP_NORETRY|__GFP_MEMALLOC|__GFP_NOMEMALLOC|\
|
||||
__GFP_ATOMIC)
|
||||
__GFP_ATOMIC|__GFP_NOLOCKDEP)
|
||||
|
||||
/* The GFP flags allowed during early boot */
|
||||
#define GFP_BOOT_MASK (__GFP_BITS_MASK & ~(__GFP_RECLAIM|__GFP_IO|__GFP_FS))
|
||||
|
||||
@@ -618,6 +618,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
|
||||
continue;
|
||||
} else {
|
||||
result = SCAN_EXCEED_NONE_PTE;
|
||||
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
|
||||
goto out;
|
||||
}
|
||||
}
|
||||
@@ -636,6 +637,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
|
||||
if (page_mapcount(page) > 1 &&
|
||||
++shared > khugepaged_max_ptes_shared) {
|
||||
result = SCAN_EXCEED_SHARED_PTE;
|
||||
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
|
||||
goto out;
|
||||
}
|
||||
|
||||
@@ -681,7 +683,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma,
|
||||
goto out;
|
||||
}
|
||||
if (!pte_write(pteval) && PageSwapCache(page) &&
|
||||
!reuse_swap_page(page, NULL)) {
|
||||
!reuse_swap_page(page)) {
|
||||
/*
|
||||
* Page is in the swap cache and cannot be re-used.
|
||||
* It cannot be collapsed into a THP.
|
||||
@@ -756,11 +758,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
|
||||
* ptl mostly unnecessary.
|
||||
*/
|
||||
spin_lock(ptl);
|
||||
/*
|
||||
* paravirt calls inside pte_clear here are
|
||||
* superfluous.
|
||||
*/
|
||||
pte_clear(vma->vm_mm, address, _pte);
|
||||
ptep_clear(vma->vm_mm, address, _pte);
|
||||
spin_unlock(ptl);
|
||||
}
|
||||
} else {
|
||||
@@ -774,11 +772,7 @@ static void __collapse_huge_page_copy(pte_t *pte, struct page *page,
|
||||
* inside page_remove_rmap().
|
||||
*/
|
||||
spin_lock(ptl);
|
||||
/*
|
||||
* paravirt calls inside pte_clear here are
|
||||
* superfluous.
|
||||
*/
|
||||
pte_clear(vma->vm_mm, address, _pte);
|
||||
ptep_clear(vma->vm_mm, address, _pte);
|
||||
page_remove_rmap(src_page, false);
|
||||
spin_unlock(ptl);
|
||||
free_page_and_swap_cache(src_page);
|
||||
@@ -1261,6 +1255,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
|
||||
continue;
|
||||
} else {
|
||||
result = SCAN_EXCEED_SWAP_PTE;
|
||||
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
|
||||
goto out_unmap;
|
||||
}
|
||||
}
|
||||
@@ -1270,6 +1265,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
|
||||
continue;
|
||||
} else {
|
||||
result = SCAN_EXCEED_NONE_PTE;
|
||||
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
|
||||
goto out_unmap;
|
||||
}
|
||||
}
|
||||
@@ -1298,6 +1294,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
|
||||
if (page_mapcount(page) > 1 &&
|
||||
++shared > khugepaged_max_ptes_shared) {
|
||||
result = SCAN_EXCEED_SHARED_PTE;
|
||||
count_vm_event(THP_SCAN_EXCEED_SHARED_PTE);
|
||||
goto out_unmap;
|
||||
}
|
||||
|
||||
@@ -1306,7 +1303,7 @@ static int khugepaged_scan_pmd(struct mm_struct *mm,
|
||||
/*
|
||||
* Record which node the original page is from and save this
|
||||
* information to khugepaged_node_load[].
|
||||
* Khupaged will allocate hugepage from the node has the max
|
||||
* Khugepaged will allocate hugepage from the node has the max
|
||||
* hit record.
|
||||
*/
|
||||
node = page_to_nid(page);
|
||||
@@ -2014,6 +2011,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
|
||||
if (xa_is_value(page)) {
|
||||
if (++swap > khugepaged_max_ptes_swap) {
|
||||
result = SCAN_EXCEED_SWAP_PTE;
|
||||
count_vm_event(THP_SCAN_EXCEED_SWAP_PTE);
|
||||
break;
|
||||
}
|
||||
continue;
|
||||
@@ -2064,6 +2062,7 @@ static void khugepaged_scan_file(struct mm_struct *mm,
|
||||
if (result == SCAN_SUCCEED) {
|
||||
if (present < HPAGE_PMD_NR - khugepaged_max_ptes_none) {
|
||||
result = SCAN_EXCEED_NONE_PTE;
|
||||
count_vm_event(THP_SCAN_EXCEED_NONE_PTE);
|
||||
} else {
|
||||
node = khugepaged_find_target_node();
|
||||
collapse_file(mm, file, start, hpage, node);
|
||||
|
||||
4
mm/ksm.c
4
mm/ksm.c
@@ -2576,8 +2576,8 @@ struct page *ksm_might_need_to_copy(struct page *page,
|
||||
return page; /* no need to copy it */
|
||||
} else if (!anon_vma) {
|
||||
return page; /* no need to copy it */
|
||||
} else if (anon_vma->root == vma->anon_vma->root &&
|
||||
page->index == linear_page_index(vma, address)) {
|
||||
} else if (page->index == linear_page_index(vma, address) &&
|
||||
anon_vma->root == vma->anon_vma->root) {
|
||||
return page; /* still no need to copy it */
|
||||
}
|
||||
if (!PageUptodate(page))
|
||||
|
||||
@@ -723,7 +723,6 @@ static const char * const action_page_types[] = {
|
||||
[MF_MSG_KERNEL_HIGH_ORDER] = "high-order kernel page",
|
||||
[MF_MSG_SLAB] = "kernel slab page",
|
||||
[MF_MSG_DIFFERENT_COMPOUND] = "different compound page after locking",
|
||||
[MF_MSG_POISONED_HUGE] = "huge page already hardware poisoned",
|
||||
[MF_MSG_HUGE] = "huge page",
|
||||
[MF_MSG_FREE_HUGE] = "free huge page",
|
||||
[MF_MSG_NON_PMD_HUGE] = "non-pmd-sized huge page",
|
||||
@@ -738,7 +737,6 @@ static const char * const action_page_types[] = {
|
||||
[MF_MSG_CLEAN_LRU] = "clean LRU page",
|
||||
[MF_MSG_TRUNCATED_LRU] = "already truncated LRU page",
|
||||
[MF_MSG_BUDDY] = "free buddy page",
|
||||
[MF_MSG_BUDDY_2ND] = "free buddy page (2nd try)",
|
||||
[MF_MSG_DAX] = "dax page",
|
||||
[MF_MSG_UNSPLIT_THP] = "unsplit thp",
|
||||
[MF_MSG_UNKNOWN] = "unknown page",
|
||||
@@ -1162,6 +1160,22 @@ static int page_action(struct page_state *ps, struct page *p,
|
||||
return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY;
|
||||
}
|
||||
|
||||
static inline bool PageHWPoisonTakenOff(struct page *page)
|
||||
{
|
||||
return PageHWPoison(page) && page_private(page) == MAGIC_HWPOISON;
|
||||
}
|
||||
|
||||
void SetPageHWPoisonTakenOff(struct page *page)
|
||||
{
|
||||
set_page_private(page, MAGIC_HWPOISON);
|
||||
}
|
||||
|
||||
void ClearPageHWPoisonTakenOff(struct page *page)
|
||||
{
|
||||
if (PageHWPoison(page))
|
||||
set_page_private(page, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Return true if a page type of a given page is supported by hwpoison
|
||||
* mechanism (while handling could fail), otherwise false. This function
|
||||
@@ -1264,6 +1278,27 @@ out:
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int __get_unpoison_page(struct page *page)
|
||||
{
|
||||
struct page *head = compound_head(page);
|
||||
int ret = 0;
|
||||
bool hugetlb = false;
|
||||
|
||||
ret = get_hwpoison_huge_page(head, &hugetlb);
|
||||
if (hugetlb)
|
||||
return ret;
|
||||
|
||||
/*
|
||||
* PageHWPoisonTakenOff pages are not only marked as PG_hwpoison,
|
||||
* but also isolated from buddy freelist, so need to identify the
|
||||
* state and have to cancel both operations to unpoison.
|
||||
*/
|
||||
if (PageHWPoisonTakenOff(page))
|
||||
return -EHWPOISON;
|
||||
|
||||
return get_page_unless_zero(page) ? 1 : 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* get_hwpoison_page() - Get refcount for memory error handling
|
||||
* @p: Raw error page (hit by memory error)
|
||||
@@ -1271,7 +1306,7 @@ out:
|
||||
*
|
||||
* get_hwpoison_page() takes a page refcount of an error page to handle memory
|
||||
* error on it, after checking that the error page is in a well-defined state
|
||||
* (defined as a page-type we can successfully handle the memor error on it,
|
||||
* (defined as a page-type we can successfully handle the memory error on it,
|
||||
* such as LRU page and hugetlb page).
|
||||
*
|
||||
* Memory error handling could be triggered at any time on any type of page,
|
||||
@@ -1280,18 +1315,26 @@ out:
|
||||
* extra care for the error page's state (as done in __get_hwpoison_page()),
|
||||
* and has some retry logic in get_any_page().
|
||||
*
|
||||
* When called from unpoison_memory(), the caller should already ensure that
|
||||
* the given page has PG_hwpoison. So it's never reused for other page
|
||||
* allocations, and __get_unpoison_page() never races with them.
|
||||
*
|
||||
* Return: 0 on failure,
|
||||
* 1 on success for in-use pages in a well-defined state,
|
||||
* -EIO for pages on which we can not handle memory errors,
|
||||
* -EBUSY when get_hwpoison_page() has raced with page lifecycle
|
||||
* operations like allocation and free.
|
||||
* operations like allocation and free,
|
||||
* -EHWPOISON when the page is hwpoisoned and taken off from buddy.
|
||||
*/
|
||||
static int get_hwpoison_page(struct page *p, unsigned long flags)
|
||||
{
|
||||
int ret;
|
||||
|
||||
zone_pcp_disable(page_zone(p));
|
||||
ret = get_any_page(p, flags);
|
||||
if (flags & MF_UNPOISON)
|
||||
ret = __get_unpoison_page(p);
|
||||
else
|
||||
ret = get_any_page(p, flags);
|
||||
zone_pcp_enable(page_zone(p));
|
||||
|
||||
return ret;
|
||||
@@ -1502,14 +1545,6 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags)
|
||||
lock_page(head);
|
||||
page_flags = head->flags;
|
||||
|
||||
if (!PageHWPoison(head)) {
|
||||
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
|
||||
num_poisoned_pages_dec();
|
||||
unlock_page(head);
|
||||
put_page(head);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* TODO: hwpoison for pud-sized hugetlb doesn't work right now, so
|
||||
* simply disable it. In order to make it work properly, we need
|
||||
@@ -1623,6 +1658,8 @@ out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
static DEFINE_MUTEX(mf_mutex);
|
||||
|
||||
/**
|
||||
* memory_failure - Handle memory failure of a page.
|
||||
* @pfn: Page Number of the corrupted page
|
||||
@@ -1649,7 +1686,6 @@ int memory_failure(unsigned long pfn, int flags)
|
||||
int res = 0;
|
||||
unsigned long page_flags;
|
||||
bool retry = true;
|
||||
static DEFINE_MUTEX(mf_mutex);
|
||||
|
||||
if (!sysctl_memory_failure_recovery)
|
||||
panic("Memory failure on page %lx", pfn);
|
||||
@@ -1790,16 +1826,6 @@ try_again:
|
||||
*/
|
||||
page_flags = p->flags;
|
||||
|
||||
/*
|
||||
* unpoison always clear PG_hwpoison inside page lock
|
||||
*/
|
||||
if (!PageHWPoison(p)) {
|
||||
pr_err("Memory failure: %#lx: just unpoisoned\n", pfn);
|
||||
num_poisoned_pages_dec();
|
||||
unlock_page(p);
|
||||
put_page(p);
|
||||
goto unlock_mutex;
|
||||
}
|
||||
if (hwpoison_filter(p)) {
|
||||
if (TestClearPageHWPoison(p))
|
||||
num_poisoned_pages_dec();
|
||||
@@ -1963,6 +1989,28 @@ core_initcall(memory_failure_init);
|
||||
pr_info(fmt, pfn); \
|
||||
})
|
||||
|
||||
static inline int clear_page_hwpoison(struct ratelimit_state *rs, struct page *p)
|
||||
{
|
||||
if (TestClearPageHWPoison(p)) {
|
||||
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
|
||||
page_to_pfn(p), rs);
|
||||
num_poisoned_pages_dec();
|
||||
return 1;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline int unpoison_taken_off_page(struct ratelimit_state *rs,
|
||||
struct page *p)
|
||||
{
|
||||
if (put_page_back_buddy(p)) {
|
||||
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
|
||||
page_to_pfn(p), rs);
|
||||
return 0;
|
||||
}
|
||||
return -EBUSY;
|
||||
}
|
||||
|
||||
/**
|
||||
* unpoison_memory - Unpoison a previously poisoned page
|
||||
* @pfn: Page number of the to be unpoisoned page
|
||||
@@ -1979,8 +2027,7 @@ int unpoison_memory(unsigned long pfn)
|
||||
{
|
||||
struct page *page;
|
||||
struct page *p;
|
||||
int freeit = 0;
|
||||
unsigned long flags = 0;
|
||||
int ret = -EBUSY;
|
||||
static DEFINE_RATELIMIT_STATE(unpoison_rs, DEFAULT_RATELIMIT_INTERVAL,
|
||||
DEFAULT_RATELIMIT_BURST);
|
||||
|
||||
@@ -1990,69 +2037,60 @@ int unpoison_memory(unsigned long pfn)
|
||||
p = pfn_to_page(pfn);
|
||||
page = compound_head(p);
|
||||
|
||||
mutex_lock(&mf_mutex);
|
||||
|
||||
if (!PageHWPoison(p)) {
|
||||
unpoison_pr_info("Unpoison: Page was already unpoisoned %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
goto unlock_mutex;
|
||||
}
|
||||
|
||||
if (page_count(page) > 1) {
|
||||
unpoison_pr_info("Unpoison: Someone grabs the hwpoison page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
goto unlock_mutex;
|
||||
}
|
||||
|
||||
if (page_mapped(page)) {
|
||||
unpoison_pr_info("Unpoison: Someone maps the hwpoison page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
goto unlock_mutex;
|
||||
}
|
||||
|
||||
if (page_mapping(page)) {
|
||||
unpoison_pr_info("Unpoison: the hwpoison page has non-NULL mapping %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
goto unlock_mutex;
|
||||
}
|
||||
|
||||
/*
|
||||
* unpoison_memory() can encounter thp only when the thp is being
|
||||
* worked by memory_failure() and the page lock is not held yet.
|
||||
* In such case, we yield to memory_failure() and make unpoison fail.
|
||||
*/
|
||||
if (!PageHuge(page) && PageTransHuge(page)) {
|
||||
unpoison_pr_info("Unpoison: Memory failure is now running on %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
}
|
||||
if (PageSlab(page) || PageTable(page))
|
||||
goto unlock_mutex;
|
||||
|
||||
if (!get_hwpoison_page(p, flags)) {
|
||||
if (TestClearPageHWPoison(p))
|
||||
num_poisoned_pages_dec();
|
||||
unpoison_pr_info("Unpoison: Software-unpoisoned free page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
return 0;
|
||||
}
|
||||
ret = get_hwpoison_page(p, MF_UNPOISON);
|
||||
if (!ret) {
|
||||
if (clear_page_hwpoison(&unpoison_rs, page))
|
||||
ret = 0;
|
||||
else
|
||||
ret = -EBUSY;
|
||||
} else if (ret < 0) {
|
||||
if (ret == -EHWPOISON) {
|
||||
ret = unpoison_taken_off_page(&unpoison_rs, p);
|
||||
} else
|
||||
unpoison_pr_info("Unpoison: failed to grab page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
} else {
|
||||
int freeit = clear_page_hwpoison(&unpoison_rs, p);
|
||||
|
||||
lock_page(page);
|
||||
/*
|
||||
* This test is racy because PG_hwpoison is set outside of page lock.
|
||||
* That's acceptable because that won't trigger kernel panic. Instead,
|
||||
* the PG_hwpoison page will be caught and isolated on the entrance to
|
||||
* the free buddy page pool.
|
||||
*/
|
||||
if (TestClearPageHWPoison(page)) {
|
||||
unpoison_pr_info("Unpoison: Software-unpoisoned page %#lx\n",
|
||||
pfn, &unpoison_rs);
|
||||
num_poisoned_pages_dec();
|
||||
freeit = 1;
|
||||
}
|
||||
unlock_page(page);
|
||||
|
||||
put_page(page);
|
||||
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1))
|
||||
put_page(page);
|
||||
if (freeit && !(pfn == my_zero_pfn(0) && page_count(p) == 1)) {
|
||||
put_page(page);
|
||||
ret = 0;
|
||||
}
|
||||
}
|
||||
|
||||
return 0;
|
||||
unlock_mutex:
|
||||
mutex_unlock(&mf_mutex);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(unpoison_memory);
|
||||
|
||||
@@ -2233,9 +2271,12 @@ int soft_offline_page(unsigned long pfn, int flags)
|
||||
return -EIO;
|
||||
}
|
||||
|
||||
mutex_lock(&mf_mutex);
|
||||
|
||||
if (PageHWPoison(page)) {
|
||||
pr_info("%s: %#lx page already poisoned\n", __func__, pfn);
|
||||
put_ref_page(ref_page);
|
||||
mutex_unlock(&mf_mutex);
|
||||
return 0;
|
||||
}
|
||||
|
||||
@@ -2254,5 +2295,7 @@ retry:
|
||||
}
|
||||
}
|
||||
|
||||
mutex_unlock(&mf_mutex);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
11
mm/memory.c
11
mm/memory.c
@@ -722,8 +722,6 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
|
||||
else if (is_writable_device_exclusive_entry(entry))
|
||||
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
|
||||
|
||||
set_pte_at(vma->vm_mm, address, ptep, pte);
|
||||
|
||||
/*
|
||||
* No need to take a page reference as one was already
|
||||
* created when the swap entry was made.
|
||||
@@ -737,6 +735,8 @@ static void restore_exclusive_pte(struct vm_area_struct *vma,
|
||||
*/
|
||||
WARN_ON_ONCE(!PageAnon(page));
|
||||
|
||||
set_pte_at(vma->vm_mm, address, ptep, pte);
|
||||
|
||||
if (vma->vm_flags & VM_LOCKED)
|
||||
mlock_vma_page(page);
|
||||
|
||||
@@ -3652,7 +3652,7 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
||||
inc_mm_counter_fast(vma->vm_mm, MM_ANONPAGES);
|
||||
dec_mm_counter_fast(vma->vm_mm, MM_SWAPENTS);
|
||||
pte = mk_pte(page, vma->vm_page_prot);
|
||||
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page, NULL)) {
|
||||
if ((vmf->flags & FAULT_FLAG_WRITE) && reuse_swap_page(page)) {
|
||||
pte = maybe_mkwrite(pte_mkdirty(pte), vma);
|
||||
vmf->flags &= ~FAULT_FLAG_WRITE;
|
||||
ret |= VM_FAULT_WRITE;
|
||||
@@ -3665,8 +3665,6 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
||||
pte = pte_mkuffd_wp(pte);
|
||||
pte = pte_wrprotect(pte);
|
||||
}
|
||||
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
|
||||
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
|
||||
vmf->orig_pte = pte;
|
||||
|
||||
/* ksm created a completely new copy */
|
||||
@@ -3677,6 +3675,9 @@ vm_fault_t do_swap_page(struct vm_fault *vmf)
|
||||
do_page_add_anon_rmap(page, vma, vmf->address, exclusive);
|
||||
}
|
||||
|
||||
set_pte_at(vma->vm_mm, vmf->address, vmf->pte, pte);
|
||||
arch_do_swap_page(vma->vm_mm, vma, vmf->address, pte, vmf->orig_pte);
|
||||
|
||||
swap_free(entry);
|
||||
if (mem_cgroup_swap_full(page) ||
|
||||
(vma->vm_flags & VM_LOCKED) || PageMlocked(page))
|
||||
|
||||
@@ -134,6 +134,8 @@ static struct mempolicy preferred_node_policy[MAX_NUMNODES];
|
||||
* @node: Node id to start the search
|
||||
*
|
||||
* Lookup the next closest node by distance if @nid is not online.
|
||||
*
|
||||
* Return: this @node if it is online, otherwise the closest node by distance
|
||||
*/
|
||||
int numa_map_to_online_node(int node)
|
||||
{
|
||||
@@ -296,6 +298,7 @@ static struct mempolicy *mpol_new(unsigned short mode, unsigned short flags,
|
||||
atomic_set(&policy->refcnt, 1);
|
||||
policy->mode = mode;
|
||||
policy->flags = flags;
|
||||
policy->home_node = NUMA_NO_NODE;
|
||||
|
||||
return policy;
|
||||
}
|
||||
@@ -1478,6 +1481,77 @@ static long kernel_mbind(unsigned long start, unsigned long len,
|
||||
return do_mbind(start, len, lmode, mode_flags, &nodes, flags);
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE4(set_mempolicy_home_node, unsigned long, start, unsigned long, len,
|
||||
unsigned long, home_node, unsigned long, flags)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
struct vm_area_struct *vma;
|
||||
struct mempolicy *new;
|
||||
unsigned long vmstart;
|
||||
unsigned long vmend;
|
||||
unsigned long end;
|
||||
int err = -ENOENT;
|
||||
|
||||
start = untagged_addr(start);
|
||||
if (start & ~PAGE_MASK)
|
||||
return -EINVAL;
|
||||
/*
|
||||
* flags is used for future extension if any.
|
||||
*/
|
||||
if (flags != 0)
|
||||
return -EINVAL;
|
||||
|
||||
/*
|
||||
* Check home_node is online to avoid accessing uninitialized
|
||||
* NODE_DATA.
|
||||
*/
|
||||
if (home_node >= MAX_NUMNODES || !node_online(home_node))
|
||||
return -EINVAL;
|
||||
|
||||
len = (len + PAGE_SIZE - 1) & PAGE_MASK;
|
||||
end = start + len;
|
||||
|
||||
if (end < start)
|
||||
return -EINVAL;
|
||||
if (end == start)
|
||||
return 0;
|
||||
mmap_write_lock(mm);
|
||||
vma = find_vma(mm, start);
|
||||
for (; vma && vma->vm_start < end; vma = vma->vm_next) {
|
||||
|
||||
vmstart = max(start, vma->vm_start);
|
||||
vmend = min(end, vma->vm_end);
|
||||
new = mpol_dup(vma_policy(vma));
|
||||
if (IS_ERR(new)) {
|
||||
err = PTR_ERR(new);
|
||||
break;
|
||||
}
|
||||
/*
|
||||
* Only update home node if there is an existing vma policy
|
||||
*/
|
||||
if (!new)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* If any vma in the range got policy other than MPOL_BIND
|
||||
* or MPOL_PREFERRED_MANY we return error. We don't reset
|
||||
* the home node for vmas we already updated before.
|
||||
*/
|
||||
if (new->mode != MPOL_BIND && new->mode != MPOL_PREFERRED_MANY) {
|
||||
err = -EOPNOTSUPP;
|
||||
break;
|
||||
}
|
||||
|
||||
new->home_node = home_node;
|
||||
err = mbind_range(mm, vmstart, vmend, new);
|
||||
mpol_put(new);
|
||||
if (err)
|
||||
break;
|
||||
}
|
||||
mmap_write_unlock(mm);
|
||||
return err;
|
||||
}
|
||||
|
||||
SYSCALL_DEFINE6(mbind, unsigned long, start, unsigned long, len,
|
||||
unsigned long, mode, const unsigned long __user *, nmask,
|
||||
unsigned long, maxnode, unsigned int, flags)
|
||||
@@ -1802,6 +1876,11 @@ static int policy_node(gfp_t gfp, struct mempolicy *policy, int nd)
|
||||
WARN_ON_ONCE(policy->mode == MPOL_BIND && (gfp & __GFP_THISNODE));
|
||||
}
|
||||
|
||||
if ((policy->mode == MPOL_BIND ||
|
||||
policy->mode == MPOL_PREFERRED_MANY) &&
|
||||
policy->home_node != NUMA_NO_NODE)
|
||||
return policy->home_node;
|
||||
|
||||
return nd;
|
||||
}
|
||||
|
||||
@@ -2062,7 +2141,7 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
|
||||
preferred_gfp &= ~(__GFP_DIRECT_RECLAIM | __GFP_NOFAIL);
|
||||
page = __alloc_pages(preferred_gfp, order, nid, &pol->nodes);
|
||||
if (!page)
|
||||
page = __alloc_pages(gfp, order, numa_node_id(), NULL);
|
||||
page = __alloc_pages(gfp, order, nid, NULL);
|
||||
|
||||
return page;
|
||||
}
|
||||
@@ -2073,7 +2152,6 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
|
||||
* @order: Order of the GFP allocation.
|
||||
* @vma: Pointer to VMA or NULL if not available.
|
||||
* @addr: Virtual address of the allocation. Must be inside @vma.
|
||||
* @node: Which node to prefer for allocation (modulo policy).
|
||||
* @hugepage: For hugepages try only the preferred node if possible.
|
||||
*
|
||||
* Allocate a page for a specific address in @vma, using the appropriate
|
||||
@@ -2084,9 +2162,10 @@ static struct page *alloc_pages_preferred_many(gfp_t gfp, unsigned int order,
|
||||
* Return: The page on success or NULL if allocation fails.
|
||||
*/
|
||||
struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
|
||||
unsigned long addr, int node, bool hugepage)
|
||||
unsigned long addr, bool hugepage)
|
||||
{
|
||||
struct mempolicy *pol;
|
||||
int node = numa_node_id();
|
||||
struct page *page;
|
||||
int preferred_nid;
|
||||
nodemask_t *nmask;
|
||||
@@ -2103,6 +2182,7 @@ struct page *alloc_pages_vma(gfp_t gfp, int order, struct vm_area_struct *vma,
|
||||
}
|
||||
|
||||
if (pol->mode == MPOL_PREFERRED_MANY) {
|
||||
node = policy_node(gfp, pol, node);
|
||||
page = alloc_pages_preferred_many(gfp, order, node, pol);
|
||||
mpol_cond_put(pol);
|
||||
goto out;
|
||||
@@ -2186,7 +2266,7 @@ struct page *alloc_pages(gfp_t gfp, unsigned order)
|
||||
page = alloc_page_interleave(gfp, order, interleave_nodes(pol));
|
||||
else if (pol->mode == MPOL_PREFERRED_MANY)
|
||||
page = alloc_pages_preferred_many(gfp, order,
|
||||
numa_node_id(), pol);
|
||||
policy_node(gfp, pol, numa_node_id()), pol);
|
||||
else
|
||||
page = __alloc_pages(gfp, order,
|
||||
policy_node(gfp, pol, numa_node_id()),
|
||||
@@ -2342,6 +2422,8 @@ bool __mpol_equal(struct mempolicy *a, struct mempolicy *b)
|
||||
return false;
|
||||
if (a->flags != b->flags)
|
||||
return false;
|
||||
if (a->home_node != b->home_node)
|
||||
return false;
|
||||
if (mpol_store_user_nodemask(a))
|
||||
if (!nodes_equal(a->w.user_nodemask, b->w.user_nodemask))
|
||||
return false;
|
||||
@@ -2885,7 +2967,7 @@ static const char * const policy_modes[] =
|
||||
* Format of input:
|
||||
* <mode>[=<flags>][:<nodelist>]
|
||||
*
|
||||
* On success, returns 0, else 1
|
||||
* Return: %0 on success, else %1
|
||||
*/
|
||||
int mpol_parse_str(char *str, struct mempolicy **mpol)
|
||||
{
|
||||
|
||||
377
mm/migrate.c
377
mm/migrate.c
@@ -50,6 +50,7 @@
|
||||
#include <linux/ptrace.h>
|
||||
#include <linux/oom.h>
|
||||
#include <linux/memory.h>
|
||||
#include <linux/random.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
|
||||
@@ -236,20 +237,19 @@ static bool remove_migration_pte(struct page *page, struct vm_area_struct *vma,
|
||||
|
||||
pte = pte_mkhuge(pte);
|
||||
pte = arch_make_huge_pte(pte, shift, vma->vm_flags);
|
||||
set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
|
||||
if (PageAnon(new))
|
||||
hugepage_add_anon_rmap(new, vma, pvmw.address);
|
||||
else
|
||||
page_dup_rmap(new, true);
|
||||
set_huge_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
|
||||
} else
|
||||
#endif
|
||||
{
|
||||
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
|
||||
|
||||
if (PageAnon(new))
|
||||
page_add_anon_rmap(new, vma, pvmw.address, false);
|
||||
else
|
||||
page_add_file_rmap(new, false);
|
||||
set_pte_at(vma->vm_mm, pvmw.address, pvmw.pte, pte);
|
||||
}
|
||||
if (vma->vm_flags & VM_LOCKED && !PageTransCompound(new))
|
||||
mlock_vma_page(new);
|
||||
@@ -1084,80 +1084,6 @@ out:
|
||||
return rc;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* node_demotion[] example:
|
||||
*
|
||||
* Consider a system with two sockets. Each socket has
|
||||
* three classes of memory attached: fast, medium and slow.
|
||||
* Each memory class is placed in its own NUMA node. The
|
||||
* CPUs are placed in the node with the "fast" memory. The
|
||||
* 6 NUMA nodes (0-5) might be split among the sockets like
|
||||
* this:
|
||||
*
|
||||
* Socket A: 0, 1, 2
|
||||
* Socket B: 3, 4, 5
|
||||
*
|
||||
* When Node 0 fills up, its memory should be migrated to
|
||||
* Node 1. When Node 1 fills up, it should be migrated to
|
||||
* Node 2. The migration path start on the nodes with the
|
||||
* processors (since allocations default to this node) and
|
||||
* fast memory, progress through medium and end with the
|
||||
* slow memory:
|
||||
*
|
||||
* 0 -> 1 -> 2 -> stop
|
||||
* 3 -> 4 -> 5 -> stop
|
||||
*
|
||||
* This is represented in the node_demotion[] like this:
|
||||
*
|
||||
* { 1, // Node 0 migrates to 1
|
||||
* 2, // Node 1 migrates to 2
|
||||
* -1, // Node 2 does not migrate
|
||||
* 4, // Node 3 migrates to 4
|
||||
* 5, // Node 4 migrates to 5
|
||||
* -1} // Node 5 does not migrate
|
||||
*/
|
||||
|
||||
/*
|
||||
* Writes to this array occur without locking. Cycles are
|
||||
* not allowed: Node X demotes to Y which demotes to X...
|
||||
*
|
||||
* If multiple reads are performed, a single rcu_read_lock()
|
||||
* must be held over all reads to ensure that no cycles are
|
||||
* observed.
|
||||
*/
|
||||
static int node_demotion[MAX_NUMNODES] __read_mostly =
|
||||
{[0 ... MAX_NUMNODES - 1] = NUMA_NO_NODE};
|
||||
|
||||
/**
|
||||
* next_demotion_node() - Get the next node in the demotion path
|
||||
* @node: The starting node to lookup the next node
|
||||
*
|
||||
* Return: node id for next memory node in the demotion path hierarchy
|
||||
* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
|
||||
* @node online or guarantee that it *continues* to be the next demotion
|
||||
* target.
|
||||
*/
|
||||
int next_demotion_node(int node)
|
||||
{
|
||||
int target;
|
||||
|
||||
/*
|
||||
* node_demotion[] is updated without excluding this
|
||||
* function from running. RCU doesn't provide any
|
||||
* compiler barriers, so the READ_ONCE() is required
|
||||
* to avoid compiler reordering or read merging.
|
||||
*
|
||||
* Make sure to use RCU over entire code blocks if
|
||||
* node_demotion[] reads need to be consistent.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
target = READ_ONCE(node_demotion[node]);
|
||||
rcu_read_unlock();
|
||||
|
||||
return target;
|
||||
}
|
||||
|
||||
/*
|
||||
* Obtain the lock on page, remove all ptes and migrate the page
|
||||
* to the newly allocated page in newpage.
|
||||
@@ -1413,7 +1339,7 @@ static inline int try_split_thp(struct page *page, struct page **page2,
|
||||
* @mode: The migration mode that specifies the constraints for
|
||||
* page migration, if any.
|
||||
* @reason: The reason for page migration.
|
||||
* @ret_succeeded: Set to the number of pages migrated successfully if
|
||||
* @ret_succeeded: Set to the number of normal pages migrated successfully if
|
||||
* the caller passes a non-NULL pointer.
|
||||
*
|
||||
* The function returns after 10 attempts or if no pages are movable any more
|
||||
@@ -1421,7 +1347,9 @@ static inline int try_split_thp(struct page *page, struct page **page2,
|
||||
* It is caller's responsibility to call putback_movable_pages() to return pages
|
||||
* to the LRU or free list only if ret != 0.
|
||||
*
|
||||
* Returns the number of pages that were not migrated, or an error code.
|
||||
* Returns the number of {normal page, THP, hugetlb} that were not migrated, or
|
||||
* an error code. The number of THP splits will be considered as the number of
|
||||
* non-migrated THP, no matter how many subpages of the THP are migrated successfully.
|
||||
*/
|
||||
int migrate_pages(struct list_head *from, new_page_t get_new_page,
|
||||
free_page_t put_new_page, unsigned long private,
|
||||
@@ -1430,6 +1358,7 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
|
||||
int retry = 1;
|
||||
int thp_retry = 1;
|
||||
int nr_failed = 0;
|
||||
int nr_failed_pages = 0;
|
||||
int nr_succeeded = 0;
|
||||
int nr_thp_succeeded = 0;
|
||||
int nr_thp_failed = 0;
|
||||
@@ -1441,13 +1370,16 @@ int migrate_pages(struct list_head *from, new_page_t get_new_page,
|
||||
int swapwrite = current->flags & PF_SWAPWRITE;
|
||||
int rc, nr_subpages;
|
||||
LIST_HEAD(ret_pages);
|
||||
LIST_HEAD(thp_split_pages);
|
||||
bool nosplit = (reason == MR_NUMA_MISPLACED);
|
||||
bool no_subpage_counting = false;
|
||||
|
||||
trace_mm_migrate_pages_start(mode, reason);
|
||||
|
||||
if (!swapwrite)
|
||||
current->flags |= PF_SWAPWRITE;
|
||||
|
||||
thp_subpage_migration:
|
||||
for (pass = 0; pass < 10 && (retry || thp_retry); pass++) {
|
||||
retry = 0;
|
||||
thp_retry = 0;
|
||||
@@ -1460,7 +1392,7 @@ retry:
|
||||
* during migration.
|
||||
*/
|
||||
is_thp = PageTransHuge(page) && !PageHuge(page);
|
||||
nr_subpages = thp_nr_pages(page);
|
||||
nr_subpages = compound_nr(page);
|
||||
cond_resched();
|
||||
|
||||
if (PageHuge(page))
|
||||
@@ -1496,18 +1428,20 @@ retry:
|
||||
case -ENOSYS:
|
||||
/* THP migration is unsupported */
|
||||
if (is_thp) {
|
||||
if (!try_split_thp(page, &page2, from)) {
|
||||
nr_thp_failed++;
|
||||
if (!try_split_thp(page, &page2, &thp_split_pages)) {
|
||||
nr_thp_split++;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
nr_thp_failed++;
|
||||
nr_failed += nr_subpages;
|
||||
nr_failed_pages += nr_subpages;
|
||||
break;
|
||||
}
|
||||
|
||||
/* Hugetlb migration is unsupported */
|
||||
nr_failed++;
|
||||
if (!no_subpage_counting)
|
||||
nr_failed++;
|
||||
nr_failed_pages += nr_subpages;
|
||||
break;
|
||||
case -ENOMEM:
|
||||
/*
|
||||
@@ -1516,16 +1450,19 @@ retry:
|
||||
* THP NUMA faulting doesn't split THP to retry.
|
||||
*/
|
||||
if (is_thp && !nosplit) {
|
||||
if (!try_split_thp(page, &page2, from)) {
|
||||
nr_thp_failed++;
|
||||
if (!try_split_thp(page, &page2, &thp_split_pages)) {
|
||||
nr_thp_split++;
|
||||
goto retry;
|
||||
}
|
||||
|
||||
nr_thp_failed++;
|
||||
nr_failed += nr_subpages;
|
||||
nr_failed_pages += nr_subpages;
|
||||
goto out;
|
||||
}
|
||||
nr_failed++;
|
||||
|
||||
if (!no_subpage_counting)
|
||||
nr_failed++;
|
||||
nr_failed_pages += nr_subpages;
|
||||
goto out;
|
||||
case -EAGAIN:
|
||||
if (is_thp) {
|
||||
@@ -1535,12 +1472,11 @@ retry:
|
||||
retry++;
|
||||
break;
|
||||
case MIGRATEPAGE_SUCCESS:
|
||||
nr_succeeded += nr_subpages;
|
||||
if (is_thp) {
|
||||
nr_thp_succeeded++;
|
||||
nr_succeeded += nr_subpages;
|
||||
break;
|
||||
}
|
||||
nr_succeeded++;
|
||||
break;
|
||||
default:
|
||||
/*
|
||||
@@ -1551,17 +1487,37 @@ retry:
|
||||
*/
|
||||
if (is_thp) {
|
||||
nr_thp_failed++;
|
||||
nr_failed += nr_subpages;
|
||||
nr_failed_pages += nr_subpages;
|
||||
break;
|
||||
}
|
||||
nr_failed++;
|
||||
|
||||
if (!no_subpage_counting)
|
||||
nr_failed++;
|
||||
nr_failed_pages += nr_subpages;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
nr_failed += retry + thp_retry;
|
||||
nr_failed += retry;
|
||||
nr_thp_failed += thp_retry;
|
||||
rc = nr_failed;
|
||||
/*
|
||||
* Try to migrate subpages of fail-to-migrate THPs, no nr_failed
|
||||
* counting in this round, since all subpages of a THP is counted
|
||||
* as 1 failure in the first round.
|
||||
*/
|
||||
if (!list_empty(&thp_split_pages)) {
|
||||
/*
|
||||
* Move non-migrated pages (after 10 retries) to ret_pages
|
||||
* to avoid migrating them again.
|
||||
*/
|
||||
list_splice_init(from, &ret_pages);
|
||||
list_splice_init(&thp_split_pages, from);
|
||||
no_subpage_counting = true;
|
||||
retry = 1;
|
||||
goto thp_subpage_migration;
|
||||
}
|
||||
|
||||
rc = nr_failed + nr_thp_failed;
|
||||
out:
|
||||
/*
|
||||
* Put the permanent failure page back to migration list, they
|
||||
@@ -1570,11 +1526,11 @@ out:
|
||||
list_splice(&ret_pages, from);
|
||||
|
||||
count_vm_events(PGMIGRATE_SUCCESS, nr_succeeded);
|
||||
count_vm_events(PGMIGRATE_FAIL, nr_failed);
|
||||
count_vm_events(PGMIGRATE_FAIL, nr_failed_pages);
|
||||
count_vm_events(THP_MIGRATION_SUCCESS, nr_thp_succeeded);
|
||||
count_vm_events(THP_MIGRATION_FAIL, nr_thp_failed);
|
||||
count_vm_events(THP_MIGRATION_SPLIT, nr_thp_split);
|
||||
trace_mm_migrate_pages(nr_succeeded, nr_failed, nr_thp_succeeded,
|
||||
trace_mm_migrate_pages(nr_succeeded, nr_failed_pages, nr_thp_succeeded,
|
||||
nr_thp_failed, nr_thp_split, mode, reason);
|
||||
|
||||
if (!swapwrite)
|
||||
@@ -2516,8 +2472,7 @@ static bool migrate_vma_check_page(struct page *page)
|
||||
static void migrate_vma_unmap(struct migrate_vma *migrate)
|
||||
{
|
||||
const unsigned long npages = migrate->npages;
|
||||
const unsigned long start = migrate->start;
|
||||
unsigned long addr, i, restore = 0;
|
||||
unsigned long i, restore = 0;
|
||||
bool allow_drain = true;
|
||||
|
||||
lru_add_drain();
|
||||
@@ -2563,7 +2518,7 @@ static void migrate_vma_unmap(struct migrate_vma *migrate)
|
||||
}
|
||||
}
|
||||
|
||||
for (addr = start, i = 0; i < npages && restore; addr += PAGE_SIZE, i++) {
|
||||
for (i = 0; i < npages && restore; i++) {
|
||||
struct page *page = migrate_pfn_to_page(migrate->src[i]);
|
||||
|
||||
if (!page || (migrate->src[i] & MIGRATE_PFN_MIGRATE))
|
||||
@@ -2961,14 +2916,152 @@ void migrate_vma_finalize(struct migrate_vma *migrate)
|
||||
EXPORT_SYMBOL(migrate_vma_finalize);
|
||||
#endif /* CONFIG_DEVICE_PRIVATE */
|
||||
|
||||
/*
|
||||
* node_demotion[] example:
|
||||
*
|
||||
* Consider a system with two sockets. Each socket has
|
||||
* three classes of memory attached: fast, medium and slow.
|
||||
* Each memory class is placed in its own NUMA node. The
|
||||
* CPUs are placed in the node with the "fast" memory. The
|
||||
* 6 NUMA nodes (0-5) might be split among the sockets like
|
||||
* this:
|
||||
*
|
||||
* Socket A: 0, 1, 2
|
||||
* Socket B: 3, 4, 5
|
||||
*
|
||||
* When Node 0 fills up, its memory should be migrated to
|
||||
* Node 1. When Node 1 fills up, it should be migrated to
|
||||
* Node 2. The migration path start on the nodes with the
|
||||
* processors (since allocations default to this node) and
|
||||
* fast memory, progress through medium and end with the
|
||||
* slow memory:
|
||||
*
|
||||
* 0 -> 1 -> 2 -> stop
|
||||
* 3 -> 4 -> 5 -> stop
|
||||
*
|
||||
* This is represented in the node_demotion[] like this:
|
||||
*
|
||||
* { nr=1, nodes[0]=1 }, // Node 0 migrates to 1
|
||||
* { nr=1, nodes[0]=2 }, // Node 1 migrates to 2
|
||||
* { nr=0, nodes[0]=-1 }, // Node 2 does not migrate
|
||||
* { nr=1, nodes[0]=4 }, // Node 3 migrates to 4
|
||||
* { nr=1, nodes[0]=5 }, // Node 4 migrates to 5
|
||||
* { nr=0, nodes[0]=-1 }, // Node 5 does not migrate
|
||||
*
|
||||
* Moreover some systems may have multiple slow memory nodes.
|
||||
* Suppose a system has one socket with 3 memory nodes, node 0
|
||||
* is fast memory type, and node 1/2 both are slow memory
|
||||
* type, and the distance between fast memory node and slow
|
||||
* memory node is same. So the migration path should be:
|
||||
*
|
||||
* 0 -> 1/2 -> stop
|
||||
*
|
||||
* This is represented in the node_demotion[] like this:
|
||||
* { nr=2, {nodes[0]=1, nodes[1]=2} }, // Node 0 migrates to node 1 and node 2
|
||||
* { nr=0, nodes[0]=-1, }, // Node 1 dose not migrate
|
||||
* { nr=0, nodes[0]=-1, }, // Node 2 does not migrate
|
||||
*/
|
||||
|
||||
/*
|
||||
* Writes to this array occur without locking. Cycles are
|
||||
* not allowed: Node X demotes to Y which demotes to X...
|
||||
*
|
||||
* If multiple reads are performed, a single rcu_read_lock()
|
||||
* must be held over all reads to ensure that no cycles are
|
||||
* observed.
|
||||
*/
|
||||
#define DEFAULT_DEMOTION_TARGET_NODES 15
|
||||
|
||||
#if MAX_NUMNODES < DEFAULT_DEMOTION_TARGET_NODES
|
||||
#define DEMOTION_TARGET_NODES (MAX_NUMNODES - 1)
|
||||
#else
|
||||
#define DEMOTION_TARGET_NODES DEFAULT_DEMOTION_TARGET_NODES
|
||||
#endif
|
||||
|
||||
struct demotion_nodes {
|
||||
unsigned short nr;
|
||||
short nodes[DEMOTION_TARGET_NODES];
|
||||
};
|
||||
|
||||
static struct demotion_nodes *node_demotion __read_mostly;
|
||||
|
||||
/**
|
||||
* next_demotion_node() - Get the next node in the demotion path
|
||||
* @node: The starting node to lookup the next node
|
||||
*
|
||||
* Return: node id for next memory node in the demotion path hierarchy
|
||||
* from @node; NUMA_NO_NODE if @node is terminal. This does not keep
|
||||
* @node online or guarantee that it *continues* to be the next demotion
|
||||
* target.
|
||||
*/
|
||||
int next_demotion_node(int node)
|
||||
{
|
||||
struct demotion_nodes *nd;
|
||||
unsigned short target_nr, index;
|
||||
int target;
|
||||
|
||||
if (!node_demotion)
|
||||
return NUMA_NO_NODE;
|
||||
|
||||
nd = &node_demotion[node];
|
||||
|
||||
/*
|
||||
* node_demotion[] is updated without excluding this
|
||||
* function from running. RCU doesn't provide any
|
||||
* compiler barriers, so the READ_ONCE() is required
|
||||
* to avoid compiler reordering or read merging.
|
||||
*
|
||||
* Make sure to use RCU over entire code blocks if
|
||||
* node_demotion[] reads need to be consistent.
|
||||
*/
|
||||
rcu_read_lock();
|
||||
target_nr = READ_ONCE(nd->nr);
|
||||
|
||||
switch (target_nr) {
|
||||
case 0:
|
||||
target = NUMA_NO_NODE;
|
||||
goto out;
|
||||
case 1:
|
||||
index = 0;
|
||||
break;
|
||||
default:
|
||||
/*
|
||||
* If there are multiple target nodes, just select one
|
||||
* target node randomly.
|
||||
*
|
||||
* In addition, we can also use round-robin to select
|
||||
* target node, but we should introduce another variable
|
||||
* for node_demotion[] to record last selected target node,
|
||||
* that may cause cache ping-pong due to the changing of
|
||||
* last target node. Or introducing per-cpu data to avoid
|
||||
* caching issue, which seems more complicated. So selecting
|
||||
* target node randomly seems better until now.
|
||||
*/
|
||||
index = get_random_int() % target_nr;
|
||||
break;
|
||||
}
|
||||
|
||||
target = READ_ONCE(nd->nodes[index]);
|
||||
|
||||
out:
|
||||
rcu_read_unlock();
|
||||
return target;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_HOTPLUG_CPU)
|
||||
/* Disable reclaim-based migration. */
|
||||
static void __disable_all_migrate_targets(void)
|
||||
{
|
||||
int node;
|
||||
int node, i;
|
||||
|
||||
for_each_online_node(node)
|
||||
node_demotion[node] = NUMA_NO_NODE;
|
||||
if (!node_demotion)
|
||||
return;
|
||||
|
||||
for_each_online_node(node) {
|
||||
node_demotion[node].nr = 0;
|
||||
for (i = 0; i < DEMOTION_TARGET_NODES; i++)
|
||||
node_demotion[node].nodes[i] = NUMA_NO_NODE;
|
||||
}
|
||||
}
|
||||
|
||||
static void disable_all_migrate_targets(void)
|
||||
@@ -2995,26 +3088,40 @@ static void disable_all_migrate_targets(void)
|
||||
* Failing here is OK. It might just indicate
|
||||
* being at the end of a chain.
|
||||
*/
|
||||
static int establish_migrate_target(int node, nodemask_t *used)
|
||||
static int establish_migrate_target(int node, nodemask_t *used,
|
||||
int best_distance)
|
||||
{
|
||||
int migration_target;
|
||||
int migration_target, index, val;
|
||||
struct demotion_nodes *nd;
|
||||
|
||||
/*
|
||||
* Can not set a migration target on a
|
||||
* node with it already set.
|
||||
*
|
||||
* No need for READ_ONCE() here since this
|
||||
* in the write path for node_demotion[].
|
||||
* This should be the only thread writing.
|
||||
*/
|
||||
if (node_demotion[node] != NUMA_NO_NODE)
|
||||
if (!node_demotion)
|
||||
return NUMA_NO_NODE;
|
||||
|
||||
nd = &node_demotion[node];
|
||||
|
||||
migration_target = find_next_best_node(node, used);
|
||||
if (migration_target == NUMA_NO_NODE)
|
||||
return NUMA_NO_NODE;
|
||||
|
||||
node_demotion[node] = migration_target;
|
||||
/*
|
||||
* If the node has been set a migration target node before,
|
||||
* which means it's the best distance between them. Still
|
||||
* check if this node can be demoted to other target nodes
|
||||
* if they have a same best distance.
|
||||
*/
|
||||
if (best_distance != -1) {
|
||||
val = node_distance(node, migration_target);
|
||||
if (val > best_distance)
|
||||
return NUMA_NO_NODE;
|
||||
}
|
||||
|
||||
index = nd->nr;
|
||||
if (WARN_ONCE(index >= DEMOTION_TARGET_NODES,
|
||||
"Exceeds maximum demotion target nodes\n"))
|
||||
return NUMA_NO_NODE;
|
||||
|
||||
nd->nodes[index] = migration_target;
|
||||
nd->nr++;
|
||||
|
||||
return migration_target;
|
||||
}
|
||||
@@ -3030,7 +3137,9 @@ static int establish_migrate_target(int node, nodemask_t *used)
|
||||
*
|
||||
* The difference here is that cycles must be avoided. If
|
||||
* node0 migrates to node1, then neither node1, nor anything
|
||||
* node1 migrates to can migrate to node0.
|
||||
* node1 migrates to can migrate to node0. Also one node can
|
||||
* be migrated to multiple nodes if the target nodes all have
|
||||
* a same best-distance against the source node.
|
||||
*
|
||||
* This function can run simultaneously with readers of
|
||||
* node_demotion[]. However, it can not run simultaneously
|
||||
@@ -3042,7 +3151,7 @@ static void __set_migration_target_nodes(void)
|
||||
nodemask_t next_pass = NODE_MASK_NONE;
|
||||
nodemask_t this_pass = NODE_MASK_NONE;
|
||||
nodemask_t used_targets = NODE_MASK_NONE;
|
||||
int node;
|
||||
int node, best_distance;
|
||||
|
||||
/*
|
||||
* Avoid any oddities like cycles that could occur
|
||||
@@ -3071,18 +3180,33 @@ again:
|
||||
* multiple source nodes to share a destination.
|
||||
*/
|
||||
nodes_or(used_targets, used_targets, this_pass);
|
||||
for_each_node_mask(node, this_pass) {
|
||||
int target_node = establish_migrate_target(node, &used_targets);
|
||||
|
||||
if (target_node == NUMA_NO_NODE)
|
||||
continue;
|
||||
for_each_node_mask(node, this_pass) {
|
||||
best_distance = -1;
|
||||
|
||||
/*
|
||||
* Visit targets from this pass in the next pass.
|
||||
* Eventually, every node will have been part of
|
||||
* a pass, and will become set in 'used_targets'.
|
||||
* Try to set up the migration path for the node, and the target
|
||||
* migration nodes can be multiple, so doing a loop to find all
|
||||
* the target nodes if they all have a best node distance.
|
||||
*/
|
||||
node_set(target_node, next_pass);
|
||||
do {
|
||||
int target_node =
|
||||
establish_migrate_target(node, &used_targets,
|
||||
best_distance);
|
||||
|
||||
if (target_node == NUMA_NO_NODE)
|
||||
break;
|
||||
|
||||
if (best_distance == -1)
|
||||
best_distance = node_distance(node, target_node);
|
||||
|
||||
/*
|
||||
* Visit targets from this pass in the next pass.
|
||||
* Eventually, every node will have been part of
|
||||
* a pass, and will become set in 'used_targets'.
|
||||
*/
|
||||
node_set(target_node, next_pass);
|
||||
} while (1);
|
||||
}
|
||||
/*
|
||||
* 'next_pass' contains nodes which became migration
|
||||
@@ -3183,6 +3307,11 @@ static int __init migrate_on_reclaim_init(void)
|
||||
{
|
||||
int ret;
|
||||
|
||||
node_demotion = kmalloc_array(nr_node_ids,
|
||||
sizeof(struct demotion_nodes),
|
||||
GFP_KERNEL);
|
||||
WARN_ON(!node_demotion);
|
||||
|
||||
ret = cpuhp_setup_state_nocalls(CPUHP_MM_DEMOTION_DEAD, "mm/demotion:offline",
|
||||
NULL, migration_offline_cpu);
|
||||
/*
|
||||
|
||||
@@ -1058,7 +1058,7 @@ bool out_of_memory(struct oom_control *oc)
|
||||
|
||||
if (!is_memcg_oom(oc)) {
|
||||
blocking_notifier_call_chain(&oom_notify_list, 0, &freed);
|
||||
if (freed > 0)
|
||||
if (freed > 0 && !is_sysrq_oom(oc))
|
||||
/* Got some memory back in the last second. */
|
||||
return true;
|
||||
}
|
||||
|
||||
@@ -19,6 +19,7 @@
|
||||
#include <linux/mm.h>
|
||||
#include <linux/highmem.h>
|
||||
#include <linux/swap.h>
|
||||
#include <linux/swapops.h>
|
||||
#include <linux/interrupt.h>
|
||||
#include <linux/pagemap.h>
|
||||
#include <linux/jiffies.h>
|
||||
@@ -63,6 +64,7 @@
|
||||
#include <linux/sched/rt.h>
|
||||
#include <linux/sched/mm.h>
|
||||
#include <linux/page_owner.h>
|
||||
#include <linux/page_table_check.h>
|
||||
#include <linux/kthread.h>
|
||||
#include <linux/memcontrol.h>
|
||||
#include <linux/ftrace.h>
|
||||
@@ -1307,6 +1309,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
|
||||
if (memcg_kmem_enabled() && PageMemcgKmem(page))
|
||||
__memcg_kmem_uncharge_page(page, order);
|
||||
reset_page_owner(page, order);
|
||||
page_table_check_free(page, order);
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -1346,6 +1349,7 @@ static __always_inline bool free_pages_prepare(struct page *page,
|
||||
page_cpupid_reset_last(page);
|
||||
page->flags &= ~PAGE_FLAGS_CHECK_AT_PREP;
|
||||
reset_page_owner(page, order);
|
||||
page_table_check_free(page, order);
|
||||
|
||||
if (!PageHighMem(page)) {
|
||||
debug_check_no_locks_freed(page_address(page),
|
||||
@@ -2420,6 +2424,7 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
|
||||
}
|
||||
|
||||
set_page_owner(page, order, gfp_flags);
|
||||
page_table_check_alloc(page, order);
|
||||
}
|
||||
|
||||
static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags,
|
||||
@@ -4214,7 +4219,9 @@ void warn_alloc(gfp_t gfp_mask, nodemask_t *nodemask, const char *fmt, ...)
|
||||
va_list args;
|
||||
static DEFINE_RATELIMIT_STATE(nopage_rs, 10*HZ, 1);
|
||||
|
||||
if ((gfp_mask & __GFP_NOWARN) || !__ratelimit(&nopage_rs))
|
||||
if ((gfp_mask & __GFP_NOWARN) ||
|
||||
!__ratelimit(&nopage_rs) ||
|
||||
((gfp_mask & __GFP_DMA) && !has_managed_dma()))
|
||||
return;
|
||||
|
||||
va_start(args, fmt);
|
||||
@@ -8224,7 +8231,7 @@ void __init mem_init_print_info(void)
|
||||
*/
|
||||
#define adj_init_size(start, end, size, pos, adj) \
|
||||
do { \
|
||||
if (start <= pos && pos < end && size > adj) \
|
||||
if (&start[0] <= &pos[0] && &pos[0] < &end[0] && size > adj) \
|
||||
size -= adj; \
|
||||
} while (0)
|
||||
|
||||
@@ -9268,8 +9275,8 @@ static bool zone_spans_last_pfn(const struct zone *zone,
|
||||
* for allocation requests which can not be fulfilled with the buddy allocator.
|
||||
*
|
||||
* The allocated memory is always aligned to a page boundary. If nr_pages is a
|
||||
* power of two then the alignment is guaranteed to be to the given nr_pages
|
||||
* (e.g. 1GB request would be aligned to 1GB).
|
||||
* power of two, then allocated range is also guaranteed to be aligned to same
|
||||
* nr_pages (e.g. 1GB request would be aligned to 1GB).
|
||||
*
|
||||
* Allocated pages can be freed with free_contig_range() or by manually calling
|
||||
* __free_page() on each allocated page.
|
||||
@@ -9502,6 +9509,7 @@ bool take_page_off_buddy(struct page *page)
|
||||
del_page_from_free_list(page_head, zone, page_order);
|
||||
break_down_buddy_pages(zone, page_head, page, 0,
|
||||
page_order, migratetype);
|
||||
SetPageHWPoisonTakenOff(page);
|
||||
if (!is_migrate_isolate(migratetype))
|
||||
__mod_zone_freepage_state(zone, -1, migratetype);
|
||||
ret = true;
|
||||
@@ -9513,4 +9521,44 @@ bool take_page_off_buddy(struct page *page)
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* Cancel takeoff done by take_page_off_buddy().
|
||||
*/
|
||||
bool put_page_back_buddy(struct page *page)
|
||||
{
|
||||
struct zone *zone = page_zone(page);
|
||||
unsigned long pfn = page_to_pfn(page);
|
||||
unsigned long flags;
|
||||
int migratetype = get_pfnblock_migratetype(page, pfn);
|
||||
bool ret = false;
|
||||
|
||||
spin_lock_irqsave(&zone->lock, flags);
|
||||
if (put_page_testzero(page)) {
|
||||
ClearPageHWPoisonTakenOff(page);
|
||||
__free_one_page(page, pfn, zone, 0, migratetype, FPI_NONE);
|
||||
if (TestClearPageHWPoison(page)) {
|
||||
num_poisoned_pages_dec();
|
||||
ret = true;
|
||||
}
|
||||
}
|
||||
spin_unlock_irqrestore(&zone->lock, flags);
|
||||
|
||||
return ret;
|
||||
}
|
||||
#endif
|
||||
|
||||
#ifdef CONFIG_ZONE_DMA
|
||||
bool has_managed_dma(void)
|
||||
{
|
||||
struct pglist_data *pgdat;
|
||||
|
||||
for_each_online_pgdat(pgdat) {
|
||||
struct zone *zone = &pgdat->node_zones[ZONE_DMA];
|
||||
|
||||
if (managed_zone(zone))
|
||||
return true;
|
||||
}
|
||||
return false;
|
||||
}
|
||||
#endif /* CONFIG_ZONE_DMA */
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include <linux/kmemleak.h>
|
||||
#include <linux/page_owner.h>
|
||||
#include <linux/page_idle.h>
|
||||
#include <linux/page_table_check.h>
|
||||
|
||||
/*
|
||||
* struct page extension
|
||||
@@ -63,18 +64,21 @@ static bool need_page_idle(void)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
struct page_ext_operations page_idle_ops = {
|
||||
static struct page_ext_operations page_idle_ops __initdata = {
|
||||
.need = need_page_idle,
|
||||
};
|
||||
#endif
|
||||
|
||||
static struct page_ext_operations *page_ext_ops[] = {
|
||||
static struct page_ext_operations *page_ext_ops[] __initdata = {
|
||||
#ifdef CONFIG_PAGE_OWNER
|
||||
&page_owner_ops,
|
||||
#endif
|
||||
#if defined(CONFIG_PAGE_IDLE_FLAG) && !defined(CONFIG_64BIT)
|
||||
&page_idle_ops,
|
||||
#endif
|
||||
#ifdef CONFIG_PAGE_TABLE_CHECK
|
||||
&page_table_check_ops,
|
||||
#endif
|
||||
};
|
||||
|
||||
unsigned long page_ext_size = sizeof(struct page_ext);
|
||||
|
||||
@@ -115,7 +115,7 @@ static void unset_migratetype_isolate(struct page *page, unsigned migratetype)
|
||||
* onlining - just onlined memory won't immediately be considered for
|
||||
* allocation.
|
||||
*/
|
||||
if (!isolated_page) {
|
||||
if (!isolated_page && PageBuddy(page)) {
|
||||
nr_pages = move_freepages_block(zone, page, migratetype, NULL);
|
||||
__mod_zone_freepage_state(zone, nr_pages, migratetype);
|
||||
}
|
||||
|
||||
@@ -46,7 +46,7 @@ static int __init early_page_owner_param(char *buf)
|
||||
}
|
||||
early_param("page_owner", early_page_owner_param);
|
||||
|
||||
static bool need_page_owner(void)
|
||||
static __init bool need_page_owner(void)
|
||||
{
|
||||
return page_owner_enabled;
|
||||
}
|
||||
@@ -75,7 +75,7 @@ static noinline void register_early_stack(void)
|
||||
early_handle = create_dummy_stack();
|
||||
}
|
||||
|
||||
static void init_page_owner(void)
|
||||
static __init void init_page_owner(void)
|
||||
{
|
||||
if (!page_owner_enabled)
|
||||
return;
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user