From a27c597f262ddf2356a9cfb1df28fd2588d6ba76 Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Thu, 20 Feb 2025 15:58:01 +0000 Subject: [PATCH 1/7] arm64: mte: Do not allow PROT_MTE on MAP_HUGETLB user mappings PROT_MTE (memory tagging extensions) is not supported on all user mmap() types for various reasons (memory attributes, backing storage, CoW handling). The arm64 arch_validate_flags() function checks whether the VM_MTE_ALLOWED flag has been set for a vma during mmap(), usually by arch_calc_vm_flag_bits(). Linux prior to 6.13 does not support PROT_MTE hugetlb mappings. This was added by commit 25c17c4b55de ("hugetlb: arm64: add mte support"). However, earlier kernels inadvertently set VM_MTE_ALLOWED on (MAP_ANONYMOUS | MAP_HUGETLB) mappings by only checking for MAP_ANONYMOUS. Explicitly check MAP_HUGETLB in arch_calc_vm_flag_bits() and avoid setting VM_MTE_ALLOWED for such mappings. Fixes: 9f3419315f3c ("arm64: mte: Add PROT_MTE support to mmap() and mprotect()") Cc: # 5.10.x-6.12.x Reported-by: Naresh Kamboju Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/mman.h | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/arm64/include/asm/mman.h b/arch/arm64/include/asm/mman.h index ef35c52aabd6..101771c60d80 100644 --- a/arch/arm64/include/asm/mman.h +++ b/arch/arm64/include/asm/mman.h @@ -31,9 +31,12 @@ static inline unsigned long arch_calc_vm_flag_bits(struct file *file, * backed by tags-capable memory. The vm_flags may be overridden by a * filesystem supporting MTE (RAM-based). */ - if (system_supports_mte() && - ((flags & MAP_ANONYMOUS) || shmem_file(file))) - return VM_MTE_ALLOWED; + if (system_supports_mte()) { + if ((flags & MAP_ANONYMOUS) && !(flags & MAP_HUGETLB)) + return VM_MTE_ALLOWED; + if (shmem_file(file)) + return VM_MTE_ALLOWED; + } return 0; } From e5c4b7b19b6b59dda8d2959740ea1ebaf0746a2c Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:12 +0800 Subject: [PATCH 2/7] md/md-bitmap: replace md_bitmap_status() with a new helper md_bitmap_get_stats() [ Upstream commit 38f287d7e495ae00d4481702f44ff7ca79f5c9bc ] There are no functional changes, and the new helper will be used in multiple places in following patches to avoid dereferencing bitmap directly. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-3-yukuai1@huaweicloud.com Signed-off-by: Song Liu Stable-dep-of: 8d28d0ddb986 ("md/md-bitmap: Synchronize bitmap_get_stats() with bitmap lifetime") Signed-off-by: Sasha Levin --- drivers/md/md-bitmap.c | 25 ++++++------------------- drivers/md/md-bitmap.h | 8 +++++++- drivers/md/md.c | 29 ++++++++++++++++++++++++++++- 3 files changed, 41 insertions(+), 21 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 9d8ac04c2346..736268447d3e 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2022,32 +2022,19 @@ int md_bitmap_copy_from_slot(struct mddev *mddev, int slot, } EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot); - -void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap) +int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) { - unsigned long chunk_kb; struct bitmap_counts *counts; if (!bitmap) - return; + return -ENOENT; counts = &bitmap->counts; + stats->missing_pages = counts->missing_pages; + stats->pages = counts->pages; + stats->file = bitmap->storage.file; - chunk_kb = bitmap->mddev->bitmap_info.chunksize >> 10; - seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], " - "%lu%s chunk", - counts->pages - counts->missing_pages, - counts->pages, - (counts->pages - counts->missing_pages) - << (PAGE_SHIFT - 10), - chunk_kb ? chunk_kb : bitmap->mddev->bitmap_info.chunksize, - chunk_kb ? "KB" : "B"); - if (bitmap->storage.file) { - seq_printf(seq, ", file: "); - seq_file_path(seq, bitmap->storage.file, " \t\n"); - } - - seq_printf(seq, "\n"); + return 0; } int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 3a4750952b3a..00ac4c3ecf4d 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -233,6 +233,12 @@ struct bitmap { int cluster_slot; /* Slot offset for clustered env */ }; +struct md_bitmap_stats { + unsigned long missing_pages; + unsigned long pages; + struct file *file; +}; + /* the bitmap API */ /* these are used only by md/bitmap */ @@ -243,7 +249,7 @@ void md_bitmap_destroy(struct mddev *mddev); void md_bitmap_print_sb(struct bitmap *bitmap); void md_bitmap_update_sb(struct bitmap *bitmap); -void md_bitmap_status(struct seq_file *seq, struct bitmap *bitmap); +int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats); int md_bitmap_setallbits(struct bitmap *bitmap); void md_bitmap_write_all(struct bitmap *bitmap); diff --git a/drivers/md/md.c b/drivers/md/md.c index 297c86f5c70b..a52843826a95 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8318,6 +8318,33 @@ static void md_seq_stop(struct seq_file *seq, void *v) mddev_put(mddev); } +static void md_bitmap_status(struct seq_file *seq, struct mddev *mddev) +{ + struct md_bitmap_stats stats; + unsigned long used_pages; + unsigned long chunk_kb; + int err; + + err = md_bitmap_get_stats(mddev->bitmap, &stats); + if (err) + return; + + chunk_kb = mddev->bitmap_info.chunksize >> 10; + used_pages = stats.pages - stats.missing_pages; + + seq_printf(seq, "bitmap: %lu/%lu pages [%luKB], %lu%s chunk", + used_pages, stats.pages, used_pages << (PAGE_SHIFT - 10), + chunk_kb ? chunk_kb : mddev->bitmap_info.chunksize, + chunk_kb ? "KB" : "B"); + + if (stats.file) { + seq_puts(seq, ", file: "); + seq_file_path(seq, stats.file, " \t\n"); + } + + seq_putc(seq, '\n'); +} + static int md_seq_show(struct seq_file *seq, void *v) { struct mddev *mddev = v; @@ -8406,7 +8433,7 @@ static int md_seq_show(struct seq_file *seq, void *v) } else seq_printf(seq, "\n "); - md_bitmap_status(seq, mddev->bitmap); + md_bitmap_status(seq, mddev); seq_printf(seq, "\n"); } From e83e6ea58964cdc16b65aee3e1d4f7651bc6fdea Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:15 +0800 Subject: [PATCH 3/7] md/md-cluster: fix spares warnings for __le64 [ Upstream commit 82697ccf7e495c1ba81e315c2886d6220ff84c2c ] drivers/md/md-cluster.c:1220:22: warning: incorrect type in assignment (different base types) drivers/md/md-cluster.c:1220:22: expected unsigned long my_sync_size drivers/md/md-cluster.c:1220:22: got restricted __le64 [usertype] sync_size drivers/md/md-cluster.c:1252:35: warning: incorrect type in assignment (different base types) drivers/md/md-cluster.c:1252:35: expected unsigned long sync_size drivers/md/md-cluster.c:1252:35: got restricted __le64 [usertype] sync_size drivers/md/md-cluster.c:1253:41: warning: restricted __le64 degrades to integer Fix the warnings by using le64_to_cpu() to convet __le64 to integer. Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-6-yukuai1@huaweicloud.com Signed-off-by: Song Liu Stable-dep-of: 8d28d0ddb986 ("md/md-bitmap: Synchronize bitmap_get_stats() with bitmap lifetime") Signed-off-by: Sasha Levin --- drivers/md/md-cluster.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index 10e0c5381d01..a0d3f6c39770 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1195,7 +1195,7 @@ static int cluster_check_sync_size(struct mddev *mddev) struct dlm_lock_resource *bm_lockres; sb = kmap_atomic(bitmap->storage.sb_page); - my_sync_size = sb->sync_size; + my_sync_size = le64_to_cpu(sb->sync_size); kunmap_atomic(sb); for (i = 0; i < node_num; i++) { @@ -1227,8 +1227,8 @@ static int cluster_check_sync_size(struct mddev *mddev) sb = kmap_atomic(bitmap->storage.sb_page); if (sync_size == 0) - sync_size = sb->sync_size; - else if (sync_size != sb->sync_size) { + sync_size = le64_to_cpu(sb->sync_size); + else if (sync_size != le64_to_cpu(sb->sync_size)) { kunmap_atomic(sb); md_bitmap_free(bitmap); return -1; From 249d9b9da2c940e44b6a625331e33327750707c6 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Mon, 26 Aug 2024 15:44:16 +0800 Subject: [PATCH 4/7] md/md-bitmap: add 'sync_size' into struct md_bitmap_stats [ Upstream commit ec6bb299c7c3dd4ca1724d13d5f5fae3ee54fc65 ] To avoid dereferencing bitmap directly in md-cluster to prepare inventing a new bitmap. BTW, also fix following checkpatch warnings: WARNING: Deprecated use of 'kmap_atomic', prefer 'kmap_local_page' instead WARNING: Deprecated use of 'kunmap_atomic', prefer 'kunmap_local' instead Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20240826074452.1490072-7-yukuai1@huaweicloud.com Signed-off-by: Song Liu Stable-dep-of: 8d28d0ddb986 ("md/md-bitmap: Synchronize bitmap_get_stats() with bitmap lifetime") Signed-off-by: Sasha Levin --- drivers/md/md-bitmap.c | 6 ++++++ drivers/md/md-bitmap.h | 1 + drivers/md/md-cluster.c | 34 ++++++++++++++++++++-------------- 3 files changed, 27 insertions(+), 14 deletions(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index 736268447d3e..bddf4f3d27a7 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2025,10 +2025,15 @@ EXPORT_SYMBOL_GPL(md_bitmap_copy_from_slot); int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) { struct bitmap_counts *counts; + bitmap_super_t *sb; if (!bitmap) return -ENOENT; + sb = kmap_local_page(bitmap->storage.sb_page); + stats->sync_size = le64_to_cpu(sb->sync_size); + kunmap_local(sb); + counts = &bitmap->counts; stats->missing_pages = counts->missing_pages; stats->pages = counts->pages; @@ -2036,6 +2041,7 @@ int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) return 0; } +EXPORT_SYMBOL_GPL(md_bitmap_get_stats); int md_bitmap_resize(struct bitmap *bitmap, sector_t blocks, int chunksize, int init) diff --git a/drivers/md/md-bitmap.h b/drivers/md/md-bitmap.h index 00ac4c3ecf4d..7b7a701f74be 100644 --- a/drivers/md/md-bitmap.h +++ b/drivers/md/md-bitmap.h @@ -235,6 +235,7 @@ struct bitmap { struct md_bitmap_stats { unsigned long missing_pages; + unsigned long sync_size; unsigned long pages; struct file *file; }; diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c index a0d3f6c39770..7484bb83171a 100644 --- a/drivers/md/md-cluster.c +++ b/drivers/md/md-cluster.c @@ -1185,18 +1185,21 @@ out: */ static int cluster_check_sync_size(struct mddev *mddev) { - int i, rv; - bitmap_super_t *sb; - unsigned long my_sync_size, sync_size = 0; - int node_num = mddev->bitmap_info.nodes; int current_slot = md_cluster_ops->slot_number(mddev); + int node_num = mddev->bitmap_info.nodes; struct bitmap *bitmap = mddev->bitmap; - char str[64]; struct dlm_lock_resource *bm_lockres; + struct md_bitmap_stats stats; + unsigned long sync_size = 0; + unsigned long my_sync_size; + char str[64]; + int i, rv; - sb = kmap_atomic(bitmap->storage.sb_page); - my_sync_size = le64_to_cpu(sb->sync_size); - kunmap_atomic(sb); + rv = md_bitmap_get_stats(bitmap, &stats); + if (rv) + return rv; + + my_sync_size = stats.sync_size; for (i = 0; i < node_num; i++) { if (i == current_slot) @@ -1225,15 +1228,18 @@ static int cluster_check_sync_size(struct mddev *mddev) md_bitmap_update_sb(bitmap); lockres_free(bm_lockres); - sb = kmap_atomic(bitmap->storage.sb_page); - if (sync_size == 0) - sync_size = le64_to_cpu(sb->sync_size); - else if (sync_size != le64_to_cpu(sb->sync_size)) { - kunmap_atomic(sb); + rv = md_bitmap_get_stats(bitmap, &stats); + if (rv) { + md_bitmap_free(bitmap); + return rv; + } + + if (sync_size == 0) { + sync_size = stats.sync_size; + } else if (sync_size != stats.sync_size) { md_bitmap_free(bitmap); return -1; } - kunmap_atomic(sb); md_bitmap_free(bitmap); } From 032fa54f486eac5507976e7e31f079a767bc13a8 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 24 Jan 2025 17:20:55 +0800 Subject: [PATCH 5/7] md/md-bitmap: Synchronize bitmap_get_stats() with bitmap lifetime [ Upstream commit 8d28d0ddb986f56920ac97ae704cc3340a699a30 ] After commit ec6bb299c7c3 ("md/md-bitmap: add 'sync_size' into struct md_bitmap_stats"), following panic is reported: Oops: general protection fault, probably for non-canonical address RIP: 0010:bitmap_get_stats+0x2b/0xa0 Call Trace: md_seq_show+0x2d2/0x5b0 seq_read_iter+0x2b9/0x470 seq_read+0x12f/0x180 proc_reg_read+0x57/0xb0 vfs_read+0xf6/0x380 ksys_read+0x6c/0xf0 do_syscall_64+0x82/0x170 entry_SYSCALL_64_after_hwframe+0x76/0x7e Root cause is that bitmap_get_stats() can be called at anytime if mddev is still there, even if bitmap is destroyed, or not fully initialized. Deferenceing bitmap in this case can crash the kernel. Meanwhile, the above commit start to deferencing bitmap->storage, make the problem easier to trigger. Fix the problem by protecting bitmap_get_stats() with bitmap_info.mutex. Cc: stable@vger.kernel.org # v6.12+ Fixes: 32a7627cf3a3 ("[PATCH] md: optimised resync using Bitmap based intent logging") Reported-and-tested-by: Harshit Mogalapalli Closes: https://lore.kernel.org/linux-raid/ca3a91a2-50ae-4f68-b317-abd9889f3907@oracle.com/T/#m6e5086c95201135e4941fe38f9efa76daf9666c5 Signed-off-by: Yu Kuai Link: https://lore.kernel.org/r/20250124092055.4050195-1-yukuai1@huaweicloud.com Signed-off-by: Song Liu Signed-off-by: Sasha Levin --- drivers/md/md-bitmap.c | 5 ++++- drivers/md/md.c | 4 ++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c index bddf4f3d27a7..e18e21b24210 100644 --- a/drivers/md/md-bitmap.c +++ b/drivers/md/md-bitmap.c @@ -2029,7 +2029,10 @@ int md_bitmap_get_stats(struct bitmap *bitmap, struct md_bitmap_stats *stats) if (!bitmap) return -ENOENT; - + if (bitmap->mddev->bitmap_info.external) + return -ENOENT; + if (!bitmap->storage.sb_page) /* no superblock */ + return -EINVAL; sb = kmap_local_page(bitmap->storage.sb_page); stats->sync_size = le64_to_cpu(sb->sync_size); kunmap_local(sb); diff --git a/drivers/md/md.c b/drivers/md/md.c index a52843826a95..5e2751d42f64 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -8368,6 +8368,9 @@ static int md_seq_show(struct seq_file *seq, void *v) return 0; } + /* prevent bitmap to be freed after checking */ + mutex_lock(&mddev->bitmap_info.mutex); + spin_lock(&mddev->lock); if (mddev->pers || mddev->raid_disks || !list_empty(&mddev->disks)) { seq_printf(seq, "%s : %sactive", mdname(mddev), @@ -8438,6 +8441,7 @@ static int md_seq_show(struct seq_file *seq, void *v) seq_printf(seq, "\n"); } spin_unlock(&mddev->lock); + mutex_unlock(&mddev->bitmap_info.mutex); return 0; } From 2ea46587834458f44efbb4b190884e7fb57e0376 Mon Sep 17 00:00:00 2001 From: Carlos Galo Date: Fri, 23 Feb 2024 17:32:49 +0000 Subject: [PATCH 6/7] mm: update mark_victim tracepoints fields MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 72ba14deb40a9e9668ec5e66a341ed657e5215c2 ] The current implementation of the mark_victim tracepoint provides only the process ID (pid) of the victim process. This limitation poses challenges for userspace tools requiring real-time OOM analysis and intervention. Although this information is available from the kernel logs, it’s not the appropriate format to provide OOM notifications. In Android, BPF programs are used with the mark_victim trace events to notify userspace of an OOM kill. For consistency, update the trace event to include the same information about the OOMed victim as the kernel logs. - UID In Android each installed application has a unique UID. Including the `uid` assists in correlating OOM events with specific apps. - Process Name (comm) Enables identification of the affected process. - OOM Score Will allow userspace to get additional insight of the relative kill priority of the OOM victim. In Android, the oom_score_adj is used to categorize app state (foreground, background, etc.), which aids in analyzing user-perceptible impacts of OOM events [1]. - Total VM, RSS Stats, and pgtables Amount of memory used by the victim that will, potentially, be freed up by killing it. [1] https://cs.android.com/android/platform/superproject/main/+/246dc8fc95b6d93afcba5c6d6c133307abb3ac2e:frameworks/base/services/core/java/com/android/server/am/ProcessList.java;l=188-283 Signed-off-by: Carlos Galo Reviewed-by: Steven Rostedt Cc: Suren Baghdasaryan Cc: Michal Hocko Cc: "Masami Hiramatsu (Google)" Cc: Mathieu Desnoyers Signed-off-by: Andrew Morton Stable-dep-of: ade81479c7dd ("memcg: fix soft lockup in the OOM process") Signed-off-by: Sasha Levin --- include/trace/events/oom.h | 36 ++++++++++++++++++++++++++++++++---- mm/oom_kill.c | 6 +++++- 2 files changed, 37 insertions(+), 5 deletions(-) diff --git a/include/trace/events/oom.h b/include/trace/events/oom.h index 26a11e4a2c36..b799f3bcba82 100644 --- a/include/trace/events/oom.h +++ b/include/trace/events/oom.h @@ -7,6 +7,8 @@ #include #include +#define PG_COUNT_TO_KB(x) ((x) << (PAGE_SHIFT - 10)) + TRACE_EVENT(oom_score_adj_update, TP_PROTO(struct task_struct *task), @@ -72,19 +74,45 @@ TRACE_EVENT(reclaim_retry_zone, ); TRACE_EVENT(mark_victim, - TP_PROTO(int pid), + TP_PROTO(struct task_struct *task, uid_t uid), - TP_ARGS(pid), + TP_ARGS(task, uid), TP_STRUCT__entry( __field(int, pid) + __string(comm, task->comm) + __field(unsigned long, total_vm) + __field(unsigned long, anon_rss) + __field(unsigned long, file_rss) + __field(unsigned long, shmem_rss) + __field(uid_t, uid) + __field(unsigned long, pgtables) + __field(short, oom_score_adj) ), TP_fast_assign( - __entry->pid = pid; + __entry->pid = task->pid; + __assign_str(comm, task->comm); + __entry->total_vm = PG_COUNT_TO_KB(task->mm->total_vm); + __entry->anon_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_ANONPAGES)); + __entry->file_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_FILEPAGES)); + __entry->shmem_rss = PG_COUNT_TO_KB(get_mm_counter(task->mm, MM_SHMEMPAGES)); + __entry->uid = uid; + __entry->pgtables = mm_pgtables_bytes(task->mm) >> 10; + __entry->oom_score_adj = task->signal->oom_score_adj; ), - TP_printk("pid=%d", __entry->pid) + TP_printk("pid=%d comm=%s total-vm=%lukB anon-rss=%lukB file-rss:%lukB shmem-rss:%lukB uid=%u pgtables=%lukB oom_score_adj=%hd", + __entry->pid, + __get_str(comm), + __entry->total_vm, + __entry->anon_rss, + __entry->file_rss, + __entry->shmem_rss, + __entry->uid, + __entry->pgtables, + __entry->oom_score_adj + ) ); TRACE_EVENT(wake_reaper, diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 1276e49b31b0..4de30c6c5183 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -44,6 +44,7 @@ #include #include #include +#include #include #include "internal.h" @@ -757,6 +758,7 @@ static inline void queue_oom_reaper(struct task_struct *tsk) */ static void mark_oom_victim(struct task_struct *tsk) { + const struct cred *cred; struct mm_struct *mm = tsk->mm; WARN_ON(oom_killer_disabled); @@ -776,7 +778,9 @@ static void mark_oom_victim(struct task_struct *tsk) */ __thaw_task(tsk); atomic_inc(&oom_victims); - trace_mark_victim(tsk->pid); + cred = get_task_cred(tsk); + trace_mark_victim(tsk, cred->uid.val); + put_cred(cred); } /** From 0a09d56e1682c951046bf15542b3e9553046c9f6 Mon Sep 17 00:00:00 2001 From: Chen Ridong Date: Tue, 24 Dec 2024 02:52:38 +0000 Subject: [PATCH 7/7] memcg: fix soft lockup in the OOM process MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit ade81479c7dda1ce3eedb215c78bc615bbd04f06 ] A soft lockup issue was found in the product with about 56,000 tasks were in the OOM cgroup, it was traversing them when the soft lockup was triggered. watchdog: BUG: soft lockup - CPU#2 stuck for 23s! [VM Thread:1503066] CPU: 2 PID: 1503066 Comm: VM Thread Kdump: loaded Tainted: G Hardware name: Huawei Cloud OpenStack Nova, BIOS RIP: 0010:console_unlock+0x343/0x540 RSP: 0000:ffffb751447db9a0 EFLAGS: 00000247 ORIG_RAX: ffffffffffffff13 RAX: 0000000000000001 RBX: 0000000000000000 RCX: 00000000ffffffff RDX: 0000000000000000 RSI: 0000000000000004 RDI: 0000000000000247 RBP: ffffffffafc71f90 R08: 0000000000000000 R09: 0000000000000040 R10: 0000000000000080 R11: 0000000000000000 R12: ffffffffafc74bd0 R13: ffffffffaf60a220 R14: 0000000000000247 R15: 0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f2fe6ad91f0 CR3: 00000004b2076003 CR4: 0000000000360ee0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: vprintk_emit+0x193/0x280 printk+0x52/0x6e dump_task+0x114/0x130 mem_cgroup_scan_tasks+0x76/0x100 dump_header+0x1fe/0x210 oom_kill_process+0xd1/0x100 out_of_memory+0x125/0x570 mem_cgroup_out_of_memory+0xb5/0xd0 try_charge+0x720/0x770 mem_cgroup_try_charge+0x86/0x180 mem_cgroup_try_charge_delay+0x1c/0x40 do_anonymous_page+0xb5/0x390 handle_mm_fault+0xc4/0x1f0 This is because thousands of processes are in the OOM cgroup, it takes a long time to traverse all of them. As a result, this lead to soft lockup in the OOM process. To fix this issue, call 'cond_resched' in the 'mem_cgroup_scan_tasks' function per 1000 iterations. For global OOM, call 'touch_softlockup_watchdog' per 1000 iterations to avoid this issue. Link: https://lkml.kernel.org/r/20241224025238.3768787-1-chenridong@huaweicloud.com Fixes: 9cbb78bb3143 ("mm, memcg: introduce own oom handler to iterate only over its own threads") Signed-off-by: Chen Ridong Acked-by: Michal Hocko Cc: Roman Gushchin Cc: Johannes Weiner Cc: Shakeel Butt Cc: Muchun Song Cc: Michal Koutný Cc: Signed-off-by: Andrew Morton Signed-off-by: Sasha Levin --- mm/memcontrol.c | 7 ++++++- mm/oom_kill.c | 8 +++++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 280bb6969c0b..3f7cab196eb6 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1242,6 +1242,7 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, { struct mem_cgroup *iter; int ret = 0; + int i = 0; BUG_ON(memcg == root_mem_cgroup); @@ -1250,8 +1251,12 @@ int mem_cgroup_scan_tasks(struct mem_cgroup *memcg, struct task_struct *task; css_task_iter_start(&iter->css, CSS_TASK_ITER_PROCS, &it); - while (!ret && (task = css_task_iter_next(&it))) + while (!ret && (task = css_task_iter_next(&it))) { + /* Avoid potential softlockup warning */ + if ((++i & 1023) == 0) + cond_resched(); ret = fn(task, arg); + } css_task_iter_end(&it); if (ret) { mem_cgroup_iter_break(memcg, iter); diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 4de30c6c5183..f4c8ef863ea7 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include "internal.h" @@ -430,10 +431,15 @@ static void dump_tasks(struct oom_control *oc) mem_cgroup_scan_tasks(oc->memcg, dump_task, oc); else { struct task_struct *p; + int i = 0; rcu_read_lock(); - for_each_process(p) + for_each_process(p) { + /* Avoid potential softlockup warning */ + if ((++i & 1023) == 0) + touch_softlockup_watchdog(); dump_task(p, oc); + } rcu_read_unlock(); } }