From 9e3ae1fbd38957a9bbcaf5a97977901ad0f2da6f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 14 Feb 2022 10:16:57 +0100 Subject: [PATCH] UPSTREAM: sched: Fix yet more sched_fork() races Where commit 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") fixed a fork race vs cgroup, it opened up a race vs syscalls by not placing the task on the runqueue before it gets exposed through the pidhash. Commit 13765de8148f ("sched/fair: Fix fault in reweight_entity") is trying to fix a single instance of this, instead fix the whole class of issues, effectively reverting this commit. Bug: 255159688 Fixes: 4ef0c5c6b5ba ("kernel/sched: Fix sched_fork() access an invalid sched_task_group") Reported-by: Linus Torvalds Signed-off-by: Peter Zijlstra (Intel) Tested-by: Tadeusz Struk Tested-by: Zhang Qiao Tested-by: Dietmar Eggemann Link: https://lkml.kernel.org/r/YgoeCbwj5mbCR0qA@hirez.programming.kicks-ass.net (cherry picked from commit b1e8206582f9d680cff7d04828708c8b6ab32957) Signed-off-by: Woody Lin Change-Id: Ic593aafb0cc8dae5ba382cdc4ab68526973fdfca --- include/linux/sched/task.h | 4 ++-- kernel/fork.c | 13 ++++++++++++- kernel/sched/core.c | 23 ++++++++++++++++------- 3 files changed, 30 insertions(+), 10 deletions(-) diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h index fa75f325dad5..cea4bdfd0f05 100644 --- a/include/linux/sched/task.h +++ b/include/linux/sched/task.h @@ -55,8 +55,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); extern void init_idle(struct task_struct *idle, int cpu); extern int sched_fork(unsigned long clone_flags, struct task_struct *p); -extern void sched_post_fork(struct task_struct *p, - struct kernel_clone_args *kargs); +extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); +extern void sched_post_fork(struct task_struct *p); extern void sched_dead(struct task_struct *p); void __noreturn do_task_dead(void); diff --git a/kernel/fork.c b/kernel/fork.c index a33cb21a05a5..8d2c3eb1b8fa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -2260,6 +2260,17 @@ static __latent_entropy struct task_struct *copy_process( if (retval) goto bad_fork_put_pidfd; + /* + * Now that the cgroups are pinned, re-clone the parent cgroup and put + * the new task on the correct runqueue. All this *before* the task + * becomes visible. + * + * This isn't part of ->can_fork() because while the re-cloning is + * cgroup specific, it unconditionally needs to place the task on a + * runqueue. + */ + sched_cgroup_fork(p, args); + /* * From this point on we must avoid any synchronous user-space * communication until we take the tasklist-lock. In particular, we do @@ -2367,7 +2378,7 @@ static __latent_entropy struct task_struct *copy_process( fd_install(pidfd, pidfile); proc_fork_connector(p); - sched_post_fork(p, args); + sched_post_fork(p); cgroup_post_fork(p, args); perf_event_fork(p); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 5000f595fac0..583094de1de0 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3506,6 +3506,7 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) init_entity_runnable_average(&p->se); trace_android_rvh_finish_prio_fork(p); + #ifdef CONFIG_SCHED_INFO if (likely(sched_info_on())) memset(&p->sched_info, 0, sizeof(p->sched_info)); @@ -3521,18 +3522,23 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) return 0; } -void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) +void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) { unsigned long flags; -#ifdef CONFIG_CGROUP_SCHED - struct task_group *tg; -#endif + /* + * Because we're not yet on the pid-hash, p->pi_lock isn't strictly + * required yet, but lockdep gets upset if rules are violated. + */ raw_spin_lock_irqsave(&p->pi_lock, flags); #ifdef CONFIG_CGROUP_SCHED - tg = container_of(kargs->cset->subsys[cpu_cgrp_id], - struct task_group, css); - p->sched_task_group = autogroup_task_group(p, tg); + if (1) { + struct task_group *tg; + tg = container_of(kargs->cset->subsys[cpu_cgrp_id], + struct task_group, css); + tg = autogroup_task_group(p, tg); + p->sched_task_group = tg; + } #endif rseq_migrate(p); /* @@ -3543,7 +3549,10 @@ void sched_post_fork(struct task_struct *p, struct kernel_clone_args *kargs) if (p->sched_class->task_fork) p->sched_class->task_fork(p); raw_spin_unlock_irqrestore(&p->pi_lock, flags); +} +void sched_post_fork(struct task_struct *p) +{ uclamp_post_fork(p); }