From 02bdd918e6247322696b2ddff33431c263f6070e Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@google.com>
Date: Sat, 25 Feb 2023 16:07:41 +0000
Subject: [PATCH] Revert "sched/psi: Stop relying on timer_pending() for
 poll_work rescheduling"

This reverts commit afec25854ca7c49b60b02bf47674630f9d7e0377 which is
commit 710ffe671e014d5ccbcff225130a178b088ef090 upstream.

It breaks the kernel ABI, but will be brought back in the near future
when an ABI break is allowed.

Bug: 161946584
Cc: Suren Baghdasaryan <surenb@google.com>
Change-Id: If05233e46a6d8baf11e53d4fbdb8ac3fcc5e7d0a
Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
---
 include/linux/psi_types.h |  1 -
 kernel/sched/psi.c        | 62 +++++++--------------------------------
 2 files changed, 10 insertions(+), 53 deletions(-)

diff --git a/include/linux/psi_types.h b/include/linux/psi_types.h
index 14a1ebb74e11..6e4372735068 100644
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -177,7 +177,6 @@ struct psi_group {
 	struct timer_list poll_timer;
 	wait_queue_head_t poll_wait;
 	atomic_t poll_wakeup;
-	atomic_t poll_scheduled;
 
 	/* Protects data used by the monitor */
 	struct mutex trigger_lock;
diff --git a/kernel/sched/psi.c b/kernel/sched/psi.c
index e83c321461cf..48fedeee15c5 100644
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -189,7 +189,6 @@ static void group_init(struct psi_group *group)
 	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
 	mutex_init(&group->avgs_lock);
 	/* Init trigger-related members */
-	atomic_set(&group->poll_scheduled, 0);
 	mutex_init(&group->trigger_lock);
 	INIT_LIST_HEAD(&group->triggers);
 	group->poll_min_period = U32_MAX;
@@ -566,17 +565,18 @@ static u64 update_triggers(struct psi_group *group, u64 now)
 	return now + group->poll_min_period;
 }
 
-/* Schedule polling if it's not already scheduled or forced. */
-static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
-				   bool force)
+/* Schedule polling if it's not already scheduled. */
+static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
 {
 	struct task_struct *task;
 
 	/*
-	 * atomic_xchg should be called even when !force to provide a
-	 * full memory barrier (see the comment inside psi_poll_work).
+	 * Do not reschedule if already scheduled.
+	 * Possible race with a timer scheduled after this check but before
+	 * mod_timer below can be tolerated because group->polling_next_update
+	 * will keep updates on schedule.
 	 */
-	if (atomic_xchg(&group->poll_scheduled, 1) && !force)
+	if (timer_pending(&group->poll_timer))
 		return;
 
 	rcu_read_lock();
@@ -588,15 +588,12 @@ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
 	 */
 	if (likely(task))
 		mod_timer(&group->poll_timer, jiffies + delay);
-	else
-		atomic_set(&group->poll_scheduled, 0);
 
 	rcu_read_unlock();
 }
 
 static void psi_poll_work(struct psi_group *group)
 {
-	bool force_reschedule = false;
 	u32 changed_states;
 	u64 now;
 
@@ -604,43 +601,6 @@ static void psi_poll_work(struct psi_group *group)
 
 	now = sched_clock();
 
-	if (now > group->polling_until) {
-		/*
-		 * We are either about to start or might stop polling if no
-		 * state change was recorded. Resetting poll_scheduled leaves
-		 * a small window for psi_group_change to sneak in and schedule
-		 * an immediate poll_work before we get to rescheduling. One
-		 * potential extra wakeup at the end of the polling window
-		 * should be negligible and polling_next_update still keeps
-		 * updates correctly on schedule.
-		 */
-		atomic_set(&group->poll_scheduled, 0);
-		/*
-		 * A task change can race with the poll worker that is supposed to
-		 * report on it. To avoid missing events, ensure ordering between
-		 * poll_scheduled and the task state accesses, such that if the poll
-		 * worker misses the state update, the task change is guaranteed to
-		 * reschedule the poll worker:
-		 *
-		 * poll worker:
-		 *   atomic_set(poll_scheduled, 0)
-		 *   smp_mb()
-		 *   LOAD states
-		 *
-		 * task change:
-		 *   STORE states
-		 *   if atomic_xchg(poll_scheduled, 1) == 0:
-		 *     schedule poll worker
-		 *
-		 * The atomic_xchg() implies a full barrier.
-		 */
-		smp_mb();
-	} else {
-		/* Polling window is not over, keep rescheduling */
-		force_reschedule = true;
-	}
-
-
 	collect_percpu_times(group, PSI_POLL, &changed_states);
 
 	if (changed_states & group->poll_states) {
@@ -666,8 +626,7 @@ static void psi_poll_work(struct psi_group *group)
 		group->polling_next_update = update_triggers(group, now);
 
 	psi_schedule_poll_work(group,
-		nsecs_to_jiffies(group->polling_next_update - now) + 1,
-		force_reschedule);
+		nsecs_to_jiffies(group->polling_next_update - now) + 1);
 
 out:
 	mutex_unlock(&group->trigger_lock);
@@ -828,7 +787,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
 	write_seqcount_end(&groupc->seq);
 
 	if (state_mask & group->poll_states)
-		psi_schedule_poll_work(group, 1, false);
+		psi_schedule_poll_work(group, 1);
 
 	if (wake_clock && !delayed_work_pending(&group->avgs_work))
 		schedule_delayed_work(&group->avgs_work, PSI_FREQ);
@@ -982,7 +941,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
 		write_seqcount_end(&groupc->seq);
 
 		if (group->poll_states & (1 << PSI_IRQ_FULL))
-			psi_schedule_poll_work(group, 1, false);
+			psi_schedule_poll_work(group, 1);
 	} while ((group = group->parent));
 }
 #endif
@@ -1369,7 +1328,6 @@ void psi_trigger_destroy(struct psi_trigger *t)
 		 * can no longer be found through group->poll_task.
 		 */
 		kthread_stop(task_to_destroy);
-		atomic_set(&group->poll_scheduled, 0);
 	}
 	kfree(t);
 }