Revert "sched/psi: Stop relying on timer_pending() for poll_work rescheduling"

This reverts commit afec25854c which is commit 710ffe671e upstream. It breaks the kernel ABI, but will be brought back in the near future when an ABI break is allowed. Bug: 161946584 Cc: Suren Baghdasaryan <surenb@google.com> Change-Id: If05233e46a6d8baf11e53d4fbdb8ac3fcc5e7d0a Signed-off-by: Greg Kroah-Hartman <gregkh@google.com>
2026-06-07 03:15:31 +09:00 · 2023-02-25 16:07:41 +00:00
parent 4b898a5ba4
commit 02bdd918e6
2 changed files with 10 additions and 53 deletions
--- a/include/linux/psi_types.h
+++ b/include/linux/psi_types.h
@@ -177,7 +177,6 @@ struct psi_group {
 	struct timer_list poll_timer;
 	wait_queue_head_t poll_wait;
 	atomic_t poll_wakeup;
-	atomic_t poll_scheduled;

 	/* Protects data used by the monitor */
 	struct mutex trigger_lock;
--- a/kernel/sched/psi.c
+++ b/kernel/sched/psi.c
@@ -189,7 +189,6 @@ static void group_init(struct psi_group *group)
 	INIT_DELAYED_WORK(&group->avgs_work, psi_avgs_work);
 	mutex_init(&group->avgs_lock);
 	/* Init trigger-related members */
-	atomic_set(&group->poll_scheduled, 0);
 	mutex_init(&group->trigger_lock);
 	INIT_LIST_HEAD(&group->triggers);
 	group->poll_min_period = U32_MAX;
@@ -566,17 +565,18 @@ static u64 update_triggers(struct psi_group *group, u64 now)
 	return now + group->poll_min_period;
 }

-/* Schedule polling if it's not already scheduled or forced. */
-static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
-				   bool force)
+/* Schedule polling if it's not already scheduled. */
+static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay)
 {
 	struct task_struct *task;

 	/*
-	 * atomic_xchg should be called even when !force to provide a
-	 * full memory barrier (see the comment inside psi_poll_work).
+	 * Do not reschedule if already scheduled.
+	 * Possible race with a timer scheduled after this check but before
+	 * mod_timer below can be tolerated because group->polling_next_update
+	 * will keep updates on schedule.
 	 */
-	if (atomic_xchg(&group->poll_scheduled, 1) && !force)
+	if (timer_pending(&group->poll_timer))
 		return;

 	rcu_read_lock();
@@ -588,15 +588,12 @@ static void psi_schedule_poll_work(struct psi_group *group, unsigned long delay,
 	 */
 	if (likely(task))
 		mod_timer(&group->poll_timer, jiffies + delay);
-	else
-		atomic_set(&group->poll_scheduled, 0);

 	rcu_read_unlock();
 }

 static void psi_poll_work(struct psi_group *group)
 {
-	bool force_reschedule = false;
 	u32 changed_states;
 	u64 now;

@@ -604,43 +601,6 @@ static void psi_poll_work(struct psi_group *group)

 	now = sched_clock();

-	if (now > group->polling_until) {
-		/*
-		 * We are either about to start or might stop polling if no
-		 * state change was recorded. Resetting poll_scheduled leaves
-		 * a small window for psi_group_change to sneak in and schedule
-		 * an immediate poll_work before we get to rescheduling. One
-		 * potential extra wakeup at the end of the polling window
-		 * should be negligible and polling_next_update still keeps
-		 * updates correctly on schedule.
-		 */
-		atomic_set(&group->poll_scheduled, 0);
-		/*
-		 * A task change can race with the poll worker that is supposed to
-		 * report on it. To avoid missing events, ensure ordering between
-		 * poll_scheduled and the task state accesses, such that if the poll
-		 * worker misses the state update, the task change is guaranteed to
-		 * reschedule the poll worker:
-		 *
-		 * poll worker:
-		 *   atomic_set(poll_scheduled, 0)
-		 *   smp_mb()
-		 *   LOAD states
-		 *
-		 * task change:
-		 *   STORE states
-		 *   if atomic_xchg(poll_scheduled, 1) == 0:
-		 *     schedule poll worker
-		 *
-		 * The atomic_xchg() implies a full barrier.
-		 */
-		smp_mb();
-	} else {
-		/* Polling window is not over, keep rescheduling */
-		force_reschedule = true;
-	}
-
-
 	collect_percpu_times(group, PSI_POLL, &changed_states);

 	if (changed_states & group->poll_states) {
@@ -666,8 +626,7 @@ static void psi_poll_work(struct psi_group *group)
 		group->polling_next_update = update_triggers(group, now);

 	psi_schedule_poll_work(group,
-		nsecs_to_jiffies(group->polling_next_update - now) + 1,
-		force_reschedule);
+		nsecs_to_jiffies(group->polling_next_update - now) + 1);

 out:
 	mutex_unlock(&group->trigger_lock);
@@ -828,7 +787,7 @@ static void psi_group_change(struct psi_group *group, int cpu,
 	write_seqcount_end(&groupc->seq);

 	if (state_mask & group->poll_states)
-		psi_schedule_poll_work(group, 1, false);
+		psi_schedule_poll_work(group, 1);

 	if (wake_clock && !delayed_work_pending(&group->avgs_work))
 		schedule_delayed_work(&group->avgs_work, PSI_FREQ);
@@ -982,7 +941,7 @@ void psi_account_irqtime(struct task_struct *task, u32 delta)
 		write_seqcount_end(&groupc->seq);

 		if (group->poll_states & (1 << PSI_IRQ_FULL))
-			psi_schedule_poll_work(group, 1, false);
+			psi_schedule_poll_work(group, 1);
 	} while ((group = group->parent));
 }
 #endif
@@ -1369,7 +1328,6 @@ void psi_trigger_destroy(struct psi_trigger *t)
 		 * can no longer be found through group->poll_task.
 		 */
 		kthread_stop(task_to_destroy);
-		atomic_set(&group->poll_scheduled, 0);
 	}
 	kfree(t);
 }