diff --git a/include/linux/sched/sysctl.h b/include/linux/sched/sysctl.h index 2d2cf5daca63..abe17fcf8fd0 100644 --- a/include/linux/sched/sysctl.h +++ b/include/linux/sched/sysctl.h @@ -18,6 +18,7 @@ extern unsigned int sysctl_sched_latency; extern unsigned int sysctl_sched_min_granularity; extern unsigned int sysctl_sched_wakeup_granularity; extern unsigned int sysctl_sched_child_runs_first; +extern unsigned int sysctl_sched_cstate_aware; enum sched_tunable_scaling { SCHED_TUNABLESCALING_NONE, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 34b6bf939335..0523b7e23de1 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -51,6 +51,7 @@ unsigned int sysctl_sched_latency = 6000000ULL; unsigned int normalized_sysctl_sched_latency = 6000000ULL; +unsigned int sysctl_sched_cstate_aware = 1; /* * The initial- and re-scaling of tunables is configurable * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) @@ -6132,33 +6133,87 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, int t static int select_idle_sibling(struct task_struct *p, int prev, int target) { struct sched_domain *sd; - int i; + struct sched_group *sg; + int i = task_cpu(p); + int best_idle = -1; + int best_idle_cstate = -1; + int best_idle_capacity = INT_MAX; - if (idle_cpu(target)) - return target; + if (!sysctl_sched_cstate_aware) { + if (idle_cpu(target)) + return target; + + /* + * If the prevous cpu is cache affine and idle, don't be stupid. + */ + if (i != target && cpus_share_cache(i, target) && idle_cpu(i)) + return i; + + sd = rcu_dereference(per_cpu(sd_llc, target)); + if (!sd) + return target; + + i = select_idle_core(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + + i = select_idle_cpu(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + + i = select_idle_smt(p, sd, target); + if ((unsigned)i < nr_cpumask_bits) + return i; + } /* - * If the previous cpu is cache affine and idle, don't be stupid. + * Otherwise, iterate the domains and find an elegible idle cpu. */ - if (prev != target && cpus_share_cache(prev, target) && idle_cpu(prev)) - return prev; - sd = rcu_dereference(per_cpu(sd_llc, target)); - if (!sd) - return target; + for_each_lower_domain(sd) { + sg = sd->groups; + do { + if (!cpumask_intersects(sched_group_cpus(sg), + tsk_cpus_allowed(p))) + goto next; - i = select_idle_core(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; - i = select_idle_cpu(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; + if (sysctl_sched_cstate_aware) { + for_each_cpu_and(i, tsk_cpus_allowed(p), sched_group_cpus(sg)) { + struct rq *rq = cpu_rq(i); + int idle_idx = idle_get_state_idx(rq); + unsigned long new_usage = boosted_task_util(p); + unsigned long capacity_orig = capacity_orig_of(i); + if (new_usage > capacity_orig || !idle_cpu(i)) + goto next; - i = select_idle_smt(p, sd, target); - if ((unsigned)i < nr_cpumask_bits) - return i; + if (i == target && new_usage <= capacity_curr_of(target)) + return target; + if (best_idle < 0 || (idle_idx < best_idle_cstate && capacity_orig <= best_idle_capacity)) { + best_idle = i; + best_idle_cstate = idle_idx; + best_idle_capacity = capacity_orig; + } + } + } else { + for_each_cpu(i, sched_group_cpus(sg)) { + if (i == target || !idle_cpu(i)) + goto next; + } + + target = cpumask_first_and(sched_group_cpus(sg), + tsk_cpus_allowed(p)); + goto done; + } +next: + sg = sg->next; + } while (sg != sd->groups); + } + if (best_idle > 0) + target = best_idle; + +done: return target; } diff --git a/kernel/sysctl.c b/kernel/sysctl.c index bcc012aad7bb..3a3567990388 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -307,6 +307,13 @@ static struct ctl_table kern_table[] = { .extra1 = &min_sched_granularity_ns, .extra2 = &max_sched_granularity_ns, }, + { + .procname = "sched_cstate_aware", + .data = &sysctl_sched_cstate_aware, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_dointvec, + }, { .procname = "sched_wakeup_granularity_ns", .data = &sysctl_sched_wakeup_granularity,