diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index a25e13accaf1..315078c28816 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5429,16 +5429,66 @@ static inline bool energy_aware(void) return sched_feat(ENERGY_AWARE); } +/* + * CPU candidates. + * + * These are labels to reference CPU candidates for an energy_diff. + * Currently we support only two possible candidates: the task's previous CPU + * and another candiate CPU. + * More advanced/aggressive EAS selection policies can consider more + * candidates. + */ +#define EAS_CPU_PRV 0 +#define EAS_CPU_NXT 1 +#define EAS_CPU_BKP 2 +#define EAS_CPU_CNT 3 + +/* + * energy_diff - supports the computation of the estimated energy impact in + * moving a "task"'s "util_delta" between different CPU candidates. + */ struct energy_env { + /* Utilization to move */ + struct task_struct *p; + int util_delta; + + /* Mask of CPUs candidates to evaluate */ + cpumask_t cpus_mask; + + /* CPU candidates to evaluate */ + struct { + + /* CPU ID, must be in cpus_mask */ + int cpu_id; + + /* + * Index (into sched_group_energy::cap_states) of the OPP the + * CPU needs to run at if the task is placed on it. + * This includes the both active and blocked load, due to + * other tasks on this CPU, as well as the task's own + * utilization. + */ + int cap_idx; + int cap; + + /* Estimated system energy */ + unsigned int energy; + + /* Estimated energy variation wrt EAS_CPU_PRV */ + int nrg_delta; + + } cpu[EAS_CPU_CNT]; + + /* + * Index (into energy_env::cpu) of the morst energy efficient CPU for + * the specified energy_env::task + */ + int next_idx; + + /* Support data */ struct sched_group *sg_top; struct sched_group *sg_cap; - int cap_idx; - int util_delta; - int src_cpu; - int dst_cpu; - int trg_cpu; - int energy; - struct task_struct *p; + struct sched_group *sg; }; static int cpu_util_wake(int cpu, struct task_struct *p); @@ -5466,7 +5516,7 @@ static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static unsigned long group_max_util(struct energy_env *eenv) +static unsigned long group_max_util(struct energy_env *eenv, int cpu_idx) { unsigned long max_util = 0; unsigned long util; @@ -5480,7 +5530,7 @@ static unsigned long group_max_util(struct energy_env *eenv) * then we should add the (estimated) utilization of the task * assuming we will wake it up on that CPU. */ - if (unlikely(cpu == eenv->trg_cpu)) + if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id)) util += eenv->util_delta; max_util = max(max_util, util); @@ -5501,13 +5551,13 @@ static unsigned long group_max_util(struct energy_env *eenv) * estimate (more busy). */ static unsigned -long group_norm_util(struct energy_env *eenv, struct sched_group *sg) +long group_norm_util(struct energy_env *eenv, int cpu_idx) { - unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; + unsigned long capacity = eenv->cpu[cpu_idx].cap; unsigned long util, util_sum = 0; int cpu; - for_each_cpu(cpu, sched_group_cpus(sg)) { + for_each_cpu(cpu, sched_group_cpus(eenv->sg)) { util = cpu_util_wake(cpu, eenv->p); /* @@ -5515,7 +5565,7 @@ long group_norm_util(struct energy_env *eenv, struct sched_group *sg) * then we should add the (estimated) utilization of the task * assuming we will wake it up on that CPU. */ - if (unlikely(cpu == eenv->trg_cpu)) + if (unlikely(cpu == eenv->cpu[cpu_idx].cpu_id)) util += eenv->util_delta; util_sum += __cpu_norm_util(util, capacity); @@ -5524,27 +5574,31 @@ long group_norm_util(struct energy_env *eenv, struct sched_group *sg) return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE); } -static int find_new_capacity(struct energy_env *eenv, - const struct sched_group_energy * const sge) +static int find_new_capacity(struct energy_env *eenv, int cpu_idx) { + const struct sched_group_energy *sge = eenv->sg->sge; int idx, max_idx = sge->nr_cap_states - 1; - unsigned long util = group_max_util(eenv); + unsigned long util = group_max_util(eenv, cpu_idx); /* default is max_cap if we don't find a match */ - eenv->cap_idx = max_idx; + eenv->cpu[cpu_idx].cap_idx = max_idx; + eenv->cpu[cpu_idx].cap = sge->cap_states[max_idx].cap; for (idx = 0; idx < sge->nr_cap_states; idx++) { if (sge->cap_states[idx].cap >= util) { - eenv->cap_idx = idx; + /* Keep track of SG's capacity */ + eenv->cpu[cpu_idx].cap_idx = idx; + eenv->cpu[cpu_idx].cap = sge->cap_states[idx].cap; break; } } - return eenv->cap_idx; + return eenv->cpu[cpu_idx].cap_idx; } -static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) +static int group_idle_state(struct energy_env *eenv, int cpu_idx) { + struct sched_group *sg = eenv->sg; int i, state = INT_MAX; int src_in_grp, dst_in_grp; long grp_util = 0; @@ -5556,8 +5610,10 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) /* Take non-cpuidle idling into account (active idle/arch_cpu_idle()) */ state++; - src_in_grp = cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg)); - dst_in_grp = cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg)); + src_in_grp = cpumask_test_cpu(eenv->cpu[EAS_CPU_PRV].cpu_id, + sched_group_cpus(sg)); + dst_in_grp = cpumask_test_cpu(eenv->cpu[cpu_idx].cpu_id, + sched_group_cpus(sg)); if (src_in_grp == dst_in_grp) { /* both CPUs under consideration are in the same group or not in * either group, migration should leave idle state the same. @@ -5571,7 +5627,7 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) */ for_each_cpu(i, sched_group_cpus(sg)) { grp_util += cpu_util_wake(i, eenv->p); - if (unlikely(i == eenv->trg_cpu)) + if (unlikely(i == eenv->cpu[cpu_idx].cpu_id)) grp_util += eenv->util_delta; } @@ -5607,19 +5663,62 @@ end: } /* - * sched_group_energy(): Computes the absolute energy consumption of cpus - * belonging to the sched_group including shared resources shared only by - * members of the group. Iterates over all cpus in the hierarchy below the - * sched_group starting from the bottom working it's way up before going to - * the next cpu until all cpus are covered at all levels. The current - * implementation is likely to gather the same util statistics multiple times. - * This can probably be done in a faster but more complex way. - * Note: sched_group_energy() may fail when racing with sched_domain updates. + * calc_sg_energy: compute energy for the eenv's SG (i.e. eenv->sg). + * + * This works in iterations to compute the SG's energy for each CPU + * candidate defined by the energy_env's cpu array. */ -static int sched_group_energy(struct energy_env *eenv) +static void calc_sg_energy(struct energy_env *eenv) +{ + struct sched_group *sg = eenv->sg; + int busy_energy, idle_energy; + unsigned int busy_power; + unsigned int idle_power; + unsigned long sg_util; + int cap_idx, idle_idx; + int total_energy = 0; + int cpu_idx; + + for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { + + + if (eenv->cpu[cpu_idx].cpu_id == -1) + continue; + /* Compute ACTIVE energy */ + cap_idx = find_new_capacity(eenv, cpu_idx); + busy_power = sg->sge->cap_states[cap_idx].power; + /* + * in order to calculate cpu_norm_util, we need to know which + * capacity level the group will be at, so calculate that first + */ + sg_util = group_norm_util(eenv, cpu_idx); + + busy_energy = sg_util * busy_power; + busy_energy >>= SCHED_CAPACITY_SHIFT; + + /* Compute IDLE energy */ + idle_idx = group_idle_state(eenv, cpu_idx); + idle_power = sg->sge->idle_states[idle_idx].power; + + idle_energy = SCHED_CAPACITY_SCALE - sg_util; + idle_energy *= idle_power; + idle_energy >>= SCHED_CAPACITY_SHIFT; + + total_energy = busy_energy + idle_energy; + eenv->cpu[cpu_idx].energy += total_energy; + } +} + +/* + * compute_energy() computes the absolute variation in energy consumption by + * moving eenv.util_delta from EAS_CPU_PRV to EAS_CPU_NXT. + * + * NOTE: compute_energy() may fail when racing with sched_domain updates, in + * which case we abort by returning -EINVAL. + */ +static int compute_energy(struct energy_env *eenv) { struct cpumask visit_cpus; - u64 total_energy = 0; WARN_ON(!eenv->sg_top->sge); @@ -5635,7 +5734,6 @@ static int sched_group_energy(struct energy_env *eenv) * sched_group? */ sd = rcu_dereference(per_cpu(sd_scs, cpu)); - if (sd && sd->parent) sg_shared_cap = sd->parent->groups; @@ -5647,25 +5745,18 @@ static int sched_group_energy(struct energy_env *eenv) break; do { - unsigned long group_util; - int sg_busy_energy, sg_idle_energy; - int cap_idx, idle_idx; - + eenv->sg_cap = sg; if (sg_shared_cap && sg_shared_cap->group_weight >= sg->group_weight) eenv->sg_cap = sg_shared_cap; - else - eenv->sg_cap = sg; - cap_idx = find_new_capacity(eenv, sg->sge); - idle_idx = group_idle_state(eenv, sg); - group_util = group_norm_util(eenv, sg); - - sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power); - sg_idle_energy = ((SCHED_CAPACITY_SCALE-group_util) - * sg->sge->idle_states[idle_idx].power); - - total_energy += sg_busy_energy + sg_idle_energy; + /* + * Compute the energy for all the candidate + * CPUs in the current visited SG. + */ + eenv->sg = sg; + calc_sg_energy(eenv); + /* remove CPUs we have just visited */ if (!sd->child) cpumask_xor(&visit_cpus, &visit_cpus, sched_group_cpus(sg)); @@ -5687,7 +5778,6 @@ next_cpu: continue; } - eenv->energy += (total_energy >> SCHED_CAPACITY_SHIFT); return 0; } @@ -5696,62 +5786,94 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } -static inline unsigned long task_util(struct task_struct *p); - /* - * energy_diff(): Estimate the energy impact of changing the utilization - * distribution. eenv specifies the change: utilisation amount, source, and - * destination cpu. Source or destination cpu may be -1 in which case the - * utilization is removed from or added to the system (e.g. task wake-up). If - * both are specified, the utilization is migrated. + * select_energy_cpu_idx(): estimate the energy impact of changing the + * utilization distribution. + * + * The eenv parameter specifies the changes: utilisation amount and a pair of + * possible CPU candidates (the previous CPU and a different target CPU). + * + * This function returns the index of a CPU candidate specified by the + * energy_env which corresponds to the first CPU saving energy. + * Thus, 0 (EAS_CPU_PRV) means that non of the CPU candidate is more energy + * efficient than running on prev_cpu. This is also the value returned in case + * of abort due to error conditions during the computations. + * A value greater than zero means that the first energy-efficient CPU is the + * one represented by eenv->cpu[eenv->next_idx].cpu_id. */ -static inline int energy_diff(struct energy_env *eenv) +static inline int select_energy_cpu_idx(struct energy_env *eenv) { struct sched_domain *sd; struct sched_group *sg; - int energy_diff = 0; int sd_cpu = -1; + int cpu_idx; int margin; - struct energy_env eenv_before = { - .util_delta = task_util(eenv->p), - .src_cpu = eenv->src_cpu, - .dst_cpu = eenv->dst_cpu, - .trg_cpu = eenv->src_cpu, - .energy = 0, - .p = eenv->p, - }; - - if (eenv->src_cpu == eenv->dst_cpu) - return 0; - - sd_cpu = (eenv->src_cpu != -1) ? eenv->src_cpu : eenv->dst_cpu; + sd_cpu = eenv->cpu[EAS_CPU_PRV].cpu_id; sd = rcu_dereference(per_cpu(sd_ea, sd_cpu)); - if (!sd) - return 0; /* Error */ + return EAS_CPU_PRV; + + cpumask_clear(&eenv->cpus_mask); + for (cpu_idx = EAS_CPU_PRV; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { + int cpu = eenv->cpu[cpu_idx].cpu_id; + + if (cpu < 0) + continue; + cpumask_set_cpu(cpu, &eenv->cpus_mask); + } sg = sd->groups; - do { - if (cpu_in_sg(sg, eenv->src_cpu) || cpu_in_sg(sg, eenv->dst_cpu)) { - eenv_before.sg_top = eenv->sg_top = sg; - if (sched_group_energy(&eenv_before)) - return 0; /* Invalid result abort */ - if (sched_group_energy(eenv)) - return 0; /* Invalid result abort */ - } + /* Skip SGs which do not contains a candidate CPU */ + if (!cpumask_intersects(&eenv->cpus_mask, sched_group_cpus(sg))) + continue; + + eenv->sg_top = sg; + if (compute_energy(eenv) == -EINVAL) + return EAS_CPU_PRV; + } while (sg = sg->next, sg != sd->groups); - energy_diff = eenv->energy - eenv_before.energy; /* - * Dead-zone margin preventing too many migrations. + * Compute the dead-zone margin used to prevent too many task + * migrations with negligible energy savings. + * An energy saving is considered meaningful if it reduces the energy + * consumption of EAS_CPU_PRV CPU candidate by at least ~1.56% */ - margin = eenv->energy >> 6; /* ~1.56% */ - if (abs(energy_diff) < margin) - energy_diff = 0; + margin = eenv->cpu[EAS_CPU_PRV].energy >> 6; - return energy_diff; + /* + * By default the EAS_CPU_PRV CPU is considered the most energy + * efficient, with a 0 energy variation. + */ + eenv->next_idx = EAS_CPU_PRV; + eenv->cpu[cpu_idx].nrg_delta = 0; + + /* + * Compare the other CPU candidates to find a CPU which can be + * more energy efficient then EAS_CPU_PRV + */ + for (cpu_idx = EAS_CPU_NXT; cpu_idx < EAS_CPU_CNT; ++cpu_idx) { + /* Skip not valid scheduled candidates */ + if (eenv->cpu[cpu_idx].cpu_id < 0) + continue; + /* Compute energy delta wrt EAS_CPU_PRV */ + eenv->cpu[cpu_idx].nrg_delta = + eenv->cpu[cpu_idx].energy - + eenv->cpu[EAS_CPU_PRV].energy; + /* filter energy variations within the dead-zone margin */ + if (abs(eenv->cpu[cpu_idx].nrg_delta) < margin) + eenv->cpu[cpu_idx].nrg_delta = 0; + /* update the schedule candidate with min(nrg_delta) */ + if (eenv->cpu[cpu_idx].nrg_delta < + eenv->cpu[eenv->next_idx].nrg_delta) { + eenv->next_idx = cpu_idx; + break; + } + } + + return eenv->next_idx; } /* @@ -6910,11 +7032,20 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync if (next_cpu != prev_cpu) { int delta = 0; struct energy_env eenv = { - .util_delta = task_util(p), - .src_cpu = prev_cpu, - .dst_cpu = next_cpu, - .trg_cpu = next_cpu, .p = p, + .util_delta = task_util(p), + /* Task's previous CPU candidate */ + .cpu[EAS_CPU_PRV] = { + .cpu_id = prev_cpu, + }, + /* Main alternative CPU candidate */ + .cpu[EAS_CPU_NXT] = { + .cpu_id = next_cpu, + }, + /* Backup alternative CPU candidate */ + .cpu[EAS_CPU_BKP] = { + .cpu_id = backup_cpu, + }, }; @@ -6931,24 +7062,17 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync goto unlock; } - target_cpu = next_cpu; - if (energy_diff(&eenv) >= 0) { - /* No energy saving for target_cpu, try backup */ - target_cpu = backup_cpu; - eenv.dst_cpu = backup_cpu; - eenv.trg_cpu = backup_cpu; - if (backup_cpu < 0 || - backup_cpu == prev_cpu || - energy_diff(&eenv) >= 0) { - schedstat_inc(p->se.statistics.nr_wakeups_secb_no_nrg_sav); - schedstat_inc(this_rq()->eas_stats.secb_no_nrg_sav); - target_cpu = prev_cpu; - goto unlock; - } + /* Check if EAS_CPU_NXT is a more energy efficient CPU */ + if (select_energy_cpu_idx(&eenv) != EAS_CPU_PRV) { + schedstat_inc(p->se.statistics.nr_wakeups_secb_nrg_sav); + schedstat_inc(this_rq()->eas_stats.secb_nrg_sav); + target_cpu = eenv.cpu[eenv.next_idx].cpu_id; + goto unlock; } - schedstat_inc(p->se.statistics.nr_wakeups_secb_nrg_sav); - schedstat_inc(this_rq()->eas_stats.secb_nrg_sav); + schedstat_inc(p->se.statistics.nr_wakeups_secb_no_nrg_sav); + schedstat_inc(this_rq()->eas_stats.secb_no_nrg_sav); + target_cpu = prev_cpu; goto unlock; }