diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c7a317543b62..b1939abf23b4 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2618,6 +2618,7 @@ void wake_up_new_task(struct task_struct *p) __set_task_cpu(p, select_task_rq(p, task_cpu(p), SD_BALANCE_FORK, 0)); #endif rq = __task_rq_lock(p, &rf); + update_rq_clock(rq); post_init_entity_util_avg(&p->se); walt_mark_task_starting(p); @@ -3175,7 +3176,9 @@ static void sched_freq_tick_pelt(int cpu) * utilization and to harm its performance the least, request * a jump to a higher OPP as soon as the margin of free capacity * is impacted (specified by capacity_margin). + * Remember CPU utilization in sched_capacity_reqs should be normalised. */ + cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, cpu_utilization); } @@ -3202,7 +3205,9 @@ static void sched_freq_tick_walt(int cpu) * It is likely that the load is growing so we * keep the added margin in our request as an * extra boost. + * Remember CPU utilization in sched_capacity_reqs should be normalised. */ + cpu_utilization = cpu_utilization * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, cpu_utilization); } @@ -3819,6 +3824,7 @@ void rt_mutex_setprio(struct task_struct *p, int prio) BUG_ON(prio > MAX_PRIO); rq = __task_rq_lock(p, &rf); + update_rq_clock(rq); /* * Idle task boosting is a nono in general. There is one @@ -3915,6 +3921,8 @@ void set_user_nice(struct task_struct *p, long nice) * the task might be in the middle of scheduling on another CPU. */ rq = task_rq_lock(p, &rf); + update_rq_clock(rq); + /* * The RT priorities are set via sched_setscheduler(), but we still * allow the 'normal' nice value to be set - but as expected @@ -4347,6 +4355,7 @@ recheck: * runqueue lock must be held. */ rq = task_rq_lock(p, &rf); + update_rq_clock(rq); /* * Changing the policy of the stop threads its a very bad idea @@ -8697,6 +8706,7 @@ static void cpu_cgroup_fork(struct task_struct *task) rq = task_rq_lock(task, &rf); + update_rq_clock(rq); sched_change_group(task, TASK_SET_GROUP); task_rq_unlock(rq, task, &rf); diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index 1b19f2643f48..b0ef17317846 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -202,7 +202,7 @@ static void update_fdomain_capacity_request(int cpu) } /* Convert the new maximum capacity request into a cpu frequency */ - freq_new = capacity * policy->max >> SCHED_CAPACITY_SHIFT; + freq_new = capacity * policy->cpuinfo.max_freq >> SCHED_CAPACITY_SHIFT; index_new = cpufreq_frequency_table_target(policy, freq_new, CPUFREQ_RELATION_L); freq_new = policy->freq_table[index_new].frequency; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 550fa4cecff8..78eef4f12be9 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4803,7 +4803,7 @@ static void update_capacity_of(int cpu) if (!sched_freq()) return; - /* Convert scale-invariant capacity to cpu. */ + /* Normalize scale-invariant capacity to cpu. */ req_cap = boosted_cpu_util(cpu); req_cap = req_cap * SCHED_CAPACITY_SCALE / capacity_orig_of(cpu); set_cfs_cpu_capacity(cpu, true, req_cap); @@ -4996,7 +4996,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) if (rq->cfs.nr_running) update_capacity_of(cpu_of(rq)); else if (sched_freq()) - set_cfs_cpu_capacity(cpu_of(rq), false, 0); + set_cfs_cpu_capacity(cpu_of(rq), false, 0); /* no normalization required for 0 */ } } @@ -5481,6 +5481,7 @@ struct energy_env { int util_delta; int src_cpu; int dst_cpu; + int trg_cpu; int energy; int payoff; struct task_struct *task; @@ -5497,11 +5498,14 @@ struct energy_env { } cap; }; +static int cpu_util_wake(int cpu, struct task_struct *p); + /* * __cpu_norm_util() returns the cpu util relative to a specific capacity, - * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE] which is useful for - * energy calculations. Using the scale-invariant util returned by - * cpu_util() and approximating scale-invariant util by: + * i.e. it's busy ratio, in the range [0..SCHED_LOAD_SCALE], which is useful for + * energy calculations. + * + * Since util is a scale-invariant utilization defined as: * * util ~ (curr_freq/max_freq)*1024 * capacity_orig/1024 * running_time/time * @@ -5511,34 +5515,32 @@ struct energy_env { * * norm_util = running_time/time ~ util/capacity */ -static unsigned long __cpu_norm_util(int cpu, unsigned long capacity, int delta) +static unsigned long __cpu_norm_util(unsigned long util, unsigned long capacity) { - int util = __cpu_util(cpu, delta); - if (util >= capacity) return SCHED_CAPACITY_SCALE; return (util << SCHED_CAPACITY_SHIFT)/capacity; } -static int calc_util_delta(struct energy_env *eenv, int cpu) +static unsigned long group_max_util(struct energy_env *eenv) { - if (cpu == eenv->src_cpu) - return -eenv->util_delta; - if (cpu == eenv->dst_cpu) - return eenv->util_delta; - return 0; -} - -static -unsigned long group_max_util(struct energy_env *eenv) -{ - int i, delta; unsigned long max_util = 0; + unsigned long util; + int cpu; - for_each_cpu(i, sched_group_cpus(eenv->sg_cap)) { - delta = calc_util_delta(eenv, i); - max_util = max(max_util, __cpu_util(i, delta)); + for_each_cpu(cpu, sched_group_cpus(eenv->sg_cap)) { + util = cpu_util_wake(cpu, eenv->task); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->trg_cpu)) + util += eenv->util_delta; + + max_util = max(max_util, util); } return max_util; @@ -5546,44 +5548,56 @@ unsigned long group_max_util(struct energy_env *eenv) /* * group_norm_util() returns the approximated group util relative to it's - * current capacity (busy ratio) in the range [0..SCHED_LOAD_SCALE] for use in - * energy calculations. Since task executions may or may not overlap in time in - * the group the true normalized util is between max(cpu_norm_util(i)) and - * sum(cpu_norm_util(i)) when iterating over all cpus in the group, i. The - * latter is used as the estimate as it leads to a more pessimistic energy + * current capacity (busy ratio), in the range [0..SCHED_LOAD_SCALE], for use + * in energy calculations. + * + * Since task executions may or may not overlap in time in the group the true + * normalized util is between MAX(cpu_norm_util(i)) and SUM(cpu_norm_util(i)) + * when iterating over all CPUs in the group. + * The latter estimate is used as it leads to a more pessimistic energy * estimate (more busy). */ static unsigned long group_norm_util(struct energy_env *eenv, struct sched_group *sg) { - int i, delta; - unsigned long util_sum = 0; unsigned long capacity = sg->sge->cap_states[eenv->cap_idx].cap; + unsigned long util, util_sum = 0; + int cpu; - for_each_cpu(i, sched_group_cpus(sg)) { - delta = calc_util_delta(eenv, i); - util_sum += __cpu_norm_util(i, capacity, delta); + for_each_cpu(cpu, sched_group_cpus(sg)) { + util = cpu_util_wake(cpu, eenv->task); + + /* + * If we are looking at the target CPU specified by the eenv, + * then we should add the (estimated) utilization of the task + * assuming we will wake it up on that CPU. + */ + if (unlikely(cpu == eenv->trg_cpu)) + util += eenv->util_delta; + + util_sum += __cpu_norm_util(util, capacity); } - if (util_sum > SCHED_CAPACITY_SCALE) - return SCHED_CAPACITY_SCALE; - return util_sum; + return min_t(unsigned long, util_sum, SCHED_CAPACITY_SCALE); } static int find_new_capacity(struct energy_env *eenv, const struct sched_group_energy * const sge) { - int idx; + int idx, max_idx = sge->nr_cap_states - 1; unsigned long util = group_max_util(eenv); + /* default is max_cap if we don't find a match */ + eenv->cap_idx = max_idx; + for (idx = 0; idx < sge->nr_cap_states; idx++) { - if (sge->cap_states[idx].cap >= util) + if (sge->cap_states[idx].cap >= util) { + eenv->cap_idx = idx; break; + } } - eenv->cap_idx = idx; - - return idx; + return eenv->cap_idx; } static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) @@ -5706,13 +5720,13 @@ static int sched_group_energy(struct energy_env *eenv) if (sg->group_weight == 1) { /* Remove capacity of src CPU (before task move) */ - if (eenv->util_delta == 0 && + if (eenv->trg_cpu == eenv->src_cpu && cpumask_test_cpu(eenv->src_cpu, sched_group_cpus(sg))) { eenv->cap.before = sg->sge->cap_states[cap_idx].cap; eenv->cap.delta -= eenv->cap.before; } /* Add capacity of dst CPU (after task move) */ - if (eenv->util_delta != 0 && + if (eenv->trg_cpu == eenv->dst_cpu && cpumask_test_cpu(eenv->dst_cpu, sched_group_cpus(sg))) { eenv->cap.after = sg->sge->cap_states[cap_idx].cap; eenv->cap.delta += eenv->cap.after; @@ -5760,6 +5774,8 @@ static inline bool cpu_in_sg(struct sched_group *sg, int cpu) return cpu != -1 && cpumask_test_cpu(cpu, sched_group_cpus(sg)); } +static inline unsigned long task_util(struct task_struct *p); + /* * energy_diff(): Estimate the energy impact of changing the utilization * distribution. eenv specifies the change: utilisation amount, source, and @@ -5775,11 +5791,13 @@ static inline int __energy_diff(struct energy_env *eenv) int diff, margin; struct energy_env eenv_before = { - .util_delta = 0, + .util_delta = task_util(eenv->task), .src_cpu = eenv->src_cpu, .dst_cpu = eenv->dst_cpu, + .trg_cpu = eenv->src_cpu, .nrg = { 0, 0, 0, 0}, .cap = { 0, 0, 0 }, + .task = eenv->task, }; if (eenv->src_cpu == eenv->dst_cpu) @@ -5887,8 +5905,14 @@ energy_diff(struct energy_env *eenv) __energy_diff(eenv); /* Return energy diff when boost margin is 0 */ - if (boost == 0) + if (boost == 0) { + trace_sched_energy_diff(eenv->task, + eenv->src_cpu, eenv->dst_cpu, eenv->util_delta, + eenv->nrg.before, eenv->nrg.after, eenv->nrg.diff, + eenv->cap.before, eenv->cap.after, eenv->cap.delta, + 0, -eenv->nrg.diff); return eenv->nrg.diff; + } /* Compute normalized energy diff */ nrg_delta = normalize_energy(eenv->nrg.diff); @@ -6151,8 +6175,6 @@ boosted_task_util(struct task_struct *task) return util + margin; } -static int cpu_util_wake(int cpu, struct task_struct *p); - static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) { return capacity_orig_of(cpu) - cpu_util_wake(cpu, p); @@ -6161,6 +6183,8 @@ static unsigned long capacity_spare_wake(int cpu, struct task_struct *p) /* * find_idlest_group finds and returns the least busy CPU group within the * domain. + * + * Assumes p is allowed on at least one CPU in sd. */ static struct sched_group * find_idlest_group(struct sched_domain *sd, struct task_struct *p, @@ -6168,16 +6192,21 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, { struct sched_group *idlest = NULL, *group = sd->groups; struct sched_group *most_spare_sg = NULL; - unsigned long min_load = ULONG_MAX, this_load = 0; + unsigned long min_runnable_load = ULONG_MAX; + unsigned long this_runnable_load = ULONG_MAX; + unsigned long min_avg_load = ULONG_MAX, this_avg_load = ULONG_MAX; unsigned long most_spare = 0, this_spare = 0; int load_idx = sd->forkexec_idx; - int imbalance = 100 + (sd->imbalance_pct-100)/2; + int imbalance_scale = 100 + (sd->imbalance_pct-100)/2; + unsigned long imbalance = scale_load_down(NICE_0_LOAD) * + (sd->imbalance_pct-100) / 100; if (sd_flag & SD_BALANCE_WAKE) load_idx = sd->wake_idx; do { - unsigned long load, avg_load, spare_cap, max_spare_cap; + unsigned long load, avg_load, runnable_load; + unsigned long spare_cap, max_spare_cap; int local_group; int i; @@ -6194,6 +6223,7 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, * the group containing the CPU with most spare capacity. */ avg_load = 0; + runnable_load = 0; max_spare_cap = 0; for_each_cpu(i, sched_group_cpus(group)) { @@ -6203,7 +6233,9 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, else load = target_load(i, load_idx); - avg_load += load; + runnable_load += load; + + avg_load += cfs_rq_load_avg(&cpu_rq(i)->cfs); spare_cap = capacity_spare_wake(i, p); @@ -6212,14 +6244,32 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, } /* Adjust by relative CPU capacity of the group */ - avg_load = (avg_load * SCHED_CAPACITY_SCALE) / group->sgc->capacity; + avg_load = (avg_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; + runnable_load = (runnable_load * SCHED_CAPACITY_SCALE) / + group->sgc->capacity; if (local_group) { - this_load = avg_load; + this_runnable_load = runnable_load; + this_avg_load = avg_load; this_spare = max_spare_cap; } else { - if (avg_load < min_load) { - min_load = avg_load; + if (min_runnable_load > (runnable_load + imbalance)) { + /* + * The runnable load is significantly smaller + * so we can pick this new cpu + */ + min_runnable_load = runnable_load; + min_avg_load = avg_load; + idlest = group; + } else if ((runnable_load < (min_runnable_load + imbalance)) && + (100*min_avg_load > imbalance_scale*avg_load)) { + /* + * The runnable loads are close so we take + * into account blocked load through avg_load + * which is blocked + runnable load + */ + min_avg_load = avg_load; idlest = group; } @@ -6236,23 +6286,32 @@ find_idlest_group(struct sched_domain *sd, struct task_struct *p, * utilized systems if we require spare_capacity > task_util(p), * so we allow for some task stuffing by using * spare_capacity > task_util(p)/2. + * spare capacity can't be used for fork because the utilization has + * not been set yet as it need to get a rq to init the utilization */ + if (sd_flag & SD_BALANCE_FORK) + goto skip_spare; + if (this_spare > task_util(p) / 2 && - imbalance*this_spare > 100*most_spare) + imbalance_scale*this_spare > 100*most_spare) return NULL; else if (most_spare > task_util(p) / 2) return most_spare_sg; - if (!idlest || 100*this_load < imbalance*min_load) +skip_spare: + if (!idlest || + (min_runnable_load > (this_runnable_load + imbalance)) || + ((this_runnable_load < (min_runnable_load + imbalance)) && + (100*this_avg_load < imbalance_scale*min_avg_load))) return NULL; return idlest; } /* - * find_idlest_cpu - find the idlest cpu among the cpus in group. + * find_idlest_group_cpu - find the idlest cpu among the cpus in group. */ static int -find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) +find_idlest_group_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) { unsigned long load, min_load = ULONG_MAX; unsigned int min_exit_latency = UINT_MAX; @@ -6301,6 +6360,68 @@ find_idlest_cpu(struct sched_group *group, struct task_struct *p, int this_cpu) return shallowest_idle_cpu != -1 ? shallowest_idle_cpu : least_loaded_cpu; } +static inline int find_idlest_cpu(struct sched_domain *sd, struct task_struct *p, + int cpu, int prev_cpu, int sd_flag) +{ + int wu = sd_flag & SD_BALANCE_WAKE; + int cas_cpu = -1; + int new_cpu = cpu; + + if (wu) { + schedstat_inc(p->se.statistics.nr_wakeups_cas_attempts); + schedstat_inc(this_rq()->eas_stats.cas_attempts); + } + + if (!cpumask_intersects(sched_domain_span(sd), &p->cpus_allowed)) + return prev_cpu; + + while (sd) { + struct sched_group *group; + struct sched_domain *tmp; + int weight; + + if (wu) + schedstat_inc(sd->eas_stats.cas_attempts); + + if (!(sd->flags & sd_flag)) { + sd = sd->child; + continue; + } + + group = find_idlest_group(sd, p, cpu, sd_flag); + if (!group) { + sd = sd->child; + continue; + } + + new_cpu = find_idlest_group_cpu(group, p, cpu); + if (new_cpu == cpu) { + /* Now try balancing at a lower domain level of cpu */ + sd = sd->child; + continue; + } + + /* Now try balancing at a lower domain level of new_cpu */ + cpu = cas_cpu = new_cpu; + weight = sd->span_weight; + sd = NULL; + for_each_domain(cpu, tmp) { + if (weight <= tmp->span_weight) + break; + if (tmp->flags & sd_flag) + sd = tmp; + } + /* while loop will break here if sd == NULL */ + } + + if (wu && (cas_cpu >= 0)) { + schedstat_inc(p->se.statistics.nr_wakeups_cas_count); + schedstat_inc(this_rq()->eas_stats.cas_count); + } + + return new_cpu; +} + #ifdef CONFIG_SCHED_SMT static inline void set_idle_cores(int cpu, int val) @@ -6605,9 +6726,6 @@ static int start_cpu(bool boosted) { struct root_domain *rd = cpu_rq(smp_processor_id())->rd; - RCU_LOCKDEP_WARN(rcu_read_lock_sched_held(), - "sched RCU must be held"); - return boosted ? rd->max_cap_orig_cpu : rd->min_cap_orig_cpu; } @@ -6757,6 +6875,19 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu, continue; } + /* + * Enforce EAS mode + * + * For non latency sensitive tasks, skip CPUs that + * will be overutilized by moving the task there. + * + * The goal here is to remain in EAS mode as long as + * possible at least for !prefer_idle tasks. + */ + if ((new_util * capacity_margin) > + (capacity_orig * SCHED_CAPACITY_SCALE)) + continue; + /* * Case B) Non latency sensitive tasks on IDLE CPUs. * @@ -6953,6 +7084,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync .src_cpu = prev_cpu, .dst_cpu = target_cpu, .task = p, + .trg_cpu = target_cpu, }; /* Not enough spare capacity on previous cpu */ @@ -6966,7 +7098,10 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync /* No energy saving for target_cpu, try backup */ target_cpu = tmp_backup; eenv.dst_cpu = target_cpu; - if (tmp_backup < 0 || energy_diff(&eenv) >= 0) { + eenv.trg_cpu = target_cpu; + if (tmp_backup < 0 || + tmp_backup == prev_cpu || + energy_diff(&eenv) >= 0) { schedstat_inc(p->se.statistics.nr_wakeups_secb_no_nrg_sav); schedstat_inc(this_rq()->eas_stats.secb_no_nrg_sav); target_cpu = prev_cpu; @@ -7044,62 +7179,21 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int sd_flag, int wake_f new_cpu = cpu; } + if (sd && !(sd_flag & SD_BALANCE_FORK)) { + /* + * We're going to need the task's util for capacity_spare_wake + * in find_idlest_group. Sync it up to prev_cpu's + * last_update_time. + */ + sync_entity_load_avg(&p->se); + } + if (!sd) { if (sd_flag & SD_BALANCE_WAKE) /* XXX always ? */ new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); } else { - int wu = sd_flag & SD_BALANCE_WAKE; - int cas_cpu = -1; - - if (wu) { - schedstat_inc(p->se.statistics.nr_wakeups_cas_attempts); - schedstat_inc(this_rq()->eas_stats.cas_attempts); - } - - - while (sd) { - struct sched_group *group; - int weight; - - if (wu) - schedstat_inc(sd->eas_stats.cas_attempts); - - if (!(sd->flags & sd_flag)) { - sd = sd->child; - continue; - } - - group = find_idlest_group(sd, p, cpu, sd_flag); - if (!group) { - sd = sd->child; - continue; - } - - new_cpu = find_idlest_cpu(group, p, cpu); - if (new_cpu == -1 || new_cpu == cpu) { - /* Now try balancing at a lower domain level of cpu */ - sd = sd->child; - continue; - } - - /* Now try balancing at a lower domain level of new_cpu */ - cpu = cas_cpu = new_cpu; - weight = sd->span_weight; - sd = NULL; - for_each_domain(cpu, tmp) { - if (weight <= tmp->span_weight) - break; - if (tmp->flags & sd_flag) - sd = tmp; - } - /* while loop will break here if sd == NULL */ - } - - if (wu && (cas_cpu >= 0)) { - schedstat_inc(p->se.statistics.nr_wakeups_cas_count); - schedstat_inc(this_rq()->eas_stats.cas_count); - } + new_cpu = find_idlest_cpu(sd, p, cpu, prev_cpu, sd_flag); } rcu_read_unlock(); @@ -9077,8 +9171,11 @@ static struct sched_group *find_busiest_group(struct lb_env *env) if (busiest->group_type == group_imbalanced) goto force_balance; - /* SD_BALANCE_NEWIDLE trumps SMP nice when underutilized */ - if (env->idle == CPU_NEWLY_IDLE && group_has_capacity(env, local) && + /* + * When dst_cpu is idle, prevent SMP nice and/or asymmetric group + * capacities from resulting in underutilization due to avg_load. + */ + if (env->idle != CPU_NOT_IDLE && group_has_capacity(env, local) && busiest->group_no_capacity) goto force_balance; @@ -9364,6 +9461,7 @@ redo: more_balance: raw_spin_lock_irqsave(&busiest->lock, flags); + update_rq_clock(busiest); /* * cur_ld_moved - load moved in current iteration @@ -9756,6 +9854,7 @@ static int active_load_balance_cpu_stop(void *data) }; schedstat_inc(sd->alb_count); + update_rq_clock(busiest_rq); p = detach_one_task(&env); if (p) { @@ -10584,7 +10683,8 @@ void online_fair_sched_group(struct task_group *tg) se = tg->se[i]; raw_spin_lock_irq(&rq->lock); - post_init_entity_util_avg(se); + update_rq_clock(rq); + attach_entity_cfs_rq(se); sync_throttle(tg, i); raw_spin_unlock_irq(&rq->lock); } diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 552e6c551dc3..c1884b6f9993 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1667,6 +1667,10 @@ static inline bool sched_freq(void) return static_key_false(&__sched_freq); } +/* + * sched_capacity_reqs expects capacity requests to be normalised. + * All capacities should sum to the range of 0-1024. + */ DECLARE_PER_CPU(struct sched_capacity_reqs, cpu_sched_capacity_reqs); void update_cpu_capacity_request(int cpu, bool request);