Merge branch 'eas-dev' into android-mainline

Merge android-specific EAS patches

Bug: 120440300
Change-Id: I9e9c3dff41f1e941e57390db971d58a1457c69a3
Signed-off-by: Todd Kjos <tkjos@google.com>
This commit is contained in:
Todd Kjos
2019-07-29 15:44:19 -07:00
12 changed files with 331 additions and 276 deletions

View File

@@ -30,6 +30,9 @@ const struct cpumask *cpu_coregroup_mask(int cpu);
/* Replace task scheduler's default frequency-invariant accounting */
#define arch_scale_freq_capacity topology_get_freq_scale
/* Replace task scheduler's default max-frequency-invariant accounting */
#define arch_scale_max_freq_capacity topology_get_max_freq_scale
/* Replace task scheduler's default cpu-invariant accounting */
#define arch_scale_cpu_capacity topology_get_cpu_scale

View File

@@ -71,6 +71,7 @@ CONFIG_COMPAT=y
CONFIG_RANDOMIZE_BASE=y
CONFIG_HIBERNATION=y
CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y
CONFIG_ENERGY_MODEL=y
CONFIG_ARM_CPUIDLE=y
CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_STAT=y

View File

@@ -42,6 +42,9 @@ int pcibus_to_node(struct pci_bus *bus);
/* Replace task scheduler's default frequency-invariant accounting */
#define arch_scale_freq_capacity topology_get_freq_scale
/* Replace task scheduler's default max-frequency-invariant accounting */
#define arch_scale_max_freq_capacity topology_get_max_freq_scale
/* Replace task scheduler's default cpu-invariant accounting */
#define arch_scale_cpu_capacity topology_get_cpu_scale

View File

@@ -17,6 +17,8 @@
#include <linux/cpuset.h>
DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
DEFINE_PER_CPU(unsigned long, max_cpu_freq);
DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE;
void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
unsigned long max_freq)
@@ -26,8 +28,29 @@ void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq;
for_each_cpu(i, cpus)
for_each_cpu(i, cpus) {
per_cpu(freq_scale, i) = scale;
per_cpu(max_cpu_freq, i) = max_freq;
}
}
void arch_set_max_freq_scale(struct cpumask *cpus,
unsigned long policy_max_freq)
{
unsigned long scale, max_freq;
int cpu = cpumask_first(cpus);
if (cpu > nr_cpu_ids)
return;
max_freq = per_cpu(max_cpu_freq, cpu);
if (!max_freq)
return;
scale = (policy_max_freq << SCHED_CAPACITY_SHIFT) / max_freq;
for_each_cpu(cpu, cpus)
per_cpu(max_freq_scale, cpu) = scale;
}
DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;

View File

@@ -153,6 +153,12 @@ __weak void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
}
EXPORT_SYMBOL_GPL(arch_set_freq_scale);
__weak void arch_set_max_freq_scale(struct cpumask *cpus,
unsigned long policy_max_freq)
{
}
EXPORT_SYMBOL_GPL(arch_set_max_freq_scale);
/*
* This is a generic cpufreq init() routine which can be used by cpufreq
* drivers of SMP systems. It will do following:
@@ -2416,6 +2422,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
policy->max = new_policy->max;
trace_cpu_frequency_limits(policy);
arch_set_max_freq_scale(policy->cpus, policy->max);
policy->cached_target_freq = UINT_MAX;
pr_debug("new min and max freqs are %u - %u kHz\n",

View File

@@ -144,6 +144,7 @@ config THERMAL_GOV_USER_SPACE
config THERMAL_GOV_POWER_ALLOCATOR
bool "Power allocator thermal governor"
depends on ENERGY_MODEL
help
Enable this to manage platform thermals by dynamically
allocating and limiting power to devices.

View File

@@ -19,6 +19,7 @@
#include <linux/slab.h>
#include <linux/cpu.h>
#include <linux/cpu_cooling.h>
#include <linux/energy_model.h>
#include <trace/events/thermal.h>
@@ -36,19 +37,6 @@
* ...
*/
/**
* struct freq_table - frequency table along with power entries
* @frequency: frequency in KHz
* @power: power in mW
*
* This structure is built when the cooling device registers and helps
* in translating frequency to power and vice versa.
*/
struct freq_table {
u32 frequency;
u32 power;
};
/**
* struct time_in_idle - Idle time stats
* @time: previous reading of the absolute time that this cpu was idle
@@ -70,7 +58,7 @@ struct time_in_idle {
* frequency.
* @max_level: maximum cooling level. One less than total number of valid
* cpufreq frequencies.
* @freq_table: Freq table in descending order of frequencies
* @em: Reference on the Energy Model of the device
* @cdev: thermal_cooling_device pointer to keep track of the
* registered cooling device.
* @policy: cpufreq policy.
@@ -86,7 +74,7 @@ struct cpufreq_cooling_device {
unsigned int cpufreq_state;
unsigned int clipped_freq;
unsigned int max_level;
struct freq_table *freq_table; /* In descending order */
struct em_perf_domain *em;
struct cpufreq_policy *policy;
struct list_head node;
struct time_in_idle *idle_time;
@@ -96,28 +84,6 @@ static DEFINE_IDA(cpufreq_ida);
static DEFINE_MUTEX(cooling_list_lock);
static LIST_HEAD(cpufreq_cdev_list);
/* Below code defines functions to be used for cpufreq as cooling device */
/**
* get_level: Find the level for a particular frequency
* @cpufreq_cdev: cpufreq_cdev for which the property is required
* @freq: Frequency
*
* Return: level corresponding to the frequency.
*/
static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
unsigned int freq)
{
struct freq_table *freq_table = cpufreq_cdev->freq_table;
unsigned long level;
for (level = 1; level <= cpufreq_cdev->max_level; level++)
if (freq > freq_table[level].frequency)
break;
return level - 1;
}
/**
* cpufreq_thermal_notifier - notifier callback for cpufreq policy change.
* @nb: struct notifier_block * with callback info.
@@ -171,104 +137,51 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb,
return NOTIFY_OK;
}
#ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
/**
* update_freq_table() - Update the freq table with power numbers
* @cpufreq_cdev: the cpufreq cooling device in which to update the table
* @capacitance: dynamic power coefficient for these cpus
* get_level: Find the level for a particular frequency
* @cpufreq_cdev: cpufreq_cdev for which the property is required
* @freq: Frequency
*
* Update the freq table with power numbers. This table will be used in
* cpu_power_to_freq() and cpu_freq_to_power() to convert between power and
* frequency efficiently. Power is stored in mW, frequency in KHz. The
* resulting table is in descending order.
*
* Return: 0 on success, -EINVAL if there are no OPPs for any CPUs,
* or -ENOMEM if we run out of memory.
* Return: level corresponding to the frequency.
*/
static int update_freq_table(struct cpufreq_cooling_device *cpufreq_cdev,
u32 capacitance)
static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
unsigned int freq)
{
struct freq_table *freq_table = cpufreq_cdev->freq_table;
struct dev_pm_opp *opp;
struct device *dev = NULL;
int num_opps = 0, cpu = cpufreq_cdev->policy->cpu, i;
int i;
dev = get_cpu_device(cpu);
if (unlikely(!dev)) {
pr_warn("No cpu device for cpu %d\n", cpu);
return -ENODEV;
for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
if (freq > cpufreq_cdev->em->table[i].frequency)
break;
}
num_opps = dev_pm_opp_get_opp_count(dev);
if (num_opps < 0)
return num_opps;
/*
* The cpufreq table is also built from the OPP table and so the count
* should match.
*/
if (num_opps != cpufreq_cdev->max_level + 1) {
dev_warn(dev, "Number of OPPs not matching with max_levels\n");
return -EINVAL;
}
for (i = 0; i <= cpufreq_cdev->max_level; i++) {
unsigned long freq = freq_table[i].frequency * 1000;
u32 freq_mhz = freq_table[i].frequency / 1000;
u64 power;
u32 voltage_mv;
/*
* Find ceil frequency as 'freq' may be slightly lower than OPP
* freq due to truncation while converting to kHz.
*/
opp = dev_pm_opp_find_freq_ceil(dev, &freq);
if (IS_ERR(opp)) {
dev_err(dev, "failed to get opp for %lu frequency\n",
freq);
return -EINVAL;
}
voltage_mv = dev_pm_opp_get_voltage(opp) / 1000;
dev_pm_opp_put(opp);
/*
* Do the multiplication with MHz and millivolt so as
* to not overflow.
*/
power = (u64)capacitance * freq_mhz * voltage_mv * voltage_mv;
do_div(power, 1000000000);
/* power is stored in mW */
freq_table[i].power = power;
}
return 0;
return cpufreq_cdev->max_level - i - 1;
}
static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
u32 freq)
{
int i;
struct freq_table *freq_table = cpufreq_cdev->freq_table;
for (i = 1; i <= cpufreq_cdev->max_level; i++)
if (freq > freq_table[i].frequency)
for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
if (freq > cpufreq_cdev->em->table[i].frequency)
break;
}
return freq_table[i - 1].power;
return cpufreq_cdev->em->table[i + 1].power;
}
static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
u32 power)
{
int i;
struct freq_table *freq_table = cpufreq_cdev->freq_table;
for (i = 1; i <= cpufreq_cdev->max_level; i++)
if (power > freq_table[i].power)
for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
if (power > cpufreq_cdev->em->table[i].power)
break;
}
return freq_table[i - 1].frequency;
return cpufreq_cdev->em->table[i + 1].frequency;
}
/**
@@ -319,80 +232,6 @@ static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_cdev,
return (raw_cpu_power * cpufreq_cdev->last_load) / 100;
}
/* cpufreq cooling device callback functions are defined below */
/**
* cpufreq_get_max_state - callback function to get the max cooling state.
* @cdev: thermal cooling device pointer.
* @state: fill this variable with the max cooling state.
*
* Callback for the thermal cooling device to return the cpufreq
* max cooling state.
*
* Return: 0 on success, an error code otherwise.
*/
static int cpufreq_get_max_state(struct thermal_cooling_device *cdev,
unsigned long *state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
*state = cpufreq_cdev->max_level;
return 0;
}
/**
* cpufreq_get_cur_state - callback function to get the current cooling state.
* @cdev: thermal cooling device pointer.
* @state: fill this variable with the current cooling state.
*
* Callback for the thermal cooling device to return the cpufreq
* current cooling state.
*
* Return: 0 on success, an error code otherwise.
*/
static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev,
unsigned long *state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
*state = cpufreq_cdev->cpufreq_state;
return 0;
}
/**
* cpufreq_set_cur_state - callback function to set the current cooling state.
* @cdev: thermal cooling device pointer.
* @state: set this variable to the current cooling state.
*
* Callback for the thermal cooling device to change the cpufreq
* current cooling state.
*
* Return: 0 on success, an error code otherwise.
*/
static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
unsigned long state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
unsigned int clip_freq;
/* Request state should be less than max_level */
if (WARN_ON(state > cpufreq_cdev->max_level))
return -EINVAL;
/* Check if the old cooling action is same as new cooling action */
if (cpufreq_cdev->cpufreq_state == state)
return 0;
clip_freq = cpufreq_cdev->freq_table[state].frequency;
cpufreq_cdev->cpufreq_state = state;
cpufreq_cdev->clipped_freq = clip_freq;
cpufreq_update_policy(cpufreq_cdev->policy->cpu);
return 0;
}
/**
* cpufreq_get_requested_power() - get the current power
* @cdev: &thermal_cooling_device pointer
@@ -483,7 +322,7 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev,
struct thermal_zone_device *tz,
unsigned long state, u32 *power)
{
unsigned int freq, num_cpus;
unsigned int freq, num_cpus, idx;
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
/* Request state should be less than max_level */
@@ -492,7 +331,8 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev,
num_cpus = cpumask_weight(cpufreq_cdev->policy->cpus);
freq = cpufreq_cdev->freq_table[state].frequency;
idx = cpufreq_cdev->max_level - state;
freq = cpufreq_cdev->em->table[idx].frequency;
*power = cpu_freq_to_power(cpufreq_cdev, freq) * num_cpus;
return 0;
@@ -537,21 +377,139 @@ static int cpufreq_power2state(struct thermal_cooling_device *cdev,
return 0;
}
static inline bool em_is_sane(struct cpufreq_cooling_device *cpufreq_cdev,
struct em_perf_domain *em) {
struct cpufreq_policy *policy;
unsigned int nr_levels;
if (!em)
return false;
policy = cpufreq_cdev->policy;
if (!cpumask_equal(policy->related_cpus, to_cpumask(em->cpus))) {
pr_err("The span of pd %*pbl is misaligned with cpufreq policy %*pbl\n",
cpumask_pr_args(to_cpumask(em->cpus)),
cpumask_pr_args(policy->related_cpus));
return false;
}
nr_levels = cpufreq_cdev->max_level + 1;
if (em->nr_cap_states != nr_levels) {
pr_err("The number of cap states in pd %*pbl (%u) doesn't match the number of cooling levels (%u)\n",
cpumask_pr_args(to_cpumask(em->cpus)),
em->nr_cap_states, nr_levels);
return false;
}
return true;
}
#endif /* CONFIG_THERMAL_GOV_POWER_ALLOCATOR */
static unsigned int get_state_freq(struct cpufreq_cooling_device *cpufreq_cdev,
unsigned long state)
{
struct cpufreq_policy *policy;
unsigned long idx;
#ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
/* Use the Energy Model table if available */
if (cpufreq_cdev->em) {
idx = cpufreq_cdev->max_level - state;
return cpufreq_cdev->em->table[idx].frequency;
}
#endif
/* Otherwise, fallback on the CPUFreq table */
policy = cpufreq_cdev->policy;
if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING)
idx = cpufreq_cdev->max_level - state;
else
idx = state;
return policy->freq_table[idx].frequency;
}
/* cpufreq cooling device callback functions are defined below */
/**
* cpufreq_get_max_state - callback function to get the max cooling state.
* @cdev: thermal cooling device pointer.
* @state: fill this variable with the max cooling state.
*
* Callback for the thermal cooling device to return the cpufreq
* max cooling state.
*
* Return: 0 on success, an error code otherwise.
*/
static int cpufreq_get_max_state(struct thermal_cooling_device *cdev,
unsigned long *state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
*state = cpufreq_cdev->max_level;
return 0;
}
/**
* cpufreq_get_cur_state - callback function to get the current cooling state.
* @cdev: thermal cooling device pointer.
* @state: fill this variable with the current cooling state.
*
* Callback for the thermal cooling device to return the cpufreq
* current cooling state.
*
* Return: 0 on success, an error code otherwise.
*/
static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev,
unsigned long *state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
*state = cpufreq_cdev->cpufreq_state;
return 0;
}
/**
* cpufreq_set_cur_state - callback function to set the current cooling state.
* @cdev: thermal cooling device pointer.
* @state: set this variable to the current cooling state.
*
* Callback for the thermal cooling device to change the cpufreq
* current cooling state.
*
* Return: 0 on success, an error code otherwise.
*/
static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
unsigned long state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
unsigned int clip_freq;
/* Request state should be less than max_level */
if (WARN_ON(state > cpufreq_cdev->max_level))
return -EINVAL;
/* Check if the old cooling action is same as new cooling action */
if (cpufreq_cdev->cpufreq_state == state)
return 0;
clip_freq = get_state_freq(cpufreq_cdev, state);
cpufreq_cdev->cpufreq_state = state;
cpufreq_cdev->clipped_freq = clip_freq;
cpufreq_update_policy(cpufreq_cdev->policy->cpu);
return 0;
}
/* Bind cpufreq callbacks to thermal cooling device ops */
static struct thermal_cooling_device_ops cpufreq_cooling_ops = {
.get_max_state = cpufreq_get_max_state,
.get_cur_state = cpufreq_get_cur_state,
.set_cur_state = cpufreq_set_cur_state,
};
static struct thermal_cooling_device_ops cpufreq_power_cooling_ops = {
.get_max_state = cpufreq_get_max_state,
.get_cur_state = cpufreq_get_cur_state,
.set_cur_state = cpufreq_set_cur_state,
.get_requested_power = cpufreq_get_requested_power,
.state2power = cpufreq_state2power,
.power2state = cpufreq_power2state,
};
/* Notifier for cpufreq policy change */
@@ -559,26 +517,12 @@ static struct notifier_block thermal_cpufreq_notifier_block = {
.notifier_call = cpufreq_thermal_notifier,
};
static unsigned int find_next_max(struct cpufreq_frequency_table *table,
unsigned int prev_max)
{
struct cpufreq_frequency_table *pos;
unsigned int max = 0;
cpufreq_for_each_valid_entry(pos, table) {
if (pos->frequency > max && pos->frequency < prev_max)
max = pos->frequency;
}
return max;
}
/**
* __cpufreq_cooling_register - helper function to create cpufreq cooling device
* @np: a valid struct device_node to the cooling device device tree node
* @policy: cpufreq policy
* Normally this should be same as cpufreq policy->related_cpus.
* @capacitance: dynamic power coefficient for these cpus
* @em: Energy Model of the cpufreq policy
*
* This interface function registers the cpufreq cooling device with the name
* "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
@@ -590,12 +534,13 @@ static unsigned int find_next_max(struct cpufreq_frequency_table *table,
*/
static struct thermal_cooling_device *
__cpufreq_cooling_register(struct device_node *np,
struct cpufreq_policy *policy, u32 capacitance)
struct cpufreq_policy *policy,
struct em_perf_domain *em)
{
struct thermal_cooling_device *cdev;
struct cpufreq_cooling_device *cpufreq_cdev;
char dev_name[THERMAL_NAME_LENGTH];
unsigned int freq, i, num_cpus;
unsigned int i, num_cpus;
int ret;
struct thermal_cooling_device_ops *cooling_ops;
bool first;
@@ -629,46 +574,30 @@ __cpufreq_cooling_register(struct device_node *np,
/* max_level is an index, not a counter */
cpufreq_cdev->max_level = i - 1;
cpufreq_cdev->freq_table = kmalloc_array(i,
sizeof(*cpufreq_cdev->freq_table),
GFP_KERNEL);
if (!cpufreq_cdev->freq_table) {
cdev = ERR_PTR(-ENOMEM);
goto free_idle_time;
}
ret = ida_simple_get(&cpufreq_ida, 0, 0, GFP_KERNEL);
if (ret < 0) {
cdev = ERR_PTR(ret);
goto free_table;
goto free_idle_time;
}
cpufreq_cdev->id = ret;
snprintf(dev_name, sizeof(dev_name), "thermal-cpufreq-%d",
cpufreq_cdev->id);
/* Fill freq-table in descending order of frequencies */
for (i = 0, freq = -1; i <= cpufreq_cdev->max_level; i++) {
freq = find_next_max(policy->freq_table, freq);
cpufreq_cdev->freq_table[i].frequency = freq;
/* Warn for duplicate entries */
if (!freq)
pr_warn("%s: table has duplicate entries\n", __func__);
else
pr_debug("%s: freq:%u KHz\n", __func__, freq);
}
if (capacitance) {
ret = update_freq_table(cpufreq_cdev, capacitance);
if (ret) {
cdev = ERR_PTR(ret);
goto remove_ida;
}
cooling_ops = &cpufreq_power_cooling_ops;
} else {
cooling_ops = &cpufreq_cooling_ops;
cooling_ops = &cpufreq_cooling_ops;
#ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
if (em_is_sane(cpufreq_cdev, em)) {
cpufreq_cdev->em = em;
cooling_ops->get_requested_power = cpufreq_get_requested_power;
cooling_ops->state2power = cpufreq_state2power;
cooling_ops->power2state = cpufreq_power2state;
} else
#endif
if (policy->freq_table_sorted == CPUFREQ_TABLE_UNSORTED) {
pr_err("%s: unsorted frequency tables are not supported\n",
__func__);
cdev = ERR_PTR(-EINVAL);
goto remove_ida;
}
cdev = thermal_of_cooling_device_register(np, dev_name, cpufreq_cdev,
@@ -676,7 +605,7 @@ __cpufreq_cooling_register(struct device_node *np,
if (IS_ERR(cdev))
goto remove_ida;
cpufreq_cdev->clipped_freq = cpufreq_cdev->freq_table[0].frequency;
cpufreq_cdev->clipped_freq = get_state_freq(cpufreq_cdev, 0);
mutex_lock(&cooling_list_lock);
/* Register the notifier for first cpufreq cooling device */
@@ -692,8 +621,6 @@ __cpufreq_cooling_register(struct device_node *np,
remove_ida:
ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
free_table:
kfree(cpufreq_cdev->freq_table);
free_idle_time:
kfree(cpufreq_cdev->idle_time);
free_cdev:
@@ -715,7 +642,7 @@ free_cdev:
struct thermal_cooling_device *
cpufreq_cooling_register(struct cpufreq_policy *policy)
{
return __cpufreq_cooling_register(NULL, policy, 0);
return __cpufreq_cooling_register(NULL, policy, NULL);
}
EXPORT_SYMBOL_GPL(cpufreq_cooling_register);
@@ -743,7 +670,6 @@ of_cpufreq_cooling_register(struct cpufreq_policy *policy)
{
struct device_node *np = of_get_cpu_node(policy->cpu, NULL);
struct thermal_cooling_device *cdev = NULL;
u32 capacitance = 0;
if (!np) {
pr_err("cpu_cooling: OF node not available for cpu%d\n",
@@ -752,10 +678,9 @@ of_cpufreq_cooling_register(struct cpufreq_policy *policy)
}
if (of_find_property(np, "#cooling-cells", NULL)) {
of_property_read_u32(np, "dynamic-power-coefficient",
&capacitance);
struct em_perf_domain *em = em_cpu_get(policy->cpu);
cdev = __cpufreq_cooling_register(np, policy, capacitance);
cdev = __cpufreq_cooling_register(np, policy, em);
if (IS_ERR(cdev)) {
pr_err("cpu_cooling: cpu%d failed to register as cooling device: %ld\n",
policy->cpu, PTR_ERR(cdev));
@@ -797,7 +722,6 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
thermal_cooling_device_unregister(cdev);
ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
kfree(cpufreq_cdev->idle_time);
kfree(cpufreq_cdev->freq_table);
kfree(cpufreq_cdev);
}
EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);

View File

@@ -33,4 +33,12 @@ unsigned long topology_get_freq_scale(int cpu)
return per_cpu(freq_scale, cpu);
}
DECLARE_PER_CPU(unsigned long, max_freq_scale);
static inline
unsigned long topology_get_max_freq_scale(struct sched_domain *sd, int cpu)
{
return per_cpu(max_freq_scale, cpu);
}
#endif /* _LINUX_ARCH_TOPOLOGY_H_ */

View File

@@ -984,6 +984,8 @@ extern unsigned int arch_freq_get_on_cpu(int cpu);
extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
unsigned long max_freq);
extern void arch_set_max_freq_scale(struct cpumask *cpus,
unsigned long policy_max_freq);
/* the following are really really optional */
extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;

View File

@@ -6226,7 +6226,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
return 0;
min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
/* Minimum capacity is close to max, no need to abort wake_affine */
if (max_cap - min_cap < max_cap >> 3)
@@ -7141,6 +7141,7 @@ struct lb_env {
int new_dst_cpu;
enum cpu_idle_type idle;
long imbalance;
unsigned int src_grp_nr_running;
/* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus;
@@ -7754,10 +7755,9 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
};
}
static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
static unsigned long scale_rt_capacity(int cpu, unsigned long max)
{
struct rq *rq = cpu_rq(cpu);
unsigned long max = arch_scale_cpu_capacity(cpu);
unsigned long used, free;
unsigned long irq;
@@ -7777,12 +7777,47 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
return scale_irq_capacity(free, irq, max);
}
void init_max_cpu_capacity(struct max_cpu_capacity *mcc) {
raw_spin_lock_init(&mcc->lock);
mcc->val = 0;
mcc->cpu = -1;
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{
unsigned long capacity = scale_rt_capacity(sd, cpu);
unsigned long capacity = arch_scale_cpu_capacity(cpu);
struct sched_group *sdg = sd->groups;
struct max_cpu_capacity *mcc;
unsigned long max_capacity;
int max_cap_cpu;
unsigned long flags;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
cpu_rq(cpu)->cpu_capacity_orig = capacity;
capacity *= arch_scale_max_freq_capacity(sd, cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
raw_spin_lock_irqsave(&mcc->lock, flags);
max_capacity = mcc->val;
max_cap_cpu = mcc->cpu;
if ((max_capacity > capacity && max_cap_cpu == cpu) ||
(max_capacity < capacity)) {
mcc->val = capacity;
mcc->cpu = cpu;
#ifdef CONFIG_SCHED_DEBUG
raw_spin_unlock_irqrestore(&mcc->lock, flags);
printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
cpu, capacity);
goto skip_unlock;
#endif
}
raw_spin_unlock_irqrestore(&mcc->lock, flags);
skip_unlock: __attribute__ ((unused));
capacity = scale_rt_capacity(cpu, capacity);
if (!capacity)
capacity = 1;
@@ -7886,7 +7921,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
{
return rq->misfit_task_load &&
(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity.val ||
check_cpu_capacity(rq, sd));
}
@@ -8291,6 +8326,8 @@ next_group:
if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat);
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
if (!env->sd->parent) {
struct root_domain *rd = env->dst_rq->rd;
@@ -8419,7 +8456,22 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
capa_move /= SCHED_CAPACITY_SCALE;
/* Move if we gain throughput */
if (capa_move > capa_now)
if (capa_move > capa_now) {
env->imbalance = busiest->load_per_task;
return;
}
/* We can't see throughput improvement with the load-based
* method, but it is possible depending upon group size and
* capacity range that there might still be an underutilized
* cpu available in an asymmetric capacity system. Do one last
* check just in case.
*/
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
busiest->group_type == group_overloaded &&
busiest->sum_nr_running > busiest->group_weight &&
local->sum_nr_running < local->group_weight &&
local->group_capacity < busiest->group_capacity)
env->imbalance = busiest->load_per_task;
}
@@ -8488,8 +8540,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
(sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE;
/* Boost imbalance to allow misfit task to be balanced. */
if (busiest->group_type == group_misfit_task) {
/* Boost imbalance to allow misfit task to be balanced.
* Always do this if we are doing a NEWLY_IDLE balance
* on the assumption that any tasks we have must not be
* long-running (and hence we cannot rely upon load).
* However if we are not idle, we should assume the tasks
* we have are longer running and not override load-based
* calculations above unless we are sure that the local
* group is underutilized.
*/
if (busiest->group_type == group_misfit_task &&
(env->idle == CPU_NEWLY_IDLE ||
local->sum_nr_running < local->group_weight)) {
env->imbalance = max_t(long, env->imbalance,
busiest->group_misfit_task_load);
}
@@ -8767,6 +8829,17 @@ static int need_active_balance(struct lb_env *env)
if (voluntary_active_balance(env))
return 1;
if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
env->src_rq->cfs.h_nr_running == 1 &&
cpu_overutilized(env->src_cpu) &&
!cpu_overutilized(env->dst_cpu)) {
return 1;
}
if (env->src_grp_type == group_overloaded && env->src_rq->misfit_task_load)
return 1;
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
}
@@ -8985,7 +9058,8 @@ more_balance:
* excessive cache_hot migrations and active balances.
*/
if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++;
if (env.src_grp_nr_running > 1)
sd->nr_balance_failed++;
if (need_active_balance(&env)) {
unsigned long flags;

View File

@@ -710,6 +710,12 @@ struct perf_domain {
struct rcu_head rcu;
};
struct max_cpu_capacity {
raw_spinlock_t lock;
unsigned long val;
int cpu;
};
/* Scheduling group status flags */
#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */
#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */
@@ -768,7 +774,8 @@ struct root_domain {
cpumask_var_t rto_mask;
struct cpupri cpupri;
unsigned long max_cpu_capacity;
/* Maximum cpu capacity in the system. */
struct max_cpu_capacity max_cpu_capacity;
/*
* NULL-terminated list of performance domains intersecting with the
@@ -781,6 +788,7 @@ extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex;
extern void init_defrootdomain(void);
extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
extern void sched_get_rd(struct root_domain *rd);
@@ -1942,6 +1950,15 @@ unsigned long arch_scale_freq_capacity(int cpu)
}
#endif
#ifndef arch_scale_max_freq_capacity
struct sched_domain;
static __always_inline
unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
{
return SCHED_CAPACITY_SCALE;
}
#endif
#ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT

View File

@@ -510,6 +510,9 @@ static int init_rootdomain(struct root_domain *rd)
if (cpupri_init(&rd->cpupri) != 0)
goto free_cpudl;
init_max_cpu_capacity(&rd->max_cpu_capacity);
return 0;
free_cpudl:
@@ -1930,7 +1933,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
enum s_alloc alloc_state;
struct sched_domain *sd;
struct s_data d;
struct rq *rq = NULL;
int i, ret = -ENOMEM;
struct sched_domain_topology_level *tl_asym;
bool has_asym = false;
@@ -1993,13 +1995,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Attach the domains */
rcu_read_lock();
for_each_cpu(i, cpu_map) {
rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i);
/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
cpu_attach_domain(sd, d.rd, i);
}
rcu_read_unlock();
@@ -2007,11 +2003,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (has_asym)
static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
if (rq && sched_debug_enabled) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
}
ret = 0;
error:
__free_domain_allocs(&d, alloc_state, cpu_map);