Merge branch 'eas-dev' into android-mainline

Merge android-specific EAS patches

Bug: 120440300
Change-Id: I9e9c3dff41f1e941e57390db971d58a1457c69a3
Signed-off-by: Todd Kjos <tkjos@google.com>
This commit is contained in:
Todd Kjos
2019-07-29 15:44:19 -07:00
12 changed files with 331 additions and 276 deletions

View File

@@ -30,6 +30,9 @@ const struct cpumask *cpu_coregroup_mask(int cpu);
/* Replace task scheduler's default frequency-invariant accounting */ /* Replace task scheduler's default frequency-invariant accounting */
#define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale
/* Replace task scheduler's default max-frequency-invariant accounting */
#define arch_scale_max_freq_capacity topology_get_max_freq_scale
/* Replace task scheduler's default cpu-invariant accounting */ /* Replace task scheduler's default cpu-invariant accounting */
#define arch_scale_cpu_capacity topology_get_cpu_scale #define arch_scale_cpu_capacity topology_get_cpu_scale

View File

@@ -71,6 +71,7 @@ CONFIG_COMPAT=y
CONFIG_RANDOMIZE_BASE=y CONFIG_RANDOMIZE_BASE=y
CONFIG_HIBERNATION=y CONFIG_HIBERNATION=y
CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y
CONFIG_ENERGY_MODEL=y
CONFIG_ARM_CPUIDLE=y CONFIG_ARM_CPUIDLE=y
CONFIG_CPU_FREQ=y CONFIG_CPU_FREQ=y
CONFIG_CPU_FREQ_STAT=y CONFIG_CPU_FREQ_STAT=y

View File

@@ -42,6 +42,9 @@ int pcibus_to_node(struct pci_bus *bus);
/* Replace task scheduler's default frequency-invariant accounting */ /* Replace task scheduler's default frequency-invariant accounting */
#define arch_scale_freq_capacity topology_get_freq_scale #define arch_scale_freq_capacity topology_get_freq_scale
/* Replace task scheduler's default max-frequency-invariant accounting */
#define arch_scale_max_freq_capacity topology_get_max_freq_scale
/* Replace task scheduler's default cpu-invariant accounting */ /* Replace task scheduler's default cpu-invariant accounting */
#define arch_scale_cpu_capacity topology_get_cpu_scale #define arch_scale_cpu_capacity topology_get_cpu_scale

View File

@@ -17,6 +17,8 @@
#include <linux/cpuset.h> #include <linux/cpuset.h>
DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE; DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
DEFINE_PER_CPU(unsigned long, max_cpu_freq);
DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE;
void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
unsigned long max_freq) unsigned long max_freq)
@@ -26,8 +28,29 @@ void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq; scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq;
for_each_cpu(i, cpus) for_each_cpu(i, cpus) {
per_cpu(freq_scale, i) = scale; per_cpu(freq_scale, i) = scale;
per_cpu(max_cpu_freq, i) = max_freq;
}
}
void arch_set_max_freq_scale(struct cpumask *cpus,
unsigned long policy_max_freq)
{
unsigned long scale, max_freq;
int cpu = cpumask_first(cpus);
if (cpu > nr_cpu_ids)
return;
max_freq = per_cpu(max_cpu_freq, cpu);
if (!max_freq)
return;
scale = (policy_max_freq << SCHED_CAPACITY_SHIFT) / max_freq;
for_each_cpu(cpu, cpus)
per_cpu(max_freq_scale, cpu) = scale;
} }
DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE; DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;

View File

@@ -153,6 +153,12 @@ __weak void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
} }
EXPORT_SYMBOL_GPL(arch_set_freq_scale); EXPORT_SYMBOL_GPL(arch_set_freq_scale);
__weak void arch_set_max_freq_scale(struct cpumask *cpus,
unsigned long policy_max_freq)
{
}
EXPORT_SYMBOL_GPL(arch_set_max_freq_scale);
/* /*
* This is a generic cpufreq init() routine which can be used by cpufreq * This is a generic cpufreq init() routine which can be used by cpufreq
* drivers of SMP systems. It will do following: * drivers of SMP systems. It will do following:
@@ -2416,6 +2422,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
policy->max = new_policy->max; policy->max = new_policy->max;
trace_cpu_frequency_limits(policy); trace_cpu_frequency_limits(policy);
arch_set_max_freq_scale(policy->cpus, policy->max);
policy->cached_target_freq = UINT_MAX; policy->cached_target_freq = UINT_MAX;
pr_debug("new min and max freqs are %u - %u kHz\n", pr_debug("new min and max freqs are %u - %u kHz\n",

View File

@@ -144,6 +144,7 @@ config THERMAL_GOV_USER_SPACE
config THERMAL_GOV_POWER_ALLOCATOR config THERMAL_GOV_POWER_ALLOCATOR
bool "Power allocator thermal governor" bool "Power allocator thermal governor"
depends on ENERGY_MODEL
help help
Enable this to manage platform thermals by dynamically Enable this to manage platform thermals by dynamically
allocating and limiting power to devices. allocating and limiting power to devices.

View File

@@ -19,6 +19,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/cpu.h> #include <linux/cpu.h>
#include <linux/cpu_cooling.h> #include <linux/cpu_cooling.h>
#include <linux/energy_model.h>
#include <trace/events/thermal.h> #include <trace/events/thermal.h>
@@ -36,19 +37,6 @@
* ... * ...
*/ */
/**
* struct freq_table - frequency table along with power entries
* @frequency: frequency in KHz
* @power: power in mW
*
* This structure is built when the cooling device registers and helps
* in translating frequency to power and vice versa.
*/
struct freq_table {
u32 frequency;
u32 power;
};
/** /**
* struct time_in_idle - Idle time stats * struct time_in_idle - Idle time stats
* @time: previous reading of the absolute time that this cpu was idle * @time: previous reading of the absolute time that this cpu was idle
@@ -70,7 +58,7 @@ struct time_in_idle {
* frequency. * frequency.
* @max_level: maximum cooling level. One less than total number of valid * @max_level: maximum cooling level. One less than total number of valid
* cpufreq frequencies. * cpufreq frequencies.
* @freq_table: Freq table in descending order of frequencies * @em: Reference on the Energy Model of the device
* @cdev: thermal_cooling_device pointer to keep track of the * @cdev: thermal_cooling_device pointer to keep track of the
* registered cooling device. * registered cooling device.
* @policy: cpufreq policy. * @policy: cpufreq policy.
@@ -86,7 +74,7 @@ struct cpufreq_cooling_device {
unsigned int cpufreq_state; unsigned int cpufreq_state;
unsigned int clipped_freq; unsigned int clipped_freq;
unsigned int max_level; unsigned int max_level;
struct freq_table *freq_table; /* In descending order */ struct em_perf_domain *em;
struct cpufreq_policy *policy; struct cpufreq_policy *policy;
struct list_head node; struct list_head node;
struct time_in_idle *idle_time; struct time_in_idle *idle_time;
@@ -96,28 +84,6 @@ static DEFINE_IDA(cpufreq_ida);
static DEFINE_MUTEX(cooling_list_lock); static DEFINE_MUTEX(cooling_list_lock);
static LIST_HEAD(cpufreq_cdev_list); static LIST_HEAD(cpufreq_cdev_list);
/* Below code defines functions to be used for cpufreq as cooling device */
/**
* get_level: Find the level for a particular frequency
* @cpufreq_cdev: cpufreq_cdev for which the property is required
* @freq: Frequency
*
* Return: level corresponding to the frequency.
*/
static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
unsigned int freq)
{
struct freq_table *freq_table = cpufreq_cdev->freq_table;
unsigned long level;
for (level = 1; level <= cpufreq_cdev->max_level; level++)
if (freq > freq_table[level].frequency)
break;
return level - 1;
}
/** /**
* cpufreq_thermal_notifier - notifier callback for cpufreq policy change. * cpufreq_thermal_notifier - notifier callback for cpufreq policy change.
* @nb: struct notifier_block * with callback info. * @nb: struct notifier_block * with callback info.
@@ -171,104 +137,51 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb,
return NOTIFY_OK; return NOTIFY_OK;
} }
#ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
/** /**
* update_freq_table() - Update the freq table with power numbers * get_level: Find the level for a particular frequency
* @cpufreq_cdev: the cpufreq cooling device in which to update the table * @cpufreq_cdev: cpufreq_cdev for which the property is required
* @capacitance: dynamic power coefficient for these cpus * @freq: Frequency
* *
* Update the freq table with power numbers. This table will be used in * Return: level corresponding to the frequency.
* cpu_power_to_freq() and cpu_freq_to_power() to convert between power and
* frequency efficiently. Power is stored in mW, frequency in KHz. The
* resulting table is in descending order.
*
* Return: 0 on success, -EINVAL if there are no OPPs for any CPUs,
* or -ENOMEM if we run out of memory.
*/ */
static int update_freq_table(struct cpufreq_cooling_device *cpufreq_cdev, static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
u32 capacitance) unsigned int freq)
{ {
struct freq_table *freq_table = cpufreq_cdev->freq_table; int i;
struct dev_pm_opp *opp;
struct device *dev = NULL;
int num_opps = 0, cpu = cpufreq_cdev->policy->cpu, i;
dev = get_cpu_device(cpu); for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
if (unlikely(!dev)) { if (freq > cpufreq_cdev->em->table[i].frequency)
pr_warn("No cpu device for cpu %d\n", cpu); break;
return -ENODEV;
} }
num_opps = dev_pm_opp_get_opp_count(dev); return cpufreq_cdev->max_level - i - 1;
if (num_opps < 0)
return num_opps;
/*
* The cpufreq table is also built from the OPP table and so the count
* should match.
*/
if (num_opps != cpufreq_cdev->max_level + 1) {
dev_warn(dev, "Number of OPPs not matching with max_levels\n");
return -EINVAL;
}
for (i = 0; i <= cpufreq_cdev->max_level; i++) {
unsigned long freq = freq_table[i].frequency * 1000;
u32 freq_mhz = freq_table[i].frequency / 1000;
u64 power;
u32 voltage_mv;
/*
* Find ceil frequency as 'freq' may be slightly lower than OPP
* freq due to truncation while converting to kHz.
*/
opp = dev_pm_opp_find_freq_ceil(dev, &freq);
if (IS_ERR(opp)) {
dev_err(dev, "failed to get opp for %lu frequency\n",
freq);
return -EINVAL;
}
voltage_mv = dev_pm_opp_get_voltage(opp) / 1000;
dev_pm_opp_put(opp);
/*
* Do the multiplication with MHz and millivolt so as
* to not overflow.
*/
power = (u64)capacitance * freq_mhz * voltage_mv * voltage_mv;
do_div(power, 1000000000);
/* power is stored in mW */
freq_table[i].power = power;
}
return 0;
} }
static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev, static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
u32 freq) u32 freq)
{ {
int i; int i;
struct freq_table *freq_table = cpufreq_cdev->freq_table;
for (i = 1; i <= cpufreq_cdev->max_level; i++) for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
if (freq > freq_table[i].frequency) if (freq > cpufreq_cdev->em->table[i].frequency)
break; break;
}
return freq_table[i - 1].power; return cpufreq_cdev->em->table[i + 1].power;
} }
static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev, static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
u32 power) u32 power)
{ {
int i; int i;
struct freq_table *freq_table = cpufreq_cdev->freq_table;
for (i = 1; i <= cpufreq_cdev->max_level; i++) for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
if (power > freq_table[i].power) if (power > cpufreq_cdev->em->table[i].power)
break; break;
}
return freq_table[i - 1].frequency; return cpufreq_cdev->em->table[i + 1].frequency;
} }
/** /**
@@ -319,80 +232,6 @@ static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_cdev,
return (raw_cpu_power * cpufreq_cdev->last_load) / 100; return (raw_cpu_power * cpufreq_cdev->last_load) / 100;
} }
/* cpufreq cooling device callback functions are defined below */
/**
* cpufreq_get_max_state - callback function to get the max cooling state.
* @cdev: thermal cooling device pointer.
* @state: fill this variable with the max cooling state.
*
* Callback for the thermal cooling device to return the cpufreq
* max cooling state.
*
* Return: 0 on success, an error code otherwise.
*/
static int cpufreq_get_max_state(struct thermal_cooling_device *cdev,
unsigned long *state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
*state = cpufreq_cdev->max_level;
return 0;
}
/**
* cpufreq_get_cur_state - callback function to get the current cooling state.
* @cdev: thermal cooling device pointer.
* @state: fill this variable with the current cooling state.
*
* Callback for the thermal cooling device to return the cpufreq
* current cooling state.
*
* Return: 0 on success, an error code otherwise.
*/
static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev,
unsigned long *state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
*state = cpufreq_cdev->cpufreq_state;
return 0;
}
/**
* cpufreq_set_cur_state - callback function to set the current cooling state.
* @cdev: thermal cooling device pointer.
* @state: set this variable to the current cooling state.
*
* Callback for the thermal cooling device to change the cpufreq
* current cooling state.
*
* Return: 0 on success, an error code otherwise.
*/
static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
unsigned long state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
unsigned int clip_freq;
/* Request state should be less than max_level */
if (WARN_ON(state > cpufreq_cdev->max_level))
return -EINVAL;
/* Check if the old cooling action is same as new cooling action */
if (cpufreq_cdev->cpufreq_state == state)
return 0;
clip_freq = cpufreq_cdev->freq_table[state].frequency;
cpufreq_cdev->cpufreq_state = state;
cpufreq_cdev->clipped_freq = clip_freq;
cpufreq_update_policy(cpufreq_cdev->policy->cpu);
return 0;
}
/** /**
* cpufreq_get_requested_power() - get the current power * cpufreq_get_requested_power() - get the current power
* @cdev: &thermal_cooling_device pointer * @cdev: &thermal_cooling_device pointer
@@ -483,7 +322,7 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev,
struct thermal_zone_device *tz, struct thermal_zone_device *tz,
unsigned long state, u32 *power) unsigned long state, u32 *power)
{ {
unsigned int freq, num_cpus; unsigned int freq, num_cpus, idx;
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata; struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
/* Request state should be less than max_level */ /* Request state should be less than max_level */
@@ -492,7 +331,8 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev,
num_cpus = cpumask_weight(cpufreq_cdev->policy->cpus); num_cpus = cpumask_weight(cpufreq_cdev->policy->cpus);
freq = cpufreq_cdev->freq_table[state].frequency; idx = cpufreq_cdev->max_level - state;
freq = cpufreq_cdev->em->table[idx].frequency;
*power = cpu_freq_to_power(cpufreq_cdev, freq) * num_cpus; *power = cpu_freq_to_power(cpufreq_cdev, freq) * num_cpus;
return 0; return 0;
@@ -537,21 +377,139 @@ static int cpufreq_power2state(struct thermal_cooling_device *cdev,
return 0; return 0;
} }
static inline bool em_is_sane(struct cpufreq_cooling_device *cpufreq_cdev,
struct em_perf_domain *em) {
struct cpufreq_policy *policy;
unsigned int nr_levels;
if (!em)
return false;
policy = cpufreq_cdev->policy;
if (!cpumask_equal(policy->related_cpus, to_cpumask(em->cpus))) {
pr_err("The span of pd %*pbl is misaligned with cpufreq policy %*pbl\n",
cpumask_pr_args(to_cpumask(em->cpus)),
cpumask_pr_args(policy->related_cpus));
return false;
}
nr_levels = cpufreq_cdev->max_level + 1;
if (em->nr_cap_states != nr_levels) {
pr_err("The number of cap states in pd %*pbl (%u) doesn't match the number of cooling levels (%u)\n",
cpumask_pr_args(to_cpumask(em->cpus)),
em->nr_cap_states, nr_levels);
return false;
}
return true;
}
#endif /* CONFIG_THERMAL_GOV_POWER_ALLOCATOR */
static unsigned int get_state_freq(struct cpufreq_cooling_device *cpufreq_cdev,
unsigned long state)
{
struct cpufreq_policy *policy;
unsigned long idx;
#ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
/* Use the Energy Model table if available */
if (cpufreq_cdev->em) {
idx = cpufreq_cdev->max_level - state;
return cpufreq_cdev->em->table[idx].frequency;
}
#endif
/* Otherwise, fallback on the CPUFreq table */
policy = cpufreq_cdev->policy;
if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING)
idx = cpufreq_cdev->max_level - state;
else
idx = state;
return policy->freq_table[idx].frequency;
}
/* cpufreq cooling device callback functions are defined below */
/**
* cpufreq_get_max_state - callback function to get the max cooling state.
* @cdev: thermal cooling device pointer.
* @state: fill this variable with the max cooling state.
*
* Callback for the thermal cooling device to return the cpufreq
* max cooling state.
*
* Return: 0 on success, an error code otherwise.
*/
static int cpufreq_get_max_state(struct thermal_cooling_device *cdev,
unsigned long *state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
*state = cpufreq_cdev->max_level;
return 0;
}
/**
* cpufreq_get_cur_state - callback function to get the current cooling state.
* @cdev: thermal cooling device pointer.
* @state: fill this variable with the current cooling state.
*
* Callback for the thermal cooling device to return the cpufreq
* current cooling state.
*
* Return: 0 on success, an error code otherwise.
*/
static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev,
unsigned long *state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
*state = cpufreq_cdev->cpufreq_state;
return 0;
}
/**
* cpufreq_set_cur_state - callback function to set the current cooling state.
* @cdev: thermal cooling device pointer.
* @state: set this variable to the current cooling state.
*
* Callback for the thermal cooling device to change the cpufreq
* current cooling state.
*
* Return: 0 on success, an error code otherwise.
*/
static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
unsigned long state)
{
struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
unsigned int clip_freq;
/* Request state should be less than max_level */
if (WARN_ON(state > cpufreq_cdev->max_level))
return -EINVAL;
/* Check if the old cooling action is same as new cooling action */
if (cpufreq_cdev->cpufreq_state == state)
return 0;
clip_freq = get_state_freq(cpufreq_cdev, state);
cpufreq_cdev->cpufreq_state = state;
cpufreq_cdev->clipped_freq = clip_freq;
cpufreq_update_policy(cpufreq_cdev->policy->cpu);
return 0;
}
/* Bind cpufreq callbacks to thermal cooling device ops */ /* Bind cpufreq callbacks to thermal cooling device ops */
static struct thermal_cooling_device_ops cpufreq_cooling_ops = { static struct thermal_cooling_device_ops cpufreq_cooling_ops = {
.get_max_state = cpufreq_get_max_state,
.get_cur_state = cpufreq_get_cur_state,
.set_cur_state = cpufreq_set_cur_state,
};
static struct thermal_cooling_device_ops cpufreq_power_cooling_ops = {
.get_max_state = cpufreq_get_max_state, .get_max_state = cpufreq_get_max_state,
.get_cur_state = cpufreq_get_cur_state, .get_cur_state = cpufreq_get_cur_state,
.set_cur_state = cpufreq_set_cur_state, .set_cur_state = cpufreq_set_cur_state,
.get_requested_power = cpufreq_get_requested_power,
.state2power = cpufreq_state2power,
.power2state = cpufreq_power2state,
}; };
/* Notifier for cpufreq policy change */ /* Notifier for cpufreq policy change */
@@ -559,26 +517,12 @@ static struct notifier_block thermal_cpufreq_notifier_block = {
.notifier_call = cpufreq_thermal_notifier, .notifier_call = cpufreq_thermal_notifier,
}; };
static unsigned int find_next_max(struct cpufreq_frequency_table *table,
unsigned int prev_max)
{
struct cpufreq_frequency_table *pos;
unsigned int max = 0;
cpufreq_for_each_valid_entry(pos, table) {
if (pos->frequency > max && pos->frequency < prev_max)
max = pos->frequency;
}
return max;
}
/** /**
* __cpufreq_cooling_register - helper function to create cpufreq cooling device * __cpufreq_cooling_register - helper function to create cpufreq cooling device
* @np: a valid struct device_node to the cooling device device tree node * @np: a valid struct device_node to the cooling device device tree node
* @policy: cpufreq policy * @policy: cpufreq policy
* Normally this should be same as cpufreq policy->related_cpus. * Normally this should be same as cpufreq policy->related_cpus.
* @capacitance: dynamic power coefficient for these cpus * @em: Energy Model of the cpufreq policy
* *
* This interface function registers the cpufreq cooling device with the name * This interface function registers the cpufreq cooling device with the name
* "thermal-cpufreq-%x". This api can support multiple instances of cpufreq * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
@@ -590,12 +534,13 @@ static unsigned int find_next_max(struct cpufreq_frequency_table *table,
*/ */
static struct thermal_cooling_device * static struct thermal_cooling_device *
__cpufreq_cooling_register(struct device_node *np, __cpufreq_cooling_register(struct device_node *np,
struct cpufreq_policy *policy, u32 capacitance) struct cpufreq_policy *policy,
struct em_perf_domain *em)
{ {
struct thermal_cooling_device *cdev; struct thermal_cooling_device *cdev;
struct cpufreq_cooling_device *cpufreq_cdev; struct cpufreq_cooling_device *cpufreq_cdev;
char dev_name[THERMAL_NAME_LENGTH]; char dev_name[THERMAL_NAME_LENGTH];
unsigned int freq, i, num_cpus; unsigned int i, num_cpus;
int ret; int ret;
struct thermal_cooling_device_ops *cooling_ops; struct thermal_cooling_device_ops *cooling_ops;
bool first; bool first;
@@ -629,46 +574,30 @@ __cpufreq_cooling_register(struct device_node *np,
/* max_level is an index, not a counter */ /* max_level is an index, not a counter */
cpufreq_cdev->max_level = i - 1; cpufreq_cdev->max_level = i - 1;
cpufreq_cdev->freq_table = kmalloc_array(i,
sizeof(*cpufreq_cdev->freq_table),
GFP_KERNEL);
if (!cpufreq_cdev->freq_table) {
cdev = ERR_PTR(-ENOMEM);
goto free_idle_time;
}
ret = ida_simple_get(&cpufreq_ida, 0, 0, GFP_KERNEL); ret = ida_simple_get(&cpufreq_ida, 0, 0, GFP_KERNEL);
if (ret < 0) { if (ret < 0) {
cdev = ERR_PTR(ret); cdev = ERR_PTR(ret);
goto free_table; goto free_idle_time;
} }
cpufreq_cdev->id = ret; cpufreq_cdev->id = ret;
snprintf(dev_name, sizeof(dev_name), "thermal-cpufreq-%d", snprintf(dev_name, sizeof(dev_name), "thermal-cpufreq-%d",
cpufreq_cdev->id); cpufreq_cdev->id);
/* Fill freq-table in descending order of frequencies */ cooling_ops = &cpufreq_cooling_ops;
for (i = 0, freq = -1; i <= cpufreq_cdev->max_level; i++) { #ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
freq = find_next_max(policy->freq_table, freq); if (em_is_sane(cpufreq_cdev, em)) {
cpufreq_cdev->freq_table[i].frequency = freq; cpufreq_cdev->em = em;
cooling_ops->get_requested_power = cpufreq_get_requested_power;
/* Warn for duplicate entries */ cooling_ops->state2power = cpufreq_state2power;
if (!freq) cooling_ops->power2state = cpufreq_power2state;
pr_warn("%s: table has duplicate entries\n", __func__); } else
else #endif
pr_debug("%s: freq:%u KHz\n", __func__, freq); if (policy->freq_table_sorted == CPUFREQ_TABLE_UNSORTED) {
} pr_err("%s: unsorted frequency tables are not supported\n",
__func__);
if (capacitance) { cdev = ERR_PTR(-EINVAL);
ret = update_freq_table(cpufreq_cdev, capacitance); goto remove_ida;
if (ret) {
cdev = ERR_PTR(ret);
goto remove_ida;
}
cooling_ops = &cpufreq_power_cooling_ops;
} else {
cooling_ops = &cpufreq_cooling_ops;
} }
cdev = thermal_of_cooling_device_register(np, dev_name, cpufreq_cdev, cdev = thermal_of_cooling_device_register(np, dev_name, cpufreq_cdev,
@@ -676,7 +605,7 @@ __cpufreq_cooling_register(struct device_node *np,
if (IS_ERR(cdev)) if (IS_ERR(cdev))
goto remove_ida; goto remove_ida;
cpufreq_cdev->clipped_freq = cpufreq_cdev->freq_table[0].frequency; cpufreq_cdev->clipped_freq = get_state_freq(cpufreq_cdev, 0);
mutex_lock(&cooling_list_lock); mutex_lock(&cooling_list_lock);
/* Register the notifier for first cpufreq cooling device */ /* Register the notifier for first cpufreq cooling device */
@@ -692,8 +621,6 @@ __cpufreq_cooling_register(struct device_node *np,
remove_ida: remove_ida:
ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id); ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
free_table:
kfree(cpufreq_cdev->freq_table);
free_idle_time: free_idle_time:
kfree(cpufreq_cdev->idle_time); kfree(cpufreq_cdev->idle_time);
free_cdev: free_cdev:
@@ -715,7 +642,7 @@ free_cdev:
struct thermal_cooling_device * struct thermal_cooling_device *
cpufreq_cooling_register(struct cpufreq_policy *policy) cpufreq_cooling_register(struct cpufreq_policy *policy)
{ {
return __cpufreq_cooling_register(NULL, policy, 0); return __cpufreq_cooling_register(NULL, policy, NULL);
} }
EXPORT_SYMBOL_GPL(cpufreq_cooling_register); EXPORT_SYMBOL_GPL(cpufreq_cooling_register);
@@ -743,7 +670,6 @@ of_cpufreq_cooling_register(struct cpufreq_policy *policy)
{ {
struct device_node *np = of_get_cpu_node(policy->cpu, NULL); struct device_node *np = of_get_cpu_node(policy->cpu, NULL);
struct thermal_cooling_device *cdev = NULL; struct thermal_cooling_device *cdev = NULL;
u32 capacitance = 0;
if (!np) { if (!np) {
pr_err("cpu_cooling: OF node not available for cpu%d\n", pr_err("cpu_cooling: OF node not available for cpu%d\n",
@@ -752,10 +678,9 @@ of_cpufreq_cooling_register(struct cpufreq_policy *policy)
} }
if (of_find_property(np, "#cooling-cells", NULL)) { if (of_find_property(np, "#cooling-cells", NULL)) {
of_property_read_u32(np, "dynamic-power-coefficient", struct em_perf_domain *em = em_cpu_get(policy->cpu);
&capacitance);
cdev = __cpufreq_cooling_register(np, policy, capacitance); cdev = __cpufreq_cooling_register(np, policy, em);
if (IS_ERR(cdev)) { if (IS_ERR(cdev)) {
pr_err("cpu_cooling: cpu%d failed to register as cooling device: %ld\n", pr_err("cpu_cooling: cpu%d failed to register as cooling device: %ld\n",
policy->cpu, PTR_ERR(cdev)); policy->cpu, PTR_ERR(cdev));
@@ -797,7 +722,6 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
thermal_cooling_device_unregister(cdev); thermal_cooling_device_unregister(cdev);
ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id); ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
kfree(cpufreq_cdev->idle_time); kfree(cpufreq_cdev->idle_time);
kfree(cpufreq_cdev->freq_table);
kfree(cpufreq_cdev); kfree(cpufreq_cdev);
} }
EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister); EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);

View File

@@ -33,4 +33,12 @@ unsigned long topology_get_freq_scale(int cpu)
return per_cpu(freq_scale, cpu); return per_cpu(freq_scale, cpu);
} }
DECLARE_PER_CPU(unsigned long, max_freq_scale);
static inline
unsigned long topology_get_max_freq_scale(struct sched_domain *sd, int cpu)
{
return per_cpu(max_freq_scale, cpu);
}
#endif /* _LINUX_ARCH_TOPOLOGY_H_ */ #endif /* _LINUX_ARCH_TOPOLOGY_H_ */

View File

@@ -984,6 +984,8 @@ extern unsigned int arch_freq_get_on_cpu(int cpu);
extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq, extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
unsigned long max_freq); unsigned long max_freq);
extern void arch_set_max_freq_scale(struct cpumask *cpus,
unsigned long policy_max_freq);
/* the following are really really optional */ /* the following are really really optional */
extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs; extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;

View File

@@ -6226,7 +6226,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
return 0; return 0;
min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu)); min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
max_cap = cpu_rq(cpu)->rd->max_cpu_capacity; max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
/* Minimum capacity is close to max, no need to abort wake_affine */ /* Minimum capacity is close to max, no need to abort wake_affine */
if (max_cap - min_cap < max_cap >> 3) if (max_cap - min_cap < max_cap >> 3)
@@ -7141,6 +7141,7 @@ struct lb_env {
int new_dst_cpu; int new_dst_cpu;
enum cpu_idle_type idle; enum cpu_idle_type idle;
long imbalance; long imbalance;
unsigned int src_grp_nr_running;
/* The set of CPUs under consideration for load-balancing */ /* The set of CPUs under consideration for load-balancing */
struct cpumask *cpus; struct cpumask *cpus;
@@ -7754,10 +7755,9 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
}; };
} }
static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu) static unsigned long scale_rt_capacity(int cpu, unsigned long max)
{ {
struct rq *rq = cpu_rq(cpu); struct rq *rq = cpu_rq(cpu);
unsigned long max = arch_scale_cpu_capacity(cpu);
unsigned long used, free; unsigned long used, free;
unsigned long irq; unsigned long irq;
@@ -7777,12 +7777,47 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
return scale_irq_capacity(free, irq, max); return scale_irq_capacity(free, irq, max);
} }
void init_max_cpu_capacity(struct max_cpu_capacity *mcc) {
raw_spin_lock_init(&mcc->lock);
mcc->val = 0;
mcc->cpu = -1;
}
static void update_cpu_capacity(struct sched_domain *sd, int cpu) static void update_cpu_capacity(struct sched_domain *sd, int cpu)
{ {
unsigned long capacity = scale_rt_capacity(sd, cpu); unsigned long capacity = arch_scale_cpu_capacity(cpu);
struct sched_group *sdg = sd->groups; struct sched_group *sdg = sd->groups;
struct max_cpu_capacity *mcc;
unsigned long max_capacity;
int max_cap_cpu;
unsigned long flags;
cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu); cpu_rq(cpu)->cpu_capacity_orig = capacity;
capacity *= arch_scale_max_freq_capacity(sd, cpu);
capacity >>= SCHED_CAPACITY_SHIFT;
mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
raw_spin_lock_irqsave(&mcc->lock, flags);
max_capacity = mcc->val;
max_cap_cpu = mcc->cpu;
if ((max_capacity > capacity && max_cap_cpu == cpu) ||
(max_capacity < capacity)) {
mcc->val = capacity;
mcc->cpu = cpu;
#ifdef CONFIG_SCHED_DEBUG
raw_spin_unlock_irqrestore(&mcc->lock, flags);
printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
cpu, capacity);
goto skip_unlock;
#endif
}
raw_spin_unlock_irqrestore(&mcc->lock, flags);
skip_unlock: __attribute__ ((unused));
capacity = scale_rt_capacity(cpu, capacity);
if (!capacity) if (!capacity)
capacity = 1; capacity = 1;
@@ -7886,7 +7921,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd) static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
{ {
return rq->misfit_task_load && return rq->misfit_task_load &&
(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity || (rq->cpu_capacity_orig < rq->rd->max_cpu_capacity.val ||
check_cpu_capacity(rq, sd)); check_cpu_capacity(rq, sd));
} }
@@ -8291,6 +8326,8 @@ next_group:
if (env->sd->flags & SD_NUMA) if (env->sd->flags & SD_NUMA)
env->fbq_type = fbq_classify_group(&sds->busiest_stat); env->fbq_type = fbq_classify_group(&sds->busiest_stat);
env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
if (!env->sd->parent) { if (!env->sd->parent) {
struct root_domain *rd = env->dst_rq->rd; struct root_domain *rd = env->dst_rq->rd;
@@ -8419,7 +8456,22 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
capa_move /= SCHED_CAPACITY_SCALE; capa_move /= SCHED_CAPACITY_SCALE;
/* Move if we gain throughput */ /* Move if we gain throughput */
if (capa_move > capa_now) if (capa_move > capa_now) {
env->imbalance = busiest->load_per_task;
return;
}
/* We can't see throughput improvement with the load-based
* method, but it is possible depending upon group size and
* capacity range that there might still be an underutilized
* cpu available in an asymmetric capacity system. Do one last
* check just in case.
*/
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
busiest->group_type == group_overloaded &&
busiest->sum_nr_running > busiest->group_weight &&
local->sum_nr_running < local->group_weight &&
local->group_capacity < busiest->group_capacity)
env->imbalance = busiest->load_per_task; env->imbalance = busiest->load_per_task;
} }
@@ -8488,8 +8540,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
(sds->avg_load - local->avg_load) * local->group_capacity (sds->avg_load - local->avg_load) * local->group_capacity
) / SCHED_CAPACITY_SCALE; ) / SCHED_CAPACITY_SCALE;
/* Boost imbalance to allow misfit task to be balanced. */ /* Boost imbalance to allow misfit task to be balanced.
if (busiest->group_type == group_misfit_task) { * Always do this if we are doing a NEWLY_IDLE balance
* on the assumption that any tasks we have must not be
* long-running (and hence we cannot rely upon load).
* However if we are not idle, we should assume the tasks
* we have are longer running and not override load-based
* calculations above unless we are sure that the local
* group is underutilized.
*/
if (busiest->group_type == group_misfit_task &&
(env->idle == CPU_NEWLY_IDLE ||
local->sum_nr_running < local->group_weight)) {
env->imbalance = max_t(long, env->imbalance, env->imbalance = max_t(long, env->imbalance,
busiest->group_misfit_task_load); busiest->group_misfit_task_load);
} }
@@ -8767,6 +8829,17 @@ static int need_active_balance(struct lb_env *env)
if (voluntary_active_balance(env)) if (voluntary_active_balance(env))
return 1; return 1;
if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
env->src_rq->cfs.h_nr_running == 1 &&
cpu_overutilized(env->src_cpu) &&
!cpu_overutilized(env->dst_cpu)) {
return 1;
}
if (env->src_grp_type == group_overloaded && env->src_rq->misfit_task_load)
return 1;
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2); return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
} }
@@ -8985,7 +9058,8 @@ more_balance:
* excessive cache_hot migrations and active balances. * excessive cache_hot migrations and active balances.
*/ */
if (idle != CPU_NEWLY_IDLE) if (idle != CPU_NEWLY_IDLE)
sd->nr_balance_failed++; if (env.src_grp_nr_running > 1)
sd->nr_balance_failed++;
if (need_active_balance(&env)) { if (need_active_balance(&env)) {
unsigned long flags; unsigned long flags;

View File

@@ -710,6 +710,12 @@ struct perf_domain {
struct rcu_head rcu; struct rcu_head rcu;
}; };
struct max_cpu_capacity {
raw_spinlock_t lock;
unsigned long val;
int cpu;
};
/* Scheduling group status flags */ /* Scheduling group status flags */
#define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */ #define SG_OVERLOAD 0x1 /* More than one runnable task on a CPU. */
#define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */ #define SG_OVERUTILIZED 0x2 /* One or more CPUs are over-utilized. */
@@ -768,7 +774,8 @@ struct root_domain {
cpumask_var_t rto_mask; cpumask_var_t rto_mask;
struct cpupri cpupri; struct cpupri cpupri;
unsigned long max_cpu_capacity; /* Maximum cpu capacity in the system. */
struct max_cpu_capacity max_cpu_capacity;
/* /*
* NULL-terminated list of performance domains intersecting with the * NULL-terminated list of performance domains intersecting with the
@@ -781,6 +788,7 @@ extern struct root_domain def_root_domain;
extern struct mutex sched_domains_mutex; extern struct mutex sched_domains_mutex;
extern void init_defrootdomain(void); extern void init_defrootdomain(void);
extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
extern int sched_init_domains(const struct cpumask *cpu_map); extern int sched_init_domains(const struct cpumask *cpu_map);
extern void rq_attach_root(struct rq *rq, struct root_domain *rd); extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
extern void sched_get_rd(struct root_domain *rd); extern void sched_get_rd(struct root_domain *rd);
@@ -1942,6 +1950,15 @@ unsigned long arch_scale_freq_capacity(int cpu)
} }
#endif #endif
#ifndef arch_scale_max_freq_capacity
struct sched_domain;
static __always_inline
unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
{
return SCHED_CAPACITY_SCALE;
}
#endif
#ifdef CONFIG_SMP #ifdef CONFIG_SMP
#ifdef CONFIG_PREEMPT #ifdef CONFIG_PREEMPT

View File

@@ -510,6 +510,9 @@ static int init_rootdomain(struct root_domain *rd)
if (cpupri_init(&rd->cpupri) != 0) if (cpupri_init(&rd->cpupri) != 0)
goto free_cpudl; goto free_cpudl;
init_max_cpu_capacity(&rd->max_cpu_capacity);
return 0; return 0;
free_cpudl: free_cpudl:
@@ -1930,7 +1933,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
enum s_alloc alloc_state; enum s_alloc alloc_state;
struct sched_domain *sd; struct sched_domain *sd;
struct s_data d; struct s_data d;
struct rq *rq = NULL;
int i, ret = -ENOMEM; int i, ret = -ENOMEM;
struct sched_domain_topology_level *tl_asym; struct sched_domain_topology_level *tl_asym;
bool has_asym = false; bool has_asym = false;
@@ -1993,13 +1995,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
/* Attach the domains */ /* Attach the domains */
rcu_read_lock(); rcu_read_lock();
for_each_cpu(i, cpu_map) { for_each_cpu(i, cpu_map) {
rq = cpu_rq(i);
sd = *per_cpu_ptr(d.sd, i); sd = *per_cpu_ptr(d.sd, i);
/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
cpu_attach_domain(sd, d.rd, i); cpu_attach_domain(sd, d.rd, i);
} }
rcu_read_unlock(); rcu_read_unlock();
@@ -2007,11 +2003,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
if (has_asym) if (has_asym)
static_branch_enable_cpuslocked(&sched_asym_cpucapacity); static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
if (rq && sched_debug_enabled) {
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
}
ret = 0; ret = 0;
error: error:
__free_domain_allocs(&d, alloc_state, cpu_map); __free_domain_allocs(&d, alloc_state, cpu_map);