Merge branch 'eas-dev' into android-mainline

Merge android-specific EAS patches Bug: 120440300 Change-Id: I9e9c3dff41f1e941e57390db971d58a1457c69a3 Signed-off-by: Todd Kjos <tkjos@google.com>
2026-06-09 12:17:12 +09:00 · 2019-07-29 15:44:19 -07:00
parent c0c919240f f3e9811919
commit 01ca13d85f
12 changed files with 331 additions and 276 deletions
--- a/arch/arm/include/asm/topology.h
+++ b/arch/arm/include/asm/topology.h
@@ -30,6 +30,9 @@ const struct cpumask *cpu_coregroup_mask(int cpu);
 /* Replace task scheduler's default frequency-invariant accounting */
 #define arch_scale_freq_capacity topology_get_freq_scale
 /* Replace task scheduler's default max-frequency-invariant accounting */
 #define arch_scale_max_freq_capacity topology_get_max_freq_scale
 /* Replace task scheduler's default cpu-invariant accounting */
 #define arch_scale_cpu_capacity topology_get_cpu_scale
--- a/arch/arm64/configs/defconfig
+++ b/arch/arm64/configs/defconfig
@@ -71,6 +71,7 @@ CONFIG_COMPAT=y
 CONFIG_RANDOMIZE_BASE=y
 CONFIG_HIBERNATION=y
 CONFIG_WQ_POWER_EFFICIENT_DEFAULT=y
 CONFIG_ENERGY_MODEL=y
 CONFIG_ARM_CPUIDLE=y
 CONFIG_CPU_FREQ=y
 CONFIG_CPU_FREQ_STAT=y
--- a/arch/arm64/include/asm/topology.h
+++ b/arch/arm64/include/asm/topology.h
@@ -42,6 +42,9 @@ int pcibus_to_node(struct pci_bus *bus);
 /* Replace task scheduler's default frequency-invariant accounting */
 #define arch_scale_freq_capacity topology_get_freq_scale
 /* Replace task scheduler's default max-frequency-invariant accounting */
 #define arch_scale_max_freq_capacity topology_get_max_freq_scale
 /* Replace task scheduler's default cpu-invariant accounting */
 #define arch_scale_cpu_capacity topology_get_cpu_scale
--- a/drivers/base/arch_topology.c
+++ b/drivers/base/arch_topology.c
@@ -17,6 +17,8 @@
 #include <linux/cpuset.h>
 DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
 DEFINE_PER_CPU(unsigned long, max_cpu_freq);
 DEFINE_PER_CPU(unsigned long, max_freq_scale) = SCHED_CAPACITY_SCALE;
 void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
 			 unsigned long max_freq)
@@ -26,8 +28,29 @@ void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
 	scale = (cur_freq << SCHED_CAPACITY_SHIFT) / max_freq;
-	for_each_cpu(i, cpus)
+	for_each_cpu(i, cpus) {
 		per_cpu(freq_scale, i) = scale;
 		per_cpu(max_cpu_freq, i) = max_freq;
 	}
 }
 void arch_set_max_freq_scale(struct cpumask *cpus,
 			     unsigned long policy_max_freq)
 {
 	unsigned long scale, max_freq;
 	int cpu = cpumask_first(cpus);
 	if (cpu > nr_cpu_ids)
 		return;
 	max_freq = per_cpu(max_cpu_freq, cpu);
 	if (!max_freq)
 		return;
 	scale = (policy_max_freq << SCHED_CAPACITY_SHIFT) / max_freq;
 	for_each_cpu(cpu, cpus)
 		per_cpu(max_freq_scale, cpu) = scale;
 }
 DEFINE_PER_CPU(unsigned long, cpu_scale) = SCHED_CAPACITY_SCALE;
--- a/drivers/cpufreq/cpufreq.c
+++ b/drivers/cpufreq/cpufreq.c
@@ -153,6 +153,12 @@ __weak void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
 }
 EXPORT_SYMBOL_GPL(arch_set_freq_scale);
 __weak void arch_set_max_freq_scale(struct cpumask *cpus,
 				    unsigned long policy_max_freq)
 {
 }
 EXPORT_SYMBOL_GPL(arch_set_max_freq_scale);
 /*
 * This is a generic cpufreq init() routine which can be used by cpufreq
 * drivers of SMP systems. It will do following:
@@ -2416,6 +2422,8 @@ int cpufreq_set_policy(struct cpufreq_policy *policy,
 	policy->max = new_policy->max;
 	trace_cpu_frequency_limits(policy);
 	arch_set_max_freq_scale(policy->cpus, policy->max);
 	policy->cached_target_freq = UINT_MAX;
 	pr_debug("new min and max freqs are %u - %u kHz\n",
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -144,6 +144,7 @@ config THERMAL_GOV_USER_SPACE
 config THERMAL_GOV_POWER_ALLOCATOR
 	bool "Power allocator thermal governor"
 	depends on ENERGY_MODEL
 	help
 	  Enable this to manage platform thermals by dynamically
 	  allocating and limiting power to devices.
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -19,6 +19,7 @@
 #include <linux/slab.h>
 #include <linux/cpu.h>
 #include <linux/cpu_cooling.h>
 #include <linux/energy_model.h>
 #include <trace/events/thermal.h>
@@ -36,19 +37,6 @@
 *	...
 */
 /**
 * struct freq_table - frequency table along with power entries
 * @frequency:	frequency in KHz
 * @power:	power in mW
 *
 * This structure is built when the cooling device registers and helps
 * in translating frequency to power and vice versa.
 */
 struct freq_table {
 	u32 frequency;
 	u32 power;
 };
 /**
 * struct time_in_idle - Idle time stats
 * @time: previous reading of the absolute time that this cpu was idle
@@ -70,7 +58,7 @@ struct time_in_idle {
 *	frequency.
 * @max_level: maximum cooling level. One less than total number of valid
 *	cpufreq frequencies.
- * @freq_table: Freq table in descending order of frequencies
+ * @em: Reference on the Energy Model of the device
 * @cdev: thermal_cooling_device pointer to keep track of the
 *	registered cooling device.
 * @policy: cpufreq policy.
@@ -86,7 +74,7 @@ struct cpufreq_cooling_device {
 	unsigned int cpufreq_state;
 	unsigned int clipped_freq;
 	unsigned int max_level;
-	struct freq_table *freq_table;	/* In descending order */
+	struct em_perf_domain *em;
 	struct cpufreq_policy *policy;
 	struct list_head node;
 	struct time_in_idle *idle_time;
@@ -96,28 +84,6 @@ static DEFINE_IDA(cpufreq_ida);
 static DEFINE_MUTEX(cooling_list_lock);
 static LIST_HEAD(cpufreq_cdev_list);
 /* Below code defines functions to be used for cpufreq as cooling device */
 /**
 * get_level: Find the level for a particular frequency
 * @cpufreq_cdev: cpufreq_cdev for which the property is required
 * @freq: Frequency
 *
 * Return: level corresponding to the frequency.
 */
 static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
 			       unsigned int freq)
 {
 	struct freq_table *freq_table = cpufreq_cdev->freq_table;
 	unsigned long level;
 	for (level = 1; level <= cpufreq_cdev->max_level; level++)
 		if (freq > freq_table[level].frequency)
 			break;
 	return level - 1;
 }
 /**
 * cpufreq_thermal_notifier - notifier callback for cpufreq policy change.
 * @nb:	struct notifier_block * with callback info.
@@ -171,104 +137,51 @@ static int cpufreq_thermal_notifier(struct notifier_block *nb,
 	return NOTIFY_OK;
 }
 #ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
 /**
- * update_freq_table() - Update the freq table with power numbers
+ * get_level: Find the level for a particular frequency
- * @cpufreq_cdev:	the cpufreq cooling device in which to update the table
+ * @cpufreq_cdev: cpufreq_cdev for which the property is required
- * @capacitance: dynamic power coefficient for these cpus
+ * @freq: Frequency
 *
- * Update the freq table with power numbers.  This table will be used in
+ * Return: level corresponding to the frequency.
 * cpu_power_to_freq() and cpu_freq_to_power() to convert between power and
 * frequency efficiently.  Power is stored in mW, frequency in KHz.  The
 * resulting table is in descending order.
 *
 * Return: 0 on success, -EINVAL if there are no OPPs for any CPUs,
 * or -ENOMEM if we run out of memory.
 */
-static int update_freq_table(struct cpufreq_cooling_device *cpufreq_cdev,
+static unsigned long get_level(struct cpufreq_cooling_device *cpufreq_cdev,
-			     u32 capacitance)
+			       unsigned int freq)
 {
-	struct freq_table *freq_table = cpufreq_cdev->freq_table;
+	int i;
 	struct dev_pm_opp *opp;
 	struct device *dev = NULL;
 	int num_opps = 0, cpu = cpufreq_cdev->policy->cpu, i;
-	dev = get_cpu_device(cpu);
+	for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
-	if (unlikely(!dev)) {
+		if (freq > cpufreq_cdev->em->table[i].frequency)
-		pr_warn("No cpu device for cpu %d\n", cpu);
+			break;
 		return -ENODEV;
 	}
-	num_opps = dev_pm_opp_get_opp_count(dev);
+	return cpufreq_cdev->max_level - i - 1;
 	if (num_opps < 0)
 		return num_opps;
 	/*
 	 * The cpufreq table is also built from the OPP table and so the count
 	 * should match.
 	 */
 	if (num_opps != cpufreq_cdev->max_level + 1) {
 		dev_warn(dev, "Number of OPPs not matching with max_levels\n");
 		return -EINVAL;
 	}
 	for (i = 0; i <= cpufreq_cdev->max_level; i++) {
 		unsigned long freq = freq_table[i].frequency * 1000;
 		u32 freq_mhz = freq_table[i].frequency / 1000;
 		u64 power;
 		u32 voltage_mv;
 		/*
 		 * Find ceil frequency as 'freq' may be slightly lower than OPP
 		 * freq due to truncation while converting to kHz.
 		 */
 		opp = dev_pm_opp_find_freq_ceil(dev, &freq);
 		if (IS_ERR(opp)) {
 			dev_err(dev, "failed to get opp for %lu frequency\n",
 				freq);
 			return -EINVAL;
 		}
 		voltage_mv = dev_pm_opp_get_voltage(opp) / 1000;
 		dev_pm_opp_put(opp);
 		/*
 		 * Do the multiplication with MHz and millivolt so as
 		 * to not overflow.
 		 */
 		power = (u64)capacitance * freq_mhz * voltage_mv * voltage_mv;
 		do_div(power, 1000000000);
 		/* power is stored in mW */
 		freq_table[i].power = power;
 	}
 	return 0;
 }
 static u32 cpu_freq_to_power(struct cpufreq_cooling_device *cpufreq_cdev,
 			     u32 freq)
 {
 	int i;
 	struct freq_table *freq_table = cpufreq_cdev->freq_table;
-	for (i = 1; i <= cpufreq_cdev->max_level; i++)
+	for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
-		if (freq > freq_table[i].frequency)
+		if (freq > cpufreq_cdev->em->table[i].frequency)
 			break;
 	}
-	return freq_table[i - 1].power;
+	return cpufreq_cdev->em->table[i + 1].power;
 }
 static u32 cpu_power_to_freq(struct cpufreq_cooling_device *cpufreq_cdev,
 			     u32 power)
 {
 	int i;
 	struct freq_table *freq_table = cpufreq_cdev->freq_table;
-	for (i = 1; i <= cpufreq_cdev->max_level; i++)
+	for (i = cpufreq_cdev->max_level - 1; i >= 0; i--) {
-		if (power > freq_table[i].power)
+		if (power > cpufreq_cdev->em->table[i].power)
 			break;
 	}
-	return freq_table[i - 1].frequency;
+	return cpufreq_cdev->em->table[i + 1].frequency;
 }
 /**
@@ -319,80 +232,6 @@ static u32 get_dynamic_power(struct cpufreq_cooling_device *cpufreq_cdev,
 	return (raw_cpu_power * cpufreq_cdev->last_load) / 100;
 }
 /* cpufreq cooling device callback functions are defined below */
 /**
 * cpufreq_get_max_state - callback function to get the max cooling state.
 * @cdev: thermal cooling device pointer.
 * @state: fill this variable with the max cooling state.
 *
 * Callback for the thermal cooling device to return the cpufreq
 * max cooling state.
 *
 * Return: 0 on success, an error code otherwise.
 */
 static int cpufreq_get_max_state(struct thermal_cooling_device *cdev,
 				 unsigned long *state)
 {
 	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
 	*state = cpufreq_cdev->max_level;
 	return 0;
 }
 /**
 * cpufreq_get_cur_state - callback function to get the current cooling state.
 * @cdev: thermal cooling device pointer.
 * @state: fill this variable with the current cooling state.
 *
 * Callback for the thermal cooling device to return the cpufreq
 * current cooling state.
 *
 * Return: 0 on success, an error code otherwise.
 */
 static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev,
 				 unsigned long *state)
 {
 	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
 	*state = cpufreq_cdev->cpufreq_state;
 	return 0;
 }
 /**
 * cpufreq_set_cur_state - callback function to set the current cooling state.
 * @cdev: thermal cooling device pointer.
 * @state: set this variable to the current cooling state.
 *
 * Callback for the thermal cooling device to change the cpufreq
 * current cooling state.
 *
 * Return: 0 on success, an error code otherwise.
 */
 static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
 				 unsigned long state)
 {
 	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
 	unsigned int clip_freq;
 	/* Request state should be less than max_level */
 	if (WARN_ON(state > cpufreq_cdev->max_level))
 		return -EINVAL;
 	/* Check if the old cooling action is same as new cooling action */
 	if (cpufreq_cdev->cpufreq_state == state)
 		return 0;
 	clip_freq = cpufreq_cdev->freq_table[state].frequency;
 	cpufreq_cdev->cpufreq_state = state;
 	cpufreq_cdev->clipped_freq = clip_freq;
 	cpufreq_update_policy(cpufreq_cdev->policy->cpu);
 	return 0;
 }
 /**
 * cpufreq_get_requested_power() - get the current power
 * @cdev:	&thermal_cooling_device pointer
@@ -483,7 +322,7 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev,
 			       struct thermal_zone_device *tz,
 			       unsigned long state, u32 *power)
 {
-	unsigned int freq, num_cpus;
+	unsigned int freq, num_cpus, idx;
 	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
 	/* Request state should be less than max_level */
@@ -492,7 +331,8 @@ static int cpufreq_state2power(struct thermal_cooling_device *cdev,
 	num_cpus = cpumask_weight(cpufreq_cdev->policy->cpus);
-	freq = cpufreq_cdev->freq_table[state].frequency;
+	idx = cpufreq_cdev->max_level - state;
 	freq = cpufreq_cdev->em->table[idx].frequency;
 	*power = cpu_freq_to_power(cpufreq_cdev, freq) * num_cpus;
 	return 0;
@@ -537,21 +377,139 @@ static int cpufreq_power2state(struct thermal_cooling_device *cdev,
 	return 0;
 }
 static inline bool em_is_sane(struct cpufreq_cooling_device *cpufreq_cdev,
 			      struct em_perf_domain *em) {
 	struct cpufreq_policy *policy;
 	unsigned int nr_levels;
 	if (!em)
 		return false;
 	policy = cpufreq_cdev->policy;
 	if (!cpumask_equal(policy->related_cpus, to_cpumask(em->cpus))) {
 		pr_err("The span of pd %*pbl is misaligned with cpufreq policy %*pbl\n",
 			cpumask_pr_args(to_cpumask(em->cpus)),
 			cpumask_pr_args(policy->related_cpus));
 		return false;
 	}
 	nr_levels = cpufreq_cdev->max_level + 1;
 	if (em->nr_cap_states != nr_levels) {
 		pr_err("The number of cap states in pd %*pbl (%u) doesn't match the number of cooling levels (%u)\n",
 			cpumask_pr_args(to_cpumask(em->cpus)),
 			em->nr_cap_states, nr_levels);
 		return false;
 	}
 	return true;
 }
 #endif /* CONFIG_THERMAL_GOV_POWER_ALLOCATOR */
 static unsigned int get_state_freq(struct cpufreq_cooling_device *cpufreq_cdev,
                             unsigned long state)
 {
       struct cpufreq_policy *policy;
       unsigned long idx;
 #ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
       /* Use the Energy Model table if available */
       if (cpufreq_cdev->em) {
               idx = cpufreq_cdev->max_level - state;
               return cpufreq_cdev->em->table[idx].frequency;
       }
 #endif
       /* Otherwise, fallback on the CPUFreq table */
       policy = cpufreq_cdev->policy;
       if (policy->freq_table_sorted == CPUFREQ_TABLE_SORTED_ASCENDING)
               idx = cpufreq_cdev->max_level - state;
       else
               idx = state;
       return policy->freq_table[idx].frequency;
 }
 /* cpufreq cooling device callback functions are defined below */
 /**
 * cpufreq_get_max_state - callback function to get the max cooling state.
 * @cdev: thermal cooling device pointer.
 * @state: fill this variable with the max cooling state.
 *
 * Callback for the thermal cooling device to return the cpufreq
 * max cooling state.
 *
 * Return: 0 on success, an error code otherwise.
 */
 static int cpufreq_get_max_state(struct thermal_cooling_device *cdev,
 				 unsigned long *state)
 {
 	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
 	*state = cpufreq_cdev->max_level;
 	return 0;
 }
 /**
 * cpufreq_get_cur_state - callback function to get the current cooling state.
 * @cdev: thermal cooling device pointer.
 * @state: fill this variable with the current cooling state.
 *
 * Callback for the thermal cooling device to return the cpufreq
 * current cooling state.
 *
 * Return: 0 on success, an error code otherwise.
 */
 static int cpufreq_get_cur_state(struct thermal_cooling_device *cdev,
 				 unsigned long *state)
 {
 	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
 	*state = cpufreq_cdev->cpufreq_state;
 	return 0;
 }
 /**
 * cpufreq_set_cur_state - callback function to set the current cooling state.
 * @cdev: thermal cooling device pointer.
 * @state: set this variable to the current cooling state.
 *
 * Callback for the thermal cooling device to change the cpufreq
 * current cooling state.
 *
 * Return: 0 on success, an error code otherwise.
 */
 static int cpufreq_set_cur_state(struct thermal_cooling_device *cdev,
 				 unsigned long state)
 {
 	struct cpufreq_cooling_device *cpufreq_cdev = cdev->devdata;
 	unsigned int clip_freq;
 	/* Request state should be less than max_level */
 	if (WARN_ON(state > cpufreq_cdev->max_level))
 		return -EINVAL;
 	/* Check if the old cooling action is same as new cooling action */
 	if (cpufreq_cdev->cpufreq_state == state)
 		return 0;
 	clip_freq = get_state_freq(cpufreq_cdev, state);
 	cpufreq_cdev->cpufreq_state = state;
 	cpufreq_cdev->clipped_freq = clip_freq;
 	cpufreq_update_policy(cpufreq_cdev->policy->cpu);
 	return 0;
 }
 /* Bind cpufreq callbacks to thermal cooling device ops */
 static struct thermal_cooling_device_ops cpufreq_cooling_ops = {
 	.get_max_state = cpufreq_get_max_state,
 	.get_cur_state = cpufreq_get_cur_state,
 	.set_cur_state = cpufreq_set_cur_state,
 };
 static struct thermal_cooling_device_ops cpufreq_power_cooling_ops = {
 	.get_max_state		= cpufreq_get_max_state,
 	.get_cur_state		= cpufreq_get_cur_state,
 	.set_cur_state		= cpufreq_set_cur_state,
 	.get_requested_power	= cpufreq_get_requested_power,
 	.state2power		= cpufreq_state2power,
 	.power2state		= cpufreq_power2state,
 };
 /* Notifier for cpufreq policy change */
@@ -559,26 +517,12 @@ static struct notifier_block thermal_cpufreq_notifier_block = {
 	.notifier_call = cpufreq_thermal_notifier,
 };
 static unsigned int find_next_max(struct cpufreq_frequency_table *table,
 				  unsigned int prev_max)
 {
 	struct cpufreq_frequency_table *pos;
 	unsigned int max = 0;
 	cpufreq_for_each_valid_entry(pos, table) {
 		if (pos->frequency > max && pos->frequency < prev_max)
 			max = pos->frequency;
 	}
 	return max;
 }
 /**
 * __cpufreq_cooling_register - helper function to create cpufreq cooling device
 * @np: a valid struct device_node to the cooling device device tree node
 * @policy: cpufreq policy
 * Normally this should be same as cpufreq policy->related_cpus.
- * @capacitance: dynamic power coefficient for these cpus
+ * @em: Energy Model of the cpufreq policy
 *
 * This interface function registers the cpufreq cooling device with the name
 * "thermal-cpufreq-%x". This api can support multiple instances of cpufreq
@@ -590,12 +534,13 @@ static unsigned int find_next_max(struct cpufreq_frequency_table *table,
 */
 static struct thermal_cooling_device *
 __cpufreq_cooling_register(struct device_node *np,
-			struct cpufreq_policy *policy, u32 capacitance)
+			struct cpufreq_policy *policy,
 			struct em_perf_domain *em)
 {
 	struct thermal_cooling_device *cdev;
 	struct cpufreq_cooling_device *cpufreq_cdev;
 	char dev_name[THERMAL_NAME_LENGTH];
-	unsigned int freq, i, num_cpus;
+	unsigned int i, num_cpus;
 	int ret;
 	struct thermal_cooling_device_ops *cooling_ops;
 	bool first;
@@ -629,46 +574,30 @@ __cpufreq_cooling_register(struct device_node *np,
 	/* max_level is an index, not a counter */
 	cpufreq_cdev->max_level = i - 1;
 	cpufreq_cdev->freq_table = kmalloc_array(i,
 					sizeof(*cpufreq_cdev->freq_table),
 					GFP_KERNEL);
 	if (!cpufreq_cdev->freq_table) {
 		cdev = ERR_PTR(-ENOMEM);
 		goto free_idle_time;
 	}
 	ret = ida_simple_get(&cpufreq_ida, 0, 0, GFP_KERNEL);
 	if (ret < 0) {
 		cdev = ERR_PTR(ret);
-		goto free_table;
+		goto free_idle_time;
 	}
 	cpufreq_cdev->id = ret;
 	snprintf(dev_name, sizeof(dev_name), "thermal-cpufreq-%d",
 		 cpufreq_cdev->id);
-	/* Fill freq-table in descending order of frequencies */
+	cooling_ops = &cpufreq_cooling_ops;
-	for (i = 0, freq = -1; i <= cpufreq_cdev->max_level; i++) {
+#ifdef CONFIG_THERMAL_GOV_POWER_ALLOCATOR
-		freq = find_next_max(policy->freq_table, freq);
+	if (em_is_sane(cpufreq_cdev, em)) {
-		cpufreq_cdev->freq_table[i].frequency = freq;
+		cpufreq_cdev->em = em;
-
+		cooling_ops->get_requested_power = cpufreq_get_requested_power;
-		/* Warn for duplicate entries */
+		cooling_ops->state2power = cpufreq_state2power;
-		if (!freq)
+		cooling_ops->power2state = cpufreq_power2state;
-			pr_warn("%s: table has duplicate entries\n", __func__);
+	} else
-		else
+#endif
-			pr_debug("%s: freq:%u KHz\n", __func__, freq);
+	if (policy->freq_table_sorted == CPUFREQ_TABLE_UNSORTED) {
-	}
+		pr_err("%s: unsorted frequency tables are not supported\n",
-
+				__func__);
-	if (capacitance) {
+		cdev = ERR_PTR(-EINVAL);
-		ret = update_freq_table(cpufreq_cdev, capacitance);
+		goto remove_ida;
 		if (ret) {
 			cdev = ERR_PTR(ret);
 			goto remove_ida;
 		}
 		cooling_ops = &cpufreq_power_cooling_ops;
 	} else {
 		cooling_ops = &cpufreq_cooling_ops;
 	}
 	cdev = thermal_of_cooling_device_register(np, dev_name, cpufreq_cdev,
@@ -676,7 +605,7 @@ __cpufreq_cooling_register(struct device_node *np,
 	if (IS_ERR(cdev))
 		goto remove_ida;
-	cpufreq_cdev->clipped_freq = cpufreq_cdev->freq_table[0].frequency;
+	cpufreq_cdev->clipped_freq = get_state_freq(cpufreq_cdev, 0);
 	mutex_lock(&cooling_list_lock);
 	/* Register the notifier for first cpufreq cooling device */
@@ -692,8 +621,6 @@ __cpufreq_cooling_register(struct device_node *np,
 remove_ida:
 	ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
 free_table:
 	kfree(cpufreq_cdev->freq_table);
 free_idle_time:
 	kfree(cpufreq_cdev->idle_time);
 free_cdev:
@@ -715,7 +642,7 @@ free_cdev:
 struct thermal_cooling_device *
 cpufreq_cooling_register(struct cpufreq_policy *policy)
 {
-	return __cpufreq_cooling_register(NULL, policy, 0);
+	return __cpufreq_cooling_register(NULL, policy, NULL);
 }
 EXPORT_SYMBOL_GPL(cpufreq_cooling_register);
@@ -743,7 +670,6 @@ of_cpufreq_cooling_register(struct cpufreq_policy *policy)
 {
 	struct device_node *np = of_get_cpu_node(policy->cpu, NULL);
 	struct thermal_cooling_device *cdev = NULL;
 	u32 capacitance = 0;
 	if (!np) {
 		pr_err("cpu_cooling: OF node not available for cpu%d\n",
@@ -752,10 +678,9 @@ of_cpufreq_cooling_register(struct cpufreq_policy *policy)
 	}
 	if (of_find_property(np, "#cooling-cells", NULL)) {
-		of_property_read_u32(np, "dynamic-power-coefficient",
+		struct em_perf_domain *em = em_cpu_get(policy->cpu);
 				     &capacitance);
-		cdev = __cpufreq_cooling_register(np, policy, capacitance);
+		cdev = __cpufreq_cooling_register(np, policy, em);
 		if (IS_ERR(cdev)) {
 			pr_err("cpu_cooling: cpu%d failed to register as cooling device: %ld\n",
 			       policy->cpu, PTR_ERR(cdev));
@@ -797,7 +722,6 @@ void cpufreq_cooling_unregister(struct thermal_cooling_device *cdev)
 	thermal_cooling_device_unregister(cdev);
 	ida_simple_remove(&cpufreq_ida, cpufreq_cdev->id);
 	kfree(cpufreq_cdev->idle_time);
 	kfree(cpufreq_cdev->freq_table);
 	kfree(cpufreq_cdev);
 }
 EXPORT_SYMBOL_GPL(cpufreq_cooling_unregister);
--- a/include/linux/arch_topology.h
+++ b/include/linux/arch_topology.h
@@ -33,4 +33,12 @@ unsigned long topology_get_freq_scale(int cpu)
 	return per_cpu(freq_scale, cpu);
 }
 DECLARE_PER_CPU(unsigned long, max_freq_scale);
 static inline
 unsigned long topology_get_max_freq_scale(struct sched_domain *sd, int cpu)
 {
 	return per_cpu(max_freq_scale, cpu);
 }
 #endif /* _LINUX_ARCH_TOPOLOGY_H_ */
--- a/include/linux/cpufreq.h
+++ b/include/linux/cpufreq.h
@@ -984,6 +984,8 @@ extern unsigned int arch_freq_get_on_cpu(int cpu);
 extern void arch_set_freq_scale(struct cpumask *cpus, unsigned long cur_freq,
 				unsigned long max_freq);
 extern void arch_set_max_freq_scale(struct cpumask *cpus,
 				    unsigned long policy_max_freq);
 /* the following are really really optional */
 extern struct freq_attr cpufreq_freq_attr_scaling_available_freqs;
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6226,7 +6226,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
 		return 0;
 	min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
-	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
+	max_cap = cpu_rq(cpu)->rd->max_cpu_capacity.val;
 	/* Minimum capacity is close to max, no need to abort wake_affine */
 	if (max_cap - min_cap < max_cap >> 3)
@@ -7141,6 +7141,7 @@ struct lb_env {
 	int			new_dst_cpu;
 	enum cpu_idle_type	idle;
 	long			imbalance;
 	unsigned int		src_grp_nr_running;
 	/* The set of CPUs under consideration for load-balancing */
 	struct cpumask		*cpus;
@@ -7754,10 +7755,9 @@ static inline void init_sd_lb_stats(struct sd_lb_stats *sds)
 	};
 }
-static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
+static unsigned long scale_rt_capacity(int cpu, unsigned long max)
 {
 	struct rq *rq = cpu_rq(cpu);
 	unsigned long max = arch_scale_cpu_capacity(cpu);
 	unsigned long used, free;
 	unsigned long irq;
@@ -7777,12 +7777,47 @@ static unsigned long scale_rt_capacity(struct sched_domain *sd, int cpu)
 	return scale_irq_capacity(free, irq, max);
 }
 void init_max_cpu_capacity(struct max_cpu_capacity *mcc) {
 	raw_spin_lock_init(&mcc->lock);
 	mcc->val = 0;
 	mcc->cpu = -1;
 }
 static void update_cpu_capacity(struct sched_domain *sd, int cpu)
 {
-	unsigned long capacity = scale_rt_capacity(sd, cpu);
+	unsigned long capacity = arch_scale_cpu_capacity(cpu);
 	struct sched_group *sdg = sd->groups;
 	struct max_cpu_capacity *mcc;
 	unsigned long max_capacity;
 	int max_cap_cpu;
 	unsigned long flags;
-	cpu_rq(cpu)->cpu_capacity_orig = arch_scale_cpu_capacity(cpu);
+	cpu_rq(cpu)->cpu_capacity_orig = capacity;
 	capacity *= arch_scale_max_freq_capacity(sd, cpu);
 	capacity >>= SCHED_CAPACITY_SHIFT;
 	mcc = &cpu_rq(cpu)->rd->max_cpu_capacity;
 	raw_spin_lock_irqsave(&mcc->lock, flags);
 	max_capacity = mcc->val;
 	max_cap_cpu = mcc->cpu;
 	if ((max_capacity > capacity && max_cap_cpu == cpu) ||
 	    (max_capacity < capacity)) {
 		mcc->val = capacity;
 		mcc->cpu = cpu;
 #ifdef CONFIG_SCHED_DEBUG
 		raw_spin_unlock_irqrestore(&mcc->lock, flags);
 		printk_deferred(KERN_INFO "CPU%d: update max cpu_capacity %lu\n",
 				cpu, capacity);
 		goto skip_unlock;
 #endif
 	}
 	raw_spin_unlock_irqrestore(&mcc->lock, flags);
 skip_unlock: __attribute__ ((unused));
 	capacity = scale_rt_capacity(cpu, capacity);
 	if (!capacity)
 		capacity = 1;
@@ -7886,7 +7921,7 @@ check_cpu_capacity(struct rq *rq, struct sched_domain *sd)
 static inline int check_misfit_status(struct rq *rq, struct sched_domain *sd)
 {
 	return rq->misfit_task_load &&
-		(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity ||
+		(rq->cpu_capacity_orig < rq->rd->max_cpu_capacity.val ||
 		 check_cpu_capacity(rq, sd));
 }
@@ -8291,6 +8326,8 @@ next_group:
 	if (env->sd->flags & SD_NUMA)
 		env->fbq_type = fbq_classify_group(&sds->busiest_stat);
 	env->src_grp_nr_running = sds->busiest_stat.sum_nr_running;
 	if (!env->sd->parent) {
 		struct root_domain *rd = env->dst_rq->rd;
@@ -8419,7 +8456,22 @@ void fix_small_imbalance(struct lb_env *env, struct sd_lb_stats *sds)
 	capa_move /= SCHED_CAPACITY_SCALE;
 	/* Move if we gain throughput */
-	if (capa_move > capa_now)
+	if (capa_move > capa_now) {
 		env->imbalance = busiest->load_per_task;
 		return;
 	}
 	/* We can't see throughput improvement with the load-based
 	 * method, but it is possible depending upon group size and
 	 * capacity range that there might still be an underutilized
 	 * cpu available in an asymmetric capacity system. Do one last
 	 * check just in case.
 	 */
 	if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
 		busiest->group_type == group_overloaded &&
 		busiest->sum_nr_running > busiest->group_weight &&
 		local->sum_nr_running < local->group_weight &&
 		local->group_capacity < busiest->group_capacity)
 		env->imbalance = busiest->load_per_task;
 }
@@ -8488,8 +8540,18 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
 		(sds->avg_load - local->avg_load) * local->group_capacity
 	) / SCHED_CAPACITY_SCALE;
-	/* Boost imbalance to allow misfit task to be balanced. */
+	/* Boost imbalance to allow misfit task to be balanced.
-	if (busiest->group_type == group_misfit_task) {
+	 * Always do this if we are doing a NEWLY_IDLE balance
 	 * on the assumption that any tasks we have must not be
 	 * long-running (and hence we cannot rely upon load).
 	 * However if we are not idle, we should assume the tasks
 	 * we have are longer running and not override load-based
 	 * calculations above unless we are sure that the local
 	 * group is underutilized.
 	 */
 	if (busiest->group_type == group_misfit_task &&
 		(env->idle == CPU_NEWLY_IDLE ||
 		local->sum_nr_running < local->group_weight)) {
 		env->imbalance = max_t(long, env->imbalance,
 				       busiest->group_misfit_task_load);
 	}
@@ -8767,6 +8829,17 @@ static int need_active_balance(struct lb_env *env)
 	if (voluntary_active_balance(env))
 		return 1;
 	if ((capacity_of(env->src_cpu) < capacity_of(env->dst_cpu)) &&
 				env->src_rq->cfs.h_nr_running == 1 &&
 				cpu_overutilized(env->src_cpu) &&
 				!cpu_overutilized(env->dst_cpu)) {
 		return 1;
 	}
 	if (env->src_grp_type == group_overloaded && env->src_rq->misfit_task_load)
 		return 1;
 	return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
 }
@@ -8985,7 +9058,8 @@ more_balance:
 		 * excessive cache_hot migrations and active balances.
 		 */
 		if (idle != CPU_NEWLY_IDLE)
-			sd->nr_balance_failed++;
+			if (env.src_grp_nr_running > 1)
 				sd->nr_balance_failed++;
 		if (need_active_balance(&env)) {
 			unsigned long flags;
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -710,6 +710,12 @@ struct perf_domain {
 	struct rcu_head rcu;
 };
 struct max_cpu_capacity {
 	raw_spinlock_t lock;
 	unsigned long val;
 	int cpu;
 };
 /* Scheduling group status flags */
 #define SG_OVERLOAD		0x1 /* More than one runnable task on a CPU. */
 #define SG_OVERUTILIZED		0x2 /* One or more CPUs are over-utilized. */
@@ -768,7 +774,8 @@ struct root_domain {
 	cpumask_var_t		rto_mask;
 	struct cpupri		cpupri;
-	unsigned long		max_cpu_capacity;
+	/* Maximum cpu capacity in the system. */
 	struct max_cpu_capacity max_cpu_capacity;
 	/*
 	 * NULL-terminated list of performance domains intersecting with the
@@ -781,6 +788,7 @@ extern struct root_domain def_root_domain;
 extern struct mutex sched_domains_mutex;
 extern void init_defrootdomain(void);
 extern void init_max_cpu_capacity(struct max_cpu_capacity *mcc);
 extern int sched_init_domains(const struct cpumask *cpu_map);
 extern void rq_attach_root(struct rq *rq, struct root_domain *rd);
 extern void sched_get_rd(struct root_domain *rd);
@@ -1942,6 +1950,15 @@ unsigned long arch_scale_freq_capacity(int cpu)
 }
 #endif
 #ifndef arch_scale_max_freq_capacity
 struct sched_domain;
 static __always_inline
 unsigned long arch_scale_max_freq_capacity(struct sched_domain *sd, int cpu)
 {
 	return SCHED_CAPACITY_SCALE;
 }
 #endif
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PREEMPT
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -510,6 +510,9 @@ static int init_rootdomain(struct root_domain *rd)
 	if (cpupri_init(&rd->cpupri) != 0)
 		goto free_cpudl;
 	init_max_cpu_capacity(&rd->max_cpu_capacity);
 	return 0;
 free_cpudl:
@@ -1930,7 +1933,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	enum s_alloc alloc_state;
 	struct sched_domain *sd;
 	struct s_data d;
 	struct rq *rq = NULL;
 	int i, ret = -ENOMEM;
 	struct sched_domain_topology_level *tl_asym;
 	bool has_asym = false;
@@ -1993,13 +1995,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	/* Attach the domains */
 	rcu_read_lock();
 	for_each_cpu(i, cpu_map) {
 		rq = cpu_rq(i);
 		sd = *per_cpu_ptr(d.sd, i);
 		/* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */
 		if (rq->cpu_capacity_orig > READ_ONCE(d.rd->max_cpu_capacity))
 			WRITE_ONCE(d.rd->max_cpu_capacity, rq->cpu_capacity_orig);
 		cpu_attach_domain(sd, d.rd, i);
 	}
 	rcu_read_unlock();
@@ -2007,11 +2003,6 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
 	if (has_asym)
 		static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
 	if (rq && sched_debug_enabled) {
 		pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
 			cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
 	}
 	ret = 0;
 error:
 	__free_domain_allocs(&d, alloc_state, cpu_map);