From eda09706b240ca9129ac4e1fbb4eb1e2bc67aadc Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Michal=20Koutn=C3=BD?= Date: Wed, 3 Nov 2021 17:58:45 +0100 Subject: [PATCH 01/49] cgroup: rstat: Mark benign data race to silence KCSAN MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There is a race between updaters and flushers (flush can possibly miss the latest update(s)). This is expected as explained in cgroup_rstat_updated() comment, add also machine readable annotation so that KCSAN results aren't noisy. Reported-by: Hao Sun Link: https://lore.kernel.org/r/CACkBjsbPVdkub=e-E-p1WBOLxS515ith-53SFdmFHWV_QMo40w@mail.gmail.com Suggested-by: Hao Sun Signed-off-by: Michal Koutný Reviewed-by: Shakeel Butt Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 1486768f2318..1abe74114527 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -35,7 +35,7 @@ void cgroup_rstat_updated(struct cgroup *cgrp, int cpu) * instead of NULL, we can tell whether @cgrp is on the list by * testing the next pointer for NULL. */ - if (cgroup_rstat_cpu(cgrp, cpu)->updated_next) + if (data_race(cgroup_rstat_cpu(cgrp, cpu)->updated_next)) return; raw_spin_lock_irqsave(cpu_lock, flags); From 764cedc5638b0565de140575b90ce24983f2b664 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 18 Nov 2021 10:37:20 -0800 Subject: [PATCH 02/49] thermal: int340x: Use struct_group() for memcpy() region In preparation for FORTIFY_SOURCE performing compile-time and run-time field bounds checking for memcpy(), avoid intentionally writing across neighboring fields. Use struct_group() in struct art around members weight, and ac[0-9]_max, so they can be referenced together. This will allow memcpy() and sizeof() to more easily reason about sizes, improve readability, and avoid future warnings about writing beyond the end of weight. "pahole" shows no size nor member offset changes to struct art. "objdump -d" shows no meaningful object code changes (i.e. only source line number induced differences). Signed-off-by: Kees Cook Reviewed-by: Srinivas Pandruvada Signed-off-by: Rafael J. Wysocki --- .../intel/int340x_thermal/acpi_thermal_rel.c | 5 +- .../intel/int340x_thermal/acpi_thermal_rel.h | 48 ++++++++++--------- 2 files changed, 29 insertions(+), 24 deletions(-) diff --git a/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.c b/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.c index a478cff8162a..e90690a234c4 100644 --- a/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.c +++ b/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.c @@ -250,8 +250,9 @@ static int fill_art(char __user *ubuf) get_single_name(arts[i].source, art_user[i].source_device); get_single_name(arts[i].target, art_user[i].target_device); /* copy the rest int data in addition to source and target */ - memcpy(&art_user[i].weight, &arts[i].weight, - sizeof(u64) * (ACPI_NR_ART_ELEMENTS - 2)); + BUILD_BUG_ON(sizeof(art_user[i].data) != + sizeof(u64) * (ACPI_NR_ART_ELEMENTS - 2)); + memcpy(&art_user[i].data, &arts[i].data, sizeof(art_user[i].data)); } if (copy_to_user(ubuf, art_user, art_len)) diff --git a/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.h b/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.h index 58822575fd54..78d942477035 100644 --- a/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.h +++ b/drivers/thermal/intel/int340x_thermal/acpi_thermal_rel.h @@ -17,17 +17,19 @@ struct art { acpi_handle source; acpi_handle target; - u64 weight; - u64 ac0_max; - u64 ac1_max; - u64 ac2_max; - u64 ac3_max; - u64 ac4_max; - u64 ac5_max; - u64 ac6_max; - u64 ac7_max; - u64 ac8_max; - u64 ac9_max; + struct_group(data, + u64 weight; + u64 ac0_max; + u64 ac1_max; + u64 ac2_max; + u64 ac3_max; + u64 ac4_max; + u64 ac5_max; + u64 ac6_max; + u64 ac7_max; + u64 ac8_max; + u64 ac9_max; + ); } __packed; struct trt { @@ -47,17 +49,19 @@ union art_object { struct { char source_device[8]; /* ACPI single name */ char target_device[8]; /* ACPI single name */ - u64 weight; - u64 ac0_max_level; - u64 ac1_max_level; - u64 ac2_max_level; - u64 ac3_max_level; - u64 ac4_max_level; - u64 ac5_max_level; - u64 ac6_max_level; - u64 ac7_max_level; - u64 ac8_max_level; - u64 ac9_max_level; + struct_group(data, + u64 weight; + u64 ac0_max_level; + u64 ac1_max_level; + u64 ac2_max_level; + u64 ac3_max_level; + u64 ac4_max_level; + u64 ac5_max_level; + u64 ac6_max_level; + u64 ac7_max_level; + u64 ac8_max_level; + u64 ac9_max_level; + ); }; u64 __data[ACPI_NR_ART_ELEMENTS]; }; From 0c9e032a45e777b707bd8ae282242a2b9be3ac91 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Mon, 15 Nov 2021 15:09:59 +0300 Subject: [PATCH 03/49] PCI: Convert to device_create_managed_software_node() In quirk_huawei_pcie_sva(), device_add_properties() is used to inject additional device properties, but there is no device_remove_properties() call anywhere to remove those properties. The assumption is most likely that the device is never removed, and the properties therefore do not also never need to be removed. Even though it is unlikely that the device is ever removed in this case, it is safer to make sure that the properties are also removed if the device ever does get unregistered. To achieve this, instead of adding a separate quirk for the case of device removal where device_remove_properties() is called, using device_create_managed_software_node() instead of device_add_properties(). Both functions create a software node (a type of fwnode) that holds the device properties, which is then assigned to the device very much the same way. The difference between the two functions is, that device_create_managed_software_node() guarantees that the software node (together with the properties) is removed when the device is removed. The function device_add_property() does not guarantee that, so the properties added with it should always be removed with device_remove_properties(). Reviewed-by: Andy Shevchenko Acked-by: Zhangfei Gao Acked-by: Bjorn Helgaas Signed-off-by: Heikki Krogerus Signed-off-by: Rafael J. Wysocki --- drivers/pci/quirks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c index 003950c738d2..b4cb658cce2b 100644 --- a/drivers/pci/quirks.c +++ b/drivers/pci/quirks.c @@ -1850,7 +1850,7 @@ static void quirk_huawei_pcie_sva(struct pci_dev *pdev) * can set it directly. */ if (!pdev->dev.of_node && - device_add_properties(&pdev->dev, properties)) + device_create_managed_software_node(&pdev->dev, properties, NULL)) pci_warn(pdev, "could not add stall property"); } DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_HUAWEI, 0xa250, quirk_huawei_pcie_sva); From 982b94ba0983bee504cdd317428cc012c8405c78 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Mon, 15 Nov 2021 15:10:00 +0300 Subject: [PATCH 04/49] driver core: Don't call device_remove_properties() from device_del() All the drivers that relied on device_del() to call device_remove_properties() have now been converted to either use device_create_managed_software_node() instead of device_add_properties(), or to register the software node completely separately from the device. This will make it finally possible to share and reuse the software nodes that hold the additional device properties. Signed-off-by: Heikki Krogerus Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/base/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index fd034d742447..a40b6fb1ebb0 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -3582,7 +3582,6 @@ void device_del(struct device *dev) device_pm_remove(dev); driver_deferred_probe_del(dev); device_platform_notify_remove(dev); - device_remove_properties(dev); device_links_purge(dev); if (dev->bus) From 2338e7bcef445059a99848a3eddde0b556277663 Mon Sep 17 00:00:00 2001 From: Heikki Krogerus Date: Mon, 15 Nov 2021 15:10:01 +0300 Subject: [PATCH 05/49] device property: Remove device_add_properties() API There are no more users for it. Reviewed-by: Andy Shevchenko Signed-off-by: Heikki Krogerus Signed-off-by: Rafael J. Wysocki --- drivers/base/property.c | 48 ---------------------------------------- include/linux/property.h | 4 ---- 2 files changed, 52 deletions(-) diff --git a/drivers/base/property.c b/drivers/base/property.c index f1f35b48ab8b..d0960a9e8974 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -507,54 +507,6 @@ struct fwnode_handle *fwnode_find_reference(const struct fwnode_handle *fwnode, } EXPORT_SYMBOL_GPL(fwnode_find_reference); -/** - * device_remove_properties - Remove properties from a device object. - * @dev: Device whose properties to remove. - * - * The function removes properties previously associated to the device - * firmware node with device_add_properties(). Memory allocated to the - * properties will also be released. - */ -void device_remove_properties(struct device *dev) -{ - struct fwnode_handle *fwnode = dev_fwnode(dev); - - if (!fwnode) - return; - - if (is_software_node(fwnode->secondary)) { - fwnode_remove_software_node(fwnode->secondary); - set_secondary_fwnode(dev, NULL); - } -} -EXPORT_SYMBOL_GPL(device_remove_properties); - -/** - * device_add_properties - Add a collection of properties to a device object. - * @dev: Device to add properties to. - * @properties: Collection of properties to add. - * - * Associate a collection of device properties represented by @properties with - * @dev. The function takes a copy of @properties. - * - * WARNING: The callers should not use this function if it is known that there - * is no real firmware node associated with @dev! In that case the callers - * should create a software node and assign it to @dev directly. - */ -int device_add_properties(struct device *dev, - const struct property_entry *properties) -{ - struct fwnode_handle *fwnode; - - fwnode = fwnode_create_software_node(properties, NULL); - if (IS_ERR(fwnode)) - return PTR_ERR(fwnode); - - set_secondary_fwnode(dev, fwnode); - return 0; -} -EXPORT_SYMBOL_GPL(device_add_properties); - /** * fwnode_get_name - Return the name of a node * @fwnode: The firmware node diff --git a/include/linux/property.h b/include/linux/property.h index 88fa726a76df..16f736c698a2 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -378,10 +378,6 @@ property_entries_dup(const struct property_entry *properties); void property_entries_free(const struct property_entry *properties); -int device_add_properties(struct device *dev, - const struct property_entry *properties); -void device_remove_properties(struct device *dev); - bool device_dma_supported(struct device *dev); enum dev_dma_attr device_get_dma_attr(struct device *dev); From d7fbdc575b33c374ce88ccfe3ab364f7eb240f8a Mon Sep 17 00:00:00 2001 From: ran jianping Date: Fri, 12 Nov 2021 09:09:46 +0000 Subject: [PATCH 06/49] thermal: tools: tmon: remove unneeded local variable Fix the following coccicheck review: /tools/thermal/tmon/pid.c:57:5-8: Unneeded variable Remove unneeded variable used to store return value. Reported-by: Zeal Robot Signed-off-by: ran jianping Signed-off-by: Rafael J. Wysocki --- tools/thermal/tmon/pid.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/thermal/tmon/pid.c b/tools/thermal/tmon/pid.c index c54edb4f630c..296f69c00c57 100644 --- a/tools/thermal/tmon/pid.c +++ b/tools/thermal/tmon/pid.c @@ -54,7 +54,6 @@ static double xk_1, xk_2; /* input temperature x[k-#] */ */ int init_thermal_controller(void) { - int ret = 0; /* init pid params */ p_param.ts = ticktime; @@ -65,7 +64,7 @@ int init_thermal_controller(void) p_param.t_target = target_temp_user; - return ret; + return 0; } void controller_reset(void) From 8291471ea5f1b2e6782cbb9c6ed785f12435245f Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 27 Nov 2021 14:59:19 +0000 Subject: [PATCH 07/49] cgroup: get the wrong css for css_alloc() during cgroup_init_subsys() css_alloc() needs the parent css, while cgroup_css() gets current cgropu's css. So we are getting the wrong css during cgroup_init_subsys(). Fortunately, cgrp_dfl_root.cgrp's css is not set yet, so the value we pass to css_alloc() is NULL anyway. Let's pass NULL directly during init, since we know there is no parent yet. Signed-off-by: Wei Yang Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index 919194de39c8..f522cee8e650 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -5711,7 +5711,7 @@ static void __init cgroup_init_subsys(struct cgroup_subsys *ss, bool early) /* Create the root cgroup state for this subsystem */ ss->root = &cgrp_dfl_root; - css = ss->css_alloc(cgroup_css(&cgrp_dfl_root.cgrp, ss)); + css = ss->css_alloc(NULL); /* We don't handle early failures gracefully */ BUG_ON(IS_ERR(css)); init_and_link_css(css, ss, &cgrp_dfl_root.cgrp); From 4cf2ddf16e175ee18c5c29865c32da7d6269cf44 Mon Sep 17 00:00:00 2001 From: Oleksij Rempel Date: Wed, 17 Nov 2021 11:34:26 +0100 Subject: [PATCH 08/49] thermal/drivers/imx: Implement runtime PM support MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Starting with commit d92ed2c9d3ff ("thermal: imx: Use driver's local data to decide whether to run a measurement") this driver stared using irq_enabled flag to make decision to power on/off the thermal core. This triggered a regression, where after reaching critical temperature, alarm IRQ handler set irq_enabled to false, disabled thermal core and was not able read temperature and disable cooling sequence. In case the cooling device is "CPU/GPU freq", the system will run with reduce performance until next reboot. To solve this issue, we need to move all parts implementing hand made runtime power management and let it handle actual runtime PM framework. Fixes: d92ed2c9d3ff ("thermal: imx: Use driver's local data to decide whether to run a measurement") Signed-off-by: Oleksij Rempel Tested-by: Petr Beneš Link: https://lore.kernel.org/r/20211117103426.81813-1-o.rempel@pengutronix.de Signed-off-by: Daniel Lezcano --- drivers/thermal/imx_thermal.c | 147 +++++++++++++++++++++------------- 1 file changed, 92 insertions(+), 55 deletions(-) diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c index 2c7473d86a59..16663373b682 100644 --- a/drivers/thermal/imx_thermal.c +++ b/drivers/thermal/imx_thermal.c @@ -15,6 +15,7 @@ #include #include #include +#include #define REG_SET 0x4 #define REG_CLR 0x8 @@ -194,6 +195,7 @@ static struct thermal_soc_data thermal_imx7d_data = { }; struct imx_thermal_data { + struct device *dev; struct cpufreq_policy *policy; struct thermal_zone_device *tz; struct thermal_cooling_device *cdev; @@ -252,44 +254,15 @@ static int imx_get_temp(struct thermal_zone_device *tz, int *temp) const struct thermal_soc_data *soc_data = data->socdata; struct regmap *map = data->tempmon; unsigned int n_meas; - bool wait, run_measurement; u32 val; + int ret; - run_measurement = !data->irq_enabled; - if (!run_measurement) { - /* Check if a measurement is currently in progress */ - regmap_read(map, soc_data->temp_data, &val); - wait = !(val & soc_data->temp_valid_mask); - } else { - /* - * Every time we measure the temperature, we will power on the - * temperature sensor, enable measurements, take a reading, - * disable measurements, power off the temperature sensor. - */ - regmap_write(map, soc_data->sensor_ctrl + REG_CLR, - soc_data->power_down_mask); - regmap_write(map, soc_data->sensor_ctrl + REG_SET, - soc_data->measure_temp_mask); - - wait = true; - } - - /* - * According to the temp sensor designers, it may require up to ~17us - * to complete a measurement. - */ - if (wait) - usleep_range(20, 50); + ret = pm_runtime_resume_and_get(data->dev); + if (ret < 0) + return ret; regmap_read(map, soc_data->temp_data, &val); - if (run_measurement) { - regmap_write(map, soc_data->sensor_ctrl + REG_CLR, - soc_data->measure_temp_mask); - regmap_write(map, soc_data->sensor_ctrl + REG_SET, - soc_data->power_down_mask); - } - if ((val & soc_data->temp_valid_mask) == 0) { dev_dbg(&tz->device, "temp measurement never finished\n"); return -EAGAIN; @@ -328,6 +301,8 @@ static int imx_get_temp(struct thermal_zone_device *tz, int *temp) enable_irq(data->irq); } + pm_runtime_put(data->dev); + return 0; } @@ -335,24 +310,16 @@ static int imx_change_mode(struct thermal_zone_device *tz, enum thermal_device_mode mode) { struct imx_thermal_data *data = tz->devdata; - struct regmap *map = data->tempmon; - const struct thermal_soc_data *soc_data = data->socdata; if (mode == THERMAL_DEVICE_ENABLED) { - regmap_write(map, soc_data->sensor_ctrl + REG_CLR, - soc_data->power_down_mask); - regmap_write(map, soc_data->sensor_ctrl + REG_SET, - soc_data->measure_temp_mask); + pm_runtime_get(data->dev); if (!data->irq_enabled) { data->irq_enabled = true; enable_irq(data->irq); } } else { - regmap_write(map, soc_data->sensor_ctrl + REG_CLR, - soc_data->measure_temp_mask); - regmap_write(map, soc_data->sensor_ctrl + REG_SET, - soc_data->power_down_mask); + pm_runtime_put(data->dev); if (data->irq_enabled) { disable_irq(data->irq); @@ -393,6 +360,11 @@ static int imx_set_trip_temp(struct thermal_zone_device *tz, int trip, int temp) { struct imx_thermal_data *data = tz->devdata; + int ret; + + ret = pm_runtime_resume_and_get(data->dev); + if (ret < 0) + return ret; /* do not allow changing critical threshold */ if (trip == IMX_TRIP_CRITICAL) @@ -406,6 +378,8 @@ static int imx_set_trip_temp(struct thermal_zone_device *tz, int trip, imx_set_alarm_temp(data, temp); + pm_runtime_put(data->dev); + return 0; } @@ -681,6 +655,8 @@ static int imx_thermal_probe(struct platform_device *pdev) if (!data) return -ENOMEM; + data->dev = &pdev->dev; + map = syscon_regmap_lookup_by_phandle(pdev->dev.of_node, "fsl,tempmon"); if (IS_ERR(map)) { ret = PTR_ERR(map); @@ -800,6 +776,16 @@ static int imx_thermal_probe(struct platform_device *pdev) data->socdata->power_down_mask); regmap_write(map, data->socdata->sensor_ctrl + REG_SET, data->socdata->measure_temp_mask); + /* After power up, we need a delay before first access can be done. */ + usleep_range(20, 50); + + /* the core was configured and enabled just before */ + pm_runtime_set_active(&pdev->dev); + pm_runtime_enable(data->dev); + + ret = pm_runtime_resume_and_get(data->dev); + if (ret < 0) + goto disable_runtime_pm; data->irq_enabled = true; ret = thermal_zone_device_enable(data->tz); @@ -814,10 +800,15 @@ static int imx_thermal_probe(struct platform_device *pdev) goto thermal_zone_unregister; } + pm_runtime_put(data->dev); + return 0; thermal_zone_unregister: thermal_zone_device_unregister(data->tz); +disable_runtime_pm: + pm_runtime_put_noidle(data->dev); + pm_runtime_disable(data->dev); clk_disable: clk_disable_unprepare(data->thermal_clk); legacy_cleanup: @@ -829,13 +820,9 @@ legacy_cleanup: static int imx_thermal_remove(struct platform_device *pdev) { struct imx_thermal_data *data = platform_get_drvdata(pdev); - struct regmap *map = data->tempmon; - /* Disable measurements */ - regmap_write(map, data->socdata->sensor_ctrl + REG_SET, - data->socdata->power_down_mask); - if (!IS_ERR(data->thermal_clk)) - clk_disable_unprepare(data->thermal_clk); + pm_runtime_put_noidle(data->dev); + pm_runtime_disable(data->dev); thermal_zone_device_unregister(data->tz); imx_thermal_unregister_legacy_cooling(data); @@ -858,9 +845,8 @@ static int __maybe_unused imx_thermal_suspend(struct device *dev) ret = thermal_zone_device_disable(data->tz); if (ret) return ret; - clk_disable_unprepare(data->thermal_clk); - return 0; + return pm_runtime_force_suspend(data->dev); } static int __maybe_unused imx_thermal_resume(struct device *dev) @@ -868,19 +854,70 @@ static int __maybe_unused imx_thermal_resume(struct device *dev) struct imx_thermal_data *data = dev_get_drvdata(dev); int ret; - ret = clk_prepare_enable(data->thermal_clk); + ret = pm_runtime_force_resume(data->dev); if (ret) return ret; /* Enabled thermal sensor after resume */ - ret = thermal_zone_device_enable(data->tz); + return thermal_zone_device_enable(data->tz); +} + +static int __maybe_unused imx_thermal_runtime_suspend(struct device *dev) +{ + struct imx_thermal_data *data = dev_get_drvdata(dev); + const struct thermal_soc_data *socdata = data->socdata; + struct regmap *map = data->tempmon; + int ret; + + ret = regmap_write(map, socdata->sensor_ctrl + REG_CLR, + socdata->measure_temp_mask); if (ret) return ret; + ret = regmap_write(map, socdata->sensor_ctrl + REG_SET, + socdata->power_down_mask); + if (ret) + return ret; + + clk_disable_unprepare(data->thermal_clk); + return 0; } -static SIMPLE_DEV_PM_OPS(imx_thermal_pm_ops, - imx_thermal_suspend, imx_thermal_resume); +static int __maybe_unused imx_thermal_runtime_resume(struct device *dev) +{ + struct imx_thermal_data *data = dev_get_drvdata(dev); + const struct thermal_soc_data *socdata = data->socdata; + struct regmap *map = data->tempmon; + int ret; + + ret = clk_prepare_enable(data->thermal_clk); + if (ret) + return ret; + + ret = regmap_write(map, socdata->sensor_ctrl + REG_CLR, + socdata->power_down_mask); + if (ret) + return ret; + + ret = regmap_write(map, socdata->sensor_ctrl + REG_SET, + socdata->measure_temp_mask); + if (ret) + return ret; + + /* + * According to the temp sensor designers, it may require up to ~17us + * to complete a measurement. + */ + usleep_range(20, 50); + + return 0; +} + +static const struct dev_pm_ops imx_thermal_pm_ops = { + SET_SYSTEM_SLEEP_PM_OPS(imx_thermal_suspend, imx_thermal_resume) + SET_RUNTIME_PM_OPS(imx_thermal_runtime_suspend, + imx_thermal_runtime_resume, NULL) +}; static struct platform_driver imx_thermal = { .driver = { From 8152d2a9e73deb2b483bef9035d53ded27f50317 Mon Sep 17 00:00:00 2001 From: Rikard Falkeborn Date: Sun, 28 Nov 2021 22:46:41 +0100 Subject: [PATCH 09/49] thermal/drivers/intel_powerclamp: Constify static thermal_cooling_device_ops The only usage of powerclamp_cooling_ops is to pass its address to thermal_cooling_device_register(), which takes a pointer to const struct thermal_cooling_device_ops. Make it const to allow the compiler to put it in read-only memory. Signed-off-by: Rikard Falkeborn Link: https://lore.kernel.org/r/20211128214641.30953-1-rikard.falkeborn@gmail.com Signed-off-by: Daniel Lezcano --- drivers/thermal/intel/intel_powerclamp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/thermal/intel/intel_powerclamp.c b/drivers/thermal/intel/intel_powerclamp.c index 9b68489a2356..14256421d98c 100644 --- a/drivers/thermal/intel/intel_powerclamp.c +++ b/drivers/thermal/intel/intel_powerclamp.c @@ -641,7 +641,7 @@ exit_set: } /* bind to generic thermal layer as cooling device*/ -static struct thermal_cooling_device_ops powerclamp_cooling_ops = { +static const struct thermal_cooling_device_ops powerclamp_cooling_ops = { .get_max_state = powerclamp_get_max_state, .get_cur_state = powerclamp_get_cur_state, .set_cur_state = powerclamp_set_cur_state, From 9460347192add5644236d492f79ecab6d83504d4 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 30 Nov 2021 15:57:56 +0000 Subject: [PATCH 10/49] dt-bindings: thermal: Document Renesas RZ/G2L TSU Document the Thermal Sensor Unit(TSU) in the RZ/G2L SoC. Signed-off-by: Biju Das Reviewed-by: Lad Prabhakar Reviewed-by: Rob Herring Link: https://lore.kernel.org/r/20211130155757.17837-2-biju.das.jz@bp.renesas.com Signed-off-by: Daniel Lezcano --- .../bindings/thermal/rzg2l-thermal.yaml | 76 +++++++++++++++++++ 1 file changed, 76 insertions(+) create mode 100644 Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml diff --git a/Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml b/Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml new file mode 100644 index 000000000000..ccab9511a042 --- /dev/null +++ b/Documentation/devicetree/bindings/thermal/rzg2l-thermal.yaml @@ -0,0 +1,76 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/thermal/rzg2l-thermal.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Renesas RZ/G2L Thermal Sensor Unit + +description: + On RZ/G2L SoCs, the thermal sensor unit (TSU) measures the + temperature(Tj) inside the LSI. + +maintainers: + - Biju Das + +properties: + compatible: + items: + - enum: + - renesas,r9a07g044-tsu # RZ/G2{L,LC} + - const: renesas,rzg2l-tsu + + reg: + maxItems: 1 + + clocks: + maxItems: 1 + + power-domains: + maxItems: 1 + + resets: + maxItems: 1 + + "#thermal-sensor-cells": + const: 1 + +required: + - compatible + - reg + - clocks + - power-domains + - resets + - "#thermal-sensor-cells" + +additionalProperties: false + +examples: + - | + #include + + tsu: thermal@10059400 { + compatible = "renesas,r9a07g044-tsu", + "renesas,rzg2l-tsu"; + reg = <0x10059400 0x400>; + clocks = <&cpg CPG_MOD R9A07G044_TSU_PCLK>; + resets = <&cpg R9A07G044_TSU_PRESETN>; + power-domains = <&cpg>; + #thermal-sensor-cells = <1>; + }; + + thermal-zones { + cpu-thermal { + polling-delay-passive = <250>; + polling-delay = <1000>; + thermal-sensors = <&tsu 0>; + + trips { + sensor_crit: sensor-crit { + temperature = <125000>; + hysteresis = <1000>; + type = "critical"; + }; + }; + }; + }; From 673c68bd48390dad01f7d17670de3e33b60860ac Mon Sep 17 00:00:00 2001 From: Biju Das Date: Tue, 30 Nov 2021 15:57:57 +0000 Subject: [PATCH 11/49] thermal/drivers: Add TSU driver for RZ/G2L MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RZ/G2L SoC incorporates a thermal sensor unit (TSU) that measures the temperature inside the LSI. The thermal sensor in this unit measures temperatures in the range from −40 degree Celsius to 125 degree Celsius with an accuracy of ±3°C. The TSU repeats measurement at 20 microseconds intervals and automatically updates the results of measurement. The TSU has no interrupts as well as no external pins. This patch adds Thermal Sensor Unit(TSU) driver for RZ/G2L SoC. Signed-off-by: Biju Das Reviewed-by: Lad Prabhakar Link: https://lore.kernel.org/r/20211130155757.17837-3-biju.das.jz@bp.renesas.com Signed-off-by: Daniel Lezcano --- drivers/thermal/Kconfig | 9 ++ drivers/thermal/Makefile | 1 + drivers/thermal/rzg2l_thermal.c | 240 ++++++++++++++++++++++++++++++++ 3 files changed, 250 insertions(+) create mode 100644 drivers/thermal/rzg2l_thermal.c diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig index d7f44deab5b1..e37691e0bf20 100644 --- a/drivers/thermal/Kconfig +++ b/drivers/thermal/Kconfig @@ -354,6 +354,15 @@ config RCAR_GEN3_THERMAL Enable this to plug the R-Car Gen3 or RZ/G2 thermal sensor driver into the Linux thermal framework. +config RZG2L_THERMAL + tristate "Renesas RZ/G2L thermal driver" + depends on ARCH_RENESAS || COMPILE_TEST + depends on HAS_IOMEM + depends on OF + help + Enable this to plug the RZ/G2L thermal sensor driver into the Linux + thermal framework. + config KIRKWOOD_THERMAL tristate "Temperature sensor on Marvell Kirkwood SoCs" depends on MACH_KIRKWOOD || COMPILE_TEST diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile index 82fc3e616e54..f0c36a1530d5 100644 --- a/drivers/thermal/Makefile +++ b/drivers/thermal/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_SUN8I_THERMAL) += sun8i_thermal.o obj-$(CONFIG_ROCKCHIP_THERMAL) += rockchip_thermal.o obj-$(CONFIG_RCAR_THERMAL) += rcar_thermal.o obj-$(CONFIG_RCAR_GEN3_THERMAL) += rcar_gen3_thermal.o +obj-$(CONFIG_RZG2L_THERMAL) += rzg2l_thermal.o obj-$(CONFIG_KIRKWOOD_THERMAL) += kirkwood_thermal.o obj-y += samsung/ obj-$(CONFIG_DOVE_THERMAL) += dove_thermal.o diff --git a/drivers/thermal/rzg2l_thermal.c b/drivers/thermal/rzg2l_thermal.c new file mode 100644 index 000000000000..d47d4a30cd6c --- /dev/null +++ b/drivers/thermal/rzg2l_thermal.c @@ -0,0 +1,240 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Renesas RZ/G2L TSU Thermal Sensor Driver + * + * Copyright (C) 2021 Renesas Electronics Corporation + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "thermal_hwmon.h" + +#define CTEMP_MASK 0xFFF + +/* default calibration values, if FUSE values are missing */ +#define SW_CALIB0_VAL 3148 +#define SW_CALIB1_VAL 503 + +/* Register offsets */ +#define TSU_SM 0x00 +#define TSU_ST 0x04 +#define TSU_SAD 0x0C +#define TSU_SS 0x10 + +#define OTPTSUTRIM_REG(n) (0x18 + ((n) * 0x4)) + +/* Sensor Mode Register(TSU_SM) */ +#define TSU_SM_EN_TS BIT(0) +#define TSU_SM_ADC_EN_TS BIT(1) +#define TSU_SM_NORMAL_MODE (TSU_SM_EN_TS | TSU_SM_ADC_EN_TS) + +/* TSU_ST bits */ +#define TSU_ST_START BIT(0) + +#define TSU_SS_CONV_RUNNING BIT(0) + +#define TS_CODE_AVE_SCALE(x) ((x) * 1000000) +#define MCELSIUS(temp) ((temp) * MILLIDEGREE_PER_DEGREE) +#define TS_CODE_CAP_TIMES 8 /* Capture times */ + +#define RZG2L_THERMAL_GRAN 500 /* milli Celsius */ +#define RZG2L_TSU_SS_TIMEOUT_US 1000 + +#define CURVATURE_CORRECTION_CONST 13 + +struct rzg2l_thermal_priv { + struct device *dev; + void __iomem *base; + struct thermal_zone_device *zone; + struct reset_control *rstc; + u32 calib0, calib1; +}; + +static inline u32 rzg2l_thermal_read(struct rzg2l_thermal_priv *priv, u32 reg) +{ + return ioread32(priv->base + reg); +} + +static inline void rzg2l_thermal_write(struct rzg2l_thermal_priv *priv, u32 reg, + u32 data) +{ + iowrite32(data, priv->base + reg); +} + +static int rzg2l_thermal_get_temp(void *devdata, int *temp) +{ + struct rzg2l_thermal_priv *priv = devdata; + u32 result = 0, dsensor, ts_code_ave; + int val, i; + + for (i = 0; i < TS_CODE_CAP_TIMES ; i++) { + /* TSU repeats measurement at 20 microseconds intervals and + * automatically updates the results of measurement. As per + * the HW manual for measuring temperature we need to read 8 + * values consecutively and then take the average. + * ts_code_ave = (ts_code[0] + ⋯ + ts_code[7]) / 8 + */ + result += rzg2l_thermal_read(priv, TSU_SAD) & CTEMP_MASK; + usleep_range(20, 30); + } + + ts_code_ave = result / TS_CODE_CAP_TIMES; + + /* Calculate actual sensor value by applying curvature correction formula + * dsensor = ts_code_ave / (1 + ts_code_ave * 0.000013). Here we are doing + * integer calculation by scaling all the values by 1000000. + */ + dsensor = TS_CODE_AVE_SCALE(ts_code_ave) / + (TS_CODE_AVE_SCALE(1) + (ts_code_ave * CURVATURE_CORRECTION_CONST)); + + /* The temperature Tj is calculated by the formula + * Tj = (dsensor − calib1) * 165/ (calib0 − calib1) − 40 + * where calib0 and calib1 are the caliberation values. + */ + val = ((dsensor - priv->calib1) * (MCELSIUS(165) / + (priv->calib0 - priv->calib1))) - MCELSIUS(40); + + *temp = roundup(val, RZG2L_THERMAL_GRAN); + + return 0; +} + +static const struct thermal_zone_of_device_ops rzg2l_tz_of_ops = { + .get_temp = rzg2l_thermal_get_temp, +}; + +static int rzg2l_thermal_init(struct rzg2l_thermal_priv *priv) +{ + u32 reg_val; + + rzg2l_thermal_write(priv, TSU_SM, TSU_SM_NORMAL_MODE); + rzg2l_thermal_write(priv, TSU_ST, 0); + + /* Before setting the START bit, TSU should be in normal operating + * mode. As per the HW manual, it will take 60 µs to place the TSU + * into normal operating mode. + */ + usleep_range(60, 80); + + reg_val = rzg2l_thermal_read(priv, TSU_ST); + reg_val |= TSU_ST_START; + rzg2l_thermal_write(priv, TSU_ST, reg_val); + + return readl_poll_timeout(priv->base + TSU_SS, reg_val, + reg_val == TSU_SS_CONV_RUNNING, 50, + RZG2L_TSU_SS_TIMEOUT_US); +} + +static void rzg2l_thermal_reset_assert_pm_disable_put(struct platform_device *pdev) +{ + struct rzg2l_thermal_priv *priv = dev_get_drvdata(&pdev->dev); + + pm_runtime_put(&pdev->dev); + pm_runtime_disable(&pdev->dev); + reset_control_assert(priv->rstc); +} + +static int rzg2l_thermal_remove(struct platform_device *pdev) +{ + struct rzg2l_thermal_priv *priv = dev_get_drvdata(&pdev->dev); + + thermal_remove_hwmon_sysfs(priv->zone); + rzg2l_thermal_reset_assert_pm_disable_put(pdev); + + return 0; +} + +static int rzg2l_thermal_probe(struct platform_device *pdev) +{ + struct thermal_zone_device *zone; + struct rzg2l_thermal_priv *priv; + struct device *dev = &pdev->dev; + int ret; + + priv = devm_kzalloc(dev, sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + priv->base = devm_platform_ioremap_resource(pdev, 0); + if (IS_ERR(priv->base)) + return PTR_ERR(priv->base); + + priv->dev = dev; + priv->rstc = devm_reset_control_get_exclusive(&pdev->dev, NULL); + if (IS_ERR(priv->rstc)) + return dev_err_probe(dev, PTR_ERR(priv->rstc), + "failed to get cpg reset"); + + reset_control_deassert(priv->rstc); + + pm_runtime_enable(dev); + pm_runtime_get_sync(dev); + + priv->calib0 = rzg2l_thermal_read(priv, OTPTSUTRIM_REG(0)); + if (!priv->calib0) + priv->calib0 = SW_CALIB0_VAL; + + priv->calib1 = rzg2l_thermal_read(priv, OTPTSUTRIM_REG(1)); + if (!priv->calib1) + priv->calib1 = SW_CALIB1_VAL; + + platform_set_drvdata(pdev, priv); + ret = rzg2l_thermal_init(priv); + if (ret) { + dev_err(dev, "Failed to start TSU"); + goto err; + } + + zone = devm_thermal_zone_of_sensor_register(dev, 0, priv, + &rzg2l_tz_of_ops); + if (IS_ERR(zone)) { + dev_err(dev, "Can't register thermal zone"); + ret = PTR_ERR(zone); + goto err; + } + + priv->zone = zone; + priv->zone->tzp->no_hwmon = false; + ret = thermal_add_hwmon_sysfs(priv->zone); + if (ret) + goto err; + + dev_dbg(dev, "TSU probed with %s caliberation values", + rzg2l_thermal_read(priv, OTPTSUTRIM_REG(0)) ? "hw" : "sw"); + + return 0; + +err: + rzg2l_thermal_reset_assert_pm_disable_put(pdev); + return ret; +} + +static const struct of_device_id rzg2l_thermal_dt_ids[] = { + { .compatible = "renesas,rzg2l-tsu", }, + { /* sentinel */ } +}; +MODULE_DEVICE_TABLE(of, rzg2l_thermal_dt_ids); + +static struct platform_driver rzg2l_thermal_driver = { + .driver = { + .name = "rzg2l_thermal", + .of_match_table = rzg2l_thermal_dt_ids, + }, + .probe = rzg2l_thermal_probe, + .remove = rzg2l_thermal_remove, +}; +module_platform_driver(rzg2l_thermal_driver); + +MODULE_DESCRIPTION("Renesas RZ/G2L TSU Thermal Sensor Driver"); +MODULE_AUTHOR("Biju Das "); +MODULE_LICENSE("GPL v2"); From 443378f0664a78756c3e3aeaab92750fe1e05735 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Tue, 30 Nov 2021 17:00:30 -0800 Subject: [PATCH 12/49] workqueue: Upgrade queue_work_on() comment The current queue_work_on() docbook comment says that the caller must ensure that the specified CPU can't go away, but does not spell out the consequences, which turn out to be quite mild. Therefore expand this comment to explicitly say that the penalty for failing to nail down the specified CPU is that the workqueue handler might find itself executing on some other CPU. Cc: Tejun Heo Cc: Lai Jiangshan Signed-off-by: Paul E. McKenney Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 613917bbc4e7..332361cf215f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1531,7 +1531,8 @@ out: * @work: work to queue * * We queue the work to a specific CPU, the caller must ensure it - * can't go away. + * can't go away. Callers that fail to ensure that the specified + * CPU cannot go away will execute on a randomly chosen CPU. * * Return: %false if @work was already on a queue, %true otherwise. */ From af3bf054661fb11497a7f712ece8b838521227a4 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Wed, 1 Dec 2021 01:17:36 +0000 Subject: [PATCH 13/49] cgroup: fix a typo in comment In commit 8699b7762a62 ("cgroup: s/child_subsys_mask/subtree_ss_mask/"), we rename child_subsys_mask to subtree_ss_mask. While it missed to rename this in comment. Signed-off-by: Wei Yang Signed-off-by: Tejun Heo --- include/linux/cgroup-defs.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h index db2e147e069f..bb1e79791ed5 100644 --- a/include/linux/cgroup-defs.h +++ b/include/linux/cgroup-defs.h @@ -413,7 +413,7 @@ struct cgroup { /* * The bitmask of subsystems enabled on the child cgroups. * ->subtree_control is the one configured through - * "cgroup.subtree_control" while ->child_ss_mask is the effective + * "cgroup.subtree_control" while ->subtree_ss_mask is the effective * one which may have more subsystems enabled. Controller knobs * are made available iff it's enabled in ->subtree_control. */ From e14da77113bb890d7bf9e5d17031bdd476a7ce5e Mon Sep 17 00:00:00 2001 From: William Kucharski Date: Wed, 1 Dec 2021 09:56:58 -0700 Subject: [PATCH 14/49] cgroup: Trace event cgroup id fields should be u64 Various trace event fields that store cgroup IDs were declared as ints, but cgroup_id(() returns a u64 and the structures and associated TP_printk() calls were not updated to reflect this. Fixes: 743210386c03 ("cgroup: use cgrp->kn->id as the cgroup ID") Signed-off-by: William Kucharski Reviewed-by: Steven Rostedt (VMware) Signed-off-by: Tejun Heo --- include/trace/events/cgroup.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/trace/events/cgroup.h b/include/trace/events/cgroup.h index 7f42a3de59e6..dd7d7c9efecd 100644 --- a/include/trace/events/cgroup.h +++ b/include/trace/events/cgroup.h @@ -59,8 +59,8 @@ DECLARE_EVENT_CLASS(cgroup, TP_STRUCT__entry( __field( int, root ) - __field( int, id ) __field( int, level ) + __field( u64, id ) __string( path, path ) ), @@ -71,7 +71,7 @@ DECLARE_EVENT_CLASS(cgroup, __assign_str(path, path); ), - TP_printk("root=%d id=%d level=%d path=%s", + TP_printk("root=%d id=%llu level=%d path=%s", __entry->root, __entry->id, __entry->level, __get_str(path)) ); @@ -126,8 +126,8 @@ DECLARE_EVENT_CLASS(cgroup_migrate, TP_STRUCT__entry( __field( int, dst_root ) - __field( int, dst_id ) __field( int, dst_level ) + __field( u64, dst_id ) __field( int, pid ) __string( dst_path, path ) __string( comm, task->comm ) @@ -142,7 +142,7 @@ DECLARE_EVENT_CLASS(cgroup_migrate, __assign_str(comm, task->comm); ), - TP_printk("dst_root=%d dst_id=%d dst_level=%d dst_path=%s pid=%d comm=%s", + TP_printk("dst_root=%d dst_id=%llu dst_level=%d dst_path=%s pid=%d comm=%s", __entry->dst_root, __entry->dst_id, __entry->dst_level, __get_str(dst_path), __entry->pid, __get_str(comm)) ); @@ -171,8 +171,8 @@ DECLARE_EVENT_CLASS(cgroup_event, TP_STRUCT__entry( __field( int, root ) - __field( int, id ) __field( int, level ) + __field( u64, id ) __string( path, path ) __field( int, val ) ), @@ -185,7 +185,7 @@ DECLARE_EVENT_CLASS(cgroup_event, __entry->val = val; ), - TP_printk("root=%d id=%d level=%d path=%s val=%d", + TP_printk("root=%d id=%llu level=%d path=%s val=%d", __entry->root, __entry->id, __entry->level, __get_str(path), __entry->val) ); From 3de89d8842a2b5d3dd22ebf97dd561ae0a330948 Mon Sep 17 00:00:00 2001 From: Paul Gerber Date: Mon, 22 Nov 2021 12:42:25 +0100 Subject: [PATCH 15/49] thermal/drivers/imx8mm: Enable ADC when enabling monitor The i.MX 8MP has a ADC_PD bit in the TMU_TER register that controls the operating mode of the ADC: * 0 means normal operating mode * 1 means power down mode When enabling/disabling the TMU, the ADC operating mode must be set accordingly. i.MX 8M Mini & Nano are lacking this bit. Signed-off-by: Paul Gerber Signed-off-by: Alexander Stein Fixes: 2b8f1f0337c5 ("thermal: imx8mm: Add i.MX8MP support") Link: https://lore.kernel.org/r/20211122114225.196280-1-alexander.stein@ew.tq-group.com Signed-off-by: Daniel Lezcano --- drivers/thermal/imx8mm_thermal.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/thermal/imx8mm_thermal.c b/drivers/thermal/imx8mm_thermal.c index 7442e013738f..af666bd9e8d4 100644 --- a/drivers/thermal/imx8mm_thermal.c +++ b/drivers/thermal/imx8mm_thermal.c @@ -21,6 +21,7 @@ #define TPS 0x4 #define TRITSR 0x20 /* TMU immediate temp */ +#define TER_ADC_PD BIT(30) #define TER_EN BIT(31) #define TRITSR_TEMP0_VAL_MASK 0xff #define TRITSR_TEMP1_VAL_MASK 0xff0000 @@ -113,6 +114,8 @@ static void imx8mm_tmu_enable(struct imx8mm_tmu *tmu, bool enable) val = readl_relaxed(tmu->base + TER); val = enable ? (val | TER_EN) : (val & ~TER_EN); + if (tmu->socdata->version == TMU_VER2) + val = enable ? (val & ~TER_ADC_PD) : (val | TER_ADC_PD); writel_relaxed(val, tmu->base + TER); } From 07edfece8bcb0580a1828d939e6f8d91a8603eb2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 1 Dec 2021 16:19:44 +0100 Subject: [PATCH 16/49] workqueue: Fix unbind_workers() VS wq_worker_running() race At CPU-hotplug time, unbind_worker() may preempt a worker while it is waking up. In that case the following scenario can happen: unbind_workers() wq_worker_running() -------------- ------------------- if (!(worker->flags & WORKER_NOT_RUNNING)) //PREEMPTED by unbind_workers worker->flags |= WORKER_UNBOUND; [...] atomic_set(&pool->nr_running, 0); //resume to worker atomic_inc(&worker->pool->nr_running); After unbind_worker() resets pool->nr_running, the value is expected to remain 0 until the pool ever gets rebound in case cpu_up() is called on the target CPU in the future. But here the race leaves pool->nr_running with a value of 1, triggering the following warning when the worker goes idle: WARNING: CPU: 3 PID: 34 at kernel/workqueue.c:1823 worker_enter_idle+0x95/0xc0 Modules linked in: CPU: 3 PID: 34 Comm: kworker/3:0 Not tainted 5.16.0-rc1+ #34 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 Workqueue: 0x0 (rcu_par_gp) RIP: 0010:worker_enter_idle+0x95/0xc0 Code: 04 85 f8 ff ff ff 39 c1 7f 09 48 8b 43 50 48 85 c0 74 1b 83 e2 04 75 99 8b 43 34 39 43 30 75 91 8b 83 00 03 00 00 85 c0 74 87 <0f> 0b 5b c3 48 8b 35 70 f1 37 01 48 8d 7b 48 48 81 c6 e0 93 0 RSP: 0000:ffff9b7680277ed0 EFLAGS: 00010086 RAX: 00000000ffffffff RBX: ffff93465eae9c00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff9346418a0000 RDI: ffff934641057140 RBP: ffff934641057170 R08: 0000000000000001 R09: ffff9346418a0080 R10: ffff9b768027fdf0 R11: 0000000000002400 R12: ffff93465eae9c20 R13: ffff93465eae9c20 R14: ffff93465eae9c70 R15: ffff934641057140 FS: 0000000000000000(0000) GS:ffff93465eac0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000001cc0c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: worker_thread+0x89/0x3d0 ? process_one_work+0x400/0x400 kthread+0x162/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 Also due to this incorrect "nr_running == 1", further queued work may end up not being served, because no worker is awaken at work insert time. This raises rcutorture writer stalls for example. Fix this with disabling preemption in the right place in wq_worker_running(). It's worth noting that if the worker migrates and runs concurrently with unbind_workers(), it is guaranteed to see the WORKER_UNBOUND flag update due to set_cpus_allowed_ptr() acquiring/releasing rq->lock. Fixes: 6d25be5782e4 ("sched/core, workqueues: Distangle worker accounting from rq lock") Reviewed-by: Lai Jiangshan Tested-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Signed-off-by: Tejun Heo --- kernel/workqueue.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 332361cf215f..5094573e8b45 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -868,8 +868,17 @@ void wq_worker_running(struct task_struct *task) if (!worker->sleeping) return; + + /* + * If preempted by unbind_workers() between the WORKER_NOT_RUNNING check + * and the nr_running increment below, we may ruin the nr_running reset + * and leave with an unexpected pool->nr_running == 1 on the newly unbound + * pool. Protect against such race. + */ + preempt_disable(); if (!(worker->flags & WORKER_NOT_RUNNING)) atomic_inc(&worker->pool->nr_running); + preempt_enable(); worker->sleeping = 0; } From 45c753f5f24d2d4717acb38ce35e604ff9abcb50 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 1 Dec 2021 16:19:45 +0100 Subject: [PATCH 17/49] workqueue: Fix unbind_workers() VS wq_worker_sleeping() race At CPU-hotplug time, unbind_workers() may preempt a worker while it is going to sleep. In that case the following scenario can happen: unbind_workers() wq_worker_sleeping() -------------- ------------------- if (worker->flags & WORKER_NOT_RUNNING) return; //PREEMPTED by unbind_workers worker->flags |= WORKER_UNBOUND; [...] atomic_set(&pool->nr_running, 0); //resume to worker atomic_dec_and_test(&pool->nr_running); After unbind_worker() resets pool->nr_running, the value is expected to remain 0 until the pool ever gets rebound in case cpu_up() is called on the target CPU in the future. But here the race leaves pool->nr_running with a value of -1, triggering the following warning when the worker goes idle: WARNING: CPU: 3 PID: 34 at kernel/workqueue.c:1823 worker_enter_idle+0x95/0xc0 Modules linked in: CPU: 3 PID: 34 Comm: kworker/3:0 Not tainted 5.16.0-rc1+ #34 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 Workqueue: 0x0 (rcu_par_gp) RIP: 0010:worker_enter_idle+0x95/0xc0 Code: 04 85 f8 ff ff ff 39 c1 7f 09 48 8b 43 50 48 85 c0 74 1b 83 e2 04 75 99 8b 43 34 39 43 30 75 91 8b 83 00 03 00 00 85 c0 74 87 <0f> 0b 5b c3 48 8b 35 70 f1 37 01 48 8d 7b 48 48 81 c6 e0 93 0 RSP: 0000:ffff9b7680277ed0 EFLAGS: 00010086 RAX: 00000000ffffffff RBX: ffff93465eae9c00 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffff9346418a0000 RDI: ffff934641057140 RBP: ffff934641057170 R08: 0000000000000001 R09: ffff9346418a0080 R10: ffff9b768027fdf0 R11: 0000000000002400 R12: ffff93465eae9c20 R13: ffff93465eae9c20 R14: ffff93465eae9c70 R15: ffff934641057140 FS: 0000000000000000(0000) GS:ffff93465eac0000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000000 CR3: 000000001cc0c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Call Trace: worker_thread+0x89/0x3d0 ? process_one_work+0x400/0x400 kthread+0x162/0x190 ? set_kthread_struct+0x40/0x40 ret_from_fork+0x22/0x30 Also due to this incorrect "nr_running == -1", all sorts of hazards can happen, starting with queued works being ignored because no workers are awaken at insert_work() time. Fix this with checking again the worker flags while pool->lock is locked. Fixes: b945efcdd07d ("sched: Remove pointless preemption disable in sched_submit_work()") Reviewed-by: Lai Jiangshan Tested-by: Paul E. McKenney Acked-by: Peter Zijlstra (Intel) Signed-off-by: Frederic Weisbecker Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Paul E. McKenney Cc: Sebastian Andrzej Siewior Cc: Daniel Bristot de Oliveira Signed-off-by: Tejun Heo --- kernel/workqueue.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 5094573e8b45..5557d19ea81c 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -912,6 +912,16 @@ void wq_worker_sleeping(struct task_struct *task) worker->sleeping = 1; raw_spin_lock_irq(&pool->lock); + /* + * Recheck in case unbind_workers() preempted us. We don't + * want to decrement nr_running after the worker is unbound + * and nr_running has been reset. + */ + if (worker->flags & WORKER_NOT_RUNNING) { + raw_spin_unlock_irq(&pool->lock); + return; + } + /* * The counterpart of the following dec_and_test, implied mb, * worklist not empty test sequence is in insert_work(). From 52e68cd60ddf11802f5135921aba77c0833909a8 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 27 Nov 2021 15:27:35 +0100 Subject: [PATCH 18/49] vsprintf: Use non-atomic bitmap API when applicable The 'set' bitmap is local to this function. No concurrent access to it is possible. So prefer the non-atomic '__[set|clear]_bit()' function to save a few cycles. Signed-off-by: Christophe JAILLET Reviewed-by: Petr Mladek Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/1abf81a5e509d372393bd22041eed4ebc07ef9f7.1638023178.git.christophe.jaillet@wanadoo.fr --- lib/vsprintf.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 58d5e567f836..53d6081f9e8b 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -3564,7 +3564,7 @@ int vsscanf(const char *buf, const char *fmt, va_list args) ++fmt; for ( ; *fmt && *fmt != ']'; ++fmt, ++len) - set_bit((u8)*fmt, set); + __set_bit((u8)*fmt, set); /* no ']' or no character set found */ if (!*fmt || !len) @@ -3574,7 +3574,7 @@ int vsscanf(const char *buf, const char *fmt, va_list args) if (negate) { bitmap_complement(set, set, 256); /* exclude null '\0' byte */ - clear_bit(0, set); + __clear_bit(0, set); } /* match must be non-empty */ From ed758b30d541e9bf713cd58612a4414e57dc6d73 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 22 Nov 2021 14:26:45 +0100 Subject: [PATCH 19/49] printk/console: Split out code that enables default console Put the code enabling a console by default into a separate function called try_enable_default_console(). Rename try_enable_new_console() to try_enable_preferred_console() to make the purpose of the different variants more clear. It is a code refactoring without any functional change. Signed-off-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Link: https://lore.kernel.org/r/20211122132649.12737-2-pmladek@suse.com --- kernel/printk/printk.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 57b132b658e1..1acbe39dd47c 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2861,7 +2861,8 @@ early_param("keep_bootcon", keep_bootcon_setup); * Care need to be taken with consoles that are statically * enabled such as netconsole */ -static int try_enable_new_console(struct console *newcon, bool user_specified) +static int try_enable_preferred_console(struct console *newcon, + bool user_specified) { struct console_cmdline *c; int i, err; @@ -2909,6 +2910,23 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) return -ENOENT; } +/* Try to enable the console unconditionally */ +static void try_enable_default_console(struct console *newcon) +{ + if (newcon->index < 0) + newcon->index = 0; + + if (newcon->setup && newcon->setup(newcon, NULL) != 0) + return; + + newcon->flags |= CON_ENABLED; + + if (newcon->device) { + newcon->flags |= CON_CONSDEV; + has_preferred_console = true; + } +} + /* * The console driver calls this routine during kernel initialization * to register the console printing procedure with printk() and to @@ -2964,25 +2982,15 @@ void register_console(struct console *newcon) * didn't select a console we take the first one * that registers here. */ - if (!has_preferred_console) { - if (newcon->index < 0) - newcon->index = 0; - if (newcon->setup == NULL || - newcon->setup(newcon, NULL) == 0) { - newcon->flags |= CON_ENABLED; - if (newcon->device) { - newcon->flags |= CON_CONSDEV; - has_preferred_console = true; - } - } - } + if (!has_preferred_console) + try_enable_default_console(newcon); /* See if this console matches one we selected on the command line */ - err = try_enable_new_console(newcon, true); + err = try_enable_preferred_console(newcon, true); /* If not, try to match against the platform default(s) */ if (err == -ENOENT) - err = try_enable_new_console(newcon, false); + err = try_enable_preferred_console(newcon, false); /* printk() messages are not printed to the Braille console. */ if (err || newcon->flags & CON_BRL) From a6953370d2fcf8c3878f1588771d20d3d972fcf3 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 22 Nov 2021 14:26:46 +0100 Subject: [PATCH 20/49] printk/console: Rename has_preferred_console to need_default_console The logic around the variable @has_preferred_console made my head spin many times. Part of the problem is the ambiguous name. There is the variable @preferred_console. It points to the last non-braille console in @console_cmdline array. This array contains consoles preferred via the command line, device tree, or SPCR. Then there is the variable @has_preferred_console. It is set to "true" when @preferred_console is enabled or when a console with tty binding gets enabled by default. It might get reset back by the magic condition: if (!has_preferred_console || bcon || !console_drivers) has_preferred_console = preferred_console >= 0; It is a puzzle. Dumb explanation is that it gets re-evaluated when: + it was not set before (see above when it gets set) + there is still an early console enabled (bcon) + there is no console enabled (!console_drivers) This is still a puzzle. It gets more clear when we see where the value is checked. The only meaning of the variable is to decide whether we should try to enable the new console by default. Rename the variable according to the single situation where the value is checked. The rename requires an inverted logic. Otherwise, it is a simple search & replace. It does not change the functionality. Signed-off-by: Petr Mladek Reviewed-by: Sergey Senozhatsky Link: https://lore.kernel.org/r/20211122132649.12737-3-pmladek@suse.com --- kernel/printk/printk.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 1acbe39dd47c..4c5f496877b0 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -280,7 +280,7 @@ static struct console *exclusive_console; static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; -static bool has_preferred_console; +static bool need_default_console = true; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); @@ -2894,7 +2894,7 @@ static int try_enable_preferred_console(struct console *newcon, newcon->flags |= CON_ENABLED; if (i == preferred_console) { newcon->flags |= CON_CONSDEV; - has_preferred_console = true; + need_default_console = false; } return 0; } @@ -2923,7 +2923,7 @@ static void try_enable_default_console(struct console *newcon) if (newcon->device) { newcon->flags |= CON_CONSDEV; - has_preferred_console = true; + need_default_console = false; } } @@ -2974,15 +2974,15 @@ void register_console(struct console *newcon) if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; - if (!has_preferred_console || bcon || !console_drivers) - has_preferred_console = preferred_console >= 0; + if (need_default_console || bcon || !console_drivers) + need_default_console = preferred_console < 0; /* * See if we want to use this console driver. If we * didn't select a console we take the first one * that registers here. */ - if (!has_preferred_console) + if (need_default_console) try_enable_default_console(newcon); /* See if this console matches one we selected on the command line */ From f873efe841f813303e8a4af0d4cc48ff1f43bbe2 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 22 Nov 2021 14:26:47 +0100 Subject: [PATCH 21/49] printk/console: Remove unnecessary need_default_console manipulation There is no need to clear @need_default_console when a console preferred by the command line, device tree, or SPCR, gets enabled. The code is called only when some non-braille console matched a console in @console_cmdline array. It means that a non-braille console was added in __add_preferred_console() and the variable preferred_console is set to a number >= 0. As a result, @need_default_console is always set to "false" in the magic condition: if (need_default_console || bcon || !console_drivers) need_default_console = preferred_console < 0; This is one small step in removing the above magic condition that is hard to follow. The patch removes one superfluous assignment and should not change the functionality. Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211122132649.12737-4-pmladek@suse.com --- kernel/printk/printk.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 4c5f496877b0..3f845daa3a4a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2892,10 +2892,8 @@ static int try_enable_preferred_console(struct console *newcon, return err; } newcon->flags |= CON_ENABLED; - if (i == preferred_console) { + if (i == preferred_console) newcon->flags |= CON_CONSDEV; - need_default_console = false; - } return 0; } From 4f546939259f8e1130b60553433892774a42ea68 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 22 Nov 2021 14:26:48 +0100 Subject: [PATCH 22/49] printk/console: Remove need_default_console variable The variable @need_default_console is used to decide whether a newly registered console should get enabled by default. The logic is complicated. It can be modified in a register_console() call. But it is always re-evaluated in the next call by the following condition: if (need_default_console || bcon || !console_drivers) need_default_console = preferred_console < 0; In short, the value is updated when either of the condition is valid: + the value is still, or again, "true" + boot/early console is still the first in @console_driver list + @console_driver list is empty The value is updated according to @preferred_console. In particular, it is set to "false" when a @preferred_console was set by __add_preferred_console(). This happens when a non-braille console was added via the command line, device tree, or SPCR. It far from clear what this all means together. Let's look at @need_default_console from another angle: 1. The value is "true" by default. It means that it is always set according to @preferred_console during the first register_console() call. By other words, the first register_console() call will register the console by default only when none non-braille console was defined via the command line, device tree, or SPCR. 2. The value will always stay "false" when @preferred_console is set. By other words, try_enable_default_console() will never get called when a non-braille console is explicitly required. 4. The value might be set to "false" in try_enable_default_console() when a console with tty binding (driver) gets enabled. In this case CON_CONSDEV is set as well. It causes that the console will be inserted as first into the list @console_driver. It might be either real or boot/early console. 5. The value will be set _back_ to "true" in the next register_console() call when: + The console added by the previous register_console() had been a boot/early one. + The last console has been unregistered in the meantime and a boot/early console became first in @console_drivers list again. Or the list became empty. By other words, the value will stay "false" only when the last registered console was real, had tty binding, and was not removed in the mean time. The main logic looks clear: + Consoles are enabled by default only when no one is preferred via the command line, device tree, or SPCR. + By default, any console is enabled until a real console with tty binding gets registered. The behavior when the real console with tty binding is later removed is a bit unclear: + By default, any new console is registered again only when there is no console or the first console in the list is a boot one. The question is why the code is suddenly happy when a real console without tty binding is the first in the list. It looks like an overlook and bug. Conclusion: The state of @preferred_console and the first console in @console_driver list should be enough to decide whether we need to enable the given console by default. The rules are simple. New consoles are _not_ enabled by default when either of the following conditions is true: + @preferred_console is set. It means that a non-braille console is explicitly configured via the command line, device tree, or SPCR. + A real console with tty binding is registered. Such a console will have CON_CONSDEV flag set and will always be the first in @console_drivers list. Note: The new code does not use @bcon variable. The meaning of the variable is far from clear. The direct check of the first console in the list makes it more clear that only real console fulfills requirements of the default console. Behavior change: As already discussed above. There was one situation where the original code worked a strange way. Let's have: + console A: real console without tty binding + console B: real console with tty binding and do: register_console(A); /* 1st step */ register_console(B); /* 2nd step */ unregister_console(B); /* 3rd step */ register_console(B); /* 4th step */ The original code will not register the console B in the 4th step. @need_default_console is set to "false" in 2nd step. The real console with tty binding (driver) is then removed in the 3rd step. But @need_default_console will stay "false" in the 4th step because there is no boot/early console and @registered_consoles list is not empty. The new code will register the console B in the 4th step because it checks whether the first console has tty binding (->driver) This behavior change should acceptable: 1. The scenario requires manual intervention (console removal). The system should boot with the same consoles as before. 2. Console B is registered again probably because the user wants to use it. The most likely scenario is that the related module is reloaded. 3. It makes the behavior more consistent and predictable. Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211122132649.12737-5-pmladek@suse.com --- kernel/printk/printk.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 3f845daa3a4a..6591da285a83 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -280,7 +280,6 @@ static struct console *exclusive_console; static struct console_cmdline console_cmdline[MAX_CMDLINECONSOLES]; static int preferred_console = -1; -static bool need_default_console = true; int console_set_on_cmdline; EXPORT_SYMBOL(console_set_on_cmdline); @@ -2919,10 +2918,8 @@ static void try_enable_default_console(struct console *newcon) newcon->flags |= CON_ENABLED; - if (newcon->device) { + if (newcon->device) newcon->flags |= CON_CONSDEV; - need_default_console = false; - } } /* @@ -2972,16 +2969,24 @@ void register_console(struct console *newcon) if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; - if (need_default_console || bcon || !console_drivers) - need_default_console = preferred_console < 0; - /* - * See if we want to use this console driver. If we - * didn't select a console we take the first one - * that registers here. + * See if we want to enable this console driver by default. + * + * Nope when a console is preferred by the command line, device + * tree, or SPCR. + * + * The first real console with tty binding (driver) wins. More + * consoles might get enabled before the right one is found. + * + * Note that a console with tty binding will have CON_CONSDEV + * flag set and will be first in the list. */ - if (need_default_console) - try_enable_default_console(newcon); + if (preferred_console < 0) { + if (!console_drivers || !console_drivers->device || + console_drivers->flags & CON_BOOT) { + try_enable_default_console(newcon); + } + } /* See if this console matches one we selected on the command line */ err = try_enable_preferred_console(newcon, true); From 5e8ba485b2522808ab2d65208839e1c915e113dd Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Mon, 22 Nov 2021 14:26:49 +0100 Subject: [PATCH 23/49] printk/console: Clean up boot console handling in register_console() The variable @bcon has two meanings. It is used several times for iterating the list of registered consoles. In the meantime, it holds the information whether a boot console is first in @console_drivers list. The information about the 1st console driver used to be important for the decision whether to install the new console by default or not. It allowed to re-evaluate the variable @need_default_console when a real console with tty binding has been unregistered in the meantime. The decision about the default console is not longer affected by @bcon variable. The current code checks whether the first driver is real and has tty binding directly. The information about the first console is still used for two more decisions: 1. It prevents duplicate output on non-boot consoles with CON_CONSDEV flag set. 2. Early/boot consoles are unregistered when a real console with CON_CONSDEV is registered and @keep_bootcon is not set. The behavior in the real life is far from obvious. @bcon is set according to the first console @console_drivers list. But the first position in the list is special: 1. Consoles with CON_CONSDEV flag are put at the beginning of the list. It is either the preferred console or any console with tty binding registered by default. 2. Another console might become the first in the list when the first console in the list is unregistered. It might happen either explicitly or automatically when boot consoles are unregistered. There is one more important rule: + Boot consoles can't be registered when any real console is already registered. It is a puzzle. The main complication is the dependency on the first position is the list and the complicated rules around it. Let's try to make it easier: 1. Add variable @bootcon_enabled and set it by iterating all registered consoles. The variable has obvious meaning and more predictable behavior. Any speed optimization and other tricks are not worth it. 2. Use a generic name for the variable that is used to iterate the list on registered console drivers. Behavior change: No, maybe surprisingly, there is _no_ behavior change! Let's provide the proof by contradiction. Both operations, duplicate output prevention and boot consoles removal, are done only when the newly added console has CON_CONSDEV flag set. The behavior would change when the new @bootcon_enabled has different value than the original @bcon. By other words, the behavior would change when the following conditions are true: + a console with CON_CONSDEV flag is added + a real (non-boot) console is the first in the list + a boot console is later in the list Now, a real console might be first in the list only when: + It was the first registered console. In this case, there can't be any boot console because any later ones were rejected. + It was put at the first position because it had CON_CONSDEV flag set. It was either the preferred console or it was a console with tty binding registered by default. We are interested only in a real consoles here. And real console with tty binding fulfills conditions of the default console. Now, there is always only one console that is either preferred or fulfills conditions of the default console. It can't be already in the list and being registered at the same time. As a result, the above three conditions could newer be "true" at the same time. Therefore the behavior can't change. Final dilemma: OK, the new code has the same behavior. But is the change in the right direction? What if the handling of @console_drivers is updated in the future? OK, let's look at it from another angle: 1. The ordering of @console_drivers list is important only in console_device() function. The first console driver with tty binding gets associated with /dev/console. 2. CON_CONSDEV flag is shown in /proc/consoles. And it should be set for the driver that is returned by console_device(). 3. A boot console is removed and the duplicated output is prevented when the real console with CON_CONSDEV flag is registered. Now, in the ideal world: + The driver associated with /dev/console should be either a console preferred via the command line, device tree, or SPCR. Or it should be the first real console with tty binding registered by default. + The code should match the related boot and real console drivers. It should unregister only the obsolete boot driver. And the duplicated output should be prevented only on the related real driver. It is clear that it is not guaranteed by the current code. Instead, the current code looks like a maze of heuristics that try to achieve the above. It is result of adding several features over last few decades. For example, a possibility to register more consoles, unregister consoles, boot consoles, consoles without tty binding, device tree, SPCR, braille consoles. Anyway, there is no reason why the decision, about removing boot consoles and preventing duplicated output, should depend on the first console in the list. The current code does the decisions primary by CON_CONSDEV flag that is used for the preferred console. It looks like a good compromise. And the change seems to be in the right direction. Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20211122132649.12737-6-pmladek@suse.com --- kernel/printk/printk.c | 47 +++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 23 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 6591da285a83..155229f0cf0f 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -2943,31 +2943,30 @@ static void try_enable_default_console(struct console *newcon) */ void register_console(struct console *newcon) { - struct console *bcon = NULL; + struct console *con; + bool bootcon_enabled = false; + bool realcon_enabled = false; int err; - for_each_console(bcon) { - if (WARN(bcon == newcon, "console '%s%d' already registered\n", - bcon->name, bcon->index)) + for_each_console(con) { + if (WARN(con == newcon, "console '%s%d' already registered\n", + con->name, con->index)) return; } - /* - * before we register a new CON_BOOT console, make sure we don't - * already have a valid console - */ - if (newcon->flags & CON_BOOT) { - for_each_console(bcon) { - if (!(bcon->flags & CON_BOOT)) { - pr_info("Too late to register bootconsole %s%d\n", - newcon->name, newcon->index); - return; - } - } + for_each_console(con) { + if (con->flags & CON_BOOT) + bootcon_enabled = true; + else + realcon_enabled = true; } - if (console_drivers && console_drivers->flags & CON_BOOT) - bcon = console_drivers; + /* Do not register boot consoles when there already is a real one. */ + if (newcon->flags & CON_BOOT && realcon_enabled) { + pr_info("Too late to register bootconsole %s%d\n", + newcon->name, newcon->index); + return; + } /* * See if we want to enable this console driver by default. @@ -3005,8 +3004,10 @@ void register_console(struct console *newcon) * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) + if (bootcon_enabled && + ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { newcon->flags &= ~CON_PRINTBUFFER; + } /* * Put this console in the list - keep the @@ -3062,15 +3063,15 @@ void register_console(struct console *newcon) pr_info("%sconsole [%s%d] enabled\n", (newcon->flags & CON_BOOT) ? "boot" : "" , newcon->name, newcon->index); - if (bcon && + if (bootcon_enabled && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV) && !keep_bootcon) { /* We need to iterate through all boot consoles, to make * sure we print everything out, before we unregister them. */ - for_each_console(bcon) - if (bcon->flags & CON_BOOT) - unregister_console(bcon); + for_each_console(con) + if (con->flags & CON_BOOT) + unregister_console(con); } } EXPORT_SYMBOL(register_console); From 8ee1c0f6526ce942b7595951c7bb0165010051c2 Mon Sep 17 00:00:00 2001 From: Biju Das Date: Wed, 8 Dec 2021 16:40:09 +0000 Subject: [PATCH 24/49] thermal/drivers/rz2gl: Add error check for reset_control_deassert() If reset_control_deassert() fails, then we won't be able to access the device registers. Therefore check the return code of reset_control_deassert() and bail out in case of error. While at it replace the parameter "&pdev->dev" -> "dev" in devm_reset_control_get_exclusive(). Suggested-by: Philipp Zabel Signed-off-by: Biju Das Reviewed-by: Philipp Zabel Link: https://lore.kernel.org/r/20211208164010.4130-1-biju.das.jz@bp.renesas.com Signed-off-by: Daniel Lezcano --- drivers/thermal/rzg2l_thermal.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/thermal/rzg2l_thermal.c b/drivers/thermal/rzg2l_thermal.c index d47d4a30cd6c..7a9cdc1f37ca 100644 --- a/drivers/thermal/rzg2l_thermal.c +++ b/drivers/thermal/rzg2l_thermal.c @@ -170,12 +170,14 @@ static int rzg2l_thermal_probe(struct platform_device *pdev) return PTR_ERR(priv->base); priv->dev = dev; - priv->rstc = devm_reset_control_get_exclusive(&pdev->dev, NULL); + priv->rstc = devm_reset_control_get_exclusive(dev, NULL); if (IS_ERR(priv->rstc)) return dev_err_probe(dev, PTR_ERR(priv->rstc), "failed to get cpg reset"); - reset_control_deassert(priv->rstc); + ret = reset_control_deassert(priv->rstc); + if (ret) + return dev_err_probe(dev, ret, "failed to deassert"); pm_runtime_enable(dev); pm_runtime_get_sync(dev); From ccf45156fd167a234baf038c11c1f367c7ccabd4 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 7 Dec 2021 15:35:37 +0800 Subject: [PATCH 25/49] workqueue: Remove the outdated comment before wq_worker_sleeping() It isn't called with preempt disabled now. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 613917bbc4e7..2964dbb783fe 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -878,8 +878,7 @@ void wq_worker_running(struct task_struct *task) * @task: task going to sleep * * This function is called from schedule() when a busy worker is - * going to sleep. Preemption needs to be disabled to protect ->sleeping - * assignment. + * going to sleep. */ void wq_worker_sleeping(struct task_struct *task) { From 3e5f39ea33b1189ccaa4ae2a9de2bce07753d2e0 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 7 Dec 2021 15:35:38 +0800 Subject: [PATCH 26/49] workqueue: Remove the advanced kicking of the idle workers in rebind_workers() The commit 6d25be5782e4 ("sched/core, workqueues: Distangle worker accounting from rq lock") changed the schedule callbacks for workqueue and removed the local-wake-up functionality. Now the wakingup of workers is done by normal fashion and workers not yet migrated to the specific CPU in concurrency managed pool can also be woken up by workers that already bound to the specific cpu now. So this advanced kicking of the idle workers to migrate them to the associated CPU is unneeded now. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 2964dbb783fe..f7f4a5fc7736 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -5057,17 +5057,6 @@ static void rebind_workers(struct worker_pool *pool) for_each_pool_worker(worker, pool) { unsigned int worker_flags = worker->flags; - /* - * A bound idle worker should actually be on the runqueue - * of the associated CPU for local wake-ups targeting it to - * work. Kick all idle workers so that they migrate to the - * associated CPU. Doing this in the same loop as - * replacing UNBOUND with REBOUND is safe as no worker will - * be bound before @pool->lock is released. - */ - if (worker_flags & WORKER_IDLE) - wake_up_process(worker->task); - /* * We want to clear UNBOUND but can't directly call * worker_clr_flags() or adjust nr_running. Atomically From 11b45b0bf402b53c94c86737a440363fc36f03cd Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 7 Dec 2021 15:35:39 +0800 Subject: [PATCH 27/49] workqueue: Remove outdated comment about exceptional workers in unbind_workers() Long time before, workers are not ALL bound after CPU_ONLINE, they can still be running in other CPUs before self rebinding. But the commit a9ab775bcadf ("workqueue: directly restore CPU affinity of workers from CPU_ONLINE") makes rebind_workers() bind them all. So all workers are on the CPU before the CPU is down. And the comment in unbind_workers() refers to the workers "which are still executing works from before the last CPU down" is outdated. Just removed it. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index f7f4a5fc7736..ae58c6ace23f 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4978,9 +4978,7 @@ static void unbind_workers(int cpu) /* * We've blocked all attach/detach operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers - * except for the ones which are still executing works from - * before the last CPU down must be on the cpu. After - * this, they may become diasporas. + * must be on the cpu. After this, they may become diasporas. */ for_each_pool_worker(worker, pool) worker->flags |= WORKER_UNBOUND; From b4ac9384ac057c5bf035fbe82fc162fa2f7b15a9 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 7 Dec 2021 15:35:40 +0800 Subject: [PATCH 28/49] workqueue: Remove schedule() in unbind_workers() The commit 6d25be5782e4 ("sched/core, workqueues: Distangle worker accounting from rq lock") changed the schedule callbacks for workqueue and moved the schedule callback from the wakeup code to at end of schedule() in the worker's process context. It means that the callback wq_worker_running() is guaranteed that it sees the %WORKER_UNBOUND flag after scheduled since unbind_workers() is running on the same CPU that all the pool's workers bound to. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index ae58c6ace23f..499a264183ef 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4979,6 +4979,9 @@ static void unbind_workers(int cpu) * We've blocked all attach/detach operations. Make all workers * unbound and set DISASSOCIATED. Before this, all workers * must be on the cpu. After this, they may become diasporas. + * And the preemption disabled section in their sched callbacks + * are guaranteed to see WORKER_UNBOUND since the code here + * is on the same cpu. */ for_each_pool_worker(worker, pool) worker->flags |= WORKER_UNBOUND; @@ -4994,14 +4997,6 @@ static void unbind_workers(int cpu) mutex_unlock(&wq_pool_attach_mutex); - /* - * Call schedule() so that we cross rq->lock and thus can - * guarantee sched callbacks see the %WORKER_UNBOUND flag. - * This is necessary as scheduler callbacks may be invoked - * from other cpus. - */ - schedule(); - /* * Sched callbacks are disabled now. Zap nr_running. * After this, nr_running stays zero and need_more_worker() From 989442d73757868118a73b92732b549a73c9ce35 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 7 Dec 2021 15:35:41 +0800 Subject: [PATCH 29/49] workqueue: Move the code of waking a worker up in unbind_workers() In unbind_workers(), there are two pool->lock held sections separated by the code of zapping nr_running. wake_up_worker() needs to be in pool->lock held section and after zapping nr_running. And zapping nr_running had to be after schedule() when the local wake up functionality was in use. Now, the call to schedule() has been removed along with the local wake up functionality, so the code can be merged into the same pool->lock held section. The diffstat shows that it is other code moved down because the diff tools can not know the meaning of merging lock sections by swapping two code blocks. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 38 +++++++++++++++----------------------- 1 file changed, 15 insertions(+), 23 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 499a264183ef..403387e9a924 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -1810,14 +1810,8 @@ static void worker_enter_idle(struct worker *worker) if (too_many_workers(pool) && !timer_pending(&pool->idle_timer)) mod_timer(&pool->idle_timer, jiffies + IDLE_WORKER_TIMEOUT); - /* - * Sanity check nr_running. Because unbind_workers() releases - * pool->lock between setting %WORKER_UNBOUND and zapping - * nr_running, the warning may trigger spuriously. Check iff - * unbind is not in progress. - */ - WARN_ON_ONCE(!(pool->flags & POOL_DISASSOCIATED) && - pool->nr_workers == pool->nr_idle && + /* Sanity check nr_running. */ + WARN_ON_ONCE(pool->nr_workers == pool->nr_idle && atomic_read(&pool->nr_running)); } @@ -4988,21 +4982,12 @@ static void unbind_workers(int cpu) pool->flags |= POOL_DISASSOCIATED; - raw_spin_unlock_irq(&pool->lock); - - for_each_pool_worker(worker, pool) { - kthread_set_per_cpu(worker->task, -1); - WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); - } - - mutex_unlock(&wq_pool_attach_mutex); - /* - * Sched callbacks are disabled now. Zap nr_running. - * After this, nr_running stays zero and need_more_worker() - * and keep_working() are always true as long as the - * worklist is not empty. This pool now behaves as an - * unbound (in terms of concurrency management) pool which + * The handling of nr_running in sched callbacks are disabled + * now. Zap nr_running. After this, nr_running stays zero and + * need_more_worker() and keep_working() are always true as + * long as the worklist is not empty. This pool now behaves as + * an unbound (in terms of concurrency management) pool which * are served by workers tied to the pool. */ atomic_set(&pool->nr_running, 0); @@ -5012,9 +4997,16 @@ static void unbind_workers(int cpu) * worker blocking could lead to lengthy stalls. Kick off * unbound chain execution of currently pending work items. */ - raw_spin_lock_irq(&pool->lock); wake_up_worker(pool); + raw_spin_unlock_irq(&pool->lock); + + for_each_pool_worker(worker, pool) { + kthread_set_per_cpu(worker->task, -1); + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_possible_mask) < 0); + } + + mutex_unlock(&wq_pool_attach_mutex); } } From 84f91c62d675480ffd3d870ee44c07965cbd8b21 Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Tue, 7 Dec 2021 15:35:42 +0800 Subject: [PATCH 30/49] workqueue: Remove the cacheline_aligned for nr_running nr_running is never modified remotely after the schedule callback in wakeup path is removed. Rather nr_running is often accessed with other fields in the pool together, so the cacheline_aligned for nr_running isn't needed. Signed-off-by: Lai Jiangshan Signed-off-by: Tejun Heo --- kernel/workqueue.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/kernel/workqueue.c b/kernel/workqueue.c index 403387e9a924..b583141c5481 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -154,6 +154,9 @@ struct worker_pool { unsigned long watchdog_ts; /* L: watchdog timestamp */ + /* The current concurrency level. */ + atomic_t nr_running; + struct list_head worklist; /* L: list of pending works */ int nr_workers; /* L: total number of workers */ @@ -177,19 +180,12 @@ struct worker_pool { struct hlist_node hash_node; /* PL: unbound_pool_hash node */ int refcnt; /* PL: refcnt for unbound pools */ - /* - * The current concurrency level. As it's likely to be accessed - * from other CPUs during try_to_wake_up(), put it in a separate - * cacheline. - */ - atomic_t nr_running ____cacheline_aligned_in_smp; - /* * Destruction of pool is RCU protected to allow dereferences * from get_work_pool(). */ struct rcu_head rcu; -} ____cacheline_aligned_in_smp; +}; /* * The per-pool workqueue. While queued, the lower WORK_STRUCT_FLAG_BITS From 1f1562fcd04a485734e94390660e741c3be47867 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Sun, 5 Dec 2021 13:32:14 -0500 Subject: [PATCH 31/49] cgroup/cpuset: Don't let child cpusets restrict parent in default hierarchy In validate_change(), there is a check since v2.6.12 to make sure that each of the child cpusets must be a subset of a parent cpuset. IOW, it allows child cpusets to restrict what changes can be made to a parent's "cpuset.cpus". This actually violates one of the core principles of the default hierarchy where a cgroup higher up in the hierarchy should be able to change configuration however it sees fit as deligation breaks down otherwise. To address this issue, the check is now removed for the default hierarchy to free parent cpusets from being restricted by child cpusets. The check will still apply for legacy hierarchy. Suggested-by: Tejun Heo Signed-off-by: Waiman Long Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index d0e163a02099..0dd7d853ed17 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -616,19 +616,11 @@ static int validate_change(struct cpuset *cur, struct cpuset *trial) struct cpuset *c, *par; int ret; - rcu_read_lock(); - - /* Each of our child cpusets must be a subset of us */ - ret = -EBUSY; - cpuset_for_each_child(c, css, cur) - if (!is_cpuset_subset(c, trial)) - goto out; - - /* Remaining checks don't apply to root cpuset */ - ret = 0; + /* The checks don't apply to root cpuset */ if (cur == &top_cpuset) - goto out; + return 0; + rcu_read_lock(); par = parent_cs(cur); /* On legacy hierarchy, we must be a subset of our parent cpuset. */ From 1815775e74541d7b498c0baf15726ad3d1247abf Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Tue, 14 Dec 2021 00:46:07 +0000 Subject: [PATCH 32/49] cgroup: return early if it is already on preloaded list If a cset is already on preloaded list, this means we have already setup this cset properly for migration. This patch just relocates the root cgrp lookup which isn't used anyway when the cset is already on the preloaded list. [tj@kernel.org: rephrase the commit log] Signed-off-by: Wei Yang Signed-off-by: Tejun Heo --- kernel/cgroup/cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c index f522cee8e650..4f77bf1eaf9f 100644 --- a/kernel/cgroup/cgroup.c +++ b/kernel/cgroup/cgroup.c @@ -2650,11 +2650,11 @@ void cgroup_migrate_add_src(struct css_set *src_cset, if (src_cset->dead) return; - src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); - if (!list_empty(&src_cset->mg_preload_node)) return; + src_cgrp = cset_cgroup_from_root(src_cset, dst_cgrp->root); + WARN_ON(src_cset->mg_src_cgrp); WARN_ON(src_cset->mg_dst_cgrp); WARN_ON(!list_empty(&src_cset->mg_tasks)); From deaee2704a157dfcca77301ddaa10c62a9840952 Mon Sep 17 00:00:00 2001 From: John Ogness Date: Wed, 15 Dec 2021 16:16:22 +0106 Subject: [PATCH 33/49] scripts/gdb: lx-dmesg: read records individually For the gdb command lx-dmesg, the entire descriptor, info, and text data regions are read into memory before printing any records. For large kernel log buffers, this not only causes a huge delay before seeing any records, but it may also lead to python errors of too much memory allocation. Rather than reading in all these regions in advance, read them as needed and only read the regions for the particular record that is being printed. The gdb macro "dmesg" in Documentation/admin-guide/kdump/gdbmacros.txt already prints out the kernel log buffer like this. Signed-off-by: John Ogness Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/874k79c3a9.fsf@jogness.linutronix.de --- scripts/gdb/linux/dmesg.py | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/scripts/gdb/linux/dmesg.py b/scripts/gdb/linux/dmesg.py index a92c55bd8de5..d5983cf3db7d 100644 --- a/scripts/gdb/linux/dmesg.py +++ b/scripts/gdb/linux/dmesg.py @@ -44,19 +44,17 @@ class LxDmesg(gdb.Command): sz = prb_desc_ring_type.get_type().sizeof desc_ring = utils.read_memoryview(inf, addr, sz).tobytes() - # read in descriptor array + # read in descriptor count, size, and address off = prb_desc_ring_type.get_type()['count_bits'].bitpos // 8 desc_ring_count = 1 << utils.read_u32(desc_ring, off) desc_sz = prb_desc_type.get_type().sizeof off = prb_desc_ring_type.get_type()['descs'].bitpos // 8 - addr = utils.read_ulong(desc_ring, off) - descs = utils.read_memoryview(inf, addr, desc_sz * desc_ring_count).tobytes() + desc_addr = utils.read_ulong(desc_ring, off) - # read in info array + # read in info size and address info_sz = printk_info_type.get_type().sizeof off = prb_desc_ring_type.get_type()['infos'].bitpos // 8 - addr = utils.read_ulong(desc_ring, off) - infos = utils.read_memoryview(inf, addr, info_sz * desc_ring_count).tobytes() + info_addr = utils.read_ulong(desc_ring, off) # read in text data ring structure off = printk_ringbuffer_type.get_type()['text_data_ring'].bitpos // 8 @@ -64,12 +62,11 @@ class LxDmesg(gdb.Command): sz = prb_data_ring_type.get_type().sizeof text_data_ring = utils.read_memoryview(inf, addr, sz).tobytes() - # read in text data + # read in text data size and address off = prb_data_ring_type.get_type()['size_bits'].bitpos // 8 text_data_sz = 1 << utils.read_u32(text_data_ring, off) off = prb_data_ring_type.get_type()['data'].bitpos // 8 - addr = utils.read_ulong(text_data_ring, off) - text_data = utils.read_memoryview(inf, addr, text_data_sz).tobytes() + text_data_addr = utils.read_ulong(text_data_ring, off) counter_off = atomic_long_type.get_type()['counter'].bitpos // 8 @@ -102,17 +99,20 @@ class LxDmesg(gdb.Command): desc_off = desc_sz * ind info_off = info_sz * ind + desc = utils.read_memoryview(inf, desc_addr + desc_off, desc_sz).tobytes() + # skip non-committed record - state = 3 & (utils.read_u64(descs, desc_off + sv_off + - counter_off) >> desc_flags_shift) + state = 3 & (utils.read_u64(desc, sv_off + counter_off) >> desc_flags_shift) if state != desc_committed and state != desc_finalized: if did == head_id: break did = (did + 1) & desc_id_mask continue - begin = utils.read_ulong(descs, desc_off + begin_off) % text_data_sz - end = utils.read_ulong(descs, desc_off + next_off) % text_data_sz + begin = utils.read_ulong(desc, begin_off) % text_data_sz + end = utils.read_ulong(desc, next_off) % text_data_sz + + info = utils.read_memoryview(inf, info_addr + info_off, info_sz).tobytes() # handle data-less record if begin & 1 == 1: @@ -125,16 +125,17 @@ class LxDmesg(gdb.Command): # skip over descriptor id text_start = begin + utils.get_long_type().sizeof - text_len = utils.read_u16(infos, info_off + len_off) + text_len = utils.read_u16(info, len_off) # handle truncated message if end - text_start < text_len: text_len = end - text_start - text = text_data[text_start:text_start + text_len].decode( - encoding='utf8', errors='replace') + text_data = utils.read_memoryview(inf, text_data_addr + text_start, + text_len).tobytes() + text = text_data[0:text_len].decode(encoding='utf8', errors='replace') - time_stamp = utils.read_u64(infos, info_off + ts_off) + time_stamp = utils.read_u64(info, ts_off) for line in text.splitlines(): msg = u"[{time:12.6f}] {line}\n".format( From 4a7f4110f79163fd53ea65438041994ed615e3af Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 1 Dec 2021 14:59:29 +0200 Subject: [PATCH 34/49] device property: Fix fwnode_graph_devcon_match() fwnode leak For each endpoint it encounters, fwnode_graph_devcon_match() checks whether the endpoint's remote port parent device is available. If it is not, it ignores the endpoint but does not put the reference to the remote endpoint port parent fwnode. For available devices the fwnode handle reference is put as expected. Put the reference for unavailable devices now. Fixes: 637e9e52b185 ("device connection: Find device connections also from device graphs") Cc: 5.1+ # 5.1+ Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/base/property.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/base/property.c b/drivers/base/property.c index d0960a9e8974..b7b3a7b86006 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -1158,8 +1158,10 @@ fwnode_graph_devcon_match(struct fwnode_handle *fwnode, const char *con_id, fwnode_graph_for_each_endpoint(fwnode, ep) { node = fwnode_graph_get_remote_port_parent(ep); - if (!fwnode_device_is_available(node)) + if (!fwnode_device_is_available(node)) { + fwnode_handle_put(node); continue; + } ret = match(node, con_id, data); fwnode_handle_put(node); From 49f39cb0ef198ae3c73765c9b9ee3034e4c9f076 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 1 Dec 2021 14:59:30 +0200 Subject: [PATCH 35/49] device property: Fix documentation for FWNODE_GRAPH_DEVICE_DISABLED FWNODE_GRAPH_DEVICE_DISABLED flag was meant for also returning endpoints connected to disabled devices, but it also may return endpoints that are not connected. Fix this in documentation. Also fwnode_graph_get_endpoint_by_id() was affeced by this. Also improve the language a little bit. Fixes: 0fcc2bdc8aff ("device property: Add fwnode_graph_get_endpoint_by_id()") Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/base/property.c | 4 ++-- include/linux/property.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/base/property.c b/drivers/base/property.c index b7b3a7b86006..e2860abf9889 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -1063,8 +1063,8 @@ EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_node); * has not been found, look for the closest endpoint ID greater than the * specified one and return the endpoint that corresponds to it, if present. * - * Do not return endpoints that belong to disabled devices, unless - * FWNODE_GRAPH_DEVICE_DISABLED is passed in @flags. + * Does not return endpoints that belong to disabled devices or endpoints that + * are unconnected, unless FWNODE_GRAPH_DEVICE_DISABLED is passed in @flags. * * The returned endpoint needs to be released by calling fwnode_handle_put() on * it when it is not needed any more. diff --git a/include/linux/property.h b/include/linux/property.h index 16f736c698a2..7a2df45ec3ae 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -414,7 +414,8 @@ static inline bool fwnode_graph_is_endpoint(struct fwnode_handle *fwnode) * one. * @FWNODE_GRAPH_DEVICE_DISABLED: That the device to which the remote * endpoint of the given endpoint belongs to, - * may be disabled. + * may be disabled, or that the endpoint is not + * connected. */ #define FWNODE_GRAPH_ENDPOINT_NEXT BIT(0) #define FWNODE_GRAPH_DEVICE_DISABLED BIT(1) From a11174952205d082f1658fab4314f0caf706e0a8 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 1 Dec 2021 14:59:31 +0200 Subject: [PATCH 36/49] Documentation: ACPI: Fix data node reference documentation The data node reference documentation was missing a package that must contain the property values, instead property name and multiple values being present in a single package. This is not aligned with the _DSD spec. Fix it by adding the package for the values. Also add the missing "reg" properties to two numbered nodes. Fixes: b10134a3643d ("ACPI: property: Document hierarchical data extension references") Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- .../firmware-guide/acpi/dsd/data-node-references.rst | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/Documentation/firmware-guide/acpi/dsd/data-node-references.rst b/Documentation/firmware-guide/acpi/dsd/data-node-references.rst index b7ad47df49de..8b65b32e6e40 100644 --- a/Documentation/firmware-guide/acpi/dsd/data-node-references.rst +++ b/Documentation/firmware-guide/acpi/dsd/data-node-references.rst @@ -5,7 +5,7 @@ Referencing hierarchical data nodes =================================== -:Copyright: |copy| 2018 Intel Corporation +:Copyright: |copy| 2018, 2021 Intel Corporation :Author: Sakari Ailus ACPI in general allows referring to device objects in the tree only. @@ -52,12 +52,14 @@ the ANOD object which is also the final target node of the reference. Name (NOD0, Package() { ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), Package () { + Package () { "reg", 0 }, Package () { "random-property", 3 }, } }) Name (NOD1, Package() { ToUUID("dbb8e3e6-5886-4ba6-8795-1319f52a966b"), Package () { + Package () { "reg", 1 }, Package () { "anothernode", "ANOD" }, } }) @@ -74,7 +76,11 @@ the ANOD object which is also the final target node of the reference. Name (_DSD, Package () { ToUUID("daffd814-6eba-4d8c-8a91-bc9bbf4aa301"), Package () { - Package () { "reference", ^DEV0, "node@1", "anothernode" }, + Package () { + "reference", Package () { + ^DEV0, "node@1", "anothernode" + } + }, } }) } From 59f3f98284baa29da60410d04e498467c044aac2 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 1 Dec 2021 14:59:32 +0200 Subject: [PATCH 37/49] Documentation: ACPI: Update references Update references for the ACPI _DSD documentation. In particular: - Substitute _DSD property and hierarchical data extension documents with the newer DSD guide that replaces both, and use its HTML form. - Refer to the latest ACPI spec. - Add data node reference documentation reference to graph documentation. Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- .../acpi/dsd/data-node-references.rst | 18 ++++----- .../firmware-guide/acpi/dsd/graph.rst | 40 +++++++++---------- .../firmware-guide/acpi/dsd/leds.rst | 40 +++++++++---------- Documentation/firmware-guide/acpi/dsd/phy.rst | 28 +++++++------ 4 files changed, 58 insertions(+), 68 deletions(-) diff --git a/Documentation/firmware-guide/acpi/dsd/data-node-references.rst b/Documentation/firmware-guide/acpi/dsd/data-node-references.rst index 8b65b32e6e40..8d8b53e96bcf 100644 --- a/Documentation/firmware-guide/acpi/dsd/data-node-references.rst +++ b/Documentation/firmware-guide/acpi/dsd/data-node-references.rst @@ -13,9 +13,9 @@ Hierarchical data extension nodes may not be referred to directly, hence this document defines a scheme to implement such references. A reference consist of the device object name followed by one or more -hierarchical data extension [1] keys. Specifically, the hierarchical data -extension node which is referred to by the key shall lie directly under the -parent object i.e. either the device object or another hierarchical data +hierarchical data extension [dsd-guide] keys. Specifically, the hierarchical +data extension node which is referred to by the key shall lie directly under +the parent object i.e. either the device object or another hierarchical data extension node. The keys in the hierarchical data nodes shall consist of the name of the node, @@ -33,7 +33,7 @@ extension key. Example ======= -In the ASL snippet below, the "reference" _DSD property [2] contains a +In the ASL snippet below, the "reference" _DSD property contains a device object reference to DEV0 and under that device object, a hierarchical data extension key "node@1" referring to the NOD1 object and lastly, a hierarchical data extension key "anothernode" referring to @@ -91,10 +91,6 @@ Documentation/firmware-guide/acpi/dsd/graph.rst. References ========== -[1] Hierarchical Data Extension UUID For _DSD. -, -referenced 2018-07-17. - -[2] Device Properties UUID For _DSD. -, -referenced 2016-10-04. +[dsd-guide] DSD Guide. + https://github.com/UEFI/DSD-Guide/blob/main/dsd-guide.adoc, referenced + 2021-11-30. diff --git a/Documentation/firmware-guide/acpi/dsd/graph.rst b/Documentation/firmware-guide/acpi/dsd/graph.rst index 0ced07cb1be3..b9dbfc73ed25 100644 --- a/Documentation/firmware-guide/acpi/dsd/graph.rst +++ b/Documentation/firmware-guide/acpi/dsd/graph.rst @@ -7,11 +7,11 @@ Graphs _DSD ==== -_DSD (Device Specific Data) [7] is a predefined ACPI device +_DSD (Device Specific Data) [dsd-guide] is a predefined ACPI device configuration object that can be used to convey information on hardware features which are not specifically covered by the ACPI -specification [1][6]. There are two _DSD extensions that are relevant -for graphs: property [4] and hierarchical data extensions [5]. The +specification [acpi]. There are two _DSD extensions that are relevant +for graphs: property [dsd-guide] and hierarchical data extensions. The property extension provides generic key-value pairs whereas the hierarchical data extension supports nodes with references to other nodes, forming a tree. The nodes in the tree may contain properties as @@ -36,8 +36,9 @@ Ports and endpoints =================== The port and endpoint concepts are very similar to those in Devicetree -[3]. A port represents an interface in a device, and an endpoint -represents a connection to that interface. +[devicetree, graph-bindings]. A port represents an interface in a device, and +an endpoint represents a connection to that interface. Also see [data-node-ref] +for generic data node references. All port nodes are located under the device's "_DSD" node in the hierarchical data extension tree. The data extension related to each port node must begin @@ -153,25 +154,20 @@ the "ISP" device and vice versa. References ========== -[1] _DSD (Device Specific Data) Implementation Guide. - https://www.uefi.org/sites/default/files/resources/_DSD-implementation-guide-toplevel-1_1.htm, - referenced 2016-10-03. +[acpi] Advanced Configuration and Power Interface Specification. + https://uefi.org/specifications/ACPI/6.4/, referenced 2021-11-30. -[2] Devicetree. https://www.devicetree.org, referenced 2016-10-03. +[data-node-ref] Documentation/firmware-guide/acpi/dsd/data-node-references.rst -[3] Documentation/devicetree/bindings/graph.txt +[devicetree] Devicetree. https://www.devicetree.org, referenced 2016-10-03. -[4] Device Properties UUID For _DSD. - https://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf, - referenced 2016-10-04. +[dsd-guide] DSD Guide. + https://github.com/UEFI/DSD-Guide/blob/main/dsd-guide.adoc, referenced + 2021-11-30. -[5] Hierarchical Data Extension UUID For _DSD. - https://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf, - referenced 2016-10-04. - -[6] Advanced Configuration and Power Interface Specification. - https://www.uefi.org/sites/default/files/resources/ACPI_6_1.pdf, - referenced 2016-10-04. - -[7] _DSD Device Properties Usage Rules. +[dsd-rules] _DSD Device Properties Usage Rules. Documentation/firmware-guide/acpi/DSD-properties-rules.rst + +[graph-bindings] Common bindings for device graphs (Devicetree). + https://github.com/devicetree-org/dt-schema/blob/main/schemas/graph.yaml, + referenced 2021-11-30. diff --git a/Documentation/firmware-guide/acpi/dsd/leds.rst b/Documentation/firmware-guide/acpi/dsd/leds.rst index b99fff8e06f2..93db592c93c7 100644 --- a/Documentation/firmware-guide/acpi/dsd/leds.rst +++ b/Documentation/firmware-guide/acpi/dsd/leds.rst @@ -5,19 +5,20 @@ Describing and referring to LEDs in ACPI ======================================== -Individual LEDs are described by hierarchical data extension [6] nodes under the +Individual LEDs are described by hierarchical data extension [5] nodes under the device node, the LED driver chip. The "reg" property in the LED specific nodes tells the numerical ID of each individual LED output to which the LEDs are -connected. [3] The hierarchical data nodes are named "led@X", where X is the +connected. [leds] The hierarchical data nodes are named "led@X", where X is the number of the LED output. -Referring to LEDs in Device tree is documented in [4], in "flash-leds" property -documentation. In short, LEDs are directly referred to by using phandles. +Referring to LEDs in Device tree is documented in [video-interfaces], in +"flash-leds" property documentation. In short, LEDs are directly referred to by +using phandles. -While Device tree allows referring to any node in the tree[1], in ACPI -references are limited to device nodes only [2]. For this reason using the same -mechanism on ACPI is not possible. A mechanism to refer to non-device ACPI nodes -is documented in [7]. +While Device tree allows referring to any node in the tree [devicetree], in +ACPI references are limited to device nodes only [acpi]. For this reason using +the same mechanism on ACPI is not possible. A mechanism to refer to non-device +ACPI nodes is documented in [data-node-ref]. ACPI allows (as does DT) using integer arguments after the reference. A combination of the LED driver device reference and an integer argument, @@ -90,22 +91,17 @@ where References ========== -[1] Device tree. https://www.devicetree.org, referenced 2019-02-21. +[acpi] Advanced Configuration and Power Interface Specification. + https://uefi.org/specifications/ACPI/6.4/, referenced 2021-11-30. -[2] Advanced Configuration and Power Interface Specification. - https://uefi.org/sites/default/files/resources/ACPI_6_3_final_Jan30.pdf, - referenced 2019-02-21. +[data-node-ref] Documentation/firmware-guide/acpi/dsd/data-node-references.rst -[3] Documentation/devicetree/bindings/leds/common.txt +[devicetree] Devicetree. https://www.devicetree.org, referenced 2019-02-21. -[4] Documentation/devicetree/bindings/media/video-interfaces.txt +[dsd-guide] DSD Guide. + https://github.com/UEFI/DSD-Guide/blob/main/dsd-guide.adoc, referenced + 2021-11-30. -[5] Device Properties UUID For _DSD. - https://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf, - referenced 2019-02-21. +[leds] Documentation/devicetree/bindings/leds/common.yaml -[6] Hierarchical Data Extension UUID For _DSD. - https://www.uefi.org/sites/default/files/resources/_DSD-hierarchical-data-extension-UUID-v1.1.pdf, - referenced 2019-02-21. - -[7] Documentation/firmware-guide/acpi/dsd/data-node-references.rst +[video-interfaces] Documentation/devicetree/bindings/media/video-interfaces.yaml diff --git a/Documentation/firmware-guide/acpi/dsd/phy.rst b/Documentation/firmware-guide/acpi/dsd/phy.rst index 680ad179e5f9..673ac374f92a 100644 --- a/Documentation/firmware-guide/acpi/dsd/phy.rst +++ b/Documentation/firmware-guide/acpi/dsd/phy.rst @@ -4,17 +4,17 @@ MDIO bus and PHYs in ACPI ========================= -The PHYs on an MDIO bus [1] are probed and registered using +The PHYs on an MDIO bus [phy] are probed and registered using fwnode_mdiobus_register_phy(). Later, for connecting these PHYs to their respective MACs, the PHYs registered on the MDIO bus have to be referenced. This document introduces two _DSD properties that are to be used -for connecting PHYs on the MDIO bus [3] to the MAC layer. +for connecting PHYs on the MDIO bus [dsd-properties-rules] to the MAC layer. These properties are defined in accordance with the "Device -Properties UUID For _DSD" [2] document and the +Properties UUID For _DSD" [dsd-guide] document and the daffd814-6eba-4d8c-8a91-bc9bbf4aa301 UUID must be used in the Device Data Descriptors containing them. @@ -48,22 +48,22 @@ as device object references (e.g. \_SB.MDI0.PHY1). phy-mode -------- The "phy-mode" _DSD property is used to describe the connection to -the PHY. The valid values for "phy-mode" are defined in [4]. +the PHY. The valid values for "phy-mode" are defined in [ethernet-controller]. managed ------- Optional property, which specifies the PHY management type. -The valid values for "managed" are defined in [4]. +The valid values for "managed" are defined in [ethernet-controller]. fixed-link ---------- The "fixed-link" is described by a data-only subnode of the MAC port, which is linked in the _DSD package via hierarchical data extension (UUID dbb8e3e6-5886-4ba6-8795-1319f52a966b -in accordance with [5] "_DSD Implementation Guide" document). +in accordance with [dsd-guide] "_DSD Implementation Guide" document). The subnode should comprise a required property ("speed") and possibly the optional ones - complete list of parameters and -their values are specified in [4]. +their values are specified in [ethernet-controller]. The following ASL example illustrates the usage of these properties. @@ -188,12 +188,14 @@ MAC node example with a "fixed-link" subnode. References ========== -[1] Documentation/networking/phy.rst +[phy] Documentation/networking/phy.rst -[2] https://www.uefi.org/sites/default/files/resources/_DSD-device-properties-UUID.pdf +[dsd-properties-rules] + Documentation/firmware-guide/acpi/DSD-properties-rules.rst -[3] Documentation/firmware-guide/acpi/DSD-properties-rules.rst +[ethernet-controller] + Documentation/devicetree/bindings/net/ethernet-controller.yaml -[4] Documentation/devicetree/bindings/net/ethernet-controller.yaml - -[5] https://github.com/UEFI/DSD-Guide/blob/main/dsd-guide.pdf +[dsd-guide] DSD Guide. + https://github.com/UEFI/DSD-Guide/blob/main/dsd-guide.adoc, referenced + 2021-11-30. From c87b8fc569667610b4891cad1e4a663e5a94d8f8 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 1 Dec 2021 14:59:33 +0200 Subject: [PATCH 38/49] device property: Implement fwnode_graph_get_endpoint_count() Add fwnode_graph_get_endpoint_count() function to provide generic implementation of of_graph_get_endpoint_count(). The former by default only counts endpoints to available devices which is consistent with the rest of the fwnode graph API. By providing FWNODE_GRAPH_DEVICE_DISABLED flag, also unconnected endpoints and endpoints to disabled devices are counted. Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/base/property.c | 49 ++++++++++++++++++++++++++++++++-------- include/linux/property.h | 2 ++ 2 files changed, 41 insertions(+), 10 deletions(-) diff --git a/drivers/base/property.c b/drivers/base/property.c index e2860abf9889..39435ba64dac 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -1049,6 +1049,18 @@ fwnode_graph_get_remote_node(const struct fwnode_handle *fwnode, u32 port_id, } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_node); +static bool fwnode_graph_remote_available(struct fwnode_handle *ep) +{ + struct fwnode_handle *dev_node; + bool available; + + dev_node = fwnode_graph_get_remote_port_parent(ep); + available = fwnode_device_is_available(dev_node); + fwnode_handle_put(dev_node); + + return available; +} + /** * fwnode_graph_get_endpoint_by_id - get endpoint by port and endpoint numbers * @fwnode: parent fwnode_handle containing the graph @@ -1082,16 +1094,8 @@ fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode, struct fwnode_endpoint fwnode_ep = { 0 }; int ret; - if (enabled_only) { - struct fwnode_handle *dev_node; - bool available; - - dev_node = fwnode_graph_get_remote_port_parent(ep); - available = fwnode_device_is_available(dev_node); - fwnode_handle_put(dev_node); - if (!available) - continue; - } + if (enabled_only && !fwnode_graph_remote_available(ep)) + continue; ret = fwnode_graph_parse_endpoint(ep, &fwnode_ep); if (ret < 0) @@ -1124,6 +1128,31 @@ fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode, } EXPORT_SYMBOL_GPL(fwnode_graph_get_endpoint_by_id); +/** + * fwnode_graph_get_endpoint_count - Count endpoints on a device node + * @fwnode: The node related to a device + * @flags: fwnode lookup flags + * Count endpoints in a device node. + * + * If FWNODE_GRAPH_DEVICE_DISABLED flag is specified, also unconnected endpoints + * and endpoints connected to disabled devices are counted. + */ +unsigned int fwnode_graph_get_endpoint_count(struct fwnode_handle *fwnode, + unsigned long flags) +{ + struct fwnode_handle *ep; + unsigned int count = 0; + + fwnode_graph_for_each_endpoint(fwnode, ep) { + if (flags & FWNODE_GRAPH_DEVICE_DISABLED || + fwnode_graph_remote_available(ep)) + count++; + } + + return count; +} +EXPORT_SYMBOL_GPL(fwnode_graph_get_endpoint_count); + /** * fwnode_graph_parse_endpoint - parse common endpoint node properties * @fwnode: pointer to endpoint fwnode_handle diff --git a/include/linux/property.h b/include/linux/property.h index 7a2df45ec3ae..8c0104871252 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -423,6 +423,8 @@ static inline bool fwnode_graph_is_endpoint(struct fwnode_handle *fwnode) struct fwnode_handle * fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode, u32 port, u32 endpoint, unsigned long flags); +unsigned int fwnode_graph_get_endpoint_count(struct fwnode_handle *fwnode, + unsigned long flags); #define fwnode_graph_for_each_endpoint(fwnode, child) \ for (child = NULL; \ From 0d82017b70517b9b209ce7181d798a050ce23234 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 1 Dec 2021 14:59:34 +0200 Subject: [PATCH 39/49] device property: Use fwnode_graph_for_each_endpoint() macro Now that we have fwnode_graph_for_each_endpoint() macro, use it instead of calling fwnode_graph_get_next_endpoint() directly. It manages the iterator variable for the user without manual intervention. Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/base/property.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/base/property.c b/drivers/base/property.c index 39435ba64dac..3ff44dd92e52 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -1024,9 +1024,9 @@ struct fwnode_handle * fwnode_graph_get_remote_node(const struct fwnode_handle *fwnode, u32 port_id, u32 endpoint_id) { - struct fwnode_handle *endpoint = NULL; + struct fwnode_handle *endpoint; - while ((endpoint = fwnode_graph_get_next_endpoint(fwnode, endpoint))) { + fwnode_graph_for_each_endpoint(fwnode, endpoint) { struct fwnode_endpoint fwnode_ep; struct fwnode_handle *remote; int ret; @@ -1085,12 +1085,12 @@ struct fwnode_handle * fwnode_graph_get_endpoint_by_id(const struct fwnode_handle *fwnode, u32 port, u32 endpoint, unsigned long flags) { - struct fwnode_handle *ep = NULL, *best_ep = NULL; + struct fwnode_handle *ep, *best_ep = NULL; unsigned int best_ep_id = 0; bool endpoint_next = flags & FWNODE_GRAPH_ENDPOINT_NEXT; bool enabled_only = !(flags & FWNODE_GRAPH_DEVICE_DISABLED); - while ((ep = fwnode_graph_get_next_endpoint(fwnode, ep))) { + fwnode_graph_for_each_endpoint(fwnode, ep) { struct fwnode_endpoint fwnode_ep = { 0 }; int ret; From c49eea6ffec626c059ace085fce1bf501b05dbc7 Mon Sep 17 00:00:00 2001 From: Sakari Ailus Date: Wed, 1 Dec 2021 15:01:15 +0200 Subject: [PATCH 40/49] device property: Drop fwnode_graph_get_remote_node() fwnode_graph_get_remote_node() is only used by the tegra-video driver. Convert it to use newer fwnode_graph_get_endpoint_by_id() and drop now-unused fwnode_graph_get_remote_node(). Signed-off-by: Sakari Ailus Reviewed-by: Andy Shevchenko Signed-off-by: Rafael J. Wysocki --- drivers/base/property.c | 38 -------------------------- drivers/staging/media/tegra-video/vi.c | 12 +++++--- include/linux/property.h | 3 -- 3 files changed, 8 insertions(+), 45 deletions(-) diff --git a/drivers/base/property.c b/drivers/base/property.c index 3ff44dd92e52..5379eae478b1 100644 --- a/drivers/base/property.c +++ b/drivers/base/property.c @@ -1011,44 +1011,6 @@ fwnode_graph_get_remote_endpoint(const struct fwnode_handle *fwnode) } EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_endpoint); -/** - * fwnode_graph_get_remote_node - get remote parent node for given port/endpoint - * @fwnode: pointer to parent fwnode_handle containing graph port/endpoint - * @port_id: identifier of the parent port node - * @endpoint_id: identifier of the endpoint node - * - * Return: Remote fwnode handle associated with remote endpoint node linked - * to @node. Use fwnode_node_put() on it when done. - */ -struct fwnode_handle * -fwnode_graph_get_remote_node(const struct fwnode_handle *fwnode, u32 port_id, - u32 endpoint_id) -{ - struct fwnode_handle *endpoint; - - fwnode_graph_for_each_endpoint(fwnode, endpoint) { - struct fwnode_endpoint fwnode_ep; - struct fwnode_handle *remote; - int ret; - - ret = fwnode_graph_parse_endpoint(endpoint, &fwnode_ep); - if (ret < 0) - continue; - - if (fwnode_ep.port != port_id || fwnode_ep.id != endpoint_id) - continue; - - remote = fwnode_graph_get_remote_port_parent(endpoint); - if (!remote) - return NULL; - - return fwnode_device_is_available(remote) ? remote : NULL; - } - - return NULL; -} -EXPORT_SYMBOL_GPL(fwnode_graph_get_remote_node); - static bool fwnode_graph_remote_available(struct fwnode_handle *ep) { struct fwnode_handle *dev_node; diff --git a/drivers/staging/media/tegra-video/vi.c b/drivers/staging/media/tegra-video/vi.c index 69d9787d5338..d1f43f465c22 100644 --- a/drivers/staging/media/tegra-video/vi.c +++ b/drivers/staging/media/tegra-video/vi.c @@ -1845,7 +1845,6 @@ static int tegra_vi_graph_init(struct tegra_vi *vi) struct tegra_vi_channel *chan; struct fwnode_handle *fwnode = dev_fwnode(vi->dev); int ret; - struct fwnode_handle *remote = NULL; /* * Walk the links to parse the full graph. Each channel will have @@ -1857,11 +1856,16 @@ static int tegra_vi_graph_init(struct tegra_vi *vi) * next channels. */ list_for_each_entry(chan, &vi->vi_chans, list) { - remote = fwnode_graph_get_remote_node(fwnode, chan->portnos[0], - 0); - if (!remote) + struct fwnode_handle *ep, *remote; + + ep = fwnode_graph_get_endpoint_by_id(fwnode, + chan->portnos[0], 0, 0); + if (!ep) continue; + remote = fwnode_graph_get_remote_port_parent(ep); + fwnode_handle_put(ep); + ret = tegra_vi_graph_parse_one(chan, remote); fwnode_handle_put(remote); if (ret < 0 || list_empty(&chan->notifier.asd_list)) diff --git a/include/linux/property.h b/include/linux/property.h index 8c0104871252..8355f99ebd47 100644 --- a/include/linux/property.h +++ b/include/linux/property.h @@ -397,9 +397,6 @@ struct fwnode_handle *fwnode_graph_get_remote_port( const struct fwnode_handle *fwnode); struct fwnode_handle *fwnode_graph_get_remote_endpoint( const struct fwnode_handle *fwnode); -struct fwnode_handle * -fwnode_graph_get_remote_node(const struct fwnode_handle *fwnode, u32 port, - u32 endpoint); static inline bool fwnode_graph_is_endpoint(struct fwnode_handle *fwnode) { From c5fc5ba8b6b7bebc05e45036a33405b4c5036c2f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Cl=C3=A9ment=20L=C3=A9ger?= Date: Mon, 20 Dec 2021 22:05:33 +0100 Subject: [PATCH 41/49] software node: fix wrong node passed to find nargs_prop MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit nargs_prop refers to a property located in the reference that is found within the nargs property. Use the correct reference node in call to property_entry_read_int_array() to retrieve the correct nargs value. Fixes: b06184acf751 ("software node: Add software_node_get_reference_args()") Signed-off-by: Clément Léger Reviewed-by: Sakari Ailus Reviewed-by: Daniel Scally Acked-by: Heikki Krogerus Signed-off-by: Rafael J. Wysocki --- drivers/base/swnode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/swnode.c b/drivers/base/swnode.c index 4debcea4fb12..0a482212c7e8 100644 --- a/drivers/base/swnode.c +++ b/drivers/base/swnode.c @@ -529,7 +529,7 @@ software_node_get_reference_args(const struct fwnode_handle *fwnode, return -ENOENT; if (nargs_prop) { - error = property_entry_read_int_array(swnode->node->properties, + error = property_entry_read_int_array(ref->node->properties, nargs_prop, sizeof(u32), &nargs_prop_val, 1); if (error) From 3a571fc19673bc00c36b2cd8a2b9811c013115d7 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Tue, 21 Dec 2021 15:32:15 +0200 Subject: [PATCH 42/49] software node: Update MAINTAINERS data base There are two updates to the MAINTAINERS regarding to software node API: - add Daniel Scally to be designated reviewer - add Sakari Ailus to be designated reviewer - add rather tightly related device property files to the list - due to above adjust section name accordingly Signed-off-by: Andy Shevchenko Acked-by: Heikki Krogerus Acked-by: Daniel Scally Acked-by: Sakari Ailus Signed-off-by: Rafael J. Wysocki --- MAINTAINERS | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 5250298d2817..e5b6ed281c97 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -17640,12 +17640,17 @@ F: drivers/firmware/arm_sdei.c F: include/linux/arm_sdei.h F: include/uapi/linux/arm_sdei.h -SOFTWARE NODES +SOFTWARE NODES AND DEVICE PROPERTIES R: Andy Shevchenko +R: Daniel Scally R: Heikki Krogerus +R: Sakari Ailus L: linux-acpi@vger.kernel.org S: Maintained +F: drivers/base/property.c F: drivers/base/swnode.c +F: include/linux/fwnode.h +F: include/linux/property.h SOFTWARE RAID (Multiple Disks) SUPPORT M: Song Liu From 2685c77b80a80c57e2a25a726b82fb31e6e212ab Mon Sep 17 00:00:00 2001 From: Sumeet Pawnikar Date: Thu, 23 Dec 2021 15:12:36 +0530 Subject: [PATCH 43/49] thermal/drivers/int340x: Fix RFIM mailbox write commands The existing mail mechanism only supports writing of workload types. However, mailbox command for RFIM (cmd = 0x08) also requires write operation which is ignored. This results in failing to store RFI restriction. Fixint this requires enhancing mailbox writes for non workload commands too, so remove the check for MBOX_CMD_WORKLOAD_TYPE_WRITE in mailbox write to allow this other write commands to be supoorted. At the same time, however, we have to make sure that there is no impact on read commands, by avoiding to write anything into the mailbox data register. To properly implement that, add two separate functions for mbox read and write commands for the processor thermal workload command type. This helps to distinguish the read and write workload command types from each other while sending mbox commands. Fixes: 5d6fbc96bd36 ("thermal/drivers/int340x: processor_thermal: Export additional attributes") Signed-off-by: Sumeet Pawnikar Cc: 5.14+ # 5.14+ Acked-by: Srinivas Pandruvada [ rjw: Changelog edits ] Signed-off-by: Rafael J. Wysocki --- .../processor_thermal_device.h | 3 +- .../int340x_thermal/processor_thermal_mbox.c | 110 ++++++++++-------- .../int340x_thermal/processor_thermal_rfim.c | 23 ++-- 3 files changed, 78 insertions(+), 58 deletions(-) diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h index be27f633e40a..9b2a64ef55d0 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_device.h @@ -80,7 +80,8 @@ void proc_thermal_rfim_remove(struct pci_dev *pdev); int proc_thermal_mbox_add(struct pci_dev *pdev, struct proc_thermal_device *proc_priv); void proc_thermal_mbox_remove(struct pci_dev *pdev); -int processor_thermal_send_mbox_cmd(struct pci_dev *pdev, u16 cmd_id, u32 cmd_data, u64 *cmd_resp); +int processor_thermal_send_mbox_read_cmd(struct pci_dev *pdev, u16 id, u64 *resp); +int processor_thermal_send_mbox_write_cmd(struct pci_dev *pdev, u16 id, u32 data); int proc_thermal_add(struct device *dev, struct proc_thermal_device *priv); void proc_thermal_remove(struct proc_thermal_device *proc_priv); int proc_thermal_suspend(struct device *dev); diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_mbox.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_mbox.c index 01008ae00e7f..0b89a4340ff4 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_mbox.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_mbox.c @@ -24,19 +24,15 @@ static DEFINE_MUTEX(mbox_lock); -static int send_mbox_cmd(struct pci_dev *pdev, u16 cmd_id, u32 cmd_data, u64 *cmd_resp) +static int wait_for_mbox_ready(struct proc_thermal_device *proc_priv) { - struct proc_thermal_device *proc_priv; u32 retries, data; int ret; - mutex_lock(&mbox_lock); - proc_priv = pci_get_drvdata(pdev); - /* Poll for rb bit == 0 */ retries = MBOX_RETRY_COUNT; do { - data = readl((void __iomem *) (proc_priv->mmio_base + MBOX_OFFSET_INTERFACE)); + data = readl(proc_priv->mmio_base + MBOX_OFFSET_INTERFACE); if (data & BIT_ULL(MBOX_BUSY_BIT)) { ret = -EBUSY; continue; @@ -45,53 +41,78 @@ static int send_mbox_cmd(struct pci_dev *pdev, u16 cmd_id, u32 cmd_data, u64 *cm break; } while (--retries); + return ret; +} + +static int send_mbox_write_cmd(struct pci_dev *pdev, u16 id, u32 data) +{ + struct proc_thermal_device *proc_priv; + u32 reg_data; + int ret; + + proc_priv = pci_get_drvdata(pdev); + + mutex_lock(&mbox_lock); + + ret = wait_for_mbox_ready(proc_priv); if (ret) goto unlock_mbox; - if (cmd_id == MBOX_CMD_WORKLOAD_TYPE_WRITE) - writel(cmd_data, (void __iomem *) ((proc_priv->mmio_base + MBOX_OFFSET_DATA))); - + writel(data, (proc_priv->mmio_base + MBOX_OFFSET_DATA)); /* Write command register */ - data = BIT_ULL(MBOX_BUSY_BIT) | cmd_id; - writel(data, (void __iomem *) ((proc_priv->mmio_base + MBOX_OFFSET_INTERFACE))); + reg_data = BIT_ULL(MBOX_BUSY_BIT) | id; + writel(reg_data, (proc_priv->mmio_base + MBOX_OFFSET_INTERFACE)); - /* Poll for rb bit == 0 */ - retries = MBOX_RETRY_COUNT; - do { - data = readl((void __iomem *) (proc_priv->mmio_base + MBOX_OFFSET_INTERFACE)); - if (data & BIT_ULL(MBOX_BUSY_BIT)) { - ret = -EBUSY; - continue; - } - - if (data) { - ret = -ENXIO; - goto unlock_mbox; - } - - ret = 0; - - if (!cmd_resp) - break; - - if (cmd_id == MBOX_CMD_WORKLOAD_TYPE_READ) - *cmd_resp = readl((void __iomem *) (proc_priv->mmio_base + MBOX_OFFSET_DATA)); - else - *cmd_resp = readq((void __iomem *) (proc_priv->mmio_base + MBOX_OFFSET_DATA)); - - break; - } while (--retries); + ret = wait_for_mbox_ready(proc_priv); unlock_mbox: mutex_unlock(&mbox_lock); return ret; } -int processor_thermal_send_mbox_cmd(struct pci_dev *pdev, u16 cmd_id, u32 cmd_data, u64 *cmd_resp) +static int send_mbox_read_cmd(struct pci_dev *pdev, u16 id, u64 *resp) { - return send_mbox_cmd(pdev, cmd_id, cmd_data, cmd_resp); + struct proc_thermal_device *proc_priv; + u32 reg_data; + int ret; + + proc_priv = pci_get_drvdata(pdev); + + mutex_lock(&mbox_lock); + + ret = wait_for_mbox_ready(proc_priv); + if (ret) + goto unlock_mbox; + + /* Write command register */ + reg_data = BIT_ULL(MBOX_BUSY_BIT) | id; + writel(reg_data, (proc_priv->mmio_base + MBOX_OFFSET_INTERFACE)); + + ret = wait_for_mbox_ready(proc_priv); + if (ret) + goto unlock_mbox; + + if (id == MBOX_CMD_WORKLOAD_TYPE_READ) + *resp = readl(proc_priv->mmio_base + MBOX_OFFSET_DATA); + else + *resp = readq(proc_priv->mmio_base + MBOX_OFFSET_DATA); + +unlock_mbox: + mutex_unlock(&mbox_lock); + return ret; } -EXPORT_SYMBOL_GPL(processor_thermal_send_mbox_cmd); + +int processor_thermal_send_mbox_read_cmd(struct pci_dev *pdev, u16 id, u64 *resp) +{ + return send_mbox_read_cmd(pdev, id, resp); +} +EXPORT_SYMBOL_NS_GPL(processor_thermal_send_mbox_read_cmd, INT340X_THERMAL); + +int processor_thermal_send_mbox_write_cmd(struct pci_dev *pdev, u16 id, u32 data) +{ + return send_mbox_write_cmd(pdev, id, data); +} +EXPORT_SYMBOL_NS_GPL(processor_thermal_send_mbox_write_cmd, INT340X_THERMAL); /* List of workload types */ static const char * const workload_types[] = { @@ -104,7 +125,6 @@ static const char * const workload_types[] = { NULL }; - static ssize_t workload_available_types_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -146,7 +166,7 @@ static ssize_t workload_type_store(struct device *dev, data |= ret; - ret = send_mbox_cmd(pdev, MBOX_CMD_WORKLOAD_TYPE_WRITE, data, NULL); + ret = send_mbox_write_cmd(pdev, MBOX_CMD_WORKLOAD_TYPE_WRITE, data); if (ret) return false; @@ -161,7 +181,7 @@ static ssize_t workload_type_show(struct device *dev, u64 cmd_resp; int ret; - ret = send_mbox_cmd(pdev, MBOX_CMD_WORKLOAD_TYPE_READ, 0, &cmd_resp); + ret = send_mbox_read_cmd(pdev, MBOX_CMD_WORKLOAD_TYPE_READ, &cmd_resp); if (ret) return false; @@ -186,8 +206,6 @@ static const struct attribute_group workload_req_attribute_group = { .name = "workload_request" }; - - static bool workload_req_created; int proc_thermal_mbox_add(struct pci_dev *pdev, struct proc_thermal_device *proc_priv) @@ -196,7 +214,7 @@ int proc_thermal_mbox_add(struct pci_dev *pdev, struct proc_thermal_device *proc int ret; /* Check if there is a mailbox support, if fails return success */ - ret = send_mbox_cmd(pdev, MBOX_CMD_WORKLOAD_TYPE_READ, 0, &cmd_resp); + ret = send_mbox_read_cmd(pdev, MBOX_CMD_WORKLOAD_TYPE_READ, &cmd_resp); if (ret) return 0; diff --git a/drivers/thermal/intel/int340x_thermal/processor_thermal_rfim.c b/drivers/thermal/intel/int340x_thermal/processor_thermal_rfim.c index e693ec8234fb..8c42e7662033 100644 --- a/drivers/thermal/intel/int340x_thermal/processor_thermal_rfim.c +++ b/drivers/thermal/intel/int340x_thermal/processor_thermal_rfim.c @@ -9,6 +9,8 @@ #include #include "processor_thermal_device.h" +MODULE_IMPORT_NS(INT340X_THERMAL); + struct mmio_reg { int read_only; u32 offset; @@ -194,8 +196,7 @@ static ssize_t rfi_restriction_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - u16 cmd_id = 0x0008; - u64 cmd_resp; + u16 id = 0x0008; u32 input; int ret; @@ -203,7 +204,7 @@ static ssize_t rfi_restriction_store(struct device *dev, if (ret) return ret; - ret = processor_thermal_send_mbox_cmd(to_pci_dev(dev), cmd_id, input, &cmd_resp); + ret = processor_thermal_send_mbox_write_cmd(to_pci_dev(dev), id, input); if (ret) return ret; @@ -214,30 +215,30 @@ static ssize_t rfi_restriction_show(struct device *dev, struct device_attribute *attr, char *buf) { - u16 cmd_id = 0x0007; - u64 cmd_resp; + u16 id = 0x0007; + u64 resp; int ret; - ret = processor_thermal_send_mbox_cmd(to_pci_dev(dev), cmd_id, 0, &cmd_resp); + ret = processor_thermal_send_mbox_read_cmd(to_pci_dev(dev), id, &resp); if (ret) return ret; - return sprintf(buf, "%llu\n", cmd_resp); + return sprintf(buf, "%llu\n", resp); } static ssize_t ddr_data_rate_show(struct device *dev, struct device_attribute *attr, char *buf) { - u16 cmd_id = 0x0107; - u64 cmd_resp; + u16 id = 0x0107; + u64 resp; int ret; - ret = processor_thermal_send_mbox_cmd(to_pci_dev(dev), cmd_id, 0, &cmd_resp); + ret = processor_thermal_send_mbox_read_cmd(to_pci_dev(dev), id, &resp); if (ret) return ret; - return sprintf(buf, "%llu\n", cmd_resp); + return sprintf(buf, "%llu\n", resp); } static DEVICE_ATTR_RW(rfi_restriction); From c5b990c71179763d0dab368ccb85ef46ee055335 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 5 Jan 2022 10:41:56 +0100 Subject: [PATCH 44/49] MAINTAINERS/vsprintf: Update link to printk git tree printk git tree has moved to printk/linux.git in February 2020. Acked-by: Andy Shevchenko Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220105094157.26216-2-pmladek@suse.com --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index 7a2345ce8521..2013a36cbf92 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -20399,7 +20399,7 @@ M: Sergey Senozhatsky R: Andy Shevchenko R: Rasmus Villemoes S: Maintained -T: git git://git.kernel.org/pub/scm/linux/kernel/git/pmladek/printk.git +T: git git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git F: Documentation/core-api/printk-formats.rst F: lib/test_printf.c F: lib/test_scanf.c From 73d86812a35965a3eab179abb462b77b4dd8a740 Mon Sep 17 00:00:00 2001 From: Petr Mladek Date: Wed, 5 Jan 2022 10:41:57 +0100 Subject: [PATCH 45/49] MAINTAIERS/printk: Add link to printk git It might also help to avoid confusion with the historic pmladek/printk.git that has got obsoleted by printk/linux.git in February 2020. Acked-by: Andy Shevchenko Signed-off-by: Petr Mladek Link: https://lore.kernel.org/r/20220105094157.26216-3-pmladek@suse.com --- MAINTAINERS | 1 + 1 file changed, 1 insertion(+) diff --git a/MAINTAINERS b/MAINTAINERS index 2013a36cbf92..140173502f63 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -15329,6 +15329,7 @@ M: Sergey Senozhatsky R: Steven Rostedt R: John Ogness S: Maintained +T: git git://git.kernel.org/pub/scm/linux/kernel/git/printk/linux.git F: include/linux/printk.h F: kernel/printk/ From 0da41f7348fff193d01d031ce255088fa98324b7 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 25 Dec 2021 00:09:31 +0000 Subject: [PATCH 46/49] cgroup: rstat: explicitly put loop variant in while MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Instead of do while unconditionally, let's put the loop variant in while. Signed-off-by: Wei Yang Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index 1abe74114527..bc6993258271 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -124,12 +124,10 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, prstatc = cgroup_rstat_cpu(parent, cpu); nextp = &prstatc->updated_children; - while (true) { + while (*nextp != pos) { struct cgroup_rstat_cpu *nrstatc; nrstatc = cgroup_rstat_cpu(*nextp, cpu); - if (*nextp == pos) - break; WARN_ON_ONCE(*nextp == parent); nextp = &nrstatc->updated_next; } From f5f60d235e7058da13a643c33fc7599c05ec0b73 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Sat, 25 Dec 2021 00:09:32 +0000 Subject: [PATCH 47/49] cgroup/rstat: check updated_next only for root MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After commit dc26532aed0a ("cgroup: rstat: punt root-level optimization to individual controllers"), each rstat on updated_children list has its ->updated_next not NULL. This means we can remove the check on ->updated_next, if we make sure the subtree from @root is on list, which could be done by checking updated_next for root. tj: Coding style fixes. Signed-off-by: Wei Yang Reviewed-by: Michal Koutný Signed-off-by: Tejun Heo --- kernel/cgroup/rstat.c | 43 +++++++++++++++++++++---------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index bc6993258271..9d331ba44870 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -88,6 +88,7 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, struct cgroup *root, int cpu) { struct cgroup_rstat_cpu *rstatc; + struct cgroup *parent; if (pos == root) return NULL; @@ -96,10 +97,14 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * We're gonna walk down to the first leaf and visit/remove it. We * can pick whatever unvisited node as the starting point. */ - if (!pos) + if (!pos) { pos = root; - else + /* return NULL if this subtree is not on-list */ + if (!cgroup_rstat_cpu(pos, cpu)->updated_next) + return NULL; + } else { pos = cgroup_parent(pos); + } /* walk down to the first leaf */ while (true) { @@ -115,31 +120,25 @@ static struct cgroup *cgroup_rstat_cpu_pop_updated(struct cgroup *pos, * However, due to the way we traverse, @pos will be the first * child in most cases. The only exception is @root. */ - if (rstatc->updated_next) { - struct cgroup *parent = cgroup_parent(pos); + parent = cgroup_parent(pos); + if (parent) { + struct cgroup_rstat_cpu *prstatc; + struct cgroup **nextp; - if (parent) { - struct cgroup_rstat_cpu *prstatc; - struct cgroup **nextp; + prstatc = cgroup_rstat_cpu(parent, cpu); + nextp = &prstatc->updated_children; + while (*nextp != pos) { + struct cgroup_rstat_cpu *nrstatc; - prstatc = cgroup_rstat_cpu(parent, cpu); - nextp = &prstatc->updated_children; - while (*nextp != pos) { - struct cgroup_rstat_cpu *nrstatc; - - nrstatc = cgroup_rstat_cpu(*nextp, cpu); - WARN_ON_ONCE(*nextp == parent); - nextp = &nrstatc->updated_next; - } - *nextp = rstatc->updated_next; + nrstatc = cgroup_rstat_cpu(*nextp, cpu); + WARN_ON_ONCE(*nextp == parent); + nextp = &nrstatc->updated_next; } - - rstatc->updated_next = NULL; - return pos; + *nextp = rstatc->updated_next; } - /* only happens for @root */ - return NULL; + rstatc->updated_next = NULL; + return pos; } /* see cgroup_rstat_flush() */ From d4296faebd337e5f76c0fddb815de33d2b0ad118 Mon Sep 17 00:00:00 2001 From: Qi Zheng Date: Sun, 19 Dec 2021 10:41:54 +0800 Subject: [PATCH 48/49] cpuset: convert 'allowed' in __cpuset_node_allowed() to be boolean Convert 'allowed' in __cpuset_node_allowed() to be boolean since the return types of node_isset() and __cpuset_node_allowed() are both boolean. Signed-off-by: Qi Zheng Signed-off-by: Tejun Heo --- kernel/cgroup/cpuset.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 0dd7d853ed17..dc653ab26e50 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -3528,7 +3528,7 @@ static struct cpuset *nearest_hardwall_ancestor(struct cpuset *cs) bool __cpuset_node_allowed(int node, gfp_t gfp_mask) { struct cpuset *cs; /* current cpuset ancestors */ - int allowed; /* is allocation in zone z allowed? */ + bool allowed; /* is allocation in zone z allowed? */ unsigned long flags; if (in_interrupt()) From 68514dacf2715d11b91ca50d88de047c086fea9c Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Mon, 10 Jan 2022 19:19:23 +0100 Subject: [PATCH 49/49] select: Fix indefinitely sleeping task in poll_schedule_timeout() A task can end up indefinitely sleeping in do_select() -> poll_schedule_timeout() when the following race happens: TASK1 (thread1) TASK2 TASK1 (thread2) do_select() setup poll_wqueues table with 'fd' write data to 'fd' pollwake() table->triggered = 1 closes 'fd' thread1 is waiting for poll_schedule_timeout() - sees table->triggered table->triggered = 0 return -EINTR loop back in do_select() But at this point when TASK1 loops back, the fdget() in the setup of poll_wqueues fails. So now so we never find 'fd' is ready for reading and sleep in poll_schedule_timeout() indefinitely. Treat an fd that got closed as a fd on which some event happened. This makes sure cannot block indefinitely in do_select(). Another option would be to return -EBADF in this case but that has a potential of subtly breaking applications that excercise this behavior and it happens to work for them. So returning fd as active seems like a safer choice. Suggested-by: Linus Torvalds CC: stable@vger.kernel.org Signed-off-by: Jan Kara Signed-off-by: Linus Torvalds --- fs/select.c | 63 ++++++++++++++++++++++++++++------------------------- 1 file changed, 33 insertions(+), 30 deletions(-) diff --git a/fs/select.c b/fs/select.c index 02cd8cb5e69f..0ee55af1a55c 100644 --- a/fs/select.c +++ b/fs/select.c @@ -459,9 +459,11 @@ get_max: return max; } -#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR) -#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR) -#define POLLEX_SET (EPOLLPRI) +#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\ + EPOLLNVAL) +#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\ + EPOLLNVAL) +#define POLLEX_SET (EPOLLPRI | EPOLLNVAL) static inline void wait_key_set(poll_table *wait, unsigned long in, unsigned long out, unsigned long bit, @@ -528,6 +530,7 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) break; if (!(bit & all_bits)) continue; + mask = EPOLLNVAL; f = fdget(i); if (f.file) { wait_key_set(wait, in, out, bit, @@ -535,34 +538,34 @@ static int do_select(int n, fd_set_bits *fds, struct timespec64 *end_time) mask = vfs_poll(f.file, wait); fdput(f); - if ((mask & POLLIN_SET) && (in & bit)) { - res_in |= bit; - retval++; - wait->_qproc = NULL; - } - if ((mask & POLLOUT_SET) && (out & bit)) { - res_out |= bit; - retval++; - wait->_qproc = NULL; - } - if ((mask & POLLEX_SET) && (ex & bit)) { - res_ex |= bit; - retval++; - wait->_qproc = NULL; - } - /* got something, stop busy polling */ - if (retval) { - can_busy_loop = false; - busy_flag = 0; - - /* - * only remember a returned - * POLL_BUSY_LOOP if we asked for it - */ - } else if (busy_flag & mask) - can_busy_loop = true; - } + if ((mask & POLLIN_SET) && (in & bit)) { + res_in |= bit; + retval++; + wait->_qproc = NULL; + } + if ((mask & POLLOUT_SET) && (out & bit)) { + res_out |= bit; + retval++; + wait->_qproc = NULL; + } + if ((mask & POLLEX_SET) && (ex & bit)) { + res_ex |= bit; + retval++; + wait->_qproc = NULL; + } + /* got something, stop busy polling */ + if (retval) { + can_busy_loop = false; + busy_flag = 0; + + /* + * only remember a returned + * POLL_BUSY_LOOP if we asked for it + */ + } else if (busy_flag & mask) + can_busy_loop = true; + } if (res_in) *rinp = res_in;